PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +230 -151
natural_pdf/classification/mixin.py +49 -35
natural_pdf/classification/results.py +64 -46
natural_pdf/collections/mixins.py +68 -20
natural_pdf/collections/pdf_collection.py +177 -64
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +633 -190
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +503 -131
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/classification/mixin.py CHANGED Viewed

@@ -3,21 +3,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Un
 # Assuming PIL is installed as it's needed for vision
 try:
-     from PIL import Image
+    from PIL import Image
 except ImportError:
-     Image = None # type: ignore
+    Image = None  # type: ignore
 # Import result classes
-from .results import ClassificationResult # Assuming results.py is in the same dir
+from .results import ClassificationResult  # Assuming results.py is in the same dir
 if TYPE_CHECKING:
     # Avoid runtime import cycle
     from natural_pdf.core.page import Page
     from natural_pdf.elements.region import Region
     from .manager import ClassificationManager
 logger = logging.getLogger(__name__)
 class ClassificationMixin:
     """
     Mixin class providing classification capabilities to Page and Region objects.
@@ -38,18 +40,18 @@ class ClassificationMixin:
     # Host class needs 'analyses' attribute initialized as Dict[str, Any]
     # analyses: Dict[str, Any]
-    # --- End Abstract --- #
+    # --- End Abstract --- #
     def classify(
         self,
         categories: List[str],
-        model: Optional[str] = None, # Default handled by manager
-        using: Optional[str] = None, # Renamed parameter
+        model: Optional[str] = None,  # Default handled by manager
+        using: Optional[str] = None,  # Renamed parameter
         min_confidence: float = 0.0,
-        analysis_key: str = 'classification', # Default key
+        analysis_key: str = "classification",  # Default key
         multi_label: bool = False,
-        **kwargs
-    ) -> "ClassificationMixin": # Return self for chaining
+        **kwargs,
+    ) -> "ClassificationMixin":  # Return self for chaining
         """
         Classifies this item (Page or Region) using the configured manager.
@@ -71,22 +73,30 @@ class ClassificationMixin:
             Self for method chaining.
         """
         # Ensure analyses dict exists
-        if not hasattr(self, 'analyses') or self.analyses is None:
-             logger.warning("'analyses' attribute not found or is None. Initializing as empty dict.")
-             self.analyses = {}
+        if not hasattr(self, "analyses") or self.analyses is None:
+            logger.warning("'analyses' attribute not found or is None. Initializing as empty dict.")
+            self.analyses = {}
         try:
             manager = self._get_classification_manager()
             # Determine the effective model ID and engine type
             effective_model_id = model
-            inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
+            inferred_using = manager.infer_using(
+                model if model else manager.DEFAULT_TEXT_MODEL, using
+            )
             # If model was not provided, use the manager's default for the inferred engine type
             if effective_model_id is None:
-                effective_model_id = manager.DEFAULT_TEXT_MODEL if inferred_using == 'text' else manager.DEFAULT_VISION_MODEL
-                logger.debug(f"No model provided, using default for mode '{inferred_using}': '{effective_model_id}'")
+                effective_model_id = (
+                    manager.DEFAULT_TEXT_MODEL
+                    if inferred_using == "text"
+                    else manager.DEFAULT_VISION_MODEL
+                )
+                logger.debug(
+                    f"No model provided, using default for mode '{inferred_using}': '{effective_model_id}'"
+                )
             # Get content based on the *final* determined engine type
             content = self._get_classification_content(model_type=inferred_using, **kwargs)
@@ -94,11 +104,11 @@ class ClassificationMixin:
             result_obj: ClassificationResult = manager.classify_item(
                 item_content=content,
                 categories=categories,
-                model_id=effective_model_id, # Pass the resolved model ID
-                using=inferred_using, # Pass renamed argument
+                model_id=effective_model_id,  # Pass the resolved model ID
+                using=inferred_using,  # Pass renamed argument
                 min_confidence=min_confidence,
                 multi_label=multi_label,
-                **kwargs
+                **kwargs,
             )
             # Store the structured result object under the specified key
@@ -106,8 +116,8 @@ class ClassificationMixin:
             logger.debug(f"Stored classification result under key '{analysis_key}': {result_obj}")
         except NotImplementedError as nie:
-             logger.error(f"Classification cannot proceed: {nie}")
-             raise
+            logger.error(f"Classification cannot proceed: {nie}")
+            raise
         except Exception as e:
             logger.error(f"Classification failed: {e}", exc_info=True)
             # Optionally re-raise or just log and return self
@@ -118,32 +128,36 @@ class ClassificationMixin:
     @property
     def classification_results(self) -> Optional[ClassificationResult]:
         """Returns the ClassificationResult from the *default* ('classification') key, or None."""
-        if not hasattr(self, 'analyses') or self.analyses is None:
+        if not hasattr(self, "analyses") or self.analyses is None:
             return None
         # Return the result object directly from the default key
-        return self.analyses.get('classification')
+        return self.analyses.get("classification")
     @property
     def category(self) -> Optional[str]:
         """Returns the top category label from the *default* ('classification') key, or None."""
-        result_obj = self.classification_results # Uses the property above
+        result_obj = self.classification_results  # Uses the property above
         # Access the property on the result object
         return result_obj.top_category if result_obj else None
     @property
     def category_confidence(self) -> Optional[float]:
         """Returns the top category confidence from the *default* ('classification') key, or None."""
-        result_obj = self.classification_results # Uses the property above
+        result_obj = self.classification_results  # Uses the property above
         # Access the property on the result object
         return result_obj.top_confidence if result_obj else None
     # Maybe add a helper to get results by specific key?
-    def get_classification_result(self, analysis_key: str = 'classification') -> Optional[ClassificationResult]:
-         """Gets a classification result object stored under a specific key."""
-         if not hasattr(self, 'analyses') or self.analyses is None:
-             return None
-         result = self.analyses.get(analysis_key)
-         if result is not None and not isinstance(result, ClassificationResult):
-              logger.warning(f"Item found under key '{analysis_key}' is not a ClassificationResult (type: {type(result)}). Returning None.")
-              return None
-         return result
+    def get_classification_result(
+        self, analysis_key: str = "classification"
+    ) -> Optional[ClassificationResult]:
+        """Gets a classification result object stored under a specific key."""
+        if not hasattr(self, "analyses") or self.analyses is None:
+            return None
+        result = self.analyses.get(analysis_key)
+        if result is not None and not isinstance(result, ClassificationResult):
+            logger.warning(
+                f"Item found under key '{analysis_key}' is not a ClassificationResult (type: {type(result)}). Returning None."
+            )
+            return None
+        return result

natural_pdf/classification/results.py CHANGED Viewed

@@ -1,62 +1,80 @@
 # natural_pdf/classification/results.py
-from typing import List, Optional, Dict, Any
-from datetime import datetime
 import logging
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)
+@dataclass
 class CategoryScore:
-    """Represents the score for a single category."""
-    label: str
-    confidence: float # Score between 0.0 and 1.0
-    def __init__(self, label: str, confidence: float):
-        # Basic validation
-        if not isinstance(label, str) or not label:
-             logger.warning(f"Initializing CategoryScore with invalid label: {label}")
-             # Fallback or raise? For now, allow but log.
-             # raise ValueError("Category label must be a non-empty string.")
-        if not isinstance(confidence, (float, int)) or not (0.0 <= confidence <= 1.0):
-             logger.warning(f"Initializing CategoryScore with invalid confidence: {confidence} for label '{label}'. Clamping to [0, 1].")
-             confidence = max(0.0, min(1.0, float(confidence)))
-             # raise ValueError("Category confidence must be a float between 0.0 and 1.0.")
-        self.label = str(label)
-        self.confidence = float(confidence)
-    def __repr__(self):
-        return f"<CategoryScore label='{self.label}' confidence={self.confidence:.3f}>"
+    """Represents a category and its confidence score from classification."""
+    category: str
+    score: float
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {"category": self.category, "score": self.score}
+@dataclass
 class ClassificationResult:
-    """Holds the structured results of a classification task."""
+    """Results from a classification operation."""
+    category: str
+    score: float
+    scores: List[CategoryScore]
     model_id: str
-    using: str # Renamed from engine_type ('text' or 'vision')
     timestamp: datetime
-    parameters: Dict[str, Any] # e.g., {'categories': [...], 'min_confidence': 0.1}
-    scores: List[CategoryScore] # List of scores above threshold, sorted by confidence
-    def __init__(self, model_id: str, using: str, timestamp: datetime, parameters: Dict[str, Any], scores: List[CategoryScore]):
-        if not isinstance(scores, list) or not all(isinstance(s, CategoryScore) for s in scores):
-             raise TypeError("Scores must be a list of CategoryScore objects.")
-        self.model_id = str(model_id)
-        self.using = str(using) # Renamed from engine_type
-        self.timestamp = timestamp
-        self.parameters = parameters if parameters is not None else {}
-        # Ensure scores are sorted descending by confidence
-        self.scores = sorted(scores, key=lambda s: s.confidence, reverse=True)
+    using: str  # 'text' or 'vision'
+    parameters: Optional[Dict[str, Any]] = None
+    def __init__(
+        self,
+        category: str,
+        score: float,
+        scores: List[CategoryScore],
+        model_id: str,
+        using: str,
+        parameters: Optional[Dict[str, Any]] = None,
+        timestamp: Optional[datetime] = None,
+    ):
+        self.category = category
+        self.score = score
+        self.scores = scores
+        self.model_id = model_id
+        self.using = using
+        self.parameters = parameters or {}
+        self.timestamp = timestamp or datetime.now()
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the classification result to a dictionary for serialization.
+        Returns:
+            Dictionary representation of the classification result
+        """
+        return {
+            "category": self.category,
+            "score": self.score,
+            "scores": [s.to_dict() for s in self.scores],
+            "model_id": self.model_id,
+            "using": self.using,
+            "parameters": self.parameters,
+            "timestamp": self.timestamp.isoformat(),
+        }
     @property
-    def top_category(self) -> Optional[str]:
-        """Returns the label of the category with the highest confidence."""
-        return self.scores[0].label if self.scores else None
+    def top_category(self) -> str:
+        """Returns the category with the highest score."""
+        return self.category
     @property
-    def top_confidence(self) -> Optional[float]:
+    def top_confidence(self) -> float:
         """Returns the confidence score of the top category."""
-        return self.scores[0].confidence if self.scores else None
+        return self.score
-    def __repr__(self):
-        top_cat = f" top='{self.top_category}' ({self.top_confidence:.2f})" if self.scores else ""
-        num_scores = len(self.scores)
-        return f"<ClassificationResult model='{self.model_id}' using='{self.using}' scores={num_scores}{top_cat}>"
+    def __repr__(self) -> str:
+        return f"<ClassificationResult category='{self.category}' score={self.score:.3f} model='{self.model_id}'>"

natural_pdf/collections/mixins.py CHANGED Viewed

@@ -1,10 +1,38 @@
 import logging
-from typing import Callable, Iterable, Any, TypeVar
+from typing import Any, Callable, Iterable, TypeVar
 from tqdm.auto import tqdm
 logger = logging.getLogger(__name__)
-T = TypeVar("T") # Generic type for items in the collection
+T = TypeVar("T")  # Generic type for items in the collection
+class DirectionalCollectionMixin:
+    """
+    Mixin providing directional methods for collections of elements/regions.
+    """
+    def below(self, **kwargs) -> "ElementCollection":
+        """Find regions below all elements in this collection."""
+        return self.apply(lambda element: element.below(**kwargs))
+    def above(self, **kwargs) -> "ElementCollection":
+        """Find regions above all elements in this collection."""
+        return self.apply(lambda element: element.above(**kwargs))
+    def left(self, **kwargs) -> "ElementCollection":
+        """Find regions to the left of all elements in this collection."""
+        return self.apply(lambda element: element.left(**kwargs))
+    def right(self, **kwargs) -> "ElementCollection":
+        """Find regions to the right of all elements in this collection."""
+        return self.apply(lambda element: element.right(**kwargs))
+    def expand(self, **kwargs) -> "ElementCollection":
+        """Expand all elements in this collection."""
+        return self.apply(lambda element: element.expand(**kwargs))
 class ApplyMixin:
     """
@@ -13,6 +41,7 @@ class ApplyMixin:
     Assumes the inheriting class implements `__iter__` and `__len__` appropriately
     for the items to be processed by `apply`.
     """
     def _get_items_for_apply(self) -> Iterable[Any]:
         """
         Returns the iterable of items to apply the function to.
@@ -22,7 +51,7 @@ class ApplyMixin:
         # Default to standard iteration over the collection itself
         return iter(self)
-    def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) -> None:
+    def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) -> Iterable[Any]:
         """
         Applies a function to each item in the collection.
@@ -34,7 +63,7 @@ class ApplyMixin:
                       A special keyword argument 'show_progress' (bool, default=False)
                       can be used to display a progress bar.
         """
-        show_progress = kwargs.pop('show_progress', False)
+        show_progress = kwargs.pop("show_progress", False)
         # Derive unit name from class name
         unit_name = self.__class__.__name__.lower()
         items_iterable = self._get_items_for_apply()
@@ -42,22 +71,41 @@ class ApplyMixin:
         # Need total count for tqdm, assumes __len__ is implemented by the inheriting class
         total_items = 0
         try:
-             total_items = len(self)
-        except TypeError: # Handle cases where __len__ might not be defined on self
-             logger.warning(f"Could not determine collection length for progress bar.")
+            total_items = len(self)
+        except TypeError:  # Handle cases where __len__ might not be defined on self
+            logger.warning(f"Could not determine collection length for progress bar.")
         if show_progress and total_items > 0:
-            items_iterable = tqdm(items_iterable, total=total_items, desc=f"Applying {func.__name__}", unit=unit_name)
+            items_iterable = tqdm(
+                items_iterable, total=total_items, desc=f"Applying {func.__name__}", unit=unit_name
+            )
         elif show_progress:
-             logger.info(f"Applying {func.__name__} (progress bar disabled for zero/unknown length).")
-        for item in items_iterable:
-            try:
-                # Apply the function with the item and any extra args/kwargs
-                func(item, *args, **kwargs)
-            except Exception as e:
-                # Log and continue for batch operations
-                logger.error(f"Error applying {func.__name__} to {item}: {e}", exc_info=True)
-                # Optionally add a mechanism to collect errors
-        # Returns None, primarily used for side effects.
+            logger.info(
+                f"Applying {func.__name__} (progress bar disabled for zero/unknown length)."
+            )
+        results = [func(item, *args, **kwargs) for item in items_iterable]
+        # If results is empty, return an empty list
+        if not results:
+            return []
+        # Import here to avoid circular imports
+        from natural_pdf import PDF, Page
+        from natural_pdf.collections.pdf_collection import PDFCollection
+        from natural_pdf.elements.base import Element
+        from natural_pdf.elements.collections import ElementCollection, PageCollection
+        from natural_pdf.elements.region import Region
+        first_non_none = next((r for r in results if r is not None), None)
+        first_type = type(first_non_none) if first_non_none is not None else None
+        # Return the appropriate collection based on result type (...generally)
+        if issubclass(first_type, Element) or issubclass(first_type, Region):
+            return ElementCollection(results)
+        elif first_type == PDF:
+            return PDFCollection(results)
+        elif first_type == Page:
+            return PageCollection(results)
+        return results

natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl