PyPI - natural-pdf - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

docs/finetuning/index.md +176 -0
docs/ocr/index.md +34 -47
docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
docs/tutorials/02-finding-elements.ipynb +42 -42
docs/tutorials/03-extracting-blocks.ipynb +17 -17
docs/tutorials/04-table-extraction.ipynb +12 -12
docs/tutorials/05-excluding-content.ipynb +30 -30
docs/tutorials/06-document-qa.ipynb +28 -28
docs/tutorials/07-layout-analysis.ipynb +63 -35
docs/tutorials/07-working-with-regions.ipynb +55 -51
docs/tutorials/07-working-with-regions.md +2 -2
docs/tutorials/08-spatial-navigation.ipynb +60 -60
docs/tutorials/09-section-extraction.ipynb +113 -113
docs/tutorials/10-form-field-extraction.ipynb +78 -50
docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
docs/tutorials/12-ocr-integration.ipynb +149 -131
docs/tutorials/12-ocr-integration.md +0 -13
docs/tutorials/13-semantic-search.ipynb +313 -873
natural_pdf/__init__.py +21 -22
natural_pdf/analyzers/layout/gemini.py +280 -0
natural_pdf/analyzers/layout/layout_manager.py +28 -1
natural_pdf/analyzers/layout/layout_options.py +11 -0
natural_pdf/analyzers/layout/yolo.py +6 -2
natural_pdf/collections/pdf_collection.py +24 -0
natural_pdf/core/element_manager.py +18 -13
natural_pdf/core/page.py +174 -36
natural_pdf/core/pdf.py +156 -42
natural_pdf/elements/base.py +9 -17
natural_pdf/elements/collections.py +99 -38
natural_pdf/elements/region.py +77 -37
natural_pdf/elements/text.py +5 -0
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +57 -36
natural_pdf/ocr/engine.py +160 -49
natural_pdf/ocr/engine_easyocr.py +178 -157
natural_pdf/ocr/engine_paddle.py +114 -189
natural_pdf/ocr/engine_surya.py +87 -144
natural_pdf/ocr/ocr_factory.py +125 -0
natural_pdf/ocr/ocr_manager.py +65 -89
natural_pdf/ocr/ocr_options.py +8 -13
natural_pdf/ocr/utils.py +113 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +34 -0
natural_pdf/utils/identifiers.py +33 -0
natural_pdf/utils/packaging.py +485 -0
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
natural_pdf/templates/ocr_debug.html +0 -517
tests/test_loading.py +0 -50
tests/test_optional_deps.py +0 -298
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/base.py CHANGED Viewed

@@ -306,17 +306,17 @@ class DirectionalMixin:
             **kwargs,
         )
+    def to_region(self):
+        return self.expand()
     def expand(
         self,
         left: float = 0,
         right: float = 0,
-        top_expand: float = 0,  # Renamed to avoid conflict
-        bottom_expand: float = 0,  # Renamed to avoid conflict
+        top: float = 0,
+        bottom: float = 0,
         width_factor: float = 1.0,
         height_factor: float = 1.0,
-        # Keep original parameter names for backward compatibility
-        top: float = None,
-        bottom: float = None,
     ) -> "Region":
         """
         Create a new region expanded from this element/region.
@@ -324,12 +324,10 @@ class DirectionalMixin:
         Args:
             left: Amount to expand left edge (positive value expands leftwards)
             right: Amount to expand right edge (positive value expands rightwards)
-            top_expand: Amount to expand top edge (positive value expands upwards)
-            bottom_expand: Amount to expand bottom edge (positive value expands downwards)
+            top: Amount to expand top edge (positive value expands upwards)
+            bottom: Amount to expand bottom edge (positive value expands downwards)
             width_factor: Factor to multiply width by (applied after absolute expansion)
             height_factor: Factor to multiply height by (applied after absolute expansion)
-            top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
-            bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
         Returns:
             New expanded Region object
@@ -340,17 +338,11 @@ class DirectionalMixin:
         new_top = self.top
         new_bottom = self.bottom
-        # Handle the deprecated parameter names for backward compatibility
-        if top is not None:
-            top_expand = top
-        if bottom is not None:
-            bottom_expand = bottom
         # Apply absolute expansions first
         new_x0 -= left
         new_x1 += right
-        new_top -= top_expand  # Expand upward (decrease top coordinate)
-        new_bottom += bottom_expand  # Expand downward (increase bottom coordinate)
+        new_top -= top  # Expand upward (decrease top coordinate)
+        new_bottom += bottom  # Expand downward (increase bottom coordinate)
         # Apply percentage factors if provided
         if width_factor != 1.0 or height_factor != 1.0:

natural_pdf/elements/collections.py CHANGED Viewed

@@ -21,6 +21,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
 from natural_pdf.elements.text import TextElement  # Needed for isinstance check
 from natural_pdf.ocr import OCROptions
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import the new utility
 logger = logging.getLogger(__name__)
@@ -1118,6 +1119,42 @@ class ElementCollection(Generic[T]):
         results = self.find_all(selector, regex=regex, case=case, **kwargs)
         return results.first
+    def correct_ocr(
+        self,
+        correction_callback: Callable[[Any], Optional[str]],
+    ) -> "ElementCollection":
+        """
+        Applies corrections to OCR-generated text elements within this collection
+        using a user-provided callback function.
+        Iterates through elements currently in the collection. If an element's
+        'source' attribute starts with 'ocr', it calls the `correction_callback`
+        for that element, passing the element itself.
+        The `correction_callback` should contain the logic to:
+        1. Determine if the element needs correction.
+        2. Perform the correction (e.g., call an LLM).
+        3. Return the new text (`str`) or `None`.
+        If the callback returns a string, the element's `.text` is updated in place.
+        Metadata updates (source, confidence, etc.) should happen within the callback.
+        Elements without a source starting with 'ocr' are skipped.
+        Args:
+            correction_callback: A function accepting an element and returning
+                                 `Optional[str]` (new text or None).
+        Returns:
+            Self for method chaining.
+        """
+        # Delegate to the utility function
+        _apply_ocr_correction_to_elements(
+            elements=self._elements,
+            correction_callback=correction_callback,
+            caller_info=f"ElementCollection(len={len(self._elements)})",  # Pass caller info
+        )
+        return self  # Return self for chaining
 class PageCollection(Generic[P]):
     """
@@ -1178,33 +1215,38 @@ class PageCollection(Generic[P]):
     def apply_ocr(
         self,
         engine: Optional[str] = None,
-        options: Optional[OCROptions] = None,
+        # --- Common OCR Parameters (Direct Arguments) ---
         languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None,
+        min_confidence: Optional[float] = None,  # Min confidence threshold
         device: Optional[str] = None,
+        resolution: Optional[int] = None,  # DPI for rendering
+        apply_exclusions: bool = True,  # New parameter
+        # --- Engine-Specific Options ---
+        options: Optional[Any] = None,  # e.g., EasyOCROptions(...)
     ) -> "PageCollection[P]":
         """
         Applies OCR to all pages within this collection using batch processing.
-        This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
-        to the respective Page objects within this collection.
+        This delegates the work to the parent PDF object's `apply_ocr` method.
         Args:
-            engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
-                    Uses manager's default if None. Ignored if 'options' is provided.
-            options: An specific Options object (e.g., EasyOCROptions) for
-                     advanced configuration. Overrides simple arguments.
-            languages: List of language codes for simple mode.
-            min_confidence: Minimum confidence threshold for simple mode.
-            device: Device string ('cpu', 'cuda', etc.) for simple mode.
+            engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
+            languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
+                       **Must be codes understood by the specific selected engine.**
+                       No mapping is performed.
+            min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
+            device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
+            resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
+            apply_exclusions: If True (default), render page images for OCR with
+                              excluded areas masked (whited out). If False, OCR
+                              the raw page images without masking exclusions.
+            options: An engine-specific options object (e.g., EasyOCROptions) or dict.
         Returns:
             Self for method chaining.
         Raises:
-            RuntimeError: If pages in the collection lack a parent PDF object
-                          or if the parent PDF object lacks the required
-                          `apply_ocr` method.
+            RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
             (Propagates exceptions from PDF.apply_ocr)
         """
         if not self.pages:
@@ -1218,7 +1260,6 @@ class PageCollection(Generic[P]):
         parent_pdf = first_page._parent
-        # Updated check for renamed method
         if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
             raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
@@ -1227,15 +1268,16 @@ class PageCollection(Generic[P]):
         logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
-        # Delegate the batch call to the parent PDF object (using renamed method)
+        # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
         parent_pdf.apply_ocr(
             pages=page_indices,
             engine=engine,
-            options=options,
             languages=languages,
-            min_confidence=min_confidence,
+            min_confidence=min_confidence,  # Pass the renamed parameter
             device=device,
-            # Pass any other relevant simple_kwargs here if added
+            resolution=resolution,
+            apply_exclusions=apply_exclusions,  # Pass down
+            options=options,
         )
         # The PDF method modifies the Page objects directly by adding elements.
@@ -1279,25 +1321,44 @@ class PageCollection(Generic[P]):
         return ElementCollection(all_elements)
-    # def debug_ocr(self, output_path):
-    #     """
-    #     Generate an interactive HTML debug report for OCR results.
-    #     This creates a single-file HTML report with:
-    #     - Side-by-side view of image regions and OCR text
-    #     - Confidence scores with color coding
-    #     - Editable correction fields
-    #     - Filtering and sorting options
-    #     - Export functionality for corrected text
-    #     Args:
-    #         output_path: Path to save the HTML report
-    #     Returns:
-    #         Path to the generated HTML file
-    #     """
-    #     from natural_pdf.utils.ocr import debug_ocr_to_html
-    #     return debug_ocr_to_html(self.pages, output_path)
+    def correct_ocr(
+        self,
+        correction_callback: Callable[[Any], Optional[str]],
+    ) -> "PageCollection[P]":
+        """
+        Applies corrections to OCR-generated text elements across all pages
+        in this collection using a user-provided callback function.
+        This method delegates to the parent PDF's `correct_ocr` method,
+        targeting all pages within this collection.
+        Args:
+            correction_callback: A function that accepts a single argument (an element
+                                 object) and returns `Optional[str]` (new text or None).
+        Returns:
+            A dictionary containing aggregate statistics for the process across all pages:
+            {'elements_checked': total_checked, 'corrections_applied': total_applied}
+        Raises:
+            RuntimeError: If the collection is empty, pages lack a parent PDF reference,
+                          or the parent PDF lacks the `correct_ocr` method.
+        """
+        if not self.pages:
+            logger.warning("Cannot correct OCR for an empty PageCollection.")
+        # Assume all pages share the same parent PDF object
+        parent_pdf = self.pages[0]._parent
+        page_indices = [p.index for p in self.pages]
+        logger.info(
+            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
+        )
+        # Delegate the call to the parent PDF object for the relevant pages
+        parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
+        return self
     def get_sections(
         self,

natural_pdf/elements/region.py CHANGED Viewed

@@ -11,6 +11,8 @@ from natural_pdf.elements.base import DirectionalMixin
 # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.text import TextElement
@@ -1082,12 +1084,18 @@ class Region(DirectionalMixin):
         filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
         return ElementCollection(filtered_elements)
-    def apply_ocr(self, **ocr_params) -> List["TextElement"]:  # Return type hint updated
+    def apply_ocr(self, **ocr_params) -> "Region":
         """
         Apply OCR to this region and return the created text elements.
         Args:
-            **ocr_params: OCR parameters to override defaults (passed to OCRManager)
+            **ocr_params: Keyword arguments passed to the OCR Manager.
+                          Common parameters like `engine`, `languages`, `min_confidence`,
+                          `device`, and `resolution` (for image rendering) should be
+                          provided here. **The `languages` list must contain codes
+                          understood by the specific engine selected.** No mapping
+                          is performed. Engine-specific settings can be passed in
+                          an `options` object (e.g., `options=EasyOCROptions(...)`).
         Returns:
             List of created TextElement objects representing OCR words/lines.
@@ -1098,20 +1106,20 @@ class Region(DirectionalMixin):
             return []
         ocr_mgr = self.page._parent._ocr_manager
-        # Get OCR configuration from kwargs or PDF defaults if needed
-        # We'll mostly rely on passing ocr_params directly to the manager
-        # For rendering, use a reasonable default scale
-        ocr_image_scale = self.page._parent._config.get("ocr_image_scale", 2.0)
+        # Determine rendering resolution from parameters
+        final_resolution = ocr_params.get("resolution")
+        if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
+            final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
+        elif final_resolution is None:
+            final_resolution = 150
         logger.debug(
-            f"Region {self.bbox}: Applying OCR with scale {ocr_image_scale} and params: {ocr_params}"
+            f"Region {self.bbox}: Applying OCR with resolution {final_resolution} DPI and params: {ocr_params}"
         )
-        # Render the page region to an image
+        # Render the page region to an image using the determined resolution
         try:
-            # Crop the page image to this region's bbox
             region_image = self.to_image(
-                scale=ocr_image_scale, include_highlights=False, crop_only=True
+                resolution=final_resolution, include_highlights=False, crop_only=True
             )
             if not region_image:
                 logger.error("Failed to render region to image for OCR.")
@@ -1121,12 +1129,21 @@ class Region(DirectionalMixin):
             logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
             return []
+        # Prepare args for the OCR Manager
+        manager_args = {
+            "images": region_image,
+            "engine": ocr_params.get("engine"),
+            "languages": ocr_params.get("languages"),
+            "min_confidence": ocr_params.get("min_confidence"),
+            "device": ocr_params.get("device"),
+            "options": ocr_params.get("options"),
+            "detect_only": ocr_params.get("detect_only"),
+        }
+        manager_args = {k: v for k, v in manager_args.items() if v is not None}
         # Run OCR on this region's image using the manager
         try:
-            # Pass the single image and any specific options/kwargs
-            # The manager handles engine selection based on ocr_params or defaults
-            results = ocr_mgr.apply_ocr(images=region_image, **ocr_params)
-            # apply_ocr returns List[Dict] for single image
+            results = ocr_mgr.apply_ocr(**manager_args)
             if not isinstance(results, list):
                 logger.error(
                     f"OCRManager returned unexpected type for single region image: {type(results)}"
@@ -1137,25 +1154,19 @@ class Region(DirectionalMixin):
             logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
             return []
-        # Convert results to TextElements, scaling coordinates relative to the page
-        # Calculate scaling factors based on the region image vs the region PDF coords
+        # Convert results to TextElements
         scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
         scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
         logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
         created_elements = []
         for result in results:
             try:
                 img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
                 pdf_height = (img_bottom - img_top) * scale_y
-                # Convert IMAGE coordinates (relative to region crop) to PAGE coordinates
                 page_x0 = self.x0 + (img_x0 * scale_x)
                 page_top = self.top + (img_top * scale_y)
                 page_x1 = self.x0 + (img_x1 * scale_x)
                 page_bottom = self.top + (img_bottom * scale_y)
-                # Create element data using PAGE coordinates
                 element_data = {
                     "text": result["text"],
                     "x0": page_x0,
@@ -1164,45 +1175,34 @@ class Region(DirectionalMixin):
                     "bottom": page_bottom,
                     "width": page_x1 - page_x0,
                     "height": page_bottom - page_top,
-                    "object_type": "word",  # Treat as word
+                    "object_type": "word",
                     "source": "ocr",
                     "confidence": float(result.get("confidence", 0.0)),
                     "fontname": "OCR",
-                    "size": round(pdf_height) if pdf_height > 0 else 10.0,  # Size based on height
+                    "size": round(pdf_height) if pdf_height > 0 else 10.0,
                     "page_number": self.page.number,
                     "bold": False,
                     "italic": False,
                     "upright": True,
                     "doctop": page_top + self.page._page.initial_doctop,
                 }
-                # Create the representative char dict
                 ocr_char_dict = element_data.copy()
                 ocr_char_dict["object_type"] = "char"
                 ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
-                # Add char dicts to word data
                 element_data["_char_dicts"] = [ocr_char_dict]
-                # Create the TextElement word
-                from natural_pdf.elements.text import TextElement  # Local import ok here
+                from natural_pdf.elements.text import TextElement
                 elem = TextElement(element_data, self.page)
                 created_elements.append(elem)
-                # Add the element to the page's element manager
                 self.page._element_mgr.add_element(elem, element_type="words")
-                # Add the char dict to the manager's char list
                 self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
             except Exception as e:
                 logger.error(
                     f"Failed to convert region OCR result to element: {result}. Error: {e}",
                     exc_info=True,
                 )
         logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
-        return created_elements
+        return self
     def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
         """
@@ -1689,3 +1689,43 @@ class Region(DirectionalMixin):
         type_info = f" type='{self.region_type}'" if self.region_type else ""
         source_info = f" source='{self.source}'" if self.source else ""
         return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
+    def correct_ocr(
+        self,
+        correction_callback: Callable[[Any], Optional[str]],
+    ) -> "Region":  # Return self for chaining
+        """
+        Applies corrections to OCR-generated text elements within this region
+        using a user-provided callback function.
+        Finds text elements within this region whose 'source' attribute starts
+        with 'ocr' and calls the `correction_callback` for each, passing the
+        element itself.
+        The `correction_callback` should contain the logic to:
+        1. Determine if the element needs correction.
+        2. Perform the correction (e.g., call an LLM).
+        3. Return the new text (`str`) or `None`.
+        If the callback returns a string, the element's `.text` is updated.
+        Metadata updates (source, confidence, etc.) should happen within the callback.
+        Args:
+            correction_callback: A function accepting an element and returning
+                                 `Optional[str]` (new text or None).
+        Returns:
+            Self for method chaining.
+        """
+        # Find OCR elements specifically within this region
+        # Note: We typically want to correct even if the element falls in an excluded area
+        target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
+        # Delegate to the utility function
+        _apply_ocr_correction_to_elements(
+            elements=target_elements,  # Pass the ElementCollection directly
+            correction_callback=correction_callback,
+            caller_info=f"Region({self.bbox})",  # Pass caller info
+        )
+        return self  # Return self for chaining

natural_pdf/elements/text.py CHANGED Viewed

@@ -41,6 +41,11 @@ class TextElement(Element):
         """Get the text content."""
         return self._obj.get("text", "")
+    @text.setter
+    def text(self, value: str):
+        """Set the text content."""
+        self._obj["text"] = value
     @property
     def source(self) -> str:
         """Get the source of this text element (pdf or ocr)."""

natural_pdf/exporters/__init__.py CHANGED Viewed

@@ -0,0 +1,4 @@
+from .base import FinetuneExporter
+from .paddleocr import PaddleOCRRecognitionExporter
+__all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]

natural_pdf/exporters/base.py ADDED Viewed

@@ -0,0 +1,61 @@
+import abc
+import logging
+from typing import Union, List, TYPE_CHECKING
+if TYPE_CHECKING:
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.collections.pdf_collection import PDFCollection
+logger = logging.getLogger(__name__)
+class FinetuneExporter(abc.ABC):
+    """
+    Abstract base class for exporting data suitable for fine-tuning models.
+    """
+    @abc.abstractmethod
+    def __init__(self, **kwargs):
+        """
+        Initialize the exporter with format-specific options.
+        """
+        pass
+    @abc.abstractmethod
+    def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
+        """
+        Exports the data from the source PDF(s) to the specified output directory
+        in a format suitable for fine-tuning a specific model type.
+        Args:
+            source: The PDF object, PDFCollection, or list of PDF objects to process.
+            output_dir: The path to the directory where the exported files will be saved.
+            **kwargs: Additional export-time arguments.
+        """
+        pass
+    def _resolve_source_pdfs(
+        self, source: Union["PDF", "PDFCollection", List["PDF"]]
+    ) -> List["PDF"]:
+        """
+        Helper to consistently resolve the input source to a list of PDF objects.
+        """
+        from natural_pdf.core.pdf import PDF  # Avoid circular import at module level
+        from natural_pdf.collections.pdf_collection import PDFCollection  # Avoid circular import
+        pdfs_to_process: List["PDF"] = []
+        if isinstance(source, PDF):
+            pdfs_to_process = [source]
+        elif isinstance(source, PDFCollection):
+            pdfs_to_process = source.pdfs
+        elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
+            pdfs_to_process = source
+        else:
+            raise TypeError(
+                f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
+            )
+        if not pdfs_to_process:
+            logger.warning("No PDF documents provided in the source.")
+        return pdfs_to_process

natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl