PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/finetuning/index.md +176 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +411 -248
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +326 -17
natural_pdf/core/element_manager.py +73 -4
natural_pdf/core/page.py +255 -83
natural_pdf/core/pdf.py +385 -367
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +279 -49
natural_pdf/elements/region.py +106 -21
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +86 -42
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +98 -34
natural_pdf/ocr/ocr_options.py +38 -10
natural_pdf/ocr/utils.py +59 -33
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +96 -65
natural_pdf/utils/tqdm_utils.py +43 -0
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -11,7 +11,15 @@ from natural_pdf.elements.base import DirectionalMixin
 # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
-from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
+# --- Classification Imports --- #
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
+# --- End Classification Imports --- #
+from natural_pdf.utils.locks import pdf_render_lock # Import the lock
+from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
@@ -27,7 +35,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Region(DirectionalMixin):
+class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
     """
     Represents a rectangular region on a page.
     """
@@ -57,6 +65,12 @@ class Region(DirectionalMixin):
         self.start_element = None
         self.end_element = None
+        # --- ADDED --- Metadata store for mixins
+        self.metadata: Dict[str, Any] = {}
+        # --- NEW --- Central registry for analysis results
+        self.analyses: Dict[str, Any] = {}
+        # --- END ADDED ---
         # Standard attributes for all elements
         self.object_type = "region"  # For selector compatibility
@@ -600,6 +614,18 @@ class Region(DirectionalMixin):
         x1 = int(self.x1 * scale_factor)
         bottom = int(self.bottom * scale_factor)
+        # Ensure coords are valid for cropping (left < right, top < bottom)
+        if x0 >= x1:
+             logger.warning(
+                 f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
+             )
+             return None
+        if top >= bottom:
+             logger.warning(
+                 f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
+             )
+             return None
         # Crop the image to just this region
         region_image = page_image.crop((x0, top, x1, bottom))
@@ -776,11 +802,6 @@ class Region(DirectionalMixin):
         debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
         logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
-        # --- Handle Docling source (priority) --- DEPRECATED or Adapt?
-        # For now, let's bypass this and always use the standard extraction flow
-        # based on contained elements to ensure consistency.
-        # if self.model == 'docling' or hasattr(self, 'text_content'): ...
         # 1. Get Word Elements potentially within this region (initial broad phase)
         # Optimization: Could use spatial query if page elements were indexed
         page_words = self.page.words  # Get all words from the page
@@ -829,7 +850,7 @@ class Region(DirectionalMixin):
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=self.bbox,  # Use region's bbox for context
-            user_kwargs=kwargs,
+            user_kwargs=kwargs, # Pass original kwargs to layout generator
         )
         logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1084,11 +1105,14 @@ class Region(DirectionalMixin):
         filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
         return ElementCollection(filtered_elements)
-    def apply_ocr(self, **ocr_params) -> "Region":
+    def apply_ocr(self, replace=True, **ocr_params) -> "Region":
         """
         Apply OCR to this region and return the created text elements.
         Args:
+            replace: If True (default), removes existing OCR elements in the region
+                    before adding new ones. If False, adds new OCR elements without
+                    removing existing ones.
             **ocr_params: Keyword arguments passed to the OCR Manager.
                           Common parameters like `engine`, `languages`, `min_confidence`,
                           `device`, and `resolution` (for image rendering) should be
@@ -1098,17 +1122,33 @@ class Region(DirectionalMixin):
                           an `options` object (e.g., `options=EasyOCROptions(...)`).
         Returns:
-            List of created TextElement objects representing OCR words/lines.
+            Self for method chaining.
         """
         # Ensure OCRManager is available
         if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
             logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
-            return []
+            return self
+        # If replace is True, find and remove existing OCR elements in this region
+        if replace:
+            logger.info(f"Region {self.bbox}: Removing existing OCR elements before applying new OCR.")
+            # Find all OCR elements in this region
+            ocr_selector = "text[source=ocr]"
+            ocr_elements = self.find_all(ocr_selector)
+            if ocr_elements:
+                logger.info(f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove.")
+                # Remove these elements from their page
+                removed_count = ocr_elements.remove()
+                logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
+            else:
+                logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
         ocr_mgr = self.page._parent._ocr_manager
         # Determine rendering resolution from parameters
         final_resolution = ocr_params.get("resolution")
-        if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
+        if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
             final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
         elif final_resolution is None:
             final_resolution = 150
@@ -1123,11 +1163,11 @@ class Region(DirectionalMixin):
             )
             if not region_image:
                 logger.error("Failed to render region to image for OCR.")
-                return []
+                return self
             logger.debug(f"Region rendered to image size: {region_image.size}")
         except Exception as e:
             logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
-            return []
+            return self
         # Prepare args for the OCR Manager
         manager_args = {
@@ -1148,11 +1188,11 @@ class Region(DirectionalMixin):
                 logger.error(
                     f"OCRManager returned unexpected type for single region image: {type(results)}"
                 )
-                return []
+                return self
             logger.debug(f"Region OCR processing returned {len(results)} results.")
         except Exception as e:
             logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
-            return []
+            return self
         # Convert results to TextElements
         scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -1191,6 +1231,7 @@ class Region(DirectionalMixin):
                 ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
                 element_data["_char_dicts"] = [ocr_char_dict]
                 from natural_pdf.elements.text import TextElement
                 elem = TextElement(element_data, self.page)
                 created_elements.append(elem)
                 self.page._element_mgr.add_element(elem, element_type="words")
@@ -1692,7 +1733,7 @@ class Region(DirectionalMixin):
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
-    ) -> "Region": # Return self for chaining
+    ) -> "Region":  # Return self for chaining
         """
         Applies corrections to OCR-generated text elements within this region
         using a user-provided callback function.
@@ -1718,13 +1759,57 @@ class Region(DirectionalMixin):
         """
         # Find OCR elements specifically within this region
         # Note: We typically want to correct even if the element falls in an excluded area
-        target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
+        target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
         # Delegate to the utility function
         _apply_ocr_correction_to_elements(
-            elements=target_elements, # Pass the ElementCollection directly
+            elements=target_elements,  # Pass the ElementCollection directly
             correction_callback=correction_callback,
-            caller_info=f"Region({self.bbox})", # Pass caller info
+            caller_info=f"Region({self.bbox})",  # Pass caller info
         )
-        return self # Return self for chaining
+        return self  # Return self for chaining
+    # --- Classification Mixin Implementation --- #
+    def _get_classification_manager(self) -> "ClassificationManager":
+        if not hasattr(self, 'page') or not hasattr(self.page, 'pdf') or not hasattr(self.page.pdf, 'get_manager'):
+             raise AttributeError("ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing.")
+        try:
+             # Use the PDF's manager registry accessor via page
+             return self.page.pdf.get_manager('classification')
+        except (ValueError, RuntimeError, AttributeError) as e:
+             # Wrap potential errors from get_manager for clarity
+             raise AttributeError(f"Failed to get ClassificationManager from PDF via Page: {e}") from e
+    def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
+        if model_type == 'text':
+            text_content = self.extract_text(layout=False) # Simple join for classification
+            if not text_content or text_content.isspace():
+                raise ValueError("Cannot classify region with 'text' model: No text content found.")
+            return text_content
+        elif model_type == 'vision':
+            # Get resolution from manager/kwargs if possible, else default
+            # We access manager via the method to ensure it's available
+            manager = self._get_classification_manager()
+            default_resolution = 150 # Manager doesn't store default res, set here
+            # Note: classify() passes resolution via **kwargs if user specifies
+            resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
+            img = self.to_image(
+                resolution=resolution,
+                include_highlights=False, # No highlights for classification input
+                crop_only=True # Just the region content
+            )
+            if img is None:
+                raise ValueError("Cannot classify region with 'vision' model: Failed to render image.")
+            return img
+        else:
+            raise ValueError(f"Unsupported model_type for classification: {model_type}")
+    def _get_metadata_storage(self) -> Dict[str, Any]:
+        # Ensure metadata exists
+        if not hasattr(self, 'metadata') or self.metadata is None:
+            self.metadata = {}
+        return self.metadata
+    # --- End Classification Mixin Implementation --- #

natural_pdf/elements/text.py CHANGED Viewed

@@ -274,9 +274,12 @@ class TextElement(Element):
         return False
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:
         """String representation of the text element."""
-        preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
+        if self.text:
+            preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
+        else:
+            preview = "..."
         font_style = []
         if self.bold:
             font_style.append("bold")

natural_pdf/exporters/__init__.py CHANGED Viewed

@@ -0,0 +1,4 @@
+from .base import FinetuneExporter
+from .paddleocr import PaddleOCRRecognitionExporter
+__all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]

natural_pdf/exporters/base.py ADDED Viewed

@@ -0,0 +1,61 @@
+import abc
+import logging
+from typing import Union, List, TYPE_CHECKING
+if TYPE_CHECKING:
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.collections.pdf_collection import PDFCollection
+logger = logging.getLogger(__name__)
+class FinetuneExporter(abc.ABC):
+    """
+    Abstract base class for exporting data suitable for fine-tuning models.
+    """
+    @abc.abstractmethod
+    def __init__(self, **kwargs):
+        """
+        Initialize the exporter with format-specific options.
+        """
+        pass
+    @abc.abstractmethod
+    def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
+        """
+        Exports the data from the source PDF(s) to the specified output directory
+        in a format suitable for fine-tuning a specific model type.
+        Args:
+            source: The PDF object, PDFCollection, or list of PDF objects to process.
+            output_dir: The path to the directory where the exported files will be saved.
+            **kwargs: Additional export-time arguments.
+        """
+        pass
+    def _resolve_source_pdfs(
+        self, source: Union["PDF", "PDFCollection", List["PDF"]]
+    ) -> List["PDF"]:
+        """
+        Helper to consistently resolve the input source to a list of PDF objects.
+        """
+        from natural_pdf.core.pdf import PDF  # Avoid circular import at module level
+        from natural_pdf.collections.pdf_collection import PDFCollection  # Avoid circular import
+        pdfs_to_process: List["PDF"] = []
+        if isinstance(source, PDF):
+            pdfs_to_process = [source]
+        elif isinstance(source, PDFCollection):
+            pdfs_to_process = source.pdfs
+        elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
+            pdfs_to_process = source
+        else:
+            raise TypeError(
+                f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
+            )
+        if not pdfs_to_process:
+            logger.warning("No PDF documents provided in the source.")
+        return pdfs_to_process

natural_pdf/exporters/paddleocr.py ADDED Viewed

@@ -0,0 +1,345 @@
+import os
+import logging
+import random
+import shutil
+from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
+from tqdm import tqdm
+from natural_pdf.exporters.base import FinetuneExporter
+# Need to import this utility
+from natural_pdf.utils.identifiers import generate_short_path_hash
+if TYPE_CHECKING:
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.collections.pdf_collection import PDFCollection
+    from natural_pdf.elements.text import TextElement
+logger = logging.getLogger(__name__)
+DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]"  # Match manifest-import etc.
+class PaddleOCRRecognitionExporter(FinetuneExporter):
+    """
+    Exports data for fine-tuning a PaddleOCR text recognition model.
+    Creates a directory structure with cropped text images and label files
+    (`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
+    Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
+    """
+    def __init__(
+        self,
+        resolution: int = 150,
+        padding: int = 2,
+        selector: Optional[str] = None,
+        corrected_only: bool = False,
+        split_ratio: Optional[float] = 0.9,
+        include_guide: bool = True,
+        random_seed: Optional[int] = 42,
+    ):
+        """
+        Initialize the PaddleOCR Recognition Exporter.
+        Args:
+            resolution: DPI resolution for rendering text region images (default: 150).
+            padding: Padding (in points) to add around text element bbox before cropping (default: 2).
+            selector: CSS-like selector to filter which TextElements to export.
+                      If None and corrected_only is False, all 'text' elements are considered.
+            corrected_only: If True, overrides selector and exports only elements likely
+                            originating from a correction manifest (selector="text[source=manifest]").
+                            (default: False).
+            split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
+                         If None, creates a single `label.txt` file (default: 0.9).
+            include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
+                           in the output directory (default: True).
+            random_seed: Seed for the random number generator used for train/val split shuffling,
+                         ensuring reproducibility (default: 42).
+        """
+        if corrected_only and selector:
+            logger.warning(
+                f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
+                f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
+            )
+            self.selector = DEFAULT_SELECTOR_CORRECTED
+        elif corrected_only:
+            self.selector = DEFAULT_SELECTOR_CORRECTED
+        elif selector:
+            self.selector = selector
+        else:
+            self.selector = "text"  # Default to all text elements if nothing else specified
+        self.resolution = resolution
+        self.padding = padding
+        self.split_ratio = split_ratio
+        self.include_guide = include_guide
+        self.random_seed = random_seed
+        logger.info(
+            f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
+            f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
+        )
+    def export(
+        self,
+        source: Union["PDF", "PDFCollection", List["PDF"]],
+        output_dir: str,
+        **kwargs,  # Allow for potential future args
+    ):
+        """
+        Exports text elements from the source PDF(s) to the specified output directory
+        in PaddleOCR text recognition format.
+        Args:
+            source: The PDF object, PDFCollection, or list of PDF objects to process.
+            output_dir: The path to the directory where the exported files will be saved.
+                        The directory will be created if it doesn't exist.
+            **kwargs: Optional keyword arguments (currently unused).
+        """
+        # --- 1. Setup and Validation ---
+        pdfs_to_process = self._resolve_source_pdfs(source)
+        if not pdfs_to_process:
+            logger.error("No valid PDF sources found. Aborting export.")
+            return
+        try:
+            os.makedirs(output_dir, exist_ok=True)
+            images_dir = os.path.join(output_dir, "images")
+            os.makedirs(images_dir, exist_ok=True)
+        except OSError as e:
+            logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
+            raise
+        # --- 2. Collect Elements and Render Images ---
+        labels: List[Tuple[str, str]] = []  # List of (relative_image_path, text_label)
+        char_set: Set[str] = set()
+        elements_processed = 0
+        elements_skipped = 0
+        logger.info(
+            f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
+        )
+        for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
+            # Need to ensure pdf.path exists and is string
+            if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
+                logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
+                continue
+            pdf_hash = generate_short_path_hash(pdf.path)
+            try:
+                # Find elements using the specified selector
+                # Need to check if pdf has find_all method
+                if not hasattr(pdf, "find_all"):
+                    logger.warning(
+                        f"PDF object {pdf.path} does not have find_all method. Skipping."
+                    )
+                    continue
+                elements = pdf.find_all(
+                    self.selector, apply_exclusions=False
+                )  # Usually want all text, even if excluded
+                if not elements:
+                    logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
+                    continue
+                for i, element in enumerate(
+                    tqdm(
+                        elements,
+                        desc=f"Exporting '{os.path.basename(pdf.path)}'",
+                        leave=False,
+                        position=1,
+                    )
+                ):
+                    # Ensure it's a TextElement with necessary methods/attributes
+                    # Removed check for to_image as it's called after expand()
+                    if not (
+                        hasattr(element, "page")
+                        and hasattr(element, "text")
+                        and hasattr(element, "expand")
+                    ):
+                        logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
+                        elements_skipped += 1
+                        continue
+                    element_text = element.text
+                    # Skip elements with no text, non-string text, or newlines
+                    if (
+                        not element_text
+                        or not isinstance(element_text, str)
+                        or "\n" in element_text
+                    ):
+                        if "\n" in str(element_text):
+                            reason = "contains newline"
+                        elif not element_text:
+                            reason = "empty text"
+                        else:
+                            reason = "invalid text type"
+                        logger.debug(
+                            f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
+                        )
+                        elements_skipped += 1
+                        continue
+                    # Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
+                    page_index = getattr(element.page, "index", 0)
+                    image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
+                    relative_image_path = os.path.join("images", image_filename)
+                    absolute_image_path = os.path.join(output_dir, relative_image_path)
+                    try:
+                        # Expand region, render, and save image
+                        region = element.expand(self.padding)
+                        img = region.to_image(
+                            resolution=self.resolution, crop_only=True, include_highlights=False
+                        )
+                        img.save(absolute_image_path, "PNG")
+                        # Add to labels and character set
+                        labels.append(
+                            (relative_image_path.replace(os.path.sep, "/"), element_text)
+                        )  # Use forward slashes for labels
+                        char_set.update(element_text)
+                        elements_processed += 1
+                    except Exception as e:
+                        page_num_str = getattr(
+                            element.page, "number", "N/A"
+                        )  # Get page number safely
+                        logger.error(
+                            f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
+                            exc_info=False,  # Keep log cleaner
+                        )
+                        elements_skipped += 1
+            except Exception as e:
+                logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
+                # Continue with other PDFs if possible
+        if elements_processed == 0:
+            logger.error(
+                f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
+            )
+            # Clean up potentially created directories? Or leave them empty? Let's leave them.
+            return
+        logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
+        # --- 3. Generate Dictionary File (`dict.txt`) ---
+        dict_path = os.path.join(output_dir, "dict.txt")
+        try:
+            # Log the character set before sorting/writing
+            logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
+            # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
+            # for recognition models, but this might depend on the specific base model.
+            # Start with just the characters found.
+            sorted_chars = sorted(list(char_set), reverse=True)
+            with open(dict_path, "w", encoding="utf-8") as f_dict:
+                for char in sorted_chars:
+                    # Ensure we don't write empty strings or just newlines as dictionary entries
+                    if char and char != "\n":
+                        f_dict.write(char + "\n")
+            logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
+        except Exception as e:
+            logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
+            raise  # Re-raise as this is critical
+        # --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
+        if self.split_ratio is not None and 0 < self.split_ratio < 1:
+            if self.random_seed is not None:
+                random.seed(self.random_seed)
+            random.shuffle(labels)
+            split_index = int(len(labels) * self.split_ratio)
+            train_labels = labels[:split_index]
+            val_labels = labels[split_index:]
+            try:
+                train_path = os.path.join(output_dir, "train.txt")
+                with open(train_path, "w", encoding="utf-8") as f_train:
+                    for img_path, text in train_labels:
+                        f_train.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(
+                    f"Created training label file with {len(train_labels)} entries: {train_path}"
+                )
+                val_path = os.path.join(output_dir, "val.txt")
+                with open(val_path, "w", encoding="utf-8") as f_val:
+                    for img_path, text in val_labels:
+                        f_val.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(
+                    f"Created validation label file with {len(val_labels)} entries: {val_path}"
+                )
+            except Exception as e:
+                logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
+                raise
+        else:
+            # Create a single label file
+            label_path = os.path.join(output_dir, "label.txt")
+            try:
+                with open(label_path, "w", encoding="utf-8") as f_label:
+                    for img_path, text in labels:
+                        f_label.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
+            except Exception as e:
+                logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
+                raise
+        # --- 5. Include Guide Notebook ---
+        if self.include_guide:
+            self._copy_guide_notebook(output_dir)
+        logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
+    def _copy_guide_notebook(self, output_dir: str):
+        """Locates, converts (md->ipynb), and copies the guide notebook."""
+        try:
+            # Try importing conversion library
+            import jupytext
+            from nbformat import write as write_notebook
+        except ImportError:
+            logger.warning(
+                "Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
+                "Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
+            )
+            return
+        try:
+            # Locate the template .md file relative to this script
+            exporter_dir = os.path.dirname(os.path.abspath(__file__))
+            # Go up two levels (exporters -> natural_pdf) then down to templates/finetune
+            template_dir = os.path.abspath(
+                os.path.join(exporter_dir, "..", "templates", "finetune")
+            )
+            template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
+            output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
+            if not os.path.exists(template_md_path):
+                logger.error(
+                    f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
+                )
+                # Try path relative to workspace root as fallback if run from project root
+                alt_template_path = os.path.abspath(
+                    os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
+                )
+                if os.path.exists(alt_template_path):
+                    template_md_path = alt_template_path
+                    logger.info(f"Found guide template at alternate path: {template_md_path}")
+                else:
+                    logger.error(
+                        f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
+                    )
+                    return
+            # Convert Markdown to Notebook object using jupytext
+            logger.debug(f"Reading guide template from: {template_md_path}")
+            notebook = jupytext.read(template_md_path)  # Reads md and returns NotebookNode
+            # Write the Notebook object to the output .ipynb file
+            logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
+            with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
+                write_notebook(notebook, f_nb)
+            logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
+        except Exception as e:
+            logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)

natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl