PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

docs/finetuning/index.md +176 -0
docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/collections/pdf_collection.py +5 -2
natural_pdf/core/element_manager.py +6 -4
natural_pdf/core/page.py +36 -27
natural_pdf/core/pdf.py +25 -16
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +13 -14
natural_pdf/elements/region.py +7 -6
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +81 -40
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +15 -11
natural_pdf/ocr/ocr_options.py +5 -0
natural_pdf/ocr/utils.py +46 -31
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/ocr_factory.py CHANGED Viewed

@@ -13,14 +13,14 @@ class OCRFactory:
     @staticmethod
     def create_engine(engine_type: str, **kwargs) -> OCREngine:
         """Create and return an OCR engine instance.
         Args:
             engine_type: One of 'surya', 'easyocr', 'paddle'
             **kwargs: Arguments to pass to the engine constructor
         Returns:
             An initialized OCR engine
         Raises:
             ImportError: If the required dependencies aren't installed
             ValueError: If the engine_type is unknown
@@ -28,72 +28,83 @@ class OCRFactory:
         if engine_type == "surya":
             try:
                 from .engine_surya import SuryaOCREngine
                 return SuryaOCREngine(**kwargs)
             except ImportError:
-                raise ImportError("Surya engine requires the 'surya' package. "
-                                 "Install with: pip install surya")
+                raise ImportError(
+                    "Surya engine requires the 'surya' package. " "Install with: pip install surya"
+                )
         elif engine_type == "easyocr":
             try:
                 from .engine_easyocr import EasyOCREngine
                 return EasyOCREngine(**kwargs)
             except ImportError:
-                raise ImportError("EasyOCR engine requires the 'easyocr' package. "
-                                 "Install with: pip install easyocr")
+                raise ImportError(
+                    "EasyOCR engine requires the 'easyocr' package. "
+                    "Install with: pip install easyocr"
+                )
         elif engine_type == "paddle":
             try:
                 from .engine_paddle import PaddleOCREngine
                 return PaddleOCREngine(**kwargs)
             except ImportError:
-                raise ImportError("PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
-                                 "Install with: pip install paddleocr paddlepaddle")
+                raise ImportError(
+                    "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
+                    "Install with: pip install paddleocr paddlepaddle"
+                )
         else:
             raise ValueError(f"Unknown engine type: {engine_type}")
     @staticmethod
     def list_available_engines() -> Dict[str, bool]:
         """Returns a dictionary of engine names and their availability status."""
         engines = {}
         # Check Surya
         try:
             engines["surya"] = importlib.util.find_spec("surya") is not None
         except ImportError:
             engines["surya"] = False
         # Check EasyOCR
         try:
             engines["easyocr"] = importlib.util.find_spec("easyocr") is not None
         except ImportError:
             engines["easyocr"] = False
         # Check PaddleOCR
         try:
-            paddle = importlib.util.find_spec("paddle") is not None or importlib.util.find_spec("paddlepaddle") is not None
+            paddle = (
+                importlib.util.find_spec("paddle") is not None
+                or importlib.util.find_spec("paddlepaddle") is not None
+            )
             paddleocr = importlib.util.find_spec("paddleocr") is not None
             engines["paddle"] = paddle and paddleocr
         except ImportError:
             engines["paddle"] = False
         return engines
     @staticmethod
     def get_recommended_engine(**kwargs) -> OCREngine:
         """Returns the best available OCR engine based on what's installed.
         First tries engines in order of preference: EasyOCR, Paddle, Surya.
         If none are available, raises ImportError with installation instructions.
         Args:
             **kwargs: Arguments to pass to the engine constructor
         Returns:
             The best available OCR engine instance
         Raises:
             ImportError: If no engines are available
         """
         available = OCRFactory.list_available_engines()
         # Try engines in order of recommendation
         if available.get("easyocr", False):
             logger.info("Using EasyOCR engine (recommended)")
@@ -104,11 +115,11 @@ class OCRFactory:
         elif available.get("surya", False):
             logger.info("Using Surya OCR engine")
             return OCRFactory.create_engine("surya", **kwargs)
         # If we get here, no engines are available
         raise ImportError(
             "No OCR engines available. Please install at least one of: \n"
             "- EasyOCR (recommended): pip install easyocr\n"
             "- PaddleOCR: pip install paddleocr paddlepaddle\n"
             "- Surya OCR: pip install surya"
-        )
+        )

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -65,7 +65,7 @@ class OCRManager:
         device: Optional[str] = None,
         detect_only: bool = False,
         # --- Engine-Specific Options ---
-        options: Optional[Any] = None, # e.g. EasyOCROptions(), PaddleOCROptions()
+        options: Optional[Any] = None,  # e.g. EasyOCROptions(), PaddleOCROptions()
     ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
         """
         Applies OCR to a single image or a batch of images.
@@ -100,7 +100,7 @@ class OCRManager:
         if not is_batch and not isinstance(images, Image.Image):
             raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
-        # --- Determine Engine ---
+        # --- Determine Engine ---
         selected_engine_name = (engine or "easyocr").lower()
         if selected_engine_name not in self.ENGINE_REGISTRY:
             raise ValueError(
@@ -108,35 +108,39 @@ class OCRManager:
             )
         logger.debug(f"Selected engine: '{selected_engine_name}'")
-        # --- Prepare Options ---
+        # --- Prepare Options ---
         final_options = copy.deepcopy(options) if options is not None else None
         # Type check options object if provided
         if final_options is not None:
-            options_class = self.ENGINE_REGISTRY[selected_engine_name].get("options_class", BaseOCROptions)
+            options_class = self.ENGINE_REGISTRY[selected_engine_name].get(
+                "options_class", BaseOCROptions
+            )
             if not isinstance(final_options, options_class):
-                 # Allow dicts to be passed directly too, assuming engine handles them
+                # Allow dicts to be passed directly too, assuming engine handles them
                 if not isinstance(final_options, dict):
-                     raise TypeError(
+                    raise TypeError(
                         f"Provided options type '{type(final_options).__name__}' is not compatible with engine '{selected_engine_name}'. Expected '{options_class.__name__}' or dict."
                     )
-        # --- Get Engine Instance and Process ---
+        # --- Get Engine Instance and Process ---
         try:
             engine_instance = self._get_engine_instance(selected_engine_name)
             processing_mode = "batch" if is_batch else "single image"
             logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
-            logger.debug(f"  Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}")
+            logger.debug(
+                f"  Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
+            )
             # Call the engine's process_image, passing common args and options object
             # **ASSUMPTION**: Engine process_image signatures are updated to accept these common args.
             results = engine_instance.process_image(
-                images=images,
+                images=images,
                 languages=languages,
                 min_confidence=min_confidence,
                 device=device,
                 detect_only=detect_only,
-                options=final_options
+                options=final_options,
             )
             # Log result summary based on mode

natural_pdf/ocr/ocr_options.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 @dataclass
 class BaseOCROptions:
     """Base class for OCR engine options."""
     extra_args: Dict[str, Any] = field(default_factory=dict)
@@ -20,6 +21,7 @@ class BaseOCROptions:
 @dataclass
 class EasyOCROptions(BaseOCROptions):
     """Specific options for the EasyOCR engine."""
     model_storage_directory: Optional[str] = None
     user_network_directory: Optional[str] = None
     recog_network: str = "english_g2"
@@ -64,6 +66,7 @@ class EasyOCROptions(BaseOCROptions):
 @dataclass
 class PaddleOCROptions(BaseOCROptions):
     """Specific options for the PaddleOCR engine."""
     use_angle_cls: bool = True
     use_gpu: Optional[bool] = None
     gpu_mem: int = 500
@@ -90,6 +93,7 @@ class PaddleOCROptions(BaseOCROptions):
     def __post_init__(self):
         pass
     #     if self.use_gpu is None:
     #         if self.device and "cuda" in self.device.lower():
     #             self.use_gpu = True
@@ -102,6 +106,7 @@ class PaddleOCROptions(BaseOCROptions):
 @dataclass
 class SuryaOCROptions(BaseOCROptions):
     """Specific options for the Surya OCR engine."""
     # Currently, Surya example shows languages passed at prediction time.
     pass

natural_pdf/ocr/utils.py CHANGED Viewed

@@ -10,51 +10,71 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 def _apply_ocr_correction_to_elements(
     elements: Iterable["Element"],
     correction_callback: Callable[[Any], Optional[str]],
+    caller_info: str = "Utility",
 ) -> None:
     """
-    Applies correction callback to a list of elements in place,
+    Applies OCR correction callback to a list of elements in place,
     showing a progress bar.
-    Iterates through elements, calls the callback, and updates
-    element.text if a new string is returned.
+    Iterates through elements, checks if source starts with 'ocr', calls
+    the callback, and updates element.text if a new string is returned.
     Args:
         elements: An iterable of Element objects.
         correction_callback: A function accepting an element and returning
                              Optional[str] (new text or None).
+        caller_info: String identifying the calling context for logs.
     """
+    if not callable(correction_callback):
+        # Raise error here so individual methods don't need to repeat the check
+        raise TypeError("`correction_callback` must be a callable function.")
+    if not elements:
+        logger.warning(f"{caller_info}: No elements provided for correction.")
+        return
     corrections_applied = 0
     elements_checked = 0
     # Prepare the iterable with tqdm
-    element_iterable = tqdm(elements, desc=f"Correcting OCR", unit="element")
+    element_iterable = tqdm(elements, desc=f"Correcting OCR ({caller_info})", unit="element")
     for element in element_iterable:
         # Check if the element is likely from OCR and has text attribute
-        element_source = getattr(element, 'source', None)
-        if isinstance(element_source, str) and element_source.startswith('ocr') and hasattr(element, 'text'):
+        element_source = getattr(element, "source", None)
+        if (
+            isinstance(element_source, str)
+            and element_source.startswith("ocr")
+            and hasattr(element, "text")
+        ):
             elements_checked += 1
-            current_text = getattr(element, 'text')
+            current_text = getattr(element, "text")  # Already checked hasattr
             new_text = correction_callback(element)
             if new_text is not None:
                 if new_text != current_text:
-                    element.text = new_text
+                    element.text = new_text  # Update in place
                     corrections_applied += 1
-    logger.info(f"OCR correction finished. Checked: {elements_checked}, Applied: {corrections_applied}")
+    logger.info(
+        f"{caller_info}: OCR correction finished. Checked: {elements_checked}, Applied: {corrections_applied}"
+    )
+    # No return value needed, modifies elements in place
-def direct_ocr_llm(element,
-                   client,
-                   model="",
-                   resolution=150,
-                   prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
-                   padding=2) -> str:
+def direct_ocr_llm(
+    element,
+    client,
+    model="",
+    resolution=150,
+    prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
+    padding=2,
+) -> str:
     """Convenience method to directly OCR a region of the page."""
     if isinstance(element, TextElement):
@@ -65,34 +85,29 @@ def direct_ocr_llm(element,
     buffered = io.BytesIO()
     region_img = region.to_image(resolution=resolution, include_highlights=False)
     region_img.save(buffered, format="PNG")
-    base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
     response = client.chat.completions.create(
         model=model,
         messages=[
             {
                 "role": "system",
-                "content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image."
+                "content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image.",
             },
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "text",
-                        "text": prompt
-                    },
+                    {"type": "text", "text": prompt},
                     {
                         "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{base64_image}"
-                        }
-                    }
-                ]
-            }
-        ]
+                        "image_url": {"url": f"data:image/png;base64,{base64_image}"},
+                    },
+                ],
+            },
+        ],
     )
     corrected = response.choices[0].message.content
     logger.debug(f"Corrected {region.extract_text()} to {corrected}")
-    return corrected
+    return corrected

natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl