PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

docs/finetuning/index.md +176 -0
docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/collections/pdf_collection.py +5 -2
natural_pdf/core/element_manager.py +6 -4
natural_pdf/core/page.py +36 -27
natural_pdf/core/pdf.py +25 -16
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +13 -14
natural_pdf/elements/region.py +7 -6
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +81 -40
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +15 -11
natural_pdf/ocr/ocr_options.py +5 -0
natural_pdf/ocr/utils.py +46 -31
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -33,6 +33,7 @@ def configure_logging(level=logging.INFO, handler=None):
     logger.propagate = False
 from natural_pdf.core.page import Page
 from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.collections import ElementCollection

natural_pdf/analyzers/layout/gemini.py CHANGED Viewed

@@ -13,6 +13,7 @@ from PIL import Image
 try:
     from openai import OpenAI
     from openai.types.chat import ChatCompletion
     # Import OpenAIError for exception handling if needed
 except ImportError:
     OpenAI = None
@@ -32,7 +33,7 @@ except ImportError:
     class LayoutDetector:
         def __init__(self):
             self.logger = logging.getLogger()
-            self.supported_classes = set() # Will be dynamic based on user request
+            self.supported_classes = set()  # Will be dynamic based on user request
         def _get_model(self, options):
             raise NotImplementedError
@@ -41,17 +42,20 @@ except ImportError:
             return n.lower().replace("_", "-").replace(" ", "-")
         def validate_classes(self, c):
-            pass # Less strict validation needed for LLM
+            pass  # Less strict validation needed for LLM
     logging.basicConfig()
 logger = logging.getLogger(__name__)
 # Define Pydantic model for the expected output structure
 # This is used by the openai library's `response_format`
 class DetectedRegion(BaseModel):
     label: str = Field(description="The identified class name.")
-    bbox: List[float] = Field(description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4)
+    bbox: List[float] = Field(
+        description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4
+    )
     confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
@@ -63,23 +67,27 @@ class GeminiLayoutDetector(LayoutDetector):
     def __init__(self):
         super().__init__()
-        self.supported_classes = set() # Indicate dynamic nature
+        self.supported_classes = set()  # Indicate dynamic nature
     def is_available(self) -> bool:
         """Check if openai library is installed and GOOGLE_API_KEY is available."""
         api_key = os.environ.get("GOOGLE_API_KEY")
         if not api_key:
-            logger.warning("GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.")
+            logger.warning(
+                "GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available."
+            )
             return False
         if OpenAI is None:
-             logger.warning("openai package not found. Gemini detector (via OpenAI lib) will not be available.")
-             return False
+            logger.warning(
+                "openai package not found. Gemini detector (via OpenAI lib) will not be available."
+            )
+            return False
         return True
     def _get_cache_key(self, options: GeminiLayoutOptions) -> str:
         """Generate cache key based on model name."""
         if not isinstance(options, GeminiLayoutOptions):
-            options = GeminiLayoutOptions() # Use defaults
+            options = GeminiLayoutOptions()  # Use defaults
         model_key = options.model_name
         # Prompt is built dynamically, so not part of cache key based on options
@@ -101,9 +109,7 @@ class GeminiLayoutDetector(LayoutDetector):
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
         """Detect layout elements in an image using Gemini via OpenAI library."""
         if not self.is_available():
-            raise RuntimeError(
-                "OpenAI library not installed or GOOGLE_API_KEY not set."
-            )
+            raise RuntimeError("OpenAI library not installed or GOOGLE_API_KEY not set.")
         # Ensure options are the correct type
         if not isinstance(options, GeminiLayoutOptions):
@@ -124,10 +130,7 @@ class GeminiLayoutDetector(LayoutDetector):
         detections = []
         try:
             # --- 1. Initialize OpenAI Client for Gemini ---
-            client = OpenAI(
-                api_key=api_key,
-                base_url=self.GEMINI_BASE_URL
-            )
+            client = OpenAI(api_key=api_key, base_url=self.GEMINI_BASE_URL)
             # --- 2. Prepare Input for OpenAI API ---
             if not options.classes:
@@ -139,11 +142,11 @@ class GeminiLayoutDetector(LayoutDetector):
             # Convert image to base64
             buffered = io.BytesIO()
             image.save(buffered, format="PNG")
-            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
             image_url = f"data:image/png;base64,{img_base64}"
             # Construct the prompt text
-            class_list_str = ", ".join(f'`{c}`' for c in options.classes)
+            class_list_str = ", ".join(f"`{c}`" for c in options.classes)
             prompt_text = (
                 f"Analyze the provided image of a document page ({width}x{height}). "
                 f"Identify all regions corresponding to the following types: {class_list_str}. "
@@ -165,14 +168,18 @@ class GeminiLayoutDetector(LayoutDetector):
             ]
             # --- 3. Call OpenAI API using .parse for structured output ---
-            logger.debug(f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}")
+            logger.debug(
+                f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}"
+            )
             # Extract relevant generation parameters from extra_args if provided
             # Mapping common names: temperature, top_p, max_tokens
             completion_kwargs = {
-                "temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
+                "temperature": options.extra_args.get("temperature", 0.2),  # Default to low temp
                 "top_p": options.extra_args.get("top_p"),
-                "max_tokens": options.extra_args.get("max_tokens", 4096), # Map from max_output_tokens
+                "max_tokens": options.extra_args.get(
+                    "max_tokens", 4096
+                ),  # Map from max_output_tokens
             }
             # Filter out None values
             completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
@@ -180,13 +187,13 @@ class GeminiLayoutDetector(LayoutDetector):
             completion: ChatCompletion = client.beta.chat.completions.parse(
                 model=model_name,
                 messages=messages,
-                response_format=List[DetectedRegion], # Pass the Pydantic model list
-                **completion_kwargs
+                response_format=List[DetectedRegion],  # Pass the Pydantic model list
+                **completion_kwargs,
             )
             logger.debug(f"Gemini response received via OpenAI lib.")
-            # --- 4. Process Parsed Response ---
+            # --- 4. Process Parsed Response ---
             if not completion.choices:
                 logger.error("Gemini response (via OpenAI lib) contained no choices.")
                 return []
@@ -194,16 +201,18 @@ class GeminiLayoutDetector(LayoutDetector):
             # Get the parsed Pydantic objects
             parsed_results = completion.choices[0].message.parsed
             if not parsed_results or not isinstance(parsed_results, list):
-                 logger.error(f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}")
-                 return []
+                logger.error(
+                    f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
+                )
+                return []
-            # --- 5. Convert to Detections & Filter ---
-            normalized_classes_req = {
-                self._normalize_class_name(c) for c in options.classes
-            }
-            normalized_classes_excl = {
-                self._normalize_class_name(c) for c in options.exclude_classes
-            } if options.exclude_classes else set()
+            # --- 5. Convert to Detections & Filter ---
+            normalized_classes_req = {self._normalize_class_name(c) for c in options.classes}
+            normalized_classes_excl = (
+                {self._normalize_class_name(c) for c in options.exclude_classes}
+                if options.exclude_classes
+                else set()
+            )
             for item in parsed_results:
                 # The item is already a validated DetectedRegion Pydantic object
@@ -215,33 +224,41 @@ class GeminiLayoutDetector(LayoutDetector):
                 # Coordinates should already be floats, but ensure tuple format
                 xmin, ymin, xmax, ymax = tuple(bbox_raw)
-                # --- Apply Filtering ---
+                # --- Apply Filtering ---
                 normalized_class = self._normalize_class_name(label)
                 # Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
                 if normalized_class not in normalized_classes_req:
-                    logger.warning(f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping.")
+                    logger.warning(
+                        f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping."
+                    )
                     continue
                 # Check against excluded classes
                 if normalized_class in normalized_classes_excl:
-                    logger.debug(f"Skipping excluded class '{label}' (normalized: {normalized_class}).")
+                    logger.debug(
+                        f"Skipping excluded class '{label}' (normalized: {normalized_class})."
+                    )
                     continue
                 # Check against base confidence threshold from options
                 if confidence_score < options.confidence:
-                    logger.debug(f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}.")
+                    logger.debug(
+                        f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}."
+                    )
                     continue
                 # Add detection
-                detections.append({
-                    "bbox": (xmin, ymin, xmax, ymax),
-                    "class": label, # Use original label from LLM
-                    "confidence": confidence_score,
-                    "normalized_class": normalized_class,
-                    "source": "layout",
-                    "model": "gemini", # Keep model name generic as gemini
-                })
+                detections.append(
+                    {
+                        "bbox": (xmin, ymin, xmax, ymax),
+                        "class": label,  # Use original label from LLM
+                        "confidence": confidence_score,
+                        "normalized_class": normalized_class,
+                        "source": "layout",
+                        "model": "gemini",  # Keep model name generic as gemini
+                    }
+                )
             self.logger.info(
                 f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
@@ -260,5 +277,4 @@ class GeminiLayoutDetector(LayoutDetector):
     def validate_classes(self, classes: List[str]):
         """Validation is less critical as we pass requested classes to the LLM."""
-        pass # Override base validation if needed, but likely not necessary
+        pass  # Override base validation if needed, but likely not necessary

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -279,14 +279,17 @@ class PDFCollection(SearchableMixin):  # Inherit from the mixin
         """
         try:
             from natural_pdf.utils.packaging import create_correction_task_package
             # Pass the collection itself (self) as the source
             create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
         except ImportError:
-            logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
+            logger.error(
+                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
+            )
             # Or raise
         except Exception as e:
             logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
-            raise # Re-raise the exception from the utility function
+            raise  # Re-raise the exception from the utility function
     # --- Mixin Required Implementation ---
     def get_indexable_items(self) -> Iterable[Indexable]:

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -359,8 +359,10 @@ class ElementManager:
                 # Handle potential None confidence
                 raw_confidence = result.get("confidence")
-                confidence_value = float(raw_confidence) if raw_confidence is not None else None # Keep None if it was None
-                ocr_text = result.get("text") # Get text, will be None if detect_only
+                confidence_value = (
+                    float(raw_confidence) if raw_confidence is not None else None
+                )  # Keep None if it was None
+                ocr_text = result.get("text")  # Get text, will be None if detect_only
                 # Create the TextElement for the word
                 word_element_data = {
@@ -373,7 +375,7 @@ class ElementManager:
                     "height": pdf_height,
                     "object_type": "word",  # Treat OCR results as whole words
                     "source": "ocr",
-                    "confidence": confidence_value, # Use the handled confidence
+                    "confidence": confidence_value,  # Use the handled confidence
                     "fontname": "OCR",  # Use consistent OCR fontname
                     "size": (
                         round(pdf_height) if pdf_height > 0 else 10.0
@@ -391,7 +393,7 @@ class ElementManager:
                 ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
                 # Add the char dict list to the word data before creating TextElement
-                word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
+                word_element_data["_char_dicts"] = [ocr_char_dict]  # Store itself as its only char
                 word_elem = TextElement(word_element_data, self._page)
                 added_word_elements.append(word_elem)

natural_pdf/core/page.py CHANGED Viewed

@@ -1233,7 +1233,7 @@ class Page:
         render_ocr: bool = False,
         resolution: Optional[float] = None,
         include_highlights: bool = True,
-        exclusions: Optional[str] = None, # New parameter
+        exclusions: Optional[str] = None,  # New parameter
         **kwargs,
     ) -> Optional[Image.Image]:
         """
@@ -1262,11 +1262,11 @@ class Page:
                 # Delegate rendering to the central service
                 image = self._highlighter.render_page(
                     page_index=self.index,
-                    scale=scale, # Note: scale is used by highlighter internally for drawing
+                    scale=scale,  # Note: scale is used by highlighter internally for drawing
                     labels=labels,
                     legend_position=legend_position,
                     render_ocr=render_ocr,
-                    resolution=render_resolution, # Pass the calculated resolution
+                    resolution=render_resolution,  # Pass the calculated resolution
                     **kwargs,
                 )
             else:
@@ -1322,16 +1322,21 @@ class Page:
                             max(0, img_x0),
                             max(0, img_top),
                             min(image.width, img_x1),
-                            min(image.height, img_bottom)
+                            min(image.height, img_bottom),
                         )
                         if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
-                           draw.rectangle(img_coords, fill="white")
+                            draw.rectangle(img_coords, fill="white")
                         else:
-                             logger.warning(f"Skipping invalid exclusion rect for masking: {img_coords}")
+                            logger.warning(
+                                f"Skipping invalid exclusion rect for masking: {img_coords}"
+                            )
-                    del draw # Release drawing context
+                    del draw  # Release drawing context
             except Exception as mask_error:
-                logger.error(f"Error applying exclusion mask to page {self.index}: {mask_error}", exc_info=True)
+                logger.error(
+                    f"Error applying exclusion mask to page {self.index}: {mask_error}",
+                    exc_info=True,
+                )
                 # Decide if you want to return None or continue without mask
                 # For now, continue without mask
@@ -1398,7 +1403,7 @@ class Page:
         """
         if not hasattr(self._parent, "apply_ocr"):
             logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
-            return [] # Return empty list for consistency
+            return []  # Return empty list for consistency
         logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
         try:
@@ -1459,11 +1464,11 @@ class Page:
             return []
         logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
         # Determine rendering resolution
-        final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
+        final_resolution = resolution if resolution is not None else 150  # Default to 150 DPI
         logger.debug(f"  Using rendering resolution: {final_resolution} DPI")
         try:
             # Get base image without highlights using the determined resolution
             image = self.to_image(resolution=final_resolution, include_highlights=False)
@@ -1477,12 +1482,12 @@ class Page:
         # Prepare arguments for the OCR Manager call
         manager_args = {
-             "images": image,
-             "engine": engine,
-             "languages": languages,
-             "min_confidence": min_confidence,
-             "device": device,
-             "options": options
+            "images": image,
+            "engine": engine,
+            "languages": languages,
+            "min_confidence": min_confidence,
+            "device": device,
+            "options": options,
         }
         manager_args = {k: v for k, v in manager_args.items() if v is not None}
@@ -1514,7 +1519,7 @@ class Page:
         scale_x = self.width / image.width if image.width else 1
         scale_y = self.height / image.height if image.height else 1
         for result in results:
-            try: # Added try-except around result processing
+            try:  # Added try-except around result processing
                 x0, top, x1, bottom = [float(c) for c in result["bbox"]]
                 elem_data = {
                     "text": result["text"],
@@ -1525,15 +1530,17 @@ class Page:
                     "bottom": bottom * scale_y,
                     "width": (x1 - x0) * scale_x,
                     "height": (bottom - top) * scale_y,
-                    "object_type": "text", # Using text for temporary elements
+                    "object_type": "text",  # Using text for temporary elements
                     "source": "ocr",
-                    "fontname": "OCR-extract", # Different name for clarity
+                    "fontname": "OCR-extract",  # Different name for clarity
                     "size": 10.0,
                     "page_number": self.number,
                 }
                 temp_elements.append(TextElement(elem_data, self))
             except (KeyError, ValueError, TypeError) as convert_err:
-                 logger.warning(f"  Skipping invalid OCR result during conversion: {result}. Error: {convert_err}")
+                logger.warning(
+                    f"  Skipping invalid OCR result during conversion: {result}. Error: {convert_err}"
+                )
         logger.info(f"  Created {len(temp_elements)} TextElements from OCR (extract only).")
         return temp_elements
@@ -2020,7 +2027,7 @@ class Page:
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
-    ) -> "Page": # Return self for chaining
+    ) -> "Page":  # Return self for chaining
         """
         Applies corrections to OCR-generated text elements on this page
         using a user-provided callback function.
@@ -2044,7 +2051,9 @@ class Page:
         Returns:
             Self for method chaining.
         """
-        logger.info(f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'")
+        logger.info(
+            f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
+        )
         # Find OCR elements specifically on this page
         # Note: We typically want to correct even if the element falls in an excluded area
@@ -2052,9 +2061,9 @@ class Page:
         # Delegate to the utility function
         _apply_ocr_correction_to_elements(
-            elements=target_elements, # Pass the ElementCollection directly
+            elements=target_elements,  # Pass the ElementCollection directly
             correction_callback=correction_callback,
-            caller_info=f"Page({self.number})", # Pass caller info
+            caller_info=f"Page({self.number})",  # Pass caller info
         )
-        return self # Return self for chaining
+        return self  # Return self for chaining

natural_pdf/core/pdf.py CHANGED Viewed

@@ -239,13 +239,13 @@ class PDF:
         engine: Optional[str] = None,
         # --- Common OCR Parameters (Direct Arguments) ---
         languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None, # Min confidence threshold
+        min_confidence: Optional[float] = None,  # Min confidence threshold
         device: Optional[str] = None,
-        resolution: Optional[int] = None, # DPI for rendering before OCR
-        apply_exclusions: bool = True, # New parameter
+        resolution: Optional[int] = None,  # DPI for rendering before OCR
+        apply_exclusions: bool = True,  # New parameter
         detect_only: bool = False,
         # --- Engine-Specific Options --- Use 'options=' for this
-        options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
+        options: Optional[Any] = None,  # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
         # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
     ) -> "PDF":
         """
@@ -314,7 +314,7 @@ class PDF:
         logger.info(f"Applying batch OCR to pages: {page_numbers}...")
         # --- Determine Rendering Resolution ---
         # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
-        final_resolution = resolution # Use direct arg if provided
+        final_resolution = resolution  # Use direct arg if provided
         if final_resolution is None:
             final_resolution = getattr(self, "_config", {}).get("resolution", 150)
@@ -323,7 +323,9 @@ class PDF:
         # --- Render Images for Batch ---
         images_pil: List[Image.Image] = []
         page_image_map: List[Tuple[Page, Image.Image]] = []  # Store page and its image
-        logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
+        logger.info(
+            f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
+        )
         failed_page_num = "unknown"  # Keep track of potentially failing page
         try:
             for i, page in enumerate(target_pages):
@@ -339,7 +341,7 @@ class PDF:
                 if img is None:
                     logger.error(f"  Failed to render page {page.number} to image.")
                     # Decide how to handle: skip page, raise error? For now, skip.
-                    continue # Skip this page if rendering failed
+                    continue  # Skip this page if rendering failed
                 images_pil.append(img)
                 page_image_map.append((page, img))  # Store pair
         except Exception as e:
@@ -356,7 +358,7 @@ class PDF:
             "images": images_pil,
             "engine": engine,
             "languages": languages,
-            "min_confidence": min_confidence, # Use the renamed parameter
+            "min_confidence": min_confidence,  # Use the renamed parameter
             "device": device,
             "options": options,
             "detect_only": detect_only,
@@ -366,7 +368,9 @@ class PDF:
         manager_args = {k: v for k, v in manager_args.items() if v is not None}
         # --- Call OCR Manager for Batch Processing ---
-        logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
+        logger.info(
+            f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
+        )
         try:
             # Manager's apply_ocr signature needs to accept common args directly
             batch_results = self._ocr_manager.apply_ocr(**manager_args)
@@ -948,19 +952,22 @@ class PDF:
         """
         try:
             from natural_pdf.utils.packaging import create_correction_task_package
             create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
         except ImportError:
-            logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
+            logger.error(
+                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
+            )
             # Or raise
         except Exception as e:
             logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
-            raise # Re-raise the exception from the utility function
+            raise  # Re-raise the exception from the utility function
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
         pages: Optional[Union[Iterable[int], range, slice]] = None,
-    ) -> "PDF": # Return self for chaining
+    ) -> "PDF":  # Return self for chaining
         """
         Applies corrections to OCR-generated text elements using a callback function,
         delegating the core work to the `Page.correct_ocr` method.
@@ -989,7 +996,9 @@ class PDF:
                     if not (0 <= idx < len(self._pages)):
                         raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
             except (IndexError, TypeError, ValueError) as e:
-                raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
+                raise ValueError(
+                    f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
+                ) from e
         else:
             raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
@@ -997,7 +1006,9 @@ class PDF:
             logger.warning("No pages selected for OCR correction.")
             return self
-        logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
+        logger.info(
+            f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
+        )
         # Iterate through target pages and call their correct_ocr method
         for page_idx in target_page_indices:
@@ -1071,8 +1082,6 @@ class PDF:
         """Context manager exit."""
         self.close()
     # --- Indexable Protocol Methods --- Needed for search/sync
     def get_id(self) -> str:
         return self.path

natural_pdf/elements/base.py CHANGED Viewed

@@ -306,9 +306,7 @@ class DirectionalMixin:
             **kwargs,
         )
-    def to_region(
-        self
-    ):
+    def to_region(self):
         return self.expand()
     def expand(

natural_pdf/elements/collections.py CHANGED Viewed

@@ -21,7 +21,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
 from natural_pdf.elements.text import TextElement  # Needed for isinstance check
 from natural_pdf.ocr import OCROptions
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
-from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import the new utility
 logger = logging.getLogger(__name__)
@@ -1151,9 +1151,9 @@ class ElementCollection(Generic[T]):
         _apply_ocr_correction_to_elements(
             elements=self._elements,
             correction_callback=correction_callback,
-            caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
+            caller_info=f"ElementCollection(len={len(self._elements)})",  # Pass caller info
         )
-        return self # Return self for chaining
+        return self  # Return self for chaining
 class PageCollection(Generic[P]):
@@ -1217,12 +1217,12 @@ class PageCollection(Generic[P]):
         engine: Optional[str] = None,
         # --- Common OCR Parameters (Direct Arguments) ---
         languages: Optional[List[str]] = None,
-        min_confidence: Optional[float] = None, # Min confidence threshold
+        min_confidence: Optional[float] = None,  # Min confidence threshold
         device: Optional[str] = None,
-        resolution: Optional[int] = None, # DPI for rendering
-        apply_exclusions: bool = True, # New parameter
+        resolution: Optional[int] = None,  # DPI for rendering
+        apply_exclusions: bool = True,  # New parameter
         # --- Engine-Specific Options ---
-        options: Optional[Any] = None, # e.g., EasyOCROptions(...)
+        options: Optional[Any] = None,  # e.g., EasyOCROptions(...)
     ) -> "PageCollection[P]":
         """
         Applies OCR to all pages within this collection using batch processing.
@@ -1273,10 +1273,10 @@ class PageCollection(Generic[P]):
             pages=page_indices,
             engine=engine,
             languages=languages,
-            min_confidence=min_confidence, # Pass the renamed parameter
+            min_confidence=min_confidence,  # Pass the renamed parameter
             device=device,
             resolution=resolution,
-            apply_exclusions=apply_exclusions, # Pass down
+            apply_exclusions=apply_exclusions,  # Pass down
             options=options,
         )
         # The PDF method modifies the Page objects directly by adding elements.
@@ -1351,13 +1351,12 @@ class PageCollection(Generic[P]):
         parent_pdf = self.pages[0]._parent
         page_indices = [p.index for p in self.pages]
-        logger.info(f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}.")
+        logger.info(
+            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
+        )
         # Delegate the call to the parent PDF object for the relevant pages
-        parent_pdf.correct_ocr(
-            correction_callback=correction_callback,
-            pages=page_indices
-        )
+        parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
         return self

natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl