PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +125 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +907 -513
natural_pdf/core/pdf.py +385 -287
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +708 -508
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/analyzers/layout/paddle.py CHANGED Viewed

@@ -1,24 +1,38 @@
 # layout_detector_paddle.py
-import logging
 import importlib.util
+import logging
 import os
 import tempfile
-from typing import List, Dict, Any, Optional
+from typing import Any, Dict, List, Optional
 from PIL import Image
 # Assuming base class and options are importable
 try:
     from .base import LayoutDetector
-    from .layout_options import PaddleLayoutOptions, BaseLayoutOptions
+    from .layout_options import BaseLayoutOptions, PaddleLayoutOptions
 except ImportError:
     # Placeholders if run standalone or imports fail
-    class BaseLayoutOptions: pass
-    class PaddleLayoutOptions(BaseLayoutOptions): pass
+    class BaseLayoutOptions:
+        pass
+    class PaddleLayoutOptions(BaseLayoutOptions):
+        pass
     class LayoutDetector:
-         def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
-         def _get_model(self, options): raise NotImplementedError
-         def _normalize_class_name(self, n): return n
-         def validate_classes(self, c): pass
+        def __init__(self):
+            self.logger = logging.getLogger()
+            self.supported_classes = set()
+        def _get_model(self, options):
+            raise NotImplementedError
+        def _normalize_class_name(self, n):
+            return n
+        def validate_classes(self, c):
+            pass
     logging.basicConfig()
 logger = logging.getLogger(__name__)
@@ -27,15 +41,17 @@ logger = logging.getLogger(__name__)
 paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
 paddleocr_spec = importlib.util.find_spec("paddleocr")
 PPStructure = None
-PaddleOCR = None # For optional text detection
+PaddleOCR = None  # For optional text detection
 if paddle_spec and paddleocr_spec:
     try:
-        from paddleocr import PPStructure, PaddleOCR
+        from paddleocr import PaddleOCR, PPStructure
     except ImportError as e:
         logger.warning(f"Could not import Paddle dependencies: {e}")
 else:
-    logger.warning("paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available.")
+    logger.warning(
+        "paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
+    )
 class PaddleLayoutDetector(LayoutDetector):
@@ -45,9 +61,17 @@ class PaddleLayoutDetector(LayoutDetector):
         super().__init__()
         # Supported classes by PP-Structure (adjust based on model version/capabilities)
         self.supported_classes = {
-            'text', 'title', 'figure', 'figure_caption',
-            'table', 'table_caption', 'table_cell', # Added table_cell
-            'header', 'footer', 'reference', 'equation',
+            "text",
+            "title",
+            "figure",
+            "figure_caption",
+            "table",
+            "table_caption",
+            "table_cell",  # Added table_cell
+            "header",
+            "footer",
+            "reference",
+            "equation",
             # PP-StructureV2 might add others like list, pub_number etc.
         }
         # Models are loaded via _get_model
@@ -59,9 +83,9 @@ class PaddleLayoutDetector(LayoutDetector):
     def _get_cache_key(self, options: BaseLayoutOptions) -> str:
         """Generate cache key based on language and device."""
         if not isinstance(options, PaddleLayoutOptions):
-            options = PaddleLayoutOptions(device=options.device) # Use base device
+            options = PaddleLayoutOptions(device=options.device)  # Use base device
-        device_key = str(options.device).lower() if options.device else 'default_device'
+        device_key = str(options.device).lower() if options.device else "default_device"
         lang_key = options.lang
         # Key could also include enable_table, use_angle_cls if these affect model loading fundamentally
         # For PPStructure, they are primarily runtime flags, so lang/device might suffice for caching the *instance*.
@@ -70,12 +94,14 @@ class PaddleLayoutDetector(LayoutDetector):
     def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
         """Load the PPStructure model based on options."""
         if not self.is_available():
-             raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
+            raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
         if not isinstance(options, PaddleLayoutOptions):
             raise TypeError("Incorrect options type provided for Paddle model loading.")
-        self.logger.info(f"Loading PPStructure model (lang={options.lang}, device={options.device}, table={options.enable_table})...")
+        self.logger.info(
+            f"Loading PPStructure model (lang={options.lang}, device={options.device}, table={options.enable_table})..."
+        )
         try:
             # PPStructure init takes several arguments that control runtime behavior
             # We cache the instance based on lang/device, assuming other flags don't require reloading.
@@ -86,15 +112,17 @@ class PaddleLayoutDetector(LayoutDetector):
             # However, PPStructure call signature is simple (__call__(self, img, ...))
             # So, we likely need to initialize with most settings.
             model_instance = PPStructure(
-                 lang=options.lang,
-                 use_gpu=('cuda' in str(options.device).lower() or 'gpu' in str(options.device).lower()),
-                 use_angle_cls=options.use_angle_cls,
-                 show_log=options.show_log,
-                 layout=True, # Ensure layout analysis is on
-                 table=options.enable_table, # Control table analysis
-                 ocr=False # Usually disable internal OCR if only using for layout/table
-                 # Add other PPStructure init args from options.extra_args if needed
-                 # **options.extra_args
+                lang=options.lang,
+                use_gpu=(
+                    "cuda" in str(options.device).lower() or "gpu" in str(options.device).lower()
+                ),
+                use_angle_cls=options.use_angle_cls,
+                show_log=options.show_log,
+                layout=True,  # Ensure layout analysis is on
+                table=options.enable_table,  # Control table analysis
+                ocr=False,  # Usually disable internal OCR if only using for layout/table
+                # Add other PPStructure init args from options.extra_args if needed
+                # **options.extra_args
             )
             self.logger.info("PPStructure model loaded.")
             return model_instance
@@ -108,13 +136,17 @@ class PaddleLayoutDetector(LayoutDetector):
             raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
         if not isinstance(options, PaddleLayoutOptions):
-             self.logger.warning("Received BaseLayoutOptions, expected PaddleLayoutOptions. Using defaults.")
-             options = PaddleLayoutOptions(
-                 confidence=options.confidence, classes=options.classes,
-                 exclude_classes=options.exclude_classes, device=options.device,
-                 extra_args=options.extra_args
-                 # Other Paddle options will use defaults
-             )
+            self.logger.warning(
+                "Received BaseLayoutOptions, expected PaddleLayoutOptions. Using defaults."
+            )
+            options = PaddleLayoutOptions(
+                confidence=options.confidence,
+                classes=options.classes,
+                exclude_classes=options.exclude_classes,
+                device=options.device,
+                extra_args=options.extra_args,
+                # Other Paddle options will use defaults
+            )
         self.validate_classes(options.classes or [])
         if options.exclude_classes:
@@ -128,8 +160,10 @@ class PaddleLayoutDetector(LayoutDetector):
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_image_path = os.path.join(temp_dir, f"paddle_input_{os.getpid()}.png")
             try:
-                self.logger.debug(f"Saving temporary image for Paddle detector to: {temp_image_path}")
-                image.convert("RGB").save(temp_image_path) # Ensure RGB
+                self.logger.debug(
+                    f"Saving temporary image for Paddle detector to: {temp_image_path}"
+                )
+                image.convert("RGB").save(temp_image_path)  # Ensure RGB
                 # Process image with PP-Structure instance
                 # The instance was configured during _load_model_from_options
@@ -141,15 +175,19 @@ class PaddleLayoutDetector(LayoutDetector):
                 self.logger.error(f"Error during PPStructure analysis: {e}", exc_info=True)
                 # Clean up temp file before raising or returning
                 if os.path.exists(temp_image_path):
-                    try: os.remove(temp_image_path)
-                    except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
-                raise # Re-raise error
+                    try:
+                        os.remove(temp_image_path)
+                    except OSError as e_rm:
+                        self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
+                raise  # Re-raise error
             finally:
-                 # Ensure cleanup even if analysis worked
-                 if os.path.exists(temp_image_path):
-                      try: os.remove(temp_image_path)
-                      except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
+                # Ensure cleanup even if analysis worked
+                if os.path.exists(temp_image_path):
+                    try:
+                        os.remove(temp_image_path)
+                    except OSError as e_rm:
+                        self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
         # --- Process Results ---
         if not result:
@@ -157,66 +195,85 @@ class PaddleLayoutDetector(LayoutDetector):
             return []
         # Prepare normalized class filters once
-        normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
-        normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
+        normalized_classes_req = (
+            {self._normalize_class_name(c) for c in options.classes} if options.classes else None
+        )
+        normalized_classes_excl = (
+            {self._normalize_class_name(c) for c in options.exclude_classes}
+            if options.exclude_classes
+            else set()
+        )
         for region in result:
             try:
-                region_type_orig = region.get('type', 'unknown')
+                region_type_orig = region.get("type", "unknown")
                 # Handle potential list returns for type (seen in some versions)
                 if isinstance(region_type_orig, list):
-                     region_type_orig = region_type_orig[0] if region_type_orig else 'unknown'
+                    region_type_orig = region_type_orig[0] if region_type_orig else "unknown"
                 region_type = region_type_orig.lower()
                 normalized_class = self._normalize_class_name(region_type)
                 # Apply class filtering
-                if normalized_classes_req and normalized_class not in normalized_classes_req: continue
-                if normalized_class in normalized_classes_excl: continue
+                if normalized_classes_req and normalized_class not in normalized_classes_req:
+                    continue
+                if normalized_class in normalized_classes_excl:
+                    continue
                 # PP-Structure results don't always have confidence, use threshold or default
-                confidence_score = region.get('score', 1.0) # Default to 1.0 if missing
-                if confidence_score < options.confidence: continue
+                confidence_score = region.get("score", 1.0)  # Default to 1.0 if missing
+                if confidence_score < options.confidence:
+                    continue
-                bbox = region.get('bbox')
+                bbox = region.get("bbox")
                 if not bbox or len(bbox) != 4:
-                     self.logger.warning(f"Skipping region with invalid bbox: {region}")
-                     continue
+                    self.logger.warning(f"Skipping region with invalid bbox: {region}")
+                    continue
                 x_min, y_min, x_max, y_max = map(float, bbox)
                 # Add detection
                 detection_data = {
-                    'bbox': (x_min, y_min, x_max, y_max),
-                    'class': region_type_orig, # Keep original case if needed
-                    'confidence': confidence_score,
-                    'normalized_class': normalized_class,
-                    'source': 'layout',
-                    'model': 'paddle'
+                    "bbox": (x_min, y_min, x_max, y_max),
+                    "class": region_type_orig,  # Keep original case if needed
+                    "confidence": confidence_score,
+                    "normalized_class": normalized_class,
+                    "source": "layout",
+                    "model": "paddle",
                 }
                 detections.append(detection_data)
                 # --- Process Table Cells (if enabled and present) ---
-                if region_type == 'table' and options.enable_table and 'res' in region:
-                    process_cells = (normalized_classes_req is None or 'table-cell' in normalized_classes_req) and \
-                                    ('table-cell' not in normalized_classes_excl)
-                    if process_cells and isinstance(region['res'], list): # V2 structure
-                         for cell in region['res']:
-                              if 'box' not in cell or len(cell['box']) != 4: continue
-                              cell_bbox = cell['box']
-                              cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
-                              # Add cell detection (confidence often not available per cell)
-                              detections.append({
-                                  'bbox': (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
-                                  'class': 'table cell', # Standardize name
-                                  'confidence': confidence_score * 0.95, # Inherit table confidence (slightly reduced)
-                                  'normalized_class': 'table-cell',
-                                  'text': cell.get('text', ''), # Include text if available
-                                  'source': 'layout', 'model': 'paddle'
-                              })
-                    elif process_cells and isinstance(region['res'], dict) and 'cells' in region['res']: # Older structure
-                         # Handle older 'cells' list if needed (logic from original file)
-                         pass # Add logic based on original paddle.txt if supporting older PP-Structure
+                if region_type == "table" and options.enable_table and "res" in region:
+                    process_cells = (
+                        normalized_classes_req is None or "table-cell" in normalized_classes_req
+                    ) and ("table-cell" not in normalized_classes_excl)
+                    if process_cells and isinstance(region["res"], list):  # V2 structure
+                        for cell in region["res"]:
+                            if "box" not in cell or len(cell["box"]) != 4:
+                                continue
+                            cell_bbox = cell["box"]
+                            cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
+                            # Add cell detection (confidence often not available per cell)
+                            detections.append(
+                                {
+                                    "bbox": (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
+                                    "class": "table cell",  # Standardize name
+                                    "confidence": confidence_score
+                                    * 0.95,  # Inherit table confidence (slightly reduced)
+                                    "normalized_class": "table-cell",
+                                    "text": cell.get("text", ""),  # Include text if available
+                                    "source": "layout",
+                                    "model": "paddle",
+                                }
+                            )
+                    elif (
+                        process_cells
+                        and isinstance(region["res"], dict)
+                        and "cells" in region["res"]
+                    ):  # Older structure
+                        # Handle older 'cells' list if needed (logic from original file)
+                        pass  # Add logic based on original paddle.txt if supporting older PP-Structure
             except (TypeError, KeyError, IndexError, ValueError) as e:
                 self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
@@ -224,17 +281,17 @@ class PaddleLayoutDetector(LayoutDetector):
         # --- Optional: Add Text Boxes from separate OCR run ---
         if options.detect_text:
-             # This requires another model instance (PaddleOCR) and adds complexity.
-             # Consider if this is truly needed or if layout regions are sufficient.
-             # If needed, implement similar to original paddle.txt:
-             # - Instantiate PaddleOCR (potentially cache separately)
-             # - Run ocr(img_path, det=True, rec=False)
-             # - Process results, adding 'text' class detections
-             self.logger.info("Paddle detect_text=True: Running separate OCR text detection...")
-             # (Implementation omitted for brevity - requires PaddleOCR instance)
-             pass
-        self.logger.info(f"PaddleLayout detected {len(detections)} layout elements matching criteria.")
+            # This requires another model instance (PaddleOCR) and adds complexity.
+            # Consider if this is truly needed or if layout regions are sufficient.
+            # If needed, implement similar to original paddle.txt:
+            # - Instantiate PaddleOCR (potentially cache separately)
+            # - Run ocr(img_path, det=True, rec=False)
+            # - Process results, adding 'text' class detections
+            self.logger.info("Paddle detect_text=True: Running separate OCR text detection...")
+            # (Implementation omitted for brevity - requires PaddleOCR instance)
+            pass
+        self.logger.info(
+            f"PaddleLayout detected {len(detections)} layout elements matching criteria."
+        )
         return detections

natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl