PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/analyzers/layout/docling.py CHANGED Viewed

@@ -1,24 +1,38 @@
 # layout_detector_docling.py
-import logging
 import importlib.util
+import logging
 import os
 import tempfile
-from typing import List, Dict, Any, Optional
+from typing import Any, Dict, List, Optional
 from PIL import Image
 # Assuming base class and options are importable
 try:
     from .base import LayoutDetector
-    from .layout_options import DoclingLayoutOptions, BaseLayoutOptions
+    from .layout_options import BaseLayoutOptions, DoclingLayoutOptions
 except ImportError:
     # Placeholders if run standalone or imports fail
-    class BaseLayoutOptions: pass
-    class DoclingLayoutOptions(BaseLayoutOptions): pass
+    class BaseLayoutOptions:
+        pass
+    class DoclingLayoutOptions(BaseLayoutOptions):
+        pass
     class LayoutDetector:
-         def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
-         def _get_model(self, options): raise NotImplementedError
-         def _normalize_class_name(self, n): return n
-         def validate_classes(self, c): pass
+        def __init__(self):
+            self.logger = logging.getLogger()
+            self.supported_classes = set()
+        def _get_model(self, options):
+            raise NotImplementedError
+        def _normalize_class_name(self, n):
+            return n
+        def validate_classes(self, c):
+            pass
     logging.basicConfig()
 logger = logging.getLogger(__name__)
@@ -42,11 +56,27 @@ class DoclingLayoutDetector(LayoutDetector):
         super().__init__()
         # Docling classes are dynamic/hierarchical, define common ones
         self.supported_classes = {
-            'Header', 'Footer', 'Paragraph', 'Heading', 'List', 'ListItem',
-            'Table', 'Figure', 'Caption', 'Footnote', 'PageNumber', 'Equation',
-            'Code', 'Title', 'Author', 'Abstract', 'Section', 'Unknown', 'Metadata' # Add more as needed
+            "Header",
+            "Footer",
+            "Paragraph",
+            "Heading",
+            "List",
+            "ListItem",
+            "Table",
+            "Figure",
+            "Caption",
+            "Footnote",
+            "PageNumber",
+            "Equation",
+            "Code",
+            "Title",
+            "Author",
+            "Abstract",
+            "Section",
+            "Unknown",
+            "Metadata",  # Add more as needed
         }
-        self._docling_document_cache = {} # Cache the output doc per image/options if needed
+        self._docling_document_cache = {}  # Cache the output doc per image/options if needed
     def is_available(self) -> bool:
         """Check if docling is installed."""
@@ -55,9 +85,9 @@ class DoclingLayoutDetector(LayoutDetector):
     def _get_cache_key(self, options: BaseLayoutOptions) -> str:
         """Generate cache key based on device and potentially converter args."""
         if not isinstance(options, DoclingLayoutOptions):
-             options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
+            options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
-        device_key = str(options.device).lower() if options.device else 'default_device'
+        device_key = str(options.device).lower() if options.device else "default_device"
         # Include hash of extra_args if they affect model loading/converter init
         extra_args_key = hash(frozenset(options.extra_args.items()))
         return f"{self.__class__.__name__}_{device_key}_{extra_args_key}"
@@ -88,12 +118,17 @@ class DoclingLayoutDetector(LayoutDetector):
             raise RuntimeError("Docling dependency not installed.")
         if not isinstance(options, DoclingLayoutOptions):
-             self.logger.warning("Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults.")
-             options = DoclingLayoutOptions(
-                 confidence=options.confidence, classes=options.classes,
-                 exclude_classes=options.exclude_classes, device=options.device,
-                 extra_args=options.extra_args, verbose=options.extra_args.get('verbose', False)
-             )
+            self.logger.warning(
+                "Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults."
+            )
+            options = DoclingLayoutOptions(
+                confidence=options.confidence,
+                classes=options.classes,
+                exclude_classes=options.exclude_classes,
+                device=options.device,
+                extra_args=options.extra_args,
+                verbose=options.extra_args.get("verbose", False),
+            )
         # Validate classes before proceeding (note: Docling classes are case-sensitive)
         # self.validate_classes(options.classes or []) # Validation might be tricky due to case sensitivity
@@ -105,18 +140,20 @@ class DoclingLayoutDetector(LayoutDetector):
         # Docling convert method requires an image path. Save temp file.
         detections = []
-        docling_doc = None # To store the result
+        docling_doc = None  # To store the result
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_image_path = os.path.join(temp_dir, f"docling_input_{os.getpid()}.png")
             try:
-                self.logger.debug(f"Saving temporary image for Docling detector to: {temp_image_path}")
-                image.convert("RGB").save(temp_image_path) # Ensure RGB
+                self.logger.debug(
+                    f"Saving temporary image for Docling detector to: {temp_image_path}"
+                )
+                image.convert("RGB").save(temp_image_path)  # Ensure RGB
                 # Convert the document using Docling's DocumentConverter
                 self.logger.debug("Running Docling conversion...")
                 # Docling convert returns a Result object with a 'document' attribute
                 result = converter.convert(temp_image_path)
-                docling_doc = result.document # Store the DoclingDocument
+                docling_doc = result.document  # Store the DoclingDocument
                 self.logger.info(f"Docling conversion complete.")
                 # Convert Docling document to our detection format
@@ -124,12 +161,14 @@ class DoclingLayoutDetector(LayoutDetector):
             except Exception as e:
                 self.logger.error(f"Error during Docling detection: {e}", exc_info=True)
-                raise # Re-raise the exception
+                raise  # Re-raise the exception
             finally:
-                 # Ensure temp file is removed
-                 if os.path.exists(temp_image_path):
-                      try: os.remove(temp_image_path)
-                      except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
+                # Ensure temp file is removed
+                if os.path.exists(temp_image_path):
+                    try:
+                        os.remove(temp_image_path)
+                    except OSError as e_rm:
+                        self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
         # Cache the docling document if needed elsewhere (maybe associate with page?)
         # self._docling_document_cache[image_hash] = docling_doc # Needs a way to key this
@@ -137,26 +176,37 @@ class DoclingLayoutDetector(LayoutDetector):
         self.logger.info(f"Docling detected {len(detections)} layout elements matching criteria.")
         return detections
-    def _convert_docling_to_detections(self, doc, options: DoclingLayoutOptions) -> List[Dict[str, Any]]:
+    def _convert_docling_to_detections(
+        self, doc, options: DoclingLayoutOptions
+    ) -> List[Dict[str, Any]]:
         """Convert a Docling document to our standard detection format."""
-        if not doc or not hasattr(doc, 'pages') or not doc.pages:
+        if not doc or not hasattr(doc, "pages") or not doc.pages:
             self.logger.warning("Invalid or empty Docling document for conversion.")
             return []
         detections = []
-        id_to_detection_index = {} # Map Docling ID to index in detections list
+        id_to_detection_index = {}  # Map Docling ID to index in detections list
         # Prepare normalized class filters once
-        normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
-        normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
+        normalized_classes_req = (
+            {self._normalize_class_name(c) for c in options.classes} if options.classes else None
+        )
+        normalized_classes_excl = (
+            {self._normalize_class_name(c) for c in options.exclude_classes}
+            if options.exclude_classes
+            else set()
+        )
         # --- Iterate through elements using Docling's structure ---
         # This requires traversing the hierarchy (e.g., doc.body.children)
         # or iterating through specific lists like doc.texts, doc.tables etc.
         elements_to_process = []
-        if hasattr(doc, 'texts'): elements_to_process.extend(doc.texts)
-        if hasattr(doc, 'tables'): elements_to_process.extend(doc.tables)
-        if hasattr(doc, 'pictures'): elements_to_process.extend(doc.pictures)
+        if hasattr(doc, "texts"):
+            elements_to_process.extend(doc.texts)
+        if hasattr(doc, "tables"):
+            elements_to_process.extend(doc.tables)
+        if hasattr(doc, "pictures"):
+            elements_to_process.extend(doc.pictures)
         # Add other element types from DoclingDocument as needed
         self.logger.debug(f"Converting {len(elements_to_process)} Docling elements...")
@@ -164,16 +214,19 @@ class DoclingLayoutDetector(LayoutDetector):
         for elem in elements_to_process:
             try:
                 # Get Provenance (bbox and page number)
-                if not hasattr(elem, 'prov') or not elem.prov: continue
-                prov = elem.prov[0] # Use first provenance
-                if not hasattr(prov, 'bbox') or not prov.bbox: continue
+                if not hasattr(elem, "prov") or not elem.prov:
+                    continue
+                prov = elem.prov[0]  # Use first provenance
+                if not hasattr(prov, "bbox") or not prov.bbox:
+                    continue
                 bbox = prov.bbox
                 page_no = prov.page_no
                 # Get Page Dimensions (crucial for coordinate conversion)
-                if not hasattr(doc.pages.get(page_no), 'size'): continue
+                if not hasattr(doc.pages.get(page_no), "size"):
+                    continue
                 page_height = doc.pages[page_no].size.height
-                page_width = doc.pages[page_no].size.width # Needed? Bbox seems absolute
+                page_width = doc.pages[page_no].size.width  # Needed? Bbox seems absolute
                 # Convert coordinates from Docling's system (often bottom-left origin)
                 # to standard top-left origin (0,0 at top-left)
@@ -182,46 +235,51 @@ class DoclingLayoutDetector(LayoutDetector):
                 x1 = float(bbox.r)
                 # Convert y: top_y = page_height - bottom_left_t
                 #            bottom_y = page_height - bottom_left_b
-                y0 = float(page_height - bbox.t) # Top y
-                y1 = float(page_height - bbox.b) # Bottom y
+                y0 = float(page_height - bbox.t)  # Top y
+                y1 = float(page_height - bbox.b)  # Bottom y
                 # Ensure y0 < y1
-                if y0 > y1: y0, y1 = y1, y0
+                if y0 > y1:
+                    y0, y1 = y1, y0
                 # Ensure x0 < x1
-                if x0 > x1: x0, x1 = x1, x0
+                if x0 > x1:
+                    x0, x1 = x1, x0
                 # Get Class Label
-                label_orig = str(getattr(elem, 'label', 'Unknown')) # Default if no label
+                label_orig = str(getattr(elem, "label", "Unknown"))  # Default if no label
                 normalized_label = self._normalize_class_name(label_orig)
                 # Apply Class Filtering
-                if normalized_classes_req and normalized_label not in normalized_classes_req: continue
-                if normalized_label in normalized_classes_excl: continue
+                if normalized_classes_req and normalized_label not in normalized_classes_req:
+                    continue
+                if normalized_label in normalized_classes_excl:
+                    continue
                 # Get Confidence (Docling often doesn't provide per-element confidence)
-                confidence = getattr(elem, 'confidence', 0.95) # Assign default confidence
-                if confidence < options.confidence: continue # Apply confidence threshold
+                confidence = getattr(elem, "confidence", 0.95)  # Assign default confidence
+                if confidence < options.confidence:
+                    continue  # Apply confidence threshold
                 # Get Text Content
-                text_content = getattr(elem, 'text', None)
+                text_content = getattr(elem, "text", None)
                 # Get IDs for hierarchy
-                docling_id = getattr(elem, 'self_ref', None)
-                parent_id_obj = getattr(elem, 'parent', None)
-                parent_id = getattr(parent_id_obj, 'self_ref', None) if parent_id_obj else None
+                docling_id = getattr(elem, "self_ref", None)
+                parent_id_obj = getattr(elem, "parent", None)
+                parent_id = getattr(parent_id_obj, "self_ref", None) if parent_id_obj else None
                 # Create Detection Dictionary
                 detection = {
-                    'bbox': (x0, y0, x1, y1),
-                    'class': label_orig,
-                    'normalized_class': normalized_label,
-                    'confidence': confidence,
-                    'text': text_content,
-                    'docling_id': docling_id,
-                    'parent_id': parent_id,
-                    'page_number': page_no, # Add page number if useful
-                    'source': 'layout',
-                    'model': 'docling'
+                    "bbox": (x0, y0, x1, y1),
+                    "class": label_orig,
+                    "normalized_class": normalized_label,
+                    "confidence": confidence,
+                    "text": text_content,
+                    "docling_id": docling_id,
+                    "parent_id": parent_id,
+                    "page_number": page_no,  # Add page number if useful
+                    "source": "layout",
+                    "model": "docling",
                 }
                 detections.append(detection)
@@ -229,8 +287,8 @@ class DoclingLayoutDetector(LayoutDetector):
                 # if docling_id: id_to_detection_index[docling_id] = len(detections) - 1
             except Exception as conv_e:
-                 self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
-                 continue
+                self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
+                continue
         return detections
@@ -241,7 +299,8 @@ class DoclingLayoutDetector(LayoutDetector):
         """
         # This requires caching the doc based on image/options or re-running.
         # For simplicity, let's just re-run detect if needed.
-        self.logger.warning("get_docling_document: Re-running detection to ensure document is generated.")
-        self.detect(image, options) # Run detect to populate internal doc
-        return getattr(self, '_docling_document', None) # Return the stored doc
+        self.logger.warning(
+            "get_docling_document: Re-running detection to ensure document is generated."
+        )
+        self.detect(image, options)  # Run detect to populate internal doc
+        return getattr(self, "_docling_document", None)  # Return the stored doc

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl