PyPI - natural-pdf - Versions diffs - 25.3.16.2__py3-none-any.whl → 25.3.17.2__py3-none-any.whl - Mend

natural-pdf 25.3.16.2py3-none-any.whl → 25.3.17.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

examples/direct_qa_example.py +17 -111
examples/docling_comprehensive_test.py +325 -0
examples/docling_example.py +192 -0
examples/docling_hierarchy_example.py +230 -0
examples/docling_text_sources.py +241 -0
examples/improved_qa_example.py +66 -0
natural_pdf/analyzers/document_layout.py +276 -0
natural_pdf/core/page.py +72 -21
natural_pdf/core/pdf.py +50 -68
natural_pdf/elements/region.py +174 -19
natural_pdf/qa/document_qa.py +29 -38
natural_pdf/selectors/parser.py +6 -2
{natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA +19 -2
{natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/RECORD +17 -12
{natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/LICENSE +0 -0
{natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/WHEEL +0 -0
{natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/top_level.txt +0 -0

natural_pdf/analyzers/document_layout.py CHANGED Viewed

@@ -399,6 +399,276 @@ class TableTransformerDetector(LayoutDetector):
         return all_detections
+class DoclingLayoutDetector(LayoutDetector):
+    """
+    Document layout and text recognition using Docling.
+    Docling provides a hierarchical document understanding system that can analyze:
+    - Document structure (headers, text, figures, tables)
+    - Text content via integrated OCR
+    - Hierarchical relationships between document elements
+    """
+    def __init__(self, verbose=False, **kwargs):
+        """
+        Initialize the Docling document analyzer.
+        Args:
+            verbose: Whether to enable verbose logging
+            **kwargs: Additional parameters to pass to DocumentConverter
+        """
+        # Set up logger with optional verbose mode
+        import logging
+        self.logger = logging.getLogger("natural_pdf.analyzers.layout.docling")
+        self.original_level = self.logger.level
+        if verbose:
+            self.logger.setLevel(logging.DEBUG)
+        super().__init__()
+        self.verbose = verbose
+        self.converter_kwargs = kwargs
+        self._docling_document = None
+        self._converter = None
+    def __del__(self):
+        # Restore the original logging level when done
+        if hasattr(self, 'logger') and hasattr(self, 'original_level'):
+            self.logger.setLevel(self.original_level)
+    @property
+    def converter(self):
+        """Lazy-load the DocumentConverter on first use."""
+        if self._converter is None:
+            try:
+                from docling.document_converter import DocumentConverter
+                self.logger.debug("Initializing Docling DocumentConverter")
+                self._converter = DocumentConverter(**self.converter_kwargs)
+            except ImportError:
+                raise ImportError(
+                    "Docling integration requires docling. "
+                    "Install with: pip install docling"
+                )
+        return self._converter
+    def detect(self, image_path, confidence=0.5, classes=None, exclude_classes=None):
+        """
+        Detect document structure and text using Docling.
+        Args:
+            image_path: Path to the image or PDF to analyze
+            confidence: Minimum confidence threshold for detections (not used by Docling)
+            classes: Specific classes to detect (used for filtering)
+            exclude_classes: Classes to exclude from detection (used for filtering)
+        Returns:
+            List of detection dictionaries with hierarchical information
+        """
+        self.logger.info(f"Processing {image_path} with Docling")
+        try:
+            # Convert the document using Docling's DocumentConverter
+            result = self.converter.convert(image_path)
+            doc = result.document
+            # Store for later use
+            self._docling_document = doc
+            self.logger.info(f"Docling document created with {len(doc.body.children)} top-level elements")
+            # Convert Docling document to our detection format
+            detections = self._convert_docling_to_detections(doc, confidence, classes, exclude_classes)
+            return detections
+        except Exception as e:
+            self.logger.error(f"Error processing with Docling: {e}")
+            raise
+    def _convert_docling_to_detections(self, doc, confidence, classes, exclude_classes):
+        """
+        Convert a Docling document to our standard detection format.
+        Args:
+            doc: DoclingDocument object
+            confidence: Confidence threshold to apply (not used by Docling)
+            classes: Classes to include (if specified)
+            exclude_classes: Classes to exclude
+        Returns:
+            List of detection dictionaries with hierarchy information
+        """
+        if not doc or not hasattr(doc, 'body') or not hasattr(doc.body, 'children'):
+            self.logger.warning("Invalid or empty Docling document")
+            return []
+        detections = []
+        id_to_detection = {}  # Map from Docling ID to detection index
+        # Process text elements
+        if hasattr(doc, 'texts') and doc.texts:
+            self.logger.debug(f"Processing {len(doc.texts)} text elements")
+            # First pass: create detections for all text elements
+            for text_elem in doc.texts:
+                # Skip if no provenance information
+                if not hasattr(text_elem, 'prov') or not text_elem.prov:
+                    continue
+                # Get the bounding box
+                prov = text_elem.prov[0]  # Take first provenance entry
+                if not hasattr(prov, 'bbox') or not prov.bbox:
+                    continue
+                bbox = prov.bbox
+                page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792  # Default letter size
+                # Already in top-left coordinates
+                t = page_height - bbox.t
+                b = page_height - bbox.b
+                # Ensure top is always less than bottom for PIL coordinates
+                if t > b:
+                    t, b = b, t
+                # Get the label and normalize it
+                label = str(text_elem.label) if hasattr(text_elem, 'label') else 'text'
+                normalized_label = self._normalize_class_name(label)
+                # Skip if filtered by class
+                if classes and normalized_label not in classes:
+                    continue
+                if exclude_classes and normalized_label in exclude_classes:
+                    continue
+                # Create detection
+                detection = {
+                    'bbox': (bbox.l, t, bbox.r, b),
+                    'class': label,
+                    'normalized_class': normalized_label,
+                    'confidence': 0.95,  # Default confidence for Docling
+                    'text': text_elem.text if hasattr(text_elem, 'text') else None,
+                    'docling_id': text_elem.self_ref if hasattr(text_elem, 'self_ref') else None,
+                    'parent_id': text_elem.parent.self_ref if hasattr(text_elem, 'parent') and hasattr(text_elem.parent, 'self_ref') else None,
+                    'model': 'docling'
+                }
+                detections.append(detection)
+                # Track by ID for hierarchy reconstruction
+                if detection['docling_id']:
+                    id_to_detection[detection['docling_id']] = len(detections) - 1
+        # Process pictures if available
+        if hasattr(doc, 'pictures') and doc.pictures:
+            self.logger.debug(f"Processing {len(doc.pictures)} picture elements")
+            for pic_elem in doc.pictures:
+                # Skip if no provenance information
+                if not hasattr(pic_elem, 'prov') or not pic_elem.prov:
+                    continue
+                # Get the bounding box
+                prov = pic_elem.prov[0]  # Take first provenance entry
+                if not hasattr(prov, 'bbox') or not prov.bbox:
+                    continue
+                bbox = prov.bbox
+                page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792
+                # In BOTTOMLEFT system, bbox.t is distance from bottom (higher value = higher on page)
+                # In TOPLEFT system, we need distance from top (convert using page_height)
+                t = page_height - bbox.t  # Correct: Top is page_height minus the top in BOTTOMLEFT
+                b = page_height - bbox.b  # Correct: Bottom is page_height minus the bottom in BOTTOMLEFT
+                # Ensure top is always less than bottom for PIL coordinates
+                if t > b:
+                    t, b = b, t
+                label = 'figure'  # Default label for pictures
+                normalized_label = 'figure'
+                # Skip if filtered by class
+                if classes and normalized_label not in classes:
+                    continue
+                if exclude_classes and normalized_label in exclude_classes:
+                    continue
+                # Create detection
+                detection = {
+                    'bbox': (bbox.l, t, bbox.r, b),
+                    'class': label,
+                    'normalized_class': normalized_label,
+                    'confidence': 0.95,  # Default confidence
+                    'docling_id': pic_elem.self_ref if hasattr(pic_elem, 'self_ref') else None,
+                    'parent_id': pic_elem.parent.self_ref if hasattr(pic_elem, 'parent') and hasattr(pic_elem.parent, 'self_ref') else None,
+                    'model': 'docling'
+                }
+                detections.append(detection)
+                # Track by ID for hierarchy reconstruction
+                if detection['docling_id']:
+                    id_to_detection[detection['docling_id']] = len(detections) - 1
+        # Process tables if available
+        if hasattr(doc, 'tables') and doc.tables:
+            self.logger.debug(f"Processing {len(doc.tables)} table elements")
+            for table_elem in doc.tables:
+                # Skip if no provenance information
+                if not hasattr(table_elem, 'prov') or not table_elem.prov:
+                    continue
+                # Get the bounding box
+                prov = table_elem.prov[0]  # Take first provenance entry
+                if not hasattr(prov, 'bbox') or not prov.bbox:
+                    continue
+                bbox = prov.bbox
+                # Convert from bottom-left to top-left coordinates
+                page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792
+                # In BOTTOMLEFT system, bbox.t is distance from bottom (higher value = higher on page)
+                # In TOPLEFT system, we need distance from top (convert using page_height)
+                t = page_height - bbox.t  # Correct: Top is page_height minus the top in BOTTOMLEFT
+                b = page_height - bbox.b  # Correct: Bottom is page_height minus the bottom in BOTTOMLEFT
+                # Ensure top is always less than bottom for PIL coordinates
+                if t > b:
+                    t, b = b, t
+                label = 'table'  # Default label for tables
+                normalized_label = 'table'
+                # Skip if filtered by class
+                if classes and normalized_label not in classes:
+                    continue
+                if exclude_classes and normalized_label in exclude_classes:
+                    continue
+                # Create detection
+                detection = {
+                    'bbox': (bbox.l, t, bbox.r, b),
+                    'class': label,
+                    'normalized_class': normalized_label,
+                    'confidence': 0.95,  # Default confidence
+                    'docling_id': table_elem.self_ref if hasattr(table_elem, 'self_ref') else None,
+                    'parent_id': table_elem.parent.self_ref if hasattr(table_elem, 'parent') and hasattr(table_elem.parent, 'self_ref') else None,
+                    'model': 'docling'
+                }
+                detections.append(detection)
+                # Track by ID for hierarchy reconstruction
+                if detection['docling_id']:
+                    id_to_detection[detection['docling_id']] = len(detections) - 1
+        self.logger.info(f"Created {len(detections)} detections from Docling document")
+        return detections
+    def get_docling_document(self):
+        """Get the original Docling document for advanced usage."""
+        return self._docling_document
 class PaddleLayoutDetector(LayoutDetector):
     """
     Document layout and table structure detector using PaddlePaddle's PP-Structure.
@@ -708,6 +978,12 @@ def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
         # Extract detection info
         x_min, y_min, x_max, y_max = det['bbox']
+        # Ensure coordinates are in proper order (min values are smaller)
+        if x_min > x_max:
+            x_min, x_max = x_max, x_min
+        if y_min > y_max:
+            y_min, y_max = y_max, y_min
         # Scale coordinates from image to PDF space
         if scale_factor != 1.0:
             x_min *= scale_factor

natural_pdf/core/page.py CHANGED Viewed

@@ -17,6 +17,7 @@ from natural_pdf.analyzers.document_layout import (
     YOLODocLayoutDetector,
     TableTransformerDetector,
     PaddleLayoutDetector,
+    DoclingLayoutDetector,
     convert_to_regions
 )
 from natural_pdf.utils.ocr import OCRManager
@@ -808,6 +809,8 @@ class Page:
                     except (KeyError, AttributeError, TypeError):
                         pass
+                # Add source attribute for native text elements
+                c['source'] = 'native'
                 chars.append(TextElement(c, self))
             # Create word-level text elements by grouping chars
@@ -872,6 +875,8 @@ class Page:
                                     if attr in current_word[0]:
                                         word_obj[attr] = current_word[0][attr]
+                                # Add source attribute for native text elements
+                                word_obj['source'] = 'native'
                                 words.append(TextElement(word_obj, self))
                                 current_word = []
                             continue
@@ -927,6 +932,8 @@ class Page:
                                 if attr in current_word[0]:
                                     word_obj[attr] = current_word[0][attr]
+                            # Add source attribute for native text elements
+                            word_obj['source'] = 'native'
                             words.append(TextElement(word_obj, self))
                             current_word = [char]
                         # If the gap between chars is larger than a threshold, it's a new word
@@ -965,6 +972,8 @@ class Page:
                                 if attr in current_word[0]:
                                     word_obj[attr] = current_word[0][attr]
+                            # Add source attribute for native text elements
+                            word_obj['source'] = 'native'
                             words.append(TextElement(word_obj, self))
                             current_word = [char]
                         else:
@@ -1005,6 +1014,8 @@ class Page:
                         if attr in current_word[0]:
                             word_obj[attr] = current_word[0][attr]
+                    # Add source attribute for native text elements
+                    word_obj['source'] = 'native'
                     words.append(TextElement(word_obj, self))
                 line_groups.extend(words)
@@ -1853,7 +1864,7 @@ class Page:
         return elements
     def analyze_layout(self,
-                      model: str = "yolo",
+                      model: str = "docling",
                       confidence: float = 0.2,
                       classes: Optional[List[str]] = None,
                       exclude_classes: Optional[List[str]] = None,
@@ -1868,7 +1879,7 @@ class Page:
         Analyze the page layout using a machine learning model.
         Args:
-            model: Model type to use ('yolo', 'tatr', or 'paddle')
+            model: Model type to use ('yolo', 'tatr', 'paddle', or 'docling')
             confidence: Minimum confidence threshold for detections
             classes: Specific classes to detect (None for all supported classes)
             exclude_classes: Classes to exclude from detection
@@ -1878,6 +1889,7 @@ class Page:
                 - YOLO: {"model_path": "...", "image_size": 1024}
                 - TATR: {"model_path": "...", "create_cells": False}
                 - Paddle: {"lang": "en", "use_angle_cls": False, "enable_table": True}
+                - Docling: {"model_name": "ds4sd/SmolDocling-256M-preview", "prompt_text": "...", "verbose": False}
             model_path: (Legacy) Optional path to custom model file
             image_size: (Legacy) Size to resize the image to before detection (YOLO only)
             create_cells: (Legacy) Whether to create cell regions for TATR table regions
@@ -1969,8 +1981,32 @@ class Page:
                     exclude_classes=exclude_classes
                 )
+            elif model.lower() == "docling":
+                # Extract Docling-specific parameters
+                verbose = model_params.get('verbose', False)
+                # Pass all other model_params directly to DocumentConverter
+                detector_kwargs = {k: v for k, v in model_params.items() if k != 'verbose'}
+                # Initialize DoclingLayoutDetector
+                detector = DoclingLayoutDetector(
+                    verbose=verbose,
+                    **detector_kwargs
+                )
+                # Run detection
+                detections = detector.detect(
+                    temp_image_path,
+                    confidence=confidence,
+                    classes=classes,
+                    exclude_classes=exclude_classes
+                )
+                # Store the original Docling document for advanced usage
+                self.docling_document = detector.get_docling_document()
             else:
-                raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle'")
+                raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle', 'docling'")
             # Calculate the scale factor to convert from image to PDF coordinates
             # Note: This assumes the image resolution is 150 DPI
@@ -1981,6 +2017,9 @@ class Page:
             layout_regions = []
             # Convert detections to regions
+            # First create all regions and track by docling_id if available
+            docling_id_to_region = {}
             for detection in detections:
                 x_min, y_min, x_max, y_max = detection['bbox']
@@ -1998,7 +2037,30 @@ class Page:
                 region.model = model  # Store which model detected this region
                 region.source = 'detected'  # Set the source for selectors
+                # If this is a Docling detection, include text content
+                if model.lower() == 'docling':
+                    if 'text' in detection:
+                        region.text_content = detection.get('text')
+                    # Track by docling_id for building hierarchy later
+                    if 'docling_id' in detection:
+                        region.docling_id = detection['docling_id']
+                        docling_id_to_region[detection['docling_id']] = region
+                    # Store parent ID for hierarchy building
+                    if 'parent_id' in detection:
+                        region.parent_id = detection.get('parent_id')
                 layout_regions.append(region)
+            # If using Docling model, build parent-child relationships
+            if model.lower() == 'docling':
+                # Second pass to establish parent-child relationships
+                for region in layout_regions:
+                    if hasattr(region, 'parent_id') and region.parent_id:
+                        parent_region = docling_id_to_region.get(region.parent_id)
+                        if parent_region:
+                            parent_region.add_child(region)
             # Handle existing regions based on mode
             if existing.lower() == 'append':
@@ -2356,21 +2418,10 @@ class Page:
                 "source_elements": list of elements that contain the answer (if found)
             }
         """
-        try:
-            from natural_pdf.qa.document_qa import get_qa_engine
-            # Get or initialize QA engine with specified model
-            qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
-            # Ask the question using the QA engine
-            return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
-        except ImportError as e:
-            import logging
-            logger = logging.getLogger("natural_pdf.core.page")
-            logger.warning(f"QA functionality not available: {e}")
-            return {
-                "answer": "",
-                "confidence": 0.0,
-                "error": "QA functionality not available",
-                "found": False
-            }
+        from natural_pdf.qa.document_qa import get_qa_engine
+        # Get or initialize QA engine with specified model
+        qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
+        # Ask the question using the QA engine
+        return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)

natural_pdf/core/pdf.py CHANGED Viewed

@@ -110,8 +110,13 @@ class PDF:
         if HAS_OCR_ENGINES:
             # Handle OCR engine selection
             if ocr_engine is None:
-                # Use default engine (EasyOCR)
-                self._ocr_engine = EasyOCREngine()
+                # Use default engine (PaddleOCR)
+                try:
+                    self._ocr_engine = PaddleOCREngine()
+                except (ImportError, ValueError) as e:
+                    logger.warning(f"PaddleOCR engine could not be loaded: {e}")
+                    logger.warning("Falling back to EasyOCR engine.")
+                    self._ocr_engine = EasyOCREngine()
             elif isinstance(ocr_engine, str):
                 # String-based engine selection
                 try:
@@ -519,74 +524,51 @@ class PDF:
             **kwargs: Additional parameters passed to the QA engine
         Returns:
-            Dictionary with answer and confidence
+            A dictionary containing the answer, confidence, and other metadata.
+            Result will have an 'answer' key containing the answer text.
         """
-        try:
-            from natural_pdf.qa import get_qa_engine
-            # Initialize or get QA engine
-            qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
-            # Determine which pages to query
-            if pages is None:
-                # Query all pages by default, prioritizing first few pages
-                target_pages = list(range(min(10, len(self.pages))))
-            elif isinstance(pages, int):
-                # Single page
-                target_pages = [pages]
-            elif isinstance(pages, (list, range)):
-                # List or range of pages
-                target_pages = pages
-            else:
-                raise ValueError(f"Invalid pages parameter: {pages}")
-            # Actually query each page and gather results
-            results = []
-            for page_idx in target_pages:
-                if 0 <= page_idx < len(self.pages):
-                    page = self.pages[page_idx]
-                    page_result = qa_engine.ask_pdf_page(
-                        page=page,
-                        question=question,
-                        min_confidence=min_confidence,
-                        **kwargs
-                    )
-                    # Add to results if it found an answer
-                    if page_result.get("found", False):
-                        results.append(page_result)
-            # Sort results by confidence
-            results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
-            # Return the best result, or an empty result if none found
-            if results:
-                return results[0]
-            else:
-                return {
-                    "answer": "",
-                    "confidence": 0.0,
-                    "found": False,
-                    "message": "No answer found in document"
-                }
+        from natural_pdf.qa import get_qa_engine
+        # Initialize or get QA engine
+        qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
+        # Determine which pages to query
+        if pages is None:
+            target_pages = list(range(len(self.pages)))
+        elif isinstance(pages, int):
+            # Single page
+            target_pages = [pages]
+        elif isinstance(pages, (list, range)):
+            # List or range of pages
+            target_pages = pages
+        else:
+            raise ValueError(f"Invalid pages parameter: {pages}")
+        # Actually query each page and gather results
+        results = []
+        for page_idx in target_pages:
+            if 0 <= page_idx < len(self.pages):
+                page = self.pages[page_idx]
+                page_result = qa_engine.ask_pdf_page(
+                    page=page,
+                    question=question,
+                    min_confidence=min_confidence,
+                    **kwargs
+                )
+                # Add to results if it found an answer
+                if page_result.get("found", False):
+                    results.append(page_result)
+        # Sort results by confidence
+        results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+        # Return the best result, or a default result if none found
+        if results:
+            return results[0]
+        else:
+            return None
-        except ImportError as e:
-            logger.warning(f"QA functionality not available: {e}")
-            return {
-                "answer": "",
-                "confidence": 0.0,
-                "error": "QA functionality not available",
-                "found": False
-            }
-        except Exception as e:
-            logger.error(f"Error in document QA: {e}")
-            return {
-                "answer": "",
-                "confidence": 0.0,
-                "error": str(e),
-                "found": False
-            }
     def __len__(self) -> int:
         """Return the number of pages in the PDF."""
         return len(self.pages)

natural-pdf 25.3.16.2__py3-none-any.whl → 25.3.17.2__py3-none-any.whl

natural-pdf 25.3.16.2py3-none-any.whl → 25.3.17.2py3-none-any.whl