PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +209 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +288 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +413 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +512 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +604 -0
docs/tutorials/12-ocr-integration.md +175 -0
docs/tutorials/13-semantic-search.ipynb +1328 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +50 -33
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/gemini.py +264 -0
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +125 -58
natural_pdf/analyzers/layout/layout_options.py +43 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +89 -45
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +146 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +419 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +1044 -521
natural_pdf/core/pdf.py +516 -313
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +307 -225
natural_pdf/elements/collections.py +805 -543
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +889 -879
natural_pdf/elements/text.py +127 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +57 -35
natural_pdf/ocr/engine.py +150 -46
natural_pdf/ocr/engine_easyocr.py +146 -150
natural_pdf/ocr/engine_paddle.py +118 -175
natural_pdf/ocr/engine_surya.py +78 -141
natural_pdf/ocr/ocr_factory.py +114 -0
natural_pdf/ocr/ocr_manager.py +122 -124
natural_pdf/ocr/ocr_options.py +16 -20
natural_pdf/ocr/utils.py +98 -0
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +32 -0
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/identifiers.py +29 -0
natural_pdf/utils/packaging.py +418 -0
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
natural_pdf-0.1.6.dist-info/RECORD +141 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
natural_pdf/templates/ocr_debug.html +0 -517
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/text.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
 Text element classes for natural-pdf.
 """
-from typing import Dict, Any, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict, Optional
 from natural_pdf.elements.base import Element
@@ -12,117 +13,133 @@ if TYPE_CHECKING:
 class TextElement(Element):
     """
     Represents a text element in a PDF.
     This class is a wrapper around pdfplumber's character objects,
     providing additional functionality for text extraction and analysis.
     """
-    def __init__(self, obj: Dict[str, Any], page: 'Page'):
+    def __init__(self, obj: Dict[str, Any], page: "Page"):
         """
         Initialize a text element.
         Args:
             obj: The underlying pdfplumber object. For OCR text elements,
                  should include 'text', 'bbox', 'source', and 'confidence'
             page: The parent Page object
         """
         # Add object_type if not present
-        if 'object_type' not in obj:
-            obj['object_type'] = 'text'
+        if "object_type" not in obj:
+            obj["object_type"] = "text"
         super().__init__(obj, page)
+        # Explicitly store constituent characters if provided
+        # (Pop from obj to avoid storing it twice if super() stores _obj by ref)
+        self._char_dicts = obj.pop("_char_dicts", [])
     @property
     def text(self) -> str:
         """Get the text content."""
-        return self._obj.get('text', '')
+        return self._obj.get("text", "")
+    @text.setter
+    def text(self, value: str):
+        """Set the text content."""
+        self._obj["text"] = value
     @property
     def source(self) -> str:
         """Get the source of this text element (pdf or ocr)."""
-        return self._obj.get('source', 'pdf')
+        return self._obj.get("source", "pdf")
     @property
     def confidence(self) -> float:
         """Get the confidence score for OCR text elements."""
-        return self._obj.get('confidence', 1.0)
+        return self._obj.get("confidence", 1.0)
     @property
     def fontname(self) -> str:
         """Get the font name."""
         # First check if we have a real fontname from PDF resources
-        if 'real_fontname' in self._obj:
-            return self._obj['real_fontname']
+        if "real_fontname" in self._obj:
+            return self._obj["real_fontname"]
         # Otherwise use standard fontname
-        return self._obj.get('fontname', '') or self._obj.get('font', '')
+        return self._obj.get("fontname", "") or self._obj.get("font", "")
     @property
     def font_family(self) -> str:
         """
         Get a cleaner font family name by stripping PDF-specific prefixes.
         PDF font names often include prefixes like 'ABCDEF+' followed by the font name
         or unique identifiers. This method attempts to extract a more readable font name.
         """
         font = self.fontname
         # Remove common PDF font prefixes (e.g., 'ABCDEF+')
-        if '+' in font:
-            font = font.split('+', 1)[1]
+        if "+" in font:
+            font = font.split("+", 1)[1]
         # Try to extract common font family names
         common_fonts = [
-            'Arial', 'Helvetica', 'Times', 'Courier', 'Calibri',
-            'Cambria', 'Georgia', 'Verdana', 'Tahoma', 'Trebuchet'
+            "Arial",
+            "Helvetica",
+            "Times",
+            "Courier",
+            "Calibri",
+            "Cambria",
+            "Georgia",
+            "Verdana",
+            "Tahoma",
+            "Trebuchet",
         ]
         for common in common_fonts:
             if common.lower() in font.lower():
                 return common
         return font
     @property
     def font_variant(self) -> str:
         """
         Get the font variant identifier (prefix before the '+' in PDF font names).
         PDF embeds font subsets with unique identifiers like 'AAAAAB+FontName'.
         Different variants of the same base font will have different prefixes.
-        This can be used to differentiate text that looks different despite
+        This can be used to differentiate text that looks different despite
         having the same font name and size.
         Returns:
             The font variant prefix, or empty string if no variant is present
         """
         font = self.fontname
         # Extract the prefix before '+' if it exists
-        if '+' in font:
-            return font.split('+', 1)[0]
+        if "+" in font:
+            return font.split("+", 1)[0]
         return ""
     @property
     def size(self) -> float:
         """Get the font size."""
-        return self._obj.get('size', 0)
+        return self._obj.get("size", 0)
     @property
     def color(self) -> tuple:
         """Get the text color (RGB tuple)."""
         # PDFs often use non-RGB values, so we handle different formats
         # In pdfplumber, colors can be in various formats depending on the PDF
-        color = self._obj.get('non_stroking_color', (0, 0, 0))
+        color = self._obj.get("non_stroking_color", (0, 0, 0))
         # If it's a single value, treat as grayscale
         if isinstance(color, (int, float)):
             return (color, color, color)
         # If it's a tuple of 3 values, treat as RGB
         if isinstance(color, tuple) and len(color) == 3:
             return color
         # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
         if isinstance(color, tuple) and len(color) == 4:
             c, m, y, k = color
@@ -130,33 +147,33 @@ class TextElement(Element):
             g = 1 - min(1, m + k)
             b = 1 - min(1, y + k)
             return (r, g, b)
         # Default to black
         return (0, 0, 0)
     def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
         """
         Extract text from this element.
         Args:
             keep_blank_chars: Whether to keep blank characters (default: True)
             **kwargs: Additional extraction parameters
         Returns:
             Text content
         """
         # For text elements, keep_blank_chars doesn't affect anything as we're
         # simply returning the text property. Included for API consistency.
         return self.text
     def contains(self, substring: str, case_sensitive: bool = True) -> bool:
         """
         Check if this text element contains a substring.
         Args:
             substring: The substring to check for
             case_sensitive: Whether the check is case-sensitive
         Returns:
             True if the text contains the substring
         """
@@ -164,25 +181,26 @@ class TextElement(Element):
             return substring in self.text
         else:
             return substring.lower() in self.text.lower()
     def matches(self, pattern: str) -> bool:
         """
         Check if this text element matches a regular expression pattern.
         Args:
             pattern: Regular expression pattern
         Returns:
             True if the text matches the pattern
         """
         import re
         return bool(re.search(pattern, self.text))
     @property
     def bold(self) -> bool:
         """
         Check if the text is bold based on multiple indicators in the PDF.
         PDFs encode boldness in several ways:
         1. Font name containing 'bold' or 'black'
         2. Font descriptor flags (bit 2 indicates bold)
@@ -192,43 +210,43 @@ class TextElement(Element):
         """
         # Check font name (original method)
         fontname = self.fontname.lower()
-        if 'bold' in fontname or 'black' in fontname or self.fontname.endswith('-B'):
+        if "bold" in fontname or "black" in fontname or self.fontname.endswith("-B"):
             return True
         # Check font descriptor flags if available (bit 2 = bold)
-        flags = self._obj.get('flags')
+        flags = self._obj.get("flags")
         if flags is not None and (flags & 4) != 0:  # Check if bit 2 is set
             return True
         # Check StemV (vertical stem width) if available
         # Higher StemV values indicate bolder fonts
-        stemv = self._obj.get('stemv') or self._obj.get('StemV')
+        stemv = self._obj.get("stemv") or self._obj.get("StemV")
         if stemv is not None and isinstance(stemv, (int, float)) and stemv > 120:
             return True
         # Check font weight if available (700+ is typically bold)
-        weight = self._obj.get('weight') or self._obj.get('FontWeight')
+        weight = self._obj.get("weight") or self._obj.get("FontWeight")
         if weight is not None and isinstance(weight, (int, float)) and weight >= 700:
             return True
         # Check text rendering mode (mode 2 = fill and stroke, can make text appear bold)
-        render_mode = self._obj.get('render_mode')
+        render_mode = self._obj.get("render_mode")
         if render_mode is not None and render_mode == 2:
             return True
         # Additional check: if we have text with the same font but different paths/strokes
         # Path widths or stroke widths can indicate boldness
-        stroke_width = self._obj.get('stroke_width') or self._obj.get('lineWidth')
+        stroke_width = self._obj.get("stroke_width") or self._obj.get("lineWidth")
         if stroke_width is not None and isinstance(stroke_width, (int, float)) and stroke_width > 0:
             return True
         return False
     @property
     def italic(self) -> bool:
         """
         Check if the text is italic based on multiple indicators in the PDF.
         PDFs encode italic (oblique) text in several ways:
         1. Font name containing 'italic' or 'oblique'
         2. Font descriptor flags (bit 6 indicates italic)
@@ -236,69 +254,79 @@ class TextElement(Element):
         """
         # Check font name (original method)
         fontname = self.fontname.lower()
-        if 'italic' in fontname or 'oblique' in fontname or self.fontname.endswith('-I'):
+        if "italic" in fontname or "oblique" in fontname or self.fontname.endswith("-I"):
             return True
         # Check font descriptor flags if available (bit 6 = italic)
-        flags = self._obj.get('flags')
+        flags = self._obj.get("flags")
         if flags is not None and (flags & 64) != 0:  # Check if bit 6 is set
             return True
         # Check italic angle if available
         # Non-zero italic angle indicates italic font
-        italic_angle = self._obj.get('italic_angle') or self._obj.get('ItalicAngle')
-        if italic_angle is not None and isinstance(italic_angle, (int, float)) and italic_angle != 0:
+        italic_angle = self._obj.get("italic_angle") or self._obj.get("ItalicAngle")
+        if (
+            italic_angle is not None
+            and isinstance(italic_angle, (int, float))
+            and italic_angle != 0
+        ):
             return True
         return False
     def __repr__(self) -> str:
         """String representation of the text element."""
-        preview = self.text[:10] + '...' if len(self.text) > 10 else self.text
+        preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
         font_style = []
         if self.bold:
             font_style.append("bold")
         if self.italic:
             font_style.append("italic")
         style_str = f", style={font_style}" if font_style else ""
         # Use font_family for display but include raw fontname and variant
         font_display = self.font_family
         variant = self.font_variant
         variant_str = f", variant='{variant}'" if variant else ""
-        if font_display != self.fontname and '+' in self.fontname:
-            base_font = self.fontname.split('+', 1)[1]
+        if font_display != self.fontname and "+" in self.fontname:
+            base_font = self.fontname.split("+", 1)[1]
             font_display = f"{font_display} ({base_font})"
         return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str} bbox={self.bbox}>"
     def font_info(self) -> dict:
         """
         Get detailed font information for this text element.
         Returns a dictionary with all available font-related properties,
         useful for debugging font detection issues.
         """
         info = {
-            'text': self.text,
-            'fontname': self.fontname,
-            'font_family': self.font_family,
-            'font_variant': self.font_variant,
-            'size': self.size,
-            'bold': self.bold,
-            'italic': self.italic,
-            'color': self.color
+            "text": self.text,
+            "fontname": self.fontname,
+            "font_family": self.font_family,
+            "font_variant": self.font_variant,
+            "size": self.size,
+            "bold": self.bold,
+            "italic": self.italic,
+            "color": self.color,
         }
         # Include raw font properties from the PDF
         font_props = [
-            'flags', 'stemv', 'StemV', 'weight', 'FontWeight',
-            'render_mode', 'stroke_width', 'lineWidth'
+            "flags",
+            "stemv",
+            "StemV",
+            "weight",
+            "FontWeight",
+            "render_mode",
+            "stroke_width",
+            "lineWidth",
         ]
         for prop in font_props:
             if prop in self._obj:
                 info[f"raw_{prop}"] = self._obj[prop]
-        return info
+        return info

natural_pdf/exporters/__init__.py CHANGED Viewed

	@@ -1 +0,0 @@
1	-

natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl