PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +188 -82
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +132 -16
natural_pdf/core/pdf.py +486 -71
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +238 -111
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.32.dist-info/RECORD +0 -118
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -77,7 +77,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
 # --- End Classification Imports --- #
 # --- End Shape Detection Mixin --- #
@@ -94,26 +93,112 @@ logger = logging.getLogger(__name__)
 class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
-    """
-    Enhanced Page wrapper built on top of pdfplumber.Page.
+    """Enhanced Page wrapper built on top of pdfplumber.Page.
     This class provides a fluent interface for working with PDF pages,
     with improved selection, navigation, extraction, and question-answering capabilities.
+    It integrates multiple analysis capabilities through mixins and provides spatial
+    navigation with CSS-like selectors.
+    The Page class serves as the primary interface for document analysis, offering:
+    - Element selection and spatial navigation
+    - OCR and layout analysis integration
+    - Table detection and extraction
+    - AI-powered classification and data extraction
+    - Visual debugging with highlighting and cropping
+    - Text style analysis and structure detection
+    Attributes:
+        index: Zero-based index of this page in the PDF.
+        number: One-based page number (index + 1).
+        width: Page width in points.
+        height: Page height in points.
+        bbox: Bounding box tuple (x0, top, x1, bottom) of the page.
+        chars: Collection of character elements on the page.
+        words: Collection of word elements on the page.
+        lines: Collection of line elements on the page.
+        rects: Collection of rectangle elements on the page.
+        images: Collection of image elements on the page.
+        metadata: Dictionary for storing analysis results and custom data.
+    Example:
+        Basic usage:
+        ```python
+        pdf = npdf.PDF("document.pdf")
+        page = pdf.pages[0]
+        # Find elements with CSS-like selectors
+        headers = page.find_all('text[size>12]:bold')
+        summaries = page.find('text:contains("Summary")')
+        # Spatial navigation
+        content_below = summaries.below(until='text[size>12]:bold')
+        # Table extraction
+        tables = page.extract_table()
+        ```
+        Advanced usage:
+        ```python
+        # Apply OCR if needed
+        page.apply_ocr(engine='easyocr', resolution=300)
+        # Layout analysis
+        page.analyze_layout(engine='yolo')
+        # AI-powered extraction
+        data = page.extract_structured_data(MySchema)
+        # Visual debugging
+        page.find('text:contains("Important")').show()
+        ```
     """
-    def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
-        """
-        Initialize a page wrapper.
+    def __init__(
+        self,
+        page: "pdfplumber.page.Page",
+        parent: "PDF",
+        index: int,
+        font_attrs=None,
+        load_text: bool = True,
+    ):
+        """Initialize a page wrapper.
+        Creates an enhanced Page object that wraps a pdfplumber page with additional
+        functionality for spatial navigation, analysis, and AI-powered extraction.
         Args:
-            page: pdfplumber page object
-            parent: Parent PDF object
-            index: Index of this page in the PDF (0-based)
-            font_attrs: Font attributes to consider when grouping characters into words.
+            page: The underlying pdfplumber page object that provides raw PDF data.
+            parent: Parent PDF object that contains this page and provides access
+                to managers and global settings.
+            index: Zero-based index of this page in the PDF document.
+            font_attrs: List of font attributes to consider when grouping characters
+                into words. Common attributes include ['fontname', 'size', 'flags'].
+                If None, uses default character-to-word grouping rules.
+            load_text: If True, load and process text elements from the PDF's text layer.
+                If False, skip text layer processing (useful for OCR-only workflows).
+        Note:
+            This constructor is typically called automatically when accessing pages
+            through the PDF.pages collection. Direct instantiation is rarely needed.
+        Example:
+            ```python
+            # Pages are usually accessed through the PDF object
+            pdf = npdf.PDF("document.pdf")
+            page = pdf.pages[0]  # Page object created automatically
+            # Direct construction (advanced usage)
+            import pdfplumber
+            with pdfplumber.open("document.pdf") as plumber_pdf:
+                plumber_page = plumber_pdf.pages[0]
+                page = Page(plumber_page, pdf, 0, load_text=True)
+            ```
         """
         self._page = page
         self._parent = parent
         self._index = index
+        self._load_text = load_text
         self._text_styles = None  # Lazy-loaded text style analyzer results
         self._exclusions = []  # List to store exclusion functions/regions
         self._skew_angle: Optional[float] = None  # Stores detected skew angle
@@ -136,7 +221,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         self._config = dict(getattr(self._parent, "_config", {}))
         # Initialize ElementManager, passing font_attrs
-        self._element_mgr = ElementManager(self, font_attrs=font_attrs)
+        self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
         # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
         # --- NEW --- Central registry for analysis results
         self.analyses: Dict[str, Any] = {}
@@ -1188,6 +1273,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             if _contains_rtl(result):
                 try:
                     from bidi.algorithm import get_display  # type: ignore
                     from natural_pdf.utils.bidi_mirror import mirror_brackets
                     result = "\n".join(
@@ -1197,8 +1283,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                                 base_dir=(
                                     "R"
                                     if any(
-                                        unicodedata.bidirectional(ch)
-                                        in ("R", "AL", "AN")
+                                        unicodedata.bidirectional(ch) in ("R", "AL", "AN")
                                         for ch in line
                                     )
                                     else "L"
@@ -1394,11 +1479,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                         table_settings.setdefault("text_y_tolerance", y_tol)
                 # pdfplumber's text strategy benefits from a tight snap tolerance.
-                if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+                if (
+                    "snap_tolerance" not in table_settings
+                    and "snap_x_tolerance" not in table_settings
+                ):
                     # Derive from y_tol if available, else default 1
                     snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
                     table_settings.setdefault("snap_tolerance", snap)
-                if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+                if (
+                    "join_tolerance" not in table_settings
+                    and "join_x_tolerance" not in table_settings
+                ):
                     join = table_settings.get("snap_tolerance", 1)
                     table_settings.setdefault("join_tolerance", join)
                     table_settings.setdefault("join_x_tolerance", join)
@@ -2996,7 +3087,32 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             InspectionSummary with element tables showing coordinates,
             properties, and other details for each element
         """
-        return self.find_all('*').inspect(limit=limit)
+        return self.find_all("*").inspect(limit=limit)
+    def remove_text_layer(self) -> "Page":
+        """
+        Remove all text elements from this page.
+        This removes all text elements (words and characters) from the page,
+        effectively clearing the text layer.
+        Returns:
+            Self for method chaining
+        """
+        logger.info(f"Page {self.number}: Removing all text elements...")
+        # Remove all words and chars from the element manager
+        removed_words = len(self._element_mgr.words)
+        removed_chars = len(self._element_mgr.chars)
+        # Clear the lists
+        self._element_mgr._elements["words"] = []
+        self._element_mgr._elements["chars"] = []
+        logger.info(
+            f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters"
+        )
+        return self
     @property
     def lines(self) -> List[Any]:

natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl