PyPI - natural-pdf - Versions diffs - 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl - Mend

natural-pdf 0.1.30py3-none-any.whl → 0.1.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

natural_pdf/analyzers/__init__.py +18 -4
natural_pdf/analyzers/guides.py +2176 -0
natural_pdf/analyzers/shape_detection_mixin.py +0 -650
natural_pdf/core/element_manager.py +86 -27
natural_pdf/core/page.py +49 -1
natural_pdf/core/pdf.py +22 -0
natural_pdf/elements/collections.py +61 -0
natural_pdf/elements/region.py +257 -14
natural_pdf/elements/text.py +29 -0
{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +15 -19
bad_pdf_analysis/analyze_10_more.py +0 -300
bad_pdf_analysis/analyze_final_10.py +0 -552
bad_pdf_analysis/analyze_specific_pages.py +0 -394
bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
tools/rtl_smoke_test.py +0 -80
{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
 import logging
 import re
+from contextlib import contextmanager
 from itertools import groupby
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,6 +48,33 @@ HIGHLIGHT_DEFAULTS = {
     "color_value_min": 0.4,        # HSV V >
 }
+@contextmanager
+def disable_text_sync():
+    """
+    Temporarily disable text synchronization for performance.
+    This is used when bulk-updating text content where character-level
+    synchronization is not needed, such as during bidi processing.
+    Fixes exponential recursion issue with Arabic/RTL text processing.
+    """
+    # Save original setter
+    original_setter = TextElement.text.fset
+    # Create a fast setter that skips sync
+    def fast_setter(self, value):
+        self._obj["text"] = value
+        # Skip character synchronization for performance
+    # Apply fast setter
+    TextElement.text = property(TextElement.text.fget, fast_setter)
+    try:
+        yield
+    finally:
+        # Restore original setter
+        TextElement.text = property(TextElement.text.fget, original_setter)
 class NaturalWordExtractor(WordExtractor):
     """
     Custom WordExtractor that splits words based on specified character attributes
@@ -202,14 +230,52 @@ class ElementManager:
             char_to_index[key] = idx
         # 2. Instantiate the custom word extractor
-        # Get config settings from the parent PDF or use defaults
+        # Prefer page-level config over PDF-level for tolerance lookup
+        page_config = getattr(self._page, "_config", {})
         pdf_config = getattr(self._page._parent, "_config", {})
-        xt = pdf_config.get("x_tolerance", 3)
-        yt = pdf_config.get("y_tolerance", 3)
+        # Start with any explicitly supplied tolerances (may be None)
+        xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
+        yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
         use_flow = pdf_config.get("use_text_flow", False)
-        # Define which attributes to preserve on the merged word object
-        # Should include split attributes + any others needed for filtering (like color)
+        # ------------------------------------------------------------------
+        # Auto-adaptive tolerance: scale based on median character size when
+        # requested and explicit values are absent.
+        # ------------------------------------------------------------------
+        if pdf_config.get("auto_text_tolerance", True):
+            import statistics
+            sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
+            median_size = None
+            if sizes:
+                median_size = statistics.median(sizes)
+                if xt is None:
+                    xt = 0.25 * median_size  # ~kerning width
+                    # Record back to page config for downstream users
+                    page_config["x_tolerance"] = xt
+                if yt is None:
+                    yt = 0.6 * median_size   # ~line spacing fraction
+                    page_config["y_tolerance"] = yt
+            # Warn users when the page's font size is extremely small –
+            # this is often the root cause of merged-row/column issues.
+            if median_size and median_size < 6:  # 6 pt is unusually small
+                logger.warning(
+                    f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
+                    f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
+                    "If the output looks wrong you can override these values via "
+                    "PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
+                    "auto_text_tolerance=False)."
+                )
+        # Fallback to pdfplumber defaults if still None
+        if xt is None:
+            xt = 3
+        if yt is None:
+            yt = 3
+        # List of attributes to preserve on word objects
         attributes_to_preserve = list(
             set(
                 self._word_split_attributes
@@ -223,7 +289,7 @@ class ElementManager:
             )
         )
-        # -------------------------------------------------------------
+        # ------------------------------------------------------------------
         # NEW: Detect direction (LTR vs RTL) per visual line and feed
         #       pdfplumber's WordExtractor with the correct settings.
         # -------------------------------------------------------------
@@ -271,7 +337,9 @@ class ElementManager:
             # Build a WordExtractor tailored for this line's direction
             if is_rtl_line:
                 line_dir = "ttb"  # horizontal lines stacked top→bottom
-                char_dir = "rtl"  # characters right→left within the line
+                # Feed characters in right→left x-order; extractor can then treat
+                # them as left-to-right so that resulting text stays logical.
+                char_dir = "ltr"
             else:
                 line_dir = "ttb"
                 char_dir = "ltr"
@@ -288,9 +356,8 @@ class ElementManager:
             )
             # Prepare character sequence for the extractor:
-            #  • For LTR lines -> left→right order (x0 ascending)
-            #  • For RTL lines -> feed **reversed** list so that neighbouring
-            #    characters appear adjacent when the extractor walks right→left.
+            # Always feed characters in spatial order (x0 ascending)
+            # PDF stores glyphs in visual order, so this gives us the visual sequence
             line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
             try:
@@ -324,15 +391,18 @@ class ElementManager:
                 # on the whole-line heuristic.
                 rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
                 if rtl_in_word:
+                    # Convert from visual order (from PDF) to logical order using bidi
                     try:
                         from bidi.algorithm import get_display  # type: ignore
                         from natural_pdf.utils.bidi_mirror import mirror_brackets
-                        word_element.text = mirror_brackets(
-                            get_display(word_element.text, base_dir="R")
-                        )
+                        with disable_text_sync():
+                            # word_element.text is currently in visual order (from PDF)
+                            # Convert to logical order using bidi with auto direction detection
+                            logical_text = get_display(word_element.text, base_dir='L')
+                            # Apply bracket mirroring for logical order
+                            word_element.text = mirror_brackets(logical_text)
                     except Exception:
-                        # Fallback: keep original text if python-bidi fails
                         pass
         # ------------------------------------------------------------------
@@ -415,19 +485,6 @@ class ElementManager:
             f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
         )
-        # --- Post-processing pass to ensure every word containing RTL characters is
-        #     stored in logical order and with mirrored brackets.  This is a
-        #     safeguard in case the per-line loop above missed some tokens.
-        try:
-            from bidi.algorithm import get_display  # type: ignore
-            from natural_pdf.utils.bidi_mirror import mirror_brackets
-            for w in generated_words:
-                if any(_is_rtl_char(ch) for ch in w.text):
-                    w.text = mirror_brackets(get_display(w.text, base_dir="R"))
-        except Exception:
-            pass  # graceful degradation – keep original text
         # 4. Load other elements (rects, lines)
         rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
         line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
@@ -463,6 +520,8 @@ class ElementManager:
         logger.debug(f"Page {self._page.number}: Element loading complete.")
+        # If per-word BiDi was skipped, generated_words already stay in logical order.
     def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
         """
         Prepares a list of character dictionaries from native PDF characters,

natural_pdf/core/page.py CHANGED Viewed

@@ -128,6 +128,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             "named": {},  # Named regions (name -> region)
         }
+        # -------------------------------------------------------------
+        # Page-scoped configuration begins as a shallow copy of the parent
+        # PDF-level configuration so that auto-computed tolerances or other
+        # page-specific values do not overwrite siblings.
+        # -------------------------------------------------------------
+        self._config = dict(getattr(self._parent, "_config", {}))
         # Initialize ElementManager, passing font_attrs
         self._element_mgr = ElementManager(self, font_attrs=font_attrs)
         # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
@@ -1153,10 +1160,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # 5. Generate Text Layout using Utility
         # Pass page bbox as layout context
         page_bbox = (0, 0, self.width, self.height)
+        # Merge PDF-level default tolerances if caller did not override
+        merged_kwargs = dict(kwargs)
+        tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
+        for k in tol_keys:
+            if k not in merged_kwargs:
+                if k in self._config:
+                    merged_kwargs[k] = self._config[k]
+                elif k in getattr(self._parent, "_config", {}):
+                    merged_kwargs[k] = self._parent._config[k]
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=page_bbox,
-            user_kwargs=kwargs,  # Pass original user kwargs
+            user_kwargs=merged_kwargs,
         )
         # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
@@ -1356,6 +1373,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # Use the selected method
         if effective_method == "pdfplumber":
+            # ---------------------------------------------------------
+            # Inject auto-computed or user-specified text tolerances so
+            # pdfplumber uses the same numbers we used for word grouping
+            # whenever the table algorithm relies on word positions.
+            # ---------------------------------------------------------
+            if "text" in (
+                table_settings.get("vertical_strategy"),
+                table_settings.get("horizontal_strategy"),
+            ):
+                print("SETTING IT UP")
+                pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
+                if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+                    x_tol = pdf_cfg.get("x_tolerance")
+                    if x_tol is not None:
+                        table_settings.setdefault("text_x_tolerance", x_tol)
+                if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+                    y_tol = pdf_cfg.get("y_tolerance")
+                    if y_tol is not None:
+                        table_settings.setdefault("text_y_tolerance", y_tol)
+                # pdfplumber's text strategy benefits from a tight snap tolerance.
+                if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+                    # Derive from y_tol if available, else default 1
+                    snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
+                    table_settings.setdefault("snap_tolerance", snap)
+                if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+                    join = table_settings.get("snap_tolerance", 1)
+                    table_settings.setdefault("join_tolerance", join)
+                    table_settings.setdefault("join_x_tolerance", join)
+                    table_settings.setdefault("join_y_tolerance", join)
             return self._page.extract_tables(table_settings)
         else:
             raise ValueError(

natural_pdf/core/pdf.py CHANGED Viewed

@@ -168,6 +168,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         reading_order: bool = True,
         font_attrs: Optional[List[str]] = None,
         keep_spaces: bool = True,
+        text_tolerance: Optional[dict] = None,
+        auto_text_tolerance: bool = True,
     ):
         """
         Initialize the enhanced PDF object.
@@ -177,6 +179,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             reading_order: Whether to use natural reading order
             font_attrs: Font attributes for grouping characters into words
             keep_spaces: Whether to include spaces in word elements
+            text_tolerance: PDFplumber-style tolerance settings
+            auto_text_tolerance: Whether to automatically scale text tolerance
         """
         self._original_path_or_stream = path_or_url_or_stream
         self._temp_file = None
@@ -274,6 +278,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             getattr(self, "_is_stream", False),
         )
+        # --- Text tolerance settings ------------------------------------
+        # Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
+        # y_tolerance, etc.) via *text_tolerance*.  We also keep a flag that
+        # enables automatic tolerance scaling when explicit values are not
+        # supplied.
+        self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
+        if text_tolerance:
+            # Only copy recognised primitives (numbers / None); ignore junk.
+            allowed = {
+                "x_tolerance",
+                "x_tolerance_ratio",
+                "y_tolerance",
+                "keep_blank_chars",  # passthrough convenience
+            }
+            for k, v in text_tolerance.items():
+                if k in allowed:
+                    self._config[k] = v
     def _initialize_managers(self):
         """Set up manager factories for lazy instantiation."""
         # Store factories/classes for each manager key

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1901,7 +1901,68 @@ class ElementCollection(
             )
         )
+    # ------------------------------------------------------------------
+    # NEW METHOD: apply_ocr for collections (supports custom function)
+    # ------------------------------------------------------------------
+    def apply_ocr(
+        self,
+        *,
+        function: Optional[Callable[["Region"], Optional[str]]] = None,
+        show_progress: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
+        """Apply OCR to every element in the collection.
+        This is a convenience wrapper that simply iterates over the collection
+        and calls ``el.apply_ocr(...)`` on each item.
+        Two modes are supported depending on the arguments provided:
+        1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
+           or ``languages=['en']`` and each element delegates to the global
+           OCRManager.
+        2. **Custom function** – pass a *callable* via the ``function`` keyword
+           (alias ``ocr_function`` also recognised).  The callable will receive
+           the element/region and must return the recognised text (or ``None``).
+           Internally this is forwarded through the element's own
+           :py:meth:`apply_ocr` implementation, so the behaviour mirrors the
+           single-element API.
+        Parameters
+        ----------
+        function : callable, optional
+            Custom OCR function to use instead of the built-in engines.
+        show_progress : bool, default True
+            Display a tqdm progress bar while processing.
+        **kwargs
+            Additional parameters forwarded to each element's ``apply_ocr``.
+        Returns
+        -------
+        ElementCollection
+            *Self* for fluent chaining.
+        """
+        # Alias for backward-compatibility
+        if function is None and "ocr_function" in kwargs:
+            function = kwargs.pop("ocr_function")
+        def _process(el):
+            if hasattr(el, "apply_ocr"):
+                if function is not None:
+                    return el.apply_ocr(function=function, **kwargs)
+                else:
+                    return el.apply_ocr(**kwargs)
+            else:
+                logger.warning(
+                    f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
+                )
+                return el
+        # Use collection's apply helper for optional progress bar
+        self.apply(_process, show_progress=show_progress)
+        return self
+    # ------------------------------------------------------------------
 class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):

natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

natural-pdf 0.1.30py3-none-any.whl → 0.1.32py3-none-any.whl