PyPI - natural-pdf - Versions diffs - 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl - Mend

natural-pdf 0.1.31py3-none-any.whl → 0.1.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

natural_pdf/analyzers/__init__.py +18 -4
natural_pdf/analyzers/guides.py +2176 -0
natural_pdf/analyzers/shape_detection_mixin.py +0 -650
natural_pdf/core/element_manager.py +99 -40
natural_pdf/core/page.py +76 -3
natural_pdf/core/pdf.py +38 -3
natural_pdf/elements/collections.py +61 -0
natural_pdf/elements/region.py +270 -14
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/METADATA +1 -1
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/RECORD +14 -18
bad_pdf_analysis/analyze_10_more.py +0 -300
bad_pdf_analysis/analyze_final_10.py +0 -552
bad_pdf_analysis/analyze_specific_pages.py +0 -394
bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
tools/rtl_smoke_test.py +0 -80
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.33.dist-info}/top_level.txt +0 -0

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -146,7 +146,7 @@ class ElementManager:
     contained in the Page class, providing better separation of concerns.
     """
-    def __init__(self, page, font_attrs=None):
+    def __init__(self, page, font_attrs=None, load_text: bool = True):
         """
         Initialize the ElementManager.
@@ -156,9 +156,11 @@ class ElementManager:
                        Default: ['fontname', 'size', 'bold', 'italic']
                        None: Only consider spatial relationships
                        List: Custom attributes to consider
+            load_text: Whether to load text elements from the PDF (default: True).
         """
         self._page = page
         self._elements = None  # Lazy-loaded
+        self._load_text = load_text
         # Default to splitting by fontname, size, bold, italic if not specified
         # Renamed internal variable for clarity
         self._word_split_attributes = (
@@ -175,11 +177,15 @@ class ElementManager:
         logger.debug(f"Page {self._page.number}: Loading elements...")
-        # 1. Prepare character dictionaries (native + OCR) with necessary attributes
-        prepared_char_dicts = self._prepare_char_dicts()
-        logger.debug(
-            f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
-        )
+        # 1. Prepare character dictionaries only if loading text
+        if self._load_text:
+            prepared_char_dicts = self._prepare_char_dicts()
+            logger.debug(
+                f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
+            )
+        else:
+            prepared_char_dicts = []
+            logger.debug(f"Page {self._page.number}: Skipping text loading (load_text=False)")
         # -------------------------------------------------------------
         # Detect strikethrough (horizontal strike-out lines) on raw
@@ -189,52 +195,105 @@ class ElementManager:
         # belong to the same word.
         # -------------------------------------------------------------
-        try:
-            self._mark_strikethrough_chars(prepared_char_dicts)
-        except Exception as strike_err:  # pragma: no cover – strike detection must never crash loading
-            logger.warning(
-                f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
-                exc_info=True,
-            )
+        if self._load_text and prepared_char_dicts:
+            try:
+                self._mark_strikethrough_chars(prepared_char_dicts)
+            except Exception as strike_err:  # pragma: no cover – strike detection must never crash loading
+                logger.warning(
+                    f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
+                    exc_info=True,
+                )
         # -------------------------------------------------------------
         # Detect underlines on raw characters (must come after strike so
         # both attributes are present before word grouping).
         # -------------------------------------------------------------
-        try:
-            self._mark_underline_chars(prepared_char_dicts)
-        except Exception as u_err:  # pragma: no cover
-            logger.warning(
-                f"Page {self._page.number}: Underline detection failed – {u_err}",
-                exc_info=True,
-            )
+        if self._load_text and prepared_char_dicts:
+            try:
+                self._mark_underline_chars(prepared_char_dicts)
+            except Exception as u_err:  # pragma: no cover
+                logger.warning(
+                    f"Page {self._page.number}: Underline detection failed – {u_err}",
+                    exc_info=True,
+                )
         # Detect highlights
-        try:
-            self._mark_highlight_chars(prepared_char_dicts)
-        except Exception as h_err:
-            logger.warning(
-                f"Page {self._page.number}: Highlight detection failed – {h_err}",
-                exc_info=True,
-            )
+        if self._load_text and prepared_char_dicts:
+            try:
+                self._mark_highlight_chars(prepared_char_dicts)
+            except Exception as h_err:
+                logger.warning(
+                    f"Page {self._page.number}: Highlight detection failed – {h_err}",
+                    exc_info=True,
+                )
         # Create a mapping from character dict to index for efficient lookup
-        char_to_index = {}
-        for idx, char_dict in enumerate(prepared_char_dicts):
-            key = (
-                char_dict.get("x0", 0),
-                char_dict.get("top", 0),
-                char_dict.get("text", ""),
-            )
-            char_to_index[key] = idx
+        if self._load_text:
+            char_to_index = {}
+            for idx, char_dict in enumerate(prepared_char_dicts):
+                key = (
+                    char_dict.get("x0", 0),
+                    char_dict.get("top", 0),
+                    char_dict.get("text", ""),
+                )
+                char_to_index[key] = idx
+        else:
+            char_to_index = {}
         # 2. Instantiate the custom word extractor
-        # Get config settings from the parent PDF or use defaults
+        # Prefer page-level config over PDF-level for tolerance lookup
+        word_elements: List[TextElement] = []
+        # Get config objects (needed for auto_text_tolerance check)
+        page_config = getattr(self._page, "_config", {})
         pdf_config = getattr(self._page._parent, "_config", {})
-        xt = pdf_config.get("x_tolerance", 3)
-        yt = pdf_config.get("y_tolerance", 3)
+        # Initialize tolerance variables
+        xt = None
+        yt = None
         use_flow = pdf_config.get("use_text_flow", False)
+        if self._load_text and prepared_char_dicts:
+            # Start with any explicitly supplied tolerances (may be None)
+            xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
+            yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
+        # ------------------------------------------------------------------
+        # Auto-adaptive tolerance: scale based on median character size when
+        # requested and explicit values are absent.
+        # ------------------------------------------------------------------
+        if self._load_text and pdf_config.get("auto_text_tolerance", True):
+            import statistics
+            sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
+            median_size = None
+            if sizes:
+                median_size = statistics.median(sizes)
+                if xt is None:
+                    xt = 0.25 * median_size  # ~kerning width
+                    # Record back to page config for downstream users
+                    page_config["x_tolerance"] = xt
+                if yt is None:
+                    yt = 0.6 * median_size   # ~line spacing fraction
+                    page_config["y_tolerance"] = yt
+            # Warn users when the page's font size is extremely small –
+            # this is often the root cause of merged-row/column issues.
+            if median_size and median_size < 6:  # 6 pt is unusually small
+                logger.warning(
+                    f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
+                    f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
+                    "If the output looks wrong you can override these values via "
+                    "PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
+                    "auto_text_tolerance=False)."
+                )
+        # Fallback to pdfplumber defaults if still None
+        if xt is None:
+            xt = 3
+        if yt is None:
+            yt = 3
         # List of attributes to preserve on word objects
         attributes_to_preserve = list(
@@ -284,7 +343,6 @@ class ElementManager:
                 current_line_key = line_key
             lines[-1].append(char_dict)
-        word_elements: List[TextElement] = []
         # Process each line separately with direction detection
         for line_chars in lines:
             if not line_chars:
@@ -441,7 +499,8 @@ class ElementManager:
                         except Exception:
                             w._obj["highlight_color"] = dominant_color
-        generated_words = word_elements
+        # generated_words defaults to empty list if text loading is disabled
+        generated_words = word_elements if self._load_text else []
         logger.debug(
             f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
         )

natural_pdf/core/page.py CHANGED Viewed

@@ -101,7 +101,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
     with improved selection, navigation, extraction, and question-answering capabilities.
     """
-    def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
+    def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None, load_text: bool = True):
         """
         Initialize a page wrapper.
@@ -110,10 +110,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             parent: Parent PDF object
             index: Index of this page in the PDF (0-based)
             font_attrs: Font attributes to consider when grouping characters into words.
+            load_text: Whether to load text elements from the PDF (default: True).
         """
         self._page = page
         self._parent = parent
         self._index = index
+        self._load_text = load_text
         self._text_styles = None  # Lazy-loaded text style analyzer results
         self._exclusions = []  # List to store exclusion functions/regions
         self._skew_angle: Optional[float] = None  # Stores detected skew angle
@@ -128,8 +130,15 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             "named": {},  # Named regions (name -> region)
         }
+        # -------------------------------------------------------------
+        # Page-scoped configuration begins as a shallow copy of the parent
+        # PDF-level configuration so that auto-computed tolerances or other
+        # page-specific values do not overwrite siblings.
+        # -------------------------------------------------------------
+        self._config = dict(getattr(self._parent, "_config", {}))
         # Initialize ElementManager, passing font_attrs
-        self._element_mgr = ElementManager(self, font_attrs=font_attrs)
+        self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
         # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
         # --- NEW --- Central registry for analysis results
         self.analyses: Dict[str, Any] = {}
@@ -1153,10 +1162,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # 5. Generate Text Layout using Utility
         # Pass page bbox as layout context
         page_bbox = (0, 0, self.width, self.height)
+        # Merge PDF-level default tolerances if caller did not override
+        merged_kwargs = dict(kwargs)
+        tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
+        for k in tol_keys:
+            if k not in merged_kwargs:
+                if k in self._config:
+                    merged_kwargs[k] = self._config[k]
+                elif k in getattr(self._parent, "_config", {}):
+                    merged_kwargs[k] = self._parent._config[k]
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=page_bbox,
-            user_kwargs=kwargs,  # Pass original user kwargs
+            user_kwargs=merged_kwargs,
         )
         # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
@@ -1356,6 +1375,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # Use the selected method
         if effective_method == "pdfplumber":
+            # ---------------------------------------------------------
+            # Inject auto-computed or user-specified text tolerances so
+            # pdfplumber uses the same numbers we used for word grouping
+            # whenever the table algorithm relies on word positions.
+            # ---------------------------------------------------------
+            if "text" in (
+                table_settings.get("vertical_strategy"),
+                table_settings.get("horizontal_strategy"),
+            ):
+                print("SETTING IT UP")
+                pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
+                if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+                    x_tol = pdf_cfg.get("x_tolerance")
+                    if x_tol is not None:
+                        table_settings.setdefault("text_x_tolerance", x_tol)
+                if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+                    y_tol = pdf_cfg.get("y_tolerance")
+                    if y_tol is not None:
+                        table_settings.setdefault("text_y_tolerance", y_tol)
+                # pdfplumber's text strategy benefits from a tight snap tolerance.
+                if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+                    # Derive from y_tol if available, else default 1
+                    snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
+                    table_settings.setdefault("snap_tolerance", snap)
+                if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+                    join = table_settings.get("snap_tolerance", 1)
+                    table_settings.setdefault("join_tolerance", join)
+                    table_settings.setdefault("join_x_tolerance", join)
+                    table_settings.setdefault("join_y_tolerance", join)
             return self._page.extract_tables(table_settings)
         else:
             raise ValueError(
@@ -2950,6 +3000,29 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         """
         return self.find_all('*').inspect(limit=limit)
+    def remove_text_layer(self) -> "Page":
+        """
+        Remove all text elements from this page.
+        This removes all text elements (words and characters) from the page,
+        effectively clearing the text layer.
+        Returns:
+            Self for method chaining
+        """
+        logger.info(f"Page {self.number}: Removing all text elements...")
+        # Remove all words and chars from the element manager
+        removed_words = len(self._element_mgr.words)
+        removed_chars = len(self._element_mgr.chars)
+        # Clear the lists
+        self._element_mgr._elements["words"] = []
+        self._element_mgr._elements["chars"] = []
+        logger.info(f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters")
+        return self
     @property
     def lines(self) -> List[Any]:
         """Get all line elements on this page."""

natural_pdf/core/pdf.py CHANGED Viewed

@@ -108,12 +108,13 @@ class _LazyPageList(Sequence):
     also supported and will materialise pages on demand.
     """
-    def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
+    def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True):
         self._parent_pdf = parent_pdf
         self._plumber_pdf = plumber_pdf
         self._font_attrs = font_attrs
         # One slot per pdfplumber page – initially all None
         self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
+        self._load_text = load_text
     # Internal helper -----------------------------------------------------
     def _create_page(self, index: int) -> "Page":
@@ -123,7 +124,7 @@ class _LazyPageList(Sequence):
             from natural_pdf.core.page import Page
             plumber_page = self._plumber_pdf.pages[index]
-            cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
+            cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs, load_text=self._load_text)
             self._cache[index] = cached
         return cached
@@ -168,6 +169,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         reading_order: bool = True,
         font_attrs: Optional[List[str]] = None,
         keep_spaces: bool = True,
+        text_tolerance: Optional[dict] = None,
+        auto_text_tolerance: bool = True,
+        text_layer: bool = True,
     ):
         """
         Initialize the enhanced PDF object.
@@ -177,11 +181,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             reading_order: Whether to use natural reading order
             font_attrs: Font attributes for grouping characters into words
             keep_spaces: Whether to include spaces in word elements
+            text_tolerance: PDFplumber-style tolerance settings
+            auto_text_tolerance: Whether to automatically scale text tolerance
+            text_layer: Whether to keep the existing text layer from the PDF (default: True).
+                       If False, removes all existing text elements during initialization.
         """
         self._original_path_or_stream = path_or_url_or_stream
         self._temp_file = None
         self._resolved_path = None
         self._is_stream = False
+        self._text_layer = text_layer
         stream_to_open = None
         if hasattr(path_or_url_or_stream, "read"):  # Check if it's file-like
@@ -253,7 +262,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._manager_registry = {}
         # Lazily instantiate pages only when accessed
-        self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
+        self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer)
         self._element_cache = {}
         self._exclusions = []
@@ -263,6 +272,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._initialize_managers()
         self._initialize_highlighter()
+        # Remove text layer if requested
+        if not self._text_layer:
+            logger.info("Removing text layer as requested (text_layer=False)")
+            # Text layer is not loaded when text_layer=False, so no need to remove
+            pass
         # Analysis results accessed via self.analyses property (see below)
         # --- Automatic cleanup when object is garbage-collected ---
@@ -274,6 +290,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             getattr(self, "_is_stream", False),
         )
+        # --- Text tolerance settings ------------------------------------
+        # Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
+        # y_tolerance, etc.) via *text_tolerance*.  We also keep a flag that
+        # enables automatic tolerance scaling when explicit values are not
+        # supplied.
+        self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
+        if text_tolerance:
+            # Only copy recognised primitives (numbers / None); ignore junk.
+            allowed = {
+                "x_tolerance",
+                "x_tolerance_ratio",
+                "y_tolerance",
+                "keep_blank_chars",  # passthrough convenience
+            }
+            for k, v in text_tolerance.items():
+                if k in allowed:
+                    self._config[k] = v
     def _initialize_managers(self):
         """Set up manager factories for lazy instantiation."""
         # Store factories/classes for each manager key
@@ -1441,6 +1475,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
                 reading_order=self._reading_order,
                 font_attrs=self._font_attrs,
                 keep_spaces=self._config.get("keep_spaces", True),
+                text_layer=self._text_layer,
             )
             return new_pdf
         except Exception as e:

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1901,7 +1901,68 @@ class ElementCollection(
             )
         )
+    # ------------------------------------------------------------------
+    # NEW METHOD: apply_ocr for collections (supports custom function)
+    # ------------------------------------------------------------------
+    def apply_ocr(
+        self,
+        *,
+        function: Optional[Callable[["Region"], Optional[str]]] = None,
+        show_progress: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
+        """Apply OCR to every element in the collection.
+        This is a convenience wrapper that simply iterates over the collection
+        and calls ``el.apply_ocr(...)`` on each item.
+        Two modes are supported depending on the arguments provided:
+        1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
+           or ``languages=['en']`` and each element delegates to the global
+           OCRManager.
+        2. **Custom function** – pass a *callable* via the ``function`` keyword
+           (alias ``ocr_function`` also recognised).  The callable will receive
+           the element/region and must return the recognised text (or ``None``).
+           Internally this is forwarded through the element's own
+           :py:meth:`apply_ocr` implementation, so the behaviour mirrors the
+           single-element API.
+        Parameters
+        ----------
+        function : callable, optional
+            Custom OCR function to use instead of the built-in engines.
+        show_progress : bool, default True
+            Display a tqdm progress bar while processing.
+        **kwargs
+            Additional parameters forwarded to each element's ``apply_ocr``.
+        Returns
+        -------
+        ElementCollection
+            *Self* for fluent chaining.
+        """
+        # Alias for backward-compatibility
+        if function is None and "ocr_function" in kwargs:
+            function = kwargs.pop("ocr_function")
+        def _process(el):
+            if hasattr(el, "apply_ocr"):
+                if function is not None:
+                    return el.apply_ocr(function=function, **kwargs)
+                else:
+                    return el.apply_ocr(**kwargs)
+            else:
+                logger.warning(
+                    f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
+                )
+                return el
+        # Use collection's apply helper for optional progress bar
+        self.apply(_process, show_progress=show_progress)
+        return self
+    # ------------------------------------------------------------------
 class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):

natural-pdf 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl

natural-pdf 0.1.31py3-none-any.whl → 0.1.33py3-none-any.whl