PyPI - natural-pdf - Versions diffs - 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl - Mend

natural-pdf 0.1.31py3-none-any.whl → 0.1.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

natural_pdf/analyzers/__init__.py +18 -4
natural_pdf/analyzers/guides.py +2176 -0
natural_pdf/analyzers/shape_detection_mixin.py +0 -650
natural_pdf/core/element_manager.py +42 -3
natural_pdf/core/page.py +49 -1
natural_pdf/core/pdf.py +22 -0
natural_pdf/elements/collections.py +61 -0
natural_pdf/elements/region.py +257 -14
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +14 -18
bad_pdf_analysis/analyze_10_more.py +0 -300
bad_pdf_analysis/analyze_final_10.py +0 -552
bad_pdf_analysis/analyze_specific_pages.py +0 -394
bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
tools/rtl_smoke_test.py +0 -80
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -230,12 +230,51 @@ class ElementManager:
             char_to_index[key] = idx
         # 2. Instantiate the custom word extractor
-        # Get config settings from the parent PDF or use defaults
+        # Prefer page-level config over PDF-level for tolerance lookup
+        page_config = getattr(self._page, "_config", {})
         pdf_config = getattr(self._page._parent, "_config", {})
-        xt = pdf_config.get("x_tolerance", 3)
-        yt = pdf_config.get("y_tolerance", 3)
+        # Start with any explicitly supplied tolerances (may be None)
+        xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
+        yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
         use_flow = pdf_config.get("use_text_flow", False)
+        # ------------------------------------------------------------------
+        # Auto-adaptive tolerance: scale based on median character size when
+        # requested and explicit values are absent.
+        # ------------------------------------------------------------------
+        if pdf_config.get("auto_text_tolerance", True):
+            import statistics
+            sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
+            median_size = None
+            if sizes:
+                median_size = statistics.median(sizes)
+                if xt is None:
+                    xt = 0.25 * median_size  # ~kerning width
+                    # Record back to page config for downstream users
+                    page_config["x_tolerance"] = xt
+                if yt is None:
+                    yt = 0.6 * median_size   # ~line spacing fraction
+                    page_config["y_tolerance"] = yt
+            # Warn users when the page's font size is extremely small –
+            # this is often the root cause of merged-row/column issues.
+            if median_size and median_size < 6:  # 6 pt is unusually small
+                logger.warning(
+                    f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
+                    f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
+                    "If the output looks wrong you can override these values via "
+                    "PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
+                    "auto_text_tolerance=False)."
+                )
+        # Fallback to pdfplumber defaults if still None
+        if xt is None:
+            xt = 3
+        if yt is None:
+            yt = 3
         # List of attributes to preserve on word objects
         attributes_to_preserve = list(
             set(

natural_pdf/core/page.py CHANGED Viewed

@@ -128,6 +128,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             "named": {},  # Named regions (name -> region)
         }
+        # -------------------------------------------------------------
+        # Page-scoped configuration begins as a shallow copy of the parent
+        # PDF-level configuration so that auto-computed tolerances or other
+        # page-specific values do not overwrite siblings.
+        # -------------------------------------------------------------
+        self._config = dict(getattr(self._parent, "_config", {}))
         # Initialize ElementManager, passing font_attrs
         self._element_mgr = ElementManager(self, font_attrs=font_attrs)
         # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
@@ -1153,10 +1160,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # 5. Generate Text Layout using Utility
         # Pass page bbox as layout context
         page_bbox = (0, 0, self.width, self.height)
+        # Merge PDF-level default tolerances if caller did not override
+        merged_kwargs = dict(kwargs)
+        tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
+        for k in tol_keys:
+            if k not in merged_kwargs:
+                if k in self._config:
+                    merged_kwargs[k] = self._config[k]
+                elif k in getattr(self._parent, "_config", {}):
+                    merged_kwargs[k] = self._parent._config[k]
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=page_bbox,
-            user_kwargs=kwargs,  # Pass original user kwargs
+            user_kwargs=merged_kwargs,
         )
         # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
@@ -1356,6 +1373,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # Use the selected method
         if effective_method == "pdfplumber":
+            # ---------------------------------------------------------
+            # Inject auto-computed or user-specified text tolerances so
+            # pdfplumber uses the same numbers we used for word grouping
+            # whenever the table algorithm relies on word positions.
+            # ---------------------------------------------------------
+            if "text" in (
+                table_settings.get("vertical_strategy"),
+                table_settings.get("horizontal_strategy"),
+            ):
+                print("SETTING IT UP")
+                pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
+                if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+                    x_tol = pdf_cfg.get("x_tolerance")
+                    if x_tol is not None:
+                        table_settings.setdefault("text_x_tolerance", x_tol)
+                if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+                    y_tol = pdf_cfg.get("y_tolerance")
+                    if y_tol is not None:
+                        table_settings.setdefault("text_y_tolerance", y_tol)
+                # pdfplumber's text strategy benefits from a tight snap tolerance.
+                if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+                    # Derive from y_tol if available, else default 1
+                    snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
+                    table_settings.setdefault("snap_tolerance", snap)
+                if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+                    join = table_settings.get("snap_tolerance", 1)
+                    table_settings.setdefault("join_tolerance", join)
+                    table_settings.setdefault("join_x_tolerance", join)
+                    table_settings.setdefault("join_y_tolerance", join)
             return self._page.extract_tables(table_settings)
         else:
             raise ValueError(

natural_pdf/core/pdf.py CHANGED Viewed

@@ -168,6 +168,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         reading_order: bool = True,
         font_attrs: Optional[List[str]] = None,
         keep_spaces: bool = True,
+        text_tolerance: Optional[dict] = None,
+        auto_text_tolerance: bool = True,
     ):
         """
         Initialize the enhanced PDF object.
@@ -177,6 +179,8 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             reading_order: Whether to use natural reading order
             font_attrs: Font attributes for grouping characters into words
             keep_spaces: Whether to include spaces in word elements
+            text_tolerance: PDFplumber-style tolerance settings
+            auto_text_tolerance: Whether to automatically scale text tolerance
         """
         self._original_path_or_stream = path_or_url_or_stream
         self._temp_file = None
@@ -274,6 +278,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             getattr(self, "_is_stream", False),
         )
+        # --- Text tolerance settings ------------------------------------
+        # Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
+        # y_tolerance, etc.) via *text_tolerance*.  We also keep a flag that
+        # enables automatic tolerance scaling when explicit values are not
+        # supplied.
+        self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
+        if text_tolerance:
+            # Only copy recognised primitives (numbers / None); ignore junk.
+            allowed = {
+                "x_tolerance",
+                "x_tolerance_ratio",
+                "y_tolerance",
+                "keep_blank_chars",  # passthrough convenience
+            }
+            for k, v in text_tolerance.items():
+                if k in allowed:
+                    self._config[k] = v
     def _initialize_managers(self):
         """Set up manager factories for lazy instantiation."""
         # Store factories/classes for each manager key

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1901,7 +1901,68 @@ class ElementCollection(
             )
         )
+    # ------------------------------------------------------------------
+    # NEW METHOD: apply_ocr for collections (supports custom function)
+    # ------------------------------------------------------------------
+    def apply_ocr(
+        self,
+        *,
+        function: Optional[Callable[["Region"], Optional[str]]] = None,
+        show_progress: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
+        """Apply OCR to every element in the collection.
+        This is a convenience wrapper that simply iterates over the collection
+        and calls ``el.apply_ocr(...)`` on each item.
+        Two modes are supported depending on the arguments provided:
+        1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
+           or ``languages=['en']`` and each element delegates to the global
+           OCRManager.
+        2. **Custom function** – pass a *callable* via the ``function`` keyword
+           (alias ``ocr_function`` also recognised).  The callable will receive
+           the element/region and must return the recognised text (or ``None``).
+           Internally this is forwarded through the element's own
+           :py:meth:`apply_ocr` implementation, so the behaviour mirrors the
+           single-element API.
+        Parameters
+        ----------
+        function : callable, optional
+            Custom OCR function to use instead of the built-in engines.
+        show_progress : bool, default True
+            Display a tqdm progress bar while processing.
+        **kwargs
+            Additional parameters forwarded to each element's ``apply_ocr``.
+        Returns
+        -------
+        ElementCollection
+            *Self* for fluent chaining.
+        """
+        # Alias for backward-compatibility
+        if function is None and "ocr_function" in kwargs:
+            function = kwargs.pop("ocr_function")
+        def _process(el):
+            if hasattr(el, "apply_ocr"):
+                if function is not None:
+                    return el.apply_ocr(function=function, **kwargs)
+                else:
+                    return el.apply_ocr(**kwargs)
+            else:
+                logger.warning(
+                    f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
+                )
+                return el
+        # Use collection's apply helper for optional progress bar
+        self.apply(_process, show_progress=show_progress)
+        return self
+    # ------------------------------------------------------------------
 class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):

natural_pdf/elements/region.py CHANGED Viewed

@@ -1319,6 +1319,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             table_settings.setdefault("vertical_strategy", "lines")
             table_settings.setdefault("horizontal_strategy", "lines")
+        # -------------------------------------------------------------
+        # Auto-inject tolerances when text-based strategies are requested.
+        # This must happen AFTER alias handling (so strategies are final)
+        # and BEFORE we delegate to _extract_table_* helpers.
+        # -------------------------------------------------------------
+        if "text" in (table_settings.get("vertical_strategy"), table_settings.get("horizontal_strategy")):
+            page_cfg = getattr(self.page, "_config", {})
+            # Ensure text_* tolerances passed to pdfplumber
+            if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+                if page_cfg.get("x_tolerance") is not None:
+                    table_settings["text_x_tolerance"] = page_cfg["x_tolerance"]
+            if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+                if page_cfg.get("y_tolerance") is not None:
+                    table_settings["text_y_tolerance"] = page_cfg["y_tolerance"]
+            # Snap / join tolerances (~ line spacing)
+            if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+                snap = max(1, round((page_cfg.get("y_tolerance", 1)) * 0.9))
+                table_settings["snap_tolerance"] = snap
+            if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+                table_settings["join_tolerance"] = table_settings["snap_tolerance"]
         logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
         # Use the selected method
@@ -1438,6 +1460,30 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Returns:
             List of tables, where each table is a list of rows, and each row is a list of cell values
         """
+        # Inject global PDF-level text tolerances if not explicitly present
+        pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
+        _uses_text = "text" in (
+            table_settings.get("vertical_strategy"),
+            table_settings.get("horizontal_strategy"),
+        )
+        if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+            x_tol = pdf_cfg.get("x_tolerance")
+            if x_tol is not None:
+                table_settings.setdefault("text_x_tolerance", x_tol)
+        if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+            y_tol = pdf_cfg.get("y_tolerance")
+            if y_tol is not None:
+                table_settings.setdefault("text_y_tolerance", y_tol)
+        if _uses_text and "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+            snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
+            table_settings.setdefault("snap_tolerance", snap)
+        if _uses_text and "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+            join = table_settings.get("snap_tolerance", 1)
+            table_settings.setdefault("join_tolerance", join)
+            table_settings.setdefault("join_x_tolerance", join)
+            table_settings.setdefault("join_y_tolerance", join)
         # Create a crop of the page for this region
         cropped = self.page._page.crop(self.bbox)
@@ -1458,6 +1504,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Returns:
             Table data as a list of rows, where each row is a list of cell values
         """
+        # Inject global PDF-level text tolerances if not explicitly present
+        pdf_cfg = getattr(self.page, "_config", getattr(self.page._parent, "_config", {}))
+        _uses_text = "text" in (
+            table_settings.get("vertical_strategy"),
+            table_settings.get("horizontal_strategy"),
+        )
+        if _uses_text and "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
+            x_tol = pdf_cfg.get("x_tolerance")
+            if x_tol is not None:
+                table_settings.setdefault("text_x_tolerance", x_tol)
+        if _uses_text and "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
+            y_tol = pdf_cfg.get("y_tolerance")
+            if y_tol is not None:
+                table_settings.setdefault("text_y_tolerance", y_tol)
         # Create a crop of the page for this region
         cropped = self.page._page.crop(self.bbox)
@@ -1943,21 +2004,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         """
         Apply OCR to this region and return the created text elements.
+        This method supports two modes:
+        1. **Built-in OCR Engines** (default) – identical to previous behaviour. Pass typical
+           parameters like ``engine='easyocr'`` or ``languages=['en']`` and the method will
+           route the request through :class:`OCRManager`.
+        2. **Custom OCR Function** – pass a *callable* under the keyword ``function`` (or
+           ``ocr_function``). The callable will receive *this* Region instance and should
+           return the extracted text (``str``) or ``None``.  Internally the call is
+           delegated to :pymeth:`apply_custom_ocr` so the same logic (replacement, element
+           creation, etc.) is re-used.
+        Examples
+        ---------
+        >>> def llm_ocr(region):
+        ...     image = region.to_image(resolution=300, crop=True)
+        ...     return my_llm_client.ocr(image)
+        >>> region.apply_ocr(function=llm_ocr)
         Args:
-            replace: If True (default), removes existing OCR elements in the region
-                    before adding new ones. If False, adds new OCR elements without
-                    removing existing ones.
-            **ocr_params: Keyword arguments passed to the OCR Manager.
-                          Common parameters like `engine`, `languages`, `min_confidence`,
-                          `device`, and `resolution` (for image rendering) should be
-                          provided here. **The `languages` list must contain codes
-                          understood by the specific engine selected.** No mapping
-                          is performed. Engine-specific settings can be passed in
-                          an `options` object (e.g., `options=EasyOCROptions(...)`).
+            replace: Whether to remove existing OCR elements first (default ``True``).
+            **ocr_params: Parameters for the built-in OCR manager *or* the special
+                          ``function``/``ocr_function`` keyword to trigger custom mode.
+        Returns
+        -------
+            Self – for chaining.
+        """
+        # --- Custom OCR function path --------------------------------------------------
+        custom_func = ocr_params.pop("function", None) or ocr_params.pop("ocr_function", None)
+        if callable(custom_func):
+            # Delegate to the specialised helper while preserving key kwargs
+            return self.apply_custom_ocr(
+                ocr_function=custom_func,
+                source_label=ocr_params.pop("source_label", "custom-ocr"),
+                replace=replace,
+                confidence=ocr_params.pop("confidence", None),
+                add_to_page=ocr_params.pop("add_to_page", True),
+            )
-        Returns:
-            Self for method chaining.
-        """
+        # --- Original built-in OCR engine path (unchanged except docstring) ------------
         # Ensure OCRManager is available
         if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
             logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
@@ -2123,6 +2208,133 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
         return self
+    def apply_custom_ocr(
+        self,
+        ocr_function: Callable[["Region"], Optional[str]],
+        source_label: str = "custom-ocr",
+        replace: bool = True,
+        confidence: Optional[float] = None,
+        add_to_page: bool = True,
+    ) -> "Region":
+        """
+        Apply a custom OCR function to this region and create text elements from the results.
+        This is useful when you want to use a custom OCR method (e.g., an LLM API,
+        specialized OCR service, or any custom logic) instead of the built-in OCR engines.
+        Args:
+            ocr_function: A callable that takes a Region and returns the OCR'd text (or None).
+                          The function receives this region as its argument and should return
+                          the extracted text as a string, or None if no text was found.
+            source_label: Label to identify the source of these text elements (default: "custom-ocr").
+                          This will be set as the 'source' attribute on created elements.
+            replace: If True (default), removes existing OCR elements in this region before
+                     adding new ones. If False, adds new OCR elements alongside existing ones.
+            confidence: Optional confidence score for the OCR result (0.0-1.0).
+                        If None, defaults to 1.0 if text is returned, 0.0 if None is returned.
+            add_to_page: If True (default), adds the created text element to the page.
+                         If False, creates the element but doesn't add it to the page.
+        Returns:
+            Self for method chaining.
+        Example:
+            # Using with an LLM
+            def ocr_with_llm(region):
+                image = region.to_image(resolution=300, crop=True)
+                # Call your LLM API here
+                return llm_client.ocr(image)
+            region.apply_custom_ocr(ocr_with_llm)
+            # Using with a custom OCR service
+            def ocr_with_service(region):
+                img_bytes = region.to_image(crop=True).tobytes()
+                response = ocr_service.process(img_bytes)
+                return response.text
+            region.apply_custom_ocr(ocr_with_service, source_label="my-ocr-service")
+        """
+        # If replace is True, remove existing OCR elements in this region
+        if replace:
+            logger.info(
+                f"Region {self.bbox}: Removing existing OCR elements before applying custom OCR."
+            )
+            removed_count = 0
+            # Helper to remove a single element safely
+            def _safe_remove(elem):
+                nonlocal removed_count
+                success = False
+                if hasattr(elem, "page") and hasattr(elem.page, "_element_mgr"):
+                    etype = getattr(elem, "object_type", "word")
+                    if etype == "word":
+                        etype_key = "words"
+                    elif etype == "char":
+                        etype_key = "chars"
+                    else:
+                        etype_key = etype + "s" if not etype.endswith("s") else etype
+                    try:
+                        success = elem.page._element_mgr.remove_element(elem, etype_key)
+                    except Exception:
+                        success = False
+                if success:
+                    removed_count += 1
+            # Remove OCR elements overlapping this region
+            for word in list(self.page._element_mgr.words):
+                if getattr(word, "source", "").startswith("ocr") and self.intersects(word):
+                    _safe_remove(word)
+            # Also check custom-ocr sources
+            for word in list(self.page._element_mgr.words):
+                if getattr(word, "source", "") == source_label and self.intersects(word):
+                    _safe_remove(word)
+            if removed_count > 0:
+                logger.info(
+                    f"Region {self.bbox}: Removed {removed_count} existing OCR elements."
+                )
+        # Call the custom OCR function
+        try:
+            logger.debug(f"Region {self.bbox}: Calling custom OCR function...")
+            ocr_text = ocr_function(self)
+            if ocr_text is not None and not isinstance(ocr_text, str):
+                logger.warning(
+                    f"Custom OCR function returned non-string type ({type(ocr_text)}). "
+                    f"Converting to string."
+                )
+                ocr_text = str(ocr_text)
+        except Exception as e:
+            logger.error(
+                f"Error calling custom OCR function for region {self.bbox}: {e}",
+                exc_info=True
+            )
+            return self
+        # Create text element if we got text
+        if ocr_text is not None:
+            # Use the to_text_element method to create the element
+            text_element = self.to_text_element(
+                text_content=ocr_text,
+                source_label=source_label,
+                confidence=confidence,
+                add_to_page=add_to_page
+            )
+            logger.info(
+                f"Region {self.bbox}: Created text element with {len(ocr_text)} chars"
+                f"{' and added to page' if add_to_page else ''}"
+            )
+        else:
+            logger.debug(f"Region {self.bbox}: Custom OCR function returned None (no text found)")
+        return self
     def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
         """
         Get a section between two elements within this region.
@@ -2917,6 +3129,33 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         if not hasattr(self, "page") or self.page is None:
             raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
+        # Create character dictionaries for the text
+        char_dicts = []
+        if actual_text:
+            # Create a single character dict that spans the entire region
+            # This is a simplified approach - OCR engines typically create one per character
+            char_dict = {
+                "text": actual_text,
+                "x0": self.x0,
+                "top": self.top,
+                "x1": self.x1,
+                "bottom": self.bottom,
+                "width": self.width,
+                "height": self.height,
+                "object_type": "char",
+                "page_number": self.page.page_number,
+                "fontname": default_font_name,
+                "size": default_font_size,
+                "upright": True,
+                "direction": 1,
+                "adv": self.width,
+                "source": source_label,
+                "confidence": final_confidence,
+                "stroking_color": (0, 0, 0),
+                "non_stroking_color": (0, 0, 0),
+            }
+            char_dicts.append(char_dict)
         elem_data = {
             "text": actual_text,
             "x0": self.x0,
@@ -2936,7 +3175,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             "adv": self.width,
             "source": source_label,
             "confidence": final_confidence,
-            "_char_dicts": [],
+            "_char_dicts": char_dicts,
         }
         text_element = TextElement(elem_data, self.page)
@@ -2952,6 +3191,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 logger.debug(
                     f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
                 )
+                # Also add character dictionaries to the chars collection
+                if char_dicts and object_type == "word":
+                    for char_dict in char_dicts:
+                        self.page._element_mgr.add_element(char_dict, element_type="chars")
             else:
                 page_num_str = (
                     str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"

{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.31
+Version: 0.1.32
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.1.31.dist-info → natural_pdf-0.1.32.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,8 @@
-bad_pdf_analysis/analyze_10_more.py,sha256=UjsTuHE1GUMoVjkX3afy3x6DfpXyfZXHgS2W1GQqUmw,11906
-bad_pdf_analysis/analyze_final_10.py,sha256=xYkIId0nF9LpWHRLDP1_nlJfJfC0b0Tu4mLu-3mim-0,25170
-bad_pdf_analysis/analyze_specific_pages.py,sha256=wzq3_ZWR28hFdT7GEkayHPYgsk20OpD476LYmy2rAEk,13725
-bad_pdf_analysis/analyze_specific_pages_direct.py,sha256=307gSNplwOtNTR9a0lEQWxlAKGeoZIcDe5z1pROKUXY,14846
 natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
 natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
-natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
-natural_pdf/analyzers/shape_detection_mixin.py,sha256=0a4uuKQ4Z1Ta_UVuUtX7mVhlwmXdAkoHTyC5wZyp5do,94455
+natural_pdf/analyzers/__init__.py,sha256=MQRctn4i5Q7u8pb8vQVHKEXUiVGpKyPZUECrlDH4AuU,673
+natural_pdf/analyzers/guides.py,sha256=tzyViSBDdM66mT0niwFTDIJ16UzRCZ18Iqv8wA5DYAk,90302
+natural_pdf/analyzers/shape_detection_mixin.py,sha256=q7gDM-z2t7bSTxjfV2aaW3533CySu1qsEpu4wb5Rp-I,62688
 natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
 natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
 natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
@@ -28,10 +25,10 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
 natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
 natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
-natural_pdf/core/element_manager.py,sha256=Mn4cYqPL_2LD_GK9lf2duExaJF1qhASCKsOdAZdQb00,49821
+natural_pdf/core/element_manager.py,sha256=A6GJk9kwTzt-aSz4-SWaRHLZRbIMFFLce3CpxSyfkV4,51749
 natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
-natural_pdf/core/page.py,sha256=kQKKqsbOaNeLhW3ark6mueDS-4tsopJcGcoMmKPK6B8,125624
-natural_pdf/core/pdf.py,sha256=YfniZp54AyptzMyr7ZP8n617n4wlV28SPrajt32nNBk,74233
+natural_pdf/core/page.py,sha256=843_Fyk1gxZ8nqERJjjjoRD3iM4pFJy9a0zQSyMthiQ,128476
+natural_pdf/core/pdf.py,sha256=mC4GZjPXx_bK6RUlhLpnJnapkHDhbgJpgpcUJOvb7OE,75290
 natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
 natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
 natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
@@ -39,11 +36,11 @@ natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo
 natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
 natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
 natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
-natural_pdf/elements/collections.py,sha256=52Oac96svzm_QMJcVaItnCG9P7d6JMNiGEx9lHgDEQg,125915
+natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtfw35FaQ,128457
 natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
 natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
-natural_pdf/elements/region.py,sha256=v1PzWvQoGHGdn7SQiPf4Oq3hIGueIfYGwcZ05ZU6XPE,127692
+natural_pdf/elements/region.py,sha256=8SKhzCJ6sELZxJcM2i_58YhEKU6HBvaJ7Oj6E3bOsHw,139523
 natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
@@ -100,13 +97,12 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
 natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
-natural_pdf-0.1.31.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.32.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
 optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
 optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
 optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
 optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
 optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
-tools/rtl_smoke_test.py,sha256=-ogcbvNzumJasICP0NNQHk4Zb4M1VRx0TnGkJUQC7SM,3043
 tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
 tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
 tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
@@ -115,8 +111,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
 tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
 tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
 tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
-natural_pdf-0.1.31.dist-info/METADATA,sha256=tqimu2ZReyYu5pS0PsbCo-Z9fIzkpMj1ljGPNbaOFss,6711
-natural_pdf-0.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.1.31.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.1.31.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
-natural_pdf-0.1.31.dist-info/RECORD,,
+natural_pdf-0.1.32.dist-info/METADATA,sha256=CMZIo2BjeLh-b9hezQHMLehZP8brUflCQ69dLtfFyxo,6711
+natural_pdf-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.1.32.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.1.32.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
+natural_pdf-0.1.32.dist-info/RECORD,,

natural-pdf 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl

natural-pdf 0.1.31py3-none-any.whl → 0.1.32py3-none-any.whl