PyPI - natural-pdf - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl - Mend

natural-pdf 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +751 -607
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +131 -45
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +120 -23
natural_pdf/core/pdf.py +477 -75
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +222 -108
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
natural_pdf-0.1.35.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.33.dist-info/RECORD +0 -118
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -1,8 +1,16 @@
-"""
-Element Manager for natural-pdf.
-This class handles the loading, creation, and management of PDF elements like
-characters, words, rectangles, and lines extracted from a page.
+"""Element Manager for natural-pdf.
+This module handles the loading, creation, and management of PDF elements like
+characters, words, rectangles, lines, and images extracted from a page. The
+ElementManager class serves as the central coordinator for element lifecycle
+management and provides enhanced word extraction capabilities.
+The module includes:
+- Element creation and caching for performance
+- Custom word extraction that respects font boundaries
+- OCR coordinate transformation and integration
+- Text decoration detection (underline, strikethrough, highlights)
+- Performance optimizations for bulk text processing
 """
 import logging
@@ -13,10 +21,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from pdfplumber.utils.text import WordExtractor
+from natural_pdf.elements.image import ImageElement
 from natural_pdf.elements.line import LineElement
 from natural_pdf.elements.rect import RectangleElement
 from natural_pdf.elements.text import TextElement
-from natural_pdf.elements.image import ImageElement
 logger = logging.getLogger(__name__)
@@ -25,8 +33,8 @@ logger = logging.getLogger(__name__)
 # ------------------------------------------------------------------
 STRIKE_DEFAULTS = {
-    "thickness_tol": 1.5,   # pt ; max height of line/rect to be considered strike
-    "horiz_tol": 1.0,       # pt ; vertical tolerance for horizontality
+    "thickness_tol": 1.5,  # pt ; max height of line/rect to be considered strike
+    "horiz_tol": 1.0,  # pt ; vertical tolerance for horizontality
     "coverage_ratio": 0.7,  # proportion of glyph width to be overlapped
     "band_top_frac": 0.35,  # fraction of glyph height above top baseline band
     "band_bottom_frac": 0.65,  # fraction below top (same used internally)
@@ -36,48 +44,90 @@ UNDERLINE_DEFAULTS = {
     "thickness_tol": 1.5,
     "horiz_tol": 1.0,
     "coverage_ratio": 0.8,
-    "band_frac": 0.25,   # height fraction above baseline
-    "below_pad": 0.7,    # pt ; pad below baseline
+    "band_frac": 0.25,  # height fraction above baseline
+    "below_pad": 0.7,  # pt ; pad below baseline
 }
 HIGHLIGHT_DEFAULTS = {
     "height_min_ratio": 0.6,  # rect height relative to char height lower bound
     "height_max_ratio": 2.0,  # upper bound
-    "coverage_ratio": 0.6,    # horizontal overlap with glyph
+    "coverage_ratio": 0.6,  # horizontal overlap with glyph
     "color_saturation_min": 0.4,  # HSV S >
-    "color_value_min": 0.4,        # HSV V >
+    "color_value_min": 0.4,  # HSV V >
 }
 @contextmanager
 def disable_text_sync():
-    """
-    Temporarily disable text synchronization for performance.
-    This is used when bulk-updating text content where character-level
-    synchronization is not needed, such as during bidi processing.
-    Fixes exponential recursion issue with Arabic/RTL text processing.
+    """Temporarily disable text synchronization for performance.
+    This context manager is used when bulk-updating text content where character-level
+    synchronization is not needed, such as during bidi processing or large-scale
+    text transformations. It prevents exponential recursion issues with Arabic/RTL
+    text processing by bypassing the normal text property setter.
+    Yields:
+        None: The context where text synchronization is disabled.
+    Example:
+        ```python
+        with disable_text_sync():
+            for element in text_elements:
+                element.text = process_arabic_text(element.text)
+        # Text sync automatically restored after the block
+        ```
+    Note:
+        This optimization is critical for performance when processing documents
+        with complex text layouts or right-to-left scripts that would otherwise
+        trigger expensive character synchronization operations.
     """
     # Save original setter
     original_setter = TextElement.text.fset
     # Create a fast setter that skips sync
     def fast_setter(self, value):
         self._obj["text"] = value
         # Skip character synchronization for performance
     # Apply fast setter
     TextElement.text = property(TextElement.text.fget, fast_setter)
     try:
         yield
     finally:
         # Restore original setter
         TextElement.text = property(TextElement.text.fget, original_setter)
 class NaturalWordExtractor(WordExtractor):
-    """
-    Custom WordExtractor that splits words based on specified character attributes
+    """Custom WordExtractor that splits words based on specified character attributes.
+    This class extends pdfplumber's WordExtractor to provide more intelligent word
+    segmentation that respects font boundaries and other character attributes.
+    It prevents words from spanning across different fonts, sizes, or styles,
+    which is essential for maintaining semantic meaning in document analysis.
+    The extractor considers multiple character attributes when determining word
+    boundaries, ensuring that visually distinct text elements (like bold headers
+    mixed with regular text) are properly separated into distinct words.
+    Attributes:
+        font_attrs: List of character attributes to consider for word boundaries.
+            Common attributes include 'fontname', 'size', 'flags', etc.
+    Example:
+        ```python
+        # Create extractor that splits on font and size changes
+        extractor = NaturalWordExtractor(['fontname', 'size'])
+        # Extract words with font-aware boundaries
+        words = extractor.extract_words(page_chars)
+        # Each word will have consistent font properties
+        for word in words:
+            print(f"'{word['text']}' in {word['fontname']} size {word['size']}")
+        ```
     in addition to pdfplumber's default spatial logic.
     """
@@ -198,7 +248,9 @@ class ElementManager:
         if self._load_text and prepared_char_dicts:
             try:
                 self._mark_strikethrough_chars(prepared_char_dicts)
-            except Exception as strike_err:  # pragma: no cover – strike detection must never crash loading
+            except (
+                Exception
+            ) as strike_err:  # pragma: no cover – strike detection must never crash loading
                 logger.warning(
                     f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
                     exc_info=True,
@@ -244,16 +296,16 @@ class ElementManager:
         # 2. Instantiate the custom word extractor
         # Prefer page-level config over PDF-level for tolerance lookup
         word_elements: List[TextElement] = []
         # Get config objects (needed for auto_text_tolerance check)
         page_config = getattr(self._page, "_config", {})
         pdf_config = getattr(self._page._parent, "_config", {})
         # Initialize tolerance variables
         xt = None
         yt = None
         use_flow = pdf_config.get("use_text_flow", False)
         if self._load_text and prepared_char_dicts:
             # Start with any explicitly supplied tolerances (may be None)
             xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
@@ -275,7 +327,7 @@ class ElementManager:
                     # Record back to page config for downstream users
                     page_config["x_tolerance"] = xt
                 if yt is None:
-                    yt = 0.6 * median_size   # ~line spacing fraction
+                    yt = 0.6 * median_size  # ~line spacing fraction
                     page_config["y_tolerance"] = yt
             # Warn users when the page's font size is extremely small –
@@ -364,7 +416,8 @@ class ElementManager:
                 char_dir = "ltr"
             extractor = NaturalWordExtractor(
-                word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
+                word_split_attributes=self._word_split_attributes
+                + ["strike", "underline", "highlight"],
                 extra_attrs=attributes_to_preserve,
                 x_tolerance=xt,
                 y_tolerance=yt,
@@ -413,12 +466,13 @@ class ElementManager:
                     # Convert from visual order (from PDF) to logical order using bidi
                     try:
                         from bidi.algorithm import get_display  # type: ignore
                         from natural_pdf.utils.bidi_mirror import mirror_brackets
                         with disable_text_sync():
                             # word_element.text is currently in visual order (from PDF)
                             # Convert to logical order using bidi with auto direction detection
-                            logical_text = get_display(word_element.text, base_dir='L')
+                            logical_text = get_display(word_element.text, base_dir="L")
                             # Apply bracket mirroring for logical order
                             word_element.text = mirror_brackets(logical_text)
                     except Exception:
@@ -495,7 +549,11 @@ class ElementManager:
                     if color_counts:
                         dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
                         try:
-                            w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
+                            w._obj["highlight_color"] = (
+                                tuple(dominant_color)
+                                if isinstance(dominant_color, (list, tuple))
+                                else dominant_color
+                            )
                         except Exception:
                             w._obj["highlight_color"] = dominant_color
@@ -998,12 +1056,16 @@ class ElementManager:
     #  Strikethrough detection (horizontal strike-out lines)
     # ------------------------------------------------------------------
-    def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
-                                  thickness_tol: float = 1.5,
-                                  horiz_tol: float = 1.0,
-                                  coverage_ratio: float = 0.7,
-                                  band_top: float = 0.35,
-                                  band_bottom: float = 0.65) -> None:
+    def _mark_strikethrough_chars(
+        self,
+        char_dicts: List[Dict[str, Any]],
+        *,
+        thickness_tol: float = 1.5,
+        horiz_tol: float = 1.0,
+        coverage_ratio: float = 0.7,
+        band_top: float = 0.35,
+        band_bottom: float = 0.65,
+    ) -> None:
         """Annotate character dictionaries with a boolean ``strike`` flag.
         Args
@@ -1102,11 +1164,31 @@ class ElementManager:
         # Allow user overrides via PDF._config["underline_detection"]
         pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
-        thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
-        horiz_tol     = horiz_tol     if horiz_tol     is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
-        coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
-        band_frac     = band_frac     if band_frac     is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
-        below_pad     = below_pad     if below_pad     is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
+        thickness_tol = (
+            thickness_tol
+            if thickness_tol is not None
+            else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
+        )
+        horiz_tol = (
+            horiz_tol
+            if horiz_tol is not None
+            else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
+        )
+        coverage_ratio = (
+            coverage_ratio
+            if coverage_ratio is not None
+            else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
+        )
+        band_frac = (
+            band_frac
+            if band_frac is not None
+            else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
+        )
+        below_pad = (
+            below_pad
+            if below_pad is not None
+            else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
+        )
         raw_lines = list(getattr(self._page._page, "lines", []))
         raw_rects = list(getattr(self._page._page, "rects", []))
@@ -1148,7 +1230,7 @@ class ElementManager:
         table_y = {k for k, v in y_groups.items() if v >= 3}
         # filter out candidates on those y values
-        filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
+        filtered_candidates = [c for c in candidates if int((c[1] + c[3]) / 2) not in table_y]
         # annotate chars
         for ch in char_dicts:
@@ -1205,7 +1287,9 @@ class ElementManager:
             y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
             y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
             rheight = y1_rect - y0_rect
-            highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
+            highlight_rects.append(
+                (rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col)
+            )
         if not highlight_rects:
             for ch in char_dicts:
@@ -1238,7 +1322,9 @@ class ElementManager:
                 if overlap > 0 and (overlap / width) >= coverage_ratio:
                     ch["highlight"] = True
                     try:
-                        ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
+                        ch["highlight_color"] = (
+                            tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
+                        )
                     except Exception:
                         ch["highlight_color"] = rcolor
                     break

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -98,7 +98,9 @@ class HighlightRenderer:
             scaled_bbox = None
             if highlight.is_polygon:
-                scaled_polygon = [(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon]
+                scaled_polygon = [
+                    (p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon
+                ]
                 # Draw polygon fill and border
                 draw.polygon(
                     scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
@@ -597,7 +599,7 @@ class HighlightingService:
         if page_index in self._highlights_by_page:
             del self._highlights_by_page[page_index]
             logger.debug(f"Cleared highlights for page {page_index}.")
         # Also clear any cached rendered images for this page so the next render
         # reflects the removal of highlights.
         try:
@@ -683,7 +685,6 @@ class HighlightingService:
             )
         try:
-            # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
             img_object = page_obj._page.to_image(**to_image_args)
             base_image_pil = (
                 img_object.annotated
@@ -929,9 +930,7 @@ class HighlightingService:
                 right_px = max(left_px + 1, min(right_px, rendered_image.width))
                 bottom_px = max(top_px + 1, min(bottom_px, rendered_image.height))
-                rendered_image = rendered_image.crop(
-                    (left_px, top_px, right_px, bottom_px)
-                )
+                rendered_image = rendered_image.crop((left_px, top_px, right_px, bottom_px))
             legend = None
             if labels:

natural_pdf/core/page.py CHANGED Viewed

@@ -77,7 +77,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
 # --- End Classification Imports --- #
 # --- End Shape Detection Mixin --- #
@@ -94,23 +93,107 @@ logger = logging.getLogger(__name__)
 class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
-    """
-    Enhanced Page wrapper built on top of pdfplumber.Page.
+    """Enhanced Page wrapper built on top of pdfplumber.Page.
     This class provides a fluent interface for working with PDF pages,
     with improved selection, navigation, extraction, and question-answering capabilities.
+    It integrates multiple analysis capabilities through mixins and provides spatial
+    navigation with CSS-like selectors.
+    The Page class serves as the primary interface for document analysis, offering:
+    - Element selection and spatial navigation
+    - OCR and layout analysis integration
+    - Table detection and extraction
+    - AI-powered classification and data extraction
+    - Visual debugging with highlighting and cropping
+    - Text style analysis and structure detection
+    Attributes:
+        index: Zero-based index of this page in the PDF.
+        number: One-based page number (index + 1).
+        width: Page width in points.
+        height: Page height in points.
+        bbox: Bounding box tuple (x0, top, x1, bottom) of the page.
+        chars: Collection of character elements on the page.
+        words: Collection of word elements on the page.
+        lines: Collection of line elements on the page.
+        rects: Collection of rectangle elements on the page.
+        images: Collection of image elements on the page.
+        metadata: Dictionary for storing analysis results and custom data.
+    Example:
+        Basic usage:
+        ```python
+        pdf = npdf.PDF("document.pdf")
+        page = pdf.pages[0]
+        # Find elements with CSS-like selectors
+        headers = page.find_all('text[size>12]:bold')
+        summaries = page.find('text:contains("Summary")')
+        # Spatial navigation
+        content_below = summaries.below(until='text[size>12]:bold')
+        # Table extraction
+        tables = page.extract_table()
+        ```
+        Advanced usage:
+        ```python
+        # Apply OCR if needed
+        page.apply_ocr(engine='easyocr', resolution=300)
+        # Layout analysis
+        page.analyze_layout(engine='yolo')
+        # AI-powered extraction
+        data = page.extract_structured_data(MySchema)
+        # Visual debugging
+        page.find('text:contains("Important")').show()
+        ```
     """
-    def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None, load_text: bool = True):
-        """
-        Initialize a page wrapper.
+    def __init__(
+        self,
+        page: "pdfplumber.page.Page",
+        parent: "PDF",
+        index: int,
+        font_attrs=None,
+        load_text: bool = True,
+    ):
+        """Initialize a page wrapper.
+        Creates an enhanced Page object that wraps a pdfplumber page with additional
+        functionality for spatial navigation, analysis, and AI-powered extraction.
         Args:
-            page: pdfplumber page object
-            parent: Parent PDF object
-            index: Index of this page in the PDF (0-based)
-            font_attrs: Font attributes to consider when grouping characters into words.
-            load_text: Whether to load text elements from the PDF (default: True).
+            page: The underlying pdfplumber page object that provides raw PDF data.
+            parent: Parent PDF object that contains this page and provides access
+                to managers and global settings.
+            index: Zero-based index of this page in the PDF document.
+            font_attrs: List of font attributes to consider when grouping characters
+                into words. Common attributes include ['fontname', 'size', 'flags'].
+                If None, uses default character-to-word grouping rules.
+            load_text: If True, load and process text elements from the PDF's text layer.
+                If False, skip text layer processing (useful for OCR-only workflows).
+        Note:
+            This constructor is typically called automatically when accessing pages
+            through the PDF.pages collection. Direct instantiation is rarely needed.
+        Example:
+            ```python
+            # Pages are usually accessed through the PDF object
+            pdf = npdf.PDF("document.pdf")
+            page = pdf.pages[0]  # Page object created automatically
+            # Direct construction (advanced usage)
+            import pdfplumber
+            with pdfplumber.open("document.pdf") as plumber_pdf:
+                plumber_page = plumber_pdf.pages[0]
+                page = Page(plumber_page, pdf, 0, load_text=True)
+            ```
         """
         self._page = page
         self._parent = parent
@@ -1190,6 +1273,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             if _contains_rtl(result):
                 try:
                     from bidi.algorithm import get_display  # type: ignore
                     from natural_pdf.utils.bidi_mirror import mirror_brackets
                     result = "\n".join(
@@ -1199,8 +1283,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                                 base_dir=(
                                     "R"
                                     if any(
-                                        unicodedata.bidirectional(ch)
-                                        in ("R", "AL", "AN")
+                                        unicodedata.bidirectional(ch) in ("R", "AL", "AN")
                                         for ch in line
                                     )
                                     else "L"
@@ -1396,11 +1479,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                         table_settings.setdefault("text_y_tolerance", y_tol)
                 # pdfplumber's text strategy benefits from a tight snap tolerance.
-                if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
+                if (
+                    "snap_tolerance" not in table_settings
+                    and "snap_x_tolerance" not in table_settings
+                ):
                     # Derive from y_tol if available, else default 1
                     snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
                     table_settings.setdefault("snap_tolerance", snap)
-                if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
+                if (
+                    "join_tolerance" not in table_settings
+                    and "join_x_tolerance" not in table_settings
+                ):
                     join = table_settings.get("snap_tolerance", 1)
                     table_settings.setdefault("join_tolerance", join)
                     table_settings.setdefault("join_x_tolerance", join)
@@ -1691,8 +1780,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # Apply global options as defaults, but allow explicit parameters to override
         import natural_pdf
+        # Determine if this is likely a computational use (OCR, analysis, etc.)
+        # If resolution is explicitly provided but width is not, assume computational use
+        # and don't apply global display width settings
+        is_computational_use = (resolution is not None and width is None and
+                               kwargs.get('include_highlights', True) is False)
         # Use global options if parameters are not explicitly set
-        if width is None:
+        if width is None and not is_computational_use:
             width = natural_pdf.options.image.width
         if resolution is None:
             if natural_pdf.options.image.resolution is not None:
@@ -2998,29 +3093,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             InspectionSummary with element tables showing coordinates,
             properties, and other details for each element
         """
-        return self.find_all('*').inspect(limit=limit)
+        return self.find_all("*").inspect(limit=limit)
     def remove_text_layer(self) -> "Page":
         """
         Remove all text elements from this page.
         This removes all text elements (words and characters) from the page,
         effectively clearing the text layer.
         Returns:
             Self for method chaining
         """
         logger.info(f"Page {self.number}: Removing all text elements...")
         # Remove all words and chars from the element manager
         removed_words = len(self._element_mgr.words)
         removed_chars = len(self._element_mgr.chars)
         # Clear the lists
         self._element_mgr._elements["words"] = []
         self._element_mgr._elements["chars"] = []
-        logger.info(f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters")
+        logger.info(
+            f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters"
+        )
         return self
     @property

natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

natural-pdf 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl