PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.33__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -146,7 +146,7 @@ class ElementManager:
     contained in the Page class, providing better separation of concerns.
     """
-    def __init__(self, page, font_attrs=None):
+    def __init__(self, page, font_attrs=None, load_text: bool = True):
         """
         Initialize the ElementManager.
@@ -156,9 +156,11 @@ class ElementManager:
                        Default: ['fontname', 'size', 'bold', 'italic']
                        None: Only consider spatial relationships
                        List: Custom attributes to consider
+            load_text: Whether to load text elements from the PDF (default: True).
         """
         self._page = page
         self._elements = None  # Lazy-loaded
+        self._load_text = load_text
         # Default to splitting by fontname, size, bold, italic if not specified
         # Renamed internal variable for clarity
         self._word_split_attributes = (
@@ -175,11 +177,15 @@ class ElementManager:
         logger.debug(f"Page {self._page.number}: Loading elements...")
-        # 1. Prepare character dictionaries (native + OCR) with necessary attributes
-        prepared_char_dicts = self._prepare_char_dicts()
-        logger.debug(
-            f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
-        )
+        # 1. Prepare character dictionaries only if loading text
+        if self._load_text:
+            prepared_char_dicts = self._prepare_char_dicts()
+            logger.debug(
+                f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
+            )
+        else:
+            prepared_char_dicts = []
+            logger.debug(f"Page {self._page.number}: Skipping text loading (load_text=False)")
         # -------------------------------------------------------------
         # Detect strikethrough (horizontal strike-out lines) on raw
@@ -189,61 +195,75 @@ class ElementManager:
         # belong to the same word.
         # -------------------------------------------------------------
-        try:
-            self._mark_strikethrough_chars(prepared_char_dicts)
-        except Exception as strike_err:  # pragma: no cover – strike detection must never crash loading
-            logger.warning(
-                f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
-                exc_info=True,
-            )
+        if self._load_text and prepared_char_dicts:
+            try:
+                self._mark_strikethrough_chars(prepared_char_dicts)
+            except Exception as strike_err:  # pragma: no cover – strike detection must never crash loading
+                logger.warning(
+                    f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
+                    exc_info=True,
+                )
         # -------------------------------------------------------------
         # Detect underlines on raw characters (must come after strike so
         # both attributes are present before word grouping).
         # -------------------------------------------------------------
-        try:
-            self._mark_underline_chars(prepared_char_dicts)
-        except Exception as u_err:  # pragma: no cover
-            logger.warning(
-                f"Page {self._page.number}: Underline detection failed – {u_err}",
-                exc_info=True,
-            )
+        if self._load_text and prepared_char_dicts:
+            try:
+                self._mark_underline_chars(prepared_char_dicts)
+            except Exception as u_err:  # pragma: no cover
+                logger.warning(
+                    f"Page {self._page.number}: Underline detection failed – {u_err}",
+                    exc_info=True,
+                )
         # Detect highlights
-        try:
-            self._mark_highlight_chars(prepared_char_dicts)
-        except Exception as h_err:
-            logger.warning(
-                f"Page {self._page.number}: Highlight detection failed – {h_err}",
-                exc_info=True,
-            )
+        if self._load_text and prepared_char_dicts:
+            try:
+                self._mark_highlight_chars(prepared_char_dicts)
+            except Exception as h_err:
+                logger.warning(
+                    f"Page {self._page.number}: Highlight detection failed – {h_err}",
+                    exc_info=True,
+                )
         # Create a mapping from character dict to index for efficient lookup
-        char_to_index = {}
-        for idx, char_dict in enumerate(prepared_char_dicts):
-            key = (
-                char_dict.get("x0", 0),
-                char_dict.get("top", 0),
-                char_dict.get("text", ""),
-            )
-            char_to_index[key] = idx
+        if self._load_text:
+            char_to_index = {}
+            for idx, char_dict in enumerate(prepared_char_dicts):
+                key = (
+                    char_dict.get("x0", 0),
+                    char_dict.get("top", 0),
+                    char_dict.get("text", ""),
+                )
+                char_to_index[key] = idx
+        else:
+            char_to_index = {}
         # 2. Instantiate the custom word extractor
         # Prefer page-level config over PDF-level for tolerance lookup
+        word_elements: List[TextElement] = []
+        # Get config objects (needed for auto_text_tolerance check)
         page_config = getattr(self._page, "_config", {})
         pdf_config = getattr(self._page._parent, "_config", {})
-        # Start with any explicitly supplied tolerances (may be None)
-        xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
-        yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
+        # Initialize tolerance variables
+        xt = None
+        yt = None
         use_flow = pdf_config.get("use_text_flow", False)
+        if self._load_text and prepared_char_dicts:
+            # Start with any explicitly supplied tolerances (may be None)
+            xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
+            yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
         # ------------------------------------------------------------------
         # Auto-adaptive tolerance: scale based on median character size when
         # requested and explicit values are absent.
         # ------------------------------------------------------------------
-        if pdf_config.get("auto_text_tolerance", True):
+        if self._load_text and pdf_config.get("auto_text_tolerance", True):
             import statistics
             sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
@@ -323,7 +343,6 @@ class ElementManager:
                 current_line_key = line_key
             lines[-1].append(char_dict)
-        word_elements: List[TextElement] = []
         # Process each line separately with direction detection
         for line_chars in lines:
             if not line_chars:
@@ -480,7 +499,8 @@ class ElementManager:
                         except Exception:
                             w._obj["highlight_color"] = dominant_color
-        generated_words = word_elements
+        # generated_words defaults to empty list if text loading is disabled
+        generated_words = word_elements if self._load_text else []
         logger.debug(
             f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
         )

natural_pdf/core/page.py CHANGED Viewed

@@ -101,7 +101,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
     with improved selection, navigation, extraction, and question-answering capabilities.
     """
-    def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
+    def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None, load_text: bool = True):
         """
         Initialize a page wrapper.
@@ -110,10 +110,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             parent: Parent PDF object
             index: Index of this page in the PDF (0-based)
             font_attrs: Font attributes to consider when grouping characters into words.
+            load_text: Whether to load text elements from the PDF (default: True).
         """
         self._page = page
         self._parent = parent
         self._index = index
+        self._load_text = load_text
         self._text_styles = None  # Lazy-loaded text style analyzer results
         self._exclusions = []  # List to store exclusion functions/regions
         self._skew_angle: Optional[float] = None  # Stores detected skew angle
@@ -136,7 +138,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         self._config = dict(getattr(self._parent, "_config", {}))
         # Initialize ElementManager, passing font_attrs
-        self._element_mgr = ElementManager(self, font_attrs=font_attrs)
+        self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
         # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
         # --- NEW --- Central registry for analysis results
         self.analyses: Dict[str, Any] = {}
@@ -2998,6 +3000,29 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         """
         return self.find_all('*').inspect(limit=limit)
+    def remove_text_layer(self) -> "Page":
+        """
+        Remove all text elements from this page.
+        This removes all text elements (words and characters) from the page,
+        effectively clearing the text layer.
+        Returns:
+            Self for method chaining
+        """
+        logger.info(f"Page {self.number}: Removing all text elements...")
+        # Remove all words and chars from the element manager
+        removed_words = len(self._element_mgr.words)
+        removed_chars = len(self._element_mgr.chars)
+        # Clear the lists
+        self._element_mgr._elements["words"] = []
+        self._element_mgr._elements["chars"] = []
+        logger.info(f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters")
+        return self
     @property
     def lines(self) -> List[Any]:
         """Get all line elements on this page."""

natural_pdf/core/pdf.py CHANGED Viewed

@@ -108,12 +108,13 @@ class _LazyPageList(Sequence):
     also supported and will materialise pages on demand.
     """
-    def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
+    def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True):
         self._parent_pdf = parent_pdf
         self._plumber_pdf = plumber_pdf
         self._font_attrs = font_attrs
         # One slot per pdfplumber page – initially all None
         self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
+        self._load_text = load_text
     # Internal helper -----------------------------------------------------
     def _create_page(self, index: int) -> "Page":
@@ -123,7 +124,7 @@ class _LazyPageList(Sequence):
             from natural_pdf.core.page import Page
             plumber_page = self._plumber_pdf.pages[index]
-            cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
+            cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs, load_text=self._load_text)
             self._cache[index] = cached
         return cached
@@ -170,6 +171,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         keep_spaces: bool = True,
         text_tolerance: Optional[dict] = None,
         auto_text_tolerance: bool = True,
+        text_layer: bool = True,
     ):
         """
         Initialize the enhanced PDF object.
@@ -181,11 +183,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             keep_spaces: Whether to include spaces in word elements
             text_tolerance: PDFplumber-style tolerance settings
             auto_text_tolerance: Whether to automatically scale text tolerance
+            text_layer: Whether to keep the existing text layer from the PDF (default: True).
+                       If False, removes all existing text elements during initialization.
         """
         self._original_path_or_stream = path_or_url_or_stream
         self._temp_file = None
         self._resolved_path = None
         self._is_stream = False
+        self._text_layer = text_layer
         stream_to_open = None
         if hasattr(path_or_url_or_stream, "read"):  # Check if it's file-like
@@ -257,7 +262,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._manager_registry = {}
         # Lazily instantiate pages only when accessed
-        self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
+        self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer)
         self._element_cache = {}
         self._exclusions = []
@@ -267,6 +272,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._initialize_managers()
         self._initialize_highlighter()
+        # Remove text layer if requested
+        if not self._text_layer:
+            logger.info("Removing text layer as requested (text_layer=False)")
+            # Text layer is not loaded when text_layer=False, so no need to remove
+            pass
         # Analysis results accessed via self.analyses property (see below)
         # --- Automatic cleanup when object is garbage-collected ---
@@ -1463,6 +1475,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
                 reading_order=self._reading_order,
                 font_attrs=self._font_attrs,
                 keep_spaces=self._config.get("keep_spaces", True),
+                text_layer=self._text_layer,
             )
             return new_pdf
         except Exception as e:

natural_pdf/elements/region.py CHANGED Viewed

@@ -2282,15 +2282,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                 if success:
                     removed_count += 1
-            # Remove OCR elements overlapping this region
+            # Remove ALL OCR elements overlapping this region
+            # Remove elements with source=="ocr" (built-in OCR) or matching the source_label (previous custom OCR)
             for word in list(self.page._element_mgr.words):
-                if getattr(word, "source", "").startswith("ocr") and self.intersects(word):
+                word_source = getattr(word, "source", "")
+                # Match built-in OCR behavior: remove elements with source "ocr" exactly
+                # Also remove elements with the same source_label to avoid duplicates
+                if (word_source == "ocr" or word_source == source_label) and self.intersects(word):
                     _safe_remove(word)
-            # Also check custom-ocr sources
-            for word in list(self.page._element_mgr.words):
-                if getattr(word, "source", "") == source_label and self.intersects(word):
-                    _safe_remove(word)
+            # Also remove char dicts if needed (matching built-in OCR)
+            for char in list(self.page._element_mgr.chars):
+                # char can be dict or TextElement; normalize
+                char_src = char.get("source") if isinstance(char, dict) else getattr(char, "source", None)
+                if char_src == "ocr" or char_src == source_label:
+                    # Rough bbox for dicts
+                    if isinstance(char, dict):
+                        cx0, ctop, cx1, cbottom = char.get("x0", 0), char.get("top", 0), char.get("x1", 0), char.get("bottom", 0)
+                    else:
+                        cx0, ctop, cx1, cbottom = char.x0, char.top, char.x1, char.bottom
+                    # Quick overlap check
+                    if not (cx1 < self.x0 or cx0 > self.x1 or cbottom < self.top or ctop > self.bottom):
+                        _safe_remove(char)
             if removed_count > 0:
                 logger.info(

{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.32
+Version: 0.1.33
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/RECORD RENAMED Viewed

@@ -25,10 +25,10 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
 natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
 natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
-natural_pdf/core/element_manager.py,sha256=A6GJk9kwTzt-aSz4-SWaRHLZRbIMFFLce3CpxSyfkV4,51749
+natural_pdf/core/element_manager.py,sha256=DbRzAKD3to5NpKc73Q-TXZIZkhx8zZtbi_UNu5K7AAU,52766
 natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
-natural_pdf/core/page.py,sha256=843_Fyk1gxZ8nqERJjjjoRD3iM4pFJy9a0zQSyMthiQ,128476
-natural_pdf/core/pdf.py,sha256=mC4GZjPXx_bK6RUlhLpnJnapkHDhbgJpgpcUJOvb7OE,75290
+natural_pdf/core/page.py,sha256=k4jezvsLqL07Raglc-rZmMnsVwBMo_A_OerklpBIejY,129477
+natural_pdf/core/pdf.py,sha256=u0ZCPuIijNecU-AJHLvqfAYVCr9h7MgUKnlEtH6RoZI,75969
 natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
 natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
 natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
@@ -40,7 +40,7 @@ natural_pdf/elements/collections.py,sha256=1E2MSg2NNcEcoRM2rumrv_CqIdO7DgbRHYEtf
 natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
 natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
 natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
-natural_pdf/elements/region.py,sha256=8SKhzCJ6sELZxJcM2i_58YhEKU6HBvaJ7Oj6E3bOsHw,139523
+natural_pdf/elements/region.py,sha256=23J5Tv7ffAgz3IBgDXPq9Ab_lLg2Sog7elFRb6nvvZE,140541
 natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
@@ -97,7 +97,7 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
 natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
-natural_pdf-0.1.32.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.1.33.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
 optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
 optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
 optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
@@ -111,8 +111,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
 tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
 tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
 tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
-natural_pdf-0.1.32.dist-info/METADATA,sha256=CMZIo2BjeLh-b9hezQHMLehZP8brUflCQ69dLtfFyxo,6711
-natural_pdf-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.1.32.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.1.32.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
-natural_pdf-0.1.32.dist-info/RECORD,,
+natural_pdf-0.1.33.dist-info/METADATA,sha256=mSAwh3vuD9aRvO_AC_XBZG5sw9SeiuidC86a7kuV--I,6711
+natural_pdf-0.1.33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.1.33.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.1.33.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
+natural_pdf-0.1.33.dist-info/RECORD,,

{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.33.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.1.32__py3-none-any.whl → 0.1.33__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.33py3-none-any.whl