PyPI - natural-pdf - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

natural-pdf 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

natural_pdf/core/page.py CHANGED Viewed

@@ -717,14 +717,23 @@ class Page(
         # Add PDF-level exclusions if we have a parent PDF
         if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
+            # Get existing labels to check for duplicates
+            existing_labels = set()
+            for exc in all_exclusions:
+                if len(exc) >= 2 and exc[1]:  # Has a label
+                    existing_labels.add(exc[1])
             for pdf_exclusion in self._parent._exclusions:
-                # Check if this exclusion is already in our list (avoid duplicates)
-                if pdf_exclusion not in all_exclusions:
-                    # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
-                    if len(pdf_exclusion) == 2:
-                        # Convert to 3-tuple format with default method
-                        pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
-                    all_exclusions.append(pdf_exclusion)
+                # Check if this exclusion label is already in our list (avoid duplicates)
+                label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
+                if label and label in existing_labels:
+                    continue  # Skip this exclusion as it's already been applied
+                # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
+                if len(pdf_exclusion) == 2:
+                    # Convert to 3-tuple format with default method
+                    pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
+                all_exclusions.append(pdf_exclusion)
         if debug:
             print(
@@ -829,6 +838,36 @@ class Page(
                 regions.append(exclusion_item)  # Label is already on the Region object
                 if debug:
                     print(f"  - Added direct region '{label}': {exclusion_item}")
+            # Process string selectors (from PDF-level exclusions)
+            elif isinstance(exclusion_item, str):
+                selector_str = exclusion_item
+                matching_elements = self.find_all(selector_str, apply_exclusions=False)
+                if debug:
+                    print(
+                        f"  - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
+                    )
+                if method == "region":
+                    # Convert each matching element to a region
+                    for el in matching_elements:
+                        try:
+                            bbox_coords = (
+                                float(el.x0),
+                                float(el.top),
+                                float(el.x1),
+                                float(el.bottom),
+                            )
+                            region = Region(self, bbox_coords, label=label)
+                            regions.append(region)
+                            if debug:
+                                print(f"    ✓ Added region from selector match: {bbox_coords}")
+                        except Exception as e:
+                            if debug:
+                                print(f"    ✗ Failed to create region from element: {e}")
+                # If method is "element", it will be handled in _filter_elements_by_exclusions
             # Element-based exclusions are not converted to regions here
             # They will be handled separately in _filter_elements_by_exclusions
@@ -852,7 +891,16 @@ class Page(
         Returns:
             A new list containing only the elements not excluded.
         """
-        if not self._exclusions:
+        # Check both page-level and PDF-level exclusions
+        has_page_exclusions = bool(self._exclusions)
+        has_pdf_exclusions = (
+            hasattr(self, "_parent")
+            and self._parent
+            and hasattr(self._parent, "_exclusions")
+            and bool(self._parent._exclusions)
+        )
+        if not has_page_exclusions and not has_pdf_exclusions:
             if debug_exclusions:
                 print(
                     f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
@@ -865,9 +913,15 @@ class Page(
         )
         # Collect element-based exclusions
-        excluded_elements = set()  # Use set for O(1) lookup
+        # Store element bboxes for comparison instead of object ids
+        excluded_element_bboxes = set()  # Use set for O(1) lookup
+        # Process both page-level and PDF-level exclusions
+        all_exclusions = list(self._exclusions) if has_page_exclusions else []
+        if has_pdf_exclusions:
+            all_exclusions.extend(self._parent._exclusions)
-        for exclusion_data in self._exclusions:
+        for exclusion_data in all_exclusions:
             # Handle both old format (2-tuple) and new format (3-tuple)
             if len(exclusion_data) == 2:
                 exclusion_item, label = exclusion_data
@@ -883,16 +937,31 @@ class Page(
             if isinstance(exclusion_item, Region):
                 continue
+            # Handle string selectors for element-based exclusions
+            if isinstance(exclusion_item, str) and method == "element":
+                selector_str = exclusion_item
+                matching_elements = self.find_all(selector_str, apply_exclusions=False)
+                for el in matching_elements:
+                    if hasattr(el, "bbox"):
+                        bbox = tuple(el.bbox)
+                        excluded_element_bboxes.add(bbox)
+                        if debug_exclusions:
+                            print(
+                                f"  - Added element exclusion from selector '{selector_str}': {bbox}"
+                            )
             # Handle element-based exclusions
-            if method == "element" and hasattr(exclusion_item, "bbox"):
-                excluded_elements.add(id(exclusion_item))
+            elif method == "element" and hasattr(exclusion_item, "bbox"):
+                # Store bbox tuple for comparison
+                bbox = tuple(exclusion_item.bbox)
+                excluded_element_bboxes.add(bbox)
                 if debug_exclusions:
-                    print(f"  - Added element exclusion: {exclusion_item}")
+                    print(f"  - Added element exclusion with bbox {bbox}: {exclusion_item}")
         if debug_exclusions:
             print(
                 f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
-                f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
+                f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
             )
         filtered_elements = []
@@ -903,7 +972,7 @@ class Page(
             exclude = False
             # Check element-based exclusions first (faster)
-            if id(element) in excluded_elements:
+            if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
                 exclude = True
                 element_excluded_count += 1
                 if debug_exclusions:
@@ -2487,10 +2556,23 @@ class Page(
         return self
     def get_section_between(
-        self, start_element=None, end_element=None, include_boundaries="both"
+        self,
+        start_element=None,
+        end_element=None,
+        include_boundaries="both",
+        orientation="vertical",
     ) -> Optional["Region"]:  # Return Optional
         """
         Get a section between two elements on this page.
+        Args:
+            start_element: Element marking the start of the section
+            end_element: Element marking the end of the section
+            include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
+            orientation: 'vertical' (default) or 'horizontal' - determines section direction
+        Returns:
+            Region representing the section
         """
         # Create a full-page region to operate within
         page_region = self.create_region(0, 0, self.width, self.height)
@@ -2501,6 +2583,7 @@ class Page(
                 start_element=start_element,
                 end_element=end_element,
                 include_boundaries=include_boundaries,
+                orientation=orientation,
             )
         except Exception as e:
             logger.error(
@@ -2575,10 +2658,23 @@ class Page(
         if include_boundaries not in valid_inclusions:
             raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
-        if not start_elements:
-            # Return an empty ElementCollection if no start elements
+        if not start_elements and not end_elements:
+            # Return an empty ElementCollection if no boundary elements at all
             return ElementCollection([])
+        # If we only have end elements, create implicit start elements
+        if not start_elements and end_elements:
+            # Delegate to PageCollection implementation for consistency
+            from natural_pdf.core.page_collection import PageCollection
+            pages = PageCollection([self])
+            return pages.get_sections(
+                start_elements=start_elements,
+                end_elements=end_elements,
+                include_boundaries=include_boundaries,
+                orientation=orientation,
+            )
         # Combine start and end elements with their type
         all_boundaries = []
         for el in start_elements:

natural_pdf/core/page_collection.py CHANGED Viewed

@@ -537,10 +537,14 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
             first_page = self.pages[0]
             first_start = Region(first_page, (0, 0, first_page.width, 1))
             first_start.is_implicit_start = True
+            # Don't mark this as created from any end element, so it can pair with any end
             start_elements.append(first_start)
             # For each end element (except the last), add an implicit start after it
-            sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
+            # Sort by page, then top, then bottom (for elements with same top), then x0
+            sorted_end_elements = sorted(
+                end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0)
+            )
             for i, end_elem in enumerate(sorted_end_elements[:-1]):  # Exclude last end element
                 # Create implicit start element right after this end element
                 implicit_start = Region(
@@ -838,29 +842,47 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                 # Create a section from current_start to just before this boundary
                 start_element = current_start["element"]
-                # Find the last element before this boundary on the same page
+                # Create section from current start to just before this new start
                 if start_element.page == boundary["element"].page:
-                    # Find elements on this page
-                    page_elements = [e for e in all_elements if e.page == start_element.page]
-                    # Sort by position based on orientation
+                    from natural_pdf.elements.region import Region
+                    next_start = boundary["element"]
+                    # Create section based on orientation
                     if orientation == "vertical":
-                        page_elements.sort(key=lambda e: (e.top, e.x0))
+                        # Determine vertical bounds
+                        if include_boundaries in ["start", "both"]:
+                            top = start_element.top
+                        else:
+                            top = start_element.bottom
+                        # The section ends just before the next start
+                        bottom = next_start.top
+                        # Create the section with full page width
+                        if top < bottom:
+                            section = Region(
+                                start_element.page, (0, top, start_element.page.width, bottom)
+                            )
+                            section.start_element = start_element
+                            sections.append(section)
                     else:  # horizontal
-                        page_elements.sort(key=lambda e: (e.x0, e.top))
+                        # Determine horizontal bounds
+                        if include_boundaries in ["start", "both"]:
+                            left = start_element.x0
+                        else:
+                            left = start_element.x1
-                    # Find the last element before the boundary
-                    end_idx = (
-                        page_elements.index(boundary["element"]) - 1
-                        if boundary["element"] in page_elements
-                        else -1
-                    )
-                    end_element = page_elements[end_idx] if end_idx >= 0 else None
+                        # The section ends just before the next start
+                        right = next_start.x0
-                    # Create the section
-                    section = start_element.page.get_section_between(
-                        start_element, end_element, include_boundaries, orientation
-                    )
-                    sections.append(section)
+                        # Create the section with full page height
+                        if left < right:
+                            section = Region(
+                                start_element.page, (left, 0, right, start_element.page.height)
+                            )
+                            section.start_element = start_element
+                            sections.append(section)
                 else:
                     # Cross-page section - create from current_start to the end of its page
                     from natural_pdf.elements.region import Region

natural_pdf/core/pdf.py CHANGED Viewed

@@ -252,6 +252,16 @@ class _LazyPageList(Sequence):
                         logger.warning(f"Failed to apply region to page {cached.number}: {e}")
             self._cache[index] = cached
+            # Also cache in the parent PDF's main page list if this is a slice
+            if (
+                hasattr(self._parent_pdf, "_pages")
+                and hasattr(self._parent_pdf._pages, "_cache")
+                and actual_page_index < len(self._parent_pdf._pages._cache)
+                and self._parent_pdf._pages._cache[actual_page_index] is None
+            ):
+                self._parent_pdf._pages._cache[actual_page_index] = cached
         return cached
     # Sequence protocol ---------------------------------------------------
@@ -720,26 +730,16 @@ class PDF(
             # Store for bookkeeping and lazy application
             self._exclusions.append((exclusion_func, label))
-            # Apply only to already-created (cached) pages to avoid forcing page creation
-            for i in range(len(self._pages)):
-                if self._pages._cache[i] is not None:  # Only apply to existing pages
-                    try:
-                        self._pages._cache[i].add_exclusion(exclusion_func, label=label)
-                    except Exception as e:
-                        logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
+            # Don't modify already-cached pages - they will get PDF-level exclusions
+            # dynamically through _get_exclusion_regions()
             return self
         # Fallback to original callable / Region behaviour ------------------
         exclusion_data = (exclusion_func, label)
         self._exclusions.append(exclusion_data)
-        # Apply only to already-created (cached) pages to avoid forcing page creation
-        for i in range(len(self._pages)):
-            if self._pages._cache[i] is not None:  # Only apply to existing pages
-                try:
-                    self._pages._cache[i].add_exclusion(exclusion_func, label=label)
-                except Exception as e:
-                    logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
+        # Don't modify already-cached pages - they will get PDF-level exclusions
+        # dynamically through _get_exclusion_regions()
         return self

natural_pdf/elements/element_collection.py CHANGED Viewed

@@ -621,6 +621,7 @@ class ElementCollection(
     def extract_text(
         self,
+        separator: str = " ",
         preserve_whitespace: bool = True,
         use_exclusions: bool = True,
         strip: Optional[bool] = None,
@@ -632,6 +633,9 @@ class ElementCollection(
         pdfplumber's layout engine if layout=True is specified.
         Args:
+            separator: String to insert between text from different elements when
+                      using simple joining (layout=False). Default is a single space.
+                      Ignored when layout=True as the layout engine handles spacing.
             preserve_whitespace: Deprecated. Use layout=False for simple joining.
             use_exclusions: Deprecated. Exclusions should be applied *before* creating
                           the collection or by filtering the collection itself.
@@ -668,7 +672,7 @@ class ElementCollection(
             logger.warning(
                 "ElementCollection.extract_text: No character dictionaries found in TextElements."
             )
-            return " ".join(
+            return separator.join(
                 getattr(el, "text", "") for el in text_elements
             )  # Fallback to simple join of word text
@@ -733,18 +737,33 @@ class ElementCollection(
                 all_char_dicts.sort(
                     key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
                 )
-                result = " ".join(c.get("text", "") for c in all_char_dicts)
+                result = separator.join(c.get("text", "") for c in all_char_dicts)
         else:
             # Default: Simple join without layout
             logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
-            # Sort chars by document order (page, top, x0)
-            all_char_dicts.sort(
-                key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
+            # Instead of joining all characters individually, we need to:
+            # 1. Extract text from each element
+            # 2. Join the element texts with the separator
+            # Sort elements by document order (page, top, x0)
+            sorted_elements = sorted(
+                text_elements,
+                key=lambda el: (
+                    el.page.index if hasattr(el, "page") else 0,
+                    el.top if hasattr(el, "top") else 0,
+                    el.x0 if hasattr(el, "x0") else 0,
+                ),
             )
-            # Simple join of character text
-            result = "".join(c.get("text", "") for c in all_char_dicts)
-            # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
+            # Extract text from each element
+            element_texts = []
+            for el in sorted_elements:
+                if hasattr(el, "text") and el.text:
+                    element_texts.append(el.text)
+            result = separator.join(element_texts)
         # Determine final strip flag – same rule as global helper unless caller overrides
         strip_text = strip if strip is not None else (not use_layout)

{natural_pdf-0.2.6.dist-info → natural_pdf-0.2.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.6
+Version: 0.2.8
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.6.dist-info → natural_pdf-0.2.8.dist-info}/RECORD RENAMED Viewed

@@ -27,10 +27,10 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
 natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
 natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
-natural_pdf/core/page.py,sha256=U0wAEw6z_lFuv6BBY8DKIpD5Y4wiZCo7x7qtjPf3hcM,148300
-natural_pdf/core/page_collection.py,sha256=itVSWeY6285G7_bIP7vjrMygnGQTX2SdNbJxYW5Eypc,62196
+natural_pdf/core/page.py,sha256=XmXii652iM-JVKgzpbKQ8f59U0TvDLD5iAfdtx92gis,152675
+natural_pdf/core/page_collection.py,sha256=IjdFq9q0D0P6ZKWInf0H25rLzxfMb7RsUXucogkhNkU,63169
 natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
-natural_pdf/core/pdf.py,sha256=XMEPyd6LlwAhFvnTAU5ZtE_Hr4WpkExxw16DpYsZpvQ,104410
+natural_pdf/core/pdf.py,sha256=ovdeu9TRPnVYyMltD7QpcdcFYBLZFXh3LlfC5ifj6RY,104227
 natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
 natural_pdf/core/render_spec.py,sha256=y9QkMiIvWaKiEBlV0TjyldADIEUY3YfWLQXxStHu1S4,15480
 natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
@@ -40,7 +40,7 @@ natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ
 natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
 natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
 natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
-natural_pdf/elements/element_collection.py,sha256=42SUzjD2nYFPNEQA-4oMi2QOwwwsxBmcrY4FKgGumJ0,128700
+natural_pdf/elements/element_collection.py,sha256=-piFQGiDPiqmnl-Cpoi3PGPmGe4AYvpl0IqaJGxBsBc,129405
 natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
 natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
 natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
@@ -107,7 +107,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
 natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
-natural_pdf-0.2.6.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.2.8.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
 optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
 optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
 optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -124,8 +124,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
 tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
 tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
 tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
-natural_pdf-0.2.6.dist-info/METADATA,sha256=SJ7AqaSiRD-4NYz9Pk0Iz7IlEMiiv1aha3V8do8qvbo,6959
-natural_pdf-0.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.2.6.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.2.6.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
-natural_pdf-0.2.6.dist-info/RECORD,,
+natural_pdf-0.2.8.dist-info/METADATA,sha256=tuWXV-mY9zU0qsVsXhrrp3aGBfSxlklUxS_Dlllqmp4,6959
+natural_pdf-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.2.8.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.2.8.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
+natural_pdf-0.2.8.dist-info/RECORD,,

{natural_pdf-0.2.6.dist-info → natural_pdf-0.2.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.2.6.dist-info → natural_pdf-0.2.8.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.2.6.dist-info → natural_pdf-0.2.8.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.2.6.dist-info → natural_pdf-0.2.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

natural-pdf 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl