PyPI - natural-pdf - Versions diffs - 0.2.6__tar.gz → 0.2.9__tar.gz - Mend

natural-pdf 0.2.6tar.gz → 0.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (320) hide show

{natural_pdf-0.2.6/natural_pdf.egg-info → natural_pdf-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.6
+Version: 0.2.9
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page.py RENAMED Viewed

@@ -717,14 +717,23 @@ class Page(
         # Add PDF-level exclusions if we have a parent PDF
         if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
+            # Get existing labels to check for duplicates
+            existing_labels = set()
+            for exc in all_exclusions:
+                if len(exc) >= 2 and exc[1]:  # Has a label
+                    existing_labels.add(exc[1])
             for pdf_exclusion in self._parent._exclusions:
-                # Check if this exclusion is already in our list (avoid duplicates)
-                if pdf_exclusion not in all_exclusions:
-                    # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
-                    if len(pdf_exclusion) == 2:
-                        # Convert to 3-tuple format with default method
-                        pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
-                    all_exclusions.append(pdf_exclusion)
+                # Check if this exclusion label is already in our list (avoid duplicates)
+                label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
+                if label and label in existing_labels:
+                    continue  # Skip this exclusion as it's already been applied
+                # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
+                if len(pdf_exclusion) == 2:
+                    # Convert to 3-tuple format with default method
+                    pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
+                all_exclusions.append(pdf_exclusion)
         if debug:
             print(
@@ -829,6 +838,36 @@ class Page(
                 regions.append(exclusion_item)  # Label is already on the Region object
                 if debug:
                     print(f"  - Added direct region '{label}': {exclusion_item}")
+            # Process string selectors (from PDF-level exclusions)
+            elif isinstance(exclusion_item, str):
+                selector_str = exclusion_item
+                matching_elements = self.find_all(selector_str, apply_exclusions=False)
+                if debug:
+                    print(
+                        f"  - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
+                    )
+                if method == "region":
+                    # Convert each matching element to a region
+                    for el in matching_elements:
+                        try:
+                            bbox_coords = (
+                                float(el.x0),
+                                float(el.top),
+                                float(el.x1),
+                                float(el.bottom),
+                            )
+                            region = Region(self, bbox_coords, label=label)
+                            regions.append(region)
+                            if debug:
+                                print(f"    ✓ Added region from selector match: {bbox_coords}")
+                        except Exception as e:
+                            if debug:
+                                print(f"    ✗ Failed to create region from element: {e}")
+                # If method is "element", it will be handled in _filter_elements_by_exclusions
             # Element-based exclusions are not converted to regions here
             # They will be handled separately in _filter_elements_by_exclusions
@@ -852,7 +891,16 @@ class Page(
         Returns:
             A new list containing only the elements not excluded.
         """
-        if not self._exclusions:
+        # Check both page-level and PDF-level exclusions
+        has_page_exclusions = bool(self._exclusions)
+        has_pdf_exclusions = (
+            hasattr(self, "_parent")
+            and self._parent
+            and hasattr(self._parent, "_exclusions")
+            and bool(self._parent._exclusions)
+        )
+        if not has_page_exclusions and not has_pdf_exclusions:
             if debug_exclusions:
                 print(
                     f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
@@ -865,9 +913,15 @@ class Page(
         )
         # Collect element-based exclusions
-        excluded_elements = set()  # Use set for O(1) lookup
+        # Store element bboxes for comparison instead of object ids
+        excluded_element_bboxes = set()  # Use set for O(1) lookup
+        # Process both page-level and PDF-level exclusions
+        all_exclusions = list(self._exclusions) if has_page_exclusions else []
+        if has_pdf_exclusions:
+            all_exclusions.extend(self._parent._exclusions)
-        for exclusion_data in self._exclusions:
+        for exclusion_data in all_exclusions:
             # Handle both old format (2-tuple) and new format (3-tuple)
             if len(exclusion_data) == 2:
                 exclusion_item, label = exclusion_data
@@ -883,16 +937,31 @@ class Page(
             if isinstance(exclusion_item, Region):
                 continue
+            # Handle string selectors for element-based exclusions
+            if isinstance(exclusion_item, str) and method == "element":
+                selector_str = exclusion_item
+                matching_elements = self.find_all(selector_str, apply_exclusions=False)
+                for el in matching_elements:
+                    if hasattr(el, "bbox"):
+                        bbox = tuple(el.bbox)
+                        excluded_element_bboxes.add(bbox)
+                        if debug_exclusions:
+                            print(
+                                f"  - Added element exclusion from selector '{selector_str}': {bbox}"
+                            )
             # Handle element-based exclusions
-            if method == "element" and hasattr(exclusion_item, "bbox"):
-                excluded_elements.add(id(exclusion_item))
+            elif method == "element" and hasattr(exclusion_item, "bbox"):
+                # Store bbox tuple for comparison
+                bbox = tuple(exclusion_item.bbox)
+                excluded_element_bboxes.add(bbox)
                 if debug_exclusions:
-                    print(f"  - Added element exclusion: {exclusion_item}")
+                    print(f"  - Added element exclusion with bbox {bbox}: {exclusion_item}")
         if debug_exclusions:
             print(
                 f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
-                f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
+                f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
             )
         filtered_elements = []
@@ -903,7 +972,7 @@ class Page(
             exclude = False
             # Check element-based exclusions first (faster)
-            if id(element) in excluded_elements:
+            if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
                 exclude = True
                 element_excluded_count += 1
                 if debug_exclusions:
@@ -2487,10 +2556,23 @@ class Page(
         return self
     def get_section_between(
-        self, start_element=None, end_element=None, include_boundaries="both"
+        self,
+        start_element=None,
+        end_element=None,
+        include_boundaries="both",
+        orientation="vertical",
     ) -> Optional["Region"]:  # Return Optional
         """
         Get a section between two elements on this page.
+        Args:
+            start_element: Element marking the start of the section
+            end_element: Element marking the end of the section
+            include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
+            orientation: 'vertical' (default) or 'horizontal' - determines section direction
+        Returns:
+            Region representing the section
         """
         # Create a full-page region to operate within
         page_region = self.create_region(0, 0, self.width, self.height)
@@ -2501,6 +2583,7 @@ class Page(
                 start_element=start_element,
                 end_element=end_element,
                 include_boundaries=include_boundaries,
+                orientation=orientation,
             )
         except Exception as e:
             logger.error(
@@ -2575,10 +2658,23 @@ class Page(
         if include_boundaries not in valid_inclusions:
             raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
-        if not start_elements:
-            # Return an empty ElementCollection if no start elements
+        if not start_elements and not end_elements:
+            # Return an empty ElementCollection if no boundary elements at all
             return ElementCollection([])
+        # If we only have end elements, create implicit start elements
+        if not start_elements and end_elements:
+            # Delegate to PageCollection implementation for consistency
+            from natural_pdf.core.page_collection import PageCollection
+            pages = PageCollection([self])
+            return pages.get_sections(
+                start_elements=start_elements,
+                end_elements=end_elements,
+                include_boundaries=include_boundaries,
+                orientation=orientation,
+            )
         # Combine start and end elements with their type
         all_boundaries = []
         for el in start_elements:

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page_collection.py RENAMED Viewed

@@ -537,10 +537,14 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
             first_page = self.pages[0]
             first_start = Region(first_page, (0, 0, first_page.width, 1))
             first_start.is_implicit_start = True
+            # Don't mark this as created from any end element, so it can pair with any end
             start_elements.append(first_start)
             # For each end element (except the last), add an implicit start after it
-            sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
+            # Sort by page, then top, then bottom (for elements with same top), then x0
+            sorted_end_elements = sorted(
+                end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0)
+            )
             for i, end_elem in enumerate(sorted_end_elements[:-1]):  # Exclude last end element
                 # Create implicit start element right after this end element
                 implicit_start = Region(
@@ -838,29 +842,47 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                 # Create a section from current_start to just before this boundary
                 start_element = current_start["element"]
-                # Find the last element before this boundary on the same page
+                # Create section from current start to just before this new start
                 if start_element.page == boundary["element"].page:
-                    # Find elements on this page
-                    page_elements = [e for e in all_elements if e.page == start_element.page]
-                    # Sort by position based on orientation
+                    from natural_pdf.elements.region import Region
+                    next_start = boundary["element"]
+                    # Create section based on orientation
                     if orientation == "vertical":
-                        page_elements.sort(key=lambda e: (e.top, e.x0))
+                        # Determine vertical bounds
+                        if include_boundaries in ["start", "both"]:
+                            top = start_element.top
+                        else:
+                            top = start_element.bottom
+                        # The section ends just before the next start
+                        bottom = next_start.top
+                        # Create the section with full page width
+                        if top < bottom:
+                            section = Region(
+                                start_element.page, (0, top, start_element.page.width, bottom)
+                            )
+                            section.start_element = start_element
+                            sections.append(section)
                     else:  # horizontal
-                        page_elements.sort(key=lambda e: (e.x0, e.top))
+                        # Determine horizontal bounds
+                        if include_boundaries in ["start", "both"]:
+                            left = start_element.x0
+                        else:
+                            left = start_element.x1
-                    # Find the last element before the boundary
-                    end_idx = (
-                        page_elements.index(boundary["element"]) - 1
-                        if boundary["element"] in page_elements
-                        else -1
-                    )
-                    end_element = page_elements[end_idx] if end_idx >= 0 else None
+                        # The section ends just before the next start
+                        right = next_start.x0
-                    # Create the section
-                    section = start_element.page.get_section_between(
-                        start_element, end_element, include_boundaries, orientation
-                    )
-                    sections.append(section)
+                        # Create the section with full page height
+                        if left < right:
+                            section = Region(
+                                start_element.page, (left, 0, right, start_element.page.height)
+                            )
+                            section.start_element = start_element
+                            sections.append(section)
                 else:
                     # Cross-page section - create from current_start to the end of its page
                     from natural_pdf.elements.region import Region

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/pdf.py RENAMED Viewed

@@ -252,6 +252,16 @@ class _LazyPageList(Sequence):
                         logger.warning(f"Failed to apply region to page {cached.number}: {e}")
             self._cache[index] = cached
+            # Also cache in the parent PDF's main page list if this is a slice
+            if (
+                hasattr(self._parent_pdf, "_pages")
+                and hasattr(self._parent_pdf._pages, "_cache")
+                and actual_page_index < len(self._parent_pdf._pages._cache)
+                and self._parent_pdf._pages._cache[actual_page_index] is None
+            ):
+                self._parent_pdf._pages._cache[actual_page_index] = cached
         return cached
     # Sequence protocol ---------------------------------------------------
@@ -720,26 +730,16 @@ class PDF(
             # Store for bookkeeping and lazy application
             self._exclusions.append((exclusion_func, label))
-            # Apply only to already-created (cached) pages to avoid forcing page creation
-            for i in range(len(self._pages)):
-                if self._pages._cache[i] is not None:  # Only apply to existing pages
-                    try:
-                        self._pages._cache[i].add_exclusion(exclusion_func, label=label)
-                    except Exception as e:
-                        logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
+            # Don't modify already-cached pages - they will get PDF-level exclusions
+            # dynamically through _get_exclusion_regions()
             return self
         # Fallback to original callable / Region behaviour ------------------
         exclusion_data = (exclusion_func, label)
         self._exclusions.append(exclusion_data)
-        # Apply only to already-created (cached) pages to avoid forcing page creation
-        for i in range(len(self._pages)):
-            if self._pages._cache[i] is not None:  # Only apply to existing pages
-                try:
-                    self._pages._cache[i].add_exclusion(exclusion_func, label=label)
-                except Exception as e:
-                    logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
+        # Don't modify already-cached pages - they will get PDF-level exclusions
+        # dynamically through _get_exclusion_regions()
         return self

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/element_collection.py RENAMED Viewed

@@ -621,6 +621,7 @@ class ElementCollection(
     def extract_text(
         self,
+        separator: str = " ",
         preserve_whitespace: bool = True,
         use_exclusions: bool = True,
         strip: Optional[bool] = None,
@@ -632,6 +633,7 @@ class ElementCollection(
         pdfplumber's layout engine if layout=True is specified.
         Args:
+            separator: String to join text from elements. Default is a single space.
             preserve_whitespace: Deprecated. Use layout=False for simple joining.
             use_exclusions: Deprecated. Exclusions should be applied *before* creating
                           the collection or by filtering the collection itself.
@@ -648,15 +650,49 @@ class ElementCollection(
         Returns:
             Combined text from elements, potentially with layout-based spacing.
         """
-        # Filter to just TextElements that likely have _char_dicts
-        text_elements = [
+        # Check if we have any elements at all
+        if not self._elements:
+            return ""
+        # Check if all elements are TextElements with character data
+        text_elements_with_chars = [
             el
             for el in self._elements
-            if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
+            if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
         ]
-        if not text_elements:
-            return ""
+        # If we have a mixed collection (Regions, TextElements without chars, etc),
+        # use a simpler approach: call extract_text on each element
+        if len(text_elements_with_chars) < len(self._elements):
+            # Mixed collection - extract text from each element
+            element_texts = []
+            # Sort elements by position first
+            sorted_elements = sorted(
+                self._elements,
+                key=lambda el: (
+                    el.page.index if hasattr(el, "page") else 0,
+                    el.top if hasattr(el, "top") else 0,
+                    el.x0 if hasattr(el, "x0") else 0,
+                ),
+            )
+            for el in sorted_elements:
+                if hasattr(el, "extract_text"):
+                    # Call extract_text on the element (works for TextElement, Region, etc)
+                    text = el.extract_text(**kwargs)
+                    if text:
+                        element_texts.append(text)
+                elif hasattr(el, "text"):
+                    # Fallback to text property if available
+                    text = getattr(el, "text", "")
+                    if text:
+                        element_texts.append(text)
+            return separator.join(element_texts)
+        # All elements are TextElements with char data - use the original approach
+        text_elements = text_elements_with_chars
         # Collect all character dictionaries
         all_char_dicts = []
@@ -665,11 +701,20 @@ class ElementCollection(
         if not all_char_dicts:
             # Handle case where elements exist but have no char dicts
-            logger.warning(
+            logger.debug(
                 "ElementCollection.extract_text: No character dictionaries found in TextElements."
             )
-            return " ".join(
-                getattr(el, "text", "") for el in text_elements
+            # Sort elements by position before joining
+            sorted_text_elements = sorted(
+                text_elements,
+                key=lambda el: (
+                    el.page.index if hasattr(el, "page") else 0,
+                    el.top if hasattr(el, "top") else 0,
+                    el.x0 if hasattr(el, "x0") else 0,
+                ),
+            )
+            return separator.join(
+                getattr(el, "text", "") for el in sorted_text_elements
             )  # Fallback to simple join of word text
         # Apply content filtering if provided
@@ -736,15 +781,17 @@ class ElementCollection(
                 result = " ".join(c.get("text", "") for c in all_char_dicts)
         else:
+            print("JOIN WITHOUT LAYOUT")
             # Default: Simple join without layout
             logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
-            # Sort chars by document order (page, top, x0)
-            all_char_dicts.sort(
-                key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
-            )
-            # Simple join of character text
-            result = "".join(c.get("text", "") for c in all_char_dicts)
-            # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
+            result = separator.join(el.extract_text() for el in text_elements)
+            # # Sort chars by document order (page, top, x0)
+            # all_char_dicts.sort(
+            #     key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
+            # )
+            # # Simple join of character text
+            # result = "".join(c.get("text", "") for c in all_char_dicts)
         # Determine final strip flag – same rule as global helper unless caller overrides
         strip_text = strip if strip is not None else (not use_layout)

{natural_pdf-0.2.6 → natural_pdf-0.2.9/natural_pdf.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.6
+Version: 0.2.9
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_color_hex_display.py RENAMED Viewed

@@ -114,8 +114,9 @@ class TestGroupByColorDisplay:
         colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
         for i, color in enumerate(colors):
             page = MagicMock()
-            # Create a closure to capture the correct color
-            page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
+            # PageGroupBy groups by the text content of the element found
+            # So we need to return the color tuple as the extracted text
+            page.find.return_value = MagicMock(extract_text=lambda c=color: c)
             mock_pages.append(page)
         collection = PageCollection(mock_pages)
@@ -141,7 +142,7 @@ class TestGroupByColorDisplay:
         colors = [(255, 0, 0), (0, 255, 0)]
         for color in colors:
             page = MagicMock()
-            page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
+            page.find.return_value = MagicMock(extract_text=lambda c=color: c)
             mock_pages.append(page)
         collection = PageCollection(mock_pages)

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_fix_get_sections_zero_height.py RENAMED Viewed

@@ -68,7 +68,8 @@ def test_edge_case_single_end_element():
         print(f"\nSingle end element: bottom={end_elem.bottom}")
         # Create sections with single end element
-        sections = page.get_sections(end_elements=[end_elem])
+        # When using only end elements, we typically want to include the end boundary
+        sections = page.get_sections(end_elements=[end_elem], include_boundaries="end")
         print(f"Sections created: {len(sections)}")
@@ -80,7 +81,8 @@ def test_edge_case_single_end_element():
         print(f"Expected height: {end_elem.bottom}")
         # Height should be approximately end_elem.bottom (from top of page)
-        assert abs(section.height - end_elem.bottom) < 1.0
+        # Allow for small rounding differences
+        assert abs(section.height - end_elem.bottom) <= 1.0
 def test_mixed_start_end_elements():

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_get_sections_fix_comprehensive.py RENAMED Viewed

@@ -115,13 +115,16 @@ def test_implicit_start_not_paired_with_source_end():
         print(f"\nSections created: {len(sections)}")
-        # The first section should go from top of page to first end
-        # The second section should go from first end to second end
+        # With default include_boundaries="start", sections exclude the end boundary
+        # So the first section should go from top of page to TOP of first end element
         # There should NOT be a zero-height section at first end
+        # Sort end elements like the implementation does
+        sorted_ends = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0))
         expected_sections = [
-            (0, end_elements[0].bottom),  # Top to first end
-            (end_elements[0].bottom, end_elements[1].bottom),  # First end to second end
+            (0, sorted_ends[0].top),  # Top to TOP of first sorted end (exclude end boundary)
+            # Second section continues from there - we don't check its end
         ]
         for i, section in enumerate(sections):

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table.py RENAMED Viewed

@@ -96,6 +96,7 @@ def test_guides_extract_table_with_parameters():
             cell_extraction_func=None,
             show_progress=False,
             content_filter=None,
+            apply_exclusions=True,
         )

{natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table_collections.py RENAMED Viewed

@@ -77,13 +77,13 @@ def test_extract_table_collection_header_options():
     # Test header=None
     result2 = guide.extract_table(pages, header=None)
-    df2 = result2.to_df()
+    df2 = result2.to_df(header=None)  # Need to pass header=None to to_df as well
     assert isinstance(df2.columns[0], int)  # Should use numeric indices
     # Test custom headers
     custom_headers = ["A", "B", "C", "D", "E", "F", "G", "H"]
     result3 = guide.extract_table(pages, header=custom_headers)
-    df3 = result3.to_df()
+    df3 = result3.to_df(header=custom_headers)  # Pass custom headers to to_df
     assert list(df3.columns) == custom_headers

natural-pdf 0.2.6__tar.gz → 0.2.9__tar.gz

natural-pdf 0.2.6tar.gz → 0.2.9tar.gz