PyPI - natural-pdf - Versions diffs - 0.2.6__tar.gz → 0.2.8__tar.gz - Mend

natural-pdf 0.2.6tar.gz → 0.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (320) hide show

{natural_pdf-0.2.6/natural_pdf.egg-info → natural_pdf-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.6
+Version: 0.2.8
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/page.py RENAMED Viewed

@@ -717,14 +717,23 @@ class Page(
         # Add PDF-level exclusions if we have a parent PDF
         if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
+            # Get existing labels to check for duplicates
+            existing_labels = set()
+            for exc in all_exclusions:
+                if len(exc) >= 2 and exc[1]:  # Has a label
+                    existing_labels.add(exc[1])
             for pdf_exclusion in self._parent._exclusions:
-                # Check if this exclusion is already in our list (avoid duplicates)
-                if pdf_exclusion not in all_exclusions:
-                    # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
-                    if len(pdf_exclusion) == 2:
-                        # Convert to 3-tuple format with default method
-                        pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
-                    all_exclusions.append(pdf_exclusion)
+                # Check if this exclusion label is already in our list (avoid duplicates)
+                label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
+                if label and label in existing_labels:
+                    continue  # Skip this exclusion as it's already been applied
+                # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
+                if len(pdf_exclusion) == 2:
+                    # Convert to 3-tuple format with default method
+                    pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
+                all_exclusions.append(pdf_exclusion)
         if debug:
             print(
@@ -829,6 +838,36 @@ class Page(
                 regions.append(exclusion_item)  # Label is already on the Region object
                 if debug:
                     print(f"  - Added direct region '{label}': {exclusion_item}")
+            # Process string selectors (from PDF-level exclusions)
+            elif isinstance(exclusion_item, str):
+                selector_str = exclusion_item
+                matching_elements = self.find_all(selector_str, apply_exclusions=False)
+                if debug:
+                    print(
+                        f"  - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
+                    )
+                if method == "region":
+                    # Convert each matching element to a region
+                    for el in matching_elements:
+                        try:
+                            bbox_coords = (
+                                float(el.x0),
+                                float(el.top),
+                                float(el.x1),
+                                float(el.bottom),
+                            )
+                            region = Region(self, bbox_coords, label=label)
+                            regions.append(region)
+                            if debug:
+                                print(f"    ✓ Added region from selector match: {bbox_coords}")
+                        except Exception as e:
+                            if debug:
+                                print(f"    ✗ Failed to create region from element: {e}")
+                # If method is "element", it will be handled in _filter_elements_by_exclusions
             # Element-based exclusions are not converted to regions here
             # They will be handled separately in _filter_elements_by_exclusions
@@ -852,7 +891,16 @@ class Page(
         Returns:
             A new list containing only the elements not excluded.
         """
-        if not self._exclusions:
+        # Check both page-level and PDF-level exclusions
+        has_page_exclusions = bool(self._exclusions)
+        has_pdf_exclusions = (
+            hasattr(self, "_parent")
+            and self._parent
+            and hasattr(self._parent, "_exclusions")
+            and bool(self._parent._exclusions)
+        )
+        if not has_page_exclusions and not has_pdf_exclusions:
             if debug_exclusions:
                 print(
                     f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
@@ -865,9 +913,15 @@ class Page(
         )
         # Collect element-based exclusions
-        excluded_elements = set()  # Use set for O(1) lookup
+        # Store element bboxes for comparison instead of object ids
+        excluded_element_bboxes = set()  # Use set for O(1) lookup
+        # Process both page-level and PDF-level exclusions
+        all_exclusions = list(self._exclusions) if has_page_exclusions else []
+        if has_pdf_exclusions:
+            all_exclusions.extend(self._parent._exclusions)
-        for exclusion_data in self._exclusions:
+        for exclusion_data in all_exclusions:
             # Handle both old format (2-tuple) and new format (3-tuple)
             if len(exclusion_data) == 2:
                 exclusion_item, label = exclusion_data
@@ -883,16 +937,31 @@ class Page(
             if isinstance(exclusion_item, Region):
                 continue
+            # Handle string selectors for element-based exclusions
+            if isinstance(exclusion_item, str) and method == "element":
+                selector_str = exclusion_item
+                matching_elements = self.find_all(selector_str, apply_exclusions=False)
+                for el in matching_elements:
+                    if hasattr(el, "bbox"):
+                        bbox = tuple(el.bbox)
+                        excluded_element_bboxes.add(bbox)
+                        if debug_exclusions:
+                            print(
+                                f"  - Added element exclusion from selector '{selector_str}': {bbox}"
+                            )
             # Handle element-based exclusions
-            if method == "element" and hasattr(exclusion_item, "bbox"):
-                excluded_elements.add(id(exclusion_item))
+            elif method == "element" and hasattr(exclusion_item, "bbox"):
+                # Store bbox tuple for comparison
+                bbox = tuple(exclusion_item.bbox)
+                excluded_element_bboxes.add(bbox)
                 if debug_exclusions:
-                    print(f"  - Added element exclusion: {exclusion_item}")
+                    print(f"  - Added element exclusion with bbox {bbox}: {exclusion_item}")
         if debug_exclusions:
             print(
                 f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
-                f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
+                f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
             )
         filtered_elements = []
@@ -903,7 +972,7 @@ class Page(
             exclude = False
             # Check element-based exclusions first (faster)
-            if id(element) in excluded_elements:
+            if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
                 exclude = True
                 element_excluded_count += 1
                 if debug_exclusions:
@@ -2487,10 +2556,23 @@ class Page(
         return self
     def get_section_between(
-        self, start_element=None, end_element=None, include_boundaries="both"
+        self,
+        start_element=None,
+        end_element=None,
+        include_boundaries="both",
+        orientation="vertical",
     ) -> Optional["Region"]:  # Return Optional
         """
         Get a section between two elements on this page.
+        Args:
+            start_element: Element marking the start of the section
+            end_element: Element marking the end of the section
+            include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
+            orientation: 'vertical' (default) or 'horizontal' - determines section direction
+        Returns:
+            Region representing the section
         """
         # Create a full-page region to operate within
         page_region = self.create_region(0, 0, self.width, self.height)
@@ -2501,6 +2583,7 @@ class Page(
                 start_element=start_element,
                 end_element=end_element,
                 include_boundaries=include_boundaries,
+                orientation=orientation,
             )
         except Exception as e:
             logger.error(
@@ -2575,10 +2658,23 @@ class Page(
         if include_boundaries not in valid_inclusions:
             raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
-        if not start_elements:
-            # Return an empty ElementCollection if no start elements
+        if not start_elements and not end_elements:
+            # Return an empty ElementCollection if no boundary elements at all
             return ElementCollection([])
+        # If we only have end elements, create implicit start elements
+        if not start_elements and end_elements:
+            # Delegate to PageCollection implementation for consistency
+            from natural_pdf.core.page_collection import PageCollection
+            pages = PageCollection([self])
+            return pages.get_sections(
+                start_elements=start_elements,
+                end_elements=end_elements,
+                include_boundaries=include_boundaries,
+                orientation=orientation,
+            )
         # Combine start and end elements with their type
         all_boundaries = []
         for el in start_elements:

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/page_collection.py RENAMED Viewed

@@ -537,10 +537,14 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
             first_page = self.pages[0]
             first_start = Region(first_page, (0, 0, first_page.width, 1))
             first_start.is_implicit_start = True
+            # Don't mark this as created from any end element, so it can pair with any end
             start_elements.append(first_start)
             # For each end element (except the last), add an implicit start after it
-            sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
+            # Sort by page, then top, then bottom (for elements with same top), then x0
+            sorted_end_elements = sorted(
+                end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0)
+            )
             for i, end_elem in enumerate(sorted_end_elements[:-1]):  # Exclude last end element
                 # Create implicit start element right after this end element
                 implicit_start = Region(
@@ -838,29 +842,47 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                 # Create a section from current_start to just before this boundary
                 start_element = current_start["element"]
-                # Find the last element before this boundary on the same page
+                # Create section from current start to just before this new start
                 if start_element.page == boundary["element"].page:
-                    # Find elements on this page
-                    page_elements = [e for e in all_elements if e.page == start_element.page]
-                    # Sort by position based on orientation
+                    from natural_pdf.elements.region import Region
+                    next_start = boundary["element"]
+                    # Create section based on orientation
                     if orientation == "vertical":
-                        page_elements.sort(key=lambda e: (e.top, e.x0))
+                        # Determine vertical bounds
+                        if include_boundaries in ["start", "both"]:
+                            top = start_element.top
+                        else:
+                            top = start_element.bottom
+                        # The section ends just before the next start
+                        bottom = next_start.top
+                        # Create the section with full page width
+                        if top < bottom:
+                            section = Region(
+                                start_element.page, (0, top, start_element.page.width, bottom)
+                            )
+                            section.start_element = start_element
+                            sections.append(section)
                     else:  # horizontal
-                        page_elements.sort(key=lambda e: (e.x0, e.top))
+                        # Determine horizontal bounds
+                        if include_boundaries in ["start", "both"]:
+                            left = start_element.x0
+                        else:
+                            left = start_element.x1
-                    # Find the last element before the boundary
-                    end_idx = (
-                        page_elements.index(boundary["element"]) - 1
-                        if boundary["element"] in page_elements
-                        else -1
-                    )
-                    end_element = page_elements[end_idx] if end_idx >= 0 else None
+                        # The section ends just before the next start
+                        right = next_start.x0
-                    # Create the section
-                    section = start_element.page.get_section_between(
-                        start_element, end_element, include_boundaries, orientation
-                    )
-                    sections.append(section)
+                        # Create the section with full page height
+                        if left < right:
+                            section = Region(
+                                start_element.page, (left, 0, right, start_element.page.height)
+                            )
+                            section.start_element = start_element
+                            sections.append(section)
                 else:
                     # Cross-page section - create from current_start to the end of its page
                     from natural_pdf.elements.region import Region

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/pdf.py RENAMED Viewed

@@ -252,6 +252,16 @@ class _LazyPageList(Sequence):
                         logger.warning(f"Failed to apply region to page {cached.number}: {e}")
             self._cache[index] = cached
+            # Also cache in the parent PDF's main page list if this is a slice
+            if (
+                hasattr(self._parent_pdf, "_pages")
+                and hasattr(self._parent_pdf._pages, "_cache")
+                and actual_page_index < len(self._parent_pdf._pages._cache)
+                and self._parent_pdf._pages._cache[actual_page_index] is None
+            ):
+                self._parent_pdf._pages._cache[actual_page_index] = cached
         return cached
     # Sequence protocol ---------------------------------------------------
@@ -720,26 +730,16 @@ class PDF(
             # Store for bookkeeping and lazy application
             self._exclusions.append((exclusion_func, label))
-            # Apply only to already-created (cached) pages to avoid forcing page creation
-            for i in range(len(self._pages)):
-                if self._pages._cache[i] is not None:  # Only apply to existing pages
-                    try:
-                        self._pages._cache[i].add_exclusion(exclusion_func, label=label)
-                    except Exception as e:
-                        logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
+            # Don't modify already-cached pages - they will get PDF-level exclusions
+            # dynamically through _get_exclusion_regions()
             return self
         # Fallback to original callable / Region behaviour ------------------
         exclusion_data = (exclusion_func, label)
         self._exclusions.append(exclusion_data)
-        # Apply only to already-created (cached) pages to avoid forcing page creation
-        for i in range(len(self._pages)):
-            if self._pages._cache[i] is not None:  # Only apply to existing pages
-                try:
-                    self._pages._cache[i].add_exclusion(exclusion_func, label=label)
-                except Exception as e:
-                    logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
+        # Don't modify already-cached pages - they will get PDF-level exclusions
+        # dynamically through _get_exclusion_regions()
         return self

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/element_collection.py RENAMED Viewed

@@ -621,6 +621,7 @@ class ElementCollection(
     def extract_text(
         self,
+        separator: str = " ",
         preserve_whitespace: bool = True,
         use_exclusions: bool = True,
         strip: Optional[bool] = None,
@@ -632,6 +633,9 @@ class ElementCollection(
         pdfplumber's layout engine if layout=True is specified.
         Args:
+            separator: String to insert between text from different elements when
+                      using simple joining (layout=False). Default is a single space.
+                      Ignored when layout=True as the layout engine handles spacing.
             preserve_whitespace: Deprecated. Use layout=False for simple joining.
             use_exclusions: Deprecated. Exclusions should be applied *before* creating
                           the collection or by filtering the collection itself.
@@ -668,7 +672,7 @@ class ElementCollection(
             logger.warning(
                 "ElementCollection.extract_text: No character dictionaries found in TextElements."
             )
-            return " ".join(
+            return separator.join(
                 getattr(el, "text", "") for el in text_elements
             )  # Fallback to simple join of word text
@@ -733,18 +737,33 @@ class ElementCollection(
                 all_char_dicts.sort(
                     key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
                 )
-                result = " ".join(c.get("text", "") for c in all_char_dicts)
+                result = separator.join(c.get("text", "") for c in all_char_dicts)
         else:
             # Default: Simple join without layout
             logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
-            # Sort chars by document order (page, top, x0)
-            all_char_dicts.sort(
-                key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
+            # Instead of joining all characters individually, we need to:
+            # 1. Extract text from each element
+            # 2. Join the element texts with the separator
+            # Sort elements by document order (page, top, x0)
+            sorted_elements = sorted(
+                text_elements,
+                key=lambda el: (
+                    el.page.index if hasattr(el, "page") else 0,
+                    el.top if hasattr(el, "top") else 0,
+                    el.x0 if hasattr(el, "x0") else 0,
+                ),
             )
-            # Simple join of character text
-            result = "".join(c.get("text", "") for c in all_char_dicts)
-            # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
+            # Extract text from each element
+            element_texts = []
+            for el in sorted_elements:
+                if hasattr(el, "text") and el.text:
+                    element_texts.append(el.text)
+            result = separator.join(element_texts)
         # Determine final strip flag – same rule as global helper unless caller overrides
         strip_text = strip if strip is not None else (not use_layout)

{natural_pdf-0.2.6 → natural_pdf-0.2.8/natural_pdf.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.6
+Version: 0.2.8
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_color_hex_display.py RENAMED Viewed

@@ -114,8 +114,9 @@ class TestGroupByColorDisplay:
         colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
         for i, color in enumerate(colors):
             page = MagicMock()
-            # Create a closure to capture the correct color
-            page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
+            # PageGroupBy groups by the text content of the element found
+            # So we need to return the color tuple as the extracted text
+            page.find.return_value = MagicMock(extract_text=lambda c=color: c)
             mock_pages.append(page)
         collection = PageCollection(mock_pages)
@@ -141,7 +142,7 @@ class TestGroupByColorDisplay:
         colors = [(255, 0, 0), (0, 255, 0)]
         for color in colors:
             page = MagicMock()
-            page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
+            page.find.return_value = MagicMock(extract_text=lambda c=color: c)
             mock_pages.append(page)
         collection = PageCollection(mock_pages)

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_fix_get_sections_zero_height.py RENAMED Viewed

@@ -68,7 +68,8 @@ def test_edge_case_single_end_element():
         print(f"\nSingle end element: bottom={end_elem.bottom}")
         # Create sections with single end element
-        sections = page.get_sections(end_elements=[end_elem])
+        # When using only end elements, we typically want to include the end boundary
+        sections = page.get_sections(end_elements=[end_elem], include_boundaries="end")
         print(f"Sections created: {len(sections)}")
@@ -80,7 +81,8 @@ def test_edge_case_single_end_element():
         print(f"Expected height: {end_elem.bottom}")
         # Height should be approximately end_elem.bottom (from top of page)
-        assert abs(section.height - end_elem.bottom) < 1.0
+        # Allow for small rounding differences
+        assert abs(section.height - end_elem.bottom) <= 1.0
 def test_mixed_start_end_elements():

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_get_sections_fix_comprehensive.py RENAMED Viewed

@@ -115,13 +115,16 @@ def test_implicit_start_not_paired_with_source_end():
         print(f"\nSections created: {len(sections)}")
-        # The first section should go from top of page to first end
-        # The second section should go from first end to second end
+        # With default include_boundaries="start", sections exclude the end boundary
+        # So the first section should go from top of page to TOP of first end element
         # There should NOT be a zero-height section at first end
+        # Sort end elements like the implementation does
+        sorted_ends = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0))
         expected_sections = [
-            (0, end_elements[0].bottom),  # Top to first end
-            (end_elements[0].bottom, end_elements[1].bottom),  # First end to second end
+            (0, sorted_ends[0].top),  # Top to TOP of first sorted end (exclude end boundary)
+            # Second section continues from there - we don't check its end
         ]
         for i, section in enumerate(sections):

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_extract_table.py RENAMED Viewed

@@ -96,6 +96,7 @@ def test_guides_extract_table_with_parameters():
             cell_extraction_func=None,
             show_progress=False,
             content_filter=None,
+            apply_exclusions=True,
         )

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_extract_table_collections.py RENAMED Viewed

@@ -77,13 +77,13 @@ def test_extract_table_collection_header_options():
     # Test header=None
     result2 = guide.extract_table(pages, header=None)
-    df2 = result2.to_df()
+    df2 = result2.to_df(header=None)  # Need to pass header=None to to_df as well
     assert isinstance(df2.columns[0], int)  # Should use numeric indices
     # Test custom headers
     custom_headers = ["A", "B", "C", "D", "E", "F", "G", "H"]
     result3 = guide.extract_table(pages, header=custom_headers)
-    df3 = result3.to_df()
+    df3 = result3.to_df(header=custom_headers)  # Pass custom headers to to_df
     assert list(df3.columns) == custom_headers

{natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_mock.py RENAMED Viewed

@@ -40,18 +40,19 @@ def test_get_sections_include_boundaries():
     page.pdf = pdf
     # Create mock elements on the page
-    # Header at top of page
-    header_element = create_mock_element(page, "Section 1", top=700, bottom=720)
+    # In PDF coordinates, top of page has higher Y value
+    # Header at top of page (high Y value)
+    header_element = create_mock_element(page, "Section 1", top=100, bottom=120)
     # Content in middle
     content_elements = [
-        create_mock_element(page, "Content line 1", top=650, bottom=670),
-        create_mock_element(page, "Content line 2", top=620, bottom=640),
-        create_mock_element(page, "Content line 3", top=590, bottom=610),
+        create_mock_element(page, "Content line 1", top=150, bottom=170),
+        create_mock_element(page, "Content line 2", top=200, bottom=220),
+        create_mock_element(page, "Content line 3", top=250, bottom=270),
     ]
-    # Next header
-    next_header = create_mock_element(page, "Section 2", top=550, bottom=570)
+    # Next header (lower on page, higher Y value)
+    next_header = create_mock_element(page, "Section 2", top=300, bottom=320)
     # Set up the page's element finding
     all_elements = [header_element] + content_elements + [next_header]
@@ -63,24 +64,38 @@ def test_get_sections_include_boundaries():
     page.find_all = mock_find_all
+    # Mock get_elements to return all elements
+    page.get_elements = Mock(return_value=all_elements)
     # Mock get_section_between to return regions with correct boundaries
-    def mock_get_section_between(start, end, include_boundaries="both"):
+    def mock_get_section_between(start, end, include_boundaries="both", orientation="vertical"):
+        # Ensure start and end are in the right order
+        # In this test setup, start should come before end (lower top value)
+        if not end:
+            end_top = page.height
+            end_bottom = page.height
+        else:
+            end_top = end.top
+            end_bottom = end.bottom
         if include_boundaries == "both":
             top = start.top
-            bottom = end.bottom if end else page.height
+            bottom = end_bottom
         elif include_boundaries == "start":
             top = start.top
-            bottom = end.top if end else page.height
+            bottom = end_top
         elif include_boundaries == "end":
             top = start.bottom
-            bottom = end.bottom if end else page.height
+            bottom = end_bottom
         else:  # none
             top = start.bottom
-            bottom = end.top if end else page.height
+            bottom = end_top
+        # Ensure top < bottom for valid region
+        if top > bottom:
+            top, bottom = bottom, top
         region = Region(page, (0, top, page.width, bottom))
-        # Store which elements would be in this region
-        region._included_elements = [e for e in all_elements if e.top >= bottom and e.bottom <= top]
         return region
     page.get_section_between = mock_get_section_between
@@ -106,37 +121,33 @@ def test_get_sections_include_boundaries():
     for boundaries in ["both", "start", "end", "none"]:
         sections = collection.get_sections("text:contains(Section)", include_boundaries=boundaries)
+        print(f"\ninclude_boundaries='{boundaries}':")
+        print(f"  Number of sections: {len(sections)}")
         if len(sections) > 0:
             section = sections[0]
-            print(f"\ninclude_boundaries='{boundaries}':")
             print(f"  Section bbox: {section.bbox}")
             print(f"  Top: {section.bbox[1]}, Bottom: {section.bbox[3]}")
-            # Verify boundaries are correct
-            if boundaries == "both":
+            # When we have only start elements, sections go from start to next start
+            # The section always ends at the TOP of the next start element
+            # include_boundaries only affects whether we include the START element
+            if boundaries == "both" or boundaries == "start":
+                # Should include the start element
                 assert (
                     section.bbox[1] == header_element.top
-                ), f"'both' should include start element top"
+                ), f"'{boundaries}' should start at first element top"
                 assert (
-                    section.bbox[3] == next_header.bottom
-                ), f"'both' should include end element bottom"
-            elif boundaries == "start":
-                assert (
-                    section.bbox[1] == header_element.top
-                ), f"'start' should include start element top"
-                assert section.bbox[3] == next_header.top, f"'start' should exclude end element"
-            elif boundaries == "end":
+                    section.bbox[3] == next_header.top
+                ), f"Section should always end at next element top"
+            else:  # "end" or "none"
+                # Should exclude the start element
                 assert (
                     section.bbox[1] == header_element.bottom
-                ), f"'end' should exclude start element"
+                ), f"'{boundaries}' should start after first element"
                 assert (
-                    section.bbox[3] == next_header.bottom
-                ), f"'end' should include end element bottom"
-            else:  # none
-                assert (
-                    section.bbox[1] == header_element.bottom
-                ), f"'none' should exclude start element"
-                assert section.bbox[3] == next_header.top, f"'none' should exclude end element"
+                    section.bbox[3] == next_header.top
+                ), f"Section should always end at next element top"
     print("\n✅ All mock tests passed! include_boundaries parameter is working correctly.")

natural-pdf 0.2.6__tar.gz → 0.2.8__tar.gz

natural-pdf 0.2.6tar.gz → 0.2.8tar.gz