PyPI - natural-pdf - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

natural-pdf 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

natural_pdf/analyzers/guides.py CHANGED Viewed

@@ -143,7 +143,7 @@ class GuidesList(UserList):
     def from_content(
         self,
-        markers: Union[str, List[str], "ElementCollection", None],
+        markers: Union[str, List[str], "ElementCollection", Callable, None],
         obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
         align: Literal["left", "right", "center", "between"] = "left",
         outer: bool = True,
@@ -160,6 +160,7 @@ class GuidesList(UserList):
                 - str: single selector (e.g., 'text:contains("Name")') or literal text
                 - List[str]: list of selectors or literal text strings
                 - ElementCollection: collection of elements to extract text from
+                - Callable: function that takes a page and returns markers
                 - None: no markers
             obj: Page/Region/FlowRegion to search (uses parent's context if None)
             align: How to align guides relative to found elements
@@ -174,13 +175,22 @@ class GuidesList(UserList):
         if target_obj is None:
             raise ValueError("No object provided and no context available")
+        # Store callable markers for later evaluation
+        if callable(markers):
+            self._callable = markers
+            # For now, evaluate with the current target object to get initial guides
+            actual_markers = markers(target_obj)
+        else:
+            self._callable = None
+            actual_markers = markers
         # Check if parent is in flow mode
         if self._parent.is_flow_region:
             # Create guides across all constituent regions
             all_guides = []
             for region in self._parent.context.constituent_regions:
                 # Normalize markers for this region
-                marker_texts = _normalize_markers(markers, region)
+                marker_texts = _normalize_markers(actual_markers, region)
                 # Create guides for this region
                 region_guides = Guides.from_content(
@@ -263,7 +273,7 @@ class GuidesList(UserList):
         # Original single-region logic
         # Normalize markers to list of text strings
-        marker_texts = _normalize_markers(markers, target_obj)
+        marker_texts = _normalize_markers(actual_markers, target_obj)
         # Create guides for this axis
         new_guides = Guides.from_content(
@@ -1541,11 +1551,15 @@ class Guides:
         # Add outer guides if requested
         if outer and bounds:
             if axis == "vertical":
-                guides_coords.insert(0, bounds[0])  # x0
-                guides_coords.append(bounds[2])  # x1
+                if outer == True or outer == "first":
+                    guides_coords.insert(0, bounds[0])  # x0
+                if outer == True or outer == "last":
+                    guides_coords.append(bounds[2])  # x1
             else:
-                guides_coords.insert(0, bounds[1])  # y0
-                guides_coords.append(bounds[3])  # y1
+                if outer == True or outer == "first":
+                    guides_coords.insert(0, bounds[1])  # y0
+                if outer == True or outer == "last":
+                    guides_coords.append(bounds[3])  # y1
         # Remove duplicates and sort
         guides_coords = sorted(list(set(guides_coords)))
@@ -3302,7 +3316,7 @@ class Guides:
         markers: Union[str, List[str], "ElementCollection", None] = None,
         obj: Optional[Union["Page", "Region"]] = None,
         align: Literal["left", "right", "center", "between"] = "left",
-        outer: bool = True,
+        outer: Union[str, bool] = True,
         tolerance: float = 5,
         apply_exclusions: bool = True,
     ) -> "Guides":
@@ -3319,7 +3333,10 @@ class Guides:
                 - None: no markers
             obj: Page or Region to search (uses self.context if None)
             align: How to align guides relative to found elements
-            outer: Whether to add outer boundary guides
+            outer: Whether to add outer boundary guides. Can be:
+                - bool: True/False to add/not add both
+                - "first": To add boundary before the first element
+                - "last": To add boundary before the last element
             tolerance: Tolerance for snapping to element edges
             apply_exclusions: Whether to apply exclusion zones when searching for text
@@ -3457,6 +3474,7 @@ class Guides:
         cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
         show_progress: bool = False,
         content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
+        apply_exclusions: bool = True,
         *,
         multi_page: Literal["auto", True, False] = "auto",
     ) -> "TableResult":
@@ -3482,6 +3500,7 @@ class Guides:
             cell_extraction_func: Optional callable for custom cell text extraction
             show_progress: Controls progress bar for text method
             content_filter: Content filtering function or patterns
+            apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
             multi_page: Controls multi-region table creation for FlowRegions
         Returns:
@@ -3552,6 +3571,7 @@ class Guides:
                 cell_extraction_func=cell_extraction_func,
                 show_progress=show_progress,
                 content_filter=content_filter,
+                apply_exclusions=apply_exclusions,
             )
             return table_result
@@ -3577,6 +3597,162 @@ class Guides:
             except Exception as cleanup_err:
                 logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
+    def extract_table_from_pages(
+        self,
+        pages: Union["PageCollection", List["Page"]],
+        header: Union[str, List[str], None] = "first",
+        skip_repeating_headers: Optional[bool] = None,
+        method: Optional[str] = None,
+        table_settings: Optional[dict] = None,
+        use_ocr: bool = False,
+        ocr_config: Optional[dict] = None,
+        text_options: Optional[Dict] = None,
+        cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
+        show_progress: bool = True,
+        content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
+        apply_exclusions: bool = True,
+    ) -> "TableResult":
+        """
+        Extract tables from multiple pages using this guide pattern.
+        This method applies the guide to each page, extracts tables, and combines
+        them into a single TableResult. Dynamic guides (using lambdas) are evaluated
+        for each page.
+        Args:
+            pages: PageCollection or list of Pages to extract from
+            header: How to handle headers:
+                - "first": Use first row of first page as headers (default)
+                - "all": Expect headers on each page, use from first page
+                - None: No headers, use numeric indices
+                - List[str]: Custom column names
+            skip_repeating_headers: Whether to remove duplicate header rows.
+                Defaults to True when header is "first" or "all", False otherwise.
+            method: Table extraction method (passed to extract_table)
+            table_settings: Settings for pdfplumber table extraction
+            use_ocr: Whether to use OCR for text extraction
+            ocr_config: OCR configuration parameters
+            text_options: Dictionary of options for the 'text' method
+            cell_extraction_func: Optional callable for custom cell text extraction
+            show_progress: Show progress bar for multi-page extraction (default: True)
+            content_filter: Content filtering function or patterns
+            apply_exclusions: Whether to apply exclusion regions during extraction
+        Returns:
+            TableResult: Combined table data from all pages
+        Example:
+            ```python
+            # Create guide with static vertical, dynamic horizontal
+            guide = Guides(pages[0])
+            guide.vertical.from_content(columns, outer="last")
+            guide.horizontal.from_content(lambda p: p.find_all('text:starts-with(NF-)'))
+            # Extract from all pages
+            table_result = guide.extract_table_from_pages(pages, header=columns)
+            df = table_result.to_df()
+            ```
+        """
+        from natural_pdf.core.page_collection import PageCollection
+        from natural_pdf.tables.result import TableResult
+        # Convert to list if it's a PageCollection
+        if isinstance(pages, PageCollection):
+            page_list = list(pages)
+        else:
+            page_list = pages
+        if not page_list:
+            return TableResult([])
+        # Determine header handling
+        if skip_repeating_headers is None:
+            skip_repeating_headers = header in ["first", "all"] or isinstance(header, list)
+        all_rows = []
+        header_row = None
+        # Configure progress bar
+        iterator = page_list
+        if show_progress and len(page_list) > 1:
+            try:
+                from tqdm.auto import tqdm
+                iterator = tqdm(page_list, desc="Extracting tables from pages", unit="page")
+            except ImportError:
+                pass
+        for i, page in enumerate(iterator):
+            # Create a new Guides object for this page
+            page_guide = Guides(page)
+            # Copy vertical guides (usually static)
+            if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
+                # If vertical is dynamic (lambda), evaluate it
+                page_guide.vertical.from_content(self.vertical._callable(page))
+            else:
+                # Copy static vertical positions
+                page_guide.vertical.data = self.vertical.data.copy()
+            # Handle horizontal guides
+            if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
+                # If horizontal is dynamic (lambda), evaluate it
+                page_guide.horizontal.from_content(self.horizontal._callable(page))
+            else:
+                # Copy static horizontal positions
+                page_guide.horizontal.data = self.horizontal.data.copy()
+            # Extract table from this page
+            table_result = page_guide.extract_table(
+                method=method,
+                table_settings=table_settings,
+                use_ocr=use_ocr,
+                ocr_config=ocr_config,
+                text_options=text_options,
+                cell_extraction_func=cell_extraction_func,
+                show_progress=False,  # Don't show nested progress
+                content_filter=content_filter,
+                apply_exclusions=apply_exclusions,
+            )
+            # Convert to list of rows
+            rows = list(table_result)
+            # Handle headers based on strategy
+            if i == 0:  # First page
+                if header == "first" or header == "all":
+                    # Use first row as header
+                    if rows:
+                        header_row = rows[0]
+                        rows = rows[1:]  # Remove header from data
+                elif isinstance(header, list):
+                    # Custom headers provided
+                    header_row = header
+            else:  # Subsequent pages
+                if header == "all" and skip_repeating_headers and rows:
+                    # Expect and remove header row
+                    if rows and header_row and rows[0] == header_row:
+                        rows = rows[1:]
+                    elif rows:
+                        # Still remove first row if it looks like a header
+                        rows = rows[1:]
+            # Add rows to combined result
+            all_rows.extend(rows)
+        # Create final TableResult
+        if isinstance(header, list):
+            # Custom headers - prepend to data
+            final_result = TableResult(all_rows)
+        elif header_row is not None:
+            # Prepend discovered header
+            final_result = TableResult([header_row] + all_rows)
+        else:
+            # No headers
+            final_result = TableResult(all_rows)
+        return final_result
     def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
         """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
         if not self.is_flow_region or len(self.context.constituent_regions) < 2:

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -939,6 +939,11 @@ class ElementManager:
         self.load_elements()
         return self._elements.get("chars", [])
+    def invalidate_cache(self):
+        """Invalidate the cached elements, forcing a reload on next access."""
+        self._elements = None
+        logger.debug(f"Page {self._page.number}: ElementManager cache invalidated")
     @property
     def words(self):
         """Get all word elements."""

natural_pdf/core/page.py CHANGED Viewed

@@ -494,6 +494,9 @@ class Page(
                                 exc_info=False,
                             )
                             raise
+            # Invalidate ElementManager cache since exclusions affect element filtering
+            if hasattr(self, "_element_mgr") and self._element_mgr:
+                self._element_mgr.invalidate_cache()
             return self  # Completed processing for selector input
         # ElementCollection -----------------------------------------------
@@ -526,6 +529,9 @@ class Page(
                             exc_info=False,
                         )
                         raise
+            # Invalidate ElementManager cache since exclusions affect element filtering
+            if hasattr(self, "_element_mgr") and self._element_mgr:
+                self._element_mgr.invalidate_cache()
             return self  # Completed processing for ElementCollection input
         # ------------------------------------------------------------------
@@ -618,6 +624,9 @@ class Page(
                             f"Page {self.index}: Failed to convert list item to Region: {e}"
                         )
                         continue
+            # Invalidate ElementManager cache since exclusions affect element filtering
+            if hasattr(self, "_element_mgr") and self._element_mgr:
+                self._element_mgr.invalidate_cache()
             return self
         else:
             # Reject invalid types
@@ -629,6 +638,10 @@ class Page(
         if exclusion_data:
             self._exclusions.append(exclusion_data)
+        # Invalidate ElementManager cache since exclusions affect element filtering
+        if hasattr(self, "_element_mgr") and self._element_mgr:
+            self._element_mgr.invalidate_cache()
         return self
     def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
@@ -699,10 +712,26 @@ class Page(
         """
         regions = []
+        # Combine page-specific exclusions with PDF-level exclusions
+        all_exclusions = list(self._exclusions)  # Start with page-specific
+        # Add PDF-level exclusions if we have a parent PDF
+        if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
+            for pdf_exclusion in self._parent._exclusions:
+                # Check if this exclusion is already in our list (avoid duplicates)
+                if pdf_exclusion not in all_exclusions:
+                    # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
+                    if len(pdf_exclusion) == 2:
+                        # Convert to 3-tuple format with default method
+                        pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
+                    all_exclusions.append(pdf_exclusion)
         if debug:
-            print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
+            print(
+                f"\nPage {self.index}: Evaluating {len(all_exclusions)} exclusions ({len(self._exclusions)} page-specific, {len(all_exclusions) - len(self._exclusions)} from PDF)"
+            )
-        for i, exclusion_data in enumerate(self._exclusions):
+        for i, exclusion_data in enumerate(all_exclusions):
             # Handle both old format (2-tuple) and new format (3-tuple) for backward compatibility
             if len(exclusion_data) == 2:
                 # Old format: (exclusion_item, label)
@@ -1598,7 +1627,14 @@ class Page(
             return ""
         # 2. Apply element-based exclusions if enabled
-        if use_exclusions and self._exclusions:
+        # Check both page-level and PDF-level exclusions
+        has_exclusions = bool(self._exclusions) or (
+            hasattr(self, "_parent")
+            and self._parent
+            and hasattr(self._parent, "_exclusions")
+            and self._parent._exclusions
+        )
+        if use_exclusions and has_exclusions:
             # Filter word elements through _filter_elements_by_exclusions
             # This handles both element-based and region-based exclusions
             word_elements = self._filter_elements_by_exclusions(
@@ -1612,7 +1648,7 @@ class Page(
         # 3. Get region-based exclusions for spatial filtering
         apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
         exclusion_regions = []
-        if apply_exclusions_flag and self._exclusions:
+        if apply_exclusions_flag and has_exclusions:
             exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
             if debug:
                 logger.debug(

natural_pdf/core/pdf.py CHANGED Viewed

@@ -173,11 +173,26 @@ class _LazyPageList(Sequence):
         """Create and cache a page at the given index within this list."""
         cached = self._cache[index]
         if cached is None:
+            # Get the actual page index in the full PDF
+            actual_page_index = self._indices[index]
+            # First check if this page is already cached in the parent PDF's main page list
+            if (
+                hasattr(self._parent_pdf, "_pages")
+                and hasattr(self._parent_pdf._pages, "_cache")
+                and actual_page_index < len(self._parent_pdf._pages._cache)
+                and self._parent_pdf._pages._cache[actual_page_index] is not None
+            ):
+                # Reuse the already-cached page from the parent PDF
+                # This ensures we get any exclusions that were already applied
+                cached = self._parent_pdf._pages._cache[actual_page_index]
+                self._cache[index] = cached
+                return cached
             # Import here to avoid circular import problems
             from natural_pdf.core.page import Page
-            # Get the actual page index in the full PDF
-            actual_page_index = self._indices[index]
+            # Create new page
             plumber_page = self._plumber_pdf.pages[actual_page_index]
             cached = Page(
                 plumber_page,
@@ -196,6 +211,30 @@ class _LazyPageList(Sequence):
                     except Exception as e:
                         logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
+            # Check if the parent PDF already has a cached page with page-specific exclusions
+            if hasattr(self._parent_pdf, "_pages") and hasattr(self._parent_pdf._pages, "_cache"):
+                parent_cache = self._parent_pdf._pages._cache
+                if (
+                    actual_page_index < len(parent_cache)
+                    and parent_cache[actual_page_index] is not None
+                ):
+                    existing_page = parent_cache[actual_page_index]
+                    # Copy over any page-specific exclusions from the existing page
+                    # Only copy non-callable exclusions (regions/elements) to avoid duplicating PDF-level exclusions
+                    if hasattr(existing_page, "_exclusions") and existing_page._exclusions:
+                        for exclusion_data in existing_page._exclusions:
+                            exclusion_item = exclusion_data[0]
+                            # Skip callable exclusions as they're PDF-level and already applied above
+                            if not callable(exclusion_item):
+                                try:
+                                    cached.add_exclusion(
+                                        *exclusion_data[:2]
+                                    )  # exclusion_item and label
+                                except Exception as e:
+                                    logger.warning(
+                                        f"Failed to copy page-specific exclusion to page {cached.number}: {e}"
+                                    )
             # Apply any stored regions to the newly created page
             if hasattr(self._parent_pdf, "_regions"):
                 for region_data in self._parent_pdf._regions:

natural_pdf/elements/region.py CHANGED Viewed

@@ -1236,6 +1236,7 @@ class Region(
         content_filter: Optional[
             Union[str, Callable[[str], bool], List[str]]
         ] = None,  # NEW: Content filtering
+        apply_exclusions: bool = True,  # Whether to apply exclusion regions during extraction
     ) -> TableResult:  # Return type allows Optional[str] for cells
         """
         Extract a table from this region.
@@ -1260,6 +1261,8 @@ class Region(
                 - A callable that takes text and returns True to KEEP the character
                 - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
                 Works with all extraction methods by filtering cell content.
+            apply_exclusions: Whether to apply exclusion regions during text extraction (default: True).
+                When True, text within excluded regions (e.g., headers/footers) will not be extracted.
         Returns:
             Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1297,7 +1300,9 @@ class Region(
                     )
                     return TableResult(
                         self._extract_table_from_cells(
-                            cell_regions_in_table, content_filter=content_filter
+                            cell_regions_in_table,
+                            content_filter=content_filter,
+                            apply_exclusions=apply_exclusions,
                         )
                     )
@@ -1381,16 +1386,22 @@ class Region(
         # Use the selected method
         if effective_method == "tatr":
             table_rows = self._extract_table_tatr(
-                use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
+                use_ocr=use_ocr,
+                ocr_config=ocr_config,
+                content_filter=content_filter,
+                apply_exclusions=apply_exclusions,
             )
         elif effective_method == "text":
             current_text_options = text_options.copy()
             current_text_options["cell_extraction_func"] = cell_extraction_func
             current_text_options["show_progress"] = show_progress
             current_text_options["content_filter"] = content_filter
+            current_text_options["apply_exclusions"] = apply_exclusions
             table_rows = self._extract_table_text(**current_text_options)
         elif effective_method == "pdfplumber":
-            table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
+            table_rows = self._extract_table_plumber(
+                table_settings, content_filter=content_filter, apply_exclusions=apply_exclusions
+            )
         else:
             raise ValueError(
                 f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1604,7 +1615,9 @@ class Region(
         # Return empty list if no tables found
         return []
-    def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
+    def _extract_table_plumber(
+        self, table_settings: dict, content_filter=None, apply_exclusions=True
+    ) -> List[List[str]]:
         """
         Extract table using pdfplumber's table extraction.
         This method extracts the largest table within the region.
@@ -1646,7 +1659,7 @@ class Region(
         # -------------------------------------------------------------
         base_plumber_page = self.page._page
-        if getattr(self.page, "_exclusions", None):
+        if apply_exclusions and getattr(self.page, "_exclusions", None):
             exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
             def _keep_char(obj):
@@ -1701,7 +1714,7 @@ class Region(
         return []
     def _extract_table_tatr(
-        self, use_ocr=False, ocr_config=None, content_filter=None
+        self, use_ocr=False, ocr_config=None, content_filter=None, apply_exclusions=True
     ) -> List[List[str]]:
         """
         Extract table using TATR structure detection.
@@ -1789,7 +1802,7 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                header_text = header.extract_text().strip()
+                header_text = header.extract_text(apply_exclusions=apply_exclusions).strip()
                 if content_filter is not None:
                     header_text = self._apply_content_filter_to_text(header_text, content_filter)
                 header_texts.append(header_text)
@@ -1824,7 +1837,7 @@ class Region(
                                 continue
                     # Fallback to normal extraction
-                    cell_text = cell_region.extract_text().strip()
+                    cell_text = cell_region.extract_text(apply_exclusions=apply_exclusions).strip()
                     if content_filter is not None:
                         cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
                     row_cells.append(cell_text)
@@ -1840,7 +1853,7 @@ class Region(
                             continue
                 # Fallback to normal extraction
-                row_text = row.extract_text().strip()
+                row_text = row.extract_text(apply_exclusions=apply_exclusions).strip()
                 if content_filter is not None:
                     row_text = self._apply_content_filter_to_text(row_text, content_filter)
                 row_cells.append(row_text)
@@ -1866,6 +1879,8 @@ class Region(
         show_progress = text_options.pop("show_progress", False)
         # --- Get content_filter option --- #
         content_filter = text_options.pop("content_filter", None)
+        # --- Get apply_exclusions option --- #
+        apply_exclusions = text_options.pop("apply_exclusions", True)
         # Analyze structure first (or use cached results)
         if "text_table_structure" in self.analyses:
@@ -1946,7 +1961,9 @@ class Region(
                         cell_value = None
                 else:
                     cell_value = cell_region.extract_text(
-                        layout=False, apply_exclusions=False, content_filter=content_filter
+                        layout=False,
+                        apply_exclusions=apply_exclusions,
+                        content_filter=content_filter,
                     ).strip()
                 rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3397,7 +3414,7 @@ class Region(
     # ------------------------------------------------------------------
     def _extract_table_from_cells(
-        self, cell_regions: List["Region"], content_filter=None
+        self, cell_regions: List["Region"], content_filter=None, apply_exclusions=True
     ) -> List[List[Optional[str]]]:
         """Construct a table (list-of-lists) from table_cell regions.
@@ -3439,7 +3456,9 @@ class Region(
                     r_idx = int(cell.metadata.get("row_index"))
                     c_idx = int(cell.metadata.get("col_index"))
                     text_val = cell.extract_text(
-                        layout=False, apply_exclusions=True, content_filter=content_filter
+                        layout=False,
+                        apply_exclusions=apply_exclusions,
+                        content_filter=content_filter,
                     ).strip()
                     table_grid[r_idx][c_idx] = text_val if text_val else None
                 except Exception as _err:
@@ -3488,7 +3507,7 @@ class Region(
             col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
             text_val = cell.extract_text(
-                layout=False, apply_exclusions=False, content_filter=content_filter
+                layout=False, apply_exclusions=apply_exclusions, content_filter=content_filter
             ).strip()
             table_grid[row_idx][col_idx] = text_val if text_val else None

natural_pdf/tables/result.py CHANGED Viewed

@@ -41,7 +41,7 @@ class TableResult(Sequence):
     def to_df(
         self,
-        header: Union[str, int, List[int], None] = "first",
+        header: Union[str, int, List[int], List[str], None] = "first",
         index_col=None,
         skip_repeating_headers=None,
         keep_blank: bool = False,
@@ -51,8 +51,8 @@ class TableResult(Sequence):
         Parameters
         ----------
-        header : "first" | int | list[int] | None, default "first"
-            • "first" – use row 0 as column names.\n            • int       – use that row index.\n            • list[int] – multi-row header.\n            • None/False– no header.
+        header : "first" | int | list[int] | list[str] | None, default "first"
+            • "first" – use row 0 as column names.\n            • int       – use that row index.\n            • list[int] – multi-row header.\n            • list[str] – custom column names.\n            • None/False– no header.
             Note: If the header row has a different number of columns than the
             body rows, the method will automatically fall back to header=None
@@ -84,7 +84,11 @@ class TableResult(Sequence):
         # Determine default for skip_repeating_headers based on header parameter
         if skip_repeating_headers is None:
-            skip_repeating_headers = header is not None and header is not False
+            skip_repeating_headers = (
+                header is not None
+                and header is not False
+                and not (isinstance(header, (list, tuple)) and len(header) == 0)
+            )
         # Determine header rows and body rows
         body = rows
@@ -97,10 +101,31 @@ class TableResult(Sequence):
         elif isinstance(header, int):
             hdr = rows[header]
             body = rows[:header] + rows[header + 1 :]
-        elif isinstance(header, (list, tuple)):
+        elif isinstance(header, (list, tuple)) and all(isinstance(i, int) for i in header):
+            # List of integers - multi-row header
             hdr_rows = [rows[i] for i in header]
             body = [r for idx, r in enumerate(rows) if idx not in header]
             hdr = hdr_rows
+        elif (
+            isinstance(header, (list, tuple))
+            and len(header) > 0
+            and all(isinstance(i, str) for i in header)
+        ):
+            # List of strings - custom column names
+            hdr = list(header)
+            body = rows
+            # Validate column count matches
+            if body:
+                max_cols = max(len(row) for row in body)
+                if len(hdr) != max_cols:
+                    raise ValueError(
+                        f"Number of column names ({len(hdr)}) must match "
+                        f"number of columns in data ({max_cols})"
+                    )
+        elif isinstance(header, (list, tuple)) and len(header) == 0:
+            # Empty list behaves like None
+            hdr = None
+            body = rows
         else:
             raise ValueError("Invalid value for header parameter")
@@ -125,7 +150,12 @@ class TableResult(Sequence):
                 pass
         # Check for header/body column count mismatch and fallback to no header
-        if hdr is not None and body:
+        if (
+            hdr is not None
+            and body
+            and not (isinstance(header, (list, tuple)) and all(isinstance(i, str) for i in header))
+        ):
+            # Skip this check for custom string headers
             # Get the maximum number of columns from all body rows
             # This handles cases where some rows have different column counts
             max_cols = max(len(row) for row in body) if body else 0
@@ -144,6 +174,9 @@ class TableResult(Sequence):
                 hdr = None
                 body = self._rows  # Use all rows as body
+        # Handle empty list case - pandas needs None not empty list
+        if isinstance(hdr, list) and len(hdr) == 0:
+            hdr = None
         df = pd.DataFrame(body, columns=hdr)
         # Convert empty strings to NaN by default

{natural_pdf-0.2.4.dist-info → natural_pdf-0.2.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.2.4
+Version: 0.2.5
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT

{natural_pdf-0.2.4.dist-info → natural_pdf-0.2.5.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
 natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
 natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
 natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
-natural_pdf/analyzers/guides.py,sha256=9FUbxk4XBOyktXgq9q5-bB949JFrzT1kBPikg2ENoIw,150032
+natural_pdf/analyzers/guides.py,sha256=mLWPPEwywo_FbU3gSoegiRlzxYmkHEo2c4DLX9krH9k,157691
 natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
 natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
 natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
@@ -25,12 +25,12 @@ natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGv
 natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
 natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
 natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
-natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
+natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
 natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
-natural_pdf/core/page.py,sha256=XrDePXZgXgB3w8hvxh4-EhPQnrwmw-0z-I_K24__OtY,142550
+natural_pdf/core/page.py,sha256=Q3hBvB9KFB8doeXY7YVQt3G1ULdBDfA-0BQD6YPN4oo,144640
 natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
 natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
-natural_pdf/core/pdf.py,sha256=Loe6sbQzBp9VDeIAuDS3zQmeDWvQMj5SWIQMky5bPDA,101964
+natural_pdf/core/pdf.py,sha256=VslSn00So6157XfiYbrB9URpx5VlWyshQOt7upi9us4,104248
 natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
 natural_pdf/core/render_spec.py,sha256=rLicaS9EPyojpJcjy2Lzn5DLWQwjrFyDJyRo7jbjdGU,14505
 natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
@@ -44,7 +44,7 @@ natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdD
 natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
 natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
 natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
-natural_pdf/elements/region.py,sha256=RxWidI7oNrdbuuj94SfdFXmcSDTfy89uGCeVMQvAfks,155591
+natural_pdf/elements/region.py,sha256=_NNBewHlyUHvA4g9kApilP6it0cn2IRlcGG4r993oUI,156660
 natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
 natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
 natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -87,7 +87,7 @@ natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1
 natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
 natural_pdf/selectors/parser.py,sha256=pw0M8ICKPMOzZPzWpLsQMG_lnl8PewGIdIG3ciukabk,38877
 natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
-natural_pdf/tables/result.py,sha256=1pcelNZvOb6Anlwj08Z1XU-YK1ihlCsLpYMRA3Zc4JM,7242
+natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
 natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
 natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
 natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3HCE,1151
@@ -106,7 +106,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
 natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
 natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
 natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
-natural_pdf-0.2.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
+natural_pdf-0.2.5.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
 optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
 optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
 optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -123,8 +123,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
 tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
 tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
 tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
-natural_pdf-0.2.4.dist-info/METADATA,sha256=G1tmes61GVEt6zLeDISuJZgceLQywIU-uRspGA_90Q8,6959
-natural_pdf-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-natural_pdf-0.2.4.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
-natural_pdf-0.2.4.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
-natural_pdf-0.2.4.dist-info/RECORD,,
+natural_pdf-0.2.5.dist-info/METADATA,sha256=H9nhjh1zRBmz2vUTe_j6FT-Zvn1sgoWT0nyoZG5GTYg,6959
+natural_pdf-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+natural_pdf-0.2.5.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
+natural_pdf-0.2.5.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
+natural_pdf-0.2.5.dist-info/RECORD,,

{natural_pdf-0.2.4.dist-info → natural_pdf-0.2.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.2.4.dist-info → natural_pdf-0.2.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.2.4.dist-info → natural_pdf-0.2.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{natural_pdf-0.2.4.dist-info → natural_pdf-0.2.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

natural-pdf 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

natural-pdf 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl