natural-pdf 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -143,7 +143,7 @@ class GuidesList(UserList):
143
143
 
144
144
  def from_content(
145
145
  self,
146
- markers: Union[str, List[str], "ElementCollection", None],
146
+ markers: Union[str, List[str], "ElementCollection", Callable, None],
147
147
  obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
148
148
  align: Literal["left", "right", "center", "between"] = "left",
149
149
  outer: bool = True,
@@ -160,6 +160,7 @@ class GuidesList(UserList):
160
160
  - str: single selector (e.g., 'text:contains("Name")') or literal text
161
161
  - List[str]: list of selectors or literal text strings
162
162
  - ElementCollection: collection of elements to extract text from
163
+ - Callable: function that takes a page and returns markers
163
164
  - None: no markers
164
165
  obj: Page/Region/FlowRegion to search (uses parent's context if None)
165
166
  align: How to align guides relative to found elements
@@ -174,13 +175,22 @@ class GuidesList(UserList):
174
175
  if target_obj is None:
175
176
  raise ValueError("No object provided and no context available")
176
177
 
178
+ # Store callable markers for later evaluation
179
+ if callable(markers):
180
+ self._callable = markers
181
+ # For now, evaluate with the current target object to get initial guides
182
+ actual_markers = markers(target_obj)
183
+ else:
184
+ self._callable = None
185
+ actual_markers = markers
186
+
177
187
  # Check if parent is in flow mode
178
188
  if self._parent.is_flow_region:
179
189
  # Create guides across all constituent regions
180
190
  all_guides = []
181
191
  for region in self._parent.context.constituent_regions:
182
192
  # Normalize markers for this region
183
- marker_texts = _normalize_markers(markers, region)
193
+ marker_texts = _normalize_markers(actual_markers, region)
184
194
 
185
195
  # Create guides for this region
186
196
  region_guides = Guides.from_content(
@@ -263,7 +273,7 @@ class GuidesList(UserList):
263
273
 
264
274
  # Original single-region logic
265
275
  # Normalize markers to list of text strings
266
- marker_texts = _normalize_markers(markers, target_obj)
276
+ marker_texts = _normalize_markers(actual_markers, target_obj)
267
277
 
268
278
  # Create guides for this axis
269
279
  new_guides = Guides.from_content(
@@ -1541,11 +1551,15 @@ class Guides:
1541
1551
  # Add outer guides if requested
1542
1552
  if outer and bounds:
1543
1553
  if axis == "vertical":
1544
- guides_coords.insert(0, bounds[0]) # x0
1545
- guides_coords.append(bounds[2]) # x1
1554
+ if outer == True or outer == "first":
1555
+ guides_coords.insert(0, bounds[0]) # x0
1556
+ if outer == True or outer == "last":
1557
+ guides_coords.append(bounds[2]) # x1
1546
1558
  else:
1547
- guides_coords.insert(0, bounds[1]) # y0
1548
- guides_coords.append(bounds[3]) # y1
1559
+ if outer == True or outer == "first":
1560
+ guides_coords.insert(0, bounds[1]) # y0
1561
+ if outer == True or outer == "last":
1562
+ guides_coords.append(bounds[3]) # y1
1549
1563
 
1550
1564
  # Remove duplicates and sort
1551
1565
  guides_coords = sorted(list(set(guides_coords)))
@@ -3302,7 +3316,7 @@ class Guides:
3302
3316
  markers: Union[str, List[str], "ElementCollection", None] = None,
3303
3317
  obj: Optional[Union["Page", "Region"]] = None,
3304
3318
  align: Literal["left", "right", "center", "between"] = "left",
3305
- outer: bool = True,
3319
+ outer: Union[str, bool] = True,
3306
3320
  tolerance: float = 5,
3307
3321
  apply_exclusions: bool = True,
3308
3322
  ) -> "Guides":
@@ -3319,7 +3333,10 @@ class Guides:
3319
3333
  - None: no markers
3320
3334
  obj: Page or Region to search (uses self.context if None)
3321
3335
  align: How to align guides relative to found elements
3322
- outer: Whether to add outer boundary guides
3336
+ outer: Whether to add outer boundary guides. Can be:
3337
+ - bool: True/False to add/not add both
3338
+ - "first": To add boundary before the first element
3339
+ - "last": To add boundary before the last element
3323
3340
  tolerance: Tolerance for snapping to element edges
3324
3341
  apply_exclusions: Whether to apply exclusion zones when searching for text
3325
3342
 
@@ -3457,6 +3474,7 @@ class Guides:
3457
3474
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3458
3475
  show_progress: bool = False,
3459
3476
  content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3477
+ apply_exclusions: bool = True,
3460
3478
  *,
3461
3479
  multi_page: Literal["auto", True, False] = "auto",
3462
3480
  ) -> "TableResult":
@@ -3482,6 +3500,7 @@ class Guides:
3482
3500
  cell_extraction_func: Optional callable for custom cell text extraction
3483
3501
  show_progress: Controls progress bar for text method
3484
3502
  content_filter: Content filtering function or patterns
3503
+ apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
3485
3504
  multi_page: Controls multi-region table creation for FlowRegions
3486
3505
 
3487
3506
  Returns:
@@ -3552,6 +3571,7 @@ class Guides:
3552
3571
  cell_extraction_func=cell_extraction_func,
3553
3572
  show_progress=show_progress,
3554
3573
  content_filter=content_filter,
3574
+ apply_exclusions=apply_exclusions,
3555
3575
  )
3556
3576
 
3557
3577
  return table_result
@@ -3577,6 +3597,162 @@ class Guides:
3577
3597
  except Exception as cleanup_err:
3578
3598
  logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
3579
3599
 
3600
+ def extract_table_from_pages(
3601
+ self,
3602
+ pages: Union["PageCollection", List["Page"]],
3603
+ header: Union[str, List[str], None] = "first",
3604
+ skip_repeating_headers: Optional[bool] = None,
3605
+ method: Optional[str] = None,
3606
+ table_settings: Optional[dict] = None,
3607
+ use_ocr: bool = False,
3608
+ ocr_config: Optional[dict] = None,
3609
+ text_options: Optional[Dict] = None,
3610
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3611
+ show_progress: bool = True,
3612
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3613
+ apply_exclusions: bool = True,
3614
+ ) -> "TableResult":
3615
+ """
3616
+ Extract tables from multiple pages using this guide pattern.
3617
+
3618
+ This method applies the guide to each page, extracts tables, and combines
3619
+ them into a single TableResult. Dynamic guides (using lambdas) are evaluated
3620
+ for each page.
3621
+
3622
+ Args:
3623
+ pages: PageCollection or list of Pages to extract from
3624
+ header: How to handle headers:
3625
+ - "first": Use first row of first page as headers (default)
3626
+ - "all": Expect headers on each page, use from first page
3627
+ - None: No headers, use numeric indices
3628
+ - List[str]: Custom column names
3629
+ skip_repeating_headers: Whether to remove duplicate header rows.
3630
+ Defaults to True when header is "first" or "all", False otherwise.
3631
+ method: Table extraction method (passed to extract_table)
3632
+ table_settings: Settings for pdfplumber table extraction
3633
+ use_ocr: Whether to use OCR for text extraction
3634
+ ocr_config: OCR configuration parameters
3635
+ text_options: Dictionary of options for the 'text' method
3636
+ cell_extraction_func: Optional callable for custom cell text extraction
3637
+ show_progress: Show progress bar for multi-page extraction (default: True)
3638
+ content_filter: Content filtering function or patterns
3639
+ apply_exclusions: Whether to apply exclusion regions during extraction
3640
+
3641
+ Returns:
3642
+ TableResult: Combined table data from all pages
3643
+
3644
+ Example:
3645
+ ```python
3646
+ # Create guide with static vertical, dynamic horizontal
3647
+ guide = Guides(pages[0])
3648
+ guide.vertical.from_content(columns, outer="last")
3649
+ guide.horizontal.from_content(lambda p: p.find_all('text:starts-with(NF-)'))
3650
+
3651
+ # Extract from all pages
3652
+ table_result = guide.extract_table_from_pages(pages, header=columns)
3653
+ df = table_result.to_df()
3654
+ ```
3655
+ """
3656
+ from natural_pdf.core.page_collection import PageCollection
3657
+ from natural_pdf.tables.result import TableResult
3658
+
3659
+ # Convert to list if it's a PageCollection
3660
+ if isinstance(pages, PageCollection):
3661
+ page_list = list(pages)
3662
+ else:
3663
+ page_list = pages
3664
+
3665
+ if not page_list:
3666
+ return TableResult([])
3667
+
3668
+ # Determine header handling
3669
+ if skip_repeating_headers is None:
3670
+ skip_repeating_headers = header in ["first", "all"] or isinstance(header, list)
3671
+
3672
+ all_rows = []
3673
+ header_row = None
3674
+
3675
+ # Configure progress bar
3676
+ iterator = page_list
3677
+ if show_progress and len(page_list) > 1:
3678
+ try:
3679
+ from tqdm.auto import tqdm
3680
+
3681
+ iterator = tqdm(page_list, desc="Extracting tables from pages", unit="page")
3682
+ except ImportError:
3683
+ pass
3684
+
3685
+ for i, page in enumerate(iterator):
3686
+ # Create a new Guides object for this page
3687
+ page_guide = Guides(page)
3688
+
3689
+ # Copy vertical guides (usually static)
3690
+ if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
3691
+ # If vertical is dynamic (lambda), evaluate it
3692
+ page_guide.vertical.from_content(self.vertical._callable(page))
3693
+ else:
3694
+ # Copy static vertical positions
3695
+ page_guide.vertical.data = self.vertical.data.copy()
3696
+
3697
+ # Handle horizontal guides
3698
+ if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
3699
+ # If horizontal is dynamic (lambda), evaluate it
3700
+ page_guide.horizontal.from_content(self.horizontal._callable(page))
3701
+ else:
3702
+ # Copy static horizontal positions
3703
+ page_guide.horizontal.data = self.horizontal.data.copy()
3704
+
3705
+ # Extract table from this page
3706
+ table_result = page_guide.extract_table(
3707
+ method=method,
3708
+ table_settings=table_settings,
3709
+ use_ocr=use_ocr,
3710
+ ocr_config=ocr_config,
3711
+ text_options=text_options,
3712
+ cell_extraction_func=cell_extraction_func,
3713
+ show_progress=False, # Don't show nested progress
3714
+ content_filter=content_filter,
3715
+ apply_exclusions=apply_exclusions,
3716
+ )
3717
+
3718
+ # Convert to list of rows
3719
+ rows = list(table_result)
3720
+
3721
+ # Handle headers based on strategy
3722
+ if i == 0: # First page
3723
+ if header == "first" or header == "all":
3724
+ # Use first row as header
3725
+ if rows:
3726
+ header_row = rows[0]
3727
+ rows = rows[1:] # Remove header from data
3728
+ elif isinstance(header, list):
3729
+ # Custom headers provided
3730
+ header_row = header
3731
+ else: # Subsequent pages
3732
+ if header == "all" and skip_repeating_headers and rows:
3733
+ # Expect and remove header row
3734
+ if rows and header_row and rows[0] == header_row:
3735
+ rows = rows[1:]
3736
+ elif rows:
3737
+ # Still remove first row if it looks like a header
3738
+ rows = rows[1:]
3739
+
3740
+ # Add rows to combined result
3741
+ all_rows.extend(rows)
3742
+
3743
+ # Create final TableResult
3744
+ if isinstance(header, list):
3745
+ # Custom headers - prepend to data
3746
+ final_result = TableResult(all_rows)
3747
+ elif header_row is not None:
3748
+ # Prepend discovered header
3749
+ final_result = TableResult([header_row] + all_rows)
3750
+ else:
3751
+ # No headers
3752
+ final_result = TableResult(all_rows)
3753
+
3754
+ return final_result
3755
+
3580
3756
  def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
3581
3757
  """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
3582
3758
  if not self.is_flow_region or len(self.context.constituent_regions) < 2:
@@ -939,6 +939,11 @@ class ElementManager:
939
939
  self.load_elements()
940
940
  return self._elements.get("chars", [])
941
941
 
942
+ def invalidate_cache(self):
943
+ """Invalidate the cached elements, forcing a reload on next access."""
944
+ self._elements = None
945
+ logger.debug(f"Page {self._page.number}: ElementManager cache invalidated")
946
+
942
947
  @property
943
948
  def words(self):
944
949
  """Get all word elements."""
natural_pdf/core/page.py CHANGED
@@ -494,6 +494,9 @@ class Page(
494
494
  exc_info=False,
495
495
  )
496
496
  raise
497
+ # Invalidate ElementManager cache since exclusions affect element filtering
498
+ if hasattr(self, "_element_mgr") and self._element_mgr:
499
+ self._element_mgr.invalidate_cache()
497
500
  return self # Completed processing for selector input
498
501
 
499
502
  # ElementCollection -----------------------------------------------
@@ -526,6 +529,9 @@ class Page(
526
529
  exc_info=False,
527
530
  )
528
531
  raise
532
+ # Invalidate ElementManager cache since exclusions affect element filtering
533
+ if hasattr(self, "_element_mgr") and self._element_mgr:
534
+ self._element_mgr.invalidate_cache()
529
535
  return self # Completed processing for ElementCollection input
530
536
 
531
537
  # ------------------------------------------------------------------
@@ -618,6 +624,9 @@ class Page(
618
624
  f"Page {self.index}: Failed to convert list item to Region: {e}"
619
625
  )
620
626
  continue
627
+ # Invalidate ElementManager cache since exclusions affect element filtering
628
+ if hasattr(self, "_element_mgr") and self._element_mgr:
629
+ self._element_mgr.invalidate_cache()
621
630
  return self
622
631
  else:
623
632
  # Reject invalid types
@@ -629,6 +638,10 @@ class Page(
629
638
  if exclusion_data:
630
639
  self._exclusions.append(exclusion_data)
631
640
 
641
+ # Invalidate ElementManager cache since exclusions affect element filtering
642
+ if hasattr(self, "_element_mgr") and self._element_mgr:
643
+ self._element_mgr.invalidate_cache()
644
+
632
645
  return self
633
646
 
634
647
  def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
@@ -699,10 +712,26 @@ class Page(
699
712
  """
700
713
  regions = []
701
714
 
715
+ # Combine page-specific exclusions with PDF-level exclusions
716
+ all_exclusions = list(self._exclusions) # Start with page-specific
717
+
718
+ # Add PDF-level exclusions if we have a parent PDF
719
+ if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
720
+ for pdf_exclusion in self._parent._exclusions:
721
+ # Check if this exclusion is already in our list (avoid duplicates)
722
+ if pdf_exclusion not in all_exclusions:
723
+ # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
724
+ if len(pdf_exclusion) == 2:
725
+ # Convert to 3-tuple format with default method
726
+ pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
727
+ all_exclusions.append(pdf_exclusion)
728
+
702
729
  if debug:
703
- print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
730
+ print(
731
+ f"\nPage {self.index}: Evaluating {len(all_exclusions)} exclusions ({len(self._exclusions)} page-specific, {len(all_exclusions) - len(self._exclusions)} from PDF)"
732
+ )
704
733
 
705
- for i, exclusion_data in enumerate(self._exclusions):
734
+ for i, exclusion_data in enumerate(all_exclusions):
706
735
  # Handle both old format (2-tuple) and new format (3-tuple) for backward compatibility
707
736
  if len(exclusion_data) == 2:
708
737
  # Old format: (exclusion_item, label)
@@ -1598,7 +1627,14 @@ class Page(
1598
1627
  return ""
1599
1628
 
1600
1629
  # 2. Apply element-based exclusions if enabled
1601
- if use_exclusions and self._exclusions:
1630
+ # Check both page-level and PDF-level exclusions
1631
+ has_exclusions = bool(self._exclusions) or (
1632
+ hasattr(self, "_parent")
1633
+ and self._parent
1634
+ and hasattr(self._parent, "_exclusions")
1635
+ and self._parent._exclusions
1636
+ )
1637
+ if use_exclusions and has_exclusions:
1602
1638
  # Filter word elements through _filter_elements_by_exclusions
1603
1639
  # This handles both element-based and region-based exclusions
1604
1640
  word_elements = self._filter_elements_by_exclusions(
@@ -1612,7 +1648,7 @@ class Page(
1612
1648
  # 3. Get region-based exclusions for spatial filtering
1613
1649
  apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
1614
1650
  exclusion_regions = []
1615
- if apply_exclusions_flag and self._exclusions:
1651
+ if apply_exclusions_flag and has_exclusions:
1616
1652
  exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
1617
1653
  if debug:
1618
1654
  logger.debug(
natural_pdf/core/pdf.py CHANGED
@@ -173,11 +173,26 @@ class _LazyPageList(Sequence):
173
173
  """Create and cache a page at the given index within this list."""
174
174
  cached = self._cache[index]
175
175
  if cached is None:
176
+ # Get the actual page index in the full PDF
177
+ actual_page_index = self._indices[index]
178
+
179
+ # First check if this page is already cached in the parent PDF's main page list
180
+ if (
181
+ hasattr(self._parent_pdf, "_pages")
182
+ and hasattr(self._parent_pdf._pages, "_cache")
183
+ and actual_page_index < len(self._parent_pdf._pages._cache)
184
+ and self._parent_pdf._pages._cache[actual_page_index] is not None
185
+ ):
186
+ # Reuse the already-cached page from the parent PDF
187
+ # This ensures we get any exclusions that were already applied
188
+ cached = self._parent_pdf._pages._cache[actual_page_index]
189
+ self._cache[index] = cached
190
+ return cached
191
+
176
192
  # Import here to avoid circular import problems
177
193
  from natural_pdf.core.page import Page
178
194
 
179
- # Get the actual page index in the full PDF
180
- actual_page_index = self._indices[index]
195
+ # Create new page
181
196
  plumber_page = self._plumber_pdf.pages[actual_page_index]
182
197
  cached = Page(
183
198
  plumber_page,
@@ -196,6 +211,30 @@ class _LazyPageList(Sequence):
196
211
  except Exception as e:
197
212
  logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
198
213
 
214
+ # Check if the parent PDF already has a cached page with page-specific exclusions
215
+ if hasattr(self._parent_pdf, "_pages") and hasattr(self._parent_pdf._pages, "_cache"):
216
+ parent_cache = self._parent_pdf._pages._cache
217
+ if (
218
+ actual_page_index < len(parent_cache)
219
+ and parent_cache[actual_page_index] is not None
220
+ ):
221
+ existing_page = parent_cache[actual_page_index]
222
+ # Copy over any page-specific exclusions from the existing page
223
+ # Only copy non-callable exclusions (regions/elements) to avoid duplicating PDF-level exclusions
224
+ if hasattr(existing_page, "_exclusions") and existing_page._exclusions:
225
+ for exclusion_data in existing_page._exclusions:
226
+ exclusion_item = exclusion_data[0]
227
+ # Skip callable exclusions as they're PDF-level and already applied above
228
+ if not callable(exclusion_item):
229
+ try:
230
+ cached.add_exclusion(
231
+ *exclusion_data[:2]
232
+ ) # exclusion_item and label
233
+ except Exception as e:
234
+ logger.warning(
235
+ f"Failed to copy page-specific exclusion to page {cached.number}: {e}"
236
+ )
237
+
199
238
  # Apply any stored regions to the newly created page
200
239
  if hasattr(self._parent_pdf, "_regions"):
201
240
  for region_data in self._parent_pdf._regions:
@@ -1236,6 +1236,7 @@ class Region(
1236
1236
  content_filter: Optional[
1237
1237
  Union[str, Callable[[str], bool], List[str]]
1238
1238
  ] = None, # NEW: Content filtering
1239
+ apply_exclusions: bool = True, # Whether to apply exclusion regions during extraction
1239
1240
  ) -> TableResult: # Return type allows Optional[str] for cells
1240
1241
  """
1241
1242
  Extract a table from this region.
@@ -1260,6 +1261,8 @@ class Region(
1260
1261
  - A callable that takes text and returns True to KEEP the character
1261
1262
  - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1262
1263
  Works with all extraction methods by filtering cell content.
1264
+ apply_exclusions: Whether to apply exclusion regions during text extraction (default: True).
1265
+ When True, text within excluded regions (e.g., headers/footers) will not be extracted.
1263
1266
 
1264
1267
  Returns:
1265
1268
  Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1297,7 +1300,9 @@ class Region(
1297
1300
  )
1298
1301
  return TableResult(
1299
1302
  self._extract_table_from_cells(
1300
- cell_regions_in_table, content_filter=content_filter
1303
+ cell_regions_in_table,
1304
+ content_filter=content_filter,
1305
+ apply_exclusions=apply_exclusions,
1301
1306
  )
1302
1307
  )
1303
1308
 
@@ -1381,16 +1386,22 @@ class Region(
1381
1386
  # Use the selected method
1382
1387
  if effective_method == "tatr":
1383
1388
  table_rows = self._extract_table_tatr(
1384
- use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
1389
+ use_ocr=use_ocr,
1390
+ ocr_config=ocr_config,
1391
+ content_filter=content_filter,
1392
+ apply_exclusions=apply_exclusions,
1385
1393
  )
1386
1394
  elif effective_method == "text":
1387
1395
  current_text_options = text_options.copy()
1388
1396
  current_text_options["cell_extraction_func"] = cell_extraction_func
1389
1397
  current_text_options["show_progress"] = show_progress
1390
1398
  current_text_options["content_filter"] = content_filter
1399
+ current_text_options["apply_exclusions"] = apply_exclusions
1391
1400
  table_rows = self._extract_table_text(**current_text_options)
1392
1401
  elif effective_method == "pdfplumber":
1393
- table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
1402
+ table_rows = self._extract_table_plumber(
1403
+ table_settings, content_filter=content_filter, apply_exclusions=apply_exclusions
1404
+ )
1394
1405
  else:
1395
1406
  raise ValueError(
1396
1407
  f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1604,7 +1615,9 @@ class Region(
1604
1615
  # Return empty list if no tables found
1605
1616
  return []
1606
1617
 
1607
- def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
1618
+ def _extract_table_plumber(
1619
+ self, table_settings: dict, content_filter=None, apply_exclusions=True
1620
+ ) -> List[List[str]]:
1608
1621
  """
1609
1622
  Extract table using pdfplumber's table extraction.
1610
1623
  This method extracts the largest table within the region.
@@ -1646,7 +1659,7 @@ class Region(
1646
1659
  # -------------------------------------------------------------
1647
1660
  base_plumber_page = self.page._page
1648
1661
 
1649
- if getattr(self.page, "_exclusions", None):
1662
+ if apply_exclusions and getattr(self.page, "_exclusions", None):
1650
1663
  exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1651
1664
 
1652
1665
  def _keep_char(obj):
@@ -1701,7 +1714,7 @@ class Region(
1701
1714
  return []
1702
1715
 
1703
1716
  def _extract_table_tatr(
1704
- self, use_ocr=False, ocr_config=None, content_filter=None
1717
+ self, use_ocr=False, ocr_config=None, content_filter=None, apply_exclusions=True
1705
1718
  ) -> List[List[str]]:
1706
1719
  """
1707
1720
  Extract table using TATR structure detection.
@@ -1789,7 +1802,7 @@ class Region(
1789
1802
  continue
1790
1803
 
1791
1804
  # Fallback to normal extraction
1792
- header_text = header.extract_text().strip()
1805
+ header_text = header.extract_text(apply_exclusions=apply_exclusions).strip()
1793
1806
  if content_filter is not None:
1794
1807
  header_text = self._apply_content_filter_to_text(header_text, content_filter)
1795
1808
  header_texts.append(header_text)
@@ -1824,7 +1837,7 @@ class Region(
1824
1837
  continue
1825
1838
 
1826
1839
  # Fallback to normal extraction
1827
- cell_text = cell_region.extract_text().strip()
1840
+ cell_text = cell_region.extract_text(apply_exclusions=apply_exclusions).strip()
1828
1841
  if content_filter is not None:
1829
1842
  cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
1830
1843
  row_cells.append(cell_text)
@@ -1840,7 +1853,7 @@ class Region(
1840
1853
  continue
1841
1854
 
1842
1855
  # Fallback to normal extraction
1843
- row_text = row.extract_text().strip()
1856
+ row_text = row.extract_text(apply_exclusions=apply_exclusions).strip()
1844
1857
  if content_filter is not None:
1845
1858
  row_text = self._apply_content_filter_to_text(row_text, content_filter)
1846
1859
  row_cells.append(row_text)
@@ -1866,6 +1879,8 @@ class Region(
1866
1879
  show_progress = text_options.pop("show_progress", False)
1867
1880
  # --- Get content_filter option --- #
1868
1881
  content_filter = text_options.pop("content_filter", None)
1882
+ # --- Get apply_exclusions option --- #
1883
+ apply_exclusions = text_options.pop("apply_exclusions", True)
1869
1884
 
1870
1885
  # Analyze structure first (or use cached results)
1871
1886
  if "text_table_structure" in self.analyses:
@@ -1946,7 +1961,9 @@ class Region(
1946
1961
  cell_value = None
1947
1962
  else:
1948
1963
  cell_value = cell_region.extract_text(
1949
- layout=False, apply_exclusions=False, content_filter=content_filter
1964
+ layout=False,
1965
+ apply_exclusions=apply_exclusions,
1966
+ content_filter=content_filter,
1950
1967
  ).strip()
1951
1968
 
1952
1969
  rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3397,7 +3414,7 @@ class Region(
3397
3414
  # ------------------------------------------------------------------
3398
3415
 
3399
3416
  def _extract_table_from_cells(
3400
- self, cell_regions: List["Region"], content_filter=None
3417
+ self, cell_regions: List["Region"], content_filter=None, apply_exclusions=True
3401
3418
  ) -> List[List[Optional[str]]]:
3402
3419
  """Construct a table (list-of-lists) from table_cell regions.
3403
3420
 
@@ -3439,7 +3456,9 @@ class Region(
3439
3456
  r_idx = int(cell.metadata.get("row_index"))
3440
3457
  c_idx = int(cell.metadata.get("col_index"))
3441
3458
  text_val = cell.extract_text(
3442
- layout=False, apply_exclusions=True, content_filter=content_filter
3459
+ layout=False,
3460
+ apply_exclusions=apply_exclusions,
3461
+ content_filter=content_filter,
3443
3462
  ).strip()
3444
3463
  table_grid[r_idx][c_idx] = text_val if text_val else None
3445
3464
  except Exception as _err:
@@ -3488,7 +3507,7 @@ class Region(
3488
3507
  col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
3489
3508
 
3490
3509
  text_val = cell.extract_text(
3491
- layout=False, apply_exclusions=False, content_filter=content_filter
3510
+ layout=False, apply_exclusions=apply_exclusions, content_filter=content_filter
3492
3511
  ).strip()
3493
3512
  table_grid[row_idx][col_idx] = text_val if text_val else None
3494
3513
 
@@ -41,7 +41,7 @@ class TableResult(Sequence):
41
41
 
42
42
  def to_df(
43
43
  self,
44
- header: Union[str, int, List[int], None] = "first",
44
+ header: Union[str, int, List[int], List[str], None] = "first",
45
45
  index_col=None,
46
46
  skip_repeating_headers=None,
47
47
  keep_blank: bool = False,
@@ -51,8 +51,8 @@ class TableResult(Sequence):
51
51
 
52
52
  Parameters
53
53
  ----------
54
- header : "first" | int | list[int] | None, default "first"
55
- • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
54
+ header : "first" | int | list[int] | list[str] | None, default "first"
55
+ • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • list[str] – custom column names.\n • None/False– no header.
56
56
 
57
57
  Note: If the header row has a different number of columns than the
58
58
  body rows, the method will automatically fall back to header=None
@@ -84,7 +84,11 @@ class TableResult(Sequence):
84
84
 
85
85
  # Determine default for skip_repeating_headers based on header parameter
86
86
  if skip_repeating_headers is None:
87
- skip_repeating_headers = header is not None and header is not False
87
+ skip_repeating_headers = (
88
+ header is not None
89
+ and header is not False
90
+ and not (isinstance(header, (list, tuple)) and len(header) == 0)
91
+ )
88
92
 
89
93
  # Determine header rows and body rows
90
94
  body = rows
@@ -97,10 +101,31 @@ class TableResult(Sequence):
97
101
  elif isinstance(header, int):
98
102
  hdr = rows[header]
99
103
  body = rows[:header] + rows[header + 1 :]
100
- elif isinstance(header, (list, tuple)):
104
+ elif isinstance(header, (list, tuple)) and all(isinstance(i, int) for i in header):
105
+ # List of integers - multi-row header
101
106
  hdr_rows = [rows[i] for i in header]
102
107
  body = [r for idx, r in enumerate(rows) if idx not in header]
103
108
  hdr = hdr_rows
109
+ elif (
110
+ isinstance(header, (list, tuple))
111
+ and len(header) > 0
112
+ and all(isinstance(i, str) for i in header)
113
+ ):
114
+ # List of strings - custom column names
115
+ hdr = list(header)
116
+ body = rows
117
+ # Validate column count matches
118
+ if body:
119
+ max_cols = max(len(row) for row in body)
120
+ if len(hdr) != max_cols:
121
+ raise ValueError(
122
+ f"Number of column names ({len(hdr)}) must match "
123
+ f"number of columns in data ({max_cols})"
124
+ )
125
+ elif isinstance(header, (list, tuple)) and len(header) == 0:
126
+ # Empty list behaves like None
127
+ hdr = None
128
+ body = rows
104
129
  else:
105
130
  raise ValueError("Invalid value for header parameter")
106
131
 
@@ -125,7 +150,12 @@ class TableResult(Sequence):
125
150
  pass
126
151
 
127
152
  # Check for header/body column count mismatch and fallback to no header
128
- if hdr is not None and body:
153
+ if (
154
+ hdr is not None
155
+ and body
156
+ and not (isinstance(header, (list, tuple)) and all(isinstance(i, str) for i in header))
157
+ ):
158
+ # Skip this check for custom string headers
129
159
  # Get the maximum number of columns from all body rows
130
160
  # This handles cases where some rows have different column counts
131
161
  max_cols = max(len(row) for row in body) if body else 0
@@ -144,6 +174,9 @@ class TableResult(Sequence):
144
174
  hdr = None
145
175
  body = self._rows # Use all rows as body
146
176
 
177
+ # Handle empty list case - pandas needs None not empty list
178
+ if isinstance(hdr, list) and len(hdr) == 0:
179
+ hdr = None
147
180
  df = pd.DataFrame(body, columns=hdr)
148
181
 
149
182
  # Convert empty strings to NaN by default
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -2,7 +2,7 @@ natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
2
2
  natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
3
3
  natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
4
4
  natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
5
- natural_pdf/analyzers/guides.py,sha256=9FUbxk4XBOyktXgq9q5-bB949JFrzT1kBPikg2ENoIw,150032
5
+ natural_pdf/analyzers/guides.py,sha256=mLWPPEwywo_FbU3gSoegiRlzxYmkHEo2c4DLX9krH9k,157691
6
6
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
7
7
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
8
8
  natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
@@ -25,12 +25,12 @@ natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGv
25
25
  natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
26
26
  natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
- natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
28
+ natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
29
29
  natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
30
- natural_pdf/core/page.py,sha256=XrDePXZgXgB3w8hvxh4-EhPQnrwmw-0z-I_K24__OtY,142550
30
+ natural_pdf/core/page.py,sha256=Q3hBvB9KFB8doeXY7YVQt3G1ULdBDfA-0BQD6YPN4oo,144640
31
31
  natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
32
32
  natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
33
- natural_pdf/core/pdf.py,sha256=Loe6sbQzBp9VDeIAuDS3zQmeDWvQMj5SWIQMky5bPDA,101964
33
+ natural_pdf/core/pdf.py,sha256=VslSn00So6157XfiYbrB9URpx5VlWyshQOt7upi9us4,104248
34
34
  natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
35
35
  natural_pdf/core/render_spec.py,sha256=rLicaS9EPyojpJcjy2Lzn5DLWQwjrFyDJyRo7jbjdGU,14505
36
36
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
@@ -44,7 +44,7 @@ natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdD
44
44
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
45
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
46
  natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
47
- natural_pdf/elements/region.py,sha256=RxWidI7oNrdbuuj94SfdFXmcSDTfy89uGCeVMQvAfks,155591
47
+ natural_pdf/elements/region.py,sha256=_NNBewHlyUHvA4g9kApilP6it0cn2IRlcGG4r993oUI,156660
48
48
  natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
49
49
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
50
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -87,7 +87,7 @@ natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1
87
87
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
88
88
  natural_pdf/selectors/parser.py,sha256=pw0M8ICKPMOzZPzWpLsQMG_lnl8PewGIdIG3ciukabk,38877
89
89
  natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
90
- natural_pdf/tables/result.py,sha256=1pcelNZvOb6Anlwj08Z1XU-YK1ihlCsLpYMRA3Zc4JM,7242
90
+ natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
91
91
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
92
92
  natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
93
93
  natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3HCE,1151
@@ -106,7 +106,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
106
106
  natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
107
107
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
108
108
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
109
- natural_pdf-0.2.4.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
109
+ natural_pdf-0.2.5.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
110
110
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
111
111
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
112
112
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -123,8 +123,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
123
123
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
124
124
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
125
125
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
126
- natural_pdf-0.2.4.dist-info/METADATA,sha256=G1tmes61GVEt6zLeDISuJZgceLQywIU-uRspGA_90Q8,6959
127
- natural_pdf-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
128
- natural_pdf-0.2.4.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
129
- natural_pdf-0.2.4.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
130
- natural_pdf-0.2.4.dist-info/RECORD,,
126
+ natural_pdf-0.2.5.dist-info/METADATA,sha256=H9nhjh1zRBmz2vUTe_j6FT-Zvn1sgoWT0nyoZG5GTYg,6959
127
+ natural_pdf-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
128
+ natural_pdf-0.2.5.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
129
+ natural_pdf-0.2.5.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
130
+ natural_pdf-0.2.5.dist-info/RECORD,,