natural-pdf 0.1.36__py3-none-any.whl → 0.1.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1221,7 +1221,7 @@ class Region(
1221
1221
  # Filter to elements in this region
1222
1222
  return [e for e in page_elements if self._is_element_in_region(e)]
1223
1223
 
1224
- def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
1224
+ def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
1225
1225
  """
1226
1226
  Extract text from this region, respecting page exclusions and using pdfplumber's
1227
1227
  layout engine (chars_to_textmap).
@@ -1229,6 +1229,10 @@ class Region(
1229
1229
  Args:
1230
1230
  apply_exclusions: Whether to apply exclusion regions defined on the parent page.
1231
1231
  debug: Enable verbose debugging output for filtering steps.
1232
+ content_filter: Optional content filter to exclude specific text patterns. Can be:
1233
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
1234
+ - A callable that takes text and returns True to KEEP the character
1235
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1232
1236
  **kwargs: Additional layout parameters passed directly to pdfplumber's
1233
1237
  `chars_to_textmap` function (e.g., layout, x_density, y_density).
1234
1238
  See Page.extract_text docstring for more.
@@ -1285,10 +1289,15 @@ class Region(
1285
1289
  )
1286
1290
 
1287
1291
  # 5. Generate Text Layout using Utility
1292
+ # Add content_filter to kwargs if provided
1293
+ final_kwargs = kwargs.copy()
1294
+ if content_filter is not None:
1295
+ final_kwargs["content_filter"] = content_filter
1296
+
1288
1297
  result = generate_text_layout(
1289
1298
  char_dicts=filtered_chars,
1290
1299
  layout_context_bbox=self.bbox, # Use region's bbox for context
1291
- user_kwargs=kwargs, # Pass original kwargs to layout generator
1300
+ user_kwargs=final_kwargs, # Pass kwargs including content_filter
1292
1301
  )
1293
1302
 
1294
1303
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1304,6 +1313,7 @@ class Region(
1304
1313
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1305
1314
  # --- NEW: Add tqdm control option --- #
1306
1315
  show_progress: bool = False, # Controls progress bar for text method
1316
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None, # NEW: Content filtering
1307
1317
  ) -> TableResult: # Return type allows Optional[str] for cells
1308
1318
  """
1309
1319
  Extract a table from this region.
@@ -1323,6 +1333,11 @@ class Region(
1323
1333
  and returns its string content. Overrides default text extraction
1324
1334
  for the 'text' method.
1325
1335
  show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
1336
+ content_filter: Optional content filter to apply during cell text extraction. Can be:
1337
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
1338
+ - A callable that takes text and returns True to KEEP the character
1339
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1340
+ Works with all extraction methods by filtering cell content.
1326
1341
 
1327
1342
  Returns:
1328
1343
  Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1358,7 +1373,7 @@ class Region(
1358
1373
  logger.debug(
1359
1374
  f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
1360
1375
  )
1361
- return TableResult(self._extract_table_from_cells(cell_regions_in_table))
1376
+ return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
1362
1377
 
1363
1378
  # --------------------------------------------------------------- #
1364
1379
 
@@ -1439,14 +1454,15 @@ class Region(
1439
1454
 
1440
1455
  # Use the selected method
1441
1456
  if effective_method == "tatr":
1442
- table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
1457
+ table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
1443
1458
  elif effective_method == "text":
1444
1459
  current_text_options = text_options.copy()
1445
1460
  current_text_options["cell_extraction_func"] = cell_extraction_func
1446
1461
  current_text_options["show_progress"] = show_progress
1462
+ current_text_options["content_filter"] = content_filter
1447
1463
  table_rows = self._extract_table_text(**current_text_options)
1448
1464
  elif effective_method == "pdfplumber":
1449
- table_rows = self._extract_table_plumber(table_settings)
1465
+ table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
1450
1466
  else:
1451
1467
  raise ValueError(
1452
1468
  f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1603,13 +1619,14 @@ class Region(
1603
1619
  # Return the tables or an empty list if none found
1604
1620
  return tables if tables else []
1605
1621
 
1606
- def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1622
+ def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
1607
1623
  """
1608
1624
  Extract table using pdfplumber's table extraction.
1609
1625
  This method extracts the largest table within the region.
1610
1626
 
1611
1627
  Args:
1612
1628
  table_settings: Settings for pdfplumber table extraction
1629
+ content_filter: Optional content filter to apply to cell values
1613
1630
 
1614
1631
  Returns:
1615
1632
  Table data as a list of rows, where each row is a list of cell values
@@ -1645,16 +1662,31 @@ class Region(
1645
1662
 
1646
1663
  # Return the table or an empty list if none found
1647
1664
  if table:
1665
+ # Apply content filtering if provided
1666
+ if content_filter is not None:
1667
+ filtered_table = []
1668
+ for row in table:
1669
+ filtered_row = []
1670
+ for cell in row:
1671
+ if cell is not None:
1672
+ # Apply content filter to cell text
1673
+ filtered_cell = self._apply_content_filter_to_text(cell, content_filter)
1674
+ filtered_row.append(filtered_cell)
1675
+ else:
1676
+ filtered_row.append(cell)
1677
+ filtered_table.append(filtered_row)
1678
+ return filtered_table
1648
1679
  return table
1649
1680
  return []
1650
1681
 
1651
- def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
1682
+ def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
1652
1683
  """
1653
1684
  Extract table using TATR structure detection.
1654
1685
 
1655
1686
  Args:
1656
1687
  use_ocr: Whether to apply OCR to each cell for better text extraction
1657
1688
  ocr_config: Optional OCR configuration parameters
1689
+ content_filter: Optional content filter to apply to cell values
1658
1690
 
1659
1691
  Returns:
1660
1692
  Table data as a list of rows, where each row is a list of cell values
@@ -1734,7 +1766,10 @@ class Region(
1734
1766
  continue
1735
1767
 
1736
1768
  # Fallback to normal extraction
1737
- header_texts.append(header.extract_text().strip())
1769
+ header_text = header.extract_text().strip()
1770
+ if content_filter is not None:
1771
+ header_text = self._apply_content_filter_to_text(header_text, content_filter)
1772
+ header_texts.append(header_text)
1738
1773
  table_data.append(header_texts)
1739
1774
 
1740
1775
  # Process rows
@@ -1767,6 +1802,8 @@ class Region(
1767
1802
 
1768
1803
  # Fallback to normal extraction
1769
1804
  cell_text = cell_region.extract_text().strip()
1805
+ if content_filter is not None:
1806
+ cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
1770
1807
  row_cells.append(cell_text)
1771
1808
  else:
1772
1809
  # No column information, just extract the whole row text
@@ -1780,7 +1817,10 @@ class Region(
1780
1817
  continue
1781
1818
 
1782
1819
  # Fallback to normal extraction
1783
- row_cells.append(row.extract_text().strip())
1820
+ row_text = row.extract_text().strip()
1821
+ if content_filter is not None:
1822
+ row_text = self._apply_content_filter_to_text(row_text, content_filter)
1823
+ row_cells.append(row_text)
1784
1824
 
1785
1825
  table_data.append(row_cells)
1786
1826
 
@@ -1793,7 +1833,7 @@ class Region(
1793
1833
  Args:
1794
1834
  **text_options: Options passed to analyze_text_table_structure,
1795
1835
  plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
1796
- and 'show_progress'.
1836
+ 'show_progress', and 'content_filter'.
1797
1837
 
1798
1838
  Returns:
1799
1839
  Table data as list of lists of strings (or None for empty cells).
@@ -1801,6 +1841,8 @@ class Region(
1801
1841
  cell_extraction_func = text_options.pop("cell_extraction_func", None)
1802
1842
  # --- Get show_progress option --- #
1803
1843
  show_progress = text_options.pop("show_progress", False)
1844
+ # --- Get content_filter option --- #
1845
+ content_filter = text_options.pop("content_filter", None)
1804
1846
 
1805
1847
  # Analyze structure first (or use cached results)
1806
1848
  if "text_table_structure" in self.analyses:
@@ -1881,7 +1923,7 @@ class Region(
1881
1923
  cell_value = None
1882
1924
  else:
1883
1925
  cell_value = cell_region.extract_text(
1884
- layout=False, apply_exclusions=False
1926
+ layout=False, apply_exclusions=False, content_filter=content_filter
1885
1927
  ).strip()
1886
1928
 
1887
1929
  rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3356,12 +3398,16 @@ class Region(
3356
3398
  # New helper: build table from pre-computed table_cell regions
3357
3399
  # ------------------------------------------------------------------
3358
3400
 
3359
- def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
3401
+ def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
3360
3402
  """Construct a table (list-of-lists) from table_cell regions.
3361
3403
 
3362
3404
  This assumes each cell Region has metadata.row_index / col_index as written by
3363
3405
  detect_table_structure_from_lines(). If these keys are missing we will
3364
3406
  fall back to sorting by geometry.
3407
+
3408
+ Args:
3409
+ cell_regions: List of table cell Region objects to extract text from
3410
+ content_filter: Optional content filter to apply to cell text extraction
3365
3411
  """
3366
3412
  if not cell_regions:
3367
3413
  return []
@@ -3392,7 +3438,7 @@ class Region(
3392
3438
  try:
3393
3439
  r_idx = int(cell.metadata.get("row_index"))
3394
3440
  c_idx = int(cell.metadata.get("col_index"))
3395
- text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
3441
+ text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3396
3442
  table_grid[r_idx][c_idx] = text_val if text_val else None
3397
3443
  except Exception as _err:
3398
3444
  # Skip problematic cell
@@ -3439,7 +3485,53 @@ class Region(
3439
3485
  row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
3440
3486
  col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
3441
3487
 
3442
- text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
3488
+ text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3443
3489
  table_grid[row_idx][col_idx] = text_val if text_val else None
3444
3490
 
3445
3491
  return table_grid
3492
+
3493
+ def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
3494
+ """
3495
+ Apply content filter to a text string.
3496
+
3497
+ Args:
3498
+ text: Input text string
3499
+ content_filter: Content filter (regex, callable, or list of regexes)
3500
+
3501
+ Returns:
3502
+ Filtered text string
3503
+ """
3504
+ if not text or content_filter is None:
3505
+ return text
3506
+
3507
+ import re
3508
+
3509
+ if isinstance(content_filter, str):
3510
+ # Single regex pattern - remove matching parts
3511
+ try:
3512
+ return re.sub(content_filter, '', text)
3513
+ except re.error:
3514
+ return text # Invalid regex, return original
3515
+
3516
+ elif isinstance(content_filter, list):
3517
+ # List of regex patterns - remove parts matching ANY pattern
3518
+ try:
3519
+ result = text
3520
+ for pattern in content_filter:
3521
+ result = re.sub(pattern, '', result)
3522
+ return result
3523
+ except re.error:
3524
+ return text # Invalid regex, return original
3525
+
3526
+ elif callable(content_filter):
3527
+ # Callable filter - apply to individual characters
3528
+ try:
3529
+ filtered_chars = []
3530
+ for char in text:
3531
+ if content_filter(char):
3532
+ filtered_chars.append(char)
3533
+ return ''.join(filtered_chars)
3534
+ except Exception:
3535
+ return text # Function error, return original
3536
+
3537
+ return text
@@ -230,7 +230,7 @@ class TextElement(Element):
230
230
  # Default to black
231
231
  return (0, 0, 0)
232
232
 
233
- def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
233
+ def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
234
234
  """
235
235
  Extract text from this element.
236
236
 
@@ -238,14 +238,48 @@ class TextElement(Element):
238
238
  keep_blank_chars: Retained for API compatibility (unused).
239
239
  strip: If True (default) remove leading/trailing whitespace. Users may
240
240
  pass ``strip=False`` to preserve whitespace exactly as stored.
241
+ content_filter: Optional content filter to exclude specific text patterns. Can be:
242
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
243
+ - A callable that takes text and returns True to KEEP the character
244
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
241
245
  **kwargs: Accepted for forward-compatibility and ignored here.
242
246
 
243
247
  Returns:
244
- The text content, optionally stripped.
248
+ The text content, optionally stripped and filtered.
245
249
  """
246
250
  # Basic retrieval
247
251
  result = self.text or ""
248
252
 
253
+ # Apply content filtering if provided
254
+ if content_filter is not None and result:
255
+ import re
256
+
257
+ if isinstance(content_filter, str):
258
+ # Single regex pattern - remove matching characters
259
+ try:
260
+ result = re.sub(content_filter, '', result)
261
+ except re.error:
262
+ pass # Invalid regex, skip filtering
263
+
264
+ elif isinstance(content_filter, list):
265
+ # List of regex patterns - remove characters matching ANY pattern
266
+ try:
267
+ for pattern in content_filter:
268
+ result = re.sub(pattern, '', result)
269
+ except re.error:
270
+ pass # Invalid regex, skip filtering
271
+
272
+ elif callable(content_filter):
273
+ # Callable filter - apply to individual characters
274
+ try:
275
+ filtered_chars = []
276
+ for char in result:
277
+ if content_filter(char):
278
+ filtered_chars.append(char)
279
+ result = ''.join(filtered_chars)
280
+ except Exception:
281
+ pass # Function error, skip filtering
282
+
249
283
  # Apply optional stripping – align with global convention where simple
250
284
  # element extraction is stripped by default.
251
285
  if strip:
@@ -1,11 +1,13 @@
1
1
  import logging
2
2
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3
3
 
4
- from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
4
+ from pdfplumber.utils.geometry import merge_bboxes # Import merge_bboxes directly
5
5
 
6
6
  # For runtime image manipulation
7
7
  from PIL import Image as PIL_Image_Runtime
8
8
 
9
+ from natural_pdf.tables import TableResult
10
+
9
11
  if TYPE_CHECKING:
10
12
  from PIL.Image import Image as PIL_Image # For type hints
11
13
 
@@ -53,28 +55,46 @@ class FlowRegion:
53
55
  self.source_flow_element: "FlowElement" = source_flow_element
54
56
  self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
55
57
 
58
+ # Add attributes for grid building, similar to Region
59
+ self.source: Optional[str] = None
60
+ self.region_type: Optional[str] = None
61
+ self.metadata: Dict[str, Any] = {}
62
+
56
63
  # Cache for expensive operations
57
64
  self._cached_text: Optional[str] = None
58
65
  self._cached_elements: Optional["ElementCollection"] = None # Stringized
59
66
  self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
60
67
 
68
+ def __getattr__(self, name: str) -> Any:
69
+ """
70
+ Dynamically proxy attribute access to the source FlowElement if the
71
+ attribute is not found in this instance.
72
+ """
73
+ if name in self.__dict__:
74
+ return self.__dict__[name]
75
+ elif self.source_flow_element is not None:
76
+ return getattr(self.source_flow_element, name)
77
+ else:
78
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
79
+
61
80
  @property
62
81
  def bbox(self) -> Optional[Tuple[float, float, float, float]]:
63
82
  """
64
- Calculates a conceptual bounding box that encompasses all constituent physical regions.
65
- This is the union of the bounding boxes of the constituent regions in their
66
- original physical coordinates.
67
- Returns None if there are no constituent regions.
83
+ The bounding box that encloses all constituent regions.
84
+ Calculated dynamically and cached.
68
85
  """
69
86
  if self._cached_bbox is not None:
70
87
  return self._cached_bbox
71
88
  if not self.constituent_regions:
72
89
  return None
73
90
 
74
- # Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
75
- # This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
76
- # Our PhysicalRegion objects satisfy this.
77
- self._cached_bbox = objects_to_bbox(self.constituent_regions)
91
+ # Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
92
+ # Extract bbox tuples from regions first
93
+ region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
94
+ if not region_bboxes:
95
+ return None
96
+
97
+ self._cached_bbox = merge_bboxes(region_bboxes)
78
98
  return self._cached_bbox
79
99
 
80
100
  @property
@@ -200,22 +220,72 @@ class FlowRegion:
200
220
  self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
201
221
  ) -> Optional["PhysicalElement"]: # Stringized
202
222
  """
203
- Finds the first physical element within this FlowRegion that matches the selector or text.
223
+ Find the first element in flow order that matches the selector or text.
224
+
225
+ This implementation iterates through the constituent regions *in the order
226
+ they appear in ``self.constituent_regions`` (i.e. document flow order),
227
+ delegating the search to each region's own ``find`` method. It therefore
228
+ avoids constructing a huge intermediate ElementCollection and returns as
229
+ soon as a match is found, which is substantially faster and ensures that
230
+ selectors such as 'table' work exactly as they do on an individual
231
+ Region.
204
232
  """
205
- # Uses self.elements() which respects exclusions if apply_exclusions=True by default
206
- all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
207
- return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
233
+ if not self.constituent_regions:
234
+ return None
235
+
236
+ for region in self.constituent_regions:
237
+ try:
238
+ result = region.find(selector=selector, text=text, **kwargs)
239
+ if result is not None:
240
+ return result
241
+ except Exception as e:
242
+ logger.warning(
243
+ f"FlowRegion.find: error searching region {region}: {e}",
244
+ exc_info=False,
245
+ )
246
+ return None # No match found
208
247
 
209
248
  def find_all(
210
249
  self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
211
250
  ) -> "ElementCollection": # Stringized
212
251
  """
213
- Finds all physical elements within this FlowRegion that match the selector or text.
252
+ Find **all** elements across the constituent regions that match the given
253
+ selector or text.
254
+
255
+ Rather than first materialising *every* element in the FlowRegion (which
256
+ can be extremely slow for multi-page flows), this implementation simply
257
+ chains each region's native ``find_all`` call and concatenates their
258
+ results into a single ElementCollection while preserving flow order.
214
259
  """
215
- all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
216
- return all_elems.find_all(
260
+ from natural_pdf.elements.collections import (
261
+ ElementCollection as RuntimeElementCollection,
262
+ )
263
+
264
+ matched_elements = [] # type: List["PhysicalElement"]
265
+
266
+ if not self.constituent_regions:
267
+ return RuntimeElementCollection([])
268
+
269
+ for region in self.constituent_regions:
270
+ try:
271
+ region_matches = region.find_all(
217
272
  selector=selector, text=text, **kwargs
218
- ) # ElementCollection.find_all
273
+ )
274
+ if region_matches:
275
+ # ``region_matches`` is an ElementCollection – extend with its
276
+ # underlying list so we don't create nested collections.
277
+ matched_elements.extend(
278
+ region_matches.elements
279
+ if hasattr(region_matches, "elements")
280
+ else list(region_matches)
281
+ )
282
+ except Exception as e:
283
+ logger.warning(
284
+ f"FlowRegion.find_all: error searching region {region}: {e}",
285
+ exc_info=False,
286
+ )
287
+
288
+ return RuntimeElementCollection(matched_elements)
219
289
 
220
290
  def highlight(
221
291
  self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
@@ -253,6 +323,7 @@ class FlowRegion:
253
323
  stack_direction: str = "vertical",
254
324
  stack_gap: int = 5,
255
325
  stack_background_color: Tuple[int, int, int] = (255, 255, 255),
326
+ crop: bool = False,
256
327
  **kwargs,
257
328
  ) -> Optional["PIL_Image"]:
258
329
  """
@@ -269,6 +340,7 @@ class FlowRegion:
269
340
  stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
270
341
  stack_gap: Gap in pixels between stacked pages.
271
342
  stack_background_color: RGB background color for the stacked image.
343
+ crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
272
344
  **kwargs: Additional arguments passed to the underlying rendering methods.
273
345
 
274
346
  Returns:
@@ -358,6 +430,16 @@ class FlowRegion:
358
430
  if not temp_highlights_for_page:
359
431
  continue
360
432
 
433
+ # Calculate crop bbox if cropping is enabled
434
+ crop_bbox = None
435
+ if crop and constituent_regions_on_this_page:
436
+ # Calculate the bounding box that encompasses all constituent regions on this page
437
+ min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
438
+ min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
439
+ max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
440
+ max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
441
+ crop_bbox = (min_x0, min_y0, max_x1, max_y1)
442
+
361
443
  page_image = highlighter_service.render_preview(
362
444
  page_index=(
363
445
  page_obj.index
@@ -369,6 +451,7 @@ class FlowRegion:
369
451
  width=width,
370
452
  labels=labels, # Pass through labels
371
453
  legend_position=legend_position,
454
+ crop_bbox=crop_bbox,
372
455
  **kwargs,
373
456
  )
374
457
  if page_image:
@@ -549,7 +632,7 @@ class FlowRegion:
549
632
  cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
550
633
  show_progress: bool = False,
551
634
  **kwargs,
552
- ) -> List[List[Optional[str]]]:
635
+ ) -> TableResult:
553
636
  """Extracts a single logical table from the FlowRegion.
554
637
 
555
638
  This is a convenience wrapper that iterates through the constituent
@@ -565,9 +648,9 @@ class FlowRegion:
565
648
  ``Region.extract_table`` implementation.
566
649
 
567
650
  Returns:
568
- A list of rows (``List[List[Optional[str]]]``). Rows returned from
651
+ A TableResult object containing the aggregated table data. Rows returned from
569
652
  consecutive constituent regions are appended in document order. If
570
- no tables are detected in any region, an empty list is returned.
653
+ no tables are detected in any region, an empty TableResult is returned.
571
654
  """
572
655
 
573
656
  if table_settings is None:
@@ -576,13 +659,13 @@ class FlowRegion:
576
659
  text_options = {}
577
660
 
578
661
  if not self.constituent_regions:
579
- return []
662
+ return TableResult([])
580
663
 
581
664
  aggregated_rows: List[List[Optional[str]]] = []
582
665
 
583
666
  for region in self.constituent_regions:
584
667
  try:
585
- region_rows = region.extract_table(
668
+ region_result = region.extract_table(
586
669
  method=method,
587
670
  table_settings=table_settings.copy(), # Avoid side-effects
588
671
  use_ocr=use_ocr,
@@ -593,16 +676,16 @@ class FlowRegion:
593
676
  **kwargs,
594
677
  )
595
678
 
596
- # ``region_rows`` can legitimately be [] if no table found.
597
- if region_rows:
598
- aggregated_rows.extend(region_rows)
679
+ # region_result is now a TableResult object, extract the rows
680
+ if region_result:
681
+ aggregated_rows.extend(region_result)
599
682
  except Exception as e:
600
683
  logger.error(
601
684
  f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
602
685
  exc_info=True,
603
686
  )
604
687
 
605
- return aggregated_rows
688
+ return TableResult(aggregated_rows)
606
689
 
607
690
  def extract_tables(
608
691
  self,
@@ -649,3 +732,22 @@ class FlowRegion:
649
732
  )
650
733
 
651
734
  return all_tables
735
+
736
+ @property
737
+ def normalized_type(self) -> Optional[str]:
738
+ """
739
+ Return the normalized type for selector compatibility.
740
+ This allows FlowRegion to be found by selectors like 'table'.
741
+ """
742
+ if self.region_type:
743
+ # Convert region_type to normalized format (replace spaces with underscores, lowercase)
744
+ return self.region_type.lower().replace(" ", "_")
745
+ return None
746
+
747
+ @property
748
+ def type(self) -> Optional[str]:
749
+ """
750
+ Return the type attribute for selector compatibility.
751
+ This is an alias for normalized_type.
752
+ """
753
+ return self.normalized_type
@@ -24,6 +24,7 @@ This enables powerful document navigation like:
24
24
  - page.find('text[size>12]:bold:contains("Summary")')
25
25
  - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
26
26
  - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
27
+ - page.find('text:regex("[\u2500-\u257F]")') # Box drawing characters
27
28
  """
28
29
 
29
30
  import ast
@@ -748,6 +749,29 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
748
749
 
749
750
  filter_lambda = contains_check
750
751
 
752
+ # --- Handle :regex pseudo-class (same as :contains with regex=True) ---
753
+ elif name == "regex" and args is not None:
754
+ ignore_case = not kwargs.get("case", True) # Default case sensitive
755
+ filter_name = f"pseudo-class :regex({args!r}, ignore_case={ignore_case})"
756
+
757
+ def regex_check(element, args=args, ignore_case=ignore_case):
758
+ if not hasattr(element, "text") or not element.text:
759
+ return False # Element must have non-empty text
760
+
761
+ element_text = element.text
762
+ search_term = str(args) # Ensure args is string
763
+
764
+ try:
765
+ pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
766
+ return bool(pattern.search(element_text))
767
+ except re.error as e:
768
+ logger.warning(
769
+ f"Invalid regex '{search_term}' in :regex selector: {e}. Returning False."
770
+ )
771
+ return False
772
+
773
+ filter_lambda = regex_check
774
+
751
775
  # --- Handle :startswith and :starts-with (alias) --- #
752
776
  elif name in ("starts-with", "startswith") and args is not None:
753
777
  filter_name = f"pseudo-class :{name}({args!r})"
@@ -0,0 +1,26 @@
1
+ from typing import List, Optional, Tuple
2
+
3
+
4
+ def merge_bboxes(
5
+ bboxes: List[Optional[Tuple[float, float, float, float]]]
6
+ ) -> Optional[Tuple[float, float, float, float]]:
7
+ """
8
+ Merge multiple bounding boxes into a single one that encompasses all of them.
9
+
10
+ Args:
11
+ bboxes: A list of bbox tuples (x0, top, x1, bottom). Can contain None values.
12
+
13
+ Returns:
14
+ A single merged bbox tuple, or None if no valid bboxes are provided.
15
+ """
16
+ if not bboxes:
17
+ return None
18
+
19
+ # Filter out None or invalid bboxes
20
+ valid_bboxes = [b for b in bboxes if b and len(b) == 4]
21
+ if not valid_bboxes:
22
+ return None
23
+
24
+ x0s, tops, x1s, bottoms = zip(*valid_bboxes)
25
+
26
+ return (min(x0s), min(tops), max(x1s), max(bottoms))