natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1221,7 +1221,7 @@ class Region(
1221
1221
  # Filter to elements in this region
1222
1222
  return [e for e in page_elements if self._is_element_in_region(e)]
1223
1223
 
1224
- def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
1224
+ def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
1225
1225
  """
1226
1226
  Extract text from this region, respecting page exclusions and using pdfplumber's
1227
1227
  layout engine (chars_to_textmap).
@@ -1229,6 +1229,10 @@ class Region(
1229
1229
  Args:
1230
1230
  apply_exclusions: Whether to apply exclusion regions defined on the parent page.
1231
1231
  debug: Enable verbose debugging output for filtering steps.
1232
+ content_filter: Optional content filter to exclude specific text patterns. Can be:
1233
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
1234
+ - A callable that takes text and returns True to KEEP the character
1235
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1232
1236
  **kwargs: Additional layout parameters passed directly to pdfplumber's
1233
1237
  `chars_to_textmap` function (e.g., layout, x_density, y_density).
1234
1238
  See Page.extract_text docstring for more.
@@ -1285,10 +1289,15 @@ class Region(
1285
1289
  )
1286
1290
 
1287
1291
  # 5. Generate Text Layout using Utility
1292
+ # Add content_filter to kwargs if provided
1293
+ final_kwargs = kwargs.copy()
1294
+ if content_filter is not None:
1295
+ final_kwargs["content_filter"] = content_filter
1296
+
1288
1297
  result = generate_text_layout(
1289
1298
  char_dicts=filtered_chars,
1290
1299
  layout_context_bbox=self.bbox, # Use region's bbox for context
1291
- user_kwargs=kwargs, # Pass original kwargs to layout generator
1300
+ user_kwargs=final_kwargs, # Pass kwargs including content_filter
1292
1301
  )
1293
1302
 
1294
1303
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1304,6 +1313,7 @@ class Region(
1304
1313
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1305
1314
  # --- NEW: Add tqdm control option --- #
1306
1315
  show_progress: bool = False, # Controls progress bar for text method
1316
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None, # NEW: Content filtering
1307
1317
  ) -> TableResult: # Return type allows Optional[str] for cells
1308
1318
  """
1309
1319
  Extract a table from this region.
@@ -1323,6 +1333,11 @@ class Region(
1323
1333
  and returns its string content. Overrides default text extraction
1324
1334
  for the 'text' method.
1325
1335
  show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
1336
+ content_filter: Optional content filter to apply during cell text extraction. Can be:
1337
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
1338
+ - A callable that takes text and returns True to KEEP the character
1339
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1340
+ Works with all extraction methods by filtering cell content.
1326
1341
 
1327
1342
  Returns:
1328
1343
  Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1358,7 +1373,7 @@ class Region(
1358
1373
  logger.debug(
1359
1374
  f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
1360
1375
  )
1361
- return TableResult(self._extract_table_from_cells(cell_regions_in_table))
1376
+ return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
1362
1377
 
1363
1378
  # --------------------------------------------------------------- #
1364
1379
 
@@ -1439,14 +1454,15 @@ class Region(
1439
1454
 
1440
1455
  # Use the selected method
1441
1456
  if effective_method == "tatr":
1442
- table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
1457
+ table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
1443
1458
  elif effective_method == "text":
1444
1459
  current_text_options = text_options.copy()
1445
1460
  current_text_options["cell_extraction_func"] = cell_extraction_func
1446
1461
  current_text_options["show_progress"] = show_progress
1462
+ current_text_options["content_filter"] = content_filter
1447
1463
  table_rows = self._extract_table_text(**current_text_options)
1448
1464
  elif effective_method == "pdfplumber":
1449
- table_rows = self._extract_table_plumber(table_settings)
1465
+ table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
1450
1466
  else:
1451
1467
  raise ValueError(
1452
1468
  f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
@@ -1600,16 +1616,35 @@ class Region(
1600
1616
  # Extract all tables from the cropped area
1601
1617
  tables = cropped.extract_tables(table_settings)
1602
1618
 
1603
- # Return the tables or an empty list if none found
1604
- return tables if tables else []
1619
+ # Apply RTL text processing to all tables
1620
+ if tables:
1621
+ processed_tables = []
1622
+ for table in tables:
1623
+ processed_table = []
1624
+ for row in table:
1625
+ processed_row = []
1626
+ for cell in row:
1627
+ if cell is not None:
1628
+ # Apply RTL text processing to each cell
1629
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1630
+ processed_row.append(rtl_processed_cell)
1631
+ else:
1632
+ processed_row.append(cell)
1633
+ processed_table.append(processed_row)
1634
+ processed_tables.append(processed_table)
1635
+ return processed_tables
1636
+
1637
+ # Return empty list if no tables found
1638
+ return []
1605
1639
 
1606
- def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1640
+ def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
1607
1641
  """
1608
1642
  Extract table using pdfplumber's table extraction.
1609
1643
  This method extracts the largest table within the region.
1610
1644
 
1611
1645
  Args:
1612
1646
  table_settings: Settings for pdfplumber table extraction
1647
+ content_filter: Optional content filter to apply to cell values
1613
1648
 
1614
1649
  Returns:
1615
1650
  Table data as a list of rows, where each row is a list of cell values
@@ -1645,16 +1680,35 @@ class Region(
1645
1680
 
1646
1681
  # Return the table or an empty list if none found
1647
1682
  if table:
1648
- return table
1683
+ # Apply RTL text processing and content filtering if provided
1684
+ processed_table = []
1685
+ for row in table:
1686
+ processed_row = []
1687
+ for cell in row:
1688
+ if cell is not None:
1689
+ # Apply RTL text processing first
1690
+ rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1691
+
1692
+ # Then apply content filter if provided
1693
+ if content_filter is not None:
1694
+ filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
1695
+ processed_row.append(filtered_cell)
1696
+ else:
1697
+ processed_row.append(rtl_processed_cell)
1698
+ else:
1699
+ processed_row.append(cell)
1700
+ processed_table.append(processed_row)
1701
+ return processed_table
1649
1702
  return []
1650
1703
 
1651
- def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
1704
+ def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
1652
1705
  """
1653
1706
  Extract table using TATR structure detection.
1654
1707
 
1655
1708
  Args:
1656
1709
  use_ocr: Whether to apply OCR to each cell for better text extraction
1657
1710
  ocr_config: Optional OCR configuration parameters
1711
+ content_filter: Optional content filter to apply to cell values
1658
1712
 
1659
1713
  Returns:
1660
1714
  Table data as a list of rows, where each row is a list of cell values
@@ -1734,7 +1788,10 @@ class Region(
1734
1788
  continue
1735
1789
 
1736
1790
  # Fallback to normal extraction
1737
- header_texts.append(header.extract_text().strip())
1791
+ header_text = header.extract_text().strip()
1792
+ if content_filter is not None:
1793
+ header_text = self._apply_content_filter_to_text(header_text, content_filter)
1794
+ header_texts.append(header_text)
1738
1795
  table_data.append(header_texts)
1739
1796
 
1740
1797
  # Process rows
@@ -1767,6 +1824,8 @@ class Region(
1767
1824
 
1768
1825
  # Fallback to normal extraction
1769
1826
  cell_text = cell_region.extract_text().strip()
1827
+ if content_filter is not None:
1828
+ cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
1770
1829
  row_cells.append(cell_text)
1771
1830
  else:
1772
1831
  # No column information, just extract the whole row text
@@ -1780,7 +1839,10 @@ class Region(
1780
1839
  continue
1781
1840
 
1782
1841
  # Fallback to normal extraction
1783
- row_cells.append(row.extract_text().strip())
1842
+ row_text = row.extract_text().strip()
1843
+ if content_filter is not None:
1844
+ row_text = self._apply_content_filter_to_text(row_text, content_filter)
1845
+ row_cells.append(row_text)
1784
1846
 
1785
1847
  table_data.append(row_cells)
1786
1848
 
@@ -1793,7 +1855,7 @@ class Region(
1793
1855
  Args:
1794
1856
  **text_options: Options passed to analyze_text_table_structure,
1795
1857
  plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
1796
- and 'show_progress'.
1858
+ 'show_progress', and 'content_filter'.
1797
1859
 
1798
1860
  Returns:
1799
1861
  Table data as list of lists of strings (or None for empty cells).
@@ -1801,6 +1863,8 @@ class Region(
1801
1863
  cell_extraction_func = text_options.pop("cell_extraction_func", None)
1802
1864
  # --- Get show_progress option --- #
1803
1865
  show_progress = text_options.pop("show_progress", False)
1866
+ # --- Get content_filter option --- #
1867
+ content_filter = text_options.pop("content_filter", None)
1804
1868
 
1805
1869
  # Analyze structure first (or use cached results)
1806
1870
  if "text_table_structure" in self.analyses:
@@ -1881,7 +1945,7 @@ class Region(
1881
1945
  cell_value = None
1882
1946
  else:
1883
1947
  cell_value = cell_region.extract_text(
1884
- layout=False, apply_exclusions=False
1948
+ layout=False, apply_exclusions=False, content_filter=content_filter
1885
1949
  ).strip()
1886
1950
 
1887
1951
  rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
@@ -3356,12 +3420,16 @@ class Region(
3356
3420
  # New helper: build table from pre-computed table_cell regions
3357
3421
  # ------------------------------------------------------------------
3358
3422
 
3359
- def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
3423
+ def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
3360
3424
  """Construct a table (list-of-lists) from table_cell regions.
3361
3425
 
3362
3426
  This assumes each cell Region has metadata.row_index / col_index as written by
3363
3427
  detect_table_structure_from_lines(). If these keys are missing we will
3364
3428
  fall back to sorting by geometry.
3429
+
3430
+ Args:
3431
+ cell_regions: List of table cell Region objects to extract text from
3432
+ content_filter: Optional content filter to apply to cell text extraction
3365
3433
  """
3366
3434
  if not cell_regions:
3367
3435
  return []
@@ -3392,7 +3460,7 @@ class Region(
3392
3460
  try:
3393
3461
  r_idx = int(cell.metadata.get("row_index"))
3394
3462
  c_idx = int(cell.metadata.get("col_index"))
3395
- text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
3463
+ text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3396
3464
  table_grid[r_idx][c_idx] = text_val if text_val else None
3397
3465
  except Exception as _err:
3398
3466
  # Skip problematic cell
@@ -3439,7 +3507,101 @@ class Region(
3439
3507
  row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
3440
3508
  col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
3441
3509
 
3442
- text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
3510
+ text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3443
3511
  table_grid[row_idx][col_idx] = text_val if text_val else None
3444
3512
 
3445
3513
  return table_grid
3514
+
3515
+ def _apply_rtl_processing_to_text(self, text: str) -> str:
3516
+ """
3517
+ Apply RTL (Right-to-Left) text processing to a string.
3518
+
3519
+ This converts visual order text (as stored in PDFs) to logical order
3520
+ for proper display of Arabic, Hebrew, and other RTL scripts.
3521
+
3522
+ Args:
3523
+ text: Input text string in visual order
3524
+
3525
+ Returns:
3526
+ Text string in logical order
3527
+ """
3528
+ if not text or not text.strip():
3529
+ return text
3530
+
3531
+ # Quick check for RTL characters - if none found, return as-is
3532
+ import unicodedata
3533
+
3534
+ def _contains_rtl(s):
3535
+ return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
3536
+
3537
+ if not _contains_rtl(text):
3538
+ return text
3539
+
3540
+ try:
3541
+ from bidi.algorithm import get_display # type: ignore
3542
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
3543
+
3544
+ # Apply BiDi algorithm to convert from visual to logical order
3545
+ # Process line by line to handle mixed content properly
3546
+ processed_lines = []
3547
+ for line in text.split("\n"):
3548
+ if line.strip():
3549
+ # Determine base direction for this line
3550
+ base_dir = "R" if _contains_rtl(line) else "L"
3551
+ logical_line = get_display(line, base_dir=base_dir)
3552
+ # Apply bracket mirroring for correct logical order
3553
+ processed_lines.append(mirror_brackets(logical_line))
3554
+ else:
3555
+ processed_lines.append(line)
3556
+
3557
+ return "\n".join(processed_lines)
3558
+
3559
+ except (ImportError, Exception):
3560
+ # If bidi library is not available or fails, return original text
3561
+ return text
3562
+
3563
+ def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
3564
+ """
3565
+ Apply content filter to a text string.
3566
+
3567
+ Args:
3568
+ text: Input text string
3569
+ content_filter: Content filter (regex, callable, or list of regexes)
3570
+
3571
+ Returns:
3572
+ Filtered text string
3573
+ """
3574
+ if not text or content_filter is None:
3575
+ return text
3576
+
3577
+ import re
3578
+
3579
+ if isinstance(content_filter, str):
3580
+ # Single regex pattern - remove matching parts
3581
+ try:
3582
+ return re.sub(content_filter, '', text)
3583
+ except re.error:
3584
+ return text # Invalid regex, return original
3585
+
3586
+ elif isinstance(content_filter, list):
3587
+ # List of regex patterns - remove parts matching ANY pattern
3588
+ try:
3589
+ result = text
3590
+ for pattern in content_filter:
3591
+ result = re.sub(pattern, '', result)
3592
+ return result
3593
+ except re.error:
3594
+ return text # Invalid regex, return original
3595
+
3596
+ elif callable(content_filter):
3597
+ # Callable filter - apply to individual characters
3598
+ try:
3599
+ filtered_chars = []
3600
+ for char in text:
3601
+ if content_filter(char):
3602
+ filtered_chars.append(char)
3603
+ return ''.join(filtered_chars)
3604
+ except Exception:
3605
+ return text # Function error, return original
3606
+
3607
+ return text
@@ -230,7 +230,7 @@ class TextElement(Element):
230
230
  # Default to black
231
231
  return (0, 0, 0)
232
232
 
233
- def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
233
+ def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
234
234
  """
235
235
  Extract text from this element.
236
236
 
@@ -238,14 +238,48 @@ class TextElement(Element):
238
238
  keep_blank_chars: Retained for API compatibility (unused).
239
239
  strip: If True (default) remove leading/trailing whitespace. Users may
240
240
  pass ``strip=False`` to preserve whitespace exactly as stored.
241
+ content_filter: Optional content filter to exclude specific text patterns. Can be:
242
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
243
+ - A callable that takes text and returns True to KEEP the character
244
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
241
245
  **kwargs: Accepted for forward-compatibility and ignored here.
242
246
 
243
247
  Returns:
244
- The text content, optionally stripped.
248
+ The text content, optionally stripped and filtered.
245
249
  """
246
250
  # Basic retrieval
247
251
  result = self.text or ""
248
252
 
253
+ # Apply content filtering if provided
254
+ if content_filter is not None and result:
255
+ import re
256
+
257
+ if isinstance(content_filter, str):
258
+ # Single regex pattern - remove matching characters
259
+ try:
260
+ result = re.sub(content_filter, '', result)
261
+ except re.error:
262
+ pass # Invalid regex, skip filtering
263
+
264
+ elif isinstance(content_filter, list):
265
+ # List of regex patterns - remove characters matching ANY pattern
266
+ try:
267
+ for pattern in content_filter:
268
+ result = re.sub(pattern, '', result)
269
+ except re.error:
270
+ pass # Invalid regex, skip filtering
271
+
272
+ elif callable(content_filter):
273
+ # Callable filter - apply to individual characters
274
+ try:
275
+ filtered_chars = []
276
+ for char in result:
277
+ if content_filter(char):
278
+ filtered_chars.append(char)
279
+ result = ''.join(filtered_chars)
280
+ except Exception:
281
+ pass # Function error, skip filtering
282
+
249
283
  # Apply optional stripping – align with global convention where simple
250
284
  # element extraction is stripped by default.
251
285
  if strip:
@@ -1,11 +1,13 @@
1
1
  import logging
2
2
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3
3
 
4
- from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
4
+ from pdfplumber.utils.geometry import merge_bboxes # Import merge_bboxes directly
5
5
 
6
6
  # For runtime image manipulation
7
7
  from PIL import Image as PIL_Image_Runtime
8
8
 
9
+ from natural_pdf.tables import TableResult
10
+
9
11
  if TYPE_CHECKING:
10
12
  from PIL.Image import Image as PIL_Image # For type hints
11
13
 
@@ -53,28 +55,46 @@ class FlowRegion:
53
55
  self.source_flow_element: "FlowElement" = source_flow_element
54
56
  self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
55
57
 
58
+ # Add attributes for grid building, similar to Region
59
+ self.source: Optional[str] = None
60
+ self.region_type: Optional[str] = None
61
+ self.metadata: Dict[str, Any] = {}
62
+
56
63
  # Cache for expensive operations
57
64
  self._cached_text: Optional[str] = None
58
65
  self._cached_elements: Optional["ElementCollection"] = None # Stringized
59
66
  self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
60
67
 
68
+ def __getattr__(self, name: str) -> Any:
69
+ """
70
+ Dynamically proxy attribute access to the source FlowElement if the
71
+ attribute is not found in this instance.
72
+ """
73
+ if name in self.__dict__:
74
+ return self.__dict__[name]
75
+ elif self.source_flow_element is not None:
76
+ return getattr(self.source_flow_element, name)
77
+ else:
78
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
79
+
61
80
  @property
62
81
  def bbox(self) -> Optional[Tuple[float, float, float, float]]:
63
82
  """
64
- Calculates a conceptual bounding box that encompasses all constituent physical regions.
65
- This is the union of the bounding boxes of the constituent regions in their
66
- original physical coordinates.
67
- Returns None if there are no constituent regions.
83
+ The bounding box that encloses all constituent regions.
84
+ Calculated dynamically and cached.
68
85
  """
69
86
  if self._cached_bbox is not None:
70
87
  return self._cached_bbox
71
88
  if not self.constituent_regions:
72
89
  return None
73
90
 
74
- # Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
75
- # This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
76
- # Our PhysicalRegion objects satisfy this.
77
- self._cached_bbox = objects_to_bbox(self.constituent_regions)
91
+ # Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
92
+ # Extract bbox tuples from regions first
93
+ region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
94
+ if not region_bboxes:
95
+ return None
96
+
97
+ self._cached_bbox = merge_bboxes(region_bboxes)
78
98
  return self._cached_bbox
79
99
 
80
100
  @property
@@ -200,22 +220,72 @@ class FlowRegion:
200
220
  self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
201
221
  ) -> Optional["PhysicalElement"]: # Stringized
202
222
  """
203
- Finds the first physical element within this FlowRegion that matches the selector or text.
223
+ Find the first element in flow order that matches the selector or text.
224
+
225
+ This implementation iterates through the constituent regions *in the order
226
+ they appear in ``self.constituent_regions`` (i.e. document flow order),
227
+ delegating the search to each region's own ``find`` method. It therefore
228
+ avoids constructing a huge intermediate ElementCollection and returns as
229
+ soon as a match is found, which is substantially faster and ensures that
230
+ selectors such as 'table' work exactly as they do on an individual
231
+ Region.
204
232
  """
205
- # Uses self.elements() which respects exclusions if apply_exclusions=True by default
206
- all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
207
- return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
233
+ if not self.constituent_regions:
234
+ return None
235
+
236
+ for region in self.constituent_regions:
237
+ try:
238
+ result = region.find(selector=selector, text=text, **kwargs)
239
+ if result is not None:
240
+ return result
241
+ except Exception as e:
242
+ logger.warning(
243
+ f"FlowRegion.find: error searching region {region}: {e}",
244
+ exc_info=False,
245
+ )
246
+ return None # No match found
208
247
 
209
248
  def find_all(
210
249
  self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
211
250
  ) -> "ElementCollection": # Stringized
212
251
  """
213
- Finds all physical elements within this FlowRegion that match the selector or text.
252
+ Find **all** elements across the constituent regions that match the given
253
+ selector or text.
254
+
255
+ Rather than first materialising *every* element in the FlowRegion (which
256
+ can be extremely slow for multi-page flows), this implementation simply
257
+ chains each region's native ``find_all`` call and concatenates their
258
+ results into a single ElementCollection while preserving flow order.
214
259
  """
215
- all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
216
- return all_elems.find_all(
260
+ from natural_pdf.elements.collections import (
261
+ ElementCollection as RuntimeElementCollection,
262
+ )
263
+
264
+ matched_elements = [] # type: List["PhysicalElement"]
265
+
266
+ if not self.constituent_regions:
267
+ return RuntimeElementCollection([])
268
+
269
+ for region in self.constituent_regions:
270
+ try:
271
+ region_matches = region.find_all(
217
272
  selector=selector, text=text, **kwargs
218
- ) # ElementCollection.find_all
273
+ )
274
+ if region_matches:
275
+ # ``region_matches`` is an ElementCollection – extend with its
276
+ # underlying list so we don't create nested collections.
277
+ matched_elements.extend(
278
+ region_matches.elements
279
+ if hasattr(region_matches, "elements")
280
+ else list(region_matches)
281
+ )
282
+ except Exception as e:
283
+ logger.warning(
284
+ f"FlowRegion.find_all: error searching region {region}: {e}",
285
+ exc_info=False,
286
+ )
287
+
288
+ return RuntimeElementCollection(matched_elements)
219
289
 
220
290
  def highlight(
221
291
  self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
@@ -253,6 +323,7 @@ class FlowRegion:
253
323
  stack_direction: str = "vertical",
254
324
  stack_gap: int = 5,
255
325
  stack_background_color: Tuple[int, int, int] = (255, 255, 255),
326
+ crop: bool = False,
256
327
  **kwargs,
257
328
  ) -> Optional["PIL_Image"]:
258
329
  """
@@ -269,6 +340,7 @@ class FlowRegion:
269
340
  stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
270
341
  stack_gap: Gap in pixels between stacked pages.
271
342
  stack_background_color: RGB background color for the stacked image.
343
+ crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
272
344
  **kwargs: Additional arguments passed to the underlying rendering methods.
273
345
 
274
346
  Returns:
@@ -358,6 +430,16 @@ class FlowRegion:
358
430
  if not temp_highlights_for_page:
359
431
  continue
360
432
 
433
+ # Calculate crop bbox if cropping is enabled
434
+ crop_bbox = None
435
+ if crop and constituent_regions_on_this_page:
436
+ # Calculate the bounding box that encompasses all constituent regions on this page
437
+ min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
438
+ min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
439
+ max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
440
+ max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
441
+ crop_bbox = (min_x0, min_y0, max_x1, max_y1)
442
+
361
443
  page_image = highlighter_service.render_preview(
362
444
  page_index=(
363
445
  page_obj.index
@@ -369,6 +451,7 @@ class FlowRegion:
369
451
  width=width,
370
452
  labels=labels, # Pass through labels
371
453
  legend_position=legend_position,
454
+ crop_bbox=crop_bbox,
372
455
  **kwargs,
373
456
  )
374
457
  if page_image:
@@ -549,7 +632,7 @@ class FlowRegion:
549
632
  cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
550
633
  show_progress: bool = False,
551
634
  **kwargs,
552
- ) -> List[List[Optional[str]]]:
635
+ ) -> TableResult:
553
636
  """Extracts a single logical table from the FlowRegion.
554
637
 
555
638
  This is a convenience wrapper that iterates through the constituent
@@ -565,9 +648,9 @@ class FlowRegion:
565
648
  ``Region.extract_table`` implementation.
566
649
 
567
650
  Returns:
568
- A list of rows (``List[List[Optional[str]]]``). Rows returned from
651
+ A TableResult object containing the aggregated table data. Rows returned from
569
652
  consecutive constituent regions are appended in document order. If
570
- no tables are detected in any region, an empty list is returned.
653
+ no tables are detected in any region, an empty TableResult is returned.
571
654
  """
572
655
 
573
656
  if table_settings is None:
@@ -576,13 +659,13 @@ class FlowRegion:
576
659
  text_options = {}
577
660
 
578
661
  if not self.constituent_regions:
579
- return []
662
+ return TableResult([])
580
663
 
581
664
  aggregated_rows: List[List[Optional[str]]] = []
582
665
 
583
666
  for region in self.constituent_regions:
584
667
  try:
585
- region_rows = region.extract_table(
668
+ region_result = region.extract_table(
586
669
  method=method,
587
670
  table_settings=table_settings.copy(), # Avoid side-effects
588
671
  use_ocr=use_ocr,
@@ -593,16 +676,16 @@ class FlowRegion:
593
676
  **kwargs,
594
677
  )
595
678
 
596
- # ``region_rows`` can legitimately be [] if no table found.
597
- if region_rows:
598
- aggregated_rows.extend(region_rows)
679
+ # region_result is now a TableResult object, extract the rows
680
+ if region_result:
681
+ aggregated_rows.extend(region_result)
599
682
  except Exception as e:
600
683
  logger.error(
601
684
  f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
602
685
  exc_info=True,
603
686
  )
604
687
 
605
- return aggregated_rows
688
+ return TableResult(aggregated_rows)
606
689
 
607
690
  def extract_tables(
608
691
  self,
@@ -649,3 +732,22 @@ class FlowRegion:
649
732
  )
650
733
 
651
734
  return all_tables
735
+
736
+ @property
737
+ def normalized_type(self) -> Optional[str]:
738
+ """
739
+ Return the normalized type for selector compatibility.
740
+ This allows FlowRegion to be found by selectors like 'table'.
741
+ """
742
+ if self.region_type:
743
+ # Convert region_type to normalized format (replace spaces with underscores, lowercase)
744
+ return self.region_type.lower().replace(" ", "_")
745
+ return None
746
+
747
+ @property
748
+ def type(self) -> Optional[str]:
749
+ """
750
+ Return the type attribute for selector compatibility.
751
+ This is an alias for normalized_type.
752
+ """
753
+ return self.normalized_type