natural-pdf 0.1.36__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +1053 -26
- natural_pdf/core/page.py +274 -46
- natural_pdf/core/pdf.py +116 -30
- natural_pdf/elements/collections.py +48 -7
- natural_pdf/elements/region.py +179 -17
- natural_pdf/elements/text.py +36 -2
- natural_pdf/flows/region.py +128 -26
- natural_pdf/selectors/parser.py +24 -0
- natural_pdf/utils/layout.py +26 -0
- natural_pdf/utils/text_extraction.py +76 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.38.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1221,7 +1221,7 @@ class Region(
|
|
1221
1221
|
# Filter to elements in this region
|
1222
1222
|
return [e for e in page_elements if self._is_element_in_region(e)]
|
1223
1223
|
|
1224
|
-
def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
|
1224
|
+
def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
|
1225
1225
|
"""
|
1226
1226
|
Extract text from this region, respecting page exclusions and using pdfplumber's
|
1227
1227
|
layout engine (chars_to_textmap).
|
@@ -1229,6 +1229,10 @@ class Region(
|
|
1229
1229
|
Args:
|
1230
1230
|
apply_exclusions: Whether to apply exclusion regions defined on the parent page.
|
1231
1231
|
debug: Enable verbose debugging output for filtering steps.
|
1232
|
+
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
1233
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1234
|
+
- A callable that takes text and returns True to KEEP the character
|
1235
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1232
1236
|
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
1233
1237
|
`chars_to_textmap` function (e.g., layout, x_density, y_density).
|
1234
1238
|
See Page.extract_text docstring for more.
|
@@ -1285,10 +1289,15 @@ class Region(
|
|
1285
1289
|
)
|
1286
1290
|
|
1287
1291
|
# 5. Generate Text Layout using Utility
|
1292
|
+
# Add content_filter to kwargs if provided
|
1293
|
+
final_kwargs = kwargs.copy()
|
1294
|
+
if content_filter is not None:
|
1295
|
+
final_kwargs["content_filter"] = content_filter
|
1296
|
+
|
1288
1297
|
result = generate_text_layout(
|
1289
1298
|
char_dicts=filtered_chars,
|
1290
1299
|
layout_context_bbox=self.bbox, # Use region's bbox for context
|
1291
|
-
user_kwargs=
|
1300
|
+
user_kwargs=final_kwargs, # Pass kwargs including content_filter
|
1292
1301
|
)
|
1293
1302
|
|
1294
1303
|
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
@@ -1304,6 +1313,7 @@ class Region(
|
|
1304
1313
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1305
1314
|
# --- NEW: Add tqdm control option --- #
|
1306
1315
|
show_progress: bool = False, # Controls progress bar for text method
|
1316
|
+
content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None, # NEW: Content filtering
|
1307
1317
|
) -> TableResult: # Return type allows Optional[str] for cells
|
1308
1318
|
"""
|
1309
1319
|
Extract a table from this region.
|
@@ -1323,6 +1333,11 @@ class Region(
|
|
1323
1333
|
and returns its string content. Overrides default text extraction
|
1324
1334
|
for the 'text' method.
|
1325
1335
|
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
1336
|
+
content_filter: Optional content filter to apply during cell text extraction. Can be:
|
1337
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1338
|
+
- A callable that takes text and returns True to KEEP the character
|
1339
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1340
|
+
Works with all extraction methods by filtering cell content.
|
1326
1341
|
|
1327
1342
|
Returns:
|
1328
1343
|
Table data as a list of rows, where each row is a list of cell values (str or None).
|
@@ -1358,7 +1373,7 @@ class Region(
|
|
1358
1373
|
logger.debug(
|
1359
1374
|
f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
|
1360
1375
|
)
|
1361
|
-
return TableResult(self._extract_table_from_cells(cell_regions_in_table))
|
1376
|
+
return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
|
1362
1377
|
|
1363
1378
|
# --------------------------------------------------------------- #
|
1364
1379
|
|
@@ -1439,14 +1454,15 @@ class Region(
|
|
1439
1454
|
|
1440
1455
|
# Use the selected method
|
1441
1456
|
if effective_method == "tatr":
|
1442
|
-
table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
1457
|
+
table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
|
1443
1458
|
elif effective_method == "text":
|
1444
1459
|
current_text_options = text_options.copy()
|
1445
1460
|
current_text_options["cell_extraction_func"] = cell_extraction_func
|
1446
1461
|
current_text_options["show_progress"] = show_progress
|
1462
|
+
current_text_options["content_filter"] = content_filter
|
1447
1463
|
table_rows = self._extract_table_text(**current_text_options)
|
1448
1464
|
elif effective_method == "pdfplumber":
|
1449
|
-
table_rows = self._extract_table_plumber(table_settings)
|
1465
|
+
table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
|
1450
1466
|
else:
|
1451
1467
|
raise ValueError(
|
1452
1468
|
f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
|
@@ -1600,16 +1616,35 @@ class Region(
|
|
1600
1616
|
# Extract all tables from the cropped area
|
1601
1617
|
tables = cropped.extract_tables(table_settings)
|
1602
1618
|
|
1603
|
-
#
|
1604
|
-
|
1619
|
+
# Apply RTL text processing to all tables
|
1620
|
+
if tables:
|
1621
|
+
processed_tables = []
|
1622
|
+
for table in tables:
|
1623
|
+
processed_table = []
|
1624
|
+
for row in table:
|
1625
|
+
processed_row = []
|
1626
|
+
for cell in row:
|
1627
|
+
if cell is not None:
|
1628
|
+
# Apply RTL text processing to each cell
|
1629
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1630
|
+
processed_row.append(rtl_processed_cell)
|
1631
|
+
else:
|
1632
|
+
processed_row.append(cell)
|
1633
|
+
processed_table.append(processed_row)
|
1634
|
+
processed_tables.append(processed_table)
|
1635
|
+
return processed_tables
|
1636
|
+
|
1637
|
+
# Return empty list if no tables found
|
1638
|
+
return []
|
1605
1639
|
|
1606
|
-
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
1640
|
+
def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
|
1607
1641
|
"""
|
1608
1642
|
Extract table using pdfplumber's table extraction.
|
1609
1643
|
This method extracts the largest table within the region.
|
1610
1644
|
|
1611
1645
|
Args:
|
1612
1646
|
table_settings: Settings for pdfplumber table extraction
|
1647
|
+
content_filter: Optional content filter to apply to cell values
|
1613
1648
|
|
1614
1649
|
Returns:
|
1615
1650
|
Table data as a list of rows, where each row is a list of cell values
|
@@ -1645,16 +1680,35 @@ class Region(
|
|
1645
1680
|
|
1646
1681
|
# Return the table or an empty list if none found
|
1647
1682
|
if table:
|
1648
|
-
|
1683
|
+
# Apply RTL text processing and content filtering if provided
|
1684
|
+
processed_table = []
|
1685
|
+
for row in table:
|
1686
|
+
processed_row = []
|
1687
|
+
for cell in row:
|
1688
|
+
if cell is not None:
|
1689
|
+
# Apply RTL text processing first
|
1690
|
+
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1691
|
+
|
1692
|
+
# Then apply content filter if provided
|
1693
|
+
if content_filter is not None:
|
1694
|
+
filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
|
1695
|
+
processed_row.append(filtered_cell)
|
1696
|
+
else:
|
1697
|
+
processed_row.append(rtl_processed_cell)
|
1698
|
+
else:
|
1699
|
+
processed_row.append(cell)
|
1700
|
+
processed_table.append(processed_row)
|
1701
|
+
return processed_table
|
1649
1702
|
return []
|
1650
1703
|
|
1651
|
-
def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
|
1704
|
+
def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
|
1652
1705
|
"""
|
1653
1706
|
Extract table using TATR structure detection.
|
1654
1707
|
|
1655
1708
|
Args:
|
1656
1709
|
use_ocr: Whether to apply OCR to each cell for better text extraction
|
1657
1710
|
ocr_config: Optional OCR configuration parameters
|
1711
|
+
content_filter: Optional content filter to apply to cell values
|
1658
1712
|
|
1659
1713
|
Returns:
|
1660
1714
|
Table data as a list of rows, where each row is a list of cell values
|
@@ -1734,7 +1788,10 @@ class Region(
|
|
1734
1788
|
continue
|
1735
1789
|
|
1736
1790
|
# Fallback to normal extraction
|
1737
|
-
|
1791
|
+
header_text = header.extract_text().strip()
|
1792
|
+
if content_filter is not None:
|
1793
|
+
header_text = self._apply_content_filter_to_text(header_text, content_filter)
|
1794
|
+
header_texts.append(header_text)
|
1738
1795
|
table_data.append(header_texts)
|
1739
1796
|
|
1740
1797
|
# Process rows
|
@@ -1767,6 +1824,8 @@ class Region(
|
|
1767
1824
|
|
1768
1825
|
# Fallback to normal extraction
|
1769
1826
|
cell_text = cell_region.extract_text().strip()
|
1827
|
+
if content_filter is not None:
|
1828
|
+
cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
|
1770
1829
|
row_cells.append(cell_text)
|
1771
1830
|
else:
|
1772
1831
|
# No column information, just extract the whole row text
|
@@ -1780,7 +1839,10 @@ class Region(
|
|
1780
1839
|
continue
|
1781
1840
|
|
1782
1841
|
# Fallback to normal extraction
|
1783
|
-
|
1842
|
+
row_text = row.extract_text().strip()
|
1843
|
+
if content_filter is not None:
|
1844
|
+
row_text = self._apply_content_filter_to_text(row_text, content_filter)
|
1845
|
+
row_cells.append(row_text)
|
1784
1846
|
|
1785
1847
|
table_data.append(row_cells)
|
1786
1848
|
|
@@ -1793,7 +1855,7 @@ class Region(
|
|
1793
1855
|
Args:
|
1794
1856
|
**text_options: Options passed to analyze_text_table_structure,
|
1795
1857
|
plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
|
1796
|
-
and '
|
1858
|
+
'show_progress', and 'content_filter'.
|
1797
1859
|
|
1798
1860
|
Returns:
|
1799
1861
|
Table data as list of lists of strings (or None for empty cells).
|
@@ -1801,6 +1863,8 @@ class Region(
|
|
1801
1863
|
cell_extraction_func = text_options.pop("cell_extraction_func", None)
|
1802
1864
|
# --- Get show_progress option --- #
|
1803
1865
|
show_progress = text_options.pop("show_progress", False)
|
1866
|
+
# --- Get content_filter option --- #
|
1867
|
+
content_filter = text_options.pop("content_filter", None)
|
1804
1868
|
|
1805
1869
|
# Analyze structure first (or use cached results)
|
1806
1870
|
if "text_table_structure" in self.analyses:
|
@@ -1881,7 +1945,7 @@ class Region(
|
|
1881
1945
|
cell_value = None
|
1882
1946
|
else:
|
1883
1947
|
cell_value = cell_region.extract_text(
|
1884
|
-
layout=False, apply_exclusions=False
|
1948
|
+
layout=False, apply_exclusions=False, content_filter=content_filter
|
1885
1949
|
).strip()
|
1886
1950
|
|
1887
1951
|
rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
|
@@ -3356,12 +3420,16 @@ class Region(
|
|
3356
3420
|
# New helper: build table from pre-computed table_cell regions
|
3357
3421
|
# ------------------------------------------------------------------
|
3358
3422
|
|
3359
|
-
def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
|
3423
|
+
def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
|
3360
3424
|
"""Construct a table (list-of-lists) from table_cell regions.
|
3361
3425
|
|
3362
3426
|
This assumes each cell Region has metadata.row_index / col_index as written by
|
3363
3427
|
detect_table_structure_from_lines(). If these keys are missing we will
|
3364
3428
|
fall back to sorting by geometry.
|
3429
|
+
|
3430
|
+
Args:
|
3431
|
+
cell_regions: List of table cell Region objects to extract text from
|
3432
|
+
content_filter: Optional content filter to apply to cell text extraction
|
3365
3433
|
"""
|
3366
3434
|
if not cell_regions:
|
3367
3435
|
return []
|
@@ -3392,7 +3460,7 @@ class Region(
|
|
3392
3460
|
try:
|
3393
3461
|
r_idx = int(cell.metadata.get("row_index"))
|
3394
3462
|
c_idx = int(cell.metadata.get("col_index"))
|
3395
|
-
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
3463
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
|
3396
3464
|
table_grid[r_idx][c_idx] = text_val if text_val else None
|
3397
3465
|
except Exception as _err:
|
3398
3466
|
# Skip problematic cell
|
@@ -3439,7 +3507,101 @@ class Region(
|
|
3439
3507
|
row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
|
3440
3508
|
col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
|
3441
3509
|
|
3442
|
-
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
3510
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
|
3443
3511
|
table_grid[row_idx][col_idx] = text_val if text_val else None
|
3444
3512
|
|
3445
3513
|
return table_grid
|
3514
|
+
|
3515
|
+
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3516
|
+
"""
|
3517
|
+
Apply RTL (Right-to-Left) text processing to a string.
|
3518
|
+
|
3519
|
+
This converts visual order text (as stored in PDFs) to logical order
|
3520
|
+
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3521
|
+
|
3522
|
+
Args:
|
3523
|
+
text: Input text string in visual order
|
3524
|
+
|
3525
|
+
Returns:
|
3526
|
+
Text string in logical order
|
3527
|
+
"""
|
3528
|
+
if not text or not text.strip():
|
3529
|
+
return text
|
3530
|
+
|
3531
|
+
# Quick check for RTL characters - if none found, return as-is
|
3532
|
+
import unicodedata
|
3533
|
+
|
3534
|
+
def _contains_rtl(s):
|
3535
|
+
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3536
|
+
|
3537
|
+
if not _contains_rtl(text):
|
3538
|
+
return text
|
3539
|
+
|
3540
|
+
try:
|
3541
|
+
from bidi.algorithm import get_display # type: ignore
|
3542
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3543
|
+
|
3544
|
+
# Apply BiDi algorithm to convert from visual to logical order
|
3545
|
+
# Process line by line to handle mixed content properly
|
3546
|
+
processed_lines = []
|
3547
|
+
for line in text.split("\n"):
|
3548
|
+
if line.strip():
|
3549
|
+
# Determine base direction for this line
|
3550
|
+
base_dir = "R" if _contains_rtl(line) else "L"
|
3551
|
+
logical_line = get_display(line, base_dir=base_dir)
|
3552
|
+
# Apply bracket mirroring for correct logical order
|
3553
|
+
processed_lines.append(mirror_brackets(logical_line))
|
3554
|
+
else:
|
3555
|
+
processed_lines.append(line)
|
3556
|
+
|
3557
|
+
return "\n".join(processed_lines)
|
3558
|
+
|
3559
|
+
except (ImportError, Exception):
|
3560
|
+
# If bidi library is not available or fails, return original text
|
3561
|
+
return text
|
3562
|
+
|
3563
|
+
def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
|
3564
|
+
"""
|
3565
|
+
Apply content filter to a text string.
|
3566
|
+
|
3567
|
+
Args:
|
3568
|
+
text: Input text string
|
3569
|
+
content_filter: Content filter (regex, callable, or list of regexes)
|
3570
|
+
|
3571
|
+
Returns:
|
3572
|
+
Filtered text string
|
3573
|
+
"""
|
3574
|
+
if not text or content_filter is None:
|
3575
|
+
return text
|
3576
|
+
|
3577
|
+
import re
|
3578
|
+
|
3579
|
+
if isinstance(content_filter, str):
|
3580
|
+
# Single regex pattern - remove matching parts
|
3581
|
+
try:
|
3582
|
+
return re.sub(content_filter, '', text)
|
3583
|
+
except re.error:
|
3584
|
+
return text # Invalid regex, return original
|
3585
|
+
|
3586
|
+
elif isinstance(content_filter, list):
|
3587
|
+
# List of regex patterns - remove parts matching ANY pattern
|
3588
|
+
try:
|
3589
|
+
result = text
|
3590
|
+
for pattern in content_filter:
|
3591
|
+
result = re.sub(pattern, '', result)
|
3592
|
+
return result
|
3593
|
+
except re.error:
|
3594
|
+
return text # Invalid regex, return original
|
3595
|
+
|
3596
|
+
elif callable(content_filter):
|
3597
|
+
# Callable filter - apply to individual characters
|
3598
|
+
try:
|
3599
|
+
filtered_chars = []
|
3600
|
+
for char in text:
|
3601
|
+
if content_filter(char):
|
3602
|
+
filtered_chars.append(char)
|
3603
|
+
return ''.join(filtered_chars)
|
3604
|
+
except Exception:
|
3605
|
+
return text # Function error, return original
|
3606
|
+
|
3607
|
+
return text
|
natural_pdf/elements/text.py
CHANGED
@@ -230,7 +230,7 @@ class TextElement(Element):
|
|
230
230
|
# Default to black
|
231
231
|
return (0, 0, 0)
|
232
232
|
|
233
|
-
def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
|
233
|
+
def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
|
234
234
|
"""
|
235
235
|
Extract text from this element.
|
236
236
|
|
@@ -238,14 +238,48 @@ class TextElement(Element):
|
|
238
238
|
keep_blank_chars: Retained for API compatibility (unused).
|
239
239
|
strip: If True (default) remove leading/trailing whitespace. Users may
|
240
240
|
pass ``strip=False`` to preserve whitespace exactly as stored.
|
241
|
+
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
242
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
243
|
+
- A callable that takes text and returns True to KEEP the character
|
244
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
241
245
|
**kwargs: Accepted for forward-compatibility and ignored here.
|
242
246
|
|
243
247
|
Returns:
|
244
|
-
The text content, optionally stripped.
|
248
|
+
The text content, optionally stripped and filtered.
|
245
249
|
"""
|
246
250
|
# Basic retrieval
|
247
251
|
result = self.text or ""
|
248
252
|
|
253
|
+
# Apply content filtering if provided
|
254
|
+
if content_filter is not None and result:
|
255
|
+
import re
|
256
|
+
|
257
|
+
if isinstance(content_filter, str):
|
258
|
+
# Single regex pattern - remove matching characters
|
259
|
+
try:
|
260
|
+
result = re.sub(content_filter, '', result)
|
261
|
+
except re.error:
|
262
|
+
pass # Invalid regex, skip filtering
|
263
|
+
|
264
|
+
elif isinstance(content_filter, list):
|
265
|
+
# List of regex patterns - remove characters matching ANY pattern
|
266
|
+
try:
|
267
|
+
for pattern in content_filter:
|
268
|
+
result = re.sub(pattern, '', result)
|
269
|
+
except re.error:
|
270
|
+
pass # Invalid regex, skip filtering
|
271
|
+
|
272
|
+
elif callable(content_filter):
|
273
|
+
# Callable filter - apply to individual characters
|
274
|
+
try:
|
275
|
+
filtered_chars = []
|
276
|
+
for char in result:
|
277
|
+
if content_filter(char):
|
278
|
+
filtered_chars.append(char)
|
279
|
+
result = ''.join(filtered_chars)
|
280
|
+
except Exception:
|
281
|
+
pass # Function error, skip filtering
|
282
|
+
|
249
283
|
# Apply optional stripping – align with global convention where simple
|
250
284
|
# element extraction is stripped by default.
|
251
285
|
if strip:
|
natural_pdf/flows/region.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
3
3
|
|
4
|
-
from pdfplumber.utils.geometry import
|
4
|
+
from pdfplumber.utils.geometry import merge_bboxes # Import merge_bboxes directly
|
5
5
|
|
6
6
|
# For runtime image manipulation
|
7
7
|
from PIL import Image as PIL_Image_Runtime
|
8
8
|
|
9
|
+
from natural_pdf.tables import TableResult
|
10
|
+
|
9
11
|
if TYPE_CHECKING:
|
10
12
|
from PIL.Image import Image as PIL_Image # For type hints
|
11
13
|
|
@@ -53,28 +55,46 @@ class FlowRegion:
|
|
53
55
|
self.source_flow_element: "FlowElement" = source_flow_element
|
54
56
|
self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
|
55
57
|
|
58
|
+
# Add attributes for grid building, similar to Region
|
59
|
+
self.source: Optional[str] = None
|
60
|
+
self.region_type: Optional[str] = None
|
61
|
+
self.metadata: Dict[str, Any] = {}
|
62
|
+
|
56
63
|
# Cache for expensive operations
|
57
64
|
self._cached_text: Optional[str] = None
|
58
65
|
self._cached_elements: Optional["ElementCollection"] = None # Stringized
|
59
66
|
self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
|
60
67
|
|
68
|
+
def __getattr__(self, name: str) -> Any:
|
69
|
+
"""
|
70
|
+
Dynamically proxy attribute access to the source FlowElement if the
|
71
|
+
attribute is not found in this instance.
|
72
|
+
"""
|
73
|
+
if name in self.__dict__:
|
74
|
+
return self.__dict__[name]
|
75
|
+
elif self.source_flow_element is not None:
|
76
|
+
return getattr(self.source_flow_element, name)
|
77
|
+
else:
|
78
|
+
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
|
79
|
+
|
61
80
|
@property
|
62
81
|
def bbox(self) -> Optional[Tuple[float, float, float, float]]:
|
63
82
|
"""
|
64
|
-
|
65
|
-
|
66
|
-
original physical coordinates.
|
67
|
-
Returns None if there are no constituent regions.
|
83
|
+
The bounding box that encloses all constituent regions.
|
84
|
+
Calculated dynamically and cached.
|
68
85
|
"""
|
69
86
|
if self._cached_bbox is not None:
|
70
87
|
return self._cached_bbox
|
71
88
|
if not self.constituent_regions:
|
72
89
|
return None
|
73
90
|
|
74
|
-
# Use
|
75
|
-
#
|
76
|
-
|
77
|
-
|
91
|
+
# Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
|
92
|
+
# Extract bbox tuples from regions first
|
93
|
+
region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
|
94
|
+
if not region_bboxes:
|
95
|
+
return None
|
96
|
+
|
97
|
+
self._cached_bbox = merge_bboxes(region_bboxes)
|
78
98
|
return self._cached_bbox
|
79
99
|
|
80
100
|
@property
|
@@ -200,22 +220,72 @@ class FlowRegion:
|
|
200
220
|
self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
|
201
221
|
) -> Optional["PhysicalElement"]: # Stringized
|
202
222
|
"""
|
203
|
-
|
223
|
+
Find the first element in flow order that matches the selector or text.
|
224
|
+
|
225
|
+
This implementation iterates through the constituent regions *in the order
|
226
|
+
they appear in ``self.constituent_regions`` (i.e. document flow order),
|
227
|
+
delegating the search to each region's own ``find`` method. It therefore
|
228
|
+
avoids constructing a huge intermediate ElementCollection and returns as
|
229
|
+
soon as a match is found, which is substantially faster and ensures that
|
230
|
+
selectors such as 'table' work exactly as they do on an individual
|
231
|
+
Region.
|
204
232
|
"""
|
205
|
-
|
206
|
-
|
207
|
-
|
233
|
+
if not self.constituent_regions:
|
234
|
+
return None
|
235
|
+
|
236
|
+
for region in self.constituent_regions:
|
237
|
+
try:
|
238
|
+
result = region.find(selector=selector, text=text, **kwargs)
|
239
|
+
if result is not None:
|
240
|
+
return result
|
241
|
+
except Exception as e:
|
242
|
+
logger.warning(
|
243
|
+
f"FlowRegion.find: error searching region {region}: {e}",
|
244
|
+
exc_info=False,
|
245
|
+
)
|
246
|
+
return None # No match found
|
208
247
|
|
209
248
|
def find_all(
|
210
249
|
self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
|
211
250
|
) -> "ElementCollection": # Stringized
|
212
251
|
"""
|
213
|
-
|
252
|
+
Find **all** elements across the constituent regions that match the given
|
253
|
+
selector or text.
|
254
|
+
|
255
|
+
Rather than first materialising *every* element in the FlowRegion (which
|
256
|
+
can be extremely slow for multi-page flows), this implementation simply
|
257
|
+
chains each region's native ``find_all`` call and concatenates their
|
258
|
+
results into a single ElementCollection while preserving flow order.
|
214
259
|
"""
|
215
|
-
|
216
|
-
|
260
|
+
from natural_pdf.elements.collections import (
|
261
|
+
ElementCollection as RuntimeElementCollection,
|
262
|
+
)
|
263
|
+
|
264
|
+
matched_elements = [] # type: List["PhysicalElement"]
|
265
|
+
|
266
|
+
if not self.constituent_regions:
|
267
|
+
return RuntimeElementCollection([])
|
268
|
+
|
269
|
+
for region in self.constituent_regions:
|
270
|
+
try:
|
271
|
+
region_matches = region.find_all(
|
217
272
|
selector=selector, text=text, **kwargs
|
218
|
-
|
273
|
+
)
|
274
|
+
if region_matches:
|
275
|
+
# ``region_matches`` is an ElementCollection – extend with its
|
276
|
+
# underlying list so we don't create nested collections.
|
277
|
+
matched_elements.extend(
|
278
|
+
region_matches.elements
|
279
|
+
if hasattr(region_matches, "elements")
|
280
|
+
else list(region_matches)
|
281
|
+
)
|
282
|
+
except Exception as e:
|
283
|
+
logger.warning(
|
284
|
+
f"FlowRegion.find_all: error searching region {region}: {e}",
|
285
|
+
exc_info=False,
|
286
|
+
)
|
287
|
+
|
288
|
+
return RuntimeElementCollection(matched_elements)
|
219
289
|
|
220
290
|
def highlight(
|
221
291
|
self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
|
@@ -253,6 +323,7 @@ class FlowRegion:
|
|
253
323
|
stack_direction: str = "vertical",
|
254
324
|
stack_gap: int = 5,
|
255
325
|
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
326
|
+
crop: bool = False,
|
256
327
|
**kwargs,
|
257
328
|
) -> Optional["PIL_Image"]:
|
258
329
|
"""
|
@@ -269,6 +340,7 @@ class FlowRegion:
|
|
269
340
|
stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
|
270
341
|
stack_gap: Gap in pixels between stacked pages.
|
271
342
|
stack_background_color: RGB background color for the stacked image.
|
343
|
+
crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
|
272
344
|
**kwargs: Additional arguments passed to the underlying rendering methods.
|
273
345
|
|
274
346
|
Returns:
|
@@ -358,6 +430,16 @@ class FlowRegion:
|
|
358
430
|
if not temp_highlights_for_page:
|
359
431
|
continue
|
360
432
|
|
433
|
+
# Calculate crop bbox if cropping is enabled
|
434
|
+
crop_bbox = None
|
435
|
+
if crop and constituent_regions_on_this_page:
|
436
|
+
# Calculate the bounding box that encompasses all constituent regions on this page
|
437
|
+
min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
|
438
|
+
min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
|
439
|
+
max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
|
440
|
+
max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
|
441
|
+
crop_bbox = (min_x0, min_y0, max_x1, max_y1)
|
442
|
+
|
361
443
|
page_image = highlighter_service.render_preview(
|
362
444
|
page_index=(
|
363
445
|
page_obj.index
|
@@ -369,6 +451,7 @@ class FlowRegion:
|
|
369
451
|
width=width,
|
370
452
|
labels=labels, # Pass through labels
|
371
453
|
legend_position=legend_position,
|
454
|
+
crop_bbox=crop_bbox,
|
372
455
|
**kwargs,
|
373
456
|
)
|
374
457
|
if page_image:
|
@@ -549,7 +632,7 @@ class FlowRegion:
|
|
549
632
|
cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
|
550
633
|
show_progress: bool = False,
|
551
634
|
**kwargs,
|
552
|
-
) ->
|
635
|
+
) -> TableResult:
|
553
636
|
"""Extracts a single logical table from the FlowRegion.
|
554
637
|
|
555
638
|
This is a convenience wrapper that iterates through the constituent
|
@@ -565,9 +648,9 @@ class FlowRegion:
|
|
565
648
|
``Region.extract_table`` implementation.
|
566
649
|
|
567
650
|
Returns:
|
568
|
-
A
|
651
|
+
A TableResult object containing the aggregated table data. Rows returned from
|
569
652
|
consecutive constituent regions are appended in document order. If
|
570
|
-
no tables are detected in any region, an empty
|
653
|
+
no tables are detected in any region, an empty TableResult is returned.
|
571
654
|
"""
|
572
655
|
|
573
656
|
if table_settings is None:
|
@@ -576,13 +659,13 @@ class FlowRegion:
|
|
576
659
|
text_options = {}
|
577
660
|
|
578
661
|
if not self.constituent_regions:
|
579
|
-
return []
|
662
|
+
return TableResult([])
|
580
663
|
|
581
664
|
aggregated_rows: List[List[Optional[str]]] = []
|
582
665
|
|
583
666
|
for region in self.constituent_regions:
|
584
667
|
try:
|
585
|
-
|
668
|
+
region_result = region.extract_table(
|
586
669
|
method=method,
|
587
670
|
table_settings=table_settings.copy(), # Avoid side-effects
|
588
671
|
use_ocr=use_ocr,
|
@@ -593,16 +676,16 @@ class FlowRegion:
|
|
593
676
|
**kwargs,
|
594
677
|
)
|
595
678
|
|
596
|
-
#
|
597
|
-
if
|
598
|
-
aggregated_rows.extend(
|
679
|
+
# region_result is now a TableResult object, extract the rows
|
680
|
+
if region_result:
|
681
|
+
aggregated_rows.extend(region_result)
|
599
682
|
except Exception as e:
|
600
683
|
logger.error(
|
601
684
|
f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
|
602
685
|
exc_info=True,
|
603
686
|
)
|
604
687
|
|
605
|
-
return aggregated_rows
|
688
|
+
return TableResult(aggregated_rows)
|
606
689
|
|
607
690
|
def extract_tables(
|
608
691
|
self,
|
@@ -649,3 +732,22 @@ class FlowRegion:
|
|
649
732
|
)
|
650
733
|
|
651
734
|
return all_tables
|
735
|
+
|
736
|
+
@property
|
737
|
+
def normalized_type(self) -> Optional[str]:
|
738
|
+
"""
|
739
|
+
Return the normalized type for selector compatibility.
|
740
|
+
This allows FlowRegion to be found by selectors like 'table'.
|
741
|
+
"""
|
742
|
+
if self.region_type:
|
743
|
+
# Convert region_type to normalized format (replace spaces with underscores, lowercase)
|
744
|
+
return self.region_type.lower().replace(" ", "_")
|
745
|
+
return None
|
746
|
+
|
747
|
+
@property
|
748
|
+
def type(self) -> Optional[str]:
|
749
|
+
"""
|
750
|
+
Return the type attribute for selector compatibility.
|
751
|
+
This is an alias for normalized_type.
|
752
|
+
"""
|
753
|
+
return self.normalized_type
|