natural-pdf 0.1.36__py3-none-any.whl → 0.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +1053 -26
- natural_pdf/core/page.py +205 -45
- natural_pdf/core/pdf.py +16 -1
- natural_pdf/elements/collections.py +10 -0
- natural_pdf/elements/region.py +106 -14
- natural_pdf/elements/text.py +36 -2
- natural_pdf/flows/region.py +128 -26
- natural_pdf/selectors/parser.py +24 -0
- natural_pdf/utils/layout.py +26 -0
- natural_pdf/utils/text_extraction.py +76 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.36.dist-info → natural_pdf-0.1.37.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1221,7 +1221,7 @@ class Region(
|
|
1221
1221
|
# Filter to elements in this region
|
1222
1222
|
return [e for e in page_elements if self._is_element_in_region(e)]
|
1223
1223
|
|
1224
|
-
def extract_text(self, apply_exclusions=True, debug=False, **kwargs) -> str:
|
1224
|
+
def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
|
1225
1225
|
"""
|
1226
1226
|
Extract text from this region, respecting page exclusions and using pdfplumber's
|
1227
1227
|
layout engine (chars_to_textmap).
|
@@ -1229,6 +1229,10 @@ class Region(
|
|
1229
1229
|
Args:
|
1230
1230
|
apply_exclusions: Whether to apply exclusion regions defined on the parent page.
|
1231
1231
|
debug: Enable verbose debugging output for filtering steps.
|
1232
|
+
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
1233
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1234
|
+
- A callable that takes text and returns True to KEEP the character
|
1235
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1232
1236
|
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
1233
1237
|
`chars_to_textmap` function (e.g., layout, x_density, y_density).
|
1234
1238
|
See Page.extract_text docstring for more.
|
@@ -1285,10 +1289,15 @@ class Region(
|
|
1285
1289
|
)
|
1286
1290
|
|
1287
1291
|
# 5. Generate Text Layout using Utility
|
1292
|
+
# Add content_filter to kwargs if provided
|
1293
|
+
final_kwargs = kwargs.copy()
|
1294
|
+
if content_filter is not None:
|
1295
|
+
final_kwargs["content_filter"] = content_filter
|
1296
|
+
|
1288
1297
|
result = generate_text_layout(
|
1289
1298
|
char_dicts=filtered_chars,
|
1290
1299
|
layout_context_bbox=self.bbox, # Use region's bbox for context
|
1291
|
-
user_kwargs=
|
1300
|
+
user_kwargs=final_kwargs, # Pass kwargs including content_filter
|
1292
1301
|
)
|
1293
1302
|
|
1294
1303
|
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
@@ -1304,6 +1313,7 @@ class Region(
|
|
1304
1313
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1305
1314
|
# --- NEW: Add tqdm control option --- #
|
1306
1315
|
show_progress: bool = False, # Controls progress bar for text method
|
1316
|
+
content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None, # NEW: Content filtering
|
1307
1317
|
) -> TableResult: # Return type allows Optional[str] for cells
|
1308
1318
|
"""
|
1309
1319
|
Extract a table from this region.
|
@@ -1323,6 +1333,11 @@ class Region(
|
|
1323
1333
|
and returns its string content. Overrides default text extraction
|
1324
1334
|
for the 'text' method.
|
1325
1335
|
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
1336
|
+
content_filter: Optional content filter to apply during cell text extraction. Can be:
|
1337
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1338
|
+
- A callable that takes text and returns True to KEEP the character
|
1339
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1340
|
+
Works with all extraction methods by filtering cell content.
|
1326
1341
|
|
1327
1342
|
Returns:
|
1328
1343
|
Table data as a list of rows, where each row is a list of cell values (str or None).
|
@@ -1358,7 +1373,7 @@ class Region(
|
|
1358
1373
|
logger.debug(
|
1359
1374
|
f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
|
1360
1375
|
)
|
1361
|
-
return TableResult(self._extract_table_from_cells(cell_regions_in_table))
|
1376
|
+
return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
|
1362
1377
|
|
1363
1378
|
# --------------------------------------------------------------- #
|
1364
1379
|
|
@@ -1439,14 +1454,15 @@ class Region(
|
|
1439
1454
|
|
1440
1455
|
# Use the selected method
|
1441
1456
|
if effective_method == "tatr":
|
1442
|
-
table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
1457
|
+
table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
|
1443
1458
|
elif effective_method == "text":
|
1444
1459
|
current_text_options = text_options.copy()
|
1445
1460
|
current_text_options["cell_extraction_func"] = cell_extraction_func
|
1446
1461
|
current_text_options["show_progress"] = show_progress
|
1462
|
+
current_text_options["content_filter"] = content_filter
|
1447
1463
|
table_rows = self._extract_table_text(**current_text_options)
|
1448
1464
|
elif effective_method == "pdfplumber":
|
1449
|
-
table_rows = self._extract_table_plumber(table_settings)
|
1465
|
+
table_rows = self._extract_table_plumber(table_settings, content_filter=content_filter)
|
1450
1466
|
else:
|
1451
1467
|
raise ValueError(
|
1452
1468
|
f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
|
@@ -1603,13 +1619,14 @@ class Region(
|
|
1603
1619
|
# Return the tables or an empty list if none found
|
1604
1620
|
return tables if tables else []
|
1605
1621
|
|
1606
|
-
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
1622
|
+
def _extract_table_plumber(self, table_settings: dict, content_filter=None) -> List[List[str]]:
|
1607
1623
|
"""
|
1608
1624
|
Extract table using pdfplumber's table extraction.
|
1609
1625
|
This method extracts the largest table within the region.
|
1610
1626
|
|
1611
1627
|
Args:
|
1612
1628
|
table_settings: Settings for pdfplumber table extraction
|
1629
|
+
content_filter: Optional content filter to apply to cell values
|
1613
1630
|
|
1614
1631
|
Returns:
|
1615
1632
|
Table data as a list of rows, where each row is a list of cell values
|
@@ -1645,16 +1662,31 @@ class Region(
|
|
1645
1662
|
|
1646
1663
|
# Return the table or an empty list if none found
|
1647
1664
|
if table:
|
1665
|
+
# Apply content filtering if provided
|
1666
|
+
if content_filter is not None:
|
1667
|
+
filtered_table = []
|
1668
|
+
for row in table:
|
1669
|
+
filtered_row = []
|
1670
|
+
for cell in row:
|
1671
|
+
if cell is not None:
|
1672
|
+
# Apply content filter to cell text
|
1673
|
+
filtered_cell = self._apply_content_filter_to_text(cell, content_filter)
|
1674
|
+
filtered_row.append(filtered_cell)
|
1675
|
+
else:
|
1676
|
+
filtered_row.append(cell)
|
1677
|
+
filtered_table.append(filtered_row)
|
1678
|
+
return filtered_table
|
1648
1679
|
return table
|
1649
1680
|
return []
|
1650
1681
|
|
1651
|
-
def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
|
1682
|
+
def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
|
1652
1683
|
"""
|
1653
1684
|
Extract table using TATR structure detection.
|
1654
1685
|
|
1655
1686
|
Args:
|
1656
1687
|
use_ocr: Whether to apply OCR to each cell for better text extraction
|
1657
1688
|
ocr_config: Optional OCR configuration parameters
|
1689
|
+
content_filter: Optional content filter to apply to cell values
|
1658
1690
|
|
1659
1691
|
Returns:
|
1660
1692
|
Table data as a list of rows, where each row is a list of cell values
|
@@ -1734,7 +1766,10 @@ class Region(
|
|
1734
1766
|
continue
|
1735
1767
|
|
1736
1768
|
# Fallback to normal extraction
|
1737
|
-
|
1769
|
+
header_text = header.extract_text().strip()
|
1770
|
+
if content_filter is not None:
|
1771
|
+
header_text = self._apply_content_filter_to_text(header_text, content_filter)
|
1772
|
+
header_texts.append(header_text)
|
1738
1773
|
table_data.append(header_texts)
|
1739
1774
|
|
1740
1775
|
# Process rows
|
@@ -1767,6 +1802,8 @@ class Region(
|
|
1767
1802
|
|
1768
1803
|
# Fallback to normal extraction
|
1769
1804
|
cell_text = cell_region.extract_text().strip()
|
1805
|
+
if content_filter is not None:
|
1806
|
+
cell_text = self._apply_content_filter_to_text(cell_text, content_filter)
|
1770
1807
|
row_cells.append(cell_text)
|
1771
1808
|
else:
|
1772
1809
|
# No column information, just extract the whole row text
|
@@ -1780,7 +1817,10 @@ class Region(
|
|
1780
1817
|
continue
|
1781
1818
|
|
1782
1819
|
# Fallback to normal extraction
|
1783
|
-
|
1820
|
+
row_text = row.extract_text().strip()
|
1821
|
+
if content_filter is not None:
|
1822
|
+
row_text = self._apply_content_filter_to_text(row_text, content_filter)
|
1823
|
+
row_cells.append(row_text)
|
1784
1824
|
|
1785
1825
|
table_data.append(row_cells)
|
1786
1826
|
|
@@ -1793,7 +1833,7 @@ class Region(
|
|
1793
1833
|
Args:
|
1794
1834
|
**text_options: Options passed to analyze_text_table_structure,
|
1795
1835
|
plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
|
1796
|
-
and '
|
1836
|
+
'show_progress', and 'content_filter'.
|
1797
1837
|
|
1798
1838
|
Returns:
|
1799
1839
|
Table data as list of lists of strings (or None for empty cells).
|
@@ -1801,6 +1841,8 @@ class Region(
|
|
1801
1841
|
cell_extraction_func = text_options.pop("cell_extraction_func", None)
|
1802
1842
|
# --- Get show_progress option --- #
|
1803
1843
|
show_progress = text_options.pop("show_progress", False)
|
1844
|
+
# --- Get content_filter option --- #
|
1845
|
+
content_filter = text_options.pop("content_filter", None)
|
1804
1846
|
|
1805
1847
|
# Analyze structure first (or use cached results)
|
1806
1848
|
if "text_table_structure" in self.analyses:
|
@@ -1881,7 +1923,7 @@ class Region(
|
|
1881
1923
|
cell_value = None
|
1882
1924
|
else:
|
1883
1925
|
cell_value = cell_region.extract_text(
|
1884
|
-
layout=False, apply_exclusions=False
|
1926
|
+
layout=False, apply_exclusions=False, content_filter=content_filter
|
1885
1927
|
).strip()
|
1886
1928
|
|
1887
1929
|
rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
|
@@ -3356,12 +3398,16 @@ class Region(
|
|
3356
3398
|
# New helper: build table from pre-computed table_cell regions
|
3357
3399
|
# ------------------------------------------------------------------
|
3358
3400
|
|
3359
|
-
def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
|
3401
|
+
def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
|
3360
3402
|
"""Construct a table (list-of-lists) from table_cell regions.
|
3361
3403
|
|
3362
3404
|
This assumes each cell Region has metadata.row_index / col_index as written by
|
3363
3405
|
detect_table_structure_from_lines(). If these keys are missing we will
|
3364
3406
|
fall back to sorting by geometry.
|
3407
|
+
|
3408
|
+
Args:
|
3409
|
+
cell_regions: List of table cell Region objects to extract text from
|
3410
|
+
content_filter: Optional content filter to apply to cell text extraction
|
3365
3411
|
"""
|
3366
3412
|
if not cell_regions:
|
3367
3413
|
return []
|
@@ -3392,7 +3438,7 @@ class Region(
|
|
3392
3438
|
try:
|
3393
3439
|
r_idx = int(cell.metadata.get("row_index"))
|
3394
3440
|
c_idx = int(cell.metadata.get("col_index"))
|
3395
|
-
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
3441
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
|
3396
3442
|
table_grid[r_idx][c_idx] = text_val if text_val else None
|
3397
3443
|
except Exception as _err:
|
3398
3444
|
# Skip problematic cell
|
@@ -3439,7 +3485,53 @@ class Region(
|
|
3439
3485
|
row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
|
3440
3486
|
col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
|
3441
3487
|
|
3442
|
-
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
3488
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
|
3443
3489
|
table_grid[row_idx][col_idx] = text_val if text_val else None
|
3444
3490
|
|
3445
3491
|
return table_grid
|
3492
|
+
|
3493
|
+
def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
|
3494
|
+
"""
|
3495
|
+
Apply content filter to a text string.
|
3496
|
+
|
3497
|
+
Args:
|
3498
|
+
text: Input text string
|
3499
|
+
content_filter: Content filter (regex, callable, or list of regexes)
|
3500
|
+
|
3501
|
+
Returns:
|
3502
|
+
Filtered text string
|
3503
|
+
"""
|
3504
|
+
if not text or content_filter is None:
|
3505
|
+
return text
|
3506
|
+
|
3507
|
+
import re
|
3508
|
+
|
3509
|
+
if isinstance(content_filter, str):
|
3510
|
+
# Single regex pattern - remove matching parts
|
3511
|
+
try:
|
3512
|
+
return re.sub(content_filter, '', text)
|
3513
|
+
except re.error:
|
3514
|
+
return text # Invalid regex, return original
|
3515
|
+
|
3516
|
+
elif isinstance(content_filter, list):
|
3517
|
+
# List of regex patterns - remove parts matching ANY pattern
|
3518
|
+
try:
|
3519
|
+
result = text
|
3520
|
+
for pattern in content_filter:
|
3521
|
+
result = re.sub(pattern, '', result)
|
3522
|
+
return result
|
3523
|
+
except re.error:
|
3524
|
+
return text # Invalid regex, return original
|
3525
|
+
|
3526
|
+
elif callable(content_filter):
|
3527
|
+
# Callable filter - apply to individual characters
|
3528
|
+
try:
|
3529
|
+
filtered_chars = []
|
3530
|
+
for char in text:
|
3531
|
+
if content_filter(char):
|
3532
|
+
filtered_chars.append(char)
|
3533
|
+
return ''.join(filtered_chars)
|
3534
|
+
except Exception:
|
3535
|
+
return text # Function error, return original
|
3536
|
+
|
3537
|
+
return text
|
natural_pdf/elements/text.py
CHANGED
@@ -230,7 +230,7 @@ class TextElement(Element):
|
|
230
230
|
# Default to black
|
231
231
|
return (0, 0, 0)
|
232
232
|
|
233
|
-
def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
|
233
|
+
def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
|
234
234
|
"""
|
235
235
|
Extract text from this element.
|
236
236
|
|
@@ -238,14 +238,48 @@ class TextElement(Element):
|
|
238
238
|
keep_blank_chars: Retained for API compatibility (unused).
|
239
239
|
strip: If True (default) remove leading/trailing whitespace. Users may
|
240
240
|
pass ``strip=False`` to preserve whitespace exactly as stored.
|
241
|
+
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
242
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
243
|
+
- A callable that takes text and returns True to KEEP the character
|
244
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
241
245
|
**kwargs: Accepted for forward-compatibility and ignored here.
|
242
246
|
|
243
247
|
Returns:
|
244
|
-
The text content, optionally stripped.
|
248
|
+
The text content, optionally stripped and filtered.
|
245
249
|
"""
|
246
250
|
# Basic retrieval
|
247
251
|
result = self.text or ""
|
248
252
|
|
253
|
+
# Apply content filtering if provided
|
254
|
+
if content_filter is not None and result:
|
255
|
+
import re
|
256
|
+
|
257
|
+
if isinstance(content_filter, str):
|
258
|
+
# Single regex pattern - remove matching characters
|
259
|
+
try:
|
260
|
+
result = re.sub(content_filter, '', result)
|
261
|
+
except re.error:
|
262
|
+
pass # Invalid regex, skip filtering
|
263
|
+
|
264
|
+
elif isinstance(content_filter, list):
|
265
|
+
# List of regex patterns - remove characters matching ANY pattern
|
266
|
+
try:
|
267
|
+
for pattern in content_filter:
|
268
|
+
result = re.sub(pattern, '', result)
|
269
|
+
except re.error:
|
270
|
+
pass # Invalid regex, skip filtering
|
271
|
+
|
272
|
+
elif callable(content_filter):
|
273
|
+
# Callable filter - apply to individual characters
|
274
|
+
try:
|
275
|
+
filtered_chars = []
|
276
|
+
for char in result:
|
277
|
+
if content_filter(char):
|
278
|
+
filtered_chars.append(char)
|
279
|
+
result = ''.join(filtered_chars)
|
280
|
+
except Exception:
|
281
|
+
pass # Function error, skip filtering
|
282
|
+
|
249
283
|
# Apply optional stripping – align with global convention where simple
|
250
284
|
# element extraction is stripped by default.
|
251
285
|
if strip:
|
natural_pdf/flows/region.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
3
3
|
|
4
|
-
from pdfplumber.utils.geometry import
|
4
|
+
from pdfplumber.utils.geometry import merge_bboxes # Import merge_bboxes directly
|
5
5
|
|
6
6
|
# For runtime image manipulation
|
7
7
|
from PIL import Image as PIL_Image_Runtime
|
8
8
|
|
9
|
+
from natural_pdf.tables import TableResult
|
10
|
+
|
9
11
|
if TYPE_CHECKING:
|
10
12
|
from PIL.Image import Image as PIL_Image # For type hints
|
11
13
|
|
@@ -53,28 +55,46 @@ class FlowRegion:
|
|
53
55
|
self.source_flow_element: "FlowElement" = source_flow_element
|
54
56
|
self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
|
55
57
|
|
58
|
+
# Add attributes for grid building, similar to Region
|
59
|
+
self.source: Optional[str] = None
|
60
|
+
self.region_type: Optional[str] = None
|
61
|
+
self.metadata: Dict[str, Any] = {}
|
62
|
+
|
56
63
|
# Cache for expensive operations
|
57
64
|
self._cached_text: Optional[str] = None
|
58
65
|
self._cached_elements: Optional["ElementCollection"] = None # Stringized
|
59
66
|
self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
|
60
67
|
|
68
|
+
def __getattr__(self, name: str) -> Any:
|
69
|
+
"""
|
70
|
+
Dynamically proxy attribute access to the source FlowElement if the
|
71
|
+
attribute is not found in this instance.
|
72
|
+
"""
|
73
|
+
if name in self.__dict__:
|
74
|
+
return self.__dict__[name]
|
75
|
+
elif self.source_flow_element is not None:
|
76
|
+
return getattr(self.source_flow_element, name)
|
77
|
+
else:
|
78
|
+
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
|
79
|
+
|
61
80
|
@property
|
62
81
|
def bbox(self) -> Optional[Tuple[float, float, float, float]]:
|
63
82
|
"""
|
64
|
-
|
65
|
-
|
66
|
-
original physical coordinates.
|
67
|
-
Returns None if there are no constituent regions.
|
83
|
+
The bounding box that encloses all constituent regions.
|
84
|
+
Calculated dynamically and cached.
|
68
85
|
"""
|
69
86
|
if self._cached_bbox is not None:
|
70
87
|
return self._cached_bbox
|
71
88
|
if not self.constituent_regions:
|
72
89
|
return None
|
73
90
|
|
74
|
-
# Use
|
75
|
-
#
|
76
|
-
|
77
|
-
|
91
|
+
# Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
|
92
|
+
# Extract bbox tuples from regions first
|
93
|
+
region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
|
94
|
+
if not region_bboxes:
|
95
|
+
return None
|
96
|
+
|
97
|
+
self._cached_bbox = merge_bboxes(region_bboxes)
|
78
98
|
return self._cached_bbox
|
79
99
|
|
80
100
|
@property
|
@@ -200,22 +220,72 @@ class FlowRegion:
|
|
200
220
|
self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
|
201
221
|
) -> Optional["PhysicalElement"]: # Stringized
|
202
222
|
"""
|
203
|
-
|
223
|
+
Find the first element in flow order that matches the selector or text.
|
224
|
+
|
225
|
+
This implementation iterates through the constituent regions *in the order
|
226
|
+
they appear in ``self.constituent_regions`` (i.e. document flow order),
|
227
|
+
delegating the search to each region's own ``find`` method. It therefore
|
228
|
+
avoids constructing a huge intermediate ElementCollection and returns as
|
229
|
+
soon as a match is found, which is substantially faster and ensures that
|
230
|
+
selectors such as 'table' work exactly as they do on an individual
|
231
|
+
Region.
|
204
232
|
"""
|
205
|
-
|
206
|
-
|
207
|
-
|
233
|
+
if not self.constituent_regions:
|
234
|
+
return None
|
235
|
+
|
236
|
+
for region in self.constituent_regions:
|
237
|
+
try:
|
238
|
+
result = region.find(selector=selector, text=text, **kwargs)
|
239
|
+
if result is not None:
|
240
|
+
return result
|
241
|
+
except Exception as e:
|
242
|
+
logger.warning(
|
243
|
+
f"FlowRegion.find: error searching region {region}: {e}",
|
244
|
+
exc_info=False,
|
245
|
+
)
|
246
|
+
return None # No match found
|
208
247
|
|
209
248
|
def find_all(
|
210
249
|
self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs
|
211
250
|
) -> "ElementCollection": # Stringized
|
212
251
|
"""
|
213
|
-
|
252
|
+
Find **all** elements across the constituent regions that match the given
|
253
|
+
selector or text.
|
254
|
+
|
255
|
+
Rather than first materialising *every* element in the FlowRegion (which
|
256
|
+
can be extremely slow for multi-page flows), this implementation simply
|
257
|
+
chains each region's native ``find_all`` call and concatenates their
|
258
|
+
results into a single ElementCollection while preserving flow order.
|
214
259
|
"""
|
215
|
-
|
216
|
-
|
260
|
+
from natural_pdf.elements.collections import (
|
261
|
+
ElementCollection as RuntimeElementCollection,
|
262
|
+
)
|
263
|
+
|
264
|
+
matched_elements = [] # type: List["PhysicalElement"]
|
265
|
+
|
266
|
+
if not self.constituent_regions:
|
267
|
+
return RuntimeElementCollection([])
|
268
|
+
|
269
|
+
for region in self.constituent_regions:
|
270
|
+
try:
|
271
|
+
region_matches = region.find_all(
|
217
272
|
selector=selector, text=text, **kwargs
|
218
|
-
|
273
|
+
)
|
274
|
+
if region_matches:
|
275
|
+
# ``region_matches`` is an ElementCollection – extend with its
|
276
|
+
# underlying list so we don't create nested collections.
|
277
|
+
matched_elements.extend(
|
278
|
+
region_matches.elements
|
279
|
+
if hasattr(region_matches, "elements")
|
280
|
+
else list(region_matches)
|
281
|
+
)
|
282
|
+
except Exception as e:
|
283
|
+
logger.warning(
|
284
|
+
f"FlowRegion.find_all: error searching region {region}: {e}",
|
285
|
+
exc_info=False,
|
286
|
+
)
|
287
|
+
|
288
|
+
return RuntimeElementCollection(matched_elements)
|
219
289
|
|
220
290
|
def highlight(
|
221
291
|
self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs
|
@@ -253,6 +323,7 @@ class FlowRegion:
|
|
253
323
|
stack_direction: str = "vertical",
|
254
324
|
stack_gap: int = 5,
|
255
325
|
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
326
|
+
crop: bool = False,
|
256
327
|
**kwargs,
|
257
328
|
) -> Optional["PIL_Image"]:
|
258
329
|
"""
|
@@ -269,6 +340,7 @@ class FlowRegion:
|
|
269
340
|
stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
|
270
341
|
stack_gap: Gap in pixels between stacked pages.
|
271
342
|
stack_background_color: RGB background color for the stacked image.
|
343
|
+
crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
|
272
344
|
**kwargs: Additional arguments passed to the underlying rendering methods.
|
273
345
|
|
274
346
|
Returns:
|
@@ -358,6 +430,16 @@ class FlowRegion:
|
|
358
430
|
if not temp_highlights_for_page:
|
359
431
|
continue
|
360
432
|
|
433
|
+
# Calculate crop bbox if cropping is enabled
|
434
|
+
crop_bbox = None
|
435
|
+
if crop and constituent_regions_on_this_page:
|
436
|
+
# Calculate the bounding box that encompasses all constituent regions on this page
|
437
|
+
min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
|
438
|
+
min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
|
439
|
+
max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
|
440
|
+
max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
|
441
|
+
crop_bbox = (min_x0, min_y0, max_x1, max_y1)
|
442
|
+
|
361
443
|
page_image = highlighter_service.render_preview(
|
362
444
|
page_index=(
|
363
445
|
page_obj.index
|
@@ -369,6 +451,7 @@ class FlowRegion:
|
|
369
451
|
width=width,
|
370
452
|
labels=labels, # Pass through labels
|
371
453
|
legend_position=legend_position,
|
454
|
+
crop_bbox=crop_bbox,
|
372
455
|
**kwargs,
|
373
456
|
)
|
374
457
|
if page_image:
|
@@ -549,7 +632,7 @@ class FlowRegion:
|
|
549
632
|
cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
|
550
633
|
show_progress: bool = False,
|
551
634
|
**kwargs,
|
552
|
-
) ->
|
635
|
+
) -> TableResult:
|
553
636
|
"""Extracts a single logical table from the FlowRegion.
|
554
637
|
|
555
638
|
This is a convenience wrapper that iterates through the constituent
|
@@ -565,9 +648,9 @@ class FlowRegion:
|
|
565
648
|
``Region.extract_table`` implementation.
|
566
649
|
|
567
650
|
Returns:
|
568
|
-
A
|
651
|
+
A TableResult object containing the aggregated table data. Rows returned from
|
569
652
|
consecutive constituent regions are appended in document order. If
|
570
|
-
no tables are detected in any region, an empty
|
653
|
+
no tables are detected in any region, an empty TableResult is returned.
|
571
654
|
"""
|
572
655
|
|
573
656
|
if table_settings is None:
|
@@ -576,13 +659,13 @@ class FlowRegion:
|
|
576
659
|
text_options = {}
|
577
660
|
|
578
661
|
if not self.constituent_regions:
|
579
|
-
return []
|
662
|
+
return TableResult([])
|
580
663
|
|
581
664
|
aggregated_rows: List[List[Optional[str]]] = []
|
582
665
|
|
583
666
|
for region in self.constituent_regions:
|
584
667
|
try:
|
585
|
-
|
668
|
+
region_result = region.extract_table(
|
586
669
|
method=method,
|
587
670
|
table_settings=table_settings.copy(), # Avoid side-effects
|
588
671
|
use_ocr=use_ocr,
|
@@ -593,16 +676,16 @@ class FlowRegion:
|
|
593
676
|
**kwargs,
|
594
677
|
)
|
595
678
|
|
596
|
-
#
|
597
|
-
if
|
598
|
-
aggregated_rows.extend(
|
679
|
+
# region_result is now a TableResult object, extract the rows
|
680
|
+
if region_result:
|
681
|
+
aggregated_rows.extend(region_result)
|
599
682
|
except Exception as e:
|
600
683
|
logger.error(
|
601
684
|
f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
|
602
685
|
exc_info=True,
|
603
686
|
)
|
604
687
|
|
605
|
-
return aggregated_rows
|
688
|
+
return TableResult(aggregated_rows)
|
606
689
|
|
607
690
|
def extract_tables(
|
608
691
|
self,
|
@@ -649,3 +732,22 @@ class FlowRegion:
|
|
649
732
|
)
|
650
733
|
|
651
734
|
return all_tables
|
735
|
+
|
736
|
+
@property
|
737
|
+
def normalized_type(self) -> Optional[str]:
|
738
|
+
"""
|
739
|
+
Return the normalized type for selector compatibility.
|
740
|
+
This allows FlowRegion to be found by selectors like 'table'.
|
741
|
+
"""
|
742
|
+
if self.region_type:
|
743
|
+
# Convert region_type to normalized format (replace spaces with underscores, lowercase)
|
744
|
+
return self.region_type.lower().replace(" ", "_")
|
745
|
+
return None
|
746
|
+
|
747
|
+
@property
|
748
|
+
def type(self) -> Optional[str]:
|
749
|
+
"""
|
750
|
+
Return the type attribute for selector compatibility.
|
751
|
+
This is an alias for normalized_type.
|
752
|
+
"""
|
753
|
+
return self.normalized_type
|
natural_pdf/selectors/parser.py
CHANGED
@@ -24,6 +24,7 @@ This enables powerful document navigation like:
|
|
24
24
|
- page.find('text[size>12]:bold:contains("Summary")')
|
25
25
|
- page.find_all('rect[color~="red"]:above(text:contains("Total"))')
|
26
26
|
- page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
|
27
|
+
- page.find('text:regex("[\u2500-\u257F]")') # Box drawing characters
|
27
28
|
"""
|
28
29
|
|
29
30
|
import ast
|
@@ -748,6 +749,29 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
748
749
|
|
749
750
|
filter_lambda = contains_check
|
750
751
|
|
752
|
+
# --- Handle :regex pseudo-class (same as :contains with regex=True) ---
|
753
|
+
elif name == "regex" and args is not None:
|
754
|
+
ignore_case = not kwargs.get("case", True) # Default case sensitive
|
755
|
+
filter_name = f"pseudo-class :regex({args!r}, ignore_case={ignore_case})"
|
756
|
+
|
757
|
+
def regex_check(element, args=args, ignore_case=ignore_case):
|
758
|
+
if not hasattr(element, "text") or not element.text:
|
759
|
+
return False # Element must have non-empty text
|
760
|
+
|
761
|
+
element_text = element.text
|
762
|
+
search_term = str(args) # Ensure args is string
|
763
|
+
|
764
|
+
try:
|
765
|
+
pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
|
766
|
+
return bool(pattern.search(element_text))
|
767
|
+
except re.error as e:
|
768
|
+
logger.warning(
|
769
|
+
f"Invalid regex '{search_term}' in :regex selector: {e}. Returning False."
|
770
|
+
)
|
771
|
+
return False
|
772
|
+
|
773
|
+
filter_lambda = regex_check
|
774
|
+
|
751
775
|
# --- Handle :startswith and :starts-with (alias) --- #
|
752
776
|
elif name in ("starts-with", "startswith") and args is not None:
|
753
777
|
filter_name = f"pseudo-class :{name}({args!r})"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from typing import List, Optional, Tuple
|
2
|
+
|
3
|
+
|
4
|
+
def merge_bboxes(
|
5
|
+
bboxes: List[Optional[Tuple[float, float, float, float]]]
|
6
|
+
) -> Optional[Tuple[float, float, float, float]]:
|
7
|
+
"""
|
8
|
+
Merge multiple bounding boxes into a single one that encompasses all of them.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
bboxes: A list of bbox tuples (x0, top, x1, bottom). Can contain None values.
|
12
|
+
|
13
|
+
Returns:
|
14
|
+
A single merged bbox tuple, or None if no valid bboxes are provided.
|
15
|
+
"""
|
16
|
+
if not bboxes:
|
17
|
+
return None
|
18
|
+
|
19
|
+
# Filter out None or invalid bboxes
|
20
|
+
valid_bboxes = [b for b in bboxes if b and len(b) == 4]
|
21
|
+
if not valid_bboxes:
|
22
|
+
return None
|
23
|
+
|
24
|
+
x0s, tops, x1s, bottoms = zip(*valid_bboxes)
|
25
|
+
|
26
|
+
return (min(x0s), min(tops), max(x1s), max(bottoms))
|