natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +159 -3
- natural_pdf/collections/mixins.py +16 -3
- natural_pdf/core/highlighting_service.py +33 -9
- natural_pdf/core/page.py +138 -7
- natural_pdf/core/page_collection.py +51 -14
- natural_pdf/core/page_groupby.py +229 -0
- natural_pdf/core/render_spec.py +62 -4
- natural_pdf/elements/base.py +102 -20
- natural_pdf/elements/element_collection.py +11 -10
- natural_pdf/elements/region.py +21 -21
- natural_pdf/elements/text.py +5 -0
- natural_pdf/extraction/manager.py +8 -14
- natural_pdf/extraction/mixin.py +35 -21
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +37 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/METADATA +2 -2
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/RECORD +23 -22
- optimization/performance_analysis.py +1 -1
- tools/bad_pdf_eval/analyser.py +1 -1
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/top_level.txt +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -260,7 +260,7 @@ class DirectionalMixin:
|
|
260
260
|
|
261
261
|
Args:
|
262
262
|
height: Height of the region above, in points
|
263
|
-
width: Width mode - "full" for full page width or "element" for element width
|
263
|
+
width: Width mode - "full" (default) for full page width or "element" for element width
|
264
264
|
include_source: Whether to include this element/region in the result (default: False)
|
265
265
|
until: Optional selector string to specify an upper boundary element
|
266
266
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -268,6 +268,18 @@ class DirectionalMixin:
|
|
268
268
|
|
269
269
|
Returns:
|
270
270
|
Region object representing the area above
|
271
|
+
|
272
|
+
Examples:
|
273
|
+
```python
|
274
|
+
# Default: full page width
|
275
|
+
signature.above() # Gets everything above across full page width
|
276
|
+
|
277
|
+
# Match element width
|
278
|
+
signature.above(width='element') # Gets region above matching signature width
|
279
|
+
|
280
|
+
# Stop at specific element
|
281
|
+
signature.above(until='text:contains("Date")') # Region from date to signature
|
282
|
+
```
|
271
283
|
"""
|
272
284
|
return self._direction(
|
273
285
|
direction="above",
|
@@ -293,7 +305,7 @@ class DirectionalMixin:
|
|
293
305
|
|
294
306
|
Args:
|
295
307
|
height: Height of the region below, in points
|
296
|
-
width: Width mode - "full" for full page width or "element" for element width
|
308
|
+
width: Width mode - "full" (default) for full page width or "element" for element width
|
297
309
|
include_source: Whether to include this element/region in the result (default: False)
|
298
310
|
until: Optional selector string to specify a lower boundary element
|
299
311
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -301,6 +313,18 @@ class DirectionalMixin:
|
|
301
313
|
|
302
314
|
Returns:
|
303
315
|
Region object representing the area below
|
316
|
+
|
317
|
+
Examples:
|
318
|
+
```python
|
319
|
+
# Default: full page width
|
320
|
+
header.below() # Gets everything below across full page width
|
321
|
+
|
322
|
+
# Match element width
|
323
|
+
header.below(width='element') # Gets region below matching header width
|
324
|
+
|
325
|
+
# Limited height
|
326
|
+
header.below(height=200) # Gets 200pt tall region below header
|
327
|
+
```
|
304
328
|
"""
|
305
329
|
return self._direction(
|
306
330
|
direction="below",
|
@@ -315,7 +339,7 @@ class DirectionalMixin:
|
|
315
339
|
def left(
|
316
340
|
self,
|
317
341
|
width: Optional[float] = None,
|
318
|
-
height: str = "
|
342
|
+
height: str = "element",
|
319
343
|
include_source: bool = False,
|
320
344
|
until: Optional[str] = None,
|
321
345
|
include_endpoint: bool = True,
|
@@ -326,7 +350,7 @@ class DirectionalMixin:
|
|
326
350
|
|
327
351
|
Args:
|
328
352
|
width: Width of the region to the left, in points
|
329
|
-
height: Height mode - "
|
353
|
+
height: Height mode - "element" (default) for element height or "full" for full page height
|
330
354
|
include_source: Whether to include this element/region in the result (default: False)
|
331
355
|
until: Optional selector string to specify a left boundary element
|
332
356
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -334,6 +358,18 @@ class DirectionalMixin:
|
|
334
358
|
|
335
359
|
Returns:
|
336
360
|
Region object representing the area to the left
|
361
|
+
|
362
|
+
Examples:
|
363
|
+
```python
|
364
|
+
# Default: matches element height
|
365
|
+
table.left() # Gets region to the left at same height as table
|
366
|
+
|
367
|
+
# Full page height
|
368
|
+
table.left(height='full') # Gets entire left side of page
|
369
|
+
|
370
|
+
# Custom height
|
371
|
+
table.left(height=100) # Gets 100pt tall region to the left
|
372
|
+
```
|
337
373
|
"""
|
338
374
|
return self._direction(
|
339
375
|
direction="left",
|
@@ -348,7 +384,7 @@ class DirectionalMixin:
|
|
348
384
|
def right(
|
349
385
|
self,
|
350
386
|
width: Optional[float] = None,
|
351
|
-
height: str = "
|
387
|
+
height: str = "element",
|
352
388
|
include_source: bool = False,
|
353
389
|
until: Optional[str] = None,
|
354
390
|
include_endpoint: bool = True,
|
@@ -359,7 +395,7 @@ class DirectionalMixin:
|
|
359
395
|
|
360
396
|
Args:
|
361
397
|
width: Width of the region to the right, in points
|
362
|
-
height: Height mode - "
|
398
|
+
height: Height mode - "element" (default) for element height or "full" for full page height
|
363
399
|
include_source: Whether to include this element/region in the result (default: False)
|
364
400
|
until: Optional selector string to specify a right boundary element
|
365
401
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -367,6 +403,18 @@ class DirectionalMixin:
|
|
367
403
|
|
368
404
|
Returns:
|
369
405
|
Region object representing the area to the right
|
406
|
+
|
407
|
+
Examples:
|
408
|
+
```python
|
409
|
+
# Default: matches element height
|
410
|
+
label.right() # Gets region to the right at same height as label
|
411
|
+
|
412
|
+
# Full page height
|
413
|
+
label.right(height='full') # Gets entire right side of page
|
414
|
+
|
415
|
+
# Custom height
|
416
|
+
label.right(height=50) # Gets 50pt tall region to the right
|
417
|
+
```
|
370
418
|
"""
|
371
419
|
return self._direction(
|
372
420
|
direction="right",
|
@@ -381,8 +429,28 @@ class DirectionalMixin:
|
|
381
429
|
def to_region(self):
|
382
430
|
return self.expand()
|
383
431
|
|
432
|
+
@overload
|
433
|
+
def expand(self, amount: float) -> "Region":
|
434
|
+
"""Expand in all directions by the same amount."""
|
435
|
+
...
|
436
|
+
|
437
|
+
@overload
|
438
|
+
def expand(
|
439
|
+
self,
|
440
|
+
*,
|
441
|
+
left: float = 0,
|
442
|
+
right: float = 0,
|
443
|
+
top: float = 0,
|
444
|
+
bottom: float = 0,
|
445
|
+
width_factor: float = 1.0,
|
446
|
+
height_factor: float = 1.0,
|
447
|
+
) -> "Region":
|
448
|
+
"""Expand by different amounts in each direction."""
|
449
|
+
...
|
450
|
+
|
384
451
|
def expand(
|
385
452
|
self,
|
453
|
+
amount: Optional[float] = None,
|
386
454
|
left: float = 0,
|
387
455
|
right: float = 0,
|
388
456
|
top: float = 0,
|
@@ -394,6 +462,7 @@ class DirectionalMixin:
|
|
394
462
|
Create a new region expanded from this element/region.
|
395
463
|
|
396
464
|
Args:
|
465
|
+
amount: If provided as the first positional argument, expand all edges by this amount
|
397
466
|
left: Amount to expand left edge (positive value expands leftwards)
|
398
467
|
right: Amount to expand right edge (positive value expands rightwards)
|
399
468
|
top: Amount to expand top edge (positive value expands upwards)
|
@@ -403,7 +472,20 @@ class DirectionalMixin:
|
|
403
472
|
|
404
473
|
Returns:
|
405
474
|
New expanded Region object
|
475
|
+
|
476
|
+
Examples:
|
477
|
+
# Expand 5 pixels in all directions
|
478
|
+
expanded = element.expand(5)
|
479
|
+
|
480
|
+
# Expand by different amounts in each direction
|
481
|
+
expanded = element.expand(left=10, right=5, top=3, bottom=7)
|
482
|
+
|
483
|
+
# Use width/height factors
|
484
|
+
expanded = element.expand(width_factor=1.5, height_factor=2.0)
|
406
485
|
"""
|
486
|
+
# If amount is provided as first positional argument, use it for all directions
|
487
|
+
if amount is not None:
|
488
|
+
left = right = top = bottom = amount
|
407
489
|
# Start with current coordinates
|
408
490
|
new_x0 = self.x0
|
409
491
|
new_x1 = self.x1
|
@@ -1212,7 +1294,7 @@ class Element(
|
|
1212
1294
|
self,
|
1213
1295
|
*,
|
1214
1296
|
text: str,
|
1215
|
-
|
1297
|
+
overlap: str = "full",
|
1216
1298
|
apply_exclusions: bool = True,
|
1217
1299
|
regex: bool = False,
|
1218
1300
|
case: bool = True,
|
@@ -1224,7 +1306,7 @@ class Element(
|
|
1224
1306
|
self,
|
1225
1307
|
selector: str,
|
1226
1308
|
*,
|
1227
|
-
|
1309
|
+
overlap: str = "full",
|
1228
1310
|
apply_exclusions: bool = True,
|
1229
1311
|
regex: bool = False,
|
1230
1312
|
case: bool = True,
|
@@ -1236,7 +1318,7 @@ class Element(
|
|
1236
1318
|
selector: Optional[str] = None,
|
1237
1319
|
*,
|
1238
1320
|
text: Optional[str] = None,
|
1239
|
-
|
1321
|
+
overlap: str = "full",
|
1240
1322
|
apply_exclusions: bool = True,
|
1241
1323
|
regex: bool = False,
|
1242
1324
|
case: bool = True,
|
@@ -1251,9 +1333,9 @@ class Element(
|
|
1251
1333
|
Args:
|
1252
1334
|
selector: CSS-like selector string.
|
1253
1335
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1254
|
-
|
1255
|
-
'
|
1256
|
-
(default: "
|
1336
|
+
overlap: How to determine if elements overlap with this element: 'full' (fully inside),
|
1337
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1338
|
+
(default: "full")
|
1257
1339
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1258
1340
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1259
1341
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1270,7 +1352,7 @@ class Element(
|
|
1270
1352
|
return temp_region.find(
|
1271
1353
|
selector=selector,
|
1272
1354
|
text=text,
|
1273
|
-
|
1355
|
+
overlap=overlap,
|
1274
1356
|
apply_exclusions=apply_exclusions,
|
1275
1357
|
regex=regex,
|
1276
1358
|
case=case,
|
@@ -1282,7 +1364,7 @@ class Element(
|
|
1282
1364
|
self,
|
1283
1365
|
*,
|
1284
1366
|
text: str,
|
1285
|
-
|
1367
|
+
overlap: str = "full",
|
1286
1368
|
apply_exclusions: bool = True,
|
1287
1369
|
regex: bool = False,
|
1288
1370
|
case: bool = True,
|
@@ -1294,7 +1376,7 @@ class Element(
|
|
1294
1376
|
self,
|
1295
1377
|
selector: str,
|
1296
1378
|
*,
|
1297
|
-
|
1379
|
+
overlap: str = "full",
|
1298
1380
|
apply_exclusions: bool = True,
|
1299
1381
|
regex: bool = False,
|
1300
1382
|
case: bool = True,
|
@@ -1306,7 +1388,7 @@ class Element(
|
|
1306
1388
|
selector: Optional[str] = None,
|
1307
1389
|
*,
|
1308
1390
|
text: Optional[str] = None,
|
1309
|
-
|
1391
|
+
overlap: str = "full",
|
1310
1392
|
apply_exclusions: bool = True,
|
1311
1393
|
regex: bool = False,
|
1312
1394
|
case: bool = True,
|
@@ -1321,9 +1403,9 @@ class Element(
|
|
1321
1403
|
Args:
|
1322
1404
|
selector: CSS-like selector string.
|
1323
1405
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1324
|
-
|
1325
|
-
'
|
1326
|
-
(default: "
|
1406
|
+
overlap: How to determine if elements overlap with this element: 'full' (fully inside),
|
1407
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1408
|
+
(default: "full")
|
1327
1409
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1328
1410
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1329
1411
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1340,7 +1422,7 @@ class Element(
|
|
1340
1422
|
return temp_region.find_all(
|
1341
1423
|
selector=selector,
|
1342
1424
|
text=text,
|
1343
|
-
|
1425
|
+
overlap=overlap,
|
1344
1426
|
apply_exclusions=apply_exclusions,
|
1345
1427
|
regex=regex,
|
1346
1428
|
case=case,
|
@@ -891,6 +891,7 @@ class ElementCollection(
|
|
891
891
|
label_format: Optional[str] = None,
|
892
892
|
annotate: Optional[List[str]] = None,
|
893
893
|
bins: Optional[Union[int, List[float]]] = None,
|
894
|
+
**kwargs,
|
894
895
|
) -> List[Dict]:
|
895
896
|
"""
|
896
897
|
Determines the parameters for highlighting each element based on the strategy.
|
@@ -1672,9 +1673,9 @@ class ElementCollection(
|
|
1672
1673
|
|
1673
1674
|
Args:
|
1674
1675
|
selector: CSS-like selector string
|
1675
|
-
|
1676
|
-
'
|
1677
|
-
(default: "
|
1676
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
1677
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1678
|
+
(default: "full")
|
1678
1679
|
apply_exclusions: Whether to exclude elements in exclusion regions
|
1679
1680
|
"""
|
1680
1681
|
return self.apply(lambda element: element.find(selector, **kwargs))
|
@@ -1684,7 +1685,7 @@ class ElementCollection(
|
|
1684
1685
|
self,
|
1685
1686
|
*,
|
1686
1687
|
text: str,
|
1687
|
-
|
1688
|
+
overlap: str = "full",
|
1688
1689
|
apply_exclusions: bool = True,
|
1689
1690
|
regex: bool = False,
|
1690
1691
|
case: bool = True,
|
@@ -1696,7 +1697,7 @@ class ElementCollection(
|
|
1696
1697
|
self,
|
1697
1698
|
selector: str,
|
1698
1699
|
*,
|
1699
|
-
|
1700
|
+
overlap: str = "full",
|
1700
1701
|
apply_exclusions: bool = True,
|
1701
1702
|
regex: bool = False,
|
1702
1703
|
case: bool = True,
|
@@ -1708,7 +1709,7 @@ class ElementCollection(
|
|
1708
1709
|
selector: Optional[str] = None,
|
1709
1710
|
*,
|
1710
1711
|
text: Optional[str] = None,
|
1711
|
-
|
1712
|
+
overlap: str = "full",
|
1712
1713
|
apply_exclusions: bool = True,
|
1713
1714
|
regex: bool = False,
|
1714
1715
|
case: bool = True,
|
@@ -1723,9 +1724,9 @@ class ElementCollection(
|
|
1723
1724
|
Args:
|
1724
1725
|
selector: CSS-like selector string.
|
1725
1726
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1726
|
-
|
1727
|
-
'
|
1728
|
-
(default: "
|
1727
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
1728
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1729
|
+
(default: "full")
|
1729
1730
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1730
1731
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1731
1732
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1747,7 +1748,7 @@ class ElementCollection(
|
|
1747
1748
|
found_in_element: "ElementCollection" = element.find_all(
|
1748
1749
|
selector=selector,
|
1749
1750
|
text=text,
|
1750
|
-
|
1751
|
+
overlap=overlap,
|
1751
1752
|
apply_exclusions=apply_exclusions,
|
1752
1753
|
regex=regex,
|
1753
1754
|
case=case,
|
natural_pdf/elements/region.py
CHANGED
@@ -960,7 +960,7 @@ class Region(
|
|
960
960
|
right_content_col = min(width - 1, content_col_indices[-1] + padding)
|
961
961
|
|
962
962
|
# Convert trimmed pixel coordinates back to PDF coordinates
|
963
|
-
scale_factor = resolution / 72.0 # Scale factor used in
|
963
|
+
scale_factor = resolution / 72.0 # Scale factor used in render()
|
964
964
|
|
965
965
|
# Calculate new PDF coordinates and ensure they are Python floats
|
966
966
|
trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
|
@@ -1982,7 +1982,7 @@ class Region(
|
|
1982
1982
|
self,
|
1983
1983
|
*,
|
1984
1984
|
text: str,
|
1985
|
-
|
1985
|
+
overlap: str = "full",
|
1986
1986
|
apply_exclusions: bool = True,
|
1987
1987
|
regex: bool = False,
|
1988
1988
|
case: bool = True,
|
@@ -1994,7 +1994,7 @@ class Region(
|
|
1994
1994
|
self,
|
1995
1995
|
selector: str,
|
1996
1996
|
*,
|
1997
|
-
|
1997
|
+
overlap: str = "full",
|
1998
1998
|
apply_exclusions: bool = True,
|
1999
1999
|
regex: bool = False,
|
2000
2000
|
case: bool = True,
|
@@ -2006,7 +2006,7 @@ class Region(
|
|
2006
2006
|
selector: Optional[str] = None, # Now optional
|
2007
2007
|
*,
|
2008
2008
|
text: Optional[str] = None, # New text parameter
|
2009
|
-
|
2009
|
+
overlap: str = "full", # How elements overlap with the region
|
2010
2010
|
apply_exclusions: bool = True,
|
2011
2011
|
regex: bool = False,
|
2012
2012
|
case: bool = True,
|
@@ -2020,9 +2020,9 @@ class Region(
|
|
2020
2020
|
Args:
|
2021
2021
|
selector: CSS-like selector string.
|
2022
2022
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
2023
|
-
|
2024
|
-
'
|
2025
|
-
(default: "
|
2023
|
+
overlap: How to determine if elements overlap with the region: 'full' (fully inside),
|
2024
|
+
'partial' (any overlap), or 'center' (center point inside).
|
2025
|
+
(default: "full")
|
2026
2026
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2027
2027
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2028
2028
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -2035,7 +2035,7 @@ class Region(
|
|
2035
2035
|
elements = self.find_all(
|
2036
2036
|
selector=selector,
|
2037
2037
|
text=text,
|
2038
|
-
|
2038
|
+
overlap=overlap,
|
2039
2039
|
apply_exclusions=apply_exclusions,
|
2040
2040
|
regex=regex,
|
2041
2041
|
case=case,
|
@@ -2048,7 +2048,7 @@ class Region(
|
|
2048
2048
|
self,
|
2049
2049
|
*,
|
2050
2050
|
text: str,
|
2051
|
-
|
2051
|
+
overlap: str = "full",
|
2052
2052
|
apply_exclusions: bool = True,
|
2053
2053
|
regex: bool = False,
|
2054
2054
|
case: bool = True,
|
@@ -2060,7 +2060,7 @@ class Region(
|
|
2060
2060
|
self,
|
2061
2061
|
selector: str,
|
2062
2062
|
*,
|
2063
|
-
|
2063
|
+
overlap: str = "full",
|
2064
2064
|
apply_exclusions: bool = True,
|
2065
2065
|
regex: bool = False,
|
2066
2066
|
case: bool = True,
|
@@ -2072,7 +2072,7 @@ class Region(
|
|
2072
2072
|
selector: Optional[str] = None, # Now optional
|
2073
2073
|
*,
|
2074
2074
|
text: Optional[str] = None, # New text parameter
|
2075
|
-
|
2075
|
+
overlap: str = "full", # How elements overlap with the region
|
2076
2076
|
apply_exclusions: bool = True,
|
2077
2077
|
regex: bool = False,
|
2078
2078
|
case: bool = True,
|
@@ -2086,9 +2086,9 @@ class Region(
|
|
2086
2086
|
Args:
|
2087
2087
|
selector: CSS-like selector string.
|
2088
2088
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
2089
|
-
|
2090
|
-
'
|
2091
|
-
(default: "
|
2089
|
+
overlap: How to determine if elements overlap with the region: 'full' (fully inside),
|
2090
|
+
'partial' (any overlap), or 'center' (center point inside).
|
2091
|
+
(default: "full")
|
2092
2092
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2093
2093
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2094
2094
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -2104,10 +2104,10 @@ class Region(
|
|
2104
2104
|
if selector is None and text is None:
|
2105
2105
|
raise ValueError("Provide either 'selector' or 'text'.")
|
2106
2106
|
|
2107
|
-
# Validate
|
2108
|
-
if
|
2107
|
+
# Validate overlap parameter
|
2108
|
+
if overlap not in ["full", "partial", "center"]:
|
2109
2109
|
raise ValueError(
|
2110
|
-
f"Invalid
|
2110
|
+
f"Invalid overlap value: {overlap}. Must be 'full', 'partial', or 'center'"
|
2111
2111
|
)
|
2112
2112
|
|
2113
2113
|
# Construct selector if 'text' is provided
|
@@ -2142,7 +2142,7 @@ class Region(
|
|
2142
2142
|
region_bbox = self.bbox
|
2143
2143
|
matching_elements = []
|
2144
2144
|
|
2145
|
-
if
|
2145
|
+
if overlap == "full": # Fully inside (strict)
|
2146
2146
|
matching_elements = [
|
2147
2147
|
el
|
2148
2148
|
for el in potential_elements
|
@@ -2151,9 +2151,9 @@ class Region(
|
|
2151
2151
|
and el.x1 <= region_bbox[2]
|
2152
2152
|
and el.bottom <= region_bbox[3]
|
2153
2153
|
]
|
2154
|
-
elif
|
2154
|
+
elif overlap == "partial": # Any overlap
|
2155
2155
|
matching_elements = [el for el in potential_elements if self.intersects(el)]
|
2156
|
-
elif
|
2156
|
+
elif overlap == "center": # Center point inside
|
2157
2157
|
matching_elements = [
|
2158
2158
|
el for el in potential_elements if self.is_element_center_inside(el)
|
2159
2159
|
]
|
@@ -3437,7 +3437,7 @@ class Region(
|
|
3437
3437
|
r_idx = int(cell.metadata.get("row_index"))
|
3438
3438
|
c_idx = int(cell.metadata.get("col_index"))
|
3439
3439
|
text_val = cell.extract_text(
|
3440
|
-
layout=False, apply_exclusions=
|
3440
|
+
layout=False, apply_exclusions=True, content_filter=content_filter
|
3441
3441
|
).strip()
|
3442
3442
|
table_grid[r_idx][c_idx] = text_val if text_val else None
|
3443
3443
|
except Exception as _err:
|
natural_pdf/elements/text.py
CHANGED
@@ -215,6 +215,11 @@ class TextElement(Element):
|
|
215
215
|
if isinstance(color, (int, float)):
|
216
216
|
return (color, color, color)
|
217
217
|
|
218
|
+
# If it's a single-value tuple (grayscale), treat as grayscale
|
219
|
+
if isinstance(color, tuple) and len(color) == 1:
|
220
|
+
gray = color[0]
|
221
|
+
return (gray, gray, gray)
|
222
|
+
|
218
223
|
# If it's a tuple of 3 values, treat as RGB
|
219
224
|
if isinstance(color, tuple) and len(color) == 3:
|
220
225
|
return color
|
@@ -119,17 +119,11 @@ class StructuredDataManager:
|
|
119
119
|
)
|
120
120
|
messages = self._prepare_llm_messages(content, prompt, using, schema)
|
121
121
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
)
|
131
|
-
except Exception as e:
|
132
|
-
logger.error(f"Extraction failed: {str(e)}")
|
133
|
-
return StructuredDataResult(
|
134
|
-
data=None, success=False, error_message=str(e), model_used=selected_model
|
135
|
-
)
|
122
|
+
logger.debug(f"Extracting with model '{selected_model}'")
|
123
|
+
completion = client.beta.chat.completions.parse(
|
124
|
+
model=selected_model, messages=messages, response_format=schema, **kwargs
|
125
|
+
)
|
126
|
+
parsed_data = completion.choices[0].message.parsed
|
127
|
+
return StructuredDataResult(
|
128
|
+
data=parsed_data, success=True, error_message=None, model_used=selected_model
|
129
|
+
)
|
natural_pdf/extraction/mixin.py
CHANGED
@@ -35,7 +35,7 @@ class ExtractionMixin(ABC):
|
|
35
35
|
|
36
36
|
Host class requirements:
|
37
37
|
- Must implement extract_text(**kwargs) -> str
|
38
|
-
- Must implement
|
38
|
+
- Must implement render(**kwargs) -> PIL.Image
|
39
39
|
- Must have access to StructuredDataManager (usually via parent PDF)
|
40
40
|
|
41
41
|
Example:
|
@@ -72,25 +72,24 @@ class ExtractionMixin(ABC):
|
|
72
72
|
|
73
73
|
Args:
|
74
74
|
using: 'text' or 'vision'
|
75
|
-
**kwargs: Additional arguments passed to extract_text or
|
75
|
+
**kwargs: Additional arguments passed to extract_text or render
|
76
76
|
|
77
77
|
Returns:
|
78
78
|
str: Extracted text if using='text'
|
79
79
|
PIL.Image.Image: Rendered image if using='vision'
|
80
80
|
None: If content cannot be retrieved
|
81
81
|
"""
|
82
|
-
if not hasattr(self, "extract_text") or not callable(self.extract_text):
|
83
|
-
logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
|
84
|
-
return None
|
85
|
-
if not hasattr(self, "to_image") or not callable(self.to_image):
|
86
|
-
logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
|
87
|
-
return None
|
88
|
-
|
89
82
|
try:
|
90
83
|
if using == "text":
|
84
|
+
if not hasattr(self, "extract_text") or not callable(self.extract_text):
|
85
|
+
logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
|
86
|
+
return None
|
91
87
|
layout = kwargs.pop("layout", True)
|
92
88
|
return self.extract_text(layout=layout, **kwargs)
|
93
89
|
elif using == "vision":
|
90
|
+
if not hasattr(self, "render") or not callable(self.render):
|
91
|
+
logger.error(f"ExtractionMixin requires 'render' method on {self!r}")
|
92
|
+
return None
|
94
93
|
resolution = kwargs.pop("resolution", 72)
|
95
94
|
include_highlights = kwargs.pop("include_highlights", False)
|
96
95
|
labels = kwargs.pop("labels", False)
|
@@ -102,8 +101,13 @@ class ExtractionMixin(ABC):
|
|
102
101
|
logger.error(f"Unsupported value for 'using': {using}")
|
103
102
|
return None
|
104
103
|
except Exception as e:
|
105
|
-
|
106
|
-
|
104
|
+
import warnings
|
105
|
+
|
106
|
+
warnings.warn(
|
107
|
+
f"Error getting {using} content from {self!r}: {e}",
|
108
|
+
RuntimeWarning,
|
109
|
+
)
|
110
|
+
raise
|
107
111
|
|
108
112
|
def extract(
|
109
113
|
self: Any,
|
@@ -275,10 +279,7 @@ class ExtractionMixin(ABC):
|
|
275
279
|
raise RuntimeError("StructuredDataManager is not available")
|
276
280
|
|
277
281
|
# Get content
|
278
|
-
|
279
|
-
content = self._get_extraction_content(
|
280
|
-
using=using, layout=layout_for_text, **kwargs
|
281
|
-
) # Pass kwargs
|
282
|
+
content = self._get_extraction_content(using=using, **kwargs) # Pass kwargs
|
282
283
|
|
283
284
|
if content is None or (
|
284
285
|
using == "text" and isinstance(content, str) and not content.strip()
|
@@ -359,10 +360,11 @@ class ExtractionMixin(ABC):
|
|
359
360
|
)
|
360
361
|
|
361
362
|
if not result.success:
|
362
|
-
|
363
|
-
|
364
|
-
f"
|
363
|
+
# Return None for failed extractions to allow batch processing to continue
|
364
|
+
logger.warning(
|
365
|
+
f"Extraction '{target_key}' failed: {result.error_message}. Returning None."
|
365
366
|
)
|
367
|
+
return None
|
366
368
|
|
367
369
|
if result.data is None:
|
368
370
|
# This case might occur if success=True but data is somehow None
|
@@ -591,16 +593,28 @@ class ExtractionMixin(ABC):
|
|
591
593
|
raise RuntimeError("StructuredDataManager is not available")
|
592
594
|
|
593
595
|
# Content preparation
|
594
|
-
|
595
|
-
|
596
|
+
content = self._get_extraction_content(using=using, **kwargs)
|
597
|
+
|
598
|
+
import warnings
|
596
599
|
|
597
600
|
if content is None or (
|
598
601
|
using == "text" and isinstance(content, str) and not content.strip()
|
599
602
|
):
|
603
|
+
preview = None
|
604
|
+
if isinstance(content, str):
|
605
|
+
preview = content[:120]
|
606
|
+
msg = (
|
607
|
+
f"No content available for extraction (using='{using}'). "
|
608
|
+
"Ensure the page has a text layer or render() returns an image. "
|
609
|
+
"For scanned PDFs run apply_ocr() or switch to using='vision'. "
|
610
|
+
f"Content preview: {preview!r}"
|
611
|
+
)
|
612
|
+
warnings.warn(msg, RuntimeWarning)
|
613
|
+
|
600
614
|
result = StructuredDataResult(
|
601
615
|
data=None,
|
602
616
|
success=False,
|
603
|
-
error_message=
|
617
|
+
error_message=msg,
|
604
618
|
model_used=model,
|
605
619
|
)
|
606
620
|
else:
|
natural_pdf/selectors/parser.py
CHANGED
@@ -721,8 +721,8 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
721
721
|
# Start with a base name, modify for specifics like :not
|
722
722
|
filter_name = f"pseudo-class :{name}"
|
723
723
|
|
724
|
-
# Relational pseudo-classes are handled separately by the caller
|
725
|
-
if name in ("above", "below", "near", "left-of", "right-of"):
|
724
|
+
# Relational pseudo-classes and collection-level pseudo-classes are handled separately by the caller
|
725
|
+
if name in ("above", "below", "near", "left-of", "right-of", "first", "last"):
|
726
726
|
continue
|
727
727
|
|
728
728
|
# --- Handle :not() ---
|