natural-pdf 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/collections/mixins.py +16 -3
- natural_pdf/core/highlighting_service.py +25 -1
- natural_pdf/core/page.py +3 -3
- natural_pdf/core/page_collection.py +14 -14
- natural_pdf/core/render_spec.py +44 -0
- natural_pdf/elements/base.py +48 -14
- natural_pdf/elements/element_collection.py +10 -10
- natural_pdf/elements/region.py +19 -19
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.3.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.3.dist-info}/RECORD +14 -14
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.3.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.2.dist-info → natural_pdf-0.2.3.dist-info}/top_level.txt +0 -0
@@ -29,9 +29,22 @@ class DirectionalCollectionMixin:
|
|
29
29
|
"""Find regions to the right of all elements in this collection."""
|
30
30
|
return self.apply(lambda element: element.right(**kwargs))
|
31
31
|
|
32
|
-
def expand(self, **kwargs) -> "ElementCollection":
|
33
|
-
"""Expand all elements in this collection.
|
34
|
-
|
32
|
+
def expand(self, *args, **kwargs) -> "ElementCollection":
|
33
|
+
"""Expand all elements in this collection.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
*args: If a single positional argument is provided, expands all elements
|
37
|
+
by that amount in all directions.
|
38
|
+
**kwargs: Keyword arguments for directional expansion (left, right, top, bottom, etc.)
|
39
|
+
|
40
|
+
Examples:
|
41
|
+
# Expand all elements by 5 pixels in all directions
|
42
|
+
collection.expand(5)
|
43
|
+
|
44
|
+
# Expand with different amounts in each direction
|
45
|
+
collection.expand(left=10, right=5, top=3, bottom=7)
|
46
|
+
"""
|
47
|
+
return self.apply(lambda element: element.expand(*args, **kwargs))
|
35
48
|
|
36
49
|
|
37
50
|
class ApplyMixin:
|
@@ -335,6 +335,7 @@ class HighlightContext:
|
|
335
335
|
self.show_on_exit = show_on_exit
|
336
336
|
self.highlight_groups = []
|
337
337
|
self._color_manager = ColorManager()
|
338
|
+
self._exit_image = None # Store image for Jupyter display
|
338
339
|
|
339
340
|
def add(
|
340
341
|
self,
|
@@ -421,6 +422,11 @@ class HighlightContext:
|
|
421
422
|
)
|
422
423
|
return None
|
423
424
|
|
425
|
+
@property
|
426
|
+
def image(self) -> Optional[Image.Image]:
|
427
|
+
"""Get the last generated image (useful after context exit)."""
|
428
|
+
return self._exit_image
|
429
|
+
|
424
430
|
def __enter__(self) -> "HighlightContext":
|
425
431
|
"""Enter the context."""
|
426
432
|
return self
|
@@ -428,7 +434,25 @@ class HighlightContext:
|
|
428
434
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
429
435
|
"""Exit the context, optionally showing highlights."""
|
430
436
|
if self.show_on_exit and not exc_type:
|
431
|
-
self.show()
|
437
|
+
self._exit_image = self.show()
|
438
|
+
|
439
|
+
# Check if we're in a Jupyter/IPython environment
|
440
|
+
try:
|
441
|
+
# Try to get IPython instance
|
442
|
+
from IPython import get_ipython
|
443
|
+
|
444
|
+
ipython = get_ipython()
|
445
|
+
if ipython is not None:
|
446
|
+
# We're in IPython/Jupyter
|
447
|
+
from IPython.display import display
|
448
|
+
|
449
|
+
if self._exit_image is not None:
|
450
|
+
display(self._exit_image)
|
451
|
+
except (ImportError, NameError):
|
452
|
+
# Not in Jupyter or IPython not available - that's OK
|
453
|
+
pass
|
454
|
+
|
455
|
+
# __exit__ must return False to not suppress exceptions
|
432
456
|
return False
|
433
457
|
|
434
458
|
|
natural_pdf/core/page.py
CHANGED
@@ -1976,7 +1976,7 @@ class Page(
|
|
1976
1976
|
"""Get all line elements on this page."""
|
1977
1977
|
return self._element_mgr.lines
|
1978
1978
|
|
1979
|
-
def
|
1979
|
+
def add_highlight(
|
1980
1980
|
self,
|
1981
1981
|
bbox: Optional[Tuple[float, float, float, float]] = None,
|
1982
1982
|
color: Optional[Union[Tuple, str]] = None,
|
@@ -1987,7 +1987,7 @@ class Page(
|
|
1987
1987
|
existing: str = "append",
|
1988
1988
|
) -> "Page":
|
1989
1989
|
"""
|
1990
|
-
|
1990
|
+
Add a highlight to a bounding box or the entire page.
|
1991
1991
|
Delegates to the central HighlightingService.
|
1992
1992
|
|
1993
1993
|
Args:
|
@@ -2015,7 +2015,7 @@ class Page(
|
|
2015
2015
|
)
|
2016
2016
|
return self
|
2017
2017
|
|
2018
|
-
def
|
2018
|
+
def add_highlight_polygon(
|
2019
2019
|
self,
|
2020
2020
|
polygon: List[Tuple[float, float]],
|
2021
2021
|
color: Optional[Union[Tuple, str]] = None,
|
@@ -259,7 +259,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
259
259
|
self,
|
260
260
|
*,
|
261
261
|
text: str,
|
262
|
-
|
262
|
+
overlap: str = "full",
|
263
263
|
apply_exclusions: bool = True,
|
264
264
|
regex: bool = False,
|
265
265
|
case: bool = True,
|
@@ -271,7 +271,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
271
271
|
self,
|
272
272
|
selector: str,
|
273
273
|
*,
|
274
|
-
|
274
|
+
overlap: str = "full",
|
275
275
|
apply_exclusions: bool = True,
|
276
276
|
regex: bool = False,
|
277
277
|
case: bool = True,
|
@@ -283,7 +283,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
283
283
|
selector: Optional[str] = None,
|
284
284
|
*,
|
285
285
|
text: Optional[str] = None,
|
286
|
-
|
286
|
+
overlap: str = "full",
|
287
287
|
apply_exclusions: bool = True,
|
288
288
|
regex: bool = False,
|
289
289
|
case: bool = True,
|
@@ -297,9 +297,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
297
297
|
Args:
|
298
298
|
selector: CSS-like selector string.
|
299
299
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
300
|
-
|
301
|
-
'
|
302
|
-
(default: "
|
300
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
301
|
+
'partial' (any overlap), or 'center' (center point inside).
|
302
|
+
(default: "full")
|
303
303
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
304
304
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
305
305
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -313,7 +313,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
313
313
|
element = page.find(
|
314
314
|
selector=selector,
|
315
315
|
text=text,
|
316
|
-
|
316
|
+
overlap=overlap,
|
317
317
|
apply_exclusions=apply_exclusions,
|
318
318
|
regex=regex,
|
319
319
|
case=case,
|
@@ -328,7 +328,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
328
328
|
self,
|
329
329
|
*,
|
330
330
|
text: str,
|
331
|
-
|
331
|
+
overlap: str = "full",
|
332
332
|
apply_exclusions: bool = True,
|
333
333
|
regex: bool = False,
|
334
334
|
case: bool = True,
|
@@ -340,7 +340,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
340
340
|
self,
|
341
341
|
selector: str,
|
342
342
|
*,
|
343
|
-
|
343
|
+
overlap: str = "full",
|
344
344
|
apply_exclusions: bool = True,
|
345
345
|
regex: bool = False,
|
346
346
|
case: bool = True,
|
@@ -352,7 +352,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
352
352
|
selector: Optional[str] = None,
|
353
353
|
*,
|
354
354
|
text: Optional[str] = None,
|
355
|
-
|
355
|
+
overlap: str = "full",
|
356
356
|
apply_exclusions: bool = True,
|
357
357
|
regex: bool = False,
|
358
358
|
case: bool = True,
|
@@ -366,9 +366,9 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
366
366
|
Args:
|
367
367
|
selector: CSS-like selector string.
|
368
368
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
369
|
-
|
370
|
-
'
|
371
|
-
(default: "
|
369
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
370
|
+
'partial' (any overlap), or 'center' (center point inside).
|
371
|
+
(default: "full")
|
372
372
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
373
373
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
374
374
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -383,7 +383,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
383
383
|
elements = page.find_all(
|
384
384
|
selector=selector,
|
385
385
|
text=text,
|
386
|
-
|
386
|
+
overlap=overlap,
|
387
387
|
apply_exclusions=apply_exclusions,
|
388
388
|
regex=regex,
|
389
389
|
case=case,
|
natural_pdf/core/render_spec.py
CHANGED
@@ -92,6 +92,50 @@ class Visualizable:
|
|
92
92
|
_get_render_specs() to gain full image generation capabilities.
|
93
93
|
"""
|
94
94
|
|
95
|
+
def highlight(self, *elements, **kwargs):
|
96
|
+
"""
|
97
|
+
Convenience method for highlighting elements in Jupyter/Colab.
|
98
|
+
|
99
|
+
This method creates a highlight context, adds the elements, and returns
|
100
|
+
the resulting image. It's designed for simple one-liner usage in notebooks.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
*elements: Elements or element collections to highlight
|
104
|
+
**kwargs: Additional parameters passed to show()
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
PIL Image with highlights
|
108
|
+
|
109
|
+
Example:
|
110
|
+
# Simple one-liner highlighting
|
111
|
+
page.highlight(left, mid, right)
|
112
|
+
|
113
|
+
# With custom colors
|
114
|
+
page.highlight(
|
115
|
+
(tables, 'blue'),
|
116
|
+
(headers, 'red'),
|
117
|
+
(footers, 'green')
|
118
|
+
)
|
119
|
+
"""
|
120
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
121
|
+
|
122
|
+
# Create context and add elements
|
123
|
+
ctx = HighlightContext(self, show_on_exit=False)
|
124
|
+
|
125
|
+
for element in elements:
|
126
|
+
if isinstance(element, tuple) and len(element) == 2:
|
127
|
+
# Element with color: (element, color)
|
128
|
+
ctx.add(element[0], color=element[1])
|
129
|
+
elif isinstance(element, tuple) and len(element) == 3:
|
130
|
+
# Element with color and label: (element, color, label)
|
131
|
+
ctx.add(element[0], color=element[1], label=element[2])
|
132
|
+
else:
|
133
|
+
# Just element
|
134
|
+
ctx.add(element)
|
135
|
+
|
136
|
+
# Return the image directly
|
137
|
+
return ctx.show(**kwargs)
|
138
|
+
|
95
139
|
def _get_render_specs(
|
96
140
|
self, mode: Literal["show", "render"] = "show", **kwargs
|
97
141
|
) -> List[RenderSpec]:
|
natural_pdf/elements/base.py
CHANGED
@@ -429,8 +429,28 @@ class DirectionalMixin:
|
|
429
429
|
def to_region(self):
|
430
430
|
return self.expand()
|
431
431
|
|
432
|
+
@overload
|
433
|
+
def expand(self, amount: float) -> "Region":
|
434
|
+
"""Expand in all directions by the same amount."""
|
435
|
+
...
|
436
|
+
|
437
|
+
@overload
|
438
|
+
def expand(
|
439
|
+
self,
|
440
|
+
*,
|
441
|
+
left: float = 0,
|
442
|
+
right: float = 0,
|
443
|
+
top: float = 0,
|
444
|
+
bottom: float = 0,
|
445
|
+
width_factor: float = 1.0,
|
446
|
+
height_factor: float = 1.0,
|
447
|
+
) -> "Region":
|
448
|
+
"""Expand by different amounts in each direction."""
|
449
|
+
...
|
450
|
+
|
432
451
|
def expand(
|
433
452
|
self,
|
453
|
+
amount: Optional[float] = None,
|
434
454
|
left: float = 0,
|
435
455
|
right: float = 0,
|
436
456
|
top: float = 0,
|
@@ -442,6 +462,7 @@ class DirectionalMixin:
|
|
442
462
|
Create a new region expanded from this element/region.
|
443
463
|
|
444
464
|
Args:
|
465
|
+
amount: If provided as the first positional argument, expand all edges by this amount
|
445
466
|
left: Amount to expand left edge (positive value expands leftwards)
|
446
467
|
right: Amount to expand right edge (positive value expands rightwards)
|
447
468
|
top: Amount to expand top edge (positive value expands upwards)
|
@@ -451,7 +472,20 @@ class DirectionalMixin:
|
|
451
472
|
|
452
473
|
Returns:
|
453
474
|
New expanded Region object
|
475
|
+
|
476
|
+
Examples:
|
477
|
+
# Expand 5 pixels in all directions
|
478
|
+
expanded = element.expand(5)
|
479
|
+
|
480
|
+
# Expand by different amounts in each direction
|
481
|
+
expanded = element.expand(left=10, right=5, top=3, bottom=7)
|
482
|
+
|
483
|
+
# Use width/height factors
|
484
|
+
expanded = element.expand(width_factor=1.5, height_factor=2.0)
|
454
485
|
"""
|
486
|
+
# If amount is provided as first positional argument, use it for all directions
|
487
|
+
if amount is not None:
|
488
|
+
left = right = top = bottom = amount
|
455
489
|
# Start with current coordinates
|
456
490
|
new_x0 = self.x0
|
457
491
|
new_x1 = self.x1
|
@@ -1260,7 +1294,7 @@ class Element(
|
|
1260
1294
|
self,
|
1261
1295
|
*,
|
1262
1296
|
text: str,
|
1263
|
-
|
1297
|
+
overlap: str = "full",
|
1264
1298
|
apply_exclusions: bool = True,
|
1265
1299
|
regex: bool = False,
|
1266
1300
|
case: bool = True,
|
@@ -1272,7 +1306,7 @@ class Element(
|
|
1272
1306
|
self,
|
1273
1307
|
selector: str,
|
1274
1308
|
*,
|
1275
|
-
|
1309
|
+
overlap: str = "full",
|
1276
1310
|
apply_exclusions: bool = True,
|
1277
1311
|
regex: bool = False,
|
1278
1312
|
case: bool = True,
|
@@ -1284,7 +1318,7 @@ class Element(
|
|
1284
1318
|
selector: Optional[str] = None,
|
1285
1319
|
*,
|
1286
1320
|
text: Optional[str] = None,
|
1287
|
-
|
1321
|
+
overlap: str = "full",
|
1288
1322
|
apply_exclusions: bool = True,
|
1289
1323
|
regex: bool = False,
|
1290
1324
|
case: bool = True,
|
@@ -1299,9 +1333,9 @@ class Element(
|
|
1299
1333
|
Args:
|
1300
1334
|
selector: CSS-like selector string.
|
1301
1335
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1302
|
-
|
1303
|
-
'
|
1304
|
-
(default: "
|
1336
|
+
overlap: How to determine if elements overlap with this element: 'full' (fully inside),
|
1337
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1338
|
+
(default: "full")
|
1305
1339
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1306
1340
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1307
1341
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1318,7 +1352,7 @@ class Element(
|
|
1318
1352
|
return temp_region.find(
|
1319
1353
|
selector=selector,
|
1320
1354
|
text=text,
|
1321
|
-
|
1355
|
+
overlap=overlap,
|
1322
1356
|
apply_exclusions=apply_exclusions,
|
1323
1357
|
regex=regex,
|
1324
1358
|
case=case,
|
@@ -1330,7 +1364,7 @@ class Element(
|
|
1330
1364
|
self,
|
1331
1365
|
*,
|
1332
1366
|
text: str,
|
1333
|
-
|
1367
|
+
overlap: str = "full",
|
1334
1368
|
apply_exclusions: bool = True,
|
1335
1369
|
regex: bool = False,
|
1336
1370
|
case: bool = True,
|
@@ -1342,7 +1376,7 @@ class Element(
|
|
1342
1376
|
self,
|
1343
1377
|
selector: str,
|
1344
1378
|
*,
|
1345
|
-
|
1379
|
+
overlap: str = "full",
|
1346
1380
|
apply_exclusions: bool = True,
|
1347
1381
|
regex: bool = False,
|
1348
1382
|
case: bool = True,
|
@@ -1354,7 +1388,7 @@ class Element(
|
|
1354
1388
|
selector: Optional[str] = None,
|
1355
1389
|
*,
|
1356
1390
|
text: Optional[str] = None,
|
1357
|
-
|
1391
|
+
overlap: str = "full",
|
1358
1392
|
apply_exclusions: bool = True,
|
1359
1393
|
regex: bool = False,
|
1360
1394
|
case: bool = True,
|
@@ -1369,9 +1403,9 @@ class Element(
|
|
1369
1403
|
Args:
|
1370
1404
|
selector: CSS-like selector string.
|
1371
1405
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1372
|
-
|
1373
|
-
'
|
1374
|
-
(default: "
|
1406
|
+
overlap: How to determine if elements overlap with this element: 'full' (fully inside),
|
1407
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1408
|
+
(default: "full")
|
1375
1409
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1376
1410
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1377
1411
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1388,7 +1422,7 @@ class Element(
|
|
1388
1422
|
return temp_region.find_all(
|
1389
1423
|
selector=selector,
|
1390
1424
|
text=text,
|
1391
|
-
|
1425
|
+
overlap=overlap,
|
1392
1426
|
apply_exclusions=apply_exclusions,
|
1393
1427
|
regex=regex,
|
1394
1428
|
case=case,
|
@@ -1673,9 +1673,9 @@ class ElementCollection(
|
|
1673
1673
|
|
1674
1674
|
Args:
|
1675
1675
|
selector: CSS-like selector string
|
1676
|
-
|
1677
|
-
'
|
1678
|
-
(default: "
|
1676
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
1677
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1678
|
+
(default: "full")
|
1679
1679
|
apply_exclusions: Whether to exclude elements in exclusion regions
|
1680
1680
|
"""
|
1681
1681
|
return self.apply(lambda element: element.find(selector, **kwargs))
|
@@ -1685,7 +1685,7 @@ class ElementCollection(
|
|
1685
1685
|
self,
|
1686
1686
|
*,
|
1687
1687
|
text: str,
|
1688
|
-
|
1688
|
+
overlap: str = "full",
|
1689
1689
|
apply_exclusions: bool = True,
|
1690
1690
|
regex: bool = False,
|
1691
1691
|
case: bool = True,
|
@@ -1697,7 +1697,7 @@ class ElementCollection(
|
|
1697
1697
|
self,
|
1698
1698
|
selector: str,
|
1699
1699
|
*,
|
1700
|
-
|
1700
|
+
overlap: str = "full",
|
1701
1701
|
apply_exclusions: bool = True,
|
1702
1702
|
regex: bool = False,
|
1703
1703
|
case: bool = True,
|
@@ -1709,7 +1709,7 @@ class ElementCollection(
|
|
1709
1709
|
selector: Optional[str] = None,
|
1710
1710
|
*,
|
1711
1711
|
text: Optional[str] = None,
|
1712
|
-
|
1712
|
+
overlap: str = "full",
|
1713
1713
|
apply_exclusions: bool = True,
|
1714
1714
|
regex: bool = False,
|
1715
1715
|
case: bool = True,
|
@@ -1724,9 +1724,9 @@ class ElementCollection(
|
|
1724
1724
|
Args:
|
1725
1725
|
selector: CSS-like selector string.
|
1726
1726
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1727
|
-
|
1728
|
-
'
|
1729
|
-
(default: "
|
1727
|
+
overlap: How to determine if elements overlap: 'full' (fully inside),
|
1728
|
+
'partial' (any overlap), or 'center' (center point inside).
|
1729
|
+
(default: "full")
|
1730
1730
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1731
1731
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1732
1732
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1748,7 +1748,7 @@ class ElementCollection(
|
|
1748
1748
|
found_in_element: "ElementCollection" = element.find_all(
|
1749
1749
|
selector=selector,
|
1750
1750
|
text=text,
|
1751
|
-
|
1751
|
+
overlap=overlap,
|
1752
1752
|
apply_exclusions=apply_exclusions,
|
1753
1753
|
regex=regex,
|
1754
1754
|
case=case,
|
natural_pdf/elements/region.py
CHANGED
@@ -1982,7 +1982,7 @@ class Region(
|
|
1982
1982
|
self,
|
1983
1983
|
*,
|
1984
1984
|
text: str,
|
1985
|
-
|
1985
|
+
overlap: str = "full",
|
1986
1986
|
apply_exclusions: bool = True,
|
1987
1987
|
regex: bool = False,
|
1988
1988
|
case: bool = True,
|
@@ -1994,7 +1994,7 @@ class Region(
|
|
1994
1994
|
self,
|
1995
1995
|
selector: str,
|
1996
1996
|
*,
|
1997
|
-
|
1997
|
+
overlap: str = "full",
|
1998
1998
|
apply_exclusions: bool = True,
|
1999
1999
|
regex: bool = False,
|
2000
2000
|
case: bool = True,
|
@@ -2006,7 +2006,7 @@ class Region(
|
|
2006
2006
|
selector: Optional[str] = None, # Now optional
|
2007
2007
|
*,
|
2008
2008
|
text: Optional[str] = None, # New text parameter
|
2009
|
-
|
2009
|
+
overlap: str = "full", # How elements overlap with the region
|
2010
2010
|
apply_exclusions: bool = True,
|
2011
2011
|
regex: bool = False,
|
2012
2012
|
case: bool = True,
|
@@ -2020,9 +2020,9 @@ class Region(
|
|
2020
2020
|
Args:
|
2021
2021
|
selector: CSS-like selector string.
|
2022
2022
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
2023
|
-
|
2024
|
-
'
|
2025
|
-
(default: "
|
2023
|
+
overlap: How to determine if elements overlap with the region: 'full' (fully inside),
|
2024
|
+
'partial' (any overlap), or 'center' (center point inside).
|
2025
|
+
(default: "full")
|
2026
2026
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2027
2027
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2028
2028
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -2035,7 +2035,7 @@ class Region(
|
|
2035
2035
|
elements = self.find_all(
|
2036
2036
|
selector=selector,
|
2037
2037
|
text=text,
|
2038
|
-
|
2038
|
+
overlap=overlap,
|
2039
2039
|
apply_exclusions=apply_exclusions,
|
2040
2040
|
regex=regex,
|
2041
2041
|
case=case,
|
@@ -2048,7 +2048,7 @@ class Region(
|
|
2048
2048
|
self,
|
2049
2049
|
*,
|
2050
2050
|
text: str,
|
2051
|
-
|
2051
|
+
overlap: str = "full",
|
2052
2052
|
apply_exclusions: bool = True,
|
2053
2053
|
regex: bool = False,
|
2054
2054
|
case: bool = True,
|
@@ -2060,7 +2060,7 @@ class Region(
|
|
2060
2060
|
self,
|
2061
2061
|
selector: str,
|
2062
2062
|
*,
|
2063
|
-
|
2063
|
+
overlap: str = "full",
|
2064
2064
|
apply_exclusions: bool = True,
|
2065
2065
|
regex: bool = False,
|
2066
2066
|
case: bool = True,
|
@@ -2072,7 +2072,7 @@ class Region(
|
|
2072
2072
|
selector: Optional[str] = None, # Now optional
|
2073
2073
|
*,
|
2074
2074
|
text: Optional[str] = None, # New text parameter
|
2075
|
-
|
2075
|
+
overlap: str = "full", # How elements overlap with the region
|
2076
2076
|
apply_exclusions: bool = True,
|
2077
2077
|
regex: bool = False,
|
2078
2078
|
case: bool = True,
|
@@ -2086,9 +2086,9 @@ class Region(
|
|
2086
2086
|
Args:
|
2087
2087
|
selector: CSS-like selector string.
|
2088
2088
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
2089
|
-
|
2090
|
-
'
|
2091
|
-
(default: "
|
2089
|
+
overlap: How to determine if elements overlap with the region: 'full' (fully inside),
|
2090
|
+
'partial' (any overlap), or 'center' (center point inside).
|
2091
|
+
(default: "full")
|
2092
2092
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2093
2093
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2094
2094
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -2104,10 +2104,10 @@ class Region(
|
|
2104
2104
|
if selector is None and text is None:
|
2105
2105
|
raise ValueError("Provide either 'selector' or 'text'.")
|
2106
2106
|
|
2107
|
-
# Validate
|
2108
|
-
if
|
2107
|
+
# Validate overlap parameter
|
2108
|
+
if overlap not in ["full", "partial", "center"]:
|
2109
2109
|
raise ValueError(
|
2110
|
-
f"Invalid
|
2110
|
+
f"Invalid overlap value: {overlap}. Must be 'full', 'partial', or 'center'"
|
2111
2111
|
)
|
2112
2112
|
|
2113
2113
|
# Construct selector if 'text' is provided
|
@@ -2142,7 +2142,7 @@ class Region(
|
|
2142
2142
|
region_bbox = self.bbox
|
2143
2143
|
matching_elements = []
|
2144
2144
|
|
2145
|
-
if
|
2145
|
+
if overlap == "full": # Fully inside (strict)
|
2146
2146
|
matching_elements = [
|
2147
2147
|
el
|
2148
2148
|
for el in potential_elements
|
@@ -2151,9 +2151,9 @@ class Region(
|
|
2151
2151
|
and el.x1 <= region_bbox[2]
|
2152
2152
|
and el.bottom <= region_bbox[3]
|
2153
2153
|
]
|
2154
|
-
elif
|
2154
|
+
elif overlap == "partial": # Any overlap
|
2155
2155
|
matching_elements = [el for el in potential_elements if self.intersects(el)]
|
2156
|
-
elif
|
2156
|
+
elif overlap == "center": # Center point inside
|
2157
2157
|
matching_elements = [
|
2158
2158
|
el for el in potential_elements if self.is_element_center_inside(el)
|
2159
2159
|
]
|
@@ -23,28 +23,28 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
|
|
23
23
|
natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
|
24
24
|
natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
|
25
25
|
natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
|
26
|
-
natural_pdf/collections/mixins.py,sha256=
|
26
|
+
natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
|
-
natural_pdf/core/highlighting_service.py,sha256=
|
30
|
-
natural_pdf/core/page.py,sha256=
|
31
|
-
natural_pdf/core/page_collection.py,sha256=
|
29
|
+
natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
|
30
|
+
natural_pdf/core/page.py,sha256=4-il2WPMVX4hNSgQ5P6yLc1-3jXfi73WCrpF9912ct4,142472
|
31
|
+
natural_pdf/core/page_collection.py,sha256=hEeXs_fzB73XZ8ZkHz2kIuSgBYcVYydvGMMdGuB1rvw,52486
|
32
32
|
natural_pdf/core/page_groupby.py,sha256=550ME6kd-h-2u75oUIIIqTYsmh8VvdQO1nXXioL8J6A,7378
|
33
33
|
natural_pdf/core/pdf.py,sha256=q54DyhXwAS_zAmsBd3PsCezu1wyQOYmGmB3iKfP8gAM,101884
|
34
34
|
natural_pdf/core/pdf_collection.py,sha256=8tM0qVWS1L5Hwv5cXuZ2X8znAYOjKmlERX62bksDlJU,30144
|
35
|
-
natural_pdf/core/render_spec.py,sha256=
|
35
|
+
natural_pdf/core/render_spec.py,sha256=3GTfnlv8JKzePrruLq_dNr3HFeWMVcZT2fwWmJN44NI,14456
|
36
36
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
37
37
|
natural_pdf/describe/base.py,sha256=Of9WVo9XuShXoeyJr0RN2CpLhF_CeiOjazl-or53RKU,18173
|
38
38
|
natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
|
39
39
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
40
40
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
41
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
|
-
natural_pdf/elements/base.py,sha256=
|
43
|
-
natural_pdf/elements/element_collection.py,sha256=
|
42
|
+
natural_pdf/elements/base.py,sha256=xXdNV1_gt4T_V_4m6qJDieWiysvJxUBhSEEAJzMOzqo,55094
|
43
|
+
natural_pdf/elements/element_collection.py,sha256=slCUnOT04sNOTjSGgmhjcCKKPVPtdDPwU7PX1ebzGMw,101342
|
44
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
46
|
natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
47
|
+
natural_pdf/elements/region.py,sha256=Onok5VzmF1CvMCa3UGLUszCuhL-CCGk_IgtSUDva-Cc,155314
|
48
48
|
natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
|
49
49
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
50
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
@@ -102,7 +102,7 @@ natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO
|
|
102
102
|
natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
|
103
103
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
104
104
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
105
|
-
natural_pdf-0.2.
|
105
|
+
natural_pdf-0.2.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
106
106
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
107
107
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
108
108
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
@@ -119,8 +119,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
119
119
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
120
120
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
121
121
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
122
|
-
natural_pdf-0.2.
|
123
|
-
natural_pdf-0.2.
|
124
|
-
natural_pdf-0.2.
|
125
|
-
natural_pdf-0.2.
|
126
|
-
natural_pdf-0.2.
|
122
|
+
natural_pdf-0.2.3.dist-info/METADATA,sha256=lyx6Cx1xPGhy-p1m0wRfTvv4zSJ4ZJnNo7DeGQZ99yU,6959
|
123
|
+
natural_pdf-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
124
|
+
natural_pdf-0.2.3.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
125
|
+
natural_pdf-0.2.3.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
|
126
|
+
natural_pdf-0.2.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|