natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +29 -40
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +120 -40
- natural_pdf/core/page.py +20 -18
- natural_pdf/core/pdf.py +146 -13
- natural_pdf/elements/base.py +17 -0
- natural_pdf/elements/collections.py +374 -30
- natural_pdf/elements/region.py +45 -14
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +519 -0
- natural_pdf/exporters/hocr_font.py +136 -0
- natural_pdf/exporters/original_pdf.py +127 -0
- natural_pdf/exporters/searchable_pdf.py +2 -12
- natural_pdf/ocr/engine_surya.py +1 -1
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -55,6 +55,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
55
55
|
bbox: Tuple[float, float, float, float],
|
56
56
|
polygon: List[Tuple[float, float]] = None,
|
57
57
|
parent=None,
|
58
|
+
label: Optional[str] = None,
|
58
59
|
):
|
59
60
|
"""
|
60
61
|
Initialize a region.
|
@@ -74,11 +75,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
74
75
|
self.start_element = None
|
75
76
|
self.end_element = None
|
76
77
|
|
77
|
-
# --- ADDED --- Metadata store for mixins
|
78
78
|
self.metadata: Dict[str, Any] = {}
|
79
|
-
# --- NEW --- Central registry for analysis results
|
80
79
|
self.analyses: Dict[str, Any] = {}
|
81
|
-
# --- END ADDED ---
|
82
80
|
|
83
81
|
# Standard attributes for all elements
|
84
82
|
self.object_type = "region" # For selector compatibility
|
@@ -91,6 +89,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
91
89
|
|
92
90
|
# Region management attributes
|
93
91
|
self.name = None
|
92
|
+
self.label = label
|
94
93
|
self.source = None # Will be set by creation methods
|
95
94
|
|
96
95
|
# Hierarchy support for nested document structure
|
@@ -773,6 +772,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
773
772
|
# Add a default color for standalone show
|
774
773
|
color: Optional[Union[Tuple, str]] = "blue",
|
775
774
|
label: Optional[str] = None,
|
775
|
+
width: Optional[int] = None, # Add width parameter
|
776
776
|
) -> "Image.Image":
|
777
777
|
"""
|
778
778
|
Show the page with just this region highlighted temporarily.
|
@@ -783,6 +783,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
783
783
|
legend_position: Position of the legend
|
784
784
|
color: Color to highlight this region (default: blue)
|
785
785
|
label: Optional label for this region in the legend
|
786
|
+
width: Optional width for the output image in pixels
|
786
787
|
|
787
788
|
Returns:
|
788
789
|
PIL Image of the page with only this region highlighted
|
@@ -813,6 +814,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
813
814
|
page_index=self._page.index,
|
814
815
|
temporary_highlights=[temp_highlight_data],
|
815
816
|
scale=scale,
|
817
|
+
width=width, # Pass the width parameter
|
816
818
|
labels=labels,
|
817
819
|
legend_position=legend_position,
|
818
820
|
)
|
@@ -1334,6 +1336,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1334
1336
|
self,
|
1335
1337
|
*,
|
1336
1338
|
text: str,
|
1339
|
+
contains: str = "all",
|
1337
1340
|
apply_exclusions: bool = True,
|
1338
1341
|
regex: bool = False,
|
1339
1342
|
case: bool = True,
|
@@ -1345,6 +1348,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1345
1348
|
self,
|
1346
1349
|
selector: str,
|
1347
1350
|
*,
|
1351
|
+
contains: str = "all",
|
1348
1352
|
apply_exclusions: bool = True,
|
1349
1353
|
regex: bool = False,
|
1350
1354
|
case: bool = True,
|
@@ -1356,6 +1360,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1356
1360
|
selector: Optional[str] = None, # Now optional
|
1357
1361
|
*,
|
1358
1362
|
text: Optional[str] = None, # New text parameter
|
1363
|
+
contains: str = "all", # New parameter for containment behavior
|
1359
1364
|
apply_exclusions: bool = True,
|
1360
1365
|
regex: bool = False,
|
1361
1366
|
case: bool = True,
|
@@ -1369,6 +1374,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1369
1374
|
Args:
|
1370
1375
|
selector: CSS-like selector string.
|
1371
1376
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1377
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1378
|
+
'any' (any overlap), or 'center' (center point inside).
|
1379
|
+
(default: "all")
|
1372
1380
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1373
1381
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1374
1382
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1381,6 +1389,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1381
1389
|
elements = self.find_all(
|
1382
1390
|
selector=selector,
|
1383
1391
|
text=text,
|
1392
|
+
contains=contains,
|
1384
1393
|
apply_exclusions=apply_exclusions,
|
1385
1394
|
regex=regex,
|
1386
1395
|
case=case,
|
@@ -1393,6 +1402,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1393
1402
|
self,
|
1394
1403
|
*,
|
1395
1404
|
text: str,
|
1405
|
+
contains: str = "all",
|
1396
1406
|
apply_exclusions: bool = True,
|
1397
1407
|
regex: bool = False,
|
1398
1408
|
case: bool = True,
|
@@ -1404,6 +1414,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1404
1414
|
self,
|
1405
1415
|
selector: str,
|
1406
1416
|
*,
|
1417
|
+
contains: str = "all",
|
1407
1418
|
apply_exclusions: bool = True,
|
1408
1419
|
regex: bool = False,
|
1409
1420
|
case: bool = True,
|
@@ -1415,6 +1426,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1415
1426
|
selector: Optional[str] = None, # Now optional
|
1416
1427
|
*,
|
1417
1428
|
text: Optional[str] = None, # New text parameter
|
1429
|
+
contains: str = "all", # New parameter to control inside/overlap behavior
|
1418
1430
|
apply_exclusions: bool = True,
|
1419
1431
|
regex: bool = False,
|
1420
1432
|
case: bool = True,
|
@@ -1428,6 +1440,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1428
1440
|
Args:
|
1429
1441
|
selector: CSS-like selector string.
|
1430
1442
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1443
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1444
|
+
'any' (any overlap), or 'center' (center point inside).
|
1445
|
+
(default: "all")
|
1431
1446
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1432
1447
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1433
1448
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1443,6 +1458,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1443
1458
|
if selector is None and text is None:
|
1444
1459
|
raise ValueError("Provide either 'selector' or 'text'.")
|
1445
1460
|
|
1461
|
+
# Validate contains parameter
|
1462
|
+
if contains not in ["all", "any", "center"]:
|
1463
|
+
raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
|
1464
|
+
|
1446
1465
|
# Construct selector if 'text' is provided
|
1447
1466
|
effective_selector = ""
|
1448
1467
|
if text is not None:
|
@@ -1482,22 +1501,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1482
1501
|
# Let the page handle its exclusion logic if needed
|
1483
1502
|
potential_elements = self.page.find_all(
|
1484
1503
|
selector=effective_selector,
|
1485
|
-
apply_exclusions=
|
1504
|
+
apply_exclusions=apply_exclusions,
|
1486
1505
|
regex=regex,
|
1487
1506
|
case=case,
|
1488
1507
|
**kwargs,
|
1489
1508
|
)
|
1490
1509
|
|
1491
|
-
# Filter these elements
|
1510
|
+
# Filter these elements based on the specified containment method
|
1492
1511
|
region_bbox = self.bbox
|
1493
|
-
matching_elements = [
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1512
|
+
matching_elements = []
|
1513
|
+
|
1514
|
+
if contains == "all": # Fully inside (strict)
|
1515
|
+
matching_elements = [
|
1516
|
+
el for el in potential_elements
|
1517
|
+
if el.x0 >= region_bbox[0]
|
1518
|
+
and el.top >= region_bbox[1]
|
1519
|
+
and el.x1 <= region_bbox[2]
|
1520
|
+
and el.bottom <= region_bbox[3]
|
1521
|
+
]
|
1522
|
+
elif contains == "any": # Any overlap
|
1523
|
+
matching_elements = [
|
1524
|
+
el for el in potential_elements
|
1525
|
+
if self.intersects(el)
|
1526
|
+
]
|
1527
|
+
elif contains == "center": # Center point inside
|
1528
|
+
matching_elements = [
|
1529
|
+
el for el in potential_elements
|
1530
|
+
if self.is_element_center_inside(el)
|
1531
|
+
]
|
1501
1532
|
|
1502
1533
|
return ElementCollection(matching_elements)
|
1503
1534
|
|
@@ -1989,7 +2020,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1989
2020
|
from natural_pdf.qa.document_qa import get_qa_engine
|
1990
2021
|
except ImportError:
|
1991
2022
|
logger.error(
|
1992
|
-
"Question answering requires optional dependencies. Install with `pip install natural-pdf[
|
2023
|
+
"Question answering requires optional dependencies. Install with `pip install natural-pdf[core-ml]`"
|
1993
2024
|
)
|
1994
2025
|
return {
|
1995
2026
|
"answer": None,
|
File without changes
|
Binary file
|
Binary file
|