natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +8 -0
- natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- natural_pdf/collections/mixins.py +14 -5
- natural_pdf/core/element_manager.py +5 -1
- natural_pdf/core/page.py +61 -0
- natural_pdf/core/page_collection.py +41 -1
- natural_pdf/core/pdf.py +24 -1
- natural_pdf/describe/base.py +20 -0
- natural_pdf/elements/base.py +152 -10
- natural_pdf/elements/element_collection.py +41 -2
- natural_pdf/elements/region.py +115 -2
- natural_pdf/judge.py +1509 -0
- natural_pdf/selectors/parser.py +42 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
- temp/check_model.py +49 -0
- temp/check_pdf_content.py +9 -0
- temp/checkbox_checks.py +590 -0
- temp/checkbox_simple.py +117 -0
- temp/checkbox_ux_ideas.py +400 -0
- temp/context_manager_prototype.py +177 -0
- temp/convert_to_hf.py +60 -0
- temp/demo_text_closest.py +66 -0
- temp/inspect_model.py +43 -0
- temp/rtdetr_dinov2_test.py +49 -0
- temp/test_closest_debug.py +26 -0
- temp/test_closest_debug2.py +22 -0
- temp/test_context_exploration.py +85 -0
- temp/test_durham.py +30 -0
- temp/test_empty_string.py +16 -0
- temp/test_similarity.py +15 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -19,6 +19,7 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
|
|
19
19
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
20
20
|
from tqdm.auto import tqdm
|
21
21
|
|
22
|
+
from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
|
22
23
|
from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
|
23
24
|
|
24
25
|
# --- Shape Detection Mixin --- #
|
@@ -75,12 +76,41 @@ except ImportError:
|
|
75
76
|
logger = logging.getLogger(__name__)
|
76
77
|
|
77
78
|
|
79
|
+
class RegionContext:
|
80
|
+
"""Context manager for constraining directional operations to a region."""
|
81
|
+
|
82
|
+
def __init__(self, region: "Region"):
|
83
|
+
"""Initialize the context manager with a region.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
region: The Region to use as a constraint for directional operations
|
87
|
+
"""
|
88
|
+
self.region = region
|
89
|
+
self.previous_within = None
|
90
|
+
|
91
|
+
def __enter__(self):
|
92
|
+
"""Enter the context, setting the global directional_within option."""
|
93
|
+
import natural_pdf
|
94
|
+
|
95
|
+
self.previous_within = natural_pdf.options.layout.directional_within
|
96
|
+
natural_pdf.options.layout.directional_within = self.region
|
97
|
+
return self.region
|
98
|
+
|
99
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
100
|
+
"""Exit the context, restoring the previous directional_within option."""
|
101
|
+
import natural_pdf
|
102
|
+
|
103
|
+
natural_pdf.options.layout.directional_within = self.previous_within
|
104
|
+
return False # Don't suppress exceptions
|
105
|
+
|
106
|
+
|
78
107
|
class Region(
|
79
108
|
TextMixin,
|
80
109
|
DirectionalMixin,
|
81
110
|
ClassificationMixin,
|
82
111
|
ExtractionMixin,
|
83
112
|
ShapeDetectionMixin,
|
113
|
+
CheckboxDetectionMixin,
|
84
114
|
DescribeMixin,
|
85
115
|
VisualSearchMixin,
|
86
116
|
Visualizable,
|
@@ -574,6 +604,16 @@ class Region(
|
|
574
604
|
(self.x0, self.bottom), # bottom-left
|
575
605
|
]
|
576
606
|
|
607
|
+
@property
|
608
|
+
def origin(self) -> Optional[Union["Element", "Region"]]:
|
609
|
+
"""The element/region that created this region (if it was created via directional method)."""
|
610
|
+
return getattr(self, "source_element", None)
|
611
|
+
|
612
|
+
@property
|
613
|
+
def endpoint(self) -> Optional["Element"]:
|
614
|
+
"""The element where this region stopped (if created with 'until' parameter)."""
|
615
|
+
return getattr(self, "boundary_element", None)
|
616
|
+
|
577
617
|
def _is_point_in_polygon(self, x: float, y: float) -> bool:
|
578
618
|
"""
|
579
619
|
Check if a point is inside the polygon using ray casting algorithm.
|
@@ -1297,9 +1337,11 @@ class Region(
|
|
1297
1337
|
|
1298
1338
|
def extract_text(
|
1299
1339
|
self,
|
1340
|
+
granularity: str = "chars",
|
1300
1341
|
apply_exclusions: bool = True,
|
1301
1342
|
debug: bool = False,
|
1302
1343
|
*,
|
1344
|
+
overlap: str = "center",
|
1303
1345
|
newlines: Union[bool, str] = True,
|
1304
1346
|
content_filter=None,
|
1305
1347
|
**kwargs,
|
@@ -1309,8 +1351,15 @@ class Region(
|
|
1309
1351
|
layout engine (chars_to_textmap).
|
1310
1352
|
|
1311
1353
|
Args:
|
1354
|
+
granularity: Level of text extraction - 'chars' (default) or 'words'.
|
1355
|
+
- 'chars': Character-by-character extraction (current behavior)
|
1356
|
+
- 'words': Word-level extraction with configurable overlap
|
1312
1357
|
apply_exclusions: Whether to apply exclusion regions defined on the parent page.
|
1313
1358
|
debug: Enable verbose debugging output for filtering steps.
|
1359
|
+
overlap: How to determine if words overlap with the region (only used when granularity='words'):
|
1360
|
+
- 'center': Word center point must be inside (default)
|
1361
|
+
- 'full': Word must be fully inside the region
|
1362
|
+
- 'partial': Any overlap includes the word
|
1314
1363
|
newlines: Whether to strip newline characters from the extracted text.
|
1315
1364
|
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
1316
1365
|
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
@@ -1323,10 +1372,41 @@ class Region(
|
|
1323
1372
|
Returns:
|
1324
1373
|
Extracted text as string, potentially with layout-based spacing.
|
1325
1374
|
"""
|
1375
|
+
# Validate granularity parameter
|
1376
|
+
if granularity not in ("chars", "words"):
|
1377
|
+
raise ValueError(f"granularity must be 'chars' or 'words', got '{granularity}'")
|
1378
|
+
|
1326
1379
|
# Allow 'debug_exclusions' for backward compatibility
|
1327
1380
|
debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
|
1328
|
-
logger.debug(
|
1381
|
+
logger.debug(
|
1382
|
+
f"Region {self.bbox}: extract_text called with granularity='{granularity}', overlap='{overlap}', kwargs: {kwargs}"
|
1383
|
+
)
|
1329
1384
|
|
1385
|
+
# Handle word-level extraction
|
1386
|
+
if granularity == "words":
|
1387
|
+
# Use find_all to get words with proper overlap and exclusion handling
|
1388
|
+
word_elements = self.find_all(
|
1389
|
+
"text", overlap=overlap, apply_exclusions=apply_exclusions
|
1390
|
+
)
|
1391
|
+
|
1392
|
+
# Join the text from all matching words
|
1393
|
+
text_parts = []
|
1394
|
+
for word in word_elements:
|
1395
|
+
word_text = word.extract_text()
|
1396
|
+
if word_text: # Skip empty strings
|
1397
|
+
text_parts.append(word_text)
|
1398
|
+
|
1399
|
+
result = " ".join(text_parts)
|
1400
|
+
|
1401
|
+
# Apply newlines processing if requested
|
1402
|
+
if newlines is False:
|
1403
|
+
result = result.replace("\n", " ").replace("\r", " ")
|
1404
|
+
elif isinstance(newlines, str):
|
1405
|
+
result = result.replace("\n", newlines).replace("\r", newlines)
|
1406
|
+
|
1407
|
+
return result
|
1408
|
+
|
1409
|
+
# Original character-level extraction logic follows...
|
1330
1410
|
# 1. Get Word Elements potentially within this region (initial broad phase)
|
1331
1411
|
# Optimization: Could use spatial query if page elements were indexed
|
1332
1412
|
page_words = self.page.words # Get all words from the page
|
@@ -3309,7 +3389,14 @@ class Region(
|
|
3309
3389
|
name_info = f" name='{self.name}'" if self.name else ""
|
3310
3390
|
type_info = f" type='{self.region_type}'" if self.region_type else ""
|
3311
3391
|
source_info = f" source='{self.source}'" if self.source else ""
|
3312
|
-
|
3392
|
+
|
3393
|
+
# Add checkbox state if this is a checkbox
|
3394
|
+
checkbox_info = ""
|
3395
|
+
if self.region_type == "checkbox" and hasattr(self, "is_checked"):
|
3396
|
+
state = "checked" if self.is_checked else "unchecked"
|
3397
|
+
checkbox_info = f" [{state}]"
|
3398
|
+
|
3399
|
+
return f"<Region{name_info}{type_info}{source_info}{checkbox_info} bbox={self.bbox}{poly_info}>"
|
3313
3400
|
|
3314
3401
|
def update_text(
|
3315
3402
|
self,
|
@@ -4038,3 +4125,29 @@ class Region(
|
|
4038
4125
|
except Exception as e:
|
4039
4126
|
logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
|
4040
4127
|
return None
|
4128
|
+
|
4129
|
+
def within(self):
|
4130
|
+
"""Context manager that constrains directional operations to this region.
|
4131
|
+
|
4132
|
+
When used as a context manager, all directional navigation operations
|
4133
|
+
(above, below, left, right) will be constrained to the bounds of this region.
|
4134
|
+
|
4135
|
+
Returns:
|
4136
|
+
RegionContext: A context manager that yields this region
|
4137
|
+
|
4138
|
+
Examples:
|
4139
|
+
```python
|
4140
|
+
# Create a column region
|
4141
|
+
left_col = page.region(right=page.width/2)
|
4142
|
+
|
4143
|
+
# All directional operations are constrained to left_col
|
4144
|
+
with left_col.within() as col:
|
4145
|
+
header = col.find("text[size>14]")
|
4146
|
+
content = header.below(until="text[size>14]")
|
4147
|
+
# content will only include elements within left_col
|
4148
|
+
|
4149
|
+
# Operations outside the context are not constrained
|
4150
|
+
full_page_below = header.below() # Searches full page
|
4151
|
+
```
|
4152
|
+
"""
|
4153
|
+
return RegionContext(self)
|