natural-pdf 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +8 -0
- natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- natural_pdf/analyzers/guides.py +26 -2
- natural_pdf/collections/mixins.py +14 -5
- natural_pdf/core/element_manager.py +5 -1
- natural_pdf/core/page.py +61 -0
- natural_pdf/core/page_collection.py +41 -1
- natural_pdf/core/pdf.py +24 -1
- natural_pdf/describe/base.py +20 -0
- natural_pdf/elements/base.py +152 -10
- natural_pdf/elements/element_collection.py +41 -2
- natural_pdf/elements/region.py +115 -2
- natural_pdf/judge.py +1509 -0
- natural_pdf/selectors/parser.py +42 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/RECORD +42 -18
- temp/check_model.py +49 -0
- temp/check_pdf_content.py +9 -0
- temp/checkbox_checks.py +590 -0
- temp/checkbox_simple.py +117 -0
- temp/checkbox_ux_ideas.py +400 -0
- temp/context_manager_prototype.py +177 -0
- temp/convert_to_hf.py +60 -0
- temp/demo_text_closest.py +66 -0
- temp/inspect_model.py +43 -0
- temp/rtdetr_dinov2_test.py +49 -0
- temp/test_closest_debug.py +26 -0
- temp/test_closest_debug2.py +22 -0
- temp/test_context_exploration.py +85 -0
- temp/test_durham.py +30 -0
- temp/test_empty_string.py +16 -0
- temp/test_similarity.py +15 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -50,6 +50,7 @@ import numpy as np
|
|
50
50
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
51
51
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
52
52
|
|
53
|
+
from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
|
53
54
|
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
54
55
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
55
56
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
@@ -103,6 +104,7 @@ class Page(
|
|
103
104
|
ClassificationMixin,
|
104
105
|
ExtractionMixin,
|
105
106
|
ShapeDetectionMixin,
|
107
|
+
CheckboxDetectionMixin,
|
106
108
|
DescribeMixin,
|
107
109
|
VisualSearchMixin,
|
108
110
|
Visualizable,
|
@@ -1491,6 +1493,65 @@ class Page(
|
|
1491
1493
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
1492
1494
|
)
|
1493
1495
|
|
1496
|
+
# Handle :closest pseudo-class for fuzzy text matching
|
1497
|
+
for pseudo in selector_obj.get("pseudo_classes", []):
|
1498
|
+
name = pseudo.get("name")
|
1499
|
+
if name == "closest" and pseudo.get("args") is not None:
|
1500
|
+
import difflib
|
1501
|
+
|
1502
|
+
# Parse search text and threshold
|
1503
|
+
search_text = str(pseudo["args"]).strip()
|
1504
|
+
threshold = 0.0 # Default threshold
|
1505
|
+
|
1506
|
+
# Handle empty search text
|
1507
|
+
if not search_text:
|
1508
|
+
matching_elements = []
|
1509
|
+
break
|
1510
|
+
|
1511
|
+
# Check if threshold is specified with @ separator
|
1512
|
+
if "@" in search_text and search_text.count("@") == 1:
|
1513
|
+
text_part, threshold_part = search_text.rsplit("@", 1)
|
1514
|
+
try:
|
1515
|
+
threshold = float(threshold_part)
|
1516
|
+
search_text = text_part.strip()
|
1517
|
+
except (ValueError, TypeError):
|
1518
|
+
pass # Keep original search_text and default threshold
|
1519
|
+
|
1520
|
+
# Determine case sensitivity
|
1521
|
+
ignore_case = not kwargs.get("case", False)
|
1522
|
+
|
1523
|
+
# Calculate similarity scores for all elements
|
1524
|
+
scored_elements = []
|
1525
|
+
|
1526
|
+
for el in matching_elements:
|
1527
|
+
if hasattr(el, "text") and el.text:
|
1528
|
+
el_text = el.text.strip()
|
1529
|
+
search_term = search_text
|
1530
|
+
|
1531
|
+
if ignore_case:
|
1532
|
+
el_text = el_text.lower()
|
1533
|
+
search_term = search_term.lower()
|
1534
|
+
|
1535
|
+
# Calculate similarity ratio
|
1536
|
+
ratio = difflib.SequenceMatcher(None, search_term, el_text).ratio()
|
1537
|
+
|
1538
|
+
# Check if element contains the search term as substring
|
1539
|
+
contains_match = search_term in el_text
|
1540
|
+
|
1541
|
+
# Store element with its similarity score and contains flag
|
1542
|
+
if ratio >= threshold:
|
1543
|
+
scored_elements.append((ratio, contains_match, el))
|
1544
|
+
|
1545
|
+
# Sort by:
|
1546
|
+
# 1. Contains match (True before False)
|
1547
|
+
# 2. Similarity score (highest first)
|
1548
|
+
# This ensures substring matches come first but are sorted by similarity
|
1549
|
+
scored_elements.sort(key=lambda x: (x[1], x[0]), reverse=True)
|
1550
|
+
|
1551
|
+
# Extract just the elements
|
1552
|
+
matching_elements = [el for _, _, el in scored_elements]
|
1553
|
+
break # Only process the first :closest pseudo-class
|
1554
|
+
|
1494
1555
|
# Handle collection-level pseudo-classes (:first, :last)
|
1495
1556
|
for pseudo in selector_obj.get("pseudo_classes", []):
|
1496
1557
|
name = pseudo.get("name")
|
@@ -28,6 +28,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
28
28
|
from PIL import Image, ImageDraw, ImageFont
|
29
29
|
from tqdm.auto import tqdm
|
30
30
|
|
31
|
+
from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
|
31
32
|
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
32
33
|
from natural_pdf.classification.manager import ClassificationManager
|
33
34
|
from natural_pdf.classification.mixin import ClassificationMixin
|
@@ -76,7 +77,9 @@ T = TypeVar("T")
|
|
76
77
|
P = TypeVar("P", bound="Page")
|
77
78
|
|
78
79
|
|
79
|
-
class PageCollection(
|
80
|
+
class PageCollection(
|
81
|
+
TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, CheckboxDetectionMixin, Visualizable
|
82
|
+
):
|
80
83
|
"""
|
81
84
|
Represents a collection of Page objects, often from a single PDF document.
|
82
85
|
Provides methods for batch operations on these pages.
|
@@ -1506,6 +1509,43 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
1506
1509
|
|
1507
1510
|
return ElementCollection(all_regions)
|
1508
1511
|
|
1512
|
+
def detect_checkboxes(self, *args, **kwargs) -> "ElementCollection[Region]":
|
1513
|
+
"""
|
1514
|
+
Detects checkboxes on each page in the collection.
|
1515
|
+
|
1516
|
+
This method iterates through each page, calls its detect_checkboxes method,
|
1517
|
+
and returns a single ElementCollection containing all detected checkbox
|
1518
|
+
regions from all pages.
|
1519
|
+
|
1520
|
+
Args:
|
1521
|
+
*args: Positional arguments to pass to each page's detect_checkboxes method.
|
1522
|
+
**kwargs: Keyword arguments to pass to each page's detect_checkboxes method.
|
1523
|
+
A 'show_progress' kwarg can be included to show a progress bar.
|
1524
|
+
|
1525
|
+
Returns:
|
1526
|
+
An ElementCollection of all detected checkbox Region objects.
|
1527
|
+
"""
|
1528
|
+
all_checkboxes = []
|
1529
|
+
|
1530
|
+
show_progress = kwargs.pop("show_progress", True)
|
1531
|
+
|
1532
|
+
iterator = self.pages
|
1533
|
+
if show_progress:
|
1534
|
+
try:
|
1535
|
+
from tqdm.auto import tqdm
|
1536
|
+
|
1537
|
+
iterator = tqdm(self.pages, desc="Detecting checkboxes")
|
1538
|
+
except ImportError:
|
1539
|
+
pass # tqdm not installed
|
1540
|
+
|
1541
|
+
for page in iterator:
|
1542
|
+
# Each page's detect_checkboxes method returns an ElementCollection
|
1543
|
+
checkbox_collection = page.detect_checkboxes(*args, **kwargs)
|
1544
|
+
if checkbox_collection:
|
1545
|
+
all_checkboxes.extend(checkbox_collection.elements)
|
1546
|
+
|
1547
|
+
return ElementCollection(all_checkboxes)
|
1548
|
+
|
1509
1549
|
def highlights(self, show: bool = False) -> "HighlightContext":
|
1510
1550
|
"""
|
1511
1551
|
Create a highlight context for accumulating highlights.
|
natural_pdf/core/pdf.py
CHANGED
@@ -27,6 +27,7 @@ from typing import (
|
|
27
27
|
import pdfplumber
|
28
28
|
from tqdm.auto import tqdm
|
29
29
|
|
30
|
+
from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
|
30
31
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
31
32
|
from natural_pdf.classification.manager import ClassificationError
|
32
33
|
from natural_pdf.classification.mixin import ClassificationMixin
|
@@ -303,7 +304,13 @@ class _LazyPageList(Sequence):
|
|
303
304
|
|
304
305
|
|
305
306
|
class PDF(
|
306
|
-
TextMixin,
|
307
|
+
TextMixin,
|
308
|
+
ExtractionMixin,
|
309
|
+
ExportMixin,
|
310
|
+
ClassificationMixin,
|
311
|
+
CheckboxDetectionMixin,
|
312
|
+
VisualSearchMixin,
|
313
|
+
Visualizable,
|
307
314
|
):
|
308
315
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
309
316
|
|
@@ -2552,6 +2559,22 @@ class PDF(
|
|
2552
2559
|
"""
|
2553
2560
|
return self.pages.analyze_layout(*args, **kwargs)
|
2554
2561
|
|
2562
|
+
def detect_checkboxes(self, *args, **kwargs) -> "ElementCollection[Region]":
|
2563
|
+
"""
|
2564
|
+
Detects checkboxes on all pages in the PDF.
|
2565
|
+
|
2566
|
+
This is a convenience method that calls detect_checkboxes on the PDF's
|
2567
|
+
page collection.
|
2568
|
+
|
2569
|
+
Args:
|
2570
|
+
*args: Positional arguments passed to pages.detect_checkboxes().
|
2571
|
+
**kwargs: Keyword arguments passed to pages.detect_checkboxes().
|
2572
|
+
|
2573
|
+
Returns:
|
2574
|
+
An ElementCollection of all detected checkbox Region objects.
|
2575
|
+
"""
|
2576
|
+
return self.pages.detect_checkboxes(*args, **kwargs)
|
2577
|
+
|
2555
2578
|
def highlights(self, show: bool = False) -> "HighlightContext":
|
2556
2579
|
"""
|
2557
2580
|
Create a highlight context for accumulating highlights.
|
natural_pdf/describe/base.py
CHANGED
@@ -233,6 +233,17 @@ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> Insp
|
|
233
233
|
# Get appropriate columns for this type
|
234
234
|
columns = _get_columns_for_type(element_type, show_page_column)
|
235
235
|
|
236
|
+
# Add checkbox state column if we have checkbox regions
|
237
|
+
if element_type == "region" and any(
|
238
|
+
getattr(e, "region_type", "") == "checkbox" for e in display_elements
|
239
|
+
):
|
240
|
+
# Insert state column after type column
|
241
|
+
if "type" in columns:
|
242
|
+
type_idx = columns.index("type")
|
243
|
+
columns.insert(type_idx + 1, "state")
|
244
|
+
else:
|
245
|
+
columns.append("state")
|
246
|
+
|
236
247
|
# Extract data for each element
|
237
248
|
element_data = []
|
238
249
|
for element in display_elements:
|
@@ -423,6 +434,15 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
423
434
|
value = getattr(element, column, False)
|
424
435
|
return value if isinstance(value, bool) else False
|
425
436
|
|
437
|
+
elif column == "state":
|
438
|
+
# For checkbox regions, show checked/unchecked state
|
439
|
+
if getattr(element, "region_type", "") == "checkbox":
|
440
|
+
if hasattr(element, "is_checked"):
|
441
|
+
return "checked" if element.is_checked else "unchecked"
|
442
|
+
elif hasattr(element, "checkbox_state"):
|
443
|
+
return element.checkbox_state
|
444
|
+
return ""
|
445
|
+
|
426
446
|
else:
|
427
447
|
# Generic attribute access
|
428
448
|
value = getattr(element, column, "")
|
natural_pdf/elements/base.py
CHANGED
@@ -122,6 +122,8 @@ class DirectionalMixin:
|
|
122
122
|
offset: float = 0.0,
|
123
123
|
apply_exclusions: bool = True,
|
124
124
|
multipage: bool = False,
|
125
|
+
within: Optional["Region"] = None,
|
126
|
+
anchor: str = "start",
|
125
127
|
**kwargs,
|
126
128
|
) -> Union["Region", "FlowRegion"]:
|
127
129
|
"""
|
@@ -136,6 +138,9 @@ class DirectionalMixin:
|
|
136
138
|
include_endpoint: Whether to include the boundary element found by 'until'
|
137
139
|
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
138
140
|
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
141
|
+
multipage: If True, allows the region to span multiple pages
|
142
|
+
within: Optional region to constrain the result to (default: None)
|
143
|
+
anchor: Reference point - 'start', 'center', 'end', or explicit edges like 'top', 'bottom', 'left', 'right'
|
139
144
|
**kwargs: Additional parameters for the 'until' selector search
|
140
145
|
|
141
146
|
Returns:
|
@@ -147,6 +152,37 @@ class DirectionalMixin:
|
|
147
152
|
is_positive = direction in ("right", "below") # right/below are positive directions
|
148
153
|
pixel_offset = offset # Use provided offset for excluding elements/endpoints
|
149
154
|
|
155
|
+
# Normalize anchor parameter
|
156
|
+
def normalize_anchor(anchor_value: str, dir: str) -> str:
|
157
|
+
"""Convert start/end/center to explicit edges based on direction."""
|
158
|
+
if anchor_value == "center":
|
159
|
+
return "center"
|
160
|
+
elif anchor_value == "start":
|
161
|
+
# Start means the edge we're moving away from
|
162
|
+
if dir == "below":
|
163
|
+
return "top"
|
164
|
+
elif dir == "above":
|
165
|
+
return "bottom"
|
166
|
+
elif dir == "right":
|
167
|
+
return "left"
|
168
|
+
elif dir == "left":
|
169
|
+
return "right"
|
170
|
+
elif anchor_value == "end":
|
171
|
+
# End means the edge we're moving towards
|
172
|
+
if dir == "below":
|
173
|
+
return "bottom"
|
174
|
+
elif dir == "above":
|
175
|
+
return "top"
|
176
|
+
elif dir == "right":
|
177
|
+
return "right"
|
178
|
+
elif dir == "left":
|
179
|
+
return "left"
|
180
|
+
else:
|
181
|
+
# Already explicit (top/bottom/left/right)
|
182
|
+
return anchor_value
|
183
|
+
|
184
|
+
normalized_anchor = normalize_anchor(anchor, direction)
|
185
|
+
|
150
186
|
# 1. Determine initial boundaries based on direction and include_source
|
151
187
|
if is_horizontal:
|
152
188
|
# Initial cross-boundaries (vertical)
|
@@ -200,34 +236,84 @@ class DirectionalMixin:
|
|
200
236
|
if until:
|
201
237
|
from natural_pdf.elements.element_collection import ElementCollection
|
202
238
|
|
239
|
+
# Get constraint region (from parameter or global options)
|
240
|
+
constraint_region = within or natural_pdf.options.layout.directional_within
|
241
|
+
|
242
|
+
# Check if until uses :closest selector (preserve ordering)
|
243
|
+
preserve_order = isinstance(until, str) and ":closest" in until
|
244
|
+
|
203
245
|
# If until is an elementcollection, just use it
|
204
246
|
if isinstance(until, ElementCollection):
|
205
247
|
# Only take ones on the same page
|
206
248
|
all_matches = [m for m in until if m.page == self.page]
|
207
249
|
else:
|
208
|
-
|
250
|
+
# If we have a constraint region, search within it instead of the whole page
|
251
|
+
if (
|
252
|
+
constraint_region
|
253
|
+
and hasattr(constraint_region, "page")
|
254
|
+
and constraint_region.page == self.page
|
255
|
+
):
|
256
|
+
all_matches = constraint_region.find_all(
|
257
|
+
until, apply_exclusions=apply_exclusions, **kwargs
|
258
|
+
)
|
259
|
+
else:
|
260
|
+
all_matches = self.page.find_all(
|
261
|
+
until, apply_exclusions=apply_exclusions, **kwargs
|
262
|
+
)
|
209
263
|
matches_in_direction = []
|
210
264
|
|
211
|
-
# Filter and sort matches based on direction
|
265
|
+
# Filter and sort matches based on direction and anchor parameter
|
212
266
|
# Also filter by cross-direction bounds when cross_size='element'
|
267
|
+
|
268
|
+
# IMPORTANT: Exclude self from matches to prevent finding ourselves
|
269
|
+
all_matches = [m for m in all_matches if m is not self]
|
270
|
+
|
271
|
+
# Determine reference point based on normalized_anchor
|
213
272
|
if direction == "above":
|
214
|
-
|
273
|
+
if normalized_anchor == "top":
|
274
|
+
ref_y = self.top
|
275
|
+
elif normalized_anchor == "center":
|
276
|
+
ref_y = (self.top + self.bottom) / 2
|
277
|
+
else: # 'bottom'
|
278
|
+
ref_y = self.bottom
|
279
|
+
|
280
|
+
matches_in_direction = [m for m in all_matches if m.bottom <= ref_y]
|
215
281
|
# Filter by horizontal bounds if cross_size='element'
|
216
282
|
if cross_size == "element":
|
217
283
|
matches_in_direction = [
|
218
284
|
m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
|
219
285
|
]
|
220
|
-
|
286
|
+
# Only sort by position if not using :closest (which is already sorted by quality)
|
287
|
+
if not preserve_order:
|
288
|
+
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
289
|
+
|
221
290
|
elif direction == "below":
|
222
|
-
|
291
|
+
if normalized_anchor == "top":
|
292
|
+
ref_y = self.top
|
293
|
+
elif normalized_anchor == "center":
|
294
|
+
ref_y = (self.top + self.bottom) / 2
|
295
|
+
else: # 'bottom'
|
296
|
+
ref_y = self.bottom
|
297
|
+
|
298
|
+
matches_in_direction = [m for m in all_matches if m.top >= ref_y]
|
223
299
|
# Filter by horizontal bounds if cross_size='element'
|
224
300
|
if cross_size == "element":
|
225
301
|
matches_in_direction = [
|
226
302
|
m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
|
227
303
|
]
|
228
|
-
|
304
|
+
# Only sort by position if not using :closest (which is already sorted by quality)
|
305
|
+
if not preserve_order:
|
306
|
+
matches_in_direction.sort(key=lambda e: e.top)
|
307
|
+
|
229
308
|
elif direction == "left":
|
230
|
-
|
309
|
+
if normalized_anchor == "left":
|
310
|
+
ref_x = self.x0
|
311
|
+
elif normalized_anchor == "center":
|
312
|
+
ref_x = (self.x0 + self.x1) / 2
|
313
|
+
else: # 'right'
|
314
|
+
ref_x = self.x1
|
315
|
+
|
316
|
+
matches_in_direction = [m for m in all_matches if m.x1 <= ref_x]
|
231
317
|
# Filter by vertical bounds if cross_size='element'
|
232
318
|
if cross_size == "element":
|
233
319
|
matches_in_direction = [
|
@@ -235,9 +321,19 @@ class DirectionalMixin:
|
|
235
321
|
for m in matches_in_direction
|
236
322
|
if m.top < self.bottom and m.bottom > self.top
|
237
323
|
]
|
238
|
-
|
324
|
+
# Only sort by position if not using :closest (which is already sorted by quality)
|
325
|
+
if not preserve_order:
|
326
|
+
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
327
|
+
|
239
328
|
elif direction == "right":
|
240
|
-
|
329
|
+
if normalized_anchor == "left":
|
330
|
+
ref_x = self.x0
|
331
|
+
elif normalized_anchor == "center":
|
332
|
+
ref_x = (self.x0 + self.x1) / 2
|
333
|
+
else: # 'right'
|
334
|
+
ref_x = self.x1
|
335
|
+
|
336
|
+
matches_in_direction = [m for m in all_matches if m.x0 >= ref_x]
|
241
337
|
# Filter by vertical bounds if cross_size='element'
|
242
338
|
if cross_size == "element":
|
243
339
|
matches_in_direction = [
|
@@ -245,7 +341,9 @@ class DirectionalMixin:
|
|
245
341
|
for m in matches_in_direction
|
246
342
|
if m.top < self.bottom and m.bottom > self.top
|
247
343
|
]
|
248
|
-
|
344
|
+
# Only sort by position if not using :closest (which is already sorted by quality)
|
345
|
+
if not preserve_order:
|
346
|
+
matches_in_direction.sort(key=lambda e: e.x0)
|
249
347
|
|
250
348
|
if matches_in_direction:
|
251
349
|
target = matches_in_direction[0]
|
@@ -284,6 +382,22 @@ class DirectionalMixin:
|
|
284
382
|
final_y1 = max(bbox[1], bbox[3])
|
285
383
|
final_bbox = (final_x0, final_y0, final_x1, final_y1)
|
286
384
|
|
385
|
+
# 4.5. Apply within constraint if provided (or from global options)
|
386
|
+
constraint_region = within or natural_pdf.options.layout.directional_within
|
387
|
+
if constraint_region:
|
388
|
+
# Ensure constraint is on same page
|
389
|
+
if hasattr(constraint_region, "page") and constraint_region.page != self.page:
|
390
|
+
raise ValueError("within constraint must be on the same page as the source element")
|
391
|
+
|
392
|
+
# Apply constraint by intersecting with the constraint region's bounds
|
393
|
+
final_x0 = max(final_x0, constraint_region.x0)
|
394
|
+
final_y0 = max(final_y0, constraint_region.top)
|
395
|
+
final_x1 = min(final_x1, constraint_region.x1)
|
396
|
+
final_y1 = min(final_y1, constraint_region.bottom)
|
397
|
+
|
398
|
+
# Update final_bbox with constrained values
|
399
|
+
final_bbox = (final_x0, final_y0, final_x1, final_y1)
|
400
|
+
|
287
401
|
# 5. Check if multipage is needed
|
288
402
|
# Use global default if not explicitly set
|
289
403
|
use_multipage = multipage
|
@@ -291,6 +405,10 @@ class DirectionalMixin:
|
|
291
405
|
if not multipage and natural_pdf.options.layout.auto_multipage:
|
292
406
|
use_multipage = True
|
293
407
|
|
408
|
+
# Multipage is not supported with within constraint
|
409
|
+
if use_multipage and constraint_region:
|
410
|
+
raise ValueError("multipage navigation is not supported with within constraint")
|
411
|
+
|
294
412
|
# Prevent recursion: if called with internal flag, don't use multipage
|
295
413
|
if kwargs.get("_from_flow", False):
|
296
414
|
use_multipage = False
|
@@ -488,6 +606,8 @@ class DirectionalMixin:
|
|
488
606
|
offset: Optional[float] = None,
|
489
607
|
apply_exclusions: bool = True,
|
490
608
|
multipage: bool = False,
|
609
|
+
within: Optional["Region"] = None,
|
610
|
+
anchor: str = "start",
|
491
611
|
**kwargs,
|
492
612
|
) -> Union["Region", "FlowRegion"]:
|
493
613
|
"""
|
@@ -503,6 +623,8 @@ class DirectionalMixin:
|
|
503
623
|
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
504
624
|
multipage: If True, allows the region to span multiple pages. Returns FlowRegion
|
505
625
|
if the result spans multiple pages, Region otherwise (default: False)
|
626
|
+
within: Optional region to constrain the result to (default: None)
|
627
|
+
anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'top', 'bottom'
|
506
628
|
**kwargs: Additional parameters
|
507
629
|
|
508
630
|
Returns:
|
@@ -534,6 +656,8 @@ class DirectionalMixin:
|
|
534
656
|
offset=offset,
|
535
657
|
apply_exclusions=apply_exclusions,
|
536
658
|
multipage=multipage,
|
659
|
+
within=within,
|
660
|
+
anchor=anchor,
|
537
661
|
**kwargs,
|
538
662
|
)
|
539
663
|
|
@@ -547,6 +671,8 @@ class DirectionalMixin:
|
|
547
671
|
offset: Optional[float] = None,
|
548
672
|
apply_exclusions: bool = True,
|
549
673
|
multipage: bool = False,
|
674
|
+
within: Optional["Region"] = None,
|
675
|
+
anchor: str = "start",
|
550
676
|
**kwargs,
|
551
677
|
) -> Union["Region", "FlowRegion"]:
|
552
678
|
"""
|
@@ -562,6 +688,8 @@ class DirectionalMixin:
|
|
562
688
|
if the result spans multiple pages, Region otherwise (default: False)
|
563
689
|
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
564
690
|
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
691
|
+
within: Optional region to constrain the result to (default: None)
|
692
|
+
anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'top', 'bottom'
|
565
693
|
**kwargs: Additional parameters
|
566
694
|
|
567
695
|
Returns:
|
@@ -593,6 +721,8 @@ class DirectionalMixin:
|
|
593
721
|
offset=offset,
|
594
722
|
apply_exclusions=apply_exclusions,
|
595
723
|
multipage=multipage,
|
724
|
+
within=within,
|
725
|
+
anchor=anchor,
|
596
726
|
**kwargs,
|
597
727
|
)
|
598
728
|
|
@@ -606,6 +736,8 @@ class DirectionalMixin:
|
|
606
736
|
offset: Optional[float] = None,
|
607
737
|
apply_exclusions: bool = True,
|
608
738
|
multipage: bool = False,
|
739
|
+
within: Optional["Region"] = None,
|
740
|
+
anchor: str = "start",
|
609
741
|
**kwargs,
|
610
742
|
) -> Union["Region", "FlowRegion"]:
|
611
743
|
"""
|
@@ -621,6 +753,8 @@ class DirectionalMixin:
|
|
621
753
|
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
622
754
|
multipage: If True, allows the region to span multiple pages. Returns FlowRegion
|
623
755
|
if the result spans multiple pages, Region otherwise (default: False)
|
756
|
+
within: Optional region to constrain the result to (default: None)
|
757
|
+
anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'left', 'right'
|
624
758
|
**kwargs: Additional parameters
|
625
759
|
|
626
760
|
Returns:
|
@@ -652,6 +786,8 @@ class DirectionalMixin:
|
|
652
786
|
offset=offset,
|
653
787
|
apply_exclusions=apply_exclusions,
|
654
788
|
multipage=multipage,
|
789
|
+
within=within,
|
790
|
+
anchor=anchor,
|
655
791
|
**kwargs,
|
656
792
|
)
|
657
793
|
|
@@ -665,6 +801,8 @@ class DirectionalMixin:
|
|
665
801
|
offset: Optional[float] = None,
|
666
802
|
apply_exclusions: bool = True,
|
667
803
|
multipage: bool = False,
|
804
|
+
within: Optional["Region"] = None,
|
805
|
+
anchor: str = "start",
|
668
806
|
**kwargs,
|
669
807
|
) -> Union["Region", "FlowRegion"]:
|
670
808
|
"""
|
@@ -680,6 +818,8 @@ class DirectionalMixin:
|
|
680
818
|
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
681
819
|
multipage: If True, allows the region to span multiple pages. Returns FlowRegion
|
682
820
|
if the result spans multiple pages, Region otherwise (default: False)
|
821
|
+
within: Optional region to constrain the result to (default: None)
|
822
|
+
anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'left', 'right'
|
683
823
|
**kwargs: Additional parameters
|
684
824
|
|
685
825
|
Returns:
|
@@ -711,6 +851,8 @@ class DirectionalMixin:
|
|
711
851
|
offset=offset,
|
712
852
|
apply_exclusions=apply_exclusions,
|
713
853
|
multipage=multipage,
|
854
|
+
within=within,
|
855
|
+
anchor=anchor,
|
714
856
|
**kwargs,
|
715
857
|
)
|
716
858
|
|
@@ -28,6 +28,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
28
28
|
from PIL import Image, ImageDraw, ImageFont
|
29
29
|
from tqdm.auto import tqdm
|
30
30
|
|
31
|
+
from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
|
31
32
|
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
32
33
|
from natural_pdf.classification.manager import ClassificationManager
|
33
34
|
from natural_pdf.classification.mixin import ClassificationMixin
|
@@ -83,6 +84,7 @@ class ElementCollection(
|
|
83
84
|
ApplyMixin,
|
84
85
|
ExportMixin,
|
85
86
|
ClassificationMixin,
|
87
|
+
CheckboxDetectionMixin,
|
86
88
|
DirectionalCollectionMixin,
|
87
89
|
DescribeMixin,
|
88
90
|
InspectMixin,
|
@@ -839,7 +841,6 @@ class ElementCollection(
|
|
839
841
|
result = " ".join(c.get("text", "") for c in all_char_dicts)
|
840
842
|
|
841
843
|
else:
|
842
|
-
print("JOIN WITHOUT LAYOUT")
|
843
844
|
# Default: Simple join without layout
|
844
845
|
logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
|
845
846
|
result = separator.join(el.extract_text() for el in text_elements)
|
@@ -3344,7 +3345,45 @@ class ElementCollection(
|
|
3344
3345
|
|
3345
3346
|
# Use collection's apply helper for optional progress bar
|
3346
3347
|
self.apply(_process, show_progress=show_progress)
|
3347
|
-
|
3348
|
+
|
3349
|
+
def detect_checkboxes(
|
3350
|
+
self, *args, show_progress: bool = False, **kwargs
|
3351
|
+
) -> "ElementCollection":
|
3352
|
+
"""
|
3353
|
+
Detect checkboxes on all applicable elements in the collection.
|
3354
|
+
|
3355
|
+
This method iterates through elements and calls detect_checkboxes on those
|
3356
|
+
that support it (Pages and Regions).
|
3357
|
+
|
3358
|
+
Args:
|
3359
|
+
*args: Positional arguments to pass to detect_checkboxes.
|
3360
|
+
show_progress: Whether to show a progress bar during processing.
|
3361
|
+
**kwargs: Keyword arguments to pass to detect_checkboxes.
|
3362
|
+
|
3363
|
+
Returns:
|
3364
|
+
A new ElementCollection containing all detected checkbox regions.
|
3365
|
+
"""
|
3366
|
+
all_checkboxes = []
|
3367
|
+
|
3368
|
+
def _process(el):
|
3369
|
+
if hasattr(el, "detect_checkboxes"):
|
3370
|
+
# Element supports checkbox detection
|
3371
|
+
result = el.detect_checkboxes(*args, **kwargs)
|
3372
|
+
if hasattr(result, "elements"):
|
3373
|
+
# Result is a collection
|
3374
|
+
all_checkboxes.extend(result.elements)
|
3375
|
+
elif isinstance(result, list):
|
3376
|
+
# Result is a list
|
3377
|
+
all_checkboxes.extend(result)
|
3378
|
+
elif result:
|
3379
|
+
# Single result
|
3380
|
+
all_checkboxes.append(result)
|
3381
|
+
return el
|
3382
|
+
|
3383
|
+
# Use collection's apply helper for optional progress bar
|
3384
|
+
self.apply(_process, show_progress=show_progress, desc="Detecting checkboxes")
|
3385
|
+
|
3386
|
+
return ElementCollection(all_checkboxes)
|
3348
3387
|
|
3349
3388
|
# ------------------------------------------------------------------
|
3350
3389
|
|