natural-pdf 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/collections/mixins.py +14 -5
  10. natural_pdf/core/element_manager.py +5 -1
  11. natural_pdf/core/page.py +103 -9
  12. natural_pdf/core/page_collection.py +41 -1
  13. natural_pdf/core/pdf.py +24 -1
  14. natural_pdf/describe/base.py +20 -0
  15. natural_pdf/elements/base.py +152 -10
  16. natural_pdf/elements/element_collection.py +41 -2
  17. natural_pdf/elements/region.py +115 -2
  18. natural_pdf/judge.py +1509 -0
  19. natural_pdf/selectors/parser.py +42 -1
  20. natural_pdf/utils/spatial.py +42 -39
  21. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
  22. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +42 -18
  23. temp/check_model.py +49 -0
  24. temp/check_pdf_content.py +9 -0
  25. temp/checkbox_checks.py +590 -0
  26. temp/checkbox_simple.py +117 -0
  27. temp/checkbox_ux_ideas.py +400 -0
  28. temp/context_manager_prototype.py +177 -0
  29. temp/convert_to_hf.py +60 -0
  30. temp/demo_text_closest.py +66 -0
  31. temp/inspect_model.py +43 -0
  32. temp/rtdetr_dinov2_test.py +49 -0
  33. temp/test_closest_debug.py +26 -0
  34. temp/test_closest_debug2.py +22 -0
  35. temp/test_context_exploration.py +85 -0
  36. temp/test_durham.py +30 -0
  37. temp/test_empty_string.py +16 -0
  38. temp/test_similarity.py +15 -0
  39. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
  40. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
  41. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
  42. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import base64
2
2
  import concurrent.futures # Added import
3
+ import contextlib
3
4
  import hashlib
4
5
  import io
5
6
  import json
@@ -49,6 +50,7 @@ import numpy as np
49
50
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
50
51
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
51
52
 
53
+ from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
52
54
  from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
53
55
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
54
56
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
@@ -102,6 +104,7 @@ class Page(
102
104
  ClassificationMixin,
103
105
  ExtractionMixin,
104
106
  ShapeDetectionMixin,
107
+ CheckboxDetectionMixin,
105
108
  DescribeMixin,
106
109
  VisualSearchMixin,
107
110
  Visualizable,
@@ -275,6 +278,9 @@ class Page(
275
278
  self._load_elements()
276
279
  self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
277
280
 
281
+ # Flag to prevent infinite recursion when computing exclusions
282
+ self._computing_exclusions = False
283
+
278
284
  def _get_render_specs(
279
285
  self,
280
286
  mode: Literal["show", "render"] = "show",
@@ -412,6 +418,35 @@ class Page(
412
418
  self._exclusions = []
413
419
  return self
414
420
 
421
+ @contextlib.contextmanager
422
+ def without_exclusions(self):
423
+ """
424
+ Context manager that temporarily disables exclusion processing.
425
+
426
+ This prevents infinite recursion when exclusion callables themselves
427
+ use find() operations. While in this context, all find operations
428
+ will skip exclusion filtering.
429
+
430
+ Example:
431
+ ```python
432
+ # This exclusion would normally cause infinite recursion:
433
+ page.add_exclusion(lambda p: p.find("text:contains('Header')").expand())
434
+
435
+ # But internally, it's safe because we use:
436
+ with page.without_exclusions():
437
+ region = exclusion_callable(page)
438
+ ```
439
+
440
+ Yields:
441
+ The page object with exclusions temporarily disabled.
442
+ """
443
+ old_value = self._computing_exclusions
444
+ self._computing_exclusions = True
445
+ try:
446
+ yield self
447
+ finally:
448
+ self._computing_exclusions = old_value
449
+
415
450
  def add_exclusion(
416
451
  self,
417
452
  exclusion_func_or_region: Union[
@@ -759,15 +794,10 @@ class Page(
759
794
  if debug:
760
795
  print(f" - Evaluating callable '{exclusion_label}'...")
761
796
 
762
- # Temporarily clear exclusions (consider if really needed)
763
- temp_original_exclusions = self._exclusions
764
- self._exclusions = []
765
-
766
- # Call the function - Expects it to return a Region or None
767
- region_result = exclusion_item(self)
768
-
769
- # Restore exclusions
770
- self._exclusions = temp_original_exclusions
797
+ # Use context manager to prevent infinite recursion
798
+ with self.without_exclusions():
799
+ # Call the function - Expects it to return a Region or None
800
+ region_result = exclusion_item(self)
771
801
 
772
802
  if isinstance(region_result, Region):
773
803
  # Assign the label to the returned region
@@ -947,6 +977,11 @@ class Page(
947
977
  Returns:
948
978
  A new list containing only the elements not excluded.
949
979
  """
980
+ # Skip exclusion filtering if we're currently computing exclusions
981
+ # This prevents infinite recursion when exclusion callables use find operations
982
+ if self._computing_exclusions:
983
+ return elements
984
+
950
985
  # Check both page-level and PDF-level exclusions
951
986
  has_page_exclusions = bool(self._exclusions)
952
987
  has_pdf_exclusions = (
@@ -1458,6 +1493,65 @@ class Page(
1458
1493
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
1459
1494
  )
1460
1495
 
1496
+ # Handle :closest pseudo-class for fuzzy text matching
1497
+ for pseudo in selector_obj.get("pseudo_classes", []):
1498
+ name = pseudo.get("name")
1499
+ if name == "closest" and pseudo.get("args") is not None:
1500
+ import difflib
1501
+
1502
+ # Parse search text and threshold
1503
+ search_text = str(pseudo["args"]).strip()
1504
+ threshold = 0.0 # Default threshold
1505
+
1506
+ # Handle empty search text
1507
+ if not search_text:
1508
+ matching_elements = []
1509
+ break
1510
+
1511
+ # Check if threshold is specified with @ separator
1512
+ if "@" in search_text and search_text.count("@") == 1:
1513
+ text_part, threshold_part = search_text.rsplit("@", 1)
1514
+ try:
1515
+ threshold = float(threshold_part)
1516
+ search_text = text_part.strip()
1517
+ except (ValueError, TypeError):
1518
+ pass # Keep original search_text and default threshold
1519
+
1520
+ # Determine case sensitivity
1521
+ ignore_case = not kwargs.get("case", False)
1522
+
1523
+ # Calculate similarity scores for all elements
1524
+ scored_elements = []
1525
+
1526
+ for el in matching_elements:
1527
+ if hasattr(el, "text") and el.text:
1528
+ el_text = el.text.strip()
1529
+ search_term = search_text
1530
+
1531
+ if ignore_case:
1532
+ el_text = el_text.lower()
1533
+ search_term = search_term.lower()
1534
+
1535
+ # Calculate similarity ratio
1536
+ ratio = difflib.SequenceMatcher(None, search_term, el_text).ratio()
1537
+
1538
+ # Check if element contains the search term as substring
1539
+ contains_match = search_term in el_text
1540
+
1541
+ # Store element with its similarity score and contains flag
1542
+ if ratio >= threshold:
1543
+ scored_elements.append((ratio, contains_match, el))
1544
+
1545
+ # Sort by:
1546
+ # 1. Contains match (True before False)
1547
+ # 2. Similarity score (highest first)
1548
+ # This ensures substring matches come first but are sorted by similarity
1549
+ scored_elements.sort(key=lambda x: (x[1], x[0]), reverse=True)
1550
+
1551
+ # Extract just the elements
1552
+ matching_elements = [el for _, _, el in scored_elements]
1553
+ break # Only process the first :closest pseudo-class
1554
+
1461
1555
  # Handle collection-level pseudo-classes (:first, :last)
1462
1556
  for pseudo in selector_obj.get("pseudo_classes", []):
1463
1557
  name = pseudo.get("name")
@@ -28,6 +28,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
28
28
  from PIL import Image, ImageDraw, ImageFont
29
29
  from tqdm.auto import tqdm
30
30
 
31
+ from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
31
32
  from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
32
33
  from natural_pdf.classification.manager import ClassificationManager
33
34
  from natural_pdf.classification.mixin import ClassificationMixin
@@ -76,7 +77,9 @@ T = TypeVar("T")
76
77
  P = TypeVar("P", bound="Page")
77
78
 
78
79
 
79
- class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Visualizable):
80
+ class PageCollection(
81
+ TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, CheckboxDetectionMixin, Visualizable
82
+ ):
80
83
  """
81
84
  Represents a collection of Page objects, often from a single PDF document.
82
85
  Provides methods for batch operations on these pages.
@@ -1506,6 +1509,43 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
1506
1509
 
1507
1510
  return ElementCollection(all_regions)
1508
1511
 
1512
+ def detect_checkboxes(self, *args, **kwargs) -> "ElementCollection[Region]":
1513
+ """
1514
+ Detects checkboxes on each page in the collection.
1515
+
1516
+ This method iterates through each page, calls its detect_checkboxes method,
1517
+ and returns a single ElementCollection containing all detected checkbox
1518
+ regions from all pages.
1519
+
1520
+ Args:
1521
+ *args: Positional arguments to pass to each page's detect_checkboxes method.
1522
+ **kwargs: Keyword arguments to pass to each page's detect_checkboxes method.
1523
+ A 'show_progress' kwarg can be included to show a progress bar.
1524
+
1525
+ Returns:
1526
+ An ElementCollection of all detected checkbox Region objects.
1527
+ """
1528
+ all_checkboxes = []
1529
+
1530
+ show_progress = kwargs.pop("show_progress", True)
1531
+
1532
+ iterator = self.pages
1533
+ if show_progress:
1534
+ try:
1535
+ from tqdm.auto import tqdm
1536
+
1537
+ iterator = tqdm(self.pages, desc="Detecting checkboxes")
1538
+ except ImportError:
1539
+ pass # tqdm not installed
1540
+
1541
+ for page in iterator:
1542
+ # Each page's detect_checkboxes method returns an ElementCollection
1543
+ checkbox_collection = page.detect_checkboxes(*args, **kwargs)
1544
+ if checkbox_collection:
1545
+ all_checkboxes.extend(checkbox_collection.elements)
1546
+
1547
+ return ElementCollection(all_checkboxes)
1548
+
1509
1549
  def highlights(self, show: bool = False) -> "HighlightContext":
1510
1550
  """
1511
1551
  Create a highlight context for accumulating highlights.
natural_pdf/core/pdf.py CHANGED
@@ -27,6 +27,7 @@ from typing import (
27
27
  import pdfplumber
28
28
  from tqdm.auto import tqdm
29
29
 
30
+ from natural_pdf.analyzers.checkbox.mixin import CheckboxDetectionMixin
30
31
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
31
32
  from natural_pdf.classification.manager import ClassificationError
32
33
  from natural_pdf.classification.mixin import ClassificationMixin
@@ -303,7 +304,13 @@ class _LazyPageList(Sequence):
303
304
 
304
305
 
305
306
  class PDF(
306
- TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
307
+ TextMixin,
308
+ ExtractionMixin,
309
+ ExportMixin,
310
+ ClassificationMixin,
311
+ CheckboxDetectionMixin,
312
+ VisualSearchMixin,
313
+ Visualizable,
307
314
  ):
308
315
  """Enhanced PDF wrapper built on top of pdfplumber.
309
316
 
@@ -2552,6 +2559,22 @@ class PDF(
2552
2559
  """
2553
2560
  return self.pages.analyze_layout(*args, **kwargs)
2554
2561
 
2562
+ def detect_checkboxes(self, *args, **kwargs) -> "ElementCollection[Region]":
2563
+ """
2564
+ Detects checkboxes on all pages in the PDF.
2565
+
2566
+ This is a convenience method that calls detect_checkboxes on the PDF's
2567
+ page collection.
2568
+
2569
+ Args:
2570
+ *args: Positional arguments passed to pages.detect_checkboxes().
2571
+ **kwargs: Keyword arguments passed to pages.detect_checkboxes().
2572
+
2573
+ Returns:
2574
+ An ElementCollection of all detected checkbox Region objects.
2575
+ """
2576
+ return self.pages.detect_checkboxes(*args, **kwargs)
2577
+
2555
2578
  def highlights(self, show: bool = False) -> "HighlightContext":
2556
2579
  """
2557
2580
  Create a highlight context for accumulating highlights.
@@ -233,6 +233,17 @@ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> Insp
233
233
  # Get appropriate columns for this type
234
234
  columns = _get_columns_for_type(element_type, show_page_column)
235
235
 
236
+ # Add checkbox state column if we have checkbox regions
237
+ if element_type == "region" and any(
238
+ getattr(e, "region_type", "") == "checkbox" for e in display_elements
239
+ ):
240
+ # Insert state column after type column
241
+ if "type" in columns:
242
+ type_idx = columns.index("type")
243
+ columns.insert(type_idx + 1, "state")
244
+ else:
245
+ columns.append("state")
246
+
236
247
  # Extract data for each element
237
248
  element_data = []
238
249
  for element in display_elements:
@@ -423,6 +434,15 @@ def _extract_element_value(element: "Element", column: str) -> Any:
423
434
  value = getattr(element, column, False)
424
435
  return value if isinstance(value, bool) else False
425
436
 
437
+ elif column == "state":
438
+ # For checkbox regions, show checked/unchecked state
439
+ if getattr(element, "region_type", "") == "checkbox":
440
+ if hasattr(element, "is_checked"):
441
+ return "checked" if element.is_checked else "unchecked"
442
+ elif hasattr(element, "checkbox_state"):
443
+ return element.checkbox_state
444
+ return ""
445
+
426
446
  else:
427
447
  # Generic attribute access
428
448
  value = getattr(element, column, "")
@@ -122,6 +122,8 @@ class DirectionalMixin:
122
122
  offset: float = 0.0,
123
123
  apply_exclusions: bool = True,
124
124
  multipage: bool = False,
125
+ within: Optional["Region"] = None,
126
+ anchor: str = "start",
125
127
  **kwargs,
126
128
  ) -> Union["Region", "FlowRegion"]:
127
129
  """
@@ -136,6 +138,9 @@ class DirectionalMixin:
136
138
  include_endpoint: Whether to include the boundary element found by 'until'
137
139
  offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
138
140
  apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
141
+ multipage: If True, allows the region to span multiple pages
142
+ within: Optional region to constrain the result to (default: None)
143
+ anchor: Reference point - 'start', 'center', 'end', or explicit edges like 'top', 'bottom', 'left', 'right'
139
144
  **kwargs: Additional parameters for the 'until' selector search
140
145
 
141
146
  Returns:
@@ -147,6 +152,37 @@ class DirectionalMixin:
147
152
  is_positive = direction in ("right", "below") # right/below are positive directions
148
153
  pixel_offset = offset # Use provided offset for excluding elements/endpoints
149
154
 
155
+ # Normalize anchor parameter
156
+ def normalize_anchor(anchor_value: str, dir: str) -> str:
157
+ """Convert start/end/center to explicit edges based on direction."""
158
+ if anchor_value == "center":
159
+ return "center"
160
+ elif anchor_value == "start":
161
+ # Start means the edge we're moving away from
162
+ if dir == "below":
163
+ return "top"
164
+ elif dir == "above":
165
+ return "bottom"
166
+ elif dir == "right":
167
+ return "left"
168
+ elif dir == "left":
169
+ return "right"
170
+ elif anchor_value == "end":
171
+ # End means the edge we're moving towards
172
+ if dir == "below":
173
+ return "bottom"
174
+ elif dir == "above":
175
+ return "top"
176
+ elif dir == "right":
177
+ return "right"
178
+ elif dir == "left":
179
+ return "left"
180
+ else:
181
+ # Already explicit (top/bottom/left/right)
182
+ return anchor_value
183
+
184
+ normalized_anchor = normalize_anchor(anchor, direction)
185
+
150
186
  # 1. Determine initial boundaries based on direction and include_source
151
187
  if is_horizontal:
152
188
  # Initial cross-boundaries (vertical)
@@ -200,34 +236,84 @@ class DirectionalMixin:
200
236
  if until:
201
237
  from natural_pdf.elements.element_collection import ElementCollection
202
238
 
239
+ # Get constraint region (from parameter or global options)
240
+ constraint_region = within or natural_pdf.options.layout.directional_within
241
+
242
+ # Check if until uses :closest selector (preserve ordering)
243
+ preserve_order = isinstance(until, str) and ":closest" in until
244
+
203
245
  # If until is an elementcollection, just use it
204
246
  if isinstance(until, ElementCollection):
205
247
  # Only take ones on the same page
206
248
  all_matches = [m for m in until if m.page == self.page]
207
249
  else:
208
- all_matches = self.page.find_all(until, apply_exclusions=apply_exclusions, **kwargs)
250
+ # If we have a constraint region, search within it instead of the whole page
251
+ if (
252
+ constraint_region
253
+ and hasattr(constraint_region, "page")
254
+ and constraint_region.page == self.page
255
+ ):
256
+ all_matches = constraint_region.find_all(
257
+ until, apply_exclusions=apply_exclusions, **kwargs
258
+ )
259
+ else:
260
+ all_matches = self.page.find_all(
261
+ until, apply_exclusions=apply_exclusions, **kwargs
262
+ )
209
263
  matches_in_direction = []
210
264
 
211
- # Filter and sort matches based on direction
265
+ # Filter and sort matches based on direction and anchor parameter
212
266
  # Also filter by cross-direction bounds when cross_size='element'
267
+
268
+ # IMPORTANT: Exclude self from matches to prevent finding ourselves
269
+ all_matches = [m for m in all_matches if m is not self]
270
+
271
+ # Determine reference point based on normalized_anchor
213
272
  if direction == "above":
214
- matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
273
+ if normalized_anchor == "top":
274
+ ref_y = self.top
275
+ elif normalized_anchor == "center":
276
+ ref_y = (self.top + self.bottom) / 2
277
+ else: # 'bottom'
278
+ ref_y = self.bottom
279
+
280
+ matches_in_direction = [m for m in all_matches if m.bottom <= ref_y]
215
281
  # Filter by horizontal bounds if cross_size='element'
216
282
  if cross_size == "element":
217
283
  matches_in_direction = [
218
284
  m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
219
285
  ]
220
- matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
286
+ # Only sort by position if not using :closest (which is already sorted by quality)
287
+ if not preserve_order:
288
+ matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
289
+
221
290
  elif direction == "below":
222
- matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
291
+ if normalized_anchor == "top":
292
+ ref_y = self.top
293
+ elif normalized_anchor == "center":
294
+ ref_y = (self.top + self.bottom) / 2
295
+ else: # 'bottom'
296
+ ref_y = self.bottom
297
+
298
+ matches_in_direction = [m for m in all_matches if m.top >= ref_y]
223
299
  # Filter by horizontal bounds if cross_size='element'
224
300
  if cross_size == "element":
225
301
  matches_in_direction = [
226
302
  m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
227
303
  ]
228
- matches_in_direction.sort(key=lambda e: e.top)
304
+ # Only sort by position if not using :closest (which is already sorted by quality)
305
+ if not preserve_order:
306
+ matches_in_direction.sort(key=lambda e: e.top)
307
+
229
308
  elif direction == "left":
230
- matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
309
+ if normalized_anchor == "left":
310
+ ref_x = self.x0
311
+ elif normalized_anchor == "center":
312
+ ref_x = (self.x0 + self.x1) / 2
313
+ else: # 'right'
314
+ ref_x = self.x1
315
+
316
+ matches_in_direction = [m for m in all_matches if m.x1 <= ref_x]
231
317
  # Filter by vertical bounds if cross_size='element'
232
318
  if cross_size == "element":
233
319
  matches_in_direction = [
@@ -235,9 +321,19 @@ class DirectionalMixin:
235
321
  for m in matches_in_direction
236
322
  if m.top < self.bottom and m.bottom > self.top
237
323
  ]
238
- matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
324
+ # Only sort by position if not using :closest (which is already sorted by quality)
325
+ if not preserve_order:
326
+ matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
327
+
239
328
  elif direction == "right":
240
- matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
329
+ if normalized_anchor == "left":
330
+ ref_x = self.x0
331
+ elif normalized_anchor == "center":
332
+ ref_x = (self.x0 + self.x1) / 2
333
+ else: # 'right'
334
+ ref_x = self.x1
335
+
336
+ matches_in_direction = [m for m in all_matches if m.x0 >= ref_x]
241
337
  # Filter by vertical bounds if cross_size='element'
242
338
  if cross_size == "element":
243
339
  matches_in_direction = [
@@ -245,7 +341,9 @@ class DirectionalMixin:
245
341
  for m in matches_in_direction
246
342
  if m.top < self.bottom and m.bottom > self.top
247
343
  ]
248
- matches_in_direction.sort(key=lambda e: e.x0)
344
+ # Only sort by position if not using :closest (which is already sorted by quality)
345
+ if not preserve_order:
346
+ matches_in_direction.sort(key=lambda e: e.x0)
249
347
 
250
348
  if matches_in_direction:
251
349
  target = matches_in_direction[0]
@@ -284,6 +382,22 @@ class DirectionalMixin:
284
382
  final_y1 = max(bbox[1], bbox[3])
285
383
  final_bbox = (final_x0, final_y0, final_x1, final_y1)
286
384
 
385
+ # 4.5. Apply within constraint if provided (or from global options)
386
+ constraint_region = within or natural_pdf.options.layout.directional_within
387
+ if constraint_region:
388
+ # Ensure constraint is on same page
389
+ if hasattr(constraint_region, "page") and constraint_region.page != self.page:
390
+ raise ValueError("within constraint must be on the same page as the source element")
391
+
392
+ # Apply constraint by intersecting with the constraint region's bounds
393
+ final_x0 = max(final_x0, constraint_region.x0)
394
+ final_y0 = max(final_y0, constraint_region.top)
395
+ final_x1 = min(final_x1, constraint_region.x1)
396
+ final_y1 = min(final_y1, constraint_region.bottom)
397
+
398
+ # Update final_bbox with constrained values
399
+ final_bbox = (final_x0, final_y0, final_x1, final_y1)
400
+
287
401
  # 5. Check if multipage is needed
288
402
  # Use global default if not explicitly set
289
403
  use_multipage = multipage
@@ -291,6 +405,10 @@ class DirectionalMixin:
291
405
  if not multipage and natural_pdf.options.layout.auto_multipage:
292
406
  use_multipage = True
293
407
 
408
+ # Multipage is not supported with within constraint
409
+ if use_multipage and constraint_region:
410
+ raise ValueError("multipage navigation is not supported with within constraint")
411
+
294
412
  # Prevent recursion: if called with internal flag, don't use multipage
295
413
  if kwargs.get("_from_flow", False):
296
414
  use_multipage = False
@@ -488,6 +606,8 @@ class DirectionalMixin:
488
606
  offset: Optional[float] = None,
489
607
  apply_exclusions: bool = True,
490
608
  multipage: bool = False,
609
+ within: Optional["Region"] = None,
610
+ anchor: str = "start",
491
611
  **kwargs,
492
612
  ) -> Union["Region", "FlowRegion"]:
493
613
  """
@@ -503,6 +623,8 @@ class DirectionalMixin:
503
623
  apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
504
624
  multipage: If True, allows the region to span multiple pages. Returns FlowRegion
505
625
  if the result spans multiple pages, Region otherwise (default: False)
626
+ within: Optional region to constrain the result to (default: None)
627
+ anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'top', 'bottom'
506
628
  **kwargs: Additional parameters
507
629
 
508
630
  Returns:
@@ -534,6 +656,8 @@ class DirectionalMixin:
534
656
  offset=offset,
535
657
  apply_exclusions=apply_exclusions,
536
658
  multipage=multipage,
659
+ within=within,
660
+ anchor=anchor,
537
661
  **kwargs,
538
662
  )
539
663
 
@@ -547,6 +671,8 @@ class DirectionalMixin:
547
671
  offset: Optional[float] = None,
548
672
  apply_exclusions: bool = True,
549
673
  multipage: bool = False,
674
+ within: Optional["Region"] = None,
675
+ anchor: str = "start",
550
676
  **kwargs,
551
677
  ) -> Union["Region", "FlowRegion"]:
552
678
  """
@@ -562,6 +688,8 @@ class DirectionalMixin:
562
688
  if the result spans multiple pages, Region otherwise (default: False)
563
689
  offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
564
690
  apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
691
+ within: Optional region to constrain the result to (default: None)
692
+ anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'top', 'bottom'
565
693
  **kwargs: Additional parameters
566
694
 
567
695
  Returns:
@@ -593,6 +721,8 @@ class DirectionalMixin:
593
721
  offset=offset,
594
722
  apply_exclusions=apply_exclusions,
595
723
  multipage=multipage,
724
+ within=within,
725
+ anchor=anchor,
596
726
  **kwargs,
597
727
  )
598
728
 
@@ -606,6 +736,8 @@ class DirectionalMixin:
606
736
  offset: Optional[float] = None,
607
737
  apply_exclusions: bool = True,
608
738
  multipage: bool = False,
739
+ within: Optional["Region"] = None,
740
+ anchor: str = "start",
609
741
  **kwargs,
610
742
  ) -> Union["Region", "FlowRegion"]:
611
743
  """
@@ -621,6 +753,8 @@ class DirectionalMixin:
621
753
  apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
622
754
  multipage: If True, allows the region to span multiple pages. Returns FlowRegion
623
755
  if the result spans multiple pages, Region otherwise (default: False)
756
+ within: Optional region to constrain the result to (default: None)
757
+ anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'left', 'right'
624
758
  **kwargs: Additional parameters
625
759
 
626
760
  Returns:
@@ -652,6 +786,8 @@ class DirectionalMixin:
652
786
  offset=offset,
653
787
  apply_exclusions=apply_exclusions,
654
788
  multipage=multipage,
789
+ within=within,
790
+ anchor=anchor,
655
791
  **kwargs,
656
792
  )
657
793
 
@@ -665,6 +801,8 @@ class DirectionalMixin:
665
801
  offset: Optional[float] = None,
666
802
  apply_exclusions: bool = True,
667
803
  multipage: bool = False,
804
+ within: Optional["Region"] = None,
805
+ anchor: str = "start",
668
806
  **kwargs,
669
807
  ) -> Union["Region", "FlowRegion"]:
670
808
  """
@@ -680,6 +818,8 @@ class DirectionalMixin:
680
818
  apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
681
819
  multipage: If True, allows the region to span multiple pages. Returns FlowRegion
682
820
  if the result spans multiple pages, Region otherwise (default: False)
821
+ within: Optional region to constrain the result to (default: None)
822
+ anchor: Reference point - 'start' (default), 'center', 'end', or explicit edges like 'left', 'right'
683
823
  **kwargs: Additional parameters
684
824
 
685
825
  Returns:
@@ -711,6 +851,8 @@ class DirectionalMixin:
711
851
  offset=offset,
712
852
  apply_exclusions=apply_exclusions,
713
853
  multipage=multipage,
854
+ within=within,
855
+ anchor=anchor,
714
856
  **kwargs,
715
857
  )
716
858