natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +130 -31
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +1 -0
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +172 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/top_level.txt +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -6,6 +6,8 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Uni
|
|
6
6
|
|
7
7
|
from PIL import Image
|
8
8
|
|
9
|
+
# Import global options
|
10
|
+
import natural_pdf
|
9
11
|
from natural_pdf.classification.mixin import ClassificationMixin
|
10
12
|
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
11
13
|
from natural_pdf.describe.mixin import DescribeMixin
|
@@ -18,6 +20,7 @@ if TYPE_CHECKING:
|
|
18
20
|
from natural_pdf.core.page import Page
|
19
21
|
from natural_pdf.elements.element_collection import ElementCollection
|
20
22
|
from natural_pdf.elements.region import Region
|
23
|
+
from natural_pdf.flows.region import FlowRegion
|
21
24
|
|
22
25
|
|
23
26
|
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
@@ -93,6 +96,16 @@ class DirectionalMixin:
|
|
93
96
|
- above(): Create region above
|
94
97
|
- below(): Create region below
|
95
98
|
|
99
|
+
Smart defaults:
|
100
|
+
- left() and right() default to element height
|
101
|
+
- above() and below() default to full page width
|
102
|
+
- All methods use a small offset (default 0.01 points) to avoid character overlap
|
103
|
+
|
104
|
+
Global offset configuration:
|
105
|
+
The default offset can be changed globally:
|
106
|
+
import natural_pdf as npdf
|
107
|
+
npdf.options.layout.directional_offset = 0.05 # Change to 0.05 points
|
108
|
+
|
96
109
|
Note:
|
97
110
|
This mixin requires the implementing class to have 'page', 'x0', 'top',
|
98
111
|
'x1', and 'bottom' attributes for coordinate calculations.
|
@@ -107,8 +120,10 @@ class DirectionalMixin:
|
|
107
120
|
until: Optional[str] = None,
|
108
121
|
include_endpoint: bool = True,
|
109
122
|
offset: float = 0.0,
|
123
|
+
apply_exclusions: bool = True,
|
124
|
+
multipage: bool = False,
|
110
125
|
**kwargs,
|
111
|
-
) -> "Region":
|
126
|
+
) -> Union["Region", "FlowRegion"]:
|
112
127
|
"""
|
113
128
|
Protected helper method to create a region in a specified direction relative to this element/region.
|
114
129
|
|
@@ -119,7 +134,8 @@ class DirectionalMixin:
|
|
119
134
|
include_source: Whether to include this element/region's area in the result
|
120
135
|
until: Optional selector string to specify a boundary element
|
121
136
|
include_endpoint: Whether to include the boundary element found by 'until'
|
122
|
-
offset: Pixel offset when excluding source/endpoint (default:
|
137
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
138
|
+
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
123
139
|
**kwargs: Additional parameters for the 'until' selector search
|
124
140
|
|
125
141
|
Returns:
|
@@ -189,21 +205,46 @@ class DirectionalMixin:
|
|
189
205
|
# Only take ones on the same page
|
190
206
|
all_matches = [m for m in until if m.page == self.page]
|
191
207
|
else:
|
192
|
-
all_matches = self.page.find_all(until, **kwargs)
|
208
|
+
all_matches = self.page.find_all(until, apply_exclusions=apply_exclusions, **kwargs)
|
193
209
|
matches_in_direction = []
|
194
210
|
|
195
211
|
# Filter and sort matches based on direction
|
212
|
+
# Also filter by cross-direction bounds when cross_size='element'
|
196
213
|
if direction == "above":
|
197
214
|
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
215
|
+
# Filter by horizontal bounds if cross_size='element'
|
216
|
+
if cross_size == "element":
|
217
|
+
matches_in_direction = [
|
218
|
+
m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
|
219
|
+
]
|
198
220
|
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
199
221
|
elif direction == "below":
|
200
222
|
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
223
|
+
# Filter by horizontal bounds if cross_size='element'
|
224
|
+
if cross_size == "element":
|
225
|
+
matches_in_direction = [
|
226
|
+
m for m in matches_in_direction if m.x0 < self.x1 and m.x1 > self.x0
|
227
|
+
]
|
201
228
|
matches_in_direction.sort(key=lambda e: e.top)
|
202
229
|
elif direction == "left":
|
203
230
|
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
231
|
+
# Filter by vertical bounds if cross_size='element'
|
232
|
+
if cross_size == "element":
|
233
|
+
matches_in_direction = [
|
234
|
+
m
|
235
|
+
for m in matches_in_direction
|
236
|
+
if m.top < self.bottom and m.bottom > self.top
|
237
|
+
]
|
204
238
|
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
205
239
|
elif direction == "right":
|
206
240
|
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
241
|
+
# Filter by vertical bounds if cross_size='element'
|
242
|
+
if cross_size == "element":
|
243
|
+
matches_in_direction = [
|
244
|
+
m
|
245
|
+
for m in matches_in_direction
|
246
|
+
if m.top < self.bottom and m.bottom > self.top
|
247
|
+
]
|
207
248
|
matches_in_direction.sort(key=lambda e: e.x0)
|
208
249
|
|
209
250
|
if matches_in_direction:
|
@@ -243,7 +284,51 @@ class DirectionalMixin:
|
|
243
284
|
final_y1 = max(bbox[1], bbox[3])
|
244
285
|
final_bbox = (final_x0, final_y0, final_x1, final_y1)
|
245
286
|
|
246
|
-
# 5.
|
287
|
+
# 5. Check if multipage is needed
|
288
|
+
# Use global default if not explicitly set
|
289
|
+
use_multipage = multipage
|
290
|
+
# If multipage is False but auto_multipage is True, use True
|
291
|
+
if not multipage and natural_pdf.options.layout.auto_multipage:
|
292
|
+
use_multipage = True
|
293
|
+
|
294
|
+
# Prevent recursion: if called with internal flag, don't use multipage
|
295
|
+
if kwargs.get("_from_flow", False):
|
296
|
+
use_multipage = False
|
297
|
+
|
298
|
+
if use_multipage:
|
299
|
+
# Check if we need to cross page boundaries
|
300
|
+
needs_multipage = False
|
301
|
+
|
302
|
+
# Case 1: until was specified but target not found on current page
|
303
|
+
if until and not target:
|
304
|
+
needs_multipage = True
|
305
|
+
|
306
|
+
# Case 2: size extends beyond page boundaries
|
307
|
+
if not until:
|
308
|
+
if direction == "below" and final_bbox[3] >= self.page.height:
|
309
|
+
needs_multipage = True
|
310
|
+
elif direction == "above" and final_bbox[1] <= 0:
|
311
|
+
needs_multipage = True
|
312
|
+
elif direction == "right" and final_bbox[2] >= self.page.width:
|
313
|
+
needs_multipage = True
|
314
|
+
elif direction == "left" and final_bbox[0] <= 0:
|
315
|
+
needs_multipage = True
|
316
|
+
|
317
|
+
if needs_multipage:
|
318
|
+
# Use multipage implementation
|
319
|
+
return self._direction_multipage(
|
320
|
+
direction=direction,
|
321
|
+
size=size,
|
322
|
+
cross_size=cross_size,
|
323
|
+
include_source=include_source,
|
324
|
+
until=until,
|
325
|
+
include_endpoint=include_endpoint,
|
326
|
+
offset=offset,
|
327
|
+
apply_exclusions=apply_exclusions,
|
328
|
+
**kwargs,
|
329
|
+
)
|
330
|
+
|
331
|
+
# 6. Create and return appropriate object based on self type
|
247
332
|
from natural_pdf.elements.region import Region
|
248
333
|
|
249
334
|
result = Region(self.page, final_bbox)
|
@@ -255,6 +340,144 @@ class DirectionalMixin:
|
|
255
340
|
|
256
341
|
return result
|
257
342
|
|
343
|
+
def _direction_multipage(
|
344
|
+
self,
|
345
|
+
direction: str,
|
346
|
+
size: Optional[float] = None,
|
347
|
+
cross_size: str = "full",
|
348
|
+
include_source: bool = False,
|
349
|
+
until: Optional[str] = None,
|
350
|
+
include_endpoint: bool = True,
|
351
|
+
offset: float = 0.0,
|
352
|
+
apply_exclusions: bool = True,
|
353
|
+
**kwargs,
|
354
|
+
) -> Union["Region", "FlowRegion"]:
|
355
|
+
"""
|
356
|
+
Handle multipage directional navigation by creating a Flow.
|
357
|
+
|
358
|
+
Returns FlowRegion if result spans multiple pages, Region if on single page.
|
359
|
+
"""
|
360
|
+
# Get access to the PDF to create a Flow
|
361
|
+
pdf = self.page.pdf
|
362
|
+
# Find the index of the current page
|
363
|
+
current_page_idx = None
|
364
|
+
for idx, page in enumerate(pdf.pages):
|
365
|
+
if page == self.page:
|
366
|
+
current_page_idx = idx
|
367
|
+
break
|
368
|
+
|
369
|
+
if current_page_idx is None:
|
370
|
+
# Fallback - just use current page
|
371
|
+
from natural_pdf.flows.flow import Flow
|
372
|
+
|
373
|
+
flow = Flow(segments=[self.page], arrangement="vertical")
|
374
|
+
from natural_pdf.flows.element import FlowElement
|
375
|
+
|
376
|
+
flow_element = FlowElement(physical_object=self, flow=flow)
|
377
|
+
return getattr(flow_element, direction)(**kwargs)
|
378
|
+
|
379
|
+
# Determine which pages to include in the Flow based on direction
|
380
|
+
if direction in ("below", "right"):
|
381
|
+
# Include current page and all following pages
|
382
|
+
flow_pages = pdf.pages[current_page_idx:]
|
383
|
+
else: # above, left
|
384
|
+
# Include all pages up to and including current page
|
385
|
+
flow_pages = pdf.pages[: current_page_idx + 1]
|
386
|
+
|
387
|
+
# Create a temporary Flow
|
388
|
+
from natural_pdf.flows.flow import Flow
|
389
|
+
|
390
|
+
flow = Flow(segments=list(flow_pages), arrangement="vertical")
|
391
|
+
|
392
|
+
# Find the element in the flow
|
393
|
+
# We need to create a FlowElement that corresponds to self
|
394
|
+
from natural_pdf.flows.element import FlowElement
|
395
|
+
|
396
|
+
flow_element = FlowElement(physical_object=self, flow=flow)
|
397
|
+
|
398
|
+
# Call the directional method on the FlowElement
|
399
|
+
# Remove parameters that FlowElement methods don't expect
|
400
|
+
flow_kwargs = kwargs.copy()
|
401
|
+
flow_kwargs.pop("multipage", None) # Remove multipage parameter
|
402
|
+
flow_kwargs.pop("apply_exclusions", None) # FlowElement might not have this
|
403
|
+
flow_kwargs.pop("offset", None) # FlowElement doesn't have offset
|
404
|
+
flow_kwargs.pop("cross_alignment", None) # Remove to avoid duplicate
|
405
|
+
|
406
|
+
# Map cross_size to appropriate FlowElement parameter
|
407
|
+
if direction in ["below", "above"]:
|
408
|
+
# For vertical directions, cross_size maps to width parameters
|
409
|
+
if cross_size == "full":
|
410
|
+
width_absolute = None # Let FlowElement use its defaults
|
411
|
+
elif cross_size == "element":
|
412
|
+
width_absolute = self.width
|
413
|
+
elif isinstance(cross_size, (int, float)):
|
414
|
+
width_absolute = cross_size
|
415
|
+
else:
|
416
|
+
width_absolute = None
|
417
|
+
|
418
|
+
result = (
|
419
|
+
flow_element.below(
|
420
|
+
height=size,
|
421
|
+
width_absolute=width_absolute,
|
422
|
+
include_source=include_source,
|
423
|
+
until=until,
|
424
|
+
include_endpoint=include_endpoint,
|
425
|
+
**flow_kwargs,
|
426
|
+
)
|
427
|
+
if direction == "below"
|
428
|
+
else flow_element.above(
|
429
|
+
height=size,
|
430
|
+
width_absolute=width_absolute,
|
431
|
+
include_source=include_source,
|
432
|
+
until=until,
|
433
|
+
include_endpoint=include_endpoint,
|
434
|
+
**flow_kwargs,
|
435
|
+
)
|
436
|
+
)
|
437
|
+
else: # left, right
|
438
|
+
# For horizontal directions, cross_size maps to height parameters
|
439
|
+
if cross_size == "full":
|
440
|
+
height_absolute = None # Let FlowElement use its defaults
|
441
|
+
elif cross_size == "element":
|
442
|
+
height_absolute = self.height
|
443
|
+
elif isinstance(cross_size, (int, float)):
|
444
|
+
height_absolute = cross_size
|
445
|
+
else:
|
446
|
+
height_absolute = None
|
447
|
+
|
448
|
+
result = (
|
449
|
+
flow_element.left(
|
450
|
+
width=size,
|
451
|
+
height_absolute=height_absolute,
|
452
|
+
include_source=include_source,
|
453
|
+
until=until,
|
454
|
+
include_endpoint=include_endpoint,
|
455
|
+
**flow_kwargs,
|
456
|
+
)
|
457
|
+
if direction == "left"
|
458
|
+
else flow_element.right(
|
459
|
+
width=size,
|
460
|
+
height_absolute=height_absolute,
|
461
|
+
include_source=include_source,
|
462
|
+
until=until,
|
463
|
+
include_endpoint=include_endpoint,
|
464
|
+
**flow_kwargs,
|
465
|
+
)
|
466
|
+
)
|
467
|
+
|
468
|
+
# If the result is a FlowRegion with only one constituent region,
|
469
|
+
# return that Region instead
|
470
|
+
from natural_pdf.flows.region import FlowRegion
|
471
|
+
|
472
|
+
if isinstance(result, FlowRegion) and len(result.constituent_regions) == 1:
|
473
|
+
single_region = result.constituent_regions[0]
|
474
|
+
# Copy over any metadata
|
475
|
+
if hasattr(result, "boundary_element_found"):
|
476
|
+
single_region.boundary_element = result.boundary_element_found
|
477
|
+
return single_region
|
478
|
+
|
479
|
+
return result
|
480
|
+
|
258
481
|
def above(
|
259
482
|
self,
|
260
483
|
height: Optional[float] = None,
|
@@ -262,9 +485,11 @@ class DirectionalMixin:
|
|
262
485
|
include_source: bool = False,
|
263
486
|
until: Optional[str] = None,
|
264
487
|
include_endpoint: bool = True,
|
265
|
-
offset: float =
|
488
|
+
offset: Optional[float] = None,
|
489
|
+
apply_exclusions: bool = True,
|
490
|
+
multipage: bool = False,
|
266
491
|
**kwargs,
|
267
|
-
) -> "Region":
|
492
|
+
) -> Union["Region", "FlowRegion"]:
|
268
493
|
"""
|
269
494
|
Select region above this element/region.
|
270
495
|
|
@@ -274,7 +499,10 @@ class DirectionalMixin:
|
|
274
499
|
include_source: Whether to include this element/region in the result (default: False)
|
275
500
|
until: Optional selector string to specify an upper boundary element
|
276
501
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
277
|
-
offset: Pixel offset when excluding source/endpoint (default:
|
502
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
503
|
+
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
504
|
+
multipage: If True, allows the region to span multiple pages. Returns FlowRegion
|
505
|
+
if the result spans multiple pages, Region otherwise (default: False)
|
278
506
|
**kwargs: Additional parameters
|
279
507
|
|
280
508
|
Returns:
|
@@ -292,6 +520,10 @@ class DirectionalMixin:
|
|
292
520
|
signature.above(until='text:contains("Date")') # Region from date to signature
|
293
521
|
```
|
294
522
|
"""
|
523
|
+
# Use global default if offset not provided
|
524
|
+
if offset is None:
|
525
|
+
offset = natural_pdf.options.layout.directional_offset
|
526
|
+
|
295
527
|
return self._direction(
|
296
528
|
direction="above",
|
297
529
|
size=height,
|
@@ -300,6 +532,8 @@ class DirectionalMixin:
|
|
300
532
|
until=until,
|
301
533
|
include_endpoint=include_endpoint,
|
302
534
|
offset=offset,
|
535
|
+
apply_exclusions=apply_exclusions,
|
536
|
+
multipage=multipage,
|
303
537
|
**kwargs,
|
304
538
|
)
|
305
539
|
|
@@ -310,9 +544,11 @@ class DirectionalMixin:
|
|
310
544
|
include_source: bool = False,
|
311
545
|
until: Optional[str] = None,
|
312
546
|
include_endpoint: bool = True,
|
313
|
-
offset: float =
|
547
|
+
offset: Optional[float] = None,
|
548
|
+
apply_exclusions: bool = True,
|
549
|
+
multipage: bool = False,
|
314
550
|
**kwargs,
|
315
|
-
) -> "Region":
|
551
|
+
) -> Union["Region", "FlowRegion"]:
|
316
552
|
"""
|
317
553
|
Select region below this element/region.
|
318
554
|
|
@@ -322,7 +558,10 @@ class DirectionalMixin:
|
|
322
558
|
include_source: Whether to include this element/region in the result (default: False)
|
323
559
|
until: Optional selector string to specify a lower boundary element
|
324
560
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
325
|
-
|
561
|
+
multipage: If True, allows the region to span multiple pages. Returns FlowRegion
|
562
|
+
if the result spans multiple pages, Region otherwise (default: False)
|
563
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
564
|
+
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
326
565
|
**kwargs: Additional parameters
|
327
566
|
|
328
567
|
Returns:
|
@@ -340,6 +579,10 @@ class DirectionalMixin:
|
|
340
579
|
header.below(height=200) # Gets 200pt tall region below header
|
341
580
|
```
|
342
581
|
"""
|
582
|
+
# Use global default if offset not provided
|
583
|
+
if offset is None:
|
584
|
+
offset = natural_pdf.options.layout.directional_offset
|
585
|
+
|
343
586
|
return self._direction(
|
344
587
|
direction="below",
|
345
588
|
size=height,
|
@@ -348,6 +591,8 @@ class DirectionalMixin:
|
|
348
591
|
until=until,
|
349
592
|
include_endpoint=include_endpoint,
|
350
593
|
offset=offset,
|
594
|
+
apply_exclusions=apply_exclusions,
|
595
|
+
multipage=multipage,
|
351
596
|
**kwargs,
|
352
597
|
)
|
353
598
|
|
@@ -358,9 +603,11 @@ class DirectionalMixin:
|
|
358
603
|
include_source: bool = False,
|
359
604
|
until: Optional[str] = None,
|
360
605
|
include_endpoint: bool = True,
|
361
|
-
offset: float =
|
606
|
+
offset: Optional[float] = None,
|
607
|
+
apply_exclusions: bool = True,
|
608
|
+
multipage: bool = False,
|
362
609
|
**kwargs,
|
363
|
-
) -> "Region":
|
610
|
+
) -> Union["Region", "FlowRegion"]:
|
364
611
|
"""
|
365
612
|
Select region to the left of this element/region.
|
366
613
|
|
@@ -370,7 +617,10 @@ class DirectionalMixin:
|
|
370
617
|
include_source: Whether to include this element/region in the result (default: False)
|
371
618
|
until: Optional selector string to specify a left boundary element
|
372
619
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
373
|
-
offset: Pixel offset when excluding source/endpoint (default:
|
620
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
621
|
+
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
622
|
+
multipage: If True, allows the region to span multiple pages. Returns FlowRegion
|
623
|
+
if the result spans multiple pages, Region otherwise (default: False)
|
374
624
|
**kwargs: Additional parameters
|
375
625
|
|
376
626
|
Returns:
|
@@ -388,6 +638,10 @@ class DirectionalMixin:
|
|
388
638
|
table.left(height=100) # Gets 100pt tall region to the left
|
389
639
|
```
|
390
640
|
"""
|
641
|
+
# Use global default if offset not provided
|
642
|
+
if offset is None:
|
643
|
+
offset = natural_pdf.options.layout.directional_offset
|
644
|
+
|
391
645
|
return self._direction(
|
392
646
|
direction="left",
|
393
647
|
size=width,
|
@@ -396,6 +650,8 @@ class DirectionalMixin:
|
|
396
650
|
until=until,
|
397
651
|
include_endpoint=include_endpoint,
|
398
652
|
offset=offset,
|
653
|
+
apply_exclusions=apply_exclusions,
|
654
|
+
multipage=multipage,
|
399
655
|
**kwargs,
|
400
656
|
)
|
401
657
|
|
@@ -406,9 +662,11 @@ class DirectionalMixin:
|
|
406
662
|
include_source: bool = False,
|
407
663
|
until: Optional[str] = None,
|
408
664
|
include_endpoint: bool = True,
|
409
|
-
offset: float =
|
665
|
+
offset: Optional[float] = None,
|
666
|
+
apply_exclusions: bool = True,
|
667
|
+
multipage: bool = False,
|
410
668
|
**kwargs,
|
411
|
-
) -> "Region":
|
669
|
+
) -> Union["Region", "FlowRegion"]:
|
412
670
|
"""
|
413
671
|
Select region to the right of this element/region.
|
414
672
|
|
@@ -418,7 +676,10 @@ class DirectionalMixin:
|
|
418
676
|
include_source: Whether to include this element/region in the result (default: False)
|
419
677
|
until: Optional selector string to specify a right boundary element
|
420
678
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
421
|
-
offset: Pixel offset when excluding source/endpoint (default:
|
679
|
+
offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
|
680
|
+
apply_exclusions: Whether to respect exclusions when using 'until' selector (default: True)
|
681
|
+
multipage: If True, allows the region to span multiple pages. Returns FlowRegion
|
682
|
+
if the result spans multiple pages, Region otherwise (default: False)
|
422
683
|
**kwargs: Additional parameters
|
423
684
|
|
424
685
|
Returns:
|
@@ -436,6 +697,10 @@ class DirectionalMixin:
|
|
436
697
|
label.right(height=50) # Gets 50pt tall region to the right
|
437
698
|
```
|
438
699
|
"""
|
700
|
+
# Use global default if offset not provided
|
701
|
+
if offset is None:
|
702
|
+
offset = natural_pdf.options.layout.directional_offset
|
703
|
+
|
439
704
|
return self._direction(
|
440
705
|
direction="right",
|
441
706
|
size=width,
|
@@ -444,6 +709,8 @@ class DirectionalMixin:
|
|
444
709
|
until=until,
|
445
710
|
include_endpoint=include_endpoint,
|
446
711
|
offset=offset,
|
712
|
+
apply_exclusions=apply_exclusions,
|
713
|
+
multipage=multipage,
|
447
714
|
**kwargs,
|
448
715
|
)
|
449
716
|
|
@@ -451,7 +718,7 @@ class DirectionalMixin:
|
|
451
718
|
return self.expand()
|
452
719
|
|
453
720
|
@overload
|
454
|
-
def expand(self, amount: float) -> "Region":
|
721
|
+
def expand(self, amount: float, *, apply_exclusions: bool = True) -> "Region":
|
455
722
|
"""Expand in all directions by the same amount."""
|
456
723
|
...
|
457
724
|
|
@@ -459,12 +726,13 @@ class DirectionalMixin:
|
|
459
726
|
def expand(
|
460
727
|
self,
|
461
728
|
*,
|
462
|
-
left: float = 0,
|
463
|
-
right: float = 0,
|
464
|
-
top: float = 0,
|
465
|
-
bottom: float = 0,
|
729
|
+
left: Union[float, bool, str] = 0,
|
730
|
+
right: Union[float, bool, str] = 0,
|
731
|
+
top: Union[float, bool, str] = 0,
|
732
|
+
bottom: Union[float, bool, str] = 0,
|
466
733
|
width_factor: float = 1.0,
|
467
734
|
height_factor: float = 1.0,
|
735
|
+
apply_exclusions: bool = True,
|
468
736
|
) -> "Region":
|
469
737
|
"""Expand by different amounts in each direction."""
|
470
738
|
...
|
@@ -472,24 +740,29 @@ class DirectionalMixin:
|
|
472
740
|
def expand(
|
473
741
|
self,
|
474
742
|
amount: Optional[float] = None,
|
475
|
-
left: float = 0,
|
476
|
-
right: float = 0,
|
477
|
-
top: float = 0,
|
478
|
-
bottom: float = 0,
|
743
|
+
left: Union[float, bool, str] = 0,
|
744
|
+
right: Union[float, bool, str] = 0,
|
745
|
+
top: Union[float, bool, str] = 0,
|
746
|
+
bottom: Union[float, bool, str] = 0,
|
479
747
|
width_factor: float = 1.0,
|
480
748
|
height_factor: float = 1.0,
|
749
|
+
apply_exclusions: bool = True,
|
481
750
|
) -> "Region":
|
482
751
|
"""
|
483
752
|
Create a new region expanded from this element/region.
|
484
753
|
|
485
754
|
Args:
|
486
755
|
amount: If provided as the first positional argument, expand all edges by this amount
|
487
|
-
left: Amount to expand left edge
|
488
|
-
|
489
|
-
|
490
|
-
|
756
|
+
left: Amount to expand left edge:
|
757
|
+
- float: Fixed pixel expansion
|
758
|
+
- True: Expand to page edge
|
759
|
+
- str: Selector to expand until (excludes target by default, prefix with '+' to include)
|
760
|
+
right: Amount to expand right edge (same options as left)
|
761
|
+
top: Amount to expand top edge (same options as left)
|
762
|
+
bottom: Amount to expand bottom edge (same options as left)
|
491
763
|
width_factor: Factor to multiply width by (applied after absolute expansion)
|
492
764
|
height_factor: Factor to multiply height by (applied after absolute expansion)
|
765
|
+
apply_exclusions: Whether to respect exclusions when using selectors (default: True)
|
493
766
|
|
494
767
|
Returns:
|
495
768
|
New expanded Region object
|
@@ -501,31 +774,108 @@ class DirectionalMixin:
|
|
501
774
|
# Expand by different amounts in each direction
|
502
775
|
expanded = element.expand(left=10, right=5, top=3, bottom=7)
|
503
776
|
|
777
|
+
# Expand to page edges
|
778
|
+
expanded = element.expand(left=True, right=True) # Full width
|
779
|
+
|
780
|
+
# Expand until specific elements
|
781
|
+
statute = page.find('text:contains("Statute")')
|
782
|
+
expanded = statute.expand(right='text:contains("Repeat?")') # Excludes "Repeat?"
|
783
|
+
expanded = statute.expand(right='+text:contains("Repeat?")') # Includes "Repeat?"
|
784
|
+
|
504
785
|
# Use width/height factors
|
505
786
|
expanded = element.expand(width_factor=1.5, height_factor=2.0)
|
506
787
|
"""
|
507
788
|
# If amount is provided as first positional argument, use it for all directions
|
508
789
|
if amount is not None:
|
509
790
|
left = right = top = bottom = amount
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
791
|
+
|
792
|
+
# Helper function to process expansion values
|
793
|
+
def process_expansion(value, direction):
|
794
|
+
"""Process expansion value and return the new coordinate."""
|
795
|
+
is_horizontal = direction in ("left", "right")
|
796
|
+
is_positive = direction in ("right", "bottom")
|
797
|
+
|
798
|
+
# Get current bounds
|
799
|
+
if is_horizontal:
|
800
|
+
current_edge = self.x1 if is_positive else self.x0
|
801
|
+
page_limit = self.page.width if is_positive else 0
|
802
|
+
else:
|
803
|
+
current_edge = self.bottom if is_positive else self.top
|
804
|
+
page_limit = self.page.height if is_positive else 0
|
805
|
+
|
806
|
+
# Handle boolean True - expand to page edge
|
807
|
+
if value is True:
|
808
|
+
return page_limit
|
809
|
+
|
810
|
+
# Handle numeric values - fixed pixel expansion
|
811
|
+
elif isinstance(value, (int, float)):
|
812
|
+
if is_positive:
|
813
|
+
return current_edge + value
|
814
|
+
else:
|
815
|
+
return current_edge - value
|
816
|
+
|
817
|
+
# Handle string selectors - use directional methods
|
818
|
+
elif isinstance(value, str):
|
819
|
+
# Check if we should include the endpoint
|
820
|
+
include_endpoint = value.startswith("+")
|
821
|
+
selector = value[1:] if include_endpoint else value
|
822
|
+
|
823
|
+
# Use directional methods to get the region
|
824
|
+
if direction == "left":
|
825
|
+
region = self.left(
|
826
|
+
until=selector,
|
827
|
+
include_endpoint=include_endpoint,
|
828
|
+
include_source=True,
|
829
|
+
apply_exclusions=apply_exclusions,
|
830
|
+
)
|
831
|
+
return region.x0
|
832
|
+
elif direction == "right":
|
833
|
+
region = self.right(
|
834
|
+
until=selector,
|
835
|
+
include_endpoint=include_endpoint,
|
836
|
+
include_source=True,
|
837
|
+
apply_exclusions=apply_exclusions,
|
838
|
+
)
|
839
|
+
return region.x1
|
840
|
+
elif direction == "top":
|
841
|
+
region = self.above(
|
842
|
+
until=selector,
|
843
|
+
include_endpoint=include_endpoint,
|
844
|
+
include_source=True,
|
845
|
+
width="element",
|
846
|
+
apply_exclusions=apply_exclusions,
|
847
|
+
)
|
848
|
+
return region.top
|
849
|
+
elif direction == "bottom":
|
850
|
+
region = self.below(
|
851
|
+
until=selector,
|
852
|
+
include_endpoint=include_endpoint,
|
853
|
+
include_source=True,
|
854
|
+
width="element",
|
855
|
+
apply_exclusions=apply_exclusions,
|
856
|
+
)
|
857
|
+
return region.bottom
|
858
|
+
|
859
|
+
# Should not reach here
|
860
|
+
return current_edge
|
861
|
+
|
862
|
+
else:
|
863
|
+
# Invalid value type, return current edge
|
864
|
+
return current_edge
|
865
|
+
|
866
|
+
# Process each direction
|
867
|
+
new_x0 = process_expansion(left, "left") if left else self.x0
|
868
|
+
new_x1 = process_expansion(right, "right") if right else self.x1
|
869
|
+
new_top = process_expansion(top, "top") if top else self.top
|
870
|
+
new_bottom = process_expansion(bottom, "bottom") if bottom else self.bottom
|
521
871
|
|
522
872
|
# Apply percentage factors if provided
|
523
873
|
if width_factor != 1.0 or height_factor != 1.0:
|
524
|
-
# Calculate center point *after*
|
874
|
+
# Calculate center point *after* expansion
|
525
875
|
center_x = (new_x0 + new_x1) / 2
|
526
876
|
center_y = (new_top + new_bottom) / 2
|
527
877
|
|
528
|
-
# Calculate current width and height *after*
|
878
|
+
# Calculate current width and height *after* expansion
|
529
879
|
current_width = new_x1 - new_x0
|
530
880
|
current_height = new_bottom - new_top
|
531
881
|
|
@@ -1210,7 +1560,22 @@ class Element(
|
|
1210
1560
|
return self
|
1211
1561
|
|
1212
1562
|
def exclude(self):
|
1213
|
-
|
1563
|
+
"""
|
1564
|
+
Exclude this element from text extraction and other operations.
|
1565
|
+
|
1566
|
+
For Region elements, this excludes everything within the region's bounds.
|
1567
|
+
For other elements (like TextElement), this excludes only the specific element,
|
1568
|
+
not the entire area it occupies.
|
1569
|
+
"""
|
1570
|
+
from natural_pdf.elements.region import Region
|
1571
|
+
|
1572
|
+
# Use 'region' method for Region objects, 'element' method for everything else
|
1573
|
+
if isinstance(self, Region):
|
1574
|
+
method = "region"
|
1575
|
+
else:
|
1576
|
+
method = "element"
|
1577
|
+
|
1578
|
+
self.page.add_exclusion(self, method=method)
|
1214
1579
|
|
1215
1580
|
def _get_render_specs(
|
1216
1581
|
self,
|