natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +159 -3
- natural_pdf/collections/mixins.py +16 -3
- natural_pdf/core/highlighting_service.py +33 -9
- natural_pdf/core/page.py +138 -7
- natural_pdf/core/page_collection.py +51 -14
- natural_pdf/core/page_groupby.py +229 -0
- natural_pdf/core/render_spec.py +62 -4
- natural_pdf/elements/base.py +102 -20
- natural_pdf/elements/element_collection.py +11 -10
- natural_pdf/elements/region.py +21 -21
- natural_pdf/elements/text.py +5 -0
- natural_pdf/extraction/manager.py +8 -14
- natural_pdf/extraction/mixin.py +35 -21
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +37 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/METADATA +2 -2
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/RECORD +23 -22
- optimization/performance_analysis.py +1 -1
- tools/bad_pdf_eval/analyser.py +1 -1
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.3.dist-info}/top_level.txt +0 -0
natural_pdf/analyzers/guides.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
from collections import UserList
|
6
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from PIL import Image, ImageDraw
|
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
|
|
16
16
|
from natural_pdf.elements.element_collection import ElementCollection
|
17
17
|
from natural_pdf.elements.region import Region
|
18
18
|
from natural_pdf.flows.region import FlowRegion
|
19
|
+
from natural_pdf.tables.result import TableResult
|
19
20
|
|
20
21
|
logger = logging.getLogger(__name__)
|
21
22
|
|
@@ -131,6 +132,15 @@ class GuidesList(UserList):
|
|
131
132
|
self._parent = parent_guides
|
132
133
|
self._axis = axis
|
133
134
|
|
135
|
+
def __getitem__(self, i):
|
136
|
+
"""Override to handle slicing properly."""
|
137
|
+
if isinstance(i, slice):
|
138
|
+
# Return a new GuidesList with the sliced data
|
139
|
+
return self.__class__(self._parent, self._axis, self.data[i])
|
140
|
+
else:
|
141
|
+
# For single index, return the value directly
|
142
|
+
return self.data[i]
|
143
|
+
|
134
144
|
def from_content(
|
135
145
|
self,
|
136
146
|
markers: Union[str, List[str], "ElementCollection", None],
|
@@ -140,6 +150,7 @@ class GuidesList(UserList):
|
|
140
150
|
tolerance: float = 5,
|
141
151
|
*,
|
142
152
|
append: bool = False,
|
153
|
+
apply_exclusions: bool = True,
|
143
154
|
) -> "Guides":
|
144
155
|
"""
|
145
156
|
Create guides from content markers and add to this axis.
|
@@ -154,6 +165,7 @@ class GuidesList(UserList):
|
|
154
165
|
align: How to align guides relative to found elements
|
155
166
|
outer: Whether to add outer boundary guides
|
156
167
|
tolerance: Tolerance for snapping to element edges
|
168
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
157
169
|
|
158
170
|
Returns:
|
159
171
|
Parent Guides object for chaining
|
@@ -178,6 +190,7 @@ class GuidesList(UserList):
|
|
178
190
|
align=align,
|
179
191
|
outer=outer,
|
180
192
|
tolerance=tolerance,
|
193
|
+
apply_exclusions=apply_exclusions,
|
181
194
|
)
|
182
195
|
|
183
196
|
# Collect guides from this region
|
@@ -260,6 +273,7 @@ class GuidesList(UserList):
|
|
260
273
|
align=align,
|
261
274
|
outer=outer,
|
262
275
|
tolerance=tolerance,
|
276
|
+
apply_exclusions=apply_exclusions,
|
263
277
|
)
|
264
278
|
|
265
279
|
# Replace or append based on parameter
|
@@ -1398,6 +1412,7 @@ class Guides:
|
|
1398
1412
|
align: Literal["left", "right", "center", "between"] = "left",
|
1399
1413
|
outer: bool = True,
|
1400
1414
|
tolerance: float = 5,
|
1415
|
+
apply_exclusions: bool = True,
|
1401
1416
|
) -> "Guides":
|
1402
1417
|
"""
|
1403
1418
|
Create guides based on text content positions.
|
@@ -1413,6 +1428,7 @@ class Guides:
|
|
1413
1428
|
align: Where to place guides relative to found text
|
1414
1429
|
outer: Whether to add guides at the boundaries
|
1415
1430
|
tolerance: Maximum distance to search for text
|
1431
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
1416
1432
|
|
1417
1433
|
Returns:
|
1418
1434
|
New Guides object aligned to text content
|
@@ -1431,6 +1447,7 @@ class Guides:
|
|
1431
1447
|
align=align,
|
1432
1448
|
outer=outer,
|
1433
1449
|
tolerance=tolerance,
|
1450
|
+
apply_exclusions=apply_exclusions,
|
1434
1451
|
)
|
1435
1452
|
|
1436
1453
|
# Store in flow guides
|
@@ -1469,7 +1486,7 @@ class Guides:
|
|
1469
1486
|
# Find each marker and determine guide position
|
1470
1487
|
for marker in marker_texts:
|
1471
1488
|
if hasattr(obj, "find"):
|
1472
|
-
element = obj.find(f'text:contains("{marker}")')
|
1489
|
+
element = obj.find(f'text:contains("{marker}")', apply_exclusions=apply_exclusions)
|
1473
1490
|
if element:
|
1474
1491
|
if axis == "vertical":
|
1475
1492
|
if align == "left":
|
@@ -1498,7 +1515,9 @@ class Guides:
|
|
1498
1515
|
marker_bounds = []
|
1499
1516
|
for marker in marker_texts:
|
1500
1517
|
if hasattr(obj, "find"):
|
1501
|
-
element = obj.find(
|
1518
|
+
element = obj.find(
|
1519
|
+
f'text:contains("{marker}")', apply_exclusions=apply_exclusions
|
1520
|
+
)
|
1502
1521
|
if element:
|
1503
1522
|
if axis == "vertical":
|
1504
1523
|
marker_bounds.append((element.x0, element.x1))
|
@@ -3285,6 +3304,7 @@ class Guides:
|
|
3285
3304
|
align: Literal["left", "right", "center", "between"] = "left",
|
3286
3305
|
outer: bool = True,
|
3287
3306
|
tolerance: float = 5,
|
3307
|
+
apply_exclusions: bool = True,
|
3288
3308
|
) -> "Guides":
|
3289
3309
|
"""
|
3290
3310
|
Instance method: Add guides from content, allowing chaining.
|
@@ -3301,6 +3321,7 @@ class Guides:
|
|
3301
3321
|
align: How to align guides relative to found elements
|
3302
3322
|
outer: Whether to add outer boundary guides
|
3303
3323
|
tolerance: Tolerance for snapping to element edges
|
3324
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
3304
3325
|
|
3305
3326
|
Returns:
|
3306
3327
|
Self for method chaining
|
@@ -3318,6 +3339,7 @@ class Guides:
|
|
3318
3339
|
align=align,
|
3319
3340
|
outer=outer,
|
3320
3341
|
tolerance=tolerance,
|
3342
|
+
apply_exclusions=apply_exclusions,
|
3321
3343
|
)
|
3322
3344
|
|
3323
3345
|
# Add the appropriate coordinates to this object
|
@@ -3421,6 +3443,140 @@ class Guides:
|
|
3421
3443
|
|
3422
3444
|
return self
|
3423
3445
|
|
3446
|
+
def extract_table(
|
3447
|
+
self,
|
3448
|
+
target: Optional[Union["Page", "Region"]] = None,
|
3449
|
+
source: str = "guides_temp",
|
3450
|
+
cell_padding: float = 0.5,
|
3451
|
+
include_outer_boundaries: bool = False,
|
3452
|
+
method: Optional[str] = None,
|
3453
|
+
table_settings: Optional[dict] = None,
|
3454
|
+
use_ocr: bool = False,
|
3455
|
+
ocr_config: Optional[dict] = None,
|
3456
|
+
text_options: Optional[Dict] = None,
|
3457
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
3458
|
+
show_progress: bool = False,
|
3459
|
+
content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
|
3460
|
+
*,
|
3461
|
+
multi_page: Literal["auto", True, False] = "auto",
|
3462
|
+
) -> "TableResult":
|
3463
|
+
"""
|
3464
|
+
Extract table data directly from guides without leaving temporary regions.
|
3465
|
+
|
3466
|
+
This method:
|
3467
|
+
1. Creates table structure using build_grid()
|
3468
|
+
2. Extracts table data from the created table region
|
3469
|
+
3. Cleans up all temporary regions
|
3470
|
+
4. Returns the TableResult
|
3471
|
+
|
3472
|
+
Args:
|
3473
|
+
target: Page or Region to create regions on (uses self.context if None)
|
3474
|
+
source: Source label for temporary regions (will be cleaned up)
|
3475
|
+
cell_padding: Internal padding for cell regions in points
|
3476
|
+
include_outer_boundaries: Whether to add boundaries at edges if missing
|
3477
|
+
method: Table extraction method ('tatr', 'pdfplumber', 'text', etc.)
|
3478
|
+
table_settings: Settings for pdfplumber table extraction
|
3479
|
+
use_ocr: Whether to use OCR for text extraction
|
3480
|
+
ocr_config: OCR configuration parameters
|
3481
|
+
text_options: Dictionary of options for the 'text' method
|
3482
|
+
cell_extraction_func: Optional callable for custom cell text extraction
|
3483
|
+
show_progress: Controls progress bar for text method
|
3484
|
+
content_filter: Content filtering function or patterns
|
3485
|
+
multi_page: Controls multi-region table creation for FlowRegions
|
3486
|
+
|
3487
|
+
Returns:
|
3488
|
+
TableResult: Extracted table data
|
3489
|
+
|
3490
|
+
Raises:
|
3491
|
+
ValueError: If no table region is created from the guides
|
3492
|
+
|
3493
|
+
Example:
|
3494
|
+
```python
|
3495
|
+
from natural_pdf.analyzers import Guides
|
3496
|
+
|
3497
|
+
# Create guides from detected lines
|
3498
|
+
guides = Guides.from_lines(page, source_label="detected")
|
3499
|
+
|
3500
|
+
# Extract table directly - no temporary regions left behind
|
3501
|
+
table_data = guides.extract_table()
|
3502
|
+
|
3503
|
+
# Convert to pandas DataFrame
|
3504
|
+
df = table_data.to_df()
|
3505
|
+
```
|
3506
|
+
"""
|
3507
|
+
target_obj = target or self.context
|
3508
|
+
if not target_obj:
|
3509
|
+
raise ValueError("No target object available. Provide target parameter or context.")
|
3510
|
+
|
3511
|
+
# Get the page for cleanup later
|
3512
|
+
if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
|
3513
|
+
page = target_obj._page
|
3514
|
+
element_manager = page._element_mgr
|
3515
|
+
elif hasattr(target_obj, "_element_mgr"): # Page
|
3516
|
+
page = target_obj
|
3517
|
+
element_manager = page._element_mgr
|
3518
|
+
else:
|
3519
|
+
raise ValueError(f"Target object {target_obj} is not a Page or Region")
|
3520
|
+
|
3521
|
+
try:
|
3522
|
+
# Step 1: Build grid structure (creates temporary regions)
|
3523
|
+
grid_result = self.build_grid(
|
3524
|
+
target=target_obj,
|
3525
|
+
source=source,
|
3526
|
+
cell_padding=cell_padding,
|
3527
|
+
include_outer_boundaries=include_outer_boundaries,
|
3528
|
+
multi_page=multi_page,
|
3529
|
+
)
|
3530
|
+
|
3531
|
+
# Step 2: Get the table region and extract table data
|
3532
|
+
table_region = grid_result["regions"]["table"]
|
3533
|
+
if table_region is None:
|
3534
|
+
raise ValueError(
|
3535
|
+
"No table region was created from the guides. Check that you have both vertical and horizontal guides."
|
3536
|
+
)
|
3537
|
+
|
3538
|
+
# Handle multi-page case where table_region might be a list
|
3539
|
+
if isinstance(table_region, list):
|
3540
|
+
if not table_region:
|
3541
|
+
raise ValueError("No table regions were created from the guides.")
|
3542
|
+
# Use the first table region for extraction
|
3543
|
+
table_region = table_region[0]
|
3544
|
+
|
3545
|
+
# Step 3: Extract table data using the region's extract_table method
|
3546
|
+
table_result = table_region.extract_table(
|
3547
|
+
method=method,
|
3548
|
+
table_settings=table_settings,
|
3549
|
+
use_ocr=use_ocr,
|
3550
|
+
ocr_config=ocr_config,
|
3551
|
+
text_options=text_options,
|
3552
|
+
cell_extraction_func=cell_extraction_func,
|
3553
|
+
show_progress=show_progress,
|
3554
|
+
content_filter=content_filter,
|
3555
|
+
)
|
3556
|
+
|
3557
|
+
return table_result
|
3558
|
+
|
3559
|
+
finally:
|
3560
|
+
# Step 4: Clean up all temporary regions created by build_grid
|
3561
|
+
# This ensures no regions are left behind regardless of success/failure
|
3562
|
+
try:
|
3563
|
+
regions_to_remove = [
|
3564
|
+
r
|
3565
|
+
for r in element_manager.regions
|
3566
|
+
if getattr(r, "source", None) == source
|
3567
|
+
and getattr(r, "region_type", None)
|
3568
|
+
in {"table", "table_row", "table_column", "table_cell"}
|
3569
|
+
]
|
3570
|
+
|
3571
|
+
for region in regions_to_remove:
|
3572
|
+
element_manager.remove_element(region, element_type="regions")
|
3573
|
+
|
3574
|
+
if regions_to_remove:
|
3575
|
+
logger.debug(f"Cleaned up {len(regions_to_remove)} temporary regions")
|
3576
|
+
|
3577
|
+
except Exception as cleanup_err:
|
3578
|
+
logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
|
3579
|
+
|
3424
3580
|
def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
|
3425
3581
|
"""Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
|
3426
3582
|
if not self.is_flow_region or len(self.context.constituent_regions) < 2:
|
@@ -29,9 +29,22 @@ class DirectionalCollectionMixin:
|
|
29
29
|
"""Find regions to the right of all elements in this collection."""
|
30
30
|
return self.apply(lambda element: element.right(**kwargs))
|
31
31
|
|
32
|
-
def expand(self, **kwargs) -> "ElementCollection":
|
33
|
-
"""Expand all elements in this collection.
|
34
|
-
|
32
|
+
def expand(self, *args, **kwargs) -> "ElementCollection":
|
33
|
+
"""Expand all elements in this collection.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
*args: If a single positional argument is provided, expands all elements
|
37
|
+
by that amount in all directions.
|
38
|
+
**kwargs: Keyword arguments for directional expansion (left, right, top, bottom, etc.)
|
39
|
+
|
40
|
+
Examples:
|
41
|
+
# Expand all elements by 5 pixels in all directions
|
42
|
+
collection.expand(5)
|
43
|
+
|
44
|
+
# Expand with different amounts in each direction
|
45
|
+
collection.expand(left=10, right=5, top=3, bottom=7)
|
46
|
+
"""
|
47
|
+
return self.apply(lambda element: element.expand(*args, **kwargs))
|
35
48
|
|
36
49
|
|
37
50
|
class ApplyMixin:
|
@@ -335,6 +335,7 @@ class HighlightContext:
|
|
335
335
|
self.show_on_exit = show_on_exit
|
336
336
|
self.highlight_groups = []
|
337
337
|
self._color_manager = ColorManager()
|
338
|
+
self._exit_image = None # Store image for Jupyter display
|
338
339
|
|
339
340
|
def add(
|
340
341
|
self,
|
@@ -421,6 +422,11 @@ class HighlightContext:
|
|
421
422
|
)
|
422
423
|
return None
|
423
424
|
|
425
|
+
@property
|
426
|
+
def image(self) -> Optional[Image.Image]:
|
427
|
+
"""Get the last generated image (useful after context exit)."""
|
428
|
+
return self._exit_image
|
429
|
+
|
424
430
|
def __enter__(self) -> "HighlightContext":
|
425
431
|
"""Enter the context."""
|
426
432
|
return self
|
@@ -428,7 +434,25 @@ class HighlightContext:
|
|
428
434
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
429
435
|
"""Exit the context, optionally showing highlights."""
|
430
436
|
if self.show_on_exit and not exc_type:
|
431
|
-
self.show()
|
437
|
+
self._exit_image = self.show()
|
438
|
+
|
439
|
+
# Check if we're in a Jupyter/IPython environment
|
440
|
+
try:
|
441
|
+
# Try to get IPython instance
|
442
|
+
from IPython import get_ipython
|
443
|
+
|
444
|
+
ipython = get_ipython()
|
445
|
+
if ipython is not None:
|
446
|
+
# We're in IPython/Jupyter
|
447
|
+
from IPython.display import display
|
448
|
+
|
449
|
+
if self._exit_image is not None:
|
450
|
+
display(self._exit_image)
|
451
|
+
except (ImportError, NameError):
|
452
|
+
# Not in Jupyter or IPython not available - that's OK
|
453
|
+
pass
|
454
|
+
|
455
|
+
# __exit__ must return False to not suppress exceptions
|
432
456
|
return False
|
433
457
|
|
434
458
|
|
@@ -689,7 +713,7 @@ class HighlightingService:
|
|
689
713
|
logger.debug(f"Added highlight to page {page_index}: {highlight}")
|
690
714
|
|
691
715
|
# --- Invalidate page-level image cache --------------------------------
|
692
|
-
# The Page.
|
716
|
+
# The Page.render method maintains an internal cache keyed by rendering
|
693
717
|
# parameters. Because the cache key currently does **not** incorporate
|
694
718
|
# any information about the highlights themselves, it can return stale
|
695
719
|
# images after highlights are added or removed. To ensure the next
|
@@ -700,11 +724,11 @@ class HighlightingService:
|
|
700
724
|
if hasattr(page_obj, "_to_image_cache"):
|
701
725
|
page_obj._to_image_cache.clear()
|
702
726
|
logger.debug(
|
703
|
-
f"Cleared cached
|
727
|
+
f"Cleared cached render images for page {page_index} after adding a highlight."
|
704
728
|
)
|
705
729
|
except Exception as cache_err: # pragma: no cover – never fail highlight creation
|
706
730
|
logger.warning(
|
707
|
-
f"Failed to invalidate
|
731
|
+
f"Failed to invalidate render cache for page {page_index}: {cache_err}",
|
708
732
|
exc_info=True,
|
709
733
|
)
|
710
734
|
|
@@ -737,11 +761,11 @@ class HighlightingService:
|
|
737
761
|
if hasattr(page_obj, "_to_image_cache"):
|
738
762
|
page_obj._to_image_cache.clear()
|
739
763
|
logger.debug(
|
740
|
-
f"Cleared cached
|
764
|
+
f"Cleared cached render images for page {page_index} after removing highlights."
|
741
765
|
)
|
742
766
|
except Exception as cache_err: # pragma: no cover
|
743
767
|
logger.warning(
|
744
|
-
f"Failed to invalidate
|
768
|
+
f"Failed to invalidate render cache for page {page_index}: {cache_err}",
|
745
769
|
exc_info=True,
|
746
770
|
)
|
747
771
|
|
@@ -760,7 +784,7 @@ class HighlightingService:
|
|
760
784
|
labels: bool = True,
|
761
785
|
legend_position: str = "right",
|
762
786
|
render_ocr: bool = False,
|
763
|
-
**kwargs, # Pass other args to pdfplumber.page.to_image if needed
|
787
|
+
**kwargs, # Pass other args to pdfplumber.page.to_image if needed (internal API)
|
764
788
|
) -> Optional[Image.Image]:
|
765
789
|
"""
|
766
790
|
Renders a specific page with its highlights.
|
@@ -773,7 +797,7 @@ class HighlightingService:
|
|
773
797
|
labels: Whether to include a legend for highlights.
|
774
798
|
legend_position: Position of the legend.
|
775
799
|
render_ocr: Whether to render OCR text on the image.
|
776
|
-
kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
|
800
|
+
kwargs: Additional keyword arguments for pdfplumber's internal page.to_image (e.g., width, height).
|
777
801
|
|
778
802
|
Returns:
|
779
803
|
A PIL Image object of the rendered page, or None if rendering fails.
|
@@ -957,7 +981,7 @@ class HighlightingService:
|
|
957
981
|
crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
|
958
982
|
space to crop the output image to, before legends or other overlays are
|
959
983
|
applied. If None, no cropping is performed.
|
960
|
-
**kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
|
984
|
+
**kwargs: Additional args for pdfplumber's internal to_image (e.g., width, height).
|
961
985
|
|
962
986
|
Returns:
|
963
987
|
PIL Image of the preview, or None if rendering fails.
|
natural_pdf/core/page.py
CHANGED
@@ -341,6 +341,26 @@ class Page(
|
|
341
341
|
for elem in elements:
|
342
342
|
spec.add_highlight(element=elem, color=group_color, label=group_label)
|
343
343
|
|
344
|
+
# Handle exclusions visualization
|
345
|
+
exclusions_param = kwargs.get("exclusions")
|
346
|
+
if exclusions_param:
|
347
|
+
# Get exclusion regions
|
348
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True)
|
349
|
+
|
350
|
+
if exclusion_regions:
|
351
|
+
# Determine color for exclusions
|
352
|
+
exclusion_color = (
|
353
|
+
exclusions_param if isinstance(exclusions_param, str) else "red"
|
354
|
+
)
|
355
|
+
|
356
|
+
# Add exclusion regions as highlights
|
357
|
+
for region in exclusion_regions:
|
358
|
+
spec.add_highlight(
|
359
|
+
element=region,
|
360
|
+
color=exclusion_color,
|
361
|
+
label=f"Exclusion: {region.label or 'unnamed'}",
|
362
|
+
)
|
363
|
+
|
344
364
|
return [spec]
|
345
365
|
|
346
366
|
@property
|
@@ -391,7 +411,9 @@ class Page(
|
|
391
411
|
|
392
412
|
def add_exclusion(
|
393
413
|
self,
|
394
|
-
exclusion_func_or_region: Union[
|
414
|
+
exclusion_func_or_region: Union[
|
415
|
+
Callable[["Page"], "Region"], "Region", List[Any], Tuple[Any, ...], Any
|
416
|
+
],
|
395
417
|
label: Optional[str] = None,
|
396
418
|
method: str = "region",
|
397
419
|
) -> "Page":
|
@@ -401,7 +423,8 @@ class Page(
|
|
401
423
|
|
402
424
|
Args:
|
403
425
|
exclusion_func_or_region: Either a callable function returning a Region,
|
404
|
-
a Region object,
|
426
|
+
a Region object, a list/tuple of regions or elements,
|
427
|
+
or another object with a valid .bbox attribute.
|
405
428
|
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
406
429
|
method: Exclusion method - 'region' (exclude all elements in bounding box) or
|
407
430
|
'element' (exclude only the specific elements). Default: 'region'.
|
@@ -551,10 +574,53 @@ class Page(
|
|
551
574
|
raise TypeError(
|
552
575
|
f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
|
553
576
|
) from e
|
577
|
+
elif isinstance(exclusion_func_or_region, (list, tuple)):
|
578
|
+
# Handle lists/tuples of regions or elements
|
579
|
+
if not exclusion_func_or_region:
|
580
|
+
logger.warning(f"Page {self.index}: Empty list provided for exclusion, ignoring.")
|
581
|
+
return self
|
582
|
+
|
583
|
+
if method == "element":
|
584
|
+
# Store each element directly
|
585
|
+
for item in exclusion_func_or_region:
|
586
|
+
if hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
|
587
|
+
self._exclusions.append((item, label, method))
|
588
|
+
logger.debug(
|
589
|
+
f"Page {self.index}: Added element exclusion from list -> {item}"
|
590
|
+
)
|
591
|
+
else:
|
592
|
+
logger.warning(
|
593
|
+
f"Page {self.index}: Skipping item without valid bbox in list: {item}"
|
594
|
+
)
|
595
|
+
else: # method == "region"
|
596
|
+
# Convert each item to a Region and add
|
597
|
+
for item in exclusion_func_or_region:
|
598
|
+
try:
|
599
|
+
if isinstance(item, Region):
|
600
|
+
item.label = label
|
601
|
+
self._exclusions.append((item, label, method))
|
602
|
+
logger.debug(f"Page {self.index}: Added Region from list: {item}")
|
603
|
+
elif hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
|
604
|
+
bbox_coords = tuple(float(v) for v in item.bbox)
|
605
|
+
region = Region(self, bbox_coords, label=label)
|
606
|
+
self._exclusions.append((region, label, method))
|
607
|
+
logger.debug(
|
608
|
+
f"Page {self.index}: Added exclusion region from list item {bbox_coords}"
|
609
|
+
)
|
610
|
+
else:
|
611
|
+
logger.warning(
|
612
|
+
f"Page {self.index}: Skipping item without valid bbox in list: {item}"
|
613
|
+
)
|
614
|
+
except Exception as e:
|
615
|
+
logger.error(
|
616
|
+
f"Page {self.index}: Failed to convert list item to Region: {e}"
|
617
|
+
)
|
618
|
+
continue
|
619
|
+
return self
|
554
620
|
else:
|
555
621
|
# Reject invalid types
|
556
622
|
raise TypeError(
|
557
|
-
f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
|
623
|
+
f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, list/tuple of regions/elements, or have a valid .bbox attribute."
|
558
624
|
)
|
559
625
|
|
560
626
|
# Append the stored data (tuple of object/callable, label, and method)
|
@@ -668,6 +734,46 @@ class Page(
|
|
668
734
|
regions.append(region_result)
|
669
735
|
if debug:
|
670
736
|
print(f" ✓ Added region from callable '{label}': {region_result}")
|
737
|
+
elif hasattr(region_result, "__iter__") and hasattr(region_result, "__len__"):
|
738
|
+
# Handle ElementCollection or other iterables
|
739
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
740
|
+
|
741
|
+
if isinstance(region_result, ElementCollection) or (
|
742
|
+
hasattr(region_result, "__iter__") and region_result
|
743
|
+
):
|
744
|
+
if debug:
|
745
|
+
print(
|
746
|
+
f" Converting {type(region_result)} with {len(region_result)} elements to regions..."
|
747
|
+
)
|
748
|
+
|
749
|
+
# Convert each element to a region
|
750
|
+
for elem in region_result:
|
751
|
+
try:
|
752
|
+
if hasattr(elem, "bbox") and len(elem.bbox) == 4:
|
753
|
+
bbox_coords = tuple(float(v) for v in elem.bbox)
|
754
|
+
region = Region(self, bbox_coords, label=label)
|
755
|
+
regions.append(region)
|
756
|
+
if debug:
|
757
|
+
print(
|
758
|
+
f" ✓ Added region from element: {bbox_coords}"
|
759
|
+
)
|
760
|
+
else:
|
761
|
+
if debug:
|
762
|
+
print(
|
763
|
+
f" ✗ Skipping element without valid bbox: {elem}"
|
764
|
+
)
|
765
|
+
except Exception as e:
|
766
|
+
if debug:
|
767
|
+
print(f" ✗ Failed to convert element to region: {e}")
|
768
|
+
continue
|
769
|
+
|
770
|
+
if debug and len(region_result) > 0:
|
771
|
+
print(
|
772
|
+
f" ✓ Converted {len(region_result)} elements from callable '{label}'"
|
773
|
+
)
|
774
|
+
else:
|
775
|
+
if debug:
|
776
|
+
print(f" ✗ Empty iterable returned from callable '{label}'")
|
671
777
|
elif region_result:
|
672
778
|
logger.warning(
|
673
779
|
f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
|
@@ -1013,6 +1119,22 @@ class Page(
|
|
1013
1119
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
1014
1120
|
)
|
1015
1121
|
|
1122
|
+
# Handle collection-level pseudo-classes (:first, :last) for OR selectors
|
1123
|
+
# Note: We only apply :first/:last if they appear in any of the sub-selectors
|
1124
|
+
has_first = False
|
1125
|
+
has_last = False
|
1126
|
+
for sub_selector in selector_obj.get("selectors", []):
|
1127
|
+
for pseudo in sub_selector.get("pseudo_classes", []):
|
1128
|
+
if pseudo.get("name") == "first":
|
1129
|
+
has_first = True
|
1130
|
+
elif pseudo.get("name") == "last":
|
1131
|
+
has_last = True
|
1132
|
+
|
1133
|
+
if has_first:
|
1134
|
+
matching_elements = matching_elements[:1] if matching_elements else []
|
1135
|
+
elif has_last:
|
1136
|
+
matching_elements = matching_elements[-1:] if matching_elements else []
|
1137
|
+
|
1016
1138
|
# Return result collection
|
1017
1139
|
return ElementCollection(matching_elements)
|
1018
1140
|
|
@@ -1134,6 +1256,15 @@ class Page(
|
|
1134
1256
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
1135
1257
|
)
|
1136
1258
|
|
1259
|
+
# Handle collection-level pseudo-classes (:first, :last)
|
1260
|
+
for pseudo in selector_obj.get("pseudo_classes", []):
|
1261
|
+
name = pseudo.get("name")
|
1262
|
+
|
1263
|
+
if name == "first":
|
1264
|
+
matching_elements = matching_elements[:1] if matching_elements else []
|
1265
|
+
elif name == "last":
|
1266
|
+
matching_elements = matching_elements[-1:] if matching_elements else []
|
1267
|
+
|
1137
1268
|
# Create result collection - exclusions are handled by the calling methods (find, find_all)
|
1138
1269
|
result = ElementCollection(matching_elements)
|
1139
1270
|
|
@@ -1845,7 +1976,7 @@ class Page(
|
|
1845
1976
|
"""Get all line elements on this page."""
|
1846
1977
|
return self._element_mgr.lines
|
1847
1978
|
|
1848
|
-
def
|
1979
|
+
def add_highlight(
|
1849
1980
|
self,
|
1850
1981
|
bbox: Optional[Tuple[float, float, float, float]] = None,
|
1851
1982
|
color: Optional[Union[Tuple, str]] = None,
|
@@ -1856,7 +1987,7 @@ class Page(
|
|
1856
1987
|
existing: str = "append",
|
1857
1988
|
) -> "Page":
|
1858
1989
|
"""
|
1859
|
-
|
1990
|
+
Add a highlight to a bounding box or the entire page.
|
1860
1991
|
Delegates to the central HighlightingService.
|
1861
1992
|
|
1862
1993
|
Args:
|
@@ -1884,7 +2015,7 @@ class Page(
|
|
1884
2015
|
)
|
1885
2016
|
return self
|
1886
2017
|
|
1887
|
-
def
|
2018
|
+
def add_highlight_polygon(
|
1888
2019
|
self,
|
1889
2020
|
polygon: List[Tuple[float, float]],
|
1890
2021
|
color: Optional[Union[Tuple, str]] = None,
|
@@ -1944,7 +2075,7 @@ class Page(
|
|
1944
2075
|
render_ocr: Whether to render OCR text.
|
1945
2076
|
include_highlights: Whether to render highlights.
|
1946
2077
|
resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
|
1947
|
-
**kwargs: Additional args for pdfplumber's to_image.
|
2078
|
+
**kwargs: Additional args for pdfplumber's internal to_image.
|
1948
2079
|
|
1949
2080
|
Returns:
|
1950
2081
|
Self for method chaining.
|