natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +159 -3
- natural_pdf/core/highlighting_service.py +8 -8
- natural_pdf/core/page.py +135 -4
- natural_pdf/core/page_collection.py +37 -0
- natural_pdf/core/page_groupby.py +229 -0
- natural_pdf/core/render_spec.py +18 -4
- natural_pdf/elements/base.py +54 -6
- natural_pdf/elements/element_collection.py +1 -0
- natural_pdf/elements/region.py +2 -2
- natural_pdf/elements/text.py +5 -0
- natural_pdf/extraction/manager.py +8 -14
- natural_pdf/extraction/mixin.py +35 -21
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +37 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/METADATA +2 -2
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/RECORD +22 -21
- optimization/performance_analysis.py +1 -1
- tools/bad_pdf_eval/analyser.py +1 -1
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/top_level.txt +0 -0
natural_pdf/analyzers/guides.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
5
|
from collections import UserList
|
6
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from PIL import Image, ImageDraw
|
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
|
|
16
16
|
from natural_pdf.elements.element_collection import ElementCollection
|
17
17
|
from natural_pdf.elements.region import Region
|
18
18
|
from natural_pdf.flows.region import FlowRegion
|
19
|
+
from natural_pdf.tables.result import TableResult
|
19
20
|
|
20
21
|
logger = logging.getLogger(__name__)
|
21
22
|
|
@@ -131,6 +132,15 @@ class GuidesList(UserList):
|
|
131
132
|
self._parent = parent_guides
|
132
133
|
self._axis = axis
|
133
134
|
|
135
|
+
def __getitem__(self, i):
|
136
|
+
"""Override to handle slicing properly."""
|
137
|
+
if isinstance(i, slice):
|
138
|
+
# Return a new GuidesList with the sliced data
|
139
|
+
return self.__class__(self._parent, self._axis, self.data[i])
|
140
|
+
else:
|
141
|
+
# For single index, return the value directly
|
142
|
+
return self.data[i]
|
143
|
+
|
134
144
|
def from_content(
|
135
145
|
self,
|
136
146
|
markers: Union[str, List[str], "ElementCollection", None],
|
@@ -140,6 +150,7 @@ class GuidesList(UserList):
|
|
140
150
|
tolerance: float = 5,
|
141
151
|
*,
|
142
152
|
append: bool = False,
|
153
|
+
apply_exclusions: bool = True,
|
143
154
|
) -> "Guides":
|
144
155
|
"""
|
145
156
|
Create guides from content markers and add to this axis.
|
@@ -154,6 +165,7 @@ class GuidesList(UserList):
|
|
154
165
|
align: How to align guides relative to found elements
|
155
166
|
outer: Whether to add outer boundary guides
|
156
167
|
tolerance: Tolerance for snapping to element edges
|
168
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
157
169
|
|
158
170
|
Returns:
|
159
171
|
Parent Guides object for chaining
|
@@ -178,6 +190,7 @@ class GuidesList(UserList):
|
|
178
190
|
align=align,
|
179
191
|
outer=outer,
|
180
192
|
tolerance=tolerance,
|
193
|
+
apply_exclusions=apply_exclusions,
|
181
194
|
)
|
182
195
|
|
183
196
|
# Collect guides from this region
|
@@ -260,6 +273,7 @@ class GuidesList(UserList):
|
|
260
273
|
align=align,
|
261
274
|
outer=outer,
|
262
275
|
tolerance=tolerance,
|
276
|
+
apply_exclusions=apply_exclusions,
|
263
277
|
)
|
264
278
|
|
265
279
|
# Replace or append based on parameter
|
@@ -1398,6 +1412,7 @@ class Guides:
|
|
1398
1412
|
align: Literal["left", "right", "center", "between"] = "left",
|
1399
1413
|
outer: bool = True,
|
1400
1414
|
tolerance: float = 5,
|
1415
|
+
apply_exclusions: bool = True,
|
1401
1416
|
) -> "Guides":
|
1402
1417
|
"""
|
1403
1418
|
Create guides based on text content positions.
|
@@ -1413,6 +1428,7 @@ class Guides:
|
|
1413
1428
|
align: Where to place guides relative to found text
|
1414
1429
|
outer: Whether to add guides at the boundaries
|
1415
1430
|
tolerance: Maximum distance to search for text
|
1431
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
1416
1432
|
|
1417
1433
|
Returns:
|
1418
1434
|
New Guides object aligned to text content
|
@@ -1431,6 +1447,7 @@ class Guides:
|
|
1431
1447
|
align=align,
|
1432
1448
|
outer=outer,
|
1433
1449
|
tolerance=tolerance,
|
1450
|
+
apply_exclusions=apply_exclusions,
|
1434
1451
|
)
|
1435
1452
|
|
1436
1453
|
# Store in flow guides
|
@@ -1469,7 +1486,7 @@ class Guides:
|
|
1469
1486
|
# Find each marker and determine guide position
|
1470
1487
|
for marker in marker_texts:
|
1471
1488
|
if hasattr(obj, "find"):
|
1472
|
-
element = obj.find(f'text:contains("{marker}")')
|
1489
|
+
element = obj.find(f'text:contains("{marker}")', apply_exclusions=apply_exclusions)
|
1473
1490
|
if element:
|
1474
1491
|
if axis == "vertical":
|
1475
1492
|
if align == "left":
|
@@ -1498,7 +1515,9 @@ class Guides:
|
|
1498
1515
|
marker_bounds = []
|
1499
1516
|
for marker in marker_texts:
|
1500
1517
|
if hasattr(obj, "find"):
|
1501
|
-
element = obj.find(
|
1518
|
+
element = obj.find(
|
1519
|
+
f'text:contains("{marker}")', apply_exclusions=apply_exclusions
|
1520
|
+
)
|
1502
1521
|
if element:
|
1503
1522
|
if axis == "vertical":
|
1504
1523
|
marker_bounds.append((element.x0, element.x1))
|
@@ -3285,6 +3304,7 @@ class Guides:
|
|
3285
3304
|
align: Literal["left", "right", "center", "between"] = "left",
|
3286
3305
|
outer: bool = True,
|
3287
3306
|
tolerance: float = 5,
|
3307
|
+
apply_exclusions: bool = True,
|
3288
3308
|
) -> "Guides":
|
3289
3309
|
"""
|
3290
3310
|
Instance method: Add guides from content, allowing chaining.
|
@@ -3301,6 +3321,7 @@ class Guides:
|
|
3301
3321
|
align: How to align guides relative to found elements
|
3302
3322
|
outer: Whether to add outer boundary guides
|
3303
3323
|
tolerance: Tolerance for snapping to element edges
|
3324
|
+
apply_exclusions: Whether to apply exclusion zones when searching for text
|
3304
3325
|
|
3305
3326
|
Returns:
|
3306
3327
|
Self for method chaining
|
@@ -3318,6 +3339,7 @@ class Guides:
|
|
3318
3339
|
align=align,
|
3319
3340
|
outer=outer,
|
3320
3341
|
tolerance=tolerance,
|
3342
|
+
apply_exclusions=apply_exclusions,
|
3321
3343
|
)
|
3322
3344
|
|
3323
3345
|
# Add the appropriate coordinates to this object
|
@@ -3421,6 +3443,140 @@ class Guides:
|
|
3421
3443
|
|
3422
3444
|
return self
|
3423
3445
|
|
3446
|
+
def extract_table(
|
3447
|
+
self,
|
3448
|
+
target: Optional[Union["Page", "Region"]] = None,
|
3449
|
+
source: str = "guides_temp",
|
3450
|
+
cell_padding: float = 0.5,
|
3451
|
+
include_outer_boundaries: bool = False,
|
3452
|
+
method: Optional[str] = None,
|
3453
|
+
table_settings: Optional[dict] = None,
|
3454
|
+
use_ocr: bool = False,
|
3455
|
+
ocr_config: Optional[dict] = None,
|
3456
|
+
text_options: Optional[Dict] = None,
|
3457
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
3458
|
+
show_progress: bool = False,
|
3459
|
+
content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
|
3460
|
+
*,
|
3461
|
+
multi_page: Literal["auto", True, False] = "auto",
|
3462
|
+
) -> "TableResult":
|
3463
|
+
"""
|
3464
|
+
Extract table data directly from guides without leaving temporary regions.
|
3465
|
+
|
3466
|
+
This method:
|
3467
|
+
1. Creates table structure using build_grid()
|
3468
|
+
2. Extracts table data from the created table region
|
3469
|
+
3. Cleans up all temporary regions
|
3470
|
+
4. Returns the TableResult
|
3471
|
+
|
3472
|
+
Args:
|
3473
|
+
target: Page or Region to create regions on (uses self.context if None)
|
3474
|
+
source: Source label for temporary regions (will be cleaned up)
|
3475
|
+
cell_padding: Internal padding for cell regions in points
|
3476
|
+
include_outer_boundaries: Whether to add boundaries at edges if missing
|
3477
|
+
method: Table extraction method ('tatr', 'pdfplumber', 'text', etc.)
|
3478
|
+
table_settings: Settings for pdfplumber table extraction
|
3479
|
+
use_ocr: Whether to use OCR for text extraction
|
3480
|
+
ocr_config: OCR configuration parameters
|
3481
|
+
text_options: Dictionary of options for the 'text' method
|
3482
|
+
cell_extraction_func: Optional callable for custom cell text extraction
|
3483
|
+
show_progress: Controls progress bar for text method
|
3484
|
+
content_filter: Content filtering function or patterns
|
3485
|
+
multi_page: Controls multi-region table creation for FlowRegions
|
3486
|
+
|
3487
|
+
Returns:
|
3488
|
+
TableResult: Extracted table data
|
3489
|
+
|
3490
|
+
Raises:
|
3491
|
+
ValueError: If no table region is created from the guides
|
3492
|
+
|
3493
|
+
Example:
|
3494
|
+
```python
|
3495
|
+
from natural_pdf.analyzers import Guides
|
3496
|
+
|
3497
|
+
# Create guides from detected lines
|
3498
|
+
guides = Guides.from_lines(page, source_label="detected")
|
3499
|
+
|
3500
|
+
# Extract table directly - no temporary regions left behind
|
3501
|
+
table_data = guides.extract_table()
|
3502
|
+
|
3503
|
+
# Convert to pandas DataFrame
|
3504
|
+
df = table_data.to_df()
|
3505
|
+
```
|
3506
|
+
"""
|
3507
|
+
target_obj = target or self.context
|
3508
|
+
if not target_obj:
|
3509
|
+
raise ValueError("No target object available. Provide target parameter or context.")
|
3510
|
+
|
3511
|
+
# Get the page for cleanup later
|
3512
|
+
if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
|
3513
|
+
page = target_obj._page
|
3514
|
+
element_manager = page._element_mgr
|
3515
|
+
elif hasattr(target_obj, "_element_mgr"): # Page
|
3516
|
+
page = target_obj
|
3517
|
+
element_manager = page._element_mgr
|
3518
|
+
else:
|
3519
|
+
raise ValueError(f"Target object {target_obj} is not a Page or Region")
|
3520
|
+
|
3521
|
+
try:
|
3522
|
+
# Step 1: Build grid structure (creates temporary regions)
|
3523
|
+
grid_result = self.build_grid(
|
3524
|
+
target=target_obj,
|
3525
|
+
source=source,
|
3526
|
+
cell_padding=cell_padding,
|
3527
|
+
include_outer_boundaries=include_outer_boundaries,
|
3528
|
+
multi_page=multi_page,
|
3529
|
+
)
|
3530
|
+
|
3531
|
+
# Step 2: Get the table region and extract table data
|
3532
|
+
table_region = grid_result["regions"]["table"]
|
3533
|
+
if table_region is None:
|
3534
|
+
raise ValueError(
|
3535
|
+
"No table region was created from the guides. Check that you have both vertical and horizontal guides."
|
3536
|
+
)
|
3537
|
+
|
3538
|
+
# Handle multi-page case where table_region might be a list
|
3539
|
+
if isinstance(table_region, list):
|
3540
|
+
if not table_region:
|
3541
|
+
raise ValueError("No table regions were created from the guides.")
|
3542
|
+
# Use the first table region for extraction
|
3543
|
+
table_region = table_region[0]
|
3544
|
+
|
3545
|
+
# Step 3: Extract table data using the region's extract_table method
|
3546
|
+
table_result = table_region.extract_table(
|
3547
|
+
method=method,
|
3548
|
+
table_settings=table_settings,
|
3549
|
+
use_ocr=use_ocr,
|
3550
|
+
ocr_config=ocr_config,
|
3551
|
+
text_options=text_options,
|
3552
|
+
cell_extraction_func=cell_extraction_func,
|
3553
|
+
show_progress=show_progress,
|
3554
|
+
content_filter=content_filter,
|
3555
|
+
)
|
3556
|
+
|
3557
|
+
return table_result
|
3558
|
+
|
3559
|
+
finally:
|
3560
|
+
# Step 4: Clean up all temporary regions created by build_grid
|
3561
|
+
# This ensures no regions are left behind regardless of success/failure
|
3562
|
+
try:
|
3563
|
+
regions_to_remove = [
|
3564
|
+
r
|
3565
|
+
for r in element_manager.regions
|
3566
|
+
if getattr(r, "source", None) == source
|
3567
|
+
and getattr(r, "region_type", None)
|
3568
|
+
in {"table", "table_row", "table_column", "table_cell"}
|
3569
|
+
]
|
3570
|
+
|
3571
|
+
for region in regions_to_remove:
|
3572
|
+
element_manager.remove_element(region, element_type="regions")
|
3573
|
+
|
3574
|
+
if regions_to_remove:
|
3575
|
+
logger.debug(f"Cleaned up {len(regions_to_remove)} temporary regions")
|
3576
|
+
|
3577
|
+
except Exception as cleanup_err:
|
3578
|
+
logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
|
3579
|
+
|
3424
3580
|
def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
|
3425
3581
|
"""Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
|
3426
3582
|
if not self.is_flow_region or len(self.context.constituent_regions) < 2:
|
@@ -689,7 +689,7 @@ class HighlightingService:
|
|
689
689
|
logger.debug(f"Added highlight to page {page_index}: {highlight}")
|
690
690
|
|
691
691
|
# --- Invalidate page-level image cache --------------------------------
|
692
|
-
# The Page.
|
692
|
+
# The Page.render method maintains an internal cache keyed by rendering
|
693
693
|
# parameters. Because the cache key currently does **not** incorporate
|
694
694
|
# any information about the highlights themselves, it can return stale
|
695
695
|
# images after highlights are added or removed. To ensure the next
|
@@ -700,11 +700,11 @@ class HighlightingService:
|
|
700
700
|
if hasattr(page_obj, "_to_image_cache"):
|
701
701
|
page_obj._to_image_cache.clear()
|
702
702
|
logger.debug(
|
703
|
-
f"Cleared cached
|
703
|
+
f"Cleared cached render images for page {page_index} after adding a highlight."
|
704
704
|
)
|
705
705
|
except Exception as cache_err: # pragma: no cover – never fail highlight creation
|
706
706
|
logger.warning(
|
707
|
-
f"Failed to invalidate
|
707
|
+
f"Failed to invalidate render cache for page {page_index}: {cache_err}",
|
708
708
|
exc_info=True,
|
709
709
|
)
|
710
710
|
|
@@ -737,11 +737,11 @@ class HighlightingService:
|
|
737
737
|
if hasattr(page_obj, "_to_image_cache"):
|
738
738
|
page_obj._to_image_cache.clear()
|
739
739
|
logger.debug(
|
740
|
-
f"Cleared cached
|
740
|
+
f"Cleared cached render images for page {page_index} after removing highlights."
|
741
741
|
)
|
742
742
|
except Exception as cache_err: # pragma: no cover
|
743
743
|
logger.warning(
|
744
|
-
f"Failed to invalidate
|
744
|
+
f"Failed to invalidate render cache for page {page_index}: {cache_err}",
|
745
745
|
exc_info=True,
|
746
746
|
)
|
747
747
|
|
@@ -760,7 +760,7 @@ class HighlightingService:
|
|
760
760
|
labels: bool = True,
|
761
761
|
legend_position: str = "right",
|
762
762
|
render_ocr: bool = False,
|
763
|
-
**kwargs, # Pass other args to pdfplumber.page.to_image if needed
|
763
|
+
**kwargs, # Pass other args to pdfplumber.page.to_image if needed (internal API)
|
764
764
|
) -> Optional[Image.Image]:
|
765
765
|
"""
|
766
766
|
Renders a specific page with its highlights.
|
@@ -773,7 +773,7 @@ class HighlightingService:
|
|
773
773
|
labels: Whether to include a legend for highlights.
|
774
774
|
legend_position: Position of the legend.
|
775
775
|
render_ocr: Whether to render OCR text on the image.
|
776
|
-
kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
|
776
|
+
kwargs: Additional keyword arguments for pdfplumber's internal page.to_image (e.g., width, height).
|
777
777
|
|
778
778
|
Returns:
|
779
779
|
A PIL Image object of the rendered page, or None if rendering fails.
|
@@ -957,7 +957,7 @@ class HighlightingService:
|
|
957
957
|
crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
|
958
958
|
space to crop the output image to, before legends or other overlays are
|
959
959
|
applied. If None, no cropping is performed.
|
960
|
-
**kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
|
960
|
+
**kwargs: Additional args for pdfplumber's internal to_image (e.g., width, height).
|
961
961
|
|
962
962
|
Returns:
|
963
963
|
PIL Image of the preview, or None if rendering fails.
|
natural_pdf/core/page.py
CHANGED
@@ -341,6 +341,26 @@ class Page(
|
|
341
341
|
for elem in elements:
|
342
342
|
spec.add_highlight(element=elem, color=group_color, label=group_label)
|
343
343
|
|
344
|
+
# Handle exclusions visualization
|
345
|
+
exclusions_param = kwargs.get("exclusions")
|
346
|
+
if exclusions_param:
|
347
|
+
# Get exclusion regions
|
348
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True)
|
349
|
+
|
350
|
+
if exclusion_regions:
|
351
|
+
# Determine color for exclusions
|
352
|
+
exclusion_color = (
|
353
|
+
exclusions_param if isinstance(exclusions_param, str) else "red"
|
354
|
+
)
|
355
|
+
|
356
|
+
# Add exclusion regions as highlights
|
357
|
+
for region in exclusion_regions:
|
358
|
+
spec.add_highlight(
|
359
|
+
element=region,
|
360
|
+
color=exclusion_color,
|
361
|
+
label=f"Exclusion: {region.label or 'unnamed'}",
|
362
|
+
)
|
363
|
+
|
344
364
|
return [spec]
|
345
365
|
|
346
366
|
@property
|
@@ -391,7 +411,9 @@ class Page(
|
|
391
411
|
|
392
412
|
def add_exclusion(
|
393
413
|
self,
|
394
|
-
exclusion_func_or_region: Union[
|
414
|
+
exclusion_func_or_region: Union[
|
415
|
+
Callable[["Page"], "Region"], "Region", List[Any], Tuple[Any, ...], Any
|
416
|
+
],
|
395
417
|
label: Optional[str] = None,
|
396
418
|
method: str = "region",
|
397
419
|
) -> "Page":
|
@@ -401,7 +423,8 @@ class Page(
|
|
401
423
|
|
402
424
|
Args:
|
403
425
|
exclusion_func_or_region: Either a callable function returning a Region,
|
404
|
-
a Region object,
|
426
|
+
a Region object, a list/tuple of regions or elements,
|
427
|
+
or another object with a valid .bbox attribute.
|
405
428
|
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
406
429
|
method: Exclusion method - 'region' (exclude all elements in bounding box) or
|
407
430
|
'element' (exclude only the specific elements). Default: 'region'.
|
@@ -551,10 +574,53 @@ class Page(
|
|
551
574
|
raise TypeError(
|
552
575
|
f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
|
553
576
|
) from e
|
577
|
+
elif isinstance(exclusion_func_or_region, (list, tuple)):
|
578
|
+
# Handle lists/tuples of regions or elements
|
579
|
+
if not exclusion_func_or_region:
|
580
|
+
logger.warning(f"Page {self.index}: Empty list provided for exclusion, ignoring.")
|
581
|
+
return self
|
582
|
+
|
583
|
+
if method == "element":
|
584
|
+
# Store each element directly
|
585
|
+
for item in exclusion_func_or_region:
|
586
|
+
if hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
|
587
|
+
self._exclusions.append((item, label, method))
|
588
|
+
logger.debug(
|
589
|
+
f"Page {self.index}: Added element exclusion from list -> {item}"
|
590
|
+
)
|
591
|
+
else:
|
592
|
+
logger.warning(
|
593
|
+
f"Page {self.index}: Skipping item without valid bbox in list: {item}"
|
594
|
+
)
|
595
|
+
else: # method == "region"
|
596
|
+
# Convert each item to a Region and add
|
597
|
+
for item in exclusion_func_or_region:
|
598
|
+
try:
|
599
|
+
if isinstance(item, Region):
|
600
|
+
item.label = label
|
601
|
+
self._exclusions.append((item, label, method))
|
602
|
+
logger.debug(f"Page {self.index}: Added Region from list: {item}")
|
603
|
+
elif hasattr(item, "bbox") and len(getattr(item, "bbox", [])) == 4:
|
604
|
+
bbox_coords = tuple(float(v) for v in item.bbox)
|
605
|
+
region = Region(self, bbox_coords, label=label)
|
606
|
+
self._exclusions.append((region, label, method))
|
607
|
+
logger.debug(
|
608
|
+
f"Page {self.index}: Added exclusion region from list item {bbox_coords}"
|
609
|
+
)
|
610
|
+
else:
|
611
|
+
logger.warning(
|
612
|
+
f"Page {self.index}: Skipping item without valid bbox in list: {item}"
|
613
|
+
)
|
614
|
+
except Exception as e:
|
615
|
+
logger.error(
|
616
|
+
f"Page {self.index}: Failed to convert list item to Region: {e}"
|
617
|
+
)
|
618
|
+
continue
|
619
|
+
return self
|
554
620
|
else:
|
555
621
|
# Reject invalid types
|
556
622
|
raise TypeError(
|
557
|
-
f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
|
623
|
+
f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, list/tuple of regions/elements, or have a valid .bbox attribute."
|
558
624
|
)
|
559
625
|
|
560
626
|
# Append the stored data (tuple of object/callable, label, and method)
|
@@ -668,6 +734,46 @@ class Page(
|
|
668
734
|
regions.append(region_result)
|
669
735
|
if debug:
|
670
736
|
print(f" ✓ Added region from callable '{label}': {region_result}")
|
737
|
+
elif hasattr(region_result, "__iter__") and hasattr(region_result, "__len__"):
|
738
|
+
# Handle ElementCollection or other iterables
|
739
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
740
|
+
|
741
|
+
if isinstance(region_result, ElementCollection) or (
|
742
|
+
hasattr(region_result, "__iter__") and region_result
|
743
|
+
):
|
744
|
+
if debug:
|
745
|
+
print(
|
746
|
+
f" Converting {type(region_result)} with {len(region_result)} elements to regions..."
|
747
|
+
)
|
748
|
+
|
749
|
+
# Convert each element to a region
|
750
|
+
for elem in region_result:
|
751
|
+
try:
|
752
|
+
if hasattr(elem, "bbox") and len(elem.bbox) == 4:
|
753
|
+
bbox_coords = tuple(float(v) for v in elem.bbox)
|
754
|
+
region = Region(self, bbox_coords, label=label)
|
755
|
+
regions.append(region)
|
756
|
+
if debug:
|
757
|
+
print(
|
758
|
+
f" ✓ Added region from element: {bbox_coords}"
|
759
|
+
)
|
760
|
+
else:
|
761
|
+
if debug:
|
762
|
+
print(
|
763
|
+
f" ✗ Skipping element without valid bbox: {elem}"
|
764
|
+
)
|
765
|
+
except Exception as e:
|
766
|
+
if debug:
|
767
|
+
print(f" ✗ Failed to convert element to region: {e}")
|
768
|
+
continue
|
769
|
+
|
770
|
+
if debug and len(region_result) > 0:
|
771
|
+
print(
|
772
|
+
f" ✓ Converted {len(region_result)} elements from callable '{label}'"
|
773
|
+
)
|
774
|
+
else:
|
775
|
+
if debug:
|
776
|
+
print(f" ✗ Empty iterable returned from callable '{label}'")
|
671
777
|
elif region_result:
|
672
778
|
logger.warning(
|
673
779
|
f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
|
@@ -1013,6 +1119,22 @@ class Page(
|
|
1013
1119
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
1014
1120
|
)
|
1015
1121
|
|
1122
|
+
# Handle collection-level pseudo-classes (:first, :last) for OR selectors
|
1123
|
+
# Note: We only apply :first/:last if they appear in any of the sub-selectors
|
1124
|
+
has_first = False
|
1125
|
+
has_last = False
|
1126
|
+
for sub_selector in selector_obj.get("selectors", []):
|
1127
|
+
for pseudo in sub_selector.get("pseudo_classes", []):
|
1128
|
+
if pseudo.get("name") == "first":
|
1129
|
+
has_first = True
|
1130
|
+
elif pseudo.get("name") == "last":
|
1131
|
+
has_last = True
|
1132
|
+
|
1133
|
+
if has_first:
|
1134
|
+
matching_elements = matching_elements[:1] if matching_elements else []
|
1135
|
+
elif has_last:
|
1136
|
+
matching_elements = matching_elements[-1:] if matching_elements else []
|
1137
|
+
|
1016
1138
|
# Return result collection
|
1017
1139
|
return ElementCollection(matching_elements)
|
1018
1140
|
|
@@ -1134,6 +1256,15 @@ class Page(
|
|
1134
1256
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
1135
1257
|
)
|
1136
1258
|
|
1259
|
+
# Handle collection-level pseudo-classes (:first, :last)
|
1260
|
+
for pseudo in selector_obj.get("pseudo_classes", []):
|
1261
|
+
name = pseudo.get("name")
|
1262
|
+
|
1263
|
+
if name == "first":
|
1264
|
+
matching_elements = matching_elements[:1] if matching_elements else []
|
1265
|
+
elif name == "last":
|
1266
|
+
matching_elements = matching_elements[-1:] if matching_elements else []
|
1267
|
+
|
1137
1268
|
# Create result collection - exclusions are handled by the calling methods (find, find_all)
|
1138
1269
|
result = ElementCollection(matching_elements)
|
1139
1270
|
|
@@ -1944,7 +2075,7 @@ class Page(
|
|
1944
2075
|
render_ocr: Whether to render OCR text.
|
1945
2076
|
include_highlights: Whether to render highlights.
|
1946
2077
|
resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
|
1947
|
-
**kwargs: Additional args for pdfplumber's to_image.
|
2078
|
+
**kwargs: Additional args for pdfplumber's internal to_image.
|
1948
2079
|
|
1949
2080
|
Returns:
|
1950
2081
|
Self for method chaining.
|
@@ -1247,3 +1247,40 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
1247
1247
|
from natural_pdf.core.highlighting_service import HighlightContext
|
1248
1248
|
|
1249
1249
|
return HighlightContext(self, show_on_exit=show)
|
1250
|
+
|
1251
|
+
def groupby(self, by: Union[str, Callable], *, show_progress: bool = True) -> "PageGroupBy":
|
1252
|
+
"""
|
1253
|
+
Group pages by selector text or callable result.
|
1254
|
+
|
1255
|
+
Args:
|
1256
|
+
by: CSS selector string or callable function
|
1257
|
+
show_progress: Whether to show progress bar during computation (default: True)
|
1258
|
+
|
1259
|
+
Returns:
|
1260
|
+
PageGroupBy object supporting iteration and dict-like access
|
1261
|
+
|
1262
|
+
Examples:
|
1263
|
+
# Group by header text
|
1264
|
+
for title, pages in pdf.pages.groupby('text[size=16]'):
|
1265
|
+
print(f"Section: {title}")
|
1266
|
+
|
1267
|
+
# Group by callable
|
1268
|
+
for city, pages in pdf.pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text()):
|
1269
|
+
process_city_pages(pages)
|
1270
|
+
|
1271
|
+
# Quick exploration with indexing
|
1272
|
+
grouped = pdf.pages.groupby('text[size=16]')
|
1273
|
+
grouped.info() # Show all groups
|
1274
|
+
first_section = grouped[0] # First group
|
1275
|
+
last_section = grouped[-1] # Last group
|
1276
|
+
|
1277
|
+
# Dict-like access by name
|
1278
|
+
madison_pages = grouped.get('CITY OF MADISON')
|
1279
|
+
madison_pages = grouped['CITY OF MADISON'] # Alternative
|
1280
|
+
|
1281
|
+
# Disable progress bar for small collections
|
1282
|
+
grouped = pdf.pages.groupby('text[size=16]', show_progress=False)
|
1283
|
+
"""
|
1284
|
+
from natural_pdf.core.page_groupby import PageGroupBy
|
1285
|
+
|
1286
|
+
return PageGroupBy(self, by, show_progress=show_progress)
|