natural-pdf 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +94 -42
- natural_pdf/core/page.py +110 -44
- natural_pdf/core/page_collection.py +223 -34
- natural_pdf/core/page_groupby.py +20 -2
- natural_pdf/core/pdf.py +3 -0
- natural_pdf/core/render_spec.py +20 -5
- natural_pdf/describe/base.py +1 -1
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +84 -8
- natural_pdf/elements/element_collection.py +730 -12
- natural_pdf/elements/region.py +181 -48
- natural_pdf/flows/flow.py +3 -0
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/RECORD +20 -19
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/top_level.txt +0 -0
@@ -21,7 +21,7 @@ from typing import (
|
|
21
21
|
overload,
|
22
22
|
)
|
23
23
|
|
24
|
-
from pdfplumber.utils.geometry import objects_to_bbox
|
24
|
+
from pdfplumber.utils.geometry import get_bbox_overlap, objects_to_bbox
|
25
25
|
|
26
26
|
# New Imports
|
27
27
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
@@ -45,6 +45,7 @@ from natural_pdf.ocr import OCROptions
|
|
45
45
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
46
46
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
47
47
|
from natural_pdf.text_mixin import TextMixin
|
48
|
+
from natural_pdf.utils.color_utils import format_color_value
|
48
49
|
|
49
50
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
50
51
|
try:
|
@@ -180,7 +181,7 @@ class ElementCollection(
|
|
180
181
|
mode: Literal["show", "render"] = "show",
|
181
182
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
182
183
|
highlights: Optional[List[Dict[str, Any]]] = None,
|
183
|
-
crop: Union[bool, Literal["
|
184
|
+
crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
|
184
185
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
185
186
|
group_by: Optional[str] = None,
|
186
187
|
bins: Optional[Union[int, List[float]]] = None,
|
@@ -193,7 +194,7 @@ class ElementCollection(
|
|
193
194
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
194
195
|
color: Default color for highlights in show mode (or colormap name when using group_by)
|
195
196
|
highlights: Additional highlight groups to show
|
196
|
-
crop:
|
197
|
+
crop: Cropping mode (False, True, int for padding, 'wide', or Region)
|
197
198
|
crop_bbox: Explicit crop bounds
|
198
199
|
group_by: Attribute to group elements by for color mapping
|
199
200
|
bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
|
@@ -226,7 +227,7 @@ class ElementCollection(
|
|
226
227
|
# Handle cropping
|
227
228
|
if crop_bbox:
|
228
229
|
spec.crop_bbox = crop_bbox
|
229
|
-
elif crop
|
230
|
+
elif crop:
|
230
231
|
# Calculate bounds of elements on this page
|
231
232
|
x_coords = []
|
232
233
|
y_coords = []
|
@@ -237,7 +238,27 @@ class ElementCollection(
|
|
237
238
|
y_coords.extend([y0, y1])
|
238
239
|
|
239
240
|
if x_coords and y_coords:
|
240
|
-
|
241
|
+
content_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
242
|
+
|
243
|
+
if crop is True:
|
244
|
+
# Tight crop to content bounds
|
245
|
+
spec.crop_bbox = content_bbox
|
246
|
+
elif isinstance(crop, (int, float)):
|
247
|
+
# Add padding around content
|
248
|
+
padding = float(crop)
|
249
|
+
x0, y0, x1, y1 = content_bbox
|
250
|
+
spec.crop_bbox = (
|
251
|
+
max(0, x0 - padding),
|
252
|
+
max(0, y0 - padding),
|
253
|
+
min(page.width, x1 + padding),
|
254
|
+
min(page.height, y1 + padding),
|
255
|
+
)
|
256
|
+
elif crop == "wide":
|
257
|
+
# Full page width, cropped vertically to content
|
258
|
+
spec.crop_bbox = (0, content_bbox[1], page.width, content_bbox[3])
|
259
|
+
elif hasattr(crop, "bbox"):
|
260
|
+
# Crop to another region's bounds
|
261
|
+
spec.crop_bbox = crop.bbox
|
241
262
|
|
242
263
|
# Add highlights in show mode
|
243
264
|
if mode == "show":
|
@@ -413,10 +434,16 @@ class ElementCollection(
|
|
413
434
|
element_type = types.pop()
|
414
435
|
return f"<ElementCollection[{element_type}](count={len(self)})>"
|
415
436
|
|
416
|
-
def __add__(self, other: "ElementCollection") -> "ElementCollection":
|
417
|
-
|
437
|
+
def __add__(self, other: Union["ElementCollection", "Element"]) -> "ElementCollection":
|
438
|
+
from natural_pdf.elements.base import Element
|
439
|
+
from natural_pdf.elements.region import Region
|
440
|
+
|
441
|
+
if isinstance(other, ElementCollection):
|
442
|
+
return ElementCollection(self._elements + other._elements)
|
443
|
+
elif isinstance(other, (Element, Region)):
|
444
|
+
return ElementCollection(self._elements + [other])
|
445
|
+
else:
|
418
446
|
return NotImplemented
|
419
|
-
return ElementCollection(self._elements + other._elements)
|
420
447
|
|
421
448
|
def __setitem__(self, index, value):
|
422
449
|
self._elements[index] = value
|
@@ -727,6 +754,67 @@ class ElementCollection(
|
|
727
754
|
|
728
755
|
return result
|
729
756
|
|
757
|
+
def merge(self) -> "Region":
|
758
|
+
"""
|
759
|
+
Merge all elements into a single region encompassing their bounding box.
|
760
|
+
|
761
|
+
Unlike dissolve() which only connects touching elements, merge() creates
|
762
|
+
a single region that spans from the minimum to maximum coordinates of all
|
763
|
+
elements, regardless of whether they touch.
|
764
|
+
|
765
|
+
Returns:
|
766
|
+
A single Region object encompassing all elements
|
767
|
+
|
768
|
+
Raises:
|
769
|
+
ValueError: If the collection is empty or elements have no valid bounding boxes
|
770
|
+
|
771
|
+
Example:
|
772
|
+
```python
|
773
|
+
# Find scattered form fields and merge into one region
|
774
|
+
fields = pdf.find_all('text:contains(Name|Date|Phone)')
|
775
|
+
merged_region = fields.merge()
|
776
|
+
|
777
|
+
# Extract all text from the merged area
|
778
|
+
text = merged_region.extract_text()
|
779
|
+
```
|
780
|
+
"""
|
781
|
+
if not self._elements:
|
782
|
+
raise ValueError("Cannot merge an empty ElementCollection")
|
783
|
+
|
784
|
+
# Collect all bounding boxes
|
785
|
+
bboxes = []
|
786
|
+
page = None
|
787
|
+
|
788
|
+
for elem in self._elements:
|
789
|
+
if hasattr(elem, "bbox") and elem.bbox:
|
790
|
+
bboxes.append(elem.bbox)
|
791
|
+
# Get the page from the first element that has one
|
792
|
+
if page is None and hasattr(elem, "page"):
|
793
|
+
page = elem.page
|
794
|
+
|
795
|
+
if not bboxes:
|
796
|
+
raise ValueError("No elements with valid bounding boxes to merge")
|
797
|
+
|
798
|
+
if page is None:
|
799
|
+
raise ValueError("Cannot determine page for merged region")
|
800
|
+
|
801
|
+
# Find min/max coordinates
|
802
|
+
x_coords = []
|
803
|
+
y_coords = []
|
804
|
+
|
805
|
+
for bbox in bboxes:
|
806
|
+
x0, y0, x1, y1 = bbox
|
807
|
+
x_coords.extend([x0, x1])
|
808
|
+
y_coords.extend([y0, y1])
|
809
|
+
|
810
|
+
# Create encompassing bounding box
|
811
|
+
merged_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
812
|
+
|
813
|
+
# Create and return the merged region
|
814
|
+
from natural_pdf.elements.region import Region
|
815
|
+
|
816
|
+
return Region(page, merged_bbox)
|
817
|
+
|
730
818
|
def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
|
731
819
|
"""
|
732
820
|
Filter elements using a function.
|
@@ -1514,23 +1602,27 @@ class ElementCollection(
|
|
1514
1602
|
self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str
|
1515
1603
|
) -> str:
|
1516
1604
|
"""Formats the label for a group based on the key and format string."""
|
1605
|
+
# Format the group_key if it's a color attribute
|
1606
|
+
formatted_key = format_color_value(group_key, attr_name=group_by_attr)
|
1607
|
+
|
1517
1608
|
if label_format:
|
1518
1609
|
try:
|
1519
1610
|
element_attrs = sample_element.__dict__.copy()
|
1520
|
-
|
1611
|
+
# Use the formatted key in the attributes
|
1612
|
+
element_attrs[group_by_attr] = formatted_key # Ensure key is present
|
1521
1613
|
return label_format.format(**element_attrs)
|
1522
1614
|
except KeyError as e:
|
1523
1615
|
logger.warning(
|
1524
1616
|
f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
|
1525
1617
|
)
|
1526
|
-
return
|
1618
|
+
return formatted_key
|
1527
1619
|
except Exception as format_e:
|
1528
1620
|
logger.warning(
|
1529
1621
|
f"Error formatting label '{label_format}': {format_e}. Using group key as label."
|
1530
1622
|
)
|
1531
|
-
return
|
1623
|
+
return formatted_key
|
1532
1624
|
else:
|
1533
|
-
return
|
1625
|
+
return formatted_key
|
1534
1626
|
|
1535
1627
|
def _get_element_highlight_params(
|
1536
1628
|
self, element: T, annotate: Optional[List[str]]
|
@@ -2336,6 +2428,632 @@ class ElementCollection(
|
|
2336
2428
|
lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
|
2337
2429
|
)
|
2338
2430
|
|
2431
|
+
def merge_connected(
|
2432
|
+
self,
|
2433
|
+
proximity_threshold: float = 5.0,
|
2434
|
+
merge_across_pages: bool = False,
|
2435
|
+
merge_non_regions: bool = False,
|
2436
|
+
text_separator: str = " ",
|
2437
|
+
preserve_order: bool = True,
|
2438
|
+
) -> "ElementCollection":
|
2439
|
+
"""
|
2440
|
+
Merge connected/adjacent regions in the collection into larger regions.
|
2441
|
+
|
2442
|
+
This method identifies regions that are adjacent or overlapping (within a proximity
|
2443
|
+
threshold) and merges them into single regions. This is particularly useful for
|
2444
|
+
handling text that gets split due to font variations, accented characters, or
|
2445
|
+
other PDF rendering quirks.
|
2446
|
+
|
2447
|
+
The method uses a graph-based approach (union-find) to identify connected components
|
2448
|
+
of regions and merges each component into a single region.
|
2449
|
+
|
2450
|
+
Args:
|
2451
|
+
proximity_threshold: Maximum distance in points between regions to consider
|
2452
|
+
them connected. Default is 5.0 points. Use 0 for only overlapping regions.
|
2453
|
+
merge_across_pages: If True, allow merging regions from different pages.
|
2454
|
+
Default is False (only merge within same page).
|
2455
|
+
merge_non_regions: If True, attempt to merge non-Region elements by converting
|
2456
|
+
them to regions first. Default is False (skip non-Region elements).
|
2457
|
+
text_separator: String to use when joining text from merged regions.
|
2458
|
+
Default is a single space.
|
2459
|
+
preserve_order: If True, order merged text by reading order (top-to-bottom,
|
2460
|
+
left-to-right). Default is True.
|
2461
|
+
|
2462
|
+
Returns:
|
2463
|
+
New ElementCollection containing the merged regions. Non-Region elements
|
2464
|
+
(if merge_non_regions=False) and elements that couldn't be merged are
|
2465
|
+
included unchanged.
|
2466
|
+
|
2467
|
+
Example:
|
2468
|
+
```python
|
2469
|
+
# Find all text regions with potential splits
|
2470
|
+
text_regions = page.find_all('region[type=text]')
|
2471
|
+
|
2472
|
+
# Merge adjacent regions (useful for accented characters)
|
2473
|
+
merged = text_regions.merge_connected(proximity_threshold=2.0)
|
2474
|
+
|
2475
|
+
# Extract clean text from merged regions
|
2476
|
+
for region in merged:
|
2477
|
+
print(region.extract_text())
|
2478
|
+
```
|
2479
|
+
|
2480
|
+
Note:
|
2481
|
+
- Regions are considered connected if their bounding boxes are within
|
2482
|
+
proximity_threshold distance of each other
|
2483
|
+
- The merged region's bbox encompasses all constituent regions
|
2484
|
+
- Text content is combined in reading order
|
2485
|
+
- Original metadata is preserved from the first region in each group
|
2486
|
+
"""
|
2487
|
+
if not self._elements:
|
2488
|
+
return ElementCollection([])
|
2489
|
+
|
2490
|
+
from natural_pdf.elements.region import Region
|
2491
|
+
|
2492
|
+
# Separate Region and non-Region elements
|
2493
|
+
regions = []
|
2494
|
+
region_indices = []
|
2495
|
+
non_regions = []
|
2496
|
+
non_region_indices = []
|
2497
|
+
|
2498
|
+
for i, elem in enumerate(self._elements):
|
2499
|
+
if isinstance(elem, Region):
|
2500
|
+
regions.append(elem)
|
2501
|
+
region_indices.append(i)
|
2502
|
+
else:
|
2503
|
+
non_regions.append(elem)
|
2504
|
+
non_region_indices.append(i)
|
2505
|
+
|
2506
|
+
if not regions:
|
2507
|
+
# No regions to merge
|
2508
|
+
return ElementCollection(self._elements)
|
2509
|
+
|
2510
|
+
# Group regions by page if not merging across pages
|
2511
|
+
page_groups = {}
|
2512
|
+
if not merge_across_pages:
|
2513
|
+
for region in regions:
|
2514
|
+
page = getattr(region, "page", None)
|
2515
|
+
if page is not None:
|
2516
|
+
page_id = id(page) # Use object id as unique identifier
|
2517
|
+
if page_id not in page_groups:
|
2518
|
+
page_groups[page_id] = []
|
2519
|
+
page_groups[page_id].append(region)
|
2520
|
+
else:
|
2521
|
+
# Region without page - treat as separate group
|
2522
|
+
page_groups[id(region)] = [region]
|
2523
|
+
else:
|
2524
|
+
# All regions in one group
|
2525
|
+
page_groups = {0: regions}
|
2526
|
+
|
2527
|
+
# Process each page group and collect merged regions
|
2528
|
+
all_merged_regions = []
|
2529
|
+
|
2530
|
+
for page_id, page_regions in page_groups.items():
|
2531
|
+
if len(page_regions) == 1:
|
2532
|
+
# Only one region on this page, nothing to merge
|
2533
|
+
all_merged_regions.extend(page_regions)
|
2534
|
+
continue
|
2535
|
+
|
2536
|
+
# Build adjacency graph using union-find
|
2537
|
+
parent = list(range(len(page_regions)))
|
2538
|
+
|
2539
|
+
def find(x):
|
2540
|
+
if parent[x] != x:
|
2541
|
+
parent[x] = find(parent[x])
|
2542
|
+
return parent[x]
|
2543
|
+
|
2544
|
+
def union(x, y):
|
2545
|
+
px, py = find(x), find(y)
|
2546
|
+
if px != py:
|
2547
|
+
parent[px] = py
|
2548
|
+
|
2549
|
+
# Check all pairs of regions for connectivity
|
2550
|
+
for i in range(len(page_regions)):
|
2551
|
+
for j in range(i + 1, len(page_regions)):
|
2552
|
+
if self._are_regions_connected(
|
2553
|
+
page_regions[i], page_regions[j], proximity_threshold
|
2554
|
+
):
|
2555
|
+
union(i, j)
|
2556
|
+
|
2557
|
+
# Group regions by their connected component
|
2558
|
+
components = {}
|
2559
|
+
for i, region in enumerate(page_regions):
|
2560
|
+
root = find(i)
|
2561
|
+
if root not in components:
|
2562
|
+
components[root] = []
|
2563
|
+
components[root].append(region)
|
2564
|
+
|
2565
|
+
# Merge each component
|
2566
|
+
for component_regions in components.values():
|
2567
|
+
if len(component_regions) == 1:
|
2568
|
+
# Single region, no merge needed
|
2569
|
+
all_merged_regions.append(component_regions[0])
|
2570
|
+
else:
|
2571
|
+
# Merge multiple regions
|
2572
|
+
merged = self._merge_region_group(
|
2573
|
+
component_regions, text_separator, preserve_order
|
2574
|
+
)
|
2575
|
+
all_merged_regions.append(merged)
|
2576
|
+
|
2577
|
+
# Combine merged regions with non-regions (if any)
|
2578
|
+
# Reconstruct in original order as much as possible
|
2579
|
+
result_elements = []
|
2580
|
+
|
2581
|
+
if not non_regions:
|
2582
|
+
# All elements were regions
|
2583
|
+
result_elements = all_merged_regions
|
2584
|
+
else:
|
2585
|
+
# Need to interleave merged regions and non-regions
|
2586
|
+
# This is a simplified approach - just append non-regions at the end
|
2587
|
+
# A more sophisticated approach would maintain relative ordering
|
2588
|
+
result_elements = all_merged_regions + non_regions
|
2589
|
+
|
2590
|
+
return ElementCollection(result_elements)
|
2591
|
+
|
2592
|
+
def _are_regions_connected(
|
2593
|
+
self, region1: "Region", region2: "Region", threshold: float
|
2594
|
+
) -> bool:
|
2595
|
+
"""Check if two regions are connected (adjacent or overlapping)."""
|
2596
|
+
bbox1 = region1.bbox
|
2597
|
+
bbox2 = region2.bbox
|
2598
|
+
|
2599
|
+
# Check for overlap first
|
2600
|
+
overlap = get_bbox_overlap(bbox1, bbox2)
|
2601
|
+
if overlap is not None:
|
2602
|
+
return True
|
2603
|
+
|
2604
|
+
# If no overlap and threshold is 0, regions are not connected
|
2605
|
+
if threshold == 0:
|
2606
|
+
return False
|
2607
|
+
|
2608
|
+
# Check proximity - calculate minimum distance between bboxes
|
2609
|
+
# bbox format: (x0, top, x1, bottom)
|
2610
|
+
x0_1, top_1, x1_1, bottom_1 = bbox1
|
2611
|
+
x0_2, top_2, x1_2, bottom_2 = bbox2
|
2612
|
+
|
2613
|
+
# Calculate horizontal distance
|
2614
|
+
if x1_1 < x0_2:
|
2615
|
+
h_dist = x0_2 - x1_1
|
2616
|
+
elif x1_2 < x0_1:
|
2617
|
+
h_dist = x0_1 - x1_2
|
2618
|
+
else:
|
2619
|
+
h_dist = 0 # Horizontally overlapping
|
2620
|
+
|
2621
|
+
# Calculate vertical distance
|
2622
|
+
if bottom_1 < top_2:
|
2623
|
+
v_dist = top_2 - bottom_1
|
2624
|
+
elif bottom_2 < top_1:
|
2625
|
+
v_dist = top_1 - bottom_2
|
2626
|
+
else:
|
2627
|
+
v_dist = 0 # Vertically overlapping
|
2628
|
+
|
2629
|
+
# Use Chebyshev distance (max of horizontal and vertical)
|
2630
|
+
# This creates a square proximity zone
|
2631
|
+
distance = max(h_dist, v_dist)
|
2632
|
+
|
2633
|
+
return distance <= threshold
|
2634
|
+
|
2635
|
+
def _merge_region_group(
|
2636
|
+
self, regions: List["Region"], text_separator: str, preserve_order: bool
|
2637
|
+
) -> "Region":
|
2638
|
+
"""Merge a group of connected regions into a single region."""
|
2639
|
+
if not regions:
|
2640
|
+
raise ValueError("Cannot merge empty region group")
|
2641
|
+
|
2642
|
+
if len(regions) == 1:
|
2643
|
+
return regions[0]
|
2644
|
+
|
2645
|
+
# Calculate merged bbox
|
2646
|
+
bboxes = [r.bbox for r in regions]
|
2647
|
+
x0s = [b[0] for b in bboxes]
|
2648
|
+
tops = [b[1] for b in bboxes]
|
2649
|
+
x1s = [b[2] for b in bboxes]
|
2650
|
+
bottoms = [b[3] for b in bboxes]
|
2651
|
+
|
2652
|
+
merged_bbox = (min(x0s), min(tops), max(x1s), max(bottoms))
|
2653
|
+
|
2654
|
+
# Use the page from the first region
|
2655
|
+
page = regions[0].page
|
2656
|
+
|
2657
|
+
# Sort regions for text ordering if requested
|
2658
|
+
if preserve_order:
|
2659
|
+
# Sort by reading order: top-to-bottom, left-to-right
|
2660
|
+
sorted_regions = sorted(regions, key=lambda r: (r.top, r.x0))
|
2661
|
+
else:
|
2662
|
+
sorted_regions = regions
|
2663
|
+
|
2664
|
+
# Merge text content
|
2665
|
+
text_parts = []
|
2666
|
+
for region in sorted_regions:
|
2667
|
+
try:
|
2668
|
+
text = region.extract_text()
|
2669
|
+
if text:
|
2670
|
+
text_parts.append(text)
|
2671
|
+
except:
|
2672
|
+
# Region might not have text extraction capability
|
2673
|
+
pass
|
2674
|
+
|
2675
|
+
merged_text = text_separator.join(text_parts) if text_parts else None
|
2676
|
+
|
2677
|
+
# Create merged region
|
2678
|
+
from natural_pdf.elements.region import Region
|
2679
|
+
|
2680
|
+
merged_region = Region(
|
2681
|
+
page=page, bbox=merged_bbox, label=f"Merged ({len(regions)} regions)"
|
2682
|
+
)
|
2683
|
+
|
2684
|
+
# Copy metadata from first region and add merge info
|
2685
|
+
if hasattr(regions[0], "metadata") and regions[0].metadata:
|
2686
|
+
merged_region.metadata = regions[0].metadata.copy()
|
2687
|
+
|
2688
|
+
merged_region.metadata["merge_info"] = {
|
2689
|
+
"source_count": len(regions),
|
2690
|
+
"merged_text": merged_text,
|
2691
|
+
"source_bboxes": bboxes,
|
2692
|
+
}
|
2693
|
+
|
2694
|
+
# If regions have region_type, preserve it if consistent
|
2695
|
+
region_types = set()
|
2696
|
+
for r in regions:
|
2697
|
+
if hasattr(r, "region_type") and r.region_type:
|
2698
|
+
region_types.add(r.region_type)
|
2699
|
+
|
2700
|
+
if len(region_types) == 1:
|
2701
|
+
merged_region.region_type = region_types.pop()
|
2702
|
+
|
2703
|
+
return merged_region
|
2704
|
+
|
2705
|
+
def dissolve(
|
2706
|
+
self,
|
2707
|
+
padding: float = 2.0,
|
2708
|
+
geometry: Literal["rect", "polygon"] = "rect",
|
2709
|
+
group_by: List[str] = None,
|
2710
|
+
) -> "ElementCollection":
|
2711
|
+
"""
|
2712
|
+
Merge connected elements based on proximity and grouping attributes.
|
2713
|
+
|
2714
|
+
This method groups elements by specified attributes (if any), then finds
|
2715
|
+
connected components within each group based on a proximity threshold.
|
2716
|
+
Connected elements are merged by creating new Region objects with merged
|
2717
|
+
bounding boxes.
|
2718
|
+
|
2719
|
+
Args:
|
2720
|
+
padding: Maximum distance in points between elements to consider
|
2721
|
+
them connected. Default is 2.0 points.
|
2722
|
+
geometry: Type of geometry to use for merged regions. Currently only
|
2723
|
+
"rect" (bounding box) is supported. "polygon" will raise
|
2724
|
+
NotImplementedError.
|
2725
|
+
group_by: List of attribute names to group elements by before merging.
|
2726
|
+
Elements are grouped by exact attribute values (floats are rounded
|
2727
|
+
to 2 decimal places). If None, all elements are considered in the
|
2728
|
+
same group. Common attributes include 'size' (for TextElements),
|
2729
|
+
'font_family', 'fontname', etc.
|
2730
|
+
|
2731
|
+
Returns:
|
2732
|
+
New ElementCollection containing the dissolved regions. All elements
|
2733
|
+
with bbox attributes are processed and converted to Region objects.
|
2734
|
+
|
2735
|
+
Example:
|
2736
|
+
```python
|
2737
|
+
# Dissolve elements that are close together
|
2738
|
+
dissolved = elements.dissolve(padding=5.0)
|
2739
|
+
|
2740
|
+
# Group by font size before dissolving
|
2741
|
+
dissolved = elements.dissolve(padding=2.0, group_by=['size'])
|
2742
|
+
|
2743
|
+
# Group by multiple attributes
|
2744
|
+
dissolved = elements.dissolve(
|
2745
|
+
padding=3.0,
|
2746
|
+
group_by=['size', 'font_family']
|
2747
|
+
)
|
2748
|
+
```
|
2749
|
+
|
2750
|
+
Note:
|
2751
|
+
- All elements with bbox attributes are processed
|
2752
|
+
- Float attribute values are rounded to 2 decimal places for grouping
|
2753
|
+
- The method uses Chebyshev distance (max of dx, dy) for proximity
|
2754
|
+
- Merged regions inherit the page from the first element in each group
|
2755
|
+
- Output is always Region objects, regardless of input element types
|
2756
|
+
"""
|
2757
|
+
if geometry == "polygon":
|
2758
|
+
raise NotImplementedError("Polygon geometry is not yet supported for dissolve()")
|
2759
|
+
|
2760
|
+
if geometry not in ["rect", "polygon"]:
|
2761
|
+
raise ValueError(f"Invalid geometry type: {geometry}. Must be 'rect' or 'polygon'")
|
2762
|
+
|
2763
|
+
from natural_pdf.elements.region import Region
|
2764
|
+
|
2765
|
+
# Filter to elements with bbox (all elements that can be dissolved)
|
2766
|
+
elements_with_bbox = [
|
2767
|
+
elem for elem in self._elements if hasattr(elem, "bbox") and elem.bbox
|
2768
|
+
]
|
2769
|
+
|
2770
|
+
if not elements_with_bbox:
|
2771
|
+
logger.debug("No elements with bbox found in collection for dissolve()")
|
2772
|
+
return ElementCollection([])
|
2773
|
+
|
2774
|
+
# Group elements by specified attributes
|
2775
|
+
if group_by:
|
2776
|
+
grouped_elements = self._group_elements_by_attributes(elements_with_bbox, group_by)
|
2777
|
+
else:
|
2778
|
+
# All elements in one group
|
2779
|
+
grouped_elements = {None: elements_with_bbox}
|
2780
|
+
|
2781
|
+
# Process each group and collect dissolved regions
|
2782
|
+
all_dissolved_regions = []
|
2783
|
+
|
2784
|
+
for group_key, group_elements in grouped_elements.items():
|
2785
|
+
if not group_elements:
|
2786
|
+
continue
|
2787
|
+
|
2788
|
+
logger.debug(f"Processing group {group_key} with {len(group_elements)} elements")
|
2789
|
+
|
2790
|
+
# Find connected components within this group
|
2791
|
+
components = self._find_connected_components_elements(group_elements, padding)
|
2792
|
+
|
2793
|
+
# Merge each component
|
2794
|
+
for component_elements in components:
|
2795
|
+
if len(component_elements) == 1:
|
2796
|
+
# Single element, convert to Region
|
2797
|
+
elem = component_elements[0]
|
2798
|
+
region = Region(
|
2799
|
+
page=elem.page, bbox=elem.bbox, label=f"Dissolved (1 {elem.type})"
|
2800
|
+
)
|
2801
|
+
# Copy relevant attributes from source element
|
2802
|
+
self._copy_element_attributes_to_region(elem, region, group_by)
|
2803
|
+
all_dissolved_regions.append(region)
|
2804
|
+
else:
|
2805
|
+
# Merge multiple elements
|
2806
|
+
merged = self._merge_elements_for_dissolve(component_elements, group_by)
|
2807
|
+
all_dissolved_regions.append(merged)
|
2808
|
+
|
2809
|
+
logger.debug(
|
2810
|
+
f"Dissolved {len(elements_with_bbox)} elements into {len(all_dissolved_regions)} regions"
|
2811
|
+
)
|
2812
|
+
|
2813
|
+
return ElementCollection(all_dissolved_regions)
|
2814
|
+
|
2815
|
+
def _group_elements_by_attributes(
|
2816
|
+
self, elements: List["Element"], group_by: List[str]
|
2817
|
+
) -> Dict[Tuple, List["Element"]]:
|
2818
|
+
"""Group elements by specified attributes."""
|
2819
|
+
groups = {}
|
2820
|
+
|
2821
|
+
for element in elements:
|
2822
|
+
# Build group key from attribute values
|
2823
|
+
key_values = []
|
2824
|
+
for attr in group_by:
|
2825
|
+
value = None
|
2826
|
+
|
2827
|
+
# Try to get attribute value from various sources
|
2828
|
+
if hasattr(element, attr):
|
2829
|
+
value = getattr(element, attr)
|
2830
|
+
elif hasattr(element, "_obj") and element._obj and attr in element._obj:
|
2831
|
+
value = element._obj[attr]
|
2832
|
+
elif hasattr(element, "metadata") and element.metadata and attr in element.metadata:
|
2833
|
+
value = element.metadata[attr]
|
2834
|
+
|
2835
|
+
# Round float values to 2 decimal places for grouping
|
2836
|
+
if isinstance(value, float):
|
2837
|
+
value = round(value, 2)
|
2838
|
+
|
2839
|
+
key_values.append(value)
|
2840
|
+
|
2841
|
+
key = tuple(key_values)
|
2842
|
+
|
2843
|
+
if key not in groups:
|
2844
|
+
groups[key] = []
|
2845
|
+
groups[key].append(element)
|
2846
|
+
|
2847
|
+
return groups
|
2848
|
+
|
2849
|
+
def _find_connected_components_elements(
|
2850
|
+
self, elements: List["Element"], padding: float
|
2851
|
+
) -> List[List["Element"]]:
|
2852
|
+
"""Find connected components among elements using union-find."""
|
2853
|
+
if not elements:
|
2854
|
+
return []
|
2855
|
+
|
2856
|
+
if len(elements) == 1:
|
2857
|
+
return [elements]
|
2858
|
+
|
2859
|
+
# Build adjacency using union-find
|
2860
|
+
parent = list(range(len(elements)))
|
2861
|
+
|
2862
|
+
def find(x):
|
2863
|
+
if parent[x] != x:
|
2864
|
+
parent[x] = find(parent[x])
|
2865
|
+
return parent[x]
|
2866
|
+
|
2867
|
+
def union(x, y):
|
2868
|
+
px, py = find(x), find(y)
|
2869
|
+
if px != py:
|
2870
|
+
parent[px] = py
|
2871
|
+
|
2872
|
+
# Check all pairs of elements for connectivity
|
2873
|
+
for i in range(len(elements)):
|
2874
|
+
for j in range(i + 1, len(elements)):
|
2875
|
+
if self._are_elements_connected(elements[i], elements[j], padding):
|
2876
|
+
union(i, j)
|
2877
|
+
|
2878
|
+
# Group elements by their connected component
|
2879
|
+
components = {}
|
2880
|
+
for i, element in enumerate(elements):
|
2881
|
+
root = find(i)
|
2882
|
+
if root not in components:
|
2883
|
+
components[root] = []
|
2884
|
+
components[root].append(element)
|
2885
|
+
|
2886
|
+
return list(components.values())
|
2887
|
+
|
2888
|
+
def _merge_elements_for_dissolve(
|
2889
|
+
self, elements: List["Element"], group_by: List[str] = None
|
2890
|
+
) -> "Region":
|
2891
|
+
"""Merge a group of elements for dissolve operation."""
|
2892
|
+
if not elements:
|
2893
|
+
raise ValueError("Cannot merge empty element group")
|
2894
|
+
|
2895
|
+
if len(elements) == 1:
|
2896
|
+
elem = elements[0]
|
2897
|
+
from natural_pdf.elements.region import Region
|
2898
|
+
|
2899
|
+
region = Region(page=elem.page, bbox=elem.bbox, label=f"Dissolved (1 {elem.type})")
|
2900
|
+
self._copy_element_attributes_to_region(elem, region, group_by)
|
2901
|
+
return region
|
2902
|
+
|
2903
|
+
# Calculate merged bbox
|
2904
|
+
bboxes = [e.bbox for e in elements]
|
2905
|
+
x0s = [b[0] for b in bboxes]
|
2906
|
+
tops = [b[1] for b in bboxes]
|
2907
|
+
x1s = [b[2] for b in bboxes]
|
2908
|
+
bottoms = [b[3] for b in bboxes]
|
2909
|
+
|
2910
|
+
merged_bbox = (min(x0s), min(tops), max(x1s), max(bottoms))
|
2911
|
+
|
2912
|
+
# Use the page from the first element
|
2913
|
+
page = elements[0].page
|
2914
|
+
|
2915
|
+
# Count element types for label
|
2916
|
+
type_counts = {}
|
2917
|
+
for elem in elements:
|
2918
|
+
elem_type = elem.type
|
2919
|
+
type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
|
2920
|
+
|
2921
|
+
# Create label showing element types
|
2922
|
+
label_parts = []
|
2923
|
+
for elem_type, count in sorted(type_counts.items()):
|
2924
|
+
# Pluralize element type if count > 1
|
2925
|
+
type_label = elem_type + ("s" if count > 1 else "")
|
2926
|
+
label_parts.append(f"{count} {type_label}")
|
2927
|
+
label = f"Dissolved ({', '.join(label_parts)})"
|
2928
|
+
|
2929
|
+
# Create merged region
|
2930
|
+
from natural_pdf.elements.region import Region
|
2931
|
+
|
2932
|
+
merged_region = Region(page=page, bbox=merged_bbox, label=label)
|
2933
|
+
|
2934
|
+
# Copy attributes from first element if they're consistent
|
2935
|
+
self._copy_element_attributes_to_region(elements[0], merged_region, group_by)
|
2936
|
+
|
2937
|
+
# Check if all elements have the same region_type
|
2938
|
+
region_types = set()
|
2939
|
+
for elem in elements:
|
2940
|
+
if hasattr(elem, "region_type") and elem.region_type:
|
2941
|
+
region_types.add(elem.region_type)
|
2942
|
+
|
2943
|
+
# Handle region_type based on consistency
|
2944
|
+
if len(region_types) == 1:
|
2945
|
+
# All elements have the same region_type, preserve it
|
2946
|
+
merged_region.region_type = region_types.pop()
|
2947
|
+
elif len(region_types) > 1:
|
2948
|
+
# Multiple different region types, clear it
|
2949
|
+
merged_region.region_type = None
|
2950
|
+
|
2951
|
+
# Add dissolve metadata
|
2952
|
+
merged_region.metadata["dissolve_info"] = {
|
2953
|
+
"source_count": len(elements),
|
2954
|
+
"source_bboxes": bboxes,
|
2955
|
+
"source_types": type_counts,
|
2956
|
+
}
|
2957
|
+
|
2958
|
+
return merged_region
|
2959
|
+
|
2960
|
+
def _are_elements_connected(self, elem1: "Element", elem2: "Element", threshold: float) -> bool:
|
2961
|
+
"""Check if two elements are connected (adjacent or overlapping)."""
|
2962
|
+
# Check if elements are on the same page
|
2963
|
+
# Handle edge cases where elements might not have a page attribute
|
2964
|
+
page1 = getattr(elem1, "page", None)
|
2965
|
+
page2 = getattr(elem2, "page", None)
|
2966
|
+
|
2967
|
+
# If either element doesn't have a page, we can't compare pages
|
2968
|
+
# In this case, only consider them connected if both lack pages
|
2969
|
+
if page1 is None or page2 is None:
|
2970
|
+
if page1 is not page2: # One has page, one doesn't
|
2971
|
+
return False
|
2972
|
+
# Both None - continue with proximity check
|
2973
|
+
elif page1 != page2: # Both have pages but different
|
2974
|
+
return False
|
2975
|
+
|
2976
|
+
bbox1 = elem1.bbox
|
2977
|
+
bbox2 = elem2.bbox
|
2978
|
+
|
2979
|
+
# Check for overlap first
|
2980
|
+
overlap = get_bbox_overlap(bbox1, bbox2)
|
2981
|
+
if overlap is not None:
|
2982
|
+
return True
|
2983
|
+
|
2984
|
+
# If no overlap and threshold is 0, elements are not connected
|
2985
|
+
if threshold == 0:
|
2986
|
+
return False
|
2987
|
+
|
2988
|
+
# Check proximity - calculate minimum distance between bboxes
|
2989
|
+
# bbox format: (x0, top, x1, bottom)
|
2990
|
+
x0_1, top_1, x1_1, bottom_1 = bbox1
|
2991
|
+
x0_2, top_2, x1_2, bottom_2 = bbox2
|
2992
|
+
|
2993
|
+
# Calculate horizontal distance
|
2994
|
+
if x1_1 < x0_2:
|
2995
|
+
h_dist = x0_2 - x1_1
|
2996
|
+
elif x1_2 < x0_1:
|
2997
|
+
h_dist = x0_1 - x1_2
|
2998
|
+
else:
|
2999
|
+
h_dist = 0 # Horizontally overlapping
|
3000
|
+
|
3001
|
+
# Calculate vertical distance
|
3002
|
+
if bottom_1 < top_2:
|
3003
|
+
v_dist = top_2 - bottom_1
|
3004
|
+
elif bottom_2 < top_1:
|
3005
|
+
v_dist = top_1 - bottom_2
|
3006
|
+
else:
|
3007
|
+
v_dist = 0 # Vertically overlapping
|
3008
|
+
|
3009
|
+
# Use Chebyshev distance (max of horizontal and vertical)
|
3010
|
+
# This creates a square proximity zone
|
3011
|
+
distance = max(h_dist, v_dist)
|
3012
|
+
|
3013
|
+
return distance <= threshold
|
3014
|
+
|
3015
|
+
def _copy_element_attributes_to_region(
|
3016
|
+
self, element: "Element", region: "Region", group_by: List[str] = None
|
3017
|
+
) -> None:
|
3018
|
+
"""Copy relevant attributes from source element to region."""
|
3019
|
+
# Common text attributes to check
|
3020
|
+
text_attrs = [
|
3021
|
+
"size",
|
3022
|
+
"font_family",
|
3023
|
+
"fontname",
|
3024
|
+
"font_size",
|
3025
|
+
"font_name",
|
3026
|
+
"bold",
|
3027
|
+
"italic",
|
3028
|
+
"color",
|
3029
|
+
"text_color",
|
3030
|
+
"region_type",
|
3031
|
+
]
|
3032
|
+
|
3033
|
+
# If group_by is specified, prioritize those attributes
|
3034
|
+
attrs_to_check = (group_by or []) + text_attrs
|
3035
|
+
|
3036
|
+
for attr in attrs_to_check:
|
3037
|
+
value = None
|
3038
|
+
|
3039
|
+
# Try different ways to get the attribute
|
3040
|
+
if hasattr(element, attr):
|
3041
|
+
value = getattr(element, attr)
|
3042
|
+
elif hasattr(element, "_obj") and element._obj and attr in element._obj:
|
3043
|
+
value = element._obj[attr]
|
3044
|
+
elif hasattr(element, "metadata") and element.metadata and attr in element.metadata:
|
3045
|
+
value = element.metadata[attr]
|
3046
|
+
|
3047
|
+
# Set the attribute on the region if we found a value
|
3048
|
+
if value is not None:
|
3049
|
+
# Map common attribute names
|
3050
|
+
if attr == "size" and not hasattr(region, "font_size"):
|
3051
|
+
setattr(region, "font_size", value)
|
3052
|
+
elif attr == "fontname" and not hasattr(region, "font_name"):
|
3053
|
+
setattr(region, "font_name", value)
|
3054
|
+
else:
|
3055
|
+
setattr(region, attr, value)
|
3056
|
+
|
2339
3057
|
# ------------------------------------------------------------------
|
2340
3058
|
# NEW METHOD: apply_ocr for collections (supports custom function)
|
2341
3059
|
# ------------------------------------------------------------------
|