natural-pdf 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,7 +21,7 @@ from typing import (
21
21
  overload,
22
22
  )
23
23
 
24
- from pdfplumber.utils.geometry import objects_to_bbox
24
+ from pdfplumber.utils.geometry import get_bbox_overlap, objects_to_bbox
25
25
 
26
26
  # New Imports
27
27
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -45,6 +45,7 @@ from natural_pdf.ocr import OCROptions
45
45
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
46
46
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
47
47
  from natural_pdf.text_mixin import TextMixin
48
+ from natural_pdf.utils.color_utils import format_color_value
48
49
 
49
50
  # Potentially lazy imports for optional dependencies needed in save_pdf
50
51
  try:
@@ -180,7 +181,7 @@ class ElementCollection(
180
181
  mode: Literal["show", "render"] = "show",
181
182
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
182
183
  highlights: Optional[List[Dict[str, Any]]] = None,
183
- crop: Union[bool, Literal["content"]] = False,
184
+ crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
184
185
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
185
186
  group_by: Optional[str] = None,
186
187
  bins: Optional[Union[int, List[float]]] = None,
@@ -193,7 +194,7 @@ class ElementCollection(
193
194
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
194
195
  color: Default color for highlights in show mode (or colormap name when using group_by)
195
196
  highlights: Additional highlight groups to show
196
- crop: Whether to crop to element bounds
197
+ crop: Cropping mode (False, True, int for padding, 'wide', or Region)
197
198
  crop_bbox: Explicit crop bounds
198
199
  group_by: Attribute to group elements by for color mapping
199
200
  bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
@@ -226,7 +227,7 @@ class ElementCollection(
226
227
  # Handle cropping
227
228
  if crop_bbox:
228
229
  spec.crop_bbox = crop_bbox
229
- elif crop == "content" or crop is True:
230
+ elif crop:
230
231
  # Calculate bounds of elements on this page
231
232
  x_coords = []
232
233
  y_coords = []
@@ -237,7 +238,27 @@ class ElementCollection(
237
238
  y_coords.extend([y0, y1])
238
239
 
239
240
  if x_coords and y_coords:
240
- spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
241
+ content_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
242
+
243
+ if crop is True:
244
+ # Tight crop to content bounds
245
+ spec.crop_bbox = content_bbox
246
+ elif isinstance(crop, (int, float)):
247
+ # Add padding around content
248
+ padding = float(crop)
249
+ x0, y0, x1, y1 = content_bbox
250
+ spec.crop_bbox = (
251
+ max(0, x0 - padding),
252
+ max(0, y0 - padding),
253
+ min(page.width, x1 + padding),
254
+ min(page.height, y1 + padding),
255
+ )
256
+ elif crop == "wide":
257
+ # Full page width, cropped vertically to content
258
+ spec.crop_bbox = (0, content_bbox[1], page.width, content_bbox[3])
259
+ elif hasattr(crop, "bbox"):
260
+ # Crop to another region's bounds
261
+ spec.crop_bbox = crop.bbox
241
262
 
242
263
  # Add highlights in show mode
243
264
  if mode == "show":
@@ -413,10 +434,16 @@ class ElementCollection(
413
434
  element_type = types.pop()
414
435
  return f"<ElementCollection[{element_type}](count={len(self)})>"
415
436
 
416
- def __add__(self, other: "ElementCollection") -> "ElementCollection":
417
- if not isinstance(other, ElementCollection):
437
+ def __add__(self, other: Union["ElementCollection", "Element"]) -> "ElementCollection":
438
+ from natural_pdf.elements.base import Element
439
+ from natural_pdf.elements.region import Region
440
+
441
+ if isinstance(other, ElementCollection):
442
+ return ElementCollection(self._elements + other._elements)
443
+ elif isinstance(other, (Element, Region)):
444
+ return ElementCollection(self._elements + [other])
445
+ else:
418
446
  return NotImplemented
419
- return ElementCollection(self._elements + other._elements)
420
447
 
421
448
  def __setitem__(self, index, value):
422
449
  self._elements[index] = value
@@ -727,6 +754,67 @@ class ElementCollection(
727
754
 
728
755
  return result
729
756
 
757
+ def merge(self) -> "Region":
758
+ """
759
+ Merge all elements into a single region encompassing their bounding box.
760
+
761
+ Unlike dissolve() which only connects touching elements, merge() creates
762
+ a single region that spans from the minimum to maximum coordinates of all
763
+ elements, regardless of whether they touch.
764
+
765
+ Returns:
766
+ A single Region object encompassing all elements
767
+
768
+ Raises:
769
+ ValueError: If the collection is empty or elements have no valid bounding boxes
770
+
771
+ Example:
772
+ ```python
773
+ # Find scattered form fields and merge into one region
774
+ fields = pdf.find_all('text:contains(Name|Date|Phone)')
775
+ merged_region = fields.merge()
776
+
777
+ # Extract all text from the merged area
778
+ text = merged_region.extract_text()
779
+ ```
780
+ """
781
+ if not self._elements:
782
+ raise ValueError("Cannot merge an empty ElementCollection")
783
+
784
+ # Collect all bounding boxes
785
+ bboxes = []
786
+ page = None
787
+
788
+ for elem in self._elements:
789
+ if hasattr(elem, "bbox") and elem.bbox:
790
+ bboxes.append(elem.bbox)
791
+ # Get the page from the first element that has one
792
+ if page is None and hasattr(elem, "page"):
793
+ page = elem.page
794
+
795
+ if not bboxes:
796
+ raise ValueError("No elements with valid bounding boxes to merge")
797
+
798
+ if page is None:
799
+ raise ValueError("Cannot determine page for merged region")
800
+
801
+ # Find min/max coordinates
802
+ x_coords = []
803
+ y_coords = []
804
+
805
+ for bbox in bboxes:
806
+ x0, y0, x1, y1 = bbox
807
+ x_coords.extend([x0, x1])
808
+ y_coords.extend([y0, y1])
809
+
810
+ # Create encompassing bounding box
811
+ merged_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
812
+
813
+ # Create and return the merged region
814
+ from natural_pdf.elements.region import Region
815
+
816
+ return Region(page, merged_bbox)
817
+
730
818
  def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
731
819
  """
732
820
  Filter elements using a function.
@@ -1514,23 +1602,27 @@ class ElementCollection(
1514
1602
  self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str
1515
1603
  ) -> str:
1516
1604
  """Formats the label for a group based on the key and format string."""
1605
+ # Format the group_key if it's a color attribute
1606
+ formatted_key = format_color_value(group_key, attr_name=group_by_attr)
1607
+
1517
1608
  if label_format:
1518
1609
  try:
1519
1610
  element_attrs = sample_element.__dict__.copy()
1520
- element_attrs[group_by_attr] = group_key # Ensure key is present
1611
+ # Use the formatted key in the attributes
1612
+ element_attrs[group_by_attr] = formatted_key # Ensure key is present
1521
1613
  return label_format.format(**element_attrs)
1522
1614
  except KeyError as e:
1523
1615
  logger.warning(
1524
1616
  f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
1525
1617
  )
1526
- return str(group_key)
1618
+ return formatted_key
1527
1619
  except Exception as format_e:
1528
1620
  logger.warning(
1529
1621
  f"Error formatting label '{label_format}': {format_e}. Using group key as label."
1530
1622
  )
1531
- return str(group_key)
1623
+ return formatted_key
1532
1624
  else:
1533
- return str(group_key)
1625
+ return formatted_key
1534
1626
 
1535
1627
  def _get_element_highlight_params(
1536
1628
  self, element: T, annotate: Optional[List[str]]
@@ -2336,6 +2428,632 @@ class ElementCollection(
2336
2428
  lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
2337
2429
  )
2338
2430
 
2431
+ def merge_connected(
2432
+ self,
2433
+ proximity_threshold: float = 5.0,
2434
+ merge_across_pages: bool = False,
2435
+ merge_non_regions: bool = False,
2436
+ text_separator: str = " ",
2437
+ preserve_order: bool = True,
2438
+ ) -> "ElementCollection":
2439
+ """
2440
+ Merge connected/adjacent regions in the collection into larger regions.
2441
+
2442
+ This method identifies regions that are adjacent or overlapping (within a proximity
2443
+ threshold) and merges them into single regions. This is particularly useful for
2444
+ handling text that gets split due to font variations, accented characters, or
2445
+ other PDF rendering quirks.
2446
+
2447
+ The method uses a graph-based approach (union-find) to identify connected components
2448
+ of regions and merges each component into a single region.
2449
+
2450
+ Args:
2451
+ proximity_threshold: Maximum distance in points between regions to consider
2452
+ them connected. Default is 5.0 points. Use 0 for only overlapping regions.
2453
+ merge_across_pages: If True, allow merging regions from different pages.
2454
+ Default is False (only merge within same page).
2455
+ merge_non_regions: If True, attempt to merge non-Region elements by converting
2456
+ them to regions first. Default is False (skip non-Region elements).
2457
+ text_separator: String to use when joining text from merged regions.
2458
+ Default is a single space.
2459
+ preserve_order: If True, order merged text by reading order (top-to-bottom,
2460
+ left-to-right). Default is True.
2461
+
2462
+ Returns:
2463
+ New ElementCollection containing the merged regions. Non-Region elements
2464
+ (if merge_non_regions=False) and elements that couldn't be merged are
2465
+ included unchanged.
2466
+
2467
+ Example:
2468
+ ```python
2469
+ # Find all text regions with potential splits
2470
+ text_regions = page.find_all('region[type=text]')
2471
+
2472
+ # Merge adjacent regions (useful for accented characters)
2473
+ merged = text_regions.merge_connected(proximity_threshold=2.0)
2474
+
2475
+ # Extract clean text from merged regions
2476
+ for region in merged:
2477
+ print(region.extract_text())
2478
+ ```
2479
+
2480
+ Note:
2481
+ - Regions are considered connected if their bounding boxes are within
2482
+ proximity_threshold distance of each other
2483
+ - The merged region's bbox encompasses all constituent regions
2484
+ - Text content is combined in reading order
2485
+ - Original metadata is preserved from the first region in each group
2486
+ """
2487
+ if not self._elements:
2488
+ return ElementCollection([])
2489
+
2490
+ from natural_pdf.elements.region import Region
2491
+
2492
+ # Separate Region and non-Region elements
2493
+ regions = []
2494
+ region_indices = []
2495
+ non_regions = []
2496
+ non_region_indices = []
2497
+
2498
+ for i, elem in enumerate(self._elements):
2499
+ if isinstance(elem, Region):
2500
+ regions.append(elem)
2501
+ region_indices.append(i)
2502
+ else:
2503
+ non_regions.append(elem)
2504
+ non_region_indices.append(i)
2505
+
2506
+ if not regions:
2507
+ # No regions to merge
2508
+ return ElementCollection(self._elements)
2509
+
2510
+ # Group regions by page if not merging across pages
2511
+ page_groups = {}
2512
+ if not merge_across_pages:
2513
+ for region in regions:
2514
+ page = getattr(region, "page", None)
2515
+ if page is not None:
2516
+ page_id = id(page) # Use object id as unique identifier
2517
+ if page_id not in page_groups:
2518
+ page_groups[page_id] = []
2519
+ page_groups[page_id].append(region)
2520
+ else:
2521
+ # Region without page - treat as separate group
2522
+ page_groups[id(region)] = [region]
2523
+ else:
2524
+ # All regions in one group
2525
+ page_groups = {0: regions}
2526
+
2527
+ # Process each page group and collect merged regions
2528
+ all_merged_regions = []
2529
+
2530
+ for page_id, page_regions in page_groups.items():
2531
+ if len(page_regions) == 1:
2532
+ # Only one region on this page, nothing to merge
2533
+ all_merged_regions.extend(page_regions)
2534
+ continue
2535
+
2536
+ # Build adjacency graph using union-find
2537
+ parent = list(range(len(page_regions)))
2538
+
2539
+ def find(x):
2540
+ if parent[x] != x:
2541
+ parent[x] = find(parent[x])
2542
+ return parent[x]
2543
+
2544
+ def union(x, y):
2545
+ px, py = find(x), find(y)
2546
+ if px != py:
2547
+ parent[px] = py
2548
+
2549
+ # Check all pairs of regions for connectivity
2550
+ for i in range(len(page_regions)):
2551
+ for j in range(i + 1, len(page_regions)):
2552
+ if self._are_regions_connected(
2553
+ page_regions[i], page_regions[j], proximity_threshold
2554
+ ):
2555
+ union(i, j)
2556
+
2557
+ # Group regions by their connected component
2558
+ components = {}
2559
+ for i, region in enumerate(page_regions):
2560
+ root = find(i)
2561
+ if root not in components:
2562
+ components[root] = []
2563
+ components[root].append(region)
2564
+
2565
+ # Merge each component
2566
+ for component_regions in components.values():
2567
+ if len(component_regions) == 1:
2568
+ # Single region, no merge needed
2569
+ all_merged_regions.append(component_regions[0])
2570
+ else:
2571
+ # Merge multiple regions
2572
+ merged = self._merge_region_group(
2573
+ component_regions, text_separator, preserve_order
2574
+ )
2575
+ all_merged_regions.append(merged)
2576
+
2577
+ # Combine merged regions with non-regions (if any)
2578
+ # Reconstruct in original order as much as possible
2579
+ result_elements = []
2580
+
2581
+ if not non_regions:
2582
+ # All elements were regions
2583
+ result_elements = all_merged_regions
2584
+ else:
2585
+ # Need to interleave merged regions and non-regions
2586
+ # This is a simplified approach - just append non-regions at the end
2587
+ # A more sophisticated approach would maintain relative ordering
2588
+ result_elements = all_merged_regions + non_regions
2589
+
2590
+ return ElementCollection(result_elements)
2591
+
2592
+ def _are_regions_connected(
2593
+ self, region1: "Region", region2: "Region", threshold: float
2594
+ ) -> bool:
2595
+ """Check if two regions are connected (adjacent or overlapping)."""
2596
+ bbox1 = region1.bbox
2597
+ bbox2 = region2.bbox
2598
+
2599
+ # Check for overlap first
2600
+ overlap = get_bbox_overlap(bbox1, bbox2)
2601
+ if overlap is not None:
2602
+ return True
2603
+
2604
+ # If no overlap and threshold is 0, regions are not connected
2605
+ if threshold == 0:
2606
+ return False
2607
+
2608
+ # Check proximity - calculate minimum distance between bboxes
2609
+ # bbox format: (x0, top, x1, bottom)
2610
+ x0_1, top_1, x1_1, bottom_1 = bbox1
2611
+ x0_2, top_2, x1_2, bottom_2 = bbox2
2612
+
2613
+ # Calculate horizontal distance
2614
+ if x1_1 < x0_2:
2615
+ h_dist = x0_2 - x1_1
2616
+ elif x1_2 < x0_1:
2617
+ h_dist = x0_1 - x1_2
2618
+ else:
2619
+ h_dist = 0 # Horizontally overlapping
2620
+
2621
+ # Calculate vertical distance
2622
+ if bottom_1 < top_2:
2623
+ v_dist = top_2 - bottom_1
2624
+ elif bottom_2 < top_1:
2625
+ v_dist = top_1 - bottom_2
2626
+ else:
2627
+ v_dist = 0 # Vertically overlapping
2628
+
2629
+ # Use Chebyshev distance (max of horizontal and vertical)
2630
+ # This creates a square proximity zone
2631
+ distance = max(h_dist, v_dist)
2632
+
2633
+ return distance <= threshold
2634
+
2635
+ def _merge_region_group(
2636
+ self, regions: List["Region"], text_separator: str, preserve_order: bool
2637
+ ) -> "Region":
2638
+ """Merge a group of connected regions into a single region."""
2639
+ if not regions:
2640
+ raise ValueError("Cannot merge empty region group")
2641
+
2642
+ if len(regions) == 1:
2643
+ return regions[0]
2644
+
2645
+ # Calculate merged bbox
2646
+ bboxes = [r.bbox for r in regions]
2647
+ x0s = [b[0] for b in bboxes]
2648
+ tops = [b[1] for b in bboxes]
2649
+ x1s = [b[2] for b in bboxes]
2650
+ bottoms = [b[3] for b in bboxes]
2651
+
2652
+ merged_bbox = (min(x0s), min(tops), max(x1s), max(bottoms))
2653
+
2654
+ # Use the page from the first region
2655
+ page = regions[0].page
2656
+
2657
+ # Sort regions for text ordering if requested
2658
+ if preserve_order:
2659
+ # Sort by reading order: top-to-bottom, left-to-right
2660
+ sorted_regions = sorted(regions, key=lambda r: (r.top, r.x0))
2661
+ else:
2662
+ sorted_regions = regions
2663
+
2664
+ # Merge text content
2665
+ text_parts = []
2666
+ for region in sorted_regions:
2667
+ try:
2668
+ text = region.extract_text()
2669
+ if text:
2670
+ text_parts.append(text)
2671
+ except:
2672
+ # Region might not have text extraction capability
2673
+ pass
2674
+
2675
+ merged_text = text_separator.join(text_parts) if text_parts else None
2676
+
2677
+ # Create merged region
2678
+ from natural_pdf.elements.region import Region
2679
+
2680
+ merged_region = Region(
2681
+ page=page, bbox=merged_bbox, label=f"Merged ({len(regions)} regions)"
2682
+ )
2683
+
2684
+ # Copy metadata from first region and add merge info
2685
+ if hasattr(regions[0], "metadata") and regions[0].metadata:
2686
+ merged_region.metadata = regions[0].metadata.copy()
2687
+
2688
+ merged_region.metadata["merge_info"] = {
2689
+ "source_count": len(regions),
2690
+ "merged_text": merged_text,
2691
+ "source_bboxes": bboxes,
2692
+ }
2693
+
2694
+ # If regions have region_type, preserve it if consistent
2695
+ region_types = set()
2696
+ for r in regions:
2697
+ if hasattr(r, "region_type") and r.region_type:
2698
+ region_types.add(r.region_type)
2699
+
2700
+ if len(region_types) == 1:
2701
+ merged_region.region_type = region_types.pop()
2702
+
2703
+ return merged_region
2704
+
2705
+ def dissolve(
2706
+ self,
2707
+ padding: float = 2.0,
2708
+ geometry: Literal["rect", "polygon"] = "rect",
2709
+ group_by: List[str] = None,
2710
+ ) -> "ElementCollection":
2711
+ """
2712
+ Merge connected elements based on proximity and grouping attributes.
2713
+
2714
+ This method groups elements by specified attributes (if any), then finds
2715
+ connected components within each group based on a proximity threshold.
2716
+ Connected elements are merged by creating new Region objects with merged
2717
+ bounding boxes.
2718
+
2719
+ Args:
2720
+ padding: Maximum distance in points between elements to consider
2721
+ them connected. Default is 2.0 points.
2722
+ geometry: Type of geometry to use for merged regions. Currently only
2723
+ "rect" (bounding box) is supported. "polygon" will raise
2724
+ NotImplementedError.
2725
+ group_by: List of attribute names to group elements by before merging.
2726
+ Elements are grouped by exact attribute values (floats are rounded
2727
+ to 2 decimal places). If None, all elements are considered in the
2728
+ same group. Common attributes include 'size' (for TextElements),
2729
+ 'font_family', 'fontname', etc.
2730
+
2731
+ Returns:
2732
+ New ElementCollection containing the dissolved regions. All elements
2733
+ with bbox attributes are processed and converted to Region objects.
2734
+
2735
+ Example:
2736
+ ```python
2737
+ # Dissolve elements that are close together
2738
+ dissolved = elements.dissolve(padding=5.0)
2739
+
2740
+ # Group by font size before dissolving
2741
+ dissolved = elements.dissolve(padding=2.0, group_by=['size'])
2742
+
2743
+ # Group by multiple attributes
2744
+ dissolved = elements.dissolve(
2745
+ padding=3.0,
2746
+ group_by=['size', 'font_family']
2747
+ )
2748
+ ```
2749
+
2750
+ Note:
2751
+ - All elements with bbox attributes are processed
2752
+ - Float attribute values are rounded to 2 decimal places for grouping
2753
+ - The method uses Chebyshev distance (max of dx, dy) for proximity
2754
+ - Merged regions inherit the page from the first element in each group
2755
+ - Output is always Region objects, regardless of input element types
2756
+ """
2757
+ if geometry == "polygon":
2758
+ raise NotImplementedError("Polygon geometry is not yet supported for dissolve()")
2759
+
2760
+ if geometry not in ["rect", "polygon"]:
2761
+ raise ValueError(f"Invalid geometry type: {geometry}. Must be 'rect' or 'polygon'")
2762
+
2763
+ from natural_pdf.elements.region import Region
2764
+
2765
+ # Filter to elements with bbox (all elements that can be dissolved)
2766
+ elements_with_bbox = [
2767
+ elem for elem in self._elements if hasattr(elem, "bbox") and elem.bbox
2768
+ ]
2769
+
2770
+ if not elements_with_bbox:
2771
+ logger.debug("No elements with bbox found in collection for dissolve()")
2772
+ return ElementCollection([])
2773
+
2774
+ # Group elements by specified attributes
2775
+ if group_by:
2776
+ grouped_elements = self._group_elements_by_attributes(elements_with_bbox, group_by)
2777
+ else:
2778
+ # All elements in one group
2779
+ grouped_elements = {None: elements_with_bbox}
2780
+
2781
+ # Process each group and collect dissolved regions
2782
+ all_dissolved_regions = []
2783
+
2784
+ for group_key, group_elements in grouped_elements.items():
2785
+ if not group_elements:
2786
+ continue
2787
+
2788
+ logger.debug(f"Processing group {group_key} with {len(group_elements)} elements")
2789
+
2790
+ # Find connected components within this group
2791
+ components = self._find_connected_components_elements(group_elements, padding)
2792
+
2793
+ # Merge each component
2794
+ for component_elements in components:
2795
+ if len(component_elements) == 1:
2796
+ # Single element, convert to Region
2797
+ elem = component_elements[0]
2798
+ region = Region(
2799
+ page=elem.page, bbox=elem.bbox, label=f"Dissolved (1 {elem.type})"
2800
+ )
2801
+ # Copy relevant attributes from source element
2802
+ self._copy_element_attributes_to_region(elem, region, group_by)
2803
+ all_dissolved_regions.append(region)
2804
+ else:
2805
+ # Merge multiple elements
2806
+ merged = self._merge_elements_for_dissolve(component_elements, group_by)
2807
+ all_dissolved_regions.append(merged)
2808
+
2809
+ logger.debug(
2810
+ f"Dissolved {len(elements_with_bbox)} elements into {len(all_dissolved_regions)} regions"
2811
+ )
2812
+
2813
+ return ElementCollection(all_dissolved_regions)
2814
+
2815
+ def _group_elements_by_attributes(
2816
+ self, elements: List["Element"], group_by: List[str]
2817
+ ) -> Dict[Tuple, List["Element"]]:
2818
+ """Group elements by specified attributes."""
2819
+ groups = {}
2820
+
2821
+ for element in elements:
2822
+ # Build group key from attribute values
2823
+ key_values = []
2824
+ for attr in group_by:
2825
+ value = None
2826
+
2827
+ # Try to get attribute value from various sources
2828
+ if hasattr(element, attr):
2829
+ value = getattr(element, attr)
2830
+ elif hasattr(element, "_obj") and element._obj and attr in element._obj:
2831
+ value = element._obj[attr]
2832
+ elif hasattr(element, "metadata") and element.metadata and attr in element.metadata:
2833
+ value = element.metadata[attr]
2834
+
2835
+ # Round float values to 2 decimal places for grouping
2836
+ if isinstance(value, float):
2837
+ value = round(value, 2)
2838
+
2839
+ key_values.append(value)
2840
+
2841
+ key = tuple(key_values)
2842
+
2843
+ if key not in groups:
2844
+ groups[key] = []
2845
+ groups[key].append(element)
2846
+
2847
+ return groups
2848
+
2849
+ def _find_connected_components_elements(
2850
+ self, elements: List["Element"], padding: float
2851
+ ) -> List[List["Element"]]:
2852
+ """Find connected components among elements using union-find."""
2853
+ if not elements:
2854
+ return []
2855
+
2856
+ if len(elements) == 1:
2857
+ return [elements]
2858
+
2859
+ # Build adjacency using union-find
2860
+ parent = list(range(len(elements)))
2861
+
2862
+ def find(x):
2863
+ if parent[x] != x:
2864
+ parent[x] = find(parent[x])
2865
+ return parent[x]
2866
+
2867
+ def union(x, y):
2868
+ px, py = find(x), find(y)
2869
+ if px != py:
2870
+ parent[px] = py
2871
+
2872
+ # Check all pairs of elements for connectivity
2873
+ for i in range(len(elements)):
2874
+ for j in range(i + 1, len(elements)):
2875
+ if self._are_elements_connected(elements[i], elements[j], padding):
2876
+ union(i, j)
2877
+
2878
+ # Group elements by their connected component
2879
+ components = {}
2880
+ for i, element in enumerate(elements):
2881
+ root = find(i)
2882
+ if root not in components:
2883
+ components[root] = []
2884
+ components[root].append(element)
2885
+
2886
+ return list(components.values())
2887
+
2888
+ def _merge_elements_for_dissolve(
2889
+ self, elements: List["Element"], group_by: List[str] = None
2890
+ ) -> "Region":
2891
+ """Merge a group of elements for dissolve operation."""
2892
+ if not elements:
2893
+ raise ValueError("Cannot merge empty element group")
2894
+
2895
+ if len(elements) == 1:
2896
+ elem = elements[0]
2897
+ from natural_pdf.elements.region import Region
2898
+
2899
+ region = Region(page=elem.page, bbox=elem.bbox, label=f"Dissolved (1 {elem.type})")
2900
+ self._copy_element_attributes_to_region(elem, region, group_by)
2901
+ return region
2902
+
2903
+ # Calculate merged bbox
2904
+ bboxes = [e.bbox for e in elements]
2905
+ x0s = [b[0] for b in bboxes]
2906
+ tops = [b[1] for b in bboxes]
2907
+ x1s = [b[2] for b in bboxes]
2908
+ bottoms = [b[3] for b in bboxes]
2909
+
2910
+ merged_bbox = (min(x0s), min(tops), max(x1s), max(bottoms))
2911
+
2912
+ # Use the page from the first element
2913
+ page = elements[0].page
2914
+
2915
+ # Count element types for label
2916
+ type_counts = {}
2917
+ for elem in elements:
2918
+ elem_type = elem.type
2919
+ type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
2920
+
2921
+ # Create label showing element types
2922
+ label_parts = []
2923
+ for elem_type, count in sorted(type_counts.items()):
2924
+ # Pluralize element type if count > 1
2925
+ type_label = elem_type + ("s" if count > 1 else "")
2926
+ label_parts.append(f"{count} {type_label}")
2927
+ label = f"Dissolved ({', '.join(label_parts)})"
2928
+
2929
+ # Create merged region
2930
+ from natural_pdf.elements.region import Region
2931
+
2932
+ merged_region = Region(page=page, bbox=merged_bbox, label=label)
2933
+
2934
+ # Copy attributes from first element if they're consistent
2935
+ self._copy_element_attributes_to_region(elements[0], merged_region, group_by)
2936
+
2937
+ # Check if all elements have the same region_type
2938
+ region_types = set()
2939
+ for elem in elements:
2940
+ if hasattr(elem, "region_type") and elem.region_type:
2941
+ region_types.add(elem.region_type)
2942
+
2943
+ # Handle region_type based on consistency
2944
+ if len(region_types) == 1:
2945
+ # All elements have the same region_type, preserve it
2946
+ merged_region.region_type = region_types.pop()
2947
+ elif len(region_types) > 1:
2948
+ # Multiple different region types, clear it
2949
+ merged_region.region_type = None
2950
+
2951
+ # Add dissolve metadata
2952
+ merged_region.metadata["dissolve_info"] = {
2953
+ "source_count": len(elements),
2954
+ "source_bboxes": bboxes,
2955
+ "source_types": type_counts,
2956
+ }
2957
+
2958
+ return merged_region
2959
+
2960
+ def _are_elements_connected(self, elem1: "Element", elem2: "Element", threshold: float) -> bool:
2961
+ """Check if two elements are connected (adjacent or overlapping)."""
2962
+ # Check if elements are on the same page
2963
+ # Handle edge cases where elements might not have a page attribute
2964
+ page1 = getattr(elem1, "page", None)
2965
+ page2 = getattr(elem2, "page", None)
2966
+
2967
+ # If either element doesn't have a page, we can't compare pages
2968
+ # In this case, only consider them connected if both lack pages
2969
+ if page1 is None or page2 is None:
2970
+ if page1 is not page2: # One has page, one doesn't
2971
+ return False
2972
+ # Both None - continue with proximity check
2973
+ elif page1 != page2: # Both have pages but different
2974
+ return False
2975
+
2976
+ bbox1 = elem1.bbox
2977
+ bbox2 = elem2.bbox
2978
+
2979
+ # Check for overlap first
2980
+ overlap = get_bbox_overlap(bbox1, bbox2)
2981
+ if overlap is not None:
2982
+ return True
2983
+
2984
+ # If no overlap and threshold is 0, elements are not connected
2985
+ if threshold == 0:
2986
+ return False
2987
+
2988
+ # Check proximity - calculate minimum distance between bboxes
2989
+ # bbox format: (x0, top, x1, bottom)
2990
+ x0_1, top_1, x1_1, bottom_1 = bbox1
2991
+ x0_2, top_2, x1_2, bottom_2 = bbox2
2992
+
2993
+ # Calculate horizontal distance
2994
+ if x1_1 < x0_2:
2995
+ h_dist = x0_2 - x1_1
2996
+ elif x1_2 < x0_1:
2997
+ h_dist = x0_1 - x1_2
2998
+ else:
2999
+ h_dist = 0 # Horizontally overlapping
3000
+
3001
+ # Calculate vertical distance
3002
+ if bottom_1 < top_2:
3003
+ v_dist = top_2 - bottom_1
3004
+ elif bottom_2 < top_1:
3005
+ v_dist = top_1 - bottom_2
3006
+ else:
3007
+ v_dist = 0 # Vertically overlapping
3008
+
3009
+ # Use Chebyshev distance (max of horizontal and vertical)
3010
+ # This creates a square proximity zone
3011
+ distance = max(h_dist, v_dist)
3012
+
3013
+ return distance <= threshold
3014
+
3015
+ def _copy_element_attributes_to_region(
3016
+ self, element: "Element", region: "Region", group_by: List[str] = None
3017
+ ) -> None:
3018
+ """Copy relevant attributes from source element to region."""
3019
+ # Common text attributes to check
3020
+ text_attrs = [
3021
+ "size",
3022
+ "font_family",
3023
+ "fontname",
3024
+ "font_size",
3025
+ "font_name",
3026
+ "bold",
3027
+ "italic",
3028
+ "color",
3029
+ "text_color",
3030
+ "region_type",
3031
+ ]
3032
+
3033
+ # If group_by is specified, prioritize those attributes
3034
+ attrs_to_check = (group_by or []) + text_attrs
3035
+
3036
+ for attr in attrs_to_check:
3037
+ value = None
3038
+
3039
+ # Try different ways to get the attribute
3040
+ if hasattr(element, attr):
3041
+ value = getattr(element, attr)
3042
+ elif hasattr(element, "_obj") and element._obj and attr in element._obj:
3043
+ value = element._obj[attr]
3044
+ elif hasattr(element, "metadata") and element.metadata and attr in element.metadata:
3045
+ value = element.metadata[attr]
3046
+
3047
+ # Set the attribute on the region if we found a value
3048
+ if value is not None:
3049
+ # Map common attribute names
3050
+ if attr == "size" and not hasattr(region, "font_size"):
3051
+ setattr(region, "font_size", value)
3052
+ elif attr == "fontname" and not hasattr(region, "font_name"):
3053
+ setattr(region, "font_name", value)
3054
+ else:
3055
+ setattr(region, attr, value)
3056
+
2339
3057
  # ------------------------------------------------------------------
2340
3058
  # NEW METHOD: apply_ocr for collections (supports custom function)
2341
3059
  # ------------------------------------------------------------------