natural-pdf 0.2.5__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,7 +21,7 @@ from typing import (
21
21
  overload,
22
22
  )
23
23
 
24
- from pdfplumber.utils.geometry import objects_to_bbox
24
+ from pdfplumber.utils.geometry import get_bbox_overlap, objects_to_bbox
25
25
 
26
26
  # New Imports
27
27
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -45,6 +45,7 @@ from natural_pdf.ocr import OCROptions
45
45
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
46
46
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
47
47
  from natural_pdf.text_mixin import TextMixin
48
+ from natural_pdf.utils.color_utils import format_color_value
48
49
 
49
50
  # Potentially lazy imports for optional dependencies needed in save_pdf
50
51
  try:
@@ -180,7 +181,7 @@ class ElementCollection(
180
181
  mode: Literal["show", "render"] = "show",
181
182
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
182
183
  highlights: Optional[List[Dict[str, Any]]] = None,
183
- crop: Union[bool, Literal["content"]] = False,
184
+ crop: Union[bool, int, str, "Region", Literal["wide"]] = False,
184
185
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
185
186
  group_by: Optional[str] = None,
186
187
  bins: Optional[Union[int, List[float]]] = None,
@@ -193,7 +194,7 @@ class ElementCollection(
193
194
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
194
195
  color: Default color for highlights in show mode (or colormap name when using group_by)
195
196
  highlights: Additional highlight groups to show
196
- crop: Whether to crop to element bounds
197
+ crop: Cropping mode (False, True, int for padding, 'wide', or Region)
197
198
  crop_bbox: Explicit crop bounds
198
199
  group_by: Attribute to group elements by for color mapping
199
200
  bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
@@ -226,7 +227,7 @@ class ElementCollection(
226
227
  # Handle cropping
227
228
  if crop_bbox:
228
229
  spec.crop_bbox = crop_bbox
229
- elif crop == "content" or crop is True:
230
+ elif crop:
230
231
  # Calculate bounds of elements on this page
231
232
  x_coords = []
232
233
  y_coords = []
@@ -237,7 +238,27 @@ class ElementCollection(
237
238
  y_coords.extend([y0, y1])
238
239
 
239
240
  if x_coords and y_coords:
240
- spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
241
+ content_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
242
+
243
+ if crop is True:
244
+ # Tight crop to content bounds
245
+ spec.crop_bbox = content_bbox
246
+ elif isinstance(crop, (int, float)):
247
+ # Add padding around content
248
+ padding = float(crop)
249
+ x0, y0, x1, y1 = content_bbox
250
+ spec.crop_bbox = (
251
+ max(0, x0 - padding),
252
+ max(0, y0 - padding),
253
+ min(page.width, x1 + padding),
254
+ min(page.height, y1 + padding),
255
+ )
256
+ elif crop == "wide":
257
+ # Full page width, cropped vertically to content
258
+ spec.crop_bbox = (0, content_bbox[1], page.width, content_bbox[3])
259
+ elif hasattr(crop, "bbox"):
260
+ # Crop to another region's bounds
261
+ spec.crop_bbox = crop.bbox
241
262
 
242
263
  # Add highlights in show mode
243
264
  if mode == "show":
@@ -413,10 +434,16 @@ class ElementCollection(
413
434
  element_type = types.pop()
414
435
  return f"<ElementCollection[{element_type}](count={len(self)})>"
415
436
 
416
- def __add__(self, other: "ElementCollection") -> "ElementCollection":
417
- if not isinstance(other, ElementCollection):
437
+ def __add__(self, other: Union["ElementCollection", "Element"]) -> "ElementCollection":
438
+ from natural_pdf.elements.base import Element
439
+ from natural_pdf.elements.region import Region
440
+
441
+ if isinstance(other, ElementCollection):
442
+ return ElementCollection(self._elements + other._elements)
443
+ elif isinstance(other, (Element, Region)):
444
+ return ElementCollection(self._elements + [other])
445
+ else:
418
446
  return NotImplemented
419
- return ElementCollection(self._elements + other._elements)
420
447
 
421
448
  def __setitem__(self, index, value):
422
449
  self._elements[index] = value
@@ -594,6 +621,7 @@ class ElementCollection(
594
621
 
595
622
  def extract_text(
596
623
  self,
624
+ separator: str = " ",
597
625
  preserve_whitespace: bool = True,
598
626
  use_exclusions: bool = True,
599
627
  strip: Optional[bool] = None,
@@ -605,6 +633,9 @@ class ElementCollection(
605
633
  pdfplumber's layout engine if layout=True is specified.
606
634
 
607
635
  Args:
636
+ separator: String to insert between text from different elements when
637
+ using simple joining (layout=False). Default is a single space.
638
+ Ignored when layout=True as the layout engine handles spacing.
608
639
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
609
640
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
610
641
  the collection or by filtering the collection itself.
@@ -641,7 +672,7 @@ class ElementCollection(
641
672
  logger.warning(
642
673
  "ElementCollection.extract_text: No character dictionaries found in TextElements."
643
674
  )
644
- return " ".join(
675
+ return separator.join(
645
676
  getattr(el, "text", "") for el in text_elements
646
677
  ) # Fallback to simple join of word text
647
678
 
@@ -706,18 +737,33 @@ class ElementCollection(
706
737
  all_char_dicts.sort(
707
738
  key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
708
739
  )
709
- result = " ".join(c.get("text", "") for c in all_char_dicts)
740
+ result = separator.join(c.get("text", "") for c in all_char_dicts)
710
741
 
711
742
  else:
712
743
  # Default: Simple join without layout
713
744
  logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
714
- # Sort chars by document order (page, top, x0)
715
- all_char_dicts.sort(
716
- key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
745
+
746
+ # Instead of joining all characters individually, we need to:
747
+ # 1. Extract text from each element
748
+ # 2. Join the element texts with the separator
749
+
750
+ # Sort elements by document order (page, top, x0)
751
+ sorted_elements = sorted(
752
+ text_elements,
753
+ key=lambda el: (
754
+ el.page.index if hasattr(el, "page") else 0,
755
+ el.top if hasattr(el, "top") else 0,
756
+ el.x0 if hasattr(el, "x0") else 0,
757
+ ),
717
758
  )
718
- # Simple join of character text
719
- result = "".join(c.get("text", "") for c in all_char_dicts)
720
- # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
759
+
760
+ # Extract text from each element
761
+ element_texts = []
762
+ for el in sorted_elements:
763
+ if hasattr(el, "text") and el.text:
764
+ element_texts.append(el.text)
765
+
766
+ result = separator.join(element_texts)
721
767
 
722
768
  # Determine final strip flag – same rule as global helper unless caller overrides
723
769
  strip_text = strip if strip is not None else (not use_layout)
@@ -727,6 +773,67 @@ class ElementCollection(
727
773
 
728
774
  return result
729
775
 
776
+ def merge(self) -> "Region":
777
+ """
778
+ Merge all elements into a single region encompassing their bounding box.
779
+
780
+ Unlike dissolve() which only connects touching elements, merge() creates
781
+ a single region that spans from the minimum to maximum coordinates of all
782
+ elements, regardless of whether they touch.
783
+
784
+ Returns:
785
+ A single Region object encompassing all elements
786
+
787
+ Raises:
788
+ ValueError: If the collection is empty or elements have no valid bounding boxes
789
+
790
+ Example:
791
+ ```python
792
+ # Find scattered form fields and merge into one region
793
+ fields = pdf.find_all('text:contains(Name|Date|Phone)')
794
+ merged_region = fields.merge()
795
+
796
+ # Extract all text from the merged area
797
+ text = merged_region.extract_text()
798
+ ```
799
+ """
800
+ if not self._elements:
801
+ raise ValueError("Cannot merge an empty ElementCollection")
802
+
803
+ # Collect all bounding boxes
804
+ bboxes = []
805
+ page = None
806
+
807
+ for elem in self._elements:
808
+ if hasattr(elem, "bbox") and elem.bbox:
809
+ bboxes.append(elem.bbox)
810
+ # Get the page from the first element that has one
811
+ if page is None and hasattr(elem, "page"):
812
+ page = elem.page
813
+
814
+ if not bboxes:
815
+ raise ValueError("No elements with valid bounding boxes to merge")
816
+
817
+ if page is None:
818
+ raise ValueError("Cannot determine page for merged region")
819
+
820
+ # Find min/max coordinates
821
+ x_coords = []
822
+ y_coords = []
823
+
824
+ for bbox in bboxes:
825
+ x0, y0, x1, y1 = bbox
826
+ x_coords.extend([x0, x1])
827
+ y_coords.extend([y0, y1])
828
+
829
+ # Create encompassing bounding box
830
+ merged_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
831
+
832
+ # Create and return the merged region
833
+ from natural_pdf.elements.region import Region
834
+
835
+ return Region(page, merged_bbox)
836
+
730
837
  def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
731
838
  """
732
839
  Filter elements using a function.
@@ -1514,23 +1621,27 @@ class ElementCollection(
1514
1621
  self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str
1515
1622
  ) -> str:
1516
1623
  """Formats the label for a group based on the key and format string."""
1624
+ # Format the group_key if it's a color attribute
1625
+ formatted_key = format_color_value(group_key, attr_name=group_by_attr)
1626
+
1517
1627
  if label_format:
1518
1628
  try:
1519
1629
  element_attrs = sample_element.__dict__.copy()
1520
- element_attrs[group_by_attr] = group_key # Ensure key is present
1630
+ # Use the formatted key in the attributes
1631
+ element_attrs[group_by_attr] = formatted_key # Ensure key is present
1521
1632
  return label_format.format(**element_attrs)
1522
1633
  except KeyError as e:
1523
1634
  logger.warning(
1524
1635
  f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
1525
1636
  )
1526
- return str(group_key)
1637
+ return formatted_key
1527
1638
  except Exception as format_e:
1528
1639
  logger.warning(
1529
1640
  f"Error formatting label '{label_format}': {format_e}. Using group key as label."
1530
1641
  )
1531
- return str(group_key)
1642
+ return formatted_key
1532
1643
  else:
1533
- return str(group_key)
1644
+ return formatted_key
1534
1645
 
1535
1646
  def _get_element_highlight_params(
1536
1647
  self, element: T, annotate: Optional[List[str]]
@@ -2336,6 +2447,632 @@ class ElementCollection(
2336
2447
  lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
2337
2448
  )
2338
2449
 
2450
+ def merge_connected(
2451
+ self,
2452
+ proximity_threshold: float = 5.0,
2453
+ merge_across_pages: bool = False,
2454
+ merge_non_regions: bool = False,
2455
+ text_separator: str = " ",
2456
+ preserve_order: bool = True,
2457
+ ) -> "ElementCollection":
2458
+ """
2459
+ Merge connected/adjacent regions in the collection into larger regions.
2460
+
2461
+ This method identifies regions that are adjacent or overlapping (within a proximity
2462
+ threshold) and merges them into single regions. This is particularly useful for
2463
+ handling text that gets split due to font variations, accented characters, or
2464
+ other PDF rendering quirks.
2465
+
2466
+ The method uses a graph-based approach (union-find) to identify connected components
2467
+ of regions and merges each component into a single region.
2468
+
2469
+ Args:
2470
+ proximity_threshold: Maximum distance in points between regions to consider
2471
+ them connected. Default is 5.0 points. Use 0 for only overlapping regions.
2472
+ merge_across_pages: If True, allow merging regions from different pages.
2473
+ Default is False (only merge within same page).
2474
+ merge_non_regions: If True, attempt to merge non-Region elements by converting
2475
+ them to regions first. Default is False (skip non-Region elements).
2476
+ text_separator: String to use when joining text from merged regions.
2477
+ Default is a single space.
2478
+ preserve_order: If True, order merged text by reading order (top-to-bottom,
2479
+ left-to-right). Default is True.
2480
+
2481
+ Returns:
2482
+ New ElementCollection containing the merged regions. Non-Region elements
2483
+ (if merge_non_regions=False) and elements that couldn't be merged are
2484
+ included unchanged.
2485
+
2486
+ Example:
2487
+ ```python
2488
+ # Find all text regions with potential splits
2489
+ text_regions = page.find_all('region[type=text]')
2490
+
2491
+ # Merge adjacent regions (useful for accented characters)
2492
+ merged = text_regions.merge_connected(proximity_threshold=2.0)
2493
+
2494
+ # Extract clean text from merged regions
2495
+ for region in merged:
2496
+ print(region.extract_text())
2497
+ ```
2498
+
2499
+ Note:
2500
+ - Regions are considered connected if their bounding boxes are within
2501
+ proximity_threshold distance of each other
2502
+ - The merged region's bbox encompasses all constituent regions
2503
+ - Text content is combined in reading order
2504
+ - Original metadata is preserved from the first region in each group
2505
+ """
2506
+ if not self._elements:
2507
+ return ElementCollection([])
2508
+
2509
+ from natural_pdf.elements.region import Region
2510
+
2511
+ # Separate Region and non-Region elements
2512
+ regions = []
2513
+ region_indices = []
2514
+ non_regions = []
2515
+ non_region_indices = []
2516
+
2517
+ for i, elem in enumerate(self._elements):
2518
+ if isinstance(elem, Region):
2519
+ regions.append(elem)
2520
+ region_indices.append(i)
2521
+ else:
2522
+ non_regions.append(elem)
2523
+ non_region_indices.append(i)
2524
+
2525
+ if not regions:
2526
+ # No regions to merge
2527
+ return ElementCollection(self._elements)
2528
+
2529
+ # Group regions by page if not merging across pages
2530
+ page_groups = {}
2531
+ if not merge_across_pages:
2532
+ for region in regions:
2533
+ page = getattr(region, "page", None)
2534
+ if page is not None:
2535
+ page_id = id(page) # Use object id as unique identifier
2536
+ if page_id not in page_groups:
2537
+ page_groups[page_id] = []
2538
+ page_groups[page_id].append(region)
2539
+ else:
2540
+ # Region without page - treat as separate group
2541
+ page_groups[id(region)] = [region]
2542
+ else:
2543
+ # All regions in one group
2544
+ page_groups = {0: regions}
2545
+
2546
+ # Process each page group and collect merged regions
2547
+ all_merged_regions = []
2548
+
2549
+ for page_id, page_regions in page_groups.items():
2550
+ if len(page_regions) == 1:
2551
+ # Only one region on this page, nothing to merge
2552
+ all_merged_regions.extend(page_regions)
2553
+ continue
2554
+
2555
+ # Build adjacency graph using union-find
2556
+ parent = list(range(len(page_regions)))
2557
+
2558
+ def find(x):
2559
+ if parent[x] != x:
2560
+ parent[x] = find(parent[x])
2561
+ return parent[x]
2562
+
2563
+ def union(x, y):
2564
+ px, py = find(x), find(y)
2565
+ if px != py:
2566
+ parent[px] = py
2567
+
2568
+ # Check all pairs of regions for connectivity
2569
+ for i in range(len(page_regions)):
2570
+ for j in range(i + 1, len(page_regions)):
2571
+ if self._are_regions_connected(
2572
+ page_regions[i], page_regions[j], proximity_threshold
2573
+ ):
2574
+ union(i, j)
2575
+
2576
+ # Group regions by their connected component
2577
+ components = {}
2578
+ for i, region in enumerate(page_regions):
2579
+ root = find(i)
2580
+ if root not in components:
2581
+ components[root] = []
2582
+ components[root].append(region)
2583
+
2584
+ # Merge each component
2585
+ for component_regions in components.values():
2586
+ if len(component_regions) == 1:
2587
+ # Single region, no merge needed
2588
+ all_merged_regions.append(component_regions[0])
2589
+ else:
2590
+ # Merge multiple regions
2591
+ merged = self._merge_region_group(
2592
+ component_regions, text_separator, preserve_order
2593
+ )
2594
+ all_merged_regions.append(merged)
2595
+
2596
+ # Combine merged regions with non-regions (if any)
2597
+ # Reconstruct in original order as much as possible
2598
+ result_elements = []
2599
+
2600
+ if not non_regions:
2601
+ # All elements were regions
2602
+ result_elements = all_merged_regions
2603
+ else:
2604
+ # Need to interleave merged regions and non-regions
2605
+ # This is a simplified approach - just append non-regions at the end
2606
+ # A more sophisticated approach would maintain relative ordering
2607
+ result_elements = all_merged_regions + non_regions
2608
+
2609
+ return ElementCollection(result_elements)
2610
+
2611
+ def _are_regions_connected(
2612
+ self, region1: "Region", region2: "Region", threshold: float
2613
+ ) -> bool:
2614
+ """Check if two regions are connected (adjacent or overlapping)."""
2615
+ bbox1 = region1.bbox
2616
+ bbox2 = region2.bbox
2617
+
2618
+ # Check for overlap first
2619
+ overlap = get_bbox_overlap(bbox1, bbox2)
2620
+ if overlap is not None:
2621
+ return True
2622
+
2623
+ # If no overlap and threshold is 0, regions are not connected
2624
+ if threshold == 0:
2625
+ return False
2626
+
2627
+ # Check proximity - calculate minimum distance between bboxes
2628
+ # bbox format: (x0, top, x1, bottom)
2629
+ x0_1, top_1, x1_1, bottom_1 = bbox1
2630
+ x0_2, top_2, x1_2, bottom_2 = bbox2
2631
+
2632
+ # Calculate horizontal distance
2633
+ if x1_1 < x0_2:
2634
+ h_dist = x0_2 - x1_1
2635
+ elif x1_2 < x0_1:
2636
+ h_dist = x0_1 - x1_2
2637
+ else:
2638
+ h_dist = 0 # Horizontally overlapping
2639
+
2640
+ # Calculate vertical distance
2641
+ if bottom_1 < top_2:
2642
+ v_dist = top_2 - bottom_1
2643
+ elif bottom_2 < top_1:
2644
+ v_dist = top_1 - bottom_2
2645
+ else:
2646
+ v_dist = 0 # Vertically overlapping
2647
+
2648
+ # Use Chebyshev distance (max of horizontal and vertical)
2649
+ # This creates a square proximity zone
2650
+ distance = max(h_dist, v_dist)
2651
+
2652
+ return distance <= threshold
2653
+
2654
+ def _merge_region_group(
2655
+ self, regions: List["Region"], text_separator: str, preserve_order: bool
2656
+ ) -> "Region":
2657
+ """Merge a group of connected regions into a single region."""
2658
+ if not regions:
2659
+ raise ValueError("Cannot merge empty region group")
2660
+
2661
+ if len(regions) == 1:
2662
+ return regions[0]
2663
+
2664
+ # Calculate merged bbox
2665
+ bboxes = [r.bbox for r in regions]
2666
+ x0s = [b[0] for b in bboxes]
2667
+ tops = [b[1] for b in bboxes]
2668
+ x1s = [b[2] for b in bboxes]
2669
+ bottoms = [b[3] for b in bboxes]
2670
+
2671
+ merged_bbox = (min(x0s), min(tops), max(x1s), max(bottoms))
2672
+
2673
+ # Use the page from the first region
2674
+ page = regions[0].page
2675
+
2676
+ # Sort regions for text ordering if requested
2677
+ if preserve_order:
2678
+ # Sort by reading order: top-to-bottom, left-to-right
2679
+ sorted_regions = sorted(regions, key=lambda r: (r.top, r.x0))
2680
+ else:
2681
+ sorted_regions = regions
2682
+
2683
+ # Merge text content
2684
+ text_parts = []
2685
+ for region in sorted_regions:
2686
+ try:
2687
+ text = region.extract_text()
2688
+ if text:
2689
+ text_parts.append(text)
2690
+ except:
2691
+ # Region might not have text extraction capability
2692
+ pass
2693
+
2694
+ merged_text = text_separator.join(text_parts) if text_parts else None
2695
+
2696
+ # Create merged region
2697
+ from natural_pdf.elements.region import Region
2698
+
2699
+ merged_region = Region(
2700
+ page=page, bbox=merged_bbox, label=f"Merged ({len(regions)} regions)"
2701
+ )
2702
+
2703
+ # Copy metadata from first region and add merge info
2704
+ if hasattr(regions[0], "metadata") and regions[0].metadata:
2705
+ merged_region.metadata = regions[0].metadata.copy()
2706
+
2707
+ merged_region.metadata["merge_info"] = {
2708
+ "source_count": len(regions),
2709
+ "merged_text": merged_text,
2710
+ "source_bboxes": bboxes,
2711
+ }
2712
+
2713
+ # If regions have region_type, preserve it if consistent
2714
+ region_types = set()
2715
+ for r in regions:
2716
+ if hasattr(r, "region_type") and r.region_type:
2717
+ region_types.add(r.region_type)
2718
+
2719
+ if len(region_types) == 1:
2720
+ merged_region.region_type = region_types.pop()
2721
+
2722
+ return merged_region
2723
+
2724
+ def dissolve(
2725
+ self,
2726
+ padding: float = 2.0,
2727
+ geometry: Literal["rect", "polygon"] = "rect",
2728
+ group_by: List[str] = None,
2729
+ ) -> "ElementCollection":
2730
+ """
2731
+ Merge connected elements based on proximity and grouping attributes.
2732
+
2733
+ This method groups elements by specified attributes (if any), then finds
2734
+ connected components within each group based on a proximity threshold.
2735
+ Connected elements are merged by creating new Region objects with merged
2736
+ bounding boxes.
2737
+
2738
+ Args:
2739
+ padding: Maximum distance in points between elements to consider
2740
+ them connected. Default is 2.0 points.
2741
+ geometry: Type of geometry to use for merged regions. Currently only
2742
+ "rect" (bounding box) is supported. "polygon" will raise
2743
+ NotImplementedError.
2744
+ group_by: List of attribute names to group elements by before merging.
2745
+ Elements are grouped by exact attribute values (floats are rounded
2746
+ to 2 decimal places). If None, all elements are considered in the
2747
+ same group. Common attributes include 'size' (for TextElements),
2748
+ 'font_family', 'fontname', etc.
2749
+
2750
+ Returns:
2751
+ New ElementCollection containing the dissolved regions. All elements
2752
+ with bbox attributes are processed and converted to Region objects.
2753
+
2754
+ Example:
2755
+ ```python
2756
+ # Dissolve elements that are close together
2757
+ dissolved = elements.dissolve(padding=5.0)
2758
+
2759
+ # Group by font size before dissolving
2760
+ dissolved = elements.dissolve(padding=2.0, group_by=['size'])
2761
+
2762
+ # Group by multiple attributes
2763
+ dissolved = elements.dissolve(
2764
+ padding=3.0,
2765
+ group_by=['size', 'font_family']
2766
+ )
2767
+ ```
2768
+
2769
+ Note:
2770
+ - All elements with bbox attributes are processed
2771
+ - Float attribute values are rounded to 2 decimal places for grouping
2772
+ - The method uses Chebyshev distance (max of dx, dy) for proximity
2773
+ - Merged regions inherit the page from the first element in each group
2774
+ - Output is always Region objects, regardless of input element types
2775
+ """
2776
+ if geometry == "polygon":
2777
+ raise NotImplementedError("Polygon geometry is not yet supported for dissolve()")
2778
+
2779
+ if geometry not in ["rect", "polygon"]:
2780
+ raise ValueError(f"Invalid geometry type: {geometry}. Must be 'rect' or 'polygon'")
2781
+
2782
+ from natural_pdf.elements.region import Region
2783
+
2784
+ # Filter to elements with bbox (all elements that can be dissolved)
2785
+ elements_with_bbox = [
2786
+ elem for elem in self._elements if hasattr(elem, "bbox") and elem.bbox
2787
+ ]
2788
+
2789
+ if not elements_with_bbox:
2790
+ logger.debug("No elements with bbox found in collection for dissolve()")
2791
+ return ElementCollection([])
2792
+
2793
+ # Group elements by specified attributes
2794
+ if group_by:
2795
+ grouped_elements = self._group_elements_by_attributes(elements_with_bbox, group_by)
2796
+ else:
2797
+ # All elements in one group
2798
+ grouped_elements = {None: elements_with_bbox}
2799
+
2800
+ # Process each group and collect dissolved regions
2801
+ all_dissolved_regions = []
2802
+
2803
+ for group_key, group_elements in grouped_elements.items():
2804
+ if not group_elements:
2805
+ continue
2806
+
2807
+ logger.debug(f"Processing group {group_key} with {len(group_elements)} elements")
2808
+
2809
+ # Find connected components within this group
2810
+ components = self._find_connected_components_elements(group_elements, padding)
2811
+
2812
+ # Merge each component
2813
+ for component_elements in components:
2814
+ if len(component_elements) == 1:
2815
+ # Single element, convert to Region
2816
+ elem = component_elements[0]
2817
+ region = Region(
2818
+ page=elem.page, bbox=elem.bbox, label=f"Dissolved (1 {elem.type})"
2819
+ )
2820
+ # Copy relevant attributes from source element
2821
+ self._copy_element_attributes_to_region(elem, region, group_by)
2822
+ all_dissolved_regions.append(region)
2823
+ else:
2824
+ # Merge multiple elements
2825
+ merged = self._merge_elements_for_dissolve(component_elements, group_by)
2826
+ all_dissolved_regions.append(merged)
2827
+
2828
+ logger.debug(
2829
+ f"Dissolved {len(elements_with_bbox)} elements into {len(all_dissolved_regions)} regions"
2830
+ )
2831
+
2832
+ return ElementCollection(all_dissolved_regions)
2833
+
2834
+ def _group_elements_by_attributes(
2835
+ self, elements: List["Element"], group_by: List[str]
2836
+ ) -> Dict[Tuple, List["Element"]]:
2837
+ """Group elements by specified attributes."""
2838
+ groups = {}
2839
+
2840
+ for element in elements:
2841
+ # Build group key from attribute values
2842
+ key_values = []
2843
+ for attr in group_by:
2844
+ value = None
2845
+
2846
+ # Try to get attribute value from various sources
2847
+ if hasattr(element, attr):
2848
+ value = getattr(element, attr)
2849
+ elif hasattr(element, "_obj") and element._obj and attr in element._obj:
2850
+ value = element._obj[attr]
2851
+ elif hasattr(element, "metadata") and element.metadata and attr in element.metadata:
2852
+ value = element.metadata[attr]
2853
+
2854
+ # Round float values to 2 decimal places for grouping
2855
+ if isinstance(value, float):
2856
+ value = round(value, 2)
2857
+
2858
+ key_values.append(value)
2859
+
2860
+ key = tuple(key_values)
2861
+
2862
+ if key not in groups:
2863
+ groups[key] = []
2864
+ groups[key].append(element)
2865
+
2866
+ return groups
2867
+
2868
+ def _find_connected_components_elements(
2869
+ self, elements: List["Element"], padding: float
2870
+ ) -> List[List["Element"]]:
2871
+ """Find connected components among elements using union-find."""
2872
+ if not elements:
2873
+ return []
2874
+
2875
+ if len(elements) == 1:
2876
+ return [elements]
2877
+
2878
+ # Build adjacency using union-find
2879
+ parent = list(range(len(elements)))
2880
+
2881
+ def find(x):
2882
+ if parent[x] != x:
2883
+ parent[x] = find(parent[x])
2884
+ return parent[x]
2885
+
2886
+ def union(x, y):
2887
+ px, py = find(x), find(y)
2888
+ if px != py:
2889
+ parent[px] = py
2890
+
2891
+ # Check all pairs of elements for connectivity
2892
+ for i in range(len(elements)):
2893
+ for j in range(i + 1, len(elements)):
2894
+ if self._are_elements_connected(elements[i], elements[j], padding):
2895
+ union(i, j)
2896
+
2897
+ # Group elements by their connected component
2898
+ components = {}
2899
+ for i, element in enumerate(elements):
2900
+ root = find(i)
2901
+ if root not in components:
2902
+ components[root] = []
2903
+ components[root].append(element)
2904
+
2905
+ return list(components.values())
2906
+
2907
+ def _merge_elements_for_dissolve(
2908
+ self, elements: List["Element"], group_by: List[str] = None
2909
+ ) -> "Region":
2910
+ """Merge a group of elements for dissolve operation."""
2911
+ if not elements:
2912
+ raise ValueError("Cannot merge empty element group")
2913
+
2914
+ if len(elements) == 1:
2915
+ elem = elements[0]
2916
+ from natural_pdf.elements.region import Region
2917
+
2918
+ region = Region(page=elem.page, bbox=elem.bbox, label=f"Dissolved (1 {elem.type})")
2919
+ self._copy_element_attributes_to_region(elem, region, group_by)
2920
+ return region
2921
+
2922
+ # Calculate merged bbox
2923
+ bboxes = [e.bbox for e in elements]
2924
+ x0s = [b[0] for b in bboxes]
2925
+ tops = [b[1] for b in bboxes]
2926
+ x1s = [b[2] for b in bboxes]
2927
+ bottoms = [b[3] for b in bboxes]
2928
+
2929
+ merged_bbox = (min(x0s), min(tops), max(x1s), max(bottoms))
2930
+
2931
+ # Use the page from the first element
2932
+ page = elements[0].page
2933
+
2934
+ # Count element types for label
2935
+ type_counts = {}
2936
+ for elem in elements:
2937
+ elem_type = elem.type
2938
+ type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
2939
+
2940
+ # Create label showing element types
2941
+ label_parts = []
2942
+ for elem_type, count in sorted(type_counts.items()):
2943
+ # Pluralize element type if count > 1
2944
+ type_label = elem_type + ("s" if count > 1 else "")
2945
+ label_parts.append(f"{count} {type_label}")
2946
+ label = f"Dissolved ({', '.join(label_parts)})"
2947
+
2948
+ # Create merged region
2949
+ from natural_pdf.elements.region import Region
2950
+
2951
+ merged_region = Region(page=page, bbox=merged_bbox, label=label)
2952
+
2953
+ # Copy attributes from first element if they're consistent
2954
+ self._copy_element_attributes_to_region(elements[0], merged_region, group_by)
2955
+
2956
+ # Check if all elements have the same region_type
2957
+ region_types = set()
2958
+ for elem in elements:
2959
+ if hasattr(elem, "region_type") and elem.region_type:
2960
+ region_types.add(elem.region_type)
2961
+
2962
+ # Handle region_type based on consistency
2963
+ if len(region_types) == 1:
2964
+ # All elements have the same region_type, preserve it
2965
+ merged_region.region_type = region_types.pop()
2966
+ elif len(region_types) > 1:
2967
+ # Multiple different region types, clear it
2968
+ merged_region.region_type = None
2969
+
2970
+ # Add dissolve metadata
2971
+ merged_region.metadata["dissolve_info"] = {
2972
+ "source_count": len(elements),
2973
+ "source_bboxes": bboxes,
2974
+ "source_types": type_counts,
2975
+ }
2976
+
2977
+ return merged_region
2978
+
2979
+ def _are_elements_connected(self, elem1: "Element", elem2: "Element", threshold: float) -> bool:
2980
+ """Check if two elements are connected (adjacent or overlapping)."""
2981
+ # Check if elements are on the same page
2982
+ # Handle edge cases where elements might not have a page attribute
2983
+ page1 = getattr(elem1, "page", None)
2984
+ page2 = getattr(elem2, "page", None)
2985
+
2986
+ # If either element doesn't have a page, we can't compare pages
2987
+ # In this case, only consider them connected if both lack pages
2988
+ if page1 is None or page2 is None:
2989
+ if page1 is not page2: # One has page, one doesn't
2990
+ return False
2991
+ # Both None - continue with proximity check
2992
+ elif page1 != page2: # Both have pages but different
2993
+ return False
2994
+
2995
+ bbox1 = elem1.bbox
2996
+ bbox2 = elem2.bbox
2997
+
2998
+ # Check for overlap first
2999
+ overlap = get_bbox_overlap(bbox1, bbox2)
3000
+ if overlap is not None:
3001
+ return True
3002
+
3003
+ # If no overlap and threshold is 0, elements are not connected
3004
+ if threshold == 0:
3005
+ return False
3006
+
3007
+ # Check proximity - calculate minimum distance between bboxes
3008
+ # bbox format: (x0, top, x1, bottom)
3009
+ x0_1, top_1, x1_1, bottom_1 = bbox1
3010
+ x0_2, top_2, x1_2, bottom_2 = bbox2
3011
+
3012
+ # Calculate horizontal distance
3013
+ if x1_1 < x0_2:
3014
+ h_dist = x0_2 - x1_1
3015
+ elif x1_2 < x0_1:
3016
+ h_dist = x0_1 - x1_2
3017
+ else:
3018
+ h_dist = 0 # Horizontally overlapping
3019
+
3020
+ # Calculate vertical distance
3021
+ if bottom_1 < top_2:
3022
+ v_dist = top_2 - bottom_1
3023
+ elif bottom_2 < top_1:
3024
+ v_dist = top_1 - bottom_2
3025
+ else:
3026
+ v_dist = 0 # Vertically overlapping
3027
+
3028
+ # Use Chebyshev distance (max of horizontal and vertical)
3029
+ # This creates a square proximity zone
3030
+ distance = max(h_dist, v_dist)
3031
+
3032
+ return distance <= threshold
3033
+
3034
+ def _copy_element_attributes_to_region(
3035
+ self, element: "Element", region: "Region", group_by: List[str] = None
3036
+ ) -> None:
3037
+ """Copy relevant attributes from source element to region."""
3038
+ # Common text attributes to check
3039
+ text_attrs = [
3040
+ "size",
3041
+ "font_family",
3042
+ "fontname",
3043
+ "font_size",
3044
+ "font_name",
3045
+ "bold",
3046
+ "italic",
3047
+ "color",
3048
+ "text_color",
3049
+ "region_type",
3050
+ ]
3051
+
3052
+ # If group_by is specified, prioritize those attributes
3053
+ attrs_to_check = (group_by or []) + text_attrs
3054
+
3055
+ for attr in attrs_to_check:
3056
+ value = None
3057
+
3058
+ # Try different ways to get the attribute
3059
+ if hasattr(element, attr):
3060
+ value = getattr(element, attr)
3061
+ elif hasattr(element, "_obj") and element._obj and attr in element._obj:
3062
+ value = element._obj[attr]
3063
+ elif hasattr(element, "metadata") and element.metadata and attr in element.metadata:
3064
+ value = element.metadata[attr]
3065
+
3066
+ # Set the attribute on the region if we found a value
3067
+ if value is not None:
3068
+ # Map common attribute names
3069
+ if attr == "size" and not hasattr(region, "font_size"):
3070
+ setattr(region, "font_size", value)
3071
+ elif attr == "fontname" and not hasattr(region, "font_name"):
3072
+ setattr(region, "font_name", value)
3073
+ else:
3074
+ setattr(region, attr, value)
3075
+
2339
3076
  # ------------------------------------------------------------------
2340
3077
  # NEW METHOD: apply_ocr for collections (supports custom function)
2341
3078
  # ------------------------------------------------------------------