natural-pdf 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ from typing import (
7
7
  List,
8
8
  Literal,
9
9
  Optional,
10
+ Set,
10
11
  Tuple,
11
12
  Union,
12
13
  overload,
@@ -346,6 +347,7 @@ class Region(
346
347
  include_source: bool = False,
347
348
  until: Optional[str] = None,
348
349
  include_endpoint: bool = True,
350
+ offset: Optional[float] = None,
349
351
  **kwargs,
350
352
  ) -> "Region":
351
353
  """
@@ -357,11 +359,18 @@ class Region(
357
359
  include_source: Whether to include this region in the result (default: False)
358
360
  until: Optional selector string to specify an upper boundary element
359
361
  include_endpoint: Whether to include the boundary element in the region (default: True)
362
+ offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
360
363
  **kwargs: Additional parameters
361
364
 
362
365
  Returns:
363
366
  Region object representing the area above
364
367
  """
368
+ # Use global default if offset not provided
369
+ if offset is None:
370
+ import natural_pdf
371
+
372
+ offset = natural_pdf.options.layout.directional_offset
373
+
365
374
  return self._direction(
366
375
  direction="above",
367
376
  size=height,
@@ -369,6 +378,7 @@ class Region(
369
378
  include_source=include_source,
370
379
  until=until,
371
380
  include_endpoint=include_endpoint,
381
+ offset=offset,
372
382
  **kwargs,
373
383
  )
374
384
 
@@ -379,6 +389,7 @@ class Region(
379
389
  include_source: bool = False,
380
390
  until: Optional[str] = None,
381
391
  include_endpoint: bool = True,
392
+ offset: Optional[float] = None,
382
393
  **kwargs,
383
394
  ) -> "Region":
384
395
  """
@@ -390,11 +401,18 @@ class Region(
390
401
  include_source: Whether to include this region in the result (default: False)
391
402
  until: Optional selector string to specify a lower boundary element
392
403
  include_endpoint: Whether to include the boundary element in the region (default: True)
404
+ offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
393
405
  **kwargs: Additional parameters
394
406
 
395
407
  Returns:
396
408
  Region object representing the area below
397
409
  """
410
+ # Use global default if offset not provided
411
+ if offset is None:
412
+ import natural_pdf
413
+
414
+ offset = natural_pdf.options.layout.directional_offset
415
+
398
416
  return self._direction(
399
417
  direction="below",
400
418
  size=height,
@@ -402,16 +420,18 @@ class Region(
402
420
  include_source=include_source,
403
421
  until=until,
404
422
  include_endpoint=include_endpoint,
423
+ offset=offset,
405
424
  **kwargs,
406
425
  )
407
426
 
408
427
  def left(
409
428
  self,
410
429
  width: Optional[float] = None,
411
- height: str = "full",
430
+ height: str = "element",
412
431
  include_source: bool = False,
413
432
  until: Optional[str] = None,
414
433
  include_endpoint: bool = True,
434
+ offset: Optional[float] = None,
415
435
  **kwargs,
416
436
  ) -> "Region":
417
437
  """
@@ -423,11 +443,18 @@ class Region(
423
443
  include_source: Whether to include this region in the result (default: False)
424
444
  until: Optional selector string to specify a left boundary element
425
445
  include_endpoint: Whether to include the boundary element in the region (default: True)
446
+ offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
426
447
  **kwargs: Additional parameters
427
448
 
428
449
  Returns:
429
450
  Region object representing the area to the left
430
451
  """
452
+ # Use global default if offset not provided
453
+ if offset is None:
454
+ import natural_pdf
455
+
456
+ offset = natural_pdf.options.layout.directional_offset
457
+
431
458
  return self._direction(
432
459
  direction="left",
433
460
  size=width,
@@ -435,16 +462,18 @@ class Region(
435
462
  include_source=include_source,
436
463
  until=until,
437
464
  include_endpoint=include_endpoint,
465
+ offset=offset,
438
466
  **kwargs,
439
467
  )
440
468
 
441
469
  def right(
442
470
  self,
443
471
  width: Optional[float] = None,
444
- height: str = "full",
472
+ height: str = "element",
445
473
  include_source: bool = False,
446
474
  until: Optional[str] = None,
447
475
  include_endpoint: bool = True,
476
+ offset: Optional[float] = None,
448
477
  **kwargs,
449
478
  ) -> "Region":
450
479
  """
@@ -456,11 +485,18 @@ class Region(
456
485
  include_source: Whether to include this region in the result (default: False)
457
486
  until: Optional selector string to specify a right boundary element
458
487
  include_endpoint: Whether to include the boundary element in the region (default: True)
488
+ offset: Pixel offset when excluding source/endpoint (default: None, uses natural_pdf.options.layout.directional_offset)
459
489
  **kwargs: Additional parameters
460
490
 
461
491
  Returns:
462
492
  Region object representing the area to the right
463
493
  """
494
+ # Use global default if offset not provided
495
+ if offset is None:
496
+ import natural_pdf
497
+
498
+ offset = natural_pdf.options.layout.directional_offset
499
+
464
500
  return self._direction(
465
501
  direction="right",
466
502
  size=width,
@@ -468,6 +504,7 @@ class Region(
468
504
  include_source=include_source,
469
505
  until=until,
470
506
  include_endpoint=include_endpoint,
507
+ offset=offset,
471
508
  **kwargs,
472
509
  )
473
510
 
@@ -638,12 +675,10 @@ class Region(
638
675
  Returns:
639
676
  True if the element is in the region, False otherwise
640
677
  """
641
- # Check if element is on the same page
642
- if not hasattr(element, "page") or element.page != self._page:
643
- return False
678
+ # Use centralized spatial utility for consistency
679
+ from natural_pdf.utils.spatial import is_element_in_region
644
680
 
645
- return self.is_element_center_inside(element)
646
- # return self.intersects(element)
681
+ return is_element_in_region(element, self, strategy="center", check_page=True)
647
682
 
648
683
  def contains(self, element: "Element") -> bool:
649
684
  """
@@ -739,7 +774,12 @@ class Region(
739
774
  )
740
775
 
741
776
  def exclude(self):
742
- self.page.add_exclusion(self)
777
+ """
778
+ Exclude this region from text extraction and other operations.
779
+
780
+ This excludes everything within the region's bounds.
781
+ """
782
+ self.page.add_exclusion(self, method="region")
743
783
 
744
784
  def highlight(
745
785
  self,
@@ -1224,12 +1264,36 @@ class Region(
1224
1264
  selector, apply_exclusions=apply_exclusions, **kwargs
1225
1265
  )
1226
1266
  # Filter those elements to only include ones within this region
1227
- return [e for e in page_elements if self._is_element_in_region(e)]
1267
+ elements = [e for e in page_elements if self._is_element_in_region(e)]
1228
1268
  else:
1229
1269
  # Get all elements from the page
1230
1270
  page_elements = self.page.get_elements(apply_exclusions=apply_exclusions)
1231
1271
  # Filter to elements in this region
1232
- return [e for e in page_elements if self._is_element_in_region(e)]
1272
+ elements = [e for e in page_elements if self._is_element_in_region(e)]
1273
+
1274
+ # Apply boundary exclusions if this is a section with boundary settings
1275
+ if hasattr(self, "_boundary_exclusions") and self._boundary_exclusions != "both":
1276
+ excluded_ids = set()
1277
+
1278
+ if self._boundary_exclusions == "none":
1279
+ # Exclude both start and end elements
1280
+ if hasattr(self, "start_element") and self.start_element:
1281
+ excluded_ids.add(id(self.start_element))
1282
+ if hasattr(self, "end_element") and self.end_element:
1283
+ excluded_ids.add(id(self.end_element))
1284
+ elif self._boundary_exclusions == "start":
1285
+ # Exclude only end element
1286
+ if hasattr(self, "end_element") and self.end_element:
1287
+ excluded_ids.add(id(self.end_element))
1288
+ elif self._boundary_exclusions == "end":
1289
+ # Exclude only start element
1290
+ if hasattr(self, "start_element") and self.start_element:
1291
+ excluded_ids.add(id(self.start_element))
1292
+
1293
+ if excluded_ids:
1294
+ elements = [e for e in elements if id(e) not in excluded_ids]
1295
+
1296
+ return elements
1233
1297
 
1234
1298
  def extract_text(
1235
1299
  self,
@@ -1300,6 +1364,34 @@ class Region(
1300
1364
  elif debug:
1301
1365
  logger.debug(f"Region {self.bbox}: Not applying exclusions (apply_exclusions=False).")
1302
1366
 
1367
+ # Add boundary element exclusions if this is a section with boundary settings
1368
+ if hasattr(self, "_boundary_exclusions") and self._boundary_exclusions != "both":
1369
+ boundary_exclusions = []
1370
+
1371
+ if self._boundary_exclusions == "none":
1372
+ # Exclude both start and end elements
1373
+ if hasattr(self, "start_element") and self.start_element:
1374
+ boundary_exclusions.append(self.start_element)
1375
+ if hasattr(self, "end_element") and self.end_element:
1376
+ boundary_exclusions.append(self.end_element)
1377
+ elif self._boundary_exclusions == "start":
1378
+ # Exclude only end element
1379
+ if hasattr(self, "end_element") and self.end_element:
1380
+ boundary_exclusions.append(self.end_element)
1381
+ elif self._boundary_exclusions == "end":
1382
+ # Exclude only start element
1383
+ if hasattr(self, "start_element") and self.start_element:
1384
+ boundary_exclusions.append(self.start_element)
1385
+
1386
+ # Add boundary elements as exclusion regions
1387
+ for elem in boundary_exclusions:
1388
+ if hasattr(elem, "bbox"):
1389
+ exclusion_regions.append(elem)
1390
+ if debug:
1391
+ logger.debug(
1392
+ f"Adding boundary exclusion: {elem.extract_text().strip()} at {elem.bbox}"
1393
+ )
1394
+
1303
1395
  # 4. Spatially Filter Characters using Utility
1304
1396
  # Pass self as the target_region for precise polygon checks etc.
1305
1397
  filtered_chars = filter_chars_spatially(
@@ -1510,6 +1602,49 @@ class Region(
1510
1602
 
1511
1603
  logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
1512
1604
 
1605
+ # For stream method with text-based edge detection and explicit vertical lines,
1606
+ # adjust guides to ensure they fall within text bounds for proper intersection
1607
+ if (
1608
+ effective_method == "pdfplumber"
1609
+ and table_settings.get("horizontal_strategy") == "text"
1610
+ and table_settings.get("vertical_strategy") == "explicit"
1611
+ and "explicit_vertical_lines" in table_settings
1612
+ ):
1613
+
1614
+ text_elements = self.find_all("text", apply_exclusions=apply_exclusions)
1615
+ if text_elements:
1616
+ text_bounds = text_elements.merge().bbox
1617
+ text_left = text_bounds[0]
1618
+ text_right = text_bounds[2]
1619
+
1620
+ # Adjust vertical guides to fall within text bounds
1621
+ original_verticals = table_settings["explicit_vertical_lines"]
1622
+ adjusted_verticals = []
1623
+
1624
+ for v in original_verticals:
1625
+ if v < text_left:
1626
+ # Guide is left of text bounds, clip to text start
1627
+ adjusted_verticals.append(text_left)
1628
+ logger.debug(
1629
+ f"Region {self.bbox}: Adjusted left guide from {v:.1f} to {text_left:.1f}"
1630
+ )
1631
+ elif v > text_right:
1632
+ # Guide is right of text bounds, clip to text end
1633
+ adjusted_verticals.append(text_right)
1634
+ logger.debug(
1635
+ f"Region {self.bbox}: Adjusted right guide from {v:.1f} to {text_right:.1f}"
1636
+ )
1637
+ else:
1638
+ # Guide is within text bounds, keep as is
1639
+ adjusted_verticals.append(v)
1640
+
1641
+ # Update table settings with adjusted guides
1642
+ table_settings["explicit_vertical_lines"] = adjusted_verticals
1643
+ logger.debug(
1644
+ f"Region {self.bbox}: Adjusted {len(original_verticals)} guides for stream extraction. "
1645
+ f"Text bounds: {text_left:.1f}-{text_right:.1f}"
1646
+ )
1647
+
1513
1648
  # Use the selected method
1514
1649
  if effective_method == "tatr":
1515
1650
  table_rows = self._extract_table_tatr(
@@ -2765,69 +2900,31 @@ class Region(
2765
2900
  if orientation not in ["vertical", "horizontal"]:
2766
2901
  raise ValueError(f"orientation must be 'vertical' or 'horizontal', got '{orientation}'")
2767
2902
 
2768
- # Calculate the section boundaries based on orientation and include_boundaries
2769
- if orientation == "vertical":
2770
- # Use full width of the parent region for vertical sections
2771
- x0 = self.x0 # Use parent region's left boundary
2772
- x1 = self.x1 # Use parent region's right boundary
2773
-
2774
- # Determine vertical boundaries based on include_boundaries
2775
- if include_boundaries == "both":
2776
- # Include both boundary elements
2777
- top = start_element.top
2778
- bottom = end_element.bottom
2779
- elif include_boundaries == "start":
2780
- # Include start element, exclude end element
2781
- top = start_element.top
2782
- bottom = end_element.top # Stop at the top of end element
2783
- elif include_boundaries == "end":
2784
- # Exclude start element, include end element
2785
- top = start_element.bottom # Start at the bottom of start element
2786
- bottom = end_element.bottom
2787
- else: # "none"
2788
- # Exclude both boundary elements
2789
- top = start_element.bottom # Start at the bottom of start element
2790
- bottom = end_element.top # Stop at the top of end element
2791
-
2792
- # Ensure valid boundaries
2793
- if top >= bottom:
2794
- logger.debug(f"Invalid section boundaries: top={top} >= bottom={bottom}")
2795
- # Return an empty region
2796
- return Region(self.page, (x0, top, x0, top))
2797
- else: # horizontal
2798
- # Use full height of the parent region for horizontal sections
2799
- top = self.top # Use parent region's top boundary
2800
- bottom = self.bottom # Use parent region's bottom boundary
2801
-
2802
- # Determine horizontal boundaries based on include_boundaries
2803
- if include_boundaries == "both":
2804
- # Include both boundary elements
2805
- x0 = start_element.x0
2806
- x1 = end_element.x1
2807
- elif include_boundaries == "start":
2808
- # Include start element, exclude end element
2809
- x0 = start_element.x0
2810
- x1 = end_element.x0 # Stop at the left of end element
2811
- elif include_boundaries == "end":
2812
- # Exclude start element, include end element
2813
- x0 = start_element.x1 # Start at the right of start element
2814
- x1 = end_element.x1
2815
- else: # "none"
2816
- # Exclude both boundary elements
2817
- x0 = start_element.x1 # Start at the right of start element
2818
- x1 = end_element.x0 # Stop at the left of end element
2819
-
2820
- # Ensure valid boundaries
2821
- if x0 >= x1:
2822
- logger.debug(f"Invalid section boundaries: x0={x0} >= x1={x1}")
2823
- # Return an empty region
2824
- return Region(self.page, (x0, top, x0, top))
2903
+ # Use centralized section utilities
2904
+ from natural_pdf.utils.sections import calculate_section_bounds, validate_section_bounds
2905
+
2906
+ # Calculate section boundaries
2907
+ bounds = calculate_section_bounds(
2908
+ start_element=start_element,
2909
+ end_element=end_element,
2910
+ include_boundaries=include_boundaries,
2911
+ orientation=orientation,
2912
+ parent_bounds=self.bbox,
2913
+ )
2914
+
2915
+ # Validate boundaries
2916
+ if not validate_section_bounds(bounds, orientation):
2917
+ # Return an empty region at the start position
2918
+ x0, top, _, _ = bounds
2919
+ return Region(self.page, (x0, top, x0, top))
2825
2920
 
2826
2921
  # Create new region
2827
- section = Region(self.page, (x0, top, x1, bottom))
2828
- # Store the original boundary elements for reference
2922
+ section = Region(self.page, bounds)
2923
+
2924
+ # Store the original boundary elements and exclusion info
2829
2925
  section.start_element = start_element
2830
2926
  section.end_element = end_element
2927
+ section._boundary_exclusions = include_boundaries
2831
2928
 
2832
2929
  return section
2833
2930
 
@@ -2851,121 +2948,63 @@ class Region(
2851
2948
  List of Region objects representing the extracted sections
2852
2949
  """
2853
2950
  from natural_pdf.elements.element_collection import ElementCollection
2951
+ from natural_pdf.utils.sections import extract_sections_from_region
2952
+
2953
+ # Use centralized section extraction logic
2954
+ sections = extract_sections_from_region(
2955
+ region=self,
2956
+ start_elements=start_elements,
2957
+ end_elements=end_elements,
2958
+ include_boundaries=include_boundaries,
2959
+ orientation=orientation,
2960
+ )
2854
2961
 
2855
- # Process string selectors to find elements WITHIN THIS REGION
2856
- if isinstance(start_elements, str):
2857
- start_elements = self.find_all(start_elements) # Use region's find_all
2858
- if hasattr(start_elements, "elements"):
2859
- start_elements = start_elements.elements
2962
+ return ElementCollection(sections)
2860
2963
 
2861
- if isinstance(end_elements, str):
2862
- end_elements = self.find_all(end_elements) # Use region's find_all
2863
- if hasattr(end_elements, "elements"):
2864
- end_elements = end_elements.elements
2964
+ def split(self, divider, **kwargs) -> "ElementCollection[Region]":
2965
+ """
2966
+ Divide this region into sections based on the provided divider elements.
2865
2967
 
2866
- # Ensure start_elements is a list (or similar iterable)
2867
- if start_elements is None or not hasattr(start_elements, "__iter__"):
2868
- logger.warning(
2869
- "get_sections requires valid start_elements (selector or list). Returning empty."
2870
- )
2871
- return []
2872
- # Ensure end_elements is a list if provided
2873
- if end_elements is not None and not hasattr(end_elements, "__iter__"):
2874
- logger.warning("end_elements must be iterable if provided. Ignoring.")
2875
- end_elements = []
2876
- elif end_elements is None:
2877
- end_elements = []
2878
-
2879
- # If no start elements found within the region, return empty list
2880
- if not start_elements:
2881
- return []
2968
+ Args:
2969
+ divider: Elements or selector string that mark section boundaries
2970
+ **kwargs: Additional parameters passed to get_sections()
2971
+ - include_boundaries: How to include boundary elements (default: 'start')
2972
+ - orientation: 'vertical' or 'horizontal' (default: 'vertical')
2882
2973
 
2883
- # Sort all elements within the region based on orientation
2884
- all_elements_in_region = self.get_elements()
2885
- if orientation == "vertical":
2886
- all_elements_in_region.sort(key=lambda e: (e.top, e.x0))
2887
- else: # horizontal
2888
- all_elements_in_region.sort(key=lambda e: (e.x0, e.top))
2889
-
2890
- if not all_elements_in_region:
2891
- return [] # Cannot create sections if region is empty
2892
-
2893
- # Map elements to their indices in the sorted list
2894
- element_to_index = {el: i for i, el in enumerate(all_elements_in_region)}
2895
-
2896
- # Mark section boundaries using indices from the sorted list
2897
- section_boundaries = []
2898
-
2899
- # Add start element indexes
2900
- for element in start_elements:
2901
- idx = element_to_index.get(element)
2902
- if idx is not None:
2903
- section_boundaries.append({"index": idx, "element": element, "type": "start"})
2904
- # else: Element found by selector might not be geometrically in region? Log warning?
2905
-
2906
- # Add end element indexes if provided
2907
- for element in end_elements:
2908
- idx = element_to_index.get(element)
2909
- if idx is not None:
2910
- section_boundaries.append({"index": idx, "element": element, "type": "end"})
2911
-
2912
- # Sort boundaries by index (document order within the region)
2913
- section_boundaries.sort(key=lambda x: x["index"])
2914
-
2915
- # Generate sections
2916
- sections = []
2917
- current_start_boundary = None
2918
-
2919
- for i, boundary in enumerate(section_boundaries):
2920
- # If it's a start boundary and we don't have a current start
2921
- if boundary["type"] == "start" and current_start_boundary is None:
2922
- current_start_boundary = boundary
2923
-
2924
- # If it's an end boundary and we have a current start
2925
- elif boundary["type"] == "end" and current_start_boundary is not None:
2926
- # Create a section from current_start to this boundary
2927
- start_element = current_start_boundary["element"]
2928
- end_element = boundary["element"]
2929
- # Use the helper, ensuring elements are from within the region
2930
- section = self.get_section_between(
2931
- start_element, end_element, include_boundaries, orientation
2932
- )
2933
- sections.append(section)
2934
- current_start_boundary = None # Reset
2935
-
2936
- # If it's another start boundary and we have a current start (split by starts only)
2937
- elif (
2938
- boundary["type"] == "start"
2939
- and current_start_boundary is not None
2940
- and not end_elements
2941
- ):
2942
- # End the previous section just before this start boundary
2943
- start_element = current_start_boundary["element"]
2944
- # Find the element immediately preceding this start in the sorted list
2945
- end_idx = boundary["index"] - 1
2946
- if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
2947
- end_element = all_elements_in_region[end_idx]
2948
- section = self.get_section_between(
2949
- start_element, end_element, include_boundaries, orientation
2974
+ Returns:
2975
+ ElementCollection of Region objects representing the sections
2976
+
2977
+ Example:
2978
+ # Split a region by bold text
2979
+ sections = region.split("text:bold")
2980
+
2981
+ # Split horizontally by vertical lines
2982
+ sections = region.split("line[orientation=vertical]", orientation="horizontal")
2983
+ """
2984
+ # Default to 'start' boundaries for split (include divider at start of each section)
2985
+ if "include_boundaries" not in kwargs:
2986
+ kwargs["include_boundaries"] = "start"
2987
+
2988
+ sections = self.get_sections(start_elements=divider, **kwargs)
2989
+
2990
+ # Add section before first divider if there's content
2991
+ if sections and hasattr(sections[0], "start_element"):
2992
+ first_divider = sections[0].start_element
2993
+ if first_divider:
2994
+ # Get all elements before the first divider
2995
+ all_elements = self.get_elements()
2996
+ if all_elements and all_elements[0] != first_divider:
2997
+ # Create section from start to just before first divider
2998
+ initial_section = self.get_section_between(
2999
+ start_element=None,
3000
+ end_element=first_divider,
3001
+ include_boundaries="none",
3002
+ orientation=kwargs.get("orientation", "vertical"),
2950
3003
  )
2951
- sections.append(section)
2952
- # Else: Section started and ended by consecutive start elements? Create empty?
2953
- # For now, just reset and start new section
2954
-
2955
- # Start the new section
2956
- current_start_boundary = boundary
2957
-
2958
- # Handle the last section if we have a current start
2959
- if current_start_boundary is not None:
2960
- start_element = current_start_boundary["element"]
2961
- # End at the last element within the region
2962
- end_element = all_elements_in_region[-1]
2963
- section = self.get_section_between(
2964
- start_element, end_element, include_boundaries, orientation
2965
- )
2966
- sections.append(section)
3004
+ if initial_section and initial_section.get_elements():
3005
+ sections.insert(0, initial_section)
2967
3006
 
2968
- return ElementCollection(sections)
3007
+ return sections
2969
3008
 
2970
3009
  def create_cells(self):
2971
3010
  """
@@ -459,9 +459,11 @@ class TextElement(Element):
459
459
  @property
460
460
  def highlight_color(self):
461
461
  """Return RGB(A) tuple of highlight colour if stored."""
462
- return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get(
463
- "highlight_color"
464
- )
462
+ # Check _obj first, being careful with falsy values like 0.0
463
+ if "highlight_color" in self._obj:
464
+ return self._obj["highlight_color"]
465
+ # Fall back to metadata
466
+ return self.metadata.get("decoration", {}).get("highlight_color")
465
467
 
466
468
  def __repr__(self) -> str:
467
469
  """String representation of the text element."""