natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
natural_pdf/elements/line.py
CHANGED
@@ -28,6 +28,11 @@ class LineElement(Element):
|
|
28
28
|
"""
|
29
29
|
super().__init__(obj, page)
|
30
30
|
|
31
|
+
@property
|
32
|
+
def source(self) -> Optional[str]:
|
33
|
+
"""Get the source of this line element (e.g., 'pdf', 'detected')."""
|
34
|
+
return self._obj.get("source")
|
35
|
+
|
31
36
|
@property
|
32
37
|
def type(self) -> str:
|
33
38
|
"""Element type."""
|
natural_pdf/elements/region.py
CHANGED
@@ -13,6 +13,7 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
|
|
13
13
|
from natural_pdf.classification.mixin import ClassificationMixin
|
14
14
|
from natural_pdf.elements.base import DirectionalMixin
|
15
15
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
16
|
+
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
16
17
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
17
18
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
18
19
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
@@ -20,11 +21,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
20
21
|
# Import new utils
|
21
22
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
22
23
|
|
23
|
-
|
24
|
-
from natural_pdf.utils.tqdm_utils import get_tqdm
|
25
|
-
|
24
|
+
from tqdm.auto import tqdm
|
26
25
|
# --- End Classification Imports --- #
|
27
26
|
|
27
|
+
# --- Shape Detection Mixin --- #
|
28
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
29
|
+
# --- End Shape Detection Mixin --- #
|
28
30
|
|
29
31
|
if TYPE_CHECKING:
|
30
32
|
# --- NEW: Add Image type hint for classification --- #
|
@@ -33,6 +35,7 @@ if TYPE_CHECKING:
|
|
33
35
|
from natural_pdf.core.page import Page
|
34
36
|
from natural_pdf.elements.collections import ElementCollection
|
35
37
|
from natural_pdf.elements.text import TextElement
|
38
|
+
from natural_pdf.elements.base import Element # Added for type hint
|
36
39
|
|
37
40
|
# Import OCRManager conditionally to avoid circular imports
|
38
41
|
try:
|
@@ -44,7 +47,7 @@ except ImportError:
|
|
44
47
|
logger = logging.getLogger(__name__)
|
45
48
|
|
46
49
|
|
47
|
-
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
50
|
+
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
48
51
|
"""
|
49
52
|
Represents a rectangular region on a page.
|
50
53
|
"""
|
@@ -720,14 +723,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
720
723
|
Returns:
|
721
724
|
PIL Image of just this region
|
722
725
|
"""
|
726
|
+
# Handle the case where user wants the cropped region to have a specific width
|
727
|
+
page_kwargs = kwargs.copy()
|
728
|
+
effective_resolution = resolution # Start with the provided resolution
|
729
|
+
|
730
|
+
if crop_only and 'width' in kwargs:
|
731
|
+
target_width = kwargs['width']
|
732
|
+
# Calculate what resolution is needed to make the region crop have target_width
|
733
|
+
region_width_points = self.width # Region width in PDF points
|
734
|
+
|
735
|
+
if region_width_points > 0:
|
736
|
+
# Calculate scale needed: target_width / region_width_points
|
737
|
+
required_scale = target_width / region_width_points
|
738
|
+
# Convert scale to resolution: scale * 72 DPI
|
739
|
+
effective_resolution = required_scale * 72.0
|
740
|
+
page_kwargs.pop('width') # Remove width parameter to avoid conflicts
|
741
|
+
logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
|
742
|
+
else:
|
743
|
+
logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
|
744
|
+
|
723
745
|
# First get the full page image with highlights if requested
|
724
746
|
page_image = self._page.to_image(
|
725
|
-
scale=scale, resolution=
|
747
|
+
scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
|
726
748
|
)
|
727
749
|
|
728
|
-
# Calculate the
|
729
|
-
|
730
|
-
|
750
|
+
# Calculate the actual scale factor used by the page image
|
751
|
+
if page_image.width > 0 and self._page.width > 0:
|
752
|
+
scale_factor = page_image.width / self._page.width
|
753
|
+
else:
|
754
|
+
# Fallback to resolution-based calculation if dimensions are invalid
|
755
|
+
scale_factor = resolution / 72.0
|
731
756
|
|
732
757
|
# Apply scaling to the coordinates
|
733
758
|
x0 = int(self.x0 * scale_factor)
|
@@ -772,6 +797,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
772
797
|
# Add a default color for standalone show
|
773
798
|
color: Optional[Union[Tuple, str]] = "blue",
|
774
799
|
label: Optional[str] = None,
|
800
|
+
width: Optional[int] = None, # Add width parameter
|
775
801
|
) -> "Image.Image":
|
776
802
|
"""
|
777
803
|
Show the page with just this region highlighted temporarily.
|
@@ -782,6 +808,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
782
808
|
legend_position: Position of the legend
|
783
809
|
color: Color to highlight this region (default: blue)
|
784
810
|
label: Optional label for this region in the legend
|
811
|
+
width: Optional width for the output image in pixels
|
785
812
|
|
786
813
|
Returns:
|
787
814
|
PIL Image of the page with only this region highlighted
|
@@ -812,6 +839,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
812
839
|
page_index=self._page.index,
|
813
840
|
temporary_highlights=[temp_highlight_data],
|
814
841
|
scale=scale,
|
842
|
+
width=width, # Pass the width parameter
|
815
843
|
labels=labels,
|
816
844
|
legend_position=legend_position,
|
817
845
|
)
|
@@ -871,6 +899,233 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
871
899
|
image.save(filename)
|
872
900
|
return self
|
873
901
|
|
902
|
+
def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
|
903
|
+
"""
|
904
|
+
Trim visual whitespace from the edges of this region.
|
905
|
+
|
906
|
+
Similar to Python's string .strip() method, but for visual whitespace in the region image.
|
907
|
+
Uses pixel analysis to detect rows/columns that are predominantly whitespace.
|
908
|
+
|
909
|
+
Args:
|
910
|
+
padding: Number of pixels to keep as padding after trimming (default: 1)
|
911
|
+
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
912
|
+
Higher values mean more strict whitespace detection.
|
913
|
+
E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
|
914
|
+
resolution: Resolution for image rendering in DPI (default: 150)
|
915
|
+
pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
|
916
|
+
This helps avoid detecting box borders/slivers as content.
|
917
|
+
|
918
|
+
Returns:
|
919
|
+
New Region with visual whitespace trimmed from all edges
|
920
|
+
|
921
|
+
Example:
|
922
|
+
# Basic trimming with 1 pixel padding and 0.5px pre-shrink
|
923
|
+
trimmed = region.trim()
|
924
|
+
|
925
|
+
# More aggressive trimming with no padding and no pre-shrink
|
926
|
+
tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
|
927
|
+
|
928
|
+
# Conservative trimming with more padding
|
929
|
+
loose = region.trim(padding=3, threshold=0.98)
|
930
|
+
"""
|
931
|
+
# Pre-shrink the region to avoid box slivers
|
932
|
+
work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
|
933
|
+
|
934
|
+
# Get the region image
|
935
|
+
image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
|
936
|
+
|
937
|
+
if image is None:
|
938
|
+
logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
|
939
|
+
return self
|
940
|
+
|
941
|
+
# Convert to grayscale for easier analysis
|
942
|
+
import numpy as np
|
943
|
+
|
944
|
+
# Convert PIL image to numpy array
|
945
|
+
img_array = np.array(image.convert('L')) # Convert to grayscale
|
946
|
+
height, width = img_array.shape
|
947
|
+
|
948
|
+
if height == 0 or width == 0:
|
949
|
+
logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
|
950
|
+
return self
|
951
|
+
|
952
|
+
# Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
|
953
|
+
normalized = img_array.astype(np.float32) / 255.0
|
954
|
+
|
955
|
+
# Find content boundaries by analyzing row and column averages
|
956
|
+
|
957
|
+
# Analyze rows (horizontal strips) to find top and bottom boundaries
|
958
|
+
row_averages = np.mean(normalized, axis=1) # Average each row
|
959
|
+
content_rows = row_averages < threshold # True where there's content (not whitespace)
|
960
|
+
|
961
|
+
# Find first and last rows with content
|
962
|
+
content_row_indices = np.where(content_rows)[0]
|
963
|
+
if len(content_row_indices) == 0:
|
964
|
+
# No content found, return a minimal region at the center
|
965
|
+
logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
|
966
|
+
center_x = (self.x0 + self.x1) / 2
|
967
|
+
center_y = (self.top + self.bottom) / 2
|
968
|
+
return Region(self.page, (center_x, center_y, center_x, center_y))
|
969
|
+
|
970
|
+
top_content_row = max(0, content_row_indices[0] - padding)
|
971
|
+
bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
|
972
|
+
|
973
|
+
# Analyze columns (vertical strips) to find left and right boundaries
|
974
|
+
col_averages = np.mean(normalized, axis=0) # Average each column
|
975
|
+
content_cols = col_averages < threshold # True where there's content
|
976
|
+
|
977
|
+
content_col_indices = np.where(content_cols)[0]
|
978
|
+
if len(content_col_indices) == 0:
|
979
|
+
# No content found in columns either
|
980
|
+
logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
|
981
|
+
center_x = (self.x0 + self.x1) / 2
|
982
|
+
center_y = (self.top + self.bottom) / 2
|
983
|
+
return Region(self.page, (center_x, center_y, center_x, center_y))
|
984
|
+
|
985
|
+
left_content_col = max(0, content_col_indices[0] - padding)
|
986
|
+
right_content_col = min(width - 1, content_col_indices[-1] + padding)
|
987
|
+
|
988
|
+
# Convert trimmed pixel coordinates back to PDF coordinates
|
989
|
+
scale_factor = resolution / 72.0 # Scale factor used in to_image()
|
990
|
+
|
991
|
+
# Calculate new PDF coordinates and ensure they are Python floats
|
992
|
+
trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
|
993
|
+
trimmed_top = float(work_region.top + (top_content_row / scale_factor))
|
994
|
+
trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor)) # +1 because we want inclusive right edge
|
995
|
+
trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor)) # +1 because we want inclusive bottom edge
|
996
|
+
|
997
|
+
# Ensure the trimmed region doesn't exceed the work region boundaries
|
998
|
+
final_x0 = max(work_region.x0, trimmed_x0)
|
999
|
+
final_top = max(work_region.top, trimmed_top)
|
1000
|
+
final_x1 = min(work_region.x1, trimmed_x1)
|
1001
|
+
final_bottom = min(work_region.bottom, trimmed_bottom)
|
1002
|
+
|
1003
|
+
# Ensure valid coordinates (width > 0, height > 0)
|
1004
|
+
if final_x1 <= final_x0 or final_bottom <= final_top:
|
1005
|
+
logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
|
1006
|
+
return self
|
1007
|
+
|
1008
|
+
# Create the trimmed region
|
1009
|
+
trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
|
1010
|
+
|
1011
|
+
# Expand back by the pre_shrink amount to restore original positioning
|
1012
|
+
if pre_shrink > 0:
|
1013
|
+
trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
|
1014
|
+
|
1015
|
+
# Copy relevant metadata
|
1016
|
+
trimmed_region.region_type = self.region_type
|
1017
|
+
trimmed_region.normalized_type = self.normalized_type
|
1018
|
+
trimmed_region.confidence = self.confidence
|
1019
|
+
trimmed_region.model = self.model
|
1020
|
+
trimmed_region.name = self.name
|
1021
|
+
trimmed_region.label = self.label
|
1022
|
+
trimmed_region.source = "trimmed" # Indicate this is a derived region
|
1023
|
+
trimmed_region.parent_region = self
|
1024
|
+
|
1025
|
+
logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
|
1026
|
+
return trimmed_region
|
1027
|
+
|
1028
|
+
def clip(
|
1029
|
+
self,
|
1030
|
+
obj: Optional[Any] = None,
|
1031
|
+
left: Optional[float] = None,
|
1032
|
+
top: Optional[float] = None,
|
1033
|
+
right: Optional[float] = None,
|
1034
|
+
bottom: Optional[float] = None,
|
1035
|
+
) -> "Region":
|
1036
|
+
"""
|
1037
|
+
Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
|
1038
|
+
|
1039
|
+
The clipped region will be constrained to not exceed the specified boundaries.
|
1040
|
+
You can provide either an object with bounding box properties, specific coordinates, or both.
|
1041
|
+
When both are provided, explicit coordinates take precedence.
|
1042
|
+
|
1043
|
+
Args:
|
1044
|
+
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1045
|
+
left: Optional left boundary (x0) to clip to
|
1046
|
+
top: Optional top boundary to clip to
|
1047
|
+
right: Optional right boundary (x1) to clip to
|
1048
|
+
bottom: Optional bottom boundary to clip to
|
1049
|
+
|
1050
|
+
Returns:
|
1051
|
+
New Region with bounds clipped to the specified constraints
|
1052
|
+
|
1053
|
+
Examples:
|
1054
|
+
# Clip to another region's bounds
|
1055
|
+
clipped = region.clip(container_region)
|
1056
|
+
|
1057
|
+
# Clip to any element's bounds
|
1058
|
+
clipped = region.clip(text_element)
|
1059
|
+
|
1060
|
+
# Clip to specific coordinates
|
1061
|
+
clipped = region.clip(left=100, right=400)
|
1062
|
+
|
1063
|
+
# Mix object bounds with specific overrides
|
1064
|
+
clipped = region.clip(obj=container, bottom=page.height/2)
|
1065
|
+
"""
|
1066
|
+
from natural_pdf.elements.base import extract_bbox
|
1067
|
+
|
1068
|
+
# Start with current region bounds
|
1069
|
+
clip_x0 = self.x0
|
1070
|
+
clip_top = self.top
|
1071
|
+
clip_x1 = self.x1
|
1072
|
+
clip_bottom = self.bottom
|
1073
|
+
|
1074
|
+
# Apply object constraints if provided
|
1075
|
+
if obj is not None:
|
1076
|
+
obj_bbox = extract_bbox(obj)
|
1077
|
+
if obj_bbox is not None:
|
1078
|
+
obj_x0, obj_top, obj_x1, obj_bottom = obj_bbox
|
1079
|
+
# Constrain to the intersection with the provided object
|
1080
|
+
clip_x0 = max(clip_x0, obj_x0)
|
1081
|
+
clip_top = max(clip_top, obj_top)
|
1082
|
+
clip_x1 = min(clip_x1, obj_x1)
|
1083
|
+
clip_bottom = min(clip_bottom, obj_bottom)
|
1084
|
+
else:
|
1085
|
+
logger.warning(
|
1086
|
+
f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
|
1087
|
+
"Object must have bbox property or x0/top/x1/bottom attributes."
|
1088
|
+
)
|
1089
|
+
|
1090
|
+
# Apply explicit coordinate constraints (these take precedence)
|
1091
|
+
if left is not None:
|
1092
|
+
clip_x0 = max(clip_x0, left)
|
1093
|
+
if top is not None:
|
1094
|
+
clip_top = max(clip_top, top)
|
1095
|
+
if right is not None:
|
1096
|
+
clip_x1 = min(clip_x1, right)
|
1097
|
+
if bottom is not None:
|
1098
|
+
clip_bottom = min(clip_bottom, bottom)
|
1099
|
+
|
1100
|
+
# Ensure valid coordinates
|
1101
|
+
if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
|
1102
|
+
logger.warning(
|
1103
|
+
f"Region {self.bbox}: Clipping resulted in invalid dimensions "
|
1104
|
+
f"({clip_x0}, {clip_top}, {clip_x1}, {clip_bottom}). Returning minimal region."
|
1105
|
+
)
|
1106
|
+
# Return a minimal region at the clip area's top-left
|
1107
|
+
return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
|
1108
|
+
|
1109
|
+
# Create the clipped region
|
1110
|
+
clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
|
1111
|
+
|
1112
|
+
# Copy relevant metadata
|
1113
|
+
clipped_region.region_type = self.region_type
|
1114
|
+
clipped_region.normalized_type = self.normalized_type
|
1115
|
+
clipped_region.confidence = self.confidence
|
1116
|
+
clipped_region.model = self.model
|
1117
|
+
clipped_region.name = self.name
|
1118
|
+
clipped_region.label = self.label
|
1119
|
+
clipped_region.source = "clipped" # Indicate this is a derived region
|
1120
|
+
clipped_region.parent_region = self
|
1121
|
+
|
1122
|
+
logger.debug(
|
1123
|
+
f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
|
1124
|
+
f"(constraints: obj={type(obj).__name__ if obj else None}, "
|
1125
|
+
f"left={left}, top={top}, right={right}, bottom={bottom})"
|
1126
|
+
)
|
1127
|
+
return clipped_region
|
1128
|
+
|
874
1129
|
def get_elements(
|
875
1130
|
self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
|
876
1131
|
) -> List["Element"]:
|
@@ -1258,8 +1513,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1258
1513
|
unique_tops = cluster_coords(tops)
|
1259
1514
|
unique_lefts = cluster_coords(lefts)
|
1260
1515
|
|
1261
|
-
# --- Setup tqdm --- #
|
1262
|
-
tqdm = get_tqdm()
|
1263
1516
|
# Determine iterable for tqdm
|
1264
1517
|
cell_iterator = cell_dicts
|
1265
1518
|
if show_progress:
|
@@ -1333,6 +1586,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1333
1586
|
self,
|
1334
1587
|
*,
|
1335
1588
|
text: str,
|
1589
|
+
contains: str = "all",
|
1336
1590
|
apply_exclusions: bool = True,
|
1337
1591
|
regex: bool = False,
|
1338
1592
|
case: bool = True,
|
@@ -1344,6 +1598,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1344
1598
|
self,
|
1345
1599
|
selector: str,
|
1346
1600
|
*,
|
1601
|
+
contains: str = "all",
|
1347
1602
|
apply_exclusions: bool = True,
|
1348
1603
|
regex: bool = False,
|
1349
1604
|
case: bool = True,
|
@@ -1355,6 +1610,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1355
1610
|
selector: Optional[str] = None, # Now optional
|
1356
1611
|
*,
|
1357
1612
|
text: Optional[str] = None, # New text parameter
|
1613
|
+
contains: str = "all", # New parameter for containment behavior
|
1358
1614
|
apply_exclusions: bool = True,
|
1359
1615
|
regex: bool = False,
|
1360
1616
|
case: bool = True,
|
@@ -1368,6 +1624,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1368
1624
|
Args:
|
1369
1625
|
selector: CSS-like selector string.
|
1370
1626
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1627
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1628
|
+
'any' (any overlap), or 'center' (center point inside).
|
1629
|
+
(default: "all")
|
1371
1630
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1372
1631
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1373
1632
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1380,6 +1639,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1380
1639
|
elements = self.find_all(
|
1381
1640
|
selector=selector,
|
1382
1641
|
text=text,
|
1642
|
+
contains=contains,
|
1383
1643
|
apply_exclusions=apply_exclusions,
|
1384
1644
|
regex=regex,
|
1385
1645
|
case=case,
|
@@ -1392,6 +1652,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1392
1652
|
self,
|
1393
1653
|
*,
|
1394
1654
|
text: str,
|
1655
|
+
contains: str = "all",
|
1395
1656
|
apply_exclusions: bool = True,
|
1396
1657
|
regex: bool = False,
|
1397
1658
|
case: bool = True,
|
@@ -1403,6 +1664,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1403
1664
|
self,
|
1404
1665
|
selector: str,
|
1405
1666
|
*,
|
1667
|
+
contains: str = "all",
|
1406
1668
|
apply_exclusions: bool = True,
|
1407
1669
|
regex: bool = False,
|
1408
1670
|
case: bool = True,
|
@@ -1414,6 +1676,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1414
1676
|
selector: Optional[str] = None, # Now optional
|
1415
1677
|
*,
|
1416
1678
|
text: Optional[str] = None, # New text parameter
|
1679
|
+
contains: str = "all", # New parameter to control inside/overlap behavior
|
1417
1680
|
apply_exclusions: bool = True,
|
1418
1681
|
regex: bool = False,
|
1419
1682
|
case: bool = True,
|
@@ -1427,6 +1690,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1427
1690
|
Args:
|
1428
1691
|
selector: CSS-like selector string.
|
1429
1692
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1693
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1694
|
+
'any' (any overlap), or 'center' (center point inside).
|
1695
|
+
(default: "all")
|
1430
1696
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1431
1697
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1432
1698
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1442,6 +1708,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1442
1708
|
if selector is None and text is None:
|
1443
1709
|
raise ValueError("Provide either 'selector' or 'text'.")
|
1444
1710
|
|
1711
|
+
# Validate contains parameter
|
1712
|
+
if contains not in ["all", "any", "center"]:
|
1713
|
+
raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
|
1714
|
+
|
1445
1715
|
# Construct selector if 'text' is provided
|
1446
1716
|
effective_selector = ""
|
1447
1717
|
if text is not None:
|
@@ -1481,22 +1751,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1481
1751
|
# Let the page handle its exclusion logic if needed
|
1482
1752
|
potential_elements = self.page.find_all(
|
1483
1753
|
selector=effective_selector,
|
1484
|
-
apply_exclusions=
|
1754
|
+
apply_exclusions=apply_exclusions,
|
1485
1755
|
regex=regex,
|
1486
1756
|
case=case,
|
1487
1757
|
**kwargs,
|
1488
1758
|
)
|
1489
1759
|
|
1490
|
-
# Filter these elements
|
1760
|
+
# Filter these elements based on the specified containment method
|
1491
1761
|
region_bbox = self.bbox
|
1492
|
-
matching_elements = [
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1762
|
+
matching_elements = []
|
1763
|
+
|
1764
|
+
if contains == "all": # Fully inside (strict)
|
1765
|
+
matching_elements = [
|
1766
|
+
el for el in potential_elements
|
1767
|
+
if el.x0 >= region_bbox[0]
|
1768
|
+
and el.top >= region_bbox[1]
|
1769
|
+
and el.x1 <= region_bbox[2]
|
1770
|
+
and el.bottom <= region_bbox[3]
|
1771
|
+
]
|
1772
|
+
elif contains == "any": # Any overlap
|
1773
|
+
matching_elements = [
|
1774
|
+
el for el in potential_elements
|
1775
|
+
if self.intersects(el)
|
1776
|
+
]
|
1777
|
+
elif contains == "center": # Center point inside
|
1778
|
+
matching_elements = [
|
1779
|
+
el for el in potential_elements
|
1780
|
+
if self.is_element_center_inside(el)
|
1781
|
+
]
|
1500
1782
|
|
1501
1783
|
return ElementCollection(matching_elements)
|
1502
1784
|
|
@@ -1745,7 +2027,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1745
2027
|
|
1746
2028
|
def get_sections(
|
1747
2029
|
self, start_elements=None, end_elements=None, boundary_inclusion="both"
|
1748
|
-
) ->
|
2030
|
+
) -> "ElementCollection[Region]":
|
1749
2031
|
"""
|
1750
2032
|
Get sections within this region based on start/end elements.
|
1751
2033
|
|
@@ -1865,7 +2147,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1865
2147
|
section = self.get_section_between(start_element, end_element, boundary_inclusion)
|
1866
2148
|
sections.append(section)
|
1867
2149
|
|
1868
|
-
return sections
|
2150
|
+
return ElementCollection(sections)
|
1869
2151
|
|
1870
2152
|
def create_cells(self):
|
1871
2153
|
"""
|
@@ -1988,7 +2270,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1988
2270
|
from natural_pdf.qa.document_qa import get_qa_engine
|
1989
2271
|
except ImportError:
|
1990
2272
|
logger.error(
|
1991
|
-
"Question answering requires optional dependencies. Install with `pip install natural-pdf[
|
2273
|
+
"Question answering requires optional dependencies. Install with `pip install natural-pdf[core-ml]`"
|
1992
2274
|
)
|
1993
2275
|
return {
|
1994
2276
|
"answer": None,
|
@@ -2381,3 +2663,94 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
2381
2663
|
return ElementCollection(cell_regions)
|
2382
2664
|
|
2383
2665
|
# --- END NEW METHOD ---
|
2666
|
+
|
2667
|
+
def to_text_element(
|
2668
|
+
self,
|
2669
|
+
text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
|
2670
|
+
source_label: str = "derived_from_region",
|
2671
|
+
object_type: str = "word", # Or "char", controls how it's categorized
|
2672
|
+
default_font_size: float = 10.0,
|
2673
|
+
default_font_name: str = "RegionContent",
|
2674
|
+
confidence: Optional[float] = None, # Allow overriding confidence
|
2675
|
+
add_to_page: bool = False # NEW: Option to add to page
|
2676
|
+
) -> "TextElement":
|
2677
|
+
"""
|
2678
|
+
Creates a new TextElement object based on this region's geometry.
|
2679
|
+
|
2680
|
+
The text for the new TextElement can be provided directly,
|
2681
|
+
generated by a callback function, or left as None.
|
2682
|
+
|
2683
|
+
Args:
|
2684
|
+
text_content:
|
2685
|
+
- If a string, this will be the text of the new TextElement.
|
2686
|
+
- If a callable, it will be called with this region instance
|
2687
|
+
and its return value (a string or None) will be the text.
|
2688
|
+
- If None (default), the TextElement's text will be None.
|
2689
|
+
source_label: The 'source' attribute for the new TextElement.
|
2690
|
+
object_type: The 'object_type' for the TextElement's data dict
|
2691
|
+
(e.g., "word", "char").
|
2692
|
+
default_font_size: Placeholder font size if text is generated.
|
2693
|
+
default_font_name: Placeholder font name if text is generated.
|
2694
|
+
confidence: Confidence score for the text. If text_content is None,
|
2695
|
+
defaults to 0.0. If text is provided/generated, defaults to 1.0
|
2696
|
+
unless specified.
|
2697
|
+
add_to_page: If True, the created TextElement will be added to the
|
2698
|
+
region's parent page. (Default: False)
|
2699
|
+
|
2700
|
+
Returns:
|
2701
|
+
A new TextElement instance.
|
2702
|
+
|
2703
|
+
Raises:
|
2704
|
+
ValueError: If the region does not have a valid 'page' attribute.
|
2705
|
+
"""
|
2706
|
+
actual_text: Optional[str] = None
|
2707
|
+
if isinstance(text_content, str):
|
2708
|
+
actual_text = text_content
|
2709
|
+
elif callable(text_content):
|
2710
|
+
try:
|
2711
|
+
actual_text = text_content(self)
|
2712
|
+
except Exception as e:
|
2713
|
+
logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
|
2714
|
+
actual_text = None # Ensure actual_text is None on error
|
2715
|
+
|
2716
|
+
final_confidence = confidence
|
2717
|
+
if final_confidence is None:
|
2718
|
+
final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
|
2719
|
+
|
2720
|
+
if not hasattr(self, 'page') or self.page is None:
|
2721
|
+
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2722
|
+
|
2723
|
+
elem_data = {
|
2724
|
+
"text": actual_text,
|
2725
|
+
"x0": self.x0,
|
2726
|
+
"top": self.top,
|
2727
|
+
"x1": self.x1,
|
2728
|
+
"bottom": self.bottom,
|
2729
|
+
"width": self.width,
|
2730
|
+
"height": self.height,
|
2731
|
+
"object_type": object_type,
|
2732
|
+
"page_number": self.page.page_number,
|
2733
|
+
"stroking_color": getattr(self, 'stroking_color', (0,0,0)),
|
2734
|
+
"non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
|
2735
|
+
"fontname": default_font_name,
|
2736
|
+
"size": default_font_size,
|
2737
|
+
"upright": True,
|
2738
|
+
"direction": 1,
|
2739
|
+
"adv": self.width,
|
2740
|
+
"source": source_label,
|
2741
|
+
"confidence": final_confidence,
|
2742
|
+
"_char_dicts": []
|
2743
|
+
}
|
2744
|
+
text_element = TextElement(elem_data, self.page)
|
2745
|
+
|
2746
|
+
if add_to_page:
|
2747
|
+
if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
|
2748
|
+
add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
|
2749
|
+
# REMOVED try-except block around add_element
|
2750
|
+
self.page._element_mgr.add_element(text_element, element_type=add_as_type)
|
2751
|
+
logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
|
2752
|
+
else:
|
2753
|
+
page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
|
2754
|
+
logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
|
2755
|
+
|
2756
|
+
return text_element
|
File without changes
|
Binary file
|
Binary file
|