natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -5,15 +5,19 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
|
|
5
5
|
|
6
6
|
# New Imports
|
7
7
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
8
|
+
from tqdm.auto import tqdm
|
8
9
|
|
9
10
|
from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
|
11
|
+
|
12
|
+
# --- Shape Detection Mixin --- #
|
13
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
10
14
|
from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
|
11
15
|
|
12
16
|
# --- Classification Imports --- #
|
13
17
|
from natural_pdf.classification.mixin import ClassificationMixin
|
14
18
|
from natural_pdf.elements.base import DirectionalMixin
|
19
|
+
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
15
20
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
16
|
-
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
17
21
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
18
22
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
19
23
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
@@ -21,21 +25,19 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
21
25
|
# Import new utils
|
22
26
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
23
27
|
|
24
|
-
from tqdm.auto import tqdm
|
25
28
|
# --- End Classification Imports --- #
|
26
29
|
|
27
|
-
|
28
|
-
|
29
|
-
# --- End Shape Detection Mixin --- #
|
30
|
+
|
31
|
+
# --- End Shape Detection Mixin --- #
|
30
32
|
|
31
33
|
if TYPE_CHECKING:
|
32
34
|
# --- NEW: Add Image type hint for classification --- #
|
33
35
|
from PIL.Image import Image
|
34
36
|
|
35
37
|
from natural_pdf.core.page import Page
|
38
|
+
from natural_pdf.elements.base import Element # Added for type hint
|
36
39
|
from natural_pdf.elements.collections import ElementCollection
|
37
40
|
from natural_pdf.elements.text import TextElement
|
38
|
-
from natural_pdf.elements.base import Element # Added for type hint
|
39
41
|
|
40
42
|
# Import OCRManager conditionally to avoid circular imports
|
41
43
|
try:
|
@@ -726,25 +728,32 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
726
728
|
# Handle the case where user wants the cropped region to have a specific width
|
727
729
|
page_kwargs = kwargs.copy()
|
728
730
|
effective_resolution = resolution # Start with the provided resolution
|
729
|
-
|
730
|
-
if crop_only and
|
731
|
-
target_width = kwargs[
|
731
|
+
|
732
|
+
if crop_only and "width" in kwargs:
|
733
|
+
target_width = kwargs["width"]
|
732
734
|
# Calculate what resolution is needed to make the region crop have target_width
|
733
735
|
region_width_points = self.width # Region width in PDF points
|
734
|
-
|
736
|
+
|
735
737
|
if region_width_points > 0:
|
736
738
|
# Calculate scale needed: target_width / region_width_points
|
737
739
|
required_scale = target_width / region_width_points
|
738
740
|
# Convert scale to resolution: scale * 72 DPI
|
739
741
|
effective_resolution = required_scale * 72.0
|
740
|
-
page_kwargs.pop(
|
741
|
-
logger.debug(
|
742
|
+
page_kwargs.pop("width") # Remove width parameter to avoid conflicts
|
743
|
+
logger.debug(
|
744
|
+
f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
|
745
|
+
)
|
742
746
|
else:
|
743
|
-
logger.warning(
|
747
|
+
logger.warning(
|
748
|
+
f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
|
749
|
+
)
|
744
750
|
|
745
751
|
# First get the full page image with highlights if requested
|
746
752
|
page_image = self._page.to_image(
|
747
|
-
scale=scale,
|
753
|
+
scale=scale,
|
754
|
+
resolution=effective_resolution,
|
755
|
+
include_highlights=include_highlights,
|
756
|
+
**page_kwargs,
|
748
757
|
)
|
749
758
|
|
750
759
|
# Calculate the actual scale factor used by the page image
|
@@ -899,13 +908,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
899
908
|
image.save(filename)
|
900
909
|
return self
|
901
910
|
|
902
|
-
def trim(
|
911
|
+
def trim(
|
912
|
+
self,
|
913
|
+
padding: int = 1,
|
914
|
+
threshold: float = 0.95,
|
915
|
+
resolution: float = 150,
|
916
|
+
pre_shrink: float = 0.5,
|
917
|
+
) -> "Region":
|
903
918
|
"""
|
904
919
|
Trim visual whitespace from the edges of this region.
|
905
|
-
|
920
|
+
|
906
921
|
Similar to Python's string .strip() method, but for visual whitespace in the region image.
|
907
922
|
Uses pixel analysis to detect rows/columns that are predominantly whitespace.
|
908
|
-
|
923
|
+
|
909
924
|
Args:
|
910
925
|
padding: Number of pixels to keep as padding after trimming (default: 1)
|
911
926
|
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
@@ -914,104 +929,126 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
914
929
|
resolution: Resolution for image rendering in DPI (default: 150)
|
915
930
|
pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
|
916
931
|
This helps avoid detecting box borders/slivers as content.
|
917
|
-
|
932
|
+
|
918
933
|
Returns:
|
919
934
|
New Region with visual whitespace trimmed from all edges
|
920
|
-
|
935
|
+
|
921
936
|
Example:
|
922
937
|
# Basic trimming with 1 pixel padding and 0.5px pre-shrink
|
923
938
|
trimmed = region.trim()
|
924
|
-
|
939
|
+
|
925
940
|
# More aggressive trimming with no padding and no pre-shrink
|
926
941
|
tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
|
927
|
-
|
942
|
+
|
928
943
|
# Conservative trimming with more padding
|
929
944
|
loose = region.trim(padding=3, threshold=0.98)
|
930
945
|
"""
|
931
946
|
# Pre-shrink the region to avoid box slivers
|
932
|
-
work_region =
|
933
|
-
|
947
|
+
work_region = (
|
948
|
+
self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
|
949
|
+
if pre_shrink > 0
|
950
|
+
else self
|
951
|
+
)
|
952
|
+
|
934
953
|
# Get the region image
|
935
|
-
image = work_region.to_image(
|
936
|
-
|
954
|
+
image = work_region.to_image(
|
955
|
+
resolution=resolution, crop_only=True, include_highlights=False
|
956
|
+
)
|
957
|
+
|
937
958
|
if image is None:
|
938
|
-
logger.warning(
|
959
|
+
logger.warning(
|
960
|
+
f"Region {self.bbox}: Could not generate image for trimming. Returning original region."
|
961
|
+
)
|
939
962
|
return self
|
940
|
-
|
963
|
+
|
941
964
|
# Convert to grayscale for easier analysis
|
942
965
|
import numpy as np
|
943
|
-
|
966
|
+
|
944
967
|
# Convert PIL image to numpy array
|
945
|
-
img_array = np.array(image.convert(
|
968
|
+
img_array = np.array(image.convert("L")) # Convert to grayscale
|
946
969
|
height, width = img_array.shape
|
947
|
-
|
970
|
+
|
948
971
|
if height == 0 or width == 0:
|
949
|
-
logger.warning(
|
972
|
+
logger.warning(
|
973
|
+
f"Region {self.bbox}: Image has zero dimensions. Returning original region."
|
974
|
+
)
|
950
975
|
return self
|
951
|
-
|
976
|
+
|
952
977
|
# Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
|
953
978
|
normalized = img_array.astype(np.float32) / 255.0
|
954
|
-
|
979
|
+
|
955
980
|
# Find content boundaries by analyzing row and column averages
|
956
|
-
|
981
|
+
|
957
982
|
# Analyze rows (horizontal strips) to find top and bottom boundaries
|
958
983
|
row_averages = np.mean(normalized, axis=1) # Average each row
|
959
984
|
content_rows = row_averages < threshold # True where there's content (not whitespace)
|
960
|
-
|
985
|
+
|
961
986
|
# Find first and last rows with content
|
962
987
|
content_row_indices = np.where(content_rows)[0]
|
963
988
|
if len(content_row_indices) == 0:
|
964
989
|
# No content found, return a minimal region at the center
|
965
|
-
logger.warning(
|
990
|
+
logger.warning(
|
991
|
+
f"Region {self.bbox}: No content detected during trimming. Returning center point."
|
992
|
+
)
|
966
993
|
center_x = (self.x0 + self.x1) / 2
|
967
994
|
center_y = (self.top + self.bottom) / 2
|
968
995
|
return Region(self.page, (center_x, center_y, center_x, center_y))
|
969
|
-
|
996
|
+
|
970
997
|
top_content_row = max(0, content_row_indices[0] - padding)
|
971
998
|
bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
|
972
|
-
|
973
|
-
# Analyze columns (vertical strips) to find left and right boundaries
|
999
|
+
|
1000
|
+
# Analyze columns (vertical strips) to find left and right boundaries
|
974
1001
|
col_averages = np.mean(normalized, axis=0) # Average each column
|
975
1002
|
content_cols = col_averages < threshold # True where there's content
|
976
|
-
|
1003
|
+
|
977
1004
|
content_col_indices = np.where(content_cols)[0]
|
978
1005
|
if len(content_col_indices) == 0:
|
979
1006
|
# No content found in columns either
|
980
|
-
logger.warning(
|
1007
|
+
logger.warning(
|
1008
|
+
f"Region {self.bbox}: No column content detected during trimming. Returning center point."
|
1009
|
+
)
|
981
1010
|
center_x = (self.x0 + self.x1) / 2
|
982
1011
|
center_y = (self.top + self.bottom) / 2
|
983
1012
|
return Region(self.page, (center_x, center_y, center_x, center_y))
|
984
|
-
|
1013
|
+
|
985
1014
|
left_content_col = max(0, content_col_indices[0] - padding)
|
986
1015
|
right_content_col = min(width - 1, content_col_indices[-1] + padding)
|
987
|
-
|
1016
|
+
|
988
1017
|
# Convert trimmed pixel coordinates back to PDF coordinates
|
989
1018
|
scale_factor = resolution / 72.0 # Scale factor used in to_image()
|
990
|
-
|
1019
|
+
|
991
1020
|
# Calculate new PDF coordinates and ensure they are Python floats
|
992
1021
|
trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
|
993
1022
|
trimmed_top = float(work_region.top + (top_content_row / scale_factor))
|
994
|
-
trimmed_x1 = float(
|
995
|
-
|
996
|
-
|
1023
|
+
trimmed_x1 = float(
|
1024
|
+
work_region.x0 + ((right_content_col + 1) / scale_factor)
|
1025
|
+
) # +1 because we want inclusive right edge
|
1026
|
+
trimmed_bottom = float(
|
1027
|
+
work_region.top + ((bottom_content_row + 1) / scale_factor)
|
1028
|
+
) # +1 because we want inclusive bottom edge
|
1029
|
+
|
997
1030
|
# Ensure the trimmed region doesn't exceed the work region boundaries
|
998
1031
|
final_x0 = max(work_region.x0, trimmed_x0)
|
999
1032
|
final_top = max(work_region.top, trimmed_top)
|
1000
1033
|
final_x1 = min(work_region.x1, trimmed_x1)
|
1001
1034
|
final_bottom = min(work_region.bottom, trimmed_bottom)
|
1002
|
-
|
1035
|
+
|
1003
1036
|
# Ensure valid coordinates (width > 0, height > 0)
|
1004
1037
|
if final_x1 <= final_x0 or final_bottom <= final_top:
|
1005
|
-
logger.warning(
|
1038
|
+
logger.warning(
|
1039
|
+
f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region."
|
1040
|
+
)
|
1006
1041
|
return self
|
1007
|
-
|
1042
|
+
|
1008
1043
|
# Create the trimmed region
|
1009
1044
|
trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
|
1010
|
-
|
1045
|
+
|
1011
1046
|
# Expand back by the pre_shrink amount to restore original positioning
|
1012
1047
|
if pre_shrink > 0:
|
1013
|
-
trimmed_region = trimmed_region.expand(
|
1014
|
-
|
1048
|
+
trimmed_region = trimmed_region.expand(
|
1049
|
+
left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink
|
1050
|
+
)
|
1051
|
+
|
1015
1052
|
# Copy relevant metadata
|
1016
1053
|
trimmed_region.region_type = self.region_type
|
1017
1054
|
trimmed_region.normalized_type = self.normalized_type
|
@@ -1021,8 +1058,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1021
1058
|
trimmed_region.label = self.label
|
1022
1059
|
trimmed_region.source = "trimmed" # Indicate this is a derived region
|
1023
1060
|
trimmed_region.parent_region = self
|
1024
|
-
|
1025
|
-
logger.debug(
|
1061
|
+
|
1062
|
+
logger.debug(
|
1063
|
+
f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})"
|
1064
|
+
)
|
1026
1065
|
return trimmed_region
|
1027
1066
|
|
1028
1067
|
def clip(
|
@@ -1035,42 +1074,42 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1035
1074
|
) -> "Region":
|
1036
1075
|
"""
|
1037
1076
|
Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
|
1038
|
-
|
1077
|
+
|
1039
1078
|
The clipped region will be constrained to not exceed the specified boundaries.
|
1040
1079
|
You can provide either an object with bounding box properties, specific coordinates, or both.
|
1041
1080
|
When both are provided, explicit coordinates take precedence.
|
1042
|
-
|
1081
|
+
|
1043
1082
|
Args:
|
1044
1083
|
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1045
1084
|
left: Optional left boundary (x0) to clip to
|
1046
|
-
top: Optional top boundary to clip to
|
1085
|
+
top: Optional top boundary to clip to
|
1047
1086
|
right: Optional right boundary (x1) to clip to
|
1048
1087
|
bottom: Optional bottom boundary to clip to
|
1049
|
-
|
1088
|
+
|
1050
1089
|
Returns:
|
1051
1090
|
New Region with bounds clipped to the specified constraints
|
1052
|
-
|
1091
|
+
|
1053
1092
|
Examples:
|
1054
1093
|
# Clip to another region's bounds
|
1055
1094
|
clipped = region.clip(container_region)
|
1056
|
-
|
1095
|
+
|
1057
1096
|
# Clip to any element's bounds
|
1058
1097
|
clipped = region.clip(text_element)
|
1059
|
-
|
1098
|
+
|
1060
1099
|
# Clip to specific coordinates
|
1061
1100
|
clipped = region.clip(left=100, right=400)
|
1062
|
-
|
1101
|
+
|
1063
1102
|
# Mix object bounds with specific overrides
|
1064
1103
|
clipped = region.clip(obj=container, bottom=page.height/2)
|
1065
1104
|
"""
|
1066
1105
|
from natural_pdf.elements.base import extract_bbox
|
1067
|
-
|
1106
|
+
|
1068
1107
|
# Start with current region bounds
|
1069
1108
|
clip_x0 = self.x0
|
1070
1109
|
clip_top = self.top
|
1071
1110
|
clip_x1 = self.x1
|
1072
1111
|
clip_bottom = self.bottom
|
1073
|
-
|
1112
|
+
|
1074
1113
|
# Apply object constraints if provided
|
1075
1114
|
if obj is not None:
|
1076
1115
|
obj_bbox = extract_bbox(obj)
|
@@ -1086,7 +1125,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1086
1125
|
f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
|
1087
1126
|
"Object must have bbox property or x0/top/x1/bottom attributes."
|
1088
1127
|
)
|
1089
|
-
|
1128
|
+
|
1090
1129
|
# Apply explicit coordinate constraints (these take precedence)
|
1091
1130
|
if left is not None:
|
1092
1131
|
clip_x0 = max(clip_x0, left)
|
@@ -1096,7 +1135,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1096
1135
|
clip_x1 = min(clip_x1, right)
|
1097
1136
|
if bottom is not None:
|
1098
1137
|
clip_bottom = min(clip_bottom, bottom)
|
1099
|
-
|
1138
|
+
|
1100
1139
|
# Ensure valid coordinates
|
1101
1140
|
if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
|
1102
1141
|
logger.warning(
|
@@ -1105,10 +1144,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1105
1144
|
)
|
1106
1145
|
# Return a minimal region at the clip area's top-left
|
1107
1146
|
return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
|
1108
|
-
|
1147
|
+
|
1109
1148
|
# Create the clipped region
|
1110
1149
|
clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
|
1111
|
-
|
1150
|
+
|
1112
1151
|
# Copy relevant metadata
|
1113
1152
|
clipped_region.region_type = self.region_type
|
1114
1153
|
clipped_region.normalized_type = self.normalized_type
|
@@ -1118,7 +1157,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1118
1157
|
clipped_region.label = self.label
|
1119
1158
|
clipped_region.source = "clipped" # Indicate this is a derived region
|
1120
1159
|
clipped_region.parent_region = self
|
1121
|
-
|
1160
|
+
|
1122
1161
|
logger.debug(
|
1123
1162
|
f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
|
1124
1163
|
f"(constraints: obj={type(obj).__name__ if obj else None}, "
|
@@ -1279,24 +1318,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1279
1318
|
else:
|
1280
1319
|
# Try lattice first, then fall back to stream if no meaningful results
|
1281
1320
|
logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
|
1282
|
-
|
1321
|
+
|
1283
1322
|
try:
|
1284
1323
|
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
|
1285
|
-
lattice_result = self.extract_table(
|
1286
|
-
|
1324
|
+
lattice_result = self.extract_table(
|
1325
|
+
"lattice", table_settings=table_settings.copy()
|
1326
|
+
)
|
1327
|
+
|
1287
1328
|
# Check if lattice found meaningful content
|
1288
|
-
if (
|
1289
|
-
|
1290
|
-
|
1329
|
+
if (
|
1330
|
+
lattice_result
|
1331
|
+
and len(lattice_result) > 0
|
1332
|
+
and any(
|
1333
|
+
any(cell and cell.strip() for cell in row if cell)
|
1334
|
+
for row in lattice_result
|
1335
|
+
)
|
1336
|
+
):
|
1337
|
+
logger.debug(
|
1338
|
+
f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows"
|
1339
|
+
)
|
1291
1340
|
return lattice_result
|
1292
1341
|
else:
|
1293
|
-
logger.debug(
|
1342
|
+
logger.debug(
|
1343
|
+
f"Region {self.bbox}: 'lattice' method found no meaningful content"
|
1344
|
+
)
|
1294
1345
|
except Exception as e:
|
1295
1346
|
logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
|
1296
|
-
|
1347
|
+
|
1297
1348
|
# Fall back to stream
|
1298
1349
|
logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
|
1299
|
-
return self.extract_table(
|
1350
|
+
return self.extract_table("stream", table_settings=table_settings.copy())
|
1300
1351
|
else:
|
1301
1352
|
effective_method = method
|
1302
1353
|
|
@@ -1308,7 +1359,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1308
1359
|
table_settings.setdefault("vertical_strategy", "text")
|
1309
1360
|
table_settings.setdefault("horizontal_strategy", "text")
|
1310
1361
|
elif effective_method == "lattice":
|
1311
|
-
logger.debug(
|
1362
|
+
logger.debug(
|
1363
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1364
|
+
)
|
1312
1365
|
effective_method = "pdfplumber"
|
1313
1366
|
# Set default line strategies if not already provided by the user
|
1314
1367
|
table_settings.setdefault("vertical_strategy", "lines")
|
@@ -1331,7 +1384,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1331
1384
|
f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
|
1332
1385
|
)
|
1333
1386
|
|
1334
|
-
|
1335
1387
|
def extract_tables(
|
1336
1388
|
self,
|
1337
1389
|
method: Optional[str] = None,
|
@@ -1357,33 +1409,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1357
1409
|
# Auto-detect method if not specified (try lattice first, then stream)
|
1358
1410
|
if method is None:
|
1359
1411
|
logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
|
1360
|
-
|
1412
|
+
|
1361
1413
|
# Try lattice first
|
1362
1414
|
try:
|
1363
1415
|
lattice_settings = table_settings.copy()
|
1364
1416
|
lattice_settings.setdefault("vertical_strategy", "lines")
|
1365
1417
|
lattice_settings.setdefault("horizontal_strategy", "lines")
|
1366
|
-
|
1418
|
+
|
1367
1419
|
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
|
1368
1420
|
lattice_result = self._extract_tables_plumber(lattice_settings)
|
1369
|
-
|
1421
|
+
|
1370
1422
|
# Check if lattice found meaningful tables
|
1371
|
-
if (
|
1372
|
-
|
1373
|
-
|
1423
|
+
if (
|
1424
|
+
lattice_result
|
1425
|
+
and len(lattice_result) > 0
|
1426
|
+
and any(
|
1427
|
+
any(
|
1428
|
+
any(cell and cell.strip() for cell in row if cell)
|
1429
|
+
for row in table
|
1430
|
+
if table
|
1431
|
+
)
|
1432
|
+
for table in lattice_result
|
1433
|
+
)
|
1434
|
+
):
|
1435
|
+
logger.debug(
|
1436
|
+
f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables"
|
1437
|
+
)
|
1374
1438
|
return lattice_result
|
1375
1439
|
else:
|
1376
1440
|
logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
|
1377
|
-
|
1441
|
+
|
1378
1442
|
except Exception as e:
|
1379
1443
|
logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
|
1380
|
-
|
1444
|
+
|
1381
1445
|
# Fall back to stream
|
1382
1446
|
logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
|
1383
1447
|
stream_settings = table_settings.copy()
|
1384
1448
|
stream_settings.setdefault("vertical_strategy", "text")
|
1385
1449
|
stream_settings.setdefault("horizontal_strategy", "text")
|
1386
|
-
|
1450
|
+
|
1387
1451
|
return self._extract_tables_plumber(stream_settings)
|
1388
1452
|
|
1389
1453
|
effective_method = method
|
@@ -1395,7 +1459,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1395
1459
|
table_settings.setdefault("vertical_strategy", "text")
|
1396
1460
|
table_settings.setdefault("horizontal_strategy", "text")
|
1397
1461
|
elif effective_method == "lattice":
|
1398
|
-
logger.debug(
|
1462
|
+
logger.debug(
|
1463
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1464
|
+
)
|
1399
1465
|
effective_method = "pdfplumber"
|
1400
1466
|
table_settings.setdefault("vertical_strategy", "lines")
|
1401
1467
|
table_settings.setdefault("horizontal_strategy", "lines")
|
@@ -1844,7 +1910,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1844
1910
|
|
1845
1911
|
# Validate contains parameter
|
1846
1912
|
if contains not in ["all", "any", "center"]:
|
1847
|
-
raise ValueError(
|
1913
|
+
raise ValueError(
|
1914
|
+
f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'"
|
1915
|
+
)
|
1848
1916
|
|
1849
1917
|
# Construct selector if 'text' is provided
|
1850
1918
|
effective_selector = ""
|
@@ -1894,24 +1962,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1894
1962
|
# Filter these elements based on the specified containment method
|
1895
1963
|
region_bbox = self.bbox
|
1896
1964
|
matching_elements = []
|
1897
|
-
|
1965
|
+
|
1898
1966
|
if contains == "all": # Fully inside (strict)
|
1899
1967
|
matching_elements = [
|
1900
|
-
el
|
1968
|
+
el
|
1969
|
+
for el in potential_elements
|
1901
1970
|
if el.x0 >= region_bbox[0]
|
1902
1971
|
and el.top >= region_bbox[1]
|
1903
1972
|
and el.x1 <= region_bbox[2]
|
1904
1973
|
and el.bottom <= region_bbox[3]
|
1905
1974
|
]
|
1906
1975
|
elif contains == "any": # Any overlap
|
1907
|
-
matching_elements = [
|
1908
|
-
el for el in potential_elements
|
1909
|
-
if self.intersects(el)
|
1910
|
-
]
|
1976
|
+
matching_elements = [el for el in potential_elements if self.intersects(el)]
|
1911
1977
|
elif contains == "center": # Center point inside
|
1912
1978
|
matching_elements = [
|
1913
|
-
el for el in potential_elements
|
1914
|
-
if self.is_element_center_inside(el)
|
1979
|
+
el for el in potential_elements if self.is_element_center_inside(el)
|
1915
1980
|
]
|
1916
1981
|
|
1917
1982
|
return ElementCollection(matching_elements)
|
@@ -2001,17 +2066,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2001
2066
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
2002
2067
|
|
2003
2068
|
# Run OCR on this region's image using the manager
|
2004
|
-
|
2005
|
-
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
)
|
2010
|
-
return self
|
2011
|
-
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
2012
|
-
except Exception as e:
|
2013
|
-
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
2069
|
+
results = ocr_mgr.apply_ocr(**manager_args)
|
2070
|
+
if not isinstance(results, list):
|
2071
|
+
logger.error(
|
2072
|
+
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
2073
|
+
)
|
2014
2074
|
return self
|
2075
|
+
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
2015
2076
|
|
2016
2077
|
# Convert results to TextElements
|
2017
2078
|
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
@@ -2802,11 +2863,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2802
2863
|
self,
|
2803
2864
|
text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
|
2804
2865
|
source_label: str = "derived_from_region",
|
2805
|
-
object_type: str = "word",
|
2866
|
+
object_type: str = "word", # Or "char", controls how it's categorized
|
2806
2867
|
default_font_size: float = 10.0,
|
2807
2868
|
default_font_name: str = "RegionContent",
|
2808
|
-
confidence: Optional[float] = None,
|
2809
|
-
add_to_page: bool = False
|
2869
|
+
confidence: Optional[float] = None, # Allow overriding confidence
|
2870
|
+
add_to_page: bool = False, # NEW: Option to add to page
|
2810
2871
|
) -> "TextElement":
|
2811
2872
|
"""
|
2812
2873
|
Creates a new TextElement object based on this region's geometry.
|
@@ -2833,7 +2894,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2833
2894
|
|
2834
2895
|
Returns:
|
2835
2896
|
A new TextElement instance.
|
2836
|
-
|
2897
|
+
|
2837
2898
|
Raises:
|
2838
2899
|
ValueError: If the region does not have a valid 'page' attribute.
|
2839
2900
|
"""
|
@@ -2844,14 +2905,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2844
2905
|
try:
|
2845
2906
|
actual_text = text_content(self)
|
2846
2907
|
except Exception as e:
|
2847
|
-
logger.error(
|
2848
|
-
|
2908
|
+
logger.error(
|
2909
|
+
f"Error executing text_content callback for region {self.bbox}: {e}",
|
2910
|
+
exc_info=True,
|
2911
|
+
)
|
2912
|
+
actual_text = None # Ensure actual_text is None on error
|
2849
2913
|
|
2850
2914
|
final_confidence = confidence
|
2851
2915
|
if final_confidence is None:
|
2852
2916
|
final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
|
2853
2917
|
|
2854
|
-
if not hasattr(self,
|
2918
|
+
if not hasattr(self, "page") or self.page is None:
|
2855
2919
|
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2856
2920
|
|
2857
2921
|
elem_data = {
|
@@ -2864,8 +2928,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2864
2928
|
"height": self.height,
|
2865
2929
|
"object_type": object_type,
|
2866
2930
|
"page_number": self.page.page_number,
|
2867
|
-
"stroking_color": getattr(self,
|
2868
|
-
"non_stroking_color": getattr(self,
|
2931
|
+
"stroking_color": getattr(self, "stroking_color", (0, 0, 0)),
|
2932
|
+
"non_stroking_color": getattr(self, "non_stroking_color", (0, 0, 0)),
|
2869
2933
|
"fontname": default_font_name,
|
2870
2934
|
"size": default_font_size,
|
2871
2935
|
"upright": True,
|
@@ -2873,18 +2937,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2873
2937
|
"adv": self.width,
|
2874
2938
|
"source": source_label,
|
2875
2939
|
"confidence": final_confidence,
|
2876
|
-
"_char_dicts": []
|
2940
|
+
"_char_dicts": [],
|
2877
2941
|
}
|
2878
2942
|
text_element = TextElement(elem_data, self.page)
|
2879
2943
|
|
2880
2944
|
if add_to_page:
|
2881
|
-
if hasattr(self.page,
|
2882
|
-
add_as_type =
|
2945
|
+
if hasattr(self.page, "_element_mgr") and self.page._element_mgr is not None:
|
2946
|
+
add_as_type = (
|
2947
|
+
"words"
|
2948
|
+
if object_type == "word"
|
2949
|
+
else "chars" if object_type == "char" else object_type
|
2950
|
+
)
|
2883
2951
|
# REMOVED try-except block around add_element
|
2884
2952
|
self.page._element_mgr.add_element(text_element, element_type=add_as_type)
|
2885
|
-
logger.debug(
|
2953
|
+
logger.debug(
|
2954
|
+
f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
|
2955
|
+
)
|
2886
2956
|
else:
|
2887
|
-
page_num_str =
|
2888
|
-
|
2889
|
-
|
2957
|
+
page_num_str = (
|
2958
|
+
str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
|
2959
|
+
)
|
2960
|
+
logger.warning(
|
2961
|
+
f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'."
|
2962
|
+
)
|
2963
|
+
|
2890
2964
|
return text_element
|