natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +226 -70
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +320 -113
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -5,15 +5,19 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
|
|
5
5
|
|
6
6
|
# New Imports
|
7
7
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
8
|
+
from tqdm.auto import tqdm
|
8
9
|
|
9
10
|
from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
|
11
|
+
|
12
|
+
# --- Shape Detection Mixin --- #
|
13
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
10
14
|
from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
|
11
15
|
|
12
16
|
# --- Classification Imports --- #
|
13
17
|
from natural_pdf.classification.mixin import ClassificationMixin
|
14
18
|
from natural_pdf.elements.base import DirectionalMixin
|
19
|
+
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
15
20
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
16
|
-
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
17
21
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
18
22
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
19
23
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
@@ -21,21 +25,19 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
21
25
|
# Import new utils
|
22
26
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
23
27
|
|
24
|
-
from tqdm.auto import tqdm
|
25
28
|
# --- End Classification Imports --- #
|
26
29
|
|
27
|
-
|
28
|
-
|
29
|
-
# --- End Shape Detection Mixin --- #
|
30
|
+
|
31
|
+
# --- End Shape Detection Mixin --- #
|
30
32
|
|
31
33
|
if TYPE_CHECKING:
|
32
34
|
# --- NEW: Add Image type hint for classification --- #
|
33
35
|
from PIL.Image import Image
|
34
36
|
|
35
37
|
from natural_pdf.core.page import Page
|
38
|
+
from natural_pdf.elements.base import Element # Added for type hint
|
36
39
|
from natural_pdf.elements.collections import ElementCollection
|
37
40
|
from natural_pdf.elements.text import TextElement
|
38
|
-
from natural_pdf.elements.base import Element # Added for type hint
|
39
41
|
|
40
42
|
# Import OCRManager conditionally to avoid circular imports
|
41
43
|
try:
|
@@ -726,25 +728,32 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
726
728
|
# Handle the case where user wants the cropped region to have a specific width
|
727
729
|
page_kwargs = kwargs.copy()
|
728
730
|
effective_resolution = resolution # Start with the provided resolution
|
729
|
-
|
730
|
-
if crop_only and
|
731
|
-
target_width = kwargs[
|
731
|
+
|
732
|
+
if crop_only and "width" in kwargs:
|
733
|
+
target_width = kwargs["width"]
|
732
734
|
# Calculate what resolution is needed to make the region crop have target_width
|
733
735
|
region_width_points = self.width # Region width in PDF points
|
734
|
-
|
736
|
+
|
735
737
|
if region_width_points > 0:
|
736
738
|
# Calculate scale needed: target_width / region_width_points
|
737
739
|
required_scale = target_width / region_width_points
|
738
740
|
# Convert scale to resolution: scale * 72 DPI
|
739
741
|
effective_resolution = required_scale * 72.0
|
740
|
-
page_kwargs.pop(
|
741
|
-
logger.debug(
|
742
|
+
page_kwargs.pop("width") # Remove width parameter to avoid conflicts
|
743
|
+
logger.debug(
|
744
|
+
f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
|
745
|
+
)
|
742
746
|
else:
|
743
|
-
logger.warning(
|
747
|
+
logger.warning(
|
748
|
+
f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
|
749
|
+
)
|
744
750
|
|
745
751
|
# First get the full page image with highlights if requested
|
746
752
|
page_image = self._page.to_image(
|
747
|
-
scale=scale,
|
753
|
+
scale=scale,
|
754
|
+
resolution=effective_resolution,
|
755
|
+
include_highlights=include_highlights,
|
756
|
+
**page_kwargs,
|
748
757
|
)
|
749
758
|
|
750
759
|
# Calculate the actual scale factor used by the page image
|
@@ -899,13 +908,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
899
908
|
image.save(filename)
|
900
909
|
return self
|
901
910
|
|
902
|
-
def trim(
|
911
|
+
def trim(
|
912
|
+
self,
|
913
|
+
padding: int = 1,
|
914
|
+
threshold: float = 0.95,
|
915
|
+
resolution: float = 150,
|
916
|
+
pre_shrink: float = 0.5,
|
917
|
+
) -> "Region":
|
903
918
|
"""
|
904
919
|
Trim visual whitespace from the edges of this region.
|
905
|
-
|
920
|
+
|
906
921
|
Similar to Python's string .strip() method, but for visual whitespace in the region image.
|
907
922
|
Uses pixel analysis to detect rows/columns that are predominantly whitespace.
|
908
|
-
|
923
|
+
|
909
924
|
Args:
|
910
925
|
padding: Number of pixels to keep as padding after trimming (default: 1)
|
911
926
|
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
@@ -914,104 +929,126 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
914
929
|
resolution: Resolution for image rendering in DPI (default: 150)
|
915
930
|
pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
|
916
931
|
This helps avoid detecting box borders/slivers as content.
|
917
|
-
|
932
|
+
|
918
933
|
Returns:
|
919
934
|
New Region with visual whitespace trimmed from all edges
|
920
|
-
|
935
|
+
|
921
936
|
Example:
|
922
937
|
# Basic trimming with 1 pixel padding and 0.5px pre-shrink
|
923
938
|
trimmed = region.trim()
|
924
|
-
|
939
|
+
|
925
940
|
# More aggressive trimming with no padding and no pre-shrink
|
926
941
|
tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
|
927
|
-
|
942
|
+
|
928
943
|
# Conservative trimming with more padding
|
929
944
|
loose = region.trim(padding=3, threshold=0.98)
|
930
945
|
"""
|
931
946
|
# Pre-shrink the region to avoid box slivers
|
932
|
-
work_region =
|
933
|
-
|
947
|
+
work_region = (
|
948
|
+
self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
|
949
|
+
if pre_shrink > 0
|
950
|
+
else self
|
951
|
+
)
|
952
|
+
|
934
953
|
# Get the region image
|
935
|
-
image = work_region.to_image(
|
936
|
-
|
954
|
+
image = work_region.to_image(
|
955
|
+
resolution=resolution, crop_only=True, include_highlights=False
|
956
|
+
)
|
957
|
+
|
937
958
|
if image is None:
|
938
|
-
logger.warning(
|
959
|
+
logger.warning(
|
960
|
+
f"Region {self.bbox}: Could not generate image for trimming. Returning original region."
|
961
|
+
)
|
939
962
|
return self
|
940
|
-
|
963
|
+
|
941
964
|
# Convert to grayscale for easier analysis
|
942
965
|
import numpy as np
|
943
|
-
|
966
|
+
|
944
967
|
# Convert PIL image to numpy array
|
945
|
-
img_array = np.array(image.convert(
|
968
|
+
img_array = np.array(image.convert("L")) # Convert to grayscale
|
946
969
|
height, width = img_array.shape
|
947
|
-
|
970
|
+
|
948
971
|
if height == 0 or width == 0:
|
949
|
-
logger.warning(
|
972
|
+
logger.warning(
|
973
|
+
f"Region {self.bbox}: Image has zero dimensions. Returning original region."
|
974
|
+
)
|
950
975
|
return self
|
951
|
-
|
976
|
+
|
952
977
|
# Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
|
953
978
|
normalized = img_array.astype(np.float32) / 255.0
|
954
|
-
|
979
|
+
|
955
980
|
# Find content boundaries by analyzing row and column averages
|
956
|
-
|
981
|
+
|
957
982
|
# Analyze rows (horizontal strips) to find top and bottom boundaries
|
958
983
|
row_averages = np.mean(normalized, axis=1) # Average each row
|
959
984
|
content_rows = row_averages < threshold # True where there's content (not whitespace)
|
960
|
-
|
985
|
+
|
961
986
|
# Find first and last rows with content
|
962
987
|
content_row_indices = np.where(content_rows)[0]
|
963
988
|
if len(content_row_indices) == 0:
|
964
989
|
# No content found, return a minimal region at the center
|
965
|
-
logger.warning(
|
990
|
+
logger.warning(
|
991
|
+
f"Region {self.bbox}: No content detected during trimming. Returning center point."
|
992
|
+
)
|
966
993
|
center_x = (self.x0 + self.x1) / 2
|
967
994
|
center_y = (self.top + self.bottom) / 2
|
968
995
|
return Region(self.page, (center_x, center_y, center_x, center_y))
|
969
|
-
|
996
|
+
|
970
997
|
top_content_row = max(0, content_row_indices[0] - padding)
|
971
998
|
bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
|
972
|
-
|
973
|
-
# Analyze columns (vertical strips) to find left and right boundaries
|
999
|
+
|
1000
|
+
# Analyze columns (vertical strips) to find left and right boundaries
|
974
1001
|
col_averages = np.mean(normalized, axis=0) # Average each column
|
975
1002
|
content_cols = col_averages < threshold # True where there's content
|
976
|
-
|
1003
|
+
|
977
1004
|
content_col_indices = np.where(content_cols)[0]
|
978
1005
|
if len(content_col_indices) == 0:
|
979
1006
|
# No content found in columns either
|
980
|
-
logger.warning(
|
1007
|
+
logger.warning(
|
1008
|
+
f"Region {self.bbox}: No column content detected during trimming. Returning center point."
|
1009
|
+
)
|
981
1010
|
center_x = (self.x0 + self.x1) / 2
|
982
1011
|
center_y = (self.top + self.bottom) / 2
|
983
1012
|
return Region(self.page, (center_x, center_y, center_x, center_y))
|
984
|
-
|
1013
|
+
|
985
1014
|
left_content_col = max(0, content_col_indices[0] - padding)
|
986
1015
|
right_content_col = min(width - 1, content_col_indices[-1] + padding)
|
987
|
-
|
1016
|
+
|
988
1017
|
# Convert trimmed pixel coordinates back to PDF coordinates
|
989
1018
|
scale_factor = resolution / 72.0 # Scale factor used in to_image()
|
990
|
-
|
1019
|
+
|
991
1020
|
# Calculate new PDF coordinates and ensure they are Python floats
|
992
1021
|
trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
|
993
1022
|
trimmed_top = float(work_region.top + (top_content_row / scale_factor))
|
994
|
-
trimmed_x1 = float(
|
995
|
-
|
996
|
-
|
1023
|
+
trimmed_x1 = float(
|
1024
|
+
work_region.x0 + ((right_content_col + 1) / scale_factor)
|
1025
|
+
) # +1 because we want inclusive right edge
|
1026
|
+
trimmed_bottom = float(
|
1027
|
+
work_region.top + ((bottom_content_row + 1) / scale_factor)
|
1028
|
+
) # +1 because we want inclusive bottom edge
|
1029
|
+
|
997
1030
|
# Ensure the trimmed region doesn't exceed the work region boundaries
|
998
1031
|
final_x0 = max(work_region.x0, trimmed_x0)
|
999
1032
|
final_top = max(work_region.top, trimmed_top)
|
1000
1033
|
final_x1 = min(work_region.x1, trimmed_x1)
|
1001
1034
|
final_bottom = min(work_region.bottom, trimmed_bottom)
|
1002
|
-
|
1035
|
+
|
1003
1036
|
# Ensure valid coordinates (width > 0, height > 0)
|
1004
1037
|
if final_x1 <= final_x0 or final_bottom <= final_top:
|
1005
|
-
logger.warning(
|
1038
|
+
logger.warning(
|
1039
|
+
f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region."
|
1040
|
+
)
|
1006
1041
|
return self
|
1007
|
-
|
1042
|
+
|
1008
1043
|
# Create the trimmed region
|
1009
1044
|
trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
|
1010
|
-
|
1045
|
+
|
1011
1046
|
# Expand back by the pre_shrink amount to restore original positioning
|
1012
1047
|
if pre_shrink > 0:
|
1013
|
-
trimmed_region = trimmed_region.expand(
|
1014
|
-
|
1048
|
+
trimmed_region = trimmed_region.expand(
|
1049
|
+
left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink
|
1050
|
+
)
|
1051
|
+
|
1015
1052
|
# Copy relevant metadata
|
1016
1053
|
trimmed_region.region_type = self.region_type
|
1017
1054
|
trimmed_region.normalized_type = self.normalized_type
|
@@ -1021,8 +1058,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1021
1058
|
trimmed_region.label = self.label
|
1022
1059
|
trimmed_region.source = "trimmed" # Indicate this is a derived region
|
1023
1060
|
trimmed_region.parent_region = self
|
1024
|
-
|
1025
|
-
logger.debug(
|
1061
|
+
|
1062
|
+
logger.debug(
|
1063
|
+
f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})"
|
1064
|
+
)
|
1026
1065
|
return trimmed_region
|
1027
1066
|
|
1028
1067
|
def clip(
|
@@ -1035,42 +1074,42 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1035
1074
|
) -> "Region":
|
1036
1075
|
"""
|
1037
1076
|
Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
|
1038
|
-
|
1077
|
+
|
1039
1078
|
The clipped region will be constrained to not exceed the specified boundaries.
|
1040
1079
|
You can provide either an object with bounding box properties, specific coordinates, or both.
|
1041
1080
|
When both are provided, explicit coordinates take precedence.
|
1042
|
-
|
1081
|
+
|
1043
1082
|
Args:
|
1044
1083
|
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1045
1084
|
left: Optional left boundary (x0) to clip to
|
1046
|
-
top: Optional top boundary to clip to
|
1085
|
+
top: Optional top boundary to clip to
|
1047
1086
|
right: Optional right boundary (x1) to clip to
|
1048
1087
|
bottom: Optional bottom boundary to clip to
|
1049
|
-
|
1088
|
+
|
1050
1089
|
Returns:
|
1051
1090
|
New Region with bounds clipped to the specified constraints
|
1052
|
-
|
1091
|
+
|
1053
1092
|
Examples:
|
1054
1093
|
# Clip to another region's bounds
|
1055
1094
|
clipped = region.clip(container_region)
|
1056
|
-
|
1095
|
+
|
1057
1096
|
# Clip to any element's bounds
|
1058
1097
|
clipped = region.clip(text_element)
|
1059
|
-
|
1098
|
+
|
1060
1099
|
# Clip to specific coordinates
|
1061
1100
|
clipped = region.clip(left=100, right=400)
|
1062
|
-
|
1101
|
+
|
1063
1102
|
# Mix object bounds with specific overrides
|
1064
1103
|
clipped = region.clip(obj=container, bottom=page.height/2)
|
1065
1104
|
"""
|
1066
1105
|
from natural_pdf.elements.base import extract_bbox
|
1067
|
-
|
1106
|
+
|
1068
1107
|
# Start with current region bounds
|
1069
1108
|
clip_x0 = self.x0
|
1070
1109
|
clip_top = self.top
|
1071
1110
|
clip_x1 = self.x1
|
1072
1111
|
clip_bottom = self.bottom
|
1073
|
-
|
1112
|
+
|
1074
1113
|
# Apply object constraints if provided
|
1075
1114
|
if obj is not None:
|
1076
1115
|
obj_bbox = extract_bbox(obj)
|
@@ -1086,7 +1125,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1086
1125
|
f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
|
1087
1126
|
"Object must have bbox property or x0/top/x1/bottom attributes."
|
1088
1127
|
)
|
1089
|
-
|
1128
|
+
|
1090
1129
|
# Apply explicit coordinate constraints (these take precedence)
|
1091
1130
|
if left is not None:
|
1092
1131
|
clip_x0 = max(clip_x0, left)
|
@@ -1096,7 +1135,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1096
1135
|
clip_x1 = min(clip_x1, right)
|
1097
1136
|
if bottom is not None:
|
1098
1137
|
clip_bottom = min(clip_bottom, bottom)
|
1099
|
-
|
1138
|
+
|
1100
1139
|
# Ensure valid coordinates
|
1101
1140
|
if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
|
1102
1141
|
logger.warning(
|
@@ -1105,10 +1144,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1105
1144
|
)
|
1106
1145
|
# Return a minimal region at the clip area's top-left
|
1107
1146
|
return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
|
1108
|
-
|
1147
|
+
|
1109
1148
|
# Create the clipped region
|
1110
1149
|
clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
|
1111
|
-
|
1150
|
+
|
1112
1151
|
# Copy relevant metadata
|
1113
1152
|
clipped_region.region_type = self.region_type
|
1114
1153
|
clipped_region.normalized_type = self.normalized_type
|
@@ -1118,7 +1157,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1118
1157
|
clipped_region.label = self.label
|
1119
1158
|
clipped_region.source = "clipped" # Indicate this is a derived region
|
1120
1159
|
clipped_region.parent_region = self
|
1121
|
-
|
1160
|
+
|
1122
1161
|
logger.debug(
|
1123
1162
|
f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
|
1124
1163
|
f"(constraints: obj={type(obj).__name__ if obj else None}, "
|
@@ -1247,8 +1286,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1247
1286
|
Extract a table from this region.
|
1248
1287
|
|
1249
1288
|
Args:
|
1250
|
-
method: Method to use: 'tatr', '
|
1251
|
-
|
1289
|
+
method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
|
1290
|
+
'stream' is an alias for 'pdfplumber' with text-based strategies (equivalent to
|
1291
|
+
setting `vertical_strategy` and `horizontal_strategy` to 'text').
|
1292
|
+
'lattice' is an alias for 'pdfplumber' with line-based strategies (equivalent to
|
1293
|
+
setting `vertical_strategy` and `horizontal_strategy` to 'lines').
|
1294
|
+
table_settings: Settings for pdfplumber table extraction (used with 'pdfplumber', 'stream', or 'lattice' methods).
|
1252
1295
|
use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
|
1253
1296
|
ocr_config: OCR configuration parameters.
|
1254
1297
|
text_options: Dictionary of options for the 'text' method, corresponding to arguments
|
@@ -1268,13 +1311,61 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1268
1311
|
text_options = {} # Initialize empty dict
|
1269
1312
|
|
1270
1313
|
# Auto-detect method if not specified
|
1271
|
-
|
1272
|
-
if effective_method is None:
|
1314
|
+
if method is None:
|
1273
1315
|
# If this is a TATR-detected region, use TATR method
|
1274
1316
|
if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
|
1275
1317
|
effective_method = "tatr"
|
1276
1318
|
else:
|
1277
|
-
|
1319
|
+
# Try lattice first, then fall back to stream if no meaningful results
|
1320
|
+
logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
|
1321
|
+
|
1322
|
+
try:
|
1323
|
+
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
|
1324
|
+
lattice_result = self.extract_table(
|
1325
|
+
"lattice", table_settings=table_settings.copy()
|
1326
|
+
)
|
1327
|
+
|
1328
|
+
# Check if lattice found meaningful content
|
1329
|
+
if (
|
1330
|
+
lattice_result
|
1331
|
+
and len(lattice_result) > 0
|
1332
|
+
and any(
|
1333
|
+
any(cell and cell.strip() for cell in row if cell)
|
1334
|
+
for row in lattice_result
|
1335
|
+
)
|
1336
|
+
):
|
1337
|
+
logger.debug(
|
1338
|
+
f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows"
|
1339
|
+
)
|
1340
|
+
return lattice_result
|
1341
|
+
else:
|
1342
|
+
logger.debug(
|
1343
|
+
f"Region {self.bbox}: 'lattice' method found no meaningful content"
|
1344
|
+
)
|
1345
|
+
except Exception as e:
|
1346
|
+
logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
|
1347
|
+
|
1348
|
+
# Fall back to stream
|
1349
|
+
logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
|
1350
|
+
return self.extract_table("stream", table_settings=table_settings.copy())
|
1351
|
+
else:
|
1352
|
+
effective_method = method
|
1353
|
+
|
1354
|
+
# Handle method aliases for pdfplumber
|
1355
|
+
if effective_method == "stream":
|
1356
|
+
logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
|
1357
|
+
effective_method = "pdfplumber"
|
1358
|
+
# Set default text strategies if not already provided by the user
|
1359
|
+
table_settings.setdefault("vertical_strategy", "text")
|
1360
|
+
table_settings.setdefault("horizontal_strategy", "text")
|
1361
|
+
elif effective_method == "lattice":
|
1362
|
+
logger.debug(
|
1363
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1364
|
+
)
|
1365
|
+
effective_method = "pdfplumber"
|
1366
|
+
# Set default line strategies if not already provided by the user
|
1367
|
+
table_settings.setdefault("vertical_strategy", "lines")
|
1368
|
+
table_settings.setdefault("horizontal_strategy", "lines")
|
1278
1369
|
|
1279
1370
|
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
1280
1371
|
|
@@ -1284,16 +1375,124 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1284
1375
|
elif effective_method == "text":
|
1285
1376
|
current_text_options = text_options.copy()
|
1286
1377
|
current_text_options["cell_extraction_func"] = cell_extraction_func
|
1287
|
-
# --- Pass show_progress to the helper --- #
|
1288
1378
|
current_text_options["show_progress"] = show_progress
|
1289
1379
|
return self._extract_table_text(**current_text_options)
|
1290
|
-
elif effective_method == "
|
1380
|
+
elif effective_method == "pdfplumber":
|
1291
1381
|
return self._extract_table_plumber(table_settings)
|
1292
1382
|
else:
|
1293
1383
|
raise ValueError(
|
1294
|
-
f"Unknown table extraction method: '{
|
1384
|
+
f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
|
1295
1385
|
)
|
1296
1386
|
|
1387
|
+
def extract_tables(
|
1388
|
+
self,
|
1389
|
+
method: Optional[str] = None,
|
1390
|
+
table_settings: Optional[dict] = None,
|
1391
|
+
) -> List[List[List[str]]]:
|
1392
|
+
"""
|
1393
|
+
Extract all tables from this region using pdfplumber-based methods.
|
1394
|
+
|
1395
|
+
Note: Only 'pdfplumber', 'stream', and 'lattice' methods are supported for extract_tables.
|
1396
|
+
'tatr' and 'text' methods are designed for single table extraction only.
|
1397
|
+
|
1398
|
+
Args:
|
1399
|
+
method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
|
1400
|
+
'stream' uses text-based strategies, 'lattice' uses line-based strategies.
|
1401
|
+
table_settings: Settings for pdfplumber table extraction.
|
1402
|
+
|
1403
|
+
Returns:
|
1404
|
+
List of tables, where each table is a list of rows, and each row is a list of cell values.
|
1405
|
+
"""
|
1406
|
+
if table_settings is None:
|
1407
|
+
table_settings = {}
|
1408
|
+
|
1409
|
+
# Auto-detect method if not specified (try lattice first, then stream)
|
1410
|
+
if method is None:
|
1411
|
+
logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
|
1412
|
+
|
1413
|
+
# Try lattice first
|
1414
|
+
try:
|
1415
|
+
lattice_settings = table_settings.copy()
|
1416
|
+
lattice_settings.setdefault("vertical_strategy", "lines")
|
1417
|
+
lattice_settings.setdefault("horizontal_strategy", "lines")
|
1418
|
+
|
1419
|
+
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
|
1420
|
+
lattice_result = self._extract_tables_plumber(lattice_settings)
|
1421
|
+
|
1422
|
+
# Check if lattice found meaningful tables
|
1423
|
+
if (
|
1424
|
+
lattice_result
|
1425
|
+
and len(lattice_result) > 0
|
1426
|
+
and any(
|
1427
|
+
any(
|
1428
|
+
any(cell and cell.strip() for cell in row if cell)
|
1429
|
+
for row in table
|
1430
|
+
if table
|
1431
|
+
)
|
1432
|
+
for table in lattice_result
|
1433
|
+
)
|
1434
|
+
):
|
1435
|
+
logger.debug(
|
1436
|
+
f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables"
|
1437
|
+
)
|
1438
|
+
return lattice_result
|
1439
|
+
else:
|
1440
|
+
logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
|
1441
|
+
|
1442
|
+
except Exception as e:
|
1443
|
+
logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
|
1444
|
+
|
1445
|
+
# Fall back to stream
|
1446
|
+
logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
|
1447
|
+
stream_settings = table_settings.copy()
|
1448
|
+
stream_settings.setdefault("vertical_strategy", "text")
|
1449
|
+
stream_settings.setdefault("horizontal_strategy", "text")
|
1450
|
+
|
1451
|
+
return self._extract_tables_plumber(stream_settings)
|
1452
|
+
|
1453
|
+
effective_method = method
|
1454
|
+
|
1455
|
+
# Handle method aliases
|
1456
|
+
if effective_method == "stream":
|
1457
|
+
logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
|
1458
|
+
effective_method = "pdfplumber"
|
1459
|
+
table_settings.setdefault("vertical_strategy", "text")
|
1460
|
+
table_settings.setdefault("horizontal_strategy", "text")
|
1461
|
+
elif effective_method == "lattice":
|
1462
|
+
logger.debug(
|
1463
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1464
|
+
)
|
1465
|
+
effective_method = "pdfplumber"
|
1466
|
+
table_settings.setdefault("vertical_strategy", "lines")
|
1467
|
+
table_settings.setdefault("horizontal_strategy", "lines")
|
1468
|
+
|
1469
|
+
# Use the selected method
|
1470
|
+
if effective_method == "pdfplumber":
|
1471
|
+
return self._extract_tables_plumber(table_settings)
|
1472
|
+
else:
|
1473
|
+
raise ValueError(
|
1474
|
+
f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
|
1475
|
+
)
|
1476
|
+
|
1477
|
+
def _extract_tables_plumber(self, table_settings: dict) -> List[List[List[str]]]:
|
1478
|
+
"""
|
1479
|
+
Extract all tables using pdfplumber's table extraction.
|
1480
|
+
|
1481
|
+
Args:
|
1482
|
+
table_settings: Settings for pdfplumber table extraction
|
1483
|
+
|
1484
|
+
Returns:
|
1485
|
+
List of tables, where each table is a list of rows, and each row is a list of cell values
|
1486
|
+
"""
|
1487
|
+
# Create a crop of the page for this region
|
1488
|
+
cropped = self.page._page.crop(self.bbox)
|
1489
|
+
|
1490
|
+
# Extract all tables from the cropped area
|
1491
|
+
tables = cropped.extract_tables(table_settings)
|
1492
|
+
|
1493
|
+
# Return the tables or an empty list if none found
|
1494
|
+
return tables if tables else []
|
1495
|
+
|
1297
1496
|
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
1298
1497
|
"""
|
1299
1498
|
Extract table using pdfplumber's table extraction.
|
@@ -1711,7 +1910,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1711
1910
|
|
1712
1911
|
# Validate contains parameter
|
1713
1912
|
if contains not in ["all", "any", "center"]:
|
1714
|
-
raise ValueError(
|
1913
|
+
raise ValueError(
|
1914
|
+
f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'"
|
1915
|
+
)
|
1715
1916
|
|
1716
1917
|
# Construct selector if 'text' is provided
|
1717
1918
|
effective_selector = ""
|
@@ -1761,24 +1962,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1761
1962
|
# Filter these elements based on the specified containment method
|
1762
1963
|
region_bbox = self.bbox
|
1763
1964
|
matching_elements = []
|
1764
|
-
|
1965
|
+
|
1765
1966
|
if contains == "all": # Fully inside (strict)
|
1766
1967
|
matching_elements = [
|
1767
|
-
el
|
1968
|
+
el
|
1969
|
+
for el in potential_elements
|
1768
1970
|
if el.x0 >= region_bbox[0]
|
1769
1971
|
and el.top >= region_bbox[1]
|
1770
1972
|
and el.x1 <= region_bbox[2]
|
1771
1973
|
and el.bottom <= region_bbox[3]
|
1772
1974
|
]
|
1773
1975
|
elif contains == "any": # Any overlap
|
1774
|
-
matching_elements = [
|
1775
|
-
el for el in potential_elements
|
1776
|
-
if self.intersects(el)
|
1777
|
-
]
|
1976
|
+
matching_elements = [el for el in potential_elements if self.intersects(el)]
|
1778
1977
|
elif contains == "center": # Center point inside
|
1779
1978
|
matching_elements = [
|
1780
|
-
el for el in potential_elements
|
1781
|
-
if self.is_element_center_inside(el)
|
1979
|
+
el for el in potential_elements if self.is_element_center_inside(el)
|
1782
1980
|
]
|
1783
1981
|
|
1784
1982
|
return ElementCollection(matching_elements)
|
@@ -1868,17 +2066,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1868
2066
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
1869
2067
|
|
1870
2068
|
# Run OCR on this region's image using the manager
|
1871
|
-
|
1872
|
-
|
1873
|
-
|
1874
|
-
|
1875
|
-
|
1876
|
-
)
|
1877
|
-
return self
|
1878
|
-
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
1879
|
-
except Exception as e:
|
1880
|
-
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
2069
|
+
results = ocr_mgr.apply_ocr(**manager_args)
|
2070
|
+
if not isinstance(results, list):
|
2071
|
+
logger.error(
|
2072
|
+
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
2073
|
+
)
|
1881
2074
|
return self
|
2075
|
+
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
1882
2076
|
|
1883
2077
|
# Convert results to TextElements
|
1884
2078
|
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
@@ -2669,11 +2863,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2669
2863
|
self,
|
2670
2864
|
text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
|
2671
2865
|
source_label: str = "derived_from_region",
|
2672
|
-
object_type: str = "word",
|
2866
|
+
object_type: str = "word", # Or "char", controls how it's categorized
|
2673
2867
|
default_font_size: float = 10.0,
|
2674
2868
|
default_font_name: str = "RegionContent",
|
2675
|
-
confidence: Optional[float] = None,
|
2676
|
-
add_to_page: bool = False
|
2869
|
+
confidence: Optional[float] = None, # Allow overriding confidence
|
2870
|
+
add_to_page: bool = False, # NEW: Option to add to page
|
2677
2871
|
) -> "TextElement":
|
2678
2872
|
"""
|
2679
2873
|
Creates a new TextElement object based on this region's geometry.
|
@@ -2700,7 +2894,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2700
2894
|
|
2701
2895
|
Returns:
|
2702
2896
|
A new TextElement instance.
|
2703
|
-
|
2897
|
+
|
2704
2898
|
Raises:
|
2705
2899
|
ValueError: If the region does not have a valid 'page' attribute.
|
2706
2900
|
"""
|
@@ -2711,14 +2905,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2711
2905
|
try:
|
2712
2906
|
actual_text = text_content(self)
|
2713
2907
|
except Exception as e:
|
2714
|
-
logger.error(
|
2715
|
-
|
2908
|
+
logger.error(
|
2909
|
+
f"Error executing text_content callback for region {self.bbox}: {e}",
|
2910
|
+
exc_info=True,
|
2911
|
+
)
|
2912
|
+
actual_text = None # Ensure actual_text is None on error
|
2716
2913
|
|
2717
2914
|
final_confidence = confidence
|
2718
2915
|
if final_confidence is None:
|
2719
2916
|
final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
|
2720
2917
|
|
2721
|
-
if not hasattr(self,
|
2918
|
+
if not hasattr(self, "page") or self.page is None:
|
2722
2919
|
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2723
2920
|
|
2724
2921
|
elem_data = {
|
@@ -2731,8 +2928,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2731
2928
|
"height": self.height,
|
2732
2929
|
"object_type": object_type,
|
2733
2930
|
"page_number": self.page.page_number,
|
2734
|
-
"stroking_color": getattr(self,
|
2735
|
-
"non_stroking_color": getattr(self,
|
2931
|
+
"stroking_color": getattr(self, "stroking_color", (0, 0, 0)),
|
2932
|
+
"non_stroking_color": getattr(self, "non_stroking_color", (0, 0, 0)),
|
2736
2933
|
"fontname": default_font_name,
|
2737
2934
|
"size": default_font_size,
|
2738
2935
|
"upright": True,
|
@@ -2740,18 +2937,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2740
2937
|
"adv": self.width,
|
2741
2938
|
"source": source_label,
|
2742
2939
|
"confidence": final_confidence,
|
2743
|
-
"_char_dicts": []
|
2940
|
+
"_char_dicts": [],
|
2744
2941
|
}
|
2745
2942
|
text_element = TextElement(elem_data, self.page)
|
2746
2943
|
|
2747
2944
|
if add_to_page:
|
2748
|
-
if hasattr(self.page,
|
2749
|
-
add_as_type =
|
2945
|
+
if hasattr(self.page, "_element_mgr") and self.page._element_mgr is not None:
|
2946
|
+
add_as_type = (
|
2947
|
+
"words"
|
2948
|
+
if object_type == "word"
|
2949
|
+
else "chars" if object_type == "char" else object_type
|
2950
|
+
)
|
2750
2951
|
# REMOVED try-except block around add_element
|
2751
2952
|
self.page._element_mgr.add_element(text_element, element_type=add_as_type)
|
2752
|
-
logger.debug(
|
2953
|
+
logger.debug(
|
2954
|
+
f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
|
2955
|
+
)
|
2753
2956
|
else:
|
2754
|
-
page_num_str =
|
2755
|
-
|
2756
|
-
|
2957
|
+
page_num_str = (
|
2958
|
+
str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
|
2959
|
+
)
|
2960
|
+
logger.warning(
|
2961
|
+
f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'."
|
2962
|
+
)
|
2963
|
+
|
2757
2964
|
return text_element
|