natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +119 -76
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +11 -10
- natural_pdf/elements/collections.py +116 -51
- natural_pdf/elements/region.py +204 -127
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -5,15 +5,20 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
|
|
5
5
|
|
6
6
|
# New Imports
|
7
7
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
8
|
+
from tqdm.auto import tqdm
|
8
9
|
|
9
10
|
from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
|
11
|
+
|
12
|
+
# --- Shape Detection Mixin --- #
|
13
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
10
14
|
from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
|
11
15
|
|
12
16
|
# --- Classification Imports --- #
|
13
17
|
from natural_pdf.classification.mixin import ClassificationMixin
|
18
|
+
from natural_pdf.describe.mixin import DescribeMixin
|
14
19
|
from natural_pdf.elements.base import DirectionalMixin
|
20
|
+
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
15
21
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
16
|
-
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
17
22
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
18
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
19
24
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
@@ -21,21 +26,19 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
21
26
|
# Import new utils
|
22
27
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
23
28
|
|
24
|
-
from tqdm.auto import tqdm
|
25
29
|
# --- End Classification Imports --- #
|
26
30
|
|
27
|
-
|
28
|
-
|
29
|
-
# --- End Shape Detection Mixin --- #
|
31
|
+
|
32
|
+
# --- End Shape Detection Mixin --- #
|
30
33
|
|
31
34
|
if TYPE_CHECKING:
|
32
35
|
# --- NEW: Add Image type hint for classification --- #
|
33
36
|
from PIL.Image import Image
|
34
37
|
|
35
38
|
from natural_pdf.core.page import Page
|
39
|
+
from natural_pdf.elements.base import Element # Added for type hint
|
36
40
|
from natural_pdf.elements.collections import ElementCollection
|
37
41
|
from natural_pdf.elements.text import TextElement
|
38
|
-
from natural_pdf.elements.base import Element # Added for type hint
|
39
42
|
|
40
43
|
# Import OCRManager conditionally to avoid circular imports
|
41
44
|
try:
|
@@ -47,7 +50,7 @@ except ImportError:
|
|
47
50
|
logger = logging.getLogger(__name__)
|
48
51
|
|
49
52
|
|
50
|
-
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
53
|
+
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
51
54
|
"""
|
52
55
|
Represents a rectangular region on a page.
|
53
56
|
"""
|
@@ -726,25 +729,32 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
726
729
|
# Handle the case where user wants the cropped region to have a specific width
|
727
730
|
page_kwargs = kwargs.copy()
|
728
731
|
effective_resolution = resolution # Start with the provided resolution
|
729
|
-
|
730
|
-
if crop_only and
|
731
|
-
target_width = kwargs[
|
732
|
+
|
733
|
+
if crop_only and "width" in kwargs:
|
734
|
+
target_width = kwargs["width"]
|
732
735
|
# Calculate what resolution is needed to make the region crop have target_width
|
733
736
|
region_width_points = self.width # Region width in PDF points
|
734
|
-
|
737
|
+
|
735
738
|
if region_width_points > 0:
|
736
739
|
# Calculate scale needed: target_width / region_width_points
|
737
740
|
required_scale = target_width / region_width_points
|
738
741
|
# Convert scale to resolution: scale * 72 DPI
|
739
742
|
effective_resolution = required_scale * 72.0
|
740
|
-
page_kwargs.pop(
|
741
|
-
logger.debug(
|
743
|
+
page_kwargs.pop("width") # Remove width parameter to avoid conflicts
|
744
|
+
logger.debug(
|
745
|
+
f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
|
746
|
+
)
|
742
747
|
else:
|
743
|
-
logger.warning(
|
748
|
+
logger.warning(
|
749
|
+
f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
|
750
|
+
)
|
744
751
|
|
745
752
|
# First get the full page image with highlights if requested
|
746
753
|
page_image = self._page.to_image(
|
747
|
-
scale=scale,
|
754
|
+
scale=scale,
|
755
|
+
resolution=effective_resolution,
|
756
|
+
include_highlights=include_highlights,
|
757
|
+
**page_kwargs,
|
748
758
|
)
|
749
759
|
|
750
760
|
# Calculate the actual scale factor used by the page image
|
@@ -899,13 +909,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
899
909
|
image.save(filename)
|
900
910
|
return self
|
901
911
|
|
902
|
-
def trim(
|
912
|
+
def trim(
|
913
|
+
self,
|
914
|
+
padding: int = 1,
|
915
|
+
threshold: float = 0.95,
|
916
|
+
resolution: float = 150,
|
917
|
+
pre_shrink: float = 0.5,
|
918
|
+
) -> "Region":
|
903
919
|
"""
|
904
920
|
Trim visual whitespace from the edges of this region.
|
905
|
-
|
921
|
+
|
906
922
|
Similar to Python's string .strip() method, but for visual whitespace in the region image.
|
907
923
|
Uses pixel analysis to detect rows/columns that are predominantly whitespace.
|
908
|
-
|
924
|
+
|
909
925
|
Args:
|
910
926
|
padding: Number of pixels to keep as padding after trimming (default: 1)
|
911
927
|
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
@@ -914,104 +930,126 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
914
930
|
resolution: Resolution for image rendering in DPI (default: 150)
|
915
931
|
pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
|
916
932
|
This helps avoid detecting box borders/slivers as content.
|
917
|
-
|
933
|
+
|
918
934
|
Returns:
|
919
935
|
New Region with visual whitespace trimmed from all edges
|
920
|
-
|
936
|
+
|
921
937
|
Example:
|
922
938
|
# Basic trimming with 1 pixel padding and 0.5px pre-shrink
|
923
939
|
trimmed = region.trim()
|
924
|
-
|
940
|
+
|
925
941
|
# More aggressive trimming with no padding and no pre-shrink
|
926
942
|
tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
|
927
|
-
|
943
|
+
|
928
944
|
# Conservative trimming with more padding
|
929
945
|
loose = region.trim(padding=3, threshold=0.98)
|
930
946
|
"""
|
931
947
|
# Pre-shrink the region to avoid box slivers
|
932
|
-
work_region =
|
933
|
-
|
948
|
+
work_region = (
|
949
|
+
self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
|
950
|
+
if pre_shrink > 0
|
951
|
+
else self
|
952
|
+
)
|
953
|
+
|
934
954
|
# Get the region image
|
935
|
-
image = work_region.to_image(
|
936
|
-
|
955
|
+
image = work_region.to_image(
|
956
|
+
resolution=resolution, crop_only=True, include_highlights=False
|
957
|
+
)
|
958
|
+
|
937
959
|
if image is None:
|
938
|
-
logger.warning(
|
960
|
+
logger.warning(
|
961
|
+
f"Region {self.bbox}: Could not generate image for trimming. Returning original region."
|
962
|
+
)
|
939
963
|
return self
|
940
|
-
|
964
|
+
|
941
965
|
# Convert to grayscale for easier analysis
|
942
966
|
import numpy as np
|
943
|
-
|
967
|
+
|
944
968
|
# Convert PIL image to numpy array
|
945
|
-
img_array = np.array(image.convert(
|
969
|
+
img_array = np.array(image.convert("L")) # Convert to grayscale
|
946
970
|
height, width = img_array.shape
|
947
|
-
|
971
|
+
|
948
972
|
if height == 0 or width == 0:
|
949
|
-
logger.warning(
|
973
|
+
logger.warning(
|
974
|
+
f"Region {self.bbox}: Image has zero dimensions. Returning original region."
|
975
|
+
)
|
950
976
|
return self
|
951
|
-
|
977
|
+
|
952
978
|
# Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
|
953
979
|
normalized = img_array.astype(np.float32) / 255.0
|
954
|
-
|
980
|
+
|
955
981
|
# Find content boundaries by analyzing row and column averages
|
956
|
-
|
982
|
+
|
957
983
|
# Analyze rows (horizontal strips) to find top and bottom boundaries
|
958
984
|
row_averages = np.mean(normalized, axis=1) # Average each row
|
959
985
|
content_rows = row_averages < threshold # True where there's content (not whitespace)
|
960
|
-
|
986
|
+
|
961
987
|
# Find first and last rows with content
|
962
988
|
content_row_indices = np.where(content_rows)[0]
|
963
989
|
if len(content_row_indices) == 0:
|
964
990
|
# No content found, return a minimal region at the center
|
965
|
-
logger.warning(
|
991
|
+
logger.warning(
|
992
|
+
f"Region {self.bbox}: No content detected during trimming. Returning center point."
|
993
|
+
)
|
966
994
|
center_x = (self.x0 + self.x1) / 2
|
967
995
|
center_y = (self.top + self.bottom) / 2
|
968
996
|
return Region(self.page, (center_x, center_y, center_x, center_y))
|
969
|
-
|
997
|
+
|
970
998
|
top_content_row = max(0, content_row_indices[0] - padding)
|
971
999
|
bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
|
972
|
-
|
973
|
-
# Analyze columns (vertical strips) to find left and right boundaries
|
1000
|
+
|
1001
|
+
# Analyze columns (vertical strips) to find left and right boundaries
|
974
1002
|
col_averages = np.mean(normalized, axis=0) # Average each column
|
975
1003
|
content_cols = col_averages < threshold # True where there's content
|
976
|
-
|
1004
|
+
|
977
1005
|
content_col_indices = np.where(content_cols)[0]
|
978
1006
|
if len(content_col_indices) == 0:
|
979
1007
|
# No content found in columns either
|
980
|
-
logger.warning(
|
1008
|
+
logger.warning(
|
1009
|
+
f"Region {self.bbox}: No column content detected during trimming. Returning center point."
|
1010
|
+
)
|
981
1011
|
center_x = (self.x0 + self.x1) / 2
|
982
1012
|
center_y = (self.top + self.bottom) / 2
|
983
1013
|
return Region(self.page, (center_x, center_y, center_x, center_y))
|
984
|
-
|
1014
|
+
|
985
1015
|
left_content_col = max(0, content_col_indices[0] - padding)
|
986
1016
|
right_content_col = min(width - 1, content_col_indices[-1] + padding)
|
987
|
-
|
1017
|
+
|
988
1018
|
# Convert trimmed pixel coordinates back to PDF coordinates
|
989
1019
|
scale_factor = resolution / 72.0 # Scale factor used in to_image()
|
990
|
-
|
1020
|
+
|
991
1021
|
# Calculate new PDF coordinates and ensure they are Python floats
|
992
1022
|
trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
|
993
1023
|
trimmed_top = float(work_region.top + (top_content_row / scale_factor))
|
994
|
-
trimmed_x1 = float(
|
995
|
-
|
996
|
-
|
1024
|
+
trimmed_x1 = float(
|
1025
|
+
work_region.x0 + ((right_content_col + 1) / scale_factor)
|
1026
|
+
) # +1 because we want inclusive right edge
|
1027
|
+
trimmed_bottom = float(
|
1028
|
+
work_region.top + ((bottom_content_row + 1) / scale_factor)
|
1029
|
+
) # +1 because we want inclusive bottom edge
|
1030
|
+
|
997
1031
|
# Ensure the trimmed region doesn't exceed the work region boundaries
|
998
1032
|
final_x0 = max(work_region.x0, trimmed_x0)
|
999
1033
|
final_top = max(work_region.top, trimmed_top)
|
1000
1034
|
final_x1 = min(work_region.x1, trimmed_x1)
|
1001
1035
|
final_bottom = min(work_region.bottom, trimmed_bottom)
|
1002
|
-
|
1036
|
+
|
1003
1037
|
# Ensure valid coordinates (width > 0, height > 0)
|
1004
1038
|
if final_x1 <= final_x0 or final_bottom <= final_top:
|
1005
|
-
logger.warning(
|
1039
|
+
logger.warning(
|
1040
|
+
f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region."
|
1041
|
+
)
|
1006
1042
|
return self
|
1007
|
-
|
1043
|
+
|
1008
1044
|
# Create the trimmed region
|
1009
1045
|
trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
|
1010
|
-
|
1046
|
+
|
1011
1047
|
# Expand back by the pre_shrink amount to restore original positioning
|
1012
1048
|
if pre_shrink > 0:
|
1013
|
-
trimmed_region = trimmed_region.expand(
|
1014
|
-
|
1049
|
+
trimmed_region = trimmed_region.expand(
|
1050
|
+
left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink
|
1051
|
+
)
|
1052
|
+
|
1015
1053
|
# Copy relevant metadata
|
1016
1054
|
trimmed_region.region_type = self.region_type
|
1017
1055
|
trimmed_region.normalized_type = self.normalized_type
|
@@ -1021,8 +1059,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1021
1059
|
trimmed_region.label = self.label
|
1022
1060
|
trimmed_region.source = "trimmed" # Indicate this is a derived region
|
1023
1061
|
trimmed_region.parent_region = self
|
1024
|
-
|
1025
|
-
logger.debug(
|
1062
|
+
|
1063
|
+
logger.debug(
|
1064
|
+
f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})"
|
1065
|
+
)
|
1026
1066
|
return trimmed_region
|
1027
1067
|
|
1028
1068
|
def clip(
|
@@ -1035,42 +1075,42 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1035
1075
|
) -> "Region":
|
1036
1076
|
"""
|
1037
1077
|
Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
|
1038
|
-
|
1078
|
+
|
1039
1079
|
The clipped region will be constrained to not exceed the specified boundaries.
|
1040
1080
|
You can provide either an object with bounding box properties, specific coordinates, or both.
|
1041
1081
|
When both are provided, explicit coordinates take precedence.
|
1042
|
-
|
1082
|
+
|
1043
1083
|
Args:
|
1044
1084
|
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1045
1085
|
left: Optional left boundary (x0) to clip to
|
1046
|
-
top: Optional top boundary to clip to
|
1086
|
+
top: Optional top boundary to clip to
|
1047
1087
|
right: Optional right boundary (x1) to clip to
|
1048
1088
|
bottom: Optional bottom boundary to clip to
|
1049
|
-
|
1089
|
+
|
1050
1090
|
Returns:
|
1051
1091
|
New Region with bounds clipped to the specified constraints
|
1052
|
-
|
1092
|
+
|
1053
1093
|
Examples:
|
1054
1094
|
# Clip to another region's bounds
|
1055
1095
|
clipped = region.clip(container_region)
|
1056
|
-
|
1096
|
+
|
1057
1097
|
# Clip to any element's bounds
|
1058
1098
|
clipped = region.clip(text_element)
|
1059
|
-
|
1099
|
+
|
1060
1100
|
# Clip to specific coordinates
|
1061
1101
|
clipped = region.clip(left=100, right=400)
|
1062
|
-
|
1102
|
+
|
1063
1103
|
# Mix object bounds with specific overrides
|
1064
1104
|
clipped = region.clip(obj=container, bottom=page.height/2)
|
1065
1105
|
"""
|
1066
1106
|
from natural_pdf.elements.base import extract_bbox
|
1067
|
-
|
1107
|
+
|
1068
1108
|
# Start with current region bounds
|
1069
1109
|
clip_x0 = self.x0
|
1070
1110
|
clip_top = self.top
|
1071
1111
|
clip_x1 = self.x1
|
1072
1112
|
clip_bottom = self.bottom
|
1073
|
-
|
1113
|
+
|
1074
1114
|
# Apply object constraints if provided
|
1075
1115
|
if obj is not None:
|
1076
1116
|
obj_bbox = extract_bbox(obj)
|
@@ -1086,7 +1126,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1086
1126
|
f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
|
1087
1127
|
"Object must have bbox property or x0/top/x1/bottom attributes."
|
1088
1128
|
)
|
1089
|
-
|
1129
|
+
|
1090
1130
|
# Apply explicit coordinate constraints (these take precedence)
|
1091
1131
|
if left is not None:
|
1092
1132
|
clip_x0 = max(clip_x0, left)
|
@@ -1096,7 +1136,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1096
1136
|
clip_x1 = min(clip_x1, right)
|
1097
1137
|
if bottom is not None:
|
1098
1138
|
clip_bottom = min(clip_bottom, bottom)
|
1099
|
-
|
1139
|
+
|
1100
1140
|
# Ensure valid coordinates
|
1101
1141
|
if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
|
1102
1142
|
logger.warning(
|
@@ -1105,10 +1145,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1105
1145
|
)
|
1106
1146
|
# Return a minimal region at the clip area's top-left
|
1107
1147
|
return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
|
1108
|
-
|
1148
|
+
|
1109
1149
|
# Create the clipped region
|
1110
1150
|
clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
|
1111
|
-
|
1151
|
+
|
1112
1152
|
# Copy relevant metadata
|
1113
1153
|
clipped_region.region_type = self.region_type
|
1114
1154
|
clipped_region.normalized_type = self.normalized_type
|
@@ -1118,7 +1158,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1118
1158
|
clipped_region.label = self.label
|
1119
1159
|
clipped_region.source = "clipped" # Indicate this is a derived region
|
1120
1160
|
clipped_region.parent_region = self
|
1121
|
-
|
1161
|
+
|
1122
1162
|
logger.debug(
|
1123
1163
|
f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
|
1124
1164
|
f"(constraints: obj={type(obj).__name__ if obj else None}, "
|
@@ -1279,24 +1319,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1279
1319
|
else:
|
1280
1320
|
# Try lattice first, then fall back to stream if no meaningful results
|
1281
1321
|
logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
|
1282
|
-
|
1322
|
+
|
1283
1323
|
try:
|
1284
1324
|
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
|
1285
|
-
lattice_result = self.extract_table(
|
1286
|
-
|
1325
|
+
lattice_result = self.extract_table(
|
1326
|
+
"lattice", table_settings=table_settings.copy()
|
1327
|
+
)
|
1328
|
+
|
1287
1329
|
# Check if lattice found meaningful content
|
1288
|
-
if (
|
1289
|
-
|
1290
|
-
|
1330
|
+
if (
|
1331
|
+
lattice_result
|
1332
|
+
and len(lattice_result) > 0
|
1333
|
+
and any(
|
1334
|
+
any(cell and cell.strip() for cell in row if cell)
|
1335
|
+
for row in lattice_result
|
1336
|
+
)
|
1337
|
+
):
|
1338
|
+
logger.debug(
|
1339
|
+
f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows"
|
1340
|
+
)
|
1291
1341
|
return lattice_result
|
1292
1342
|
else:
|
1293
|
-
logger.debug(
|
1343
|
+
logger.debug(
|
1344
|
+
f"Region {self.bbox}: 'lattice' method found no meaningful content"
|
1345
|
+
)
|
1294
1346
|
except Exception as e:
|
1295
1347
|
logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
|
1296
|
-
|
1348
|
+
|
1297
1349
|
# Fall back to stream
|
1298
1350
|
logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
|
1299
|
-
return self.extract_table(
|
1351
|
+
return self.extract_table("stream", table_settings=table_settings.copy())
|
1300
1352
|
else:
|
1301
1353
|
effective_method = method
|
1302
1354
|
|
@@ -1308,7 +1360,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1308
1360
|
table_settings.setdefault("vertical_strategy", "text")
|
1309
1361
|
table_settings.setdefault("horizontal_strategy", "text")
|
1310
1362
|
elif effective_method == "lattice":
|
1311
|
-
logger.debug(
|
1363
|
+
logger.debug(
|
1364
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1365
|
+
)
|
1312
1366
|
effective_method = "pdfplumber"
|
1313
1367
|
# Set default line strategies if not already provided by the user
|
1314
1368
|
table_settings.setdefault("vertical_strategy", "lines")
|
@@ -1331,7 +1385,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1331
1385
|
f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
|
1332
1386
|
)
|
1333
1387
|
|
1334
|
-
|
1335
1388
|
def extract_tables(
|
1336
1389
|
self,
|
1337
1390
|
method: Optional[str] = None,
|
@@ -1357,33 +1410,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1357
1410
|
# Auto-detect method if not specified (try lattice first, then stream)
|
1358
1411
|
if method is None:
|
1359
1412
|
logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
|
1360
|
-
|
1413
|
+
|
1361
1414
|
# Try lattice first
|
1362
1415
|
try:
|
1363
1416
|
lattice_settings = table_settings.copy()
|
1364
1417
|
lattice_settings.setdefault("vertical_strategy", "lines")
|
1365
1418
|
lattice_settings.setdefault("horizontal_strategy", "lines")
|
1366
|
-
|
1419
|
+
|
1367
1420
|
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
|
1368
1421
|
lattice_result = self._extract_tables_plumber(lattice_settings)
|
1369
|
-
|
1422
|
+
|
1370
1423
|
# Check if lattice found meaningful tables
|
1371
|
-
if (
|
1372
|
-
|
1373
|
-
|
1424
|
+
if (
|
1425
|
+
lattice_result
|
1426
|
+
and len(lattice_result) > 0
|
1427
|
+
and any(
|
1428
|
+
any(
|
1429
|
+
any(cell and cell.strip() for cell in row if cell)
|
1430
|
+
for row in table
|
1431
|
+
if table
|
1432
|
+
)
|
1433
|
+
for table in lattice_result
|
1434
|
+
)
|
1435
|
+
):
|
1436
|
+
logger.debug(
|
1437
|
+
f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables"
|
1438
|
+
)
|
1374
1439
|
return lattice_result
|
1375
1440
|
else:
|
1376
1441
|
logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
|
1377
|
-
|
1442
|
+
|
1378
1443
|
except Exception as e:
|
1379
1444
|
logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
|
1380
|
-
|
1445
|
+
|
1381
1446
|
# Fall back to stream
|
1382
1447
|
logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
|
1383
1448
|
stream_settings = table_settings.copy()
|
1384
1449
|
stream_settings.setdefault("vertical_strategy", "text")
|
1385
1450
|
stream_settings.setdefault("horizontal_strategy", "text")
|
1386
|
-
|
1451
|
+
|
1387
1452
|
return self._extract_tables_plumber(stream_settings)
|
1388
1453
|
|
1389
1454
|
effective_method = method
|
@@ -1395,7 +1460,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1395
1460
|
table_settings.setdefault("vertical_strategy", "text")
|
1396
1461
|
table_settings.setdefault("horizontal_strategy", "text")
|
1397
1462
|
elif effective_method == "lattice":
|
1398
|
-
logger.debug(
|
1463
|
+
logger.debug(
|
1464
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1465
|
+
)
|
1399
1466
|
effective_method = "pdfplumber"
|
1400
1467
|
table_settings.setdefault("vertical_strategy", "lines")
|
1401
1468
|
table_settings.setdefault("horizontal_strategy", "lines")
|
@@ -1844,7 +1911,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1844
1911
|
|
1845
1912
|
# Validate contains parameter
|
1846
1913
|
if contains not in ["all", "any", "center"]:
|
1847
|
-
raise ValueError(
|
1914
|
+
raise ValueError(
|
1915
|
+
f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'"
|
1916
|
+
)
|
1848
1917
|
|
1849
1918
|
# Construct selector if 'text' is provided
|
1850
1919
|
effective_selector = ""
|
@@ -1894,24 +1963,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1894
1963
|
# Filter these elements based on the specified containment method
|
1895
1964
|
region_bbox = self.bbox
|
1896
1965
|
matching_elements = []
|
1897
|
-
|
1966
|
+
|
1898
1967
|
if contains == "all": # Fully inside (strict)
|
1899
1968
|
matching_elements = [
|
1900
|
-
el
|
1969
|
+
el
|
1970
|
+
for el in potential_elements
|
1901
1971
|
if el.x0 >= region_bbox[0]
|
1902
1972
|
and el.top >= region_bbox[1]
|
1903
1973
|
and el.x1 <= region_bbox[2]
|
1904
1974
|
and el.bottom <= region_bbox[3]
|
1905
1975
|
]
|
1906
1976
|
elif contains == "any": # Any overlap
|
1907
|
-
matching_elements = [
|
1908
|
-
el for el in potential_elements
|
1909
|
-
if self.intersects(el)
|
1910
|
-
]
|
1977
|
+
matching_elements = [el for el in potential_elements if self.intersects(el)]
|
1911
1978
|
elif contains == "center": # Center point inside
|
1912
1979
|
matching_elements = [
|
1913
|
-
el for el in potential_elements
|
1914
|
-
if self.is_element_center_inside(el)
|
1980
|
+
el for el in potential_elements if self.is_element_center_inside(el)
|
1915
1981
|
]
|
1916
1982
|
|
1917
1983
|
return ElementCollection(matching_elements)
|
@@ -2001,17 +2067,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2001
2067
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
2002
2068
|
|
2003
2069
|
# Run OCR on this region's image using the manager
|
2004
|
-
|
2005
|
-
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
)
|
2010
|
-
return self
|
2011
|
-
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
2012
|
-
except Exception as e:
|
2013
|
-
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
2070
|
+
results = ocr_mgr.apply_ocr(**manager_args)
|
2071
|
+
if not isinstance(results, list):
|
2072
|
+
logger.error(
|
2073
|
+
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
2074
|
+
)
|
2014
2075
|
return self
|
2076
|
+
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
2015
2077
|
|
2016
2078
|
# Convert results to TextElements
|
2017
2079
|
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
@@ -2802,11 +2864,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2802
2864
|
self,
|
2803
2865
|
text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
|
2804
2866
|
source_label: str = "derived_from_region",
|
2805
|
-
object_type: str = "word",
|
2867
|
+
object_type: str = "word", # Or "char", controls how it's categorized
|
2806
2868
|
default_font_size: float = 10.0,
|
2807
2869
|
default_font_name: str = "RegionContent",
|
2808
|
-
confidence: Optional[float] = None,
|
2809
|
-
add_to_page: bool = False
|
2870
|
+
confidence: Optional[float] = None, # Allow overriding confidence
|
2871
|
+
add_to_page: bool = False, # NEW: Option to add to page
|
2810
2872
|
) -> "TextElement":
|
2811
2873
|
"""
|
2812
2874
|
Creates a new TextElement object based on this region's geometry.
|
@@ -2833,7 +2895,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2833
2895
|
|
2834
2896
|
Returns:
|
2835
2897
|
A new TextElement instance.
|
2836
|
-
|
2898
|
+
|
2837
2899
|
Raises:
|
2838
2900
|
ValueError: If the region does not have a valid 'page' attribute.
|
2839
2901
|
"""
|
@@ -2844,14 +2906,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2844
2906
|
try:
|
2845
2907
|
actual_text = text_content(self)
|
2846
2908
|
except Exception as e:
|
2847
|
-
logger.error(
|
2848
|
-
|
2909
|
+
logger.error(
|
2910
|
+
f"Error executing text_content callback for region {self.bbox}: {e}",
|
2911
|
+
exc_info=True,
|
2912
|
+
)
|
2913
|
+
actual_text = None # Ensure actual_text is None on error
|
2849
2914
|
|
2850
2915
|
final_confidence = confidence
|
2851
2916
|
if final_confidence is None:
|
2852
2917
|
final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
|
2853
2918
|
|
2854
|
-
if not hasattr(self,
|
2919
|
+
if not hasattr(self, "page") or self.page is None:
|
2855
2920
|
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2856
2921
|
|
2857
2922
|
elem_data = {
|
@@ -2864,8 +2929,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2864
2929
|
"height": self.height,
|
2865
2930
|
"object_type": object_type,
|
2866
2931
|
"page_number": self.page.page_number,
|
2867
|
-
"stroking_color": getattr(self,
|
2868
|
-
"non_stroking_color": getattr(self,
|
2932
|
+
"stroking_color": getattr(self, "stroking_color", (0, 0, 0)),
|
2933
|
+
"non_stroking_color": getattr(self, "non_stroking_color", (0, 0, 0)),
|
2869
2934
|
"fontname": default_font_name,
|
2870
2935
|
"size": default_font_size,
|
2871
2936
|
"upright": True,
|
@@ -2873,18 +2938,30 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2873
2938
|
"adv": self.width,
|
2874
2939
|
"source": source_label,
|
2875
2940
|
"confidence": final_confidence,
|
2876
|
-
"_char_dicts": []
|
2941
|
+
"_char_dicts": [],
|
2877
2942
|
}
|
2878
2943
|
text_element = TextElement(elem_data, self.page)
|
2879
2944
|
|
2880
2945
|
if add_to_page:
|
2881
|
-
if hasattr(self.page,
|
2882
|
-
add_as_type =
|
2946
|
+
if hasattr(self.page, "_element_mgr") and self.page._element_mgr is not None:
|
2947
|
+
add_as_type = (
|
2948
|
+
"words"
|
2949
|
+
if object_type == "word"
|
2950
|
+
else "chars" if object_type == "char" else object_type
|
2951
|
+
)
|
2883
2952
|
# REMOVED try-except block around add_element
|
2884
2953
|
self.page._element_mgr.add_element(text_element, element_type=add_as_type)
|
2885
|
-
logger.debug(
|
2954
|
+
logger.debug(
|
2955
|
+
f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
|
2956
|
+
)
|
2886
2957
|
else:
|
2887
|
-
page_num_str =
|
2888
|
-
|
2889
|
-
|
2958
|
+
page_num_str = (
|
2959
|
+
str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
|
2960
|
+
)
|
2961
|
+
logger.warning(
|
2962
|
+
f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'."
|
2963
|
+
)
|
2964
|
+
|
2890
2965
|
return text_element
|
2966
|
+
|
2967
|
+
|