natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +1373 -0
- natural_pdf/classification/manager.py +2 -3
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/highlighting_service.py +29 -38
- natural_pdf/core/page.py +284 -187
- natural_pdf/core/pdf.py +4 -4
- natural_pdf/elements/base.py +54 -20
- natural_pdf/elements/collections.py +160 -9
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +380 -38
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/selectors/parser.py +163 -8
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/RECORD +22 -17
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/WHEEL +1 -1
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -13,6 +13,7 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
|
|
13
13
|
from natural_pdf.classification.mixin import ClassificationMixin
|
14
14
|
from natural_pdf.elements.base import DirectionalMixin
|
15
15
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
16
|
+
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
16
17
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
17
18
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
18
19
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
@@ -20,11 +21,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
20
21
|
# Import new utils
|
21
22
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
22
23
|
|
23
|
-
|
24
|
-
from natural_pdf.utils.tqdm_utils import get_tqdm
|
25
|
-
|
24
|
+
from tqdm.auto import tqdm
|
26
25
|
# --- End Classification Imports --- #
|
27
26
|
|
27
|
+
# --- Shape Detection Mixin --- #
|
28
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
29
|
+
# --- End Shape Detection Mixin --- #
|
28
30
|
|
29
31
|
if TYPE_CHECKING:
|
30
32
|
# --- NEW: Add Image type hint for classification --- #
|
@@ -33,6 +35,7 @@ if TYPE_CHECKING:
|
|
33
35
|
from natural_pdf.core.page import Page
|
34
36
|
from natural_pdf.elements.collections import ElementCollection
|
35
37
|
from natural_pdf.elements.text import TextElement
|
38
|
+
from natural_pdf.elements.base import Element # Added for type hint
|
36
39
|
|
37
40
|
# Import OCRManager conditionally to avoid circular imports
|
38
41
|
try:
|
@@ -44,7 +47,7 @@ except ImportError:
|
|
44
47
|
logger = logging.getLogger(__name__)
|
45
48
|
|
46
49
|
|
47
|
-
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
50
|
+
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
48
51
|
"""
|
49
52
|
Represents a rectangular region on a page.
|
50
53
|
"""
|
@@ -103,7 +106,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
103
106
|
direction: str,
|
104
107
|
size: Optional[float] = None,
|
105
108
|
cross_size: str = "full",
|
106
|
-
|
109
|
+
include_source: bool = False,
|
107
110
|
until: Optional[str] = None,
|
108
111
|
include_endpoint: bool = True,
|
109
112
|
**kwargs,
|
@@ -115,7 +118,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
115
118
|
direction: 'left', 'right', 'above', or 'below'
|
116
119
|
size: Size in the primary direction (width for horizontal, height for vertical)
|
117
120
|
cross_size: Size in the cross direction ('full' or 'element')
|
118
|
-
|
121
|
+
include_source: Whether to include this region's area in the result
|
119
122
|
until: Optional selector string to specify a boundary element
|
120
123
|
include_endpoint: Whether to include the boundary element found by 'until'
|
121
124
|
**kwargs: Additional parameters for the 'until' selector search
|
@@ -129,7 +132,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
129
132
|
is_positive = direction in ("right", "below") # right/below are positive directions
|
130
133
|
pixel_offset = 1 # Offset for excluding elements/endpoints
|
131
134
|
|
132
|
-
# 1. Determine initial boundaries based on direction and
|
135
|
+
# 1. Determine initial boundaries based on direction and include_source
|
133
136
|
if is_horizontal:
|
134
137
|
# Initial cross-boundaries (vertical)
|
135
138
|
y0 = 0 if cross_size == "full" else self.top
|
@@ -137,11 +140,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
137
140
|
|
138
141
|
# Initial primary boundaries (horizontal)
|
139
142
|
if is_positive: # right
|
140
|
-
x0_initial = self.x0 if
|
143
|
+
x0_initial = self.x0 if include_source else self.x1 + pixel_offset
|
141
144
|
x1_initial = self.x1 # This edge moves
|
142
145
|
else: # left
|
143
146
|
x0_initial = self.x0 # This edge moves
|
144
|
-
x1_initial = self.x1 if
|
147
|
+
x1_initial = self.x1 if include_source else self.x0 - pixel_offset
|
145
148
|
else: # Vertical
|
146
149
|
# Initial cross-boundaries (horizontal)
|
147
150
|
x0 = 0 if cross_size == "full" else self.x0
|
@@ -149,11 +152,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
149
152
|
|
150
153
|
# Initial primary boundaries (vertical)
|
151
154
|
if is_positive: # below
|
152
|
-
y0_initial = self.top if
|
155
|
+
y0_initial = self.top if include_source else self.bottom + pixel_offset
|
153
156
|
y1_initial = self.bottom # This edge moves
|
154
157
|
else: # above
|
155
158
|
y0_initial = self.top # This edge moves
|
156
|
-
y1_initial = self.bottom if
|
159
|
+
y1_initial = self.bottom if include_source else self.top - pixel_offset
|
157
160
|
|
158
161
|
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
159
162
|
if is_horizontal:
|
@@ -245,7 +248,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
245
248
|
# 5. Create and return Region
|
246
249
|
region = Region(self.page, final_bbox)
|
247
250
|
region.source_element = self
|
248
|
-
region.includes_source =
|
251
|
+
region.includes_source = include_source
|
249
252
|
# Optionally store the boundary element if found
|
250
253
|
if target:
|
251
254
|
region.boundary_element = target
|
@@ -256,7 +259,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
256
259
|
self,
|
257
260
|
height: Optional[float] = None,
|
258
261
|
width: str = "full",
|
259
|
-
|
262
|
+
include_source: bool = False,
|
260
263
|
until: Optional[str] = None,
|
261
264
|
include_endpoint: bool = True,
|
262
265
|
**kwargs,
|
@@ -267,7 +270,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
267
270
|
Args:
|
268
271
|
height: Height of the region above, in points
|
269
272
|
width: Width mode - "full" for full page width or "element" for element width
|
270
|
-
|
273
|
+
include_source: Whether to include this region in the result (default: False)
|
271
274
|
until: Optional selector string to specify an upper boundary element
|
272
275
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
273
276
|
**kwargs: Additional parameters
|
@@ -279,7 +282,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
279
282
|
direction="above",
|
280
283
|
size=height,
|
281
284
|
cross_size=width,
|
282
|
-
|
285
|
+
include_source=include_source,
|
283
286
|
until=until,
|
284
287
|
include_endpoint=include_endpoint,
|
285
288
|
**kwargs,
|
@@ -289,7 +292,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
289
292
|
self,
|
290
293
|
height: Optional[float] = None,
|
291
294
|
width: str = "full",
|
292
|
-
|
295
|
+
include_source: bool = False,
|
293
296
|
until: Optional[str] = None,
|
294
297
|
include_endpoint: bool = True,
|
295
298
|
**kwargs,
|
@@ -300,7 +303,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
300
303
|
Args:
|
301
304
|
height: Height of the region below, in points
|
302
305
|
width: Width mode - "full" for full page width or "element" for element width
|
303
|
-
|
306
|
+
include_source: Whether to include this region in the result (default: False)
|
304
307
|
until: Optional selector string to specify a lower boundary element
|
305
308
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
306
309
|
**kwargs: Additional parameters
|
@@ -312,7 +315,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
312
315
|
direction="below",
|
313
316
|
size=height,
|
314
317
|
cross_size=width,
|
315
|
-
|
318
|
+
include_source=include_source,
|
316
319
|
until=until,
|
317
320
|
include_endpoint=include_endpoint,
|
318
321
|
**kwargs,
|
@@ -322,7 +325,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
322
325
|
self,
|
323
326
|
width: Optional[float] = None,
|
324
327
|
height: str = "full",
|
325
|
-
|
328
|
+
include_source: bool = False,
|
326
329
|
until: Optional[str] = None,
|
327
330
|
include_endpoint: bool = True,
|
328
331
|
**kwargs,
|
@@ -333,7 +336,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
333
336
|
Args:
|
334
337
|
width: Width of the region to the left, in points
|
335
338
|
height: Height mode - "full" for full page height or "element" for element height
|
336
|
-
|
339
|
+
include_source: Whether to include this region in the result (default: False)
|
337
340
|
until: Optional selector string to specify a left boundary element
|
338
341
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
339
342
|
**kwargs: Additional parameters
|
@@ -345,7 +348,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
345
348
|
direction="left",
|
346
349
|
size=width,
|
347
350
|
cross_size=height,
|
348
|
-
|
351
|
+
include_source=include_source,
|
349
352
|
until=until,
|
350
353
|
include_endpoint=include_endpoint,
|
351
354
|
**kwargs,
|
@@ -355,7 +358,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
355
358
|
self,
|
356
359
|
width: Optional[float] = None,
|
357
360
|
height: str = "full",
|
358
|
-
|
361
|
+
include_source: bool = False,
|
359
362
|
until: Optional[str] = None,
|
360
363
|
include_endpoint: bool = True,
|
361
364
|
**kwargs,
|
@@ -366,7 +369,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
366
369
|
Args:
|
367
370
|
width: Width of the region to the right, in points
|
368
371
|
height: Height mode - "full" for full page height or "element" for element height
|
369
|
-
|
372
|
+
include_source: Whether to include this region in the result (default: False)
|
370
373
|
until: Optional selector string to specify a right boundary element
|
371
374
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
372
375
|
**kwargs: Additional parameters
|
@@ -378,7 +381,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
378
381
|
direction="right",
|
379
382
|
size=width,
|
380
383
|
cross_size=height,
|
381
|
-
|
384
|
+
include_source=include_source,
|
382
385
|
until=until,
|
383
386
|
include_endpoint=include_endpoint,
|
384
387
|
**kwargs,
|
@@ -720,14 +723,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
720
723
|
Returns:
|
721
724
|
PIL Image of just this region
|
722
725
|
"""
|
726
|
+
# Handle the case where user wants the cropped region to have a specific width
|
727
|
+
page_kwargs = kwargs.copy()
|
728
|
+
effective_resolution = resolution # Start with the provided resolution
|
729
|
+
|
730
|
+
if crop_only and 'width' in kwargs:
|
731
|
+
target_width = kwargs['width']
|
732
|
+
# Calculate what resolution is needed to make the region crop have target_width
|
733
|
+
region_width_points = self.width # Region width in PDF points
|
734
|
+
|
735
|
+
if region_width_points > 0:
|
736
|
+
# Calculate scale needed: target_width / region_width_points
|
737
|
+
required_scale = target_width / region_width_points
|
738
|
+
# Convert scale to resolution: scale * 72 DPI
|
739
|
+
effective_resolution = required_scale * 72.0
|
740
|
+
page_kwargs.pop('width') # Remove width parameter to avoid conflicts
|
741
|
+
logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
|
742
|
+
else:
|
743
|
+
logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
|
744
|
+
|
723
745
|
# First get the full page image with highlights if requested
|
724
746
|
page_image = self._page.to_image(
|
725
|
-
scale=scale, resolution=
|
747
|
+
scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
|
726
748
|
)
|
727
749
|
|
728
|
-
# Calculate the
|
729
|
-
|
730
|
-
|
750
|
+
# Calculate the actual scale factor used by the page image
|
751
|
+
if page_image.width > 0 and self._page.width > 0:
|
752
|
+
scale_factor = page_image.width / self._page.width
|
753
|
+
else:
|
754
|
+
# Fallback to resolution-based calculation if dimensions are invalid
|
755
|
+
scale_factor = resolution / 72.0
|
731
756
|
|
732
757
|
# Apply scaling to the coordinates
|
733
758
|
x0 = int(self.x0 * scale_factor)
|
@@ -874,6 +899,233 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
874
899
|
image.save(filename)
|
875
900
|
return self
|
876
901
|
|
902
|
+
def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
|
903
|
+
"""
|
904
|
+
Trim visual whitespace from the edges of this region.
|
905
|
+
|
906
|
+
Similar to Python's string .strip() method, but for visual whitespace in the region image.
|
907
|
+
Uses pixel analysis to detect rows/columns that are predominantly whitespace.
|
908
|
+
|
909
|
+
Args:
|
910
|
+
padding: Number of pixels to keep as padding after trimming (default: 1)
|
911
|
+
threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
|
912
|
+
Higher values mean more strict whitespace detection.
|
913
|
+
E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
|
914
|
+
resolution: Resolution for image rendering in DPI (default: 150)
|
915
|
+
pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
|
916
|
+
This helps avoid detecting box borders/slivers as content.
|
917
|
+
|
918
|
+
Returns:
|
919
|
+
New Region with visual whitespace trimmed from all edges
|
920
|
+
|
921
|
+
Example:
|
922
|
+
# Basic trimming with 1 pixel padding and 0.5px pre-shrink
|
923
|
+
trimmed = region.trim()
|
924
|
+
|
925
|
+
# More aggressive trimming with no padding and no pre-shrink
|
926
|
+
tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
|
927
|
+
|
928
|
+
# Conservative trimming with more padding
|
929
|
+
loose = region.trim(padding=3, threshold=0.98)
|
930
|
+
"""
|
931
|
+
# Pre-shrink the region to avoid box slivers
|
932
|
+
work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
|
933
|
+
|
934
|
+
# Get the region image
|
935
|
+
image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
|
936
|
+
|
937
|
+
if image is None:
|
938
|
+
logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
|
939
|
+
return self
|
940
|
+
|
941
|
+
# Convert to grayscale for easier analysis
|
942
|
+
import numpy as np
|
943
|
+
|
944
|
+
# Convert PIL image to numpy array
|
945
|
+
img_array = np.array(image.convert('L')) # Convert to grayscale
|
946
|
+
height, width = img_array.shape
|
947
|
+
|
948
|
+
if height == 0 or width == 0:
|
949
|
+
logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
|
950
|
+
return self
|
951
|
+
|
952
|
+
# Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
|
953
|
+
normalized = img_array.astype(np.float32) / 255.0
|
954
|
+
|
955
|
+
# Find content boundaries by analyzing row and column averages
|
956
|
+
|
957
|
+
# Analyze rows (horizontal strips) to find top and bottom boundaries
|
958
|
+
row_averages = np.mean(normalized, axis=1) # Average each row
|
959
|
+
content_rows = row_averages < threshold # True where there's content (not whitespace)
|
960
|
+
|
961
|
+
# Find first and last rows with content
|
962
|
+
content_row_indices = np.where(content_rows)[0]
|
963
|
+
if len(content_row_indices) == 0:
|
964
|
+
# No content found, return a minimal region at the center
|
965
|
+
logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
|
966
|
+
center_x = (self.x0 + self.x1) / 2
|
967
|
+
center_y = (self.top + self.bottom) / 2
|
968
|
+
return Region(self.page, (center_x, center_y, center_x, center_y))
|
969
|
+
|
970
|
+
top_content_row = max(0, content_row_indices[0] - padding)
|
971
|
+
bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
|
972
|
+
|
973
|
+
# Analyze columns (vertical strips) to find left and right boundaries
|
974
|
+
col_averages = np.mean(normalized, axis=0) # Average each column
|
975
|
+
content_cols = col_averages < threshold # True where there's content
|
976
|
+
|
977
|
+
content_col_indices = np.where(content_cols)[0]
|
978
|
+
if len(content_col_indices) == 0:
|
979
|
+
# No content found in columns either
|
980
|
+
logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
|
981
|
+
center_x = (self.x0 + self.x1) / 2
|
982
|
+
center_y = (self.top + self.bottom) / 2
|
983
|
+
return Region(self.page, (center_x, center_y, center_x, center_y))
|
984
|
+
|
985
|
+
left_content_col = max(0, content_col_indices[0] - padding)
|
986
|
+
right_content_col = min(width - 1, content_col_indices[-1] + padding)
|
987
|
+
|
988
|
+
# Convert trimmed pixel coordinates back to PDF coordinates
|
989
|
+
scale_factor = resolution / 72.0 # Scale factor used in to_image()
|
990
|
+
|
991
|
+
# Calculate new PDF coordinates and ensure they are Python floats
|
992
|
+
trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
|
993
|
+
trimmed_top = float(work_region.top + (top_content_row / scale_factor))
|
994
|
+
trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor)) # +1 because we want inclusive right edge
|
995
|
+
trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor)) # +1 because we want inclusive bottom edge
|
996
|
+
|
997
|
+
# Ensure the trimmed region doesn't exceed the work region boundaries
|
998
|
+
final_x0 = max(work_region.x0, trimmed_x0)
|
999
|
+
final_top = max(work_region.top, trimmed_top)
|
1000
|
+
final_x1 = min(work_region.x1, trimmed_x1)
|
1001
|
+
final_bottom = min(work_region.bottom, trimmed_bottom)
|
1002
|
+
|
1003
|
+
# Ensure valid coordinates (width > 0, height > 0)
|
1004
|
+
if final_x1 <= final_x0 or final_bottom <= final_top:
|
1005
|
+
logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
|
1006
|
+
return self
|
1007
|
+
|
1008
|
+
# Create the trimmed region
|
1009
|
+
trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
|
1010
|
+
|
1011
|
+
# Expand back by the pre_shrink amount to restore original positioning
|
1012
|
+
if pre_shrink > 0:
|
1013
|
+
trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
|
1014
|
+
|
1015
|
+
# Copy relevant metadata
|
1016
|
+
trimmed_region.region_type = self.region_type
|
1017
|
+
trimmed_region.normalized_type = self.normalized_type
|
1018
|
+
trimmed_region.confidence = self.confidence
|
1019
|
+
trimmed_region.model = self.model
|
1020
|
+
trimmed_region.name = self.name
|
1021
|
+
trimmed_region.label = self.label
|
1022
|
+
trimmed_region.source = "trimmed" # Indicate this is a derived region
|
1023
|
+
trimmed_region.parent_region = self
|
1024
|
+
|
1025
|
+
logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
|
1026
|
+
return trimmed_region
|
1027
|
+
|
1028
|
+
def clip(
|
1029
|
+
self,
|
1030
|
+
obj: Optional[Any] = None,
|
1031
|
+
left: Optional[float] = None,
|
1032
|
+
top: Optional[float] = None,
|
1033
|
+
right: Optional[float] = None,
|
1034
|
+
bottom: Optional[float] = None,
|
1035
|
+
) -> "Region":
|
1036
|
+
"""
|
1037
|
+
Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
|
1038
|
+
|
1039
|
+
The clipped region will be constrained to not exceed the specified boundaries.
|
1040
|
+
You can provide either an object with bounding box properties, specific coordinates, or both.
|
1041
|
+
When both are provided, explicit coordinates take precedence.
|
1042
|
+
|
1043
|
+
Args:
|
1044
|
+
obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
|
1045
|
+
left: Optional left boundary (x0) to clip to
|
1046
|
+
top: Optional top boundary to clip to
|
1047
|
+
right: Optional right boundary (x1) to clip to
|
1048
|
+
bottom: Optional bottom boundary to clip to
|
1049
|
+
|
1050
|
+
Returns:
|
1051
|
+
New Region with bounds clipped to the specified constraints
|
1052
|
+
|
1053
|
+
Examples:
|
1054
|
+
# Clip to another region's bounds
|
1055
|
+
clipped = region.clip(container_region)
|
1056
|
+
|
1057
|
+
# Clip to any element's bounds
|
1058
|
+
clipped = region.clip(text_element)
|
1059
|
+
|
1060
|
+
# Clip to specific coordinates
|
1061
|
+
clipped = region.clip(left=100, right=400)
|
1062
|
+
|
1063
|
+
# Mix object bounds with specific overrides
|
1064
|
+
clipped = region.clip(obj=container, bottom=page.height/2)
|
1065
|
+
"""
|
1066
|
+
from natural_pdf.elements.base import extract_bbox
|
1067
|
+
|
1068
|
+
# Start with current region bounds
|
1069
|
+
clip_x0 = self.x0
|
1070
|
+
clip_top = self.top
|
1071
|
+
clip_x1 = self.x1
|
1072
|
+
clip_bottom = self.bottom
|
1073
|
+
|
1074
|
+
# Apply object constraints if provided
|
1075
|
+
if obj is not None:
|
1076
|
+
obj_bbox = extract_bbox(obj)
|
1077
|
+
if obj_bbox is not None:
|
1078
|
+
obj_x0, obj_top, obj_x1, obj_bottom = obj_bbox
|
1079
|
+
# Constrain to the intersection with the provided object
|
1080
|
+
clip_x0 = max(clip_x0, obj_x0)
|
1081
|
+
clip_top = max(clip_top, obj_top)
|
1082
|
+
clip_x1 = min(clip_x1, obj_x1)
|
1083
|
+
clip_bottom = min(clip_bottom, obj_bottom)
|
1084
|
+
else:
|
1085
|
+
logger.warning(
|
1086
|
+
f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
|
1087
|
+
"Object must have bbox property or x0/top/x1/bottom attributes."
|
1088
|
+
)
|
1089
|
+
|
1090
|
+
# Apply explicit coordinate constraints (these take precedence)
|
1091
|
+
if left is not None:
|
1092
|
+
clip_x0 = max(clip_x0, left)
|
1093
|
+
if top is not None:
|
1094
|
+
clip_top = max(clip_top, top)
|
1095
|
+
if right is not None:
|
1096
|
+
clip_x1 = min(clip_x1, right)
|
1097
|
+
if bottom is not None:
|
1098
|
+
clip_bottom = min(clip_bottom, bottom)
|
1099
|
+
|
1100
|
+
# Ensure valid coordinates
|
1101
|
+
if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
|
1102
|
+
logger.warning(
|
1103
|
+
f"Region {self.bbox}: Clipping resulted in invalid dimensions "
|
1104
|
+
f"({clip_x0}, {clip_top}, {clip_x1}, {clip_bottom}). Returning minimal region."
|
1105
|
+
)
|
1106
|
+
# Return a minimal region at the clip area's top-left
|
1107
|
+
return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
|
1108
|
+
|
1109
|
+
# Create the clipped region
|
1110
|
+
clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
|
1111
|
+
|
1112
|
+
# Copy relevant metadata
|
1113
|
+
clipped_region.region_type = self.region_type
|
1114
|
+
clipped_region.normalized_type = self.normalized_type
|
1115
|
+
clipped_region.confidence = self.confidence
|
1116
|
+
clipped_region.model = self.model
|
1117
|
+
clipped_region.name = self.name
|
1118
|
+
clipped_region.label = self.label
|
1119
|
+
clipped_region.source = "clipped" # Indicate this is a derived region
|
1120
|
+
clipped_region.parent_region = self
|
1121
|
+
|
1122
|
+
logger.debug(
|
1123
|
+
f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
|
1124
|
+
f"(constraints: obj={type(obj).__name__ if obj else None}, "
|
1125
|
+
f"left={left}, top={top}, right={right}, bottom={bottom})"
|
1126
|
+
)
|
1127
|
+
return clipped_region
|
1128
|
+
|
877
1129
|
def get_elements(
|
878
1130
|
self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
|
879
1131
|
) -> List["Element"]:
|
@@ -1022,7 +1274,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1022
1274
|
if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
|
1023
1275
|
effective_method = "tatr"
|
1024
1276
|
else:
|
1025
|
-
effective_method = "
|
1277
|
+
effective_method = "plumber"
|
1026
1278
|
|
1027
1279
|
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
1028
1280
|
|
@@ -1045,6 +1297,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1045
1297
|
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
1046
1298
|
"""
|
1047
1299
|
Extract table using pdfplumber's table extraction.
|
1300
|
+
This method extracts the largest table within the region.
|
1048
1301
|
|
1049
1302
|
Args:
|
1050
1303
|
table_settings: Settings for pdfplumber table extraction
|
@@ -1055,12 +1308,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1055
1308
|
# Create a crop of the page for this region
|
1056
1309
|
cropped = self.page._page.crop(self.bbox)
|
1057
1310
|
|
1058
|
-
# Extract table from the cropped area
|
1059
|
-
|
1311
|
+
# Extract the single largest table from the cropped area
|
1312
|
+
table = cropped.extract_table(table_settings)
|
1060
1313
|
|
1061
|
-
# Return the
|
1062
|
-
if
|
1063
|
-
return
|
1314
|
+
# Return the table or an empty list if none found
|
1315
|
+
if table:
|
1316
|
+
return table
|
1064
1317
|
return []
|
1065
1318
|
|
1066
1319
|
def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
|
@@ -1261,8 +1514,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1261
1514
|
unique_tops = cluster_coords(tops)
|
1262
1515
|
unique_lefts = cluster_coords(lefts)
|
1263
1516
|
|
1264
|
-
# --- Setup tqdm --- #
|
1265
|
-
tqdm = get_tqdm()
|
1266
1517
|
# Determine iterable for tqdm
|
1267
1518
|
cell_iterator = cell_dicts
|
1268
1519
|
if show_progress:
|
@@ -1777,7 +2028,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1777
2028
|
|
1778
2029
|
def get_sections(
|
1779
2030
|
self, start_elements=None, end_elements=None, boundary_inclusion="both"
|
1780
|
-
) ->
|
2031
|
+
) -> "ElementCollection[Region]":
|
1781
2032
|
"""
|
1782
2033
|
Get sections within this region based on start/end elements.
|
1783
2034
|
|
@@ -1897,7 +2148,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1897
2148
|
section = self.get_section_between(start_element, end_element, boundary_inclusion)
|
1898
2149
|
sections.append(section)
|
1899
2150
|
|
1900
|
-
return sections
|
2151
|
+
return ElementCollection(sections)
|
1901
2152
|
|
1902
2153
|
def create_cells(self):
|
1903
2154
|
"""
|
@@ -2413,3 +2664,94 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
2413
2664
|
return ElementCollection(cell_regions)
|
2414
2665
|
|
2415
2666
|
# --- END NEW METHOD ---
|
2667
|
+
|
2668
|
+
def to_text_element(
|
2669
|
+
self,
|
2670
|
+
text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
|
2671
|
+
source_label: str = "derived_from_region",
|
2672
|
+
object_type: str = "word", # Or "char", controls how it's categorized
|
2673
|
+
default_font_size: float = 10.0,
|
2674
|
+
default_font_name: str = "RegionContent",
|
2675
|
+
confidence: Optional[float] = None, # Allow overriding confidence
|
2676
|
+
add_to_page: bool = False # NEW: Option to add to page
|
2677
|
+
) -> "TextElement":
|
2678
|
+
"""
|
2679
|
+
Creates a new TextElement object based on this region's geometry.
|
2680
|
+
|
2681
|
+
The text for the new TextElement can be provided directly,
|
2682
|
+
generated by a callback function, or left as None.
|
2683
|
+
|
2684
|
+
Args:
|
2685
|
+
text_content:
|
2686
|
+
- If a string, this will be the text of the new TextElement.
|
2687
|
+
- If a callable, it will be called with this region instance
|
2688
|
+
and its return value (a string or None) will be the text.
|
2689
|
+
- If None (default), the TextElement's text will be None.
|
2690
|
+
source_label: The 'source' attribute for the new TextElement.
|
2691
|
+
object_type: The 'object_type' for the TextElement's data dict
|
2692
|
+
(e.g., "word", "char").
|
2693
|
+
default_font_size: Placeholder font size if text is generated.
|
2694
|
+
default_font_name: Placeholder font name if text is generated.
|
2695
|
+
confidence: Confidence score for the text. If text_content is None,
|
2696
|
+
defaults to 0.0. If text is provided/generated, defaults to 1.0
|
2697
|
+
unless specified.
|
2698
|
+
add_to_page: If True, the created TextElement will be added to the
|
2699
|
+
region's parent page. (Default: False)
|
2700
|
+
|
2701
|
+
Returns:
|
2702
|
+
A new TextElement instance.
|
2703
|
+
|
2704
|
+
Raises:
|
2705
|
+
ValueError: If the region does not have a valid 'page' attribute.
|
2706
|
+
"""
|
2707
|
+
actual_text: Optional[str] = None
|
2708
|
+
if isinstance(text_content, str):
|
2709
|
+
actual_text = text_content
|
2710
|
+
elif callable(text_content):
|
2711
|
+
try:
|
2712
|
+
actual_text = text_content(self)
|
2713
|
+
except Exception as e:
|
2714
|
+
logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
|
2715
|
+
actual_text = None # Ensure actual_text is None on error
|
2716
|
+
|
2717
|
+
final_confidence = confidence
|
2718
|
+
if final_confidence is None:
|
2719
|
+
final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
|
2720
|
+
|
2721
|
+
if not hasattr(self, 'page') or self.page is None:
|
2722
|
+
raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
|
2723
|
+
|
2724
|
+
elem_data = {
|
2725
|
+
"text": actual_text,
|
2726
|
+
"x0": self.x0,
|
2727
|
+
"top": self.top,
|
2728
|
+
"x1": self.x1,
|
2729
|
+
"bottom": self.bottom,
|
2730
|
+
"width": self.width,
|
2731
|
+
"height": self.height,
|
2732
|
+
"object_type": object_type,
|
2733
|
+
"page_number": self.page.page_number,
|
2734
|
+
"stroking_color": getattr(self, 'stroking_color', (0,0,0)),
|
2735
|
+
"non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
|
2736
|
+
"fontname": default_font_name,
|
2737
|
+
"size": default_font_size,
|
2738
|
+
"upright": True,
|
2739
|
+
"direction": 1,
|
2740
|
+
"adv": self.width,
|
2741
|
+
"source": source_label,
|
2742
|
+
"confidence": final_confidence,
|
2743
|
+
"_char_dicts": []
|
2744
|
+
}
|
2745
|
+
text_element = TextElement(elem_data, self.page)
|
2746
|
+
|
2747
|
+
if add_to_page:
|
2748
|
+
if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
|
2749
|
+
add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
|
2750
|
+
# REMOVED try-except block around add_element
|
2751
|
+
self.page._element_mgr.add_element(text_element, element_type=add_as_type)
|
2752
|
+
logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
|
2753
|
+
else:
|
2754
|
+
page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
|
2755
|
+
logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
|
2756
|
+
|
2757
|
+
return text_element
|