natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +43 -3
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/classification/mixin.py +35 -14
- natural_pdf/classification/results.py +16 -1
- natural_pdf/cli.py +1 -0
- natural_pdf/core/highlighting_service.py +23 -0
- natural_pdf/core/page.py +32 -2
- natural_pdf/core/pdf.py +24 -4
- natural_pdf/describe/base.py +11 -1
- natural_pdf/describe/summary.py +26 -0
- natural_pdf/elements/base.py +81 -3
- natural_pdf/elements/collections.py +162 -101
- natural_pdf/elements/region.py +187 -160
- natural_pdf/elements/text.py +15 -7
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +2 -2
- natural_pdf/extraction/mixin.py +295 -11
- natural_pdf/extraction/result.py +28 -1
- natural_pdf/flows/region.py +117 -2
- natural_pdf/ocr/engine_surya.py +25 -5
- natural_pdf/qa/__init__.py +2 -1
- natural_pdf/qa/document_qa.py +166 -113
- natural_pdf/qa/qa_result.py +55 -0
- natural_pdf/selectors/parser.py +22 -0
- natural_pdf/utils/text_extraction.py +34 -14
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -82,7 +82,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
82
82
|
self.end_element = None
|
83
83
|
|
84
84
|
self.metadata: Dict[str, Any] = {}
|
85
|
-
self.
|
85
|
+
# Analysis results live under self.metadata['analysis'] via property
|
86
86
|
|
87
87
|
# Standard attributes for all elements
|
88
88
|
self.object_type = "region" # For selector compatibility
|
@@ -115,146 +115,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
115
115
|
**kwargs,
|
116
116
|
) -> "Region":
|
117
117
|
"""
|
118
|
-
|
118
|
+
Region-specific wrapper around :py:meth:`DirectionalMixin._direction`.
|
119
119
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
until: Optional selector string to specify a boundary element
|
126
|
-
include_endpoint: Whether to include the boundary element found by 'until'
|
127
|
-
**kwargs: Additional parameters for the 'until' selector search
|
128
|
-
|
129
|
-
Returns:
|
130
|
-
Region object
|
131
|
-
"""
|
132
|
-
import math # Use math.inf for infinity
|
133
|
-
|
134
|
-
is_horizontal = direction in ("left", "right")
|
135
|
-
is_positive = direction in ("right", "below") # right/below are positive directions
|
136
|
-
pixel_offset = 1 # Offset for excluding elements/endpoints
|
137
|
-
|
138
|
-
# 1. Determine initial boundaries based on direction and include_source
|
139
|
-
if is_horizontal:
|
140
|
-
# Initial cross-boundaries (vertical)
|
141
|
-
y0 = 0 if cross_size == "full" else self.top
|
142
|
-
y1 = self.page.height if cross_size == "full" else self.bottom
|
143
|
-
|
144
|
-
# Initial primary boundaries (horizontal)
|
145
|
-
if is_positive: # right
|
146
|
-
x0_initial = self.x0 if include_source else self.x1 + pixel_offset
|
147
|
-
x1_initial = self.x1 # This edge moves
|
148
|
-
else: # left
|
149
|
-
x0_initial = self.x0 # This edge moves
|
150
|
-
x1_initial = self.x1 if include_source else self.x0 - pixel_offset
|
151
|
-
else: # Vertical
|
152
|
-
# Initial cross-boundaries (horizontal)
|
153
|
-
x0 = 0 if cross_size == "full" else self.x0
|
154
|
-
x1 = self.page.width if cross_size == "full" else self.x1
|
155
|
-
|
156
|
-
# Initial primary boundaries (vertical)
|
157
|
-
if is_positive: # below
|
158
|
-
y0_initial = self.top if include_source else self.bottom + pixel_offset
|
159
|
-
y1_initial = self.bottom # This edge moves
|
160
|
-
else: # above
|
161
|
-
y0_initial = self.top # This edge moves
|
162
|
-
y1_initial = self.bottom if include_source else self.top - pixel_offset
|
163
|
-
|
164
|
-
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
165
|
-
if is_horizontal:
|
166
|
-
if is_positive: # right
|
167
|
-
x1_final = min(
|
168
|
-
self.page.width,
|
169
|
-
x1_initial + (size if size is not None else (self.page.width - x1_initial)),
|
170
|
-
)
|
171
|
-
x0_final = x0_initial
|
172
|
-
else: # left
|
173
|
-
x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
|
174
|
-
x1_final = x1_initial
|
175
|
-
else: # Vertical
|
176
|
-
if is_positive: # below
|
177
|
-
y1_final = min(
|
178
|
-
self.page.height,
|
179
|
-
y1_initial + (size if size is not None else (self.page.height - y1_initial)),
|
180
|
-
)
|
181
|
-
y0_final = y0_initial
|
182
|
-
else: # above
|
183
|
-
y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
|
184
|
-
y1_final = y1_initial
|
185
|
-
|
186
|
-
# 3. Handle 'until' selector if provided
|
187
|
-
target = None
|
188
|
-
if until:
|
189
|
-
all_matches = self.page.find_all(until, **kwargs)
|
190
|
-
matches_in_direction = []
|
191
|
-
|
192
|
-
# Filter and sort matches based on direction
|
193
|
-
if direction == "above":
|
194
|
-
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
195
|
-
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
196
|
-
elif direction == "below":
|
197
|
-
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
198
|
-
matches_in_direction.sort(key=lambda e: e.top)
|
199
|
-
elif direction == "left":
|
200
|
-
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
201
|
-
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
202
|
-
elif direction == "right":
|
203
|
-
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
204
|
-
matches_in_direction.sort(key=lambda e: e.x0)
|
205
|
-
|
206
|
-
if matches_in_direction:
|
207
|
-
target = matches_in_direction[0]
|
208
|
-
|
209
|
-
# Adjust the primary boundary based on the target
|
210
|
-
if is_horizontal:
|
211
|
-
if is_positive: # right
|
212
|
-
x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
|
213
|
-
else: # left
|
214
|
-
x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
|
215
|
-
else: # Vertical
|
216
|
-
if is_positive: # below
|
217
|
-
y1_final = target.bottom if include_endpoint else target.top - pixel_offset
|
218
|
-
else: # above
|
219
|
-
y0_final = target.top if include_endpoint else target.bottom + pixel_offset
|
220
|
-
|
221
|
-
# Adjust cross boundaries if cross_size is 'element'
|
222
|
-
if cross_size == "element":
|
223
|
-
if is_horizontal: # Adjust y0, y1
|
224
|
-
target_y0 = (
|
225
|
-
target.top if include_endpoint else target.bottom
|
226
|
-
) # Use opposite boundary if excluding
|
227
|
-
target_y1 = target.bottom if include_endpoint else target.top
|
228
|
-
y0 = min(y0, target_y0)
|
229
|
-
y1 = max(y1, target_y1)
|
230
|
-
else: # Adjust x0, x1
|
231
|
-
target_x0 = (
|
232
|
-
target.x0 if include_endpoint else target.x1
|
233
|
-
) # Use opposite boundary if excluding
|
234
|
-
target_x1 = target.x1 if include_endpoint else target.x0
|
235
|
-
x0 = min(x0, target_x0)
|
236
|
-
x1 = max(x1, target_x1)
|
237
|
-
|
238
|
-
# 4. Finalize bbox coordinates
|
239
|
-
if is_horizontal:
|
240
|
-
bbox = (x0_final, y0, x1_final, y1)
|
241
|
-
else:
|
242
|
-
bbox = (x0, y0_final, x1, y1_final)
|
120
|
+
It performs any pre-processing required by *Region* (none currently),
|
121
|
+
delegates the core geometry work to the mix-in implementation via
|
122
|
+
``super()``, then attaches region-level metadata before returning the
|
123
|
+
new :class:`Region` instance.
|
124
|
+
"""
|
243
125
|
|
244
|
-
#
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
126
|
+
# Delegate to the shared implementation on DirectionalMixin
|
127
|
+
region = super()._direction(
|
128
|
+
direction=direction,
|
129
|
+
size=size,
|
130
|
+
cross_size=cross_size,
|
131
|
+
include_source=include_source,
|
132
|
+
until=until,
|
133
|
+
include_endpoint=include_endpoint,
|
134
|
+
**kwargs,
|
135
|
+
)
|
250
136
|
|
251
|
-
#
|
252
|
-
region = Region(self.page, final_bbox)
|
137
|
+
# Post-process: make sure callers can trace lineage and flags
|
253
138
|
region.source_element = self
|
254
139
|
region.includes_source = include_source
|
255
|
-
# Optionally store the boundary element if found
|
256
|
-
if target:
|
257
|
-
region.boundary_element = target
|
258
140
|
|
259
141
|
return region
|
260
142
|
|
@@ -710,7 +592,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
710
592
|
self,
|
711
593
|
scale: float = 2.0,
|
712
594
|
resolution: float = 150,
|
713
|
-
|
595
|
+
crop: bool = False,
|
714
596
|
include_highlights: bool = True,
|
715
597
|
**kwargs,
|
716
598
|
) -> "Image.Image":
|
@@ -719,7 +601,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
719
601
|
|
720
602
|
Args:
|
721
603
|
resolution: Resolution in DPI for rendering (default: 150)
|
722
|
-
|
604
|
+
crop: If True, only crop the region without highlighting its boundaries
|
723
605
|
include_highlights: Whether to include existing highlights (default: True)
|
724
606
|
**kwargs: Additional parameters for page.to_image()
|
725
607
|
|
@@ -730,7 +612,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
730
612
|
page_kwargs = kwargs.copy()
|
731
613
|
effective_resolution = resolution # Start with the provided resolution
|
732
614
|
|
733
|
-
if
|
615
|
+
if crop and "width" in kwargs:
|
734
616
|
target_width = kwargs["width"]
|
735
617
|
# Calculate what resolution is needed to make the region crop have target_width
|
736
618
|
region_width_points = self.width # Region width in PDF points
|
@@ -785,8 +667,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
785
667
|
# Crop the image to just this region
|
786
668
|
region_image = page_image.crop((x0, top, x1, bottom))
|
787
669
|
|
788
|
-
# If not
|
789
|
-
if not
|
670
|
+
# If not crop, add a border to highlight the region boundaries
|
671
|
+
if not crop:
|
790
672
|
from PIL import ImageDraw
|
791
673
|
|
792
674
|
# Create a 1px border around the region
|
@@ -808,6 +690,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
808
690
|
color: Optional[Union[Tuple, str]] = "blue",
|
809
691
|
label: Optional[str] = None,
|
810
692
|
width: Optional[int] = None, # Add width parameter
|
693
|
+
crop: bool = False, # NEW: Crop output to region bounds before legend
|
811
694
|
) -> "Image.Image":
|
812
695
|
"""
|
813
696
|
Show the page with just this region highlighted temporarily.
|
@@ -819,6 +702,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
819
702
|
color: Color to highlight this region (default: blue)
|
820
703
|
label: Optional label for this region in the legend
|
821
704
|
width: Optional width for the output image in pixels
|
705
|
+
crop: If True, crop the rendered image to this region's
|
706
|
+
bounding box (with a small margin handled inside
|
707
|
+
HighlightingService) before legends/overlays are added.
|
822
708
|
|
823
709
|
Returns:
|
824
710
|
PIL Image of the page with only this region highlighted
|
@@ -844,6 +730,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
844
730
|
"use_color_cycling": False, # Explicitly false for single preview
|
845
731
|
}
|
846
732
|
|
733
|
+
# Determine crop bbox if requested
|
734
|
+
crop_bbox = self.bbox if crop else None
|
735
|
+
|
847
736
|
# Use render_preview to show only this highlight
|
848
737
|
return service.render_preview(
|
849
738
|
page_index=self._page.index,
|
@@ -852,6 +741,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
852
741
|
width=width, # Pass the width parameter
|
853
742
|
labels=labels,
|
854
743
|
legend_position=legend_position,
|
744
|
+
crop_bbox=crop_bbox,
|
855
745
|
)
|
856
746
|
|
857
747
|
def save(
|
@@ -880,7 +770,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
880
770
|
self,
|
881
771
|
filename: str,
|
882
772
|
resolution: float = 150,
|
883
|
-
|
773
|
+
crop: bool = False,
|
884
774
|
include_highlights: bool = True,
|
885
775
|
**kwargs,
|
886
776
|
) -> "Region":
|
@@ -890,7 +780,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
890
780
|
Args:
|
891
781
|
filename: Path to save the image to
|
892
782
|
resolution: Resolution in DPI for rendering (default: 150)
|
893
|
-
|
783
|
+
crop: If True, only crop the region without highlighting its boundaries
|
894
784
|
include_highlights: Whether to include existing highlights (default: True)
|
895
785
|
**kwargs: Additional parameters for page.to_image()
|
896
786
|
|
@@ -900,7 +790,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
900
790
|
# Get the region image
|
901
791
|
image = self.to_image(
|
902
792
|
resolution=resolution,
|
903
|
-
|
793
|
+
crop=crop,
|
904
794
|
include_highlights=include_highlights,
|
905
795
|
**kwargs,
|
906
796
|
)
|
@@ -953,7 +843,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
953
843
|
|
954
844
|
# Get the region image
|
955
845
|
image = work_region.to_image(
|
956
|
-
resolution=resolution,
|
846
|
+
resolution=resolution, crop=True, include_highlights=False
|
957
847
|
)
|
958
848
|
|
959
849
|
if image is None:
|
@@ -1320,6 +1210,24 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
1320
1210
|
# Try lattice first, then fall back to stream if no meaningful results
|
1321
1211
|
logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
|
1322
1212
|
|
1213
|
+
# --- NEW: Prefer already-created table_cell regions if they exist --- #
|
1214
|
+
try:
|
1215
|
+
cell_regions_in_table = [
|
1216
|
+
c
|
1217
|
+
for c in self.page.find_all("region[type=table_cell]", apply_exclusions=False)
|
1218
|
+
if self.intersects(c)
|
1219
|
+
]
|
1220
|
+
except Exception as _cells_err:
|
1221
|
+
cell_regions_in_table = [] # Fallback silently
|
1222
|
+
|
1223
|
+
if cell_regions_in_table:
|
1224
|
+
logger.debug(
|
1225
|
+
f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
|
1226
|
+
)
|
1227
|
+
return self._extract_table_from_cells(cell_regions_in_table)
|
1228
|
+
|
1229
|
+
# --------------------------------------------------------------- #
|
1230
|
+
|
1323
1231
|
try:
|
1324
1232
|
logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
|
1325
1233
|
lattice_result = self.extract_table(
|
@@ -2015,19 +1923,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2015
1923
|
logger.info(
|
2016
1924
|
f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
|
2017
1925
|
)
|
2018
|
-
# Find all OCR elements in this region
|
2019
|
-
ocr_selector = "text[source=ocr]"
|
2020
|
-
ocr_elements = self.find_all(ocr_selector)
|
2021
1926
|
|
1927
|
+
# Remove existing OCR word elements strictly inside this region
|
1928
|
+
ocr_selector = "text[source=ocr]"
|
1929
|
+
ocr_elements = self.find_all(ocr_selector, apply_exclusions=False)
|
2022
1930
|
if ocr_elements:
|
1931
|
+
removed_count = ocr_elements.remove()
|
2023
1932
|
logger.info(
|
2024
|
-
f"Region {self.bbox}:
|
1933
|
+
f"Region {self.bbox}: Removed {removed_count} existing OCR word elements in region before re-applying OCR."
|
2025
1934
|
)
|
2026
|
-
# Remove these elements from their page
|
2027
|
-
removed_count = ocr_elements.remove()
|
2028
|
-
logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
|
2029
1935
|
else:
|
2030
|
-
logger.info(
|
1936
|
+
logger.info(
|
1937
|
+
f"Region {self.bbox}: No existing OCR word elements found within region to remove."
|
1938
|
+
)
|
2031
1939
|
|
2032
1940
|
ocr_mgr = self.page._parent._ocr_manager
|
2033
1941
|
|
@@ -2044,7 +1952,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2044
1952
|
# Render the page region to an image using the determined resolution
|
2045
1953
|
try:
|
2046
1954
|
region_image = self.to_image(
|
2047
|
-
resolution=final_resolution, include_highlights=False,
|
1955
|
+
resolution=final_resolution, include_highlights=False, crop=True
|
2048
1956
|
)
|
2049
1957
|
if not region_image:
|
2050
1958
|
logger.error("Failed to render region to image for OCR.")
|
@@ -2088,8 +1996,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2088
1996
|
page_top = self.top + (img_top * scale_y)
|
2089
1997
|
page_x1 = self.x0 + (img_x1 * scale_x)
|
2090
1998
|
page_bottom = self.top + (img_bottom * scale_y)
|
1999
|
+
raw_conf = result.get("confidence")
|
2000
|
+
# Convert confidence to float unless it is None/invalid
|
2001
|
+
try:
|
2002
|
+
confidence_val = float(raw_conf) if raw_conf is not None else None
|
2003
|
+
except (TypeError, ValueError):
|
2004
|
+
confidence_val = None
|
2005
|
+
|
2006
|
+
text_val = result.get("text") # May legitimately be None in detect_only mode
|
2007
|
+
|
2091
2008
|
element_data = {
|
2092
|
-
"text":
|
2009
|
+
"text": text_val,
|
2093
2010
|
"x0": page_x0,
|
2094
2011
|
"top": page_top,
|
2095
2012
|
"x1": page_x1,
|
@@ -2098,7 +2015,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2098
2015
|
"height": page_bottom - page_top,
|
2099
2016
|
"object_type": "word",
|
2100
2017
|
"source": "ocr",
|
2101
|
-
"confidence":
|
2018
|
+
"confidence": confidence_val,
|
2102
2019
|
"fontname": "OCR",
|
2103
2020
|
"size": round(pdf_height) if pdf_height > 0 else 10.0,
|
2104
2021
|
"page_number": self.page.number,
|
@@ -2434,12 +2351,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2434
2351
|
|
2435
2352
|
def ask(
|
2436
2353
|
self,
|
2437
|
-
question: str,
|
2354
|
+
question: Union[str, List[str], Tuple[str, ...]],
|
2438
2355
|
min_confidence: float = 0.1,
|
2439
2356
|
model: str = None,
|
2440
2357
|
debug: bool = False,
|
2441
2358
|
**kwargs,
|
2442
|
-
) -> Dict[str, Any]:
|
2359
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
2443
2360
|
"""
|
2444
2361
|
Ask a question about the region content using document QA.
|
2445
2362
|
|
@@ -2466,7 +2383,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2466
2383
|
from natural_pdf.qa.document_qa import get_qa_engine
|
2467
2384
|
except ImportError:
|
2468
2385
|
logger.error(
|
2469
|
-
"Question answering requires optional dependencies. Install with `pip install natural-pdf[
|
2386
|
+
"Question answering requires optional dependencies. Install with `pip install natural-pdf[ai]`"
|
2470
2387
|
)
|
2471
2388
|
return {
|
2472
2389
|
"answer": None,
|
@@ -2684,7 +2601,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2684
2601
|
img = self.to_image(
|
2685
2602
|
resolution=resolution,
|
2686
2603
|
include_highlights=False, # No highlights for classification input
|
2687
|
-
|
2604
|
+
crop=True, # Just the region content
|
2688
2605
|
)
|
2689
2606
|
if img is None:
|
2690
2607
|
raise ValueError(
|
@@ -2964,4 +2881,114 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2964
2881
|
|
2965
2882
|
return text_element
|
2966
2883
|
|
2884
|
+
# ------------------------------------------------------------------
|
2885
|
+
# Unified analysis storage (maps to metadata["analysis"])
|
2886
|
+
# ------------------------------------------------------------------
|
2887
|
+
|
2888
|
+
@property
|
2889
|
+
def analyses(self) -> Dict[str, Any]:
|
2890
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2891
|
+
self.metadata = {}
|
2892
|
+
return self.metadata.setdefault("analysis", {})
|
2893
|
+
|
2894
|
+
@analyses.setter
|
2895
|
+
def analyses(self, value: Dict[str, Any]):
|
2896
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2897
|
+
self.metadata = {}
|
2898
|
+
self.metadata["analysis"] = value
|
2899
|
+
|
2900
|
+
# ------------------------------------------------------------------
|
2901
|
+
# New helper: build table from pre-computed table_cell regions
|
2902
|
+
# ------------------------------------------------------------------
|
2903
|
+
|
2904
|
+
def _extract_table_from_cells(self, cell_regions: List["Region"]) -> List[List[Optional[str]]]:
|
2905
|
+
"""Construct a table (list-of-lists) from table_cell regions.
|
2906
|
+
|
2907
|
+
This assumes each cell Region has metadata.row_index / col_index as written by
|
2908
|
+
detect_table_structure_from_lines(). If these keys are missing we will
|
2909
|
+
fall back to sorting by geometry.
|
2910
|
+
"""
|
2911
|
+
if not cell_regions:
|
2912
|
+
return []
|
2913
|
+
|
2914
|
+
# Attempt to use explicit indices first
|
2915
|
+
all_row_idxs = []
|
2916
|
+
all_col_idxs = []
|
2917
|
+
for cell in cell_regions:
|
2918
|
+
try:
|
2919
|
+
r_idx = int(cell.metadata.get("row_index"))
|
2920
|
+
c_idx = int(cell.metadata.get("col_index"))
|
2921
|
+
all_row_idxs.append(r_idx)
|
2922
|
+
all_col_idxs.append(c_idx)
|
2923
|
+
except Exception:
|
2924
|
+
# Not all cells have indices – clear the lists so we switch to geometric sorting
|
2925
|
+
all_row_idxs = []
|
2926
|
+
all_col_idxs = []
|
2927
|
+
break
|
2928
|
+
|
2929
|
+
if all_row_idxs and all_col_idxs:
|
2930
|
+
num_rows = max(all_row_idxs) + 1
|
2931
|
+
num_cols = max(all_col_idxs) + 1
|
2932
|
+
|
2933
|
+
# Initialise blank grid
|
2934
|
+
table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
|
2935
|
+
|
2936
|
+
for cell in cell_regions:
|
2937
|
+
try:
|
2938
|
+
r_idx = int(cell.metadata.get("row_index"))
|
2939
|
+
c_idx = int(cell.metadata.get("col_index"))
|
2940
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
2941
|
+
table_grid[r_idx][c_idx] = text_val if text_val else None
|
2942
|
+
except Exception as _err:
|
2943
|
+
# Skip problematic cell
|
2944
|
+
continue
|
2945
|
+
|
2946
|
+
return table_grid
|
2947
|
+
|
2948
|
+
# ------------------------------------------------------------------
|
2949
|
+
# Fallback: derive order purely from geometry if indices are absent
|
2950
|
+
# ------------------------------------------------------------------
|
2951
|
+
# Sort unique centers to define ordering
|
2952
|
+
try:
|
2953
|
+
import numpy as np
|
2954
|
+
except ImportError:
|
2955
|
+
logger.warning("NumPy required for geometric cell ordering; returning empty result.")
|
2956
|
+
return []
|
2957
|
+
|
2958
|
+
# Build arrays of centers
|
2959
|
+
centers = np.array([
|
2960
|
+
[(c.x0 + c.x1) / 2.0, (c.top + c.bottom) / 2.0] for c in cell_regions
|
2961
|
+
])
|
2962
|
+
xs = centers[:, 0]
|
2963
|
+
ys = centers[:, 1]
|
2964
|
+
|
2965
|
+
# Cluster unique row Y positions and column X positions with a tolerance
|
2966
|
+
def _cluster(vals, tol=1.0):
|
2967
|
+
sorted_vals = np.sort(vals)
|
2968
|
+
groups = [[sorted_vals[0]]]
|
2969
|
+
for v in sorted_vals[1:]:
|
2970
|
+
if abs(v - groups[-1][-1]) <= tol:
|
2971
|
+
groups[-1].append(v)
|
2972
|
+
else:
|
2973
|
+
groups.append([v])
|
2974
|
+
return [np.mean(g) for g in groups]
|
2975
|
+
|
2976
|
+
row_centers = _cluster(ys)
|
2977
|
+
col_centers = _cluster(xs)
|
2978
|
+
|
2979
|
+
num_rows = len(row_centers)
|
2980
|
+
num_cols = len(col_centers)
|
2981
|
+
|
2982
|
+
table_grid: List[List[Optional[str]]] = [[None] * num_cols for _ in range(num_rows)]
|
2983
|
+
|
2984
|
+
# Assign each cell to nearest row & col center
|
2985
|
+
for cell, (cx, cy) in zip(cell_regions, centers):
|
2986
|
+
row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
|
2987
|
+
col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
|
2988
|
+
|
2989
|
+
text_val = cell.extract_text(layout=False, apply_exclusions=False).strip()
|
2990
|
+
table_grid[row_idx][col_idx] = text_val if text_val else None
|
2991
|
+
|
2992
|
+
return table_grid
|
2993
|
+
|
2967
2994
|
|
natural_pdf/elements/text.py
CHANGED
@@ -151,20 +151,28 @@ class TextElement(Element):
|
|
151
151
|
# Default to black
|
152
152
|
return (0, 0, 0)
|
153
153
|
|
154
|
-
def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
|
154
|
+
def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, **kwargs) -> str:
|
155
155
|
"""
|
156
156
|
Extract text from this element.
|
157
157
|
|
158
158
|
Args:
|
159
|
-
keep_blank_chars:
|
160
|
-
|
159
|
+
keep_blank_chars: Retained for API compatibility (unused).
|
160
|
+
strip: If True (default) remove leading/trailing whitespace. Users may
|
161
|
+
pass ``strip=False`` to preserve whitespace exactly as stored.
|
162
|
+
**kwargs: Accepted for forward-compatibility and ignored here.
|
161
163
|
|
162
164
|
Returns:
|
163
|
-
|
165
|
+
The text content, optionally stripped.
|
164
166
|
"""
|
165
|
-
#
|
166
|
-
|
167
|
-
|
167
|
+
# Basic retrieval
|
168
|
+
result = self.text or ""
|
169
|
+
|
170
|
+
# Apply optional stripping – align with global convention where simple
|
171
|
+
# element extraction is stripped by default.
|
172
|
+
if strip:
|
173
|
+
result = result.strip()
|
174
|
+
|
175
|
+
return result
|
168
176
|
|
169
177
|
def contains(self, substring: str, case_sensitive: bool = True) -> bool:
|
170
178
|
"""
|
@@ -217,7 +217,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
217
217
|
# Expand region, render, and save image
|
218
218
|
region = element.expand(self.padding)
|
219
219
|
img = region.to_image(
|
220
|
-
resolution=self.resolution,
|
220
|
+
resolution=self.resolution, crop=True, include_highlights=False
|
221
221
|
)
|
222
222
|
img.save(absolute_image_path, "PNG")
|
223
223
|
|
@@ -126,10 +126,10 @@ class StructuredDataManager:
|
|
126
126
|
)
|
127
127
|
parsed_data = completion.choices[0].message.parsed
|
128
128
|
return StructuredDataResult(
|
129
|
-
data=parsed_data, success=True, error_message=None,
|
129
|
+
data=parsed_data, success=True, error_message=None, model_used=selected_model
|
130
130
|
)
|
131
131
|
except Exception as e:
|
132
132
|
logger.error(f"Extraction failed: {str(e)}")
|
133
133
|
return StructuredDataResult(
|
134
|
-
data=None, success=False, error_message=str(e),
|
134
|
+
data=None, success=False, error_message=str(e), model_used=selected_model
|
135
135
|
)
|