natural-pdf 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +3 -3
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/classification/mixin.py +35 -14
- natural_pdf/classification/results.py +16 -1
- natural_pdf/cli.py +9 -27
- natural_pdf/core/highlighting_service.py +23 -0
- natural_pdf/core/page.py +16 -0
- natural_pdf/core/pdf.py +55 -49
- natural_pdf/describe/base.py +2 -2
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +79 -1
- natural_pdf/elements/collections.py +23 -1
- natural_pdf/elements/region.py +54 -148
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +2 -2
- natural_pdf/extraction/mixin.py +295 -11
- natural_pdf/extraction/result.py +28 -1
- natural_pdf/flows/region.py +1 -1
- natural_pdf/ocr/engine_surya.py +25 -5
- natural_pdf/qa/__init__.py +2 -1
- natural_pdf/qa/document_qa.py +33 -37
- natural_pdf/qa/qa_result.py +55 -0
- natural_pdf/selectors/parser.py +22 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/METADATA +21 -14
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/RECORD +29 -28
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -82,7 +82,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
82
82
|
self.end_element = None
|
83
83
|
|
84
84
|
self.metadata: Dict[str, Any] = {}
|
85
|
-
self.
|
85
|
+
# Analysis results live under self.metadata['analysis'] via property
|
86
86
|
|
87
87
|
# Standard attributes for all elements
|
88
88
|
self.object_type = "region" # For selector compatibility
|
@@ -115,146 +115,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
115
115
|
**kwargs,
|
116
116
|
) -> "Region":
|
117
117
|
"""
|
118
|
-
|
118
|
+
Region-specific wrapper around :py:meth:`DirectionalMixin._direction`.
|
119
119
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
until: Optional selector string to specify a boundary element
|
126
|
-
include_endpoint: Whether to include the boundary element found by 'until'
|
127
|
-
**kwargs: Additional parameters for the 'until' selector search
|
128
|
-
|
129
|
-
Returns:
|
130
|
-
Region object
|
131
|
-
"""
|
132
|
-
import math # Use math.inf for infinity
|
133
|
-
|
134
|
-
is_horizontal = direction in ("left", "right")
|
135
|
-
is_positive = direction in ("right", "below") # right/below are positive directions
|
136
|
-
pixel_offset = 1 # Offset for excluding elements/endpoints
|
137
|
-
|
138
|
-
# 1. Determine initial boundaries based on direction and include_source
|
139
|
-
if is_horizontal:
|
140
|
-
# Initial cross-boundaries (vertical)
|
141
|
-
y0 = 0 if cross_size == "full" else self.top
|
142
|
-
y1 = self.page.height if cross_size == "full" else self.bottom
|
143
|
-
|
144
|
-
# Initial primary boundaries (horizontal)
|
145
|
-
if is_positive: # right
|
146
|
-
x0_initial = self.x0 if include_source else self.x1 + pixel_offset
|
147
|
-
x1_initial = self.x1 # This edge moves
|
148
|
-
else: # left
|
149
|
-
x0_initial = self.x0 # This edge moves
|
150
|
-
x1_initial = self.x1 if include_source else self.x0 - pixel_offset
|
151
|
-
else: # Vertical
|
152
|
-
# Initial cross-boundaries (horizontal)
|
153
|
-
x0 = 0 if cross_size == "full" else self.x0
|
154
|
-
x1 = self.page.width if cross_size == "full" else self.x1
|
155
|
-
|
156
|
-
# Initial primary boundaries (vertical)
|
157
|
-
if is_positive: # below
|
158
|
-
y0_initial = self.top if include_source else self.bottom + pixel_offset
|
159
|
-
y1_initial = self.bottom # This edge moves
|
160
|
-
else: # above
|
161
|
-
y0_initial = self.top # This edge moves
|
162
|
-
y1_initial = self.bottom if include_source else self.top - pixel_offset
|
163
|
-
|
164
|
-
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
165
|
-
if is_horizontal:
|
166
|
-
if is_positive: # right
|
167
|
-
x1_final = min(
|
168
|
-
self.page.width,
|
169
|
-
x1_initial + (size if size is not None else (self.page.width - x1_initial)),
|
170
|
-
)
|
171
|
-
x0_final = x0_initial
|
172
|
-
else: # left
|
173
|
-
x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
|
174
|
-
x1_final = x1_initial
|
175
|
-
else: # Vertical
|
176
|
-
if is_positive: # below
|
177
|
-
y1_final = min(
|
178
|
-
self.page.height,
|
179
|
-
y1_initial + (size if size is not None else (self.page.height - y1_initial)),
|
180
|
-
)
|
181
|
-
y0_final = y0_initial
|
182
|
-
else: # above
|
183
|
-
y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
|
184
|
-
y1_final = y1_initial
|
185
|
-
|
186
|
-
# 3. Handle 'until' selector if provided
|
187
|
-
target = None
|
188
|
-
if until:
|
189
|
-
all_matches = self.page.find_all(until, **kwargs)
|
190
|
-
matches_in_direction = []
|
191
|
-
|
192
|
-
# Filter and sort matches based on direction
|
193
|
-
if direction == "above":
|
194
|
-
matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
|
195
|
-
matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
|
196
|
-
elif direction == "below":
|
197
|
-
matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
|
198
|
-
matches_in_direction.sort(key=lambda e: e.top)
|
199
|
-
elif direction == "left":
|
200
|
-
matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
|
201
|
-
matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
|
202
|
-
elif direction == "right":
|
203
|
-
matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
|
204
|
-
matches_in_direction.sort(key=lambda e: e.x0)
|
205
|
-
|
206
|
-
if matches_in_direction:
|
207
|
-
target = matches_in_direction[0]
|
208
|
-
|
209
|
-
# Adjust the primary boundary based on the target
|
210
|
-
if is_horizontal:
|
211
|
-
if is_positive: # right
|
212
|
-
x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
|
213
|
-
else: # left
|
214
|
-
x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
|
215
|
-
else: # Vertical
|
216
|
-
if is_positive: # below
|
217
|
-
y1_final = target.bottom if include_endpoint else target.top - pixel_offset
|
218
|
-
else: # above
|
219
|
-
y0_final = target.top if include_endpoint else target.bottom + pixel_offset
|
220
|
-
|
221
|
-
# Adjust cross boundaries if cross_size is 'element'
|
222
|
-
if cross_size == "element":
|
223
|
-
if is_horizontal: # Adjust y0, y1
|
224
|
-
target_y0 = (
|
225
|
-
target.top if include_endpoint else target.bottom
|
226
|
-
) # Use opposite boundary if excluding
|
227
|
-
target_y1 = target.bottom if include_endpoint else target.top
|
228
|
-
y0 = min(y0, target_y0)
|
229
|
-
y1 = max(y1, target_y1)
|
230
|
-
else: # Adjust x0, x1
|
231
|
-
target_x0 = (
|
232
|
-
target.x0 if include_endpoint else target.x1
|
233
|
-
) # Use opposite boundary if excluding
|
234
|
-
target_x1 = target.x1 if include_endpoint else target.x0
|
235
|
-
x0 = min(x0, target_x0)
|
236
|
-
x1 = max(x1, target_x1)
|
237
|
-
|
238
|
-
# 4. Finalize bbox coordinates
|
239
|
-
if is_horizontal:
|
240
|
-
bbox = (x0_final, y0, x1_final, y1)
|
241
|
-
else:
|
242
|
-
bbox = (x0, y0_final, x1, y1_final)
|
120
|
+
It performs any pre-processing required by *Region* (none currently),
|
121
|
+
delegates the core geometry work to the mix-in implementation via
|
122
|
+
``super()``, then attaches region-level metadata before returning the
|
123
|
+
new :class:`Region` instance.
|
124
|
+
"""
|
243
125
|
|
244
|
-
#
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
126
|
+
# Delegate to the shared implementation on DirectionalMixin
|
127
|
+
region = super()._direction(
|
128
|
+
direction=direction,
|
129
|
+
size=size,
|
130
|
+
cross_size=cross_size,
|
131
|
+
include_source=include_source,
|
132
|
+
until=until,
|
133
|
+
include_endpoint=include_endpoint,
|
134
|
+
**kwargs,
|
135
|
+
)
|
250
136
|
|
251
|
-
#
|
252
|
-
region = Region(self.page, final_bbox)
|
137
|
+
# Post-process: make sure callers can trace lineage and flags
|
253
138
|
region.source_element = self
|
254
139
|
region.includes_source = include_source
|
255
|
-
# Optionally store the boundary element if found
|
256
|
-
if target:
|
257
|
-
region.boundary_element = target
|
258
140
|
|
259
141
|
return region
|
260
142
|
|
@@ -710,7 +592,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
710
592
|
self,
|
711
593
|
scale: float = 2.0,
|
712
594
|
resolution: float = 150,
|
713
|
-
|
595
|
+
crop: bool = False,
|
714
596
|
include_highlights: bool = True,
|
715
597
|
**kwargs,
|
716
598
|
) -> "Image.Image":
|
@@ -719,7 +601,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
719
601
|
|
720
602
|
Args:
|
721
603
|
resolution: Resolution in DPI for rendering (default: 150)
|
722
|
-
|
604
|
+
crop: If True, only crop the region without highlighting its boundaries
|
723
605
|
include_highlights: Whether to include existing highlights (default: True)
|
724
606
|
**kwargs: Additional parameters for page.to_image()
|
725
607
|
|
@@ -730,7 +612,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
730
612
|
page_kwargs = kwargs.copy()
|
731
613
|
effective_resolution = resolution # Start with the provided resolution
|
732
614
|
|
733
|
-
if
|
615
|
+
if crop and "width" in kwargs:
|
734
616
|
target_width = kwargs["width"]
|
735
617
|
# Calculate what resolution is needed to make the region crop have target_width
|
736
618
|
region_width_points = self.width # Region width in PDF points
|
@@ -785,8 +667,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
785
667
|
# Crop the image to just this region
|
786
668
|
region_image = page_image.crop((x0, top, x1, bottom))
|
787
669
|
|
788
|
-
# If not
|
789
|
-
if not
|
670
|
+
# If not crop, add a border to highlight the region boundaries
|
671
|
+
if not crop:
|
790
672
|
from PIL import ImageDraw
|
791
673
|
|
792
674
|
# Create a 1px border around the region
|
@@ -808,6 +690,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
808
690
|
color: Optional[Union[Tuple, str]] = "blue",
|
809
691
|
label: Optional[str] = None,
|
810
692
|
width: Optional[int] = None, # Add width parameter
|
693
|
+
crop: bool = False, # NEW: Crop output to region bounds before legend
|
811
694
|
) -> "Image.Image":
|
812
695
|
"""
|
813
696
|
Show the page with just this region highlighted temporarily.
|
@@ -819,6 +702,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
819
702
|
color: Color to highlight this region (default: blue)
|
820
703
|
label: Optional label for this region in the legend
|
821
704
|
width: Optional width for the output image in pixels
|
705
|
+
crop: If True, crop the rendered image to this region's
|
706
|
+
bounding box (with a small margin handled inside
|
707
|
+
HighlightingService) before legends/overlays are added.
|
822
708
|
|
823
709
|
Returns:
|
824
710
|
PIL Image of the page with only this region highlighted
|
@@ -844,6 +730,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
844
730
|
"use_color_cycling": False, # Explicitly false for single preview
|
845
731
|
}
|
846
732
|
|
733
|
+
# Determine crop bbox if requested
|
734
|
+
crop_bbox = self.bbox if crop else None
|
735
|
+
|
847
736
|
# Use render_preview to show only this highlight
|
848
737
|
return service.render_preview(
|
849
738
|
page_index=self._page.index,
|
@@ -852,6 +741,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
852
741
|
width=width, # Pass the width parameter
|
853
742
|
labels=labels,
|
854
743
|
legend_position=legend_position,
|
744
|
+
crop_bbox=crop_bbox,
|
855
745
|
)
|
856
746
|
|
857
747
|
def save(
|
@@ -880,7 +770,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
880
770
|
self,
|
881
771
|
filename: str,
|
882
772
|
resolution: float = 150,
|
883
|
-
|
773
|
+
crop: bool = False,
|
884
774
|
include_highlights: bool = True,
|
885
775
|
**kwargs,
|
886
776
|
) -> "Region":
|
@@ -890,7 +780,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
890
780
|
Args:
|
891
781
|
filename: Path to save the image to
|
892
782
|
resolution: Resolution in DPI for rendering (default: 150)
|
893
|
-
|
783
|
+
crop: If True, only crop the region without highlighting its boundaries
|
894
784
|
include_highlights: Whether to include existing highlights (default: True)
|
895
785
|
**kwargs: Additional parameters for page.to_image()
|
896
786
|
|
@@ -900,7 +790,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
900
790
|
# Get the region image
|
901
791
|
image = self.to_image(
|
902
792
|
resolution=resolution,
|
903
|
-
|
793
|
+
crop=crop,
|
904
794
|
include_highlights=include_highlights,
|
905
795
|
**kwargs,
|
906
796
|
)
|
@@ -953,7 +843,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
953
843
|
|
954
844
|
# Get the region image
|
955
845
|
image = work_region.to_image(
|
956
|
-
resolution=resolution,
|
846
|
+
resolution=resolution, crop=True, include_highlights=False
|
957
847
|
)
|
958
848
|
|
959
849
|
if image is None:
|
@@ -2044,7 +1934,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2044
1934
|
# Render the page region to an image using the determined resolution
|
2045
1935
|
try:
|
2046
1936
|
region_image = self.to_image(
|
2047
|
-
resolution=final_resolution, include_highlights=False,
|
1937
|
+
resolution=final_resolution, include_highlights=False, crop=True
|
2048
1938
|
)
|
2049
1939
|
if not region_image:
|
2050
1940
|
logger.error("Failed to render region to image for OCR.")
|
@@ -2466,7 +2356,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2466
2356
|
from natural_pdf.qa.document_qa import get_qa_engine
|
2467
2357
|
except ImportError:
|
2468
2358
|
logger.error(
|
2469
|
-
"Question answering requires optional dependencies. Install with `pip install natural-pdf[
|
2359
|
+
"Question answering requires optional dependencies. Install with `pip install natural-pdf[ai]`"
|
2470
2360
|
)
|
2471
2361
|
return {
|
2472
2362
|
"answer": None,
|
@@ -2684,7 +2574,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2684
2574
|
img = self.to_image(
|
2685
2575
|
resolution=resolution,
|
2686
2576
|
include_highlights=False, # No highlights for classification input
|
2687
|
-
|
2577
|
+
crop=True, # Just the region content
|
2688
2578
|
)
|
2689
2579
|
if img is None:
|
2690
2580
|
raise ValueError(
|
@@ -2964,4 +2854,20 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2964
2854
|
|
2965
2855
|
return text_element
|
2966
2856
|
|
2857
|
+
# ------------------------------------------------------------------
|
2858
|
+
# Unified analysis storage (maps to metadata["analysis"])
|
2859
|
+
# ------------------------------------------------------------------
|
2860
|
+
|
2861
|
+
@property
|
2862
|
+
def analyses(self) -> Dict[str, Any]:
|
2863
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2864
|
+
self.metadata = {}
|
2865
|
+
return self.metadata.setdefault("analysis", {})
|
2866
|
+
|
2867
|
+
@analyses.setter
|
2868
|
+
def analyses(self, value: Dict[str, Any]):
|
2869
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2870
|
+
self.metadata = {}
|
2871
|
+
self.metadata["analysis"] = value
|
2872
|
+
|
2967
2873
|
|
@@ -217,7 +217,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
217
217
|
# Expand region, render, and save image
|
218
218
|
region = element.expand(self.padding)
|
219
219
|
img = region.to_image(
|
220
|
-
resolution=self.resolution,
|
220
|
+
resolution=self.resolution, crop=True, include_highlights=False
|
221
221
|
)
|
222
222
|
img.save(absolute_image_path, "PNG")
|
223
223
|
|
@@ -126,10 +126,10 @@ class StructuredDataManager:
|
|
126
126
|
)
|
127
127
|
parsed_data = completion.choices[0].message.parsed
|
128
128
|
return StructuredDataResult(
|
129
|
-
data=parsed_data, success=True, error_message=None,
|
129
|
+
data=parsed_data, success=True, error_message=None, model_used=selected_model
|
130
130
|
)
|
131
131
|
except Exception as e:
|
132
132
|
logger.error(f"Extraction failed: {str(e)}")
|
133
133
|
return StructuredDataResult(
|
134
|
-
data=None, success=False, error_message=str(e),
|
134
|
+
data=None, success=False, error_message=str(e), model_used=selected_model
|
135
135
|
)
|