natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1,5 +1,16 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import
|
2
|
+
from typing import (
|
3
|
+
TYPE_CHECKING,
|
4
|
+
Any,
|
5
|
+
Callable,
|
6
|
+
Dict,
|
7
|
+
List,
|
8
|
+
Literal,
|
9
|
+
Optional,
|
10
|
+
Tuple,
|
11
|
+
Union,
|
12
|
+
overload,
|
13
|
+
)
|
3
14
|
|
4
15
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
5
16
|
|
@@ -15,23 +26,29 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
|
|
15
26
|
|
16
27
|
# --- Classification Imports --- #
|
17
28
|
from natural_pdf.classification.mixin import ClassificationMixin
|
29
|
+
|
30
|
+
# Add Visualizable import
|
31
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
18
32
|
from natural_pdf.describe.mixin import DescribeMixin
|
19
33
|
from natural_pdf.elements.base import DirectionalMixin
|
20
34
|
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
21
35
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
22
36
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
23
37
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
-
from natural_pdf.text_mixin import TextMixin
|
25
38
|
|
26
39
|
# ------------------------------------------------------------------
|
27
40
|
# Table utilities
|
28
41
|
# ------------------------------------------------------------------
|
29
42
|
from natural_pdf.tables import TableResult
|
43
|
+
from natural_pdf.text_mixin import TextMixin
|
30
44
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
31
45
|
|
32
46
|
# Import new utils
|
33
47
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
34
48
|
|
49
|
+
# Import viewer widget support
|
50
|
+
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
51
|
+
|
35
52
|
# --- End Classification Imports --- #
|
36
53
|
|
37
54
|
|
@@ -43,7 +60,7 @@ if TYPE_CHECKING:
|
|
43
60
|
|
44
61
|
from natural_pdf.core.page import Page
|
45
62
|
from natural_pdf.elements.base import Element # Added for type hint
|
46
|
-
from natural_pdf.elements.
|
63
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
47
64
|
from natural_pdf.elements.text import TextElement
|
48
65
|
|
49
66
|
# Import OCRManager conditionally to avoid circular imports
|
@@ -63,6 +80,7 @@ class Region(
|
|
63
80
|
ExtractionMixin,
|
64
81
|
ShapeDetectionMixin,
|
65
82
|
DescribeMixin,
|
83
|
+
Visualizable,
|
66
84
|
):
|
67
85
|
"""Represents a rectangular region on a page.
|
68
86
|
|
@@ -199,6 +217,62 @@ class Region(
|
|
199
217
|
self.text_content = None # Direct text content (e.g., from Docling)
|
200
218
|
self.associated_text_elements = [] # Native text elements that overlap with this region
|
201
219
|
|
220
|
+
def _get_render_specs(
|
221
|
+
self,
|
222
|
+
mode: Literal["show", "render"] = "show",
|
223
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
224
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
225
|
+
crop: Union[bool, Literal["content"]] = True, # Default to True for regions
|
226
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
227
|
+
**kwargs,
|
228
|
+
) -> List[RenderSpec]:
|
229
|
+
"""Get render specifications for this region.
|
230
|
+
|
231
|
+
Args:
|
232
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
233
|
+
color: Color for highlighting this region in show mode
|
234
|
+
highlights: Additional highlight groups to show
|
235
|
+
crop: Whether to crop to this region
|
236
|
+
crop_bbox: Explicit crop bounds (overrides region bounds)
|
237
|
+
**kwargs: Additional parameters
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
List containing a single RenderSpec for this region's page
|
241
|
+
"""
|
242
|
+
from typing import Literal
|
243
|
+
|
244
|
+
spec = RenderSpec(page=self.page)
|
245
|
+
|
246
|
+
# Handle cropping
|
247
|
+
if crop_bbox:
|
248
|
+
spec.crop_bbox = crop_bbox
|
249
|
+
elif crop:
|
250
|
+
# Crop to this region's bounds
|
251
|
+
spec.crop_bbox = self.bbox
|
252
|
+
|
253
|
+
# Add highlights in show mode
|
254
|
+
if mode == "show":
|
255
|
+
# Highlight this region
|
256
|
+
if color or mode == "show": # Always highlight in show mode
|
257
|
+
spec.add_highlight(
|
258
|
+
bbox=self.bbox,
|
259
|
+
polygon=self.polygon if self.has_polygon else None,
|
260
|
+
color=color or "blue",
|
261
|
+
label=self.label or self.name or "Region",
|
262
|
+
)
|
263
|
+
|
264
|
+
# Add additional highlight groups if provided
|
265
|
+
if highlights:
|
266
|
+
for group in highlights:
|
267
|
+
elements = group.get("elements", [])
|
268
|
+
group_color = group.get("color", color)
|
269
|
+
group_label = group.get("label")
|
270
|
+
|
271
|
+
for elem in elements:
|
272
|
+
spec.add_highlight(element=elem, color=group_color, label=group_label)
|
273
|
+
|
274
|
+
return [spec]
|
275
|
+
|
202
276
|
def _direction(
|
203
277
|
self,
|
204
278
|
direction: str,
|
@@ -639,7 +713,7 @@ class Region(
|
|
639
713
|
label: Optional[str] = None,
|
640
714
|
color: Optional[Union[Tuple, str]] = None,
|
641
715
|
use_color_cycling: bool = False,
|
642
|
-
|
716
|
+
annotate: Optional[List[str]] = None,
|
643
717
|
existing: str = "append",
|
644
718
|
) -> "Region":
|
645
719
|
"""
|
@@ -649,7 +723,7 @@ class Region(
|
|
649
723
|
label: Optional label for the highlight
|
650
724
|
color: Color tuple/string for the highlight, or None to use automatic color
|
651
725
|
use_color_cycling: Force color cycling even with no label (default: False)
|
652
|
-
|
726
|
+
annotate: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
653
727
|
existing: How to handle existing highlights ('append' or 'replace').
|
654
728
|
|
655
729
|
Returns:
|
@@ -665,7 +739,7 @@ class Region(
|
|
665
739
|
"label": label,
|
666
740
|
"use_color_cycling": use_color_cycling,
|
667
741
|
"element": self, # Pass the region itself so attributes can be accessed
|
668
|
-
"
|
742
|
+
"annotate": annotate,
|
669
743
|
"existing": existing,
|
670
744
|
}
|
671
745
|
|
@@ -679,178 +753,6 @@ class Region(
|
|
679
753
|
|
680
754
|
return self
|
681
755
|
|
682
|
-
def to_image(
|
683
|
-
self,
|
684
|
-
resolution: Optional[float] = None,
|
685
|
-
crop: bool = False,
|
686
|
-
include_highlights: bool = True,
|
687
|
-
**kwargs,
|
688
|
-
) -> "Image.Image":
|
689
|
-
"""
|
690
|
-
Generate an image of just this region.
|
691
|
-
|
692
|
-
Args:
|
693
|
-
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
694
|
-
crop: If True, only crop the region without highlighting its boundaries
|
695
|
-
include_highlights: Whether to include existing highlights (default: True)
|
696
|
-
**kwargs: Additional parameters for page.to_image()
|
697
|
-
|
698
|
-
Returns:
|
699
|
-
PIL Image of just this region
|
700
|
-
"""
|
701
|
-
# Apply global options as defaults
|
702
|
-
import natural_pdf
|
703
|
-
|
704
|
-
if resolution is None:
|
705
|
-
if natural_pdf.options.image.resolution is not None:
|
706
|
-
resolution = natural_pdf.options.image.resolution
|
707
|
-
else:
|
708
|
-
resolution = 144 # Default resolution when none specified
|
709
|
-
|
710
|
-
# Handle the case where user wants the cropped region to have a specific width
|
711
|
-
page_kwargs = kwargs.copy()
|
712
|
-
effective_resolution = resolution # Start with the provided resolution
|
713
|
-
|
714
|
-
if crop and "width" in kwargs:
|
715
|
-
target_width = kwargs["width"]
|
716
|
-
# Calculate what resolution is needed to make the region crop have target_width
|
717
|
-
region_width_points = self.width # Region width in PDF points
|
718
|
-
|
719
|
-
if region_width_points > 0:
|
720
|
-
# Calculate scale needed: target_width / region_width_points
|
721
|
-
required_scale = target_width / region_width_points
|
722
|
-
# Convert scale to resolution: scale * 72 DPI
|
723
|
-
effective_resolution = required_scale * 72.0
|
724
|
-
page_kwargs.pop("width") # Remove width parameter to avoid conflicts
|
725
|
-
logger.debug(
|
726
|
-
f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
|
727
|
-
)
|
728
|
-
else:
|
729
|
-
logger.warning(
|
730
|
-
f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
|
731
|
-
)
|
732
|
-
|
733
|
-
# First get the full page image with highlights if requested
|
734
|
-
page_image = self._page.to_image(
|
735
|
-
resolution=effective_resolution,
|
736
|
-
include_highlights=include_highlights,
|
737
|
-
**page_kwargs,
|
738
|
-
)
|
739
|
-
|
740
|
-
# Calculate the actual scale factor used by the page image
|
741
|
-
if page_image.width > 0 and self._page.width > 0:
|
742
|
-
scale_factor = page_image.width / self._page.width
|
743
|
-
else:
|
744
|
-
# Fallback to resolution-based calculation if dimensions are invalid
|
745
|
-
scale_factor = resolution / 72.0
|
746
|
-
|
747
|
-
# Apply scaling to the coordinates
|
748
|
-
x0 = int(self.x0 * scale_factor)
|
749
|
-
top = int(self.top * scale_factor)
|
750
|
-
x1 = int(self.x1 * scale_factor)
|
751
|
-
bottom = int(self.bottom * scale_factor)
|
752
|
-
|
753
|
-
# Ensure coords are valid for cropping (left < right, top < bottom)
|
754
|
-
if x0 >= x1:
|
755
|
-
logger.warning(
|
756
|
-
f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
|
757
|
-
)
|
758
|
-
return None
|
759
|
-
if top >= bottom:
|
760
|
-
logger.warning(
|
761
|
-
f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
|
762
|
-
)
|
763
|
-
return None
|
764
|
-
|
765
|
-
# Crop the image to just this region
|
766
|
-
region_image = page_image.crop((x0, top, x1, bottom))
|
767
|
-
|
768
|
-
# If not crop, add a border to highlight the region boundaries
|
769
|
-
if not crop:
|
770
|
-
from PIL import ImageDraw
|
771
|
-
|
772
|
-
# Create a 1px border around the region
|
773
|
-
draw = ImageDraw.Draw(region_image)
|
774
|
-
draw.rectangle(
|
775
|
-
(0, 0, region_image.width - 1, region_image.height - 1),
|
776
|
-
outline=(255, 0, 0),
|
777
|
-
width=1,
|
778
|
-
)
|
779
|
-
|
780
|
-
return region_image
|
781
|
-
|
782
|
-
def show(
|
783
|
-
self,
|
784
|
-
resolution: Optional[float] = None,
|
785
|
-
labels: bool = True,
|
786
|
-
legend_position: str = "right",
|
787
|
-
# Add a default color for standalone show
|
788
|
-
color: Optional[Union[Tuple, str]] = "blue",
|
789
|
-
label: Optional[str] = None,
|
790
|
-
width: Optional[int] = None, # Add width parameter
|
791
|
-
crop: bool = False, # NEW: Crop output to region bounds before legend
|
792
|
-
) -> "Image.Image":
|
793
|
-
"""
|
794
|
-
Show the page with just this region highlighted temporarily.
|
795
|
-
|
796
|
-
Args:
|
797
|
-
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
798
|
-
labels: Whether to include a legend for labels
|
799
|
-
legend_position: Position of the legend
|
800
|
-
color: Color to highlight this region (default: blue)
|
801
|
-
label: Optional label for this region in the legend
|
802
|
-
width: Optional width for the output image in pixels
|
803
|
-
crop: If True, crop the rendered image to this region's
|
804
|
-
bounding box (with a small margin handled inside
|
805
|
-
HighlightingService) before legends/overlays are added.
|
806
|
-
|
807
|
-
Returns:
|
808
|
-
PIL Image of the page with only this region highlighted
|
809
|
-
"""
|
810
|
-
# Apply global options as defaults
|
811
|
-
import natural_pdf
|
812
|
-
|
813
|
-
if resolution is None:
|
814
|
-
if natural_pdf.options.image.resolution is not None:
|
815
|
-
resolution = natural_pdf.options.image.resolution
|
816
|
-
else:
|
817
|
-
resolution = 144 # Default resolution when none specified
|
818
|
-
|
819
|
-
if not self._page:
|
820
|
-
raise ValueError("Region must be associated with a page to show.")
|
821
|
-
|
822
|
-
# Use the highlighting service via the page's property
|
823
|
-
service = self._page._highlighter
|
824
|
-
|
825
|
-
# Determine the label if not provided
|
826
|
-
display_label = (
|
827
|
-
label if label is not None else f"Region ({self.type})" if self.type else "Region"
|
828
|
-
)
|
829
|
-
|
830
|
-
# Prepare temporary highlight data for just this region
|
831
|
-
temp_highlight_data = {
|
832
|
-
"page_index": self._page.index,
|
833
|
-
"bbox": self.bbox,
|
834
|
-
"polygon": self.polygon if self.has_polygon else None,
|
835
|
-
"color": color, # Use provided or default color
|
836
|
-
"label": display_label,
|
837
|
-
"use_color_cycling": False, # Explicitly false for single preview
|
838
|
-
}
|
839
|
-
|
840
|
-
# Determine crop bbox if requested
|
841
|
-
crop_bbox = self.bbox if crop else None
|
842
|
-
|
843
|
-
# Use render_preview to show only this highlight
|
844
|
-
return service.render_preview(
|
845
|
-
page_index=self._page.index,
|
846
|
-
temporary_highlights=[temp_highlight_data],
|
847
|
-
resolution=resolution,
|
848
|
-
width=width, # Pass the width parameter
|
849
|
-
labels=labels,
|
850
|
-
legend_position=legend_position,
|
851
|
-
crop_bbox=crop_bbox,
|
852
|
-
)
|
853
|
-
|
854
756
|
def save(
|
855
757
|
self,
|
856
758
|
filename: str,
|
@@ -904,7 +806,7 @@ class Region(
|
|
904
806
|
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
905
807
|
crop: If True, only crop the region without highlighting its boundaries
|
906
808
|
include_highlights: Whether to include existing highlights (default: True)
|
907
|
-
**kwargs: Additional parameters for
|
809
|
+
**kwargs: Additional parameters for rendering
|
908
810
|
|
909
811
|
Returns:
|
910
812
|
Self for method chaining
|
@@ -918,16 +820,23 @@ class Region(
|
|
918
820
|
else:
|
919
821
|
resolution = 144 # Default resolution when none specified
|
920
822
|
|
921
|
-
#
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
823
|
+
# Use export() to save the image
|
824
|
+
if include_highlights:
|
825
|
+
# With highlights, use export() which includes them
|
826
|
+
self.export(
|
827
|
+
path=filename,
|
828
|
+
resolution=resolution,
|
829
|
+
crop=crop,
|
830
|
+
**kwargs,
|
831
|
+
)
|
832
|
+
else:
|
833
|
+
# Without highlights, use render() and save manually
|
834
|
+
image = self.render(resolution=resolution, crop=crop, **kwargs)
|
835
|
+
if image:
|
836
|
+
image.save(filename)
|
837
|
+
else:
|
838
|
+
logger.error(f"Failed to render region image for saving to {filename}")
|
928
839
|
|
929
|
-
# Save the image
|
930
|
-
image.save(filename)
|
931
840
|
return self
|
932
841
|
|
933
842
|
def trim(
|
@@ -988,7 +897,8 @@ class Region(
|
|
988
897
|
)
|
989
898
|
|
990
899
|
# Get the region image
|
991
|
-
|
900
|
+
# Use render() for clean image without highlights, with cropping
|
901
|
+
image = work_region.render(resolution=resolution, crop=True)
|
992
902
|
|
993
903
|
if image is None:
|
994
904
|
logger.warning(
|
@@ -1227,7 +1137,9 @@ class Region(
|
|
1227
1137
|
# Filter to elements in this region
|
1228
1138
|
return [e for e in page_elements if self._is_element_in_region(e)]
|
1229
1139
|
|
1230
|
-
def extract_text(
|
1140
|
+
def extract_text(
|
1141
|
+
self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
|
1142
|
+
) -> str:
|
1231
1143
|
"""
|
1232
1144
|
Extract text from this region, respecting page exclusions and using pdfplumber's
|
1233
1145
|
layout engine (chars_to_textmap).
|
@@ -1299,7 +1211,7 @@ class Region(
|
|
1299
1211
|
final_kwargs = kwargs.copy()
|
1300
1212
|
if content_filter is not None:
|
1301
1213
|
final_kwargs["content_filter"] = content_filter
|
1302
|
-
|
1214
|
+
|
1303
1215
|
result = generate_text_layout(
|
1304
1216
|
char_dicts=filtered_chars,
|
1305
1217
|
layout_context_bbox=self.bbox, # Use region's bbox for context
|
@@ -1319,7 +1231,9 @@ class Region(
|
|
1319
1231
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1320
1232
|
# --- NEW: Add tqdm control option --- #
|
1321
1233
|
show_progress: bool = False, # Controls progress bar for text method
|
1322
|
-
content_filter: Optional[
|
1234
|
+
content_filter: Optional[
|
1235
|
+
Union[str, Callable[[str], bool], List[str]]
|
1236
|
+
] = None, # NEW: Content filtering
|
1323
1237
|
) -> TableResult: # Return type allows Optional[str] for cells
|
1324
1238
|
"""
|
1325
1239
|
Extract a table from this region.
|
@@ -1379,7 +1293,11 @@ class Region(
|
|
1379
1293
|
logger.debug(
|
1380
1294
|
f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
|
1381
1295
|
)
|
1382
|
-
return TableResult(
|
1296
|
+
return TableResult(
|
1297
|
+
self._extract_table_from_cells(
|
1298
|
+
cell_regions_in_table, content_filter=content_filter
|
1299
|
+
)
|
1300
|
+
)
|
1383
1301
|
|
1384
1302
|
# --------------------------------------------------------------- #
|
1385
1303
|
|
@@ -1460,7 +1378,9 @@ class Region(
|
|
1460
1378
|
|
1461
1379
|
# Use the selected method
|
1462
1380
|
if effective_method == "tatr":
|
1463
|
-
table_rows = self._extract_table_tatr(
|
1381
|
+
table_rows = self._extract_table_tatr(
|
1382
|
+
use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
|
1383
|
+
)
|
1464
1384
|
elif effective_method == "text":
|
1465
1385
|
current_text_options = text_options.copy()
|
1466
1386
|
current_text_options["cell_extraction_func"] = cell_extraction_func
|
@@ -1763,10 +1683,12 @@ class Region(
|
|
1763
1683
|
if cell is not None:
|
1764
1684
|
# Apply RTL text processing first
|
1765
1685
|
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1766
|
-
|
1686
|
+
|
1767
1687
|
# Then apply content filter if provided
|
1768
1688
|
if content_filter is not None:
|
1769
|
-
filtered_cell = self._apply_content_filter_to_text(
|
1689
|
+
filtered_cell = self._apply_content_filter_to_text(
|
1690
|
+
rtl_processed_cell, content_filter
|
1691
|
+
)
|
1770
1692
|
processed_row.append(filtered_cell)
|
1771
1693
|
else:
|
1772
1694
|
processed_row.append(rtl_processed_cell)
|
@@ -1776,7 +1698,9 @@ class Region(
|
|
1776
1698
|
return processed_table
|
1777
1699
|
return []
|
1778
1700
|
|
1779
|
-
def _extract_table_tatr(
|
1701
|
+
def _extract_table_tatr(
|
1702
|
+
self, use_ocr=False, ocr_config=None, content_filter=None
|
1703
|
+
) -> List[List[str]]:
|
1780
1704
|
"""
|
1781
1705
|
Extract table using TATR structure detection.
|
1782
1706
|
|
@@ -2173,7 +2097,7 @@ class Region(
|
|
2173
2097
|
Returns:
|
2174
2098
|
ElementCollection with matching elements.
|
2175
2099
|
"""
|
2176
|
-
from natural_pdf.elements.
|
2100
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
2177
2101
|
|
2178
2102
|
if selector is not None and text is not None:
|
2179
2103
|
raise ValueError("Provide either 'selector' or 'text', not both.")
|
@@ -2258,7 +2182,7 @@ class Region(
|
|
2258
2182
|
---------
|
2259
2183
|
```python
|
2260
2184
|
def llm_ocr(region):
|
2261
|
-
image = region.
|
2185
|
+
image = region.render(resolution=300, crop=True)
|
2262
2186
|
return my_llm_client.ocr(image)
|
2263
2187
|
region.apply_ocr(function=llm_ocr)
|
2264
2188
|
```
|
@@ -2368,9 +2292,8 @@ class Region(
|
|
2368
2292
|
|
2369
2293
|
# Render the page region to an image using the determined resolution
|
2370
2294
|
try:
|
2371
|
-
|
2372
|
-
|
2373
|
-
)
|
2295
|
+
# Use render() for clean image without highlights, with cropping
|
2296
|
+
region_image = self.render(resolution=final_resolution, crop=True)
|
2374
2297
|
if not region_image:
|
2375
2298
|
logger.error("Failed to render region to image for OCR.")
|
2376
2299
|
return self
|
@@ -2492,7 +2415,7 @@ class Region(
|
|
2492
2415
|
Example:
|
2493
2416
|
# Using with an LLM
|
2494
2417
|
def ocr_with_llm(region):
|
2495
|
-
image = region.
|
2418
|
+
image = region.render(resolution=300, crop=True)
|
2496
2419
|
# Call your LLM API here
|
2497
2420
|
return llm_client.ocr(image)
|
2498
2421
|
|
@@ -2500,7 +2423,7 @@ class Region(
|
|
2500
2423
|
|
2501
2424
|
# Using with a custom OCR service
|
2502
2425
|
def ocr_with_service(region):
|
2503
|
-
img_bytes = region.
|
2426
|
+
img_bytes = region.render(crop=True).tobytes()
|
2504
2427
|
response = ocr_service.process(img_bytes)
|
2505
2428
|
return response.text
|
2506
2429
|
|
@@ -2605,14 +2528,14 @@ class Region(
|
|
2605
2528
|
|
2606
2529
|
return self
|
2607
2530
|
|
2608
|
-
def get_section_between(self, start_element=None, end_element=None,
|
2531
|
+
def get_section_between(self, start_element=None, end_element=None, include_boundaries="both"):
|
2609
2532
|
"""
|
2610
2533
|
Get a section between two elements within this region.
|
2611
2534
|
|
2612
2535
|
Args:
|
2613
2536
|
start_element: Element marking the start of the section
|
2614
2537
|
end_element: Element marking the end of the section
|
2615
|
-
|
2538
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2616
2539
|
|
2617
2540
|
Returns:
|
2618
2541
|
Region representing the section
|
@@ -2661,15 +2584,15 @@ class Region(
|
|
2661
2584
|
start_element_for_bbox = start_element
|
2662
2585
|
end_element_for_bbox = end_element
|
2663
2586
|
|
2664
|
-
if
|
2587
|
+
if include_boundaries == "none":
|
2665
2588
|
start_idx += 1
|
2666
2589
|
end_idx -= 1
|
2667
2590
|
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
2668
2591
|
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
2669
|
-
elif
|
2592
|
+
elif include_boundaries == "start":
|
2670
2593
|
end_idx -= 1
|
2671
2594
|
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
2672
|
-
elif
|
2595
|
+
elif include_boundaries == "end":
|
2673
2596
|
start_idx += 1
|
2674
2597
|
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
2675
2598
|
|
@@ -2702,7 +2625,7 @@ class Region(
|
|
2702
2625
|
return section
|
2703
2626
|
|
2704
2627
|
def get_sections(
|
2705
|
-
self, start_elements=None, end_elements=None,
|
2628
|
+
self, start_elements=None, end_elements=None, include_boundaries="both"
|
2706
2629
|
) -> "ElementCollection[Region]":
|
2707
2630
|
"""
|
2708
2631
|
Get sections within this region based on start/end elements.
|
@@ -2710,12 +2633,12 @@ class Region(
|
|
2710
2633
|
Args:
|
2711
2634
|
start_elements: Elements or selector string that mark the start of sections
|
2712
2635
|
end_elements: Elements or selector string that mark the end of sections
|
2713
|
-
|
2636
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2714
2637
|
|
2715
2638
|
Returns:
|
2716
2639
|
List of Region objects representing the extracted sections
|
2717
2640
|
"""
|
2718
|
-
from natural_pdf.elements.
|
2641
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
2719
2642
|
|
2720
2643
|
# Process string selectors to find elements WITHIN THIS REGION
|
2721
2644
|
if isinstance(start_elements, str):
|
@@ -2789,7 +2712,7 @@ class Region(
|
|
2789
2712
|
start_element = current_start_boundary["element"]
|
2790
2713
|
end_element = boundary["element"]
|
2791
2714
|
# Use the helper, ensuring elements are from within the region
|
2792
|
-
section = self.get_section_between(start_element, end_element,
|
2715
|
+
section = self.get_section_between(start_element, end_element, include_boundaries)
|
2793
2716
|
sections.append(section)
|
2794
2717
|
current_start_boundary = None # Reset
|
2795
2718
|
|
@@ -2806,7 +2729,7 @@ class Region(
|
|
2806
2729
|
if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
|
2807
2730
|
end_element = all_elements_in_region[end_idx]
|
2808
2731
|
section = self.get_section_between(
|
2809
|
-
start_element, end_element,
|
2732
|
+
start_element, end_element, include_boundaries
|
2810
2733
|
)
|
2811
2734
|
sections.append(section)
|
2812
2735
|
# Else: Section started and ended by consecutive start elements? Create empty?
|
@@ -2820,7 +2743,7 @@ class Region(
|
|
2820
2743
|
start_element = current_start_boundary["element"]
|
2821
2744
|
# End at the last element within the region
|
2822
2745
|
end_element = all_elements_in_region[-1]
|
2823
|
-
section = self.get_section_between(start_element, end_element,
|
2746
|
+
section = self.get_section_between(start_element, end_element, include_boundaries)
|
2824
2747
|
sections.append(section)
|
2825
2748
|
|
2826
2749
|
return ElementCollection(sections)
|
@@ -3095,7 +3018,9 @@ class Region(
|
|
3095
3018
|
override simply ensures the search is scoped to the region.
|
3096
3019
|
"""
|
3097
3020
|
|
3098
|
-
return TextMixin.update_text(
|
3021
|
+
return TextMixin.update_text(
|
3022
|
+
self, transform, selector=selector, apply_exclusions=apply_exclusions
|
3023
|
+
)
|
3099
3024
|
|
3100
3025
|
# --- Classification Mixin Implementation --- #
|
3101
3026
|
def _get_classification_manager(self) -> "ClassificationManager":
|
@@ -3136,9 +3061,8 @@ class Region(
|
|
3136
3061
|
else default_resolution
|
3137
3062
|
)
|
3138
3063
|
|
3139
|
-
img = self.
|
3064
|
+
img = self.render(
|
3140
3065
|
resolution=resolution,
|
3141
|
-
include_highlights=False, # No highlights for classification input
|
3142
3066
|
crop=True, # Just the region content
|
3143
3067
|
)
|
3144
3068
|
if img is None:
|
@@ -3268,7 +3192,7 @@ class Region(
|
|
3268
3192
|
An ElementCollection containing temporary Region objects for each detected cell,
|
3269
3193
|
or an empty ElementCollection if no cells are found or an error occurs.
|
3270
3194
|
"""
|
3271
|
-
from natural_pdf.elements.
|
3195
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3272
3196
|
|
3273
3197
|
# 1. Perform the analysis (or use cached results)
|
3274
3198
|
if "text_table_structure" in self.analyses:
|
@@ -3470,13 +3394,15 @@ class Region(
|
|
3470
3394
|
# New helper: build table from pre-computed table_cell regions
|
3471
3395
|
# ------------------------------------------------------------------
|
3472
3396
|
|
3473
|
-
def _extract_table_from_cells(
|
3397
|
+
def _extract_table_from_cells(
|
3398
|
+
self, cell_regions: List["Region"], content_filter=None
|
3399
|
+
) -> List[List[Optional[str]]]:
|
3474
3400
|
"""Construct a table (list-of-lists) from table_cell regions.
|
3475
3401
|
|
3476
3402
|
This assumes each cell Region has metadata.row_index / col_index as written by
|
3477
3403
|
detect_table_structure_from_lines(). If these keys are missing we will
|
3478
3404
|
fall back to sorting by geometry.
|
3479
|
-
|
3405
|
+
|
3480
3406
|
Args:
|
3481
3407
|
cell_regions: List of table cell Region objects to extract text from
|
3482
3408
|
content_filter: Optional content filter to apply to cell text extraction
|
@@ -3510,7 +3436,9 @@ class Region(
|
|
3510
3436
|
try:
|
3511
3437
|
r_idx = int(cell.metadata.get("row_index"))
|
3512
3438
|
c_idx = int(cell.metadata.get("col_index"))
|
3513
|
-
text_val = cell.extract_text(
|
3439
|
+
text_val = cell.extract_text(
|
3440
|
+
layout=False, apply_exclusions=False, content_filter=content_filter
|
3441
|
+
).strip()
|
3514
3442
|
table_grid[r_idx][c_idx] = text_val if text_val else None
|
3515
3443
|
except Exception as _err:
|
3516
3444
|
# Skip problematic cell
|
@@ -3557,7 +3485,9 @@ class Region(
|
|
3557
3485
|
row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
|
3558
3486
|
col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
|
3559
3487
|
|
3560
|
-
text_val = cell.extract_text(
|
3488
|
+
text_val = cell.extract_text(
|
3489
|
+
layout=False, apply_exclusions=False, content_filter=content_filter
|
3490
|
+
).strip()
|
3561
3491
|
table_grid[row_idx][col_idx] = text_val if text_val else None
|
3562
3492
|
|
3563
3493
|
return table_grid
|
@@ -3565,32 +3495,33 @@ class Region(
|
|
3565
3495
|
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3566
3496
|
"""
|
3567
3497
|
Apply RTL (Right-to-Left) text processing to a string.
|
3568
|
-
|
3498
|
+
|
3569
3499
|
This converts visual order text (as stored in PDFs) to logical order
|
3570
3500
|
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3571
|
-
|
3501
|
+
|
3572
3502
|
Args:
|
3573
3503
|
text: Input text string in visual order
|
3574
|
-
|
3504
|
+
|
3575
3505
|
Returns:
|
3576
3506
|
Text string in logical order
|
3577
3507
|
"""
|
3578
3508
|
if not text or not text.strip():
|
3579
3509
|
return text
|
3580
|
-
|
3510
|
+
|
3581
3511
|
# Quick check for RTL characters - if none found, return as-is
|
3582
3512
|
import unicodedata
|
3583
|
-
|
3513
|
+
|
3584
3514
|
def _contains_rtl(s):
|
3585
3515
|
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3586
|
-
|
3516
|
+
|
3587
3517
|
if not _contains_rtl(text):
|
3588
3518
|
return text
|
3589
|
-
|
3519
|
+
|
3590
3520
|
try:
|
3591
3521
|
from bidi.algorithm import get_display # type: ignore
|
3522
|
+
|
3592
3523
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3593
|
-
|
3524
|
+
|
3594
3525
|
# Apply BiDi algorithm to convert from visual to logical order
|
3595
3526
|
# Process line by line to handle mixed content properly
|
3596
3527
|
processed_lines = []
|
@@ -3603,9 +3534,9 @@ class Region(
|
|
3603
3534
|
processed_lines.append(mirror_brackets(logical_line))
|
3604
3535
|
else:
|
3605
3536
|
processed_lines.append(line)
|
3606
|
-
|
3537
|
+
|
3607
3538
|
return "\n".join(processed_lines)
|
3608
|
-
|
3539
|
+
|
3609
3540
|
except (ImportError, Exception):
|
3610
3541
|
# If bidi library is not available or fails, return original text
|
3611
3542
|
return text
|
@@ -3613,36 +3544,36 @@ class Region(
|
|
3613
3544
|
def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
|
3614
3545
|
"""
|
3615
3546
|
Apply content filter to a text string.
|
3616
|
-
|
3547
|
+
|
3617
3548
|
Args:
|
3618
3549
|
text: Input text string
|
3619
3550
|
content_filter: Content filter (regex, callable, or list of regexes)
|
3620
|
-
|
3551
|
+
|
3621
3552
|
Returns:
|
3622
3553
|
Filtered text string
|
3623
3554
|
"""
|
3624
3555
|
if not text or content_filter is None:
|
3625
3556
|
return text
|
3626
|
-
|
3557
|
+
|
3627
3558
|
import re
|
3628
|
-
|
3559
|
+
|
3629
3560
|
if isinstance(content_filter, str):
|
3630
3561
|
# Single regex pattern - remove matching parts
|
3631
3562
|
try:
|
3632
|
-
return re.sub(content_filter,
|
3563
|
+
return re.sub(content_filter, "", text)
|
3633
3564
|
except re.error:
|
3634
3565
|
return text # Invalid regex, return original
|
3635
|
-
|
3566
|
+
|
3636
3567
|
elif isinstance(content_filter, list):
|
3637
3568
|
# List of regex patterns - remove parts matching ANY pattern
|
3638
3569
|
try:
|
3639
3570
|
result = text
|
3640
3571
|
for pattern in content_filter:
|
3641
|
-
result = re.sub(pattern,
|
3572
|
+
result = re.sub(pattern, "", result)
|
3642
3573
|
return result
|
3643
3574
|
except re.error:
|
3644
3575
|
return text # Invalid regex, return original
|
3645
|
-
|
3576
|
+
|
3646
3577
|
elif callable(content_filter):
|
3647
3578
|
# Callable filter - apply to individual characters
|
3648
3579
|
try:
|
@@ -3650,8 +3581,152 @@ class Region(
|
|
3650
3581
|
for char in text:
|
3651
3582
|
if content_filter(char):
|
3652
3583
|
filtered_chars.append(char)
|
3653
|
-
return
|
3584
|
+
return "".join(filtered_chars)
|
3654
3585
|
except Exception:
|
3655
3586
|
return text # Function error, return original
|
3656
|
-
|
3587
|
+
|
3657
3588
|
return text
|
3589
|
+
|
3590
|
+
# ------------------------------------------------------------------
|
3591
|
+
# Interactive Viewer Support
|
3592
|
+
# ------------------------------------------------------------------
|
3593
|
+
|
3594
|
+
def viewer(
|
3595
|
+
self,
|
3596
|
+
*,
|
3597
|
+
resolution: int = 150,
|
3598
|
+
include_chars: bool = False,
|
3599
|
+
include_attributes: Optional[List[str]] = None,
|
3600
|
+
) -> Optional["InteractiveViewerWidget"]:
|
3601
|
+
"""Create an interactive ipywidget viewer for **this specific region**.
|
3602
|
+
|
3603
|
+
The method renders the region to an image (cropped to the region bounds) and
|
3604
|
+
overlays all elements that intersect the region (optionally excluding noisy
|
3605
|
+
character-level elements). The resulting widget offers the same zoom / pan
|
3606
|
+
experience as :py:meth:`Page.viewer` but scoped to the region.
|
3607
|
+
|
3608
|
+
Parameters
|
3609
|
+
----------
|
3610
|
+
resolution : int, default 150
|
3611
|
+
Rendering resolution (DPI). This should match the value used by the
|
3612
|
+
page-level viewer so element scaling is accurate.
|
3613
|
+
include_chars : bool, default False
|
3614
|
+
Whether to include individual *char* elements in the overlay. These
|
3615
|
+
are often too dense for a meaningful visualisation so are skipped by
|
3616
|
+
default.
|
3617
|
+
include_attributes : list[str], optional
|
3618
|
+
Additional element attributes to expose in the info panel (on top of
|
3619
|
+
the default set used by the page viewer).
|
3620
|
+
|
3621
|
+
Returns
|
3622
|
+
-------
|
3623
|
+
InteractiveViewerWidget | None
|
3624
|
+
The widget instance, or ``None`` if *ipywidgets* is not installed or
|
3625
|
+
an error occurred during creation.
|
3626
|
+
"""
|
3627
|
+
|
3628
|
+
# ------------------------------------------------------------------
|
3629
|
+
# Dependency / environment checks
|
3630
|
+
# ------------------------------------------------------------------
|
3631
|
+
if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
|
3632
|
+
logger.error(
|
3633
|
+
"Interactive viewer requires 'ipywidgets'. "
|
3634
|
+
'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
|
3635
|
+
)
|
3636
|
+
return None
|
3637
|
+
|
3638
|
+
try:
|
3639
|
+
# ------------------------------------------------------------------
|
3640
|
+
# Render region image (cropped) and encode as data URI
|
3641
|
+
# ------------------------------------------------------------------
|
3642
|
+
import base64
|
3643
|
+
from io import BytesIO
|
3644
|
+
|
3645
|
+
# Use unified render() with crop=True to obtain just the region
|
3646
|
+
img = self.render(resolution=resolution, crop=True)
|
3647
|
+
if img is None:
|
3648
|
+
logger.error(f"Failed to render image for region {self.bbox} viewer.")
|
3649
|
+
return None
|
3650
|
+
|
3651
|
+
buf = BytesIO()
|
3652
|
+
img.save(buf, format="PNG")
|
3653
|
+
img_str = base64.b64encode(buf.getvalue()).decode()
|
3654
|
+
image_uri = f"data:image/png;base64,{img_str}"
|
3655
|
+
|
3656
|
+
# ------------------------------------------------------------------
|
3657
|
+
# Prepare element overlay data (coordinates relative to region)
|
3658
|
+
# ------------------------------------------------------------------
|
3659
|
+
scale = resolution / 72.0 # Same convention as page viewer
|
3660
|
+
|
3661
|
+
# Gather elements intersecting the region
|
3662
|
+
region_elements = self.get_elements(apply_exclusions=False)
|
3663
|
+
|
3664
|
+
# Optionally filter out chars
|
3665
|
+
if not include_chars:
|
3666
|
+
region_elements = [
|
3667
|
+
el for el in region_elements if str(getattr(el, "type", "")).lower() != "char"
|
3668
|
+
]
|
3669
|
+
|
3670
|
+
default_attrs = [
|
3671
|
+
"text",
|
3672
|
+
"fontname",
|
3673
|
+
"size",
|
3674
|
+
"bold",
|
3675
|
+
"italic",
|
3676
|
+
"color",
|
3677
|
+
"linewidth",
|
3678
|
+
"is_horizontal",
|
3679
|
+
"is_vertical",
|
3680
|
+
"source",
|
3681
|
+
"confidence",
|
3682
|
+
"label",
|
3683
|
+
"model",
|
3684
|
+
"upright",
|
3685
|
+
"direction",
|
3686
|
+
]
|
3687
|
+
|
3688
|
+
if include_attributes:
|
3689
|
+
default_attrs.extend([a for a in include_attributes if a not in default_attrs])
|
3690
|
+
|
3691
|
+
elements_json: List[dict] = []
|
3692
|
+
for idx, el in enumerate(region_elements):
|
3693
|
+
try:
|
3694
|
+
# Calculate coordinates relative to region bbox and apply scale
|
3695
|
+
x0 = (el.x0 - self.x0) * scale
|
3696
|
+
y0 = (el.top - self.top) * scale
|
3697
|
+
x1 = (el.x1 - self.x0) * scale
|
3698
|
+
y1 = (el.bottom - self.top) * scale
|
3699
|
+
|
3700
|
+
elem_dict = {
|
3701
|
+
"id": idx,
|
3702
|
+
"type": getattr(el, "type", "unknown"),
|
3703
|
+
"x0": round(x0, 2),
|
3704
|
+
"y0": round(y0, 2),
|
3705
|
+
"x1": round(x1, 2),
|
3706
|
+
"y1": round(y1, 2),
|
3707
|
+
"width": round(x1 - x0, 2),
|
3708
|
+
"height": round(y1 - y0, 2),
|
3709
|
+
}
|
3710
|
+
|
3711
|
+
# Add requested / default attributes
|
3712
|
+
for attr_name in default_attrs:
|
3713
|
+
if hasattr(el, attr_name):
|
3714
|
+
val = getattr(el, attr_name)
|
3715
|
+
# Ensure JSON serialisable
|
3716
|
+
if not isinstance(val, (str, int, float, bool, list, dict, type(None))):
|
3717
|
+
val = str(val)
|
3718
|
+
elem_dict[attr_name] = val
|
3719
|
+
elements_json.append(elem_dict)
|
3720
|
+
except Exception as e:
|
3721
|
+
logger.warning(f"Error preparing element {idx} for region viewer: {e}")
|
3722
|
+
|
3723
|
+
viewer_data = {"page_image": image_uri, "elements": elements_json}
|
3724
|
+
|
3725
|
+
# ------------------------------------------------------------------
|
3726
|
+
# Instantiate the widget directly using the prepared data
|
3727
|
+
# ------------------------------------------------------------------
|
3728
|
+
return InteractiveViewerWidget(pdf_data=viewer_data)
|
3729
|
+
|
3730
|
+
except Exception as e:
|
3731
|
+
logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
|
3732
|
+
return None
|