natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1,5 +1,16 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import
|
2
|
+
from typing import (
|
3
|
+
TYPE_CHECKING,
|
4
|
+
Any,
|
5
|
+
Callable,
|
6
|
+
Dict,
|
7
|
+
List,
|
8
|
+
Literal,
|
9
|
+
Optional,
|
10
|
+
Tuple,
|
11
|
+
Union,
|
12
|
+
overload,
|
13
|
+
)
|
3
14
|
|
4
15
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
5
16
|
|
@@ -15,6 +26,9 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
|
|
15
26
|
|
16
27
|
# --- Classification Imports --- #
|
17
28
|
from natural_pdf.classification.mixin import ClassificationMixin
|
29
|
+
|
30
|
+
# Add Visualizable import
|
31
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
18
32
|
from natural_pdf.describe.mixin import DescribeMixin
|
19
33
|
from natural_pdf.elements.base import DirectionalMixin
|
20
34
|
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
@@ -26,11 +40,15 @@ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
|
26
40
|
# Table utilities
|
27
41
|
# ------------------------------------------------------------------
|
28
42
|
from natural_pdf.tables import TableResult
|
43
|
+
from natural_pdf.text_mixin import TextMixin
|
29
44
|
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
30
45
|
|
31
46
|
# Import new utils
|
32
47
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
33
48
|
|
49
|
+
# Import viewer widget support
|
50
|
+
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
51
|
+
|
34
52
|
# --- End Classification Imports --- #
|
35
53
|
|
36
54
|
|
@@ -42,7 +60,7 @@ if TYPE_CHECKING:
|
|
42
60
|
|
43
61
|
from natural_pdf.core.page import Page
|
44
62
|
from natural_pdf.elements.base import Element # Added for type hint
|
45
|
-
from natural_pdf.elements.
|
63
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
46
64
|
from natural_pdf.elements.text import TextElement
|
47
65
|
|
48
66
|
# Import OCRManager conditionally to avoid circular imports
|
@@ -56,7 +74,13 @@ logger = logging.getLogger(__name__)
|
|
56
74
|
|
57
75
|
|
58
76
|
class Region(
|
59
|
-
|
77
|
+
TextMixin,
|
78
|
+
DirectionalMixin,
|
79
|
+
ClassificationMixin,
|
80
|
+
ExtractionMixin,
|
81
|
+
ShapeDetectionMixin,
|
82
|
+
DescribeMixin,
|
83
|
+
Visualizable,
|
60
84
|
):
|
61
85
|
"""Represents a rectangular region on a page.
|
62
86
|
|
@@ -193,6 +217,62 @@ class Region(
|
|
193
217
|
self.text_content = None # Direct text content (e.g., from Docling)
|
194
218
|
self.associated_text_elements = [] # Native text elements that overlap with this region
|
195
219
|
|
220
|
+
def _get_render_specs(
|
221
|
+
self,
|
222
|
+
mode: Literal["show", "render"] = "show",
|
223
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
224
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
225
|
+
crop: Union[bool, Literal["content"]] = True, # Default to True for regions
|
226
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
227
|
+
**kwargs,
|
228
|
+
) -> List[RenderSpec]:
|
229
|
+
"""Get render specifications for this region.
|
230
|
+
|
231
|
+
Args:
|
232
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
233
|
+
color: Color for highlighting this region in show mode
|
234
|
+
highlights: Additional highlight groups to show
|
235
|
+
crop: Whether to crop to this region
|
236
|
+
crop_bbox: Explicit crop bounds (overrides region bounds)
|
237
|
+
**kwargs: Additional parameters
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
List containing a single RenderSpec for this region's page
|
241
|
+
"""
|
242
|
+
from typing import Literal
|
243
|
+
|
244
|
+
spec = RenderSpec(page=self.page)
|
245
|
+
|
246
|
+
# Handle cropping
|
247
|
+
if crop_bbox:
|
248
|
+
spec.crop_bbox = crop_bbox
|
249
|
+
elif crop:
|
250
|
+
# Crop to this region's bounds
|
251
|
+
spec.crop_bbox = self.bbox
|
252
|
+
|
253
|
+
# Add highlights in show mode
|
254
|
+
if mode == "show":
|
255
|
+
# Highlight this region
|
256
|
+
if color or mode == "show": # Always highlight in show mode
|
257
|
+
spec.add_highlight(
|
258
|
+
bbox=self.bbox,
|
259
|
+
polygon=self.polygon if self.has_polygon else None,
|
260
|
+
color=color or "blue",
|
261
|
+
label=self.label or self.name or "Region",
|
262
|
+
)
|
263
|
+
|
264
|
+
# Add additional highlight groups if provided
|
265
|
+
if highlights:
|
266
|
+
for group in highlights:
|
267
|
+
elements = group.get("elements", [])
|
268
|
+
group_color = group.get("color", color)
|
269
|
+
group_label = group.get("label")
|
270
|
+
|
271
|
+
for elem in elements:
|
272
|
+
spec.add_highlight(element=elem, color=group_color, label=group_label)
|
273
|
+
|
274
|
+
return [spec]
|
275
|
+
|
196
276
|
def _direction(
|
197
277
|
self,
|
198
278
|
direction: str,
|
@@ -633,7 +713,7 @@ class Region(
|
|
633
713
|
label: Optional[str] = None,
|
634
714
|
color: Optional[Union[Tuple, str]] = None,
|
635
715
|
use_color_cycling: bool = False,
|
636
|
-
|
716
|
+
annotate: Optional[List[str]] = None,
|
637
717
|
existing: str = "append",
|
638
718
|
) -> "Region":
|
639
719
|
"""
|
@@ -643,7 +723,7 @@ class Region(
|
|
643
723
|
label: Optional label for the highlight
|
644
724
|
color: Color tuple/string for the highlight, or None to use automatic color
|
645
725
|
use_color_cycling: Force color cycling even with no label (default: False)
|
646
|
-
|
726
|
+
annotate: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
|
647
727
|
existing: How to handle existing highlights ('append' or 'replace').
|
648
728
|
|
649
729
|
Returns:
|
@@ -659,7 +739,7 @@ class Region(
|
|
659
739
|
"label": label,
|
660
740
|
"use_color_cycling": use_color_cycling,
|
661
741
|
"element": self, # Pass the region itself so attributes can be accessed
|
662
|
-
"
|
742
|
+
"annotate": annotate,
|
663
743
|
"existing": existing,
|
664
744
|
}
|
665
745
|
|
@@ -673,178 +753,6 @@ class Region(
|
|
673
753
|
|
674
754
|
return self
|
675
755
|
|
676
|
-
def to_image(
|
677
|
-
self,
|
678
|
-
resolution: Optional[float] = None,
|
679
|
-
crop: bool = False,
|
680
|
-
include_highlights: bool = True,
|
681
|
-
**kwargs,
|
682
|
-
) -> "Image.Image":
|
683
|
-
"""
|
684
|
-
Generate an image of just this region.
|
685
|
-
|
686
|
-
Args:
|
687
|
-
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
688
|
-
crop: If True, only crop the region without highlighting its boundaries
|
689
|
-
include_highlights: Whether to include existing highlights (default: True)
|
690
|
-
**kwargs: Additional parameters for page.to_image()
|
691
|
-
|
692
|
-
Returns:
|
693
|
-
PIL Image of just this region
|
694
|
-
"""
|
695
|
-
# Apply global options as defaults
|
696
|
-
import natural_pdf
|
697
|
-
|
698
|
-
if resolution is None:
|
699
|
-
if natural_pdf.options.image.resolution is not None:
|
700
|
-
resolution = natural_pdf.options.image.resolution
|
701
|
-
else:
|
702
|
-
resolution = 144 # Default resolution when none specified
|
703
|
-
|
704
|
-
# Handle the case where user wants the cropped region to have a specific width
|
705
|
-
page_kwargs = kwargs.copy()
|
706
|
-
effective_resolution = resolution # Start with the provided resolution
|
707
|
-
|
708
|
-
if crop and "width" in kwargs:
|
709
|
-
target_width = kwargs["width"]
|
710
|
-
# Calculate what resolution is needed to make the region crop have target_width
|
711
|
-
region_width_points = self.width # Region width in PDF points
|
712
|
-
|
713
|
-
if region_width_points > 0:
|
714
|
-
# Calculate scale needed: target_width / region_width_points
|
715
|
-
required_scale = target_width / region_width_points
|
716
|
-
# Convert scale to resolution: scale * 72 DPI
|
717
|
-
effective_resolution = required_scale * 72.0
|
718
|
-
page_kwargs.pop("width") # Remove width parameter to avoid conflicts
|
719
|
-
logger.debug(
|
720
|
-
f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
|
721
|
-
)
|
722
|
-
else:
|
723
|
-
logger.warning(
|
724
|
-
f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
|
725
|
-
)
|
726
|
-
|
727
|
-
# First get the full page image with highlights if requested
|
728
|
-
page_image = self._page.to_image(
|
729
|
-
resolution=effective_resolution,
|
730
|
-
include_highlights=include_highlights,
|
731
|
-
**page_kwargs,
|
732
|
-
)
|
733
|
-
|
734
|
-
# Calculate the actual scale factor used by the page image
|
735
|
-
if page_image.width > 0 and self._page.width > 0:
|
736
|
-
scale_factor = page_image.width / self._page.width
|
737
|
-
else:
|
738
|
-
# Fallback to resolution-based calculation if dimensions are invalid
|
739
|
-
scale_factor = resolution / 72.0
|
740
|
-
|
741
|
-
# Apply scaling to the coordinates
|
742
|
-
x0 = int(self.x0 * scale_factor)
|
743
|
-
top = int(self.top * scale_factor)
|
744
|
-
x1 = int(self.x1 * scale_factor)
|
745
|
-
bottom = int(self.bottom * scale_factor)
|
746
|
-
|
747
|
-
# Ensure coords are valid for cropping (left < right, top < bottom)
|
748
|
-
if x0 >= x1:
|
749
|
-
logger.warning(
|
750
|
-
f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
|
751
|
-
)
|
752
|
-
return None
|
753
|
-
if top >= bottom:
|
754
|
-
logger.warning(
|
755
|
-
f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
|
756
|
-
)
|
757
|
-
return None
|
758
|
-
|
759
|
-
# Crop the image to just this region
|
760
|
-
region_image = page_image.crop((x0, top, x1, bottom))
|
761
|
-
|
762
|
-
# If not crop, add a border to highlight the region boundaries
|
763
|
-
if not crop:
|
764
|
-
from PIL import ImageDraw
|
765
|
-
|
766
|
-
# Create a 1px border around the region
|
767
|
-
draw = ImageDraw.Draw(region_image)
|
768
|
-
draw.rectangle(
|
769
|
-
(0, 0, region_image.width - 1, region_image.height - 1),
|
770
|
-
outline=(255, 0, 0),
|
771
|
-
width=1,
|
772
|
-
)
|
773
|
-
|
774
|
-
return region_image
|
775
|
-
|
776
|
-
def show(
|
777
|
-
self,
|
778
|
-
resolution: Optional[float] = None,
|
779
|
-
labels: bool = True,
|
780
|
-
legend_position: str = "right",
|
781
|
-
# Add a default color for standalone show
|
782
|
-
color: Optional[Union[Tuple, str]] = "blue",
|
783
|
-
label: Optional[str] = None,
|
784
|
-
width: Optional[int] = None, # Add width parameter
|
785
|
-
crop: bool = False, # NEW: Crop output to region bounds before legend
|
786
|
-
) -> "Image.Image":
|
787
|
-
"""
|
788
|
-
Show the page with just this region highlighted temporarily.
|
789
|
-
|
790
|
-
Args:
|
791
|
-
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
792
|
-
labels: Whether to include a legend for labels
|
793
|
-
legend_position: Position of the legend
|
794
|
-
color: Color to highlight this region (default: blue)
|
795
|
-
label: Optional label for this region in the legend
|
796
|
-
width: Optional width for the output image in pixels
|
797
|
-
crop: If True, crop the rendered image to this region's
|
798
|
-
bounding box (with a small margin handled inside
|
799
|
-
HighlightingService) before legends/overlays are added.
|
800
|
-
|
801
|
-
Returns:
|
802
|
-
PIL Image of the page with only this region highlighted
|
803
|
-
"""
|
804
|
-
# Apply global options as defaults
|
805
|
-
import natural_pdf
|
806
|
-
|
807
|
-
if resolution is None:
|
808
|
-
if natural_pdf.options.image.resolution is not None:
|
809
|
-
resolution = natural_pdf.options.image.resolution
|
810
|
-
else:
|
811
|
-
resolution = 144 # Default resolution when none specified
|
812
|
-
|
813
|
-
if not self._page:
|
814
|
-
raise ValueError("Region must be associated with a page to show.")
|
815
|
-
|
816
|
-
# Use the highlighting service via the page's property
|
817
|
-
service = self._page._highlighter
|
818
|
-
|
819
|
-
# Determine the label if not provided
|
820
|
-
display_label = (
|
821
|
-
label if label is not None else f"Region ({self.type})" if self.type else "Region"
|
822
|
-
)
|
823
|
-
|
824
|
-
# Prepare temporary highlight data for just this region
|
825
|
-
temp_highlight_data = {
|
826
|
-
"page_index": self._page.index,
|
827
|
-
"bbox": self.bbox,
|
828
|
-
"polygon": self.polygon if self.has_polygon else None,
|
829
|
-
"color": color, # Use provided or default color
|
830
|
-
"label": display_label,
|
831
|
-
"use_color_cycling": False, # Explicitly false for single preview
|
832
|
-
}
|
833
|
-
|
834
|
-
# Determine crop bbox if requested
|
835
|
-
crop_bbox = self.bbox if crop else None
|
836
|
-
|
837
|
-
# Use render_preview to show only this highlight
|
838
|
-
return service.render_preview(
|
839
|
-
page_index=self._page.index,
|
840
|
-
temporary_highlights=[temp_highlight_data],
|
841
|
-
resolution=resolution,
|
842
|
-
width=width, # Pass the width parameter
|
843
|
-
labels=labels,
|
844
|
-
legend_position=legend_position,
|
845
|
-
crop_bbox=crop_bbox,
|
846
|
-
)
|
847
|
-
|
848
756
|
def save(
|
849
757
|
self,
|
850
758
|
filename: str,
|
@@ -898,7 +806,7 @@ class Region(
|
|
898
806
|
resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
|
899
807
|
crop: If True, only crop the region without highlighting its boundaries
|
900
808
|
include_highlights: Whether to include existing highlights (default: True)
|
901
|
-
**kwargs: Additional parameters for
|
809
|
+
**kwargs: Additional parameters for rendering
|
902
810
|
|
903
811
|
Returns:
|
904
812
|
Self for method chaining
|
@@ -912,16 +820,23 @@ class Region(
|
|
912
820
|
else:
|
913
821
|
resolution = 144 # Default resolution when none specified
|
914
822
|
|
915
|
-
#
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
823
|
+
# Use export() to save the image
|
824
|
+
if include_highlights:
|
825
|
+
# With highlights, use export() which includes them
|
826
|
+
self.export(
|
827
|
+
path=filename,
|
828
|
+
resolution=resolution,
|
829
|
+
crop=crop,
|
830
|
+
**kwargs,
|
831
|
+
)
|
832
|
+
else:
|
833
|
+
# Without highlights, use render() and save manually
|
834
|
+
image = self.render(resolution=resolution, crop=crop, **kwargs)
|
835
|
+
if image:
|
836
|
+
image.save(filename)
|
837
|
+
else:
|
838
|
+
logger.error(f"Failed to render region image for saving to {filename}")
|
922
839
|
|
923
|
-
# Save the image
|
924
|
-
image.save(filename)
|
925
840
|
return self
|
926
841
|
|
927
842
|
def trim(
|
@@ -982,7 +897,8 @@ class Region(
|
|
982
897
|
)
|
983
898
|
|
984
899
|
# Get the region image
|
985
|
-
|
900
|
+
# Use render() for clean image without highlights, with cropping
|
901
|
+
image = work_region.render(resolution=resolution, crop=True)
|
986
902
|
|
987
903
|
if image is None:
|
988
904
|
logger.warning(
|
@@ -1221,7 +1137,9 @@ class Region(
|
|
1221
1137
|
# Filter to elements in this region
|
1222
1138
|
return [e for e in page_elements if self._is_element_in_region(e)]
|
1223
1139
|
|
1224
|
-
def extract_text(
|
1140
|
+
def extract_text(
|
1141
|
+
self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
|
1142
|
+
) -> str:
|
1225
1143
|
"""
|
1226
1144
|
Extract text from this region, respecting page exclusions and using pdfplumber's
|
1227
1145
|
layout engine (chars_to_textmap).
|
@@ -1293,7 +1211,7 @@ class Region(
|
|
1293
1211
|
final_kwargs = kwargs.copy()
|
1294
1212
|
if content_filter is not None:
|
1295
1213
|
final_kwargs["content_filter"] = content_filter
|
1296
|
-
|
1214
|
+
|
1297
1215
|
result = generate_text_layout(
|
1298
1216
|
char_dicts=filtered_chars,
|
1299
1217
|
layout_context_bbox=self.bbox, # Use region's bbox for context
|
@@ -1313,7 +1231,9 @@ class Region(
|
|
1313
1231
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1314
1232
|
# --- NEW: Add tqdm control option --- #
|
1315
1233
|
show_progress: bool = False, # Controls progress bar for text method
|
1316
|
-
content_filter: Optional[
|
1234
|
+
content_filter: Optional[
|
1235
|
+
Union[str, Callable[[str], bool], List[str]]
|
1236
|
+
] = None, # NEW: Content filtering
|
1317
1237
|
) -> TableResult: # Return type allows Optional[str] for cells
|
1318
1238
|
"""
|
1319
1239
|
Extract a table from this region.
|
@@ -1373,7 +1293,11 @@ class Region(
|
|
1373
1293
|
logger.debug(
|
1374
1294
|
f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
|
1375
1295
|
)
|
1376
|
-
return TableResult(
|
1296
|
+
return TableResult(
|
1297
|
+
self._extract_table_from_cells(
|
1298
|
+
cell_regions_in_table, content_filter=content_filter
|
1299
|
+
)
|
1300
|
+
)
|
1377
1301
|
|
1378
1302
|
# --------------------------------------------------------------- #
|
1379
1303
|
|
@@ -1454,7 +1378,9 @@ class Region(
|
|
1454
1378
|
|
1455
1379
|
# Use the selected method
|
1456
1380
|
if effective_method == "tatr":
|
1457
|
-
table_rows = self._extract_table_tatr(
|
1381
|
+
table_rows = self._extract_table_tatr(
|
1382
|
+
use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
|
1383
|
+
)
|
1458
1384
|
elif effective_method == "text":
|
1459
1385
|
current_text_options = text_options.copy()
|
1460
1386
|
current_text_options["cell_extraction_func"] = cell_extraction_func
|
@@ -1610,8 +1536,47 @@ class Region(
|
|
1610
1536
|
table_settings.setdefault("join_x_tolerance", join)
|
1611
1537
|
table_settings.setdefault("join_y_tolerance", join)
|
1612
1538
|
|
1613
|
-
#
|
1614
|
-
|
1539
|
+
# -------------------------------------------------------------
|
1540
|
+
# Apply char-level exclusion filtering, if any exclusions are
|
1541
|
+
# defined on the parent Page. We create a lightweight
|
1542
|
+
# pdfplumber.Page copy whose .chars list omits characters that
|
1543
|
+
# fall inside any exclusion Region. Other object types are
|
1544
|
+
# left untouched for now ("chars-only" strategy).
|
1545
|
+
# -------------------------------------------------------------
|
1546
|
+
base_plumber_page = self.page._page
|
1547
|
+
|
1548
|
+
if getattr(self.page, "_exclusions", None):
|
1549
|
+
# Resolve exclusion Regions (callables already evaluated)
|
1550
|
+
exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
|
1551
|
+
|
1552
|
+
def _keep_char(obj):
|
1553
|
+
"""Return True if pdfplumber obj should be kept."""
|
1554
|
+
if obj.get("object_type") != "char":
|
1555
|
+
# Keep non-char objects unchanged – lattice grids etc.
|
1556
|
+
return True
|
1557
|
+
|
1558
|
+
# Compute character centre point
|
1559
|
+
cx = (obj["x0"] + obj["x1"]) / 2.0
|
1560
|
+
cy = (obj["top"] + obj["bottom"]) / 2.0
|
1561
|
+
|
1562
|
+
# Reject if the centre lies inside ANY exclusion Region
|
1563
|
+
for reg in exclusion_regions:
|
1564
|
+
if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
|
1565
|
+
return False
|
1566
|
+
return True
|
1567
|
+
|
1568
|
+
try:
|
1569
|
+
filtered_page = base_plumber_page.filter(_keep_char)
|
1570
|
+
except Exception as _filter_err:
|
1571
|
+
# Fallback – if filtering fails, log and proceed unfiltered
|
1572
|
+
logger.warning(
|
1573
|
+
f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
|
1574
|
+
)
|
1575
|
+
filtered_page = base_plumber_page
|
1576
|
+
else:
|
1577
|
+
filtered_page = base_plumber_page
|
1578
|
+
|
1579
|
+
cropped = filtered_page.crop(self.bbox)
|
1615
1580
|
|
1616
1581
|
# Extract all tables from the cropped area
|
1617
1582
|
tables = cropped.extract_tables(table_settings)
|
@@ -1672,8 +1637,38 @@ class Region(
|
|
1672
1637
|
if y_tol is not None:
|
1673
1638
|
table_settings.setdefault("text_y_tolerance", y_tol)
|
1674
1639
|
|
1675
|
-
#
|
1676
|
-
|
1640
|
+
# -------------------------------------------------------------
|
1641
|
+
# Apply char-level exclusion filtering (chars only) just like in
|
1642
|
+
# _extract_tables_plumber so header/footer text does not appear
|
1643
|
+
# in extracted tables.
|
1644
|
+
# -------------------------------------------------------------
|
1645
|
+
base_plumber_page = self.page._page
|
1646
|
+
|
1647
|
+
if getattr(self.page, "_exclusions", None):
|
1648
|
+
exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
|
1649
|
+
|
1650
|
+
def _keep_char(obj):
|
1651
|
+
if obj.get("object_type") != "char":
|
1652
|
+
return True
|
1653
|
+
cx = (obj["x0"] + obj["x1"]) / 2.0
|
1654
|
+
cy = (obj["top"] + obj["bottom"]) / 2.0
|
1655
|
+
for reg in exclusion_regions:
|
1656
|
+
if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
|
1657
|
+
return False
|
1658
|
+
return True
|
1659
|
+
|
1660
|
+
try:
|
1661
|
+
filtered_page = base_plumber_page.filter(_keep_char)
|
1662
|
+
except Exception as _filter_err:
|
1663
|
+
logger.warning(
|
1664
|
+
f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
|
1665
|
+
)
|
1666
|
+
filtered_page = base_plumber_page
|
1667
|
+
else:
|
1668
|
+
filtered_page = base_plumber_page
|
1669
|
+
|
1670
|
+
# Now crop the (possibly filtered) page to the region bbox
|
1671
|
+
cropped = filtered_page.crop(self.bbox)
|
1677
1672
|
|
1678
1673
|
# Extract the single largest table from the cropped area
|
1679
1674
|
table = cropped.extract_table(table_settings)
|
@@ -1688,10 +1683,12 @@ class Region(
|
|
1688
1683
|
if cell is not None:
|
1689
1684
|
# Apply RTL text processing first
|
1690
1685
|
rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
|
1691
|
-
|
1686
|
+
|
1692
1687
|
# Then apply content filter if provided
|
1693
1688
|
if content_filter is not None:
|
1694
|
-
filtered_cell = self._apply_content_filter_to_text(
|
1689
|
+
filtered_cell = self._apply_content_filter_to_text(
|
1690
|
+
rtl_processed_cell, content_filter
|
1691
|
+
)
|
1695
1692
|
processed_row.append(filtered_cell)
|
1696
1693
|
else:
|
1697
1694
|
processed_row.append(rtl_processed_cell)
|
@@ -1701,7 +1698,9 @@ class Region(
|
|
1701
1698
|
return processed_table
|
1702
1699
|
return []
|
1703
1700
|
|
1704
|
-
def _extract_table_tatr(
|
1701
|
+
def _extract_table_tatr(
|
1702
|
+
self, use_ocr=False, ocr_config=None, content_filter=None
|
1703
|
+
) -> List[List[str]]:
|
1705
1704
|
"""
|
1706
1705
|
Extract table using TATR structure detection.
|
1707
1706
|
|
@@ -2098,7 +2097,7 @@ class Region(
|
|
2098
2097
|
Returns:
|
2099
2098
|
ElementCollection with matching elements.
|
2100
2099
|
"""
|
2101
|
-
from natural_pdf.elements.
|
2100
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
2102
2101
|
|
2103
2102
|
if selector is not None and text is not None:
|
2104
2103
|
raise ValueError("Provide either 'selector' or 'text', not both.")
|
@@ -2183,7 +2182,7 @@ class Region(
|
|
2183
2182
|
---------
|
2184
2183
|
```python
|
2185
2184
|
def llm_ocr(region):
|
2186
|
-
image = region.
|
2185
|
+
image = region.render(resolution=300, crop=True)
|
2187
2186
|
return my_llm_client.ocr(image)
|
2188
2187
|
region.apply_ocr(function=llm_ocr)
|
2189
2188
|
```
|
@@ -2293,9 +2292,8 @@ class Region(
|
|
2293
2292
|
|
2294
2293
|
# Render the page region to an image using the determined resolution
|
2295
2294
|
try:
|
2296
|
-
|
2297
|
-
|
2298
|
-
)
|
2295
|
+
# Use render() for clean image without highlights, with cropping
|
2296
|
+
region_image = self.render(resolution=final_resolution, crop=True)
|
2299
2297
|
if not region_image:
|
2300
2298
|
logger.error("Failed to render region to image for OCR.")
|
2301
2299
|
return self
|
@@ -2417,7 +2415,7 @@ class Region(
|
|
2417
2415
|
Example:
|
2418
2416
|
# Using with an LLM
|
2419
2417
|
def ocr_with_llm(region):
|
2420
|
-
image = region.
|
2418
|
+
image = region.render(resolution=300, crop=True)
|
2421
2419
|
# Call your LLM API here
|
2422
2420
|
return llm_client.ocr(image)
|
2423
2421
|
|
@@ -2425,7 +2423,7 @@ class Region(
|
|
2425
2423
|
|
2426
2424
|
# Using with a custom OCR service
|
2427
2425
|
def ocr_with_service(region):
|
2428
|
-
img_bytes = region.
|
2426
|
+
img_bytes = region.render(crop=True).tobytes()
|
2429
2427
|
response = ocr_service.process(img_bytes)
|
2430
2428
|
return response.text
|
2431
2429
|
|
@@ -2530,14 +2528,14 @@ class Region(
|
|
2530
2528
|
|
2531
2529
|
return self
|
2532
2530
|
|
2533
|
-
def get_section_between(self, start_element=None, end_element=None,
|
2531
|
+
def get_section_between(self, start_element=None, end_element=None, include_boundaries="both"):
|
2534
2532
|
"""
|
2535
2533
|
Get a section between two elements within this region.
|
2536
2534
|
|
2537
2535
|
Args:
|
2538
2536
|
start_element: Element marking the start of the section
|
2539
2537
|
end_element: Element marking the end of the section
|
2540
|
-
|
2538
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2541
2539
|
|
2542
2540
|
Returns:
|
2543
2541
|
Region representing the section
|
@@ -2586,15 +2584,15 @@ class Region(
|
|
2586
2584
|
start_element_for_bbox = start_element
|
2587
2585
|
end_element_for_bbox = end_element
|
2588
2586
|
|
2589
|
-
if
|
2587
|
+
if include_boundaries == "none":
|
2590
2588
|
start_idx += 1
|
2591
2589
|
end_idx -= 1
|
2592
2590
|
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
2593
2591
|
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
2594
|
-
elif
|
2592
|
+
elif include_boundaries == "start":
|
2595
2593
|
end_idx -= 1
|
2596
2594
|
end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
|
2597
|
-
elif
|
2595
|
+
elif include_boundaries == "end":
|
2598
2596
|
start_idx += 1
|
2599
2597
|
start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
|
2600
2598
|
|
@@ -2627,7 +2625,7 @@ class Region(
|
|
2627
2625
|
return section
|
2628
2626
|
|
2629
2627
|
def get_sections(
|
2630
|
-
self, start_elements=None, end_elements=None,
|
2628
|
+
self, start_elements=None, end_elements=None, include_boundaries="both"
|
2631
2629
|
) -> "ElementCollection[Region]":
|
2632
2630
|
"""
|
2633
2631
|
Get sections within this region based on start/end elements.
|
@@ -2635,12 +2633,12 @@ class Region(
|
|
2635
2633
|
Args:
|
2636
2634
|
start_elements: Elements or selector string that mark the start of sections
|
2637
2635
|
end_elements: Elements or selector string that mark the end of sections
|
2638
|
-
|
2636
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2639
2637
|
|
2640
2638
|
Returns:
|
2641
2639
|
List of Region objects representing the extracted sections
|
2642
2640
|
"""
|
2643
|
-
from natural_pdf.elements.
|
2641
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
2644
2642
|
|
2645
2643
|
# Process string selectors to find elements WITHIN THIS REGION
|
2646
2644
|
if isinstance(start_elements, str):
|
@@ -2714,7 +2712,7 @@ class Region(
|
|
2714
2712
|
start_element = current_start_boundary["element"]
|
2715
2713
|
end_element = boundary["element"]
|
2716
2714
|
# Use the helper, ensuring elements are from within the region
|
2717
|
-
section = self.get_section_between(start_element, end_element,
|
2715
|
+
section = self.get_section_between(start_element, end_element, include_boundaries)
|
2718
2716
|
sections.append(section)
|
2719
2717
|
current_start_boundary = None # Reset
|
2720
2718
|
|
@@ -2731,7 +2729,7 @@ class Region(
|
|
2731
2729
|
if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
|
2732
2730
|
end_element = all_elements_in_region[end_idx]
|
2733
2731
|
section = self.get_section_between(
|
2734
|
-
start_element, end_element,
|
2732
|
+
start_element, end_element, include_boundaries
|
2735
2733
|
)
|
2736
2734
|
sections.append(section)
|
2737
2735
|
# Else: Section started and ended by consecutive start elements? Create empty?
|
@@ -2745,7 +2743,7 @@ class Region(
|
|
2745
2743
|
start_element = current_start_boundary["element"]
|
2746
2744
|
# End at the last element within the region
|
2747
2745
|
end_element = all_elements_in_region[-1]
|
2748
|
-
section = self.get_section_between(start_element, end_element,
|
2746
|
+
section = self.get_section_between(start_element, end_element, include_boundaries)
|
2749
2747
|
sections.append(section)
|
2750
2748
|
|
2751
2749
|
return ElementCollection(sections)
|
@@ -3007,46 +3005,23 @@ class Region(
|
|
3007
3005
|
source_info = f" source='{self.source}'" if self.source else ""
|
3008
3006
|
return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
|
3009
3007
|
|
3010
|
-
def
|
3008
|
+
def update_text(
|
3011
3009
|
self,
|
3012
|
-
|
3013
|
-
|
3014
|
-
""
|
3015
|
-
|
3016
|
-
|
3017
|
-
|
3018
|
-
Finds text elements within this region whose 'source' attribute starts
|
3019
|
-
with 'ocr' and calls the `correction_callback` for each, passing the
|
3020
|
-
element itself.
|
3021
|
-
|
3022
|
-
The `correction_callback` should contain the logic to:
|
3023
|
-
1. Determine if the element needs correction.
|
3024
|
-
2. Perform the correction (e.g., call an LLM).
|
3025
|
-
3. Return the new text (`str`) or `None`.
|
3026
|
-
|
3027
|
-
If the callback returns a string, the element's `.text` is updated.
|
3028
|
-
Metadata updates (source, confidence, etc.) should happen within the callback.
|
3029
|
-
|
3030
|
-
Args:
|
3031
|
-
correction_callback: A function accepting an element and returning
|
3032
|
-
`Optional[str]` (new text or None).
|
3010
|
+
transform: Callable[[Any], Optional[str]],
|
3011
|
+
*,
|
3012
|
+
selector: str = "text",
|
3013
|
+
apply_exclusions: bool = False,
|
3014
|
+
) -> "Region":
|
3015
|
+
"""Apply *transform* to every text element matched by *selector* inside this region.
|
3033
3016
|
|
3034
|
-
|
3035
|
-
|
3017
|
+
The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
|
3018
|
+
override simply ensures the search is scoped to the region.
|
3036
3019
|
"""
|
3037
|
-
# Find OCR elements specifically within this region
|
3038
|
-
# Note: We typically want to correct even if the element falls in an excluded area
|
3039
|
-
target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
|
3040
3020
|
|
3041
|
-
|
3042
|
-
|
3043
|
-
elements=target_elements, # Pass the ElementCollection directly
|
3044
|
-
correction_callback=correction_callback,
|
3045
|
-
caller_info=f"Region({self.bbox})", # Pass caller info
|
3021
|
+
return TextMixin.update_text(
|
3022
|
+
self, transform, selector=selector, apply_exclusions=apply_exclusions
|
3046
3023
|
)
|
3047
3024
|
|
3048
|
-
return self # Return self for chaining
|
3049
|
-
|
3050
3025
|
# --- Classification Mixin Implementation --- #
|
3051
3026
|
def _get_classification_manager(self) -> "ClassificationManager":
|
3052
3027
|
if (
|
@@ -3086,9 +3061,8 @@ class Region(
|
|
3086
3061
|
else default_resolution
|
3087
3062
|
)
|
3088
3063
|
|
3089
|
-
img = self.
|
3064
|
+
img = self.render(
|
3090
3065
|
resolution=resolution,
|
3091
|
-
include_highlights=False, # No highlights for classification input
|
3092
3066
|
crop=True, # Just the region content
|
3093
3067
|
)
|
3094
3068
|
if img is None:
|
@@ -3218,7 +3192,7 @@ class Region(
|
|
3218
3192
|
An ElementCollection containing temporary Region objects for each detected cell,
|
3219
3193
|
or an empty ElementCollection if no cells are found or an error occurs.
|
3220
3194
|
"""
|
3221
|
-
from natural_pdf.elements.
|
3195
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3222
3196
|
|
3223
3197
|
# 1. Perform the analysis (or use cached results)
|
3224
3198
|
if "text_table_structure" in self.analyses:
|
@@ -3420,13 +3394,15 @@ class Region(
|
|
3420
3394
|
# New helper: build table from pre-computed table_cell regions
|
3421
3395
|
# ------------------------------------------------------------------
|
3422
3396
|
|
3423
|
-
def _extract_table_from_cells(
|
3397
|
+
def _extract_table_from_cells(
|
3398
|
+
self, cell_regions: List["Region"], content_filter=None
|
3399
|
+
) -> List[List[Optional[str]]]:
|
3424
3400
|
"""Construct a table (list-of-lists) from table_cell regions.
|
3425
3401
|
|
3426
3402
|
This assumes each cell Region has metadata.row_index / col_index as written by
|
3427
3403
|
detect_table_structure_from_lines(). If these keys are missing we will
|
3428
3404
|
fall back to sorting by geometry.
|
3429
|
-
|
3405
|
+
|
3430
3406
|
Args:
|
3431
3407
|
cell_regions: List of table cell Region objects to extract text from
|
3432
3408
|
content_filter: Optional content filter to apply to cell text extraction
|
@@ -3460,7 +3436,9 @@ class Region(
|
|
3460
3436
|
try:
|
3461
3437
|
r_idx = int(cell.metadata.get("row_index"))
|
3462
3438
|
c_idx = int(cell.metadata.get("col_index"))
|
3463
|
-
text_val = cell.extract_text(
|
3439
|
+
text_val = cell.extract_text(
|
3440
|
+
layout=False, apply_exclusions=False, content_filter=content_filter
|
3441
|
+
).strip()
|
3464
3442
|
table_grid[r_idx][c_idx] = text_val if text_val else None
|
3465
3443
|
except Exception as _err:
|
3466
3444
|
# Skip problematic cell
|
@@ -3507,7 +3485,9 @@ class Region(
|
|
3507
3485
|
row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
|
3508
3486
|
col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
|
3509
3487
|
|
3510
|
-
text_val = cell.extract_text(
|
3488
|
+
text_val = cell.extract_text(
|
3489
|
+
layout=False, apply_exclusions=False, content_filter=content_filter
|
3490
|
+
).strip()
|
3511
3491
|
table_grid[row_idx][col_idx] = text_val if text_val else None
|
3512
3492
|
|
3513
3493
|
return table_grid
|
@@ -3515,32 +3495,33 @@ class Region(
|
|
3515
3495
|
def _apply_rtl_processing_to_text(self, text: str) -> str:
|
3516
3496
|
"""
|
3517
3497
|
Apply RTL (Right-to-Left) text processing to a string.
|
3518
|
-
|
3498
|
+
|
3519
3499
|
This converts visual order text (as stored in PDFs) to logical order
|
3520
3500
|
for proper display of Arabic, Hebrew, and other RTL scripts.
|
3521
|
-
|
3501
|
+
|
3522
3502
|
Args:
|
3523
3503
|
text: Input text string in visual order
|
3524
|
-
|
3504
|
+
|
3525
3505
|
Returns:
|
3526
3506
|
Text string in logical order
|
3527
3507
|
"""
|
3528
3508
|
if not text or not text.strip():
|
3529
3509
|
return text
|
3530
|
-
|
3510
|
+
|
3531
3511
|
# Quick check for RTL characters - if none found, return as-is
|
3532
3512
|
import unicodedata
|
3533
|
-
|
3513
|
+
|
3534
3514
|
def _contains_rtl(s):
|
3535
3515
|
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
3536
|
-
|
3516
|
+
|
3537
3517
|
if not _contains_rtl(text):
|
3538
3518
|
return text
|
3539
|
-
|
3519
|
+
|
3540
3520
|
try:
|
3541
3521
|
from bidi.algorithm import get_display # type: ignore
|
3522
|
+
|
3542
3523
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
3543
|
-
|
3524
|
+
|
3544
3525
|
# Apply BiDi algorithm to convert from visual to logical order
|
3545
3526
|
# Process line by line to handle mixed content properly
|
3546
3527
|
processed_lines = []
|
@@ -3553,9 +3534,9 @@ class Region(
|
|
3553
3534
|
processed_lines.append(mirror_brackets(logical_line))
|
3554
3535
|
else:
|
3555
3536
|
processed_lines.append(line)
|
3556
|
-
|
3537
|
+
|
3557
3538
|
return "\n".join(processed_lines)
|
3558
|
-
|
3539
|
+
|
3559
3540
|
except (ImportError, Exception):
|
3560
3541
|
# If bidi library is not available or fails, return original text
|
3561
3542
|
return text
|
@@ -3563,36 +3544,36 @@ class Region(
|
|
3563
3544
|
def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
|
3564
3545
|
"""
|
3565
3546
|
Apply content filter to a text string.
|
3566
|
-
|
3547
|
+
|
3567
3548
|
Args:
|
3568
3549
|
text: Input text string
|
3569
3550
|
content_filter: Content filter (regex, callable, or list of regexes)
|
3570
|
-
|
3551
|
+
|
3571
3552
|
Returns:
|
3572
3553
|
Filtered text string
|
3573
3554
|
"""
|
3574
3555
|
if not text or content_filter is None:
|
3575
3556
|
return text
|
3576
|
-
|
3557
|
+
|
3577
3558
|
import re
|
3578
|
-
|
3559
|
+
|
3579
3560
|
if isinstance(content_filter, str):
|
3580
3561
|
# Single regex pattern - remove matching parts
|
3581
3562
|
try:
|
3582
|
-
return re.sub(content_filter,
|
3563
|
+
return re.sub(content_filter, "", text)
|
3583
3564
|
except re.error:
|
3584
3565
|
return text # Invalid regex, return original
|
3585
|
-
|
3566
|
+
|
3586
3567
|
elif isinstance(content_filter, list):
|
3587
3568
|
# List of regex patterns - remove parts matching ANY pattern
|
3588
3569
|
try:
|
3589
3570
|
result = text
|
3590
3571
|
for pattern in content_filter:
|
3591
|
-
result = re.sub(pattern,
|
3572
|
+
result = re.sub(pattern, "", result)
|
3592
3573
|
return result
|
3593
3574
|
except re.error:
|
3594
3575
|
return text # Invalid regex, return original
|
3595
|
-
|
3576
|
+
|
3596
3577
|
elif callable(content_filter):
|
3597
3578
|
# Callable filter - apply to individual characters
|
3598
3579
|
try:
|
@@ -3600,8 +3581,152 @@ class Region(
|
|
3600
3581
|
for char in text:
|
3601
3582
|
if content_filter(char):
|
3602
3583
|
filtered_chars.append(char)
|
3603
|
-
return
|
3584
|
+
return "".join(filtered_chars)
|
3604
3585
|
except Exception:
|
3605
3586
|
return text # Function error, return original
|
3606
|
-
|
3587
|
+
|
3607
3588
|
return text
|
3589
|
+
|
3590
|
+
# ------------------------------------------------------------------
|
3591
|
+
# Interactive Viewer Support
|
3592
|
+
# ------------------------------------------------------------------
|
3593
|
+
|
3594
|
+
def viewer(
|
3595
|
+
self,
|
3596
|
+
*,
|
3597
|
+
resolution: int = 150,
|
3598
|
+
include_chars: bool = False,
|
3599
|
+
include_attributes: Optional[List[str]] = None,
|
3600
|
+
) -> Optional["InteractiveViewerWidget"]:
|
3601
|
+
"""Create an interactive ipywidget viewer for **this specific region**.
|
3602
|
+
|
3603
|
+
The method renders the region to an image (cropped to the region bounds) and
|
3604
|
+
overlays all elements that intersect the region (optionally excluding noisy
|
3605
|
+
character-level elements). The resulting widget offers the same zoom / pan
|
3606
|
+
experience as :py:meth:`Page.viewer` but scoped to the region.
|
3607
|
+
|
3608
|
+
Parameters
|
3609
|
+
----------
|
3610
|
+
resolution : int, default 150
|
3611
|
+
Rendering resolution (DPI). This should match the value used by the
|
3612
|
+
page-level viewer so element scaling is accurate.
|
3613
|
+
include_chars : bool, default False
|
3614
|
+
Whether to include individual *char* elements in the overlay. These
|
3615
|
+
are often too dense for a meaningful visualisation so are skipped by
|
3616
|
+
default.
|
3617
|
+
include_attributes : list[str], optional
|
3618
|
+
Additional element attributes to expose in the info panel (on top of
|
3619
|
+
the default set used by the page viewer).
|
3620
|
+
|
3621
|
+
Returns
|
3622
|
+
-------
|
3623
|
+
InteractiveViewerWidget | None
|
3624
|
+
The widget instance, or ``None`` if *ipywidgets* is not installed or
|
3625
|
+
an error occurred during creation.
|
3626
|
+
"""
|
3627
|
+
|
3628
|
+
# ------------------------------------------------------------------
|
3629
|
+
# Dependency / environment checks
|
3630
|
+
# ------------------------------------------------------------------
|
3631
|
+
if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
|
3632
|
+
logger.error(
|
3633
|
+
"Interactive viewer requires 'ipywidgets'. "
|
3634
|
+
'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
|
3635
|
+
)
|
3636
|
+
return None
|
3637
|
+
|
3638
|
+
try:
|
3639
|
+
# ------------------------------------------------------------------
|
3640
|
+
# Render region image (cropped) and encode as data URI
|
3641
|
+
# ------------------------------------------------------------------
|
3642
|
+
import base64
|
3643
|
+
from io import BytesIO
|
3644
|
+
|
3645
|
+
# Use unified render() with crop=True to obtain just the region
|
3646
|
+
img = self.render(resolution=resolution, crop=True)
|
3647
|
+
if img is None:
|
3648
|
+
logger.error(f"Failed to render image for region {self.bbox} viewer.")
|
3649
|
+
return None
|
3650
|
+
|
3651
|
+
buf = BytesIO()
|
3652
|
+
img.save(buf, format="PNG")
|
3653
|
+
img_str = base64.b64encode(buf.getvalue()).decode()
|
3654
|
+
image_uri = f"data:image/png;base64,{img_str}"
|
3655
|
+
|
3656
|
+
# ------------------------------------------------------------------
|
3657
|
+
# Prepare element overlay data (coordinates relative to region)
|
3658
|
+
# ------------------------------------------------------------------
|
3659
|
+
scale = resolution / 72.0 # Same convention as page viewer
|
3660
|
+
|
3661
|
+
# Gather elements intersecting the region
|
3662
|
+
region_elements = self.get_elements(apply_exclusions=False)
|
3663
|
+
|
3664
|
+
# Optionally filter out chars
|
3665
|
+
if not include_chars:
|
3666
|
+
region_elements = [
|
3667
|
+
el for el in region_elements if str(getattr(el, "type", "")).lower() != "char"
|
3668
|
+
]
|
3669
|
+
|
3670
|
+
default_attrs = [
|
3671
|
+
"text",
|
3672
|
+
"fontname",
|
3673
|
+
"size",
|
3674
|
+
"bold",
|
3675
|
+
"italic",
|
3676
|
+
"color",
|
3677
|
+
"linewidth",
|
3678
|
+
"is_horizontal",
|
3679
|
+
"is_vertical",
|
3680
|
+
"source",
|
3681
|
+
"confidence",
|
3682
|
+
"label",
|
3683
|
+
"model",
|
3684
|
+
"upright",
|
3685
|
+
"direction",
|
3686
|
+
]
|
3687
|
+
|
3688
|
+
if include_attributes:
|
3689
|
+
default_attrs.extend([a for a in include_attributes if a not in default_attrs])
|
3690
|
+
|
3691
|
+
elements_json: List[dict] = []
|
3692
|
+
for idx, el in enumerate(region_elements):
|
3693
|
+
try:
|
3694
|
+
# Calculate coordinates relative to region bbox and apply scale
|
3695
|
+
x0 = (el.x0 - self.x0) * scale
|
3696
|
+
y0 = (el.top - self.top) * scale
|
3697
|
+
x1 = (el.x1 - self.x0) * scale
|
3698
|
+
y1 = (el.bottom - self.top) * scale
|
3699
|
+
|
3700
|
+
elem_dict = {
|
3701
|
+
"id": idx,
|
3702
|
+
"type": getattr(el, "type", "unknown"),
|
3703
|
+
"x0": round(x0, 2),
|
3704
|
+
"y0": round(y0, 2),
|
3705
|
+
"x1": round(x1, 2),
|
3706
|
+
"y1": round(y1, 2),
|
3707
|
+
"width": round(x1 - x0, 2),
|
3708
|
+
"height": round(y1 - y0, 2),
|
3709
|
+
}
|
3710
|
+
|
3711
|
+
# Add requested / default attributes
|
3712
|
+
for attr_name in default_attrs:
|
3713
|
+
if hasattr(el, attr_name):
|
3714
|
+
val = getattr(el, attr_name)
|
3715
|
+
# Ensure JSON serialisable
|
3716
|
+
if not isinstance(val, (str, int, float, bool, list, dict, type(None))):
|
3717
|
+
val = str(val)
|
3718
|
+
elem_dict[attr_name] = val
|
3719
|
+
elements_json.append(elem_dict)
|
3720
|
+
except Exception as e:
|
3721
|
+
logger.warning(f"Error preparing element {idx} for region viewer: {e}")
|
3722
|
+
|
3723
|
+
viewer_data = {"page_image": image_uri, "elements": elements_json}
|
3724
|
+
|
3725
|
+
# ------------------------------------------------------------------
|
3726
|
+
# Instantiate the widget directly using the prepared data
|
3727
|
+
# ------------------------------------------------------------------
|
3728
|
+
return InteractiveViewerWidget(pdf_data=viewer_data)
|
3729
|
+
|
3730
|
+
except Exception as e:
|
3731
|
+
logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
|
3732
|
+
return None
|