natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +44 -0
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/ocr_manager.py +50 -0
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -70,16 +70,16 @@ class HighlightRenderer:
|
|
70
70
|
page: Page,
|
71
71
|
base_image: Image.Image,
|
72
72
|
highlights: List[Highlight],
|
73
|
-
|
73
|
+
scale_factor: float,
|
74
74
|
render_ocr: bool,
|
75
75
|
):
|
76
76
|
self.page = page # Keep page reference for OCR rendering
|
77
77
|
self.base_image = base_image.convert("RGBA") # Ensure RGBA
|
78
78
|
self.highlights = highlights
|
79
|
-
self.
|
79
|
+
self.scale_factor = scale_factor # Renamed from scale to scale_factor for clarity
|
80
80
|
self.render_ocr = render_ocr
|
81
81
|
self.result_image = self.base_image.copy()
|
82
|
-
self.vertex_size = max(3, int(2 * self.
|
82
|
+
self.vertex_size = max(3, int(2 * self.scale_factor)) # Size of corner markers
|
83
83
|
|
84
84
|
def render(self) -> Image.Image:
|
85
85
|
"""Executes the rendering process."""
|
@@ -98,7 +98,7 @@ class HighlightRenderer:
|
|
98
98
|
scaled_bbox = None
|
99
99
|
|
100
100
|
if highlight.is_polygon:
|
101
|
-
scaled_polygon = [(p[0] * self.
|
101
|
+
scaled_polygon = [(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon]
|
102
102
|
# Draw polygon fill and border
|
103
103
|
draw.polygon(
|
104
104
|
scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
|
@@ -113,10 +113,10 @@ class HighlightRenderer:
|
|
113
113
|
else: # Rectangle
|
114
114
|
x0, top, x1, bottom = highlight.bbox
|
115
115
|
x0_s, top_s, x1_s, bottom_s = (
|
116
|
-
x0 * self.
|
117
|
-
top * self.
|
118
|
-
x1 * self.
|
119
|
-
bottom * self.
|
116
|
+
x0 * self.scale_factor,
|
117
|
+
top * self.scale_factor,
|
118
|
+
x1 * self.scale_factor,
|
119
|
+
bottom * self.scale_factor,
|
120
120
|
)
|
121
121
|
scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
|
122
122
|
# Draw rectangle fill and border
|
@@ -159,15 +159,15 @@ class HighlightRenderer:
|
|
159
159
|
"""Draws attribute key-value pairs on the highlight."""
|
160
160
|
try:
|
161
161
|
# Slightly larger font, scaled
|
162
|
-
font_size = max(10, int(8 * self.
|
162
|
+
font_size = max(10, int(8 * self.scale_factor))
|
163
163
|
# Prioritize monospace fonts for better alignment
|
164
164
|
font = ImageFont.truetype("Arial.ttf", font_size) # Fallback sans-serif
|
165
165
|
except IOError:
|
166
166
|
font = ImageFont.load_default()
|
167
167
|
font_size = 10 # Reset size for default font
|
168
168
|
|
169
|
-
line_height = font_size + int(4 * self.
|
170
|
-
bg_padding = int(3 * self.
|
169
|
+
line_height = font_size + int(4 * self.scale_factor) # Scaled line spacing
|
170
|
+
bg_padding = int(3 * self.scale_factor)
|
171
171
|
max_width = 0
|
172
172
|
text_lines = []
|
173
173
|
|
@@ -191,8 +191,8 @@ class HighlightRenderer:
|
|
191
191
|
total_height = line_height * len(text_lines)
|
192
192
|
|
193
193
|
# Position near top-right corner with padding
|
194
|
-
x = bbox_scaled[2] - int(2 * self.
|
195
|
-
y = bbox_scaled[1] + int(2 * self.
|
194
|
+
x = bbox_scaled[2] - int(2 * self.scale_factor) - max_width
|
195
|
+
y = bbox_scaled[1] + int(2 * self.scale_factor)
|
196
196
|
|
197
197
|
# Draw background rectangle (semi-transparent white)
|
198
198
|
bg_x0 = x - bg_padding
|
@@ -244,10 +244,10 @@ class HighlightRenderer:
|
|
244
244
|
for element in ocr_elements:
|
245
245
|
x0, top, x1, bottom = element.bbox
|
246
246
|
x0_s, top_s, x1_s, bottom_s = (
|
247
|
-
x0 * self.
|
248
|
-
top * self.
|
249
|
-
x1 * self.
|
250
|
-
bottom * self.
|
247
|
+
x0 * self.scale_factor,
|
248
|
+
top * self.scale_factor,
|
249
|
+
x1 * self.scale_factor,
|
250
|
+
bottom * self.scale_factor,
|
251
251
|
)
|
252
252
|
box_w, box_h = x1_s - x0_s, bottom_s - top_s
|
253
253
|
|
@@ -556,19 +556,62 @@ class HighlightingService:
|
|
556
556
|
self._highlights_by_page[page_index].append(highlight)
|
557
557
|
logger.debug(f"Added highlight to page {page_index}: {highlight}")
|
558
558
|
|
559
|
+
# --- Invalidate page-level image cache --------------------------------
|
560
|
+
# The Page.to_image method maintains an internal cache keyed by rendering
|
561
|
+
# parameters. Because the cache key currently does **not** incorporate
|
562
|
+
# any information about the highlights themselves, it can return stale
|
563
|
+
# images after highlights are added or removed. To ensure the next
|
564
|
+
# render reflects the new highlights, we clear the cache for the
|
565
|
+
# affected page here.
|
566
|
+
try:
|
567
|
+
page_obj = self._pdf[page_index]
|
568
|
+
if hasattr(page_obj, "_to_image_cache"):
|
569
|
+
page_obj._to_image_cache.clear()
|
570
|
+
logger.debug(
|
571
|
+
f"Cleared cached to_image renders for page {page_index} after adding a highlight."
|
572
|
+
)
|
573
|
+
except Exception as cache_err: # pragma: no cover – never fail highlight creation
|
574
|
+
logger.warning(
|
575
|
+
f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
|
576
|
+
exc_info=True,
|
577
|
+
)
|
578
|
+
|
559
579
|
def clear_all(self):
|
560
580
|
"""Clears all highlights from all pages and resets the color manager."""
|
561
581
|
self._highlights_by_page = {}
|
562
582
|
self._color_manager.reset()
|
563
583
|
logger.info("Cleared all highlights and reset ColorManager.")
|
564
584
|
|
585
|
+
# Clear cached images for *all* pages because their visual state may
|
586
|
+
# depend on highlight visibility.
|
587
|
+
for idx, page in enumerate(self._pdf.pages):
|
588
|
+
try:
|
589
|
+
if hasattr(page, "_to_image_cache"):
|
590
|
+
page._to_image_cache.clear()
|
591
|
+
except Exception:
|
592
|
+
# Non-critical – keep going for remaining pages
|
593
|
+
continue
|
594
|
+
|
565
595
|
def clear_page(self, page_index: int):
|
566
596
|
"""Clears all highlights from a specific page."""
|
567
597
|
if page_index in self._highlights_by_page:
|
568
598
|
del self._highlights_by_page[page_index]
|
569
599
|
logger.debug(f"Cleared highlights for page {page_index}.")
|
570
|
-
|
571
|
-
|
600
|
+
|
601
|
+
# Also clear any cached rendered images for this page so the next render
|
602
|
+
# reflects the removal of highlights.
|
603
|
+
try:
|
604
|
+
page_obj = self._pdf[page_index]
|
605
|
+
if hasattr(page_obj, "_to_image_cache"):
|
606
|
+
page_obj._to_image_cache.clear()
|
607
|
+
logger.debug(
|
608
|
+
f"Cleared cached to_image renders for page {page_index} after removing highlights."
|
609
|
+
)
|
610
|
+
except Exception as cache_err: # pragma: no cover
|
611
|
+
logger.warning(
|
612
|
+
f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
|
613
|
+
exc_info=True,
|
614
|
+
)
|
572
615
|
|
573
616
|
def get_highlights_for_page(self, page_index: int) -> List[Highlight]:
|
574
617
|
"""Returns a list of Highlight objects for a specific page."""
|
@@ -581,11 +624,10 @@ class HighlightingService:
|
|
581
624
|
def render_page(
|
582
625
|
self,
|
583
626
|
page_index: int,
|
584
|
-
|
627
|
+
resolution: float = 144,
|
585
628
|
labels: bool = True,
|
586
629
|
legend_position: str = "right",
|
587
630
|
render_ocr: bool = False,
|
588
|
-
resolution: Optional[float] = None,
|
589
631
|
**kwargs, # Pass other args to pdfplumber.page.to_image if needed
|
590
632
|
) -> Optional[Image.Image]:
|
591
633
|
"""
|
@@ -594,12 +636,11 @@ class HighlightingService:
|
|
594
636
|
|
595
637
|
Args:
|
596
638
|
page_index: The 0-based index of the page to render.
|
597
|
-
|
639
|
+
resolution: Resolution (DPI) for the base page image if width/height not in kwargs.
|
640
|
+
Defaults to 144 DPI (equivalent to previous scale=2.0).
|
598
641
|
labels: Whether to include a legend for highlights.
|
599
642
|
legend_position: Position of the legend.
|
600
643
|
render_ocr: Whether to render OCR text on the image.
|
601
|
-
resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
|
602
|
-
Defaults to scale * 72 if not otherwise specified.
|
603
644
|
kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
|
604
645
|
|
605
646
|
Returns:
|
@@ -625,13 +666,16 @@ class HighlightingService:
|
|
625
666
|
logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
|
626
667
|
# Actual scale will be calculated after image creation
|
627
668
|
else:
|
628
|
-
# Use explicit resolution
|
629
|
-
|
630
|
-
|
631
|
-
|
669
|
+
# Use explicit resolution if provided via kwargs, otherwise fallback to the
|
670
|
+
# `resolution` parameter (which might be None). If we still end up with
|
671
|
+
# `None`, default to 144 DPI to avoid downstream errors.
|
672
|
+
render_resolution = to_image_args.pop("resolution", resolution)
|
632
673
|
if render_resolution is None:
|
633
|
-
render_resolution =
|
634
|
-
|
674
|
+
render_resolution = 144
|
675
|
+
|
676
|
+
# Reinstate into kwargs for pdfplumber
|
677
|
+
to_image_args["resolution"] = render_resolution
|
678
|
+
|
635
679
|
actual_scale_x = render_resolution / 72.0
|
636
680
|
actual_scale_y = render_resolution / 72.0
|
637
681
|
logger.debug(
|
@@ -657,11 +701,11 @@ class HighlightingService:
|
|
657
701
|
if page_obj.width > 0:
|
658
702
|
actual_scale_x = base_image_pil.width / page_obj.width
|
659
703
|
else:
|
660
|
-
actual_scale_x =
|
704
|
+
actual_scale_x = resolution / 72.0 # Fallback to resolution-based scale
|
661
705
|
if page_obj.height > 0:
|
662
706
|
actual_scale_y = base_image_pil.height / page_obj.height
|
663
707
|
else:
|
664
|
-
actual_scale_y =
|
708
|
+
actual_scale_y = resolution / 72.0 # Fallback to resolution-based scale
|
665
709
|
logger.debug(
|
666
710
|
f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}"
|
667
711
|
)
|
@@ -682,14 +726,20 @@ class HighlightingService:
|
|
682
726
|
page=page_obj,
|
683
727
|
base_image=base_image_pil,
|
684
728
|
highlights=highlights_on_page,
|
685
|
-
|
729
|
+
scale_factor=renderer_scale, # Use the determined actual scale
|
686
730
|
render_ocr=render_ocr,
|
687
731
|
)
|
688
732
|
rendered_image = renderer.render()
|
689
733
|
else:
|
690
734
|
if render_ocr:
|
691
735
|
# Still render OCR even if no highlights, using the determined actual scale
|
692
|
-
renderer = HighlightRenderer(
|
736
|
+
renderer = HighlightRenderer(
|
737
|
+
page=page_obj,
|
738
|
+
base_image=base_image_pil,
|
739
|
+
highlights=[],
|
740
|
+
scale_factor=renderer_scale,
|
741
|
+
render_ocr=True,
|
742
|
+
)
|
693
743
|
rendered_image = renderer.render()
|
694
744
|
else:
|
695
745
|
rendered_image = base_image_pil # No highlights, no OCR requested
|
@@ -722,11 +772,10 @@ class HighlightingService:
|
|
722
772
|
self,
|
723
773
|
page_index: int,
|
724
774
|
temporary_highlights: List[Dict],
|
725
|
-
|
775
|
+
resolution: float = 144,
|
726
776
|
labels: bool = True,
|
727
777
|
legend_position: str = "right",
|
728
778
|
render_ocr: bool = False,
|
729
|
-
resolution: Optional[float] = None,
|
730
779
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
731
780
|
**kwargs,
|
732
781
|
) -> Optional[Image.Image]:
|
@@ -737,11 +786,11 @@ class HighlightingService:
|
|
737
786
|
Args:
|
738
787
|
page_index: Index of the page to render.
|
739
788
|
temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
|
740
|
-
|
789
|
+
resolution: Resolution (DPI) for base page image rendering if width/height not used.
|
790
|
+
Defaults to 144 DPI (equivalent to previous scale=2.0).
|
741
791
|
labels: Whether to include a legend.
|
742
792
|
legend_position: Position of the legend.
|
743
793
|
render_ocr: Whether to render OCR text.
|
744
|
-
resolution: Resolution for base page image rendering if width/height not used.
|
745
794
|
crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
|
746
795
|
space to crop the output image to, before legends or other overlays are
|
747
796
|
applied. If None, no cropping is performed.
|
@@ -777,9 +826,11 @@ class HighlightingService:
|
|
777
826
|
# Resolution is implicitly handled by pdfplumber when height is set
|
778
827
|
# after image is created, we will calculate actual_scale_x and actual_scale_y
|
779
828
|
else:
|
780
|
-
# Neither width nor height is provided,
|
781
|
-
|
829
|
+
# Neither width nor height is provided, rely on `resolution`.
|
830
|
+
# If `resolution` was explicitly passed as `None`, fall back to 144 DPI.
|
831
|
+
render_resolution = 144 if resolution is None else resolution
|
782
832
|
to_image_args["resolution"] = render_resolution
|
833
|
+
|
783
834
|
actual_scale_x = render_resolution / 72.0
|
784
835
|
actual_scale_y = render_resolution / 72.0
|
785
836
|
logger.debug(
|
@@ -804,11 +855,11 @@ class HighlightingService:
|
|
804
855
|
if page_obj.width > 0:
|
805
856
|
actual_scale_x = base_image_pil.width / page_obj.width
|
806
857
|
else:
|
807
|
-
actual_scale_x =
|
858
|
+
actual_scale_x = resolution / 72.0 # Fallback to resolution-based scale
|
808
859
|
if page_obj.height > 0:
|
809
860
|
actual_scale_y = base_image_pil.height / page_obj.height
|
810
861
|
else:
|
811
|
-
actual_scale_y =
|
862
|
+
actual_scale_y = resolution / 72.0 # Fallback to resolution-based scale
|
812
863
|
logger.debug(
|
813
864
|
f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})"
|
814
865
|
)
|
@@ -855,7 +906,11 @@ class HighlightingService:
|
|
855
906
|
renderer_scale = actual_scale_x
|
856
907
|
|
857
908
|
renderer = HighlightRenderer(
|
858
|
-
page_obj,
|
909
|
+
page=page_obj,
|
910
|
+
base_image=base_image_pil,
|
911
|
+
highlights=preview_highlights,
|
912
|
+
scale_factor=renderer_scale,
|
913
|
+
render_ocr=render_ocr,
|
859
914
|
)
|
860
915
|
rendered_image = renderer.render()
|
861
916
|
|
natural_pdf/core/page.py
CHANGED
@@ -867,6 +867,28 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
867
867
|
>>> page.region(right=200, width=50) # Region from x=150 to x=200
|
868
868
|
>>> page.region(top=100, bottom=200, width="full") # Explicit full width
|
869
869
|
"""
|
870
|
+
# ------------------------------------------------------------------
|
871
|
+
# Percentage support – convert strings like "30%" to absolute values
|
872
|
+
# based on page dimensions. X-axis params (left, right, width) use
|
873
|
+
# page.width; Y-axis params (top, bottom, height) use page.height.
|
874
|
+
# ------------------------------------------------------------------
|
875
|
+
|
876
|
+
def _pct_to_abs(val, axis: str):
|
877
|
+
if isinstance(val, str) and val.strip().endswith("%"):
|
878
|
+
try:
|
879
|
+
pct = float(val.strip()[:-1]) / 100.0
|
880
|
+
except ValueError:
|
881
|
+
return val # leave unchanged if not a number
|
882
|
+
return pct * (self.width if axis == "x" else self.height)
|
883
|
+
return val
|
884
|
+
|
885
|
+
left = _pct_to_abs(left, "x")
|
886
|
+
right = _pct_to_abs(right, "x")
|
887
|
+
width = _pct_to_abs(width, "x")
|
888
|
+
top = _pct_to_abs(top, "y")
|
889
|
+
bottom = _pct_to_abs(bottom, "y")
|
890
|
+
height = _pct_to_abs(height, "y")
|
891
|
+
|
870
892
|
# --- Type checking and basic validation ---
|
871
893
|
is_width_numeric = isinstance(width, (int, float))
|
872
894
|
is_width_string = isinstance(width, str)
|
@@ -1137,6 +1159,40 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1137
1159
|
user_kwargs=kwargs, # Pass original user kwargs
|
1138
1160
|
)
|
1139
1161
|
|
1162
|
+
# --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
|
1163
|
+
apply_bidi = kwargs.get("bidi", True)
|
1164
|
+
if apply_bidi and result:
|
1165
|
+
# Quick check for any RTL character
|
1166
|
+
import unicodedata
|
1167
|
+
|
1168
|
+
def _contains_rtl(s):
|
1169
|
+
return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
|
1170
|
+
|
1171
|
+
if _contains_rtl(result):
|
1172
|
+
try:
|
1173
|
+
from bidi.algorithm import get_display # type: ignore
|
1174
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
1175
|
+
|
1176
|
+
result = "\n".join(
|
1177
|
+
mirror_brackets(
|
1178
|
+
get_display(
|
1179
|
+
line,
|
1180
|
+
base_dir=(
|
1181
|
+
"R"
|
1182
|
+
if any(
|
1183
|
+
unicodedata.bidirectional(ch)
|
1184
|
+
in ("R", "AL", "AN")
|
1185
|
+
for ch in line
|
1186
|
+
)
|
1187
|
+
else "L"
|
1188
|
+
),
|
1189
|
+
)
|
1190
|
+
)
|
1191
|
+
for line in result.split("\n")
|
1192
|
+
)
|
1193
|
+
except ModuleNotFoundError:
|
1194
|
+
pass # silently skip if python-bidi not available
|
1195
|
+
|
1140
1196
|
logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
|
1141
1197
|
return result
|
1142
1198
|
|
@@ -1440,7 +1496,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1440
1496
|
|
1441
1497
|
def show(
|
1442
1498
|
self,
|
1443
|
-
|
1499
|
+
resolution: float = 144,
|
1444
1500
|
width: Optional[int] = None,
|
1445
1501
|
labels: bool = True,
|
1446
1502
|
legend_position: str = "right",
|
@@ -1450,7 +1506,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1450
1506
|
Generates and returns an image of the page with persistent highlights rendered.
|
1451
1507
|
|
1452
1508
|
Args:
|
1453
|
-
|
1509
|
+
resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
|
1454
1510
|
width: Optional width for the output image.
|
1455
1511
|
labels: Whether to include a legend for labels.
|
1456
1512
|
legend_position: Position of the legend.
|
@@ -1460,7 +1516,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1460
1516
|
PIL Image object of the page with highlights, or None if rendering fails.
|
1461
1517
|
"""
|
1462
1518
|
return self.to_image(
|
1463
|
-
|
1519
|
+
resolution=resolution,
|
1464
1520
|
width=width,
|
1465
1521
|
labels=labels,
|
1466
1522
|
legend_position=legend_position,
|
@@ -1471,13 +1527,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1471
1527
|
def save_image(
|
1472
1528
|
self,
|
1473
1529
|
filename: str,
|
1474
|
-
scale: float = 2.0,
|
1475
1530
|
width: Optional[int] = None,
|
1476
1531
|
labels: bool = True,
|
1477
1532
|
legend_position: str = "right",
|
1478
1533
|
render_ocr: bool = False,
|
1479
1534
|
include_highlights: bool = True, # Allow saving without highlights
|
1480
|
-
resolution:
|
1535
|
+
resolution: float = 144,
|
1481
1536
|
**kwargs,
|
1482
1537
|
) -> "Page":
|
1483
1538
|
"""
|
@@ -1485,13 +1540,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1485
1540
|
|
1486
1541
|
Args:
|
1487
1542
|
filename: Path to save the image to.
|
1488
|
-
scale: Scale factor for rendering highlights.
|
1489
1543
|
width: Optional width for the output image.
|
1490
1544
|
labels: Whether to include a legend.
|
1491
1545
|
legend_position: Position of the legend.
|
1492
1546
|
render_ocr: Whether to render OCR text.
|
1493
1547
|
include_highlights: Whether to render highlights.
|
1494
|
-
resolution: Resolution for base image rendering.
|
1548
|
+
resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
|
1495
1549
|
**kwargs: Additional args for pdfplumber's to_image.
|
1496
1550
|
|
1497
1551
|
Returns:
|
@@ -1500,7 +1554,6 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1500
1554
|
# Use to_image to generate and save the image
|
1501
1555
|
self.to_image(
|
1502
1556
|
path=filename,
|
1503
|
-
scale=scale,
|
1504
1557
|
width=width,
|
1505
1558
|
labels=labels,
|
1506
1559
|
legend_position=legend_position,
|
@@ -1554,7 +1607,6 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1554
1607
|
def to_image(
|
1555
1608
|
self,
|
1556
1609
|
path: Optional[str] = None,
|
1557
|
-
scale: float = 2.0,
|
1558
1610
|
width: Optional[int] = None,
|
1559
1611
|
labels: bool = True,
|
1560
1612
|
legend_position: str = "right",
|
@@ -1569,12 +1621,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1569
1621
|
|
1570
1622
|
Args:
|
1571
1623
|
path: Optional path to save the image to.
|
1572
|
-
scale: Scale factor for rendering highlights.
|
1573
1624
|
width: Optional width for the output image.
|
1574
1625
|
labels: Whether to include a legend for highlights.
|
1575
1626
|
legend_position: Position of the legend.
|
1576
1627
|
render_ocr: Whether to render OCR text on highlights.
|
1577
|
-
resolution: Resolution in DPI for base page image
|
1628
|
+
resolution: Resolution in DPI for base page image. If None, uses global setting or defaults to 144 DPI.
|
1578
1629
|
include_highlights: Whether to render highlights.
|
1579
1630
|
exclusions: Accepts one of the following:
|
1580
1631
|
• None – no masking (default)
|
@@ -1593,11 +1644,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1593
1644
|
# Use global options if parameters are not explicitly set
|
1594
1645
|
if width is None:
|
1595
1646
|
width = natural_pdf.options.image.width
|
1596
|
-
if resolution is None
|
1597
|
-
|
1647
|
+
if resolution is None:
|
1648
|
+
if natural_pdf.options.image.resolution is not None:
|
1649
|
+
resolution = natural_pdf.options.image.resolution
|
1650
|
+
else:
|
1651
|
+
resolution = 144 # Default resolution when none specified
|
1598
1652
|
# 1. Create cache key (excluding path)
|
1599
1653
|
cache_key_parts = [
|
1600
|
-
scale,
|
1601
1654
|
width,
|
1602
1655
|
labels,
|
1603
1656
|
legend_position,
|
@@ -1641,7 +1694,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1641
1694
|
rendered_image_component: Optional[Image.Image] = (
|
1642
1695
|
None # Renamed from 'image' in original
|
1643
1696
|
)
|
1644
|
-
render_resolution = resolution
|
1697
|
+
render_resolution = resolution
|
1645
1698
|
thread_id = threading.current_thread().name
|
1646
1699
|
logger.debug(
|
1647
1700
|
f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
|
@@ -1658,11 +1711,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1658
1711
|
# Delegate rendering to the central service
|
1659
1712
|
rendered_image_component = self._highlighter.render_page(
|
1660
1713
|
page_index=self.index,
|
1661
|
-
|
1714
|
+
resolution=render_resolution,
|
1662
1715
|
labels=labels,
|
1663
1716
|
legend_position=legend_position,
|
1664
1717
|
render_ocr=render_ocr,
|
1665
|
-
resolution=render_resolution, # Pass the calculated resolution
|
1666
1718
|
**kwargs,
|
1667
1719
|
)
|
1668
1720
|
else:
|
@@ -2336,7 +2388,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2336
2388
|
def show_preview(
|
2337
2389
|
self,
|
2338
2390
|
temporary_highlights: List[Dict],
|
2339
|
-
|
2391
|
+
resolution: float = 144,
|
2340
2392
|
width: Optional[int] = None,
|
2341
2393
|
labels: bool = True,
|
2342
2394
|
legend_position: str = "right",
|
@@ -2349,7 +2401,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2349
2401
|
Args:
|
2350
2402
|
temporary_highlights: List of highlight data dictionaries (as prepared by
|
2351
2403
|
ElementCollection._prepare_highlight_data).
|
2352
|
-
|
2404
|
+
resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
|
2353
2405
|
width: Optional width for the output image.
|
2354
2406
|
labels: Whether to include a legend.
|
2355
2407
|
legend_position: Position of the legend.
|
@@ -2363,7 +2415,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2363
2415
|
img = self._highlighter.render_preview(
|
2364
2416
|
page_index=self.index,
|
2365
2417
|
temporary_highlights=temporary_highlights,
|
2366
|
-
|
2418
|
+
resolution=resolution,
|
2367
2419
|
labels=labels,
|
2368
2420
|
legend_position=legend_position,
|
2369
2421
|
render_ocr=render_ocr,
|
@@ -2897,3 +2949,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2897
2949
|
properties, and other details for each element
|
2898
2950
|
"""
|
2899
2951
|
return self.find_all('*').inspect(limit=limit)
|
2952
|
+
|
2953
|
+
@property
|
2954
|
+
def lines(self) -> List[Any]:
|
2955
|
+
"""Get all line elements on this page."""
|
2956
|
+
return self._element_mgr.lines
|
2957
|
+
|
2958
|
+
# ------------------------------------------------------------------
|
2959
|
+
# Image elements
|
2960
|
+
# ------------------------------------------------------------------
|
2961
|
+
|
2962
|
+
@property
|
2963
|
+
def images(self) -> List[Any]:
|
2964
|
+
"""Get all embedded raster images on this page."""
|
2965
|
+
return self._element_mgr.images
|
natural_pdf/core/pdf.py
CHANGED
@@ -653,8 +653,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
653
653
|
raise ValueError("Internal error: No selector or text provided.")
|
654
654
|
|
655
655
|
selector_obj = parse_selector(effective_selector)
|
656
|
-
kwargs["regex"] = regex
|
657
|
-
kwargs["case"] = case
|
658
656
|
|
659
657
|
# Search page by page
|
660
658
|
for page in self.pages:
|
natural_pdf/describe/base.py
CHANGED
@@ -269,15 +269,28 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
|
|
269
269
|
base_columns = ['x0', 'top', 'x1', 'bottom']
|
270
270
|
|
271
271
|
if element_type == 'word':
|
272
|
-
columns = ['text'] + base_columns + [
|
273
|
-
|
272
|
+
columns = ['text'] + base_columns + [
|
273
|
+
'font_family',
|
274
|
+
'font_variant',
|
275
|
+
'size',
|
276
|
+
'bold',
|
277
|
+
'italic',
|
278
|
+
'strike',
|
279
|
+
'underline',
|
280
|
+
'highlight',
|
281
|
+
'source',
|
282
|
+
'confidence',
|
283
|
+
]
|
284
|
+
# Add foreground text colour too
|
274
285
|
columns.append('color')
|
275
286
|
elif element_type == 'rect':
|
276
287
|
columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
|
277
288
|
elif element_type == 'line':
|
278
289
|
columns = base_columns + ['width', 'is_horizontal', 'is_vertical'] # LineElement properties
|
279
290
|
elif element_type == 'region':
|
280
|
-
columns = base_columns + ['width', 'height', 'type']
|
291
|
+
columns = base_columns + ['width', 'height', 'type', 'color']
|
292
|
+
elif element_type == 'blob':
|
293
|
+
columns = base_columns + ['width', 'height', 'color']
|
281
294
|
else:
|
282
295
|
columns = base_columns + ['type']
|
283
296
|
|
@@ -325,19 +338,37 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
325
338
|
return fontname.split("+", 1)[0]
|
326
339
|
return ''
|
327
340
|
|
328
|
-
elif column in ['bold', 'italic']:
|
341
|
+
elif column in ['bold', 'italic', 'strike', 'underline']:
|
329
342
|
value = getattr(element, column, False)
|
330
343
|
return value if isinstance(value, bool) else False
|
331
344
|
|
345
|
+
elif column == 'highlight':
|
346
|
+
# If element is highlighted, return its colour; otherwise blank
|
347
|
+
if getattr(element, 'highlight', False):
|
348
|
+
col_val = getattr(element, 'highlight_color', None)
|
349
|
+
if col_val is None:
|
350
|
+
return 'True' # fallback if colour missing
|
351
|
+
# Convert tuple to hex
|
352
|
+
if isinstance(col_val, (tuple, list)) and len(col_val) >= 3:
|
353
|
+
try:
|
354
|
+
r, g, b = [int(v * 255) if v <= 1 else int(v) for v in col_val[:3]]
|
355
|
+
return f"#{r:02x}{g:02x}{b:02x}"
|
356
|
+
except Exception:
|
357
|
+
return str(col_val)
|
358
|
+
return str(col_val)
|
359
|
+
return ''
|
360
|
+
|
332
361
|
elif column in ['stroke', 'fill', 'color']:
|
333
|
-
# For rectangles and text, these return color tuples
|
334
362
|
value = getattr(element, column, None)
|
363
|
+
# If already a string (e.g. '#ff00aa' or 'red') return as is
|
364
|
+
if isinstance(value, str):
|
365
|
+
return value
|
366
|
+
# If tuple/list convert to hex
|
335
367
|
if value and isinstance(value, (tuple, list)) and len(value) >= 3:
|
336
|
-
# Convert to hex color for display
|
337
368
|
try:
|
338
369
|
r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
|
339
370
|
return f"#{r:02x}{g:02x}{b:02x}"
|
340
|
-
except:
|
371
|
+
except Exception:
|
341
372
|
return str(value)
|
342
373
|
return ""
|
343
374
|
|
@@ -406,7 +437,7 @@ def describe_element(element: "Element") -> "ElementSummary":
|
|
406
437
|
|
407
438
|
# Add common text properties - use dict structure for proper list formatting
|
408
439
|
text_props = {}
|
409
|
-
for prop in ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']:
|
440
|
+
for prop in ['font_family', 'size', 'bold', 'italic', 'strike', 'underline', 'highlight', 'source', 'confidence']:
|
410
441
|
if hasattr(element, prop):
|
411
442
|
value = getattr(element, prop)
|
412
443
|
if value is not None:
|
@@ -414,7 +445,7 @@ def describe_element(element: "Element") -> "ElementSummary":
|
|
414
445
|
text_props[prop] = round(value, 3)
|
415
446
|
elif prop == 'size' and isinstance(value, (int, float)):
|
416
447
|
text_props[prop] = round(value, 1)
|
417
|
-
elif prop in ['bold', 'italic']:
|
448
|
+
elif prop in ['bold', 'italic', 'strike', 'underline']:
|
418
449
|
text_props[prop] = value
|
419
450
|
else:
|
420
451
|
text_props[prop] = value
|