natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -25,14 +25,12 @@ from typing import (
|
|
25
25
|
)
|
26
26
|
|
27
27
|
from PIL import Image
|
28
|
-
from tqdm import tqdm
|
29
|
-
from tqdm.auto import tqdm as auto_tqdm
|
30
|
-
from tqdm.notebook import tqdm as notebook_tqdm
|
28
|
+
from tqdm.auto import tqdm
|
31
29
|
|
32
|
-
from natural_pdf.
|
30
|
+
from natural_pdf.exporters.base import FinetuneExporter
|
33
31
|
|
34
|
-
#
|
35
|
-
|
32
|
+
# Need to import this utility
|
33
|
+
from natural_pdf.utils.identifiers import generate_short_path_hash
|
36
34
|
|
37
35
|
# Set up logger early
|
38
36
|
# Configure logging to include thread information
|
@@ -67,8 +65,10 @@ except ImportError as e:
|
|
67
65
|
from natural_pdf.collections.mixins import ApplyMixin
|
68
66
|
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
69
67
|
|
68
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
70
69
|
|
71
|
-
|
70
|
+
|
71
|
+
class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin): # Add ExportMixin and ShapeDetectionMixin
|
72
72
|
def __init__(
|
73
73
|
self,
|
74
74
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -119,16 +119,8 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
119
119
|
@staticmethod
|
120
120
|
def _get_pdf_class():
|
121
121
|
"""Helper method to dynamically import the PDF class."""
|
122
|
-
|
123
|
-
|
124
|
-
from natural_pdf.core.pdf import PDF
|
125
|
-
|
126
|
-
return PDF
|
127
|
-
except ImportError as e:
|
128
|
-
logger.error(
|
129
|
-
"Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
|
130
|
-
)
|
131
|
-
raise ImportError("PDF class is required but could not be imported.") from e
|
122
|
+
from natural_pdf.core.pdf import PDF
|
123
|
+
return PDF
|
132
124
|
|
133
125
|
# --- Internal Helpers ---
|
134
126
|
|
@@ -141,16 +133,13 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
141
133
|
def _execute_glob(self, pattern: str) -> Set[str]:
|
142
134
|
"""Glob for paths and return a set of valid PDF paths."""
|
143
135
|
found_paths = set()
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
for
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
found_paths.add(str(p.resolve())) # Store resolved absolute path
|
152
|
-
except Exception as e:
|
153
|
-
logger.error(f"Error processing glob pattern '{pattern}': {e}")
|
136
|
+
# Use iglob for potentially large directories/matches
|
137
|
+
paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
|
138
|
+
for path_str in paths_iter:
|
139
|
+
# Use Path object for easier checking
|
140
|
+
p = Path(path_str)
|
141
|
+
if p.is_file() and p.suffix.lower() == ".pdf":
|
142
|
+
found_paths.add(str(p.resolve())) # Store resolved absolute path
|
154
143
|
return found_paths
|
155
144
|
|
156
145
|
def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
|
@@ -534,19 +523,10 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
534
523
|
**kwargs: Additional arguments passed to create_correction_task_package
|
535
524
|
(e.g., image_render_scale, overwrite).
|
536
525
|
"""
|
537
|
-
|
538
|
-
from natural_pdf.utils.packaging import create_correction_task_package
|
526
|
+
from natural_pdf.utils.packaging import create_correction_task_package
|
539
527
|
|
540
|
-
|
541
|
-
|
542
|
-
except ImportError:
|
543
|
-
logger.error(
|
544
|
-
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
545
|
-
)
|
546
|
-
# Or raise
|
547
|
-
except Exception as e:
|
548
|
-
logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
|
549
|
-
raise # Re-raise the exception from the utility function
|
528
|
+
# Pass the collection itself (self) as the source
|
529
|
+
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
550
530
|
|
551
531
|
# --- Mixin Required Implementation ---
|
552
532
|
def get_indexable_items(self) -> Iterable[Indexable]:
|
@@ -407,7 +407,17 @@ class ElementManager:
|
|
407
407
|
char_dict_data = ocr_char_dict # Use the one we already created
|
408
408
|
char_dict_data["object_type"] = "char" # Mark as char type
|
409
409
|
char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
|
410
|
-
|
410
|
+
|
411
|
+
# Create a TextElement for the char representation
|
412
|
+
# Ensure _char_dicts is handled correctly by TextElement constructor
|
413
|
+
# For an OCR word represented as a char, its _char_dicts can be a list containing its own data
|
414
|
+
char_element_specific_data = char_dict_data.copy()
|
415
|
+
char_element_specific_data["_char_dicts"] = [char_dict_data.copy()]
|
416
|
+
|
417
|
+
ocr_char_as_element = TextElement(char_element_specific_data, self._page)
|
418
|
+
self._elements["chars"].append(
|
419
|
+
ocr_char_as_element
|
420
|
+
) # Append TextElement instance
|
411
421
|
|
412
422
|
except (KeyError, ValueError, TypeError) as e:
|
413
423
|
logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)
|
@@ -215,21 +215,14 @@ class HighlightRenderer:
|
|
215
215
|
def _render_ocr_text(self):
|
216
216
|
"""Renders OCR text onto the image. (Adapted from old HighlightManager)"""
|
217
217
|
# Use the page reference to get OCR elements
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
if
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
# Alternative: self.page.extract_ocr_elements() - but might be slow
|
227
|
-
|
228
|
-
except Exception as e:
|
229
|
-
logger.warning(
|
230
|
-
f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True
|
231
|
-
)
|
232
|
-
return # Don't modify image if OCR elements aren't available
|
218
|
+
# Try finding first, then extracting if necessary
|
219
|
+
ocr_elements = self.page.find_all("text[source=ocr]")
|
220
|
+
if not ocr_elements:
|
221
|
+
# Don't run full OCR here, just extract if already run
|
222
|
+
ocr_elements = [
|
223
|
+
el for el in self.page.words if getattr(el, "source", None) == "ocr"
|
224
|
+
]
|
225
|
+
# Alternative: self.page.extract_ocr_elements() - but might be slow
|
233
226
|
|
234
227
|
if not ocr_elements:
|
235
228
|
logger.debug(f"No OCR elements found for page {self.page.number} to render.")
|
@@ -293,20 +286,15 @@ class HighlightRenderer:
|
|
293
286
|
)
|
294
287
|
|
295
288
|
# Calculate text position (centered vertically, slightly offset from left)
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
text_x = x0_s + padding # Start near left edge with padding
|
306
|
-
|
307
|
-
except Exception:
|
308
|
-
# Fallback positioning
|
309
|
-
text_x, text_y = x0_s + padding, top_s + padding
|
289
|
+
if hasattr(sized_font, "getbbox"): # Modern PIL
|
290
|
+
_, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
|
291
|
+
text_h = text_bottom_offset - text_top_offset
|
292
|
+
else: # Older PIL approximation
|
293
|
+
text_h = font_size
|
294
|
+
text_y = top_s + (box_h - text_h) / 2
|
295
|
+
# Adjust for vertical offset in some fonts
|
296
|
+
text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
|
297
|
+
text_x = x0_s + padding # Start near left edge with padding
|
310
298
|
|
311
299
|
draw.text((text_x, text_y), element.text, fill=(0, 0, 0, 255), font=sized_font)
|
312
300
|
|
@@ -392,9 +380,6 @@ class HighlightingService:
|
|
392
380
|
except ValueError:
|
393
381
|
logger.warning(f"Invalid color string: '{color_input}'")
|
394
382
|
return None
|
395
|
-
except Exception as e:
|
396
|
-
logger.error(f"Error processing color string '{color_input}': {e}")
|
397
|
-
return None
|
398
383
|
else:
|
399
384
|
logger.warning(f"Invalid color input type: {type(color_input)}")
|
400
385
|
return None
|
@@ -611,13 +596,13 @@ class HighlightingService:
|
|
611
596
|
|
612
597
|
Args:
|
613
598
|
page_index: The 0-based index of the page to render.
|
614
|
-
scale: Scale factor for rendering highlights.
|
599
|
+
scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
|
615
600
|
labels: Whether to include a legend for highlights.
|
616
601
|
legend_position: Position of the legend.
|
617
602
|
render_ocr: Whether to render OCR text on the image.
|
618
|
-
resolution: Optional resolution (DPI) for the base page image.
|
619
|
-
Defaults to scale * 72.
|
620
|
-
kwargs: Additional keyword arguments for pdfplumber's page.to_image.
|
603
|
+
resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
|
604
|
+
Defaults to scale * 72 if not otherwise specified.
|
605
|
+
kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
|
621
606
|
|
622
607
|
Returns:
|
623
608
|
A PIL Image object of the rendered page, or None if rendering fails.
|
@@ -626,34 +611,84 @@ class HighlightingService:
|
|
626
611
|
logger.error(f"Invalid page index {page_index} for rendering.")
|
627
612
|
return None
|
628
613
|
|
629
|
-
|
614
|
+
page_obj = self._pdf[page_index] # Renamed to avoid conflict
|
630
615
|
highlights_on_page = self.get_highlights_for_page(page_index)
|
631
616
|
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
617
|
+
to_image_args = kwargs.copy()
|
618
|
+
actual_scale_x = None
|
619
|
+
actual_scale_y = None
|
620
|
+
|
621
|
+
if "width" in to_image_args and to_image_args["width"] is not None:
|
622
|
+
logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
|
623
|
+
if "height" in to_image_args: to_image_args.pop("height", None)
|
624
|
+
# Actual scale will be calculated after image creation
|
625
|
+
elif "height" in to_image_args and to_image_args["height"] is not None:
|
626
|
+
logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
|
627
|
+
# Actual scale will be calculated after image creation
|
628
|
+
else:
|
629
|
+
# Use explicit resolution from kwargs if present, then the resolution param, then scale
|
630
|
+
render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
|
631
|
+
if render_resolution is None:
|
632
|
+
render_resolution = scale * 72
|
633
|
+
to_image_args["resolution"] = render_resolution # Add it back for the call
|
634
|
+
actual_scale_x = render_resolution / 72.0
|
635
|
+
actual_scale_y = render_resolution / 72.0
|
636
|
+
logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
|
637
|
+
|
638
|
+
try:
|
639
|
+
# base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
|
640
|
+
img_object = page_obj._page.to_image(**to_image_args)
|
641
|
+
base_image_pil = (
|
642
|
+
img_object.annotated
|
643
|
+
if hasattr(img_object, "annotated")
|
644
|
+
else img_object._repr_png_()
|
645
|
+
)
|
646
|
+
if isinstance(base_image_pil, bytes):
|
647
|
+
from io import BytesIO
|
648
|
+
base_image_pil = Image.open(BytesIO(base_image_pil))
|
649
|
+
base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
|
650
|
+
logger.debug(
|
651
|
+
f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
|
652
|
+
)
|
653
|
+
|
654
|
+
if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
|
655
|
+
if page_obj.width > 0:
|
656
|
+
actual_scale_x = base_image_pil.width / page_obj.width
|
657
|
+
else:
|
658
|
+
actual_scale_x = scale # Fallback
|
659
|
+
if page_obj.height > 0:
|
660
|
+
actual_scale_y = base_image_pil.height / page_obj.height
|
661
|
+
else:
|
662
|
+
actual_scale_y = scale # Fallback
|
663
|
+
logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
|
664
|
+
|
665
|
+
except IOError as e:
|
666
|
+
logger.error(f"IOError creating base image for page {page_index}: {e}")
|
667
|
+
raise
|
668
|
+
except AttributeError as e:
|
669
|
+
logger.error(f"AttributeError creating base image for page {page_index}: {e}")
|
670
|
+
raise
|
671
|
+
|
672
|
+
renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
|
638
673
|
|
639
674
|
# --- Render Highlights ---
|
640
675
|
rendered_image: Image.Image
|
641
676
|
if highlights_on_page:
|
642
677
|
renderer = HighlightRenderer(
|
643
|
-
page=
|
644
|
-
base_image=
|
678
|
+
page=page_obj,
|
679
|
+
base_image=base_image_pil,
|
645
680
|
highlights=highlights_on_page,
|
646
|
-
scale=scale
|
681
|
+
scale=renderer_scale, # Use the determined actual scale
|
647
682
|
render_ocr=render_ocr,
|
648
683
|
)
|
649
684
|
rendered_image = renderer.render()
|
650
685
|
else:
|
651
686
|
if render_ocr:
|
652
|
-
# Still render OCR even if no highlights
|
653
|
-
renderer = HighlightRenderer(
|
687
|
+
# Still render OCR even if no highlights, using the determined actual scale
|
688
|
+
renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
|
654
689
|
rendered_image = renderer.render()
|
655
690
|
else:
|
656
|
-
rendered_image =
|
691
|
+
rendered_image = base_image_pil # No highlights, no OCR requested
|
657
692
|
|
658
693
|
# --- Add Legend (Based ONLY on this page's highlights) ---
|
659
694
|
if labels:
|
@@ -697,12 +732,12 @@ class HighlightingService:
|
|
697
732
|
Args:
|
698
733
|
page_index: Index of the page to render.
|
699
734
|
temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
|
700
|
-
scale:
|
735
|
+
scale: Original scale factor for rendering, used if width/height are not provided.
|
701
736
|
labels: Whether to include a legend.
|
702
737
|
legend_position: Position of the legend.
|
703
738
|
render_ocr: Whether to render OCR text.
|
704
|
-
resolution: Resolution for base page image rendering.
|
705
|
-
**kwargs: Additional args for pdfplumber's to_image.
|
739
|
+
resolution: Resolution for base page image rendering if width/height not used.
|
740
|
+
**kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
|
706
741
|
|
707
742
|
Returns:
|
708
743
|
PIL Image of the preview, or None if rendering fails.
|
@@ -711,35 +746,64 @@ class HighlightingService:
|
|
711
746
|
logger.error(f"Invalid page index {page_index} for render_preview.")
|
712
747
|
return None
|
713
748
|
|
714
|
-
|
715
|
-
|
749
|
+
page_obj = self._pdf.pages[page_index]
|
750
|
+
|
751
|
+
to_image_args = kwargs.copy()
|
752
|
+
actual_scale_x = None
|
753
|
+
actual_scale_y = None
|
754
|
+
|
755
|
+
# Determine arguments for page._page.to_image()
|
756
|
+
if "width" in to_image_args and to_image_args["width"] is not None:
|
757
|
+
logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
|
758
|
+
# Resolution is implicitly handled by pdfplumber when width is set
|
759
|
+
if "height" in to_image_args:
|
760
|
+
to_image_args.pop("height", None)
|
761
|
+
# after image is created, we will calculate actual_scale_x and actual_scale_y
|
762
|
+
|
763
|
+
elif "height" in to_image_args and to_image_args["height"] is not None:
|
764
|
+
logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
|
765
|
+
# Resolution is implicitly handled by pdfplumber when height is set
|
766
|
+
# after image is created, we will calculate actual_scale_x and actual_scale_y
|
767
|
+
else:
|
768
|
+
# Neither width nor height is provided, use resolution or scale.
|
769
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
770
|
+
to_image_args["resolution"] = render_resolution
|
771
|
+
actual_scale_x = render_resolution / 72.0
|
772
|
+
actual_scale_y = render_resolution / 72.0
|
773
|
+
logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
|
716
774
|
|
717
775
|
try:
|
718
|
-
|
719
|
-
|
720
|
-
base_image = (
|
776
|
+
img_object = page_obj._page.to_image(**to_image_args)
|
777
|
+
base_image_pil = (
|
721
778
|
img_object.annotated
|
722
779
|
if hasattr(img_object, "annotated")
|
723
780
|
else img_object._repr_png_()
|
724
781
|
)
|
725
|
-
if isinstance(
|
782
|
+
if isinstance(base_image_pil, bytes):
|
726
783
|
from io import BytesIO
|
784
|
+
base_image_pil = Image.open(BytesIO(base_image_pil))
|
785
|
+
base_image_pil = base_image_pil.convert("RGB")
|
727
786
|
|
728
|
-
|
729
|
-
|
787
|
+
# If scale was not determined by resolution, calculate it now from base_image_pil dimensions
|
788
|
+
if actual_scale_x is None or actual_scale_y is None:
|
789
|
+
if page_obj.width > 0:
|
790
|
+
actual_scale_x = base_image_pil.width / page_obj.width
|
791
|
+
else:
|
792
|
+
actual_scale_x = scale # Fallback to original scale
|
793
|
+
if page_obj.height > 0:
|
794
|
+
actual_scale_y = base_image_pil.height / page_obj.height
|
795
|
+
else:
|
796
|
+
actual_scale_y = scale # Fallback to original scale
|
797
|
+
logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
|
730
798
|
|
731
799
|
# Convert temporary highlight dicts to Highlight objects
|
732
|
-
# Note: Colors/labels should be determined *here* for temporary preview
|
733
800
|
preview_highlights = []
|
734
801
|
for hl_data in temporary_highlights:
|
735
|
-
# Determine the final color using the service logic
|
736
802
|
final_color = self._determine_highlight_color(
|
737
803
|
color_input=hl_data.get("color"),
|
738
804
|
label=hl_data.get("label"),
|
739
805
|
use_color_cycling=hl_data.get("use_color_cycling", False),
|
740
806
|
)
|
741
|
-
|
742
|
-
# Extract potential attributes to draw
|
743
807
|
attrs_to_draw = {}
|
744
808
|
element = hl_data.get("element")
|
745
809
|
include_attrs = hl_data.get("include_attrs")
|
@@ -753,25 +817,29 @@ class HighlightingService:
|
|
753
817
|
logger.warning(
|
754
818
|
f"Attribute '{attr_name}' not found on element {element}"
|
755
819
|
)
|
756
|
-
|
757
|
-
# Add highlight if geometry exists
|
758
820
|
if hl_data.get("bbox") or hl_data.get("polygon"):
|
759
821
|
preview_highlights.append(
|
760
822
|
Highlight(
|
761
823
|
page_index=hl_data["page_index"],
|
762
824
|
bbox=hl_data.get("bbox"),
|
763
825
|
polygon=hl_data.get("polygon"),
|
764
|
-
color=final_color,
|
826
|
+
color=final_color,
|
765
827
|
label=hl_data.get("label"),
|
766
828
|
attributes=attrs_to_draw,
|
767
829
|
)
|
768
830
|
)
|
769
|
-
|
770
|
-
#
|
771
|
-
|
831
|
+
|
832
|
+
# Use the calculated actual_scale_x for the HighlightRenderer
|
833
|
+
# Assuming HighlightRenderer can handle a single scale or we adapt it.
|
834
|
+
# For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
|
835
|
+
# If not, HighlightRenderer needs to accept scale_x and scale_y.
|
836
|
+
# We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
|
837
|
+
# or if not, it's a reasonable approximation for highlight scaling.
|
838
|
+
renderer_scale = actual_scale_x
|
839
|
+
|
840
|
+
renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
|
772
841
|
rendered_image = renderer.render()
|
773
842
|
|
774
|
-
# Create legend only from temporary highlights
|
775
843
|
legend = None
|
776
844
|
if labels:
|
777
845
|
preview_labels = {h.label: h.color for h in preview_highlights if h.label}
|
@@ -781,12 +849,15 @@ class HighlightingService:
|
|
781
849
|
rendered_image, legend, position=legend_position
|
782
850
|
)
|
783
851
|
else:
|
784
|
-
final_image = rendered_image
|
852
|
+
final_image = rendered_image
|
785
853
|
else:
|
786
854
|
final_image = rendered_image
|
787
855
|
|
788
|
-
except
|
789
|
-
logger.error(f"
|
790
|
-
|
856
|
+
except IOError as e:
|
857
|
+
logger.error(f"IOError rendering preview for page {page_index}: {e}")
|
858
|
+
raise
|
859
|
+
except AttributeError as e:
|
860
|
+
logger.error(f"AttributeError rendering preview for page {page_index}: {e}")
|
861
|
+
raise
|
791
862
|
|
792
863
|
return final_image
|