natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ from typing import (
|
|
11
11
|
Iterable,
|
12
12
|
Iterator,
|
13
13
|
List,
|
14
|
+
Literal,
|
14
15
|
Optional,
|
15
16
|
Sequence,
|
16
17
|
Tuple,
|
@@ -32,6 +33,9 @@ from natural_pdf.classification.manager import ClassificationManager
|
|
32
33
|
from natural_pdf.classification.mixin import ClassificationMixin
|
33
34
|
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
34
35
|
from natural_pdf.core.pdf import PDF
|
36
|
+
|
37
|
+
# Add Visualizable import
|
38
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
35
39
|
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
36
40
|
from natural_pdf.elements.base import Element
|
37
41
|
from natural_pdf.elements.region import Region
|
@@ -40,6 +44,7 @@ from natural_pdf.export.mixin import ExportMixin
|
|
40
44
|
from natural_pdf.ocr import OCROptions
|
41
45
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
42
46
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
47
|
+
from natural_pdf.text_mixin import TextMixin
|
43
48
|
|
44
49
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
45
50
|
try:
|
@@ -66,6 +71,7 @@ if TYPE_CHECKING:
|
|
66
71
|
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
67
72
|
from natural_pdf.elements.region import Region
|
68
73
|
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
74
|
+
from natural_pdf.flows.flow import Flow
|
69
75
|
|
70
76
|
T = TypeVar("T")
|
71
77
|
P = TypeVar("P", bound="Page")
|
@@ -79,6 +85,7 @@ class ElementCollection(
|
|
79
85
|
DirectionalCollectionMixin,
|
80
86
|
DescribeMixin,
|
81
87
|
InspectMixin,
|
88
|
+
Visualizable,
|
82
89
|
MutableSequence,
|
83
90
|
):
|
84
91
|
"""Collection of PDF elements with batch operations.
|
@@ -168,13 +175,234 @@ class ElementCollection(
|
|
168
175
|
"""
|
169
176
|
self._elements = elements or []
|
170
177
|
|
178
|
+
def _get_render_specs(
|
179
|
+
self,
|
180
|
+
mode: Literal["show", "render"] = "show",
|
181
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
182
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
183
|
+
crop: Union[bool, Literal["content"]] = False,
|
184
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
185
|
+
group_by: Optional[str] = None,
|
186
|
+
bins: Optional[Union[int, List[float]]] = None,
|
187
|
+
annotate: Optional[List[str]] = None,
|
188
|
+
**kwargs,
|
189
|
+
) -> List[RenderSpec]:
|
190
|
+
"""Get render specifications for this element collection.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
194
|
+
color: Default color for highlights in show mode (or colormap name when using group_by)
|
195
|
+
highlights: Additional highlight groups to show
|
196
|
+
crop: Whether to crop to element bounds
|
197
|
+
crop_bbox: Explicit crop bounds
|
198
|
+
group_by: Attribute to group elements by for color mapping
|
199
|
+
bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
|
200
|
+
annotate: List of attribute names to display on highlights
|
201
|
+
**kwargs: Additional parameters
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
List of RenderSpec objects, one per page with elements
|
205
|
+
"""
|
206
|
+
if not self._elements:
|
207
|
+
return []
|
208
|
+
|
209
|
+
# Group elements by page
|
210
|
+
elements_by_page = {}
|
211
|
+
for elem in self._elements:
|
212
|
+
if hasattr(elem, "page"):
|
213
|
+
page = elem.page
|
214
|
+
if page not in elements_by_page:
|
215
|
+
elements_by_page[page] = []
|
216
|
+
elements_by_page[page].append(elem)
|
217
|
+
|
218
|
+
if not elements_by_page:
|
219
|
+
return []
|
220
|
+
|
221
|
+
# Create RenderSpec for each page
|
222
|
+
specs = []
|
223
|
+
for page, page_elements in elements_by_page.items():
|
224
|
+
spec = RenderSpec(page=page)
|
225
|
+
|
226
|
+
# Handle cropping
|
227
|
+
if crop_bbox:
|
228
|
+
spec.crop_bbox = crop_bbox
|
229
|
+
elif crop == "content" or crop is True:
|
230
|
+
# Calculate bounds of elements on this page
|
231
|
+
x_coords = []
|
232
|
+
y_coords = []
|
233
|
+
for elem in page_elements:
|
234
|
+
if hasattr(elem, "bbox") and elem.bbox:
|
235
|
+
x0, y0, x1, y1 = elem.bbox
|
236
|
+
x_coords.extend([x0, x1])
|
237
|
+
y_coords.extend([y0, y1])
|
238
|
+
|
239
|
+
if x_coords and y_coords:
|
240
|
+
spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
241
|
+
|
242
|
+
# Add highlights in show mode
|
243
|
+
if mode == "show":
|
244
|
+
# Handle group_by parameter for quantitative/categorical grouping
|
245
|
+
if group_by is not None:
|
246
|
+
# Use the improved highlighting logic from _prepare_highlight_data
|
247
|
+
prepared_highlights = self._prepare_highlight_data(
|
248
|
+
group_by=group_by, color=color, bins=bins, annotate=annotate, **kwargs
|
249
|
+
)
|
250
|
+
|
251
|
+
# Check if we have quantitative metadata to preserve
|
252
|
+
quantitative_metadata = None
|
253
|
+
for highlight_data in prepared_highlights:
|
254
|
+
if (
|
255
|
+
"quantitative_metadata" in highlight_data
|
256
|
+
and highlight_data["quantitative_metadata"]
|
257
|
+
):
|
258
|
+
quantitative_metadata = highlight_data["quantitative_metadata"]
|
259
|
+
break
|
260
|
+
|
261
|
+
# Add highlights from prepared data
|
262
|
+
for highlight_data in prepared_highlights:
|
263
|
+
# Only add elements from this page
|
264
|
+
elem = highlight_data.get("element")
|
265
|
+
if elem and hasattr(elem, "page") and elem.page == page:
|
266
|
+
# Create the highlight dict manually to preserve quantitative metadata
|
267
|
+
highlight_dict = {
|
268
|
+
"element": elem,
|
269
|
+
"color": highlight_data.get("color"),
|
270
|
+
"label": highlight_data.get("label"),
|
271
|
+
}
|
272
|
+
|
273
|
+
# Add quantitative metadata to the first highlight
|
274
|
+
if quantitative_metadata and not any(
|
275
|
+
h.get("quantitative_metadata") for h in spec.highlights
|
276
|
+
):
|
277
|
+
highlight_dict["quantitative_metadata"] = quantitative_metadata
|
278
|
+
|
279
|
+
# Add annotate if provided in the prepared data
|
280
|
+
if "annotate" in highlight_data:
|
281
|
+
highlight_dict["annotate"] = highlight_data["annotate"]
|
282
|
+
if "attributes_to_draw" in highlight_data:
|
283
|
+
highlight_dict["attributes_to_draw"] = highlight_data[
|
284
|
+
"attributes_to_draw"
|
285
|
+
]
|
286
|
+
|
287
|
+
# Extract geometry from element
|
288
|
+
if (
|
289
|
+
hasattr(elem, "polygon")
|
290
|
+
and hasattr(elem, "has_polygon")
|
291
|
+
and elem.has_polygon
|
292
|
+
):
|
293
|
+
highlight_dict["polygon"] = elem.polygon
|
294
|
+
elif hasattr(elem, "bbox"):
|
295
|
+
highlight_dict["bbox"] = elem.bbox
|
296
|
+
|
297
|
+
spec.highlights.append(highlight_dict)
|
298
|
+
else:
|
299
|
+
# Default behavior when no group_by is specified
|
300
|
+
# Determine if all elements are of the same type
|
301
|
+
element_types = set(type(elem).__name__ for elem in page_elements)
|
302
|
+
|
303
|
+
if len(element_types) == 1:
|
304
|
+
# All elements are the same type - use a single label
|
305
|
+
type_name = element_types.pop()
|
306
|
+
# Generate a clean label from the type name
|
307
|
+
base_name = (
|
308
|
+
type_name.replace("Element", "").replace("Region", "")
|
309
|
+
if type_name != "Region"
|
310
|
+
else "Region"
|
311
|
+
)
|
312
|
+
# Handle special cases for common types
|
313
|
+
if base_name == "Text":
|
314
|
+
shared_label = "Text Elements"
|
315
|
+
elif base_name == "table_cell" or (
|
316
|
+
hasattr(page_elements[0], "region_type")
|
317
|
+
and page_elements[0].region_type == "table_cell"
|
318
|
+
):
|
319
|
+
shared_label = "Table Cells"
|
320
|
+
elif base_name == "table":
|
321
|
+
shared_label = "Tables"
|
322
|
+
else:
|
323
|
+
shared_label = f"{base_name} Elements" if base_name else "Elements"
|
324
|
+
|
325
|
+
# Add all elements with the same label (no color cycling)
|
326
|
+
for elem in page_elements:
|
327
|
+
# Get element highlight params with annotate
|
328
|
+
element_data = self._get_element_highlight_params(elem, annotate)
|
329
|
+
if element_data:
|
330
|
+
# Use add_highlight with basic params
|
331
|
+
spec.add_highlight(
|
332
|
+
element=elem,
|
333
|
+
color=color, # Use provided color or None
|
334
|
+
label=shared_label,
|
335
|
+
)
|
336
|
+
# Update last highlight with attributes if present
|
337
|
+
if element_data.get("attributes_to_draw") and spec.highlights:
|
338
|
+
spec.highlights[-1]["attributes_to_draw"] = element_data[
|
339
|
+
"attributes_to_draw"
|
340
|
+
]
|
341
|
+
else:
|
342
|
+
# Mixed types - use individual labels (existing behavior)
|
343
|
+
for elem in page_elements:
|
344
|
+
# Get element highlight params with annotate
|
345
|
+
element_data = self._get_element_highlight_params(elem, annotate)
|
346
|
+
if element_data:
|
347
|
+
spec.add_highlight(
|
348
|
+
element=elem,
|
349
|
+
color=color,
|
350
|
+
label=getattr(elem, "text", None) or str(elem),
|
351
|
+
)
|
352
|
+
# Update last highlight with attributes if present
|
353
|
+
if element_data.get("attributes_to_draw") and spec.highlights:
|
354
|
+
spec.highlights[-1]["attributes_to_draw"] = element_data[
|
355
|
+
"attributes_to_draw"
|
356
|
+
]
|
357
|
+
|
358
|
+
# Add additional highlight groups if provided
|
359
|
+
if highlights:
|
360
|
+
for group in highlights:
|
361
|
+
group_elements = group.get("elements", [])
|
362
|
+
group_color = group.get("color", color)
|
363
|
+
group_label = group.get("label")
|
364
|
+
|
365
|
+
# Only add elements from this page
|
366
|
+
for elem in group_elements:
|
367
|
+
if hasattr(elem, "page") and elem.page == page:
|
368
|
+
spec.add_highlight(
|
369
|
+
element=elem, color=group_color, label=group_label
|
370
|
+
)
|
371
|
+
|
372
|
+
specs.append(spec)
|
373
|
+
|
374
|
+
return specs
|
375
|
+
|
376
|
+
def _get_highlighter(self):
|
377
|
+
"""Get the highlighting service for rendering.
|
378
|
+
|
379
|
+
For ElementCollection, we get it from the first element's page.
|
380
|
+
"""
|
381
|
+
if not self._elements:
|
382
|
+
raise RuntimeError("Cannot get highlighter from empty ElementCollection")
|
383
|
+
|
384
|
+
# Try to get highlighter from first element's page
|
385
|
+
for elem in self._elements:
|
386
|
+
if hasattr(elem, "page") and hasattr(elem.page, "_highlighter"):
|
387
|
+
return elem.page._highlighter
|
388
|
+
|
389
|
+
# If no elements have pages, we can't render
|
390
|
+
raise RuntimeError(
|
391
|
+
"Cannot find HighlightingService. ElementCollection elements don't have page access."
|
392
|
+
)
|
393
|
+
|
171
394
|
def __len__(self) -> int:
|
172
395
|
"""Get the number of elements in the collection."""
|
173
396
|
return len(self._elements)
|
174
397
|
|
175
|
-
def __getitem__(self, index: int) -> "Element":
|
176
|
-
"""Get an element by index."""
|
177
|
-
|
398
|
+
def __getitem__(self, index: Union[int, slice]) -> Union["Element", "ElementCollection"]:
|
399
|
+
"""Get an element by index or a collection by slice."""
|
400
|
+
if isinstance(index, slice):
|
401
|
+
# Return a new ElementCollection for slices
|
402
|
+
return ElementCollection(self._elements[index])
|
403
|
+
else:
|
404
|
+
# Return the element for integer indices
|
405
|
+
return self._elements[index]
|
178
406
|
|
179
407
|
def __repr__(self) -> str:
|
180
408
|
"""Return a string representation showing the element count."""
|
@@ -420,6 +648,7 @@ class ElementCollection(
|
|
420
648
|
# Apply content filtering if provided
|
421
649
|
if content_filter is not None:
|
422
650
|
from natural_pdf.utils.text_extraction import _apply_content_filter
|
651
|
+
|
423
652
|
all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
|
424
653
|
|
425
654
|
# Check if layout is requested
|
@@ -531,8 +760,9 @@ class ElementCollection(
|
|
531
760
|
group_by: Optional[str] = None,
|
532
761
|
label_format: Optional[str] = None,
|
533
762
|
distinct: bool = False,
|
534
|
-
|
763
|
+
annotate: Optional[List[str]] = None,
|
535
764
|
replace: bool = False,
|
765
|
+
bins: Optional[Union[int, List[float]]] = None,
|
536
766
|
) -> "ElementCollection":
|
537
767
|
"""
|
538
768
|
Adds persistent highlights for all elements in the collection to the page
|
@@ -550,12 +780,15 @@ class ElementCollection(
|
|
550
780
|
label: Optional explicit label for the entire collection. If provided,
|
551
781
|
all elements are highlighted as a single group with this label,
|
552
782
|
ignoring 'group_by' and the default type-based grouping.
|
553
|
-
color: Optional explicit color for the highlight (tuple/string)
|
554
|
-
|
783
|
+
color: Optional explicit color for the highlight (tuple/string), or
|
784
|
+
matplotlib colormap name for quantitative group_by (e.g., 'viridis', 'plasma',
|
785
|
+
'inferno', 'coolwarm', 'RdBu'). Applied consistently if 'label' is provided
|
786
|
+
or if grouping occurs.
|
555
787
|
group_by: Optional attribute name present on the elements. If provided
|
556
788
|
(and 'label' is None), elements will be grouped based on the
|
557
789
|
value of this attribute, and each group will be highlighted
|
558
|
-
with a distinct label and color.
|
790
|
+
with a distinct label and color. Automatically detects quantitative
|
791
|
+
data and uses gradient colormaps when appropriate.
|
559
792
|
label_format: Optional Python f-string to format the group label when
|
560
793
|
'group_by' is used. Can reference element attributes
|
561
794
|
(e.g., "Type: {region_type}, Conf: {confidence:.2f}").
|
@@ -563,11 +796,14 @@ class ElementCollection(
|
|
563
796
|
distinct: If True, bypasses all grouping and highlights each element
|
564
797
|
individually with cycling colors (the previous default behavior).
|
565
798
|
(default: False)
|
566
|
-
|
567
|
-
|
799
|
+
annotate: List of attribute names from the element to display directly
|
800
|
+
on the highlight itself (distinct from group label).
|
568
801
|
replace: If True, existing highlights on the affected page(s)
|
569
802
|
are cleared before adding these highlights.
|
570
803
|
If False (default), highlights are appended to existing ones.
|
804
|
+
bins: Optional binning specification for quantitative data when using group_by.
|
805
|
+
Can be an integer (number of equal-width bins) or a list of bin edges.
|
806
|
+
Only used when group_by contains quantitative data.
|
571
807
|
|
572
808
|
Returns:
|
573
809
|
Self for method chaining
|
@@ -589,7 +825,8 @@ class ElementCollection(
|
|
589
825
|
color=color,
|
590
826
|
group_by=group_by,
|
591
827
|
label_format=label_format,
|
592
|
-
|
828
|
+
annotate=annotate,
|
829
|
+
bins=bins,
|
593
830
|
# 'replace' flag is handled during the add call below
|
594
831
|
)
|
595
832
|
|
@@ -630,7 +867,7 @@ class ElementCollection(
|
|
630
867
|
"use_color_cycling", False
|
631
868
|
), # Set by _prepare if distinct
|
632
869
|
"element": data["element"],
|
633
|
-
"
|
870
|
+
"annotate": data["annotate"],
|
634
871
|
# Internal call to service always appends, as clearing was handled above
|
635
872
|
"existing": "append",
|
636
873
|
}
|
@@ -652,7 +889,8 @@ class ElementCollection(
|
|
652
889
|
color: Optional[Union[Tuple, str]] = None,
|
653
890
|
group_by: Optional[str] = None,
|
654
891
|
label_format: Optional[str] = None,
|
655
|
-
|
892
|
+
annotate: Optional[List[str]] = None,
|
893
|
+
bins: Optional[Union[int, List[float]]] = None,
|
656
894
|
) -> List[Dict]:
|
657
895
|
"""
|
658
896
|
Determines the parameters for highlighting each element based on the strategy.
|
@@ -661,7 +899,7 @@ class ElementCollection(
|
|
661
899
|
|
662
900
|
Returns:
|
663
901
|
List of dictionaries, each containing parameters for a single highlight
|
664
|
-
(e.g., page_index, bbox/polygon, color, label, element,
|
902
|
+
(e.g., page_index, bbox/polygon, color, label, element, annotate, attributes_to_draw).
|
665
903
|
Color and label determination happens here.
|
666
904
|
"""
|
667
905
|
prepared_data = []
|
@@ -669,11 +907,25 @@ class ElementCollection(
|
|
669
907
|
return prepared_data
|
670
908
|
|
671
909
|
# Need access to the HighlightingService to determine colors correctly.
|
910
|
+
# Use highlighting protocol to find a valid service from any element
|
672
911
|
highlighter = None
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
912
|
+
|
913
|
+
for element in self._elements:
|
914
|
+
# Try direct page access first (for regular elements)
|
915
|
+
if hasattr(element, "page") and hasattr(element.page, "_highlighter"):
|
916
|
+
highlighter = element.page._highlighter
|
917
|
+
break
|
918
|
+
# Try highlighting protocol for FlowRegions and other complex elements
|
919
|
+
elif hasattr(element, "get_highlight_specs"):
|
920
|
+
specs = element.get_highlight_specs()
|
921
|
+
for spec in specs:
|
922
|
+
if "page" in spec and hasattr(spec["page"], "_highlighter"):
|
923
|
+
highlighter = spec["page"]._highlighter
|
924
|
+
break
|
925
|
+
if highlighter:
|
926
|
+
break
|
927
|
+
|
928
|
+
if not highlighter:
|
677
929
|
logger.warning(
|
678
930
|
"Cannot determine highlight colors: HighlightingService not accessible from elements."
|
679
931
|
)
|
@@ -686,7 +938,7 @@ class ElementCollection(
|
|
686
938
|
final_color = highlighter._determine_highlight_color(
|
687
939
|
label=None, color_input=None, use_color_cycling=True
|
688
940
|
)
|
689
|
-
element_data = self._get_element_highlight_params(element,
|
941
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
690
942
|
if element_data:
|
691
943
|
element_data.update(
|
692
944
|
{"color": final_color, "label": None, "use_color_cycling": True}
|
@@ -699,7 +951,7 @@ class ElementCollection(
|
|
699
951
|
label=label, color_input=color, use_color_cycling=False
|
700
952
|
)
|
701
953
|
for element in self._elements:
|
702
|
-
element_data = self._get_element_highlight_params(element,
|
954
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
703
955
|
if element_data:
|
704
956
|
element_data.update({"color": final_color, "label": label})
|
705
957
|
prepared_data.append(element_data)
|
@@ -707,23 +959,84 @@ class ElementCollection(
|
|
707
959
|
elif group_by is not None:
|
708
960
|
logger.debug("_prepare: Grouping by attribute strategy.")
|
709
961
|
grouped_elements = self._group_elements_by_attr(group_by)
|
962
|
+
|
963
|
+
# Collect all values for quantitative detection
|
964
|
+
all_values = []
|
710
965
|
for group_key, group_elements in grouped_elements.items():
|
711
|
-
if
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
966
|
+
if group_elements:
|
967
|
+
all_values.append(group_key)
|
968
|
+
|
969
|
+
# Import the quantitative detection function
|
970
|
+
from natural_pdf.utils.visualization import (
|
971
|
+
create_quantitative_color_mapping,
|
972
|
+
detect_quantitative_data,
|
973
|
+
)
|
974
|
+
|
975
|
+
# Determine if we should use quantitative color mapping
|
976
|
+
use_quantitative = detect_quantitative_data(all_values)
|
977
|
+
|
978
|
+
if use_quantitative:
|
979
|
+
logger.debug(" _prepare: Using quantitative color mapping.")
|
980
|
+
# Use quantitative color mapping with specified colormap
|
981
|
+
colormap_name = color if isinstance(color, str) else "viridis"
|
982
|
+
value_to_color = create_quantitative_color_mapping(
|
983
|
+
all_values, colormap=colormap_name, bins=bins
|
721
984
|
)
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
985
|
+
|
986
|
+
# Store quantitative metadata for colorbar creation
|
987
|
+
quantitative_metadata = {
|
988
|
+
"values": all_values,
|
989
|
+
"colormap": colormap_name,
|
990
|
+
"bins": bins,
|
991
|
+
"attribute": group_by,
|
992
|
+
}
|
993
|
+
|
994
|
+
for group_key, group_elements in grouped_elements.items():
|
995
|
+
if not group_elements:
|
996
|
+
continue
|
997
|
+
group_label = self._format_group_label(
|
998
|
+
group_key, label_format, group_elements[0], group_by
|
999
|
+
)
|
1000
|
+
|
1001
|
+
# Get quantitative color for this value
|
1002
|
+
final_color = value_to_color.get(group_key)
|
1003
|
+
if final_color is None:
|
1004
|
+
# Fallback to traditional color assignment
|
1005
|
+
final_color = highlighter._determine_highlight_color(
|
1006
|
+
label=group_label, color_input=None, use_color_cycling=False
|
1007
|
+
)
|
1008
|
+
|
1009
|
+
logger.debug(
|
1010
|
+
f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
|
1011
|
+
)
|
1012
|
+
for element in group_elements:
|
1013
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
1014
|
+
if element_data:
|
1015
|
+
element_data.update({"color": final_color, "label": group_label})
|
1016
|
+
# Add quantitative metadata to the first element in each group
|
1017
|
+
if not any("quantitative_metadata" in pd for pd in prepared_data):
|
1018
|
+
element_data["quantitative_metadata"] = quantitative_metadata
|
1019
|
+
prepared_data.append(element_data)
|
1020
|
+
else:
|
1021
|
+
logger.debug(" _prepare: Using categorical color mapping.")
|
1022
|
+
# Use traditional categorical color mapping
|
1023
|
+
for group_key, group_elements in grouped_elements.items():
|
1024
|
+
if not group_elements:
|
1025
|
+
continue
|
1026
|
+
group_label = self._format_group_label(
|
1027
|
+
group_key, label_format, group_elements[0], group_by
|
1028
|
+
)
|
1029
|
+
final_color = highlighter._determine_highlight_color(
|
1030
|
+
label=group_label, color_input=None, use_color_cycling=False
|
1031
|
+
)
|
1032
|
+
logger.debug(
|
1033
|
+
f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
|
1034
|
+
)
|
1035
|
+
for element in group_elements:
|
1036
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
1037
|
+
if element_data:
|
1038
|
+
element_data.update({"color": final_color, "label": group_label})
|
1039
|
+
prepared_data.append(element_data)
|
727
1040
|
else:
|
728
1041
|
logger.debug("_prepare: Default grouping strategy.")
|
729
1042
|
element_types = set(type(el).__name__ for el in self._elements)
|
@@ -742,7 +1055,7 @@ class ElementCollection(
|
|
742
1055
|
)
|
743
1056
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
744
1057
|
for element in self._elements:
|
745
|
-
element_data = self._get_element_highlight_params(element,
|
1058
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
746
1059
|
if element_data:
|
747
1060
|
element_data.update({"color": final_color, "label": auto_label})
|
748
1061
|
prepared_data.append(element_data)
|
@@ -761,7 +1074,7 @@ class ElementCollection(
|
|
761
1074
|
# Determine color *before* logging or using it (already done above for this branch)
|
762
1075
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
763
1076
|
for element in self._elements:
|
764
|
-
element_data = self._get_element_highlight_params(element,
|
1077
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
765
1078
|
if element_data:
|
766
1079
|
element_data.update({"color": final_color, "label": auto_label})
|
767
1080
|
prepared_data.append(element_data)
|
@@ -774,7 +1087,7 @@ class ElementCollection(
|
|
774
1087
|
color: Optional[Union[Tuple, str]],
|
775
1088
|
label: Optional[str],
|
776
1089
|
use_color_cycling: bool,
|
777
|
-
|
1090
|
+
annotate: Optional[List[str]],
|
778
1091
|
existing: str,
|
779
1092
|
):
|
780
1093
|
"""Low-level helper to call the appropriate HighlightingService method for an element."""
|
@@ -790,7 +1103,7 @@ class ElementCollection(
|
|
790
1103
|
"color": color,
|
791
1104
|
"label": label,
|
792
1105
|
"use_color_cycling": use_color_cycling,
|
793
|
-
"
|
1106
|
+
"annotate": annotate,
|
794
1107
|
"existing": existing,
|
795
1108
|
"element": element,
|
796
1109
|
}
|
@@ -825,7 +1138,7 @@ class ElementCollection(
|
|
825
1138
|
self,
|
826
1139
|
label: str,
|
827
1140
|
color: Optional[Union[Tuple, str]],
|
828
|
-
|
1141
|
+
annotate: Optional[List[str]],
|
829
1142
|
existing: str,
|
830
1143
|
):
|
831
1144
|
"""Highlights all elements with the same explicit label and color."""
|
@@ -835,7 +1148,7 @@ class ElementCollection(
|
|
835
1148
|
color=color, # Use explicit color if provided
|
836
1149
|
label=label, # Use the explicit group label
|
837
1150
|
use_color_cycling=False, # Use consistent color for the label
|
838
|
-
|
1151
|
+
annotate=annotate,
|
839
1152
|
existing=existing,
|
840
1153
|
)
|
841
1154
|
|
@@ -843,7 +1156,7 @@ class ElementCollection(
|
|
843
1156
|
self,
|
844
1157
|
group_by: str,
|
845
1158
|
label_format: Optional[str],
|
846
|
-
|
1159
|
+
annotate: Optional[List[str]],
|
847
1160
|
existing: str,
|
848
1161
|
):
|
849
1162
|
"""Groups elements by attribute and highlights each group distinctly."""
|
@@ -915,11 +1228,11 @@ class ElementCollection(
|
|
915
1228
|
color=None, # Let ColorManager choose based on label
|
916
1229
|
label=group_label, # Use the derived group label
|
917
1230
|
use_color_cycling=False, # Use consistent color for the label
|
918
|
-
|
1231
|
+
annotate=annotate,
|
919
1232
|
existing=existing,
|
920
1233
|
)
|
921
1234
|
|
922
|
-
def _highlight_distinctly(self,
|
1235
|
+
def _highlight_distinctly(self, annotate: Optional[List[str]], existing: str):
|
923
1236
|
"""DEPRECATED: Logic moved to _prepare_highlight_data. Kept for reference/potential reuse."""
|
924
1237
|
# This method is no longer called directly by the main highlight path.
|
925
1238
|
# The distinct logic is handled within _prepare_highlight_data.
|
@@ -929,152 +1242,191 @@ class ElementCollection(
|
|
929
1242
|
color=None, # Let ColorManager cycle
|
930
1243
|
label=None, # No label for distinct elements
|
931
1244
|
use_color_cycling=True, # Force cycling
|
932
|
-
|
1245
|
+
annotate=annotate,
|
933
1246
|
existing=existing,
|
934
1247
|
)
|
935
1248
|
|
936
|
-
def
|
1249
|
+
def _render_multipage_highlights(
|
937
1250
|
self,
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
render_ocr
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
Currently only supports collections where all elements are on the same page
|
959
|
-
of the same PDF.
|
960
|
-
|
961
|
-
Allows grouping and coloring elements based on attributes, similar to the
|
962
|
-
persistent `highlight()` method, but only for this temporary view.
|
963
|
-
|
964
|
-
Args:
|
965
|
-
group_by: Attribute name to group elements by for distinct colors/labels.
|
966
|
-
label: Explicit label for all elements (overrides group_by).
|
967
|
-
color: Explicit color for all elements (if label used) or base color.
|
968
|
-
label_format: F-string to format group labels if group_by is used.
|
969
|
-
distinct: Highlight each element distinctly (overrides group_by/label).
|
970
|
-
include_attrs: Attributes to display on individual highlights.
|
971
|
-
resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
|
972
|
-
labels: Whether to include a legend for the temporary highlights.
|
973
|
-
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
974
|
-
render_ocr: Whether to render OCR text.
|
975
|
-
width: Optional width for the output image in pixels.
|
976
|
-
crop: If True, crop the resulting image to the tight bounding box
|
977
|
-
containing all elements in the collection. The elements are
|
978
|
-
still highlighted first, then the image is cropped.
|
979
|
-
|
980
|
-
Returns:
|
981
|
-
PIL Image object of the temporary preview, or None if rendering fails or
|
982
|
-
elements span multiple pages/PDFs.
|
1251
|
+
specs_by_page,
|
1252
|
+
resolution,
|
1253
|
+
width,
|
1254
|
+
labels,
|
1255
|
+
legend_position,
|
1256
|
+
group_by,
|
1257
|
+
label,
|
1258
|
+
color,
|
1259
|
+
label_format,
|
1260
|
+
distinct,
|
1261
|
+
annotate,
|
1262
|
+
render_ocr,
|
1263
|
+
crop,
|
1264
|
+
stack_direction="vertical",
|
1265
|
+
stack_gap=5,
|
1266
|
+
stack_background_color=(255, 255, 255),
|
1267
|
+
):
|
1268
|
+
"""Render highlights across multiple pages and stack them."""
|
1269
|
+
from PIL import Image
|
983
1270
|
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
import natural_pdf
|
1271
|
+
# Sort pages by index for consistent output
|
1272
|
+
sorted_pages = sorted(
|
1273
|
+
specs_by_page.keys(), key=lambda p: p.index if hasattr(p, "index") else 0
|
1274
|
+
)
|
989
1275
|
|
990
|
-
|
991
|
-
if width is None:
|
992
|
-
width = natural_pdf.options.image.width
|
993
|
-
if resolution is None:
|
994
|
-
if natural_pdf.options.image.resolution is not None:
|
995
|
-
resolution = natural_pdf.options.image.resolution
|
996
|
-
else:
|
997
|
-
resolution = 144 # Default resolution when none specified
|
1276
|
+
page_images = []
|
998
1277
|
|
999
|
-
|
1000
|
-
|
1278
|
+
for page in sorted_pages:
|
1279
|
+
element_specs = specs_by_page[page]
|
1001
1280
|
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1281
|
+
# Get highlighter service from the page
|
1282
|
+
if not hasattr(page, "_highlighter"):
|
1283
|
+
logger.warning(
|
1284
|
+
f"Page {getattr(page, 'number', '?')} has no highlighter service, skipping"
|
1285
|
+
)
|
1286
|
+
continue
|
1007
1287
|
|
1008
|
-
|
1009
|
-
if self._are_on_multiple_pages():
|
1010
|
-
raise ValueError(
|
1011
|
-
"show() currently only supports collections where all elements are on the same page."
|
1012
|
-
)
|
1288
|
+
service = page._highlighter
|
1013
1289
|
|
1014
|
-
|
1015
|
-
|
1016
|
-
if not hasattr(first_element, "page") or not first_element.page:
|
1017
|
-
logger.warning("Cannot show collection: First element has no associated page.")
|
1018
|
-
return None
|
1019
|
-
page = first_element.page
|
1020
|
-
if not hasattr(page, "pdf") or not page.pdf:
|
1021
|
-
logger.warning("Cannot show collection: Page has no associated PDF object.")
|
1022
|
-
return None
|
1290
|
+
# Prepare highlight data for this page
|
1291
|
+
highlight_data_list = []
|
1023
1292
|
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1293
|
+
for element_idx, spec in element_specs:
|
1294
|
+
# Use the element index to generate consistent colors/labels across pages
|
1295
|
+
element = spec.get(
|
1296
|
+
"element",
|
1297
|
+
self._elements[element_idx] if element_idx < len(self._elements) else None,
|
1298
|
+
)
|
1028
1299
|
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1300
|
+
# Prepare highlight data based on grouping parameters
|
1301
|
+
if distinct:
|
1302
|
+
# Use cycling colors for distinct mode
|
1303
|
+
element_color = None # Let the highlighter service pick from palette
|
1304
|
+
use_color_cycling = True
|
1305
|
+
element_label = (
|
1306
|
+
f"Element_{element_idx + 1}"
|
1307
|
+
if label is None
|
1308
|
+
else f"{label}_{element_idx + 1}"
|
1309
|
+
)
|
1310
|
+
elif label:
|
1311
|
+
# Explicit label for all elements
|
1312
|
+
element_color = color
|
1313
|
+
use_color_cycling = color is None
|
1314
|
+
element_label = label
|
1315
|
+
elif group_by and element:
|
1316
|
+
# Group by attribute
|
1317
|
+
try:
|
1318
|
+
group_key = getattr(element, group_by, None)
|
1319
|
+
element_label = self._format_group_label(
|
1320
|
+
group_key, label_format, element, group_by
|
1321
|
+
)
|
1322
|
+
element_color = None # Let service assign color by group
|
1323
|
+
use_color_cycling = True
|
1324
|
+
except:
|
1325
|
+
element_label = f"Element_{element_idx + 1}"
|
1326
|
+
element_color = color
|
1327
|
+
use_color_cycling = color is None
|
1328
|
+
else:
|
1329
|
+
# Default behavior
|
1330
|
+
element_color = color
|
1331
|
+
use_color_cycling = color is None
|
1332
|
+
element_label = f"Element_{element_idx + 1}"
|
1333
|
+
|
1334
|
+
# Build highlight data
|
1335
|
+
highlight_item = {
|
1336
|
+
"page_index": spec["page_index"],
|
1337
|
+
"bbox": spec["bbox"],
|
1338
|
+
"polygon": spec.get("polygon"),
|
1339
|
+
"color": element_color,
|
1340
|
+
"label": element_label if labels else None,
|
1341
|
+
"use_color_cycling": use_color_cycling,
|
1342
|
+
}
|
1343
|
+
|
1344
|
+
# Add attributes if requested
|
1345
|
+
if annotate and element:
|
1346
|
+
highlight_item["attributes_to_draw"] = {}
|
1347
|
+
for attr_name in annotate:
|
1348
|
+
try:
|
1349
|
+
attr_value = getattr(element, attr_name, None)
|
1350
|
+
if attr_value is not None:
|
1351
|
+
highlight_item["attributes_to_draw"][attr_name] = attr_value
|
1352
|
+
except:
|
1353
|
+
pass
|
1039
1354
|
|
1040
|
-
|
1041
|
-
logger.warning("No highlight data generated for show(). Rendering clean page.")
|
1042
|
-
# Render the page without any temporary highlights
|
1043
|
-
highlight_data_list = []
|
1355
|
+
highlight_data_list.append(highlight_item)
|
1044
1356
|
|
1045
|
-
|
1046
|
-
try:
|
1047
|
-
# Calculate crop bounding box in PDF coordinates if crop is requested
|
1357
|
+
# Calculate crop bbox if requested
|
1048
1358
|
crop_bbox = None
|
1049
1359
|
if crop:
|
1050
1360
|
try:
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1361
|
+
# Get bboxes from all specs on this page
|
1362
|
+
bboxes = [spec["bbox"] for _, spec in element_specs if spec.get("bbox")]
|
1363
|
+
if bboxes:
|
1364
|
+
crop_bbox = (
|
1365
|
+
min(bbox[0] for bbox in bboxes),
|
1366
|
+
min(bbox[1] for bbox in bboxes),
|
1367
|
+
max(bbox[2] for bbox in bboxes),
|
1368
|
+
max(bbox[3] for bbox in bboxes),
|
1369
|
+
)
|
1057
1370
|
except Exception as bbox_err:
|
1058
|
-
logger.error(
|
1059
|
-
f"Error determining crop bbox for collection show: {bbox_err}",
|
1060
|
-
exc_info=True,
|
1061
|
-
)
|
1371
|
+
logger.error(f"Error determining crop bbox: {bbox_err}")
|
1062
1372
|
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1373
|
+
# Render this page
|
1374
|
+
try:
|
1375
|
+
img = service.render_preview(
|
1376
|
+
page_index=page.index,
|
1377
|
+
temporary_highlights=highlight_data_list,
|
1378
|
+
resolution=resolution,
|
1379
|
+
width=width,
|
1380
|
+
labels=labels,
|
1381
|
+
legend_position=legend_position,
|
1382
|
+
render_ocr=render_ocr,
|
1383
|
+
crop_bbox=crop_bbox,
|
1384
|
+
)
|
1385
|
+
|
1386
|
+
if img:
|
1387
|
+
page_images.append(img)
|
1388
|
+
except Exception as e:
|
1389
|
+
logger.error(
|
1390
|
+
f"Error rendering page {getattr(page, 'number', '?')}: {e}", exc_info=True
|
1391
|
+
)
|
1392
|
+
|
1393
|
+
if not page_images:
|
1394
|
+
logger.warning("Failed to render any pages")
|
1076
1395
|
return None
|
1077
1396
|
|
1397
|
+
if len(page_images) == 1:
|
1398
|
+
return page_images[0]
|
1399
|
+
|
1400
|
+
# Stack the images
|
1401
|
+
if stack_direction == "vertical":
|
1402
|
+
final_width = max(img.width for img in page_images)
|
1403
|
+
final_height = (
|
1404
|
+
sum(img.height for img in page_images) + (len(page_images) - 1) * stack_gap
|
1405
|
+
)
|
1406
|
+
|
1407
|
+
stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
1408
|
+
|
1409
|
+
current_y = 0
|
1410
|
+
for img in page_images:
|
1411
|
+
# Center horizontally
|
1412
|
+
x_offset = (final_width - img.width) // 2
|
1413
|
+
stacked_image.paste(img, (x_offset, current_y))
|
1414
|
+
current_y += img.height + stack_gap
|
1415
|
+
else: # horizontal
|
1416
|
+
final_width = sum(img.width for img in page_images) + (len(page_images) - 1) * stack_gap
|
1417
|
+
final_height = max(img.height for img in page_images)
|
1418
|
+
|
1419
|
+
stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
1420
|
+
|
1421
|
+
current_x = 0
|
1422
|
+
for img in page_images:
|
1423
|
+
# Center vertically
|
1424
|
+
y_offset = (final_height - img.height) // 2
|
1425
|
+
stacked_image.paste(img, (current_x, y_offset))
|
1426
|
+
current_x += img.width + stack_gap
|
1427
|
+
|
1428
|
+
return stacked_image
|
1429
|
+
|
1078
1430
|
def save(
|
1079
1431
|
self,
|
1080
1432
|
filename: str,
|
@@ -1110,8 +1462,8 @@ class ElementCollection(
|
|
1110
1462
|
else:
|
1111
1463
|
resolution = 144 # Default resolution when none specified
|
1112
1464
|
|
1113
|
-
# Use
|
1114
|
-
self.
|
1465
|
+
# Use export() to save the image
|
1466
|
+
self.export(
|
1115
1467
|
path=filename,
|
1116
1468
|
resolution=resolution,
|
1117
1469
|
width=width,
|
@@ -1121,42 +1473,6 @@ class ElementCollection(
|
|
1121
1473
|
)
|
1122
1474
|
return self
|
1123
1475
|
|
1124
|
-
def to_image(
|
1125
|
-
self,
|
1126
|
-
path: Optional[str] = None,
|
1127
|
-
resolution: Optional[float] = None,
|
1128
|
-
width: Optional[int] = None,
|
1129
|
-
labels: bool = True,
|
1130
|
-
legend_position: str = "right",
|
1131
|
-
render_ocr: bool = False,
|
1132
|
-
) -> Optional["Image.Image"]:
|
1133
|
-
"""
|
1134
|
-
Generate an image of the page with this collection's elements highlighted,
|
1135
|
-
optionally saving it to a file.
|
1136
|
-
|
1137
|
-
Args:
|
1138
|
-
path: Optional path to save the image to
|
1139
|
-
resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
|
1140
|
-
width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
|
1141
|
-
labels: Whether to include a legend for labels
|
1142
|
-
legend_position: Position of the legend
|
1143
|
-
render_ocr: Whether to render OCR text with white background boxes
|
1144
|
-
|
1145
|
-
Returns:
|
1146
|
-
PIL Image of the page with elements highlighted, or None if no valid page
|
1147
|
-
"""
|
1148
|
-
# Get the page from the first element (if available)
|
1149
|
-
if self._elements and hasattr(self._elements[0], "page"):
|
1150
|
-
page = self._elements[0].page
|
1151
|
-
# Generate the image using to_image
|
1152
|
-
return page.to_image(
|
1153
|
-
path=path,
|
1154
|
-
resolution=resolution,
|
1155
|
-
width=width,
|
1156
|
-
labels=labels,
|
1157
|
-
legend_position=legend_position,
|
1158
|
-
render_ocr=render_ocr,
|
1159
|
-
)
|
1160
1476
|
return None
|
1161
1477
|
|
1162
1478
|
def _group_elements_by_attr(self, group_by: str) -> Dict[Any, List[T]]:
|
@@ -1216,17 +1532,57 @@ class ElementCollection(
|
|
1216
1532
|
return str(group_key)
|
1217
1533
|
|
1218
1534
|
def _get_element_highlight_params(
|
1219
|
-
self, element: T,
|
1535
|
+
self, element: T, annotate: Optional[List[str]]
|
1220
1536
|
) -> Optional[Dict]:
|
1221
1537
|
"""Extracts common parameters needed for highlighting a single element."""
|
1538
|
+
# For FlowRegions and other complex elements, use highlighting protocol
|
1539
|
+
if hasattr(element, "get_highlight_specs"):
|
1540
|
+
specs = element.get_highlight_specs()
|
1541
|
+
if not specs:
|
1542
|
+
logger.warning(f"Element {element} returned no highlight specs")
|
1543
|
+
return None
|
1544
|
+
|
1545
|
+
# For now, we'll use the first spec for the prepared data
|
1546
|
+
# The actual rendering will use all specs
|
1547
|
+
first_spec = specs[0]
|
1548
|
+
page = first_spec["page"]
|
1549
|
+
|
1550
|
+
base_data = {
|
1551
|
+
"page_index": first_spec["page_index"],
|
1552
|
+
"element": element,
|
1553
|
+
"annotate": annotate,
|
1554
|
+
"attributes_to_draw": {},
|
1555
|
+
"bbox": first_spec.get("bbox"),
|
1556
|
+
"polygon": first_spec.get("polygon"),
|
1557
|
+
"multi_spec": len(specs) > 1, # Flag to indicate multiple specs
|
1558
|
+
"all_specs": specs, # Store all specs for rendering
|
1559
|
+
}
|
1560
|
+
|
1561
|
+
# Extract attributes if requested
|
1562
|
+
if annotate:
|
1563
|
+
for attr_name in annotate:
|
1564
|
+
try:
|
1565
|
+
attr_value = getattr(element, attr_name, None)
|
1566
|
+
if attr_value is not None:
|
1567
|
+
base_data["attributes_to_draw"][attr_name] = attr_value
|
1568
|
+
except AttributeError:
|
1569
|
+
logger.warning(
|
1570
|
+
f"Attribute '{attr_name}' not found on element {element} for annotate"
|
1571
|
+
)
|
1572
|
+
|
1573
|
+
return base_data
|
1574
|
+
|
1575
|
+
# Fallback for regular elements with direct page access
|
1222
1576
|
if not hasattr(element, "page"):
|
1577
|
+
logger.warning(f"Element {element} has no page attribute and no highlighting protocol")
|
1223
1578
|
return None
|
1579
|
+
|
1224
1580
|
page = element.page
|
1225
1581
|
|
1226
1582
|
base_data = {
|
1227
1583
|
"page_index": page.index,
|
1228
1584
|
"element": element,
|
1229
|
-
"
|
1585
|
+
"annotate": annotate,
|
1230
1586
|
"attributes_to_draw": {},
|
1231
1587
|
"bbox": None,
|
1232
1588
|
"polygon": None,
|
@@ -1251,15 +1607,15 @@ class ElementCollection(
|
|
1251
1607
|
return None
|
1252
1608
|
|
1253
1609
|
# Extract attributes if requested
|
1254
|
-
if
|
1255
|
-
for attr_name in
|
1610
|
+
if annotate:
|
1611
|
+
for attr_name in annotate:
|
1256
1612
|
try:
|
1257
1613
|
attr_value = getattr(element, attr_name, None)
|
1258
1614
|
if attr_value is not None:
|
1259
1615
|
base_data["attributes_to_draw"][attr_name] = attr_value
|
1260
1616
|
except AttributeError:
|
1261
1617
|
logger.warning(
|
1262
|
-
f"Attribute '{attr_name}' not found on element {element} for
|
1618
|
+
f"Attribute '{attr_name}' not found on element {element} for annotate"
|
1263
1619
|
)
|
1264
1620
|
|
1265
1621
|
return base_data
|
@@ -1416,7 +1772,7 @@ class ElementCollection(
|
|
1416
1772
|
|
1417
1773
|
def correct_ocr(
|
1418
1774
|
self,
|
1419
|
-
|
1775
|
+
transform: Callable[[Any], Optional[str]],
|
1420
1776
|
max_workers: Optional[int] = None,
|
1421
1777
|
) -> "ElementCollection":
|
1422
1778
|
"""
|
@@ -1425,10 +1781,10 @@ class ElementCollection(
|
|
1425
1781
|
in parallel if `max_workers` is specified.
|
1426
1782
|
|
1427
1783
|
Iterates through elements currently in the collection. If an element's
|
1428
|
-
'source' attribute starts with 'ocr', it calls the `
|
1784
|
+
'source' attribute starts with 'ocr', it calls the `transform`
|
1429
1785
|
for that element, passing the element itself.
|
1430
1786
|
|
1431
|
-
The `
|
1787
|
+
The `transform` should contain the logic to:
|
1432
1788
|
1. Determine if the element needs correction.
|
1433
1789
|
2. Perform the correction (e.g., call an LLM).
|
1434
1790
|
3. Return the new text (`str`) or `None`.
|
@@ -1438,8 +1794,8 @@ class ElementCollection(
|
|
1438
1794
|
Elements without a source starting with 'ocr' are skipped.
|
1439
1795
|
|
1440
1796
|
Args:
|
1441
|
-
|
1442
|
-
|
1797
|
+
transform: A function accepting an element and returning
|
1798
|
+
`Optional[str]` (new text or None).
|
1443
1799
|
max_workers: The maximum number of worker threads to use for parallel
|
1444
1800
|
correction on each page. If None, defaults are used.
|
1445
1801
|
|
@@ -1449,7 +1805,7 @@ class ElementCollection(
|
|
1449
1805
|
# Delegate to the utility function
|
1450
1806
|
_apply_ocr_correction_to_elements(
|
1451
1807
|
elements=self._elements,
|
1452
|
-
correction_callback=
|
1808
|
+
correction_callback=transform,
|
1453
1809
|
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1454
1810
|
max_workers=max_workers,
|
1455
1811
|
)
|
@@ -1696,9 +2052,7 @@ class ElementCollection(
|
|
1696
2052
|
image_path = image_dir / image_filename
|
1697
2053
|
|
1698
2054
|
# Save image
|
1699
|
-
element.
|
1700
|
-
path=str(image_path), resolution=image_resolution, include_highlights=True
|
1701
|
-
)
|
2055
|
+
element.show(path=str(image_path), resolution=image_resolution)
|
1702
2056
|
|
1703
2057
|
# Add relative path to data
|
1704
2058
|
element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
@@ -1986,8 +2340,8 @@ class ElementCollection(
|
|
1986
2340
|
# ------------------------------------------------------------------
|
1987
2341
|
def apply_ocr(
|
1988
2342
|
self,
|
1989
|
-
*,
|
1990
2343
|
function: Optional[Callable[["Region"], Optional[str]]] = None,
|
2344
|
+
*,
|
1991
2345
|
show_progress: bool = True,
|
1992
2346
|
**kwargs,
|
1993
2347
|
) -> "ElementCollection":
|
@@ -2043,1154 +2397,3 @@ class ElementCollection(
|
|
2043
2397
|
return self
|
2044
2398
|
|
2045
2399
|
# ------------------------------------------------------------------
|
2046
|
-
|
2047
|
-
|
2048
|
-
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
2049
|
-
"""
|
2050
|
-
Represents a collection of Page objects, often from a single PDF document.
|
2051
|
-
Provides methods for batch operations on these pages.
|
2052
|
-
"""
|
2053
|
-
|
2054
|
-
def __init__(self, pages: Union[List[P], Sequence[P]]):
|
2055
|
-
"""
|
2056
|
-
Initialize a page collection.
|
2057
|
-
|
2058
|
-
Args:
|
2059
|
-
pages: List or sequence of Page objects (can be lazy)
|
2060
|
-
"""
|
2061
|
-
# Store the sequence as-is to preserve lazy behavior
|
2062
|
-
# Only convert to list if we need list-specific operations
|
2063
|
-
if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
|
2064
|
-
self.pages = pages
|
2065
|
-
else:
|
2066
|
-
# Fallback for non-sequence types
|
2067
|
-
self.pages = list(pages)
|
2068
|
-
|
2069
|
-
def __len__(self) -> int:
|
2070
|
-
"""Return the number of pages in the collection."""
|
2071
|
-
return len(self.pages)
|
2072
|
-
|
2073
|
-
def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
|
2074
|
-
"""Support indexing and slicing."""
|
2075
|
-
if isinstance(idx, slice):
|
2076
|
-
return PageCollection(self.pages[idx])
|
2077
|
-
return self.pages[idx]
|
2078
|
-
|
2079
|
-
def __iter__(self) -> Iterator[P]:
|
2080
|
-
"""Support iteration."""
|
2081
|
-
return iter(self.pages)
|
2082
|
-
|
2083
|
-
def __repr__(self) -> str:
|
2084
|
-
"""Return a string representation showing the page count."""
|
2085
|
-
return f"<PageCollection(count={len(self)})>"
|
2086
|
-
|
2087
|
-
def _get_items_for_apply(self) -> Iterator[P]:
|
2088
|
-
"""
|
2089
|
-
Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
|
2090
|
-
|
2091
|
-
Returns an iterator that yields pages on-demand rather than materializing
|
2092
|
-
all pages at once, maintaining the lazy loading behavior.
|
2093
|
-
"""
|
2094
|
-
return iter(self.pages)
|
2095
|
-
|
2096
|
-
def _get_page_indices(self) -> List[int]:
|
2097
|
-
"""
|
2098
|
-
Get page indices without forcing materialization of pages.
|
2099
|
-
|
2100
|
-
Returns:
|
2101
|
-
List of page indices for the pages in this collection.
|
2102
|
-
"""
|
2103
|
-
# Handle different types of page sequences efficiently
|
2104
|
-
if hasattr(self.pages, '_indices'):
|
2105
|
-
# If it's a _LazyPageList (or slice), get indices directly
|
2106
|
-
return list(self.pages._indices)
|
2107
|
-
else:
|
2108
|
-
# Fallback: if pages are already materialized, get indices normally
|
2109
|
-
# This will force materialization but only if pages aren't lazy
|
2110
|
-
return [p.index for p in self.pages]
|
2111
|
-
|
2112
|
-
def extract_text(
|
2113
|
-
self,
|
2114
|
-
keep_blank_chars: bool = True,
|
2115
|
-
apply_exclusions: bool = True,
|
2116
|
-
strip: Optional[bool] = None,
|
2117
|
-
**kwargs,
|
2118
|
-
) -> str:
|
2119
|
-
"""
|
2120
|
-
Extract text from all pages in the collection.
|
2121
|
-
|
2122
|
-
Args:
|
2123
|
-
keep_blank_chars: Whether to keep blank characters (default: True)
|
2124
|
-
apply_exclusions: Whether to apply exclusion regions (default: True)
|
2125
|
-
strip: Whether to strip whitespace from the extracted text.
|
2126
|
-
**kwargs: Additional extraction parameters
|
2127
|
-
|
2128
|
-
Returns:
|
2129
|
-
Combined text from all pages
|
2130
|
-
"""
|
2131
|
-
texts = []
|
2132
|
-
for page in self.pages:
|
2133
|
-
text = page.extract_text(
|
2134
|
-
keep_blank_chars=keep_blank_chars,
|
2135
|
-
apply_exclusions=apply_exclusions,
|
2136
|
-
**kwargs,
|
2137
|
-
)
|
2138
|
-
texts.append(text)
|
2139
|
-
|
2140
|
-
combined = "\n".join(texts)
|
2141
|
-
|
2142
|
-
# Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
|
2143
|
-
use_layout = kwargs.get("layout", False)
|
2144
|
-
strip_final = strip if strip is not None else (not use_layout)
|
2145
|
-
|
2146
|
-
if strip_final:
|
2147
|
-
combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
|
2148
|
-
|
2149
|
-
return combined
|
2150
|
-
|
2151
|
-
def apply_ocr(
|
2152
|
-
self,
|
2153
|
-
engine: Optional[str] = None,
|
2154
|
-
# --- Common OCR Parameters (Direct Arguments) ---
|
2155
|
-
languages: Optional[List[str]] = None,
|
2156
|
-
min_confidence: Optional[float] = None, # Min confidence threshold
|
2157
|
-
device: Optional[str] = None,
|
2158
|
-
resolution: Optional[int] = None, # DPI for rendering
|
2159
|
-
apply_exclusions: bool = True, # New parameter
|
2160
|
-
replace: bool = True, # Whether to replace existing OCR elements
|
2161
|
-
# --- Engine-Specific Options ---
|
2162
|
-
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
2163
|
-
) -> "PageCollection[P]":
|
2164
|
-
"""
|
2165
|
-
Applies OCR to all pages within this collection using batch processing.
|
2166
|
-
|
2167
|
-
This delegates the work to the parent PDF object's `apply_ocr` method.
|
2168
|
-
|
2169
|
-
Args:
|
2170
|
-
engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
|
2171
|
-
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
|
2172
|
-
**Must be codes understood by the specific selected engine.**
|
2173
|
-
No mapping is performed.
|
2174
|
-
min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
|
2175
|
-
device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
|
2176
|
-
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
2177
|
-
apply_exclusions: If True (default), render page images for OCR with
|
2178
|
-
excluded areas masked (whited out). If False, OCR
|
2179
|
-
the raw page images without masking exclusions.
|
2180
|
-
replace: If True (default), remove any existing OCR elements before
|
2181
|
-
adding new ones. If False, add new OCR elements to existing ones.
|
2182
|
-
options: An engine-specific options object (e.g., EasyOCROptions) or dict.
|
2183
|
-
|
2184
|
-
Returns:
|
2185
|
-
Self for method chaining.
|
2186
|
-
|
2187
|
-
Raises:
|
2188
|
-
RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
|
2189
|
-
(Propagates exceptions from PDF.apply_ocr)
|
2190
|
-
"""
|
2191
|
-
if not self.pages:
|
2192
|
-
logger.warning("Cannot apply OCR to an empty PageCollection.")
|
2193
|
-
return self
|
2194
|
-
|
2195
|
-
# Assume all pages share the same parent PDF object
|
2196
|
-
first_page = self.pages[0]
|
2197
|
-
if not hasattr(first_page, "_parent") or not first_page._parent:
|
2198
|
-
raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
|
2199
|
-
|
2200
|
-
parent_pdf = first_page._parent
|
2201
|
-
|
2202
|
-
if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
|
2203
|
-
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
2204
|
-
|
2205
|
-
# Get the 0-based indices of the pages in this collection
|
2206
|
-
page_indices = self._get_page_indices()
|
2207
|
-
|
2208
|
-
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
2209
|
-
|
2210
|
-
# Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
|
2211
|
-
parent_pdf.apply_ocr(
|
2212
|
-
pages=page_indices,
|
2213
|
-
engine=engine,
|
2214
|
-
languages=languages,
|
2215
|
-
min_confidence=min_confidence, # Pass the renamed parameter
|
2216
|
-
device=device,
|
2217
|
-
resolution=resolution,
|
2218
|
-
apply_exclusions=apply_exclusions, # Pass down
|
2219
|
-
replace=replace, # Pass the replace parameter
|
2220
|
-
options=options,
|
2221
|
-
)
|
2222
|
-
# The PDF method modifies the Page objects directly by adding elements.
|
2223
|
-
|
2224
|
-
return self # Return self for chaining
|
2225
|
-
|
2226
|
-
@overload
|
2227
|
-
def find(
|
2228
|
-
self,
|
2229
|
-
*,
|
2230
|
-
text: str,
|
2231
|
-
contains: str = "all",
|
2232
|
-
apply_exclusions: bool = True,
|
2233
|
-
regex: bool = False,
|
2234
|
-
case: bool = True,
|
2235
|
-
**kwargs,
|
2236
|
-
) -> Optional[T]: ...
|
2237
|
-
|
2238
|
-
@overload
|
2239
|
-
def find(
|
2240
|
-
self,
|
2241
|
-
selector: str,
|
2242
|
-
*,
|
2243
|
-
contains: str = "all",
|
2244
|
-
apply_exclusions: bool = True,
|
2245
|
-
regex: bool = False,
|
2246
|
-
case: bool = True,
|
2247
|
-
**kwargs,
|
2248
|
-
) -> Optional[T]: ...
|
2249
|
-
|
2250
|
-
def find(
|
2251
|
-
self,
|
2252
|
-
selector: Optional[str] = None,
|
2253
|
-
*,
|
2254
|
-
text: Optional[str] = None,
|
2255
|
-
contains: str = "all",
|
2256
|
-
apply_exclusions: bool = True,
|
2257
|
-
regex: bool = False,
|
2258
|
-
case: bool = True,
|
2259
|
-
**kwargs,
|
2260
|
-
) -> Optional[T]:
|
2261
|
-
"""
|
2262
|
-
Find the first element matching the selector OR text across all pages in the collection.
|
2263
|
-
|
2264
|
-
Provide EITHER `selector` OR `text`, but not both.
|
2265
|
-
|
2266
|
-
Args:
|
2267
|
-
selector: CSS-like selector string.
|
2268
|
-
text: Text content to search for (equivalent to 'text:contains(...)').
|
2269
|
-
contains: How to determine if elements are inside: 'all' (fully inside),
|
2270
|
-
'any' (any overlap), or 'center' (center point inside).
|
2271
|
-
(default: "all")
|
2272
|
-
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2273
|
-
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2274
|
-
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
2275
|
-
**kwargs: Additional filter parameters.
|
2276
|
-
|
2277
|
-
Returns:
|
2278
|
-
First matching element or None.
|
2279
|
-
"""
|
2280
|
-
# Input validation happens within page.find
|
2281
|
-
for page in self.pages:
|
2282
|
-
element = page.find(
|
2283
|
-
selector=selector,
|
2284
|
-
text=text,
|
2285
|
-
contains=contains,
|
2286
|
-
apply_exclusions=apply_exclusions,
|
2287
|
-
regex=regex,
|
2288
|
-
case=case,
|
2289
|
-
**kwargs,
|
2290
|
-
)
|
2291
|
-
if element:
|
2292
|
-
return element
|
2293
|
-
return None
|
2294
|
-
|
2295
|
-
@overload
|
2296
|
-
def find_all(
|
2297
|
-
self,
|
2298
|
-
*,
|
2299
|
-
text: str,
|
2300
|
-
contains: str = "all",
|
2301
|
-
apply_exclusions: bool = True,
|
2302
|
-
regex: bool = False,
|
2303
|
-
case: bool = True,
|
2304
|
-
**kwargs,
|
2305
|
-
) -> "ElementCollection": ...
|
2306
|
-
|
2307
|
-
@overload
|
2308
|
-
def find_all(
|
2309
|
-
self,
|
2310
|
-
selector: str,
|
2311
|
-
*,
|
2312
|
-
contains: str = "all",
|
2313
|
-
apply_exclusions: bool = True,
|
2314
|
-
regex: bool = False,
|
2315
|
-
case: bool = True,
|
2316
|
-
**kwargs,
|
2317
|
-
) -> "ElementCollection": ...
|
2318
|
-
|
2319
|
-
def find_all(
|
2320
|
-
self,
|
2321
|
-
selector: Optional[str] = None,
|
2322
|
-
*,
|
2323
|
-
text: Optional[str] = None,
|
2324
|
-
contains: str = "all",
|
2325
|
-
apply_exclusions: bool = True,
|
2326
|
-
regex: bool = False,
|
2327
|
-
case: bool = True,
|
2328
|
-
**kwargs,
|
2329
|
-
) -> "ElementCollection":
|
2330
|
-
"""
|
2331
|
-
Find all elements matching the selector OR text across all pages in the collection.
|
2332
|
-
|
2333
|
-
Provide EITHER `selector` OR `text`, but not both.
|
2334
|
-
|
2335
|
-
Args:
|
2336
|
-
selector: CSS-like selector string.
|
2337
|
-
text: Text content to search for (equivalent to 'text:contains(...)').
|
2338
|
-
contains: How to determine if elements are inside: 'all' (fully inside),
|
2339
|
-
'any' (any overlap), or 'center' (center point inside).
|
2340
|
-
(default: "all")
|
2341
|
-
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2342
|
-
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2343
|
-
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
2344
|
-
**kwargs: Additional filter parameters.
|
2345
|
-
|
2346
|
-
Returns:
|
2347
|
-
ElementCollection with matching elements from all pages.
|
2348
|
-
"""
|
2349
|
-
all_elements = []
|
2350
|
-
# Input validation happens within page.find_all
|
2351
|
-
for page in self.pages:
|
2352
|
-
elements = page.find_all(
|
2353
|
-
selector=selector,
|
2354
|
-
text=text,
|
2355
|
-
contains=contains,
|
2356
|
-
apply_exclusions=apply_exclusions,
|
2357
|
-
regex=regex,
|
2358
|
-
case=case,
|
2359
|
-
**kwargs,
|
2360
|
-
)
|
2361
|
-
if elements:
|
2362
|
-
all_elements.extend(elements.elements)
|
2363
|
-
|
2364
|
-
return ElementCollection(all_elements)
|
2365
|
-
|
2366
|
-
def correct_ocr(
|
2367
|
-
self,
|
2368
|
-
correction_callback: Callable[[Any], Optional[str]],
|
2369
|
-
max_workers: Optional[int] = None,
|
2370
|
-
) -> "PageCollection[P]":
|
2371
|
-
"""
|
2372
|
-
Applies corrections to OCR-generated text elements across all pages
|
2373
|
-
in this collection using a user-provided callback function, executed
|
2374
|
-
in parallel if `max_workers` is specified.
|
2375
|
-
|
2376
|
-
This method delegates to the parent PDF's `correct_ocr` method,
|
2377
|
-
targeting all pages within this collection.
|
2378
|
-
|
2379
|
-
Args:
|
2380
|
-
correction_callback: A function that accepts a single argument (an element
|
2381
|
-
object) and returns `Optional[str]` (new text or None).
|
2382
|
-
max_workers: The maximum number of worker threads to use for parallel
|
2383
|
-
correction on each page. If None, defaults are used.
|
2384
|
-
|
2385
|
-
Returns:
|
2386
|
-
Self for method chaining.
|
2387
|
-
|
2388
|
-
Raises:
|
2389
|
-
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
2390
|
-
or the parent PDF lacks the `correct_ocr` method.
|
2391
|
-
"""
|
2392
|
-
if not self.pages:
|
2393
|
-
logger.warning("Cannot correct OCR for an empty PageCollection.")
|
2394
|
-
# Return self even if empty to maintain chaining consistency
|
2395
|
-
return self
|
2396
|
-
|
2397
|
-
# Assume all pages share the same parent PDF object
|
2398
|
-
parent_pdf = self.pages[0]._parent
|
2399
|
-
if (
|
2400
|
-
not parent_pdf
|
2401
|
-
or not hasattr(parent_pdf, "correct_ocr")
|
2402
|
-
or not callable(parent_pdf.correct_ocr)
|
2403
|
-
):
|
2404
|
-
raise RuntimeError(
|
2405
|
-
"Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
|
2406
|
-
)
|
2407
|
-
|
2408
|
-
page_indices = self._get_page_indices()
|
2409
|
-
logger.info(
|
2410
|
-
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
|
2411
|
-
)
|
2412
|
-
|
2413
|
-
# Delegate the call to the parent PDF object for the relevant pages
|
2414
|
-
# Pass the max_workers parameter down
|
2415
|
-
parent_pdf.correct_ocr(
|
2416
|
-
correction_callback=correction_callback,
|
2417
|
-
pages=page_indices,
|
2418
|
-
max_workers=max_workers, # Pass it here
|
2419
|
-
)
|
2420
|
-
|
2421
|
-
return self
|
2422
|
-
|
2423
|
-
def get_sections(
|
2424
|
-
self,
|
2425
|
-
start_elements=None,
|
2426
|
-
end_elements=None,
|
2427
|
-
new_section_on_page_break=False,
|
2428
|
-
boundary_inclusion="both",
|
2429
|
-
) -> "ElementCollection[Region]":
|
2430
|
-
"""
|
2431
|
-
Extract sections from a page collection based on start/end elements.
|
2432
|
-
|
2433
|
-
Args:
|
2434
|
-
start_elements: Elements or selector string that mark the start of sections
|
2435
|
-
end_elements: Elements or selector string that mark the end of sections
|
2436
|
-
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
2437
|
-
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
2438
|
-
|
2439
|
-
Returns:
|
2440
|
-
List of Region objects representing the extracted sections
|
2441
|
-
"""
|
2442
|
-
# Find start and end elements across all pages
|
2443
|
-
if isinstance(start_elements, str):
|
2444
|
-
start_elements = self.find_all(start_elements).elements
|
2445
|
-
|
2446
|
-
if isinstance(end_elements, str):
|
2447
|
-
end_elements = self.find_all(end_elements).elements
|
2448
|
-
|
2449
|
-
# If no start elements, return empty list
|
2450
|
-
if not start_elements:
|
2451
|
-
return []
|
2452
|
-
|
2453
|
-
# If there are page break boundaries, we'll need to add them
|
2454
|
-
if new_section_on_page_break:
|
2455
|
-
# For each page boundary, create virtual "end" and "start" elements
|
2456
|
-
for i in range(len(self.pages) - 1):
|
2457
|
-
# Add a virtual "end" element at the bottom of the current page
|
2458
|
-
page = self.pages[i]
|
2459
|
-
# If end_elements is None, initialize it as an empty list
|
2460
|
-
if end_elements is None:
|
2461
|
-
end_elements = []
|
2462
|
-
|
2463
|
-
# Create a region at the bottom of the page as an artificial end marker
|
2464
|
-
from natural_pdf.elements.region import Region
|
2465
|
-
|
2466
|
-
bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
|
2467
|
-
bottom_region.is_page_boundary = True # Mark it as a special boundary
|
2468
|
-
end_elements.append(bottom_region)
|
2469
|
-
|
2470
|
-
# Add a virtual "start" element at the top of the next page
|
2471
|
-
next_page = self.pages[i + 1]
|
2472
|
-
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
2473
|
-
top_region.is_page_boundary = True # Mark it as a special boundary
|
2474
|
-
start_elements.append(top_region)
|
2475
|
-
|
2476
|
-
# Get all elements from all pages and sort them in document order
|
2477
|
-
all_elements = []
|
2478
|
-
for page in self.pages:
|
2479
|
-
elements = page.get_elements()
|
2480
|
-
all_elements.extend(elements)
|
2481
|
-
|
2482
|
-
# Sort by page index, then vertical position, then horizontal position
|
2483
|
-
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
2484
|
-
|
2485
|
-
# Mark section boundaries
|
2486
|
-
section_boundaries = []
|
2487
|
-
|
2488
|
-
# Add start element boundaries
|
2489
|
-
for element in start_elements:
|
2490
|
-
if element in all_elements:
|
2491
|
-
idx = all_elements.index(element)
|
2492
|
-
section_boundaries.append(
|
2493
|
-
{
|
2494
|
-
"index": idx,
|
2495
|
-
"element": element,
|
2496
|
-
"type": "start",
|
2497
|
-
"page_idx": element.page.index,
|
2498
|
-
}
|
2499
|
-
)
|
2500
|
-
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
2501
|
-
# This is a virtual page boundary element
|
2502
|
-
section_boundaries.append(
|
2503
|
-
{
|
2504
|
-
"index": -1, # Special index for page boundaries
|
2505
|
-
"element": element,
|
2506
|
-
"type": "start",
|
2507
|
-
"page_idx": element.page.index,
|
2508
|
-
}
|
2509
|
-
)
|
2510
|
-
|
2511
|
-
# Add end element boundaries if provided
|
2512
|
-
if end_elements:
|
2513
|
-
for element in end_elements:
|
2514
|
-
if element in all_elements:
|
2515
|
-
idx = all_elements.index(element)
|
2516
|
-
section_boundaries.append(
|
2517
|
-
{
|
2518
|
-
"index": idx,
|
2519
|
-
"element": element,
|
2520
|
-
"type": "end",
|
2521
|
-
"page_idx": element.page.index,
|
2522
|
-
}
|
2523
|
-
)
|
2524
|
-
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
2525
|
-
# This is a virtual page boundary element
|
2526
|
-
section_boundaries.append(
|
2527
|
-
{
|
2528
|
-
"index": -1, # Special index for page boundaries
|
2529
|
-
"element": element,
|
2530
|
-
"type": "end",
|
2531
|
-
"page_idx": element.page.index,
|
2532
|
-
}
|
2533
|
-
)
|
2534
|
-
|
2535
|
-
# Sort boundaries by page index, then by actual document position
|
2536
|
-
section_boundaries.sort(
|
2537
|
-
key=lambda x: (
|
2538
|
-
x["page_idx"],
|
2539
|
-
x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
|
2540
|
-
)
|
2541
|
-
)
|
2542
|
-
|
2543
|
-
# Generate sections
|
2544
|
-
sections = []
|
2545
|
-
|
2546
|
-
# --- Helper: build a FlowRegion spanning multiple pages ---
|
2547
|
-
def _build_flow_region(start_el, end_el):
|
2548
|
-
"""Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
|
2549
|
-
If *end_el* is None, the region continues to the bottom of the last
|
2550
|
-
page in this PageCollection."""
|
2551
|
-
# Local imports to avoid top-level cycles
|
2552
|
-
from natural_pdf.elements.region import Region
|
2553
|
-
from natural_pdf.flows.element import FlowElement
|
2554
|
-
from natural_pdf.flows.flow import Flow
|
2555
|
-
from natural_pdf.flows.region import FlowRegion
|
2556
|
-
|
2557
|
-
start_pg = start_el.page
|
2558
|
-
end_pg = end_el.page if end_el is not None else self.pages[-1]
|
2559
|
-
|
2560
|
-
parts: list[Region] = []
|
2561
|
-
# Slice of first page
|
2562
|
-
parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
|
2563
|
-
|
2564
|
-
# Full middle pages
|
2565
|
-
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
2566
|
-
mid_pg = self.pages[pg_idx]
|
2567
|
-
parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
|
2568
|
-
|
2569
|
-
# Slice of last page (if distinct)
|
2570
|
-
if end_pg is not start_pg:
|
2571
|
-
bottom = end_el.bottom if end_el is not None else end_pg.height
|
2572
|
-
parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
|
2573
|
-
|
2574
|
-
flow = Flow(segments=parts, arrangement="vertical")
|
2575
|
-
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
2576
|
-
return FlowRegion(
|
2577
|
-
flow=flow,
|
2578
|
-
constituent_regions=parts,
|
2579
|
-
source_flow_element=src_fe,
|
2580
|
-
boundary_element_found=end_el,
|
2581
|
-
)
|
2582
|
-
|
2583
|
-
# ------------------------------------------------------------------
|
2584
|
-
|
2585
|
-
current_start = None
|
2586
|
-
|
2587
|
-
for i, boundary in enumerate(section_boundaries):
|
2588
|
-
# If it's a start boundary and we don't have a current start
|
2589
|
-
if boundary["type"] == "start" and current_start is None:
|
2590
|
-
current_start = boundary
|
2591
|
-
|
2592
|
-
# If it's an end boundary and we have a current start
|
2593
|
-
elif boundary["type"] == "end" and current_start is not None:
|
2594
|
-
# Create a section from current_start to this boundary
|
2595
|
-
start_element = current_start["element"]
|
2596
|
-
end_element = boundary["element"]
|
2597
|
-
|
2598
|
-
# If both elements are on the same page, use the page's get_section_between
|
2599
|
-
if start_element.page == end_element.page:
|
2600
|
-
section = start_element.page.get_section_between(
|
2601
|
-
start_element, end_element, boundary_inclusion
|
2602
|
-
)
|
2603
|
-
sections.append(section)
|
2604
|
-
else:
|
2605
|
-
# Create FlowRegion spanning pages
|
2606
|
-
flow_region = _build_flow_region(start_element, end_element)
|
2607
|
-
sections.append(flow_region)
|
2608
|
-
|
2609
|
-
current_start = None
|
2610
|
-
|
2611
|
-
# If it's another start boundary and we have a current start (for splitting by starts only)
|
2612
|
-
elif boundary["type"] == "start" and current_start is not None and not end_elements:
|
2613
|
-
# Create a section from current_start to just before this boundary
|
2614
|
-
start_element = current_start["element"]
|
2615
|
-
|
2616
|
-
# Find the last element before this boundary on the same page
|
2617
|
-
if start_element.page == boundary["element"].page:
|
2618
|
-
# Find elements on this page
|
2619
|
-
page_elements = [e for e in all_elements if e.page == start_element.page]
|
2620
|
-
# Sort by position
|
2621
|
-
page_elements.sort(key=lambda e: (e.top, e.x0))
|
2622
|
-
|
2623
|
-
# Find the last element before the boundary
|
2624
|
-
end_idx = (
|
2625
|
-
page_elements.index(boundary["element"]) - 1
|
2626
|
-
if boundary["element"] in page_elements
|
2627
|
-
else -1
|
2628
|
-
)
|
2629
|
-
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
2630
|
-
|
2631
|
-
# Create the section
|
2632
|
-
section = start_element.page.get_section_between(
|
2633
|
-
start_element, end_element, boundary_inclusion
|
2634
|
-
)
|
2635
|
-
sections.append(section)
|
2636
|
-
else:
|
2637
|
-
# Cross-page section - create from current_start to the end of its page
|
2638
|
-
from natural_pdf.elements.region import Region
|
2639
|
-
|
2640
|
-
start_page = start_element.page
|
2641
|
-
|
2642
|
-
region = Region(
|
2643
|
-
start_page, (0, start_element.top, start_page.width, start_page.height)
|
2644
|
-
)
|
2645
|
-
region.start_element = start_element
|
2646
|
-
sections.append(region)
|
2647
|
-
|
2648
|
-
current_start = boundary
|
2649
|
-
|
2650
|
-
# Handle the last section if we have a current start
|
2651
|
-
if current_start is not None:
|
2652
|
-
start_element = current_start["element"]
|
2653
|
-
start_page = start_element.page
|
2654
|
-
|
2655
|
-
if end_elements:
|
2656
|
-
# With end_elements, we need an explicit end - use the last element
|
2657
|
-
# on the last page of the collection
|
2658
|
-
last_page = self.pages[-1]
|
2659
|
-
last_page_elements = [e for e in all_elements if e.page == last_page]
|
2660
|
-
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
2661
|
-
end_element = last_page_elements[-1] if last_page_elements else None
|
2662
|
-
|
2663
|
-
# Create FlowRegion spanning multiple pages using helper
|
2664
|
-
flow_region = _build_flow_region(start_element, end_element)
|
2665
|
-
sections.append(flow_region)
|
2666
|
-
else:
|
2667
|
-
# With start_elements only, create a section to the end of the current page
|
2668
|
-
from natural_pdf.elements.region import Region
|
2669
|
-
|
2670
|
-
region = Region(
|
2671
|
-
start_page, (0, start_element.top, start_page.width, start_page.height)
|
2672
|
-
)
|
2673
|
-
region.start_element = start_element
|
2674
|
-
sections.append(region)
|
2675
|
-
|
2676
|
-
return ElementCollection(sections)
|
2677
|
-
|
2678
|
-
def _gather_analysis_data(
|
2679
|
-
self,
|
2680
|
-
analysis_keys: List[str],
|
2681
|
-
include_content: bool,
|
2682
|
-
include_images: bool,
|
2683
|
-
image_dir: Optional[Path],
|
2684
|
-
image_format: str,
|
2685
|
-
image_resolution: int,
|
2686
|
-
) -> List[Dict[str, Any]]:
|
2687
|
-
"""
|
2688
|
-
Gather analysis data from all pages in the collection.
|
2689
|
-
|
2690
|
-
Args:
|
2691
|
-
analysis_keys: Keys in the analyses dictionary to export
|
2692
|
-
include_content: Whether to include extracted text
|
2693
|
-
include_images: Whether to export images
|
2694
|
-
image_dir: Directory to save images
|
2695
|
-
image_format: Format to save images
|
2696
|
-
image_resolution: Resolution for exported images
|
2697
|
-
|
2698
|
-
Returns:
|
2699
|
-
List of dictionaries containing analysis data
|
2700
|
-
"""
|
2701
|
-
if not self.elements:
|
2702
|
-
logger.warning("No pages found in collection")
|
2703
|
-
return []
|
2704
|
-
|
2705
|
-
all_data = []
|
2706
|
-
|
2707
|
-
for page in self.elements:
|
2708
|
-
# Basic page information
|
2709
|
-
page_data = {
|
2710
|
-
"page_number": page.number,
|
2711
|
-
"page_index": page.index,
|
2712
|
-
"width": page.width,
|
2713
|
-
"height": page.height,
|
2714
|
-
}
|
2715
|
-
|
2716
|
-
# Add PDF information if available
|
2717
|
-
if hasattr(page, "pdf") and page.pdf:
|
2718
|
-
page_data["pdf_path"] = page.pdf.path
|
2719
|
-
page_data["pdf_filename"] = Path(page.pdf.path).name
|
2720
|
-
|
2721
|
-
# Include extracted text if requested
|
2722
|
-
if include_content:
|
2723
|
-
try:
|
2724
|
-
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
2725
|
-
except Exception as e:
|
2726
|
-
logger.error(f"Error extracting text from page {page.number}: {e}")
|
2727
|
-
page_data["content"] = ""
|
2728
|
-
|
2729
|
-
# Save image if requested
|
2730
|
-
if include_images:
|
2731
|
-
try:
|
2732
|
-
# Create image filename
|
2733
|
-
pdf_name = "unknown"
|
2734
|
-
if hasattr(page, "pdf") and page.pdf:
|
2735
|
-
pdf_name = Path(page.pdf.path).stem
|
2736
|
-
|
2737
|
-
image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
|
2738
|
-
image_path = image_dir / image_filename
|
2739
|
-
|
2740
|
-
# Save image
|
2741
|
-
page.save_image(
|
2742
|
-
str(image_path), resolution=image_resolution, include_highlights=True
|
2743
|
-
)
|
2744
|
-
|
2745
|
-
# Add relative path to data
|
2746
|
-
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
2747
|
-
except Exception as e:
|
2748
|
-
logger.error(f"Error saving image for page {page.number}: {e}")
|
2749
|
-
page_data["image_path"] = None
|
2750
|
-
|
2751
|
-
# Add analyses data
|
2752
|
-
if hasattr(page, "analyses") and page.analyses:
|
2753
|
-
for key in analysis_keys:
|
2754
|
-
if key not in page.analyses:
|
2755
|
-
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
2756
|
-
|
2757
|
-
# Get the analysis result
|
2758
|
-
analysis_result = page.analyses[key]
|
2759
|
-
|
2760
|
-
# If the result has a to_dict method, use it
|
2761
|
-
if hasattr(analysis_result, "to_dict"):
|
2762
|
-
analysis_data = analysis_result.to_dict()
|
2763
|
-
else:
|
2764
|
-
# Otherwise, use the result directly if it's dict-like
|
2765
|
-
try:
|
2766
|
-
analysis_data = dict(analysis_result)
|
2767
|
-
except (TypeError, ValueError):
|
2768
|
-
# Last resort: convert to string
|
2769
|
-
analysis_data = {"raw_result": str(analysis_result)}
|
2770
|
-
|
2771
|
-
# Add analysis data to page data with the key as prefix
|
2772
|
-
for k, v in analysis_data.items():
|
2773
|
-
page_data[f"{key}.{k}"] = v
|
2774
|
-
|
2775
|
-
all_data.append(page_data)
|
2776
|
-
|
2777
|
-
return all_data
|
2778
|
-
|
2779
|
-
# --- Deskew Method --- #
|
2780
|
-
|
2781
|
-
def deskew(
|
2782
|
-
self,
|
2783
|
-
resolution: int = 300,
|
2784
|
-
detection_resolution: int = 72,
|
2785
|
-
force_overwrite: bool = False,
|
2786
|
-
**deskew_kwargs,
|
2787
|
-
) -> "PDF": # Changed return type
|
2788
|
-
"""
|
2789
|
-
Creates a new, in-memory PDF object containing deskewed versions of the pages
|
2790
|
-
in this collection.
|
2791
|
-
|
2792
|
-
This method delegates the actual processing to the parent PDF object's
|
2793
|
-
`deskew` method.
|
2794
|
-
|
2795
|
-
Important: The returned PDF is image-based. Any existing text, OCR results,
|
2796
|
-
annotations, or other elements from the original pages will *not* be carried over.
|
2797
|
-
|
2798
|
-
Args:
|
2799
|
-
resolution: DPI resolution for rendering the output deskewed pages.
|
2800
|
-
detection_resolution: DPI resolution used for skew detection if angles are not
|
2801
|
-
already cached on the page objects.
|
2802
|
-
force_overwrite: If False (default), raises a ValueError if any target page
|
2803
|
-
already contains processed elements (text, OCR, regions) to
|
2804
|
-
prevent accidental data loss. Set to True to proceed anyway.
|
2805
|
-
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2806
|
-
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
2807
|
-
|
2808
|
-
Returns:
|
2809
|
-
A new PDF object representing the deskewed document.
|
2810
|
-
|
2811
|
-
Raises:
|
2812
|
-
ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
|
2813
|
-
ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
|
2814
|
-
or if the collection is empty.
|
2815
|
-
RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
|
2816
|
-
"""
|
2817
|
-
if not self.pages:
|
2818
|
-
logger.warning("Cannot deskew an empty PageCollection.")
|
2819
|
-
raise ValueError("Cannot deskew an empty PageCollection.")
|
2820
|
-
|
2821
|
-
# Assume all pages share the same parent PDF object
|
2822
|
-
# Need to hint the type of _parent for type checkers
|
2823
|
-
if TYPE_CHECKING:
|
2824
|
-
parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
|
2825
|
-
else:
|
2826
|
-
parent_pdf = self.pages[0]._parent
|
2827
|
-
|
2828
|
-
if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
|
2829
|
-
raise RuntimeError(
|
2830
|
-
"Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
|
2831
|
-
)
|
2832
|
-
|
2833
|
-
# Get the 0-based indices of the pages in this collection
|
2834
|
-
page_indices = self._get_page_indices()
|
2835
|
-
logger.info(
|
2836
|
-
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2837
|
-
)
|
2838
|
-
|
2839
|
-
# Delegate the call to the parent PDF object for the relevant pages
|
2840
|
-
# Pass all relevant arguments through (no output_path anymore)
|
2841
|
-
return parent_pdf.deskew(
|
2842
|
-
pages=page_indices,
|
2843
|
-
resolution=resolution,
|
2844
|
-
detection_resolution=detection_resolution,
|
2845
|
-
force_overwrite=force_overwrite,
|
2846
|
-
**deskew_kwargs,
|
2847
|
-
)
|
2848
|
-
|
2849
|
-
# --- End Deskew Method --- #
|
2850
|
-
|
2851
|
-
def to_image(
|
2852
|
-
self,
|
2853
|
-
page_width: Optional[int] = None,
|
2854
|
-
cols: Optional[int] = 4,
|
2855
|
-
rows: Optional[int] = None,
|
2856
|
-
max_pages: Optional[int] = None,
|
2857
|
-
spacing: int = 10,
|
2858
|
-
add_labels: bool = True, # Add new flag
|
2859
|
-
show_category: bool = False,
|
2860
|
-
) -> Optional["Image.Image"]:
|
2861
|
-
"""
|
2862
|
-
Generate a grid of page images for this collection.
|
2863
|
-
|
2864
|
-
Args:
|
2865
|
-
page_width: Width in pixels for rendering individual pages
|
2866
|
-
cols: Number of columns in grid (default: 4)
|
2867
|
-
rows: Number of rows in grid (calculated automatically if None)
|
2868
|
-
max_pages: Maximum number of pages to include (default: all)
|
2869
|
-
spacing: Spacing between page thumbnails in pixels
|
2870
|
-
add_labels: Whether to add page number labels
|
2871
|
-
show_category: Whether to add category and confidence labels (if available)
|
2872
|
-
|
2873
|
-
Returns:
|
2874
|
-
PIL Image of the page grid or None if no pages
|
2875
|
-
"""
|
2876
|
-
# Determine default page width from global options if not explicitly provided
|
2877
|
-
if page_width is None:
|
2878
|
-
try:
|
2879
|
-
import natural_pdf
|
2880
|
-
|
2881
|
-
page_width = natural_pdf.options.image.width or 300
|
2882
|
-
except Exception:
|
2883
|
-
# Fallback if natural_pdf import fails in some edge context
|
2884
|
-
page_width = 300
|
2885
|
-
|
2886
|
-
# Ensure PIL is imported, handle potential ImportError if not done globally/lazily
|
2887
|
-
try:
|
2888
|
-
from PIL import Image, ImageDraw, ImageFont
|
2889
|
-
except ImportError:
|
2890
|
-
logger.error(
|
2891
|
-
"Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
|
2892
|
-
)
|
2893
|
-
return None
|
2894
|
-
|
2895
|
-
if not self.pages:
|
2896
|
-
logger.warning("Cannot generate image for empty PageCollection")
|
2897
|
-
return None
|
2898
|
-
|
2899
|
-
# Limit pages if max_pages is specified
|
2900
|
-
pages_to_render = self.pages[:max_pages] if max_pages else self.pages
|
2901
|
-
|
2902
|
-
# Load font once outside the loop
|
2903
|
-
font = None
|
2904
|
-
if add_labels:
|
2905
|
-
try:
|
2906
|
-
# Try loading a commonly available font first
|
2907
|
-
font = ImageFont.truetype("DejaVuSans.ttf", 16)
|
2908
|
-
except IOError:
|
2909
|
-
try:
|
2910
|
-
font = ImageFont.load_default(16)
|
2911
|
-
except IOError:
|
2912
|
-
logger.warning("Default font not found. Labels cannot be added.")
|
2913
|
-
add_labels = False # Disable if no font
|
2914
|
-
|
2915
|
-
# Render individual page images
|
2916
|
-
page_images = []
|
2917
|
-
for page in pages_to_render:
|
2918
|
-
try:
|
2919
|
-
# Assume page.to_image returns a PIL Image or None
|
2920
|
-
img = page.to_image(
|
2921
|
-
width=page_width, include_highlights=True
|
2922
|
-
) # Render with highlights for visual context
|
2923
|
-
if img is None:
|
2924
|
-
logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
|
2925
|
-
continue
|
2926
|
-
except Exception as img_err:
|
2927
|
-
logger.error(
|
2928
|
-
f"Error generating image for page {page.number}: {img_err}", exc_info=True
|
2929
|
-
)
|
2930
|
-
continue
|
2931
|
-
|
2932
|
-
# Add page number label
|
2933
|
-
if add_labels and font:
|
2934
|
-
draw = ImageDraw.Draw(img)
|
2935
|
-
pdf_name = (
|
2936
|
-
Path(page.pdf.path).stem
|
2937
|
-
if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
|
2938
|
-
else ""
|
2939
|
-
)
|
2940
|
-
label_text = f"p{page.number}"
|
2941
|
-
if pdf_name:
|
2942
|
-
label_text += f" - {pdf_name}"
|
2943
|
-
|
2944
|
-
# Add category if requested and available
|
2945
|
-
if show_category:
|
2946
|
-
# Placeholder logic - adjust based on how classification results are stored
|
2947
|
-
category = None
|
2948
|
-
confidence = None
|
2949
|
-
if (
|
2950
|
-
hasattr(page, "analyses")
|
2951
|
-
and page.analyses
|
2952
|
-
and "classification" in page.analyses
|
2953
|
-
):
|
2954
|
-
result = page.analyses["classification"]
|
2955
|
-
# Adapt based on actual structure of classification result
|
2956
|
-
category = (
|
2957
|
-
getattr(result, "label", None) or result.get("label", None)
|
2958
|
-
if isinstance(result, dict)
|
2959
|
-
else None
|
2960
|
-
)
|
2961
|
-
confidence = (
|
2962
|
-
getattr(result, "score", None) or result.get("score", None)
|
2963
|
-
if isinstance(result, dict)
|
2964
|
-
else None
|
2965
|
-
)
|
2966
|
-
|
2967
|
-
if category is not None and confidence is not None:
|
2968
|
-
try:
|
2969
|
-
category_str = f"{category} ({confidence:.2f})" # Format confidence
|
2970
|
-
label_text += f"\\n{category_str}"
|
2971
|
-
except (TypeError, ValueError):
|
2972
|
-
pass # Ignore formatting errors
|
2973
|
-
|
2974
|
-
# Calculate bounding box for multi-line text and draw background/text
|
2975
|
-
try:
|
2976
|
-
# Using textbbox for potentially better accuracy with specific fonts
|
2977
|
-
# Note: textbbox needs Pillow 8+
|
2978
|
-
bbox = draw.textbbox(
|
2979
|
-
(5, 5), label_text, font=font, spacing=2
|
2980
|
-
) # Use textbbox if available
|
2981
|
-
bg_rect = (
|
2982
|
-
max(0, bbox[0] - 2),
|
2983
|
-
max(0, bbox[1] - 2),
|
2984
|
-
min(img.width, bbox[2] + 2),
|
2985
|
-
min(img.height, bbox[3] + 2),
|
2986
|
-
)
|
2987
|
-
|
2988
|
-
# Draw semi-transparent background
|
2989
|
-
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
|
2990
|
-
draw_overlay = ImageDraw.Draw(overlay)
|
2991
|
-
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
|
2992
|
-
img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
|
2993
|
-
draw = ImageDraw.Draw(img) # Recreate draw object
|
2994
|
-
|
2995
|
-
# Draw the potentially multi-line text
|
2996
|
-
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
2997
|
-
except AttributeError: # Fallback for older Pillow without textbbox
|
2998
|
-
# Approximate size and draw
|
2999
|
-
# This might not be perfectly aligned
|
3000
|
-
draw.rectangle(
|
3001
|
-
(2, 2, 150, 40), fill=(255, 255, 255, 180)
|
3002
|
-
) # Simple fixed background
|
3003
|
-
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
3004
|
-
except Exception as draw_err:
|
3005
|
-
logger.error(
|
3006
|
-
f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
|
3007
|
-
)
|
3008
|
-
|
3009
|
-
page_images.append(img)
|
3010
|
-
|
3011
|
-
if not page_images:
|
3012
|
-
logger.warning("No page images were successfully rendered for the grid.")
|
3013
|
-
return None
|
3014
|
-
|
3015
|
-
# Calculate grid dimensions if not provided
|
3016
|
-
num_images = len(page_images)
|
3017
|
-
if not rows and not cols:
|
3018
|
-
cols = min(4, int(num_images**0.5) + 1)
|
3019
|
-
rows = (num_images + cols - 1) // cols
|
3020
|
-
elif rows and not cols:
|
3021
|
-
cols = (num_images + rows - 1) // rows
|
3022
|
-
elif cols and not rows:
|
3023
|
-
rows = (num_images + cols - 1) // cols
|
3024
|
-
cols = max(1, cols if cols else 1) # Ensure at least 1
|
3025
|
-
rows = max(1, rows if rows else 1)
|
3026
|
-
|
3027
|
-
# Get maximum dimensions for consistent grid cells
|
3028
|
-
max_width = max(img.width for img in page_images) if page_images else 1
|
3029
|
-
max_height = max(img.height for img in page_images) if page_images else 1
|
3030
|
-
|
3031
|
-
# Create grid image
|
3032
|
-
grid_width = cols * max_width + (cols + 1) * spacing
|
3033
|
-
grid_height = rows * max_height + (rows + 1) * spacing
|
3034
|
-
grid_img = Image.new(
|
3035
|
-
"RGB", (grid_width, grid_height), (220, 220, 220)
|
3036
|
-
) # Lighter gray background
|
3037
|
-
|
3038
|
-
# Place images in grid
|
3039
|
-
for i, img in enumerate(page_images):
|
3040
|
-
if i >= rows * cols: # Ensure we don't exceed grid capacity
|
3041
|
-
break
|
3042
|
-
|
3043
|
-
row = i // cols
|
3044
|
-
col = i % cols
|
3045
|
-
|
3046
|
-
x = col * max_width + (col + 1) * spacing
|
3047
|
-
y = row * max_height + (row + 1) * spacing
|
3048
|
-
|
3049
|
-
grid_img.paste(img, (x, y))
|
3050
|
-
|
3051
|
-
return grid_img
|
3052
|
-
|
3053
|
-
def save_pdf(
|
3054
|
-
self,
|
3055
|
-
output_path: Union[str, Path],
|
3056
|
-
ocr: bool = False,
|
3057
|
-
original: bool = False,
|
3058
|
-
dpi: int = 300,
|
3059
|
-
):
|
3060
|
-
"""
|
3061
|
-
Saves the pages in this collection to a new PDF file.
|
3062
|
-
|
3063
|
-
Choose one saving mode:
|
3064
|
-
- `ocr=True`: Creates a new, image-based PDF using OCR results. This
|
3065
|
-
makes the text generated during the natural-pdf session searchable,
|
3066
|
-
but loses original vector content. Requires 'ocr-export' extras.
|
3067
|
-
- `original=True`: Extracts the original pages from the source PDF,
|
3068
|
-
preserving all vector content, fonts, and annotations. OCR results
|
3069
|
-
from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
|
3070
|
-
|
3071
|
-
Args:
|
3072
|
-
output_path: Path to save the new PDF file.
|
3073
|
-
ocr: If True, save as a searchable, image-based PDF using OCR data.
|
3074
|
-
original: If True, save the original, vector-based pages.
|
3075
|
-
dpi: Resolution (dots per inch) used only when ocr=True for
|
3076
|
-
rendering page images and aligning the text layer.
|
3077
|
-
|
3078
|
-
Raises:
|
3079
|
-
ValueError: If the collection is empty, if neither or both 'ocr'
|
3080
|
-
and 'original' are True, or if 'original=True' and
|
3081
|
-
pages originate from different PDFs.
|
3082
|
-
ImportError: If required libraries ('pikepdf', 'Pillow')
|
3083
|
-
are not installed for the chosen mode.
|
3084
|
-
RuntimeError: If an unexpected error occurs during saving.
|
3085
|
-
"""
|
3086
|
-
if not self.pages:
|
3087
|
-
raise ValueError("Cannot save an empty PageCollection.")
|
3088
|
-
|
3089
|
-
if not (ocr ^ original): # XOR: exactly one must be true
|
3090
|
-
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
3091
|
-
|
3092
|
-
output_path_obj = Path(output_path)
|
3093
|
-
output_path_str = str(output_path_obj)
|
3094
|
-
|
3095
|
-
if ocr:
|
3096
|
-
if create_searchable_pdf is None:
|
3097
|
-
raise ImportError(
|
3098
|
-
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
3099
|
-
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
3100
|
-
)
|
3101
|
-
|
3102
|
-
# Check for non-OCR vector elements (provide a warning)
|
3103
|
-
has_vector_elements = False
|
3104
|
-
for page in self.pages:
|
3105
|
-
# Simplified check for common vector types or non-OCR chars/words
|
3106
|
-
if (
|
3107
|
-
hasattr(page, "rects")
|
3108
|
-
and page.rects
|
3109
|
-
or hasattr(page, "lines")
|
3110
|
-
and page.lines
|
3111
|
-
or hasattr(page, "curves")
|
3112
|
-
and page.curves
|
3113
|
-
or (
|
3114
|
-
hasattr(page, "chars")
|
3115
|
-
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
3116
|
-
)
|
3117
|
-
or (
|
3118
|
-
hasattr(page, "words")
|
3119
|
-
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
3120
|
-
)
|
3121
|
-
):
|
3122
|
-
has_vector_elements = True
|
3123
|
-
break
|
3124
|
-
if has_vector_elements:
|
3125
|
-
logger.warning(
|
3126
|
-
"Warning: Saving with ocr=True creates an image-based PDF. "
|
3127
|
-
"Original vector elements (rects, lines, non-OCR text/chars) "
|
3128
|
-
"on selected pages will not be preserved in the output file."
|
3129
|
-
)
|
3130
|
-
|
3131
|
-
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
3132
|
-
try:
|
3133
|
-
# Delegate to the searchable PDF exporter function
|
3134
|
-
# Pass `self` (the PageCollection instance) as the source
|
3135
|
-
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
3136
|
-
# Success log is now inside create_searchable_pdf if needed, or keep here
|
3137
|
-
# logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
|
3138
|
-
except Exception as e:
|
3139
|
-
logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
|
3140
|
-
# Re-raise as RuntimeError for consistency, potentially handled in exporter too
|
3141
|
-
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
3142
|
-
|
3143
|
-
elif original:
|
3144
|
-
# ---> MODIFIED: Call the new exporter
|
3145
|
-
if create_original_pdf is None:
|
3146
|
-
raise ImportError(
|
3147
|
-
"Saving with original=True requires 'pikepdf'. "
|
3148
|
-
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
3149
|
-
)
|
3150
|
-
|
3151
|
-
# Check for OCR elements (provide a warning) - keep this check here
|
3152
|
-
has_ocr_elements = False
|
3153
|
-
for page in self.pages:
|
3154
|
-
# Use find_all which returns a collection; check if it's non-empty
|
3155
|
-
if hasattr(page, "find_all"):
|
3156
|
-
ocr_text_elements = page.find_all("text[source=ocr]")
|
3157
|
-
if ocr_text_elements: # Check truthiness of collection
|
3158
|
-
has_ocr_elements = True
|
3159
|
-
break
|
3160
|
-
elif hasattr(page, "words"): # Fallback check if find_all isn't present?
|
3161
|
-
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
3162
|
-
has_ocr_elements = True
|
3163
|
-
break
|
3164
|
-
|
3165
|
-
if has_ocr_elements:
|
3166
|
-
logger.warning(
|
3167
|
-
"Warning: Saving with original=True preserves original page content. "
|
3168
|
-
"OCR text generated in this session will not be included in the saved file."
|
3169
|
-
)
|
3170
|
-
|
3171
|
-
logger.info(f"Saving original pages PDF to: {output_path_str}")
|
3172
|
-
try:
|
3173
|
-
# Delegate to the original PDF exporter function
|
3174
|
-
# Pass `self` (the PageCollection instance) as the source
|
3175
|
-
create_original_pdf(self, output_path_str)
|
3176
|
-
# Success log is now inside create_original_pdf
|
3177
|
-
# logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
|
3178
|
-
except Exception as e:
|
3179
|
-
# Error logging is handled within create_original_pdf
|
3180
|
-
# Re-raise the exception caught from the exporter
|
3181
|
-
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
3182
|
-
# <--- END MODIFIED
|
3183
|
-
|
3184
|
-
# Alias .to_image() to .show() for convenience
|
3185
|
-
def show(
|
3186
|
-
self,
|
3187
|
-
*args,
|
3188
|
-
**kwargs,
|
3189
|
-
) -> Optional["Image.Image"]:
|
3190
|
-
"""Display pages similarly to ``to_image``.
|
3191
|
-
|
3192
|
-
This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
|
3193
|
-
ElementCollection, where ``show()`` already exists. It forwards all
|
3194
|
-
arguments and returns the resulting ``PIL.Image`` instance.
|
3195
|
-
"""
|
3196
|
-
return self.to_image(*args, **kwargs)
|