natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,9 @@ from natural_pdf.classification.manager import ClassificationManager
|
|
33
33
|
from natural_pdf.classification.mixin import ClassificationMixin
|
34
34
|
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
35
35
|
from natural_pdf.core.pdf import PDF
|
36
|
+
|
37
|
+
# Add Visualizable import
|
38
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
36
39
|
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
37
40
|
from natural_pdf.elements.base import Element
|
38
41
|
from natural_pdf.elements.region import Region
|
@@ -82,6 +85,7 @@ class ElementCollection(
|
|
82
85
|
DirectionalCollectionMixin,
|
83
86
|
DescribeMixin,
|
84
87
|
InspectMixin,
|
88
|
+
Visualizable,
|
85
89
|
MutableSequence,
|
86
90
|
):
|
87
91
|
"""Collection of PDF elements with batch operations.
|
@@ -171,13 +175,234 @@ class ElementCollection(
|
|
171
175
|
"""
|
172
176
|
self._elements = elements or []
|
173
177
|
|
178
|
+
def _get_render_specs(
|
179
|
+
self,
|
180
|
+
mode: Literal["show", "render"] = "show",
|
181
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
182
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
183
|
+
crop: Union[bool, Literal["content"]] = False,
|
184
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
185
|
+
group_by: Optional[str] = None,
|
186
|
+
bins: Optional[Union[int, List[float]]] = None,
|
187
|
+
annotate: Optional[List[str]] = None,
|
188
|
+
**kwargs,
|
189
|
+
) -> List[RenderSpec]:
|
190
|
+
"""Get render specifications for this element collection.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
194
|
+
color: Default color for highlights in show mode (or colormap name when using group_by)
|
195
|
+
highlights: Additional highlight groups to show
|
196
|
+
crop: Whether to crop to element bounds
|
197
|
+
crop_bbox: Explicit crop bounds
|
198
|
+
group_by: Attribute to group elements by for color mapping
|
199
|
+
bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
|
200
|
+
annotate: List of attribute names to display on highlights
|
201
|
+
**kwargs: Additional parameters
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
List of RenderSpec objects, one per page with elements
|
205
|
+
"""
|
206
|
+
if not self._elements:
|
207
|
+
return []
|
208
|
+
|
209
|
+
# Group elements by page
|
210
|
+
elements_by_page = {}
|
211
|
+
for elem in self._elements:
|
212
|
+
if hasattr(elem, "page"):
|
213
|
+
page = elem.page
|
214
|
+
if page not in elements_by_page:
|
215
|
+
elements_by_page[page] = []
|
216
|
+
elements_by_page[page].append(elem)
|
217
|
+
|
218
|
+
if not elements_by_page:
|
219
|
+
return []
|
220
|
+
|
221
|
+
# Create RenderSpec for each page
|
222
|
+
specs = []
|
223
|
+
for page, page_elements in elements_by_page.items():
|
224
|
+
spec = RenderSpec(page=page)
|
225
|
+
|
226
|
+
# Handle cropping
|
227
|
+
if crop_bbox:
|
228
|
+
spec.crop_bbox = crop_bbox
|
229
|
+
elif crop == "content" or crop is True:
|
230
|
+
# Calculate bounds of elements on this page
|
231
|
+
x_coords = []
|
232
|
+
y_coords = []
|
233
|
+
for elem in page_elements:
|
234
|
+
if hasattr(elem, "bbox") and elem.bbox:
|
235
|
+
x0, y0, x1, y1 = elem.bbox
|
236
|
+
x_coords.extend([x0, x1])
|
237
|
+
y_coords.extend([y0, y1])
|
238
|
+
|
239
|
+
if x_coords and y_coords:
|
240
|
+
spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
241
|
+
|
242
|
+
# Add highlights in show mode
|
243
|
+
if mode == "show":
|
244
|
+
# Handle group_by parameter for quantitative/categorical grouping
|
245
|
+
if group_by is not None:
|
246
|
+
# Use the improved highlighting logic from _prepare_highlight_data
|
247
|
+
prepared_highlights = self._prepare_highlight_data(
|
248
|
+
group_by=group_by, color=color, bins=bins, annotate=annotate, **kwargs
|
249
|
+
)
|
250
|
+
|
251
|
+
# Check if we have quantitative metadata to preserve
|
252
|
+
quantitative_metadata = None
|
253
|
+
for highlight_data in prepared_highlights:
|
254
|
+
if (
|
255
|
+
"quantitative_metadata" in highlight_data
|
256
|
+
and highlight_data["quantitative_metadata"]
|
257
|
+
):
|
258
|
+
quantitative_metadata = highlight_data["quantitative_metadata"]
|
259
|
+
break
|
260
|
+
|
261
|
+
# Add highlights from prepared data
|
262
|
+
for highlight_data in prepared_highlights:
|
263
|
+
# Only add elements from this page
|
264
|
+
elem = highlight_data.get("element")
|
265
|
+
if elem and hasattr(elem, "page") and elem.page == page:
|
266
|
+
# Create the highlight dict manually to preserve quantitative metadata
|
267
|
+
highlight_dict = {
|
268
|
+
"element": elem,
|
269
|
+
"color": highlight_data.get("color"),
|
270
|
+
"label": highlight_data.get("label"),
|
271
|
+
}
|
272
|
+
|
273
|
+
# Add quantitative metadata to the first highlight
|
274
|
+
if quantitative_metadata and not any(
|
275
|
+
h.get("quantitative_metadata") for h in spec.highlights
|
276
|
+
):
|
277
|
+
highlight_dict["quantitative_metadata"] = quantitative_metadata
|
278
|
+
|
279
|
+
# Add annotate if provided in the prepared data
|
280
|
+
if "annotate" in highlight_data:
|
281
|
+
highlight_dict["annotate"] = highlight_data["annotate"]
|
282
|
+
if "attributes_to_draw" in highlight_data:
|
283
|
+
highlight_dict["attributes_to_draw"] = highlight_data[
|
284
|
+
"attributes_to_draw"
|
285
|
+
]
|
286
|
+
|
287
|
+
# Extract geometry from element
|
288
|
+
if (
|
289
|
+
hasattr(elem, "polygon")
|
290
|
+
and hasattr(elem, "has_polygon")
|
291
|
+
and elem.has_polygon
|
292
|
+
):
|
293
|
+
highlight_dict["polygon"] = elem.polygon
|
294
|
+
elif hasattr(elem, "bbox"):
|
295
|
+
highlight_dict["bbox"] = elem.bbox
|
296
|
+
|
297
|
+
spec.highlights.append(highlight_dict)
|
298
|
+
else:
|
299
|
+
# Default behavior when no group_by is specified
|
300
|
+
# Determine if all elements are of the same type
|
301
|
+
element_types = set(type(elem).__name__ for elem in page_elements)
|
302
|
+
|
303
|
+
if len(element_types) == 1:
|
304
|
+
# All elements are the same type - use a single label
|
305
|
+
type_name = element_types.pop()
|
306
|
+
# Generate a clean label from the type name
|
307
|
+
base_name = (
|
308
|
+
type_name.replace("Element", "").replace("Region", "")
|
309
|
+
if type_name != "Region"
|
310
|
+
else "Region"
|
311
|
+
)
|
312
|
+
# Handle special cases for common types
|
313
|
+
if base_name == "Text":
|
314
|
+
shared_label = "Text Elements"
|
315
|
+
elif base_name == "table_cell" or (
|
316
|
+
hasattr(page_elements[0], "region_type")
|
317
|
+
and page_elements[0].region_type == "table_cell"
|
318
|
+
):
|
319
|
+
shared_label = "Table Cells"
|
320
|
+
elif base_name == "table":
|
321
|
+
shared_label = "Tables"
|
322
|
+
else:
|
323
|
+
shared_label = f"{base_name} Elements" if base_name else "Elements"
|
324
|
+
|
325
|
+
# Add all elements with the same label (no color cycling)
|
326
|
+
for elem in page_elements:
|
327
|
+
# Get element highlight params with annotate
|
328
|
+
element_data = self._get_element_highlight_params(elem, annotate)
|
329
|
+
if element_data:
|
330
|
+
# Use add_highlight with basic params
|
331
|
+
spec.add_highlight(
|
332
|
+
element=elem,
|
333
|
+
color=color, # Use provided color or None
|
334
|
+
label=shared_label,
|
335
|
+
)
|
336
|
+
# Update last highlight with attributes if present
|
337
|
+
if element_data.get("attributes_to_draw") and spec.highlights:
|
338
|
+
spec.highlights[-1]["attributes_to_draw"] = element_data[
|
339
|
+
"attributes_to_draw"
|
340
|
+
]
|
341
|
+
else:
|
342
|
+
# Mixed types - use individual labels (existing behavior)
|
343
|
+
for elem in page_elements:
|
344
|
+
# Get element highlight params with annotate
|
345
|
+
element_data = self._get_element_highlight_params(elem, annotate)
|
346
|
+
if element_data:
|
347
|
+
spec.add_highlight(
|
348
|
+
element=elem,
|
349
|
+
color=color,
|
350
|
+
label=getattr(elem, "text", None) or str(elem),
|
351
|
+
)
|
352
|
+
# Update last highlight with attributes if present
|
353
|
+
if element_data.get("attributes_to_draw") and spec.highlights:
|
354
|
+
spec.highlights[-1]["attributes_to_draw"] = element_data[
|
355
|
+
"attributes_to_draw"
|
356
|
+
]
|
357
|
+
|
358
|
+
# Add additional highlight groups if provided
|
359
|
+
if highlights:
|
360
|
+
for group in highlights:
|
361
|
+
group_elements = group.get("elements", [])
|
362
|
+
group_color = group.get("color", color)
|
363
|
+
group_label = group.get("label")
|
364
|
+
|
365
|
+
# Only add elements from this page
|
366
|
+
for elem in group_elements:
|
367
|
+
if hasattr(elem, "page") and elem.page == page:
|
368
|
+
spec.add_highlight(
|
369
|
+
element=elem, color=group_color, label=group_label
|
370
|
+
)
|
371
|
+
|
372
|
+
specs.append(spec)
|
373
|
+
|
374
|
+
return specs
|
375
|
+
|
376
|
+
def _get_highlighter(self):
|
377
|
+
"""Get the highlighting service for rendering.
|
378
|
+
|
379
|
+
For ElementCollection, we get it from the first element's page.
|
380
|
+
"""
|
381
|
+
if not self._elements:
|
382
|
+
raise RuntimeError("Cannot get highlighter from empty ElementCollection")
|
383
|
+
|
384
|
+
# Try to get highlighter from first element's page
|
385
|
+
for elem in self._elements:
|
386
|
+
if hasattr(elem, "page") and hasattr(elem.page, "_highlighter"):
|
387
|
+
return elem.page._highlighter
|
388
|
+
|
389
|
+
# If no elements have pages, we can't render
|
390
|
+
raise RuntimeError(
|
391
|
+
"Cannot find HighlightingService. ElementCollection elements don't have page access."
|
392
|
+
)
|
393
|
+
|
174
394
|
def __len__(self) -> int:
|
175
395
|
"""Get the number of elements in the collection."""
|
176
396
|
return len(self._elements)
|
177
397
|
|
178
|
-
def __getitem__(self, index: int) -> "Element":
|
179
|
-
"""Get an element by index."""
|
180
|
-
|
398
|
+
def __getitem__(self, index: Union[int, slice]) -> Union["Element", "ElementCollection"]:
|
399
|
+
"""Get an element by index or a collection by slice."""
|
400
|
+
if isinstance(index, slice):
|
401
|
+
# Return a new ElementCollection for slices
|
402
|
+
return ElementCollection(self._elements[index])
|
403
|
+
else:
|
404
|
+
# Return the element for integer indices
|
405
|
+
return self._elements[index]
|
181
406
|
|
182
407
|
def __repr__(self) -> str:
|
183
408
|
"""Return a string representation showing the element count."""
|
@@ -423,6 +648,7 @@ class ElementCollection(
|
|
423
648
|
# Apply content filtering if provided
|
424
649
|
if content_filter is not None:
|
425
650
|
from natural_pdf.utils.text_extraction import _apply_content_filter
|
651
|
+
|
426
652
|
all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
|
427
653
|
|
428
654
|
# Check if layout is requested
|
@@ -534,8 +760,9 @@ class ElementCollection(
|
|
534
760
|
group_by: Optional[str] = None,
|
535
761
|
label_format: Optional[str] = None,
|
536
762
|
distinct: bool = False,
|
537
|
-
|
763
|
+
annotate: Optional[List[str]] = None,
|
538
764
|
replace: bool = False,
|
765
|
+
bins: Optional[Union[int, List[float]]] = None,
|
539
766
|
) -> "ElementCollection":
|
540
767
|
"""
|
541
768
|
Adds persistent highlights for all elements in the collection to the page
|
@@ -553,12 +780,15 @@ class ElementCollection(
|
|
553
780
|
label: Optional explicit label for the entire collection. If provided,
|
554
781
|
all elements are highlighted as a single group with this label,
|
555
782
|
ignoring 'group_by' and the default type-based grouping.
|
556
|
-
color: Optional explicit color for the highlight (tuple/string)
|
557
|
-
|
783
|
+
color: Optional explicit color for the highlight (tuple/string), or
|
784
|
+
matplotlib colormap name for quantitative group_by (e.g., 'viridis', 'plasma',
|
785
|
+
'inferno', 'coolwarm', 'RdBu'). Applied consistently if 'label' is provided
|
786
|
+
or if grouping occurs.
|
558
787
|
group_by: Optional attribute name present on the elements. If provided
|
559
788
|
(and 'label' is None), elements will be grouped based on the
|
560
789
|
value of this attribute, and each group will be highlighted
|
561
|
-
with a distinct label and color.
|
790
|
+
with a distinct label and color. Automatically detects quantitative
|
791
|
+
data and uses gradient colormaps when appropriate.
|
562
792
|
label_format: Optional Python f-string to format the group label when
|
563
793
|
'group_by' is used. Can reference element attributes
|
564
794
|
(e.g., "Type: {region_type}, Conf: {confidence:.2f}").
|
@@ -566,11 +796,14 @@ class ElementCollection(
|
|
566
796
|
distinct: If True, bypasses all grouping and highlights each element
|
567
797
|
individually with cycling colors (the previous default behavior).
|
568
798
|
(default: False)
|
569
|
-
|
570
|
-
|
799
|
+
annotate: List of attribute names from the element to display directly
|
800
|
+
on the highlight itself (distinct from group label).
|
571
801
|
replace: If True, existing highlights on the affected page(s)
|
572
802
|
are cleared before adding these highlights.
|
573
803
|
If False (default), highlights are appended to existing ones.
|
804
|
+
bins: Optional binning specification for quantitative data when using group_by.
|
805
|
+
Can be an integer (number of equal-width bins) or a list of bin edges.
|
806
|
+
Only used when group_by contains quantitative data.
|
574
807
|
|
575
808
|
Returns:
|
576
809
|
Self for method chaining
|
@@ -592,7 +825,8 @@ class ElementCollection(
|
|
592
825
|
color=color,
|
593
826
|
group_by=group_by,
|
594
827
|
label_format=label_format,
|
595
|
-
|
828
|
+
annotate=annotate,
|
829
|
+
bins=bins,
|
596
830
|
# 'replace' flag is handled during the add call below
|
597
831
|
)
|
598
832
|
|
@@ -633,7 +867,7 @@ class ElementCollection(
|
|
633
867
|
"use_color_cycling", False
|
634
868
|
), # Set by _prepare if distinct
|
635
869
|
"element": data["element"],
|
636
|
-
"
|
870
|
+
"annotate": data["annotate"],
|
637
871
|
# Internal call to service always appends, as clearing was handled above
|
638
872
|
"existing": "append",
|
639
873
|
}
|
@@ -655,7 +889,8 @@ class ElementCollection(
|
|
655
889
|
color: Optional[Union[Tuple, str]] = None,
|
656
890
|
group_by: Optional[str] = None,
|
657
891
|
label_format: Optional[str] = None,
|
658
|
-
|
892
|
+
annotate: Optional[List[str]] = None,
|
893
|
+
bins: Optional[Union[int, List[float]]] = None,
|
659
894
|
) -> List[Dict]:
|
660
895
|
"""
|
661
896
|
Determines the parameters for highlighting each element based on the strategy.
|
@@ -664,7 +899,7 @@ class ElementCollection(
|
|
664
899
|
|
665
900
|
Returns:
|
666
901
|
List of dictionaries, each containing parameters for a single highlight
|
667
|
-
(e.g., page_index, bbox/polygon, color, label, element,
|
902
|
+
(e.g., page_index, bbox/polygon, color, label, element, annotate, attributes_to_draw).
|
668
903
|
Color and label determination happens here.
|
669
904
|
"""
|
670
905
|
prepared_data = []
|
@@ -672,11 +907,25 @@ class ElementCollection(
|
|
672
907
|
return prepared_data
|
673
908
|
|
674
909
|
# Need access to the HighlightingService to determine colors correctly.
|
910
|
+
# Use highlighting protocol to find a valid service from any element
|
675
911
|
highlighter = None
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
912
|
+
|
913
|
+
for element in self._elements:
|
914
|
+
# Try direct page access first (for regular elements)
|
915
|
+
if hasattr(element, "page") and hasattr(element.page, "_highlighter"):
|
916
|
+
highlighter = element.page._highlighter
|
917
|
+
break
|
918
|
+
# Try highlighting protocol for FlowRegions and other complex elements
|
919
|
+
elif hasattr(element, "get_highlight_specs"):
|
920
|
+
specs = element.get_highlight_specs()
|
921
|
+
for spec in specs:
|
922
|
+
if "page" in spec and hasattr(spec["page"], "_highlighter"):
|
923
|
+
highlighter = spec["page"]._highlighter
|
924
|
+
break
|
925
|
+
if highlighter:
|
926
|
+
break
|
927
|
+
|
928
|
+
if not highlighter:
|
680
929
|
logger.warning(
|
681
930
|
"Cannot determine highlight colors: HighlightingService not accessible from elements."
|
682
931
|
)
|
@@ -689,7 +938,7 @@ class ElementCollection(
|
|
689
938
|
final_color = highlighter._determine_highlight_color(
|
690
939
|
label=None, color_input=None, use_color_cycling=True
|
691
940
|
)
|
692
|
-
element_data = self._get_element_highlight_params(element,
|
941
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
693
942
|
if element_data:
|
694
943
|
element_data.update(
|
695
944
|
{"color": final_color, "label": None, "use_color_cycling": True}
|
@@ -702,7 +951,7 @@ class ElementCollection(
|
|
702
951
|
label=label, color_input=color, use_color_cycling=False
|
703
952
|
)
|
704
953
|
for element in self._elements:
|
705
|
-
element_data = self._get_element_highlight_params(element,
|
954
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
706
955
|
if element_data:
|
707
956
|
element_data.update({"color": final_color, "label": label})
|
708
957
|
prepared_data.append(element_data)
|
@@ -710,23 +959,84 @@ class ElementCollection(
|
|
710
959
|
elif group_by is not None:
|
711
960
|
logger.debug("_prepare: Grouping by attribute strategy.")
|
712
961
|
grouped_elements = self._group_elements_by_attr(group_by)
|
962
|
+
|
963
|
+
# Collect all values for quantitative detection
|
964
|
+
all_values = []
|
713
965
|
for group_key, group_elements in grouped_elements.items():
|
714
|
-
if
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
966
|
+
if group_elements:
|
967
|
+
all_values.append(group_key)
|
968
|
+
|
969
|
+
# Import the quantitative detection function
|
970
|
+
from natural_pdf.utils.visualization import (
|
971
|
+
create_quantitative_color_mapping,
|
972
|
+
detect_quantitative_data,
|
973
|
+
)
|
974
|
+
|
975
|
+
# Determine if we should use quantitative color mapping
|
976
|
+
use_quantitative = detect_quantitative_data(all_values)
|
977
|
+
|
978
|
+
if use_quantitative:
|
979
|
+
logger.debug(" _prepare: Using quantitative color mapping.")
|
980
|
+
# Use quantitative color mapping with specified colormap
|
981
|
+
colormap_name = color if isinstance(color, str) else "viridis"
|
982
|
+
value_to_color = create_quantitative_color_mapping(
|
983
|
+
all_values, colormap=colormap_name, bins=bins
|
724
984
|
)
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
985
|
+
|
986
|
+
# Store quantitative metadata for colorbar creation
|
987
|
+
quantitative_metadata = {
|
988
|
+
"values": all_values,
|
989
|
+
"colormap": colormap_name,
|
990
|
+
"bins": bins,
|
991
|
+
"attribute": group_by,
|
992
|
+
}
|
993
|
+
|
994
|
+
for group_key, group_elements in grouped_elements.items():
|
995
|
+
if not group_elements:
|
996
|
+
continue
|
997
|
+
group_label = self._format_group_label(
|
998
|
+
group_key, label_format, group_elements[0], group_by
|
999
|
+
)
|
1000
|
+
|
1001
|
+
# Get quantitative color for this value
|
1002
|
+
final_color = value_to_color.get(group_key)
|
1003
|
+
if final_color is None:
|
1004
|
+
# Fallback to traditional color assignment
|
1005
|
+
final_color = highlighter._determine_highlight_color(
|
1006
|
+
label=group_label, color_input=None, use_color_cycling=False
|
1007
|
+
)
|
1008
|
+
|
1009
|
+
logger.debug(
|
1010
|
+
f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
|
1011
|
+
)
|
1012
|
+
for element in group_elements:
|
1013
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
1014
|
+
if element_data:
|
1015
|
+
element_data.update({"color": final_color, "label": group_label})
|
1016
|
+
# Add quantitative metadata to the first element in each group
|
1017
|
+
if not any("quantitative_metadata" in pd for pd in prepared_data):
|
1018
|
+
element_data["quantitative_metadata"] = quantitative_metadata
|
1019
|
+
prepared_data.append(element_data)
|
1020
|
+
else:
|
1021
|
+
logger.debug(" _prepare: Using categorical color mapping.")
|
1022
|
+
# Use traditional categorical color mapping
|
1023
|
+
for group_key, group_elements in grouped_elements.items():
|
1024
|
+
if not group_elements:
|
1025
|
+
continue
|
1026
|
+
group_label = self._format_group_label(
|
1027
|
+
group_key, label_format, group_elements[0], group_by
|
1028
|
+
)
|
1029
|
+
final_color = highlighter._determine_highlight_color(
|
1030
|
+
label=group_label, color_input=None, use_color_cycling=False
|
1031
|
+
)
|
1032
|
+
logger.debug(
|
1033
|
+
f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
|
1034
|
+
)
|
1035
|
+
for element in group_elements:
|
1036
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
1037
|
+
if element_data:
|
1038
|
+
element_data.update({"color": final_color, "label": group_label})
|
1039
|
+
prepared_data.append(element_data)
|
730
1040
|
else:
|
731
1041
|
logger.debug("_prepare: Default grouping strategy.")
|
732
1042
|
element_types = set(type(el).__name__ for el in self._elements)
|
@@ -745,7 +1055,7 @@ class ElementCollection(
|
|
745
1055
|
)
|
746
1056
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
747
1057
|
for element in self._elements:
|
748
|
-
element_data = self._get_element_highlight_params(element,
|
1058
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
749
1059
|
if element_data:
|
750
1060
|
element_data.update({"color": final_color, "label": auto_label})
|
751
1061
|
prepared_data.append(element_data)
|
@@ -764,7 +1074,7 @@ class ElementCollection(
|
|
764
1074
|
# Determine color *before* logging or using it (already done above for this branch)
|
765
1075
|
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
766
1076
|
for element in self._elements:
|
767
|
-
element_data = self._get_element_highlight_params(element,
|
1077
|
+
element_data = self._get_element_highlight_params(element, annotate)
|
768
1078
|
if element_data:
|
769
1079
|
element_data.update({"color": final_color, "label": auto_label})
|
770
1080
|
prepared_data.append(element_data)
|
@@ -777,7 +1087,7 @@ class ElementCollection(
|
|
777
1087
|
color: Optional[Union[Tuple, str]],
|
778
1088
|
label: Optional[str],
|
779
1089
|
use_color_cycling: bool,
|
780
|
-
|
1090
|
+
annotate: Optional[List[str]],
|
781
1091
|
existing: str,
|
782
1092
|
):
|
783
1093
|
"""Low-level helper to call the appropriate HighlightingService method for an element."""
|
@@ -793,7 +1103,7 @@ class ElementCollection(
|
|
793
1103
|
"color": color,
|
794
1104
|
"label": label,
|
795
1105
|
"use_color_cycling": use_color_cycling,
|
796
|
-
"
|
1106
|
+
"annotate": annotate,
|
797
1107
|
"existing": existing,
|
798
1108
|
"element": element,
|
799
1109
|
}
|
@@ -828,7 +1138,7 @@ class ElementCollection(
|
|
828
1138
|
self,
|
829
1139
|
label: str,
|
830
1140
|
color: Optional[Union[Tuple, str]],
|
831
|
-
|
1141
|
+
annotate: Optional[List[str]],
|
832
1142
|
existing: str,
|
833
1143
|
):
|
834
1144
|
"""Highlights all elements with the same explicit label and color."""
|
@@ -838,7 +1148,7 @@ class ElementCollection(
|
|
838
1148
|
color=color, # Use explicit color if provided
|
839
1149
|
label=label, # Use the explicit group label
|
840
1150
|
use_color_cycling=False, # Use consistent color for the label
|
841
|
-
|
1151
|
+
annotate=annotate,
|
842
1152
|
existing=existing,
|
843
1153
|
)
|
844
1154
|
|
@@ -846,7 +1156,7 @@ class ElementCollection(
|
|
846
1156
|
self,
|
847
1157
|
group_by: str,
|
848
1158
|
label_format: Optional[str],
|
849
|
-
|
1159
|
+
annotate: Optional[List[str]],
|
850
1160
|
existing: str,
|
851
1161
|
):
|
852
1162
|
"""Groups elements by attribute and highlights each group distinctly."""
|
@@ -918,11 +1228,11 @@ class ElementCollection(
|
|
918
1228
|
color=None, # Let ColorManager choose based on label
|
919
1229
|
label=group_label, # Use the derived group label
|
920
1230
|
use_color_cycling=False, # Use consistent color for the label
|
921
|
-
|
1231
|
+
annotate=annotate,
|
922
1232
|
existing=existing,
|
923
1233
|
)
|
924
1234
|
|
925
|
-
def _highlight_distinctly(self,
|
1235
|
+
def _highlight_distinctly(self, annotate: Optional[List[str]], existing: str):
|
926
1236
|
"""DEPRECATED: Logic moved to _prepare_highlight_data. Kept for reference/potential reuse."""
|
927
1237
|
# This method is no longer called directly by the main highlight path.
|
928
1238
|
# The distinct logic is handled within _prepare_highlight_data.
|
@@ -932,152 +1242,191 @@ class ElementCollection(
|
|
932
1242
|
color=None, # Let ColorManager cycle
|
933
1243
|
label=None, # No label for distinct elements
|
934
1244
|
use_color_cycling=True, # Force cycling
|
935
|
-
|
1245
|
+
annotate=annotate,
|
936
1246
|
existing=existing,
|
937
1247
|
)
|
938
1248
|
|
939
|
-
def
|
1249
|
+
def _render_multipage_highlights(
|
940
1250
|
self,
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
render_ocr
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
Currently only supports collections where all elements are on the same page
|
962
|
-
of the same PDF.
|
963
|
-
|
964
|
-
Allows grouping and coloring elements based on attributes, similar to the
|
965
|
-
persistent `highlight()` method, but only for this temporary view.
|
966
|
-
|
967
|
-
Args:
|
968
|
-
group_by: Attribute name to group elements by for distinct colors/labels.
|
969
|
-
label: Explicit label for all elements (overrides group_by).
|
970
|
-
color: Explicit color for all elements (if label used) or base color.
|
971
|
-
label_format: F-string to format group labels if group_by is used.
|
972
|
-
distinct: Highlight each element distinctly (overrides group_by/label).
|
973
|
-
include_attrs: Attributes to display on individual highlights.
|
974
|
-
resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
|
975
|
-
labels: Whether to include a legend for the temporary highlights.
|
976
|
-
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
977
|
-
render_ocr: Whether to render OCR text.
|
978
|
-
width: Optional width for the output image in pixels.
|
979
|
-
crop: If True, crop the resulting image to the tight bounding box
|
980
|
-
containing all elements in the collection. The elements are
|
981
|
-
still highlighted first, then the image is cropped.
|
982
|
-
|
983
|
-
Returns:
|
984
|
-
PIL Image object of the temporary preview, or None if rendering fails or
|
985
|
-
elements span multiple pages/PDFs.
|
1251
|
+
specs_by_page,
|
1252
|
+
resolution,
|
1253
|
+
width,
|
1254
|
+
labels,
|
1255
|
+
legend_position,
|
1256
|
+
group_by,
|
1257
|
+
label,
|
1258
|
+
color,
|
1259
|
+
label_format,
|
1260
|
+
distinct,
|
1261
|
+
annotate,
|
1262
|
+
render_ocr,
|
1263
|
+
crop,
|
1264
|
+
stack_direction="vertical",
|
1265
|
+
stack_gap=5,
|
1266
|
+
stack_background_color=(255, 255, 255),
|
1267
|
+
):
|
1268
|
+
"""Render highlights across multiple pages and stack them."""
|
1269
|
+
from PIL import Image
|
986
1270
|
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
import natural_pdf
|
1271
|
+
# Sort pages by index for consistent output
|
1272
|
+
sorted_pages = sorted(
|
1273
|
+
specs_by_page.keys(), key=lambda p: p.index if hasattr(p, "index") else 0
|
1274
|
+
)
|
992
1275
|
|
993
|
-
|
994
|
-
if width is None:
|
995
|
-
width = natural_pdf.options.image.width
|
996
|
-
if resolution is None:
|
997
|
-
if natural_pdf.options.image.resolution is not None:
|
998
|
-
resolution = natural_pdf.options.image.resolution
|
999
|
-
else:
|
1000
|
-
resolution = 144 # Default resolution when none specified
|
1276
|
+
page_images = []
|
1001
1277
|
|
1002
|
-
|
1003
|
-
|
1278
|
+
for page in sorted_pages:
|
1279
|
+
element_specs = specs_by_page[page]
|
1004
1280
|
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1281
|
+
# Get highlighter service from the page
|
1282
|
+
if not hasattr(page, "_highlighter"):
|
1283
|
+
logger.warning(
|
1284
|
+
f"Page {getattr(page, 'number', '?')} has no highlighter service, skipping"
|
1285
|
+
)
|
1286
|
+
continue
|
1010
1287
|
|
1011
|
-
|
1012
|
-
if self._are_on_multiple_pages():
|
1013
|
-
raise ValueError(
|
1014
|
-
"show() currently only supports collections where all elements are on the same page."
|
1015
|
-
)
|
1288
|
+
service = page._highlighter
|
1016
1289
|
|
1017
|
-
|
1018
|
-
|
1019
|
-
if not hasattr(first_element, "page") or not first_element.page:
|
1020
|
-
logger.warning("Cannot show collection: First element has no associated page.")
|
1021
|
-
return None
|
1022
|
-
page = first_element.page
|
1023
|
-
if not hasattr(page, "pdf") or not page.pdf:
|
1024
|
-
logger.warning("Cannot show collection: Page has no associated PDF object.")
|
1025
|
-
return None
|
1290
|
+
# Prepare highlight data for this page
|
1291
|
+
highlight_data_list = []
|
1026
1292
|
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1293
|
+
for element_idx, spec in element_specs:
|
1294
|
+
# Use the element index to generate consistent colors/labels across pages
|
1295
|
+
element = spec.get(
|
1296
|
+
"element",
|
1297
|
+
self._elements[element_idx] if element_idx < len(self._elements) else None,
|
1298
|
+
)
|
1031
1299
|
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1300
|
+
# Prepare highlight data based on grouping parameters
|
1301
|
+
if distinct:
|
1302
|
+
# Use cycling colors for distinct mode
|
1303
|
+
element_color = None # Let the highlighter service pick from palette
|
1304
|
+
use_color_cycling = True
|
1305
|
+
element_label = (
|
1306
|
+
f"Element_{element_idx + 1}"
|
1307
|
+
if label is None
|
1308
|
+
else f"{label}_{element_idx + 1}"
|
1309
|
+
)
|
1310
|
+
elif label:
|
1311
|
+
# Explicit label for all elements
|
1312
|
+
element_color = color
|
1313
|
+
use_color_cycling = color is None
|
1314
|
+
element_label = label
|
1315
|
+
elif group_by and element:
|
1316
|
+
# Group by attribute
|
1317
|
+
try:
|
1318
|
+
group_key = getattr(element, group_by, None)
|
1319
|
+
element_label = self._format_group_label(
|
1320
|
+
group_key, label_format, element, group_by
|
1321
|
+
)
|
1322
|
+
element_color = None # Let service assign color by group
|
1323
|
+
use_color_cycling = True
|
1324
|
+
except:
|
1325
|
+
element_label = f"Element_{element_idx + 1}"
|
1326
|
+
element_color = color
|
1327
|
+
use_color_cycling = color is None
|
1328
|
+
else:
|
1329
|
+
# Default behavior
|
1330
|
+
element_color = color
|
1331
|
+
use_color_cycling = color is None
|
1332
|
+
element_label = f"Element_{element_idx + 1}"
|
1333
|
+
|
1334
|
+
# Build highlight data
|
1335
|
+
highlight_item = {
|
1336
|
+
"page_index": spec["page_index"],
|
1337
|
+
"bbox": spec["bbox"],
|
1338
|
+
"polygon": spec.get("polygon"),
|
1339
|
+
"color": element_color,
|
1340
|
+
"label": element_label if labels else None,
|
1341
|
+
"use_color_cycling": use_color_cycling,
|
1342
|
+
}
|
1343
|
+
|
1344
|
+
# Add attributes if requested
|
1345
|
+
if annotate and element:
|
1346
|
+
highlight_item["attributes_to_draw"] = {}
|
1347
|
+
for attr_name in annotate:
|
1348
|
+
try:
|
1349
|
+
attr_value = getattr(element, attr_name, None)
|
1350
|
+
if attr_value is not None:
|
1351
|
+
highlight_item["attributes_to_draw"][attr_name] = attr_value
|
1352
|
+
except:
|
1353
|
+
pass
|
1042
1354
|
|
1043
|
-
|
1044
|
-
logger.warning("No highlight data generated for show(). Rendering clean page.")
|
1045
|
-
# Render the page without any temporary highlights
|
1046
|
-
highlight_data_list = []
|
1355
|
+
highlight_data_list.append(highlight_item)
|
1047
1356
|
|
1048
|
-
|
1049
|
-
try:
|
1050
|
-
# Calculate crop bounding box in PDF coordinates if crop is requested
|
1357
|
+
# Calculate crop bbox if requested
|
1051
1358
|
crop_bbox = None
|
1052
1359
|
if crop:
|
1053
1360
|
try:
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1361
|
+
# Get bboxes from all specs on this page
|
1362
|
+
bboxes = [spec["bbox"] for _, spec in element_specs if spec.get("bbox")]
|
1363
|
+
if bboxes:
|
1364
|
+
crop_bbox = (
|
1365
|
+
min(bbox[0] for bbox in bboxes),
|
1366
|
+
min(bbox[1] for bbox in bboxes),
|
1367
|
+
max(bbox[2] for bbox in bboxes),
|
1368
|
+
max(bbox[3] for bbox in bboxes),
|
1369
|
+
)
|
1060
1370
|
except Exception as bbox_err:
|
1061
|
-
logger.error(
|
1062
|
-
f"Error determining crop bbox for collection show: {bbox_err}",
|
1063
|
-
exc_info=True,
|
1064
|
-
)
|
1371
|
+
logger.error(f"Error determining crop bbox: {bbox_err}")
|
1065
1372
|
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1373
|
+
# Render this page
|
1374
|
+
try:
|
1375
|
+
img = service.render_preview(
|
1376
|
+
page_index=page.index,
|
1377
|
+
temporary_highlights=highlight_data_list,
|
1378
|
+
resolution=resolution,
|
1379
|
+
width=width,
|
1380
|
+
labels=labels,
|
1381
|
+
legend_position=legend_position,
|
1382
|
+
render_ocr=render_ocr,
|
1383
|
+
crop_bbox=crop_bbox,
|
1384
|
+
)
|
1385
|
+
|
1386
|
+
if img:
|
1387
|
+
page_images.append(img)
|
1388
|
+
except Exception as e:
|
1389
|
+
logger.error(
|
1390
|
+
f"Error rendering page {getattr(page, 'number', '?')}: {e}", exc_info=True
|
1391
|
+
)
|
1392
|
+
|
1393
|
+
if not page_images:
|
1394
|
+
logger.warning("Failed to render any pages")
|
1079
1395
|
return None
|
1080
1396
|
|
1397
|
+
if len(page_images) == 1:
|
1398
|
+
return page_images[0]
|
1399
|
+
|
1400
|
+
# Stack the images
|
1401
|
+
if stack_direction == "vertical":
|
1402
|
+
final_width = max(img.width for img in page_images)
|
1403
|
+
final_height = (
|
1404
|
+
sum(img.height for img in page_images) + (len(page_images) - 1) * stack_gap
|
1405
|
+
)
|
1406
|
+
|
1407
|
+
stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
1408
|
+
|
1409
|
+
current_y = 0
|
1410
|
+
for img in page_images:
|
1411
|
+
# Center horizontally
|
1412
|
+
x_offset = (final_width - img.width) // 2
|
1413
|
+
stacked_image.paste(img, (x_offset, current_y))
|
1414
|
+
current_y += img.height + stack_gap
|
1415
|
+
else: # horizontal
|
1416
|
+
final_width = sum(img.width for img in page_images) + (len(page_images) - 1) * stack_gap
|
1417
|
+
final_height = max(img.height for img in page_images)
|
1418
|
+
|
1419
|
+
stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
|
1420
|
+
|
1421
|
+
current_x = 0
|
1422
|
+
for img in page_images:
|
1423
|
+
# Center vertically
|
1424
|
+
y_offset = (final_height - img.height) // 2
|
1425
|
+
stacked_image.paste(img, (current_x, y_offset))
|
1426
|
+
current_x += img.width + stack_gap
|
1427
|
+
|
1428
|
+
return stacked_image
|
1429
|
+
|
1081
1430
|
def save(
|
1082
1431
|
self,
|
1083
1432
|
filename: str,
|
@@ -1113,8 +1462,8 @@ class ElementCollection(
|
|
1113
1462
|
else:
|
1114
1463
|
resolution = 144 # Default resolution when none specified
|
1115
1464
|
|
1116
|
-
# Use
|
1117
|
-
self.
|
1465
|
+
# Use export() to save the image
|
1466
|
+
self.export(
|
1118
1467
|
path=filename,
|
1119
1468
|
resolution=resolution,
|
1120
1469
|
width=width,
|
@@ -1124,42 +1473,6 @@ class ElementCollection(
|
|
1124
1473
|
)
|
1125
1474
|
return self
|
1126
1475
|
|
1127
|
-
def to_image(
|
1128
|
-
self,
|
1129
|
-
path: Optional[str] = None,
|
1130
|
-
resolution: Optional[float] = None,
|
1131
|
-
width: Optional[int] = None,
|
1132
|
-
labels: bool = True,
|
1133
|
-
legend_position: str = "right",
|
1134
|
-
render_ocr: bool = False,
|
1135
|
-
) -> Optional["Image.Image"]:
|
1136
|
-
"""
|
1137
|
-
Generate an image of the page with this collection's elements highlighted,
|
1138
|
-
optionally saving it to a file.
|
1139
|
-
|
1140
|
-
Args:
|
1141
|
-
path: Optional path to save the image to
|
1142
|
-
resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
|
1143
|
-
width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
|
1144
|
-
labels: Whether to include a legend for labels
|
1145
|
-
legend_position: Position of the legend
|
1146
|
-
render_ocr: Whether to render OCR text with white background boxes
|
1147
|
-
|
1148
|
-
Returns:
|
1149
|
-
PIL Image of the page with elements highlighted, or None if no valid page
|
1150
|
-
"""
|
1151
|
-
# Get the page from the first element (if available)
|
1152
|
-
if self._elements and hasattr(self._elements[0], "page"):
|
1153
|
-
page = self._elements[0].page
|
1154
|
-
# Generate the image using to_image
|
1155
|
-
return page.to_image(
|
1156
|
-
path=path,
|
1157
|
-
resolution=resolution,
|
1158
|
-
width=width,
|
1159
|
-
labels=labels,
|
1160
|
-
legend_position=legend_position,
|
1161
|
-
render_ocr=render_ocr,
|
1162
|
-
)
|
1163
1476
|
return None
|
1164
1477
|
|
1165
1478
|
def _group_elements_by_attr(self, group_by: str) -> Dict[Any, List[T]]:
|
@@ -1219,17 +1532,57 @@ class ElementCollection(
|
|
1219
1532
|
return str(group_key)
|
1220
1533
|
|
1221
1534
|
def _get_element_highlight_params(
|
1222
|
-
self, element: T,
|
1535
|
+
self, element: T, annotate: Optional[List[str]]
|
1223
1536
|
) -> Optional[Dict]:
|
1224
1537
|
"""Extracts common parameters needed for highlighting a single element."""
|
1538
|
+
# For FlowRegions and other complex elements, use highlighting protocol
|
1539
|
+
if hasattr(element, "get_highlight_specs"):
|
1540
|
+
specs = element.get_highlight_specs()
|
1541
|
+
if not specs:
|
1542
|
+
logger.warning(f"Element {element} returned no highlight specs")
|
1543
|
+
return None
|
1544
|
+
|
1545
|
+
# For now, we'll use the first spec for the prepared data
|
1546
|
+
# The actual rendering will use all specs
|
1547
|
+
first_spec = specs[0]
|
1548
|
+
page = first_spec["page"]
|
1549
|
+
|
1550
|
+
base_data = {
|
1551
|
+
"page_index": first_spec["page_index"],
|
1552
|
+
"element": element,
|
1553
|
+
"annotate": annotate,
|
1554
|
+
"attributes_to_draw": {},
|
1555
|
+
"bbox": first_spec.get("bbox"),
|
1556
|
+
"polygon": first_spec.get("polygon"),
|
1557
|
+
"multi_spec": len(specs) > 1, # Flag to indicate multiple specs
|
1558
|
+
"all_specs": specs, # Store all specs for rendering
|
1559
|
+
}
|
1560
|
+
|
1561
|
+
# Extract attributes if requested
|
1562
|
+
if annotate:
|
1563
|
+
for attr_name in annotate:
|
1564
|
+
try:
|
1565
|
+
attr_value = getattr(element, attr_name, None)
|
1566
|
+
if attr_value is not None:
|
1567
|
+
base_data["attributes_to_draw"][attr_name] = attr_value
|
1568
|
+
except AttributeError:
|
1569
|
+
logger.warning(
|
1570
|
+
f"Attribute '{attr_name}' not found on element {element} for annotate"
|
1571
|
+
)
|
1572
|
+
|
1573
|
+
return base_data
|
1574
|
+
|
1575
|
+
# Fallback for regular elements with direct page access
|
1225
1576
|
if not hasattr(element, "page"):
|
1577
|
+
logger.warning(f"Element {element} has no page attribute and no highlighting protocol")
|
1226
1578
|
return None
|
1579
|
+
|
1227
1580
|
page = element.page
|
1228
1581
|
|
1229
1582
|
base_data = {
|
1230
1583
|
"page_index": page.index,
|
1231
1584
|
"element": element,
|
1232
|
-
"
|
1585
|
+
"annotate": annotate,
|
1233
1586
|
"attributes_to_draw": {},
|
1234
1587
|
"bbox": None,
|
1235
1588
|
"polygon": None,
|
@@ -1254,15 +1607,15 @@ class ElementCollection(
|
|
1254
1607
|
return None
|
1255
1608
|
|
1256
1609
|
# Extract attributes if requested
|
1257
|
-
if
|
1258
|
-
for attr_name in
|
1610
|
+
if annotate:
|
1611
|
+
for attr_name in annotate:
|
1259
1612
|
try:
|
1260
1613
|
attr_value = getattr(element, attr_name, None)
|
1261
1614
|
if attr_value is not None:
|
1262
1615
|
base_data["attributes_to_draw"][attr_name] = attr_value
|
1263
1616
|
except AttributeError:
|
1264
1617
|
logger.warning(
|
1265
|
-
f"Attribute '{attr_name}' not found on element {element} for
|
1618
|
+
f"Attribute '{attr_name}' not found on element {element} for annotate"
|
1266
1619
|
)
|
1267
1620
|
|
1268
1621
|
return base_data
|
@@ -1699,9 +2052,7 @@ class ElementCollection(
|
|
1699
2052
|
image_path = image_dir / image_filename
|
1700
2053
|
|
1701
2054
|
# Save image
|
1702
|
-
element.
|
1703
|
-
path=str(image_path), resolution=image_resolution, include_highlights=True
|
1704
|
-
)
|
2055
|
+
element.show(path=str(image_path), resolution=image_resolution)
|
1705
2056
|
|
1706
2057
|
# Add relative path to data
|
1707
2058
|
element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
@@ -1989,8 +2340,8 @@ class ElementCollection(
|
|
1989
2340
|
# ------------------------------------------------------------------
|
1990
2341
|
def apply_ocr(
|
1991
2342
|
self,
|
1992
|
-
*,
|
1993
2343
|
function: Optional[Callable[["Region"], Optional[str]]] = None,
|
2344
|
+
*,
|
1994
2345
|
show_progress: bool = True,
|
1995
2346
|
**kwargs,
|
1996
2347
|
) -> "ElementCollection":
|
@@ -2046,1275 +2397,3 @@ class ElementCollection(
|
|
2046
2397
|
return self
|
2047
2398
|
|
2048
2399
|
# ------------------------------------------------------------------
|
2049
|
-
|
2050
|
-
|
2051
|
-
class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
|
2052
|
-
"""
|
2053
|
-
Represents a collection of Page objects, often from a single PDF document.
|
2054
|
-
Provides methods for batch operations on these pages.
|
2055
|
-
"""
|
2056
|
-
|
2057
|
-
def __init__(self, pages: Union[List[P], Sequence[P]]):
|
2058
|
-
"""
|
2059
|
-
Initialize a page collection.
|
2060
|
-
|
2061
|
-
Args:
|
2062
|
-
pages: List or sequence of Page objects (can be lazy)
|
2063
|
-
"""
|
2064
|
-
# Store the sequence as-is to preserve lazy behavior
|
2065
|
-
# Only convert to list if we need list-specific operations
|
2066
|
-
if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
|
2067
|
-
self.pages = pages
|
2068
|
-
else:
|
2069
|
-
# Fallback for non-sequence types
|
2070
|
-
self.pages = list(pages)
|
2071
|
-
|
2072
|
-
def __len__(self) -> int:
|
2073
|
-
"""Return the number of pages in the collection."""
|
2074
|
-
return len(self.pages)
|
2075
|
-
|
2076
|
-
def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
|
2077
|
-
"""Support indexing and slicing."""
|
2078
|
-
if isinstance(idx, slice):
|
2079
|
-
return PageCollection(self.pages[idx])
|
2080
|
-
return self.pages[idx]
|
2081
|
-
|
2082
|
-
def __iter__(self) -> Iterator[P]:
|
2083
|
-
"""Support iteration."""
|
2084
|
-
return iter(self.pages)
|
2085
|
-
|
2086
|
-
def __repr__(self) -> str:
|
2087
|
-
"""Return a string representation showing the page count."""
|
2088
|
-
return f"<PageCollection(count={len(self)})>"
|
2089
|
-
|
2090
|
-
def _get_items_for_apply(self) -> Iterator[P]:
|
2091
|
-
"""
|
2092
|
-
Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
|
2093
|
-
|
2094
|
-
Returns an iterator that yields pages on-demand rather than materializing
|
2095
|
-
all pages at once, maintaining the lazy loading behavior.
|
2096
|
-
"""
|
2097
|
-
return iter(self.pages)
|
2098
|
-
|
2099
|
-
def _get_page_indices(self) -> List[int]:
|
2100
|
-
"""
|
2101
|
-
Get page indices without forcing materialization of pages.
|
2102
|
-
|
2103
|
-
Returns:
|
2104
|
-
List of page indices for the pages in this collection.
|
2105
|
-
"""
|
2106
|
-
# Handle different types of page sequences efficiently
|
2107
|
-
if hasattr(self.pages, '_indices'):
|
2108
|
-
# If it's a _LazyPageList (or slice), get indices directly
|
2109
|
-
return list(self.pages._indices)
|
2110
|
-
else:
|
2111
|
-
# Fallback: if pages are already materialized, get indices normally
|
2112
|
-
# This will force materialization but only if pages aren't lazy
|
2113
|
-
return [p.index for p in self.pages]
|
2114
|
-
|
2115
|
-
def extract_text(
|
2116
|
-
self,
|
2117
|
-
keep_blank_chars: bool = True,
|
2118
|
-
apply_exclusions: bool = True,
|
2119
|
-
strip: Optional[bool] = None,
|
2120
|
-
**kwargs,
|
2121
|
-
) -> str:
|
2122
|
-
"""
|
2123
|
-
Extract text from all pages in the collection.
|
2124
|
-
|
2125
|
-
Args:
|
2126
|
-
keep_blank_chars: Whether to keep blank characters (default: True)
|
2127
|
-
apply_exclusions: Whether to apply exclusion regions (default: True)
|
2128
|
-
strip: Whether to strip whitespace from the extracted text.
|
2129
|
-
**kwargs: Additional extraction parameters
|
2130
|
-
|
2131
|
-
Returns:
|
2132
|
-
Combined text from all pages
|
2133
|
-
"""
|
2134
|
-
texts = []
|
2135
|
-
for page in self.pages:
|
2136
|
-
text = page.extract_text(
|
2137
|
-
keep_blank_chars=keep_blank_chars,
|
2138
|
-
apply_exclusions=apply_exclusions,
|
2139
|
-
**kwargs,
|
2140
|
-
)
|
2141
|
-
texts.append(text)
|
2142
|
-
|
2143
|
-
combined = "\n".join(texts)
|
2144
|
-
|
2145
|
-
# Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
|
2146
|
-
use_layout = kwargs.get("layout", False)
|
2147
|
-
strip_final = strip if strip is not None else (not use_layout)
|
2148
|
-
|
2149
|
-
if strip_final:
|
2150
|
-
combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
|
2151
|
-
|
2152
|
-
return combined
|
2153
|
-
|
2154
|
-
def apply_ocr(
|
2155
|
-
self,
|
2156
|
-
engine: Optional[str] = None,
|
2157
|
-
# --- Common OCR Parameters (Direct Arguments) ---
|
2158
|
-
languages: Optional[List[str]] = None,
|
2159
|
-
min_confidence: Optional[float] = None, # Min confidence threshold
|
2160
|
-
device: Optional[str] = None,
|
2161
|
-
resolution: Optional[int] = None, # DPI for rendering
|
2162
|
-
apply_exclusions: bool = True, # New parameter
|
2163
|
-
replace: bool = True, # Whether to replace existing OCR elements
|
2164
|
-
# --- Engine-Specific Options ---
|
2165
|
-
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
2166
|
-
) -> "PageCollection[P]":
|
2167
|
-
"""
|
2168
|
-
Applies OCR to all pages within this collection using batch processing.
|
2169
|
-
|
2170
|
-
This delegates the work to the parent PDF object's `apply_ocr` method.
|
2171
|
-
|
2172
|
-
Args:
|
2173
|
-
engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
|
2174
|
-
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
|
2175
|
-
**Must be codes understood by the specific selected engine.**
|
2176
|
-
No mapping is performed.
|
2177
|
-
min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
|
2178
|
-
device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
|
2179
|
-
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
2180
|
-
apply_exclusions: If True (default), render page images for OCR with
|
2181
|
-
excluded areas masked (whited out). If False, OCR
|
2182
|
-
the raw page images without masking exclusions.
|
2183
|
-
replace: If True (default), remove any existing OCR elements before
|
2184
|
-
adding new ones. If False, add new OCR elements to existing ones.
|
2185
|
-
options: An engine-specific options object (e.g., EasyOCROptions) or dict.
|
2186
|
-
|
2187
|
-
Returns:
|
2188
|
-
Self for method chaining.
|
2189
|
-
|
2190
|
-
Raises:
|
2191
|
-
RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
|
2192
|
-
(Propagates exceptions from PDF.apply_ocr)
|
2193
|
-
"""
|
2194
|
-
if not self.pages:
|
2195
|
-
logger.warning("Cannot apply OCR to an empty PageCollection.")
|
2196
|
-
return self
|
2197
|
-
|
2198
|
-
# Assume all pages share the same parent PDF object
|
2199
|
-
first_page = self.pages[0]
|
2200
|
-
if not hasattr(first_page, "_parent") or not first_page._parent:
|
2201
|
-
raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
|
2202
|
-
|
2203
|
-
parent_pdf = first_page._parent
|
2204
|
-
|
2205
|
-
if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
|
2206
|
-
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
2207
|
-
|
2208
|
-
# Get the 0-based indices of the pages in this collection
|
2209
|
-
page_indices = self._get_page_indices()
|
2210
|
-
|
2211
|
-
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
2212
|
-
|
2213
|
-
# Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
|
2214
|
-
parent_pdf.apply_ocr(
|
2215
|
-
pages=page_indices,
|
2216
|
-
engine=engine,
|
2217
|
-
languages=languages,
|
2218
|
-
min_confidence=min_confidence, # Pass the renamed parameter
|
2219
|
-
device=device,
|
2220
|
-
resolution=resolution,
|
2221
|
-
apply_exclusions=apply_exclusions, # Pass down
|
2222
|
-
replace=replace, # Pass the replace parameter
|
2223
|
-
options=options,
|
2224
|
-
)
|
2225
|
-
# The PDF method modifies the Page objects directly by adding elements.
|
2226
|
-
|
2227
|
-
return self # Return self for chaining
|
2228
|
-
|
2229
|
-
@overload
|
2230
|
-
def find(
|
2231
|
-
self,
|
2232
|
-
*,
|
2233
|
-
text: str,
|
2234
|
-
contains: str = "all",
|
2235
|
-
apply_exclusions: bool = True,
|
2236
|
-
regex: bool = False,
|
2237
|
-
case: bool = True,
|
2238
|
-
**kwargs,
|
2239
|
-
) -> Optional[T]: ...
|
2240
|
-
|
2241
|
-
@overload
|
2242
|
-
def find(
|
2243
|
-
self,
|
2244
|
-
selector: str,
|
2245
|
-
*,
|
2246
|
-
contains: str = "all",
|
2247
|
-
apply_exclusions: bool = True,
|
2248
|
-
regex: bool = False,
|
2249
|
-
case: bool = True,
|
2250
|
-
**kwargs,
|
2251
|
-
) -> Optional[T]: ...
|
2252
|
-
|
2253
|
-
def find(
|
2254
|
-
self,
|
2255
|
-
selector: Optional[str] = None,
|
2256
|
-
*,
|
2257
|
-
text: Optional[str] = None,
|
2258
|
-
contains: str = "all",
|
2259
|
-
apply_exclusions: bool = True,
|
2260
|
-
regex: bool = False,
|
2261
|
-
case: bool = True,
|
2262
|
-
**kwargs,
|
2263
|
-
) -> Optional[T]:
|
2264
|
-
"""
|
2265
|
-
Find the first element matching the selector OR text across all pages in the collection.
|
2266
|
-
|
2267
|
-
Provide EITHER `selector` OR `text`, but not both.
|
2268
|
-
|
2269
|
-
Args:
|
2270
|
-
selector: CSS-like selector string.
|
2271
|
-
text: Text content to search for (equivalent to 'text:contains(...)').
|
2272
|
-
contains: How to determine if elements are inside: 'all' (fully inside),
|
2273
|
-
'any' (any overlap), or 'center' (center point inside).
|
2274
|
-
(default: "all")
|
2275
|
-
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2276
|
-
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2277
|
-
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
2278
|
-
**kwargs: Additional filter parameters.
|
2279
|
-
|
2280
|
-
Returns:
|
2281
|
-
First matching element or None.
|
2282
|
-
"""
|
2283
|
-
# Input validation happens within page.find
|
2284
|
-
for page in self.pages:
|
2285
|
-
element = page.find(
|
2286
|
-
selector=selector,
|
2287
|
-
text=text,
|
2288
|
-
contains=contains,
|
2289
|
-
apply_exclusions=apply_exclusions,
|
2290
|
-
regex=regex,
|
2291
|
-
case=case,
|
2292
|
-
**kwargs,
|
2293
|
-
)
|
2294
|
-
if element:
|
2295
|
-
return element
|
2296
|
-
return None
|
2297
|
-
|
2298
|
-
@overload
|
2299
|
-
def find_all(
|
2300
|
-
self,
|
2301
|
-
*,
|
2302
|
-
text: str,
|
2303
|
-
contains: str = "all",
|
2304
|
-
apply_exclusions: bool = True,
|
2305
|
-
regex: bool = False,
|
2306
|
-
case: bool = True,
|
2307
|
-
**kwargs,
|
2308
|
-
) -> "ElementCollection": ...
|
2309
|
-
|
2310
|
-
@overload
|
2311
|
-
def find_all(
|
2312
|
-
self,
|
2313
|
-
selector: str,
|
2314
|
-
*,
|
2315
|
-
contains: str = "all",
|
2316
|
-
apply_exclusions: bool = True,
|
2317
|
-
regex: bool = False,
|
2318
|
-
case: bool = True,
|
2319
|
-
**kwargs,
|
2320
|
-
) -> "ElementCollection": ...
|
2321
|
-
|
2322
|
-
def find_all(
|
2323
|
-
self,
|
2324
|
-
selector: Optional[str] = None,
|
2325
|
-
*,
|
2326
|
-
text: Optional[str] = None,
|
2327
|
-
contains: str = "all",
|
2328
|
-
apply_exclusions: bool = True,
|
2329
|
-
regex: bool = False,
|
2330
|
-
case: bool = True,
|
2331
|
-
**kwargs,
|
2332
|
-
) -> "ElementCollection":
|
2333
|
-
"""
|
2334
|
-
Find all elements matching the selector OR text across all pages in the collection.
|
2335
|
-
|
2336
|
-
Provide EITHER `selector` OR `text`, but not both.
|
2337
|
-
|
2338
|
-
Args:
|
2339
|
-
selector: CSS-like selector string.
|
2340
|
-
text: Text content to search for (equivalent to 'text:contains(...)').
|
2341
|
-
contains: How to determine if elements are inside: 'all' (fully inside),
|
2342
|
-
'any' (any overlap), or 'center' (center point inside).
|
2343
|
-
(default: "all")
|
2344
|
-
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
2345
|
-
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
2346
|
-
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
2347
|
-
**kwargs: Additional filter parameters.
|
2348
|
-
|
2349
|
-
Returns:
|
2350
|
-
ElementCollection with matching elements from all pages.
|
2351
|
-
"""
|
2352
|
-
all_elements = []
|
2353
|
-
# Input validation happens within page.find_all
|
2354
|
-
for page in self.pages:
|
2355
|
-
elements = page.find_all(
|
2356
|
-
selector=selector,
|
2357
|
-
text=text,
|
2358
|
-
contains=contains,
|
2359
|
-
apply_exclusions=apply_exclusions,
|
2360
|
-
regex=regex,
|
2361
|
-
case=case,
|
2362
|
-
**kwargs,
|
2363
|
-
)
|
2364
|
-
if elements:
|
2365
|
-
all_elements.extend(elements.elements)
|
2366
|
-
|
2367
|
-
return ElementCollection(all_elements)
|
2368
|
-
|
2369
|
-
def update_text(
|
2370
|
-
self,
|
2371
|
-
transform: Callable[[Any], Optional[str]],
|
2372
|
-
selector: str = "text",
|
2373
|
-
max_workers: Optional[int] = None,
|
2374
|
-
) -> "PageCollection[P]":
|
2375
|
-
"""
|
2376
|
-
Applies corrections to text elements across all pages
|
2377
|
-
in this collection using a user-provided callback function, executed
|
2378
|
-
in parallel if `max_workers` is specified.
|
2379
|
-
|
2380
|
-
This method delegates to the parent PDF's `update_text` method,
|
2381
|
-
targeting all pages within this collection.
|
2382
|
-
|
2383
|
-
Args:
|
2384
|
-
transform: A function that accepts a single argument (an element
|
2385
|
-
object) and returns `Optional[str]` (new text or None).
|
2386
|
-
selector: The attribute name to update. Default is 'text'.
|
2387
|
-
max_workers: The maximum number of worker threads to use for parallel
|
2388
|
-
correction on each page. If None, defaults are used.
|
2389
|
-
|
2390
|
-
Returns:
|
2391
|
-
Self for method chaining.
|
2392
|
-
|
2393
|
-
Raises:
|
2394
|
-
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
2395
|
-
or the parent PDF lacks the `update_text` method.
|
2396
|
-
"""
|
2397
|
-
if not self.pages:
|
2398
|
-
logger.warning("Cannot update text for an empty PageCollection.")
|
2399
|
-
# Return self even if empty to maintain chaining consistency
|
2400
|
-
return self
|
2401
|
-
|
2402
|
-
# Assume all pages share the same parent PDF object
|
2403
|
-
parent_pdf = self.pages[0]._parent
|
2404
|
-
if (
|
2405
|
-
not parent_pdf
|
2406
|
-
or not hasattr(parent_pdf, "update_text")
|
2407
|
-
or not callable(parent_pdf.update_text)
|
2408
|
-
):
|
2409
|
-
raise RuntimeError(
|
2410
|
-
"Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
|
2411
|
-
)
|
2412
|
-
|
2413
|
-
page_indices = self._get_page_indices()
|
2414
|
-
logger.info(
|
2415
|
-
f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
|
2416
|
-
)
|
2417
|
-
|
2418
|
-
# Delegate the call to the parent PDF object for the relevant pages
|
2419
|
-
# Pass the max_workers parameter down
|
2420
|
-
parent_pdf.update_text(
|
2421
|
-
transform=transform,
|
2422
|
-
pages=page_indices,
|
2423
|
-
selector=selector,
|
2424
|
-
max_workers=max_workers,
|
2425
|
-
)
|
2426
|
-
|
2427
|
-
return self
|
2428
|
-
|
2429
|
-
def get_sections(
|
2430
|
-
self,
|
2431
|
-
start_elements=None,
|
2432
|
-
end_elements=None,
|
2433
|
-
new_section_on_page_break=False,
|
2434
|
-
boundary_inclusion="both",
|
2435
|
-
) -> "ElementCollection[Region]":
|
2436
|
-
"""
|
2437
|
-
Extract sections from a page collection based on start/end elements.
|
2438
|
-
|
2439
|
-
Args:
|
2440
|
-
start_elements: Elements or selector string that mark the start of sections (optional)
|
2441
|
-
end_elements: Elements or selector string that mark the end of sections (optional)
|
2442
|
-
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
2443
|
-
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
2444
|
-
|
2445
|
-
Returns:
|
2446
|
-
List of Region objects representing the extracted sections
|
2447
|
-
|
2448
|
-
Note:
|
2449
|
-
You can provide only start_elements, only end_elements, or both.
|
2450
|
-
- With only start_elements: sections go from each start to the next start (or end of page)
|
2451
|
-
- With only end_elements: sections go from beginning of document/page to each end
|
2452
|
-
- With both: sections go from each start to the corresponding end
|
2453
|
-
"""
|
2454
|
-
# Find start and end elements across all pages
|
2455
|
-
if isinstance(start_elements, str):
|
2456
|
-
start_elements = self.find_all(start_elements).elements
|
2457
|
-
|
2458
|
-
if isinstance(end_elements, str):
|
2459
|
-
end_elements = self.find_all(end_elements).elements
|
2460
|
-
|
2461
|
-
# If no start elements and no end elements, return empty list
|
2462
|
-
if not start_elements and not end_elements:
|
2463
|
-
return []
|
2464
|
-
|
2465
|
-
# If there are page break boundaries, we'll need to add them
|
2466
|
-
if new_section_on_page_break:
|
2467
|
-
# For each page boundary, create virtual "end" and "start" elements
|
2468
|
-
for i in range(len(self.pages) - 1):
|
2469
|
-
# Add a virtual "end" element at the bottom of the current page
|
2470
|
-
page = self.pages[i]
|
2471
|
-
# If end_elements is None, initialize it as an empty list
|
2472
|
-
if end_elements is None:
|
2473
|
-
end_elements = []
|
2474
|
-
|
2475
|
-
# Create a region at the bottom of the page as an artificial end marker
|
2476
|
-
from natural_pdf.elements.region import Region
|
2477
|
-
|
2478
|
-
bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
|
2479
|
-
bottom_region.is_page_boundary = True # Mark it as a special boundary
|
2480
|
-
end_elements.append(bottom_region)
|
2481
|
-
|
2482
|
-
# Add a virtual "start" element at the top of the next page
|
2483
|
-
next_page = self.pages[i + 1]
|
2484
|
-
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
2485
|
-
top_region.is_page_boundary = True # Mark it as a special boundary
|
2486
|
-
start_elements.append(top_region)
|
2487
|
-
|
2488
|
-
# Get all elements from all pages and sort them in document order
|
2489
|
-
all_elements = []
|
2490
|
-
for page in self.pages:
|
2491
|
-
elements = page.get_elements()
|
2492
|
-
all_elements.extend(elements)
|
2493
|
-
|
2494
|
-
# Sort by page index, then vertical position, then horizontal position
|
2495
|
-
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
2496
|
-
|
2497
|
-
# If we only have end_elements (no start_elements), create implicit start elements
|
2498
|
-
if not start_elements and end_elements:
|
2499
|
-
from natural_pdf.elements.region import Region
|
2500
|
-
|
2501
|
-
start_elements = []
|
2502
|
-
|
2503
|
-
# Add implicit start at the beginning of the first page
|
2504
|
-
first_page = self.pages[0]
|
2505
|
-
first_start = Region(first_page, (0, 0, first_page.width, 1))
|
2506
|
-
first_start.is_implicit_start = True
|
2507
|
-
start_elements.append(first_start)
|
2508
|
-
|
2509
|
-
# For each end element (except the last), add an implicit start after it
|
2510
|
-
sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
|
2511
|
-
for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
|
2512
|
-
# Create implicit start element right after this end element
|
2513
|
-
implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
|
2514
|
-
implicit_start.is_implicit_start = True
|
2515
|
-
start_elements.append(implicit_start)
|
2516
|
-
|
2517
|
-
# Mark section boundaries
|
2518
|
-
section_boundaries = []
|
2519
|
-
|
2520
|
-
# Add start element boundaries
|
2521
|
-
for element in start_elements:
|
2522
|
-
if element in all_elements:
|
2523
|
-
idx = all_elements.index(element)
|
2524
|
-
section_boundaries.append(
|
2525
|
-
{
|
2526
|
-
"index": idx,
|
2527
|
-
"element": element,
|
2528
|
-
"type": "start",
|
2529
|
-
"page_idx": element.page.index,
|
2530
|
-
}
|
2531
|
-
)
|
2532
|
-
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
2533
|
-
# This is a virtual page boundary element
|
2534
|
-
section_boundaries.append(
|
2535
|
-
{
|
2536
|
-
"index": -1, # Special index for page boundaries
|
2537
|
-
"element": element,
|
2538
|
-
"type": "start",
|
2539
|
-
"page_idx": element.page.index,
|
2540
|
-
}
|
2541
|
-
)
|
2542
|
-
elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
|
2543
|
-
# This is an implicit start element
|
2544
|
-
section_boundaries.append(
|
2545
|
-
{
|
2546
|
-
"index": -2, # Special index for implicit starts
|
2547
|
-
"element": element,
|
2548
|
-
"type": "start",
|
2549
|
-
"page_idx": element.page.index,
|
2550
|
-
}
|
2551
|
-
)
|
2552
|
-
|
2553
|
-
# Add end element boundaries if provided
|
2554
|
-
if end_elements:
|
2555
|
-
for element in end_elements:
|
2556
|
-
if element in all_elements:
|
2557
|
-
idx = all_elements.index(element)
|
2558
|
-
section_boundaries.append(
|
2559
|
-
{
|
2560
|
-
"index": idx,
|
2561
|
-
"element": element,
|
2562
|
-
"type": "end",
|
2563
|
-
"page_idx": element.page.index,
|
2564
|
-
}
|
2565
|
-
)
|
2566
|
-
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
2567
|
-
# This is a virtual page boundary element
|
2568
|
-
section_boundaries.append(
|
2569
|
-
{
|
2570
|
-
"index": -1, # Special index for page boundaries
|
2571
|
-
"element": element,
|
2572
|
-
"type": "end",
|
2573
|
-
"page_idx": element.page.index,
|
2574
|
-
}
|
2575
|
-
)
|
2576
|
-
|
2577
|
-
# Sort boundaries by page index, then by actual document position
|
2578
|
-
def _sort_key(boundary):
|
2579
|
-
"""Sort boundaries by (page_idx, vertical_top, priority)."""
|
2580
|
-
page_idx = boundary["page_idx"]
|
2581
|
-
element = boundary["element"]
|
2582
|
-
|
2583
|
-
# Vertical position on the page
|
2584
|
-
y_pos = getattr(element, "top", 0.0)
|
2585
|
-
|
2586
|
-
# Ensure starts come before ends at the same coordinate
|
2587
|
-
priority = 0 if boundary["type"] == "start" else 1
|
2588
|
-
|
2589
|
-
return (page_idx, y_pos, priority)
|
2590
|
-
|
2591
|
-
section_boundaries.sort(key=_sort_key)
|
2592
|
-
|
2593
|
-
# Generate sections
|
2594
|
-
sections = []
|
2595
|
-
|
2596
|
-
# --- Helper: build a FlowRegion spanning multiple pages ---
|
2597
|
-
def _build_flow_region(start_el, end_el):
|
2598
|
-
"""Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
|
2599
|
-
If *end_el* is None, the region continues to the bottom of the last
|
2600
|
-
page in this PageCollection."""
|
2601
|
-
# Local imports to avoid top-level cycles
|
2602
|
-
from natural_pdf.elements.region import Region
|
2603
|
-
from natural_pdf.flows.element import FlowElement
|
2604
|
-
from natural_pdf.flows.flow import Flow
|
2605
|
-
from natural_pdf.flows.region import FlowRegion
|
2606
|
-
|
2607
|
-
start_pg = start_el.page
|
2608
|
-
end_pg = end_el.page if end_el is not None else self.pages[-1]
|
2609
|
-
|
2610
|
-
parts: list[Region] = []
|
2611
|
-
|
2612
|
-
# Use the actual top of the start element (for implicit starts this is
|
2613
|
-
# the bottom of the previous end element) instead of forcing to 0.
|
2614
|
-
start_top = start_el.top
|
2615
|
-
|
2616
|
-
# Slice of first page beginning at *start_top*
|
2617
|
-
parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
|
2618
|
-
|
2619
|
-
# Full middle pages
|
2620
|
-
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
2621
|
-
mid_pg = self.pages[pg_idx]
|
2622
|
-
parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
|
2623
|
-
|
2624
|
-
# Slice of last page (if distinct)
|
2625
|
-
if end_pg is not start_pg:
|
2626
|
-
bottom = end_el.bottom if end_el is not None else end_pg.height
|
2627
|
-
parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
|
2628
|
-
|
2629
|
-
flow = Flow(segments=parts, arrangement="vertical")
|
2630
|
-
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
2631
|
-
return FlowRegion(
|
2632
|
-
flow=flow,
|
2633
|
-
constituent_regions=parts,
|
2634
|
-
source_flow_element=src_fe,
|
2635
|
-
boundary_element_found=end_el,
|
2636
|
-
)
|
2637
|
-
|
2638
|
-
# ------------------------------------------------------------------
|
2639
|
-
|
2640
|
-
current_start = None
|
2641
|
-
|
2642
|
-
for i, boundary in enumerate(section_boundaries):
|
2643
|
-
# If it's a start boundary and we don't have a current start
|
2644
|
-
if boundary["type"] == "start" and current_start is None:
|
2645
|
-
current_start = boundary
|
2646
|
-
|
2647
|
-
# If it's an end boundary and we have a current start
|
2648
|
-
elif boundary["type"] == "end" and current_start is not None:
|
2649
|
-
# Create a section from current_start to this boundary
|
2650
|
-
start_element = current_start["element"]
|
2651
|
-
end_element = boundary["element"]
|
2652
|
-
|
2653
|
-
# If both elements are on the same page, use the page's get_section_between
|
2654
|
-
if start_element.page == end_element.page:
|
2655
|
-
# For implicit start elements, create a region from the top of the page
|
2656
|
-
if hasattr(start_element, "is_implicit_start"):
|
2657
|
-
from natural_pdf.elements.region import Region
|
2658
|
-
section = Region(
|
2659
|
-
start_element.page,
|
2660
|
-
(0, start_element.top, start_element.page.width, end_element.bottom)
|
2661
|
-
)
|
2662
|
-
section.start_element = start_element
|
2663
|
-
section.boundary_element_found = end_element
|
2664
|
-
else:
|
2665
|
-
section = start_element.page.get_section_between(
|
2666
|
-
start_element, end_element, boundary_inclusion
|
2667
|
-
)
|
2668
|
-
sections.append(section)
|
2669
|
-
else:
|
2670
|
-
# Create FlowRegion spanning pages
|
2671
|
-
flow_region = _build_flow_region(start_element, end_element)
|
2672
|
-
sections.append(flow_region)
|
2673
|
-
|
2674
|
-
current_start = None
|
2675
|
-
|
2676
|
-
# If it's another start boundary and we have a current start (for splitting by starts only)
|
2677
|
-
elif boundary["type"] == "start" and current_start is not None and not end_elements:
|
2678
|
-
# Create a section from current_start to just before this boundary
|
2679
|
-
start_element = current_start["element"]
|
2680
|
-
|
2681
|
-
# Find the last element before this boundary on the same page
|
2682
|
-
if start_element.page == boundary["element"].page:
|
2683
|
-
# Find elements on this page
|
2684
|
-
page_elements = [e for e in all_elements if e.page == start_element.page]
|
2685
|
-
# Sort by position
|
2686
|
-
page_elements.sort(key=lambda e: (e.top, e.x0))
|
2687
|
-
|
2688
|
-
# Find the last element before the boundary
|
2689
|
-
end_idx = (
|
2690
|
-
page_elements.index(boundary["element"]) - 1
|
2691
|
-
if boundary["element"] in page_elements
|
2692
|
-
else -1
|
2693
|
-
)
|
2694
|
-
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
2695
|
-
|
2696
|
-
# Create the section
|
2697
|
-
section = start_element.page.get_section_between(
|
2698
|
-
start_element, end_element, boundary_inclusion
|
2699
|
-
)
|
2700
|
-
sections.append(section)
|
2701
|
-
else:
|
2702
|
-
# Cross-page section - create from current_start to the end of its page
|
2703
|
-
from natural_pdf.elements.region import Region
|
2704
|
-
|
2705
|
-
start_page = start_element.page
|
2706
|
-
|
2707
|
-
# Handle implicit start elements
|
2708
|
-
start_top = start_element.top
|
2709
|
-
region = Region(
|
2710
|
-
start_page, (0, start_top, start_page.width, start_page.height)
|
2711
|
-
)
|
2712
|
-
region.start_element = start_element
|
2713
|
-
sections.append(region)
|
2714
|
-
|
2715
|
-
current_start = boundary
|
2716
|
-
|
2717
|
-
# Handle the last section if we have a current start
|
2718
|
-
if current_start is not None:
|
2719
|
-
start_element = current_start["element"]
|
2720
|
-
start_page = start_element.page
|
2721
|
-
|
2722
|
-
if end_elements:
|
2723
|
-
# With end_elements, we need an explicit end - use the last element
|
2724
|
-
# on the last page of the collection
|
2725
|
-
last_page = self.pages[-1]
|
2726
|
-
last_page_elements = [e for e in all_elements if e.page == last_page]
|
2727
|
-
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
2728
|
-
end_element = last_page_elements[-1] if last_page_elements else None
|
2729
|
-
|
2730
|
-
# Create FlowRegion spanning multiple pages using helper
|
2731
|
-
flow_region = _build_flow_region(start_element, end_element)
|
2732
|
-
sections.append(flow_region)
|
2733
|
-
else:
|
2734
|
-
# With start_elements only, create a section to the end of the current page
|
2735
|
-
from natural_pdf.elements.region import Region
|
2736
|
-
|
2737
|
-
# Handle implicit start elements
|
2738
|
-
start_top = start_element.top
|
2739
|
-
region = Region(
|
2740
|
-
start_page, (0, start_top, start_page.width, start_page.height)
|
2741
|
-
)
|
2742
|
-
region.start_element = start_element
|
2743
|
-
sections.append(region)
|
2744
|
-
|
2745
|
-
return ElementCollection(sections)
|
2746
|
-
|
2747
|
-
def _gather_analysis_data(
|
2748
|
-
self,
|
2749
|
-
analysis_keys: List[str],
|
2750
|
-
include_content: bool,
|
2751
|
-
include_images: bool,
|
2752
|
-
image_dir: Optional[Path],
|
2753
|
-
image_format: str,
|
2754
|
-
image_resolution: int,
|
2755
|
-
) -> List[Dict[str, Any]]:
|
2756
|
-
"""
|
2757
|
-
Gather analysis data from all pages in the collection.
|
2758
|
-
|
2759
|
-
Args:
|
2760
|
-
analysis_keys: Keys in the analyses dictionary to export
|
2761
|
-
include_content: Whether to include extracted text
|
2762
|
-
include_images: Whether to export images
|
2763
|
-
image_dir: Directory to save images
|
2764
|
-
image_format: Format to save images
|
2765
|
-
image_resolution: Resolution for exported images
|
2766
|
-
|
2767
|
-
Returns:
|
2768
|
-
List of dictionaries containing analysis data
|
2769
|
-
"""
|
2770
|
-
if not self.elements:
|
2771
|
-
logger.warning("No pages found in collection")
|
2772
|
-
return []
|
2773
|
-
|
2774
|
-
all_data = []
|
2775
|
-
|
2776
|
-
for page in self.elements:
|
2777
|
-
# Basic page information
|
2778
|
-
page_data = {
|
2779
|
-
"page_number": page.number,
|
2780
|
-
"page_index": page.index,
|
2781
|
-
"width": page.width,
|
2782
|
-
"height": page.height,
|
2783
|
-
}
|
2784
|
-
|
2785
|
-
# Add PDF information if available
|
2786
|
-
if hasattr(page, "pdf") and page.pdf:
|
2787
|
-
page_data["pdf_path"] = page.pdf.path
|
2788
|
-
page_data["pdf_filename"] = Path(page.pdf.path).name
|
2789
|
-
|
2790
|
-
# Include extracted text if requested
|
2791
|
-
if include_content:
|
2792
|
-
try:
|
2793
|
-
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
2794
|
-
except Exception as e:
|
2795
|
-
logger.error(f"Error extracting text from page {page.number}: {e}")
|
2796
|
-
page_data["content"] = ""
|
2797
|
-
|
2798
|
-
# Save image if requested
|
2799
|
-
if include_images:
|
2800
|
-
try:
|
2801
|
-
# Create image filename
|
2802
|
-
pdf_name = "unknown"
|
2803
|
-
if hasattr(page, "pdf") and page.pdf:
|
2804
|
-
pdf_name = Path(page.pdf.path).stem
|
2805
|
-
|
2806
|
-
image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
|
2807
|
-
image_path = image_dir / image_filename
|
2808
|
-
|
2809
|
-
# Save image
|
2810
|
-
page.save_image(
|
2811
|
-
str(image_path), resolution=image_resolution, include_highlights=True
|
2812
|
-
)
|
2813
|
-
|
2814
|
-
# Add relative path to data
|
2815
|
-
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
2816
|
-
except Exception as e:
|
2817
|
-
logger.error(f"Error saving image for page {page.number}: {e}")
|
2818
|
-
page_data["image_path"] = None
|
2819
|
-
|
2820
|
-
# Add analyses data
|
2821
|
-
if hasattr(page, "analyses") and page.analyses:
|
2822
|
-
for key in analysis_keys:
|
2823
|
-
if key not in page.analyses:
|
2824
|
-
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
2825
|
-
|
2826
|
-
# Get the analysis result
|
2827
|
-
analysis_result = page.analyses[key]
|
2828
|
-
|
2829
|
-
# If the result has a to_dict method, use it
|
2830
|
-
if hasattr(analysis_result, "to_dict"):
|
2831
|
-
analysis_data = analysis_result.to_dict()
|
2832
|
-
else:
|
2833
|
-
# Otherwise, use the result directly if it's dict-like
|
2834
|
-
try:
|
2835
|
-
analysis_data = dict(analysis_result)
|
2836
|
-
except (TypeError, ValueError):
|
2837
|
-
# Last resort: convert to string
|
2838
|
-
analysis_data = {"raw_result": str(analysis_result)}
|
2839
|
-
|
2840
|
-
# Add analysis data to page data with the key as prefix
|
2841
|
-
for k, v in analysis_data.items():
|
2842
|
-
page_data[f"{key}.{k}"] = v
|
2843
|
-
|
2844
|
-
all_data.append(page_data)
|
2845
|
-
|
2846
|
-
return all_data
|
2847
|
-
|
2848
|
-
# --- Deskew Method --- #
|
2849
|
-
|
2850
|
-
def deskew(
|
2851
|
-
self,
|
2852
|
-
resolution: int = 300,
|
2853
|
-
detection_resolution: int = 72,
|
2854
|
-
force_overwrite: bool = False,
|
2855
|
-
**deskew_kwargs,
|
2856
|
-
) -> "PDF": # Changed return type
|
2857
|
-
"""
|
2858
|
-
Creates a new, in-memory PDF object containing deskewed versions of the pages
|
2859
|
-
in this collection.
|
2860
|
-
|
2861
|
-
This method delegates the actual processing to the parent PDF object's
|
2862
|
-
`deskew` method.
|
2863
|
-
|
2864
|
-
Important: The returned PDF is image-based. Any existing text, OCR results,
|
2865
|
-
annotations, or other elements from the original pages will *not* be carried over.
|
2866
|
-
|
2867
|
-
Args:
|
2868
|
-
resolution: DPI resolution for rendering the output deskewed pages.
|
2869
|
-
detection_resolution: DPI resolution used for skew detection if angles are not
|
2870
|
-
already cached on the page objects.
|
2871
|
-
force_overwrite: If False (default), raises a ValueError if any target page
|
2872
|
-
already contains processed elements (text, OCR, regions) to
|
2873
|
-
prevent accidental data loss. Set to True to proceed anyway.
|
2874
|
-
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2875
|
-
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
2876
|
-
|
2877
|
-
Returns:
|
2878
|
-
A new PDF object representing the deskewed document.
|
2879
|
-
|
2880
|
-
Raises:
|
2881
|
-
ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
|
2882
|
-
ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
|
2883
|
-
or if the collection is empty.
|
2884
|
-
RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
|
2885
|
-
"""
|
2886
|
-
if not self.pages:
|
2887
|
-
logger.warning("Cannot deskew an empty PageCollection.")
|
2888
|
-
raise ValueError("Cannot deskew an empty PageCollection.")
|
2889
|
-
|
2890
|
-
# Assume all pages share the same parent PDF object
|
2891
|
-
# Need to hint the type of _parent for type checkers
|
2892
|
-
if TYPE_CHECKING:
|
2893
|
-
parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
|
2894
|
-
else:
|
2895
|
-
parent_pdf = self.pages[0]._parent
|
2896
|
-
|
2897
|
-
if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
|
2898
|
-
raise RuntimeError(
|
2899
|
-
"Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
|
2900
|
-
)
|
2901
|
-
|
2902
|
-
# Get the 0-based indices of the pages in this collection
|
2903
|
-
page_indices = self._get_page_indices()
|
2904
|
-
logger.info(
|
2905
|
-
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2906
|
-
)
|
2907
|
-
|
2908
|
-
# Delegate the call to the parent PDF object for the relevant pages
|
2909
|
-
# Pass all relevant arguments through (no output_path anymore)
|
2910
|
-
return parent_pdf.deskew(
|
2911
|
-
pages=page_indices,
|
2912
|
-
resolution=resolution,
|
2913
|
-
detection_resolution=detection_resolution,
|
2914
|
-
force_overwrite=force_overwrite,
|
2915
|
-
**deskew_kwargs,
|
2916
|
-
)
|
2917
|
-
|
2918
|
-
# --- End Deskew Method --- #
|
2919
|
-
|
2920
|
-
def to_image(
|
2921
|
-
self,
|
2922
|
-
page_width: Optional[int] = None,
|
2923
|
-
cols: Optional[int] = 4,
|
2924
|
-
rows: Optional[int] = None,
|
2925
|
-
max_pages: Optional[int] = None,
|
2926
|
-
spacing: int = 10,
|
2927
|
-
add_labels: bool = True, # Add new flag
|
2928
|
-
show_category: bool = False,
|
2929
|
-
) -> Optional["Image.Image"]:
|
2930
|
-
"""
|
2931
|
-
Generate a grid of page images for this collection.
|
2932
|
-
|
2933
|
-
Args:
|
2934
|
-
page_width: Width in pixels for rendering individual pages
|
2935
|
-
cols: Number of columns in grid (default: 4)
|
2936
|
-
rows: Number of rows in grid (calculated automatically if None)
|
2937
|
-
max_pages: Maximum number of pages to include (default: all)
|
2938
|
-
spacing: Spacing between page thumbnails in pixels
|
2939
|
-
add_labels: Whether to add page number labels
|
2940
|
-
show_category: Whether to add category and confidence labels (if available)
|
2941
|
-
|
2942
|
-
Returns:
|
2943
|
-
PIL Image of the page grid or None if no pages
|
2944
|
-
"""
|
2945
|
-
# Determine default page width from global options if not explicitly provided
|
2946
|
-
if page_width is None:
|
2947
|
-
try:
|
2948
|
-
import natural_pdf
|
2949
|
-
|
2950
|
-
page_width = natural_pdf.options.image.width or 300
|
2951
|
-
except Exception:
|
2952
|
-
# Fallback if natural_pdf import fails in some edge context
|
2953
|
-
page_width = 300
|
2954
|
-
|
2955
|
-
# Ensure PIL is imported, handle potential ImportError if not done globally/lazily
|
2956
|
-
try:
|
2957
|
-
from PIL import Image, ImageDraw, ImageFont
|
2958
|
-
except ImportError:
|
2959
|
-
logger.error(
|
2960
|
-
"Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
|
2961
|
-
)
|
2962
|
-
return None
|
2963
|
-
|
2964
|
-
if not self.pages:
|
2965
|
-
logger.warning("Cannot generate image for empty PageCollection")
|
2966
|
-
return None
|
2967
|
-
|
2968
|
-
# Limit pages if max_pages is specified
|
2969
|
-
pages_to_render = self.pages[:max_pages] if max_pages else self.pages
|
2970
|
-
|
2971
|
-
# Load font once outside the loop
|
2972
|
-
font = None
|
2973
|
-
if add_labels:
|
2974
|
-
try:
|
2975
|
-
# Try loading a commonly available font first
|
2976
|
-
font = ImageFont.truetype("DejaVuSans.ttf", 16)
|
2977
|
-
except IOError:
|
2978
|
-
try:
|
2979
|
-
font = ImageFont.load_default(16)
|
2980
|
-
except IOError:
|
2981
|
-
logger.warning("Default font not found. Labels cannot be added.")
|
2982
|
-
add_labels = False # Disable if no font
|
2983
|
-
|
2984
|
-
# Render individual page images
|
2985
|
-
page_images = []
|
2986
|
-
for page in pages_to_render:
|
2987
|
-
try:
|
2988
|
-
# Assume page.to_image returns a PIL Image or None
|
2989
|
-
img = page.to_image(
|
2990
|
-
width=page_width, include_highlights=True
|
2991
|
-
) # Render with highlights for visual context
|
2992
|
-
if img is None:
|
2993
|
-
logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
|
2994
|
-
continue
|
2995
|
-
except Exception as img_err:
|
2996
|
-
logger.error(
|
2997
|
-
f"Error generating image for page {page.number}: {img_err}", exc_info=True
|
2998
|
-
)
|
2999
|
-
continue
|
3000
|
-
|
3001
|
-
# Add page number label
|
3002
|
-
if add_labels and font:
|
3003
|
-
draw = ImageDraw.Draw(img)
|
3004
|
-
pdf_name = (
|
3005
|
-
Path(page.pdf.path).stem
|
3006
|
-
if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
|
3007
|
-
else ""
|
3008
|
-
)
|
3009
|
-
label_text = f"p{page.number}"
|
3010
|
-
if pdf_name:
|
3011
|
-
label_text += f" - {pdf_name}"
|
3012
|
-
|
3013
|
-
# Add category if requested and available
|
3014
|
-
if show_category:
|
3015
|
-
# Placeholder logic - adjust based on how classification results are stored
|
3016
|
-
category = None
|
3017
|
-
confidence = None
|
3018
|
-
if (
|
3019
|
-
hasattr(page, "analyses")
|
3020
|
-
and page.analyses
|
3021
|
-
and "classification" in page.analyses
|
3022
|
-
):
|
3023
|
-
result = page.analyses["classification"]
|
3024
|
-
# Adapt based on actual structure of classification result
|
3025
|
-
category = (
|
3026
|
-
getattr(result, "label", None) or result.get("label", None)
|
3027
|
-
if isinstance(result, dict)
|
3028
|
-
else None
|
3029
|
-
)
|
3030
|
-
confidence = (
|
3031
|
-
getattr(result, "score", None) or result.get("score", None)
|
3032
|
-
if isinstance(result, dict)
|
3033
|
-
else None
|
3034
|
-
)
|
3035
|
-
|
3036
|
-
if category is not None and confidence is not None:
|
3037
|
-
try:
|
3038
|
-
category_str = f"{category} ({confidence:.2f})" # Format confidence
|
3039
|
-
label_text += f"\\n{category_str}"
|
3040
|
-
except (TypeError, ValueError):
|
3041
|
-
pass # Ignore formatting errors
|
3042
|
-
|
3043
|
-
# Calculate bounding box for multi-line text and draw background/text
|
3044
|
-
try:
|
3045
|
-
# Using textbbox for potentially better accuracy with specific fonts
|
3046
|
-
# Note: textbbox needs Pillow 8+
|
3047
|
-
bbox = draw.textbbox(
|
3048
|
-
(5, 5), label_text, font=font, spacing=2
|
3049
|
-
) # Use textbbox if available
|
3050
|
-
bg_rect = (
|
3051
|
-
max(0, bbox[0] - 2),
|
3052
|
-
max(0, bbox[1] - 2),
|
3053
|
-
min(img.width, bbox[2] + 2),
|
3054
|
-
min(img.height, bbox[3] + 2),
|
3055
|
-
)
|
3056
|
-
|
3057
|
-
# Draw semi-transparent background
|
3058
|
-
overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
|
3059
|
-
draw_overlay = ImageDraw.Draw(overlay)
|
3060
|
-
draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
|
3061
|
-
img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
|
3062
|
-
draw = ImageDraw.Draw(img) # Recreate draw object
|
3063
|
-
|
3064
|
-
# Draw the potentially multi-line text
|
3065
|
-
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
3066
|
-
except AttributeError: # Fallback for older Pillow without textbbox
|
3067
|
-
# Approximate size and draw
|
3068
|
-
# This might not be perfectly aligned
|
3069
|
-
draw.rectangle(
|
3070
|
-
(2, 2, 150, 40), fill=(255, 255, 255, 180)
|
3071
|
-
) # Simple fixed background
|
3072
|
-
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
|
3073
|
-
except Exception as draw_err:
|
3074
|
-
logger.error(
|
3075
|
-
f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
|
3076
|
-
)
|
3077
|
-
|
3078
|
-
page_images.append(img)
|
3079
|
-
|
3080
|
-
if not page_images:
|
3081
|
-
logger.warning("No page images were successfully rendered for the grid.")
|
3082
|
-
return None
|
3083
|
-
|
3084
|
-
# Calculate grid dimensions if not provided
|
3085
|
-
num_images = len(page_images)
|
3086
|
-
if not rows and not cols:
|
3087
|
-
cols = min(4, int(num_images**0.5) + 1)
|
3088
|
-
rows = (num_images + cols - 1) // cols
|
3089
|
-
elif rows and not cols:
|
3090
|
-
cols = (num_images + rows - 1) // rows
|
3091
|
-
elif cols and not rows:
|
3092
|
-
rows = (num_images + cols - 1) // cols
|
3093
|
-
cols = max(1, cols if cols else 1) # Ensure at least 1
|
3094
|
-
rows = max(1, rows if rows else 1)
|
3095
|
-
|
3096
|
-
# Get maximum dimensions for consistent grid cells
|
3097
|
-
max_width = max(img.width for img in page_images) if page_images else 1
|
3098
|
-
max_height = max(img.height for img in page_images) if page_images else 1
|
3099
|
-
|
3100
|
-
# Create grid image
|
3101
|
-
grid_width = cols * max_width + (cols + 1) * spacing
|
3102
|
-
grid_height = rows * max_height + (rows + 1) * spacing
|
3103
|
-
grid_img = Image.new(
|
3104
|
-
"RGB", (grid_width, grid_height), (220, 220, 220)
|
3105
|
-
) # Lighter gray background
|
3106
|
-
|
3107
|
-
# Place images in grid
|
3108
|
-
for i, img in enumerate(page_images):
|
3109
|
-
if i >= rows * cols: # Ensure we don't exceed grid capacity
|
3110
|
-
break
|
3111
|
-
|
3112
|
-
row = i // cols
|
3113
|
-
col = i % cols
|
3114
|
-
|
3115
|
-
x = col * max_width + (col + 1) * spacing
|
3116
|
-
y = row * max_height + (row + 1) * spacing
|
3117
|
-
|
3118
|
-
grid_img.paste(img, (x, y))
|
3119
|
-
|
3120
|
-
return grid_img
|
3121
|
-
|
3122
|
-
def save_pdf(
|
3123
|
-
self,
|
3124
|
-
output_path: Union[str, Path],
|
3125
|
-
ocr: bool = False,
|
3126
|
-
original: bool = False,
|
3127
|
-
dpi: int = 300,
|
3128
|
-
):
|
3129
|
-
"""
|
3130
|
-
Saves the pages in this collection to a new PDF file.
|
3131
|
-
|
3132
|
-
Choose one saving mode:
|
3133
|
-
- `ocr=True`: Creates a new, image-based PDF using OCR results. This
|
3134
|
-
makes the text generated during the natural-pdf session searchable,
|
3135
|
-
but loses original vector content. Requires 'ocr-export' extras.
|
3136
|
-
- `original=True`: Extracts the original pages from the source PDF,
|
3137
|
-
preserving all vector content, fonts, and annotations. OCR results
|
3138
|
-
from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
|
3139
|
-
|
3140
|
-
Args:
|
3141
|
-
output_path: Path to save the new PDF file.
|
3142
|
-
ocr: If True, save as a searchable, image-based PDF using OCR data.
|
3143
|
-
original: If True, save the original, vector-based pages.
|
3144
|
-
dpi: Resolution (dots per inch) used only when ocr=True for
|
3145
|
-
rendering page images and aligning the text layer.
|
3146
|
-
|
3147
|
-
Raises:
|
3148
|
-
ValueError: If the collection is empty, if neither or both 'ocr'
|
3149
|
-
and 'original' are True, or if 'original=True' and
|
3150
|
-
pages originate from different PDFs.
|
3151
|
-
ImportError: If required libraries ('pikepdf', 'Pillow')
|
3152
|
-
are not installed for the chosen mode.
|
3153
|
-
RuntimeError: If an unexpected error occurs during saving.
|
3154
|
-
"""
|
3155
|
-
if not self.pages:
|
3156
|
-
raise ValueError("Cannot save an empty PageCollection.")
|
3157
|
-
|
3158
|
-
if not (ocr ^ original): # XOR: exactly one must be true
|
3159
|
-
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
3160
|
-
|
3161
|
-
output_path_obj = Path(output_path)
|
3162
|
-
output_path_str = str(output_path_obj)
|
3163
|
-
|
3164
|
-
if ocr:
|
3165
|
-
if create_searchable_pdf is None:
|
3166
|
-
raise ImportError(
|
3167
|
-
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
3168
|
-
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
3169
|
-
)
|
3170
|
-
|
3171
|
-
# Check for non-OCR vector elements (provide a warning)
|
3172
|
-
has_vector_elements = False
|
3173
|
-
for page in self.pages:
|
3174
|
-
# Simplified check for common vector types or non-OCR chars/words
|
3175
|
-
if (
|
3176
|
-
hasattr(page, "rects")
|
3177
|
-
and page.rects
|
3178
|
-
or hasattr(page, "lines")
|
3179
|
-
and page.lines
|
3180
|
-
or hasattr(page, "curves")
|
3181
|
-
and page.curves
|
3182
|
-
or (
|
3183
|
-
hasattr(page, "chars")
|
3184
|
-
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
3185
|
-
)
|
3186
|
-
or (
|
3187
|
-
hasattr(page, "words")
|
3188
|
-
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
3189
|
-
)
|
3190
|
-
):
|
3191
|
-
has_vector_elements = True
|
3192
|
-
break
|
3193
|
-
if has_vector_elements:
|
3194
|
-
logger.warning(
|
3195
|
-
"Warning: Saving with ocr=True creates an image-based PDF. "
|
3196
|
-
"Original vector elements (rects, lines, non-OCR text/chars) "
|
3197
|
-
"on selected pages will not be preserved in the output file."
|
3198
|
-
)
|
3199
|
-
|
3200
|
-
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
3201
|
-
try:
|
3202
|
-
# Delegate to the searchable PDF exporter function
|
3203
|
-
# Pass `self` (the PageCollection instance) as the source
|
3204
|
-
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
3205
|
-
# Success log is now inside create_searchable_pdf if needed, or keep here
|
3206
|
-
# logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
|
3207
|
-
except Exception as e:
|
3208
|
-
logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
|
3209
|
-
# Re-raise as RuntimeError for consistency, potentially handled in exporter too
|
3210
|
-
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
3211
|
-
|
3212
|
-
elif original:
|
3213
|
-
# ---> MODIFIED: Call the new exporter
|
3214
|
-
if create_original_pdf is None:
|
3215
|
-
raise ImportError(
|
3216
|
-
"Saving with original=True requires 'pikepdf'. "
|
3217
|
-
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
3218
|
-
)
|
3219
|
-
|
3220
|
-
# Check for OCR elements (provide a warning) - keep this check here
|
3221
|
-
has_ocr_elements = False
|
3222
|
-
for page in self.pages:
|
3223
|
-
# Use find_all which returns a collection; check if it's non-empty
|
3224
|
-
if hasattr(page, "find_all"):
|
3225
|
-
ocr_text_elements = page.find_all("text[source=ocr]")
|
3226
|
-
if ocr_text_elements: # Check truthiness of collection
|
3227
|
-
has_ocr_elements = True
|
3228
|
-
break
|
3229
|
-
elif hasattr(page, "words"): # Fallback check if find_all isn't present?
|
3230
|
-
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
3231
|
-
has_ocr_elements = True
|
3232
|
-
break
|
3233
|
-
|
3234
|
-
if has_ocr_elements:
|
3235
|
-
logger.warning(
|
3236
|
-
"Warning: Saving with original=True preserves original page content. "
|
3237
|
-
"OCR text generated in this session will not be included in the saved file."
|
3238
|
-
)
|
3239
|
-
|
3240
|
-
logger.info(f"Saving original pages PDF to: {output_path_str}")
|
3241
|
-
try:
|
3242
|
-
# Delegate to the original PDF exporter function
|
3243
|
-
# Pass `self` (the PageCollection instance) as the source
|
3244
|
-
create_original_pdf(self, output_path_str)
|
3245
|
-
# Success log is now inside create_original_pdf
|
3246
|
-
# logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
|
3247
|
-
except Exception as e:
|
3248
|
-
# Error logging is handled within create_original_pdf
|
3249
|
-
# Re-raise the exception caught from the exporter
|
3250
|
-
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
3251
|
-
# <--- END MODIFIED
|
3252
|
-
|
3253
|
-
def to_flow(
|
3254
|
-
self,
|
3255
|
-
arrangement: Literal["vertical", "horizontal"] = "vertical",
|
3256
|
-
alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
|
3257
|
-
segment_gap: float = 0.0,
|
3258
|
-
) -> "Flow":
|
3259
|
-
"""
|
3260
|
-
Convert this PageCollection to a Flow for cross-page operations.
|
3261
|
-
|
3262
|
-
This enables treating multiple pages as a continuous logical document
|
3263
|
-
structure, useful for multi-page tables, articles spanning columns,
|
3264
|
-
or any content requiring reading order across page boundaries.
|
3265
|
-
|
3266
|
-
Args:
|
3267
|
-
arrangement: Primary flow direction ('vertical' or 'horizontal').
|
3268
|
-
'vertical' stacks pages top-to-bottom (most common).
|
3269
|
-
'horizontal' arranges pages left-to-right.
|
3270
|
-
alignment: Cross-axis alignment for pages of different sizes:
|
3271
|
-
For vertical: 'left'/'start', 'center', 'right'/'end'
|
3272
|
-
For horizontal: 'top'/'start', 'center', 'bottom'/'end'
|
3273
|
-
segment_gap: Virtual gap between pages in PDF points (default: 0.0).
|
3274
|
-
|
3275
|
-
Returns:
|
3276
|
-
Flow object that can perform operations across all pages in sequence.
|
3277
|
-
|
3278
|
-
Example:
|
3279
|
-
Multi-page table extraction:
|
3280
|
-
```python
|
3281
|
-
pdf = npdf.PDF("multi_page_report.pdf")
|
3282
|
-
|
3283
|
-
# Create flow for pages 2-4 containing a table
|
3284
|
-
table_flow = pdf.pages[1:4].to_flow()
|
3285
|
-
|
3286
|
-
# Extract table as if it were continuous
|
3287
|
-
table_data = table_flow.extract_table()
|
3288
|
-
df = table_data.df
|
3289
|
-
```
|
3290
|
-
|
3291
|
-
Cross-page element search:
|
3292
|
-
```python
|
3293
|
-
# Find all headers across multiple pages
|
3294
|
-
headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
|
3295
|
-
|
3296
|
-
# Analyze layout across pages
|
3297
|
-
regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
|
3298
|
-
```
|
3299
|
-
"""
|
3300
|
-
from natural_pdf.flows.flow import Flow
|
3301
|
-
return Flow(
|
3302
|
-
segments=self, # Flow constructor now handles PageCollection
|
3303
|
-
arrangement=arrangement,
|
3304
|
-
alignment=alignment,
|
3305
|
-
segment_gap=segment_gap,
|
3306
|
-
)
|
3307
|
-
|
3308
|
-
# Alias .to_image() to .show() for convenience
|
3309
|
-
def show(
|
3310
|
-
self,
|
3311
|
-
*args,
|
3312
|
-
**kwargs,
|
3313
|
-
) -> Optional["Image.Image"]:
|
3314
|
-
"""Display pages similarly to ``to_image``.
|
3315
|
-
|
3316
|
-
This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
|
3317
|
-
ElementCollection, where ``show()`` already exists. It forwards all
|
3318
|
-
arguments and returns the resulting ``PIL.Image`` instance.
|
3319
|
-
"""
|
3320
|
-
return self.to_image(*args, **kwargs)
|