natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1249 @@
|
|
1
|
+
import hashlib
|
2
|
+
import logging
|
3
|
+
from collections.abc import MutableSequence, Sequence
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import (
|
6
|
+
TYPE_CHECKING,
|
7
|
+
Any,
|
8
|
+
Callable,
|
9
|
+
Dict,
|
10
|
+
Generic,
|
11
|
+
Iterable,
|
12
|
+
Iterator,
|
13
|
+
List,
|
14
|
+
Literal,
|
15
|
+
Optional,
|
16
|
+
Sequence,
|
17
|
+
Tuple,
|
18
|
+
Type,
|
19
|
+
TypeVar,
|
20
|
+
Union,
|
21
|
+
overload,
|
22
|
+
)
|
23
|
+
|
24
|
+
from pdfplumber.utils.geometry import objects_to_bbox
|
25
|
+
|
26
|
+
# New Imports
|
27
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
28
|
+
from PIL import Image, ImageDraw, ImageFont
|
29
|
+
from tqdm.auto import tqdm
|
30
|
+
|
31
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
32
|
+
from natural_pdf.classification.manager import ClassificationManager
|
33
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
34
|
+
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
35
|
+
from natural_pdf.core.pdf import PDF
|
36
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
37
|
+
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
38
|
+
from natural_pdf.elements.base import Element
|
39
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
40
|
+
from natural_pdf.elements.region import Region
|
41
|
+
from natural_pdf.elements.text import TextElement
|
42
|
+
from natural_pdf.export.mixin import ExportMixin
|
43
|
+
from natural_pdf.ocr import OCROptions
|
44
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
45
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
46
|
+
from natural_pdf.text_mixin import TextMixin
|
47
|
+
|
48
|
+
# Potentially lazy imports for optional dependencies needed in save_pdf
|
49
|
+
try:
|
50
|
+
import pikepdf
|
51
|
+
except ImportError:
|
52
|
+
pikepdf = None
|
53
|
+
|
54
|
+
try:
|
55
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
56
|
+
except ImportError:
|
57
|
+
create_searchable_pdf = None
|
58
|
+
|
59
|
+
# ---> ADDED Import for the new exporter
|
60
|
+
try:
|
61
|
+
from natural_pdf.exporters.original_pdf import create_original_pdf
|
62
|
+
except ImportError:
|
63
|
+
create_original_pdf = None
|
64
|
+
# <--- END ADDED
|
65
|
+
|
66
|
+
logger = logging.getLogger(__name__)
|
67
|
+
|
68
|
+
if TYPE_CHECKING:
|
69
|
+
from natural_pdf.core.page import Page
|
70
|
+
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
71
|
+
from natural_pdf.elements.region import Region
|
72
|
+
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
73
|
+
from natural_pdf.flows.flow import Flow
|
74
|
+
|
75
|
+
T = TypeVar("T")
|
76
|
+
P = TypeVar("P", bound="Page")
|
77
|
+
|
78
|
+
|
79
|
+
class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Visualizable):
|
80
|
+
"""
|
81
|
+
Represents a collection of Page objects, often from a single PDF document.
|
82
|
+
Provides methods for batch operations on these pages.
|
83
|
+
"""
|
84
|
+
|
85
|
+
def __init__(self, pages: Union[List[P], Sequence[P]]):
|
86
|
+
"""
|
87
|
+
Initialize a page collection.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
pages: List or sequence of Page objects (can be lazy)
|
91
|
+
"""
|
92
|
+
# Store the sequence as-is to preserve lazy behavior
|
93
|
+
# Only convert to list if we need list-specific operations
|
94
|
+
if hasattr(pages, "__iter__") and hasattr(pages, "__len__"):
|
95
|
+
self.pages = pages
|
96
|
+
else:
|
97
|
+
# Fallback for non-sequence types
|
98
|
+
self.pages = list(pages)
|
99
|
+
|
100
|
+
def __len__(self) -> int:
|
101
|
+
"""Return the number of pages in the collection."""
|
102
|
+
return len(self.pages)
|
103
|
+
|
104
|
+
def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
|
105
|
+
"""Support indexing and slicing."""
|
106
|
+
if isinstance(idx, slice):
|
107
|
+
return PageCollection(self.pages[idx])
|
108
|
+
return self.pages[idx]
|
109
|
+
|
110
|
+
def __iter__(self) -> Iterator[P]:
|
111
|
+
"""Support iteration."""
|
112
|
+
return iter(self.pages)
|
113
|
+
|
114
|
+
def __repr__(self) -> str:
|
115
|
+
"""Return a string representation showing the page count."""
|
116
|
+
return f"<PageCollection(count={len(self)})>"
|
117
|
+
|
118
|
+
def _get_items_for_apply(self) -> Iterator[P]:
|
119
|
+
"""
|
120
|
+
Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
|
121
|
+
|
122
|
+
Returns an iterator that yields pages on-demand rather than materializing
|
123
|
+
all pages at once, maintaining the lazy loading behavior.
|
124
|
+
"""
|
125
|
+
return iter(self.pages)
|
126
|
+
|
127
|
+
def _get_page_indices(self) -> List[int]:
|
128
|
+
"""
|
129
|
+
Get page indices without forcing materialization of pages.
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
List of page indices for the pages in this collection.
|
133
|
+
"""
|
134
|
+
# Handle different types of page sequences efficiently
|
135
|
+
if hasattr(self.pages, "_indices"):
|
136
|
+
# If it's a _LazyPageList (or slice), get indices directly
|
137
|
+
return list(self.pages._indices)
|
138
|
+
else:
|
139
|
+
# Fallback: if pages are already materialized, get indices normally
|
140
|
+
# This will force materialization but only if pages aren't lazy
|
141
|
+
return [p.index for p in self.pages]
|
142
|
+
|
143
|
+
def extract_text(
|
144
|
+
self,
|
145
|
+
keep_blank_chars: bool = True,
|
146
|
+
apply_exclusions: bool = True,
|
147
|
+
strip: Optional[bool] = None,
|
148
|
+
**kwargs,
|
149
|
+
) -> str:
|
150
|
+
"""
|
151
|
+
Extract text from all pages in the collection.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
keep_blank_chars: Whether to keep blank characters (default: True)
|
155
|
+
apply_exclusions: Whether to apply exclusion regions (default: True)
|
156
|
+
strip: Whether to strip whitespace from the extracted text.
|
157
|
+
**kwargs: Additional extraction parameters
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
Combined text from all pages
|
161
|
+
"""
|
162
|
+
texts = []
|
163
|
+
for page in self.pages:
|
164
|
+
text = page.extract_text(
|
165
|
+
keep_blank_chars=keep_blank_chars,
|
166
|
+
apply_exclusions=apply_exclusions,
|
167
|
+
**kwargs,
|
168
|
+
)
|
169
|
+
texts.append(text)
|
170
|
+
|
171
|
+
combined = "\n".join(texts)
|
172
|
+
|
173
|
+
# Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
|
174
|
+
use_layout = kwargs.get("layout", False)
|
175
|
+
strip_final = strip if strip is not None else (not use_layout)
|
176
|
+
|
177
|
+
if strip_final:
|
178
|
+
combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
|
179
|
+
|
180
|
+
return combined
|
181
|
+
|
182
|
+
def apply_ocr(
|
183
|
+
self,
|
184
|
+
engine: Optional[str] = None,
|
185
|
+
# --- Common OCR Parameters (Direct Arguments) ---
|
186
|
+
languages: Optional[List[str]] = None,
|
187
|
+
min_confidence: Optional[float] = None, # Min confidence threshold
|
188
|
+
device: Optional[str] = None,
|
189
|
+
resolution: Optional[int] = None, # DPI for rendering
|
190
|
+
apply_exclusions: bool = True, # New parameter
|
191
|
+
replace: bool = True, # Whether to replace existing OCR elements
|
192
|
+
# --- Engine-Specific Options ---
|
193
|
+
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
194
|
+
) -> "PageCollection[P]":
|
195
|
+
"""
|
196
|
+
Applies OCR to all pages within this collection using batch processing.
|
197
|
+
|
198
|
+
This delegates the work to the parent PDF object's `apply_ocr` method.
|
199
|
+
|
200
|
+
Args:
|
201
|
+
engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
|
202
|
+
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
|
203
|
+
**Must be codes understood by the specific selected engine.**
|
204
|
+
No mapping is performed.
|
205
|
+
min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
|
206
|
+
device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
|
207
|
+
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
208
|
+
apply_exclusions: If True (default), render page images for OCR with
|
209
|
+
excluded areas masked (whited out). If False, OCR
|
210
|
+
the raw page images without masking exclusions.
|
211
|
+
replace: If True (default), remove any existing OCR elements before
|
212
|
+
adding new ones. If False, add new OCR elements to existing ones.
|
213
|
+
options: An engine-specific options object (e.g., EasyOCROptions) or dict.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
Self for method chaining.
|
217
|
+
|
218
|
+
Raises:
|
219
|
+
RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
|
220
|
+
(Propagates exceptions from PDF.apply_ocr)
|
221
|
+
"""
|
222
|
+
if not self.pages:
|
223
|
+
logger.warning("Cannot apply OCR to an empty PageCollection.")
|
224
|
+
return self
|
225
|
+
|
226
|
+
# Assume all pages share the same parent PDF object
|
227
|
+
first_page = self.pages[0]
|
228
|
+
if not hasattr(first_page, "_parent") or not first_page._parent:
|
229
|
+
raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
|
230
|
+
|
231
|
+
parent_pdf = first_page._parent
|
232
|
+
|
233
|
+
if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
|
234
|
+
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
235
|
+
|
236
|
+
# Get the 0-based indices of the pages in this collection
|
237
|
+
page_indices = self._get_page_indices()
|
238
|
+
|
239
|
+
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
240
|
+
|
241
|
+
# Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
|
242
|
+
parent_pdf.apply_ocr(
|
243
|
+
pages=page_indices,
|
244
|
+
engine=engine,
|
245
|
+
languages=languages,
|
246
|
+
min_confidence=min_confidence, # Pass the renamed parameter
|
247
|
+
device=device,
|
248
|
+
resolution=resolution,
|
249
|
+
apply_exclusions=apply_exclusions, # Pass down
|
250
|
+
replace=replace, # Pass the replace parameter
|
251
|
+
options=options,
|
252
|
+
)
|
253
|
+
# The PDF method modifies the Page objects directly by adding elements.
|
254
|
+
|
255
|
+
return self # Return self for chaining
|
256
|
+
|
257
|
+
@overload
|
258
|
+
def find(
|
259
|
+
self,
|
260
|
+
*,
|
261
|
+
text: str,
|
262
|
+
contains: str = "all",
|
263
|
+
apply_exclusions: bool = True,
|
264
|
+
regex: bool = False,
|
265
|
+
case: bool = True,
|
266
|
+
**kwargs,
|
267
|
+
) -> Optional[T]: ...
|
268
|
+
|
269
|
+
@overload
|
270
|
+
def find(
|
271
|
+
self,
|
272
|
+
selector: str,
|
273
|
+
*,
|
274
|
+
contains: str = "all",
|
275
|
+
apply_exclusions: bool = True,
|
276
|
+
regex: bool = False,
|
277
|
+
case: bool = True,
|
278
|
+
**kwargs,
|
279
|
+
) -> Optional[T]: ...
|
280
|
+
|
281
|
+
def find(
|
282
|
+
self,
|
283
|
+
selector: Optional[str] = None,
|
284
|
+
*,
|
285
|
+
text: Optional[str] = None,
|
286
|
+
contains: str = "all",
|
287
|
+
apply_exclusions: bool = True,
|
288
|
+
regex: bool = False,
|
289
|
+
case: bool = True,
|
290
|
+
**kwargs,
|
291
|
+
) -> Optional[T]:
|
292
|
+
"""
|
293
|
+
Find the first element matching the selector OR text across all pages in the collection.
|
294
|
+
|
295
|
+
Provide EITHER `selector` OR `text`, but not both.
|
296
|
+
|
297
|
+
Args:
|
298
|
+
selector: CSS-like selector string.
|
299
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
300
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
301
|
+
'any' (any overlap), or 'center' (center point inside).
|
302
|
+
(default: "all")
|
303
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
304
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
305
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
306
|
+
**kwargs: Additional filter parameters.
|
307
|
+
|
308
|
+
Returns:
|
309
|
+
First matching element or None.
|
310
|
+
"""
|
311
|
+
# Input validation happens within page.find
|
312
|
+
for page in self.pages:
|
313
|
+
element = page.find(
|
314
|
+
selector=selector,
|
315
|
+
text=text,
|
316
|
+
contains=contains,
|
317
|
+
apply_exclusions=apply_exclusions,
|
318
|
+
regex=regex,
|
319
|
+
case=case,
|
320
|
+
**kwargs,
|
321
|
+
)
|
322
|
+
if element:
|
323
|
+
return element
|
324
|
+
return None
|
325
|
+
|
326
|
+
@overload
|
327
|
+
def find_all(
|
328
|
+
self,
|
329
|
+
*,
|
330
|
+
text: str,
|
331
|
+
contains: str = "all",
|
332
|
+
apply_exclusions: bool = True,
|
333
|
+
regex: bool = False,
|
334
|
+
case: bool = True,
|
335
|
+
**kwargs,
|
336
|
+
) -> "ElementCollection": ...
|
337
|
+
|
338
|
+
@overload
|
339
|
+
def find_all(
|
340
|
+
self,
|
341
|
+
selector: str,
|
342
|
+
*,
|
343
|
+
contains: str = "all",
|
344
|
+
apply_exclusions: bool = True,
|
345
|
+
regex: bool = False,
|
346
|
+
case: bool = True,
|
347
|
+
**kwargs,
|
348
|
+
) -> "ElementCollection": ...
|
349
|
+
|
350
|
+
def find_all(
|
351
|
+
self,
|
352
|
+
selector: Optional[str] = None,
|
353
|
+
*,
|
354
|
+
text: Optional[str] = None,
|
355
|
+
contains: str = "all",
|
356
|
+
apply_exclusions: bool = True,
|
357
|
+
regex: bool = False,
|
358
|
+
case: bool = True,
|
359
|
+
**kwargs,
|
360
|
+
) -> "ElementCollection":
|
361
|
+
"""
|
362
|
+
Find all elements matching the selector OR text across all pages in the collection.
|
363
|
+
|
364
|
+
Provide EITHER `selector` OR `text`, but not both.
|
365
|
+
|
366
|
+
Args:
|
367
|
+
selector: CSS-like selector string.
|
368
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
369
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
370
|
+
'any' (any overlap), or 'center' (center point inside).
|
371
|
+
(default: "all")
|
372
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
373
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
374
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
375
|
+
**kwargs: Additional filter parameters.
|
376
|
+
|
377
|
+
Returns:
|
378
|
+
ElementCollection with matching elements from all pages.
|
379
|
+
"""
|
380
|
+
all_elements = []
|
381
|
+
# Input validation happens within page.find_all
|
382
|
+
for page in self.pages:
|
383
|
+
elements = page.find_all(
|
384
|
+
selector=selector,
|
385
|
+
text=text,
|
386
|
+
contains=contains,
|
387
|
+
apply_exclusions=apply_exclusions,
|
388
|
+
regex=regex,
|
389
|
+
case=case,
|
390
|
+
**kwargs,
|
391
|
+
)
|
392
|
+
if elements:
|
393
|
+
all_elements.extend(elements.elements)
|
394
|
+
|
395
|
+
return ElementCollection(all_elements)
|
396
|
+
|
397
|
+
def update_text(
|
398
|
+
self,
|
399
|
+
transform: Callable[[Any], Optional[str]],
|
400
|
+
selector: str = "text",
|
401
|
+
max_workers: Optional[int] = None,
|
402
|
+
) -> "PageCollection[P]":
|
403
|
+
"""
|
404
|
+
Applies corrections to text elements across all pages
|
405
|
+
in this collection using a user-provided callback function, executed
|
406
|
+
in parallel if `max_workers` is specified.
|
407
|
+
|
408
|
+
This method delegates to the parent PDF's `update_text` method,
|
409
|
+
targeting all pages within this collection.
|
410
|
+
|
411
|
+
Args:
|
412
|
+
transform: A function that accepts a single argument (an element
|
413
|
+
object) and returns `Optional[str]` (new text or None).
|
414
|
+
selector: The attribute name to update. Default is 'text'.
|
415
|
+
max_workers: The maximum number of worker threads to use for parallel
|
416
|
+
correction on each page. If None, defaults are used.
|
417
|
+
|
418
|
+
Returns:
|
419
|
+
Self for method chaining.
|
420
|
+
|
421
|
+
Raises:
|
422
|
+
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
423
|
+
or the parent PDF lacks the `update_text` method.
|
424
|
+
"""
|
425
|
+
if not self.pages:
|
426
|
+
logger.warning("Cannot update text for an empty PageCollection.")
|
427
|
+
# Return self even if empty to maintain chaining consistency
|
428
|
+
return self
|
429
|
+
|
430
|
+
# Assume all pages share the same parent PDF object
|
431
|
+
parent_pdf = self.pages[0]._parent
|
432
|
+
if (
|
433
|
+
not parent_pdf
|
434
|
+
or not hasattr(parent_pdf, "update_text")
|
435
|
+
or not callable(parent_pdf.update_text)
|
436
|
+
):
|
437
|
+
raise RuntimeError(
|
438
|
+
"Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
|
439
|
+
)
|
440
|
+
|
441
|
+
page_indices = self._get_page_indices()
|
442
|
+
logger.info(
|
443
|
+
f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
|
444
|
+
)
|
445
|
+
|
446
|
+
# Delegate the call to the parent PDF object for the relevant pages
|
447
|
+
# Pass the max_workers parameter down
|
448
|
+
parent_pdf.update_text(
|
449
|
+
transform=transform,
|
450
|
+
pages=page_indices,
|
451
|
+
selector=selector,
|
452
|
+
max_workers=max_workers,
|
453
|
+
)
|
454
|
+
|
455
|
+
return self
|
456
|
+
|
457
|
+
def get_sections(
|
458
|
+
self,
|
459
|
+
start_elements=None,
|
460
|
+
end_elements=None,
|
461
|
+
new_section_on_page_break=False,
|
462
|
+
include_boundaries="both",
|
463
|
+
) -> "ElementCollection[Region]":
|
464
|
+
"""
|
465
|
+
Extract sections from a page collection based on start/end elements.
|
466
|
+
|
467
|
+
Args:
|
468
|
+
start_elements: Elements or selector string that mark the start of sections (optional)
|
469
|
+
end_elements: Elements or selector string that mark the end of sections (optional)
|
470
|
+
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
471
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
472
|
+
|
473
|
+
Returns:
|
474
|
+
List of Region objects representing the extracted sections
|
475
|
+
|
476
|
+
Note:
|
477
|
+
You can provide only start_elements, only end_elements, or both.
|
478
|
+
- With only start_elements: sections go from each start to the next start (or end of page)
|
479
|
+
- With only end_elements: sections go from beginning of document/page to each end
|
480
|
+
- With both: sections go from each start to the corresponding end
|
481
|
+
"""
|
482
|
+
# Find start and end elements across all pages
|
483
|
+
if isinstance(start_elements, str):
|
484
|
+
start_elements = self.find_all(start_elements).elements
|
485
|
+
|
486
|
+
if isinstance(end_elements, str):
|
487
|
+
end_elements = self.find_all(end_elements).elements
|
488
|
+
|
489
|
+
# If no start elements and no end elements, return empty list
|
490
|
+
if not start_elements and not end_elements:
|
491
|
+
return []
|
492
|
+
|
493
|
+
# If there are page break boundaries, we'll need to add them
|
494
|
+
if new_section_on_page_break:
|
495
|
+
# For each page boundary, create virtual "end" and "start" elements
|
496
|
+
for i in range(len(self.pages) - 1):
|
497
|
+
# Add a virtual "end" element at the bottom of the current page
|
498
|
+
page = self.pages[i]
|
499
|
+
# If end_elements is None, initialize it as an empty list
|
500
|
+
if end_elements is None:
|
501
|
+
end_elements = []
|
502
|
+
|
503
|
+
# Create a region at the bottom of the page as an artificial end marker
|
504
|
+
from natural_pdf.elements.region import Region
|
505
|
+
|
506
|
+
bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
|
507
|
+
bottom_region.is_page_boundary = True # Mark it as a special boundary
|
508
|
+
end_elements.append(bottom_region)
|
509
|
+
|
510
|
+
# Add a virtual "start" element at the top of the next page
|
511
|
+
next_page = self.pages[i + 1]
|
512
|
+
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
513
|
+
top_region.is_page_boundary = True # Mark it as a special boundary
|
514
|
+
start_elements.append(top_region)
|
515
|
+
|
516
|
+
# Get all elements from all pages and sort them in document order
|
517
|
+
all_elements = []
|
518
|
+
for page in self.pages:
|
519
|
+
elements = page.get_elements()
|
520
|
+
all_elements.extend(elements)
|
521
|
+
|
522
|
+
# Sort by page index, then vertical position, then horizontal position
|
523
|
+
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
524
|
+
|
525
|
+
# If we only have end_elements (no start_elements), create implicit start elements
|
526
|
+
if not start_elements and end_elements:
|
527
|
+
from natural_pdf.elements.region import Region
|
528
|
+
|
529
|
+
start_elements = []
|
530
|
+
|
531
|
+
# Add implicit start at the beginning of the first page
|
532
|
+
first_page = self.pages[0]
|
533
|
+
first_start = Region(first_page, (0, 0, first_page.width, 1))
|
534
|
+
first_start.is_implicit_start = True
|
535
|
+
start_elements.append(first_start)
|
536
|
+
|
537
|
+
# For each end element (except the last), add an implicit start after it
|
538
|
+
sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
|
539
|
+
for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
|
540
|
+
# Create implicit start element right after this end element
|
541
|
+
implicit_start = Region(
|
542
|
+
end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1)
|
543
|
+
)
|
544
|
+
implicit_start.is_implicit_start = True
|
545
|
+
start_elements.append(implicit_start)
|
546
|
+
|
547
|
+
# Mark section boundaries
|
548
|
+
section_boundaries = []
|
549
|
+
|
550
|
+
# Add start element boundaries
|
551
|
+
for element in start_elements:
|
552
|
+
if element in all_elements:
|
553
|
+
idx = all_elements.index(element)
|
554
|
+
section_boundaries.append(
|
555
|
+
{
|
556
|
+
"index": idx,
|
557
|
+
"element": element,
|
558
|
+
"type": "start",
|
559
|
+
"page_idx": element.page.index,
|
560
|
+
}
|
561
|
+
)
|
562
|
+
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
563
|
+
# This is a virtual page boundary element
|
564
|
+
section_boundaries.append(
|
565
|
+
{
|
566
|
+
"index": -1, # Special index for page boundaries
|
567
|
+
"element": element,
|
568
|
+
"type": "start",
|
569
|
+
"page_idx": element.page.index,
|
570
|
+
}
|
571
|
+
)
|
572
|
+
elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
|
573
|
+
# This is an implicit start element
|
574
|
+
section_boundaries.append(
|
575
|
+
{
|
576
|
+
"index": -2, # Special index for implicit starts
|
577
|
+
"element": element,
|
578
|
+
"type": "start",
|
579
|
+
"page_idx": element.page.index,
|
580
|
+
}
|
581
|
+
)
|
582
|
+
|
583
|
+
# Add end element boundaries if provided
|
584
|
+
if end_elements:
|
585
|
+
for element in end_elements:
|
586
|
+
if element in all_elements:
|
587
|
+
idx = all_elements.index(element)
|
588
|
+
section_boundaries.append(
|
589
|
+
{
|
590
|
+
"index": idx,
|
591
|
+
"element": element,
|
592
|
+
"type": "end",
|
593
|
+
"page_idx": element.page.index,
|
594
|
+
}
|
595
|
+
)
|
596
|
+
elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
|
597
|
+
# This is a virtual page boundary element
|
598
|
+
section_boundaries.append(
|
599
|
+
{
|
600
|
+
"index": -1, # Special index for page boundaries
|
601
|
+
"element": element,
|
602
|
+
"type": "end",
|
603
|
+
"page_idx": element.page.index,
|
604
|
+
}
|
605
|
+
)
|
606
|
+
|
607
|
+
# Sort boundaries by page index, then by actual document position
|
608
|
+
def _sort_key(boundary):
|
609
|
+
"""Sort boundaries by (page_idx, vertical_top, priority)."""
|
610
|
+
page_idx = boundary["page_idx"]
|
611
|
+
element = boundary["element"]
|
612
|
+
|
613
|
+
# Vertical position on the page
|
614
|
+
y_pos = getattr(element, "top", 0.0)
|
615
|
+
|
616
|
+
# Ensure starts come before ends at the same coordinate
|
617
|
+
priority = 0 if boundary["type"] == "start" else 1
|
618
|
+
|
619
|
+
return (page_idx, y_pos, priority)
|
620
|
+
|
621
|
+
section_boundaries.sort(key=_sort_key)
|
622
|
+
|
623
|
+
# Generate sections
|
624
|
+
sections = []
|
625
|
+
|
626
|
+
# --- Helper: build a FlowRegion spanning multiple pages ---
|
627
|
+
def _build_flow_region(start_el, end_el):
|
628
|
+
"""Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
|
629
|
+
If *end_el* is None, the region continues to the bottom of the last
|
630
|
+
page in this PageCollection."""
|
631
|
+
# Local imports to avoid top-level cycles
|
632
|
+
from natural_pdf.elements.region import Region
|
633
|
+
from natural_pdf.flows.element import FlowElement
|
634
|
+
from natural_pdf.flows.flow import Flow
|
635
|
+
from natural_pdf.flows.region import FlowRegion
|
636
|
+
|
637
|
+
start_pg = start_el.page
|
638
|
+
end_pg = end_el.page if end_el is not None else self.pages[-1]
|
639
|
+
|
640
|
+
parts: list[Region] = []
|
641
|
+
|
642
|
+
# Use the actual top of the start element (for implicit starts this is
|
643
|
+
# the bottom of the previous end element) instead of forcing to 0.
|
644
|
+
start_top = start_el.top
|
645
|
+
|
646
|
+
# Slice of first page beginning at *start_top*
|
647
|
+
parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
|
648
|
+
|
649
|
+
# Full middle pages
|
650
|
+
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
651
|
+
mid_pg = self.pages[pg_idx]
|
652
|
+
parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
|
653
|
+
|
654
|
+
# Slice of last page (if distinct)
|
655
|
+
if end_pg is not start_pg:
|
656
|
+
bottom = end_el.bottom if end_el is not None else end_pg.height
|
657
|
+
parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
|
658
|
+
|
659
|
+
flow = Flow(segments=parts, arrangement="vertical")
|
660
|
+
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
661
|
+
return FlowRegion(
|
662
|
+
flow=flow,
|
663
|
+
constituent_regions=parts,
|
664
|
+
source_flow_element=src_fe,
|
665
|
+
boundary_element_found=end_el,
|
666
|
+
)
|
667
|
+
|
668
|
+
# ------------------------------------------------------------------
|
669
|
+
|
670
|
+
current_start = None
|
671
|
+
|
672
|
+
for i, boundary in enumerate(section_boundaries):
|
673
|
+
# If it's a start boundary and we don't have a current start
|
674
|
+
if boundary["type"] == "start" and current_start is None:
|
675
|
+
current_start = boundary
|
676
|
+
|
677
|
+
# If it's an end boundary and we have a current start
|
678
|
+
elif boundary["type"] == "end" and current_start is not None:
|
679
|
+
# Create a section from current_start to this boundary
|
680
|
+
start_element = current_start["element"]
|
681
|
+
end_element = boundary["element"]
|
682
|
+
|
683
|
+
# If both elements are on the same page, use the page's get_section_between
|
684
|
+
if start_element.page == end_element.page:
|
685
|
+
# For implicit start elements, create a region from the top of the page
|
686
|
+
if hasattr(start_element, "is_implicit_start"):
|
687
|
+
from natural_pdf.elements.region import Region
|
688
|
+
|
689
|
+
section = Region(
|
690
|
+
start_element.page,
|
691
|
+
(0, start_element.top, start_element.page.width, end_element.bottom),
|
692
|
+
)
|
693
|
+
section.start_element = start_element
|
694
|
+
section.boundary_element_found = end_element
|
695
|
+
else:
|
696
|
+
section = start_element.page.get_section_between(
|
697
|
+
start_element, end_element, include_boundaries
|
698
|
+
)
|
699
|
+
sections.append(section)
|
700
|
+
else:
|
701
|
+
# Create FlowRegion spanning pages
|
702
|
+
flow_region = _build_flow_region(start_element, end_element)
|
703
|
+
sections.append(flow_region)
|
704
|
+
|
705
|
+
current_start = None
|
706
|
+
|
707
|
+
# If it's another start boundary and we have a current start (for splitting by starts only)
|
708
|
+
elif boundary["type"] == "start" and current_start is not None and not end_elements:
|
709
|
+
# Create a section from current_start to just before this boundary
|
710
|
+
start_element = current_start["element"]
|
711
|
+
|
712
|
+
# Find the last element before this boundary on the same page
|
713
|
+
if start_element.page == boundary["element"].page:
|
714
|
+
# Find elements on this page
|
715
|
+
page_elements = [e for e in all_elements if e.page == start_element.page]
|
716
|
+
# Sort by position
|
717
|
+
page_elements.sort(key=lambda e: (e.top, e.x0))
|
718
|
+
|
719
|
+
# Find the last element before the boundary
|
720
|
+
end_idx = (
|
721
|
+
page_elements.index(boundary["element"]) - 1
|
722
|
+
if boundary["element"] in page_elements
|
723
|
+
else -1
|
724
|
+
)
|
725
|
+
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
726
|
+
|
727
|
+
# Create the section
|
728
|
+
section = start_element.page.get_section_between(
|
729
|
+
start_element, end_element, include_boundaries
|
730
|
+
)
|
731
|
+
sections.append(section)
|
732
|
+
else:
|
733
|
+
# Cross-page section - create from current_start to the end of its page
|
734
|
+
from natural_pdf.elements.region import Region
|
735
|
+
|
736
|
+
start_page = start_element.page
|
737
|
+
|
738
|
+
# Handle implicit start elements
|
739
|
+
start_top = start_element.top
|
740
|
+
region = Region(start_page, (0, start_top, start_page.width, start_page.height))
|
741
|
+
region.start_element = start_element
|
742
|
+
sections.append(region)
|
743
|
+
|
744
|
+
current_start = boundary
|
745
|
+
|
746
|
+
# Handle the last section if we have a current start
|
747
|
+
if current_start is not None:
|
748
|
+
start_element = current_start["element"]
|
749
|
+
start_page = start_element.page
|
750
|
+
|
751
|
+
if end_elements:
|
752
|
+
# With end_elements, we need an explicit end - use the last element
|
753
|
+
# on the last page of the collection
|
754
|
+
last_page = self.pages[-1]
|
755
|
+
last_page_elements = [e for e in all_elements if e.page == last_page]
|
756
|
+
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
757
|
+
end_element = last_page_elements[-1] if last_page_elements else None
|
758
|
+
|
759
|
+
# Create FlowRegion spanning multiple pages using helper
|
760
|
+
flow_region = _build_flow_region(start_element, end_element)
|
761
|
+
sections.append(flow_region)
|
762
|
+
else:
|
763
|
+
# With start_elements only, create a section to the end of the current page
|
764
|
+
from natural_pdf.elements.region import Region
|
765
|
+
|
766
|
+
# Handle implicit start elements
|
767
|
+
start_top = start_element.top
|
768
|
+
region = Region(start_page, (0, start_top, start_page.width, start_page.height))
|
769
|
+
region.start_element = start_element
|
770
|
+
sections.append(region)
|
771
|
+
|
772
|
+
return ElementCollection(sections)
|
773
|
+
|
774
|
+
def _gather_analysis_data(
|
775
|
+
self,
|
776
|
+
analysis_keys: List[str],
|
777
|
+
include_content: bool,
|
778
|
+
include_images: bool,
|
779
|
+
image_dir: Optional[Path],
|
780
|
+
image_format: str,
|
781
|
+
image_resolution: int,
|
782
|
+
) -> List[Dict[str, Any]]:
|
783
|
+
"""
|
784
|
+
Gather analysis data from all pages in the collection.
|
785
|
+
|
786
|
+
Args:
|
787
|
+
analysis_keys: Keys in the analyses dictionary to export
|
788
|
+
include_content: Whether to include extracted text
|
789
|
+
include_images: Whether to export images
|
790
|
+
image_dir: Directory to save images
|
791
|
+
image_format: Format to save images
|
792
|
+
image_resolution: Resolution for exported images
|
793
|
+
|
794
|
+
Returns:
|
795
|
+
List of dictionaries containing analysis data
|
796
|
+
"""
|
797
|
+
if not self.elements:
|
798
|
+
logger.warning("No pages found in collection")
|
799
|
+
return []
|
800
|
+
|
801
|
+
all_data = []
|
802
|
+
|
803
|
+
for page in self.elements:
|
804
|
+
# Basic page information
|
805
|
+
page_data = {
|
806
|
+
"page_number": page.number,
|
807
|
+
"page_index": page.index,
|
808
|
+
"width": page.width,
|
809
|
+
"height": page.height,
|
810
|
+
}
|
811
|
+
|
812
|
+
# Add PDF information if available
|
813
|
+
if hasattr(page, "pdf") and page.pdf:
|
814
|
+
page_data["pdf_path"] = page.pdf.path
|
815
|
+
page_data["pdf_filename"] = Path(page.pdf.path).name
|
816
|
+
|
817
|
+
# Include extracted text if requested
|
818
|
+
if include_content:
|
819
|
+
try:
|
820
|
+
page_data["content"] = page.extract_text(preserve_whitespace=True)
|
821
|
+
except Exception as e:
|
822
|
+
logger.error(f"Error extracting text from page {page.number}: {e}")
|
823
|
+
page_data["content"] = ""
|
824
|
+
|
825
|
+
# Save image if requested
|
826
|
+
if include_images:
|
827
|
+
try:
|
828
|
+
# Create image filename
|
829
|
+
pdf_name = "unknown"
|
830
|
+
if hasattr(page, "pdf") and page.pdf:
|
831
|
+
pdf_name = Path(page.pdf.path).stem
|
832
|
+
|
833
|
+
image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
|
834
|
+
image_path = image_dir / image_filename
|
835
|
+
|
836
|
+
# Save image
|
837
|
+
page.save_image(
|
838
|
+
str(image_path), resolution=image_resolution, include_highlights=True
|
839
|
+
)
|
840
|
+
|
841
|
+
# Add relative path to data
|
842
|
+
page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
|
843
|
+
except Exception as e:
|
844
|
+
logger.error(f"Error saving image for page {page.number}: {e}")
|
845
|
+
page_data["image_path"] = None
|
846
|
+
|
847
|
+
# Add analyses data
|
848
|
+
if hasattr(page, "analyses") and page.analyses:
|
849
|
+
for key in analysis_keys:
|
850
|
+
if key not in page.analyses:
|
851
|
+
raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
|
852
|
+
|
853
|
+
# Get the analysis result
|
854
|
+
analysis_result = page.analyses[key]
|
855
|
+
|
856
|
+
# If the result has a to_dict method, use it
|
857
|
+
if hasattr(analysis_result, "to_dict"):
|
858
|
+
analysis_data = analysis_result.to_dict()
|
859
|
+
else:
|
860
|
+
# Otherwise, use the result directly if it's dict-like
|
861
|
+
try:
|
862
|
+
analysis_data = dict(analysis_result)
|
863
|
+
except (TypeError, ValueError):
|
864
|
+
# Last resort: convert to string
|
865
|
+
analysis_data = {"raw_result": str(analysis_result)}
|
866
|
+
|
867
|
+
# Add analysis data to page data with the key as prefix
|
868
|
+
for k, v in analysis_data.items():
|
869
|
+
page_data[f"{key}.{k}"] = v
|
870
|
+
|
871
|
+
all_data.append(page_data)
|
872
|
+
|
873
|
+
return all_data
|
874
|
+
|
875
|
+
# --- Deskew Method --- #
|
876
|
+
|
877
|
+
def deskew(
|
878
|
+
self,
|
879
|
+
resolution: int = 300,
|
880
|
+
detection_resolution: int = 72,
|
881
|
+
force_overwrite: bool = False,
|
882
|
+
**deskew_kwargs,
|
883
|
+
) -> "PDF": # Changed return type
|
884
|
+
"""
|
885
|
+
Creates a new, in-memory PDF object containing deskewed versions of the pages
|
886
|
+
in this collection.
|
887
|
+
|
888
|
+
This method delegates the actual processing to the parent PDF object's
|
889
|
+
`deskew` method.
|
890
|
+
|
891
|
+
Important: The returned PDF is image-based. Any existing text, OCR results,
|
892
|
+
annotations, or other elements from the original pages will *not* be carried over.
|
893
|
+
|
894
|
+
Args:
|
895
|
+
resolution: DPI resolution for rendering the output deskewed pages.
|
896
|
+
detection_resolution: DPI resolution used for skew detection if angles are not
|
897
|
+
already cached on the page objects.
|
898
|
+
force_overwrite: If False (default), raises a ValueError if any target page
|
899
|
+
already contains processed elements (text, OCR, regions) to
|
900
|
+
prevent accidental data loss. Set to True to proceed anyway.
|
901
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
902
|
+
during automatic detection (e.g., `max_angle`, `num_peaks`).
|
903
|
+
|
904
|
+
Returns:
|
905
|
+
A new PDF object representing the deskewed document.
|
906
|
+
|
907
|
+
Raises:
|
908
|
+
ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
|
909
|
+
ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
|
910
|
+
or if the collection is empty.
|
911
|
+
RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
|
912
|
+
"""
|
913
|
+
if not self.pages:
|
914
|
+
logger.warning("Cannot deskew an empty PageCollection.")
|
915
|
+
raise ValueError("Cannot deskew an empty PageCollection.")
|
916
|
+
|
917
|
+
# Assume all pages share the same parent PDF object
|
918
|
+
# Need to hint the type of _parent for type checkers
|
919
|
+
if TYPE_CHECKING:
|
920
|
+
parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
|
921
|
+
else:
|
922
|
+
parent_pdf = self.pages[0]._parent
|
923
|
+
|
924
|
+
if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
|
925
|
+
raise RuntimeError(
|
926
|
+
"Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
|
927
|
+
)
|
928
|
+
|
929
|
+
# Get the 0-based indices of the pages in this collection
|
930
|
+
page_indices = self._get_page_indices()
|
931
|
+
logger.info(
|
932
|
+
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
933
|
+
)
|
934
|
+
|
935
|
+
# Delegate the call to the parent PDF object for the relevant pages
|
936
|
+
# Pass all relevant arguments through (no output_path anymore)
|
937
|
+
return parent_pdf.deskew(
|
938
|
+
pages=page_indices,
|
939
|
+
resolution=resolution,
|
940
|
+
detection_resolution=detection_resolution,
|
941
|
+
force_overwrite=force_overwrite,
|
942
|
+
**deskew_kwargs,
|
943
|
+
)
|
944
|
+
|
945
|
+
# --- End Deskew Method --- #
|
946
|
+
|
947
|
+
def _get_render_specs(
|
948
|
+
self,
|
949
|
+
mode: Literal["show", "render"] = "show",
|
950
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
951
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
952
|
+
crop: Union[bool, Literal["content"]] = False,
|
953
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
954
|
+
**kwargs,
|
955
|
+
) -> List[RenderSpec]:
|
956
|
+
"""Get render specifications for this page collection.
|
957
|
+
|
958
|
+
For page collections, we return specs for all pages that will be
|
959
|
+
rendered into a grid layout.
|
960
|
+
|
961
|
+
Args:
|
962
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
963
|
+
color: Color for highlighting pages in show mode
|
964
|
+
highlights: Additional highlight groups to show
|
965
|
+
crop: Whether to crop pages
|
966
|
+
crop_bbox: Explicit crop bounds
|
967
|
+
**kwargs: Additional parameters
|
968
|
+
|
969
|
+
Returns:
|
970
|
+
List of RenderSpec objects, one per page
|
971
|
+
"""
|
972
|
+
specs = []
|
973
|
+
|
974
|
+
# Get max pages from kwargs if specified
|
975
|
+
max_pages = kwargs.get("max_pages")
|
976
|
+
pages_to_render = self.pages[:max_pages] if max_pages else self.pages
|
977
|
+
|
978
|
+
for page in pages_to_render:
|
979
|
+
if hasattr(page, "_get_render_specs"):
|
980
|
+
# Page has the new unified rendering
|
981
|
+
page_specs = page._get_render_specs(
|
982
|
+
mode=mode,
|
983
|
+
color=color,
|
984
|
+
highlights=highlights,
|
985
|
+
crop=crop,
|
986
|
+
crop_bbox=crop_bbox,
|
987
|
+
**kwargs,
|
988
|
+
)
|
989
|
+
specs.extend(page_specs)
|
990
|
+
else:
|
991
|
+
# Fallback for pages without unified rendering
|
992
|
+
spec = RenderSpec(page=page)
|
993
|
+
if crop_bbox:
|
994
|
+
spec.crop_bbox = crop_bbox
|
995
|
+
specs.append(spec)
|
996
|
+
|
997
|
+
return specs
|
998
|
+
|
999
|
+
def save_pdf(
|
1000
|
+
self,
|
1001
|
+
output_path: Union[str, Path],
|
1002
|
+
ocr: bool = False,
|
1003
|
+
original: bool = False,
|
1004
|
+
dpi: int = 300,
|
1005
|
+
):
|
1006
|
+
"""
|
1007
|
+
Saves the pages in this collection to a new PDF file.
|
1008
|
+
|
1009
|
+
Choose one saving mode:
|
1010
|
+
- `ocr=True`: Creates a new, image-based PDF using OCR results. This
|
1011
|
+
makes the text generated during the natural-pdf session searchable,
|
1012
|
+
but loses original vector content. Requires 'ocr-export' extras.
|
1013
|
+
- `original=True`: Extracts the original pages from the source PDF,
|
1014
|
+
preserving all vector content, fonts, and annotations. OCR results
|
1015
|
+
from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
|
1016
|
+
|
1017
|
+
Args:
|
1018
|
+
output_path: Path to save the new PDF file.
|
1019
|
+
ocr: If True, save as a searchable, image-based PDF using OCR data.
|
1020
|
+
original: If True, save the original, vector-based pages.
|
1021
|
+
dpi: Resolution (dots per inch) used only when ocr=True for
|
1022
|
+
rendering page images and aligning the text layer.
|
1023
|
+
|
1024
|
+
Raises:
|
1025
|
+
ValueError: If the collection is empty, if neither or both 'ocr'
|
1026
|
+
and 'original' are True, or if 'original=True' and
|
1027
|
+
pages originate from different PDFs.
|
1028
|
+
ImportError: If required libraries ('pikepdf', 'Pillow')
|
1029
|
+
are not installed for the chosen mode.
|
1030
|
+
RuntimeError: If an unexpected error occurs during saving.
|
1031
|
+
"""
|
1032
|
+
if not self.pages:
|
1033
|
+
raise ValueError("Cannot save an empty PageCollection.")
|
1034
|
+
|
1035
|
+
if not (ocr ^ original): # XOR: exactly one must be true
|
1036
|
+
raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
|
1037
|
+
|
1038
|
+
output_path_obj = Path(output_path)
|
1039
|
+
output_path_str = str(output_path_obj)
|
1040
|
+
|
1041
|
+
if ocr:
|
1042
|
+
if create_searchable_pdf is None:
|
1043
|
+
raise ImportError(
|
1044
|
+
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
1045
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
1046
|
+
)
|
1047
|
+
|
1048
|
+
# Check for non-OCR vector elements (provide a warning)
|
1049
|
+
has_vector_elements = False
|
1050
|
+
for page in self.pages:
|
1051
|
+
# Simplified check for common vector types or non-OCR chars/words
|
1052
|
+
if (
|
1053
|
+
hasattr(page, "rects")
|
1054
|
+
and page.rects
|
1055
|
+
or hasattr(page, "lines")
|
1056
|
+
and page.lines
|
1057
|
+
or hasattr(page, "curves")
|
1058
|
+
and page.curves
|
1059
|
+
or (
|
1060
|
+
hasattr(page, "chars")
|
1061
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
1062
|
+
)
|
1063
|
+
or (
|
1064
|
+
hasattr(page, "words")
|
1065
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
1066
|
+
)
|
1067
|
+
):
|
1068
|
+
has_vector_elements = True
|
1069
|
+
break
|
1070
|
+
if has_vector_elements:
|
1071
|
+
logger.warning(
|
1072
|
+
"Warning: Saving with ocr=True creates an image-based PDF. "
|
1073
|
+
"Original vector elements (rects, lines, non-OCR text/chars) "
|
1074
|
+
"on selected pages will not be preserved in the output file."
|
1075
|
+
)
|
1076
|
+
|
1077
|
+
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
1078
|
+
try:
|
1079
|
+
# Delegate to the searchable PDF exporter function
|
1080
|
+
# Pass `self` (the PageCollection instance) as the source
|
1081
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
1082
|
+
# Success log is now inside create_searchable_pdf if needed, or keep here
|
1083
|
+
# logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
|
1084
|
+
except Exception as e:
|
1085
|
+
logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
|
1086
|
+
# Re-raise as RuntimeError for consistency, potentially handled in exporter too
|
1087
|
+
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
1088
|
+
|
1089
|
+
elif original:
|
1090
|
+
# ---> MODIFIED: Call the new exporter
|
1091
|
+
if create_original_pdf is None:
|
1092
|
+
raise ImportError(
|
1093
|
+
"Saving with original=True requires 'pikepdf'. "
|
1094
|
+
'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
|
1095
|
+
)
|
1096
|
+
|
1097
|
+
# Check for OCR elements (provide a warning) - keep this check here
|
1098
|
+
has_ocr_elements = False
|
1099
|
+
for page in self.pages:
|
1100
|
+
# Use find_all which returns a collection; check if it's non-empty
|
1101
|
+
if hasattr(page, "find_all"):
|
1102
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
1103
|
+
if ocr_text_elements: # Check truthiness of collection
|
1104
|
+
has_ocr_elements = True
|
1105
|
+
break
|
1106
|
+
elif hasattr(page, "words"): # Fallback check if find_all isn't present?
|
1107
|
+
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
1108
|
+
has_ocr_elements = True
|
1109
|
+
break
|
1110
|
+
|
1111
|
+
if has_ocr_elements:
|
1112
|
+
logger.warning(
|
1113
|
+
"Warning: Saving with original=True preserves original page content. "
|
1114
|
+
"OCR text generated in this session will not be included in the saved file."
|
1115
|
+
)
|
1116
|
+
|
1117
|
+
logger.info(f"Saving original pages PDF to: {output_path_str}")
|
1118
|
+
try:
|
1119
|
+
# Delegate to the original PDF exporter function
|
1120
|
+
# Pass `self` (the PageCollection instance) as the source
|
1121
|
+
create_original_pdf(self, output_path_str)
|
1122
|
+
# Success log is now inside create_original_pdf
|
1123
|
+
# logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
|
1124
|
+
except Exception as e:
|
1125
|
+
# Error logging is handled within create_original_pdf
|
1126
|
+
# Re-raise the exception caught from the exporter
|
1127
|
+
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
1128
|
+
# <--- END MODIFIED
|
1129
|
+
|
1130
|
+
def to_flow(
|
1131
|
+
self,
|
1132
|
+
arrangement: Literal["vertical", "horizontal"] = "vertical",
|
1133
|
+
alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
|
1134
|
+
segment_gap: float = 0.0,
|
1135
|
+
) -> "Flow":
|
1136
|
+
"""
|
1137
|
+
Convert this PageCollection to a Flow for cross-page operations.
|
1138
|
+
|
1139
|
+
This enables treating multiple pages as a continuous logical document
|
1140
|
+
structure, useful for multi-page tables, articles spanning columns,
|
1141
|
+
or any content requiring reading order across page boundaries.
|
1142
|
+
|
1143
|
+
Args:
|
1144
|
+
arrangement: Primary flow direction ('vertical' or 'horizontal').
|
1145
|
+
'vertical' stacks pages top-to-bottom (most common).
|
1146
|
+
'horizontal' arranges pages left-to-right.
|
1147
|
+
alignment: Cross-axis alignment for pages of different sizes:
|
1148
|
+
For vertical: 'left'/'start', 'center', 'right'/'end'
|
1149
|
+
For horizontal: 'top'/'start', 'center', 'bottom'/'end'
|
1150
|
+
segment_gap: Virtual gap between pages in PDF points (default: 0.0).
|
1151
|
+
|
1152
|
+
Returns:
|
1153
|
+
Flow object that can perform operations across all pages in sequence.
|
1154
|
+
|
1155
|
+
Example:
|
1156
|
+
Multi-page table extraction:
|
1157
|
+
```python
|
1158
|
+
pdf = npdf.PDF("multi_page_report.pdf")
|
1159
|
+
|
1160
|
+
# Create flow for pages 2-4 containing a table
|
1161
|
+
table_flow = pdf.pages[1:4].to_flow()
|
1162
|
+
|
1163
|
+
# Extract table as if it were continuous
|
1164
|
+
table_data = table_flow.extract_table()
|
1165
|
+
df = table_data.df
|
1166
|
+
```
|
1167
|
+
|
1168
|
+
Cross-page element search:
|
1169
|
+
```python
|
1170
|
+
# Find all headers across multiple pages
|
1171
|
+
headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
|
1172
|
+
|
1173
|
+
# Analyze layout across pages
|
1174
|
+
regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
|
1175
|
+
```
|
1176
|
+
"""
|
1177
|
+
from natural_pdf.flows.flow import Flow
|
1178
|
+
|
1179
|
+
return Flow(
|
1180
|
+
segments=self, # Flow constructor now handles PageCollection
|
1181
|
+
arrangement=arrangement,
|
1182
|
+
alignment=alignment,
|
1183
|
+
segment_gap=segment_gap,
|
1184
|
+
)
|
1185
|
+
|
1186
|
+
def analyze_layout(self, *args, **kwargs) -> "ElementCollection[Region]":
|
1187
|
+
"""
|
1188
|
+
Analyzes the layout of each page in the collection.
|
1189
|
+
|
1190
|
+
This method iterates through each page, calls its analyze_layout method,
|
1191
|
+
and returns a single ElementCollection containing all the detected layout
|
1192
|
+
regions from all pages.
|
1193
|
+
|
1194
|
+
Args:
|
1195
|
+
*args: Positional arguments to pass to each page's analyze_layout method.
|
1196
|
+
**kwargs: Keyword arguments to pass to each page's analyze_layout method.
|
1197
|
+
A 'show_progress' kwarg can be included to show a progress bar.
|
1198
|
+
|
1199
|
+
Returns:
|
1200
|
+
An ElementCollection of all detected Region objects.
|
1201
|
+
"""
|
1202
|
+
all_regions = []
|
1203
|
+
|
1204
|
+
show_progress = kwargs.pop("show_progress", True)
|
1205
|
+
|
1206
|
+
iterator = self.pages
|
1207
|
+
if show_progress:
|
1208
|
+
try:
|
1209
|
+
from tqdm.auto import tqdm
|
1210
|
+
|
1211
|
+
iterator = tqdm(self.pages, desc="Analyzing layout")
|
1212
|
+
except ImportError:
|
1213
|
+
pass # tqdm not installed
|
1214
|
+
|
1215
|
+
for page in iterator:
|
1216
|
+
# Each page's analyze_layout method returns an ElementCollection
|
1217
|
+
regions_collection = page.analyze_layout(*args, **kwargs)
|
1218
|
+
if regions_collection:
|
1219
|
+
all_regions.extend(regions_collection.elements)
|
1220
|
+
|
1221
|
+
return ElementCollection(all_regions)
|
1222
|
+
|
1223
|
+
def highlights(self, show: bool = False) -> "HighlightContext":
|
1224
|
+
"""
|
1225
|
+
Create a highlight context for accumulating highlights.
|
1226
|
+
|
1227
|
+
This allows for clean syntax to show multiple highlight groups:
|
1228
|
+
|
1229
|
+
Example:
|
1230
|
+
with pages.highlights() as h:
|
1231
|
+
h.add(pages.find_all('table'), label='tables', color='blue')
|
1232
|
+
h.add(pages.find_all('text:bold'), label='bold text', color='red')
|
1233
|
+
h.show()
|
1234
|
+
|
1235
|
+
Or with automatic display:
|
1236
|
+
with pages.highlights(show=True) as h:
|
1237
|
+
h.add(pages.find_all('table'), label='tables')
|
1238
|
+
h.add(pages.find_all('text:bold'), label='bold')
|
1239
|
+
# Automatically shows when exiting the context
|
1240
|
+
|
1241
|
+
Args:
|
1242
|
+
show: If True, automatically show highlights when exiting context
|
1243
|
+
|
1244
|
+
Returns:
|
1245
|
+
HighlightContext for accumulating highlights
|
1246
|
+
"""
|
1247
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
1248
|
+
|
1249
|
+
return HighlightContext(self, show_on_exit=show)
|