natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
  50. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
  51. optimization/memory_comparison.py +1 -1
  52. optimization/pdf_analyzer.py +2 -2
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1249 @@
1
+ import hashlib
2
+ import logging
3
+ from collections.abc import MutableSequence, Sequence
4
+ from pathlib import Path
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ Any,
8
+ Callable,
9
+ Dict,
10
+ Generic,
11
+ Iterable,
12
+ Iterator,
13
+ List,
14
+ Literal,
15
+ Optional,
16
+ Sequence,
17
+ Tuple,
18
+ Type,
19
+ TypeVar,
20
+ Union,
21
+ overload,
22
+ )
23
+
24
+ from pdfplumber.utils.geometry import objects_to_bbox
25
+
26
+ # New Imports
27
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
28
+ from PIL import Image, ImageDraw, ImageFont
29
+ from tqdm.auto import tqdm
30
+
31
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
32
+ from natural_pdf.classification.manager import ClassificationManager
33
+ from natural_pdf.classification.mixin import ClassificationMixin
34
+ from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
35
+ from natural_pdf.core.pdf import PDF
36
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
37
+ from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
38
+ from natural_pdf.elements.base import Element
39
+ from natural_pdf.elements.element_collection import ElementCollection
40
+ from natural_pdf.elements.region import Region
41
+ from natural_pdf.elements.text import TextElement
42
+ from natural_pdf.export.mixin import ExportMixin
43
+ from natural_pdf.ocr import OCROptions
44
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
45
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
46
+ from natural_pdf.text_mixin import TextMixin
47
+
48
+ # Potentially lazy imports for optional dependencies needed in save_pdf
49
+ try:
50
+ import pikepdf
51
+ except ImportError:
52
+ pikepdf = None
53
+
54
+ try:
55
+ from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
56
+ except ImportError:
57
+ create_searchable_pdf = None
58
+
59
+ # ---> ADDED Import for the new exporter
60
+ try:
61
+ from natural_pdf.exporters.original_pdf import create_original_pdf
62
+ except ImportError:
63
+ create_original_pdf = None
64
+ # <--- END ADDED
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+ if TYPE_CHECKING:
69
+ from natural_pdf.core.page import Page
70
+ from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
71
+ from natural_pdf.elements.region import Region
72
+ from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
73
+ from natural_pdf.flows.flow import Flow
74
+
75
+ T = TypeVar("T")
76
+ P = TypeVar("P", bound="Page")
77
+
78
+
79
+ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Visualizable):
80
+ """
81
+ Represents a collection of Page objects, often from a single PDF document.
82
+ Provides methods for batch operations on these pages.
83
+ """
84
+
85
+ def __init__(self, pages: Union[List[P], Sequence[P]]):
86
+ """
87
+ Initialize a page collection.
88
+
89
+ Args:
90
+ pages: List or sequence of Page objects (can be lazy)
91
+ """
92
+ # Store the sequence as-is to preserve lazy behavior
93
+ # Only convert to list if we need list-specific operations
94
+ if hasattr(pages, "__iter__") and hasattr(pages, "__len__"):
95
+ self.pages = pages
96
+ else:
97
+ # Fallback for non-sequence types
98
+ self.pages = list(pages)
99
+
100
+ def __len__(self) -> int:
101
+ """Return the number of pages in the collection."""
102
+ return len(self.pages)
103
+
104
+ def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
105
+ """Support indexing and slicing."""
106
+ if isinstance(idx, slice):
107
+ return PageCollection(self.pages[idx])
108
+ return self.pages[idx]
109
+
110
+ def __iter__(self) -> Iterator[P]:
111
+ """Support iteration."""
112
+ return iter(self.pages)
113
+
114
+ def __repr__(self) -> str:
115
+ """Return a string representation showing the page count."""
116
+ return f"<PageCollection(count={len(self)})>"
117
+
118
+ def _get_items_for_apply(self) -> Iterator[P]:
119
+ """
120
+ Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
121
+
122
+ Returns an iterator that yields pages on-demand rather than materializing
123
+ all pages at once, maintaining the lazy loading behavior.
124
+ """
125
+ return iter(self.pages)
126
+
127
+ def _get_page_indices(self) -> List[int]:
128
+ """
129
+ Get page indices without forcing materialization of pages.
130
+
131
+ Returns:
132
+ List of page indices for the pages in this collection.
133
+ """
134
+ # Handle different types of page sequences efficiently
135
+ if hasattr(self.pages, "_indices"):
136
+ # If it's a _LazyPageList (or slice), get indices directly
137
+ return list(self.pages._indices)
138
+ else:
139
+ # Fallback: if pages are already materialized, get indices normally
140
+ # This will force materialization but only if pages aren't lazy
141
+ return [p.index for p in self.pages]
142
+
143
+ def extract_text(
144
+ self,
145
+ keep_blank_chars: bool = True,
146
+ apply_exclusions: bool = True,
147
+ strip: Optional[bool] = None,
148
+ **kwargs,
149
+ ) -> str:
150
+ """
151
+ Extract text from all pages in the collection.
152
+
153
+ Args:
154
+ keep_blank_chars: Whether to keep blank characters (default: True)
155
+ apply_exclusions: Whether to apply exclusion regions (default: True)
156
+ strip: Whether to strip whitespace from the extracted text.
157
+ **kwargs: Additional extraction parameters
158
+
159
+ Returns:
160
+ Combined text from all pages
161
+ """
162
+ texts = []
163
+ for page in self.pages:
164
+ text = page.extract_text(
165
+ keep_blank_chars=keep_blank_chars,
166
+ apply_exclusions=apply_exclusions,
167
+ **kwargs,
168
+ )
169
+ texts.append(text)
170
+
171
+ combined = "\n".join(texts)
172
+
173
+ # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
174
+ use_layout = kwargs.get("layout", False)
175
+ strip_final = strip if strip is not None else (not use_layout)
176
+
177
+ if strip_final:
178
+ combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
179
+
180
+ return combined
181
+
182
+ def apply_ocr(
183
+ self,
184
+ engine: Optional[str] = None,
185
+ # --- Common OCR Parameters (Direct Arguments) ---
186
+ languages: Optional[List[str]] = None,
187
+ min_confidence: Optional[float] = None, # Min confidence threshold
188
+ device: Optional[str] = None,
189
+ resolution: Optional[int] = None, # DPI for rendering
190
+ apply_exclusions: bool = True, # New parameter
191
+ replace: bool = True, # Whether to replace existing OCR elements
192
+ # --- Engine-Specific Options ---
193
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...)
194
+ ) -> "PageCollection[P]":
195
+ """
196
+ Applies OCR to all pages within this collection using batch processing.
197
+
198
+ This delegates the work to the parent PDF object's `apply_ocr` method.
199
+
200
+ Args:
201
+ engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
202
+ languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
203
+ **Must be codes understood by the specific selected engine.**
204
+ No mapping is performed.
205
+ min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
206
+ device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
207
+ resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
208
+ apply_exclusions: If True (default), render page images for OCR with
209
+ excluded areas masked (whited out). If False, OCR
210
+ the raw page images without masking exclusions.
211
+ replace: If True (default), remove any existing OCR elements before
212
+ adding new ones. If False, add new OCR elements to existing ones.
213
+ options: An engine-specific options object (e.g., EasyOCROptions) or dict.
214
+
215
+ Returns:
216
+ Self for method chaining.
217
+
218
+ Raises:
219
+ RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
220
+ (Propagates exceptions from PDF.apply_ocr)
221
+ """
222
+ if not self.pages:
223
+ logger.warning("Cannot apply OCR to an empty PageCollection.")
224
+ return self
225
+
226
+ # Assume all pages share the same parent PDF object
227
+ first_page = self.pages[0]
228
+ if not hasattr(first_page, "_parent") or not first_page._parent:
229
+ raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
230
+
231
+ parent_pdf = first_page._parent
232
+
233
+ if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
234
+ raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
235
+
236
+ # Get the 0-based indices of the pages in this collection
237
+ page_indices = self._get_page_indices()
238
+
239
+ logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
240
+
241
+ # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
242
+ parent_pdf.apply_ocr(
243
+ pages=page_indices,
244
+ engine=engine,
245
+ languages=languages,
246
+ min_confidence=min_confidence, # Pass the renamed parameter
247
+ device=device,
248
+ resolution=resolution,
249
+ apply_exclusions=apply_exclusions, # Pass down
250
+ replace=replace, # Pass the replace parameter
251
+ options=options,
252
+ )
253
+ # The PDF method modifies the Page objects directly by adding elements.
254
+
255
+ return self # Return self for chaining
256
+
257
+ @overload
258
+ def find(
259
+ self,
260
+ *,
261
+ text: str,
262
+ contains: str = "all",
263
+ apply_exclusions: bool = True,
264
+ regex: bool = False,
265
+ case: bool = True,
266
+ **kwargs,
267
+ ) -> Optional[T]: ...
268
+
269
+ @overload
270
+ def find(
271
+ self,
272
+ selector: str,
273
+ *,
274
+ contains: str = "all",
275
+ apply_exclusions: bool = True,
276
+ regex: bool = False,
277
+ case: bool = True,
278
+ **kwargs,
279
+ ) -> Optional[T]: ...
280
+
281
+ def find(
282
+ self,
283
+ selector: Optional[str] = None,
284
+ *,
285
+ text: Optional[str] = None,
286
+ contains: str = "all",
287
+ apply_exclusions: bool = True,
288
+ regex: bool = False,
289
+ case: bool = True,
290
+ **kwargs,
291
+ ) -> Optional[T]:
292
+ """
293
+ Find the first element matching the selector OR text across all pages in the collection.
294
+
295
+ Provide EITHER `selector` OR `text`, but not both.
296
+
297
+ Args:
298
+ selector: CSS-like selector string.
299
+ text: Text content to search for (equivalent to 'text:contains(...)').
300
+ contains: How to determine if elements are inside: 'all' (fully inside),
301
+ 'any' (any overlap), or 'center' (center point inside).
302
+ (default: "all")
303
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
304
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
305
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
306
+ **kwargs: Additional filter parameters.
307
+
308
+ Returns:
309
+ First matching element or None.
310
+ """
311
+ # Input validation happens within page.find
312
+ for page in self.pages:
313
+ element = page.find(
314
+ selector=selector,
315
+ text=text,
316
+ contains=contains,
317
+ apply_exclusions=apply_exclusions,
318
+ regex=regex,
319
+ case=case,
320
+ **kwargs,
321
+ )
322
+ if element:
323
+ return element
324
+ return None
325
+
326
+ @overload
327
+ def find_all(
328
+ self,
329
+ *,
330
+ text: str,
331
+ contains: str = "all",
332
+ apply_exclusions: bool = True,
333
+ regex: bool = False,
334
+ case: bool = True,
335
+ **kwargs,
336
+ ) -> "ElementCollection": ...
337
+
338
+ @overload
339
+ def find_all(
340
+ self,
341
+ selector: str,
342
+ *,
343
+ contains: str = "all",
344
+ apply_exclusions: bool = True,
345
+ regex: bool = False,
346
+ case: bool = True,
347
+ **kwargs,
348
+ ) -> "ElementCollection": ...
349
+
350
+ def find_all(
351
+ self,
352
+ selector: Optional[str] = None,
353
+ *,
354
+ text: Optional[str] = None,
355
+ contains: str = "all",
356
+ apply_exclusions: bool = True,
357
+ regex: bool = False,
358
+ case: bool = True,
359
+ **kwargs,
360
+ ) -> "ElementCollection":
361
+ """
362
+ Find all elements matching the selector OR text across all pages in the collection.
363
+
364
+ Provide EITHER `selector` OR `text`, but not both.
365
+
366
+ Args:
367
+ selector: CSS-like selector string.
368
+ text: Text content to search for (equivalent to 'text:contains(...)').
369
+ contains: How to determine if elements are inside: 'all' (fully inside),
370
+ 'any' (any overlap), or 'center' (center point inside).
371
+ (default: "all")
372
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
373
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
374
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
375
+ **kwargs: Additional filter parameters.
376
+
377
+ Returns:
378
+ ElementCollection with matching elements from all pages.
379
+ """
380
+ all_elements = []
381
+ # Input validation happens within page.find_all
382
+ for page in self.pages:
383
+ elements = page.find_all(
384
+ selector=selector,
385
+ text=text,
386
+ contains=contains,
387
+ apply_exclusions=apply_exclusions,
388
+ regex=regex,
389
+ case=case,
390
+ **kwargs,
391
+ )
392
+ if elements:
393
+ all_elements.extend(elements.elements)
394
+
395
+ return ElementCollection(all_elements)
396
+
397
+ def update_text(
398
+ self,
399
+ transform: Callable[[Any], Optional[str]],
400
+ selector: str = "text",
401
+ max_workers: Optional[int] = None,
402
+ ) -> "PageCollection[P]":
403
+ """
404
+ Applies corrections to text elements across all pages
405
+ in this collection using a user-provided callback function, executed
406
+ in parallel if `max_workers` is specified.
407
+
408
+ This method delegates to the parent PDF's `update_text` method,
409
+ targeting all pages within this collection.
410
+
411
+ Args:
412
+ transform: A function that accepts a single argument (an element
413
+ object) and returns `Optional[str]` (new text or None).
414
+ selector: The attribute name to update. Default is 'text'.
415
+ max_workers: The maximum number of worker threads to use for parallel
416
+ correction on each page. If None, defaults are used.
417
+
418
+ Returns:
419
+ Self for method chaining.
420
+
421
+ Raises:
422
+ RuntimeError: If the collection is empty, pages lack a parent PDF reference,
423
+ or the parent PDF lacks the `update_text` method.
424
+ """
425
+ if not self.pages:
426
+ logger.warning("Cannot update text for an empty PageCollection.")
427
+ # Return self even if empty to maintain chaining consistency
428
+ return self
429
+
430
+ # Assume all pages share the same parent PDF object
431
+ parent_pdf = self.pages[0]._parent
432
+ if (
433
+ not parent_pdf
434
+ or not hasattr(parent_pdf, "update_text")
435
+ or not callable(parent_pdf.update_text)
436
+ ):
437
+ raise RuntimeError(
438
+ "Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
439
+ )
440
+
441
+ page_indices = self._get_page_indices()
442
+ logger.info(
443
+ f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
444
+ )
445
+
446
+ # Delegate the call to the parent PDF object for the relevant pages
447
+ # Pass the max_workers parameter down
448
+ parent_pdf.update_text(
449
+ transform=transform,
450
+ pages=page_indices,
451
+ selector=selector,
452
+ max_workers=max_workers,
453
+ )
454
+
455
+ return self
456
+
457
+ def get_sections(
458
+ self,
459
+ start_elements=None,
460
+ end_elements=None,
461
+ new_section_on_page_break=False,
462
+ include_boundaries="both",
463
+ ) -> "ElementCollection[Region]":
464
+ """
465
+ Extract sections from a page collection based on start/end elements.
466
+
467
+ Args:
468
+ start_elements: Elements or selector string that mark the start of sections (optional)
469
+ end_elements: Elements or selector string that mark the end of sections (optional)
470
+ new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
471
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
472
+
473
+ Returns:
474
+ List of Region objects representing the extracted sections
475
+
476
+ Note:
477
+ You can provide only start_elements, only end_elements, or both.
478
+ - With only start_elements: sections go from each start to the next start (or end of page)
479
+ - With only end_elements: sections go from beginning of document/page to each end
480
+ - With both: sections go from each start to the corresponding end
481
+ """
482
+ # Find start and end elements across all pages
483
+ if isinstance(start_elements, str):
484
+ start_elements = self.find_all(start_elements).elements
485
+
486
+ if isinstance(end_elements, str):
487
+ end_elements = self.find_all(end_elements).elements
488
+
489
+ # If no start elements and no end elements, return empty list
490
+ if not start_elements and not end_elements:
491
+ return []
492
+
493
+ # If there are page break boundaries, we'll need to add them
494
+ if new_section_on_page_break:
495
+ # For each page boundary, create virtual "end" and "start" elements
496
+ for i in range(len(self.pages) - 1):
497
+ # Add a virtual "end" element at the bottom of the current page
498
+ page = self.pages[i]
499
+ # If end_elements is None, initialize it as an empty list
500
+ if end_elements is None:
501
+ end_elements = []
502
+
503
+ # Create a region at the bottom of the page as an artificial end marker
504
+ from natural_pdf.elements.region import Region
505
+
506
+ bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
507
+ bottom_region.is_page_boundary = True # Mark it as a special boundary
508
+ end_elements.append(bottom_region)
509
+
510
+ # Add a virtual "start" element at the top of the next page
511
+ next_page = self.pages[i + 1]
512
+ top_region = Region(next_page, (0, 0, next_page.width, 1))
513
+ top_region.is_page_boundary = True # Mark it as a special boundary
514
+ start_elements.append(top_region)
515
+
516
+ # Get all elements from all pages and sort them in document order
517
+ all_elements = []
518
+ for page in self.pages:
519
+ elements = page.get_elements()
520
+ all_elements.extend(elements)
521
+
522
+ # Sort by page index, then vertical position, then horizontal position
523
+ all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
524
+
525
+ # If we only have end_elements (no start_elements), create implicit start elements
526
+ if not start_elements and end_elements:
527
+ from natural_pdf.elements.region import Region
528
+
529
+ start_elements = []
530
+
531
+ # Add implicit start at the beginning of the first page
532
+ first_page = self.pages[0]
533
+ first_start = Region(first_page, (0, 0, first_page.width, 1))
534
+ first_start.is_implicit_start = True
535
+ start_elements.append(first_start)
536
+
537
+ # For each end element (except the last), add an implicit start after it
538
+ sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
539
+ for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
540
+ # Create implicit start element right after this end element
541
+ implicit_start = Region(
542
+ end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1)
543
+ )
544
+ implicit_start.is_implicit_start = True
545
+ start_elements.append(implicit_start)
546
+
547
+ # Mark section boundaries
548
+ section_boundaries = []
549
+
550
+ # Add start element boundaries
551
+ for element in start_elements:
552
+ if element in all_elements:
553
+ idx = all_elements.index(element)
554
+ section_boundaries.append(
555
+ {
556
+ "index": idx,
557
+ "element": element,
558
+ "type": "start",
559
+ "page_idx": element.page.index,
560
+ }
561
+ )
562
+ elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
563
+ # This is a virtual page boundary element
564
+ section_boundaries.append(
565
+ {
566
+ "index": -1, # Special index for page boundaries
567
+ "element": element,
568
+ "type": "start",
569
+ "page_idx": element.page.index,
570
+ }
571
+ )
572
+ elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
573
+ # This is an implicit start element
574
+ section_boundaries.append(
575
+ {
576
+ "index": -2, # Special index for implicit starts
577
+ "element": element,
578
+ "type": "start",
579
+ "page_idx": element.page.index,
580
+ }
581
+ )
582
+
583
+ # Add end element boundaries if provided
584
+ if end_elements:
585
+ for element in end_elements:
586
+ if element in all_elements:
587
+ idx = all_elements.index(element)
588
+ section_boundaries.append(
589
+ {
590
+ "index": idx,
591
+ "element": element,
592
+ "type": "end",
593
+ "page_idx": element.page.index,
594
+ }
595
+ )
596
+ elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
597
+ # This is a virtual page boundary element
598
+ section_boundaries.append(
599
+ {
600
+ "index": -1, # Special index for page boundaries
601
+ "element": element,
602
+ "type": "end",
603
+ "page_idx": element.page.index,
604
+ }
605
+ )
606
+
607
+ # Sort boundaries by page index, then by actual document position
608
+ def _sort_key(boundary):
609
+ """Sort boundaries by (page_idx, vertical_top, priority)."""
610
+ page_idx = boundary["page_idx"]
611
+ element = boundary["element"]
612
+
613
+ # Vertical position on the page
614
+ y_pos = getattr(element, "top", 0.0)
615
+
616
+ # Ensure starts come before ends at the same coordinate
617
+ priority = 0 if boundary["type"] == "start" else 1
618
+
619
+ return (page_idx, y_pos, priority)
620
+
621
+ section_boundaries.sort(key=_sort_key)
622
+
623
+ # Generate sections
624
+ sections = []
625
+
626
+ # --- Helper: build a FlowRegion spanning multiple pages ---
627
+ def _build_flow_region(start_el, end_el):
628
+ """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
629
+ If *end_el* is None, the region continues to the bottom of the last
630
+ page in this PageCollection."""
631
+ # Local imports to avoid top-level cycles
632
+ from natural_pdf.elements.region import Region
633
+ from natural_pdf.flows.element import FlowElement
634
+ from natural_pdf.flows.flow import Flow
635
+ from natural_pdf.flows.region import FlowRegion
636
+
637
+ start_pg = start_el.page
638
+ end_pg = end_el.page if end_el is not None else self.pages[-1]
639
+
640
+ parts: list[Region] = []
641
+
642
+ # Use the actual top of the start element (for implicit starts this is
643
+ # the bottom of the previous end element) instead of forcing to 0.
644
+ start_top = start_el.top
645
+
646
+ # Slice of first page beginning at *start_top*
647
+ parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
648
+
649
+ # Full middle pages
650
+ for pg_idx in range(start_pg.index + 1, end_pg.index):
651
+ mid_pg = self.pages[pg_idx]
652
+ parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
653
+
654
+ # Slice of last page (if distinct)
655
+ if end_pg is not start_pg:
656
+ bottom = end_el.bottom if end_el is not None else end_pg.height
657
+ parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
658
+
659
+ flow = Flow(segments=parts, arrangement="vertical")
660
+ src_fe = FlowElement(physical_object=start_el, flow=flow)
661
+ return FlowRegion(
662
+ flow=flow,
663
+ constituent_regions=parts,
664
+ source_flow_element=src_fe,
665
+ boundary_element_found=end_el,
666
+ )
667
+
668
+ # ------------------------------------------------------------------
669
+
670
+ current_start = None
671
+
672
+ for i, boundary in enumerate(section_boundaries):
673
+ # If it's a start boundary and we don't have a current start
674
+ if boundary["type"] == "start" and current_start is None:
675
+ current_start = boundary
676
+
677
+ # If it's an end boundary and we have a current start
678
+ elif boundary["type"] == "end" and current_start is not None:
679
+ # Create a section from current_start to this boundary
680
+ start_element = current_start["element"]
681
+ end_element = boundary["element"]
682
+
683
+ # If both elements are on the same page, use the page's get_section_between
684
+ if start_element.page == end_element.page:
685
+ # For implicit start elements, create a region from the top of the page
686
+ if hasattr(start_element, "is_implicit_start"):
687
+ from natural_pdf.elements.region import Region
688
+
689
+ section = Region(
690
+ start_element.page,
691
+ (0, start_element.top, start_element.page.width, end_element.bottom),
692
+ )
693
+ section.start_element = start_element
694
+ section.boundary_element_found = end_element
695
+ else:
696
+ section = start_element.page.get_section_between(
697
+ start_element, end_element, include_boundaries
698
+ )
699
+ sections.append(section)
700
+ else:
701
+ # Create FlowRegion spanning pages
702
+ flow_region = _build_flow_region(start_element, end_element)
703
+ sections.append(flow_region)
704
+
705
+ current_start = None
706
+
707
+ # If it's another start boundary and we have a current start (for splitting by starts only)
708
+ elif boundary["type"] == "start" and current_start is not None and not end_elements:
709
+ # Create a section from current_start to just before this boundary
710
+ start_element = current_start["element"]
711
+
712
+ # Find the last element before this boundary on the same page
713
+ if start_element.page == boundary["element"].page:
714
+ # Find elements on this page
715
+ page_elements = [e for e in all_elements if e.page == start_element.page]
716
+ # Sort by position
717
+ page_elements.sort(key=lambda e: (e.top, e.x0))
718
+
719
+ # Find the last element before the boundary
720
+ end_idx = (
721
+ page_elements.index(boundary["element"]) - 1
722
+ if boundary["element"] in page_elements
723
+ else -1
724
+ )
725
+ end_element = page_elements[end_idx] if end_idx >= 0 else None
726
+
727
+ # Create the section
728
+ section = start_element.page.get_section_between(
729
+ start_element, end_element, include_boundaries
730
+ )
731
+ sections.append(section)
732
+ else:
733
+ # Cross-page section - create from current_start to the end of its page
734
+ from natural_pdf.elements.region import Region
735
+
736
+ start_page = start_element.page
737
+
738
+ # Handle implicit start elements
739
+ start_top = start_element.top
740
+ region = Region(start_page, (0, start_top, start_page.width, start_page.height))
741
+ region.start_element = start_element
742
+ sections.append(region)
743
+
744
+ current_start = boundary
745
+
746
+ # Handle the last section if we have a current start
747
+ if current_start is not None:
748
+ start_element = current_start["element"]
749
+ start_page = start_element.page
750
+
751
+ if end_elements:
752
+ # With end_elements, we need an explicit end - use the last element
753
+ # on the last page of the collection
754
+ last_page = self.pages[-1]
755
+ last_page_elements = [e for e in all_elements if e.page == last_page]
756
+ last_page_elements.sort(key=lambda e: (e.top, e.x0))
757
+ end_element = last_page_elements[-1] if last_page_elements else None
758
+
759
+ # Create FlowRegion spanning multiple pages using helper
760
+ flow_region = _build_flow_region(start_element, end_element)
761
+ sections.append(flow_region)
762
+ else:
763
+ # With start_elements only, create a section to the end of the current page
764
+ from natural_pdf.elements.region import Region
765
+
766
+ # Handle implicit start elements
767
+ start_top = start_element.top
768
+ region = Region(start_page, (0, start_top, start_page.width, start_page.height))
769
+ region.start_element = start_element
770
+ sections.append(region)
771
+
772
+ return ElementCollection(sections)
773
+
774
+ def _gather_analysis_data(
775
+ self,
776
+ analysis_keys: List[str],
777
+ include_content: bool,
778
+ include_images: bool,
779
+ image_dir: Optional[Path],
780
+ image_format: str,
781
+ image_resolution: int,
782
+ ) -> List[Dict[str, Any]]:
783
+ """
784
+ Gather analysis data from all pages in the collection.
785
+
786
+ Args:
787
+ analysis_keys: Keys in the analyses dictionary to export
788
+ include_content: Whether to include extracted text
789
+ include_images: Whether to export images
790
+ image_dir: Directory to save images
791
+ image_format: Format to save images
792
+ image_resolution: Resolution for exported images
793
+
794
+ Returns:
795
+ List of dictionaries containing analysis data
796
+ """
797
+ if not self.elements:
798
+ logger.warning("No pages found in collection")
799
+ return []
800
+
801
+ all_data = []
802
+
803
+ for page in self.elements:
804
+ # Basic page information
805
+ page_data = {
806
+ "page_number": page.number,
807
+ "page_index": page.index,
808
+ "width": page.width,
809
+ "height": page.height,
810
+ }
811
+
812
+ # Add PDF information if available
813
+ if hasattr(page, "pdf") and page.pdf:
814
+ page_data["pdf_path"] = page.pdf.path
815
+ page_data["pdf_filename"] = Path(page.pdf.path).name
816
+
817
+ # Include extracted text if requested
818
+ if include_content:
819
+ try:
820
+ page_data["content"] = page.extract_text(preserve_whitespace=True)
821
+ except Exception as e:
822
+ logger.error(f"Error extracting text from page {page.number}: {e}")
823
+ page_data["content"] = ""
824
+
825
+ # Save image if requested
826
+ if include_images:
827
+ try:
828
+ # Create image filename
829
+ pdf_name = "unknown"
830
+ if hasattr(page, "pdf") and page.pdf:
831
+ pdf_name = Path(page.pdf.path).stem
832
+
833
+ image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
834
+ image_path = image_dir / image_filename
835
+
836
+ # Save image
837
+ page.save_image(
838
+ str(image_path), resolution=image_resolution, include_highlights=True
839
+ )
840
+
841
+ # Add relative path to data
842
+ page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
843
+ except Exception as e:
844
+ logger.error(f"Error saving image for page {page.number}: {e}")
845
+ page_data["image_path"] = None
846
+
847
+ # Add analyses data
848
+ if hasattr(page, "analyses") and page.analyses:
849
+ for key in analysis_keys:
850
+ if key not in page.analyses:
851
+ raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
852
+
853
+ # Get the analysis result
854
+ analysis_result = page.analyses[key]
855
+
856
+ # If the result has a to_dict method, use it
857
+ if hasattr(analysis_result, "to_dict"):
858
+ analysis_data = analysis_result.to_dict()
859
+ else:
860
+ # Otherwise, use the result directly if it's dict-like
861
+ try:
862
+ analysis_data = dict(analysis_result)
863
+ except (TypeError, ValueError):
864
+ # Last resort: convert to string
865
+ analysis_data = {"raw_result": str(analysis_result)}
866
+
867
+ # Add analysis data to page data with the key as prefix
868
+ for k, v in analysis_data.items():
869
+ page_data[f"{key}.{k}"] = v
870
+
871
+ all_data.append(page_data)
872
+
873
+ return all_data
874
+
875
+ # --- Deskew Method --- #
876
+
877
+ def deskew(
878
+ self,
879
+ resolution: int = 300,
880
+ detection_resolution: int = 72,
881
+ force_overwrite: bool = False,
882
+ **deskew_kwargs,
883
+ ) -> "PDF": # Changed return type
884
+ """
885
+ Creates a new, in-memory PDF object containing deskewed versions of the pages
886
+ in this collection.
887
+
888
+ This method delegates the actual processing to the parent PDF object's
889
+ `deskew` method.
890
+
891
+ Important: The returned PDF is image-based. Any existing text, OCR results,
892
+ annotations, or other elements from the original pages will *not* be carried over.
893
+
894
+ Args:
895
+ resolution: DPI resolution for rendering the output deskewed pages.
896
+ detection_resolution: DPI resolution used for skew detection if angles are not
897
+ already cached on the page objects.
898
+ force_overwrite: If False (default), raises a ValueError if any target page
899
+ already contains processed elements (text, OCR, regions) to
900
+ prevent accidental data loss. Set to True to proceed anyway.
901
+ **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
902
+ during automatic detection (e.g., `max_angle`, `num_peaks`).
903
+
904
+ Returns:
905
+ A new PDF object representing the deskewed document.
906
+
907
+ Raises:
908
+ ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
909
+ ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
910
+ or if the collection is empty.
911
+ RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
912
+ """
913
+ if not self.pages:
914
+ logger.warning("Cannot deskew an empty PageCollection.")
915
+ raise ValueError("Cannot deskew an empty PageCollection.")
916
+
917
+ # Assume all pages share the same parent PDF object
918
+ # Need to hint the type of _parent for type checkers
919
+ if TYPE_CHECKING:
920
+ parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
921
+ else:
922
+ parent_pdf = self.pages[0]._parent
923
+
924
+ if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
925
+ raise RuntimeError(
926
+ "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
927
+ )
928
+
929
+ # Get the 0-based indices of the pages in this collection
930
+ page_indices = self._get_page_indices()
931
+ logger.info(
932
+ f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
933
+ )
934
+
935
+ # Delegate the call to the parent PDF object for the relevant pages
936
+ # Pass all relevant arguments through (no output_path anymore)
937
+ return parent_pdf.deskew(
938
+ pages=page_indices,
939
+ resolution=resolution,
940
+ detection_resolution=detection_resolution,
941
+ force_overwrite=force_overwrite,
942
+ **deskew_kwargs,
943
+ )
944
+
945
+ # --- End Deskew Method --- #
946
+
947
+ def _get_render_specs(
948
+ self,
949
+ mode: Literal["show", "render"] = "show",
950
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
951
+ highlights: Optional[List[Dict[str, Any]]] = None,
952
+ crop: Union[bool, Literal["content"]] = False,
953
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
954
+ **kwargs,
955
+ ) -> List[RenderSpec]:
956
+ """Get render specifications for this page collection.
957
+
958
+ For page collections, we return specs for all pages that will be
959
+ rendered into a grid layout.
960
+
961
+ Args:
962
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
963
+ color: Color for highlighting pages in show mode
964
+ highlights: Additional highlight groups to show
965
+ crop: Whether to crop pages
966
+ crop_bbox: Explicit crop bounds
967
+ **kwargs: Additional parameters
968
+
969
+ Returns:
970
+ List of RenderSpec objects, one per page
971
+ """
972
+ specs = []
973
+
974
+ # Get max pages from kwargs if specified
975
+ max_pages = kwargs.get("max_pages")
976
+ pages_to_render = self.pages[:max_pages] if max_pages else self.pages
977
+
978
+ for page in pages_to_render:
979
+ if hasattr(page, "_get_render_specs"):
980
+ # Page has the new unified rendering
981
+ page_specs = page._get_render_specs(
982
+ mode=mode,
983
+ color=color,
984
+ highlights=highlights,
985
+ crop=crop,
986
+ crop_bbox=crop_bbox,
987
+ **kwargs,
988
+ )
989
+ specs.extend(page_specs)
990
+ else:
991
+ # Fallback for pages without unified rendering
992
+ spec = RenderSpec(page=page)
993
+ if crop_bbox:
994
+ spec.crop_bbox = crop_bbox
995
+ specs.append(spec)
996
+
997
+ return specs
998
+
999
+ def save_pdf(
1000
+ self,
1001
+ output_path: Union[str, Path],
1002
+ ocr: bool = False,
1003
+ original: bool = False,
1004
+ dpi: int = 300,
1005
+ ):
1006
+ """
1007
+ Saves the pages in this collection to a new PDF file.
1008
+
1009
+ Choose one saving mode:
1010
+ - `ocr=True`: Creates a new, image-based PDF using OCR results. This
1011
+ makes the text generated during the natural-pdf session searchable,
1012
+ but loses original vector content. Requires 'ocr-export' extras.
1013
+ - `original=True`: Extracts the original pages from the source PDF,
1014
+ preserving all vector content, fonts, and annotations. OCR results
1015
+ from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
1016
+
1017
+ Args:
1018
+ output_path: Path to save the new PDF file.
1019
+ ocr: If True, save as a searchable, image-based PDF using OCR data.
1020
+ original: If True, save the original, vector-based pages.
1021
+ dpi: Resolution (dots per inch) used only when ocr=True for
1022
+ rendering page images and aligning the text layer.
1023
+
1024
+ Raises:
1025
+ ValueError: If the collection is empty, if neither or both 'ocr'
1026
+ and 'original' are True, or if 'original=True' and
1027
+ pages originate from different PDFs.
1028
+ ImportError: If required libraries ('pikepdf', 'Pillow')
1029
+ are not installed for the chosen mode.
1030
+ RuntimeError: If an unexpected error occurs during saving.
1031
+ """
1032
+ if not self.pages:
1033
+ raise ValueError("Cannot save an empty PageCollection.")
1034
+
1035
+ if not (ocr ^ original): # XOR: exactly one must be true
1036
+ raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
1037
+
1038
+ output_path_obj = Path(output_path)
1039
+ output_path_str = str(output_path_obj)
1040
+
1041
+ if ocr:
1042
+ if create_searchable_pdf is None:
1043
+ raise ImportError(
1044
+ "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
1045
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
1046
+ )
1047
+
1048
+ # Check for non-OCR vector elements (provide a warning)
1049
+ has_vector_elements = False
1050
+ for page in self.pages:
1051
+ # Simplified check for common vector types or non-OCR chars/words
1052
+ if (
1053
+ hasattr(page, "rects")
1054
+ and page.rects
1055
+ or hasattr(page, "lines")
1056
+ and page.lines
1057
+ or hasattr(page, "curves")
1058
+ and page.curves
1059
+ or (
1060
+ hasattr(page, "chars")
1061
+ and any(getattr(el, "source", None) != "ocr" for el in page.chars)
1062
+ )
1063
+ or (
1064
+ hasattr(page, "words")
1065
+ and any(getattr(el, "source", None) != "ocr" for el in page.words)
1066
+ )
1067
+ ):
1068
+ has_vector_elements = True
1069
+ break
1070
+ if has_vector_elements:
1071
+ logger.warning(
1072
+ "Warning: Saving with ocr=True creates an image-based PDF. "
1073
+ "Original vector elements (rects, lines, non-OCR text/chars) "
1074
+ "on selected pages will not be preserved in the output file."
1075
+ )
1076
+
1077
+ logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
1078
+ try:
1079
+ # Delegate to the searchable PDF exporter function
1080
+ # Pass `self` (the PageCollection instance) as the source
1081
+ create_searchable_pdf(self, output_path_str, dpi=dpi)
1082
+ # Success log is now inside create_searchable_pdf if needed, or keep here
1083
+ # logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
1084
+ except Exception as e:
1085
+ logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
1086
+ # Re-raise as RuntimeError for consistency, potentially handled in exporter too
1087
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
1088
+
1089
+ elif original:
1090
+ # ---> MODIFIED: Call the new exporter
1091
+ if create_original_pdf is None:
1092
+ raise ImportError(
1093
+ "Saving with original=True requires 'pikepdf'. "
1094
+ 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
1095
+ )
1096
+
1097
+ # Check for OCR elements (provide a warning) - keep this check here
1098
+ has_ocr_elements = False
1099
+ for page in self.pages:
1100
+ # Use find_all which returns a collection; check if it's non-empty
1101
+ if hasattr(page, "find_all"):
1102
+ ocr_text_elements = page.find_all("text[source=ocr]")
1103
+ if ocr_text_elements: # Check truthiness of collection
1104
+ has_ocr_elements = True
1105
+ break
1106
+ elif hasattr(page, "words"): # Fallback check if find_all isn't present?
1107
+ if any(getattr(el, "source", None) == "ocr" for el in page.words):
1108
+ has_ocr_elements = True
1109
+ break
1110
+
1111
+ if has_ocr_elements:
1112
+ logger.warning(
1113
+ "Warning: Saving with original=True preserves original page content. "
1114
+ "OCR text generated in this session will not be included in the saved file."
1115
+ )
1116
+
1117
+ logger.info(f"Saving original pages PDF to: {output_path_str}")
1118
+ try:
1119
+ # Delegate to the original PDF exporter function
1120
+ # Pass `self` (the PageCollection instance) as the source
1121
+ create_original_pdf(self, output_path_str)
1122
+ # Success log is now inside create_original_pdf
1123
+ # logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
1124
+ except Exception as e:
1125
+ # Error logging is handled within create_original_pdf
1126
+ # Re-raise the exception caught from the exporter
1127
+ raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
1128
+ # <--- END MODIFIED
1129
+
1130
+ def to_flow(
1131
+ self,
1132
+ arrangement: Literal["vertical", "horizontal"] = "vertical",
1133
+ alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
1134
+ segment_gap: float = 0.0,
1135
+ ) -> "Flow":
1136
+ """
1137
+ Convert this PageCollection to a Flow for cross-page operations.
1138
+
1139
+ This enables treating multiple pages as a continuous logical document
1140
+ structure, useful for multi-page tables, articles spanning columns,
1141
+ or any content requiring reading order across page boundaries.
1142
+
1143
+ Args:
1144
+ arrangement: Primary flow direction ('vertical' or 'horizontal').
1145
+ 'vertical' stacks pages top-to-bottom (most common).
1146
+ 'horizontal' arranges pages left-to-right.
1147
+ alignment: Cross-axis alignment for pages of different sizes:
1148
+ For vertical: 'left'/'start', 'center', 'right'/'end'
1149
+ For horizontal: 'top'/'start', 'center', 'bottom'/'end'
1150
+ segment_gap: Virtual gap between pages in PDF points (default: 0.0).
1151
+
1152
+ Returns:
1153
+ Flow object that can perform operations across all pages in sequence.
1154
+
1155
+ Example:
1156
+ Multi-page table extraction:
1157
+ ```python
1158
+ pdf = npdf.PDF("multi_page_report.pdf")
1159
+
1160
+ # Create flow for pages 2-4 containing a table
1161
+ table_flow = pdf.pages[1:4].to_flow()
1162
+
1163
+ # Extract table as if it were continuous
1164
+ table_data = table_flow.extract_table()
1165
+ df = table_data.df
1166
+ ```
1167
+
1168
+ Cross-page element search:
1169
+ ```python
1170
+ # Find all headers across multiple pages
1171
+ headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
1172
+
1173
+ # Analyze layout across pages
1174
+ regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
1175
+ ```
1176
+ """
1177
+ from natural_pdf.flows.flow import Flow
1178
+
1179
+ return Flow(
1180
+ segments=self, # Flow constructor now handles PageCollection
1181
+ arrangement=arrangement,
1182
+ alignment=alignment,
1183
+ segment_gap=segment_gap,
1184
+ )
1185
+
1186
+ def analyze_layout(self, *args, **kwargs) -> "ElementCollection[Region]":
1187
+ """
1188
+ Analyzes the layout of each page in the collection.
1189
+
1190
+ This method iterates through each page, calls its analyze_layout method,
1191
+ and returns a single ElementCollection containing all the detected layout
1192
+ regions from all pages.
1193
+
1194
+ Args:
1195
+ *args: Positional arguments to pass to each page's analyze_layout method.
1196
+ **kwargs: Keyword arguments to pass to each page's analyze_layout method.
1197
+ A 'show_progress' kwarg can be included to show a progress bar.
1198
+
1199
+ Returns:
1200
+ An ElementCollection of all detected Region objects.
1201
+ """
1202
+ all_regions = []
1203
+
1204
+ show_progress = kwargs.pop("show_progress", True)
1205
+
1206
+ iterator = self.pages
1207
+ if show_progress:
1208
+ try:
1209
+ from tqdm.auto import tqdm
1210
+
1211
+ iterator = tqdm(self.pages, desc="Analyzing layout")
1212
+ except ImportError:
1213
+ pass # tqdm not installed
1214
+
1215
+ for page in iterator:
1216
+ # Each page's analyze_layout method returns an ElementCollection
1217
+ regions_collection = page.analyze_layout(*args, **kwargs)
1218
+ if regions_collection:
1219
+ all_regions.extend(regions_collection.elements)
1220
+
1221
+ return ElementCollection(all_regions)
1222
+
1223
+ def highlights(self, show: bool = False) -> "HighlightContext":
1224
+ """
1225
+ Create a highlight context for accumulating highlights.
1226
+
1227
+ This allows for clean syntax to show multiple highlight groups:
1228
+
1229
+ Example:
1230
+ with pages.highlights() as h:
1231
+ h.add(pages.find_all('table'), label='tables', color='blue')
1232
+ h.add(pages.find_all('text:bold'), label='bold text', color='red')
1233
+ h.show()
1234
+
1235
+ Or with automatic display:
1236
+ with pages.highlights(show=True) as h:
1237
+ h.add(pages.find_all('table'), label='tables')
1238
+ h.add(pages.find_all('text:bold'), label='bold')
1239
+ # Automatically shows when exiting the context
1240
+
1241
+ Args:
1242
+ show: If True, automatically show highlights when exiting context
1243
+
1244
+ Returns:
1245
+ HighlightContext for accumulating highlights
1246
+ """
1247
+ from natural_pdf.core.highlighting_service import HighlightContext
1248
+
1249
+ return HighlightContext(self, show_on_exit=show)