natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,770 @@
1
+ from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from natural_pdf.core.page import Page
5
+ from natural_pdf.elements.region import Region
6
+
7
+ T = TypeVar('T')
8
+ P = TypeVar('P', bound='Page')
9
+
10
+ class ElementCollection(Generic[T]):
11
+ """
12
+ Collection of PDF elements with batch operations.
13
+ """
14
+
15
+ def __init__(self, elements: List[T]):
16
+ """
17
+ Initialize a collection of elements.
18
+
19
+ Args:
20
+ elements: List of Element objects
21
+ """
22
+ self._elements = elements or []
23
+
24
+ def __len__(self) -> int:
25
+ """Get the number of elements in the collection."""
26
+ return len(self._elements)
27
+
28
+ def __getitem__(self, index: int) -> 'Element':
29
+ """Get an element by index."""
30
+ return self._elements[index]
31
+
32
+ def __iter__(self):
33
+ """Iterate over elements."""
34
+ return iter(self._elements)
35
+
36
+ @property
37
+ def elements(self) -> List['Element']:
38
+ """Get the elements in this collection."""
39
+ return self._elements
40
+
41
+ @property
42
+ def first(self) -> Optional['Element']:
43
+ """Get the first element in the collection."""
44
+ return self._elements[0] if self._elements else None
45
+
46
+ @property
47
+ def last(self) -> Optional['Element']:
48
+ """Get the last element in the collection."""
49
+ return self._elements[-1] if self._elements else None
50
+
51
+ def highest(self) -> Optional['Element']:
52
+ """
53
+ Get element with the smallest top y-coordinate (highest on page).
54
+
55
+ Raises:
56
+ ValueError: If elements are on multiple pages
57
+
58
+ Returns:
59
+ Element with smallest top value or None if empty
60
+ """
61
+ if not self._elements:
62
+ return None
63
+
64
+ # Check if elements are on multiple pages
65
+ if self._are_on_multiple_pages():
66
+ raise ValueError("Cannot determine highest element across multiple pages")
67
+
68
+ return min(self._elements, key=lambda e: e.top)
69
+
70
+ def lowest(self) -> Optional['Element']:
71
+ """
72
+ Get element with the largest bottom y-coordinate (lowest on page).
73
+
74
+ Raises:
75
+ ValueError: If elements are on multiple pages
76
+
77
+ Returns:
78
+ Element with largest bottom value or None if empty
79
+ """
80
+ if not self._elements:
81
+ return None
82
+
83
+ # Check if elements are on multiple pages
84
+ if self._are_on_multiple_pages():
85
+ raise ValueError("Cannot determine lowest element across multiple pages")
86
+
87
+ return max(self._elements, key=lambda e: e.bottom)
88
+
89
+ def leftmost(self) -> Optional['Element']:
90
+ """
91
+ Get element with the smallest x0 coordinate (leftmost on page).
92
+
93
+ Raises:
94
+ ValueError: If elements are on multiple pages
95
+
96
+ Returns:
97
+ Element with smallest x0 value or None if empty
98
+ """
99
+ if not self._elements:
100
+ return None
101
+
102
+ # Check if elements are on multiple pages
103
+ if self._are_on_multiple_pages():
104
+ raise ValueError("Cannot determine leftmost element across multiple pages")
105
+
106
+ return min(self._elements, key=lambda e: e.x0)
107
+
108
+ def rightmost(self) -> Optional['Element']:
109
+ """
110
+ Get element with the largest x1 coordinate (rightmost on page).
111
+
112
+ Raises:
113
+ ValueError: If elements are on multiple pages
114
+
115
+ Returns:
116
+ Element with largest x1 value or None if empty
117
+ """
118
+ if not self._elements:
119
+ return None
120
+
121
+ # Check if elements are on multiple pages
122
+ if self._are_on_multiple_pages():
123
+ raise ValueError("Cannot determine rightmost element across multiple pages")
124
+
125
+ return max(self._elements, key=lambda e: e.x1)
126
+
127
+ def _are_on_multiple_pages(self) -> bool:
128
+ """
129
+ Check if elements in this collection span multiple pages.
130
+
131
+ Returns:
132
+ True if elements are on different pages, False otherwise
133
+ """
134
+ if not self._elements:
135
+ return False
136
+
137
+ # Get the page index of the first element
138
+ if not hasattr(self._elements[0], 'page'):
139
+ return False
140
+
141
+ first_page_idx = self._elements[0].page.index
142
+
143
+ # Check if any element is on a different page
144
+ return any(hasattr(e, 'page') and e.page.index != first_page_idx for e in self._elements)
145
+
146
+ def exclude_regions(self, regions: List['Region']) -> 'ElementCollection':
147
+ """
148
+ Remove elements that are within any of the specified regions.
149
+
150
+ Args:
151
+ regions: List of Region objects to exclude
152
+
153
+ Returns:
154
+ New ElementCollection with filtered elements
155
+ """
156
+ if not regions:
157
+ return ElementCollection(self._elements)
158
+
159
+ filtered = []
160
+ for element in self._elements:
161
+ exclude = False
162
+ for region in regions:
163
+ if region._is_element_in_region(element):
164
+ exclude = True
165
+ break
166
+ if not exclude:
167
+ filtered.append(element)
168
+
169
+ return ElementCollection(filtered)
170
+
171
+ def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
172
+ """
173
+ Extract text from all elements in the collection.
174
+
175
+ Args:
176
+ preserve_whitespace: Whether to keep blank characters (default: True)
177
+ use_exclusions: Whether to apply exclusion regions (default: True)
178
+ **kwargs: Additional extraction parameters
179
+
180
+ Returns:
181
+ Combined text from all elements
182
+ """
183
+ # Filter to just text-like elements
184
+ text_elements = [e for e in self._elements if hasattr(e, 'extract_text')]
185
+
186
+ # Sort elements in reading order (top-to-bottom, left-to-right)
187
+ sorted_elements = sorted(text_elements, key=lambda e: (e.top, e.x0))
188
+
189
+ # Extract text from each element
190
+ texts = []
191
+ for element in sorted_elements:
192
+ # Extract text with new parameter names
193
+ text = element.extract_text(preserve_whitespace=preserve_whitespace, use_exclusions=use_exclusions, **kwargs)
194
+
195
+ if text:
196
+ texts.append(text)
197
+
198
+ return " ".join(texts)
199
+
200
+ def filter(self, func: Callable[['Element'], bool]) -> 'ElementCollection':
201
+ """
202
+ Filter elements using a function.
203
+
204
+ Args:
205
+ func: Function that takes an element and returns True to keep it
206
+
207
+ Returns:
208
+ New ElementCollection with filtered elements
209
+ """
210
+ return ElementCollection([e for e in self._elements if func(e)])
211
+
212
+ def sort(self, key=None, reverse=False) -> 'ElementCollection':
213
+ """
214
+ Sort elements by the given key function.
215
+
216
+ Args:
217
+ key: Function to generate a key for sorting
218
+ reverse: Whether to sort in descending order
219
+
220
+ Returns:
221
+ Self for method chaining
222
+ """
223
+ self._elements.sort(key=key, reverse=reverse)
224
+ return self
225
+
226
+ def highlight(self,
227
+ label: Optional[str] = None,
228
+ color: Optional[tuple] = None,
229
+ use_color_cycling: bool = False,
230
+ cycle_colors: bool = False,
231
+ include_attrs: Optional[List[str]] = None,
232
+ existing: str = 'append') -> 'ElementCollection': # Added for backward compatibility
233
+ """
234
+ Highlight all elements in the collection.
235
+
236
+ Args:
237
+ label: Optional label for the highlight
238
+ color: Optional color for the highlight (RGBA tuple)
239
+ use_color_cycling: Force color cycling even with no label (default: False)
240
+ cycle_colors: Alias for use_color_cycling (deprecated, for backward compatibility)
241
+ include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
242
+ existing: How to handle existing highlights - 'append' (default) or 'replace'
243
+
244
+ Returns:
245
+ Self for method chaining
246
+ """
247
+ # Use cycle_colors if provided (backward compatibility)
248
+ color_cycle = use_color_cycling or cycle_colors
249
+
250
+ # Get the highlight manager from the first element's page (if available)
251
+ if self._elements and hasattr(self._elements[0], 'page'):
252
+ page = self._elements[0].page
253
+ if hasattr(page, '_highlight_mgr'):
254
+ highlight_mgr = page._highlight_mgr
255
+
256
+ # Add highlights for each element
257
+ for element in self._elements:
258
+ # Check if element has polygon coordinates
259
+ if hasattr(element, 'has_polygon') and element.has_polygon:
260
+ # Use polygon highlight
261
+ highlight_mgr.add_polygon_highlight(
262
+ element.polygon,
263
+ color,
264
+ label,
265
+ color_cycle,
266
+ element=element,
267
+ include_attrs=include_attrs,
268
+ existing=existing if element is self._elements[0] else 'append'
269
+ )
270
+ else:
271
+ # Get the element's bounding box
272
+ bbox = (element.x0, element.top, element.x1, element.bottom)
273
+ # Add the highlight
274
+ highlight_mgr.add_highlight(
275
+ bbox,
276
+ color,
277
+ label,
278
+ color_cycle,
279
+ element=element,
280
+ include_attrs=include_attrs,
281
+ existing=existing if element is self._elements[0] else 'append'
282
+ )
283
+
284
+ return self
285
+
286
+ def show(self,
287
+ scale: float = 2.0,
288
+ width: Optional[int] = None,
289
+ labels: bool = True,
290
+ legend_position: str = 'right',
291
+ render_ocr: bool = False) -> 'Image.Image':
292
+ """
293
+ Show the page with this collection's elements highlighted.
294
+
295
+ Args:
296
+ scale: Scale factor for rendering
297
+ width: Optional width for the output image in pixels
298
+ labels: Whether to include a legend for labels
299
+ legend_position: Position of the legend
300
+ render_ocr: Whether to render OCR text with white background boxes
301
+
302
+ Returns:
303
+ PIL Image of the page with elements highlighted
304
+ """
305
+ # Use to_image to get the image
306
+ return self.to_image(
307
+ scale=scale,
308
+ width=width,
309
+ labels=labels,
310
+ legend_position=legend_position,
311
+ render_ocr=render_ocr
312
+ )
313
+
314
+ def save(self,
315
+ filename: str,
316
+ scale: float = 2.0,
317
+ width: Optional[int] = None,
318
+ labels: bool = True,
319
+ legend_position: str = 'right',
320
+ render_ocr: bool = False) -> 'ElementCollection':
321
+ """
322
+ Save the page with this collection's elements highlighted to an image file.
323
+
324
+ Args:
325
+ filename: Path to save the image to
326
+ scale: Scale factor for rendering
327
+ width: Optional width for the output image in pixels
328
+ labels: Whether to include a legend for labels
329
+ legend_position: Position of the legend
330
+ render_ocr: Whether to render OCR text with white background boxes
331
+
332
+ Returns:
333
+ Self for method chaining
334
+ """
335
+ # Use to_image to generate and save the image
336
+ self.to_image(
337
+ path=filename,
338
+ scale=scale,
339
+ width=width,
340
+ labels=labels,
341
+ legend_position=legend_position,
342
+ render_ocr=render_ocr
343
+ )
344
+ return self
345
+
346
+ def to_image(self,
347
+ path: Optional[str] = None,
348
+ scale: float = 2.0,
349
+ width: Optional[int] = None,
350
+ labels: bool = True,
351
+ legend_position: str = 'right',
352
+ render_ocr: bool = False) -> Optional['Image.Image']:
353
+ """
354
+ Generate an image of the page with this collection's elements highlighted,
355
+ optionally saving it to a file.
356
+
357
+ Args:
358
+ path: Optional path to save the image to
359
+ scale: Scale factor for rendering
360
+ width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
361
+ labels: Whether to include a legend for labels
362
+ legend_position: Position of the legend
363
+ render_ocr: Whether to render OCR text with white background boxes
364
+
365
+ Returns:
366
+ PIL Image of the page with elements highlighted, or None if no valid page
367
+ """
368
+ # Get the page from the first element (if available)
369
+ if self._elements and hasattr(self._elements[0], 'page'):
370
+ page = self._elements[0].page
371
+ # Generate the image using to_image
372
+ return page.to_image(
373
+ path=path,
374
+ scale=scale,
375
+ width=width,
376
+ labels=labels,
377
+ legend_position=legend_position,
378
+ render_ocr=render_ocr
379
+ )
380
+ return None
381
+
382
+ class PageCollection(Generic[P]):
383
+ """
384
+ A collection of PDF pages with cross-page operations.
385
+
386
+ This class provides methods for working with multiple pages, such as finding
387
+ elements across pages, extracting text from page ranges, and more.
388
+ """
389
+
390
+ def __init__(self, pages: List[P]):
391
+ """
392
+ Initialize a page collection.
393
+
394
+ Args:
395
+ pages: List of Page objects
396
+ """
397
+ self.pages = pages
398
+
399
+ def __len__(self) -> int:
400
+ """Return the number of pages in the collection."""
401
+ return len(self.pages)
402
+
403
+ def __getitem__(self, idx) -> Union[P, 'PageCollection[P]']:
404
+ """Support indexing and slicing."""
405
+ if isinstance(idx, slice):
406
+ return PageCollection(self.pages[idx])
407
+ return self.pages[idx]
408
+
409
+ def __iter__(self) -> Iterator[P]:
410
+ """Support iteration."""
411
+ return iter(self.pages)
412
+
413
+ def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
414
+ """
415
+ Extract text from all pages in the collection.
416
+
417
+ Args:
418
+ keep_blank_chars: Whether to keep blank characters (default: True)
419
+ apply_exclusions: Whether to apply exclusion regions (default: True)
420
+ **kwargs: Additional extraction parameters
421
+
422
+ Returns:
423
+ Combined text from all pages
424
+ """
425
+ texts = []
426
+ for page in self.pages:
427
+ text = page.extract_text(
428
+ keep_blank_chars=keep_blank_chars,
429
+ apply_exclusions=apply_exclusions,
430
+ **kwargs
431
+ )
432
+ texts.append(text)
433
+
434
+ return "\n".join(texts)
435
+
436
+ def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
437
+ """
438
+ Find the first element matching the selector across all pages.
439
+
440
+ Args:
441
+ selector: CSS-like selector string
442
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
443
+ **kwargs: Additional filter parameters
444
+
445
+ Returns:
446
+ First matching element or None
447
+ """
448
+ for page in self.pages:
449
+ element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
450
+ if element:
451
+ return element
452
+ return None
453
+
454
+ def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
455
+ """
456
+ Find all elements matching the selector across all pages.
457
+
458
+ Args:
459
+ selector: CSS-like selector string
460
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
461
+ **kwargs: Additional filter parameters
462
+
463
+ Returns:
464
+ ElementCollection with matching elements from all pages
465
+ """
466
+ all_elements = []
467
+ for page in self.pages:
468
+ elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
469
+ if elements:
470
+ all_elements.extend(elements.elements)
471
+
472
+ return ElementCollection(all_elements)
473
+
474
+ def debug_ocr(self, output_path):
475
+ """
476
+ Generate an interactive HTML debug report for OCR results.
477
+
478
+ This creates a single-file HTML report with:
479
+ - Side-by-side view of image regions and OCR text
480
+ - Confidence scores with color coding
481
+ - Editable correction fields
482
+ - Filtering and sorting options
483
+ - Export functionality for corrected text
484
+
485
+ Args:
486
+ output_path: Path to save the HTML report
487
+
488
+ Returns:
489
+ Path to the generated HTML file
490
+ """
491
+ from natural_pdf.utils.ocr import debug_ocr_to_html
492
+ return debug_ocr_to_html(self.pages, output_path)
493
+
494
+ def get_sections(self,
495
+ start_elements=None,
496
+ end_elements=None,
497
+ new_section_on_page_break=False,
498
+ boundary_inclusion='both') -> List['Region']:
499
+ """
500
+ Extract sections from a page collection based on start/end elements.
501
+
502
+ Args:
503
+ start_elements: Elements or selector string that mark the start of sections
504
+ end_elements: Elements or selector string that mark the end of sections
505
+ new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
506
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
507
+
508
+ Returns:
509
+ List of Region objects representing the extracted sections
510
+ """
511
+ # Find start and end elements across all pages
512
+ if isinstance(start_elements, str):
513
+ start_elements = self.find_all(start_elements).elements
514
+
515
+ if isinstance(end_elements, str):
516
+ end_elements = self.find_all(end_elements).elements
517
+
518
+ # If no start elements, return empty list
519
+ if not start_elements:
520
+ return []
521
+
522
+ # If there are page break boundaries, we'll need to add them
523
+ if new_section_on_page_break:
524
+ # For each page boundary, create virtual "end" and "start" elements
525
+ for i in range(len(self.pages) - 1):
526
+ # Add a virtual "end" element at the bottom of the current page
527
+ page = self.pages[i]
528
+ # If end_elements is None, initialize it as an empty list
529
+ if end_elements is None:
530
+ end_elements = []
531
+
532
+ # Create a region at the bottom of the page as an artificial end marker
533
+ from natural_pdf.elements.region import Region
534
+ bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
535
+ bottom_region.is_page_boundary = True # Mark it as a special boundary
536
+ end_elements.append(bottom_region)
537
+
538
+ # Add a virtual "start" element at the top of the next page
539
+ next_page = self.pages[i + 1]
540
+ top_region = Region(next_page, (0, 0, next_page.width, 1))
541
+ top_region.is_page_boundary = True # Mark it as a special boundary
542
+ start_elements.append(top_region)
543
+
544
+ # Get all elements from all pages and sort them in document order
545
+ all_elements = []
546
+ for page in self.pages:
547
+ elements = page.get_elements()
548
+ all_elements.extend(elements)
549
+
550
+ # Sort by page index, then vertical position, then horizontal position
551
+ all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
552
+
553
+ # Mark section boundaries
554
+ section_boundaries = []
555
+
556
+ # Add start element boundaries
557
+ for element in start_elements:
558
+ if element in all_elements:
559
+ idx = all_elements.index(element)
560
+ section_boundaries.append({
561
+ 'index': idx,
562
+ 'element': element,
563
+ 'type': 'start',
564
+ 'page_idx': element.page.index
565
+ })
566
+ elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
567
+ # This is a virtual page boundary element
568
+ section_boundaries.append({
569
+ 'index': -1, # Special index for page boundaries
570
+ 'element': element,
571
+ 'type': 'start',
572
+ 'page_idx': element.page.index
573
+ })
574
+
575
+ # Add end element boundaries if provided
576
+ if end_elements:
577
+ for element in end_elements:
578
+ if element in all_elements:
579
+ idx = all_elements.index(element)
580
+ section_boundaries.append({
581
+ 'index': idx,
582
+ 'element': element,
583
+ 'type': 'end',
584
+ 'page_idx': element.page.index
585
+ })
586
+ elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
587
+ # This is a virtual page boundary element
588
+ section_boundaries.append({
589
+ 'index': -1, # Special index for page boundaries
590
+ 'element': element,
591
+ 'type': 'end',
592
+ 'page_idx': element.page.index
593
+ })
594
+
595
+ # Sort boundaries by page index, then by actual document position
596
+ section_boundaries.sort(key=lambda x: (x['page_idx'],
597
+ x['index'] if x['index'] != -1 else
598
+ (0 if x['type'] == 'start' else float('inf'))))
599
+
600
+ # Generate sections
601
+ sections = []
602
+ current_start = None
603
+
604
+ for i, boundary in enumerate(section_boundaries):
605
+ # If it's a start boundary and we don't have a current start
606
+ if boundary['type'] == 'start' and current_start is None:
607
+ current_start = boundary
608
+
609
+ # If it's an end boundary and we have a current start
610
+ elif boundary['type'] == 'end' and current_start is not None:
611
+ # Create a section from current_start to this boundary
612
+ start_element = current_start['element']
613
+ end_element = boundary['element']
614
+
615
+ # If both elements are on the same page, use the page's get_section_between
616
+ if start_element.page == end_element.page:
617
+ section = start_element.page.get_section_between(
618
+ start_element,
619
+ end_element,
620
+ boundary_inclusion
621
+ )
622
+ sections.append(section)
623
+ else:
624
+ # Create a multi-page section
625
+ from natural_pdf.elements.region import Region
626
+
627
+ # Get the start and end pages
628
+ start_page = start_element.page
629
+ end_page = end_element.page
630
+
631
+ # Create a combined region
632
+ combined_region = Region(
633
+ start_page,
634
+ (0, start_element.top, start_page.width, start_page.height)
635
+ )
636
+ combined_region._spans_pages = True
637
+ combined_region._page_range = (start_page.index, end_page.index)
638
+ combined_region.start_element = start_element
639
+ combined_region.end_element = end_element
640
+
641
+ # Get all elements that fall within this multi-page region
642
+ combined_elements = []
643
+
644
+ # Get elements from the first page
645
+ first_page_elements = [e for e in all_elements
646
+ if e.page == start_page and e.top >= start_element.top]
647
+ combined_elements.extend(first_page_elements)
648
+
649
+ # Get elements from middle pages (if any)
650
+ for page_idx in range(start_page.index + 1, end_page.index):
651
+ middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
652
+ combined_elements.extend(middle_page_elements)
653
+
654
+ # Get elements from the last page
655
+ last_page_elements = [e for e in all_elements
656
+ if e.page == end_page and e.bottom <= end_element.bottom]
657
+ combined_elements.extend(last_page_elements)
658
+
659
+ # Store the elements in the combined region
660
+ combined_region._multi_page_elements = combined_elements
661
+
662
+ sections.append(combined_region)
663
+
664
+ current_start = None
665
+
666
+ # If it's another start boundary and we have a current start (for splitting by starts only)
667
+ elif boundary['type'] == 'start' and current_start is not None and not end_elements:
668
+ # Create a section from current_start to just before this boundary
669
+ start_element = current_start['element']
670
+
671
+ # Find the last element before this boundary on the same page
672
+ if start_element.page == boundary['element'].page:
673
+ # Find elements on this page
674
+ page_elements = [e for e in all_elements if e.page == start_element.page]
675
+ # Sort by position
676
+ page_elements.sort(key=lambda e: (e.top, e.x0))
677
+
678
+ # Find the last element before the boundary
679
+ end_idx = page_elements.index(boundary['element']) - 1 if boundary['element'] in page_elements else -1
680
+ end_element = page_elements[end_idx] if end_idx >= 0 else None
681
+
682
+ # Create the section
683
+ section = start_element.page.get_section_between(
684
+ start_element,
685
+ end_element,
686
+ boundary_inclusion
687
+ )
688
+ sections.append(section)
689
+ else:
690
+ # Cross-page section - create from current_start to the end of its page
691
+ from natural_pdf.elements.region import Region
692
+ start_page = start_element.page
693
+
694
+ region = Region(
695
+ start_page,
696
+ (0, start_element.top, start_page.width, start_page.height)
697
+ )
698
+ region.start_element = start_element
699
+ sections.append(region)
700
+
701
+ current_start = boundary
702
+
703
+ # Handle the last section if we have a current start
704
+ if current_start is not None:
705
+ start_element = current_start['element']
706
+ start_page = start_element.page
707
+
708
+ if end_elements:
709
+ # With end_elements, we need an explicit end - use the last element
710
+ # on the last page of the collection
711
+ last_page = self.pages[-1]
712
+ last_page_elements = [e for e in all_elements if e.page == last_page]
713
+ last_page_elements.sort(key=lambda e: (e.top, e.x0))
714
+ end_element = last_page_elements[-1] if last_page_elements else None
715
+
716
+ # Create a multi-page section
717
+ from natural_pdf.elements.region import Region
718
+
719
+ if start_page == last_page:
720
+ # Simple case - both on same page
721
+ section = start_page.get_section_between(
722
+ start_element,
723
+ end_element,
724
+ boundary_inclusion
725
+ )
726
+ sections.append(section)
727
+ else:
728
+ # Create a multi-page section
729
+ combined_region = Region(
730
+ start_page,
731
+ (0, start_element.top, start_page.width, start_page.height)
732
+ )
733
+ combined_region._spans_pages = True
734
+ combined_region._page_range = (start_page.index, last_page.index)
735
+ combined_region.start_element = start_element
736
+ combined_region.end_element = end_element
737
+
738
+ # Get all elements that fall within this multi-page region
739
+ combined_elements = []
740
+
741
+ # Get elements from the first page
742
+ first_page_elements = [e for e in all_elements
743
+ if e.page == start_page and e.top >= start_element.top]
744
+ combined_elements.extend(first_page_elements)
745
+
746
+ # Get elements from middle pages (if any)
747
+ for page_idx in range(start_page.index + 1, last_page.index):
748
+ middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
749
+ combined_elements.extend(middle_page_elements)
750
+
751
+ # Get elements from the last page
752
+ last_page_elements = [e for e in all_elements
753
+ if e.page == last_page and (end_element is None or e.bottom <= end_element.bottom)]
754
+ combined_elements.extend(last_page_elements)
755
+
756
+ # Store the elements in the combined region
757
+ combined_region._multi_page_elements = combined_elements
758
+
759
+ sections.append(combined_region)
760
+ else:
761
+ # With start_elements only, create a section to the end of the current page
762
+ from natural_pdf.elements.region import Region
763
+ region = Region(
764
+ start_page,
765
+ (0, start_element.top, start_page.width, start_page.height)
766
+ )
767
+ region.start_element = start_element
768
+ sections.append(region)
769
+
770
+ return sections