natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -0
- natural_pdf/core/page.py +90 -22
- natural_pdf/core/pdf.py +183 -59
- natural_pdf/elements/collections.py +202 -47
- natural_pdf/elements/region.py +176 -56
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +702 -20
- natural_pdf/flows/region.py +52 -4
- natural_pdf/selectors/parser.py +34 -1
- natural_pdf/text_mixin.py +97 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.37.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
|
-
from collections.abc import MutableSequence
|
3
|
+
from collections.abc import MutableSequence, Sequence
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import (
|
6
6
|
TYPE_CHECKING,
|
@@ -11,6 +11,7 @@ from typing import (
|
|
11
11
|
Iterable,
|
12
12
|
Iterator,
|
13
13
|
List,
|
14
|
+
Literal,
|
14
15
|
Optional,
|
15
16
|
Sequence,
|
16
17
|
Tuple,
|
@@ -40,6 +41,7 @@ from natural_pdf.export.mixin import ExportMixin
|
|
40
41
|
from natural_pdf.ocr import OCROptions
|
41
42
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
42
43
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
44
|
+
from natural_pdf.text_mixin import TextMixin
|
43
45
|
|
44
46
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
45
47
|
try:
|
@@ -66,6 +68,7 @@ if TYPE_CHECKING:
|
|
66
68
|
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
67
69
|
from natural_pdf.elements.region import Region
|
68
70
|
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
71
|
+
from natural_pdf.flows.flow import Flow
|
69
72
|
|
70
73
|
T = TypeVar("T")
|
71
74
|
P = TypeVar("P", bound="Page")
|
@@ -1416,7 +1419,7 @@ class ElementCollection(
|
|
1416
1419
|
|
1417
1420
|
def correct_ocr(
|
1418
1421
|
self,
|
1419
|
-
|
1422
|
+
transform: Callable[[Any], Optional[str]],
|
1420
1423
|
max_workers: Optional[int] = None,
|
1421
1424
|
) -> "ElementCollection":
|
1422
1425
|
"""
|
@@ -1425,10 +1428,10 @@ class ElementCollection(
|
|
1425
1428
|
in parallel if `max_workers` is specified.
|
1426
1429
|
|
1427
1430
|
Iterates through elements currently in the collection. If an element's
|
1428
|
-
'source' attribute starts with 'ocr', it calls the `
|
1431
|
+
'source' attribute starts with 'ocr', it calls the `transform`
|
1429
1432
|
for that element, passing the element itself.
|
1430
1433
|
|
1431
|
-
The `
|
1434
|
+
The `transform` should contain the logic to:
|
1432
1435
|
1. Determine if the element needs correction.
|
1433
1436
|
2. Perform the correction (e.g., call an LLM).
|
1434
1437
|
3. Return the new text (`str`) or `None`.
|
@@ -1438,8 +1441,8 @@ class ElementCollection(
|
|
1438
1441
|
Elements without a source starting with 'ocr' are skipped.
|
1439
1442
|
|
1440
1443
|
Args:
|
1441
|
-
|
1442
|
-
|
1444
|
+
transform: A function accepting an element and returning
|
1445
|
+
`Optional[str]` (new text or None).
|
1443
1446
|
max_workers: The maximum number of worker threads to use for parallel
|
1444
1447
|
correction on each page. If None, defaults are used.
|
1445
1448
|
|
@@ -1449,7 +1452,7 @@ class ElementCollection(
|
|
1449
1452
|
# Delegate to the utility function
|
1450
1453
|
_apply_ocr_correction_to_elements(
|
1451
1454
|
elements=self._elements,
|
1452
|
-
correction_callback=
|
1455
|
+
correction_callback=transform,
|
1453
1456
|
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1454
1457
|
max_workers=max_workers,
|
1455
1458
|
)
|
@@ -2045,20 +2048,26 @@ class ElementCollection(
|
|
2045
2048
|
# ------------------------------------------------------------------
|
2046
2049
|
|
2047
2050
|
|
2048
|
-
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
2051
|
+
class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
|
2049
2052
|
"""
|
2050
2053
|
Represents a collection of Page objects, often from a single PDF document.
|
2051
2054
|
Provides methods for batch operations on these pages.
|
2052
2055
|
"""
|
2053
2056
|
|
2054
|
-
def __init__(self, pages: List[P]):
|
2057
|
+
def __init__(self, pages: Union[List[P], Sequence[P]]):
|
2055
2058
|
"""
|
2056
2059
|
Initialize a page collection.
|
2057
2060
|
|
2058
2061
|
Args:
|
2059
|
-
pages: List of Page objects
|
2062
|
+
pages: List or sequence of Page objects (can be lazy)
|
2060
2063
|
"""
|
2061
|
-
|
2064
|
+
# Store the sequence as-is to preserve lazy behavior
|
2065
|
+
# Only convert to list if we need list-specific operations
|
2066
|
+
if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
|
2067
|
+
self.pages = pages
|
2068
|
+
else:
|
2069
|
+
# Fallback for non-sequence types
|
2070
|
+
self.pages = list(pages)
|
2062
2071
|
|
2063
2072
|
def __len__(self) -> int:
|
2064
2073
|
"""Return the number of pages in the collection."""
|
@@ -2078,6 +2087,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2078
2087
|
"""Return a string representation showing the page count."""
|
2079
2088
|
return f"<PageCollection(count={len(self)})>"
|
2080
2089
|
|
2090
|
+
def _get_items_for_apply(self) -> Iterator[P]:
|
2091
|
+
"""
|
2092
|
+
Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
|
2093
|
+
|
2094
|
+
Returns an iterator that yields pages on-demand rather than materializing
|
2095
|
+
all pages at once, maintaining the lazy loading behavior.
|
2096
|
+
"""
|
2097
|
+
return iter(self.pages)
|
2098
|
+
|
2099
|
+
def _get_page_indices(self) -> List[int]:
|
2100
|
+
"""
|
2101
|
+
Get page indices without forcing materialization of pages.
|
2102
|
+
|
2103
|
+
Returns:
|
2104
|
+
List of page indices for the pages in this collection.
|
2105
|
+
"""
|
2106
|
+
# Handle different types of page sequences efficiently
|
2107
|
+
if hasattr(self.pages, '_indices'):
|
2108
|
+
# If it's a _LazyPageList (or slice), get indices directly
|
2109
|
+
return list(self.pages._indices)
|
2110
|
+
else:
|
2111
|
+
# Fallback: if pages are already materialized, get indices normally
|
2112
|
+
# This will force materialization but only if pages aren't lazy
|
2113
|
+
return [p.index for p in self.pages]
|
2114
|
+
|
2081
2115
|
def extract_text(
|
2082
2116
|
self,
|
2083
2117
|
keep_blank_chars: bool = True,
|
@@ -2172,7 +2206,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2172
2206
|
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
2173
2207
|
|
2174
2208
|
# Get the 0-based indices of the pages in this collection
|
2175
|
-
page_indices =
|
2209
|
+
page_indices = self._get_page_indices()
|
2176
2210
|
|
2177
2211
|
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
2178
2212
|
|
@@ -2332,22 +2366,24 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2332
2366
|
|
2333
2367
|
return ElementCollection(all_elements)
|
2334
2368
|
|
2335
|
-
def
|
2369
|
+
def update_text(
|
2336
2370
|
self,
|
2337
|
-
|
2371
|
+
transform: Callable[[Any], Optional[str]],
|
2372
|
+
selector: str = "text",
|
2338
2373
|
max_workers: Optional[int] = None,
|
2339
2374
|
) -> "PageCollection[P]":
|
2340
2375
|
"""
|
2341
|
-
Applies corrections to
|
2376
|
+
Applies corrections to text elements across all pages
|
2342
2377
|
in this collection using a user-provided callback function, executed
|
2343
2378
|
in parallel if `max_workers` is specified.
|
2344
2379
|
|
2345
|
-
This method delegates to the parent PDF's `
|
2380
|
+
This method delegates to the parent PDF's `update_text` method,
|
2346
2381
|
targeting all pages within this collection.
|
2347
2382
|
|
2348
2383
|
Args:
|
2349
|
-
|
2350
|
-
|
2384
|
+
transform: A function that accepts a single argument (an element
|
2385
|
+
object) and returns `Optional[str]` (new text or None).
|
2386
|
+
selector: The attribute name to update. Default is 'text'.
|
2351
2387
|
max_workers: The maximum number of worker threads to use for parallel
|
2352
2388
|
correction on each page. If None, defaults are used.
|
2353
2389
|
|
@@ -2356,10 +2392,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2356
2392
|
|
2357
2393
|
Raises:
|
2358
2394
|
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
2359
|
-
or the parent PDF lacks the `
|
2395
|
+
or the parent PDF lacks the `update_text` method.
|
2360
2396
|
"""
|
2361
2397
|
if not self.pages:
|
2362
|
-
logger.warning("Cannot
|
2398
|
+
logger.warning("Cannot update text for an empty PageCollection.")
|
2363
2399
|
# Return self even if empty to maintain chaining consistency
|
2364
2400
|
return self
|
2365
2401
|
|
@@ -2367,24 +2403,25 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2367
2403
|
parent_pdf = self.pages[0]._parent
|
2368
2404
|
if (
|
2369
2405
|
not parent_pdf
|
2370
|
-
or not hasattr(parent_pdf, "
|
2371
|
-
or not callable(parent_pdf.
|
2406
|
+
or not hasattr(parent_pdf, "update_text")
|
2407
|
+
or not callable(parent_pdf.update_text)
|
2372
2408
|
):
|
2373
2409
|
raise RuntimeError(
|
2374
|
-
"Parent PDF reference not found or parent PDF lacks the required '
|
2410
|
+
"Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
|
2375
2411
|
)
|
2376
2412
|
|
2377
|
-
page_indices =
|
2413
|
+
page_indices = self._get_page_indices()
|
2378
2414
|
logger.info(
|
2379
|
-
f"PageCollection: Delegating
|
2415
|
+
f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
|
2380
2416
|
)
|
2381
2417
|
|
2382
2418
|
# Delegate the call to the parent PDF object for the relevant pages
|
2383
2419
|
# Pass the max_workers parameter down
|
2384
|
-
parent_pdf.
|
2385
|
-
|
2420
|
+
parent_pdf.update_text(
|
2421
|
+
transform=transform,
|
2386
2422
|
pages=page_indices,
|
2387
|
-
|
2423
|
+
selector=selector,
|
2424
|
+
max_workers=max_workers,
|
2388
2425
|
)
|
2389
2426
|
|
2390
2427
|
return self
|
@@ -2400,13 +2437,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2400
2437
|
Extract sections from a page collection based on start/end elements.
|
2401
2438
|
|
2402
2439
|
Args:
|
2403
|
-
start_elements: Elements or selector string that mark the start of sections
|
2404
|
-
end_elements: Elements or selector string that mark the end of sections
|
2440
|
+
start_elements: Elements or selector string that mark the start of sections (optional)
|
2441
|
+
end_elements: Elements or selector string that mark the end of sections (optional)
|
2405
2442
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
2406
2443
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
2407
2444
|
|
2408
2445
|
Returns:
|
2409
2446
|
List of Region objects representing the extracted sections
|
2447
|
+
|
2448
|
+
Note:
|
2449
|
+
You can provide only start_elements, only end_elements, or both.
|
2450
|
+
- With only start_elements: sections go from each start to the next start (or end of page)
|
2451
|
+
- With only end_elements: sections go from beginning of document/page to each end
|
2452
|
+
- With both: sections go from each start to the corresponding end
|
2410
2453
|
"""
|
2411
2454
|
# Find start and end elements across all pages
|
2412
2455
|
if isinstance(start_elements, str):
|
@@ -2415,8 +2458,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2415
2458
|
if isinstance(end_elements, str):
|
2416
2459
|
end_elements = self.find_all(end_elements).elements
|
2417
2460
|
|
2418
|
-
# If no start elements, return empty list
|
2419
|
-
if not start_elements:
|
2461
|
+
# If no start elements and no end elements, return empty list
|
2462
|
+
if not start_elements and not end_elements:
|
2420
2463
|
return []
|
2421
2464
|
|
2422
2465
|
# If there are page break boundaries, we'll need to add them
|
@@ -2451,6 +2494,26 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2451
2494
|
# Sort by page index, then vertical position, then horizontal position
|
2452
2495
|
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
2453
2496
|
|
2497
|
+
# If we only have end_elements (no start_elements), create implicit start elements
|
2498
|
+
if not start_elements and end_elements:
|
2499
|
+
from natural_pdf.elements.region import Region
|
2500
|
+
|
2501
|
+
start_elements = []
|
2502
|
+
|
2503
|
+
# Add implicit start at the beginning of the first page
|
2504
|
+
first_page = self.pages[0]
|
2505
|
+
first_start = Region(first_page, (0, 0, first_page.width, 1))
|
2506
|
+
first_start.is_implicit_start = True
|
2507
|
+
start_elements.append(first_start)
|
2508
|
+
|
2509
|
+
# For each end element (except the last), add an implicit start after it
|
2510
|
+
sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
|
2511
|
+
for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
|
2512
|
+
# Create implicit start element right after this end element
|
2513
|
+
implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
|
2514
|
+
implicit_start.is_implicit_start = True
|
2515
|
+
start_elements.append(implicit_start)
|
2516
|
+
|
2454
2517
|
# Mark section boundaries
|
2455
2518
|
section_boundaries = []
|
2456
2519
|
|
@@ -2476,6 +2539,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2476
2539
|
"page_idx": element.page.index,
|
2477
2540
|
}
|
2478
2541
|
)
|
2542
|
+
elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
|
2543
|
+
# This is an implicit start element
|
2544
|
+
section_boundaries.append(
|
2545
|
+
{
|
2546
|
+
"index": -2, # Special index for implicit starts
|
2547
|
+
"element": element,
|
2548
|
+
"type": "start",
|
2549
|
+
"page_idx": element.page.index,
|
2550
|
+
}
|
2551
|
+
)
|
2479
2552
|
|
2480
2553
|
# Add end element boundaries if provided
|
2481
2554
|
if end_elements:
|
@@ -2502,12 +2575,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2502
2575
|
)
|
2503
2576
|
|
2504
2577
|
# Sort boundaries by page index, then by actual document position
|
2505
|
-
|
2506
|
-
|
2507
|
-
|
2508
|
-
|
2509
|
-
|
2510
|
-
|
2578
|
+
def _sort_key(boundary):
|
2579
|
+
"""Sort boundaries by (page_idx, vertical_top, priority)."""
|
2580
|
+
page_idx = boundary["page_idx"]
|
2581
|
+
element = boundary["element"]
|
2582
|
+
|
2583
|
+
# Vertical position on the page
|
2584
|
+
y_pos = getattr(element, "top", 0.0)
|
2585
|
+
|
2586
|
+
# Ensure starts come before ends at the same coordinate
|
2587
|
+
priority = 0 if boundary["type"] == "start" else 1
|
2588
|
+
|
2589
|
+
return (page_idx, y_pos, priority)
|
2590
|
+
|
2591
|
+
section_boundaries.sort(key=_sort_key)
|
2511
2592
|
|
2512
2593
|
# Generate sections
|
2513
2594
|
sections = []
|
@@ -2527,8 +2608,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2527
2608
|
end_pg = end_el.page if end_el is not None else self.pages[-1]
|
2528
2609
|
|
2529
2610
|
parts: list[Region] = []
|
2530
|
-
|
2531
|
-
|
2611
|
+
|
2612
|
+
# Use the actual top of the start element (for implicit starts this is
|
2613
|
+
# the bottom of the previous end element) instead of forcing to 0.
|
2614
|
+
start_top = start_el.top
|
2615
|
+
|
2616
|
+
# Slice of first page beginning at *start_top*
|
2617
|
+
parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
|
2532
2618
|
|
2533
2619
|
# Full middle pages
|
2534
2620
|
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
@@ -2566,9 +2652,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2566
2652
|
|
2567
2653
|
# If both elements are on the same page, use the page's get_section_between
|
2568
2654
|
if start_element.page == end_element.page:
|
2569
|
-
|
2570
|
-
|
2571
|
-
|
2655
|
+
# For implicit start elements, create a region from the top of the page
|
2656
|
+
if hasattr(start_element, "is_implicit_start"):
|
2657
|
+
from natural_pdf.elements.region import Region
|
2658
|
+
section = Region(
|
2659
|
+
start_element.page,
|
2660
|
+
(0, start_element.top, start_element.page.width, end_element.bottom)
|
2661
|
+
)
|
2662
|
+
section.start_element = start_element
|
2663
|
+
section.boundary_element_found = end_element
|
2664
|
+
else:
|
2665
|
+
section = start_element.page.get_section_between(
|
2666
|
+
start_element, end_element, boundary_inclusion
|
2667
|
+
)
|
2572
2668
|
sections.append(section)
|
2573
2669
|
else:
|
2574
2670
|
# Create FlowRegion spanning pages
|
@@ -2607,9 +2703,11 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2607
2703
|
from natural_pdf.elements.region import Region
|
2608
2704
|
|
2609
2705
|
start_page = start_element.page
|
2610
|
-
|
2706
|
+
|
2707
|
+
# Handle implicit start elements
|
2708
|
+
start_top = start_element.top
|
2611
2709
|
region = Region(
|
2612
|
-
start_page, (0,
|
2710
|
+
start_page, (0, start_top, start_page.width, start_page.height)
|
2613
2711
|
)
|
2614
2712
|
region.start_element = start_element
|
2615
2713
|
sections.append(region)
|
@@ -2636,8 +2734,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2636
2734
|
# With start_elements only, create a section to the end of the current page
|
2637
2735
|
from natural_pdf.elements.region import Region
|
2638
2736
|
|
2737
|
+
# Handle implicit start elements
|
2738
|
+
start_top = start_element.top
|
2639
2739
|
region = Region(
|
2640
|
-
start_page, (0,
|
2740
|
+
start_page, (0, start_top, start_page.width, start_page.height)
|
2641
2741
|
)
|
2642
2742
|
region.start_element = start_element
|
2643
2743
|
sections.append(region)
|
@@ -2800,7 +2900,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2800
2900
|
)
|
2801
2901
|
|
2802
2902
|
# Get the 0-based indices of the pages in this collection
|
2803
|
-
page_indices =
|
2903
|
+
page_indices = self._get_page_indices()
|
2804
2904
|
logger.info(
|
2805
2905
|
f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
|
2806
2906
|
)
|
@@ -3150,6 +3250,61 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
3150
3250
|
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
3151
3251
|
# <--- END MODIFIED
|
3152
3252
|
|
3253
|
+
def to_flow(
|
3254
|
+
self,
|
3255
|
+
arrangement: Literal["vertical", "horizontal"] = "vertical",
|
3256
|
+
alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
|
3257
|
+
segment_gap: float = 0.0,
|
3258
|
+
) -> "Flow":
|
3259
|
+
"""
|
3260
|
+
Convert this PageCollection to a Flow for cross-page operations.
|
3261
|
+
|
3262
|
+
This enables treating multiple pages as a continuous logical document
|
3263
|
+
structure, useful for multi-page tables, articles spanning columns,
|
3264
|
+
or any content requiring reading order across page boundaries.
|
3265
|
+
|
3266
|
+
Args:
|
3267
|
+
arrangement: Primary flow direction ('vertical' or 'horizontal').
|
3268
|
+
'vertical' stacks pages top-to-bottom (most common).
|
3269
|
+
'horizontal' arranges pages left-to-right.
|
3270
|
+
alignment: Cross-axis alignment for pages of different sizes:
|
3271
|
+
For vertical: 'left'/'start', 'center', 'right'/'end'
|
3272
|
+
For horizontal: 'top'/'start', 'center', 'bottom'/'end'
|
3273
|
+
segment_gap: Virtual gap between pages in PDF points (default: 0.0).
|
3274
|
+
|
3275
|
+
Returns:
|
3276
|
+
Flow object that can perform operations across all pages in sequence.
|
3277
|
+
|
3278
|
+
Example:
|
3279
|
+
Multi-page table extraction:
|
3280
|
+
```python
|
3281
|
+
pdf = npdf.PDF("multi_page_report.pdf")
|
3282
|
+
|
3283
|
+
# Create flow for pages 2-4 containing a table
|
3284
|
+
table_flow = pdf.pages[1:4].to_flow()
|
3285
|
+
|
3286
|
+
# Extract table as if it were continuous
|
3287
|
+
table_data = table_flow.extract_table()
|
3288
|
+
df = table_data.df
|
3289
|
+
```
|
3290
|
+
|
3291
|
+
Cross-page element search:
|
3292
|
+
```python
|
3293
|
+
# Find all headers across multiple pages
|
3294
|
+
headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
|
3295
|
+
|
3296
|
+
# Analyze layout across pages
|
3297
|
+
regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
|
3298
|
+
```
|
3299
|
+
"""
|
3300
|
+
from natural_pdf.flows.flow import Flow
|
3301
|
+
return Flow(
|
3302
|
+
segments=self, # Flow constructor now handles PageCollection
|
3303
|
+
arrangement=arrangement,
|
3304
|
+
alignment=alignment,
|
3305
|
+
segment_gap=segment_gap,
|
3306
|
+
)
|
3307
|
+
|
3153
3308
|
# Alias .to_image() to .show() for convenience
|
3154
3309
|
def show(
|
3155
3310
|
self,
|