natural-pdf 0.1.37__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import hashlib
2
2
  import logging
3
- from collections.abc import MutableSequence
3
+ from collections.abc import MutableSequence, Sequence
4
4
  from pathlib import Path
5
5
  from typing import (
6
6
  TYPE_CHECKING,
@@ -11,6 +11,7 @@ from typing import (
11
11
  Iterable,
12
12
  Iterator,
13
13
  List,
14
+ Literal,
14
15
  Optional,
15
16
  Sequence,
16
17
  Tuple,
@@ -40,6 +41,7 @@ from natural_pdf.export.mixin import ExportMixin
40
41
  from natural_pdf.ocr import OCROptions
41
42
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
42
43
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
44
+ from natural_pdf.text_mixin import TextMixin
43
45
 
44
46
  # Potentially lazy imports for optional dependencies needed in save_pdf
45
47
  try:
@@ -66,6 +68,7 @@ if TYPE_CHECKING:
66
68
  from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
67
69
  from natural_pdf.elements.region import Region
68
70
  from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
71
+ from natural_pdf.flows.flow import Flow
69
72
 
70
73
  T = TypeVar("T")
71
74
  P = TypeVar("P", bound="Page")
@@ -1416,7 +1419,7 @@ class ElementCollection(
1416
1419
 
1417
1420
  def correct_ocr(
1418
1421
  self,
1419
- correction_callback: Callable[[Any], Optional[str]],
1422
+ transform: Callable[[Any], Optional[str]],
1420
1423
  max_workers: Optional[int] = None,
1421
1424
  ) -> "ElementCollection":
1422
1425
  """
@@ -1425,10 +1428,10 @@ class ElementCollection(
1425
1428
  in parallel if `max_workers` is specified.
1426
1429
 
1427
1430
  Iterates through elements currently in the collection. If an element's
1428
- 'source' attribute starts with 'ocr', it calls the `correction_callback`
1431
+ 'source' attribute starts with 'ocr', it calls the `transform`
1429
1432
  for that element, passing the element itself.
1430
1433
 
1431
- The `correction_callback` should contain the logic to:
1434
+ The `transform` should contain the logic to:
1432
1435
  1. Determine if the element needs correction.
1433
1436
  2. Perform the correction (e.g., call an LLM).
1434
1437
  3. Return the new text (`str`) or `None`.
@@ -1438,8 +1441,8 @@ class ElementCollection(
1438
1441
  Elements without a source starting with 'ocr' are skipped.
1439
1442
 
1440
1443
  Args:
1441
- correction_callback: A function accepting an element and returning
1442
- `Optional[str]` (new text or None).
1444
+ transform: A function accepting an element and returning
1445
+ `Optional[str]` (new text or None).
1443
1446
  max_workers: The maximum number of worker threads to use for parallel
1444
1447
  correction on each page. If None, defaults are used.
1445
1448
 
@@ -1449,7 +1452,7 @@ class ElementCollection(
1449
1452
  # Delegate to the utility function
1450
1453
  _apply_ocr_correction_to_elements(
1451
1454
  elements=self._elements,
1452
- correction_callback=correction_callback,
1455
+ correction_callback=transform,
1453
1456
  caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1454
1457
  max_workers=max_workers,
1455
1458
  )
@@ -2045,20 +2048,26 @@ class ElementCollection(
2045
2048
  # ------------------------------------------------------------------
2046
2049
 
2047
2050
 
2048
- class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2051
+ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
2049
2052
  """
2050
2053
  Represents a collection of Page objects, often from a single PDF document.
2051
2054
  Provides methods for batch operations on these pages.
2052
2055
  """
2053
2056
 
2054
- def __init__(self, pages: List[P]):
2057
+ def __init__(self, pages: Union[List[P], Sequence[P]]):
2055
2058
  """
2056
2059
  Initialize a page collection.
2057
2060
 
2058
2061
  Args:
2059
- pages: List of Page objects
2062
+ pages: List or sequence of Page objects (can be lazy)
2060
2063
  """
2061
- self.pages = pages
2064
+ # Store the sequence as-is to preserve lazy behavior
2065
+ # Only convert to list if we need list-specific operations
2066
+ if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
2067
+ self.pages = pages
2068
+ else:
2069
+ # Fallback for non-sequence types
2070
+ self.pages = list(pages)
2062
2071
 
2063
2072
  def __len__(self) -> int:
2064
2073
  """Return the number of pages in the collection."""
@@ -2078,6 +2087,31 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2078
2087
  """Return a string representation showing the page count."""
2079
2088
  return f"<PageCollection(count={len(self)})>"
2080
2089
 
2090
+ def _get_items_for_apply(self) -> Iterator[P]:
2091
+ """
2092
+ Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
2093
+
2094
+ Returns an iterator that yields pages on-demand rather than materializing
2095
+ all pages at once, maintaining the lazy loading behavior.
2096
+ """
2097
+ return iter(self.pages)
2098
+
2099
+ def _get_page_indices(self) -> List[int]:
2100
+ """
2101
+ Get page indices without forcing materialization of pages.
2102
+
2103
+ Returns:
2104
+ List of page indices for the pages in this collection.
2105
+ """
2106
+ # Handle different types of page sequences efficiently
2107
+ if hasattr(self.pages, '_indices'):
2108
+ # If it's a _LazyPageList (or slice), get indices directly
2109
+ return list(self.pages._indices)
2110
+ else:
2111
+ # Fallback: if pages are already materialized, get indices normally
2112
+ # This will force materialization but only if pages aren't lazy
2113
+ return [p.index for p in self.pages]
2114
+
2081
2115
  def extract_text(
2082
2116
  self,
2083
2117
  keep_blank_chars: bool = True,
@@ -2172,7 +2206,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2172
2206
  raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
2173
2207
 
2174
2208
  # Get the 0-based indices of the pages in this collection
2175
- page_indices = [p.index for p in self.pages]
2209
+ page_indices = self._get_page_indices()
2176
2210
 
2177
2211
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
2178
2212
 
@@ -2332,22 +2366,24 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2332
2366
 
2333
2367
  return ElementCollection(all_elements)
2334
2368
 
2335
- def correct_ocr(
2369
+ def update_text(
2336
2370
  self,
2337
- correction_callback: Callable[[Any], Optional[str]],
2371
+ transform: Callable[[Any], Optional[str]],
2372
+ selector: str = "text",
2338
2373
  max_workers: Optional[int] = None,
2339
2374
  ) -> "PageCollection[P]":
2340
2375
  """
2341
- Applies corrections to OCR-generated text elements across all pages
2376
+ Applies corrections to text elements across all pages
2342
2377
  in this collection using a user-provided callback function, executed
2343
2378
  in parallel if `max_workers` is specified.
2344
2379
 
2345
- This method delegates to the parent PDF's `correct_ocr` method,
2380
+ This method delegates to the parent PDF's `update_text` method,
2346
2381
  targeting all pages within this collection.
2347
2382
 
2348
2383
  Args:
2349
- correction_callback: A function that accepts a single argument (an element
2350
- object) and returns `Optional[str]` (new text or None).
2384
+ transform: A function that accepts a single argument (an element
2385
+ object) and returns `Optional[str]` (new text or None).
2386
+ selector: The attribute name to update. Default is 'text'.
2351
2387
  max_workers: The maximum number of worker threads to use for parallel
2352
2388
  correction on each page. If None, defaults are used.
2353
2389
 
@@ -2356,10 +2392,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2356
2392
 
2357
2393
  Raises:
2358
2394
  RuntimeError: If the collection is empty, pages lack a parent PDF reference,
2359
- or the parent PDF lacks the `correct_ocr` method.
2395
+ or the parent PDF lacks the `update_text` method.
2360
2396
  """
2361
2397
  if not self.pages:
2362
- logger.warning("Cannot correct OCR for an empty PageCollection.")
2398
+ logger.warning("Cannot update text for an empty PageCollection.")
2363
2399
  # Return self even if empty to maintain chaining consistency
2364
2400
  return self
2365
2401
 
@@ -2367,24 +2403,25 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2367
2403
  parent_pdf = self.pages[0]._parent
2368
2404
  if (
2369
2405
  not parent_pdf
2370
- or not hasattr(parent_pdf, "correct_ocr")
2371
- or not callable(parent_pdf.correct_ocr)
2406
+ or not hasattr(parent_pdf, "update_text")
2407
+ or not callable(parent_pdf.update_text)
2372
2408
  ):
2373
2409
  raise RuntimeError(
2374
- "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
2410
+ "Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
2375
2411
  )
2376
2412
 
2377
- page_indices = [p.index for p in self.pages]
2413
+ page_indices = self._get_page_indices()
2378
2414
  logger.info(
2379
- f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
2415
+ f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
2380
2416
  )
2381
2417
 
2382
2418
  # Delegate the call to the parent PDF object for the relevant pages
2383
2419
  # Pass the max_workers parameter down
2384
- parent_pdf.correct_ocr(
2385
- correction_callback=correction_callback,
2420
+ parent_pdf.update_text(
2421
+ transform=transform,
2386
2422
  pages=page_indices,
2387
- max_workers=max_workers, # Pass it here
2423
+ selector=selector,
2424
+ max_workers=max_workers,
2388
2425
  )
2389
2426
 
2390
2427
  return self
@@ -2400,13 +2437,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2400
2437
  Extract sections from a page collection based on start/end elements.
2401
2438
 
2402
2439
  Args:
2403
- start_elements: Elements or selector string that mark the start of sections
2404
- end_elements: Elements or selector string that mark the end of sections
2440
+ start_elements: Elements or selector string that mark the start of sections (optional)
2441
+ end_elements: Elements or selector string that mark the end of sections (optional)
2405
2442
  new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
2406
2443
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
2407
2444
 
2408
2445
  Returns:
2409
2446
  List of Region objects representing the extracted sections
2447
+
2448
+ Note:
2449
+ You can provide only start_elements, only end_elements, or both.
2450
+ - With only start_elements: sections go from each start to the next start (or end of page)
2451
+ - With only end_elements: sections go from beginning of document/page to each end
2452
+ - With both: sections go from each start to the corresponding end
2410
2453
  """
2411
2454
  # Find start and end elements across all pages
2412
2455
  if isinstance(start_elements, str):
@@ -2415,8 +2458,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2415
2458
  if isinstance(end_elements, str):
2416
2459
  end_elements = self.find_all(end_elements).elements
2417
2460
 
2418
- # If no start elements, return empty list
2419
- if not start_elements:
2461
+ # If no start elements and no end elements, return empty list
2462
+ if not start_elements and not end_elements:
2420
2463
  return []
2421
2464
 
2422
2465
  # If there are page break boundaries, we'll need to add them
@@ -2451,6 +2494,26 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2451
2494
  # Sort by page index, then vertical position, then horizontal position
2452
2495
  all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
2453
2496
 
2497
+ # If we only have end_elements (no start_elements), create implicit start elements
2498
+ if not start_elements and end_elements:
2499
+ from natural_pdf.elements.region import Region
2500
+
2501
+ start_elements = []
2502
+
2503
+ # Add implicit start at the beginning of the first page
2504
+ first_page = self.pages[0]
2505
+ first_start = Region(first_page, (0, 0, first_page.width, 1))
2506
+ first_start.is_implicit_start = True
2507
+ start_elements.append(first_start)
2508
+
2509
+ # For each end element (except the last), add an implicit start after it
2510
+ sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
2511
+ for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
2512
+ # Create implicit start element right after this end element
2513
+ implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
2514
+ implicit_start.is_implicit_start = True
2515
+ start_elements.append(implicit_start)
2516
+
2454
2517
  # Mark section boundaries
2455
2518
  section_boundaries = []
2456
2519
 
@@ -2476,6 +2539,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2476
2539
  "page_idx": element.page.index,
2477
2540
  }
2478
2541
  )
2542
+ elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
2543
+ # This is an implicit start element
2544
+ section_boundaries.append(
2545
+ {
2546
+ "index": -2, # Special index for implicit starts
2547
+ "element": element,
2548
+ "type": "start",
2549
+ "page_idx": element.page.index,
2550
+ }
2551
+ )
2479
2552
 
2480
2553
  # Add end element boundaries if provided
2481
2554
  if end_elements:
@@ -2502,12 +2575,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2502
2575
  )
2503
2576
 
2504
2577
  # Sort boundaries by page index, then by actual document position
2505
- section_boundaries.sort(
2506
- key=lambda x: (
2507
- x["page_idx"],
2508
- x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
2509
- )
2510
- )
2578
+ def _sort_key(boundary):
2579
+ """Sort boundaries by (page_idx, vertical_top, priority)."""
2580
+ page_idx = boundary["page_idx"]
2581
+ element = boundary["element"]
2582
+
2583
+ # Vertical position on the page
2584
+ y_pos = getattr(element, "top", 0.0)
2585
+
2586
+ # Ensure starts come before ends at the same coordinate
2587
+ priority = 0 if boundary["type"] == "start" else 1
2588
+
2589
+ return (page_idx, y_pos, priority)
2590
+
2591
+ section_boundaries.sort(key=_sort_key)
2511
2592
 
2512
2593
  # Generate sections
2513
2594
  sections = []
@@ -2527,8 +2608,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2527
2608
  end_pg = end_el.page if end_el is not None else self.pages[-1]
2528
2609
 
2529
2610
  parts: list[Region] = []
2530
- # Slice of first page
2531
- parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
2611
+
2612
+ # Use the actual top of the start element (for implicit starts this is
2613
+ # the bottom of the previous end element) instead of forcing to 0.
2614
+ start_top = start_el.top
2615
+
2616
+ # Slice of first page beginning at *start_top*
2617
+ parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
2532
2618
 
2533
2619
  # Full middle pages
2534
2620
  for pg_idx in range(start_pg.index + 1, end_pg.index):
@@ -2566,9 +2652,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2566
2652
 
2567
2653
  # If both elements are on the same page, use the page's get_section_between
2568
2654
  if start_element.page == end_element.page:
2569
- section = start_element.page.get_section_between(
2570
- start_element, end_element, boundary_inclusion
2571
- )
2655
+ # For implicit start elements, create a region from the top of the page
2656
+ if hasattr(start_element, "is_implicit_start"):
2657
+ from natural_pdf.elements.region import Region
2658
+ section = Region(
2659
+ start_element.page,
2660
+ (0, start_element.top, start_element.page.width, end_element.bottom)
2661
+ )
2662
+ section.start_element = start_element
2663
+ section.boundary_element_found = end_element
2664
+ else:
2665
+ section = start_element.page.get_section_between(
2666
+ start_element, end_element, boundary_inclusion
2667
+ )
2572
2668
  sections.append(section)
2573
2669
  else:
2574
2670
  # Create FlowRegion spanning pages
@@ -2607,9 +2703,11 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2607
2703
  from natural_pdf.elements.region import Region
2608
2704
 
2609
2705
  start_page = start_element.page
2610
-
2706
+
2707
+ # Handle implicit start elements
2708
+ start_top = start_element.top
2611
2709
  region = Region(
2612
- start_page, (0, start_element.top, start_page.width, start_page.height)
2710
+ start_page, (0, start_top, start_page.width, start_page.height)
2613
2711
  )
2614
2712
  region.start_element = start_element
2615
2713
  sections.append(region)
@@ -2636,8 +2734,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2636
2734
  # With start_elements only, create a section to the end of the current page
2637
2735
  from natural_pdf.elements.region import Region
2638
2736
 
2737
+ # Handle implicit start elements
2738
+ start_top = start_element.top
2639
2739
  region = Region(
2640
- start_page, (0, start_element.top, start_page.width, start_page.height)
2740
+ start_page, (0, start_top, start_page.width, start_page.height)
2641
2741
  )
2642
2742
  region.start_element = start_element
2643
2743
  sections.append(region)
@@ -2800,7 +2900,7 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2800
2900
  )
2801
2901
 
2802
2902
  # Get the 0-based indices of the pages in this collection
2803
- page_indices = [p.index for p in self.pages]
2903
+ page_indices = self._get_page_indices()
2804
2904
  logger.info(
2805
2905
  f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2806
2906
  )
@@ -3150,6 +3250,61 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
3150
3250
  raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
3151
3251
  # <--- END MODIFIED
3152
3252
 
3253
+ def to_flow(
3254
+ self,
3255
+ arrangement: Literal["vertical", "horizontal"] = "vertical",
3256
+ alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
3257
+ segment_gap: float = 0.0,
3258
+ ) -> "Flow":
3259
+ """
3260
+ Convert this PageCollection to a Flow for cross-page operations.
3261
+
3262
+ This enables treating multiple pages as a continuous logical document
3263
+ structure, useful for multi-page tables, articles spanning columns,
3264
+ or any content requiring reading order across page boundaries.
3265
+
3266
+ Args:
3267
+ arrangement: Primary flow direction ('vertical' or 'horizontal').
3268
+ 'vertical' stacks pages top-to-bottom (most common).
3269
+ 'horizontal' arranges pages left-to-right.
3270
+ alignment: Cross-axis alignment for pages of different sizes:
3271
+ For vertical: 'left'/'start', 'center', 'right'/'end'
3272
+ For horizontal: 'top'/'start', 'center', 'bottom'/'end'
3273
+ segment_gap: Virtual gap between pages in PDF points (default: 0.0).
3274
+
3275
+ Returns:
3276
+ Flow object that can perform operations across all pages in sequence.
3277
+
3278
+ Example:
3279
+ Multi-page table extraction:
3280
+ ```python
3281
+ pdf = npdf.PDF("multi_page_report.pdf")
3282
+
3283
+ # Create flow for pages 2-4 containing a table
3284
+ table_flow = pdf.pages[1:4].to_flow()
3285
+
3286
+ # Extract table as if it were continuous
3287
+ table_data = table_flow.extract_table()
3288
+ df = table_data.df
3289
+ ```
3290
+
3291
+ Cross-page element search:
3292
+ ```python
3293
+ # Find all headers across multiple pages
3294
+ headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
3295
+
3296
+ # Analyze layout across pages
3297
+ regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
3298
+ ```
3299
+ """
3300
+ from natural_pdf.flows.flow import Flow
3301
+ return Flow(
3302
+ segments=self, # Flow constructor now handles PageCollection
3303
+ arrangement=arrangement,
3304
+ alignment=alignment,
3305
+ segment_gap=segment_gap,
3306
+ )
3307
+
3153
3308
  # Alias .to_image() to .show() for convenience
3154
3309
  def show(
3155
3310
  self,