natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ from typing import (
11
11
  Iterable,
12
12
  Iterator,
13
13
  List,
14
+ Literal,
14
15
  Optional,
15
16
  Sequence,
16
17
  Tuple,
@@ -40,6 +41,7 @@ from natural_pdf.export.mixin import ExportMixin
40
41
  from natural_pdf.ocr import OCROptions
41
42
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
42
43
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
44
+ from natural_pdf.text_mixin import TextMixin
43
45
 
44
46
  # Potentially lazy imports for optional dependencies needed in save_pdf
45
47
  try:
@@ -66,6 +68,7 @@ if TYPE_CHECKING:
66
68
  from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
67
69
  from natural_pdf.elements.region import Region
68
70
  from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
71
+ from natural_pdf.flows.flow import Flow
69
72
 
70
73
  T = TypeVar("T")
71
74
  P = TypeVar("P", bound="Page")
@@ -1416,7 +1419,7 @@ class ElementCollection(
1416
1419
 
1417
1420
  def correct_ocr(
1418
1421
  self,
1419
- correction_callback: Callable[[Any], Optional[str]],
1422
+ transform: Callable[[Any], Optional[str]],
1420
1423
  max_workers: Optional[int] = None,
1421
1424
  ) -> "ElementCollection":
1422
1425
  """
@@ -1425,10 +1428,10 @@ class ElementCollection(
1425
1428
  in parallel if `max_workers` is specified.
1426
1429
 
1427
1430
  Iterates through elements currently in the collection. If an element's
1428
- 'source' attribute starts with 'ocr', it calls the `correction_callback`
1431
+ 'source' attribute starts with 'ocr', it calls the `transform`
1429
1432
  for that element, passing the element itself.
1430
1433
 
1431
- The `correction_callback` should contain the logic to:
1434
+ The `transform` should contain the logic to:
1432
1435
  1. Determine if the element needs correction.
1433
1436
  2. Perform the correction (e.g., call an LLM).
1434
1437
  3. Return the new text (`str`) or `None`.
@@ -1438,8 +1441,8 @@ class ElementCollection(
1438
1441
  Elements without a source starting with 'ocr' are skipped.
1439
1442
 
1440
1443
  Args:
1441
- correction_callback: A function accepting an element and returning
1442
- `Optional[str]` (new text or None).
1444
+ transform: A function accepting an element and returning
1445
+ `Optional[str]` (new text or None).
1443
1446
  max_workers: The maximum number of worker threads to use for parallel
1444
1447
  correction on each page. If None, defaults are used.
1445
1448
 
@@ -1449,7 +1452,7 @@ class ElementCollection(
1449
1452
  # Delegate to the utility function
1450
1453
  _apply_ocr_correction_to_elements(
1451
1454
  elements=self._elements,
1452
- correction_callback=correction_callback,
1455
+ correction_callback=transform,
1453
1456
  caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1454
1457
  max_workers=max_workers,
1455
1458
  )
@@ -2045,7 +2048,7 @@ class ElementCollection(
2045
2048
  # ------------------------------------------------------------------
2046
2049
 
2047
2050
 
2048
- class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2051
+ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
2049
2052
  """
2050
2053
  Represents a collection of Page objects, often from a single PDF document.
2051
2054
  Provides methods for batch operations on these pages.
@@ -2363,22 +2366,24 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2363
2366
 
2364
2367
  return ElementCollection(all_elements)
2365
2368
 
2366
- def correct_ocr(
2369
+ def update_text(
2367
2370
  self,
2368
- correction_callback: Callable[[Any], Optional[str]],
2371
+ transform: Callable[[Any], Optional[str]],
2372
+ selector: str = "text",
2369
2373
  max_workers: Optional[int] = None,
2370
2374
  ) -> "PageCollection[P]":
2371
2375
  """
2372
- Applies corrections to OCR-generated text elements across all pages
2376
+ Applies corrections to text elements across all pages
2373
2377
  in this collection using a user-provided callback function, executed
2374
2378
  in parallel if `max_workers` is specified.
2375
2379
 
2376
- This method delegates to the parent PDF's `correct_ocr` method,
2380
+ This method delegates to the parent PDF's `update_text` method,
2377
2381
  targeting all pages within this collection.
2378
2382
 
2379
2383
  Args:
2380
- correction_callback: A function that accepts a single argument (an element
2381
- object) and returns `Optional[str]` (new text or None).
2384
+ transform: A function that accepts a single argument (an element
2385
+ object) and returns `Optional[str]` (new text or None).
2386
+ selector: The attribute name to update. Default is 'text'.
2382
2387
  max_workers: The maximum number of worker threads to use for parallel
2383
2388
  correction on each page. If None, defaults are used.
2384
2389
 
@@ -2387,10 +2392,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2387
2392
 
2388
2393
  Raises:
2389
2394
  RuntimeError: If the collection is empty, pages lack a parent PDF reference,
2390
- or the parent PDF lacks the `correct_ocr` method.
2395
+ or the parent PDF lacks the `update_text` method.
2391
2396
  """
2392
2397
  if not self.pages:
2393
- logger.warning("Cannot correct OCR for an empty PageCollection.")
2398
+ logger.warning("Cannot update text for an empty PageCollection.")
2394
2399
  # Return self even if empty to maintain chaining consistency
2395
2400
  return self
2396
2401
 
@@ -2398,24 +2403,25 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2398
2403
  parent_pdf = self.pages[0]._parent
2399
2404
  if (
2400
2405
  not parent_pdf
2401
- or not hasattr(parent_pdf, "correct_ocr")
2402
- or not callable(parent_pdf.correct_ocr)
2406
+ or not hasattr(parent_pdf, "update_text")
2407
+ or not callable(parent_pdf.update_text)
2403
2408
  ):
2404
2409
  raise RuntimeError(
2405
- "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
2410
+ "Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
2406
2411
  )
2407
2412
 
2408
2413
  page_indices = self._get_page_indices()
2409
2414
  logger.info(
2410
- f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
2415
+ f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
2411
2416
  )
2412
2417
 
2413
2418
  # Delegate the call to the parent PDF object for the relevant pages
2414
2419
  # Pass the max_workers parameter down
2415
- parent_pdf.correct_ocr(
2416
- correction_callback=correction_callback,
2420
+ parent_pdf.update_text(
2421
+ transform=transform,
2417
2422
  pages=page_indices,
2418
- max_workers=max_workers, # Pass it here
2423
+ selector=selector,
2424
+ max_workers=max_workers,
2419
2425
  )
2420
2426
 
2421
2427
  return self
@@ -2431,13 +2437,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2431
2437
  Extract sections from a page collection based on start/end elements.
2432
2438
 
2433
2439
  Args:
2434
- start_elements: Elements or selector string that mark the start of sections
2435
- end_elements: Elements or selector string that mark the end of sections
2440
+ start_elements: Elements or selector string that mark the start of sections (optional)
2441
+ end_elements: Elements or selector string that mark the end of sections (optional)
2436
2442
  new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
2437
2443
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
2438
2444
 
2439
2445
  Returns:
2440
2446
  List of Region objects representing the extracted sections
2447
+
2448
+ Note:
2449
+ You can provide only start_elements, only end_elements, or both.
2450
+ - With only start_elements: sections go from each start to the next start (or end of page)
2451
+ - With only end_elements: sections go from beginning of document/page to each end
2452
+ - With both: sections go from each start to the corresponding end
2441
2453
  """
2442
2454
  # Find start and end elements across all pages
2443
2455
  if isinstance(start_elements, str):
@@ -2446,8 +2458,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2446
2458
  if isinstance(end_elements, str):
2447
2459
  end_elements = self.find_all(end_elements).elements
2448
2460
 
2449
- # If no start elements, return empty list
2450
- if not start_elements:
2461
+ # If no start elements and no end elements, return empty list
2462
+ if not start_elements and not end_elements:
2451
2463
  return []
2452
2464
 
2453
2465
  # If there are page break boundaries, we'll need to add them
@@ -2482,6 +2494,26 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2482
2494
  # Sort by page index, then vertical position, then horizontal position
2483
2495
  all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
2484
2496
 
2497
+ # If we only have end_elements (no start_elements), create implicit start elements
2498
+ if not start_elements and end_elements:
2499
+ from natural_pdf.elements.region import Region
2500
+
2501
+ start_elements = []
2502
+
2503
+ # Add implicit start at the beginning of the first page
2504
+ first_page = self.pages[0]
2505
+ first_start = Region(first_page, (0, 0, first_page.width, 1))
2506
+ first_start.is_implicit_start = True
2507
+ start_elements.append(first_start)
2508
+
2509
+ # For each end element (except the last), add an implicit start after it
2510
+ sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
2511
+ for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
2512
+ # Create implicit start element right after this end element
2513
+ implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
2514
+ implicit_start.is_implicit_start = True
2515
+ start_elements.append(implicit_start)
2516
+
2485
2517
  # Mark section boundaries
2486
2518
  section_boundaries = []
2487
2519
 
@@ -2507,6 +2539,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2507
2539
  "page_idx": element.page.index,
2508
2540
  }
2509
2541
  )
2542
+ elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
2543
+ # This is an implicit start element
2544
+ section_boundaries.append(
2545
+ {
2546
+ "index": -2, # Special index for implicit starts
2547
+ "element": element,
2548
+ "type": "start",
2549
+ "page_idx": element.page.index,
2550
+ }
2551
+ )
2510
2552
 
2511
2553
  # Add end element boundaries if provided
2512
2554
  if end_elements:
@@ -2533,12 +2575,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2533
2575
  )
2534
2576
 
2535
2577
  # Sort boundaries by page index, then by actual document position
2536
- section_boundaries.sort(
2537
- key=lambda x: (
2538
- x["page_idx"],
2539
- x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
2540
- )
2541
- )
2578
+ def _sort_key(boundary):
2579
+ """Sort boundaries by (page_idx, vertical_top, priority)."""
2580
+ page_idx = boundary["page_idx"]
2581
+ element = boundary["element"]
2582
+
2583
+ # Vertical position on the page
2584
+ y_pos = getattr(element, "top", 0.0)
2585
+
2586
+ # Ensure starts come before ends at the same coordinate
2587
+ priority = 0 if boundary["type"] == "start" else 1
2588
+
2589
+ return (page_idx, y_pos, priority)
2590
+
2591
+ section_boundaries.sort(key=_sort_key)
2542
2592
 
2543
2593
  # Generate sections
2544
2594
  sections = []
@@ -2558,8 +2608,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2558
2608
  end_pg = end_el.page if end_el is not None else self.pages[-1]
2559
2609
 
2560
2610
  parts: list[Region] = []
2561
- # Slice of first page
2562
- parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
2611
+
2612
+ # Use the actual top of the start element (for implicit starts this is
2613
+ # the bottom of the previous end element) instead of forcing to 0.
2614
+ start_top = start_el.top
2615
+
2616
+ # Slice of first page beginning at *start_top*
2617
+ parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
2563
2618
 
2564
2619
  # Full middle pages
2565
2620
  for pg_idx in range(start_pg.index + 1, end_pg.index):
@@ -2597,9 +2652,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2597
2652
 
2598
2653
  # If both elements are on the same page, use the page's get_section_between
2599
2654
  if start_element.page == end_element.page:
2600
- section = start_element.page.get_section_between(
2601
- start_element, end_element, boundary_inclusion
2602
- )
2655
+ # For implicit start elements, create a region from the top of the page
2656
+ if hasattr(start_element, "is_implicit_start"):
2657
+ from natural_pdf.elements.region import Region
2658
+ section = Region(
2659
+ start_element.page,
2660
+ (0, start_element.top, start_element.page.width, end_element.bottom)
2661
+ )
2662
+ section.start_element = start_element
2663
+ section.boundary_element_found = end_element
2664
+ else:
2665
+ section = start_element.page.get_section_between(
2666
+ start_element, end_element, boundary_inclusion
2667
+ )
2603
2668
  sections.append(section)
2604
2669
  else:
2605
2670
  # Create FlowRegion spanning pages
@@ -2638,9 +2703,11 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2638
2703
  from natural_pdf.elements.region import Region
2639
2704
 
2640
2705
  start_page = start_element.page
2641
-
2706
+
2707
+ # Handle implicit start elements
2708
+ start_top = start_element.top
2642
2709
  region = Region(
2643
- start_page, (0, start_element.top, start_page.width, start_page.height)
2710
+ start_page, (0, start_top, start_page.width, start_page.height)
2644
2711
  )
2645
2712
  region.start_element = start_element
2646
2713
  sections.append(region)
@@ -2667,8 +2734,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2667
2734
  # With start_elements only, create a section to the end of the current page
2668
2735
  from natural_pdf.elements.region import Region
2669
2736
 
2737
+ # Handle implicit start elements
2738
+ start_top = start_element.top
2670
2739
  region = Region(
2671
- start_page, (0, start_element.top, start_page.width, start_page.height)
2740
+ start_page, (0, start_top, start_page.width, start_page.height)
2672
2741
  )
2673
2742
  region.start_element = start_element
2674
2743
  sections.append(region)
@@ -3181,6 +3250,61 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
3181
3250
  raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
3182
3251
  # <--- END MODIFIED
3183
3252
 
3253
+ def to_flow(
3254
+ self,
3255
+ arrangement: Literal["vertical", "horizontal"] = "vertical",
3256
+ alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
3257
+ segment_gap: float = 0.0,
3258
+ ) -> "Flow":
3259
+ """
3260
+ Convert this PageCollection to a Flow for cross-page operations.
3261
+
3262
+ This enables treating multiple pages as a continuous logical document
3263
+ structure, useful for multi-page tables, articles spanning columns,
3264
+ or any content requiring reading order across page boundaries.
3265
+
3266
+ Args:
3267
+ arrangement: Primary flow direction ('vertical' or 'horizontal').
3268
+ 'vertical' stacks pages top-to-bottom (most common).
3269
+ 'horizontal' arranges pages left-to-right.
3270
+ alignment: Cross-axis alignment for pages of different sizes:
3271
+ For vertical: 'left'/'start', 'center', 'right'/'end'
3272
+ For horizontal: 'top'/'start', 'center', 'bottom'/'end'
3273
+ segment_gap: Virtual gap between pages in PDF points (default: 0.0).
3274
+
3275
+ Returns:
3276
+ Flow object that can perform operations across all pages in sequence.
3277
+
3278
+ Example:
3279
+ Multi-page table extraction:
3280
+ ```python
3281
+ pdf = npdf.PDF("multi_page_report.pdf")
3282
+
3283
+ # Create flow for pages 2-4 containing a table
3284
+ table_flow = pdf.pages[1:4].to_flow()
3285
+
3286
+ # Extract table as if it were continuous
3287
+ table_data = table_flow.extract_table()
3288
+ df = table_data.df
3289
+ ```
3290
+
3291
+ Cross-page element search:
3292
+ ```python
3293
+ # Find all headers across multiple pages
3294
+ headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
3295
+
3296
+ # Analyze layout across pages
3297
+ regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
3298
+ ```
3299
+ """
3300
+ from natural_pdf.flows.flow import Flow
3301
+ return Flow(
3302
+ segments=self, # Flow constructor now handles PageCollection
3303
+ arrangement=arrangement,
3304
+ alignment=alignment,
3305
+ segment_gap=segment_gap,
3306
+ )
3307
+
3184
3308
  # Alias .to_image() to .show() for convenience
3185
3309
  def show(
3186
3310
  self,
@@ -21,6 +21,7 @@ from natural_pdf.elements.text import TextElement # ADDED IMPORT
21
21
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
22
22
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
23
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
+ from natural_pdf.text_mixin import TextMixin
24
25
 
25
26
  # ------------------------------------------------------------------
26
27
  # Table utilities
@@ -56,7 +57,12 @@ logger = logging.getLogger(__name__)
56
57
 
57
58
 
58
59
  class Region(
59
- DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
60
+ TextMixin,
61
+ DirectionalMixin,
62
+ ClassificationMixin,
63
+ ExtractionMixin,
64
+ ShapeDetectionMixin,
65
+ DescribeMixin,
60
66
  ):
61
67
  """Represents a rectangular region on a page.
62
68
 
@@ -1610,8 +1616,47 @@ class Region(
1610
1616
  table_settings.setdefault("join_x_tolerance", join)
1611
1617
  table_settings.setdefault("join_y_tolerance", join)
1612
1618
 
1613
- # Create a crop of the page for this region
1614
- cropped = self.page._page.crop(self.bbox)
1619
+ # -------------------------------------------------------------
1620
+ # Apply char-level exclusion filtering, if any exclusions are
1621
+ # defined on the parent Page. We create a lightweight
1622
+ # pdfplumber.Page copy whose .chars list omits characters that
1623
+ # fall inside any exclusion Region. Other object types are
1624
+ # left untouched for now ("chars-only" strategy).
1625
+ # -------------------------------------------------------------
1626
+ base_plumber_page = self.page._page
1627
+
1628
+ if getattr(self.page, "_exclusions", None):
1629
+ # Resolve exclusion Regions (callables already evaluated)
1630
+ exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1631
+
1632
+ def _keep_char(obj):
1633
+ """Return True if pdfplumber obj should be kept."""
1634
+ if obj.get("object_type") != "char":
1635
+ # Keep non-char objects unchanged – lattice grids etc.
1636
+ return True
1637
+
1638
+ # Compute character centre point
1639
+ cx = (obj["x0"] + obj["x1"]) / 2.0
1640
+ cy = (obj["top"] + obj["bottom"]) / 2.0
1641
+
1642
+ # Reject if the centre lies inside ANY exclusion Region
1643
+ for reg in exclusion_regions:
1644
+ if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
1645
+ return False
1646
+ return True
1647
+
1648
+ try:
1649
+ filtered_page = base_plumber_page.filter(_keep_char)
1650
+ except Exception as _filter_err:
1651
+ # Fallback – if filtering fails, log and proceed unfiltered
1652
+ logger.warning(
1653
+ f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
1654
+ )
1655
+ filtered_page = base_plumber_page
1656
+ else:
1657
+ filtered_page = base_plumber_page
1658
+
1659
+ cropped = filtered_page.crop(self.bbox)
1615
1660
 
1616
1661
  # Extract all tables from the cropped area
1617
1662
  tables = cropped.extract_tables(table_settings)
@@ -1672,8 +1717,38 @@ class Region(
1672
1717
  if y_tol is not None:
1673
1718
  table_settings.setdefault("text_y_tolerance", y_tol)
1674
1719
 
1675
- # Create a crop of the page for this region
1676
- cropped = self.page._page.crop(self.bbox)
1720
+ # -------------------------------------------------------------
1721
+ # Apply char-level exclusion filtering (chars only) just like in
1722
+ # _extract_tables_plumber so header/footer text does not appear
1723
+ # in extracted tables.
1724
+ # -------------------------------------------------------------
1725
+ base_plumber_page = self.page._page
1726
+
1727
+ if getattr(self.page, "_exclusions", None):
1728
+ exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1729
+
1730
+ def _keep_char(obj):
1731
+ if obj.get("object_type") != "char":
1732
+ return True
1733
+ cx = (obj["x0"] + obj["x1"]) / 2.0
1734
+ cy = (obj["top"] + obj["bottom"]) / 2.0
1735
+ for reg in exclusion_regions:
1736
+ if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
1737
+ return False
1738
+ return True
1739
+
1740
+ try:
1741
+ filtered_page = base_plumber_page.filter(_keep_char)
1742
+ except Exception as _filter_err:
1743
+ logger.warning(
1744
+ f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
1745
+ )
1746
+ filtered_page = base_plumber_page
1747
+ else:
1748
+ filtered_page = base_plumber_page
1749
+
1750
+ # Now crop the (possibly filtered) page to the region bbox
1751
+ cropped = filtered_page.crop(self.bbox)
1677
1752
 
1678
1753
  # Extract the single largest table from the cropped area
1679
1754
  table = cropped.extract_table(table_settings)
@@ -3007,45 +3082,20 @@ class Region(
3007
3082
  source_info = f" source='{self.source}'" if self.source else ""
3008
3083
  return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
3009
3084
 
3010
- def correct_ocr(
3085
+ def update_text(
3011
3086
  self,
3012
- correction_callback: Callable[[Any], Optional[str]],
3013
- ) -> "Region": # Return self for chaining
3014
- """
3015
- Applies corrections to OCR-generated text elements within this region
3016
- using a user-provided callback function.
3017
-
3018
- Finds text elements within this region whose 'source' attribute starts
3019
- with 'ocr' and calls the `correction_callback` for each, passing the
3020
- element itself.
3021
-
3022
- The `correction_callback` should contain the logic to:
3023
- 1. Determine if the element needs correction.
3024
- 2. Perform the correction (e.g., call an LLM).
3025
- 3. Return the new text (`str`) or `None`.
3026
-
3027
- If the callback returns a string, the element's `.text` is updated.
3028
- Metadata updates (source, confidence, etc.) should happen within the callback.
3029
-
3030
- Args:
3031
- correction_callback: A function accepting an element and returning
3032
- `Optional[str]` (new text or None).
3087
+ transform: Callable[[Any], Optional[str]],
3088
+ *,
3089
+ selector: str = "text",
3090
+ apply_exclusions: bool = False,
3091
+ ) -> "Region":
3092
+ """Apply *transform* to every text element matched by *selector* inside this region.
3033
3093
 
3034
- Returns:
3035
- Self for method chaining.
3094
+ The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
3095
+ override simply ensures the search is scoped to the region.
3036
3096
  """
3037
- # Find OCR elements specifically within this region
3038
- # Note: We typically want to correct even if the element falls in an excluded area
3039
- target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
3040
3097
 
3041
- # Delegate to the utility function
3042
- _apply_ocr_correction_to_elements(
3043
- elements=target_elements, # Pass the ElementCollection directly
3044
- correction_callback=correction_callback,
3045
- caller_info=f"Region({self.bbox})", # Pass caller info
3046
- )
3047
-
3048
- return self # Return self for chaining
3098
+ return TextMixin.update_text(self, transform, selector=selector, apply_exclusions=apply_exclusions)
3049
3099
 
3050
3100
  # --- Classification Mixin Implementation --- #
3051
3101
  def _get_classification_manager(self) -> "ClassificationManager":
@@ -73,6 +73,31 @@ class FlowElement:
73
73
  """Returns the physical page of the underlying element."""
74
74
  return getattr(self.physical_object, "page", None)
75
75
 
76
+ def __getattr__(self, name: str) -> Any:
77
+ """
78
+ Delegate unknown attribute access to the physical_object.
79
+
80
+ This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
81
+ from the physical element are accessible on the FlowElement wrapper.
82
+
83
+ Args:
84
+ name: The attribute name being accessed
85
+
86
+ Returns:
87
+ The attribute value from physical_object
88
+
89
+ Raises:
90
+ AttributeError: If the attribute doesn't exist on physical_object either
91
+ """
92
+ try:
93
+ return getattr(self.physical_object, name)
94
+ except AttributeError:
95
+ # Provide a helpful error message that mentions both FlowElement and physical_object
96
+ raise AttributeError(
97
+ f"'{type(self).__name__}' object has no attribute '{name}' "
98
+ f"(also not found on underlying {type(self.physical_object).__name__})"
99
+ )
100
+
76
101
  def _flow_direction(
77
102
  self,
78
103
  direction: str, # "above", "below", "left", "right"