natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -0
- natural_pdf/core/page.py +21 -21
- natural_pdf/core/pdf.py +77 -24
- natural_pdf/elements/collections.py +164 -40
- natural_pdf/elements/region.py +90 -40
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +702 -20
- natural_pdf/flows/region.py +52 -4
- natural_pdf/selectors/parser.py +34 -1
- natural_pdf/text_mixin.py +97 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ from typing import (
|
|
11
11
|
Iterable,
|
12
12
|
Iterator,
|
13
13
|
List,
|
14
|
+
Literal,
|
14
15
|
Optional,
|
15
16
|
Sequence,
|
16
17
|
Tuple,
|
@@ -40,6 +41,7 @@ from natural_pdf.export.mixin import ExportMixin
|
|
40
41
|
from natural_pdf.ocr import OCROptions
|
41
42
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
42
43
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
44
|
+
from natural_pdf.text_mixin import TextMixin
|
43
45
|
|
44
46
|
# Potentially lazy imports for optional dependencies needed in save_pdf
|
45
47
|
try:
|
@@ -66,6 +68,7 @@ if TYPE_CHECKING:
|
|
66
68
|
from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
|
67
69
|
from natural_pdf.elements.region import Region
|
68
70
|
from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
|
71
|
+
from natural_pdf.flows.flow import Flow
|
69
72
|
|
70
73
|
T = TypeVar("T")
|
71
74
|
P = TypeVar("P", bound="Page")
|
@@ -1416,7 +1419,7 @@ class ElementCollection(
|
|
1416
1419
|
|
1417
1420
|
def correct_ocr(
|
1418
1421
|
self,
|
1419
|
-
|
1422
|
+
transform: Callable[[Any], Optional[str]],
|
1420
1423
|
max_workers: Optional[int] = None,
|
1421
1424
|
) -> "ElementCollection":
|
1422
1425
|
"""
|
@@ -1425,10 +1428,10 @@ class ElementCollection(
|
|
1425
1428
|
in parallel if `max_workers` is specified.
|
1426
1429
|
|
1427
1430
|
Iterates through elements currently in the collection. If an element's
|
1428
|
-
'source' attribute starts with 'ocr', it calls the `
|
1431
|
+
'source' attribute starts with 'ocr', it calls the `transform`
|
1429
1432
|
for that element, passing the element itself.
|
1430
1433
|
|
1431
|
-
The `
|
1434
|
+
The `transform` should contain the logic to:
|
1432
1435
|
1. Determine if the element needs correction.
|
1433
1436
|
2. Perform the correction (e.g., call an LLM).
|
1434
1437
|
3. Return the new text (`str`) or `None`.
|
@@ -1438,8 +1441,8 @@ class ElementCollection(
|
|
1438
1441
|
Elements without a source starting with 'ocr' are skipped.
|
1439
1442
|
|
1440
1443
|
Args:
|
1441
|
-
|
1442
|
-
|
1444
|
+
transform: A function accepting an element and returning
|
1445
|
+
`Optional[str]` (new text or None).
|
1443
1446
|
max_workers: The maximum number of worker threads to use for parallel
|
1444
1447
|
correction on each page. If None, defaults are used.
|
1445
1448
|
|
@@ -1449,7 +1452,7 @@ class ElementCollection(
|
|
1449
1452
|
# Delegate to the utility function
|
1450
1453
|
_apply_ocr_correction_to_elements(
|
1451
1454
|
elements=self._elements,
|
1452
|
-
correction_callback=
|
1455
|
+
correction_callback=transform,
|
1453
1456
|
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1454
1457
|
max_workers=max_workers,
|
1455
1458
|
)
|
@@ -2045,7 +2048,7 @@ class ElementCollection(
|
|
2045
2048
|
# ------------------------------------------------------------------
|
2046
2049
|
|
2047
2050
|
|
2048
|
-
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
2051
|
+
class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
|
2049
2052
|
"""
|
2050
2053
|
Represents a collection of Page objects, often from a single PDF document.
|
2051
2054
|
Provides methods for batch operations on these pages.
|
@@ -2363,22 +2366,24 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2363
2366
|
|
2364
2367
|
return ElementCollection(all_elements)
|
2365
2368
|
|
2366
|
-
def
|
2369
|
+
def update_text(
|
2367
2370
|
self,
|
2368
|
-
|
2371
|
+
transform: Callable[[Any], Optional[str]],
|
2372
|
+
selector: str = "text",
|
2369
2373
|
max_workers: Optional[int] = None,
|
2370
2374
|
) -> "PageCollection[P]":
|
2371
2375
|
"""
|
2372
|
-
Applies corrections to
|
2376
|
+
Applies corrections to text elements across all pages
|
2373
2377
|
in this collection using a user-provided callback function, executed
|
2374
2378
|
in parallel if `max_workers` is specified.
|
2375
2379
|
|
2376
|
-
This method delegates to the parent PDF's `
|
2380
|
+
This method delegates to the parent PDF's `update_text` method,
|
2377
2381
|
targeting all pages within this collection.
|
2378
2382
|
|
2379
2383
|
Args:
|
2380
|
-
|
2381
|
-
|
2384
|
+
transform: A function that accepts a single argument (an element
|
2385
|
+
object) and returns `Optional[str]` (new text or None).
|
2386
|
+
selector: The attribute name to update. Default is 'text'.
|
2382
2387
|
max_workers: The maximum number of worker threads to use for parallel
|
2383
2388
|
correction on each page. If None, defaults are used.
|
2384
2389
|
|
@@ -2387,10 +2392,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2387
2392
|
|
2388
2393
|
Raises:
|
2389
2394
|
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
2390
|
-
or the parent PDF lacks the `
|
2395
|
+
or the parent PDF lacks the `update_text` method.
|
2391
2396
|
"""
|
2392
2397
|
if not self.pages:
|
2393
|
-
logger.warning("Cannot
|
2398
|
+
logger.warning("Cannot update text for an empty PageCollection.")
|
2394
2399
|
# Return self even if empty to maintain chaining consistency
|
2395
2400
|
return self
|
2396
2401
|
|
@@ -2398,24 +2403,25 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2398
2403
|
parent_pdf = self.pages[0]._parent
|
2399
2404
|
if (
|
2400
2405
|
not parent_pdf
|
2401
|
-
or not hasattr(parent_pdf, "
|
2402
|
-
or not callable(parent_pdf.
|
2406
|
+
or not hasattr(parent_pdf, "update_text")
|
2407
|
+
or not callable(parent_pdf.update_text)
|
2403
2408
|
):
|
2404
2409
|
raise RuntimeError(
|
2405
|
-
"Parent PDF reference not found or parent PDF lacks the required '
|
2410
|
+
"Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
|
2406
2411
|
)
|
2407
2412
|
|
2408
2413
|
page_indices = self._get_page_indices()
|
2409
2414
|
logger.info(
|
2410
|
-
f"PageCollection: Delegating
|
2415
|
+
f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
|
2411
2416
|
)
|
2412
2417
|
|
2413
2418
|
# Delegate the call to the parent PDF object for the relevant pages
|
2414
2419
|
# Pass the max_workers parameter down
|
2415
|
-
parent_pdf.
|
2416
|
-
|
2420
|
+
parent_pdf.update_text(
|
2421
|
+
transform=transform,
|
2417
2422
|
pages=page_indices,
|
2418
|
-
|
2423
|
+
selector=selector,
|
2424
|
+
max_workers=max_workers,
|
2419
2425
|
)
|
2420
2426
|
|
2421
2427
|
return self
|
@@ -2431,13 +2437,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2431
2437
|
Extract sections from a page collection based on start/end elements.
|
2432
2438
|
|
2433
2439
|
Args:
|
2434
|
-
start_elements: Elements or selector string that mark the start of sections
|
2435
|
-
end_elements: Elements or selector string that mark the end of sections
|
2440
|
+
start_elements: Elements or selector string that mark the start of sections (optional)
|
2441
|
+
end_elements: Elements or selector string that mark the end of sections (optional)
|
2436
2442
|
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
2437
2443
|
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
2438
2444
|
|
2439
2445
|
Returns:
|
2440
2446
|
List of Region objects representing the extracted sections
|
2447
|
+
|
2448
|
+
Note:
|
2449
|
+
You can provide only start_elements, only end_elements, or both.
|
2450
|
+
- With only start_elements: sections go from each start to the next start (or end of page)
|
2451
|
+
- With only end_elements: sections go from beginning of document/page to each end
|
2452
|
+
- With both: sections go from each start to the corresponding end
|
2441
2453
|
"""
|
2442
2454
|
# Find start and end elements across all pages
|
2443
2455
|
if isinstance(start_elements, str):
|
@@ -2446,8 +2458,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2446
2458
|
if isinstance(end_elements, str):
|
2447
2459
|
end_elements = self.find_all(end_elements).elements
|
2448
2460
|
|
2449
|
-
# If no start elements, return empty list
|
2450
|
-
if not start_elements:
|
2461
|
+
# If no start elements and no end elements, return empty list
|
2462
|
+
if not start_elements and not end_elements:
|
2451
2463
|
return []
|
2452
2464
|
|
2453
2465
|
# If there are page break boundaries, we'll need to add them
|
@@ -2482,6 +2494,26 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2482
2494
|
# Sort by page index, then vertical position, then horizontal position
|
2483
2495
|
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
2484
2496
|
|
2497
|
+
# If we only have end_elements (no start_elements), create implicit start elements
|
2498
|
+
if not start_elements and end_elements:
|
2499
|
+
from natural_pdf.elements.region import Region
|
2500
|
+
|
2501
|
+
start_elements = []
|
2502
|
+
|
2503
|
+
# Add implicit start at the beginning of the first page
|
2504
|
+
first_page = self.pages[0]
|
2505
|
+
first_start = Region(first_page, (0, 0, first_page.width, 1))
|
2506
|
+
first_start.is_implicit_start = True
|
2507
|
+
start_elements.append(first_start)
|
2508
|
+
|
2509
|
+
# For each end element (except the last), add an implicit start after it
|
2510
|
+
sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
|
2511
|
+
for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
|
2512
|
+
# Create implicit start element right after this end element
|
2513
|
+
implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
|
2514
|
+
implicit_start.is_implicit_start = True
|
2515
|
+
start_elements.append(implicit_start)
|
2516
|
+
|
2485
2517
|
# Mark section boundaries
|
2486
2518
|
section_boundaries = []
|
2487
2519
|
|
@@ -2507,6 +2539,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2507
2539
|
"page_idx": element.page.index,
|
2508
2540
|
}
|
2509
2541
|
)
|
2542
|
+
elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
|
2543
|
+
# This is an implicit start element
|
2544
|
+
section_boundaries.append(
|
2545
|
+
{
|
2546
|
+
"index": -2, # Special index for implicit starts
|
2547
|
+
"element": element,
|
2548
|
+
"type": "start",
|
2549
|
+
"page_idx": element.page.index,
|
2550
|
+
}
|
2551
|
+
)
|
2510
2552
|
|
2511
2553
|
# Add end element boundaries if provided
|
2512
2554
|
if end_elements:
|
@@ -2533,12 +2575,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2533
2575
|
)
|
2534
2576
|
|
2535
2577
|
# Sort boundaries by page index, then by actual document position
|
2536
|
-
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2541
|
-
|
2578
|
+
def _sort_key(boundary):
|
2579
|
+
"""Sort boundaries by (page_idx, vertical_top, priority)."""
|
2580
|
+
page_idx = boundary["page_idx"]
|
2581
|
+
element = boundary["element"]
|
2582
|
+
|
2583
|
+
# Vertical position on the page
|
2584
|
+
y_pos = getattr(element, "top", 0.0)
|
2585
|
+
|
2586
|
+
# Ensure starts come before ends at the same coordinate
|
2587
|
+
priority = 0 if boundary["type"] == "start" else 1
|
2588
|
+
|
2589
|
+
return (page_idx, y_pos, priority)
|
2590
|
+
|
2591
|
+
section_boundaries.sort(key=_sort_key)
|
2542
2592
|
|
2543
2593
|
# Generate sections
|
2544
2594
|
sections = []
|
@@ -2558,8 +2608,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2558
2608
|
end_pg = end_el.page if end_el is not None else self.pages[-1]
|
2559
2609
|
|
2560
2610
|
parts: list[Region] = []
|
2561
|
-
|
2562
|
-
|
2611
|
+
|
2612
|
+
# Use the actual top of the start element (for implicit starts this is
|
2613
|
+
# the bottom of the previous end element) instead of forcing to 0.
|
2614
|
+
start_top = start_el.top
|
2615
|
+
|
2616
|
+
# Slice of first page beginning at *start_top*
|
2617
|
+
parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
|
2563
2618
|
|
2564
2619
|
# Full middle pages
|
2565
2620
|
for pg_idx in range(start_pg.index + 1, end_pg.index):
|
@@ -2597,9 +2652,19 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2597
2652
|
|
2598
2653
|
# If both elements are on the same page, use the page's get_section_between
|
2599
2654
|
if start_element.page == end_element.page:
|
2600
|
-
|
2601
|
-
|
2602
|
-
|
2655
|
+
# For implicit start elements, create a region from the top of the page
|
2656
|
+
if hasattr(start_element, "is_implicit_start"):
|
2657
|
+
from natural_pdf.elements.region import Region
|
2658
|
+
section = Region(
|
2659
|
+
start_element.page,
|
2660
|
+
(0, start_element.top, start_element.page.width, end_element.bottom)
|
2661
|
+
)
|
2662
|
+
section.start_element = start_element
|
2663
|
+
section.boundary_element_found = end_element
|
2664
|
+
else:
|
2665
|
+
section = start_element.page.get_section_between(
|
2666
|
+
start_element, end_element, boundary_inclusion
|
2667
|
+
)
|
2603
2668
|
sections.append(section)
|
2604
2669
|
else:
|
2605
2670
|
# Create FlowRegion spanning pages
|
@@ -2638,9 +2703,11 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2638
2703
|
from natural_pdf.elements.region import Region
|
2639
2704
|
|
2640
2705
|
start_page = start_element.page
|
2641
|
-
|
2706
|
+
|
2707
|
+
# Handle implicit start elements
|
2708
|
+
start_top = start_element.top
|
2642
2709
|
region = Region(
|
2643
|
-
start_page, (0,
|
2710
|
+
start_page, (0, start_top, start_page.width, start_page.height)
|
2644
2711
|
)
|
2645
2712
|
region.start_element = start_element
|
2646
2713
|
sections.append(region)
|
@@ -2667,8 +2734,10 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2667
2734
|
# With start_elements only, create a section to the end of the current page
|
2668
2735
|
from natural_pdf.elements.region import Region
|
2669
2736
|
|
2737
|
+
# Handle implicit start elements
|
2738
|
+
start_top = start_element.top
|
2670
2739
|
region = Region(
|
2671
|
-
start_page, (0,
|
2740
|
+
start_page, (0, start_top, start_page.width, start_page.height)
|
2672
2741
|
)
|
2673
2742
|
region.start_element = start_element
|
2674
2743
|
sections.append(region)
|
@@ -3181,6 +3250,61 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
3181
3250
|
raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
|
3182
3251
|
# <--- END MODIFIED
|
3183
3252
|
|
3253
|
+
def to_flow(
|
3254
|
+
self,
|
3255
|
+
arrangement: Literal["vertical", "horizontal"] = "vertical",
|
3256
|
+
alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
|
3257
|
+
segment_gap: float = 0.0,
|
3258
|
+
) -> "Flow":
|
3259
|
+
"""
|
3260
|
+
Convert this PageCollection to a Flow for cross-page operations.
|
3261
|
+
|
3262
|
+
This enables treating multiple pages as a continuous logical document
|
3263
|
+
structure, useful for multi-page tables, articles spanning columns,
|
3264
|
+
or any content requiring reading order across page boundaries.
|
3265
|
+
|
3266
|
+
Args:
|
3267
|
+
arrangement: Primary flow direction ('vertical' or 'horizontal').
|
3268
|
+
'vertical' stacks pages top-to-bottom (most common).
|
3269
|
+
'horizontal' arranges pages left-to-right.
|
3270
|
+
alignment: Cross-axis alignment for pages of different sizes:
|
3271
|
+
For vertical: 'left'/'start', 'center', 'right'/'end'
|
3272
|
+
For horizontal: 'top'/'start', 'center', 'bottom'/'end'
|
3273
|
+
segment_gap: Virtual gap between pages in PDF points (default: 0.0).
|
3274
|
+
|
3275
|
+
Returns:
|
3276
|
+
Flow object that can perform operations across all pages in sequence.
|
3277
|
+
|
3278
|
+
Example:
|
3279
|
+
Multi-page table extraction:
|
3280
|
+
```python
|
3281
|
+
pdf = npdf.PDF("multi_page_report.pdf")
|
3282
|
+
|
3283
|
+
# Create flow for pages 2-4 containing a table
|
3284
|
+
table_flow = pdf.pages[1:4].to_flow()
|
3285
|
+
|
3286
|
+
# Extract table as if it were continuous
|
3287
|
+
table_data = table_flow.extract_table()
|
3288
|
+
df = table_data.df
|
3289
|
+
```
|
3290
|
+
|
3291
|
+
Cross-page element search:
|
3292
|
+
```python
|
3293
|
+
# Find all headers across multiple pages
|
3294
|
+
headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
|
3295
|
+
|
3296
|
+
# Analyze layout across pages
|
3297
|
+
regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
|
3298
|
+
```
|
3299
|
+
"""
|
3300
|
+
from natural_pdf.flows.flow import Flow
|
3301
|
+
return Flow(
|
3302
|
+
segments=self, # Flow constructor now handles PageCollection
|
3303
|
+
arrangement=arrangement,
|
3304
|
+
alignment=alignment,
|
3305
|
+
segment_gap=segment_gap,
|
3306
|
+
)
|
3307
|
+
|
3184
3308
|
# Alias .to_image() to .show() for convenience
|
3185
3309
|
def show(
|
3186
3310
|
self,
|
natural_pdf/elements/region.py
CHANGED
@@ -21,6 +21,7 @@ from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
|
21
21
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
22
22
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
23
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
+
from natural_pdf.text_mixin import TextMixin
|
24
25
|
|
25
26
|
# ------------------------------------------------------------------
|
26
27
|
# Table utilities
|
@@ -56,7 +57,12 @@ logger = logging.getLogger(__name__)
|
|
56
57
|
|
57
58
|
|
58
59
|
class Region(
|
59
|
-
|
60
|
+
TextMixin,
|
61
|
+
DirectionalMixin,
|
62
|
+
ClassificationMixin,
|
63
|
+
ExtractionMixin,
|
64
|
+
ShapeDetectionMixin,
|
65
|
+
DescribeMixin,
|
60
66
|
):
|
61
67
|
"""Represents a rectangular region on a page.
|
62
68
|
|
@@ -1610,8 +1616,47 @@ class Region(
|
|
1610
1616
|
table_settings.setdefault("join_x_tolerance", join)
|
1611
1617
|
table_settings.setdefault("join_y_tolerance", join)
|
1612
1618
|
|
1613
|
-
#
|
1614
|
-
|
1619
|
+
# -------------------------------------------------------------
|
1620
|
+
# Apply char-level exclusion filtering, if any exclusions are
|
1621
|
+
# defined on the parent Page. We create a lightweight
|
1622
|
+
# pdfplumber.Page copy whose .chars list omits characters that
|
1623
|
+
# fall inside any exclusion Region. Other object types are
|
1624
|
+
# left untouched for now ("chars-only" strategy).
|
1625
|
+
# -------------------------------------------------------------
|
1626
|
+
base_plumber_page = self.page._page
|
1627
|
+
|
1628
|
+
if getattr(self.page, "_exclusions", None):
|
1629
|
+
# Resolve exclusion Regions (callables already evaluated)
|
1630
|
+
exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
|
1631
|
+
|
1632
|
+
def _keep_char(obj):
|
1633
|
+
"""Return True if pdfplumber obj should be kept."""
|
1634
|
+
if obj.get("object_type") != "char":
|
1635
|
+
# Keep non-char objects unchanged – lattice grids etc.
|
1636
|
+
return True
|
1637
|
+
|
1638
|
+
# Compute character centre point
|
1639
|
+
cx = (obj["x0"] + obj["x1"]) / 2.0
|
1640
|
+
cy = (obj["top"] + obj["bottom"]) / 2.0
|
1641
|
+
|
1642
|
+
# Reject if the centre lies inside ANY exclusion Region
|
1643
|
+
for reg in exclusion_regions:
|
1644
|
+
if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
|
1645
|
+
return False
|
1646
|
+
return True
|
1647
|
+
|
1648
|
+
try:
|
1649
|
+
filtered_page = base_plumber_page.filter(_keep_char)
|
1650
|
+
except Exception as _filter_err:
|
1651
|
+
# Fallback – if filtering fails, log and proceed unfiltered
|
1652
|
+
logger.warning(
|
1653
|
+
f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
|
1654
|
+
)
|
1655
|
+
filtered_page = base_plumber_page
|
1656
|
+
else:
|
1657
|
+
filtered_page = base_plumber_page
|
1658
|
+
|
1659
|
+
cropped = filtered_page.crop(self.bbox)
|
1615
1660
|
|
1616
1661
|
# Extract all tables from the cropped area
|
1617
1662
|
tables = cropped.extract_tables(table_settings)
|
@@ -1672,8 +1717,38 @@ class Region(
|
|
1672
1717
|
if y_tol is not None:
|
1673
1718
|
table_settings.setdefault("text_y_tolerance", y_tol)
|
1674
1719
|
|
1675
|
-
#
|
1676
|
-
|
1720
|
+
# -------------------------------------------------------------
|
1721
|
+
# Apply char-level exclusion filtering (chars only) just like in
|
1722
|
+
# _extract_tables_plumber so header/footer text does not appear
|
1723
|
+
# in extracted tables.
|
1724
|
+
# -------------------------------------------------------------
|
1725
|
+
base_plumber_page = self.page._page
|
1726
|
+
|
1727
|
+
if getattr(self.page, "_exclusions", None):
|
1728
|
+
exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
|
1729
|
+
|
1730
|
+
def _keep_char(obj):
|
1731
|
+
if obj.get("object_type") != "char":
|
1732
|
+
return True
|
1733
|
+
cx = (obj["x0"] + obj["x1"]) / 2.0
|
1734
|
+
cy = (obj["top"] + obj["bottom"]) / 2.0
|
1735
|
+
for reg in exclusion_regions:
|
1736
|
+
if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
|
1737
|
+
return False
|
1738
|
+
return True
|
1739
|
+
|
1740
|
+
try:
|
1741
|
+
filtered_page = base_plumber_page.filter(_keep_char)
|
1742
|
+
except Exception as _filter_err:
|
1743
|
+
logger.warning(
|
1744
|
+
f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
|
1745
|
+
)
|
1746
|
+
filtered_page = base_plumber_page
|
1747
|
+
else:
|
1748
|
+
filtered_page = base_plumber_page
|
1749
|
+
|
1750
|
+
# Now crop the (possibly filtered) page to the region bbox
|
1751
|
+
cropped = filtered_page.crop(self.bbox)
|
1677
1752
|
|
1678
1753
|
# Extract the single largest table from the cropped area
|
1679
1754
|
table = cropped.extract_table(table_settings)
|
@@ -3007,45 +3082,20 @@ class Region(
|
|
3007
3082
|
source_info = f" source='{self.source}'" if self.source else ""
|
3008
3083
|
return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
|
3009
3084
|
|
3010
|
-
def
|
3085
|
+
def update_text(
|
3011
3086
|
self,
|
3012
|
-
|
3013
|
-
|
3014
|
-
""
|
3015
|
-
|
3016
|
-
|
3017
|
-
|
3018
|
-
Finds text elements within this region whose 'source' attribute starts
|
3019
|
-
with 'ocr' and calls the `correction_callback` for each, passing the
|
3020
|
-
element itself.
|
3021
|
-
|
3022
|
-
The `correction_callback` should contain the logic to:
|
3023
|
-
1. Determine if the element needs correction.
|
3024
|
-
2. Perform the correction (e.g., call an LLM).
|
3025
|
-
3. Return the new text (`str`) or `None`.
|
3026
|
-
|
3027
|
-
If the callback returns a string, the element's `.text` is updated.
|
3028
|
-
Metadata updates (source, confidence, etc.) should happen within the callback.
|
3029
|
-
|
3030
|
-
Args:
|
3031
|
-
correction_callback: A function accepting an element and returning
|
3032
|
-
`Optional[str]` (new text or None).
|
3087
|
+
transform: Callable[[Any], Optional[str]],
|
3088
|
+
*,
|
3089
|
+
selector: str = "text",
|
3090
|
+
apply_exclusions: bool = False,
|
3091
|
+
) -> "Region":
|
3092
|
+
"""Apply *transform* to every text element matched by *selector* inside this region.
|
3033
3093
|
|
3034
|
-
|
3035
|
-
|
3094
|
+
The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
|
3095
|
+
override simply ensures the search is scoped to the region.
|
3036
3096
|
"""
|
3037
|
-
# Find OCR elements specifically within this region
|
3038
|
-
# Note: We typically want to correct even if the element falls in an excluded area
|
3039
|
-
target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
|
3040
3097
|
|
3041
|
-
|
3042
|
-
_apply_ocr_correction_to_elements(
|
3043
|
-
elements=target_elements, # Pass the ElementCollection directly
|
3044
|
-
correction_callback=correction_callback,
|
3045
|
-
caller_info=f"Region({self.bbox})", # Pass caller info
|
3046
|
-
)
|
3047
|
-
|
3048
|
-
return self # Return self for chaining
|
3098
|
+
return TextMixin.update_text(self, transform, selector=selector, apply_exclusions=apply_exclusions)
|
3049
3099
|
|
3050
3100
|
# --- Classification Mixin Implementation --- #
|
3051
3101
|
def _get_classification_manager(self) -> "ClassificationManager":
|
natural_pdf/flows/element.py
CHANGED
@@ -73,6 +73,31 @@ class FlowElement:
|
|
73
73
|
"""Returns the physical page of the underlying element."""
|
74
74
|
return getattr(self.physical_object, "page", None)
|
75
75
|
|
76
|
+
def __getattr__(self, name: str) -> Any:
|
77
|
+
"""
|
78
|
+
Delegate unknown attribute access to the physical_object.
|
79
|
+
|
80
|
+
This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
|
81
|
+
from the physical element are accessible on the FlowElement wrapper.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
name: The attribute name being accessed
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
The attribute value from physical_object
|
88
|
+
|
89
|
+
Raises:
|
90
|
+
AttributeError: If the attribute doesn't exist on physical_object either
|
91
|
+
"""
|
92
|
+
try:
|
93
|
+
return getattr(self.physical_object, name)
|
94
|
+
except AttributeError:
|
95
|
+
# Provide a helpful error message that mentions both FlowElement and physical_object
|
96
|
+
raise AttributeError(
|
97
|
+
f"'{type(self).__name__}' object has no attribute '{name}' "
|
98
|
+
f"(also not found on underlying {type(self.physical_object).__name__})"
|
99
|
+
)
|
100
|
+
|
76
101
|
def _flow_direction(
|
77
102
|
self,
|
78
103
|
direction: str, # "above", "below", "left", "right"
|