natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -0
- natural_pdf/core/page.py +21 -21
- natural_pdf/core/pdf.py +77 -24
- natural_pdf/elements/collections.py +164 -40
- natural_pdf/elements/region.py +90 -40
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +702 -20
- natural_pdf/flows/region.py +52 -4
- natural_pdf/selectors/parser.py +34 -1
- natural_pdf/text_mixin.py +97 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0
natural_pdf/flows/flow.py
CHANGED
@@ -1,15 +1,21 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
|
2
|
+
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union, Tuple, Callable, overload
|
3
3
|
|
4
4
|
if TYPE_CHECKING:
|
5
5
|
from natural_pdf.core.page import Page
|
6
6
|
from natural_pdf.elements.base import Element as PhysicalElement
|
7
|
-
from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection
|
7
|
+
from natural_pdf.elements.collections import ElementCollection as PhysicalElementCollection, PageCollection
|
8
8
|
from natural_pdf.elements.region import Region as PhysicalRegion
|
9
|
+
from PIL.Image import Image as PIL_Image
|
9
10
|
|
10
11
|
from .collections import FlowElementCollection
|
11
12
|
from .element import FlowElement
|
12
13
|
|
14
|
+
# Import required classes for the new methods
|
15
|
+
from natural_pdf.tables import TableResult
|
16
|
+
# For runtime image manipulation
|
17
|
+
from PIL import Image as PIL_Image_Runtime
|
18
|
+
|
13
19
|
logger = logging.getLogger(__name__)
|
14
20
|
|
15
21
|
|
@@ -81,7 +87,7 @@ class Flow:
|
|
81
87
|
|
82
88
|
def __init__(
|
83
89
|
self,
|
84
|
-
segments: List[Union["Page", "PhysicalRegion"]],
|
90
|
+
segments: Union[List[Union["Page", "PhysicalRegion"]], "PageCollection"],
|
85
91
|
arrangement: Literal["vertical", "horizontal"],
|
86
92
|
alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
|
87
93
|
segment_gap: float = 0.0,
|
@@ -91,7 +97,8 @@ class Flow:
|
|
91
97
|
|
92
98
|
Args:
|
93
99
|
segments: An ordered list of natural_pdf.core.page.Page or
|
94
|
-
natural_pdf.elements.region.Region objects that constitute the flow
|
100
|
+
natural_pdf.elements.region.Region objects that constitute the flow,
|
101
|
+
or a PageCollection containing pages.
|
95
102
|
arrangement: The primary direction of the flow.
|
96
103
|
- "vertical": Segments are stacked top-to-bottom.
|
97
104
|
- "horizontal": Segments are arranged left-to-right.
|
@@ -106,6 +113,10 @@ class Flow:
|
|
106
113
|
- "bottom" (or "end"): Align bottom edges.
|
107
114
|
segment_gap: The virtual gap (in PDF points) between segments.
|
108
115
|
"""
|
116
|
+
# Handle PageCollection input
|
117
|
+
if hasattr(segments, 'pages'): # It's a PageCollection
|
118
|
+
segments = list(segments.pages)
|
119
|
+
|
109
120
|
if not segments:
|
110
121
|
raise ValueError("Flow segments cannot be empty.")
|
111
122
|
if arrangement not in ["vertical", "horizontal"]:
|
@@ -213,21 +224,48 @@ class Flow:
|
|
213
224
|
) -> "FlowElementCollection":
|
214
225
|
"""
|
215
226
|
Finds all elements within the flow that match the given selector or text criteria.
|
216
|
-
|
217
|
-
|
227
|
+
|
228
|
+
This method efficiently groups segments by their parent pages, searches at the page level,
|
229
|
+
then filters results appropriately for each segment. This ensures elements that intersect
|
230
|
+
with flow segments (but aren't fully contained) are still found.
|
231
|
+
|
218
232
|
Elements found are wrapped as FlowElement objects, anchored to this Flow,
|
219
233
|
and returned in a FlowElementCollection.
|
220
234
|
"""
|
221
235
|
from .collections import FlowElementCollection
|
222
236
|
from .element import FlowElement
|
223
237
|
|
238
|
+
# Step 1: Group segments by their parent pages (like in analyze_layout)
|
239
|
+
segments_by_page = {} # Dict[Page, List[Segment]]
|
240
|
+
|
241
|
+
for i, segment in enumerate(self.segments):
|
242
|
+
# Determine the page for this segment - fix type detection
|
243
|
+
if hasattr(segment, 'page') and hasattr(segment.page, 'find_all'):
|
244
|
+
# It's a Region object (has a parent page)
|
245
|
+
page_obj = segment.page
|
246
|
+
segment_type = "region"
|
247
|
+
elif hasattr(segment, 'find_all') and hasattr(segment, 'width') and hasattr(segment, 'height') and not hasattr(segment, 'page'):
|
248
|
+
# It's a Page object (has find_all but no parent page)
|
249
|
+
page_obj = segment
|
250
|
+
segment_type = "page"
|
251
|
+
else:
|
252
|
+
logger.warning(f"Segment {i+1} does not support find_all, skipping")
|
253
|
+
continue
|
254
|
+
|
255
|
+
if page_obj not in segments_by_page:
|
256
|
+
segments_by_page[page_obj] = []
|
257
|
+
segments_by_page[page_obj].append((segment, segment_type))
|
258
|
+
|
259
|
+
if not segments_by_page:
|
260
|
+
logger.warning("No segments with searchable pages found")
|
261
|
+
return FlowElementCollection([])
|
262
|
+
|
263
|
+
# Step 2: Search each unique page only once
|
224
264
|
all_flow_elements: List["FlowElement"] = []
|
225
265
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
# Region.find_all() should return elements in local reading order.
|
230
|
-
matches_in_segment: "PhysicalElementCollection" = physical_segment.find_all(
|
266
|
+
for page_obj, page_segments in segments_by_page.items():
|
267
|
+
# Find all matching elements on this page
|
268
|
+
page_matches = page_obj.find_all(
|
231
269
|
selector=selector,
|
232
270
|
text=text,
|
233
271
|
apply_exclusions=apply_exclusions,
|
@@ -235,16 +273,46 @@ class Flow:
|
|
235
273
|
case=case,
|
236
274
|
**kwargs,
|
237
275
|
)
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
276
|
+
|
277
|
+
if not page_matches:
|
278
|
+
continue
|
279
|
+
|
280
|
+
# Step 3: For each segment on this page, collect relevant elements
|
281
|
+
for segment, segment_type in page_segments:
|
282
|
+
if segment_type == "page":
|
283
|
+
# Full page segment: include all elements
|
284
|
+
for phys_elem in page_matches.elements:
|
285
|
+
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
286
|
+
|
287
|
+
elif segment_type == "region":
|
288
|
+
# Region segment: filter to only intersecting elements
|
289
|
+
for phys_elem in page_matches.elements:
|
290
|
+
try:
|
291
|
+
# Check if element intersects with this flow segment
|
292
|
+
if segment.intersects(phys_elem):
|
293
|
+
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
294
|
+
except Exception as intersect_error:
|
295
|
+
logger.debug(f"Error checking intersection for element: {intersect_error}")
|
296
|
+
# Include the element anyway if intersection check fails
|
297
|
+
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
298
|
+
|
299
|
+
# Step 4: Remove duplicates (can happen if multiple segments intersect the same element)
|
300
|
+
unique_flow_elements = []
|
301
|
+
seen_element_ids = set()
|
302
|
+
|
303
|
+
for flow_elem in all_flow_elements:
|
304
|
+
# Create a unique identifier for the underlying physical element
|
305
|
+
phys_elem = flow_elem.physical_object
|
306
|
+
elem_id = (
|
307
|
+
getattr(phys_elem.page, 'index', id(phys_elem.page)) if hasattr(phys_elem, 'page') else id(phys_elem),
|
308
|
+
phys_elem.bbox if hasattr(phys_elem, 'bbox') else id(phys_elem)
|
309
|
+
)
|
310
|
+
|
311
|
+
if elem_id not in seen_element_ids:
|
312
|
+
unique_flow_elements.append(flow_elem)
|
313
|
+
seen_element_ids.add(elem_id)
|
246
314
|
|
247
|
-
return FlowElementCollection(
|
315
|
+
return FlowElementCollection(unique_flow_elements)
|
248
316
|
|
249
317
|
def __repr__(self) -> str:
|
250
318
|
return (
|
@@ -252,6 +320,620 @@ class Flow:
|
|
252
320
|
f"arrangement='{self.arrangement}', alignment='{self.alignment}', gap={self.segment_gap}>"
|
253
321
|
)
|
254
322
|
|
323
|
+
@overload
|
324
|
+
def extract_table(
|
325
|
+
self,
|
326
|
+
method: Optional[str] = None,
|
327
|
+
table_settings: Optional[dict] = None,
|
328
|
+
use_ocr: bool = False,
|
329
|
+
ocr_config: Optional[dict] = None,
|
330
|
+
text_options: Optional[dict] = None,
|
331
|
+
cell_extraction_func: Optional[Any] = None,
|
332
|
+
show_progress: bool = False,
|
333
|
+
content_filter: Optional[Any] = None,
|
334
|
+
stitch_rows: Callable[[List[Optional[str]]], bool] = None,
|
335
|
+
) -> TableResult: ...
|
336
|
+
|
337
|
+
@overload
|
338
|
+
def extract_table(
|
339
|
+
self,
|
340
|
+
method: Optional[str] = None,
|
341
|
+
table_settings: Optional[dict] = None,
|
342
|
+
use_ocr: bool = False,
|
343
|
+
ocr_config: Optional[dict] = None,
|
344
|
+
text_options: Optional[dict] = None,
|
345
|
+
cell_extraction_func: Optional[Any] = None,
|
346
|
+
show_progress: bool = False,
|
347
|
+
content_filter: Optional[Any] = None,
|
348
|
+
stitch_rows: Callable[
|
349
|
+
[List[Optional[str]], List[Optional[str]], int, Union["Page", "PhysicalRegion"]],
|
350
|
+
bool,
|
351
|
+
] = None,
|
352
|
+
) -> TableResult: ...
|
353
|
+
|
354
|
+
def extract_table(
|
355
|
+
self,
|
356
|
+
method: Optional[str] = None,
|
357
|
+
table_settings: Optional[dict] = None,
|
358
|
+
use_ocr: bool = False,
|
359
|
+
ocr_config: Optional[dict] = None,
|
360
|
+
text_options: Optional[dict] = None,
|
361
|
+
cell_extraction_func: Optional[Any] = None,
|
362
|
+
show_progress: bool = False,
|
363
|
+
content_filter: Optional[Any] = None,
|
364
|
+
stitch_rows: Optional[Callable] = None,
|
365
|
+
) -> TableResult:
|
366
|
+
"""
|
367
|
+
Extract table data from all segments in the flow, combining results sequentially.
|
368
|
+
|
369
|
+
This method extracts table data from each segment in flow order and combines
|
370
|
+
the results into a single logical table. This is particularly useful for
|
371
|
+
multi-page tables or tables that span across columns.
|
372
|
+
|
373
|
+
Args:
|
374
|
+
method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
|
375
|
+
table_settings: Settings for pdfplumber table extraction.
|
376
|
+
use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
|
377
|
+
ocr_config: OCR configuration parameters.
|
378
|
+
text_options: Dictionary of options for the 'text' method.
|
379
|
+
cell_extraction_func: Optional callable function that takes a cell Region object
|
380
|
+
and returns its string content. For 'text' method only.
|
381
|
+
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
382
|
+
content_filter: Optional content filter to apply during cell text extraction.
|
383
|
+
stitch_rows: Optional callable to determine when rows should be merged across
|
384
|
+
segment boundaries. Two overloaded signatures are supported:
|
385
|
+
|
386
|
+
• func(current_row) -> bool
|
387
|
+
Called only on the first row of each segment (after the first).
|
388
|
+
Return True to merge this first row with the last row from
|
389
|
+
the previous segment.
|
390
|
+
|
391
|
+
• func(prev_row, current_row, row_index, segment) -> bool
|
392
|
+
Called for every row. Return True to merge current_row with
|
393
|
+
the previous row in the aggregated results.
|
394
|
+
|
395
|
+
When True is returned, rows are concatenated cell-by-cell.
|
396
|
+
This is useful for handling table rows split across page
|
397
|
+
boundaries or segments. If None, rows are never merged.
|
398
|
+
|
399
|
+
Returns:
|
400
|
+
TableResult object containing the aggregated table data from all segments.
|
401
|
+
|
402
|
+
Example:
|
403
|
+
Multi-page table extraction:
|
404
|
+
```python
|
405
|
+
pdf = npdf.PDF("multi_page_table.pdf")
|
406
|
+
|
407
|
+
# Create flow for table spanning pages 2-4
|
408
|
+
table_flow = Flow(
|
409
|
+
segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
|
410
|
+
arrangement='vertical'
|
411
|
+
)
|
412
|
+
|
413
|
+
# Extract table as if it were continuous
|
414
|
+
table_data = table_flow.extract_table()
|
415
|
+
df = table_data.df # Convert to pandas DataFrame
|
416
|
+
|
417
|
+
# Custom row stitching - single parameter (simple case)
|
418
|
+
table_data = table_flow.extract_table(
|
419
|
+
stitch_rows=lambda row: row and not (row[0] or "").strip()
|
420
|
+
)
|
421
|
+
|
422
|
+
# Custom row stitching - full parameters (advanced case)
|
423
|
+
table_data = table_flow.extract_table(
|
424
|
+
stitch_rows=lambda prev, curr, idx, seg: idx == 0 and curr and not (curr[0] or "").strip()
|
425
|
+
)
|
426
|
+
```
|
427
|
+
"""
|
428
|
+
logger.info(f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})")
|
429
|
+
|
430
|
+
if not self.segments:
|
431
|
+
logger.warning("Flow has no segments, returning empty table")
|
432
|
+
return TableResult([])
|
433
|
+
|
434
|
+
# Resolve predicate and determine its signature
|
435
|
+
predicate: Optional[Callable] = None
|
436
|
+
predicate_type: str = "none"
|
437
|
+
|
438
|
+
if callable(stitch_rows):
|
439
|
+
import inspect
|
440
|
+
sig = inspect.signature(stitch_rows)
|
441
|
+
param_count = len(sig.parameters)
|
442
|
+
|
443
|
+
if param_count == 1:
|
444
|
+
predicate = stitch_rows
|
445
|
+
predicate_type = "single_param"
|
446
|
+
elif param_count == 4:
|
447
|
+
predicate = stitch_rows
|
448
|
+
predicate_type = "full_params"
|
449
|
+
else:
|
450
|
+
logger.warning(f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring.")
|
451
|
+
predicate = None
|
452
|
+
predicate_type = "none"
|
453
|
+
|
454
|
+
def _default_merge(prev_row: List[Optional[str]], cur_row: List[Optional[str]]) -> List[Optional[str]]:
|
455
|
+
from itertools import zip_longest
|
456
|
+
merged: List[Optional[str]] = []
|
457
|
+
for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
|
458
|
+
if (p or "").strip() and (c or "").strip():
|
459
|
+
merged.append(f"{p} {c}".strip())
|
460
|
+
else:
|
461
|
+
merged.append((p or "") + (c or ""))
|
462
|
+
return merged
|
463
|
+
|
464
|
+
aggregated_rows: List[List[Optional[str]]] = []
|
465
|
+
processed_segments = 0
|
466
|
+
|
467
|
+
for seg_idx, segment in enumerate(self.segments):
|
468
|
+
try:
|
469
|
+
logger.debug(f" Extracting table from segment {seg_idx+1}/{len(self.segments)}")
|
470
|
+
|
471
|
+
segment_result = segment.extract_table(
|
472
|
+
method=method,
|
473
|
+
table_settings=table_settings.copy() if table_settings else None,
|
474
|
+
use_ocr=use_ocr,
|
475
|
+
ocr_config=ocr_config,
|
476
|
+
text_options=text_options.copy() if text_options else None,
|
477
|
+
cell_extraction_func=cell_extraction_func,
|
478
|
+
show_progress=show_progress,
|
479
|
+
content_filter=content_filter,
|
480
|
+
)
|
481
|
+
|
482
|
+
if not segment_result:
|
483
|
+
continue
|
484
|
+
|
485
|
+
if hasattr(segment_result, "_rows"):
|
486
|
+
segment_rows = list(segment_result._rows)
|
487
|
+
else:
|
488
|
+
segment_rows = list(segment_result)
|
489
|
+
|
490
|
+
if not segment_rows:
|
491
|
+
logger.debug(f" No table data found in segment {seg_idx+1}")
|
492
|
+
continue
|
493
|
+
|
494
|
+
for row_idx, row in enumerate(segment_rows):
|
495
|
+
should_merge = False
|
496
|
+
|
497
|
+
if predicate is not None and aggregated_rows:
|
498
|
+
if predicate_type == "single_param":
|
499
|
+
# For single param: only call on first row of segment (row_idx == 0)
|
500
|
+
# and pass the current row
|
501
|
+
if row_idx == 0:
|
502
|
+
should_merge = predicate(row)
|
503
|
+
elif predicate_type == "full_params":
|
504
|
+
# For full params: call with all arguments
|
505
|
+
should_merge = predicate(aggregated_rows[-1], row, row_idx, segment)
|
506
|
+
|
507
|
+
if should_merge:
|
508
|
+
aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
|
509
|
+
else:
|
510
|
+
aggregated_rows.append(row)
|
511
|
+
|
512
|
+
processed_segments += 1
|
513
|
+
logger.debug(f" Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}")
|
514
|
+
|
515
|
+
except Exception as e:
|
516
|
+
logger.error(f"Error extracting table from segment {seg_idx+1}: {e}", exc_info=True)
|
517
|
+
continue
|
518
|
+
|
519
|
+
logger.info(
|
520
|
+
f"Flow table extraction complete: {len(aggregated_rows)} total rows from {processed_segments}/{len(self.segments)} segments"
|
521
|
+
)
|
522
|
+
return TableResult(aggregated_rows)
|
523
|
+
|
524
|
+
def analyze_layout(
|
525
|
+
self,
|
526
|
+
engine: Optional[str] = None,
|
527
|
+
options: Optional[Any] = None,
|
528
|
+
confidence: Optional[float] = None,
|
529
|
+
classes: Optional[List[str]] = None,
|
530
|
+
exclude_classes: Optional[List[str]] = None,
|
531
|
+
device: Optional[str] = None,
|
532
|
+
existing: str = "replace",
|
533
|
+
model_name: Optional[str] = None,
|
534
|
+
client: Optional[Any] = None,
|
535
|
+
) -> "PhysicalElementCollection":
|
536
|
+
"""
|
537
|
+
Analyze layout across all segments in the flow.
|
538
|
+
|
539
|
+
This method efficiently groups segments by their parent pages, runs layout analysis
|
540
|
+
only once per unique page, then filters results appropriately for each segment.
|
541
|
+
This avoids redundant analysis when multiple flow segments come from the same page.
|
542
|
+
|
543
|
+
Args:
|
544
|
+
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
|
545
|
+
options: Specific LayoutOptions object for advanced configuration.
|
546
|
+
confidence: Minimum confidence threshold.
|
547
|
+
classes: Specific classes to detect.
|
548
|
+
exclude_classes: Classes to exclude.
|
549
|
+
device: Device for inference.
|
550
|
+
existing: How to handle existing detected regions: 'replace' (default) or 'append'.
|
551
|
+
model_name: Optional model name for the engine.
|
552
|
+
client: Optional client for API-based engines.
|
553
|
+
|
554
|
+
Returns:
|
555
|
+
ElementCollection containing all detected Region objects from all segments.
|
556
|
+
|
557
|
+
Example:
|
558
|
+
Multi-page layout analysis:
|
559
|
+
```python
|
560
|
+
pdf = npdf.PDF("document.pdf")
|
561
|
+
|
562
|
+
# Create flow for first 3 pages
|
563
|
+
page_flow = Flow(
|
564
|
+
segments=pdf.pages[:3],
|
565
|
+
arrangement='vertical'
|
566
|
+
)
|
567
|
+
|
568
|
+
# Analyze layout across all pages (efficiently)
|
569
|
+
all_regions = page_flow.analyze_layout(engine='yolo')
|
570
|
+
|
571
|
+
# Find all tables across the flow
|
572
|
+
tables = all_regions.filter('region[type=table]')
|
573
|
+
```
|
574
|
+
"""
|
575
|
+
from natural_pdf.elements.collections import ElementCollection
|
576
|
+
|
577
|
+
logger.info(f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})")
|
578
|
+
|
579
|
+
if not self.segments:
|
580
|
+
logger.warning("Flow has no segments, returning empty collection")
|
581
|
+
return ElementCollection([])
|
582
|
+
|
583
|
+
# Step 1: Group segments by their parent pages to avoid redundant analysis
|
584
|
+
segments_by_page = {} # Dict[Page, List[Segment]]
|
585
|
+
|
586
|
+
for i, segment in enumerate(self.segments):
|
587
|
+
# Determine the page for this segment
|
588
|
+
if hasattr(segment, 'analyze_layout'):
|
589
|
+
# It's a Page object
|
590
|
+
page_obj = segment
|
591
|
+
segment_type = "page"
|
592
|
+
elif hasattr(segment, 'page') and hasattr(segment.page, 'analyze_layout'):
|
593
|
+
# It's a Region object
|
594
|
+
page_obj = segment.page
|
595
|
+
segment_type = "region"
|
596
|
+
else:
|
597
|
+
logger.warning(f"Segment {i+1} does not support layout analysis, skipping")
|
598
|
+
continue
|
599
|
+
|
600
|
+
if page_obj not in segments_by_page:
|
601
|
+
segments_by_page[page_obj] = []
|
602
|
+
segments_by_page[page_obj].append((segment, segment_type))
|
603
|
+
|
604
|
+
if not segments_by_page:
|
605
|
+
logger.warning("No segments with analyzable pages found")
|
606
|
+
return ElementCollection([])
|
607
|
+
|
608
|
+
logger.debug(f" Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages")
|
609
|
+
|
610
|
+
# Step 2: Analyze each unique page only once
|
611
|
+
all_detected_regions: List["PhysicalRegion"] = []
|
612
|
+
processed_pages = 0
|
613
|
+
|
614
|
+
for page_obj, page_segments in segments_by_page.items():
|
615
|
+
try:
|
616
|
+
logger.debug(f" Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments")
|
617
|
+
|
618
|
+
# Run layout analysis once for this page
|
619
|
+
page_results = page_obj.analyze_layout(
|
620
|
+
engine=engine,
|
621
|
+
options=options,
|
622
|
+
confidence=confidence,
|
623
|
+
classes=classes,
|
624
|
+
exclude_classes=exclude_classes,
|
625
|
+
device=device,
|
626
|
+
existing=existing,
|
627
|
+
model_name=model_name,
|
628
|
+
client=client,
|
629
|
+
)
|
630
|
+
|
631
|
+
# Extract regions from results
|
632
|
+
if hasattr(page_results, 'elements'):
|
633
|
+
# It's an ElementCollection
|
634
|
+
page_regions = page_results.elements
|
635
|
+
elif isinstance(page_results, list):
|
636
|
+
# It's a list of regions
|
637
|
+
page_regions = page_results
|
638
|
+
else:
|
639
|
+
logger.warning(f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}")
|
640
|
+
continue
|
641
|
+
|
642
|
+
if not page_regions:
|
643
|
+
logger.debug(f" No layout regions found on page {getattr(page_obj, 'number', '?')}")
|
644
|
+
continue
|
645
|
+
|
646
|
+
# Step 3: For each segment on this page, collect relevant regions
|
647
|
+
segments_processed_on_page = 0
|
648
|
+
for segment, segment_type in page_segments:
|
649
|
+
if segment_type == "page":
|
650
|
+
# Full page segment: include all detected regions
|
651
|
+
all_detected_regions.extend(page_regions)
|
652
|
+
segments_processed_on_page += 1
|
653
|
+
logger.debug(f" Added {len(page_regions)} regions for full-page segment")
|
654
|
+
|
655
|
+
elif segment_type == "region":
|
656
|
+
# Region segment: filter to only intersecting regions
|
657
|
+
intersecting_regions = []
|
658
|
+
for region in page_regions:
|
659
|
+
try:
|
660
|
+
if segment.intersects(region):
|
661
|
+
intersecting_regions.append(region)
|
662
|
+
except Exception as intersect_error:
|
663
|
+
logger.debug(f"Error checking intersection for region: {intersect_error}")
|
664
|
+
# Include the region anyway if intersection check fails
|
665
|
+
intersecting_regions.append(region)
|
666
|
+
|
667
|
+
all_detected_regions.extend(intersecting_regions)
|
668
|
+
segments_processed_on_page += 1
|
669
|
+
logger.debug(f" Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}")
|
670
|
+
|
671
|
+
processed_pages += 1
|
672
|
+
logger.debug(f" Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}")
|
673
|
+
|
674
|
+
except Exception as e:
|
675
|
+
logger.error(f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}", exc_info=True)
|
676
|
+
continue
|
677
|
+
|
678
|
+
# Step 4: Remove duplicates (can happen if multiple segments intersect the same region)
|
679
|
+
unique_regions = []
|
680
|
+
seen_region_ids = set()
|
681
|
+
|
682
|
+
for region in all_detected_regions:
|
683
|
+
# Create a unique identifier for this region (page + bbox)
|
684
|
+
region_id = (
|
685
|
+
getattr(region.page, 'index', id(region.page)),
|
686
|
+
region.bbox if hasattr(region, 'bbox') else id(region)
|
687
|
+
)
|
688
|
+
|
689
|
+
if region_id not in seen_region_ids:
|
690
|
+
unique_regions.append(region)
|
691
|
+
seen_region_ids.add(region_id)
|
692
|
+
|
693
|
+
dedupe_removed = len(all_detected_regions) - len(unique_regions)
|
694
|
+
if dedupe_removed > 0:
|
695
|
+
logger.debug(f" Removed {dedupe_removed} duplicate regions")
|
696
|
+
|
697
|
+
logger.info(f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages")
|
698
|
+
return ElementCollection(unique_regions)
|
699
|
+
|
700
|
+
def show(
|
701
|
+
self,
|
702
|
+
resolution: Optional[float] = None,
|
703
|
+
labels: bool = True,
|
704
|
+
legend_position: str = "right",
|
705
|
+
color: Optional[Union[Tuple, str]] = "blue",
|
706
|
+
label_prefix: Optional[str] = "FlowSegment",
|
707
|
+
width: Optional[int] = None,
|
708
|
+
stack_direction: str = "vertical",
|
709
|
+
stack_gap: int = 5,
|
710
|
+
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
711
|
+
crop: bool = False,
|
712
|
+
**kwargs,
|
713
|
+
) -> Optional["PIL_Image"]:
|
714
|
+
"""
|
715
|
+
Generates and returns a PIL Image showing all segments in the flow with highlights.
|
716
|
+
|
717
|
+
This method visualizes the entire flow by highlighting each segment on its respective
|
718
|
+
page and combining the results into a single image. If multiple pages are involved,
|
719
|
+
they are stacked according to the flow's arrangement.
|
720
|
+
|
721
|
+
Args:
|
722
|
+
resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
|
723
|
+
labels: Whether to include a legend for highlights.
|
724
|
+
legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
|
725
|
+
color: Color for highlighting the flow segments.
|
726
|
+
label_prefix: Prefix for segment labels (e.g., 'FlowSegment').
|
727
|
+
width: Optional width for the output image (overrides resolution).
|
728
|
+
stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
|
729
|
+
stack_gap: Gap in pixels between stacked pages.
|
730
|
+
stack_background_color: RGB background color for the stacked image.
|
731
|
+
crop: If True, crop each rendered page to the bounding box of segments on that page.
|
732
|
+
**kwargs: Additional arguments passed to the underlying rendering methods.
|
733
|
+
|
734
|
+
Returns:
|
735
|
+
PIL Image of the rendered pages with highlighted flow segments, or None if rendering fails.
|
736
|
+
|
737
|
+
Example:
|
738
|
+
Visualizing a multi-page flow:
|
739
|
+
```python
|
740
|
+
pdf = npdf.PDF("document.pdf")
|
741
|
+
|
742
|
+
# Create flow across multiple pages
|
743
|
+
page_flow = Flow(
|
744
|
+
segments=[pdf.pages[0], pdf.pages[1], pdf.pages[2]],
|
745
|
+
arrangement='vertical'
|
746
|
+
)
|
747
|
+
|
748
|
+
# Show the entire flow
|
749
|
+
flow_image = page_flow.show(color="green", labels=True)
|
750
|
+
```
|
751
|
+
"""
|
752
|
+
logger.info(f"Rendering Flow with {len(self.segments)} segments")
|
753
|
+
|
754
|
+
if not self.segments:
|
755
|
+
logger.warning("Flow has no segments to show")
|
756
|
+
return None
|
757
|
+
|
758
|
+
# Apply global options as defaults for resolution
|
759
|
+
import natural_pdf
|
760
|
+
if resolution is None:
|
761
|
+
if natural_pdf.options.image.resolution is not None:
|
762
|
+
resolution = natural_pdf.options.image.resolution
|
763
|
+
else:
|
764
|
+
resolution = 144 # Default resolution
|
765
|
+
|
766
|
+
# 1. Group segments by their physical pages
|
767
|
+
segments_by_page = {} # Dict[Page, List[PhysicalRegion]]
|
768
|
+
|
769
|
+
for i, segment in enumerate(self.segments):
|
770
|
+
# Get the page for this segment
|
771
|
+
if hasattr(segment, 'page') and segment.page is not None:
|
772
|
+
# It's a Region, use its page
|
773
|
+
page_obj = segment.page
|
774
|
+
if page_obj not in segments_by_page:
|
775
|
+
segments_by_page[page_obj] = []
|
776
|
+
segments_by_page[page_obj].append(segment)
|
777
|
+
elif hasattr(segment, 'index') and hasattr(segment, 'width') and hasattr(segment, 'height'):
|
778
|
+
# It's a full Page object, create a full-page region for it
|
779
|
+
page_obj = segment
|
780
|
+
full_page_region = segment.region(0, 0, segment.width, segment.height)
|
781
|
+
if page_obj not in segments_by_page:
|
782
|
+
segments_by_page[page_obj] = []
|
783
|
+
segments_by_page[page_obj].append(full_page_region)
|
784
|
+
else:
|
785
|
+
logger.warning(f"Segment {i+1} has no identifiable page, skipping")
|
786
|
+
continue
|
787
|
+
|
788
|
+
if not segments_by_page:
|
789
|
+
logger.warning("No segments with identifiable pages found")
|
790
|
+
return None
|
791
|
+
|
792
|
+
# 2. Get a highlighter service from the first page
|
793
|
+
first_page = next(iter(segments_by_page.keys()))
|
794
|
+
if not hasattr(first_page, '_highlighter'):
|
795
|
+
logger.error("Cannot get highlighter service for Flow.show(). Page missing highlighter.")
|
796
|
+
return None
|
797
|
+
|
798
|
+
highlighter_service = first_page._highlighter
|
799
|
+
output_page_images: List["PIL_Image_Runtime"] = []
|
800
|
+
|
801
|
+
# Sort pages by index for consistent output order
|
802
|
+
sorted_pages = sorted(
|
803
|
+
segments_by_page.keys(),
|
804
|
+
key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
|
805
|
+
)
|
806
|
+
|
807
|
+
# 3. Render each page with its relevant segments highlighted
|
808
|
+
for page_idx, page_obj in enumerate(sorted_pages):
|
809
|
+
segments_on_this_page = segments_by_page[page_obj]
|
810
|
+
if not segments_on_this_page:
|
811
|
+
continue
|
812
|
+
|
813
|
+
temp_highlights_for_page = []
|
814
|
+
for i, segment in enumerate(segments_on_this_page):
|
815
|
+
segment_label = None
|
816
|
+
if labels and label_prefix:
|
817
|
+
# Create label for this segment
|
818
|
+
global_segment_idx = None
|
819
|
+
try:
|
820
|
+
# Find the global index of this segment in the original flow
|
821
|
+
global_segment_idx = self.segments.index(segment)
|
822
|
+
except ValueError:
|
823
|
+
# If it's a generated full-page region, find its source page
|
824
|
+
for idx, orig_segment in enumerate(self.segments):
|
825
|
+
if (hasattr(orig_segment, 'index') and hasattr(segment, 'page')
|
826
|
+
and orig_segment.index == segment.page.index):
|
827
|
+
global_segment_idx = idx
|
828
|
+
break
|
829
|
+
|
830
|
+
if global_segment_idx is not None:
|
831
|
+
segment_label = f"{label_prefix}_{global_segment_idx + 1}"
|
832
|
+
else:
|
833
|
+
segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
|
834
|
+
|
835
|
+
temp_highlights_for_page.append(
|
836
|
+
{
|
837
|
+
"page_index": (
|
838
|
+
page_obj.index
|
839
|
+
if hasattr(page_obj, "index")
|
840
|
+
else getattr(page_obj, "page_number", 1) - 1
|
841
|
+
),
|
842
|
+
"bbox": segment.bbox,
|
843
|
+
"polygon": segment.polygon if hasattr(segment, 'polygon') and hasattr(segment, 'has_polygon') and segment.has_polygon else None,
|
844
|
+
"color": color,
|
845
|
+
"label": segment_label,
|
846
|
+
"use_color_cycling": False, # Keep specific color
|
847
|
+
}
|
848
|
+
)
|
849
|
+
|
850
|
+
if not temp_highlights_for_page:
|
851
|
+
continue
|
852
|
+
|
853
|
+
# Calculate crop bbox if cropping is enabled
|
854
|
+
crop_bbox = None
|
855
|
+
if crop and segments_on_this_page:
|
856
|
+
# Calculate the bounding box that encompasses all segments on this page
|
857
|
+
min_x0 = min(segment.bbox[0] for segment in segments_on_this_page)
|
858
|
+
min_y0 = min(segment.bbox[1] for segment in segments_on_this_page)
|
859
|
+
max_x1 = max(segment.bbox[2] for segment in segments_on_this_page)
|
860
|
+
max_y1 = max(segment.bbox[3] for segment in segments_on_this_page)
|
861
|
+
crop_bbox = (min_x0, min_y0, max_x1, max_y1)
|
862
|
+
|
863
|
+
# Render this page with highlights
|
864
|
+
page_image = highlighter_service.render_preview(
|
865
|
+
page_index=(
|
866
|
+
page_obj.index
|
867
|
+
if hasattr(page_obj, "index")
|
868
|
+
else getattr(page_obj, "page_number", 1) - 1
|
869
|
+
),
|
870
|
+
temporary_highlights=temp_highlights_for_page,
|
871
|
+
resolution=resolution,
|
872
|
+
width=width,
|
873
|
+
labels=labels,
|
874
|
+
legend_position=legend_position,
|
875
|
+
crop_bbox=crop_bbox,
|
876
|
+
**kwargs,
|
877
|
+
)
|
878
|
+
if page_image:
|
879
|
+
output_page_images.append(page_image)
|
880
|
+
|
881
|
+
# 4. Stack the generated page images if multiple
|
882
|
+
if not output_page_images:
|
883
|
+
logger.warning("Flow.show() produced no page images")
|
884
|
+
return None
|
885
|
+
|
886
|
+
if len(output_page_images) == 1:
|
887
|
+
return output_page_images[0]
|
888
|
+
|
889
|
+
# Determine stacking direction (default to flow arrangement, but allow override)
|
890
|
+
final_stack_direction = stack_direction
|
891
|
+
if stack_direction == "auto":
|
892
|
+
final_stack_direction = self.arrangement
|
893
|
+
|
894
|
+
# Stack multiple page images
|
895
|
+
if final_stack_direction == "vertical":
|
896
|
+
final_width = max(img.width for img in output_page_images)
|
897
|
+
final_height = (
|
898
|
+
sum(img.height for img in output_page_images)
|
899
|
+
+ (len(output_page_images) - 1) * stack_gap
|
900
|
+
)
|
901
|
+
if final_width == 0 or final_height == 0:
|
902
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
903
|
+
|
904
|
+
concatenated_image = PIL_Image_Runtime.new(
|
905
|
+
"RGB", (final_width, final_height), stack_background_color
|
906
|
+
)
|
907
|
+
current_y = 0
|
908
|
+
for img in output_page_images:
|
909
|
+
paste_x = (final_width - img.width) // 2
|
910
|
+
concatenated_image.paste(img, (paste_x, current_y))
|
911
|
+
current_y += img.height + stack_gap
|
912
|
+
return concatenated_image
|
913
|
+
|
914
|
+
elif final_stack_direction == "horizontal":
|
915
|
+
final_width = (
|
916
|
+
sum(img.width for img in output_page_images)
|
917
|
+
+ (len(output_page_images) - 1) * stack_gap
|
918
|
+
)
|
919
|
+
final_height = max(img.height for img in output_page_images)
|
920
|
+
if final_width == 0 or final_height == 0:
|
921
|
+
raise ValueError("Cannot create concatenated image with zero width or height.")
|
922
|
+
|
923
|
+
concatenated_image = PIL_Image_Runtime.new(
|
924
|
+
"RGB", (final_width, final_height), stack_background_color
|
925
|
+
)
|
926
|
+
current_x = 0
|
927
|
+
for img in output_page_images:
|
928
|
+
paste_y = (final_height - img.height) // 2
|
929
|
+
concatenated_image.paste(img, (current_x, paste_y))
|
930
|
+
current_x += img.width + stack_gap
|
931
|
+
return concatenated_image
|
932
|
+
else:
|
933
|
+
raise ValueError(
|
934
|
+
f"Invalid stack_direction '{final_stack_direction}' for Flow.show(). Must be 'vertical' or 'horizontal'."
|
935
|
+
)
|
936
|
+
|
255
937
|
# --- Helper methods for coordinate transformations and segment iteration ---
|
256
938
|
# These will be crucial for FlowElement's directional methods.
|
257
939
|
|