natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
natural_pdf/flows/flow.py
CHANGED
@@ -1,25 +1,43 @@
|
|
1
1
|
import logging
|
2
|
-
|
2
|
+
import warnings
|
3
|
+
from typing import (
|
4
|
+
TYPE_CHECKING,
|
5
|
+
Any,
|
6
|
+
Callable,
|
7
|
+
Dict,
|
8
|
+
List,
|
9
|
+
Literal,
|
10
|
+
Optional,
|
11
|
+
Tuple,
|
12
|
+
Union,
|
13
|
+
overload,
|
14
|
+
)
|
3
15
|
|
4
16
|
if TYPE_CHECKING:
|
17
|
+
from PIL.Image import Image as PIL_Image
|
18
|
+
|
5
19
|
from natural_pdf.core.page import Page
|
20
|
+
from natural_pdf.core.page_collection import PageCollection
|
6
21
|
from natural_pdf.elements.base import Element as PhysicalElement
|
7
|
-
from natural_pdf.elements.
|
22
|
+
from natural_pdf.elements.element_collection import (
|
23
|
+
ElementCollection as PhysicalElementCollection,
|
24
|
+
)
|
8
25
|
from natural_pdf.elements.region import Region as PhysicalRegion
|
9
|
-
from PIL.Image import Image as PIL_Image
|
10
26
|
|
11
27
|
from .collections import FlowElementCollection
|
12
28
|
from .element import FlowElement
|
13
29
|
|
14
30
|
# Import required classes for the new methods
|
15
|
-
from natural_pdf.tables import TableResult
|
16
31
|
# For runtime image manipulation
|
17
32
|
from PIL import Image as PIL_Image_Runtime
|
18
33
|
|
34
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
35
|
+
from natural_pdf.tables import TableResult
|
36
|
+
|
19
37
|
logger = logging.getLogger(__name__)
|
20
38
|
|
21
39
|
|
22
|
-
class Flow:
|
40
|
+
class Flow(Visualizable):
|
23
41
|
"""Defines a logical flow or sequence of physical Page or Region objects.
|
24
42
|
|
25
43
|
A Flow represents a continuous logical document structure that spans across
|
@@ -114,9 +132,9 @@ class Flow:
|
|
114
132
|
segment_gap: The virtual gap (in PDF points) between segments.
|
115
133
|
"""
|
116
134
|
# Handle PageCollection input
|
117
|
-
if hasattr(segments,
|
135
|
+
if hasattr(segments, "pages"): # It's a PageCollection
|
118
136
|
segments = list(segments.pages)
|
119
|
-
|
137
|
+
|
120
138
|
if not segments:
|
121
139
|
raise ValueError("Flow segments cannot be empty.")
|
122
140
|
if arrangement not in ["vertical", "horizontal"]:
|
@@ -176,6 +194,103 @@ class Flow:
|
|
176
194
|
f"Valid options are: {valid_alignments[self.arrangement]}"
|
177
195
|
)
|
178
196
|
|
197
|
+
def _get_highlighter(self):
|
198
|
+
"""Get the highlighting service from the first segment."""
|
199
|
+
if not self.segments:
|
200
|
+
raise RuntimeError("Flow has no segments to get highlighter from")
|
201
|
+
|
202
|
+
# Get highlighter from first segment
|
203
|
+
first_segment = self.segments[0]
|
204
|
+
if hasattr(first_segment, "_highlighter"):
|
205
|
+
return first_segment._highlighter
|
206
|
+
elif hasattr(first_segment, "page") and hasattr(first_segment.page, "_highlighter"):
|
207
|
+
return first_segment.page._highlighter
|
208
|
+
else:
|
209
|
+
raise RuntimeError(
|
210
|
+
f"Cannot find HighlightingService from Flow segments. "
|
211
|
+
f"First segment type: {type(first_segment).__name__}"
|
212
|
+
)
|
213
|
+
|
214
|
+
def show(
|
215
|
+
self,
|
216
|
+
*,
|
217
|
+
# Basic rendering options
|
218
|
+
resolution: Optional[float] = None,
|
219
|
+
width: Optional[int] = None,
|
220
|
+
# Highlight options
|
221
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
222
|
+
labels: bool = True,
|
223
|
+
label_format: Optional[str] = None,
|
224
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
225
|
+
# Layout options for multi-page/region
|
226
|
+
layout: Literal["stack", "grid", "single"] = "stack",
|
227
|
+
stack_direction: Literal["vertical", "horizontal"] = "vertical",
|
228
|
+
gap: int = 5,
|
229
|
+
columns: Optional[int] = None, # For grid layout
|
230
|
+
# Cropping options
|
231
|
+
crop: Union[bool, Literal["content"]] = False,
|
232
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
233
|
+
# Flow-specific options
|
234
|
+
in_context: bool = False,
|
235
|
+
separator_color: Optional[Tuple[int, int, int]] = None,
|
236
|
+
separator_thickness: int = 2,
|
237
|
+
**kwargs,
|
238
|
+
) -> Optional["PIL_Image"]:
|
239
|
+
"""Generate a preview image with highlights.
|
240
|
+
|
241
|
+
If in_context=True, shows segments as cropped images stacked together
|
242
|
+
with separators between segments.
|
243
|
+
|
244
|
+
Args:
|
245
|
+
resolution: DPI for rendering (default from global settings)
|
246
|
+
width: Target width in pixels (overrides resolution)
|
247
|
+
color: Default highlight color
|
248
|
+
labels: Whether to show labels for highlights
|
249
|
+
label_format: Format string for labels
|
250
|
+
highlights: Additional highlight groups to show
|
251
|
+
layout: How to arrange multiple pages/regions
|
252
|
+
stack_direction: Direction for stack layout
|
253
|
+
gap: Pixels between stacked images
|
254
|
+
columns: Number of columns for grid layout
|
255
|
+
crop: Whether to crop
|
256
|
+
crop_bbox: Explicit crop bounds
|
257
|
+
in_context: If True, use special Flow visualization with separators
|
258
|
+
separator_color: RGB color for separator lines (default: red)
|
259
|
+
separator_thickness: Thickness of separator lines
|
260
|
+
**kwargs: Additional parameters passed to rendering
|
261
|
+
|
262
|
+
Returns:
|
263
|
+
PIL Image object or None if nothing to render
|
264
|
+
"""
|
265
|
+
if in_context:
|
266
|
+
# Use the special in_context visualization
|
267
|
+
return self._show_in_context(
|
268
|
+
resolution=resolution or 150,
|
269
|
+
width=width,
|
270
|
+
stack_direction=stack_direction,
|
271
|
+
stack_gap=gap,
|
272
|
+
separator_color=separator_color or (255, 0, 0),
|
273
|
+
separator_thickness=separator_thickness,
|
274
|
+
**kwargs,
|
275
|
+
)
|
276
|
+
|
277
|
+
# Otherwise use the standard show method
|
278
|
+
return super().show(
|
279
|
+
resolution=resolution,
|
280
|
+
width=width,
|
281
|
+
color=color,
|
282
|
+
labels=labels,
|
283
|
+
label_format=label_format,
|
284
|
+
highlights=highlights,
|
285
|
+
layout=layout,
|
286
|
+
stack_direction=stack_direction,
|
287
|
+
gap=gap,
|
288
|
+
columns=columns,
|
289
|
+
crop=crop,
|
290
|
+
crop_bbox=crop_bbox,
|
291
|
+
**kwargs,
|
292
|
+
)
|
293
|
+
|
179
294
|
def find(
|
180
295
|
self,
|
181
296
|
selector: Optional[str] = None,
|
@@ -224,11 +339,11 @@ class Flow:
|
|
224
339
|
) -> "FlowElementCollection":
|
225
340
|
"""
|
226
341
|
Finds all elements within the flow that match the given selector or text criteria.
|
227
|
-
|
342
|
+
|
228
343
|
This method efficiently groups segments by their parent pages, searches at the page level,
|
229
344
|
then filters results appropriately for each segment. This ensures elements that intersect
|
230
345
|
with flow segments (but aren't fully contained) are still found.
|
231
|
-
|
346
|
+
|
232
347
|
Elements found are wrapped as FlowElement objects, anchored to this Flow,
|
233
348
|
and returned in a FlowElementCollection.
|
234
349
|
"""
|
@@ -237,21 +352,26 @@ class Flow:
|
|
237
352
|
|
238
353
|
# Step 1: Group segments by their parent pages (like in analyze_layout)
|
239
354
|
segments_by_page = {} # Dict[Page, List[Segment]]
|
240
|
-
|
355
|
+
|
241
356
|
for i, segment in enumerate(self.segments):
|
242
357
|
# Determine the page for this segment - fix type detection
|
243
|
-
if hasattr(segment,
|
358
|
+
if hasattr(segment, "page") and hasattr(segment.page, "find_all"):
|
244
359
|
# It's a Region object (has a parent page)
|
245
360
|
page_obj = segment.page
|
246
361
|
segment_type = "region"
|
247
|
-
elif
|
362
|
+
elif (
|
363
|
+
hasattr(segment, "find_all")
|
364
|
+
and hasattr(segment, "width")
|
365
|
+
and hasattr(segment, "height")
|
366
|
+
and not hasattr(segment, "page")
|
367
|
+
):
|
248
368
|
# It's a Page object (has find_all but no parent page)
|
249
369
|
page_obj = segment
|
250
370
|
segment_type = "page"
|
251
371
|
else:
|
252
372
|
logger.warning(f"Segment {i+1} does not support find_all, skipping")
|
253
373
|
continue
|
254
|
-
|
374
|
+
|
255
375
|
if page_obj not in segments_by_page:
|
256
376
|
segments_by_page[page_obj] = []
|
257
377
|
segments_by_page[page_obj].append((segment, segment_type))
|
@@ -273,7 +393,7 @@ class Flow:
|
|
273
393
|
case=case,
|
274
394
|
**kwargs,
|
275
395
|
)
|
276
|
-
|
396
|
+
|
277
397
|
if not page_matches:
|
278
398
|
continue
|
279
399
|
|
@@ -283,31 +403,41 @@ class Flow:
|
|
283
403
|
# Full page segment: include all elements
|
284
404
|
for phys_elem in page_matches.elements:
|
285
405
|
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
286
|
-
|
406
|
+
|
287
407
|
elif segment_type == "region":
|
288
408
|
# Region segment: filter to only intersecting elements
|
289
409
|
for phys_elem in page_matches.elements:
|
290
410
|
try:
|
291
411
|
# Check if element intersects with this flow segment
|
292
412
|
if segment.intersects(phys_elem):
|
293
|
-
all_flow_elements.append(
|
413
|
+
all_flow_elements.append(
|
414
|
+
FlowElement(physical_object=phys_elem, flow=self)
|
415
|
+
)
|
294
416
|
except Exception as intersect_error:
|
295
|
-
logger.debug(
|
417
|
+
logger.debug(
|
418
|
+
f"Error checking intersection for element: {intersect_error}"
|
419
|
+
)
|
296
420
|
# Include the element anyway if intersection check fails
|
297
|
-
all_flow_elements.append(
|
421
|
+
all_flow_elements.append(
|
422
|
+
FlowElement(physical_object=phys_elem, flow=self)
|
423
|
+
)
|
298
424
|
|
299
425
|
# Step 4: Remove duplicates (can happen if multiple segments intersect the same element)
|
300
426
|
unique_flow_elements = []
|
301
427
|
seen_element_ids = set()
|
302
|
-
|
428
|
+
|
303
429
|
for flow_elem in all_flow_elements:
|
304
430
|
# Create a unique identifier for the underlying physical element
|
305
431
|
phys_elem = flow_elem.physical_object
|
306
432
|
elem_id = (
|
307
|
-
|
308
|
-
|
433
|
+
(
|
434
|
+
getattr(phys_elem.page, "index", id(phys_elem.page))
|
435
|
+
if hasattr(phys_elem, "page")
|
436
|
+
else id(phys_elem)
|
437
|
+
),
|
438
|
+
phys_elem.bbox if hasattr(phys_elem, "bbox") else id(phys_elem),
|
309
439
|
)
|
310
|
-
|
440
|
+
|
311
441
|
if elem_id not in seen_element_ids:
|
312
442
|
unique_flow_elements.append(flow_elem)
|
313
443
|
seen_element_ids.add(elem_id)
|
@@ -362,6 +492,7 @@ class Flow:
|
|
362
492
|
show_progress: bool = False,
|
363
493
|
content_filter: Optional[Any] = None,
|
364
494
|
stitch_rows: Optional[Callable] = None,
|
495
|
+
merge_headers: Optional[bool] = None,
|
365
496
|
) -> TableResult:
|
366
497
|
"""
|
367
498
|
Extract table data from all segments in the flow, combining results sequentially.
|
@@ -380,18 +511,24 @@ class Flow:
|
|
380
511
|
and returns its string content. For 'text' method only.
|
381
512
|
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
382
513
|
content_filter: Optional content filter to apply during cell text extraction.
|
514
|
+
merge_headers: Whether to merge tables by removing repeated headers from subsequent
|
515
|
+
segments. If None (default), auto-detects by checking if the first row
|
516
|
+
of each segment matches the first row of the first segment. If segments have
|
517
|
+
inconsistent header patterns (some repeat, others don't), raises ValueError.
|
518
|
+
Useful for multi-page tables where headers repeat on each page.
|
383
519
|
stitch_rows: Optional callable to determine when rows should be merged across
|
384
|
-
segment boundaries.
|
385
|
-
|
520
|
+
segment boundaries. Applied AFTER header removal if merge_headers
|
521
|
+
is enabled. Two overloaded signatures are supported:
|
522
|
+
|
386
523
|
• func(current_row) -> bool
|
387
524
|
Called only on the first row of each segment (after the first).
|
388
525
|
Return True to merge this first row with the last row from
|
389
526
|
the previous segment.
|
390
|
-
|
527
|
+
|
391
528
|
• func(prev_row, current_row, row_index, segment) -> bool
|
392
529
|
Called for every row. Return True to merge current_row with
|
393
530
|
the previous row in the aggregated results.
|
394
|
-
|
531
|
+
|
395
532
|
When True is returned, rows are concatenated cell-by-cell.
|
396
533
|
This is useful for handling table rows split across page
|
397
534
|
boundaries or segments. If None, rows are never merged.
|
@@ -403,30 +540,32 @@ class Flow:
|
|
403
540
|
Multi-page table extraction:
|
404
541
|
```python
|
405
542
|
pdf = npdf.PDF("multi_page_table.pdf")
|
406
|
-
|
543
|
+
|
407
544
|
# Create flow for table spanning pages 2-4
|
408
545
|
table_flow = Flow(
|
409
546
|
segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
|
410
547
|
arrangement='vertical'
|
411
548
|
)
|
412
|
-
|
549
|
+
|
413
550
|
# Extract table as if it were continuous
|
414
551
|
table_data = table_flow.extract_table()
|
415
552
|
df = table_data.df # Convert to pandas DataFrame
|
416
|
-
|
553
|
+
|
417
554
|
# Custom row stitching - single parameter (simple case)
|
418
555
|
table_data = table_flow.extract_table(
|
419
556
|
stitch_rows=lambda row: row and not (row[0] or "").strip()
|
420
557
|
)
|
421
|
-
|
558
|
+
|
422
559
|
# Custom row stitching - full parameters (advanced case)
|
423
560
|
table_data = table_flow.extract_table(
|
424
561
|
stitch_rows=lambda prev, curr, idx, seg: idx == 0 and curr and not (curr[0] or "").strip()
|
425
562
|
)
|
426
563
|
```
|
427
564
|
"""
|
428
|
-
logger.info(
|
429
|
-
|
565
|
+
logger.info(
|
566
|
+
f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})"
|
567
|
+
)
|
568
|
+
|
430
569
|
if not self.segments:
|
431
570
|
logger.warning("Flow has no segments, returning empty table")
|
432
571
|
return TableResult([])
|
@@ -434,12 +573,13 @@ class Flow:
|
|
434
573
|
# Resolve predicate and determine its signature
|
435
574
|
predicate: Optional[Callable] = None
|
436
575
|
predicate_type: str = "none"
|
437
|
-
|
576
|
+
|
438
577
|
if callable(stitch_rows):
|
439
578
|
import inspect
|
579
|
+
|
440
580
|
sig = inspect.signature(stitch_rows)
|
441
581
|
param_count = len(sig.parameters)
|
442
|
-
|
582
|
+
|
443
583
|
if param_count == 1:
|
444
584
|
predicate = stitch_rows
|
445
585
|
predicate_type = "single_param"
|
@@ -447,12 +587,17 @@ class Flow:
|
|
447
587
|
predicate = stitch_rows
|
448
588
|
predicate_type = "full_params"
|
449
589
|
else:
|
450
|
-
logger.warning(
|
590
|
+
logger.warning(
|
591
|
+
f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring."
|
592
|
+
)
|
451
593
|
predicate = None
|
452
594
|
predicate_type = "none"
|
453
595
|
|
454
|
-
def _default_merge(
|
596
|
+
def _default_merge(
|
597
|
+
prev_row: List[Optional[str]], cur_row: List[Optional[str]]
|
598
|
+
) -> List[Optional[str]]:
|
455
599
|
from itertools import zip_longest
|
600
|
+
|
456
601
|
merged: List[Optional[str]] = []
|
457
602
|
for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
|
458
603
|
if (p or "").strip() and (c or "").strip():
|
@@ -463,6 +608,10 @@ class Flow:
|
|
463
608
|
|
464
609
|
aggregated_rows: List[List[Optional[str]]] = []
|
465
610
|
processed_segments = 0
|
611
|
+
header_row: Optional[List[Optional[str]]] = None
|
612
|
+
merge_headers_enabled = False
|
613
|
+
headers_warned = False # Track if we've already warned about dropping headers
|
614
|
+
segment_has_repeated_header = [] # Track which segments have repeated headers
|
466
615
|
|
467
616
|
for seg_idx, segment in enumerate(self.segments):
|
468
617
|
try:
|
@@ -491,9 +640,67 @@ class Flow:
|
|
491
640
|
logger.debug(f" No table data found in segment {seg_idx+1}")
|
492
641
|
continue
|
493
642
|
|
643
|
+
# Handle header detection and merging for multi-page tables
|
644
|
+
if seg_idx == 0:
|
645
|
+
# First segment: capture potential header row
|
646
|
+
if segment_rows:
|
647
|
+
header_row = segment_rows[0]
|
648
|
+
# Determine if we should merge headers
|
649
|
+
if merge_headers is None:
|
650
|
+
# Auto-detect: we'll check all subsequent segments
|
651
|
+
merge_headers_enabled = False # Will be determined later
|
652
|
+
else:
|
653
|
+
merge_headers_enabled = merge_headers
|
654
|
+
# Track that first segment exists (for consistency checking)
|
655
|
+
segment_has_repeated_header.append(False) # First segment doesn't "repeat"
|
656
|
+
elif seg_idx == 1 and merge_headers is None:
|
657
|
+
# Auto-detection: check if first row of second segment matches header
|
658
|
+
has_header = segment_rows and header_row and segment_rows[0] == header_row
|
659
|
+
segment_has_repeated_header.append(has_header)
|
660
|
+
|
661
|
+
if has_header:
|
662
|
+
merge_headers_enabled = True
|
663
|
+
# Remove the detected repeated header from this segment
|
664
|
+
segment_rows = segment_rows[1:]
|
665
|
+
logger.debug(
|
666
|
+
f" Auto-detected repeated header in segment {seg_idx+1}, removed"
|
667
|
+
)
|
668
|
+
if not headers_warned:
|
669
|
+
warnings.warn(
|
670
|
+
"Detected repeated headers in multi-page table. Merging by removing "
|
671
|
+
"repeated headers from subsequent pages.",
|
672
|
+
UserWarning,
|
673
|
+
stacklevel=2,
|
674
|
+
)
|
675
|
+
headers_warned = True
|
676
|
+
else:
|
677
|
+
merge_headers_enabled = False
|
678
|
+
logger.debug(f" No repeated header detected in segment {seg_idx+1}")
|
679
|
+
elif seg_idx > 1:
|
680
|
+
# Check consistency: all segments should have same pattern
|
681
|
+
has_header = segment_rows and header_row and segment_rows[0] == header_row
|
682
|
+
segment_has_repeated_header.append(has_header)
|
683
|
+
|
684
|
+
# Remove header if merging is enabled and header is present
|
685
|
+
if merge_headers_enabled and has_header:
|
686
|
+
segment_rows = segment_rows[1:]
|
687
|
+
logger.debug(f" Removed repeated header from segment {seg_idx+1}")
|
688
|
+
elif seg_idx > 0 and merge_headers_enabled:
|
689
|
+
# Explicit merge_headers=True: remove headers from subsequent segments
|
690
|
+
if segment_rows and header_row and segment_rows[0] == header_row:
|
691
|
+
segment_rows = segment_rows[1:]
|
692
|
+
logger.debug(f" Removed repeated header from segment {seg_idx+1}")
|
693
|
+
if not headers_warned:
|
694
|
+
warnings.warn(
|
695
|
+
"Removing repeated headers from multi-page table during merge.",
|
696
|
+
UserWarning,
|
697
|
+
stacklevel=2,
|
698
|
+
)
|
699
|
+
headers_warned = True
|
700
|
+
|
494
701
|
for row_idx, row in enumerate(segment_rows):
|
495
702
|
should_merge = False
|
496
|
-
|
703
|
+
|
497
704
|
if predicate is not None and aggregated_rows:
|
498
705
|
if predicate_type == "single_param":
|
499
706
|
# For single param: only call on first row of segment (row_idx == 0)
|
@@ -503,19 +710,41 @@ class Flow:
|
|
503
710
|
elif predicate_type == "full_params":
|
504
711
|
# For full params: call with all arguments
|
505
712
|
should_merge = predicate(aggregated_rows[-1], row, row_idx, segment)
|
506
|
-
|
713
|
+
|
507
714
|
if should_merge:
|
508
715
|
aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
|
509
716
|
else:
|
510
717
|
aggregated_rows.append(row)
|
511
718
|
|
512
719
|
processed_segments += 1
|
513
|
-
logger.debug(
|
720
|
+
logger.debug(
|
721
|
+
f" Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}"
|
722
|
+
)
|
514
723
|
|
515
724
|
except Exception as e:
|
516
725
|
logger.error(f"Error extracting table from segment {seg_idx+1}: {e}", exc_info=True)
|
517
726
|
continue
|
518
727
|
|
728
|
+
# Check for inconsistent header patterns after processing all segments
|
729
|
+
if merge_headers is None and len(segment_has_repeated_header) > 2:
|
730
|
+
# During auto-detection, check for consistency across all segments
|
731
|
+
expected_pattern = segment_has_repeated_header[1] # Pattern from second segment
|
732
|
+
for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
|
733
|
+
if has_header != expected_pattern:
|
734
|
+
# Inconsistent pattern detected
|
735
|
+
segments_with_headers = [
|
736
|
+
i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
|
737
|
+
]
|
738
|
+
segments_without_headers = [
|
739
|
+
i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
|
740
|
+
]
|
741
|
+
raise ValueError(
|
742
|
+
f"Inconsistent header pattern in multi-page table: "
|
743
|
+
f"segments {segments_with_headers} have repeated headers, "
|
744
|
+
f"but segments {segments_without_headers} do not. "
|
745
|
+
f"All segments must have the same header pattern for reliable merging."
|
746
|
+
)
|
747
|
+
|
519
748
|
logger.info(
|
520
749
|
f"Flow table extraction complete: {len(aggregated_rows)} total rows from {processed_segments}/{len(self.segments)} segments"
|
521
750
|
)
|
@@ -558,45 +787,47 @@ class Flow:
|
|
558
787
|
Multi-page layout analysis:
|
559
788
|
```python
|
560
789
|
pdf = npdf.PDF("document.pdf")
|
561
|
-
|
790
|
+
|
562
791
|
# Create flow for first 3 pages
|
563
792
|
page_flow = Flow(
|
564
793
|
segments=pdf.pages[:3],
|
565
794
|
arrangement='vertical'
|
566
795
|
)
|
567
|
-
|
796
|
+
|
568
797
|
# Analyze layout across all pages (efficiently)
|
569
798
|
all_regions = page_flow.analyze_layout(engine='yolo')
|
570
|
-
|
799
|
+
|
571
800
|
# Find all tables across the flow
|
572
801
|
tables = all_regions.filter('region[type=table]')
|
573
802
|
```
|
574
803
|
"""
|
575
|
-
from natural_pdf.elements.
|
576
|
-
|
577
|
-
logger.info(
|
578
|
-
|
804
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
805
|
+
|
806
|
+
logger.info(
|
807
|
+
f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})"
|
808
|
+
)
|
809
|
+
|
579
810
|
if not self.segments:
|
580
811
|
logger.warning("Flow has no segments, returning empty collection")
|
581
812
|
return ElementCollection([])
|
582
813
|
|
583
814
|
# Step 1: Group segments by their parent pages to avoid redundant analysis
|
584
815
|
segments_by_page = {} # Dict[Page, List[Segment]]
|
585
|
-
|
816
|
+
|
586
817
|
for i, segment in enumerate(self.segments):
|
587
818
|
# Determine the page for this segment
|
588
|
-
if hasattr(segment,
|
819
|
+
if hasattr(segment, "analyze_layout"):
|
589
820
|
# It's a Page object
|
590
821
|
page_obj = segment
|
591
822
|
segment_type = "page"
|
592
|
-
elif hasattr(segment,
|
823
|
+
elif hasattr(segment, "page") and hasattr(segment.page, "analyze_layout"):
|
593
824
|
# It's a Region object
|
594
825
|
page_obj = segment.page
|
595
826
|
segment_type = "region"
|
596
827
|
else:
|
597
828
|
logger.warning(f"Segment {i+1} does not support layout analysis, skipping")
|
598
829
|
continue
|
599
|
-
|
830
|
+
|
600
831
|
if page_obj not in segments_by_page:
|
601
832
|
segments_by_page[page_obj] = []
|
602
833
|
segments_by_page[page_obj].append((segment, segment_type))
|
@@ -605,7 +836,9 @@ class Flow:
|
|
605
836
|
logger.warning("No segments with analyzable pages found")
|
606
837
|
return ElementCollection([])
|
607
838
|
|
608
|
-
logger.debug(
|
839
|
+
logger.debug(
|
840
|
+
f" Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages"
|
841
|
+
)
|
609
842
|
|
610
843
|
# Step 2: Analyze each unique page only once
|
611
844
|
all_detected_regions: List["PhysicalRegion"] = []
|
@@ -613,8 +846,10 @@ class Flow:
|
|
613
846
|
|
614
847
|
for page_obj, page_segments in segments_by_page.items():
|
615
848
|
try:
|
616
|
-
logger.debug(
|
617
|
-
|
849
|
+
logger.debug(
|
850
|
+
f" Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments"
|
851
|
+
)
|
852
|
+
|
618
853
|
# Run layout analysis once for this page
|
619
854
|
page_results = page_obj.analyze_layout(
|
620
855
|
engine=engine,
|
@@ -629,18 +864,22 @@ class Flow:
|
|
629
864
|
)
|
630
865
|
|
631
866
|
# Extract regions from results
|
632
|
-
if hasattr(page_results,
|
867
|
+
if hasattr(page_results, "elements"):
|
633
868
|
# It's an ElementCollection
|
634
869
|
page_regions = page_results.elements
|
635
870
|
elif isinstance(page_results, list):
|
636
871
|
# It's a list of regions
|
637
872
|
page_regions = page_results
|
638
873
|
else:
|
639
|
-
logger.warning(
|
874
|
+
logger.warning(
|
875
|
+
f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}"
|
876
|
+
)
|
640
877
|
continue
|
641
878
|
|
642
879
|
if not page_regions:
|
643
|
-
logger.debug(
|
880
|
+
logger.debug(
|
881
|
+
f" No layout regions found on page {getattr(page_obj, 'number', '?')}"
|
882
|
+
)
|
644
883
|
continue
|
645
884
|
|
646
885
|
# Step 3: For each segment on this page, collect relevant regions
|
@@ -651,7 +890,7 @@ class Flow:
|
|
651
890
|
all_detected_regions.extend(page_regions)
|
652
891
|
segments_processed_on_page += 1
|
653
892
|
logger.debug(f" Added {len(page_regions)} regions for full-page segment")
|
654
|
-
|
893
|
+
|
655
894
|
elif segment_type == "region":
|
656
895
|
# Region segment: filter to only intersecting regions
|
657
896
|
intersecting_regions = []
|
@@ -660,32 +899,41 @@ class Flow:
|
|
660
899
|
if segment.intersects(region):
|
661
900
|
intersecting_regions.append(region)
|
662
901
|
except Exception as intersect_error:
|
663
|
-
logger.debug(
|
902
|
+
logger.debug(
|
903
|
+
f"Error checking intersection for region: {intersect_error}"
|
904
|
+
)
|
664
905
|
# Include the region anyway if intersection check fails
|
665
906
|
intersecting_regions.append(region)
|
666
|
-
|
907
|
+
|
667
908
|
all_detected_regions.extend(intersecting_regions)
|
668
909
|
segments_processed_on_page += 1
|
669
|
-
logger.debug(
|
910
|
+
logger.debug(
|
911
|
+
f" Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}"
|
912
|
+
)
|
670
913
|
|
671
914
|
processed_pages += 1
|
672
|
-
logger.debug(
|
915
|
+
logger.debug(
|
916
|
+
f" Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}"
|
917
|
+
)
|
673
918
|
|
674
919
|
except Exception as e:
|
675
|
-
logger.error(
|
920
|
+
logger.error(
|
921
|
+
f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}",
|
922
|
+
exc_info=True,
|
923
|
+
)
|
676
924
|
continue
|
677
925
|
|
678
926
|
# Step 4: Remove duplicates (can happen if multiple segments intersect the same region)
|
679
927
|
unique_regions = []
|
680
928
|
seen_region_ids = set()
|
681
|
-
|
929
|
+
|
682
930
|
for region in all_detected_regions:
|
683
931
|
# Create a unique identifier for this region (page + bbox)
|
684
932
|
region_id = (
|
685
|
-
getattr(region.page,
|
686
|
-
region.bbox if hasattr(region,
|
933
|
+
getattr(region.page, "index", id(region.page)),
|
934
|
+
region.bbox if hasattr(region, "bbox") else id(region),
|
687
935
|
)
|
688
|
-
|
936
|
+
|
689
937
|
if region_id not in seen_region_ids:
|
690
938
|
unique_regions.append(region)
|
691
939
|
seen_region_ids.add(region_id)
|
@@ -694,87 +942,54 @@ class Flow:
|
|
694
942
|
if dedupe_removed > 0:
|
695
943
|
logger.debug(f" Removed {dedupe_removed} duplicate regions")
|
696
944
|
|
697
|
-
logger.info(
|
945
|
+
logger.info(
|
946
|
+
f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages"
|
947
|
+
)
|
698
948
|
return ElementCollection(unique_regions)
|
699
949
|
|
700
|
-
def
|
950
|
+
def _get_render_specs(
|
701
951
|
self,
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
952
|
+
mode: Literal["show", "render"] = "show",
|
953
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
954
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
955
|
+
crop: Union[bool, Literal["content"]] = False,
|
956
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
706
957
|
label_prefix: Optional[str] = "FlowSegment",
|
707
|
-
width: Optional[int] = None,
|
708
|
-
stack_direction: str = "vertical",
|
709
|
-
stack_gap: int = 5,
|
710
|
-
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
711
|
-
crop: bool = False,
|
712
958
|
**kwargs,
|
713
|
-
) ->
|
714
|
-
"""
|
715
|
-
Generates and returns a PIL Image showing all segments in the flow with highlights.
|
716
|
-
|
717
|
-
This method visualizes the entire flow by highlighting each segment on its respective
|
718
|
-
page and combining the results into a single image. If multiple pages are involved,
|
719
|
-
they are stacked according to the flow's arrangement.
|
959
|
+
) -> List[RenderSpec]:
|
960
|
+
"""Get render specifications for this flow.
|
720
961
|
|
721
962
|
Args:
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
stack_gap: Gap in pixels between stacked pages.
|
730
|
-
stack_background_color: RGB background color for the stacked image.
|
731
|
-
crop: If True, crop each rendered page to the bounding box of segments on that page.
|
732
|
-
**kwargs: Additional arguments passed to the underlying rendering methods.
|
963
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
964
|
+
color: Color for highlighting segments in show mode
|
965
|
+
highlights: Additional highlight groups to show
|
966
|
+
crop: Whether to crop to segments
|
967
|
+
crop_bbox: Explicit crop bounds
|
968
|
+
label_prefix: Prefix for segment labels
|
969
|
+
**kwargs: Additional parameters
|
733
970
|
|
734
971
|
Returns:
|
735
|
-
|
736
|
-
|
737
|
-
Example:
|
738
|
-
Visualizing a multi-page flow:
|
739
|
-
```python
|
740
|
-
pdf = npdf.PDF("document.pdf")
|
741
|
-
|
742
|
-
# Create flow across multiple pages
|
743
|
-
page_flow = Flow(
|
744
|
-
segments=[pdf.pages[0], pdf.pages[1], pdf.pages[2]],
|
745
|
-
arrangement='vertical'
|
746
|
-
)
|
747
|
-
|
748
|
-
# Show the entire flow
|
749
|
-
flow_image = page_flow.show(color="green", labels=True)
|
750
|
-
```
|
972
|
+
List of RenderSpec objects, one per page with segments
|
751
973
|
"""
|
752
|
-
logger.info(f"Rendering Flow with {len(self.segments)} segments")
|
753
|
-
|
754
974
|
if not self.segments:
|
755
|
-
|
756
|
-
return None
|
757
|
-
|
758
|
-
# Apply global options as defaults for resolution
|
759
|
-
import natural_pdf
|
760
|
-
if resolution is None:
|
761
|
-
if natural_pdf.options.image.resolution is not None:
|
762
|
-
resolution = natural_pdf.options.image.resolution
|
763
|
-
else:
|
764
|
-
resolution = 144 # Default resolution
|
975
|
+
return []
|
765
976
|
|
766
|
-
#
|
977
|
+
# Group segments by their physical pages
|
767
978
|
segments_by_page = {} # Dict[Page, List[PhysicalRegion]]
|
768
|
-
|
979
|
+
|
769
980
|
for i, segment in enumerate(self.segments):
|
770
981
|
# Get the page for this segment
|
771
|
-
if hasattr(segment,
|
982
|
+
if hasattr(segment, "page") and segment.page is not None:
|
772
983
|
# It's a Region, use its page
|
773
984
|
page_obj = segment.page
|
774
985
|
if page_obj not in segments_by_page:
|
775
986
|
segments_by_page[page_obj] = []
|
776
987
|
segments_by_page[page_obj].append(segment)
|
777
|
-
elif
|
988
|
+
elif (
|
989
|
+
hasattr(segment, "index")
|
990
|
+
and hasattr(segment, "width")
|
991
|
+
and hasattr(segment, "height")
|
992
|
+
):
|
778
993
|
# It's a full Page object, create a full-page region for it
|
779
994
|
page_obj = segment
|
780
995
|
full_page_region = segment.region(0, 0, segment.width, segment.height)
|
@@ -786,17 +1001,10 @@ class Flow:
|
|
786
1001
|
continue
|
787
1002
|
|
788
1003
|
if not segments_by_page:
|
789
|
-
|
790
|
-
return None
|
1004
|
+
return []
|
791
1005
|
|
792
|
-
#
|
793
|
-
|
794
|
-
if not hasattr(first_page, '_highlighter'):
|
795
|
-
logger.error("Cannot get highlighter service for Flow.show(). Page missing highlighter.")
|
796
|
-
return None
|
797
|
-
|
798
|
-
highlighter_service = first_page._highlighter
|
799
|
-
output_page_images: List["PIL_Image_Runtime"] = []
|
1006
|
+
# Create RenderSpec for each page
|
1007
|
+
specs = []
|
800
1008
|
|
801
1009
|
# Sort pages by index for consistent output order
|
802
1010
|
sorted_pages = sorted(
|
@@ -804,134 +1012,243 @@ class Flow:
|
|
804
1012
|
key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
|
805
1013
|
)
|
806
1014
|
|
807
|
-
# 3. Render each page with its relevant segments highlighted
|
808
1015
|
for page_idx, page_obj in enumerate(sorted_pages):
|
809
1016
|
segments_on_this_page = segments_by_page[page_obj]
|
810
1017
|
if not segments_on_this_page:
|
811
1018
|
continue
|
812
1019
|
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
1020
|
+
spec = RenderSpec(page=page_obj)
|
1021
|
+
|
1022
|
+
# Handle cropping
|
1023
|
+
if crop_bbox:
|
1024
|
+
spec.crop_bbox = crop_bbox
|
1025
|
+
elif crop == "content" or crop is True:
|
1026
|
+
# Calculate bounds of segments on this page
|
1027
|
+
x_coords = []
|
1028
|
+
y_coords = []
|
1029
|
+
for segment in segments_on_this_page:
|
1030
|
+
if hasattr(segment, "bbox") and segment.bbox:
|
1031
|
+
x0, y0, x1, y1 = segment.bbox
|
1032
|
+
x_coords.extend([x0, x1])
|
1033
|
+
y_coords.extend([y0, y1])
|
1034
|
+
|
1035
|
+
if x_coords and y_coords:
|
1036
|
+
spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
1037
|
+
|
1038
|
+
# Add highlights in show mode
|
1039
|
+
if mode == "show":
|
1040
|
+
# Highlight segments
|
1041
|
+
for i, segment in enumerate(segments_on_this_page):
|
1042
|
+
segment_label = None
|
1043
|
+
if label_prefix:
|
1044
|
+
# Create label for this segment
|
1045
|
+
global_segment_idx = None
|
1046
|
+
try:
|
1047
|
+
# Find the global index of this segment in the original flow
|
1048
|
+
global_segment_idx = self.segments.index(segment)
|
1049
|
+
except ValueError:
|
1050
|
+
# If it's a generated full-page region, find its source page
|
1051
|
+
for idx, orig_segment in enumerate(self.segments):
|
1052
|
+
if (
|
1053
|
+
hasattr(orig_segment, "index")
|
1054
|
+
and hasattr(segment, "page")
|
1055
|
+
and orig_segment.index == segment.page.index
|
1056
|
+
):
|
1057
|
+
global_segment_idx = idx
|
1058
|
+
break
|
1059
|
+
|
1060
|
+
if global_segment_idx is not None:
|
1061
|
+
segment_label = f"{label_prefix}_{global_segment_idx + 1}"
|
1062
|
+
else:
|
1063
|
+
segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
|
1064
|
+
|
1065
|
+
spec.add_highlight(
|
1066
|
+
bbox=segment.bbox,
|
1067
|
+
polygon=segment.polygon if segment.has_polygon else None,
|
1068
|
+
color=color or "blue",
|
1069
|
+
label=segment_label,
|
1070
|
+
)
|
849
1071
|
|
850
|
-
|
851
|
-
|
1072
|
+
# Add additional highlight groups if provided
|
1073
|
+
if highlights:
|
1074
|
+
for group in highlights:
|
1075
|
+
group_elements = group.get("elements", [])
|
1076
|
+
group_color = group.get("color", color)
|
1077
|
+
group_label = group.get("label")
|
852
1078
|
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
max_x1 = max(segment.bbox[2] for segment in segments_on_this_page)
|
860
|
-
max_y1 = max(segment.bbox[3] for segment in segments_on_this_page)
|
861
|
-
crop_bbox = (min_x0, min_y0, max_x1, max_y1)
|
862
|
-
|
863
|
-
# Render this page with highlights
|
864
|
-
page_image = highlighter_service.render_preview(
|
865
|
-
page_index=(
|
866
|
-
page_obj.index
|
867
|
-
if hasattr(page_obj, "index")
|
868
|
-
else getattr(page_obj, "page_number", 1) - 1
|
869
|
-
),
|
870
|
-
temporary_highlights=temp_highlights_for_page,
|
871
|
-
resolution=resolution,
|
872
|
-
width=width,
|
873
|
-
labels=labels,
|
874
|
-
legend_position=legend_position,
|
875
|
-
crop_bbox=crop_bbox,
|
876
|
-
**kwargs,
|
877
|
-
)
|
878
|
-
if page_image:
|
879
|
-
output_page_images.append(page_image)
|
1079
|
+
for elem in group_elements:
|
1080
|
+
# Only add if element is on this page
|
1081
|
+
if hasattr(elem, "page") and elem.page == page_obj:
|
1082
|
+
spec.add_highlight(
|
1083
|
+
element=elem, color=group_color, label=group_label
|
1084
|
+
)
|
880
1085
|
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
1086
|
+
specs.append(spec)
|
1087
|
+
|
1088
|
+
return specs
|
1089
|
+
|
1090
|
+
def _show_in_context(
|
1091
|
+
self,
|
1092
|
+
resolution: float,
|
1093
|
+
width: Optional[int] = None,
|
1094
|
+
stack_direction: str = "vertical",
|
1095
|
+
stack_gap: int = 5,
|
1096
|
+
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
1097
|
+
separator_color: Tuple[int, int, int] = (255, 0, 0),
|
1098
|
+
separator_thickness: int = 2,
|
1099
|
+
**kwargs,
|
1100
|
+
) -> Optional["PIL_Image"]:
|
1101
|
+
"""
|
1102
|
+
Show segments as cropped images stacked together with separators between segments.
|
1103
|
+
|
1104
|
+
Args:
|
1105
|
+
resolution: Resolution in DPI for rendering segment images
|
1106
|
+
width: Optional width for segment images
|
1107
|
+
stack_direction: Direction to stack segments ('vertical' or 'horizontal')
|
1108
|
+
stack_gap: Gap in pixels between segments
|
1109
|
+
stack_background_color: RGB background color for the final image
|
1110
|
+
separator_color: RGB color for separator lines between segments
|
1111
|
+
separator_thickness: Thickness in pixels of separator lines
|
1112
|
+
**kwargs: Additional arguments passed to segment rendering
|
885
1113
|
|
886
|
-
|
887
|
-
|
1114
|
+
Returns:
|
1115
|
+
PIL Image with all segments stacked together
|
1116
|
+
"""
|
1117
|
+
from PIL import Image, ImageDraw
|
888
1118
|
|
889
|
-
|
1119
|
+
segment_images = []
|
1120
|
+
segment_pages = []
|
1121
|
+
|
1122
|
+
# Determine stacking direction
|
890
1123
|
final_stack_direction = stack_direction
|
891
1124
|
if stack_direction == "auto":
|
892
1125
|
final_stack_direction = self.arrangement
|
893
1126
|
|
894
|
-
#
|
1127
|
+
# Get cropped images for each segment
|
1128
|
+
for i, segment in enumerate(self.segments):
|
1129
|
+
# Get the page reference for this segment
|
1130
|
+
if hasattr(segment, "page") and segment.page is not None:
|
1131
|
+
segment_page = segment.page
|
1132
|
+
# Get cropped image of the segment
|
1133
|
+
# Use render() for clean image without highlights
|
1134
|
+
segment_image = segment.render(
|
1135
|
+
resolution=resolution,
|
1136
|
+
crop=True,
|
1137
|
+
width=width,
|
1138
|
+
**kwargs,
|
1139
|
+
)
|
1140
|
+
|
1141
|
+
elif (
|
1142
|
+
hasattr(segment, "index")
|
1143
|
+
and hasattr(segment, "width")
|
1144
|
+
and hasattr(segment, "height")
|
1145
|
+
):
|
1146
|
+
# It's a full Page object
|
1147
|
+
segment_page = segment
|
1148
|
+
# Use render() for clean image without highlights
|
1149
|
+
segment_image = segment.render(resolution=resolution, width=width, **kwargs)
|
1150
|
+
else:
|
1151
|
+
raise ValueError(
|
1152
|
+
f"Segment {i+1} has no identifiable page. Segment type: {type(segment)}, attributes: {dir(segment)}"
|
1153
|
+
)
|
1154
|
+
|
1155
|
+
if segment_image is not None:
|
1156
|
+
segment_images.append(segment_image)
|
1157
|
+
segment_pages.append(segment_page)
|
1158
|
+
else:
|
1159
|
+
logger.warning(f"Segment {i+1} render() returned None, skipping")
|
1160
|
+
|
1161
|
+
# Check if we have any valid images
|
1162
|
+
if not segment_images:
|
1163
|
+
logger.error("No valid segment images could be rendered")
|
1164
|
+
return None
|
1165
|
+
|
1166
|
+
# We should have at least one segment image by now (or an exception would have been raised)
|
1167
|
+
if len(segment_images) == 1:
|
1168
|
+
return segment_images[0]
|
1169
|
+
|
1170
|
+
# Calculate dimensions for the final stacked image
|
895
1171
|
if final_stack_direction == "vertical":
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
)
|
901
|
-
|
902
|
-
|
1172
|
+
# Stack vertically
|
1173
|
+
final_width = max(img.width for img in segment_images)
|
1174
|
+
|
1175
|
+
# Calculate total height including gaps and separators
|
1176
|
+
total_height = sum(img.height for img in segment_images)
|
1177
|
+
total_height += (len(segment_images) - 1) * stack_gap
|
1178
|
+
|
1179
|
+
# Add separator thickness between all segments
|
1180
|
+
num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
|
1181
|
+
total_height += num_separators * separator_thickness
|
1182
|
+
|
1183
|
+
# Create the final image
|
1184
|
+
final_image = Image.new("RGB", (final_width, total_height), stack_background_color)
|
1185
|
+
draw = ImageDraw.Draw(final_image)
|
903
1186
|
|
904
|
-
concatenated_image = PIL_Image_Runtime.new(
|
905
|
-
"RGB", (final_width, final_height), stack_background_color
|
906
|
-
)
|
907
1187
|
current_y = 0
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
1188
|
+
|
1189
|
+
for i, img in enumerate(segment_images):
|
1190
|
+
# Add separator line before each segment (except the first one)
|
1191
|
+
if i > 0:
|
1192
|
+
# Draw separator line
|
1193
|
+
draw.rectangle(
|
1194
|
+
[(0, current_y), (final_width, current_y + separator_thickness)],
|
1195
|
+
fill=separator_color,
|
1196
|
+
)
|
1197
|
+
current_y += separator_thickness
|
1198
|
+
|
1199
|
+
# Paste the segment image
|
1200
|
+
paste_x = (final_width - img.width) // 2 # Center horizontally
|
1201
|
+
final_image.paste(img, (paste_x, current_y))
|
1202
|
+
current_y += img.height
|
1203
|
+
|
1204
|
+
# Add gap after segment (except for the last one)
|
1205
|
+
if i < len(segment_images) - 1:
|
1206
|
+
current_y += stack_gap
|
1207
|
+
|
1208
|
+
return final_image
|
1209
|
+
|
914
1210
|
elif final_stack_direction == "horizontal":
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
1211
|
+
# Stack horizontally
|
1212
|
+
final_height = max(img.height for img in segment_images)
|
1213
|
+
|
1214
|
+
# Calculate total width including gaps and separators
|
1215
|
+
total_width = sum(img.width for img in segment_images)
|
1216
|
+
total_width += (len(segment_images) - 1) * stack_gap
|
1217
|
+
|
1218
|
+
# Add separator thickness between all segments
|
1219
|
+
num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
|
1220
|
+
total_width += num_separators * separator_thickness
|
1221
|
+
|
1222
|
+
# Create the final image
|
1223
|
+
final_image = Image.new("RGB", (total_width, final_height), stack_background_color)
|
1224
|
+
draw = ImageDraw.Draw(final_image)
|
922
1225
|
|
923
|
-
concatenated_image = PIL_Image_Runtime.new(
|
924
|
-
"RGB", (final_width, final_height), stack_background_color
|
925
|
-
)
|
926
1226
|
current_x = 0
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
1227
|
+
|
1228
|
+
for i, img in enumerate(segment_images):
|
1229
|
+
# Add separator line before each segment (except the first one)
|
1230
|
+
if i > 0:
|
1231
|
+
# Draw separator line
|
1232
|
+
draw.rectangle(
|
1233
|
+
[(current_x, 0), (current_x + separator_thickness, final_height)],
|
1234
|
+
fill=separator_color,
|
1235
|
+
)
|
1236
|
+
current_x += separator_thickness
|
1237
|
+
|
1238
|
+
# Paste the segment image
|
1239
|
+
paste_y = (final_height - img.height) // 2 # Center vertically
|
1240
|
+
final_image.paste(img, (current_x, paste_y))
|
1241
|
+
current_x += img.width
|
1242
|
+
|
1243
|
+
# Add gap after segment (except for the last one)
|
1244
|
+
if i < len(segment_images) - 1:
|
1245
|
+
current_x += stack_gap
|
1246
|
+
|
1247
|
+
return final_image
|
1248
|
+
|
932
1249
|
else:
|
933
1250
|
raise ValueError(
|
934
|
-
f"Invalid stack_direction '{final_stack_direction}' for
|
1251
|
+
f"Invalid stack_direction '{final_stack_direction}' for in_context. Must be 'vertical' or 'horizontal'."
|
935
1252
|
)
|
936
1253
|
|
937
1254
|
# --- Helper methods for coordinate transformations and segment iteration ---
|
@@ -972,3 +1289,643 @@ class Flow:
|
|
972
1289
|
raise NotImplementedError(
|
973
1290
|
"Translating element coordinates to a unified flow coordinate system is not yet implemented."
|
974
1291
|
)
|
1292
|
+
|
1293
|
+
def get_sections(
|
1294
|
+
self,
|
1295
|
+
start_elements=None,
|
1296
|
+
end_elements=None,
|
1297
|
+
new_section_on_page_break: bool = False,
|
1298
|
+
include_boundaries: str = "both",
|
1299
|
+
) -> "ElementCollection":
|
1300
|
+
"""
|
1301
|
+
Extract logical sections from the Flow based on *start* and *end* boundary
|
1302
|
+
elements, mirroring the behaviour of PDF/PageCollection.get_sections().
|
1303
|
+
|
1304
|
+
This implementation is a thin wrapper that converts the Flow into a
|
1305
|
+
temporary PageCollection (constructed from the unique pages that the
|
1306
|
+
Flow spans) and then delegates the heavy‐lifting to that existing
|
1307
|
+
implementation. Any FlowElement / FlowElementCollection inputs are
|
1308
|
+
automatically unwrapped to their underlying physical elements so that
|
1309
|
+
PageCollection can work with them directly.
|
1310
|
+
|
1311
|
+
Args:
|
1312
|
+
start_elements: Elements or selector string that mark the start of
|
1313
|
+
sections (optional).
|
1314
|
+
end_elements: Elements or selector string that mark the end of
|
1315
|
+
sections (optional).
|
1316
|
+
new_section_on_page_break: Whether to start a new section at page
|
1317
|
+
boundaries (default: False).
|
1318
|
+
include_boundaries: How to include boundary elements: 'start',
|
1319
|
+
'end', 'both', or 'none' (default: 'both').
|
1320
|
+
|
1321
|
+
Returns:
|
1322
|
+
ElementCollection of Region/FlowRegion objects representing the
|
1323
|
+
extracted sections.
|
1324
|
+
"""
|
1325
|
+
# ------------------------------------------------------------------
|
1326
|
+
# Unwrap FlowElement(-Collection) inputs and selector strings so we
|
1327
|
+
# can reason about them generically.
|
1328
|
+
# ------------------------------------------------------------------
|
1329
|
+
from natural_pdf.flows.collections import FlowElementCollection
|
1330
|
+
from natural_pdf.flows.element import FlowElement
|
1331
|
+
|
1332
|
+
def _unwrap(obj):
|
1333
|
+
"""Convert Flow-specific wrappers to their underlying physical objects.
|
1334
|
+
|
1335
|
+
Keeps selector strings as-is; converts FlowElement to its physical
|
1336
|
+
element; converts FlowElementCollection to list of physical
|
1337
|
+
elements; passes through ElementCollection by taking .elements.
|
1338
|
+
"""
|
1339
|
+
|
1340
|
+
if obj is None or isinstance(obj, str):
|
1341
|
+
return obj
|
1342
|
+
|
1343
|
+
if isinstance(obj, FlowElement):
|
1344
|
+
return obj.physical_object
|
1345
|
+
|
1346
|
+
if isinstance(obj, FlowElementCollection):
|
1347
|
+
return [fe.physical_object for fe in obj.flow_elements]
|
1348
|
+
|
1349
|
+
if hasattr(obj, "elements"):
|
1350
|
+
return obj.elements
|
1351
|
+
|
1352
|
+
if isinstance(obj, (list, tuple, set)):
|
1353
|
+
out = []
|
1354
|
+
for item in obj:
|
1355
|
+
if isinstance(item, FlowElement):
|
1356
|
+
out.append(item.physical_object)
|
1357
|
+
else:
|
1358
|
+
out.append(item)
|
1359
|
+
return out
|
1360
|
+
|
1361
|
+
return obj # Fallback – unknown type
|
1362
|
+
|
1363
|
+
start_elements_unwrapped = _unwrap(start_elements)
|
1364
|
+
end_elements_unwrapped = _unwrap(end_elements)
|
1365
|
+
|
1366
|
+
# ------------------------------------------------------------------
|
1367
|
+
# PRIMARY IMPLEMENTATION – operate on each Flow **segment region**
|
1368
|
+
# independently so that sectioning happens *per-region*, not per page.
|
1369
|
+
# ------------------------------------------------------------------
|
1370
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
1371
|
+
|
1372
|
+
aggregated_sections = []
|
1373
|
+
|
1374
|
+
# Helper to decide if an element lies inside a segment (Region)
|
1375
|
+
def _element_in_segment(elem, segment_region):
|
1376
|
+
try:
|
1377
|
+
return segment_region.intersects(elem) # Region method – robust
|
1378
|
+
except Exception:
|
1379
|
+
# Fallback to bounding-box containment checks
|
1380
|
+
if not hasattr(elem, "bbox"):
|
1381
|
+
return False
|
1382
|
+
ex0, etop, ex1, ebottom = elem.bbox
|
1383
|
+
sx0, stop, sx1, sbottom = segment_region.bbox
|
1384
|
+
return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
|
1385
|
+
|
1386
|
+
for seg in self.segments:
|
1387
|
+
# Each *seg* is guaranteed to be a Region (see _normalize_segments)
|
1388
|
+
|
1389
|
+
# Resolve segment-specific boundary arguments
|
1390
|
+
seg_start_elems = None
|
1391
|
+
seg_end_elems = None
|
1392
|
+
|
1393
|
+
# --- Handle selector strings ---
|
1394
|
+
if isinstance(start_elements_unwrapped, str):
|
1395
|
+
seg_start_elems = seg.find_all(start_elements_unwrapped).elements
|
1396
|
+
elif start_elements_unwrapped is not None:
|
1397
|
+
seg_start_elems = [
|
1398
|
+
e for e in start_elements_unwrapped if _element_in_segment(e, seg)
|
1399
|
+
]
|
1400
|
+
|
1401
|
+
if isinstance(end_elements_unwrapped, str):
|
1402
|
+
seg_end_elems = seg.find_all(end_elements_unwrapped).elements
|
1403
|
+
elif end_elements_unwrapped is not None:
|
1404
|
+
seg_end_elems = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1405
|
+
|
1406
|
+
# Call Region.get_sections – this returns ElementCollection[Region]
|
1407
|
+
seg_sections = seg.get_sections(
|
1408
|
+
start_elements=seg_start_elems,
|
1409
|
+
end_elements=seg_end_elems,
|
1410
|
+
include_boundaries=include_boundaries,
|
1411
|
+
)
|
1412
|
+
|
1413
|
+
if seg_sections:
|
1414
|
+
aggregated_sections.extend(seg_sections.elements)
|
1415
|
+
|
1416
|
+
# Optionally, handle new_section_on_page_break – interpreted here as
|
1417
|
+
# *new_section_on_segment_break*: if True and there were *no* explicit
|
1418
|
+
# boundaries, treat the entire segment as a single section.
|
1419
|
+
if (
|
1420
|
+
new_section_on_page_break
|
1421
|
+
and not seg_sections
|
1422
|
+
and start_elements_unwrapped is None
|
1423
|
+
and end_elements_unwrapped is None
|
1424
|
+
):
|
1425
|
+
aggregated_sections.append(seg)
|
1426
|
+
|
1427
|
+
# ------------------------------------------------------------------
|
1428
|
+
# CROSS-SEGMENT SECTION DETECTION: Check if we have boundaries that
|
1429
|
+
# span multiple segments and create FlowRegions for those cases.
|
1430
|
+
# ------------------------------------------------------------------
|
1431
|
+
|
1432
|
+
# If we have explicit start/end elements, check for cross-segment sections
|
1433
|
+
if start_elements_unwrapped is not None and end_elements_unwrapped is not None:
|
1434
|
+
# Find all start and end elements across all segments
|
1435
|
+
all_start_elements = []
|
1436
|
+
all_end_elements = []
|
1437
|
+
|
1438
|
+
# Map elements to their segments for tracking
|
1439
|
+
element_to_segment = {}
|
1440
|
+
|
1441
|
+
for seg_idx, seg in enumerate(self.segments):
|
1442
|
+
if isinstance(start_elements_unwrapped, str):
|
1443
|
+
seg_starts = seg.find_all(start_elements_unwrapped).elements
|
1444
|
+
else:
|
1445
|
+
seg_starts = [
|
1446
|
+
e for e in start_elements_unwrapped if _element_in_segment(e, seg)
|
1447
|
+
]
|
1448
|
+
|
1449
|
+
if isinstance(end_elements_unwrapped, str):
|
1450
|
+
seg_ends = seg.find_all(end_elements_unwrapped).elements
|
1451
|
+
else:
|
1452
|
+
seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1453
|
+
|
1454
|
+
for elem in seg_starts:
|
1455
|
+
all_start_elements.append((elem, seg_idx))
|
1456
|
+
element_to_segment[id(elem)] = seg_idx
|
1457
|
+
|
1458
|
+
for elem in seg_ends:
|
1459
|
+
all_end_elements.append((elem, seg_idx))
|
1460
|
+
element_to_segment[id(elem)] = seg_idx
|
1461
|
+
|
1462
|
+
# Sort by segment index, then by position within segment
|
1463
|
+
all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1464
|
+
all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1465
|
+
|
1466
|
+
# Look for cross-segment pairs (start in one segment, end in another)
|
1467
|
+
cross_segment_sections = []
|
1468
|
+
used_starts = set()
|
1469
|
+
used_ends = set()
|
1470
|
+
|
1471
|
+
for start_elem, start_seg_idx in all_start_elements:
|
1472
|
+
if id(start_elem) in used_starts:
|
1473
|
+
continue
|
1474
|
+
|
1475
|
+
# Find the next end element that comes after this start
|
1476
|
+
matching_end = None
|
1477
|
+
for end_elem, end_seg_idx in all_end_elements:
|
1478
|
+
if id(end_elem) in used_ends:
|
1479
|
+
continue
|
1480
|
+
|
1481
|
+
# Check if this end comes after the start (by segment order or position)
|
1482
|
+
if end_seg_idx > start_seg_idx or (
|
1483
|
+
end_seg_idx == start_seg_idx
|
1484
|
+
and (
|
1485
|
+
end_elem.top > start_elem.top
|
1486
|
+
or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
|
1487
|
+
)
|
1488
|
+
):
|
1489
|
+
matching_end = (end_elem, end_seg_idx)
|
1490
|
+
break
|
1491
|
+
|
1492
|
+
if matching_end is not None:
|
1493
|
+
end_elem, end_seg_idx = matching_end
|
1494
|
+
|
1495
|
+
# If start and end are in different segments, create FlowRegion
|
1496
|
+
if start_seg_idx != end_seg_idx:
|
1497
|
+
cross_segment_sections.append(
|
1498
|
+
(start_elem, start_seg_idx, end_elem, end_seg_idx)
|
1499
|
+
)
|
1500
|
+
used_starts.add(id(start_elem))
|
1501
|
+
used_ends.add(id(end_elem))
|
1502
|
+
|
1503
|
+
# Create FlowRegions for cross-segment sections
|
1504
|
+
from natural_pdf.elements.region import Region
|
1505
|
+
from natural_pdf.flows.element import FlowElement
|
1506
|
+
from natural_pdf.flows.region import FlowRegion
|
1507
|
+
|
1508
|
+
for start_elem, start_seg_idx, end_elem, end_seg_idx in cross_segment_sections:
|
1509
|
+
# Build constituent regions spanning from start segment to end segment
|
1510
|
+
constituent_regions = []
|
1511
|
+
|
1512
|
+
# First segment: from start element to bottom
|
1513
|
+
start_seg = self.segments[start_seg_idx]
|
1514
|
+
first_region = Region(
|
1515
|
+
start_seg.page, (start_seg.x0, start_elem.top, start_seg.x1, start_seg.bottom)
|
1516
|
+
)
|
1517
|
+
constituent_regions.append(first_region)
|
1518
|
+
|
1519
|
+
# Middle segments: full segments
|
1520
|
+
for seg_idx in range(start_seg_idx + 1, end_seg_idx):
|
1521
|
+
constituent_regions.append(self.segments[seg_idx])
|
1522
|
+
|
1523
|
+
# Last segment: from top to end element
|
1524
|
+
if end_seg_idx != start_seg_idx:
|
1525
|
+
end_seg = self.segments[end_seg_idx]
|
1526
|
+
last_region = Region(
|
1527
|
+
end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, end_elem.bottom)
|
1528
|
+
)
|
1529
|
+
constituent_regions.append(last_region)
|
1530
|
+
|
1531
|
+
# Create FlowRegion
|
1532
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1533
|
+
flow_region = FlowRegion(
|
1534
|
+
flow=self,
|
1535
|
+
constituent_regions=constituent_regions,
|
1536
|
+
source_flow_element=flow_element,
|
1537
|
+
boundary_element_found=end_elem,
|
1538
|
+
)
|
1539
|
+
|
1540
|
+
# Remove any single-segment sections that are now covered by this FlowRegion
|
1541
|
+
# This prevents duplication of content
|
1542
|
+
aggregated_sections = [
|
1543
|
+
s
|
1544
|
+
for s in aggregated_sections
|
1545
|
+
if not any(
|
1546
|
+
cr.intersects(s)
|
1547
|
+
for cr in constituent_regions
|
1548
|
+
if hasattr(cr, "intersects") and hasattr(s, "intersects")
|
1549
|
+
)
|
1550
|
+
]
|
1551
|
+
|
1552
|
+
aggregated_sections.append(flow_region)
|
1553
|
+
|
1554
|
+
# ------------------------------------------------------------------
|
1555
|
+
# NEW APPROACH: First collect ALL boundary elements across all segments,
|
1556
|
+
# then pair them up to create sections (either single-segment Regions
|
1557
|
+
# or multi-segment FlowRegions).
|
1558
|
+
# ------------------------------------------------------------------
|
1559
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
1560
|
+
from natural_pdf.elements.region import Region
|
1561
|
+
from natural_pdf.flows.element import FlowElement
|
1562
|
+
from natural_pdf.flows.region import FlowRegion
|
1563
|
+
|
1564
|
+
# Helper to decide if an element lies inside a segment (Region)
|
1565
|
+
def _element_in_segment(elem, segment_region):
|
1566
|
+
try:
|
1567
|
+
return segment_region.intersects(elem) # Region method – robust
|
1568
|
+
except Exception:
|
1569
|
+
# Fallback to bounding-box containment checks
|
1570
|
+
if not hasattr(elem, "bbox"):
|
1571
|
+
return False
|
1572
|
+
ex0, etop, ex1, ebottom = elem.bbox
|
1573
|
+
sx0, stop, sx1, sbottom = segment_region.bbox
|
1574
|
+
return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
|
1575
|
+
|
1576
|
+
# Collect ALL boundary elements across all segments with their segment indices
|
1577
|
+
all_start_elements = []
|
1578
|
+
all_end_elements = []
|
1579
|
+
|
1580
|
+
for seg_idx, seg in enumerate(self.segments):
|
1581
|
+
# Find start elements in this segment
|
1582
|
+
if isinstance(start_elements_unwrapped, str):
|
1583
|
+
seg_starts = seg.find_all(start_elements_unwrapped).elements
|
1584
|
+
elif start_elements_unwrapped is not None:
|
1585
|
+
seg_starts = [e for e in start_elements_unwrapped if _element_in_segment(e, seg)]
|
1586
|
+
else:
|
1587
|
+
seg_starts = []
|
1588
|
+
|
1589
|
+
logger.debug(f"\n=== Processing segment {seg_idx} ===")
|
1590
|
+
logger.debug(f"Segment bbox: {seg.bbox}")
|
1591
|
+
logger.debug(
|
1592
|
+
f"Segment page: {seg.page.number if hasattr(seg.page, 'number') else 'unknown'}"
|
1593
|
+
)
|
1594
|
+
|
1595
|
+
logger.debug(f"Found {len(seg_starts)} start elements in segment {seg_idx}")
|
1596
|
+
for i, elem in enumerate(seg_starts):
|
1597
|
+
logger.debug(
|
1598
|
+
f" Start {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
|
1599
|
+
)
|
1600
|
+
|
1601
|
+
# Find end elements in this segment
|
1602
|
+
if isinstance(end_elements_unwrapped, str):
|
1603
|
+
seg_ends = seg.find_all(end_elements_unwrapped).elements
|
1604
|
+
elif end_elements_unwrapped is not None:
|
1605
|
+
seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1606
|
+
else:
|
1607
|
+
seg_ends = []
|
1608
|
+
|
1609
|
+
logger.debug(f"Found {len(seg_ends)} end elements in segment {seg_idx}")
|
1610
|
+
for i, elem in enumerate(seg_ends):
|
1611
|
+
logger.debug(
|
1612
|
+
f" End {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
|
1613
|
+
)
|
1614
|
+
|
1615
|
+
# Add to global lists with segment index
|
1616
|
+
for elem in seg_starts:
|
1617
|
+
all_start_elements.append((elem, seg_idx))
|
1618
|
+
for elem in seg_ends:
|
1619
|
+
all_end_elements.append((elem, seg_idx))
|
1620
|
+
|
1621
|
+
# Sort by flow order: segment index first, then position within segment
|
1622
|
+
all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1623
|
+
all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1624
|
+
|
1625
|
+
logger.debug(f"\n=== Total boundary elements found ===")
|
1626
|
+
logger.debug(f"Total start elements: {len(all_start_elements)}")
|
1627
|
+
logger.debug(f"Total end elements: {len(all_end_elements)}")
|
1628
|
+
|
1629
|
+
# Pair up start and end elements to create sections
|
1630
|
+
sections = []
|
1631
|
+
used_starts = set()
|
1632
|
+
used_ends = set()
|
1633
|
+
|
1634
|
+
for start_elem, start_seg_idx in all_start_elements:
|
1635
|
+
if id(start_elem) in used_starts:
|
1636
|
+
continue
|
1637
|
+
|
1638
|
+
logger.debug(f"\n--- Pairing start element from segment {start_seg_idx} ---")
|
1639
|
+
logger.debug(
|
1640
|
+
f"Start: bbox={start_elem.bbox}, text='{getattr(start_elem, 'text', 'N/A')[:30]}...'"
|
1641
|
+
)
|
1642
|
+
|
1643
|
+
# Find the next unused end element that comes after this start
|
1644
|
+
matching_end = None
|
1645
|
+
for end_elem, end_seg_idx in all_end_elements:
|
1646
|
+
if id(end_elem) in used_ends:
|
1647
|
+
continue
|
1648
|
+
|
1649
|
+
# Check if this end comes after the start in flow order
|
1650
|
+
if end_seg_idx > start_seg_idx or (
|
1651
|
+
end_seg_idx == start_seg_idx
|
1652
|
+
and (
|
1653
|
+
end_elem.top > start_elem.top
|
1654
|
+
or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
|
1655
|
+
)
|
1656
|
+
):
|
1657
|
+
matching_end = (end_elem, end_seg_idx)
|
1658
|
+
break
|
1659
|
+
|
1660
|
+
if matching_end is not None:
|
1661
|
+
end_elem, end_seg_idx = matching_end
|
1662
|
+
used_starts.add(id(start_elem))
|
1663
|
+
used_ends.add(id(end_elem))
|
1664
|
+
|
1665
|
+
logger.debug(f" Matched! Start seg={start_seg_idx}, End seg={end_seg_idx}")
|
1666
|
+
|
1667
|
+
# Create section based on whether it spans segments
|
1668
|
+
if start_seg_idx == end_seg_idx:
|
1669
|
+
# Single segment section - use Region.get_section_between
|
1670
|
+
seg = self.segments[start_seg_idx]
|
1671
|
+
section = seg.get_section_between(start_elem, end_elem, include_boundaries)
|
1672
|
+
sections.append(section)
|
1673
|
+
logger.debug(f" Created single-segment Region")
|
1674
|
+
else:
|
1675
|
+
# Multi-segment section - create FlowRegion
|
1676
|
+
logger.debug(
|
1677
|
+
f" Creating multi-segment FlowRegion spanning segments {start_seg_idx} to {end_seg_idx}"
|
1678
|
+
)
|
1679
|
+
constituent_regions = []
|
1680
|
+
|
1681
|
+
# First segment: from start element to bottom
|
1682
|
+
start_seg = self.segments[start_seg_idx]
|
1683
|
+
if include_boundaries in ["start", "both"]:
|
1684
|
+
first_top = start_elem.top
|
1685
|
+
else:
|
1686
|
+
first_top = start_elem.bottom
|
1687
|
+
first_region = Region(
|
1688
|
+
start_seg.page, (start_seg.x0, first_top, start_seg.x1, start_seg.bottom)
|
1689
|
+
)
|
1690
|
+
constituent_regions.append(first_region)
|
1691
|
+
|
1692
|
+
# Middle segments: full segments
|
1693
|
+
for seg_idx in range(start_seg_idx + 1, end_seg_idx):
|
1694
|
+
constituent_regions.append(self.segments[seg_idx])
|
1695
|
+
|
1696
|
+
# Last segment: from top to end element
|
1697
|
+
end_seg = self.segments[end_seg_idx]
|
1698
|
+
if include_boundaries in ["end", "both"]:
|
1699
|
+
last_bottom = end_elem.bottom
|
1700
|
+
else:
|
1701
|
+
last_bottom = end_elem.top
|
1702
|
+
last_region = Region(
|
1703
|
+
end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, last_bottom)
|
1704
|
+
)
|
1705
|
+
constituent_regions.append(last_region)
|
1706
|
+
|
1707
|
+
# Create FlowRegion
|
1708
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1709
|
+
flow_region = FlowRegion(
|
1710
|
+
flow=self,
|
1711
|
+
constituent_regions=constituent_regions,
|
1712
|
+
source_flow_element=flow_element,
|
1713
|
+
boundary_element_found=end_elem,
|
1714
|
+
)
|
1715
|
+
sections.append(flow_region)
|
1716
|
+
|
1717
|
+
# Handle special cases when only start or only end elements are provided
|
1718
|
+
if start_elements_unwrapped is not None and end_elements_unwrapped is None:
|
1719
|
+
logger.debug(f"\n=== Handling start-only elements (no end elements provided) ===")
|
1720
|
+
for i, (start_elem, start_seg_idx) in enumerate(all_start_elements):
|
1721
|
+
if id(start_elem) in used_starts:
|
1722
|
+
continue
|
1723
|
+
|
1724
|
+
# Find next start element
|
1725
|
+
next_start = None
|
1726
|
+
if i + 1 < len(all_start_elements):
|
1727
|
+
next_start_elem, next_start_seg_idx = all_start_elements[i + 1]
|
1728
|
+
# Create section from this start to just before next start
|
1729
|
+
if start_seg_idx == next_start_seg_idx:
|
1730
|
+
# Same segment
|
1731
|
+
seg = self.segments[start_seg_idx]
|
1732
|
+
# Find element just before next start
|
1733
|
+
all_elems = seg.get_elements()
|
1734
|
+
all_elems.sort(key=lambda e: (e.top, e.x0))
|
1735
|
+
try:
|
1736
|
+
next_idx = all_elems.index(next_start_elem)
|
1737
|
+
if next_idx > 0:
|
1738
|
+
end_elem = all_elems[next_idx - 1]
|
1739
|
+
section = seg.get_section_between(
|
1740
|
+
start_elem, end_elem, include_boundaries
|
1741
|
+
)
|
1742
|
+
sections.append(section)
|
1743
|
+
except ValueError:
|
1744
|
+
pass
|
1745
|
+
elif next_start_seg_idx == start_seg_idx + 1:
|
1746
|
+
# Next start is in the immediately following segment in the flow
|
1747
|
+
# Create a FlowRegion that spans from current start to just before next start
|
1748
|
+
logger.debug(f" Next start is in next flow segment - creating FlowRegion")
|
1749
|
+
|
1750
|
+
constituent_regions = []
|
1751
|
+
|
1752
|
+
# First segment: from start element to bottom
|
1753
|
+
start_seg = self.segments[start_seg_idx]
|
1754
|
+
if include_boundaries in ["start", "both"]:
|
1755
|
+
first_top = start_elem.top
|
1756
|
+
else:
|
1757
|
+
first_top = start_elem.bottom
|
1758
|
+
first_region = Region(
|
1759
|
+
start_seg.page,
|
1760
|
+
(start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
|
1761
|
+
)
|
1762
|
+
constituent_regions.append(first_region)
|
1763
|
+
|
1764
|
+
# Next segment: from top to just before next start
|
1765
|
+
next_seg = self.segments[next_start_seg_idx]
|
1766
|
+
# Find element just before next start in the next segment
|
1767
|
+
next_seg_elems = next_seg.get_elements()
|
1768
|
+
next_seg_elems.sort(key=lambda e: (e.top, e.x0))
|
1769
|
+
|
1770
|
+
last_bottom = next_start_elem.top # Default to just before the next start
|
1771
|
+
try:
|
1772
|
+
next_idx = next_seg_elems.index(next_start_elem)
|
1773
|
+
if next_idx > 0:
|
1774
|
+
# Use the bottom of the element before next start
|
1775
|
+
prev_elem = next_seg_elems[next_idx - 1]
|
1776
|
+
last_bottom = prev_elem.bottom
|
1777
|
+
except ValueError:
|
1778
|
+
pass
|
1779
|
+
|
1780
|
+
last_region = Region(
|
1781
|
+
next_seg.page, (next_seg.x0, next_seg.top, next_seg.x1, last_bottom)
|
1782
|
+
)
|
1783
|
+
constituent_regions.append(last_region)
|
1784
|
+
|
1785
|
+
# Create FlowRegion
|
1786
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1787
|
+
flow_region = FlowRegion(
|
1788
|
+
flow=self,
|
1789
|
+
constituent_regions=constituent_regions,
|
1790
|
+
source_flow_element=flow_element,
|
1791
|
+
boundary_element_found=None,
|
1792
|
+
)
|
1793
|
+
sections.append(flow_region)
|
1794
|
+
logger.debug(
|
1795
|
+
f" Created FlowRegion with {len(constituent_regions)} constituent regions"
|
1796
|
+
)
|
1797
|
+
else:
|
1798
|
+
# Next start is more than one segment away - just end at current segment
|
1799
|
+
start_seg = self.segments[start_seg_idx]
|
1800
|
+
if include_boundaries in ["start", "both"]:
|
1801
|
+
region_top = start_elem.top
|
1802
|
+
else:
|
1803
|
+
region_top = start_elem.bottom
|
1804
|
+
section = Region(
|
1805
|
+
start_seg.page,
|
1806
|
+
(start_seg.x0, region_top, start_seg.x1, start_seg.bottom),
|
1807
|
+
)
|
1808
|
+
sections.append(section)
|
1809
|
+
logger.debug(
|
1810
|
+
f" Next start is {next_start_seg_idx - start_seg_idx} segments away - ending at current segment"
|
1811
|
+
)
|
1812
|
+
else:
|
1813
|
+
# Last start element: section goes to end of flow
|
1814
|
+
# This could span multiple segments
|
1815
|
+
if start_seg_idx == len(self.segments) - 1:
|
1816
|
+
# Only in last segment
|
1817
|
+
seg = self.segments[start_seg_idx]
|
1818
|
+
if include_boundaries in ["start", "both"]:
|
1819
|
+
region_top = start_elem.top
|
1820
|
+
else:
|
1821
|
+
region_top = start_elem.bottom
|
1822
|
+
section = Region(seg.page, (seg.x0, region_top, seg.x1, seg.bottom))
|
1823
|
+
sections.append(section)
|
1824
|
+
else:
|
1825
|
+
# Spans to end of flow - create FlowRegion
|
1826
|
+
constituent_regions = []
|
1827
|
+
|
1828
|
+
# First segment
|
1829
|
+
start_seg = self.segments[start_seg_idx]
|
1830
|
+
if include_boundaries in ["start", "both"]:
|
1831
|
+
first_top = start_elem.top
|
1832
|
+
else:
|
1833
|
+
first_top = start_elem.bottom
|
1834
|
+
first_region = Region(
|
1835
|
+
start_seg.page,
|
1836
|
+
(start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
|
1837
|
+
)
|
1838
|
+
constituent_regions.append(first_region)
|
1839
|
+
|
1840
|
+
# Remaining segments
|
1841
|
+
for seg_idx in range(start_seg_idx + 1, len(self.segments)):
|
1842
|
+
constituent_regions.append(self.segments[seg_idx])
|
1843
|
+
|
1844
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1845
|
+
flow_region = FlowRegion(
|
1846
|
+
flow=self,
|
1847
|
+
constituent_regions=constituent_regions,
|
1848
|
+
source_flow_element=flow_element,
|
1849
|
+
boundary_element_found=None,
|
1850
|
+
)
|
1851
|
+
sections.append(flow_region)
|
1852
|
+
|
1853
|
+
# Handle new_section_on_page_break when no explicit boundaries
|
1854
|
+
if (
|
1855
|
+
new_section_on_page_break
|
1856
|
+
and start_elements_unwrapped is None
|
1857
|
+
and end_elements_unwrapped is None
|
1858
|
+
):
|
1859
|
+
# Each segment becomes its own section
|
1860
|
+
sections = list(self.segments)
|
1861
|
+
|
1862
|
+
# Sort sections by their position in the flow
|
1863
|
+
def _section_sort_key(section):
|
1864
|
+
if hasattr(section, "constituent_regions"):
|
1865
|
+
# FlowRegion - use first constituent region
|
1866
|
+
first_region = (
|
1867
|
+
section.constituent_regions[0] if section.constituent_regions else None
|
1868
|
+
)
|
1869
|
+
if first_region:
|
1870
|
+
# Find which segment this region belongs to
|
1871
|
+
for idx, seg in enumerate(self.segments):
|
1872
|
+
try:
|
1873
|
+
if seg.intersects(first_region):
|
1874
|
+
return (
|
1875
|
+
idx,
|
1876
|
+
getattr(first_region, "top", 0),
|
1877
|
+
getattr(first_region, "x0", 0),
|
1878
|
+
)
|
1879
|
+
except:
|
1880
|
+
pass
|
1881
|
+
else:
|
1882
|
+
# Regular Region
|
1883
|
+
for idx, seg in enumerate(self.segments):
|
1884
|
+
try:
|
1885
|
+
if seg.intersects(section):
|
1886
|
+
return (idx, getattr(section, "top", 0), getattr(section, "x0", 0))
|
1887
|
+
except:
|
1888
|
+
pass
|
1889
|
+
return (float("inf"), 0, 0)
|
1890
|
+
|
1891
|
+
sections.sort(key=_section_sort_key)
|
1892
|
+
|
1893
|
+
logger.debug(f"\n=== Section creation complete ===")
|
1894
|
+
logger.debug(f"Total sections created: {len(sections)}")
|
1895
|
+
for i, section in enumerate(sections):
|
1896
|
+
if hasattr(section, "constituent_regions"):
|
1897
|
+
logger.debug(
|
1898
|
+
f"Section {i}: FlowRegion with {len(section.constituent_regions)} constituent regions"
|
1899
|
+
)
|
1900
|
+
else:
|
1901
|
+
logger.debug(f"Section {i}: Region with bbox={section.bbox}")
|
1902
|
+
|
1903
|
+
return ElementCollection(sections)
|
1904
|
+
|
1905
|
+
def highlights(self, show: bool = False) -> "HighlightContext":
|
1906
|
+
"""
|
1907
|
+
Create a highlight context for accumulating highlights.
|
1908
|
+
|
1909
|
+
This allows for clean syntax to show multiple highlight groups:
|
1910
|
+
|
1911
|
+
Example:
|
1912
|
+
with flow.highlights() as h:
|
1913
|
+
h.add(flow.find_all('table'), label='tables', color='blue')
|
1914
|
+
h.add(flow.find_all('text:bold'), label='bold text', color='red')
|
1915
|
+
h.show()
|
1916
|
+
|
1917
|
+
Or with automatic display:
|
1918
|
+
with flow.highlights(show=True) as h:
|
1919
|
+
h.add(flow.find_all('table'), label='tables')
|
1920
|
+
h.add(flow.find_all('text:bold'), label='bold')
|
1921
|
+
# Automatically shows when exiting the context
|
1922
|
+
|
1923
|
+
Args:
|
1924
|
+
show: If True, automatically show highlights when exiting context
|
1925
|
+
|
1926
|
+
Returns:
|
1927
|
+
HighlightContext for accumulating highlights
|
1928
|
+
"""
|
1929
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
1930
|
+
|
1931
|
+
return HighlightContext(self, show_on_exit=show)
|