natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/flows/flow.py
CHANGED
@@ -1,19 +1,43 @@
|
|
1
1
|
import logging
|
2
|
-
|
2
|
+
import warnings
|
3
|
+
from typing import (
|
4
|
+
TYPE_CHECKING,
|
5
|
+
Any,
|
6
|
+
Callable,
|
7
|
+
Dict,
|
8
|
+
List,
|
9
|
+
Literal,
|
10
|
+
Optional,
|
11
|
+
Tuple,
|
12
|
+
Union,
|
13
|
+
overload,
|
14
|
+
)
|
3
15
|
|
4
16
|
if TYPE_CHECKING:
|
17
|
+
from PIL.Image import Image as PIL_Image
|
18
|
+
|
5
19
|
from natural_pdf.core.page import Page
|
20
|
+
from natural_pdf.core.page_collection import PageCollection
|
6
21
|
from natural_pdf.elements.base import Element as PhysicalElement
|
7
|
-
from natural_pdf.elements.
|
22
|
+
from natural_pdf.elements.element_collection import (
|
23
|
+
ElementCollection as PhysicalElementCollection,
|
24
|
+
)
|
8
25
|
from natural_pdf.elements.region import Region as PhysicalRegion
|
9
26
|
|
10
27
|
from .collections import FlowElementCollection
|
11
28
|
from .element import FlowElement
|
12
29
|
|
30
|
+
# Import required classes for the new methods
|
31
|
+
# For runtime image manipulation
|
32
|
+
from PIL import Image as PIL_Image_Runtime
|
33
|
+
|
34
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
35
|
+
from natural_pdf.tables import TableResult
|
36
|
+
|
13
37
|
logger = logging.getLogger(__name__)
|
14
38
|
|
15
39
|
|
16
|
-
class Flow:
|
40
|
+
class Flow(Visualizable):
|
17
41
|
"""Defines a logical flow or sequence of physical Page or Region objects.
|
18
42
|
|
19
43
|
A Flow represents a continuous logical document structure that spans across
|
@@ -81,7 +105,7 @@ class Flow:
|
|
81
105
|
|
82
106
|
def __init__(
|
83
107
|
self,
|
84
|
-
segments: List[Union["Page", "PhysicalRegion"]],
|
108
|
+
segments: Union[List[Union["Page", "PhysicalRegion"]], "PageCollection"],
|
85
109
|
arrangement: Literal["vertical", "horizontal"],
|
86
110
|
alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
|
87
111
|
segment_gap: float = 0.0,
|
@@ -91,7 +115,8 @@ class Flow:
|
|
91
115
|
|
92
116
|
Args:
|
93
117
|
segments: An ordered list of natural_pdf.core.page.Page or
|
94
|
-
natural_pdf.elements.region.Region objects that constitute the flow
|
118
|
+
natural_pdf.elements.region.Region objects that constitute the flow,
|
119
|
+
or a PageCollection containing pages.
|
95
120
|
arrangement: The primary direction of the flow.
|
96
121
|
- "vertical": Segments are stacked top-to-bottom.
|
97
122
|
- "horizontal": Segments are arranged left-to-right.
|
@@ -106,6 +131,10 @@ class Flow:
|
|
106
131
|
- "bottom" (or "end"): Align bottom edges.
|
107
132
|
segment_gap: The virtual gap (in PDF points) between segments.
|
108
133
|
"""
|
134
|
+
# Handle PageCollection input
|
135
|
+
if hasattr(segments, "pages"): # It's a PageCollection
|
136
|
+
segments = list(segments.pages)
|
137
|
+
|
109
138
|
if not segments:
|
110
139
|
raise ValueError("Flow segments cannot be empty.")
|
111
140
|
if arrangement not in ["vertical", "horizontal"]:
|
@@ -165,6 +194,103 @@ class Flow:
|
|
165
194
|
f"Valid options are: {valid_alignments[self.arrangement]}"
|
166
195
|
)
|
167
196
|
|
197
|
+
def _get_highlighter(self):
|
198
|
+
"""Get the highlighting service from the first segment."""
|
199
|
+
if not self.segments:
|
200
|
+
raise RuntimeError("Flow has no segments to get highlighter from")
|
201
|
+
|
202
|
+
# Get highlighter from first segment
|
203
|
+
first_segment = self.segments[0]
|
204
|
+
if hasattr(first_segment, "_highlighter"):
|
205
|
+
return first_segment._highlighter
|
206
|
+
elif hasattr(first_segment, "page") and hasattr(first_segment.page, "_highlighter"):
|
207
|
+
return first_segment.page._highlighter
|
208
|
+
else:
|
209
|
+
raise RuntimeError(
|
210
|
+
f"Cannot find HighlightingService from Flow segments. "
|
211
|
+
f"First segment type: {type(first_segment).__name__}"
|
212
|
+
)
|
213
|
+
|
214
|
+
def show(
|
215
|
+
self,
|
216
|
+
*,
|
217
|
+
# Basic rendering options
|
218
|
+
resolution: Optional[float] = None,
|
219
|
+
width: Optional[int] = None,
|
220
|
+
# Highlight options
|
221
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
222
|
+
labels: bool = True,
|
223
|
+
label_format: Optional[str] = None,
|
224
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
225
|
+
# Layout options for multi-page/region
|
226
|
+
layout: Literal["stack", "grid", "single"] = "stack",
|
227
|
+
stack_direction: Literal["vertical", "horizontal"] = "vertical",
|
228
|
+
gap: int = 5,
|
229
|
+
columns: Optional[int] = None, # For grid layout
|
230
|
+
# Cropping options
|
231
|
+
crop: Union[bool, Literal["content"]] = False,
|
232
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
233
|
+
# Flow-specific options
|
234
|
+
in_context: bool = False,
|
235
|
+
separator_color: Optional[Tuple[int, int, int]] = None,
|
236
|
+
separator_thickness: int = 2,
|
237
|
+
**kwargs,
|
238
|
+
) -> Optional["PIL_Image"]:
|
239
|
+
"""Generate a preview image with highlights.
|
240
|
+
|
241
|
+
If in_context=True, shows segments as cropped images stacked together
|
242
|
+
with separators between segments.
|
243
|
+
|
244
|
+
Args:
|
245
|
+
resolution: DPI for rendering (default from global settings)
|
246
|
+
width: Target width in pixels (overrides resolution)
|
247
|
+
color: Default highlight color
|
248
|
+
labels: Whether to show labels for highlights
|
249
|
+
label_format: Format string for labels
|
250
|
+
highlights: Additional highlight groups to show
|
251
|
+
layout: How to arrange multiple pages/regions
|
252
|
+
stack_direction: Direction for stack layout
|
253
|
+
gap: Pixels between stacked images
|
254
|
+
columns: Number of columns for grid layout
|
255
|
+
crop: Whether to crop
|
256
|
+
crop_bbox: Explicit crop bounds
|
257
|
+
in_context: If True, use special Flow visualization with separators
|
258
|
+
separator_color: RGB color for separator lines (default: red)
|
259
|
+
separator_thickness: Thickness of separator lines
|
260
|
+
**kwargs: Additional parameters passed to rendering
|
261
|
+
|
262
|
+
Returns:
|
263
|
+
PIL Image object or None if nothing to render
|
264
|
+
"""
|
265
|
+
if in_context:
|
266
|
+
# Use the special in_context visualization
|
267
|
+
return self._show_in_context(
|
268
|
+
resolution=resolution or 150,
|
269
|
+
width=width,
|
270
|
+
stack_direction=stack_direction,
|
271
|
+
stack_gap=gap,
|
272
|
+
separator_color=separator_color or (255, 0, 0),
|
273
|
+
separator_thickness=separator_thickness,
|
274
|
+
**kwargs,
|
275
|
+
)
|
276
|
+
|
277
|
+
# Otherwise use the standard show method
|
278
|
+
return super().show(
|
279
|
+
resolution=resolution,
|
280
|
+
width=width,
|
281
|
+
color=color,
|
282
|
+
labels=labels,
|
283
|
+
label_format=label_format,
|
284
|
+
highlights=highlights,
|
285
|
+
layout=layout,
|
286
|
+
stack_direction=stack_direction,
|
287
|
+
gap=gap,
|
288
|
+
columns=columns,
|
289
|
+
crop=crop,
|
290
|
+
crop_bbox=crop_bbox,
|
291
|
+
**kwargs,
|
292
|
+
)
|
293
|
+
|
168
294
|
def find(
|
169
295
|
self,
|
170
296
|
selector: Optional[str] = None,
|
@@ -213,7 +339,10 @@ class Flow:
|
|
213
339
|
) -> "FlowElementCollection":
|
214
340
|
"""
|
215
341
|
Finds all elements within the flow that match the given selector or text criteria.
|
216
|
-
|
342
|
+
|
343
|
+
This method efficiently groups segments by their parent pages, searches at the page level,
|
344
|
+
then filters results appropriately for each segment. This ensures elements that intersect
|
345
|
+
with flow segments (but aren't fully contained) are still found.
|
217
346
|
|
218
347
|
Elements found are wrapped as FlowElement objects, anchored to this Flow,
|
219
348
|
and returned in a FlowElementCollection.
|
@@ -221,13 +350,42 @@ class Flow:
|
|
221
350
|
from .collections import FlowElementCollection
|
222
351
|
from .element import FlowElement
|
223
352
|
|
353
|
+
# Step 1: Group segments by their parent pages (like in analyze_layout)
|
354
|
+
segments_by_page = {} # Dict[Page, List[Segment]]
|
355
|
+
|
356
|
+
for i, segment in enumerate(self.segments):
|
357
|
+
# Determine the page for this segment - fix type detection
|
358
|
+
if hasattr(segment, "page") and hasattr(segment.page, "find_all"):
|
359
|
+
# It's a Region object (has a parent page)
|
360
|
+
page_obj = segment.page
|
361
|
+
segment_type = "region"
|
362
|
+
elif (
|
363
|
+
hasattr(segment, "find_all")
|
364
|
+
and hasattr(segment, "width")
|
365
|
+
and hasattr(segment, "height")
|
366
|
+
and not hasattr(segment, "page")
|
367
|
+
):
|
368
|
+
# It's a Page object (has find_all but no parent page)
|
369
|
+
page_obj = segment
|
370
|
+
segment_type = "page"
|
371
|
+
else:
|
372
|
+
logger.warning(f"Segment {i+1} does not support find_all, skipping")
|
373
|
+
continue
|
374
|
+
|
375
|
+
if page_obj not in segments_by_page:
|
376
|
+
segments_by_page[page_obj] = []
|
377
|
+
segments_by_page[page_obj].append((segment, segment_type))
|
378
|
+
|
379
|
+
if not segments_by_page:
|
380
|
+
logger.warning("No segments with searchable pages found")
|
381
|
+
return FlowElementCollection([])
|
382
|
+
|
383
|
+
# Step 2: Search each unique page only once
|
224
384
|
all_flow_elements: List["FlowElement"] = []
|
225
385
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
# Region.find_all() should return elements in local reading order.
|
230
|
-
matches_in_segment: "PhysicalElementCollection" = physical_segment.find_all(
|
386
|
+
for page_obj, page_segments in segments_by_page.items():
|
387
|
+
# Find all matching elements on this page
|
388
|
+
page_matches = page_obj.find_all(
|
231
389
|
selector=selector,
|
232
390
|
text=text,
|
233
391
|
apply_exclusions=apply_exclusions,
|
@@ -235,16 +393,56 @@ class Flow:
|
|
235
393
|
case=case,
|
236
394
|
**kwargs,
|
237
395
|
)
|
238
|
-
if matches_in_segment:
|
239
|
-
# Wrap each found physical element as a FlowElement and add to the list
|
240
|
-
# This preserves the order from matches_in_segment.elements
|
241
|
-
for phys_elem in matches_in_segment.elements:
|
242
|
-
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
243
396
|
|
244
|
-
|
245
|
-
|
397
|
+
if not page_matches:
|
398
|
+
continue
|
399
|
+
|
400
|
+
# Step 3: For each segment on this page, collect relevant elements
|
401
|
+
for segment, segment_type in page_segments:
|
402
|
+
if segment_type == "page":
|
403
|
+
# Full page segment: include all elements
|
404
|
+
for phys_elem in page_matches.elements:
|
405
|
+
all_flow_elements.append(FlowElement(physical_object=phys_elem, flow=self))
|
406
|
+
|
407
|
+
elif segment_type == "region":
|
408
|
+
# Region segment: filter to only intersecting elements
|
409
|
+
for phys_elem in page_matches.elements:
|
410
|
+
try:
|
411
|
+
# Check if element intersects with this flow segment
|
412
|
+
if segment.intersects(phys_elem):
|
413
|
+
all_flow_elements.append(
|
414
|
+
FlowElement(physical_object=phys_elem, flow=self)
|
415
|
+
)
|
416
|
+
except Exception as intersect_error:
|
417
|
+
logger.debug(
|
418
|
+
f"Error checking intersection for element: {intersect_error}"
|
419
|
+
)
|
420
|
+
# Include the element anyway if intersection check fails
|
421
|
+
all_flow_elements.append(
|
422
|
+
FlowElement(physical_object=phys_elem, flow=self)
|
423
|
+
)
|
424
|
+
|
425
|
+
# Step 4: Remove duplicates (can happen if multiple segments intersect the same element)
|
426
|
+
unique_flow_elements = []
|
427
|
+
seen_element_ids = set()
|
428
|
+
|
429
|
+
for flow_elem in all_flow_elements:
|
430
|
+
# Create a unique identifier for the underlying physical element
|
431
|
+
phys_elem = flow_elem.physical_object
|
432
|
+
elem_id = (
|
433
|
+
(
|
434
|
+
getattr(phys_elem.page, "index", id(phys_elem.page))
|
435
|
+
if hasattr(phys_elem, "page")
|
436
|
+
else id(phys_elem)
|
437
|
+
),
|
438
|
+
phys_elem.bbox if hasattr(phys_elem, "bbox") else id(phys_elem),
|
439
|
+
)
|
440
|
+
|
441
|
+
if elem_id not in seen_element_ids:
|
442
|
+
unique_flow_elements.append(flow_elem)
|
443
|
+
seen_element_ids.add(elem_id)
|
246
444
|
|
247
|
-
return FlowElementCollection(
|
445
|
+
return FlowElementCollection(unique_flow_elements)
|
248
446
|
|
249
447
|
def __repr__(self) -> str:
|
250
448
|
return (
|
@@ -252,6 +450,807 @@ class Flow:
|
|
252
450
|
f"arrangement='{self.arrangement}', alignment='{self.alignment}', gap={self.segment_gap}>"
|
253
451
|
)
|
254
452
|
|
453
|
+
@overload
|
454
|
+
def extract_table(
|
455
|
+
self,
|
456
|
+
method: Optional[str] = None,
|
457
|
+
table_settings: Optional[dict] = None,
|
458
|
+
use_ocr: bool = False,
|
459
|
+
ocr_config: Optional[dict] = None,
|
460
|
+
text_options: Optional[dict] = None,
|
461
|
+
cell_extraction_func: Optional[Any] = None,
|
462
|
+
show_progress: bool = False,
|
463
|
+
content_filter: Optional[Any] = None,
|
464
|
+
stitch_rows: Callable[[List[Optional[str]]], bool] = None,
|
465
|
+
) -> TableResult: ...
|
466
|
+
|
467
|
+
@overload
|
468
|
+
def extract_table(
|
469
|
+
self,
|
470
|
+
method: Optional[str] = None,
|
471
|
+
table_settings: Optional[dict] = None,
|
472
|
+
use_ocr: bool = False,
|
473
|
+
ocr_config: Optional[dict] = None,
|
474
|
+
text_options: Optional[dict] = None,
|
475
|
+
cell_extraction_func: Optional[Any] = None,
|
476
|
+
show_progress: bool = False,
|
477
|
+
content_filter: Optional[Any] = None,
|
478
|
+
stitch_rows: Callable[
|
479
|
+
[List[Optional[str]], List[Optional[str]], int, Union["Page", "PhysicalRegion"]],
|
480
|
+
bool,
|
481
|
+
] = None,
|
482
|
+
) -> TableResult: ...
|
483
|
+
|
484
|
+
def extract_table(
|
485
|
+
self,
|
486
|
+
method: Optional[str] = None,
|
487
|
+
table_settings: Optional[dict] = None,
|
488
|
+
use_ocr: bool = False,
|
489
|
+
ocr_config: Optional[dict] = None,
|
490
|
+
text_options: Optional[dict] = None,
|
491
|
+
cell_extraction_func: Optional[Any] = None,
|
492
|
+
show_progress: bool = False,
|
493
|
+
content_filter: Optional[Any] = None,
|
494
|
+
stitch_rows: Optional[Callable] = None,
|
495
|
+
merge_headers: Optional[bool] = None,
|
496
|
+
) -> TableResult:
|
497
|
+
"""
|
498
|
+
Extract table data from all segments in the flow, combining results sequentially.
|
499
|
+
|
500
|
+
This method extracts table data from each segment in flow order and combines
|
501
|
+
the results into a single logical table. This is particularly useful for
|
502
|
+
multi-page tables or tables that span across columns.
|
503
|
+
|
504
|
+
Args:
|
505
|
+
method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
|
506
|
+
table_settings: Settings for pdfplumber table extraction.
|
507
|
+
use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
|
508
|
+
ocr_config: OCR configuration parameters.
|
509
|
+
text_options: Dictionary of options for the 'text' method.
|
510
|
+
cell_extraction_func: Optional callable function that takes a cell Region object
|
511
|
+
and returns its string content. For 'text' method only.
|
512
|
+
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
513
|
+
content_filter: Optional content filter to apply during cell text extraction.
|
514
|
+
merge_headers: Whether to merge tables by removing repeated headers from subsequent
|
515
|
+
segments. If None (default), auto-detects by checking if the first row
|
516
|
+
of each segment matches the first row of the first segment. If segments have
|
517
|
+
inconsistent header patterns (some repeat, others don't), raises ValueError.
|
518
|
+
Useful for multi-page tables where headers repeat on each page.
|
519
|
+
stitch_rows: Optional callable to determine when rows should be merged across
|
520
|
+
segment boundaries. Applied AFTER header removal if merge_headers
|
521
|
+
is enabled. Two overloaded signatures are supported:
|
522
|
+
|
523
|
+
• func(current_row) -> bool
|
524
|
+
Called only on the first row of each segment (after the first).
|
525
|
+
Return True to merge this first row with the last row from
|
526
|
+
the previous segment.
|
527
|
+
|
528
|
+
• func(prev_row, current_row, row_index, segment) -> bool
|
529
|
+
Called for every row. Return True to merge current_row with
|
530
|
+
the previous row in the aggregated results.
|
531
|
+
|
532
|
+
When True is returned, rows are concatenated cell-by-cell.
|
533
|
+
This is useful for handling table rows split across page
|
534
|
+
boundaries or segments. If None, rows are never merged.
|
535
|
+
|
536
|
+
Returns:
|
537
|
+
TableResult object containing the aggregated table data from all segments.
|
538
|
+
|
539
|
+
Example:
|
540
|
+
Multi-page table extraction:
|
541
|
+
```python
|
542
|
+
pdf = npdf.PDF("multi_page_table.pdf")
|
543
|
+
|
544
|
+
# Create flow for table spanning pages 2-4
|
545
|
+
table_flow = Flow(
|
546
|
+
segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
|
547
|
+
arrangement='vertical'
|
548
|
+
)
|
549
|
+
|
550
|
+
# Extract table as if it were continuous
|
551
|
+
table_data = table_flow.extract_table()
|
552
|
+
df = table_data.df # Convert to pandas DataFrame
|
553
|
+
|
554
|
+
# Custom row stitching - single parameter (simple case)
|
555
|
+
table_data = table_flow.extract_table(
|
556
|
+
stitch_rows=lambda row: row and not (row[0] or "").strip()
|
557
|
+
)
|
558
|
+
|
559
|
+
# Custom row stitching - full parameters (advanced case)
|
560
|
+
table_data = table_flow.extract_table(
|
561
|
+
stitch_rows=lambda prev, curr, idx, seg: idx == 0 and curr and not (curr[0] or "").strip()
|
562
|
+
)
|
563
|
+
```
|
564
|
+
"""
|
565
|
+
logger.info(
|
566
|
+
f"Extracting table from Flow with {len(self.segments)} segments (method: {method or 'auto'})"
|
567
|
+
)
|
568
|
+
|
569
|
+
if not self.segments:
|
570
|
+
logger.warning("Flow has no segments, returning empty table")
|
571
|
+
return TableResult([])
|
572
|
+
|
573
|
+
# Resolve predicate and determine its signature
|
574
|
+
predicate: Optional[Callable] = None
|
575
|
+
predicate_type: str = "none"
|
576
|
+
|
577
|
+
if callable(stitch_rows):
|
578
|
+
import inspect
|
579
|
+
|
580
|
+
sig = inspect.signature(stitch_rows)
|
581
|
+
param_count = len(sig.parameters)
|
582
|
+
|
583
|
+
if param_count == 1:
|
584
|
+
predicate = stitch_rows
|
585
|
+
predicate_type = "single_param"
|
586
|
+
elif param_count == 4:
|
587
|
+
predicate = stitch_rows
|
588
|
+
predicate_type = "full_params"
|
589
|
+
else:
|
590
|
+
logger.warning(
|
591
|
+
f"stitch_rows function has {param_count} parameters, expected 1 or 4. Ignoring."
|
592
|
+
)
|
593
|
+
predicate = None
|
594
|
+
predicate_type = "none"
|
595
|
+
|
596
|
+
def _default_merge(
|
597
|
+
prev_row: List[Optional[str]], cur_row: List[Optional[str]]
|
598
|
+
) -> List[Optional[str]]:
|
599
|
+
from itertools import zip_longest
|
600
|
+
|
601
|
+
merged: List[Optional[str]] = []
|
602
|
+
for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
|
603
|
+
if (p or "").strip() and (c or "").strip():
|
604
|
+
merged.append(f"{p} {c}".strip())
|
605
|
+
else:
|
606
|
+
merged.append((p or "") + (c or ""))
|
607
|
+
return merged
|
608
|
+
|
609
|
+
aggregated_rows: List[List[Optional[str]]] = []
|
610
|
+
processed_segments = 0
|
611
|
+
header_row: Optional[List[Optional[str]]] = None
|
612
|
+
merge_headers_enabled = False
|
613
|
+
headers_warned = False # Track if we've already warned about dropping headers
|
614
|
+
segment_has_repeated_header = [] # Track which segments have repeated headers
|
615
|
+
|
616
|
+
for seg_idx, segment in enumerate(self.segments):
|
617
|
+
try:
|
618
|
+
logger.debug(f" Extracting table from segment {seg_idx+1}/{len(self.segments)}")
|
619
|
+
|
620
|
+
segment_result = segment.extract_table(
|
621
|
+
method=method,
|
622
|
+
table_settings=table_settings.copy() if table_settings else None,
|
623
|
+
use_ocr=use_ocr,
|
624
|
+
ocr_config=ocr_config,
|
625
|
+
text_options=text_options.copy() if text_options else None,
|
626
|
+
cell_extraction_func=cell_extraction_func,
|
627
|
+
show_progress=show_progress,
|
628
|
+
content_filter=content_filter,
|
629
|
+
)
|
630
|
+
|
631
|
+
if not segment_result:
|
632
|
+
continue
|
633
|
+
|
634
|
+
if hasattr(segment_result, "_rows"):
|
635
|
+
segment_rows = list(segment_result._rows)
|
636
|
+
else:
|
637
|
+
segment_rows = list(segment_result)
|
638
|
+
|
639
|
+
if not segment_rows:
|
640
|
+
logger.debug(f" No table data found in segment {seg_idx+1}")
|
641
|
+
continue
|
642
|
+
|
643
|
+
# Handle header detection and merging for multi-page tables
|
644
|
+
if seg_idx == 0:
|
645
|
+
# First segment: capture potential header row
|
646
|
+
if segment_rows:
|
647
|
+
header_row = segment_rows[0]
|
648
|
+
# Determine if we should merge headers
|
649
|
+
if merge_headers is None:
|
650
|
+
# Auto-detect: we'll check all subsequent segments
|
651
|
+
merge_headers_enabled = False # Will be determined later
|
652
|
+
else:
|
653
|
+
merge_headers_enabled = merge_headers
|
654
|
+
# Track that first segment exists (for consistency checking)
|
655
|
+
segment_has_repeated_header.append(False) # First segment doesn't "repeat"
|
656
|
+
elif seg_idx == 1 and merge_headers is None:
|
657
|
+
# Auto-detection: check if first row of second segment matches header
|
658
|
+
has_header = segment_rows and header_row and segment_rows[0] == header_row
|
659
|
+
segment_has_repeated_header.append(has_header)
|
660
|
+
|
661
|
+
if has_header:
|
662
|
+
merge_headers_enabled = True
|
663
|
+
# Remove the detected repeated header from this segment
|
664
|
+
segment_rows = segment_rows[1:]
|
665
|
+
logger.debug(
|
666
|
+
f" Auto-detected repeated header in segment {seg_idx+1}, removed"
|
667
|
+
)
|
668
|
+
if not headers_warned:
|
669
|
+
warnings.warn(
|
670
|
+
"Detected repeated headers in multi-page table. Merging by removing "
|
671
|
+
"repeated headers from subsequent pages.",
|
672
|
+
UserWarning,
|
673
|
+
stacklevel=2,
|
674
|
+
)
|
675
|
+
headers_warned = True
|
676
|
+
else:
|
677
|
+
merge_headers_enabled = False
|
678
|
+
logger.debug(f" No repeated header detected in segment {seg_idx+1}")
|
679
|
+
elif seg_idx > 1:
|
680
|
+
# Check consistency: all segments should have same pattern
|
681
|
+
has_header = segment_rows and header_row and segment_rows[0] == header_row
|
682
|
+
segment_has_repeated_header.append(has_header)
|
683
|
+
|
684
|
+
# Remove header if merging is enabled and header is present
|
685
|
+
if merge_headers_enabled and has_header:
|
686
|
+
segment_rows = segment_rows[1:]
|
687
|
+
logger.debug(f" Removed repeated header from segment {seg_idx+1}")
|
688
|
+
elif seg_idx > 0 and merge_headers_enabled:
|
689
|
+
# Explicit merge_headers=True: remove headers from subsequent segments
|
690
|
+
if segment_rows and header_row and segment_rows[0] == header_row:
|
691
|
+
segment_rows = segment_rows[1:]
|
692
|
+
logger.debug(f" Removed repeated header from segment {seg_idx+1}")
|
693
|
+
if not headers_warned:
|
694
|
+
warnings.warn(
|
695
|
+
"Removing repeated headers from multi-page table during merge.",
|
696
|
+
UserWarning,
|
697
|
+
stacklevel=2,
|
698
|
+
)
|
699
|
+
headers_warned = True
|
700
|
+
|
701
|
+
for row_idx, row in enumerate(segment_rows):
|
702
|
+
should_merge = False
|
703
|
+
|
704
|
+
if predicate is not None and aggregated_rows:
|
705
|
+
if predicate_type == "single_param":
|
706
|
+
# For single param: only call on first row of segment (row_idx == 0)
|
707
|
+
# and pass the current row
|
708
|
+
if row_idx == 0:
|
709
|
+
should_merge = predicate(row)
|
710
|
+
elif predicate_type == "full_params":
|
711
|
+
# For full params: call with all arguments
|
712
|
+
should_merge = predicate(aggregated_rows[-1], row, row_idx, segment)
|
713
|
+
|
714
|
+
if should_merge:
|
715
|
+
aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
|
716
|
+
else:
|
717
|
+
aggregated_rows.append(row)
|
718
|
+
|
719
|
+
processed_segments += 1
|
720
|
+
logger.debug(
|
721
|
+
f" Added {len(segment_rows)} rows (post-merge) from segment {seg_idx+1}"
|
722
|
+
)
|
723
|
+
|
724
|
+
except Exception as e:
|
725
|
+
logger.error(f"Error extracting table from segment {seg_idx+1}: {e}", exc_info=True)
|
726
|
+
continue
|
727
|
+
|
728
|
+
# Check for inconsistent header patterns after processing all segments
|
729
|
+
if merge_headers is None and len(segment_has_repeated_header) > 2:
|
730
|
+
# During auto-detection, check for consistency across all segments
|
731
|
+
expected_pattern = segment_has_repeated_header[1] # Pattern from second segment
|
732
|
+
for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
|
733
|
+
if has_header != expected_pattern:
|
734
|
+
# Inconsistent pattern detected
|
735
|
+
segments_with_headers = [
|
736
|
+
i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
|
737
|
+
]
|
738
|
+
segments_without_headers = [
|
739
|
+
i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
|
740
|
+
]
|
741
|
+
raise ValueError(
|
742
|
+
f"Inconsistent header pattern in multi-page table: "
|
743
|
+
f"segments {segments_with_headers} have repeated headers, "
|
744
|
+
f"but segments {segments_without_headers} do not. "
|
745
|
+
f"All segments must have the same header pattern for reliable merging."
|
746
|
+
)
|
747
|
+
|
748
|
+
logger.info(
|
749
|
+
f"Flow table extraction complete: {len(aggregated_rows)} total rows from {processed_segments}/{len(self.segments)} segments"
|
750
|
+
)
|
751
|
+
return TableResult(aggregated_rows)
|
752
|
+
|
753
|
+
def analyze_layout(
|
754
|
+
self,
|
755
|
+
engine: Optional[str] = None,
|
756
|
+
options: Optional[Any] = None,
|
757
|
+
confidence: Optional[float] = None,
|
758
|
+
classes: Optional[List[str]] = None,
|
759
|
+
exclude_classes: Optional[List[str]] = None,
|
760
|
+
device: Optional[str] = None,
|
761
|
+
existing: str = "replace",
|
762
|
+
model_name: Optional[str] = None,
|
763
|
+
client: Optional[Any] = None,
|
764
|
+
) -> "PhysicalElementCollection":
|
765
|
+
"""
|
766
|
+
Analyze layout across all segments in the flow.
|
767
|
+
|
768
|
+
This method efficiently groups segments by their parent pages, runs layout analysis
|
769
|
+
only once per unique page, then filters results appropriately for each segment.
|
770
|
+
This avoids redundant analysis when multiple flow segments come from the same page.
|
771
|
+
|
772
|
+
Args:
|
773
|
+
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
|
774
|
+
options: Specific LayoutOptions object for advanced configuration.
|
775
|
+
confidence: Minimum confidence threshold.
|
776
|
+
classes: Specific classes to detect.
|
777
|
+
exclude_classes: Classes to exclude.
|
778
|
+
device: Device for inference.
|
779
|
+
existing: How to handle existing detected regions: 'replace' (default) or 'append'.
|
780
|
+
model_name: Optional model name for the engine.
|
781
|
+
client: Optional client for API-based engines.
|
782
|
+
|
783
|
+
Returns:
|
784
|
+
ElementCollection containing all detected Region objects from all segments.
|
785
|
+
|
786
|
+
Example:
|
787
|
+
Multi-page layout analysis:
|
788
|
+
```python
|
789
|
+
pdf = npdf.PDF("document.pdf")
|
790
|
+
|
791
|
+
# Create flow for first 3 pages
|
792
|
+
page_flow = Flow(
|
793
|
+
segments=pdf.pages[:3],
|
794
|
+
arrangement='vertical'
|
795
|
+
)
|
796
|
+
|
797
|
+
# Analyze layout across all pages (efficiently)
|
798
|
+
all_regions = page_flow.analyze_layout(engine='yolo')
|
799
|
+
|
800
|
+
# Find all tables across the flow
|
801
|
+
tables = all_regions.filter('region[type=table]')
|
802
|
+
```
|
803
|
+
"""
|
804
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
805
|
+
|
806
|
+
logger.info(
|
807
|
+
f"Analyzing layout across Flow with {len(self.segments)} segments (engine: {engine or 'default'})"
|
808
|
+
)
|
809
|
+
|
810
|
+
if not self.segments:
|
811
|
+
logger.warning("Flow has no segments, returning empty collection")
|
812
|
+
return ElementCollection([])
|
813
|
+
|
814
|
+
# Step 1: Group segments by their parent pages to avoid redundant analysis
|
815
|
+
segments_by_page = {} # Dict[Page, List[Segment]]
|
816
|
+
|
817
|
+
for i, segment in enumerate(self.segments):
|
818
|
+
# Determine the page for this segment
|
819
|
+
if hasattr(segment, "analyze_layout"):
|
820
|
+
# It's a Page object
|
821
|
+
page_obj = segment
|
822
|
+
segment_type = "page"
|
823
|
+
elif hasattr(segment, "page") and hasattr(segment.page, "analyze_layout"):
|
824
|
+
# It's a Region object
|
825
|
+
page_obj = segment.page
|
826
|
+
segment_type = "region"
|
827
|
+
else:
|
828
|
+
logger.warning(f"Segment {i+1} does not support layout analysis, skipping")
|
829
|
+
continue
|
830
|
+
|
831
|
+
if page_obj not in segments_by_page:
|
832
|
+
segments_by_page[page_obj] = []
|
833
|
+
segments_by_page[page_obj].append((segment, segment_type))
|
834
|
+
|
835
|
+
if not segments_by_page:
|
836
|
+
logger.warning("No segments with analyzable pages found")
|
837
|
+
return ElementCollection([])
|
838
|
+
|
839
|
+
logger.debug(
|
840
|
+
f" Grouped {len(self.segments)} segments into {len(segments_by_page)} unique pages"
|
841
|
+
)
|
842
|
+
|
843
|
+
# Step 2: Analyze each unique page only once
|
844
|
+
all_detected_regions: List["PhysicalRegion"] = []
|
845
|
+
processed_pages = 0
|
846
|
+
|
847
|
+
for page_obj, page_segments in segments_by_page.items():
|
848
|
+
try:
|
849
|
+
logger.debug(
|
850
|
+
f" Analyzing layout for page {getattr(page_obj, 'number', '?')} with {len(page_segments)} segments"
|
851
|
+
)
|
852
|
+
|
853
|
+
# Run layout analysis once for this page
|
854
|
+
page_results = page_obj.analyze_layout(
|
855
|
+
engine=engine,
|
856
|
+
options=options,
|
857
|
+
confidence=confidence,
|
858
|
+
classes=classes,
|
859
|
+
exclude_classes=exclude_classes,
|
860
|
+
device=device,
|
861
|
+
existing=existing,
|
862
|
+
model_name=model_name,
|
863
|
+
client=client,
|
864
|
+
)
|
865
|
+
|
866
|
+
# Extract regions from results
|
867
|
+
if hasattr(page_results, "elements"):
|
868
|
+
# It's an ElementCollection
|
869
|
+
page_regions = page_results.elements
|
870
|
+
elif isinstance(page_results, list):
|
871
|
+
# It's a list of regions
|
872
|
+
page_regions = page_results
|
873
|
+
else:
|
874
|
+
logger.warning(
|
875
|
+
f"Page {getattr(page_obj, 'number', '?')} returned unexpected layout analysis result type: {type(page_results)}"
|
876
|
+
)
|
877
|
+
continue
|
878
|
+
|
879
|
+
if not page_regions:
|
880
|
+
logger.debug(
|
881
|
+
f" No layout regions found on page {getattr(page_obj, 'number', '?')}"
|
882
|
+
)
|
883
|
+
continue
|
884
|
+
|
885
|
+
# Step 3: For each segment on this page, collect relevant regions
|
886
|
+
segments_processed_on_page = 0
|
887
|
+
for segment, segment_type in page_segments:
|
888
|
+
if segment_type == "page":
|
889
|
+
# Full page segment: include all detected regions
|
890
|
+
all_detected_regions.extend(page_regions)
|
891
|
+
segments_processed_on_page += 1
|
892
|
+
logger.debug(f" Added {len(page_regions)} regions for full-page segment")
|
893
|
+
|
894
|
+
elif segment_type == "region":
|
895
|
+
# Region segment: filter to only intersecting regions
|
896
|
+
intersecting_regions = []
|
897
|
+
for region in page_regions:
|
898
|
+
try:
|
899
|
+
if segment.intersects(region):
|
900
|
+
intersecting_regions.append(region)
|
901
|
+
except Exception as intersect_error:
|
902
|
+
logger.debug(
|
903
|
+
f"Error checking intersection for region: {intersect_error}"
|
904
|
+
)
|
905
|
+
# Include the region anyway if intersection check fails
|
906
|
+
intersecting_regions.append(region)
|
907
|
+
|
908
|
+
all_detected_regions.extend(intersecting_regions)
|
909
|
+
segments_processed_on_page += 1
|
910
|
+
logger.debug(
|
911
|
+
f" Added {len(intersecting_regions)} intersecting regions for region segment {segment.bbox}"
|
912
|
+
)
|
913
|
+
|
914
|
+
processed_pages += 1
|
915
|
+
logger.debug(
|
916
|
+
f" Processed {segments_processed_on_page} segments on page {getattr(page_obj, 'number', '?')}"
|
917
|
+
)
|
918
|
+
|
919
|
+
except Exception as e:
|
920
|
+
logger.error(
|
921
|
+
f"Error analyzing layout for page {getattr(page_obj, 'number', '?')}: {e}",
|
922
|
+
exc_info=True,
|
923
|
+
)
|
924
|
+
continue
|
925
|
+
|
926
|
+
# Step 4: Remove duplicates (can happen if multiple segments intersect the same region)
|
927
|
+
unique_regions = []
|
928
|
+
seen_region_ids = set()
|
929
|
+
|
930
|
+
for region in all_detected_regions:
|
931
|
+
# Create a unique identifier for this region (page + bbox)
|
932
|
+
region_id = (
|
933
|
+
getattr(region.page, "index", id(region.page)),
|
934
|
+
region.bbox if hasattr(region, "bbox") else id(region),
|
935
|
+
)
|
936
|
+
|
937
|
+
if region_id not in seen_region_ids:
|
938
|
+
unique_regions.append(region)
|
939
|
+
seen_region_ids.add(region_id)
|
940
|
+
|
941
|
+
dedupe_removed = len(all_detected_regions) - len(unique_regions)
|
942
|
+
if dedupe_removed > 0:
|
943
|
+
logger.debug(f" Removed {dedupe_removed} duplicate regions")
|
944
|
+
|
945
|
+
logger.info(
|
946
|
+
f"Flow layout analysis complete: {len(unique_regions)} unique regions from {processed_pages} pages"
|
947
|
+
)
|
948
|
+
return ElementCollection(unique_regions)
|
949
|
+
|
950
|
+
def _get_render_specs(
|
951
|
+
self,
|
952
|
+
mode: Literal["show", "render"] = "show",
|
953
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
954
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
955
|
+
crop: Union[bool, Literal["content"]] = False,
|
956
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
957
|
+
label_prefix: Optional[str] = "FlowSegment",
|
958
|
+
**kwargs,
|
959
|
+
) -> List[RenderSpec]:
|
960
|
+
"""Get render specifications for this flow.
|
961
|
+
|
962
|
+
Args:
|
963
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
964
|
+
color: Color for highlighting segments in show mode
|
965
|
+
highlights: Additional highlight groups to show
|
966
|
+
crop: Whether to crop to segments
|
967
|
+
crop_bbox: Explicit crop bounds
|
968
|
+
label_prefix: Prefix for segment labels
|
969
|
+
**kwargs: Additional parameters
|
970
|
+
|
971
|
+
Returns:
|
972
|
+
List of RenderSpec objects, one per page with segments
|
973
|
+
"""
|
974
|
+
if not self.segments:
|
975
|
+
return []
|
976
|
+
|
977
|
+
# Group segments by their physical pages
|
978
|
+
segments_by_page = {} # Dict[Page, List[PhysicalRegion]]
|
979
|
+
|
980
|
+
for i, segment in enumerate(self.segments):
|
981
|
+
# Get the page for this segment
|
982
|
+
if hasattr(segment, "page") and segment.page is not None:
|
983
|
+
# It's a Region, use its page
|
984
|
+
page_obj = segment.page
|
985
|
+
if page_obj not in segments_by_page:
|
986
|
+
segments_by_page[page_obj] = []
|
987
|
+
segments_by_page[page_obj].append(segment)
|
988
|
+
elif (
|
989
|
+
hasattr(segment, "index")
|
990
|
+
and hasattr(segment, "width")
|
991
|
+
and hasattr(segment, "height")
|
992
|
+
):
|
993
|
+
# It's a full Page object, create a full-page region for it
|
994
|
+
page_obj = segment
|
995
|
+
full_page_region = segment.region(0, 0, segment.width, segment.height)
|
996
|
+
if page_obj not in segments_by_page:
|
997
|
+
segments_by_page[page_obj] = []
|
998
|
+
segments_by_page[page_obj].append(full_page_region)
|
999
|
+
else:
|
1000
|
+
logger.warning(f"Segment {i+1} has no identifiable page, skipping")
|
1001
|
+
continue
|
1002
|
+
|
1003
|
+
if not segments_by_page:
|
1004
|
+
return []
|
1005
|
+
|
1006
|
+
# Create RenderSpec for each page
|
1007
|
+
specs = []
|
1008
|
+
|
1009
|
+
# Sort pages by index for consistent output order
|
1010
|
+
sorted_pages = sorted(
|
1011
|
+
segments_by_page.keys(),
|
1012
|
+
key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
|
1013
|
+
)
|
1014
|
+
|
1015
|
+
for page_idx, page_obj in enumerate(sorted_pages):
|
1016
|
+
segments_on_this_page = segments_by_page[page_obj]
|
1017
|
+
if not segments_on_this_page:
|
1018
|
+
continue
|
1019
|
+
|
1020
|
+
spec = RenderSpec(page=page_obj)
|
1021
|
+
|
1022
|
+
# Handle cropping
|
1023
|
+
if crop_bbox:
|
1024
|
+
spec.crop_bbox = crop_bbox
|
1025
|
+
elif crop == "content" or crop is True:
|
1026
|
+
# Calculate bounds of segments on this page
|
1027
|
+
x_coords = []
|
1028
|
+
y_coords = []
|
1029
|
+
for segment in segments_on_this_page:
|
1030
|
+
if hasattr(segment, "bbox") and segment.bbox:
|
1031
|
+
x0, y0, x1, y1 = segment.bbox
|
1032
|
+
x_coords.extend([x0, x1])
|
1033
|
+
y_coords.extend([y0, y1])
|
1034
|
+
|
1035
|
+
if x_coords and y_coords:
|
1036
|
+
spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
1037
|
+
|
1038
|
+
# Add highlights in show mode
|
1039
|
+
if mode == "show":
|
1040
|
+
# Highlight segments
|
1041
|
+
for i, segment in enumerate(segments_on_this_page):
|
1042
|
+
segment_label = None
|
1043
|
+
if label_prefix:
|
1044
|
+
# Create label for this segment
|
1045
|
+
global_segment_idx = None
|
1046
|
+
try:
|
1047
|
+
# Find the global index of this segment in the original flow
|
1048
|
+
global_segment_idx = self.segments.index(segment)
|
1049
|
+
except ValueError:
|
1050
|
+
# If it's a generated full-page region, find its source page
|
1051
|
+
for idx, orig_segment in enumerate(self.segments):
|
1052
|
+
if (
|
1053
|
+
hasattr(orig_segment, "index")
|
1054
|
+
and hasattr(segment, "page")
|
1055
|
+
and orig_segment.index == segment.page.index
|
1056
|
+
):
|
1057
|
+
global_segment_idx = idx
|
1058
|
+
break
|
1059
|
+
|
1060
|
+
if global_segment_idx is not None:
|
1061
|
+
segment_label = f"{label_prefix}_{global_segment_idx + 1}"
|
1062
|
+
else:
|
1063
|
+
segment_label = f"{label_prefix}_p{page_idx + 1}s{i + 1}"
|
1064
|
+
|
1065
|
+
spec.add_highlight(
|
1066
|
+
bbox=segment.bbox,
|
1067
|
+
polygon=segment.polygon if segment.has_polygon else None,
|
1068
|
+
color=color or "blue",
|
1069
|
+
label=segment_label,
|
1070
|
+
)
|
1071
|
+
|
1072
|
+
# Add additional highlight groups if provided
|
1073
|
+
if highlights:
|
1074
|
+
for group in highlights:
|
1075
|
+
group_elements = group.get("elements", [])
|
1076
|
+
group_color = group.get("color", color)
|
1077
|
+
group_label = group.get("label")
|
1078
|
+
|
1079
|
+
for elem in group_elements:
|
1080
|
+
# Only add if element is on this page
|
1081
|
+
if hasattr(elem, "page") and elem.page == page_obj:
|
1082
|
+
spec.add_highlight(
|
1083
|
+
element=elem, color=group_color, label=group_label
|
1084
|
+
)
|
1085
|
+
|
1086
|
+
specs.append(spec)
|
1087
|
+
|
1088
|
+
return specs
|
1089
|
+
|
1090
|
+
def _show_in_context(
|
1091
|
+
self,
|
1092
|
+
resolution: float,
|
1093
|
+
width: Optional[int] = None,
|
1094
|
+
stack_direction: str = "vertical",
|
1095
|
+
stack_gap: int = 5,
|
1096
|
+
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
1097
|
+
separator_color: Tuple[int, int, int] = (255, 0, 0),
|
1098
|
+
separator_thickness: int = 2,
|
1099
|
+
**kwargs,
|
1100
|
+
) -> Optional["PIL_Image"]:
|
1101
|
+
"""
|
1102
|
+
Show segments as cropped images stacked together with separators between segments.
|
1103
|
+
|
1104
|
+
Args:
|
1105
|
+
resolution: Resolution in DPI for rendering segment images
|
1106
|
+
width: Optional width for segment images
|
1107
|
+
stack_direction: Direction to stack segments ('vertical' or 'horizontal')
|
1108
|
+
stack_gap: Gap in pixels between segments
|
1109
|
+
stack_background_color: RGB background color for the final image
|
1110
|
+
separator_color: RGB color for separator lines between segments
|
1111
|
+
separator_thickness: Thickness in pixels of separator lines
|
1112
|
+
**kwargs: Additional arguments passed to segment rendering
|
1113
|
+
|
1114
|
+
Returns:
|
1115
|
+
PIL Image with all segments stacked together
|
1116
|
+
"""
|
1117
|
+
from PIL import Image, ImageDraw
|
1118
|
+
|
1119
|
+
segment_images = []
|
1120
|
+
segment_pages = []
|
1121
|
+
|
1122
|
+
# Determine stacking direction
|
1123
|
+
final_stack_direction = stack_direction
|
1124
|
+
if stack_direction == "auto":
|
1125
|
+
final_stack_direction = self.arrangement
|
1126
|
+
|
1127
|
+
# Get cropped images for each segment
|
1128
|
+
for i, segment in enumerate(self.segments):
|
1129
|
+
# Get the page reference for this segment
|
1130
|
+
if hasattr(segment, "page") and segment.page is not None:
|
1131
|
+
segment_page = segment.page
|
1132
|
+
# Get cropped image of the segment
|
1133
|
+
# Use render() for clean image without highlights
|
1134
|
+
segment_image = segment.render(
|
1135
|
+
resolution=resolution,
|
1136
|
+
crop=True,
|
1137
|
+
width=width,
|
1138
|
+
**kwargs,
|
1139
|
+
)
|
1140
|
+
|
1141
|
+
elif (
|
1142
|
+
hasattr(segment, "index")
|
1143
|
+
and hasattr(segment, "width")
|
1144
|
+
and hasattr(segment, "height")
|
1145
|
+
):
|
1146
|
+
# It's a full Page object
|
1147
|
+
segment_page = segment
|
1148
|
+
# Use render() for clean image without highlights
|
1149
|
+
segment_image = segment.render(resolution=resolution, width=width, **kwargs)
|
1150
|
+
else:
|
1151
|
+
raise ValueError(
|
1152
|
+
f"Segment {i+1} has no identifiable page. Segment type: {type(segment)}, attributes: {dir(segment)}"
|
1153
|
+
)
|
1154
|
+
|
1155
|
+
if segment_image is not None:
|
1156
|
+
segment_images.append(segment_image)
|
1157
|
+
segment_pages.append(segment_page)
|
1158
|
+
else:
|
1159
|
+
logger.warning(f"Segment {i+1} render() returned None, skipping")
|
1160
|
+
|
1161
|
+
# Check if we have any valid images
|
1162
|
+
if not segment_images:
|
1163
|
+
logger.error("No valid segment images could be rendered")
|
1164
|
+
return None
|
1165
|
+
|
1166
|
+
# We should have at least one segment image by now (or an exception would have been raised)
|
1167
|
+
if len(segment_images) == 1:
|
1168
|
+
return segment_images[0]
|
1169
|
+
|
1170
|
+
# Calculate dimensions for the final stacked image
|
1171
|
+
if final_stack_direction == "vertical":
|
1172
|
+
# Stack vertically
|
1173
|
+
final_width = max(img.width for img in segment_images)
|
1174
|
+
|
1175
|
+
# Calculate total height including gaps and separators
|
1176
|
+
total_height = sum(img.height for img in segment_images)
|
1177
|
+
total_height += (len(segment_images) - 1) * stack_gap
|
1178
|
+
|
1179
|
+
# Add separator thickness between all segments
|
1180
|
+
num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
|
1181
|
+
total_height += num_separators * separator_thickness
|
1182
|
+
|
1183
|
+
# Create the final image
|
1184
|
+
final_image = Image.new("RGB", (final_width, total_height), stack_background_color)
|
1185
|
+
draw = ImageDraw.Draw(final_image)
|
1186
|
+
|
1187
|
+
current_y = 0
|
1188
|
+
|
1189
|
+
for i, img in enumerate(segment_images):
|
1190
|
+
# Add separator line before each segment (except the first one)
|
1191
|
+
if i > 0:
|
1192
|
+
# Draw separator line
|
1193
|
+
draw.rectangle(
|
1194
|
+
[(0, current_y), (final_width, current_y + separator_thickness)],
|
1195
|
+
fill=separator_color,
|
1196
|
+
)
|
1197
|
+
current_y += separator_thickness
|
1198
|
+
|
1199
|
+
# Paste the segment image
|
1200
|
+
paste_x = (final_width - img.width) // 2 # Center horizontally
|
1201
|
+
final_image.paste(img, (paste_x, current_y))
|
1202
|
+
current_y += img.height
|
1203
|
+
|
1204
|
+
# Add gap after segment (except for the last one)
|
1205
|
+
if i < len(segment_images) - 1:
|
1206
|
+
current_y += stack_gap
|
1207
|
+
|
1208
|
+
return final_image
|
1209
|
+
|
1210
|
+
elif final_stack_direction == "horizontal":
|
1211
|
+
# Stack horizontally
|
1212
|
+
final_height = max(img.height for img in segment_images)
|
1213
|
+
|
1214
|
+
# Calculate total width including gaps and separators
|
1215
|
+
total_width = sum(img.width for img in segment_images)
|
1216
|
+
total_width += (len(segment_images) - 1) * stack_gap
|
1217
|
+
|
1218
|
+
# Add separator thickness between all segments
|
1219
|
+
num_separators = len(segment_images) - 1 if len(segment_images) > 1 else 0
|
1220
|
+
total_width += num_separators * separator_thickness
|
1221
|
+
|
1222
|
+
# Create the final image
|
1223
|
+
final_image = Image.new("RGB", (total_width, final_height), stack_background_color)
|
1224
|
+
draw = ImageDraw.Draw(final_image)
|
1225
|
+
|
1226
|
+
current_x = 0
|
1227
|
+
|
1228
|
+
for i, img in enumerate(segment_images):
|
1229
|
+
# Add separator line before each segment (except the first one)
|
1230
|
+
if i > 0:
|
1231
|
+
# Draw separator line
|
1232
|
+
draw.rectangle(
|
1233
|
+
[(current_x, 0), (current_x + separator_thickness, final_height)],
|
1234
|
+
fill=separator_color,
|
1235
|
+
)
|
1236
|
+
current_x += separator_thickness
|
1237
|
+
|
1238
|
+
# Paste the segment image
|
1239
|
+
paste_y = (final_height - img.height) // 2 # Center vertically
|
1240
|
+
final_image.paste(img, (current_x, paste_y))
|
1241
|
+
current_x += img.width
|
1242
|
+
|
1243
|
+
# Add gap after segment (except for the last one)
|
1244
|
+
if i < len(segment_images) - 1:
|
1245
|
+
current_x += stack_gap
|
1246
|
+
|
1247
|
+
return final_image
|
1248
|
+
|
1249
|
+
else:
|
1250
|
+
raise ValueError(
|
1251
|
+
f"Invalid stack_direction '{final_stack_direction}' for in_context. Must be 'vertical' or 'horizontal'."
|
1252
|
+
)
|
1253
|
+
|
255
1254
|
# --- Helper methods for coordinate transformations and segment iteration ---
|
256
1255
|
# These will be crucial for FlowElement's directional methods.
|
257
1256
|
|
@@ -290,3 +1289,643 @@ class Flow:
|
|
290
1289
|
raise NotImplementedError(
|
291
1290
|
"Translating element coordinates to a unified flow coordinate system is not yet implemented."
|
292
1291
|
)
|
1292
|
+
|
1293
|
+
def get_sections(
|
1294
|
+
self,
|
1295
|
+
start_elements=None,
|
1296
|
+
end_elements=None,
|
1297
|
+
new_section_on_page_break: bool = False,
|
1298
|
+
include_boundaries: str = "both",
|
1299
|
+
) -> "ElementCollection":
|
1300
|
+
"""
|
1301
|
+
Extract logical sections from the Flow based on *start* and *end* boundary
|
1302
|
+
elements, mirroring the behaviour of PDF/PageCollection.get_sections().
|
1303
|
+
|
1304
|
+
This implementation is a thin wrapper that converts the Flow into a
|
1305
|
+
temporary PageCollection (constructed from the unique pages that the
|
1306
|
+
Flow spans) and then delegates the heavy‐lifting to that existing
|
1307
|
+
implementation. Any FlowElement / FlowElementCollection inputs are
|
1308
|
+
automatically unwrapped to their underlying physical elements so that
|
1309
|
+
PageCollection can work with them directly.
|
1310
|
+
|
1311
|
+
Args:
|
1312
|
+
start_elements: Elements or selector string that mark the start of
|
1313
|
+
sections (optional).
|
1314
|
+
end_elements: Elements or selector string that mark the end of
|
1315
|
+
sections (optional).
|
1316
|
+
new_section_on_page_break: Whether to start a new section at page
|
1317
|
+
boundaries (default: False).
|
1318
|
+
include_boundaries: How to include boundary elements: 'start',
|
1319
|
+
'end', 'both', or 'none' (default: 'both').
|
1320
|
+
|
1321
|
+
Returns:
|
1322
|
+
ElementCollection of Region/FlowRegion objects representing the
|
1323
|
+
extracted sections.
|
1324
|
+
"""
|
1325
|
+
# ------------------------------------------------------------------
|
1326
|
+
# Unwrap FlowElement(-Collection) inputs and selector strings so we
|
1327
|
+
# can reason about them generically.
|
1328
|
+
# ------------------------------------------------------------------
|
1329
|
+
from natural_pdf.flows.collections import FlowElementCollection
|
1330
|
+
from natural_pdf.flows.element import FlowElement
|
1331
|
+
|
1332
|
+
def _unwrap(obj):
|
1333
|
+
"""Convert Flow-specific wrappers to their underlying physical objects.
|
1334
|
+
|
1335
|
+
Keeps selector strings as-is; converts FlowElement to its physical
|
1336
|
+
element; converts FlowElementCollection to list of physical
|
1337
|
+
elements; passes through ElementCollection by taking .elements.
|
1338
|
+
"""
|
1339
|
+
|
1340
|
+
if obj is None or isinstance(obj, str):
|
1341
|
+
return obj
|
1342
|
+
|
1343
|
+
if isinstance(obj, FlowElement):
|
1344
|
+
return obj.physical_object
|
1345
|
+
|
1346
|
+
if isinstance(obj, FlowElementCollection):
|
1347
|
+
return [fe.physical_object for fe in obj.flow_elements]
|
1348
|
+
|
1349
|
+
if hasattr(obj, "elements"):
|
1350
|
+
return obj.elements
|
1351
|
+
|
1352
|
+
if isinstance(obj, (list, tuple, set)):
|
1353
|
+
out = []
|
1354
|
+
for item in obj:
|
1355
|
+
if isinstance(item, FlowElement):
|
1356
|
+
out.append(item.physical_object)
|
1357
|
+
else:
|
1358
|
+
out.append(item)
|
1359
|
+
return out
|
1360
|
+
|
1361
|
+
return obj # Fallback – unknown type
|
1362
|
+
|
1363
|
+
start_elements_unwrapped = _unwrap(start_elements)
|
1364
|
+
end_elements_unwrapped = _unwrap(end_elements)
|
1365
|
+
|
1366
|
+
# ------------------------------------------------------------------
|
1367
|
+
# PRIMARY IMPLEMENTATION – operate on each Flow **segment region**
|
1368
|
+
# independently so that sectioning happens *per-region*, not per page.
|
1369
|
+
# ------------------------------------------------------------------
|
1370
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
1371
|
+
|
1372
|
+
aggregated_sections = []
|
1373
|
+
|
1374
|
+
# Helper to decide if an element lies inside a segment (Region)
|
1375
|
+
def _element_in_segment(elem, segment_region):
|
1376
|
+
try:
|
1377
|
+
return segment_region.intersects(elem) # Region method – robust
|
1378
|
+
except Exception:
|
1379
|
+
# Fallback to bounding-box containment checks
|
1380
|
+
if not hasattr(elem, "bbox"):
|
1381
|
+
return False
|
1382
|
+
ex0, etop, ex1, ebottom = elem.bbox
|
1383
|
+
sx0, stop, sx1, sbottom = segment_region.bbox
|
1384
|
+
return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
|
1385
|
+
|
1386
|
+
for seg in self.segments:
|
1387
|
+
# Each *seg* is guaranteed to be a Region (see _normalize_segments)
|
1388
|
+
|
1389
|
+
# Resolve segment-specific boundary arguments
|
1390
|
+
seg_start_elems = None
|
1391
|
+
seg_end_elems = None
|
1392
|
+
|
1393
|
+
# --- Handle selector strings ---
|
1394
|
+
if isinstance(start_elements_unwrapped, str):
|
1395
|
+
seg_start_elems = seg.find_all(start_elements_unwrapped).elements
|
1396
|
+
elif start_elements_unwrapped is not None:
|
1397
|
+
seg_start_elems = [
|
1398
|
+
e for e in start_elements_unwrapped if _element_in_segment(e, seg)
|
1399
|
+
]
|
1400
|
+
|
1401
|
+
if isinstance(end_elements_unwrapped, str):
|
1402
|
+
seg_end_elems = seg.find_all(end_elements_unwrapped).elements
|
1403
|
+
elif end_elements_unwrapped is not None:
|
1404
|
+
seg_end_elems = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1405
|
+
|
1406
|
+
# Call Region.get_sections – this returns ElementCollection[Region]
|
1407
|
+
seg_sections = seg.get_sections(
|
1408
|
+
start_elements=seg_start_elems,
|
1409
|
+
end_elements=seg_end_elems,
|
1410
|
+
include_boundaries=include_boundaries,
|
1411
|
+
)
|
1412
|
+
|
1413
|
+
if seg_sections:
|
1414
|
+
aggregated_sections.extend(seg_sections.elements)
|
1415
|
+
|
1416
|
+
# Optionally, handle new_section_on_page_break – interpreted here as
|
1417
|
+
# *new_section_on_segment_break*: if True and there were *no* explicit
|
1418
|
+
# boundaries, treat the entire segment as a single section.
|
1419
|
+
if (
|
1420
|
+
new_section_on_page_break
|
1421
|
+
and not seg_sections
|
1422
|
+
and start_elements_unwrapped is None
|
1423
|
+
and end_elements_unwrapped is None
|
1424
|
+
):
|
1425
|
+
aggregated_sections.append(seg)
|
1426
|
+
|
1427
|
+
# ------------------------------------------------------------------
|
1428
|
+
# CROSS-SEGMENT SECTION DETECTION: Check if we have boundaries that
|
1429
|
+
# span multiple segments and create FlowRegions for those cases.
|
1430
|
+
# ------------------------------------------------------------------
|
1431
|
+
|
1432
|
+
# If we have explicit start/end elements, check for cross-segment sections
|
1433
|
+
if start_elements_unwrapped is not None and end_elements_unwrapped is not None:
|
1434
|
+
# Find all start and end elements across all segments
|
1435
|
+
all_start_elements = []
|
1436
|
+
all_end_elements = []
|
1437
|
+
|
1438
|
+
# Map elements to their segments for tracking
|
1439
|
+
element_to_segment = {}
|
1440
|
+
|
1441
|
+
for seg_idx, seg in enumerate(self.segments):
|
1442
|
+
if isinstance(start_elements_unwrapped, str):
|
1443
|
+
seg_starts = seg.find_all(start_elements_unwrapped).elements
|
1444
|
+
else:
|
1445
|
+
seg_starts = [
|
1446
|
+
e for e in start_elements_unwrapped if _element_in_segment(e, seg)
|
1447
|
+
]
|
1448
|
+
|
1449
|
+
if isinstance(end_elements_unwrapped, str):
|
1450
|
+
seg_ends = seg.find_all(end_elements_unwrapped).elements
|
1451
|
+
else:
|
1452
|
+
seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1453
|
+
|
1454
|
+
for elem in seg_starts:
|
1455
|
+
all_start_elements.append((elem, seg_idx))
|
1456
|
+
element_to_segment[id(elem)] = seg_idx
|
1457
|
+
|
1458
|
+
for elem in seg_ends:
|
1459
|
+
all_end_elements.append((elem, seg_idx))
|
1460
|
+
element_to_segment[id(elem)] = seg_idx
|
1461
|
+
|
1462
|
+
# Sort by segment index, then by position within segment
|
1463
|
+
all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1464
|
+
all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1465
|
+
|
1466
|
+
# Look for cross-segment pairs (start in one segment, end in another)
|
1467
|
+
cross_segment_sections = []
|
1468
|
+
used_starts = set()
|
1469
|
+
used_ends = set()
|
1470
|
+
|
1471
|
+
for start_elem, start_seg_idx in all_start_elements:
|
1472
|
+
if id(start_elem) in used_starts:
|
1473
|
+
continue
|
1474
|
+
|
1475
|
+
# Find the next end element that comes after this start
|
1476
|
+
matching_end = None
|
1477
|
+
for end_elem, end_seg_idx in all_end_elements:
|
1478
|
+
if id(end_elem) in used_ends:
|
1479
|
+
continue
|
1480
|
+
|
1481
|
+
# Check if this end comes after the start (by segment order or position)
|
1482
|
+
if end_seg_idx > start_seg_idx or (
|
1483
|
+
end_seg_idx == start_seg_idx
|
1484
|
+
and (
|
1485
|
+
end_elem.top > start_elem.top
|
1486
|
+
or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
|
1487
|
+
)
|
1488
|
+
):
|
1489
|
+
matching_end = (end_elem, end_seg_idx)
|
1490
|
+
break
|
1491
|
+
|
1492
|
+
if matching_end is not None:
|
1493
|
+
end_elem, end_seg_idx = matching_end
|
1494
|
+
|
1495
|
+
# If start and end are in different segments, create FlowRegion
|
1496
|
+
if start_seg_idx != end_seg_idx:
|
1497
|
+
cross_segment_sections.append(
|
1498
|
+
(start_elem, start_seg_idx, end_elem, end_seg_idx)
|
1499
|
+
)
|
1500
|
+
used_starts.add(id(start_elem))
|
1501
|
+
used_ends.add(id(end_elem))
|
1502
|
+
|
1503
|
+
# Create FlowRegions for cross-segment sections
|
1504
|
+
from natural_pdf.elements.region import Region
|
1505
|
+
from natural_pdf.flows.element import FlowElement
|
1506
|
+
from natural_pdf.flows.region import FlowRegion
|
1507
|
+
|
1508
|
+
for start_elem, start_seg_idx, end_elem, end_seg_idx in cross_segment_sections:
|
1509
|
+
# Build constituent regions spanning from start segment to end segment
|
1510
|
+
constituent_regions = []
|
1511
|
+
|
1512
|
+
# First segment: from start element to bottom
|
1513
|
+
start_seg = self.segments[start_seg_idx]
|
1514
|
+
first_region = Region(
|
1515
|
+
start_seg.page, (start_seg.x0, start_elem.top, start_seg.x1, start_seg.bottom)
|
1516
|
+
)
|
1517
|
+
constituent_regions.append(first_region)
|
1518
|
+
|
1519
|
+
# Middle segments: full segments
|
1520
|
+
for seg_idx in range(start_seg_idx + 1, end_seg_idx):
|
1521
|
+
constituent_regions.append(self.segments[seg_idx])
|
1522
|
+
|
1523
|
+
# Last segment: from top to end element
|
1524
|
+
if end_seg_idx != start_seg_idx:
|
1525
|
+
end_seg = self.segments[end_seg_idx]
|
1526
|
+
last_region = Region(
|
1527
|
+
end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, end_elem.bottom)
|
1528
|
+
)
|
1529
|
+
constituent_regions.append(last_region)
|
1530
|
+
|
1531
|
+
# Create FlowRegion
|
1532
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1533
|
+
flow_region = FlowRegion(
|
1534
|
+
flow=self,
|
1535
|
+
constituent_regions=constituent_regions,
|
1536
|
+
source_flow_element=flow_element,
|
1537
|
+
boundary_element_found=end_elem,
|
1538
|
+
)
|
1539
|
+
|
1540
|
+
# Remove any single-segment sections that are now covered by this FlowRegion
|
1541
|
+
# This prevents duplication of content
|
1542
|
+
aggregated_sections = [
|
1543
|
+
s
|
1544
|
+
for s in aggregated_sections
|
1545
|
+
if not any(
|
1546
|
+
cr.intersects(s)
|
1547
|
+
for cr in constituent_regions
|
1548
|
+
if hasattr(cr, "intersects") and hasattr(s, "intersects")
|
1549
|
+
)
|
1550
|
+
]
|
1551
|
+
|
1552
|
+
aggregated_sections.append(flow_region)
|
1553
|
+
|
1554
|
+
# ------------------------------------------------------------------
|
1555
|
+
# NEW APPROACH: First collect ALL boundary elements across all segments,
|
1556
|
+
# then pair them up to create sections (either single-segment Regions
|
1557
|
+
# or multi-segment FlowRegions).
|
1558
|
+
# ------------------------------------------------------------------
|
1559
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
1560
|
+
from natural_pdf.elements.region import Region
|
1561
|
+
from natural_pdf.flows.element import FlowElement
|
1562
|
+
from natural_pdf.flows.region import FlowRegion
|
1563
|
+
|
1564
|
+
# Helper to decide if an element lies inside a segment (Region)
|
1565
|
+
def _element_in_segment(elem, segment_region):
|
1566
|
+
try:
|
1567
|
+
return segment_region.intersects(elem) # Region method – robust
|
1568
|
+
except Exception:
|
1569
|
+
# Fallback to bounding-box containment checks
|
1570
|
+
if not hasattr(elem, "bbox"):
|
1571
|
+
return False
|
1572
|
+
ex0, etop, ex1, ebottom = elem.bbox
|
1573
|
+
sx0, stop, sx1, sbottom = segment_region.bbox
|
1574
|
+
return not (ex1 < sx0 or ex0 > sx1 or ebottom < stop or etop > sbottom)
|
1575
|
+
|
1576
|
+
# Collect ALL boundary elements across all segments with their segment indices
|
1577
|
+
all_start_elements = []
|
1578
|
+
all_end_elements = []
|
1579
|
+
|
1580
|
+
for seg_idx, seg in enumerate(self.segments):
|
1581
|
+
# Find start elements in this segment
|
1582
|
+
if isinstance(start_elements_unwrapped, str):
|
1583
|
+
seg_starts = seg.find_all(start_elements_unwrapped).elements
|
1584
|
+
elif start_elements_unwrapped is not None:
|
1585
|
+
seg_starts = [e for e in start_elements_unwrapped if _element_in_segment(e, seg)]
|
1586
|
+
else:
|
1587
|
+
seg_starts = []
|
1588
|
+
|
1589
|
+
logger.debug(f"\n=== Processing segment {seg_idx} ===")
|
1590
|
+
logger.debug(f"Segment bbox: {seg.bbox}")
|
1591
|
+
logger.debug(
|
1592
|
+
f"Segment page: {seg.page.number if hasattr(seg.page, 'number') else 'unknown'}"
|
1593
|
+
)
|
1594
|
+
|
1595
|
+
logger.debug(f"Found {len(seg_starts)} start elements in segment {seg_idx}")
|
1596
|
+
for i, elem in enumerate(seg_starts):
|
1597
|
+
logger.debug(
|
1598
|
+
f" Start {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
|
1599
|
+
)
|
1600
|
+
|
1601
|
+
# Find end elements in this segment
|
1602
|
+
if isinstance(end_elements_unwrapped, str):
|
1603
|
+
seg_ends = seg.find_all(end_elements_unwrapped).elements
|
1604
|
+
elif end_elements_unwrapped is not None:
|
1605
|
+
seg_ends = [e for e in end_elements_unwrapped if _element_in_segment(e, seg)]
|
1606
|
+
else:
|
1607
|
+
seg_ends = []
|
1608
|
+
|
1609
|
+
logger.debug(f"Found {len(seg_ends)} end elements in segment {seg_idx}")
|
1610
|
+
for i, elem in enumerate(seg_ends):
|
1611
|
+
logger.debug(
|
1612
|
+
f" End {i}: bbox={elem.bbox}, text='{getattr(elem, 'text', 'N/A')[:50]}...'"
|
1613
|
+
)
|
1614
|
+
|
1615
|
+
# Add to global lists with segment index
|
1616
|
+
for elem in seg_starts:
|
1617
|
+
all_start_elements.append((elem, seg_idx))
|
1618
|
+
for elem in seg_ends:
|
1619
|
+
all_end_elements.append((elem, seg_idx))
|
1620
|
+
|
1621
|
+
# Sort by flow order: segment index first, then position within segment
|
1622
|
+
all_start_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1623
|
+
all_end_elements.sort(key=lambda x: (x[1], x[0].top, x[0].x0))
|
1624
|
+
|
1625
|
+
logger.debug(f"\n=== Total boundary elements found ===")
|
1626
|
+
logger.debug(f"Total start elements: {len(all_start_elements)}")
|
1627
|
+
logger.debug(f"Total end elements: {len(all_end_elements)}")
|
1628
|
+
|
1629
|
+
# Pair up start and end elements to create sections
|
1630
|
+
sections = []
|
1631
|
+
used_starts = set()
|
1632
|
+
used_ends = set()
|
1633
|
+
|
1634
|
+
for start_elem, start_seg_idx in all_start_elements:
|
1635
|
+
if id(start_elem) in used_starts:
|
1636
|
+
continue
|
1637
|
+
|
1638
|
+
logger.debug(f"\n--- Pairing start element from segment {start_seg_idx} ---")
|
1639
|
+
logger.debug(
|
1640
|
+
f"Start: bbox={start_elem.bbox}, text='{getattr(start_elem, 'text', 'N/A')[:30]}...'"
|
1641
|
+
)
|
1642
|
+
|
1643
|
+
# Find the next unused end element that comes after this start
|
1644
|
+
matching_end = None
|
1645
|
+
for end_elem, end_seg_idx in all_end_elements:
|
1646
|
+
if id(end_elem) in used_ends:
|
1647
|
+
continue
|
1648
|
+
|
1649
|
+
# Check if this end comes after the start in flow order
|
1650
|
+
if end_seg_idx > start_seg_idx or (
|
1651
|
+
end_seg_idx == start_seg_idx
|
1652
|
+
and (
|
1653
|
+
end_elem.top > start_elem.top
|
1654
|
+
or (end_elem.top == start_elem.top and end_elem.x0 >= start_elem.x0)
|
1655
|
+
)
|
1656
|
+
):
|
1657
|
+
matching_end = (end_elem, end_seg_idx)
|
1658
|
+
break
|
1659
|
+
|
1660
|
+
if matching_end is not None:
|
1661
|
+
end_elem, end_seg_idx = matching_end
|
1662
|
+
used_starts.add(id(start_elem))
|
1663
|
+
used_ends.add(id(end_elem))
|
1664
|
+
|
1665
|
+
logger.debug(f" Matched! Start seg={start_seg_idx}, End seg={end_seg_idx}")
|
1666
|
+
|
1667
|
+
# Create section based on whether it spans segments
|
1668
|
+
if start_seg_idx == end_seg_idx:
|
1669
|
+
# Single segment section - use Region.get_section_between
|
1670
|
+
seg = self.segments[start_seg_idx]
|
1671
|
+
section = seg.get_section_between(start_elem, end_elem, include_boundaries)
|
1672
|
+
sections.append(section)
|
1673
|
+
logger.debug(f" Created single-segment Region")
|
1674
|
+
else:
|
1675
|
+
# Multi-segment section - create FlowRegion
|
1676
|
+
logger.debug(
|
1677
|
+
f" Creating multi-segment FlowRegion spanning segments {start_seg_idx} to {end_seg_idx}"
|
1678
|
+
)
|
1679
|
+
constituent_regions = []
|
1680
|
+
|
1681
|
+
# First segment: from start element to bottom
|
1682
|
+
start_seg = self.segments[start_seg_idx]
|
1683
|
+
if include_boundaries in ["start", "both"]:
|
1684
|
+
first_top = start_elem.top
|
1685
|
+
else:
|
1686
|
+
first_top = start_elem.bottom
|
1687
|
+
first_region = Region(
|
1688
|
+
start_seg.page, (start_seg.x0, first_top, start_seg.x1, start_seg.bottom)
|
1689
|
+
)
|
1690
|
+
constituent_regions.append(first_region)
|
1691
|
+
|
1692
|
+
# Middle segments: full segments
|
1693
|
+
for seg_idx in range(start_seg_idx + 1, end_seg_idx):
|
1694
|
+
constituent_regions.append(self.segments[seg_idx])
|
1695
|
+
|
1696
|
+
# Last segment: from top to end element
|
1697
|
+
end_seg = self.segments[end_seg_idx]
|
1698
|
+
if include_boundaries in ["end", "both"]:
|
1699
|
+
last_bottom = end_elem.bottom
|
1700
|
+
else:
|
1701
|
+
last_bottom = end_elem.top
|
1702
|
+
last_region = Region(
|
1703
|
+
end_seg.page, (end_seg.x0, end_seg.top, end_seg.x1, last_bottom)
|
1704
|
+
)
|
1705
|
+
constituent_regions.append(last_region)
|
1706
|
+
|
1707
|
+
# Create FlowRegion
|
1708
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1709
|
+
flow_region = FlowRegion(
|
1710
|
+
flow=self,
|
1711
|
+
constituent_regions=constituent_regions,
|
1712
|
+
source_flow_element=flow_element,
|
1713
|
+
boundary_element_found=end_elem,
|
1714
|
+
)
|
1715
|
+
sections.append(flow_region)
|
1716
|
+
|
1717
|
+
# Handle special cases when only start or only end elements are provided
|
1718
|
+
if start_elements_unwrapped is not None and end_elements_unwrapped is None:
|
1719
|
+
logger.debug(f"\n=== Handling start-only elements (no end elements provided) ===")
|
1720
|
+
for i, (start_elem, start_seg_idx) in enumerate(all_start_elements):
|
1721
|
+
if id(start_elem) in used_starts:
|
1722
|
+
continue
|
1723
|
+
|
1724
|
+
# Find next start element
|
1725
|
+
next_start = None
|
1726
|
+
if i + 1 < len(all_start_elements):
|
1727
|
+
next_start_elem, next_start_seg_idx = all_start_elements[i + 1]
|
1728
|
+
# Create section from this start to just before next start
|
1729
|
+
if start_seg_idx == next_start_seg_idx:
|
1730
|
+
# Same segment
|
1731
|
+
seg = self.segments[start_seg_idx]
|
1732
|
+
# Find element just before next start
|
1733
|
+
all_elems = seg.get_elements()
|
1734
|
+
all_elems.sort(key=lambda e: (e.top, e.x0))
|
1735
|
+
try:
|
1736
|
+
next_idx = all_elems.index(next_start_elem)
|
1737
|
+
if next_idx > 0:
|
1738
|
+
end_elem = all_elems[next_idx - 1]
|
1739
|
+
section = seg.get_section_between(
|
1740
|
+
start_elem, end_elem, include_boundaries
|
1741
|
+
)
|
1742
|
+
sections.append(section)
|
1743
|
+
except ValueError:
|
1744
|
+
pass
|
1745
|
+
elif next_start_seg_idx == start_seg_idx + 1:
|
1746
|
+
# Next start is in the immediately following segment in the flow
|
1747
|
+
# Create a FlowRegion that spans from current start to just before next start
|
1748
|
+
logger.debug(f" Next start is in next flow segment - creating FlowRegion")
|
1749
|
+
|
1750
|
+
constituent_regions = []
|
1751
|
+
|
1752
|
+
# First segment: from start element to bottom
|
1753
|
+
start_seg = self.segments[start_seg_idx]
|
1754
|
+
if include_boundaries in ["start", "both"]:
|
1755
|
+
first_top = start_elem.top
|
1756
|
+
else:
|
1757
|
+
first_top = start_elem.bottom
|
1758
|
+
first_region = Region(
|
1759
|
+
start_seg.page,
|
1760
|
+
(start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
|
1761
|
+
)
|
1762
|
+
constituent_regions.append(first_region)
|
1763
|
+
|
1764
|
+
# Next segment: from top to just before next start
|
1765
|
+
next_seg = self.segments[next_start_seg_idx]
|
1766
|
+
# Find element just before next start in the next segment
|
1767
|
+
next_seg_elems = next_seg.get_elements()
|
1768
|
+
next_seg_elems.sort(key=lambda e: (e.top, e.x0))
|
1769
|
+
|
1770
|
+
last_bottom = next_start_elem.top # Default to just before the next start
|
1771
|
+
try:
|
1772
|
+
next_idx = next_seg_elems.index(next_start_elem)
|
1773
|
+
if next_idx > 0:
|
1774
|
+
# Use the bottom of the element before next start
|
1775
|
+
prev_elem = next_seg_elems[next_idx - 1]
|
1776
|
+
last_bottom = prev_elem.bottom
|
1777
|
+
except ValueError:
|
1778
|
+
pass
|
1779
|
+
|
1780
|
+
last_region = Region(
|
1781
|
+
next_seg.page, (next_seg.x0, next_seg.top, next_seg.x1, last_bottom)
|
1782
|
+
)
|
1783
|
+
constituent_regions.append(last_region)
|
1784
|
+
|
1785
|
+
# Create FlowRegion
|
1786
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1787
|
+
flow_region = FlowRegion(
|
1788
|
+
flow=self,
|
1789
|
+
constituent_regions=constituent_regions,
|
1790
|
+
source_flow_element=flow_element,
|
1791
|
+
boundary_element_found=None,
|
1792
|
+
)
|
1793
|
+
sections.append(flow_region)
|
1794
|
+
logger.debug(
|
1795
|
+
f" Created FlowRegion with {len(constituent_regions)} constituent regions"
|
1796
|
+
)
|
1797
|
+
else:
|
1798
|
+
# Next start is more than one segment away - just end at current segment
|
1799
|
+
start_seg = self.segments[start_seg_idx]
|
1800
|
+
if include_boundaries in ["start", "both"]:
|
1801
|
+
region_top = start_elem.top
|
1802
|
+
else:
|
1803
|
+
region_top = start_elem.bottom
|
1804
|
+
section = Region(
|
1805
|
+
start_seg.page,
|
1806
|
+
(start_seg.x0, region_top, start_seg.x1, start_seg.bottom),
|
1807
|
+
)
|
1808
|
+
sections.append(section)
|
1809
|
+
logger.debug(
|
1810
|
+
f" Next start is {next_start_seg_idx - start_seg_idx} segments away - ending at current segment"
|
1811
|
+
)
|
1812
|
+
else:
|
1813
|
+
# Last start element: section goes to end of flow
|
1814
|
+
# This could span multiple segments
|
1815
|
+
if start_seg_idx == len(self.segments) - 1:
|
1816
|
+
# Only in last segment
|
1817
|
+
seg = self.segments[start_seg_idx]
|
1818
|
+
if include_boundaries in ["start", "both"]:
|
1819
|
+
region_top = start_elem.top
|
1820
|
+
else:
|
1821
|
+
region_top = start_elem.bottom
|
1822
|
+
section = Region(seg.page, (seg.x0, region_top, seg.x1, seg.bottom))
|
1823
|
+
sections.append(section)
|
1824
|
+
else:
|
1825
|
+
# Spans to end of flow - create FlowRegion
|
1826
|
+
constituent_regions = []
|
1827
|
+
|
1828
|
+
# First segment
|
1829
|
+
start_seg = self.segments[start_seg_idx]
|
1830
|
+
if include_boundaries in ["start", "both"]:
|
1831
|
+
first_top = start_elem.top
|
1832
|
+
else:
|
1833
|
+
first_top = start_elem.bottom
|
1834
|
+
first_region = Region(
|
1835
|
+
start_seg.page,
|
1836
|
+
(start_seg.x0, first_top, start_seg.x1, start_seg.bottom),
|
1837
|
+
)
|
1838
|
+
constituent_regions.append(first_region)
|
1839
|
+
|
1840
|
+
# Remaining segments
|
1841
|
+
for seg_idx in range(start_seg_idx + 1, len(self.segments)):
|
1842
|
+
constituent_regions.append(self.segments[seg_idx])
|
1843
|
+
|
1844
|
+
flow_element = FlowElement(physical_object=start_elem, flow=self)
|
1845
|
+
flow_region = FlowRegion(
|
1846
|
+
flow=self,
|
1847
|
+
constituent_regions=constituent_regions,
|
1848
|
+
source_flow_element=flow_element,
|
1849
|
+
boundary_element_found=None,
|
1850
|
+
)
|
1851
|
+
sections.append(flow_region)
|
1852
|
+
|
1853
|
+
# Handle new_section_on_page_break when no explicit boundaries
|
1854
|
+
if (
|
1855
|
+
new_section_on_page_break
|
1856
|
+
and start_elements_unwrapped is None
|
1857
|
+
and end_elements_unwrapped is None
|
1858
|
+
):
|
1859
|
+
# Each segment becomes its own section
|
1860
|
+
sections = list(self.segments)
|
1861
|
+
|
1862
|
+
# Sort sections by their position in the flow
|
1863
|
+
def _section_sort_key(section):
|
1864
|
+
if hasattr(section, "constituent_regions"):
|
1865
|
+
# FlowRegion - use first constituent region
|
1866
|
+
first_region = (
|
1867
|
+
section.constituent_regions[0] if section.constituent_regions else None
|
1868
|
+
)
|
1869
|
+
if first_region:
|
1870
|
+
# Find which segment this region belongs to
|
1871
|
+
for idx, seg in enumerate(self.segments):
|
1872
|
+
try:
|
1873
|
+
if seg.intersects(first_region):
|
1874
|
+
return (
|
1875
|
+
idx,
|
1876
|
+
getattr(first_region, "top", 0),
|
1877
|
+
getattr(first_region, "x0", 0),
|
1878
|
+
)
|
1879
|
+
except:
|
1880
|
+
pass
|
1881
|
+
else:
|
1882
|
+
# Regular Region
|
1883
|
+
for idx, seg in enumerate(self.segments):
|
1884
|
+
try:
|
1885
|
+
if seg.intersects(section):
|
1886
|
+
return (idx, getattr(section, "top", 0), getattr(section, "x0", 0))
|
1887
|
+
except:
|
1888
|
+
pass
|
1889
|
+
return (float("inf"), 0, 0)
|
1890
|
+
|
1891
|
+
sections.sort(key=_section_sort_key)
|
1892
|
+
|
1893
|
+
logger.debug(f"\n=== Section creation complete ===")
|
1894
|
+
logger.debug(f"Total sections created: {len(sections)}")
|
1895
|
+
for i, section in enumerate(sections):
|
1896
|
+
if hasattr(section, "constituent_regions"):
|
1897
|
+
logger.debug(
|
1898
|
+
f"Section {i}: FlowRegion with {len(section.constituent_regions)} constituent regions"
|
1899
|
+
)
|
1900
|
+
else:
|
1901
|
+
logger.debug(f"Section {i}: Region with bbox={section.bbox}")
|
1902
|
+
|
1903
|
+
return ElementCollection(sections)
|
1904
|
+
|
1905
|
+
def highlights(self, show: bool = False) -> "HighlightContext":
|
1906
|
+
"""
|
1907
|
+
Create a highlight context for accumulating highlights.
|
1908
|
+
|
1909
|
+
This allows for clean syntax to show multiple highlight groups:
|
1910
|
+
|
1911
|
+
Example:
|
1912
|
+
with flow.highlights() as h:
|
1913
|
+
h.add(flow.find_all('table'), label='tables', color='blue')
|
1914
|
+
h.add(flow.find_all('text:bold'), label='bold text', color='red')
|
1915
|
+
h.show()
|
1916
|
+
|
1917
|
+
Or with automatic display:
|
1918
|
+
with flow.highlights(show=True) as h:
|
1919
|
+
h.add(flow.find_all('table'), label='tables')
|
1920
|
+
h.add(flow.find_all('text:bold'), label='bold')
|
1921
|
+
# Automatically shows when exiting the context
|
1922
|
+
|
1923
|
+
Args:
|
1924
|
+
show: If True, automatically show highlights when exiting context
|
1925
|
+
|
1926
|
+
Returns:
|
1927
|
+
HighlightContext for accumulating highlights
|
1928
|
+
"""
|
1929
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
1930
|
+
|
1931
|
+
return HighlightContext(self, show_on_exit=show)
|