natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/flows/region.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
import logging
|
2
|
-
|
2
|
+
import warnings
|
3
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
3
4
|
|
4
5
|
from pdfplumber.utils.geometry import merge_bboxes # Import merge_bboxes directly
|
5
6
|
|
6
7
|
# For runtime image manipulation
|
7
8
|
from PIL import Image as PIL_Image_Runtime
|
8
9
|
|
10
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
9
11
|
from natural_pdf.tables import TableResult
|
10
12
|
|
11
13
|
if TYPE_CHECKING:
|
@@ -13,7 +15,7 @@ if TYPE_CHECKING:
|
|
13
15
|
|
14
16
|
from natural_pdf.core.page import Page as PhysicalPage
|
15
17
|
from natural_pdf.elements.base import Element as PhysicalElement
|
16
|
-
from natural_pdf.elements.
|
18
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
17
19
|
from natural_pdf.elements.region import Region as PhysicalRegion
|
18
20
|
|
19
21
|
from .element import FlowElement
|
@@ -22,7 +24,7 @@ if TYPE_CHECKING:
|
|
22
24
|
logger = logging.getLogger(__name__)
|
23
25
|
|
24
26
|
|
25
|
-
class FlowRegion:
|
27
|
+
class FlowRegion(Visualizable):
|
26
28
|
"""
|
27
29
|
Represents a selected area within a Flow, potentially composed of multiple
|
28
30
|
physical Region objects (constituent_regions) that might span across
|
@@ -65,17 +67,156 @@ class FlowRegion:
|
|
65
67
|
self._cached_elements: Optional["ElementCollection"] = None # Stringized
|
66
68
|
self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
|
67
69
|
|
70
|
+
def _get_highlighter(self):
|
71
|
+
"""Get the highlighting service from constituent regions."""
|
72
|
+
if not self.constituent_regions:
|
73
|
+
raise RuntimeError("FlowRegion has no constituent regions to get highlighter from")
|
74
|
+
|
75
|
+
# Get highlighter from first constituent region
|
76
|
+
first_region = self.constituent_regions[0]
|
77
|
+
if hasattr(first_region, "_highlighter"):
|
78
|
+
return first_region._highlighter
|
79
|
+
elif hasattr(first_region, "page") and hasattr(first_region.page, "_highlighter"):
|
80
|
+
return first_region.page._highlighter
|
81
|
+
else:
|
82
|
+
raise RuntimeError(
|
83
|
+
f"Cannot find HighlightingService from FlowRegion constituent regions. "
|
84
|
+
f"First region type: {type(first_region).__name__}"
|
85
|
+
)
|
86
|
+
|
87
|
+
def _get_render_specs(
|
88
|
+
self,
|
89
|
+
mode: Literal["show", "render"] = "show",
|
90
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
91
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
92
|
+
crop: Union[bool, Literal["content"]] = False,
|
93
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
94
|
+
**kwargs,
|
95
|
+
) -> List[RenderSpec]:
|
96
|
+
"""Get render specifications for this flow region.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
100
|
+
color: Color for highlighting this region in show mode
|
101
|
+
highlights: Additional highlight groups to show
|
102
|
+
crop: Whether to crop to constituent regions
|
103
|
+
crop_bbox: Explicit crop bounds
|
104
|
+
**kwargs: Additional parameters
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
List of RenderSpec objects, one per page with constituent regions
|
108
|
+
"""
|
109
|
+
if not self.constituent_regions:
|
110
|
+
return []
|
111
|
+
|
112
|
+
# Group constituent regions by page
|
113
|
+
regions_by_page = {}
|
114
|
+
for region in self.constituent_regions:
|
115
|
+
if hasattr(region, "page") and region.page:
|
116
|
+
page = region.page
|
117
|
+
if page not in regions_by_page:
|
118
|
+
regions_by_page[page] = []
|
119
|
+
regions_by_page[page].append(region)
|
120
|
+
|
121
|
+
if not regions_by_page:
|
122
|
+
return []
|
123
|
+
|
124
|
+
# Create RenderSpec for each page
|
125
|
+
specs = []
|
126
|
+
for page, page_regions in regions_by_page.items():
|
127
|
+
spec = RenderSpec(page=page)
|
128
|
+
|
129
|
+
# Handle cropping
|
130
|
+
if crop_bbox:
|
131
|
+
spec.crop_bbox = crop_bbox
|
132
|
+
elif crop == "content" or crop is True:
|
133
|
+
# Calculate bounds of regions on this page
|
134
|
+
x_coords = []
|
135
|
+
y_coords = []
|
136
|
+
for region in page_regions:
|
137
|
+
if hasattr(region, "bbox") and region.bbox:
|
138
|
+
x0, y0, x1, y1 = region.bbox
|
139
|
+
x_coords.extend([x0, x1])
|
140
|
+
y_coords.extend([y0, y1])
|
141
|
+
|
142
|
+
if x_coords and y_coords:
|
143
|
+
spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
144
|
+
|
145
|
+
# Add highlights in show mode
|
146
|
+
if mode == "show":
|
147
|
+
# Highlight constituent regions
|
148
|
+
for i, region in enumerate(page_regions):
|
149
|
+
# Label each part if multiple regions
|
150
|
+
label = None
|
151
|
+
if len(self.constituent_regions) > 1:
|
152
|
+
# Find global index
|
153
|
+
try:
|
154
|
+
global_idx = self.constituent_regions.index(region)
|
155
|
+
label = f"FlowPart_{global_idx + 1}"
|
156
|
+
except ValueError:
|
157
|
+
label = f"FlowPart_{i + 1}"
|
158
|
+
else:
|
159
|
+
label = "FlowRegion"
|
160
|
+
|
161
|
+
spec.add_highlight(
|
162
|
+
bbox=region.bbox,
|
163
|
+
polygon=region.polygon if region.has_polygon else None,
|
164
|
+
color=color or "fuchsia",
|
165
|
+
label=label,
|
166
|
+
)
|
167
|
+
|
168
|
+
# Add additional highlight groups if provided
|
169
|
+
if highlights:
|
170
|
+
for group in highlights:
|
171
|
+
group_elements = group.get("elements", [])
|
172
|
+
group_color = group.get("color", color)
|
173
|
+
group_label = group.get("label")
|
174
|
+
|
175
|
+
for elem in group_elements:
|
176
|
+
# Only add if element is on this page
|
177
|
+
if hasattr(elem, "page") and elem.page == page:
|
178
|
+
spec.add_highlight(
|
179
|
+
element=elem, color=group_color, label=group_label
|
180
|
+
)
|
181
|
+
|
182
|
+
specs.append(spec)
|
183
|
+
|
184
|
+
return specs
|
185
|
+
|
68
186
|
def __getattr__(self, name: str) -> Any:
|
69
187
|
"""
|
70
|
-
Dynamically proxy attribute access to the source FlowElement
|
71
|
-
|
188
|
+
Dynamically proxy attribute access to the source FlowElement for safe attributes only.
|
189
|
+
Spatial methods (above, below, left, right) are explicitly implemented to prevent
|
190
|
+
silent failures and incorrect behavior.
|
72
191
|
"""
|
73
192
|
if name in self.__dict__:
|
74
193
|
return self.__dict__[name]
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
194
|
+
|
195
|
+
# List of methods that should NOT be proxied - they need proper FlowRegion implementation
|
196
|
+
spatial_methods = {"above", "below", "left", "right", "to_region"}
|
197
|
+
|
198
|
+
if name in spatial_methods:
|
199
|
+
raise AttributeError(
|
200
|
+
f"'{self.__class__.__name__}' object has no attribute '{name}'. "
|
201
|
+
f"This method requires proper FlowRegion implementation to handle spatial relationships correctly."
|
202
|
+
)
|
203
|
+
|
204
|
+
# Only proxy safe attributes and methods
|
205
|
+
if self.source_flow_element is not None:
|
206
|
+
try:
|
207
|
+
attr = getattr(self.source_flow_element, name)
|
208
|
+
# Only proxy non-callable attributes and explicitly safe methods
|
209
|
+
if not callable(attr) or name in {"page", "document"}: # Add safe methods as needed
|
210
|
+
return attr
|
211
|
+
else:
|
212
|
+
raise AttributeError(
|
213
|
+
f"Method '{name}' cannot be safely proxied from FlowElement to FlowRegion. "
|
214
|
+
f"It may need explicit implementation."
|
215
|
+
)
|
216
|
+
except AttributeError:
|
217
|
+
pass
|
218
|
+
|
219
|
+
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
|
79
220
|
|
80
221
|
@property
|
81
222
|
def bbox(self) -> Optional[Tuple[float, float, float, float]]:
|
@@ -90,10 +231,12 @@ class FlowRegion:
|
|
90
231
|
|
91
232
|
# Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
|
92
233
|
# Extract bbox tuples from regions first
|
93
|
-
region_bboxes = [
|
234
|
+
region_bboxes = [
|
235
|
+
region.bbox for region in self.constituent_regions if hasattr(region, "bbox")
|
236
|
+
]
|
94
237
|
if not region_bboxes:
|
95
238
|
return None
|
96
|
-
|
239
|
+
|
97
240
|
self._cached_bbox = merge_bboxes(region_bboxes)
|
98
241
|
return self._cached_bbox
|
99
242
|
|
@@ -171,7 +314,7 @@ class FlowRegion:
|
|
171
314
|
Returns:
|
172
315
|
An ElementCollection containing all unique elements.
|
173
316
|
"""
|
174
|
-
from natural_pdf.elements.
|
317
|
+
from natural_pdf.elements.element_collection import (
|
175
318
|
ElementCollection as RuntimeElementCollection, # Local import
|
176
319
|
)
|
177
320
|
|
@@ -257,7 +400,7 @@ class FlowRegion:
|
|
257
400
|
chains each region's native ``find_all`` call and concatenates their
|
258
401
|
results into a single ElementCollection while preserving flow order.
|
259
402
|
"""
|
260
|
-
from natural_pdf.elements.
|
403
|
+
from natural_pdf.elements.element_collection import (
|
261
404
|
ElementCollection as RuntimeElementCollection,
|
262
405
|
)
|
263
406
|
|
@@ -268,9 +411,7 @@ class FlowRegion:
|
|
268
411
|
|
269
412
|
for region in self.constituent_regions:
|
270
413
|
try:
|
271
|
-
region_matches = region.find_all(
|
272
|
-
selector=selector, text=text, **kwargs
|
273
|
-
)
|
414
|
+
region_matches = region.find_all(selector=selector, text=text, **kwargs)
|
274
415
|
if region_matches:
|
275
416
|
# ``region_matches`` is an ElementCollection – extend with its
|
276
417
|
# underlying list so we don't create nested collections.
|
@@ -312,200 +453,33 @@ class FlowRegion:
|
|
312
453
|
region.highlight(label=current_label, color=color, **kwargs)
|
313
454
|
return self
|
314
455
|
|
315
|
-
def show
|
316
|
-
self,
|
317
|
-
resolution: Optional[float] = None,
|
318
|
-
labels: bool = True,
|
319
|
-
legend_position: str = "right",
|
320
|
-
color: Optional[Union[Tuple, str]] = "fuchsia",
|
321
|
-
label_prefix: Optional[str] = "FlowPart",
|
322
|
-
width: Optional[int] = None,
|
323
|
-
stack_direction: str = "vertical",
|
324
|
-
stack_gap: int = 5,
|
325
|
-
stack_background_color: Tuple[int, int, int] = (255, 255, 255),
|
326
|
-
crop: bool = False,
|
327
|
-
**kwargs,
|
328
|
-
) -> Optional["PIL_Image"]:
|
456
|
+
def highlights(self, show: bool = False) -> "HighlightContext":
|
329
457
|
"""
|
330
|
-
|
331
|
-
If multiple pages are involved, they are stacked into a single image.
|
458
|
+
Create a highlight context for accumulating highlights.
|
332
459
|
|
333
|
-
|
334
|
-
resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
|
335
|
-
labels: Whether to include a legend for highlights.
|
336
|
-
legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
|
337
|
-
color: Color for highlighting the constituent regions.
|
338
|
-
label_prefix: Prefix for region labels (e.g., 'FlowPart').
|
339
|
-
width: Optional width for the output image (overrides resolution).
|
340
|
-
stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
|
341
|
-
stack_gap: Gap in pixels between stacked pages.
|
342
|
-
stack_background_color: RGB background color for the stacked image.
|
343
|
-
crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
|
344
|
-
**kwargs: Additional arguments passed to the underlying rendering methods.
|
460
|
+
This allows for clean syntax to show multiple highlight groups:
|
345
461
|
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
return None
|
462
|
+
Example:
|
463
|
+
with flow_region.highlights() as h:
|
464
|
+
h.add(flow_region.find_all('table'), label='tables', color='blue')
|
465
|
+
h.add(flow_region.find_all('text:bold'), label='bold text', color='red')
|
466
|
+
h.show()
|
352
467
|
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
regions_by_page[region.page] = []
|
359
|
-
regions_by_page[region.page].append(region)
|
360
|
-
else:
|
361
|
-
raise ValueError(f"Constituent region {region.bbox} has no page.")
|
362
|
-
|
363
|
-
if not regions_by_page:
|
364
|
-
logger.info("FlowRegion.show() found no constituent regions with associated pages.")
|
365
|
-
return None
|
366
|
-
|
367
|
-
# 2. Get a highlighter service (e.g., from the first page involved)
|
368
|
-
first_page_with_regions = next(iter(regions_by_page.keys()), None)
|
369
|
-
highlighter_service = None
|
370
|
-
if first_page_with_regions and hasattr(first_page_with_regions, "_highlighter"):
|
371
|
-
highlighter_service = first_page_with_regions._highlighter
|
468
|
+
Or with automatic display:
|
469
|
+
with flow_region.highlights(show=True) as h:
|
470
|
+
h.add(flow_region.find_all('table'), label='tables')
|
471
|
+
h.add(flow_region.find_all('text:bold'), label='bold')
|
472
|
+
# Automatically shows when exiting the context
|
372
473
|
|
373
|
-
|
374
|
-
|
375
|
-
"Cannot get highlighter service for FlowRegion.show(). "
|
376
|
-
"Ensure constituent regions' pages are initialized with a highlighter."
|
377
|
-
)
|
378
|
-
|
379
|
-
output_page_images: List["PIL_Image_Runtime"] = []
|
380
|
-
|
381
|
-
# Sort pages by index for consistent output order
|
382
|
-
sorted_pages = sorted(
|
383
|
-
regions_by_page.keys(),
|
384
|
-
key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
|
385
|
-
)
|
386
|
-
|
387
|
-
# 3. Render each page with its relevant constituent regions highlighted
|
388
|
-
for page_idx, page_obj in enumerate(sorted_pages):
|
389
|
-
constituent_regions_on_this_page = regions_by_page[page_obj]
|
390
|
-
if not constituent_regions_on_this_page:
|
391
|
-
continue
|
392
|
-
|
393
|
-
temp_highlights_for_page = []
|
394
|
-
for i, region_part in enumerate(constituent_regions_on_this_page):
|
395
|
-
part_label = None
|
396
|
-
if labels and label_prefix: # Ensure labels is True for label_prefix to apply
|
397
|
-
# If FlowRegion consists of multiple parts on this page, or overall
|
398
|
-
count_indicator = ""
|
399
|
-
if (
|
400
|
-
len(self.constituent_regions) > 1
|
401
|
-
): # If flow region has multiple parts overall
|
402
|
-
# Find global index of this region_part in self.constituent_regions
|
403
|
-
try:
|
404
|
-
global_idx = self.constituent_regions.index(region_part)
|
405
|
-
count_indicator = f"_{global_idx + 1}"
|
406
|
-
except ValueError: # Should not happen if region_part is from the list
|
407
|
-
count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
|
408
|
-
elif (
|
409
|
-
len(constituent_regions_on_this_page) > 1
|
410
|
-
): # If multiple parts on *this* page, but FR is single part overall
|
411
|
-
count_indicator = f"_{i+1}"
|
412
|
-
|
413
|
-
part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
|
414
|
-
|
415
|
-
temp_highlights_for_page.append(
|
416
|
-
{
|
417
|
-
"page_index": (
|
418
|
-
page_obj.index
|
419
|
-
if hasattr(page_obj, "index")
|
420
|
-
else getattr(page_obj, "page_number", 1) - 1
|
421
|
-
),
|
422
|
-
"bbox": region_part.bbox,
|
423
|
-
"polygon": region_part.polygon if region_part.has_polygon else None,
|
424
|
-
"color": color, # Use the passed color
|
425
|
-
"label": part_label,
|
426
|
-
"use_color_cycling": False, # Keep specific color
|
427
|
-
}
|
428
|
-
)
|
429
|
-
|
430
|
-
if not temp_highlights_for_page:
|
431
|
-
continue
|
432
|
-
|
433
|
-
# Calculate crop bbox if cropping is enabled
|
434
|
-
crop_bbox = None
|
435
|
-
if crop and constituent_regions_on_this_page:
|
436
|
-
# Calculate the bounding box that encompasses all constituent regions on this page
|
437
|
-
min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
|
438
|
-
min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
|
439
|
-
max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
|
440
|
-
max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
|
441
|
-
crop_bbox = (min_x0, min_y0, max_x1, max_y1)
|
442
|
-
|
443
|
-
page_image = highlighter_service.render_preview(
|
444
|
-
page_index=(
|
445
|
-
page_obj.index
|
446
|
-
if hasattr(page_obj, "index")
|
447
|
-
else getattr(page_obj, "page_number", 1) - 1
|
448
|
-
),
|
449
|
-
temporary_highlights=temp_highlights_for_page,
|
450
|
-
resolution=resolution,
|
451
|
-
width=width,
|
452
|
-
labels=labels, # Pass through labels
|
453
|
-
legend_position=legend_position,
|
454
|
-
crop_bbox=crop_bbox,
|
455
|
-
**kwargs,
|
456
|
-
)
|
457
|
-
if page_image:
|
458
|
-
output_page_images.append(page_image)
|
459
|
-
|
460
|
-
# 4. Stack the generated page images if multiple
|
461
|
-
if not output_page_images:
|
462
|
-
logger.info("FlowRegion.show() produced no page images to concatenate.")
|
463
|
-
return None
|
464
|
-
|
465
|
-
if len(output_page_images) == 1:
|
466
|
-
return output_page_images[0]
|
467
|
-
|
468
|
-
# Stacking logic (same as in FlowRegionCollection.show)
|
469
|
-
if stack_direction == "vertical":
|
470
|
-
final_width = max(img.width for img in output_page_images)
|
471
|
-
final_height = (
|
472
|
-
sum(img.height for img in output_page_images)
|
473
|
-
+ (len(output_page_images) - 1) * stack_gap
|
474
|
-
)
|
475
|
-
if final_width == 0 or final_height == 0:
|
476
|
-
raise ValueError("Cannot create concatenated image with zero width or height.")
|
474
|
+
Args:
|
475
|
+
show: If True, automatically show highlights when exiting context
|
477
476
|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
for img in output_page_images:
|
483
|
-
paste_x = (final_width - img.width) // 2
|
484
|
-
concatenated_image.paste(img, (paste_x, current_y))
|
485
|
-
current_y += img.height + stack_gap
|
486
|
-
return concatenated_image
|
487
|
-
elif stack_direction == "horizontal":
|
488
|
-
final_width = (
|
489
|
-
sum(img.width for img in output_page_images)
|
490
|
-
+ (len(output_page_images) - 1) * stack_gap
|
491
|
-
)
|
492
|
-
final_height = max(img.height for img in output_page_images)
|
493
|
-
if final_width == 0 or final_height == 0:
|
494
|
-
raise ValueError("Cannot create concatenated image with zero width or height.")
|
477
|
+
Returns:
|
478
|
+
HighlightContext for accumulating highlights
|
479
|
+
"""
|
480
|
+
from natural_pdf.core.highlighting_service import HighlightContext
|
495
481
|
|
496
|
-
|
497
|
-
"RGB", (final_width, final_height), stack_background_color
|
498
|
-
)
|
499
|
-
current_x = 0
|
500
|
-
for img in output_page_images:
|
501
|
-
paste_y = (final_height - img.height) // 2
|
502
|
-
concatenated_image.paste(img, (current_x, paste_y))
|
503
|
-
current_x += img.width + stack_gap
|
504
|
-
return concatenated_image
|
505
|
-
else:
|
506
|
-
raise ValueError(
|
507
|
-
f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'."
|
508
|
-
)
|
482
|
+
return HighlightContext(self, show_on_exit=show)
|
509
483
|
|
510
484
|
def to_images(
|
511
485
|
self,
|
@@ -523,9 +497,8 @@ class FlowRegion:
|
|
523
497
|
cropped_images: List["PIL_Image"] = []
|
524
498
|
for region_part in self.constituent_regions:
|
525
499
|
try:
|
526
|
-
|
527
|
-
|
528
|
-
)
|
500
|
+
# Use render() for clean image without highlights
|
501
|
+
img = region_part.render(resolution=resolution, crop=True, **kwargs)
|
529
502
|
if img:
|
530
503
|
cropped_images.append(img)
|
531
504
|
except Exception as e:
|
@@ -536,73 +509,424 @@ class FlowRegion:
|
|
536
509
|
|
537
510
|
return cropped_images
|
538
511
|
|
539
|
-
def
|
512
|
+
def __repr__(self) -> str:
|
513
|
+
return (
|
514
|
+
f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
|
515
|
+
f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
|
516
|
+
)
|
517
|
+
|
518
|
+
def expand(
|
519
|
+
self,
|
520
|
+
left: float = 0,
|
521
|
+
right: float = 0,
|
522
|
+
top: float = 0,
|
523
|
+
bottom: float = 0,
|
524
|
+
width_factor: float = 1.0,
|
525
|
+
height_factor: float = 1.0,
|
526
|
+
) -> "FlowRegion":
|
540
527
|
"""
|
541
|
-
|
542
|
-
Stacking direction is based on the Flow's arrangement.
|
543
|
-
Individual region images are obtained by calling to_images(**kwargs).
|
528
|
+
Create a new FlowRegion with all constituent regions expanded.
|
544
529
|
|
545
530
|
Args:
|
546
|
-
|
547
|
-
|
548
|
-
|
531
|
+
left: Amount to expand left edge (positive value expands leftwards)
|
532
|
+
right: Amount to expand right edge (positive value expands rightwards)
|
533
|
+
top: Amount to expand top edge (positive value expands upwards)
|
534
|
+
bottom: Amount to expand bottom edge (positive value expands downwards)
|
535
|
+
width_factor: Factor to multiply width by (applied after absolute expansion)
|
536
|
+
height_factor: Factor to multiply height by (applied after absolute expansion)
|
549
537
|
|
550
538
|
Returns:
|
551
|
-
|
539
|
+
New FlowRegion with expanded constituent regions
|
552
540
|
"""
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
541
|
+
if not self.constituent_regions:
|
542
|
+
return FlowRegion(
|
543
|
+
flow=self.flow,
|
544
|
+
constituent_regions=[],
|
545
|
+
source_flow_element=self.source_flow_element,
|
546
|
+
boundary_element_found=self.boundary_element_found,
|
547
|
+
)
|
548
|
+
|
549
|
+
expanded_regions = []
|
550
|
+
for idx, region in enumerate(self.constituent_regions):
|
551
|
+
# Determine which adjustments to apply based on flow arrangement
|
552
|
+
apply_left = left
|
553
|
+
apply_right = right
|
554
|
+
apply_top = top
|
555
|
+
apply_bottom = bottom
|
556
|
+
|
557
|
+
if self.flow.arrangement == "vertical":
|
558
|
+
# In a vertical flow, only the *first* region should react to `top`
|
559
|
+
# and only the *last* region should react to `bottom`. This keeps
|
560
|
+
# the virtual contiguous area intact while allowing users to nudge
|
561
|
+
# the flow boundaries.
|
562
|
+
if idx != 0:
|
563
|
+
apply_top = 0
|
564
|
+
if idx != len(self.constituent_regions) - 1:
|
565
|
+
apply_bottom = 0
|
566
|
+
# left/right apply to every region (same column width change)
|
567
|
+
else: # horizontal flow
|
568
|
+
# In a horizontal flow, only the first region reacts to `left`
|
569
|
+
# and only the last region reacts to `right`.
|
570
|
+
if idx != 0:
|
571
|
+
apply_left = 0
|
572
|
+
if idx != len(self.constituent_regions) - 1:
|
573
|
+
apply_right = 0
|
574
|
+
# top/bottom apply to every region in horizontal flows
|
575
|
+
|
576
|
+
# Skip no-op expansion to avoid extra Region objects
|
577
|
+
needs_expansion = (
|
578
|
+
any(
|
579
|
+
v not in (0, 1.0) # compare width/height factor logically later
|
580
|
+
for v in (apply_left, apply_right, apply_top, apply_bottom)
|
581
|
+
)
|
582
|
+
or width_factor != 1.0
|
583
|
+
or height_factor != 1.0
|
584
|
+
)
|
585
|
+
|
586
|
+
try:
|
587
|
+
expanded_region = (
|
588
|
+
region.expand(
|
589
|
+
left=apply_left,
|
590
|
+
right=apply_right,
|
591
|
+
top=apply_top,
|
592
|
+
bottom=apply_bottom,
|
593
|
+
width_factor=width_factor,
|
594
|
+
height_factor=height_factor,
|
595
|
+
)
|
596
|
+
if needs_expansion
|
597
|
+
else region
|
598
|
+
)
|
599
|
+
expanded_regions.append(expanded_region)
|
600
|
+
except Exception as e:
|
601
|
+
logger.warning(
|
602
|
+
f"FlowRegion.expand: Error expanding constituent region {region.bbox}: {e}",
|
603
|
+
exc_info=False,
|
604
|
+
)
|
605
|
+
expanded_regions.append(region)
|
606
|
+
|
607
|
+
# Create new FlowRegion with expanded constituent regions
|
608
|
+
new_flow_region = FlowRegion(
|
609
|
+
flow=self.flow,
|
610
|
+
constituent_regions=expanded_regions,
|
611
|
+
source_flow_element=self.source_flow_element,
|
612
|
+
boundary_element_found=self.boundary_element_found,
|
613
|
+
)
|
614
|
+
|
615
|
+
# Copy metadata
|
616
|
+
new_flow_region.source = self.source
|
617
|
+
new_flow_region.region_type = self.region_type
|
618
|
+
new_flow_region.metadata = self.metadata.copy()
|
619
|
+
|
620
|
+
# Clear caches since the regions have changed
|
621
|
+
new_flow_region._cached_text = None
|
622
|
+
new_flow_region._cached_elements = None
|
623
|
+
new_flow_region._cached_bbox = None
|
624
|
+
|
625
|
+
return new_flow_region
|
626
|
+
|
627
|
+
def above(
|
628
|
+
self,
|
629
|
+
height: Optional[float] = None,
|
630
|
+
width: str = "full",
|
631
|
+
include_source: bool = False,
|
632
|
+
until: Optional[str] = None,
|
633
|
+
include_endpoint: bool = True,
|
634
|
+
**kwargs,
|
635
|
+
) -> "FlowRegion":
|
636
|
+
"""
|
637
|
+
Create a FlowRegion with regions above this FlowRegion.
|
638
|
+
|
639
|
+
For vertical flows: Only expands the topmost constituent region upward.
|
640
|
+
For horizontal flows: Expands all constituent regions upward.
|
641
|
+
|
642
|
+
Args:
|
643
|
+
height: Height of the region above, in points
|
644
|
+
width: Width mode - "full" for full page width or "element" for element width
|
645
|
+
include_source: Whether to include this FlowRegion in the result
|
646
|
+
until: Optional selector string to specify an upper boundary element
|
647
|
+
include_endpoint: Whether to include the boundary element in the region
|
648
|
+
**kwargs: Additional parameters
|
649
|
+
|
650
|
+
Returns:
|
651
|
+
New FlowRegion with regions above
|
652
|
+
"""
|
653
|
+
if not self.constituent_regions:
|
654
|
+
return FlowRegion(
|
655
|
+
flow=self.flow,
|
656
|
+
constituent_regions=[],
|
657
|
+
source_flow_element=self.source_flow_element,
|
658
|
+
boundary_element_found=self.boundary_element_found,
|
659
|
+
)
|
660
|
+
|
661
|
+
new_regions = []
|
559
662
|
|
560
663
|
if self.flow.arrangement == "vertical":
|
561
|
-
#
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
664
|
+
# For vertical flow, use FLOW ORDER (index 0 is earliest). Only expand the
|
665
|
+
# first constituent region in that order.
|
666
|
+
first_region = self.constituent_regions[0]
|
667
|
+
for idx, region in enumerate(self.constituent_regions):
|
668
|
+
if idx == 0: # Only expand the first region (earliest in flow)
|
669
|
+
above_region = region.above(
|
670
|
+
height=height,
|
671
|
+
width="element", # Keep original column width
|
672
|
+
include_source=include_source,
|
673
|
+
until=until,
|
674
|
+
include_endpoint=include_endpoint,
|
675
|
+
**kwargs,
|
676
|
+
)
|
677
|
+
new_regions.append(above_region)
|
678
|
+
elif include_source:
|
679
|
+
new_regions.append(region)
|
680
|
+
else: # horizontal flow
|
681
|
+
# For horizontal flow, expand all regions upward
|
682
|
+
for region in self.constituent_regions:
|
683
|
+
above_region = region.above(
|
684
|
+
height=height,
|
685
|
+
width=width,
|
686
|
+
include_source=include_source,
|
687
|
+
until=until,
|
688
|
+
include_endpoint=include_endpoint,
|
689
|
+
**kwargs,
|
690
|
+
)
|
691
|
+
new_regions.append(above_region)
|
692
|
+
|
693
|
+
return FlowRegion(
|
694
|
+
flow=self.flow,
|
695
|
+
constituent_regions=new_regions,
|
696
|
+
source_flow_element=self.source_flow_element,
|
697
|
+
boundary_element_found=self.boundary_element_found,
|
698
|
+
)
|
699
|
+
|
700
|
+
def below(
|
701
|
+
self,
|
702
|
+
height: Optional[float] = None,
|
703
|
+
width: str = "full",
|
704
|
+
include_source: bool = False,
|
705
|
+
until: Optional[str] = None,
|
706
|
+
include_endpoint: bool = True,
|
707
|
+
**kwargs,
|
708
|
+
) -> "FlowRegion":
|
709
|
+
"""
|
710
|
+
Create a FlowRegion with regions below this FlowRegion.
|
711
|
+
|
712
|
+
For vertical flows: Only expands the bottommost constituent region downward.
|
713
|
+
For horizontal flows: Expands all constituent regions downward.
|
714
|
+
|
715
|
+
Args:
|
716
|
+
height: Height of the region below, in points
|
717
|
+
width: Width mode - "full" for full page width or "element" for element width
|
718
|
+
include_source: Whether to include this FlowRegion in the result
|
719
|
+
until: Optional selector string to specify a lower boundary element
|
720
|
+
include_endpoint: Whether to include the boundary element in the region
|
721
|
+
**kwargs: Additional parameters
|
722
|
+
|
723
|
+
Returns:
|
724
|
+
New FlowRegion with regions below
|
725
|
+
"""
|
726
|
+
if not self.constituent_regions:
|
727
|
+
return FlowRegion(
|
728
|
+
flow=self.flow,
|
729
|
+
constituent_regions=[],
|
730
|
+
source_flow_element=self.source_flow_element,
|
731
|
+
boundary_element_found=self.boundary_element_found,
|
569
732
|
)
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
733
|
+
|
734
|
+
new_regions = []
|
735
|
+
|
736
|
+
if self.flow.arrangement == "vertical":
|
737
|
+
# For vertical flow, expand only the LAST constituent region in flow order.
|
738
|
+
last_idx = len(self.constituent_regions) - 1
|
739
|
+
for idx, region in enumerate(self.constituent_regions):
|
740
|
+
if idx == last_idx:
|
741
|
+
below_region = region.below(
|
742
|
+
height=height,
|
743
|
+
width="element",
|
744
|
+
include_source=include_source,
|
745
|
+
until=until,
|
746
|
+
include_endpoint=include_endpoint,
|
747
|
+
**kwargs,
|
748
|
+
)
|
749
|
+
new_regions.append(below_region)
|
750
|
+
elif include_source:
|
751
|
+
new_regions.append(region)
|
752
|
+
else: # horizontal flow
|
753
|
+
# For horizontal flow, expand all regions downward
|
754
|
+
for region in self.constituent_regions:
|
755
|
+
below_region = region.below(
|
756
|
+
height=height,
|
757
|
+
width=width,
|
758
|
+
include_source=include_source,
|
759
|
+
until=until,
|
760
|
+
include_endpoint=include_endpoint,
|
761
|
+
**kwargs,
|
762
|
+
)
|
763
|
+
new_regions.append(below_region)
|
764
|
+
|
765
|
+
return FlowRegion(
|
766
|
+
flow=self.flow,
|
767
|
+
constituent_regions=new_regions,
|
768
|
+
source_flow_element=self.source_flow_element,
|
769
|
+
boundary_element_found=self.boundary_element_found,
|
770
|
+
)
|
771
|
+
|
772
|
+
def left(
|
773
|
+
self,
|
774
|
+
width: Optional[float] = None,
|
775
|
+
height: str = "full",
|
776
|
+
include_source: bool = False,
|
777
|
+
until: Optional[str] = None,
|
778
|
+
include_endpoint: bool = True,
|
779
|
+
**kwargs,
|
780
|
+
) -> "FlowRegion":
|
781
|
+
"""
|
782
|
+
Create a FlowRegion with regions to the left of this FlowRegion.
|
783
|
+
|
784
|
+
For vertical flows: Expands all constituent regions leftward.
|
785
|
+
For horizontal flows: Only expands the leftmost constituent region leftward.
|
786
|
+
|
787
|
+
Args:
|
788
|
+
width: Width of the region to the left, in points
|
789
|
+
height: Height mode - "full" for full page height or "element" for element height
|
790
|
+
include_source: Whether to include this FlowRegion in the result
|
791
|
+
until: Optional selector string to specify a left boundary element
|
792
|
+
include_endpoint: Whether to include the boundary element in the region
|
793
|
+
**kwargs: Additional parameters
|
794
|
+
|
795
|
+
Returns:
|
796
|
+
New FlowRegion with regions to the left
|
797
|
+
"""
|
798
|
+
if not self.constituent_regions:
|
799
|
+
return FlowRegion(
|
800
|
+
flow=self.flow,
|
801
|
+
constituent_regions=[],
|
802
|
+
source_flow_element=self.source_flow_element,
|
803
|
+
boundary_element_found=self.boundary_element_found,
|
586
804
|
)
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
805
|
+
|
806
|
+
new_regions = []
|
807
|
+
|
808
|
+
if self.flow.arrangement == "vertical":
|
809
|
+
# For vertical flow, expand all regions leftward
|
810
|
+
for region in self.constituent_regions:
|
811
|
+
left_region = region.left(
|
812
|
+
width=width,
|
813
|
+
height="element",
|
814
|
+
include_source=include_source,
|
815
|
+
until=until,
|
816
|
+
include_endpoint=include_endpoint,
|
817
|
+
**kwargs,
|
818
|
+
)
|
819
|
+
new_regions.append(left_region)
|
820
|
+
else: # horizontal flow
|
821
|
+
# For horizontal flow, only expand the leftmost region leftward
|
822
|
+
leftmost_region = min(self.constituent_regions, key=lambda r: r.x0)
|
823
|
+
for region in self.constituent_regions:
|
824
|
+
if region == leftmost_region:
|
825
|
+
# Expand this region leftward
|
826
|
+
left_region = region.left(
|
827
|
+
width=width,
|
828
|
+
height="element",
|
829
|
+
include_source=include_source,
|
830
|
+
until=until,
|
831
|
+
include_endpoint=include_endpoint,
|
832
|
+
**kwargs,
|
833
|
+
)
|
834
|
+
new_regions.append(left_region)
|
835
|
+
elif include_source:
|
836
|
+
# Include other regions unchanged if include_source is True
|
837
|
+
new_regions.append(region)
|
838
|
+
|
839
|
+
return FlowRegion(
|
840
|
+
flow=self.flow,
|
841
|
+
constituent_regions=new_regions,
|
842
|
+
source_flow_element=self.source_flow_element,
|
843
|
+
boundary_element_found=self.boundary_element_found,
|
844
|
+
)
|
845
|
+
|
846
|
+
def right(
|
847
|
+
self,
|
848
|
+
width: Optional[float] = None,
|
849
|
+
height: str = "full",
|
850
|
+
include_source: bool = False,
|
851
|
+
until: Optional[str] = None,
|
852
|
+
include_endpoint: bool = True,
|
853
|
+
**kwargs,
|
854
|
+
) -> "FlowRegion":
|
855
|
+
"""
|
856
|
+
Create a FlowRegion with regions to the right of this FlowRegion.
|
857
|
+
|
858
|
+
For vertical flows: Expands all constituent regions rightward.
|
859
|
+
For horizontal flows: Only expands the rightmost constituent region rightward.
|
860
|
+
|
861
|
+
Args:
|
862
|
+
width: Width of the region to the right, in points
|
863
|
+
height: Height mode - "full" for full page height or "element" for element height
|
864
|
+
include_source: Whether to include this FlowRegion in the result
|
865
|
+
until: Optional selector string to specify a right boundary element
|
866
|
+
include_endpoint: Whether to include the boundary element in the region
|
867
|
+
**kwargs: Additional parameters
|
868
|
+
|
869
|
+
Returns:
|
870
|
+
New FlowRegion with regions to the right
|
871
|
+
"""
|
872
|
+
if not self.constituent_regions:
|
873
|
+
return FlowRegion(
|
874
|
+
flow=self.flow,
|
875
|
+
constituent_regions=[],
|
876
|
+
source_flow_element=self.source_flow_element,
|
877
|
+
boundary_element_found=self.boundary_element_found,
|
597
878
|
)
|
598
|
-
return None
|
599
879
|
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
880
|
+
new_regions = []
|
881
|
+
|
882
|
+
if self.flow.arrangement == "vertical":
|
883
|
+
# For vertical flow, expand all regions rightward
|
884
|
+
for region in self.constituent_regions:
|
885
|
+
right_region = region.right(
|
886
|
+
width=width,
|
887
|
+
height="element",
|
888
|
+
include_source=include_source,
|
889
|
+
until=until,
|
890
|
+
include_endpoint=include_endpoint,
|
891
|
+
**kwargs,
|
892
|
+
)
|
893
|
+
new_regions.append(right_region)
|
894
|
+
else: # horizontal flow
|
895
|
+
# For horizontal flow, only expand the rightmost region rightward
|
896
|
+
rightmost_region = max(self.constituent_regions, key=lambda r: r.x1)
|
897
|
+
for region in self.constituent_regions:
|
898
|
+
if region == rightmost_region:
|
899
|
+
# Expand this region rightward
|
900
|
+
right_region = region.right(
|
901
|
+
width=width,
|
902
|
+
height="element",
|
903
|
+
include_source=include_source,
|
904
|
+
until=until,
|
905
|
+
include_endpoint=include_endpoint,
|
906
|
+
**kwargs,
|
907
|
+
)
|
908
|
+
new_regions.append(right_region)
|
909
|
+
elif include_source:
|
910
|
+
# Include other regions unchanged if include_source is True
|
911
|
+
new_regions.append(region)
|
912
|
+
|
913
|
+
return FlowRegion(
|
914
|
+
flow=self.flow,
|
915
|
+
constituent_regions=new_regions,
|
916
|
+
source_flow_element=self.source_flow_element,
|
917
|
+
boundary_element_found=self.boundary_element_found,
|
604
918
|
)
|
605
919
|
|
920
|
+
def to_region(self) -> "FlowRegion":
|
921
|
+
"""
|
922
|
+
Convert this FlowRegion to a region (returns a copy).
|
923
|
+
This is equivalent to calling expand() with no arguments.
|
924
|
+
|
925
|
+
Returns:
|
926
|
+
Copy of this FlowRegion
|
927
|
+
"""
|
928
|
+
return self.expand()
|
929
|
+
|
606
930
|
@property
|
607
931
|
def is_empty(self) -> bool:
|
608
932
|
"""Checks if the FlowRegion contains no constituent regions or if all are empty."""
|
@@ -637,6 +961,7 @@ class FlowRegion:
|
|
637
961
|
stitch_rows: Optional[
|
638
962
|
Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
|
639
963
|
] = None,
|
964
|
+
merge_headers: Optional[bool] = None,
|
640
965
|
**kwargs,
|
641
966
|
) -> TableResult:
|
642
967
|
"""Extracts a single logical table from the FlowRegion.
|
@@ -650,6 +975,11 @@ class FlowRegion:
|
|
650
975
|
method, table_settings, use_ocr, ocr_config, text_options, cell_extraction_func, show_progress:
|
651
976
|
Same as in :pymeth:`Region.extract_table` and are forwarded as-is
|
652
977
|
to each physical region.
|
978
|
+
merge_headers: Whether to merge tables by removing repeated headers from subsequent
|
979
|
+
pages/segments. If None (default), auto-detects by checking if the first row
|
980
|
+
of each segment matches the first row of the first segment. If segments have
|
981
|
+
inconsistent header patterns (some repeat, others don't), raises ValueError.
|
982
|
+
Useful for multi-page tables where headers repeat on each page.
|
653
983
|
**kwargs: Additional keyword arguments forwarded to the underlying
|
654
984
|
``Region.extract_table`` implementation.
|
655
985
|
|
@@ -661,6 +991,7 @@ class FlowRegion:
|
|
661
991
|
stitch_rows parameter:
|
662
992
|
Controls whether the first rows of subsequent segments/regions should be merged
|
663
993
|
into the previous row (to handle spill-over across page breaks).
|
994
|
+
Applied AFTER header removal if merge_headers is enabled.
|
664
995
|
|
665
996
|
• None (default) – no merging (behaviour identical to previous versions).
|
666
997
|
• Callable – custom predicate taking
|
@@ -679,9 +1010,11 @@ class FlowRegion:
|
|
679
1010
|
# Resolve stitch_rows predicate -------------------------------------------------------
|
680
1011
|
predicate: Optional[
|
681
1012
|
Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
|
682
|
-
] = stitch_rows if callable(stitch_rows) else None
|
1013
|
+
] = (stitch_rows if callable(stitch_rows) else None)
|
683
1014
|
|
684
|
-
def _default_merge(
|
1015
|
+
def _default_merge(
|
1016
|
+
prev_row: List[Optional[str]], cur_row: List[Optional[str]]
|
1017
|
+
) -> List[Optional[str]]:
|
685
1018
|
"""Column-wise merge – concatenates non-empty strings with a space."""
|
686
1019
|
from itertools import zip_longest
|
687
1020
|
|
@@ -694,6 +1027,10 @@ class FlowRegion:
|
|
694
1027
|
return merged
|
695
1028
|
|
696
1029
|
aggregated_rows: List[List[Optional[str]]] = []
|
1030
|
+
header_row: Optional[List[Optional[str]]] = None
|
1031
|
+
merge_headers_enabled = False
|
1032
|
+
headers_warned = False # Track if we've already warned about dropping headers
|
1033
|
+
segment_has_repeated_header = [] # Track which segments have repeated headers
|
697
1034
|
|
698
1035
|
for region_idx, region in enumerate(self.constituent_regions):
|
699
1036
|
try:
|
@@ -717,6 +1054,59 @@ class FlowRegion:
|
|
717
1054
|
else:
|
718
1055
|
segment_rows = list(region_result)
|
719
1056
|
|
1057
|
+
# Handle header detection and merging for multi-page tables
|
1058
|
+
if region_idx == 0:
|
1059
|
+
# First segment: capture potential header row
|
1060
|
+
if segment_rows:
|
1061
|
+
header_row = segment_rows[0]
|
1062
|
+
# Determine if we should merge headers
|
1063
|
+
if merge_headers is None:
|
1064
|
+
# Auto-detect: we'll check all subsequent segments
|
1065
|
+
merge_headers_enabled = False # Will be determined later
|
1066
|
+
else:
|
1067
|
+
merge_headers_enabled = merge_headers
|
1068
|
+
# Track that first segment exists (for consistency checking)
|
1069
|
+
segment_has_repeated_header.append(False) # First segment doesn't "repeat"
|
1070
|
+
elif region_idx == 1 and merge_headers is None:
|
1071
|
+
# Auto-detection: check if first row of second segment matches header
|
1072
|
+
has_header = segment_rows and header_row and segment_rows[0] == header_row
|
1073
|
+
segment_has_repeated_header.append(has_header)
|
1074
|
+
|
1075
|
+
if has_header:
|
1076
|
+
merge_headers_enabled = True
|
1077
|
+
# Remove the detected repeated header from this segment
|
1078
|
+
segment_rows = segment_rows[1:]
|
1079
|
+
if not headers_warned:
|
1080
|
+
warnings.warn(
|
1081
|
+
"Detected repeated headers in multi-page table. Merging by removing "
|
1082
|
+
"repeated headers from subsequent pages.",
|
1083
|
+
UserWarning,
|
1084
|
+
stacklevel=2,
|
1085
|
+
)
|
1086
|
+
headers_warned = True
|
1087
|
+
else:
|
1088
|
+
merge_headers_enabled = False
|
1089
|
+
elif region_idx > 1:
|
1090
|
+
# Check consistency: all segments should have same pattern
|
1091
|
+
has_header = segment_rows and header_row and segment_rows[0] == header_row
|
1092
|
+
segment_has_repeated_header.append(has_header)
|
1093
|
+
|
1094
|
+
# Remove header if merging is enabled and header is present
|
1095
|
+
if merge_headers_enabled and has_header:
|
1096
|
+
segment_rows = segment_rows[1:]
|
1097
|
+
elif region_idx > 0 and merge_headers_enabled:
|
1098
|
+
# Explicit merge_headers=True: remove headers from subsequent segments
|
1099
|
+
if segment_rows and header_row and segment_rows[0] == header_row:
|
1100
|
+
segment_rows = segment_rows[1:]
|
1101
|
+
if not headers_warned:
|
1102
|
+
warnings.warn(
|
1103
|
+
"Removing repeated headers from multi-page table during merge.",
|
1104
|
+
UserWarning,
|
1105
|
+
stacklevel=2,
|
1106
|
+
)
|
1107
|
+
headers_warned = True
|
1108
|
+
|
1109
|
+
# Process remaining rows with stitch_rows logic
|
720
1110
|
for row_idx, row in enumerate(segment_rows):
|
721
1111
|
if (
|
722
1112
|
predicate is not None
|
@@ -733,6 +1123,26 @@ class FlowRegion:
|
|
733
1123
|
exc_info=True,
|
734
1124
|
)
|
735
1125
|
|
1126
|
+
# Check for inconsistent header patterns after processing all segments
|
1127
|
+
if merge_headers is None and len(segment_has_repeated_header) > 2:
|
1128
|
+
# During auto-detection, check for consistency across all segments
|
1129
|
+
expected_pattern = segment_has_repeated_header[1] # Pattern from second segment
|
1130
|
+
for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
|
1131
|
+
if has_header != expected_pattern:
|
1132
|
+
# Inconsistent pattern detected
|
1133
|
+
segments_with_headers = [
|
1134
|
+
i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
|
1135
|
+
]
|
1136
|
+
segments_without_headers = [
|
1137
|
+
i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
|
1138
|
+
]
|
1139
|
+
raise ValueError(
|
1140
|
+
f"Inconsistent header pattern in multi-page table: "
|
1141
|
+
f"segments {segments_with_headers} have repeated headers, "
|
1142
|
+
f"but segments {segments_without_headers} do not. "
|
1143
|
+
f"All segments must have the same header pattern for reliable merging."
|
1144
|
+
)
|
1145
|
+
|
736
1146
|
return TableResult(aggregated_rows)
|
737
1147
|
|
738
1148
|
def extract_tables(
|
@@ -799,3 +1209,39 @@ class FlowRegion:
|
|
799
1209
|
This is an alias for normalized_type.
|
800
1210
|
"""
|
801
1211
|
return self.normalized_type
|
1212
|
+
|
1213
|
+
def get_highlight_specs(self) -> List[Dict[str, Any]]:
|
1214
|
+
"""
|
1215
|
+
Get highlight specifications for all constituent regions.
|
1216
|
+
|
1217
|
+
This implements the highlighting protocol for FlowRegions, returning
|
1218
|
+
specs for each constituent region so they can be highlighted on their
|
1219
|
+
respective pages.
|
1220
|
+
|
1221
|
+
Returns:
|
1222
|
+
List of highlight specification dictionaries, one for each
|
1223
|
+
constituent region.
|
1224
|
+
"""
|
1225
|
+
specs = []
|
1226
|
+
|
1227
|
+
for region in self.constituent_regions:
|
1228
|
+
if not hasattr(region, "page") or region.page is None:
|
1229
|
+
continue
|
1230
|
+
|
1231
|
+
if not hasattr(region, "bbox") or region.bbox is None:
|
1232
|
+
continue
|
1233
|
+
|
1234
|
+
spec = {
|
1235
|
+
"page": region.page,
|
1236
|
+
"page_index": region.page.index if hasattr(region.page, "index") else 0,
|
1237
|
+
"bbox": region.bbox,
|
1238
|
+
"element": region, # Reference to the constituent region
|
1239
|
+
}
|
1240
|
+
|
1241
|
+
# Add polygon if available
|
1242
|
+
if hasattr(region, "polygon") and hasattr(region, "has_polygon") and region.has_polygon:
|
1243
|
+
spec["polygon"] = region.polygon
|
1244
|
+
|
1245
|
+
specs.append(spec)
|
1246
|
+
|
1247
|
+
return specs
|