natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,13 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
2
+ import warnings
3
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
3
4
 
4
5
  from pdfplumber.utils.geometry import merge_bboxes # Import merge_bboxes directly
5
6
 
6
7
  # For runtime image manipulation
7
8
  from PIL import Image as PIL_Image_Runtime
8
9
 
10
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
9
11
  from natural_pdf.tables import TableResult
10
12
 
11
13
  if TYPE_CHECKING:
@@ -13,7 +15,7 @@ if TYPE_CHECKING:
13
15
 
14
16
  from natural_pdf.core.page import Page as PhysicalPage
15
17
  from natural_pdf.elements.base import Element as PhysicalElement
16
- from natural_pdf.elements.collections import ElementCollection
18
+ from natural_pdf.elements.element_collection import ElementCollection
17
19
  from natural_pdf.elements.region import Region as PhysicalRegion
18
20
 
19
21
  from .element import FlowElement
@@ -22,7 +24,7 @@ if TYPE_CHECKING:
22
24
  logger = logging.getLogger(__name__)
23
25
 
24
26
 
25
- class FlowRegion:
27
+ class FlowRegion(Visualizable):
26
28
  """
27
29
  Represents a selected area within a Flow, potentially composed of multiple
28
30
  physical Region objects (constituent_regions) that might span across
@@ -65,17 +67,156 @@ class FlowRegion:
65
67
  self._cached_elements: Optional["ElementCollection"] = None # Stringized
66
68
  self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
67
69
 
70
+ def _get_highlighter(self):
71
+ """Get the highlighting service from constituent regions."""
72
+ if not self.constituent_regions:
73
+ raise RuntimeError("FlowRegion has no constituent regions to get highlighter from")
74
+
75
+ # Get highlighter from first constituent region
76
+ first_region = self.constituent_regions[0]
77
+ if hasattr(first_region, "_highlighter"):
78
+ return first_region._highlighter
79
+ elif hasattr(first_region, "page") and hasattr(first_region.page, "_highlighter"):
80
+ return first_region.page._highlighter
81
+ else:
82
+ raise RuntimeError(
83
+ f"Cannot find HighlightingService from FlowRegion constituent regions. "
84
+ f"First region type: {type(first_region).__name__}"
85
+ )
86
+
87
+ def _get_render_specs(
88
+ self,
89
+ mode: Literal["show", "render"] = "show",
90
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
91
+ highlights: Optional[List[Dict[str, Any]]] = None,
92
+ crop: Union[bool, Literal["content"]] = False,
93
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
94
+ **kwargs,
95
+ ) -> List[RenderSpec]:
96
+ """Get render specifications for this flow region.
97
+
98
+ Args:
99
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
100
+ color: Color for highlighting this region in show mode
101
+ highlights: Additional highlight groups to show
102
+ crop: Whether to crop to constituent regions
103
+ crop_bbox: Explicit crop bounds
104
+ **kwargs: Additional parameters
105
+
106
+ Returns:
107
+ List of RenderSpec objects, one per page with constituent regions
108
+ """
109
+ if not self.constituent_regions:
110
+ return []
111
+
112
+ # Group constituent regions by page
113
+ regions_by_page = {}
114
+ for region in self.constituent_regions:
115
+ if hasattr(region, "page") and region.page:
116
+ page = region.page
117
+ if page not in regions_by_page:
118
+ regions_by_page[page] = []
119
+ regions_by_page[page].append(region)
120
+
121
+ if not regions_by_page:
122
+ return []
123
+
124
+ # Create RenderSpec for each page
125
+ specs = []
126
+ for page, page_regions in regions_by_page.items():
127
+ spec = RenderSpec(page=page)
128
+
129
+ # Handle cropping
130
+ if crop_bbox:
131
+ spec.crop_bbox = crop_bbox
132
+ elif crop == "content" or crop is True:
133
+ # Calculate bounds of regions on this page
134
+ x_coords = []
135
+ y_coords = []
136
+ for region in page_regions:
137
+ if hasattr(region, "bbox") and region.bbox:
138
+ x0, y0, x1, y1 = region.bbox
139
+ x_coords.extend([x0, x1])
140
+ y_coords.extend([y0, y1])
141
+
142
+ if x_coords and y_coords:
143
+ spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
144
+
145
+ # Add highlights in show mode
146
+ if mode == "show":
147
+ # Highlight constituent regions
148
+ for i, region in enumerate(page_regions):
149
+ # Label each part if multiple regions
150
+ label = None
151
+ if len(self.constituent_regions) > 1:
152
+ # Find global index
153
+ try:
154
+ global_idx = self.constituent_regions.index(region)
155
+ label = f"FlowPart_{global_idx + 1}"
156
+ except ValueError:
157
+ label = f"FlowPart_{i + 1}"
158
+ else:
159
+ label = "FlowRegion"
160
+
161
+ spec.add_highlight(
162
+ bbox=region.bbox,
163
+ polygon=region.polygon if region.has_polygon else None,
164
+ color=color or "fuchsia",
165
+ label=label,
166
+ )
167
+
168
+ # Add additional highlight groups if provided
169
+ if highlights:
170
+ for group in highlights:
171
+ group_elements = group.get("elements", [])
172
+ group_color = group.get("color", color)
173
+ group_label = group.get("label")
174
+
175
+ for elem in group_elements:
176
+ # Only add if element is on this page
177
+ if hasattr(elem, "page") and elem.page == page:
178
+ spec.add_highlight(
179
+ element=elem, color=group_color, label=group_label
180
+ )
181
+
182
+ specs.append(spec)
183
+
184
+ return specs
185
+
68
186
  def __getattr__(self, name: str) -> Any:
69
187
  """
70
- Dynamically proxy attribute access to the source FlowElement if the
71
- attribute is not found in this instance.
188
+ Dynamically proxy attribute access to the source FlowElement for safe attributes only.
189
+ Spatial methods (above, below, left, right) are explicitly implemented to prevent
190
+ silent failures and incorrect behavior.
72
191
  """
73
192
  if name in self.__dict__:
74
193
  return self.__dict__[name]
75
- elif self.source_flow_element is not None:
76
- return getattr(self.source_flow_element, name)
77
- else:
78
- raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
194
+
195
+ # List of methods that should NOT be proxied - they need proper FlowRegion implementation
196
+ spatial_methods = {"above", "below", "left", "right", "to_region"}
197
+
198
+ if name in spatial_methods:
199
+ raise AttributeError(
200
+ f"'{self.__class__.__name__}' object has no attribute '{name}'. "
201
+ f"This method requires proper FlowRegion implementation to handle spatial relationships correctly."
202
+ )
203
+
204
+ # Only proxy safe attributes and methods
205
+ if self.source_flow_element is not None:
206
+ try:
207
+ attr = getattr(self.source_flow_element, name)
208
+ # Only proxy non-callable attributes and explicitly safe methods
209
+ if not callable(attr) or name in {"page", "document"}: # Add safe methods as needed
210
+ return attr
211
+ else:
212
+ raise AttributeError(
213
+ f"Method '{name}' cannot be safely proxied from FlowElement to FlowRegion. "
214
+ f"It may need explicit implementation."
215
+ )
216
+ except AttributeError:
217
+ pass
218
+
219
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
79
220
 
80
221
  @property
81
222
  def bbox(self) -> Optional[Tuple[float, float, float, float]]:
@@ -90,10 +231,12 @@ class FlowRegion:
90
231
 
91
232
  # Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
92
233
  # Extract bbox tuples from regions first
93
- region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
234
+ region_bboxes = [
235
+ region.bbox for region in self.constituent_regions if hasattr(region, "bbox")
236
+ ]
94
237
  if not region_bboxes:
95
238
  return None
96
-
239
+
97
240
  self._cached_bbox = merge_bboxes(region_bboxes)
98
241
  return self._cached_bbox
99
242
 
@@ -171,7 +314,7 @@ class FlowRegion:
171
314
  Returns:
172
315
  An ElementCollection containing all unique elements.
173
316
  """
174
- from natural_pdf.elements.collections import (
317
+ from natural_pdf.elements.element_collection import (
175
318
  ElementCollection as RuntimeElementCollection, # Local import
176
319
  )
177
320
 
@@ -257,7 +400,7 @@ class FlowRegion:
257
400
  chains each region's native ``find_all`` call and concatenates their
258
401
  results into a single ElementCollection while preserving flow order.
259
402
  """
260
- from natural_pdf.elements.collections import (
403
+ from natural_pdf.elements.element_collection import (
261
404
  ElementCollection as RuntimeElementCollection,
262
405
  )
263
406
 
@@ -268,9 +411,7 @@ class FlowRegion:
268
411
 
269
412
  for region in self.constituent_regions:
270
413
  try:
271
- region_matches = region.find_all(
272
- selector=selector, text=text, **kwargs
273
- )
414
+ region_matches = region.find_all(selector=selector, text=text, **kwargs)
274
415
  if region_matches:
275
416
  # ``region_matches`` is an ElementCollection – extend with its
276
417
  # underlying list so we don't create nested collections.
@@ -312,200 +453,33 @@ class FlowRegion:
312
453
  region.highlight(label=current_label, color=color, **kwargs)
313
454
  return self
314
455
 
315
- def show(
316
- self,
317
- resolution: Optional[float] = None,
318
- labels: bool = True,
319
- legend_position: str = "right",
320
- color: Optional[Union[Tuple, str]] = "fuchsia",
321
- label_prefix: Optional[str] = "FlowPart",
322
- width: Optional[int] = None,
323
- stack_direction: str = "vertical",
324
- stack_gap: int = 5,
325
- stack_background_color: Tuple[int, int, int] = (255, 255, 255),
326
- crop: bool = False,
327
- **kwargs,
328
- ) -> Optional["PIL_Image"]:
329
- """
330
- Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
331
- If multiple pages are involved, they are stacked into a single image.
332
-
333
- Args:
334
- resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
335
- labels: Whether to include a legend for highlights.
336
- legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
337
- color: Color for highlighting the constituent regions.
338
- label_prefix: Prefix for region labels (e.g., 'FlowPart').
339
- width: Optional width for the output image (overrides resolution).
340
- stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
341
- stack_gap: Gap in pixels between stacked pages.
342
- stack_background_color: RGB background color for the stacked image.
343
- crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
344
- **kwargs: Additional arguments passed to the underlying rendering methods.
345
-
346
- Returns:
347
- PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
456
+ def highlights(self, show: bool = False) -> "HighlightContext":
348
457
  """
349
- if not self.constituent_regions:
350
- logger.info("FlowRegion.show() called with no constituent regions.")
351
- return None
458
+ Create a highlight context for accumulating highlights.
352
459
 
353
- # 1. Group constituent regions by their physical page
354
- regions_by_page: Dict["PhysicalPage", List["PhysicalRegion"]] = {}
355
- for region in self.constituent_regions:
356
- if region.page:
357
- if region.page not in regions_by_page:
358
- regions_by_page[region.page] = []
359
- regions_by_page[region.page].append(region)
360
- else:
361
- raise ValueError(f"Constituent region {region.bbox} has no page.")
460
+ This allows for clean syntax to show multiple highlight groups:
362
461
 
363
- if not regions_by_page:
364
- logger.info("FlowRegion.show() found no constituent regions with associated pages.")
365
- return None
366
-
367
- # 2. Get a highlighter service (e.g., from the first page involved)
368
- first_page_with_regions = next(iter(regions_by_page.keys()), None)
369
- highlighter_service = None
370
- if first_page_with_regions and hasattr(first_page_with_regions, "_highlighter"):
371
- highlighter_service = first_page_with_regions._highlighter
462
+ Example:
463
+ with flow_region.highlights() as h:
464
+ h.add(flow_region.find_all('table'), label='tables', color='blue')
465
+ h.add(flow_region.find_all('text:bold'), label='bold text', color='red')
466
+ h.show()
372
467
 
373
- if not highlighter_service:
374
- raise ValueError(
375
- "Cannot get highlighter service for FlowRegion.show(). "
376
- "Ensure constituent regions' pages are initialized with a highlighter."
377
- )
378
-
379
- output_page_images: List["PIL_Image_Runtime"] = []
468
+ Or with automatic display:
469
+ with flow_region.highlights(show=True) as h:
470
+ h.add(flow_region.find_all('table'), label='tables')
471
+ h.add(flow_region.find_all('text:bold'), label='bold')
472
+ # Automatically shows when exiting the context
380
473
 
381
- # Sort pages by index for consistent output order
382
- sorted_pages = sorted(
383
- regions_by_page.keys(),
384
- key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
385
- )
386
-
387
- # 3. Render each page with its relevant constituent regions highlighted
388
- for page_idx, page_obj in enumerate(sorted_pages):
389
- constituent_regions_on_this_page = regions_by_page[page_obj]
390
- if not constituent_regions_on_this_page:
391
- continue
392
-
393
- temp_highlights_for_page = []
394
- for i, region_part in enumerate(constituent_regions_on_this_page):
395
- part_label = None
396
- if labels and label_prefix: # Ensure labels is True for label_prefix to apply
397
- # If FlowRegion consists of multiple parts on this page, or overall
398
- count_indicator = ""
399
- if (
400
- len(self.constituent_regions) > 1
401
- ): # If flow region has multiple parts overall
402
- # Find global index of this region_part in self.constituent_regions
403
- try:
404
- global_idx = self.constituent_regions.index(region_part)
405
- count_indicator = f"_{global_idx + 1}"
406
- except ValueError: # Should not happen if region_part is from the list
407
- count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
408
- elif (
409
- len(constituent_regions_on_this_page) > 1
410
- ): # If multiple parts on *this* page, but FR is single part overall
411
- count_indicator = f"_{i+1}"
412
-
413
- part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
414
-
415
- temp_highlights_for_page.append(
416
- {
417
- "page_index": (
418
- page_obj.index
419
- if hasattr(page_obj, "index")
420
- else getattr(page_obj, "page_number", 1) - 1
421
- ),
422
- "bbox": region_part.bbox,
423
- "polygon": region_part.polygon if region_part.has_polygon else None,
424
- "color": color, # Use the passed color
425
- "label": part_label,
426
- "use_color_cycling": False, # Keep specific color
427
- }
428
- )
429
-
430
- if not temp_highlights_for_page:
431
- continue
432
-
433
- # Calculate crop bbox if cropping is enabled
434
- crop_bbox = None
435
- if crop and constituent_regions_on_this_page:
436
- # Calculate the bounding box that encompasses all constituent regions on this page
437
- min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
438
- min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
439
- max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
440
- max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
441
- crop_bbox = (min_x0, min_y0, max_x1, max_y1)
442
-
443
- page_image = highlighter_service.render_preview(
444
- page_index=(
445
- page_obj.index
446
- if hasattr(page_obj, "index")
447
- else getattr(page_obj, "page_number", 1) - 1
448
- ),
449
- temporary_highlights=temp_highlights_for_page,
450
- resolution=resolution,
451
- width=width,
452
- labels=labels, # Pass through labels
453
- legend_position=legend_position,
454
- crop_bbox=crop_bbox,
455
- **kwargs,
456
- )
457
- if page_image:
458
- output_page_images.append(page_image)
459
-
460
- # 4. Stack the generated page images if multiple
461
- if not output_page_images:
462
- logger.info("FlowRegion.show() produced no page images to concatenate.")
463
- return None
464
-
465
- if len(output_page_images) == 1:
466
- return output_page_images[0]
474
+ Args:
475
+ show: If True, automatically show highlights when exiting context
467
476
 
468
- # Stacking logic (same as in FlowRegionCollection.show)
469
- if stack_direction == "vertical":
470
- final_width = max(img.width for img in output_page_images)
471
- final_height = (
472
- sum(img.height for img in output_page_images)
473
- + (len(output_page_images) - 1) * stack_gap
474
- )
475
- if final_width == 0 or final_height == 0:
476
- raise ValueError("Cannot create concatenated image with zero width or height.")
477
+ Returns:
478
+ HighlightContext for accumulating highlights
479
+ """
480
+ from natural_pdf.core.highlighting_service import HighlightContext
477
481
 
478
- concatenated_image = PIL_Image_Runtime.new(
479
- "RGB", (final_width, final_height), stack_background_color
480
- )
481
- current_y = 0
482
- for img in output_page_images:
483
- paste_x = (final_width - img.width) // 2
484
- concatenated_image.paste(img, (paste_x, current_y))
485
- current_y += img.height + stack_gap
486
- return concatenated_image
487
- elif stack_direction == "horizontal":
488
- final_width = (
489
- sum(img.width for img in output_page_images)
490
- + (len(output_page_images) - 1) * stack_gap
491
- )
492
- final_height = max(img.height for img in output_page_images)
493
- if final_width == 0 or final_height == 0:
494
- raise ValueError("Cannot create concatenated image with zero width or height.")
495
-
496
- concatenated_image = PIL_Image_Runtime.new(
497
- "RGB", (final_width, final_height), stack_background_color
498
- )
499
- current_x = 0
500
- for img in output_page_images:
501
- paste_y = (final_height - img.height) // 2
502
- concatenated_image.paste(img, (current_x, paste_y))
503
- current_x += img.width + stack_gap
504
- return concatenated_image
505
- else:
506
- raise ValueError(
507
- f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'."
508
- )
482
+ return HighlightContext(self, show_on_exit=show)
509
483
 
510
484
  def to_images(
511
485
  self,
@@ -523,9 +497,8 @@ class FlowRegion:
523
497
  cropped_images: List["PIL_Image"] = []
524
498
  for region_part in self.constituent_regions:
525
499
  try:
526
- img = region_part.to_image(
527
- resolution=resolution, crop=True, include_highlights=False, **kwargs
528
- )
500
+ # Use render() for clean image without highlights
501
+ img = region_part.render(resolution=resolution, crop=True, **kwargs)
529
502
  if img:
530
503
  cropped_images.append(img)
531
504
  except Exception as e:
@@ -536,73 +509,424 @@ class FlowRegion:
536
509
 
537
510
  return cropped_images
538
511
 
539
- def to_image(self, background_color=(255, 255, 255), **kwargs) -> Optional["PIL_Image"]:
512
+ def __repr__(self) -> str:
513
+ return (
514
+ f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
515
+ f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
516
+ )
517
+
518
+ def expand(
519
+ self,
520
+ left: float = 0,
521
+ right: float = 0,
522
+ top: float = 0,
523
+ bottom: float = 0,
524
+ width_factor: float = 1.0,
525
+ height_factor: float = 1.0,
526
+ ) -> "FlowRegion":
540
527
  """
541
- Creates a single composite image by stacking the images of its constituent regions.
542
- Stacking direction is based on the Flow's arrangement.
543
- Individual region images are obtained by calling to_images(**kwargs).
528
+ Create a new FlowRegion with all constituent regions expanded.
544
529
 
545
530
  Args:
546
- background_color: Tuple for RGB background color of the composite image.
547
- **kwargs: Additional arguments passed to to_images() for rendering individual parts
548
- (e.g., resolution).
531
+ left: Amount to expand left edge (positive value expands leftwards)
532
+ right: Amount to expand right edge (positive value expands rightwards)
533
+ top: Amount to expand top edge (positive value expands upwards)
534
+ bottom: Amount to expand bottom edge (positive value expands downwards)
535
+ width_factor: Factor to multiply width by (applied after absolute expansion)
536
+ height_factor: Factor to multiply height by (applied after absolute expansion)
549
537
 
550
538
  Returns:
551
- A single PIL.Image.Image object, or None if no constituent images.
539
+ New FlowRegion with expanded constituent regions
552
540
  """
553
- # Use PIL_Image_Runtime for creating new images at runtime
554
- images = self.to_images(**kwargs)
555
- if not images:
556
- return None
557
- if len(images) == 1:
558
- return images[0]
541
+ if not self.constituent_regions:
542
+ return FlowRegion(
543
+ flow=self.flow,
544
+ constituent_regions=[],
545
+ source_flow_element=self.source_flow_element,
546
+ boundary_element_found=self.boundary_element_found,
547
+ )
548
+
549
+ expanded_regions = []
550
+ for idx, region in enumerate(self.constituent_regions):
551
+ # Determine which adjustments to apply based on flow arrangement
552
+ apply_left = left
553
+ apply_right = right
554
+ apply_top = top
555
+ apply_bottom = bottom
556
+
557
+ if self.flow.arrangement == "vertical":
558
+ # In a vertical flow, only the *first* region should react to `top`
559
+ # and only the *last* region should react to `bottom`. This keeps
560
+ # the virtual contiguous area intact while allowing users to nudge
561
+ # the flow boundaries.
562
+ if idx != 0:
563
+ apply_top = 0
564
+ if idx != len(self.constituent_regions) - 1:
565
+ apply_bottom = 0
566
+ # left/right apply to every region (same column width change)
567
+ else: # horizontal flow
568
+ # In a horizontal flow, only the first region reacts to `left`
569
+ # and only the last region reacts to `right`.
570
+ if idx != 0:
571
+ apply_left = 0
572
+ if idx != len(self.constituent_regions) - 1:
573
+ apply_right = 0
574
+ # top/bottom apply to every region in horizontal flows
575
+
576
+ # Skip no-op expansion to avoid extra Region objects
577
+ needs_expansion = (
578
+ any(
579
+ v not in (0, 1.0) # compare width/height factor logically later
580
+ for v in (apply_left, apply_right, apply_top, apply_bottom)
581
+ )
582
+ or width_factor != 1.0
583
+ or height_factor != 1.0
584
+ )
585
+
586
+ try:
587
+ expanded_region = (
588
+ region.expand(
589
+ left=apply_left,
590
+ right=apply_right,
591
+ top=apply_top,
592
+ bottom=apply_bottom,
593
+ width_factor=width_factor,
594
+ height_factor=height_factor,
595
+ )
596
+ if needs_expansion
597
+ else region
598
+ )
599
+ expanded_regions.append(expanded_region)
600
+ except Exception as e:
601
+ logger.warning(
602
+ f"FlowRegion.expand: Error expanding constituent region {region.bbox}: {e}",
603
+ exc_info=False,
604
+ )
605
+ expanded_regions.append(region)
606
+
607
+ # Create new FlowRegion with expanded constituent regions
608
+ new_flow_region = FlowRegion(
609
+ flow=self.flow,
610
+ constituent_regions=expanded_regions,
611
+ source_flow_element=self.source_flow_element,
612
+ boundary_element_found=self.boundary_element_found,
613
+ )
614
+
615
+ # Copy metadata
616
+ new_flow_region.source = self.source
617
+ new_flow_region.region_type = self.region_type
618
+ new_flow_region.metadata = self.metadata.copy()
619
+
620
+ # Clear caches since the regions have changed
621
+ new_flow_region._cached_text = None
622
+ new_flow_region._cached_elements = None
623
+ new_flow_region._cached_bbox = None
624
+
625
+ return new_flow_region
626
+
627
+ def above(
628
+ self,
629
+ height: Optional[float] = None,
630
+ width: str = "full",
631
+ include_source: bool = False,
632
+ until: Optional[str] = None,
633
+ include_endpoint: bool = True,
634
+ **kwargs,
635
+ ) -> "FlowRegion":
636
+ """
637
+ Create a FlowRegion with regions above this FlowRegion.
638
+
639
+ For vertical flows: Only expands the topmost constituent region upward.
640
+ For horizontal flows: Expands all constituent regions upward.
641
+
642
+ Args:
643
+ height: Height of the region above, in points
644
+ width: Width mode - "full" for full page width or "element" for element width
645
+ include_source: Whether to include this FlowRegion in the result
646
+ until: Optional selector string to specify an upper boundary element
647
+ include_endpoint: Whether to include the boundary element in the region
648
+ **kwargs: Additional parameters
649
+
650
+ Returns:
651
+ New FlowRegion with regions above
652
+ """
653
+ if not self.constituent_regions:
654
+ return FlowRegion(
655
+ flow=self.flow,
656
+ constituent_regions=[],
657
+ source_flow_element=self.source_flow_element,
658
+ boundary_element_found=self.boundary_element_found,
659
+ )
660
+
661
+ new_regions = []
559
662
 
560
663
  if self.flow.arrangement == "vertical":
561
- # Stack vertically
562
- composite_width = max(img.width for img in images)
563
- composite_height = sum(img.height for img in images)
564
- if composite_width == 0 or composite_height == 0:
565
- return None # Avoid zero-size image
566
-
567
- new_image = PIL_Image_Runtime.new(
568
- "RGB", (composite_width, composite_height), background_color
664
+ # For vertical flow, use FLOW ORDER (index 0 is earliest). Only expand the
665
+ # first constituent region in that order.
666
+ first_region = self.constituent_regions[0]
667
+ for idx, region in enumerate(self.constituent_regions):
668
+ if idx == 0: # Only expand the first region (earliest in flow)
669
+ above_region = region.above(
670
+ height=height,
671
+ width="element", # Keep original column width
672
+ include_source=include_source,
673
+ until=until,
674
+ include_endpoint=include_endpoint,
675
+ **kwargs,
676
+ )
677
+ new_regions.append(above_region)
678
+ elif include_source:
679
+ new_regions.append(region)
680
+ else: # horizontal flow
681
+ # For horizontal flow, expand all regions upward
682
+ for region in self.constituent_regions:
683
+ above_region = region.above(
684
+ height=height,
685
+ width=width,
686
+ include_source=include_source,
687
+ until=until,
688
+ include_endpoint=include_endpoint,
689
+ **kwargs,
690
+ )
691
+ new_regions.append(above_region)
692
+
693
+ return FlowRegion(
694
+ flow=self.flow,
695
+ constituent_regions=new_regions,
696
+ source_flow_element=self.source_flow_element,
697
+ boundary_element_found=self.boundary_element_found,
698
+ )
699
+
700
+ def below(
701
+ self,
702
+ height: Optional[float] = None,
703
+ width: str = "full",
704
+ include_source: bool = False,
705
+ until: Optional[str] = None,
706
+ include_endpoint: bool = True,
707
+ **kwargs,
708
+ ) -> "FlowRegion":
709
+ """
710
+ Create a FlowRegion with regions below this FlowRegion.
711
+
712
+ For vertical flows: Only expands the bottommost constituent region downward.
713
+ For horizontal flows: Expands all constituent regions downward.
714
+
715
+ Args:
716
+ height: Height of the region below, in points
717
+ width: Width mode - "full" for full page width or "element" for element width
718
+ include_source: Whether to include this FlowRegion in the result
719
+ until: Optional selector string to specify a lower boundary element
720
+ include_endpoint: Whether to include the boundary element in the region
721
+ **kwargs: Additional parameters
722
+
723
+ Returns:
724
+ New FlowRegion with regions below
725
+ """
726
+ if not self.constituent_regions:
727
+ return FlowRegion(
728
+ flow=self.flow,
729
+ constituent_regions=[],
730
+ source_flow_element=self.source_flow_element,
731
+ boundary_element_found=self.boundary_element_found,
569
732
  )
570
- current_y = 0
571
- for img in images:
572
- # Default to left alignment for vertical stacking
573
- new_image.paste(img, (0, current_y))
574
- current_y += img.height
575
- return new_image
576
-
577
- elif self.flow.arrangement == "horizontal":
578
- # Stack horizontally
579
- composite_width = sum(img.width for img in images)
580
- composite_height = max(img.height for img in images)
581
- if composite_width == 0 or composite_height == 0:
582
- return None
583
-
584
- new_image = PIL_Image_Runtime.new(
585
- "RGB", (composite_width, composite_height), background_color
733
+
734
+ new_regions = []
735
+
736
+ if self.flow.arrangement == "vertical":
737
+ # For vertical flow, expand only the LAST constituent region in flow order.
738
+ last_idx = len(self.constituent_regions) - 1
739
+ for idx, region in enumerate(self.constituent_regions):
740
+ if idx == last_idx:
741
+ below_region = region.below(
742
+ height=height,
743
+ width="element",
744
+ include_source=include_source,
745
+ until=until,
746
+ include_endpoint=include_endpoint,
747
+ **kwargs,
748
+ )
749
+ new_regions.append(below_region)
750
+ elif include_source:
751
+ new_regions.append(region)
752
+ else: # horizontal flow
753
+ # For horizontal flow, expand all regions downward
754
+ for region in self.constituent_regions:
755
+ below_region = region.below(
756
+ height=height,
757
+ width=width,
758
+ include_source=include_source,
759
+ until=until,
760
+ include_endpoint=include_endpoint,
761
+ **kwargs,
762
+ )
763
+ new_regions.append(below_region)
764
+
765
+ return FlowRegion(
766
+ flow=self.flow,
767
+ constituent_regions=new_regions,
768
+ source_flow_element=self.source_flow_element,
769
+ boundary_element_found=self.boundary_element_found,
770
+ )
771
+
772
+ def left(
773
+ self,
774
+ width: Optional[float] = None,
775
+ height: str = "full",
776
+ include_source: bool = False,
777
+ until: Optional[str] = None,
778
+ include_endpoint: bool = True,
779
+ **kwargs,
780
+ ) -> "FlowRegion":
781
+ """
782
+ Create a FlowRegion with regions to the left of this FlowRegion.
783
+
784
+ For vertical flows: Expands all constituent regions leftward.
785
+ For horizontal flows: Only expands the leftmost constituent region leftward.
786
+
787
+ Args:
788
+ width: Width of the region to the left, in points
789
+ height: Height mode - "full" for full page height or "element" for element height
790
+ include_source: Whether to include this FlowRegion in the result
791
+ until: Optional selector string to specify a left boundary element
792
+ include_endpoint: Whether to include the boundary element in the region
793
+ **kwargs: Additional parameters
794
+
795
+ Returns:
796
+ New FlowRegion with regions to the left
797
+ """
798
+ if not self.constituent_regions:
799
+ return FlowRegion(
800
+ flow=self.flow,
801
+ constituent_regions=[],
802
+ source_flow_element=self.source_flow_element,
803
+ boundary_element_found=self.boundary_element_found,
586
804
  )
587
- current_x = 0
588
- for img in images:
589
- # Default to top alignment for horizontal stacking
590
- new_image.paste(img, (current_x, 0))
591
- current_x += img.width
592
- return new_image
593
- else:
594
- # Should not happen if flow.arrangement is validated
595
- logger.warning(
596
- f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images."
805
+
806
+ new_regions = []
807
+
808
+ if self.flow.arrangement == "vertical":
809
+ # For vertical flow, expand all regions leftward
810
+ for region in self.constituent_regions:
811
+ left_region = region.left(
812
+ width=width,
813
+ height="element",
814
+ include_source=include_source,
815
+ until=until,
816
+ include_endpoint=include_endpoint,
817
+ **kwargs,
818
+ )
819
+ new_regions.append(left_region)
820
+ else: # horizontal flow
821
+ # For horizontal flow, only expand the leftmost region leftward
822
+ leftmost_region = min(self.constituent_regions, key=lambda r: r.x0)
823
+ for region in self.constituent_regions:
824
+ if region == leftmost_region:
825
+ # Expand this region leftward
826
+ left_region = region.left(
827
+ width=width,
828
+ height="element",
829
+ include_source=include_source,
830
+ until=until,
831
+ include_endpoint=include_endpoint,
832
+ **kwargs,
833
+ )
834
+ new_regions.append(left_region)
835
+ elif include_source:
836
+ # Include other regions unchanged if include_source is True
837
+ new_regions.append(region)
838
+
839
+ return FlowRegion(
840
+ flow=self.flow,
841
+ constituent_regions=new_regions,
842
+ source_flow_element=self.source_flow_element,
843
+ boundary_element_found=self.boundary_element_found,
844
+ )
845
+
846
+ def right(
847
+ self,
848
+ width: Optional[float] = None,
849
+ height: str = "full",
850
+ include_source: bool = False,
851
+ until: Optional[str] = None,
852
+ include_endpoint: bool = True,
853
+ **kwargs,
854
+ ) -> "FlowRegion":
855
+ """
856
+ Create a FlowRegion with regions to the right of this FlowRegion.
857
+
858
+ For vertical flows: Expands all constituent regions rightward.
859
+ For horizontal flows: Only expands the rightmost constituent region rightward.
860
+
861
+ Args:
862
+ width: Width of the region to the right, in points
863
+ height: Height mode - "full" for full page height or "element" for element height
864
+ include_source: Whether to include this FlowRegion in the result
865
+ until: Optional selector string to specify a right boundary element
866
+ include_endpoint: Whether to include the boundary element in the region
867
+ **kwargs: Additional parameters
868
+
869
+ Returns:
870
+ New FlowRegion with regions to the right
871
+ """
872
+ if not self.constituent_regions:
873
+ return FlowRegion(
874
+ flow=self.flow,
875
+ constituent_regions=[],
876
+ source_flow_element=self.source_flow_element,
877
+ boundary_element_found=self.boundary_element_found,
597
878
  )
598
- return None
599
879
 
600
- def __repr__(self) -> str:
601
- return (
602
- f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
603
- f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
880
+ new_regions = []
881
+
882
+ if self.flow.arrangement == "vertical":
883
+ # For vertical flow, expand all regions rightward
884
+ for region in self.constituent_regions:
885
+ right_region = region.right(
886
+ width=width,
887
+ height="element",
888
+ include_source=include_source,
889
+ until=until,
890
+ include_endpoint=include_endpoint,
891
+ **kwargs,
892
+ )
893
+ new_regions.append(right_region)
894
+ else: # horizontal flow
895
+ # For horizontal flow, only expand the rightmost region rightward
896
+ rightmost_region = max(self.constituent_regions, key=lambda r: r.x1)
897
+ for region in self.constituent_regions:
898
+ if region == rightmost_region:
899
+ # Expand this region rightward
900
+ right_region = region.right(
901
+ width=width,
902
+ height="element",
903
+ include_source=include_source,
904
+ until=until,
905
+ include_endpoint=include_endpoint,
906
+ **kwargs,
907
+ )
908
+ new_regions.append(right_region)
909
+ elif include_source:
910
+ # Include other regions unchanged if include_source is True
911
+ new_regions.append(region)
912
+
913
+ return FlowRegion(
914
+ flow=self.flow,
915
+ constituent_regions=new_regions,
916
+ source_flow_element=self.source_flow_element,
917
+ boundary_element_found=self.boundary_element_found,
604
918
  )
605
919
 
920
+ def to_region(self) -> "FlowRegion":
921
+ """
922
+ Convert this FlowRegion to a region (returns a copy).
923
+ This is equivalent to calling expand() with no arguments.
924
+
925
+ Returns:
926
+ Copy of this FlowRegion
927
+ """
928
+ return self.expand()
929
+
606
930
  @property
607
931
  def is_empty(self) -> bool:
608
932
  """Checks if the FlowRegion contains no constituent regions or if all are empty."""
@@ -631,6 +955,13 @@ class FlowRegion:
631
955
  text_options: Optional[Dict] = None,
632
956
  cell_extraction_func: Optional[Callable[["PhysicalRegion"], Optional[str]]] = None,
633
957
  show_progress: bool = False,
958
+ # Optional row-level merge predicate. If provided, it decides whether
959
+ # the current row (first row of a segment/page) should be merged with
960
+ # the previous one (to handle multi-page spill-overs).
961
+ stitch_rows: Optional[
962
+ Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
963
+ ] = None,
964
+ merge_headers: Optional[bool] = None,
634
965
  **kwargs,
635
966
  ) -> TableResult:
636
967
  """Extracts a single logical table from the FlowRegion.
@@ -644,6 +975,11 @@ class FlowRegion:
644
975
  method, table_settings, use_ocr, ocr_config, text_options, cell_extraction_func, show_progress:
645
976
  Same as in :pymeth:`Region.extract_table` and are forwarded as-is
646
977
  to each physical region.
978
+ merge_headers: Whether to merge tables by removing repeated headers from subsequent
979
+ pages/segments. If None (default), auto-detects by checking if the first row
980
+ of each segment matches the first row of the first segment. If segments have
981
+ inconsistent header patterns (some repeat, others don't), raises ValueError.
982
+ Useful for multi-page tables where headers repeat on each page.
647
983
  **kwargs: Additional keyword arguments forwarded to the underlying
648
984
  ``Region.extract_table`` implementation.
649
985
 
@@ -651,6 +987,16 @@ class FlowRegion:
651
987
  A TableResult object containing the aggregated table data. Rows returned from
652
988
  consecutive constituent regions are appended in document order. If
653
989
  no tables are detected in any region, an empty TableResult is returned.
990
+
991
+ stitch_rows parameter:
992
+ Controls whether the first rows of subsequent segments/regions should be merged
993
+ into the previous row (to handle spill-over across page breaks).
994
+ Applied AFTER header removal if merge_headers is enabled.
995
+
996
+ • None (default) – no merging (behaviour identical to previous versions).
997
+ • Callable – custom predicate taking
998
+ (prev_row, cur_row, row_idx_in_segment, segment_object) → bool.
999
+ Return True to merge `cur_row` into `prev_row` (default column-wise merge is used).
654
1000
  """
655
1001
 
656
1002
  if table_settings is None:
@@ -661,9 +1007,32 @@ class FlowRegion:
661
1007
  if not self.constituent_regions:
662
1008
  return TableResult([])
663
1009
 
1010
+ # Resolve stitch_rows predicate -------------------------------------------------------
1011
+ predicate: Optional[
1012
+ Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
1013
+ ] = (stitch_rows if callable(stitch_rows) else None)
1014
+
1015
+ def _default_merge(
1016
+ prev_row: List[Optional[str]], cur_row: List[Optional[str]]
1017
+ ) -> List[Optional[str]]:
1018
+ """Column-wise merge – concatenates non-empty strings with a space."""
1019
+ from itertools import zip_longest
1020
+
1021
+ merged: List[Optional[str]] = []
1022
+ for p, c in zip_longest(prev_row, cur_row, fillvalue=""):
1023
+ if (p or "").strip() and (c or "").strip():
1024
+ merged.append(f"{p} {c}".strip())
1025
+ else:
1026
+ merged.append((p or "") + (c or ""))
1027
+ return merged
1028
+
664
1029
  aggregated_rows: List[List[Optional[str]]] = []
1030
+ header_row: Optional[List[Optional[str]]] = None
1031
+ merge_headers_enabled = False
1032
+ headers_warned = False # Track if we've already warned about dropping headers
1033
+ segment_has_repeated_header = [] # Track which segments have repeated headers
665
1034
 
666
- for region in self.constituent_regions:
1035
+ for region_idx, region in enumerate(self.constituent_regions):
667
1036
  try:
668
1037
  region_result = region.extract_table(
669
1038
  method=method,
@@ -676,15 +1045,104 @@ class FlowRegion:
676
1045
  **kwargs,
677
1046
  )
678
1047
 
679
- # region_result is now a TableResult object, extract the rows
680
- if region_result:
681
- aggregated_rows.extend(region_result)
1048
+ # Convert result to list of rows
1049
+ if not region_result:
1050
+ continue
1051
+
1052
+ if isinstance(region_result, TableResult):
1053
+ segment_rows = list(region_result)
1054
+ else:
1055
+ segment_rows = list(region_result)
1056
+
1057
+ # Handle header detection and merging for multi-page tables
1058
+ if region_idx == 0:
1059
+ # First segment: capture potential header row
1060
+ if segment_rows:
1061
+ header_row = segment_rows[0]
1062
+ # Determine if we should merge headers
1063
+ if merge_headers is None:
1064
+ # Auto-detect: we'll check all subsequent segments
1065
+ merge_headers_enabled = False # Will be determined later
1066
+ else:
1067
+ merge_headers_enabled = merge_headers
1068
+ # Track that first segment exists (for consistency checking)
1069
+ segment_has_repeated_header.append(False) # First segment doesn't "repeat"
1070
+ elif region_idx == 1 and merge_headers is None:
1071
+ # Auto-detection: check if first row of second segment matches header
1072
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
1073
+ segment_has_repeated_header.append(has_header)
1074
+
1075
+ if has_header:
1076
+ merge_headers_enabled = True
1077
+ # Remove the detected repeated header from this segment
1078
+ segment_rows = segment_rows[1:]
1079
+ if not headers_warned:
1080
+ warnings.warn(
1081
+ "Detected repeated headers in multi-page table. Merging by removing "
1082
+ "repeated headers from subsequent pages.",
1083
+ UserWarning,
1084
+ stacklevel=2,
1085
+ )
1086
+ headers_warned = True
1087
+ else:
1088
+ merge_headers_enabled = False
1089
+ elif region_idx > 1:
1090
+ # Check consistency: all segments should have same pattern
1091
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
1092
+ segment_has_repeated_header.append(has_header)
1093
+
1094
+ # Remove header if merging is enabled and header is present
1095
+ if merge_headers_enabled and has_header:
1096
+ segment_rows = segment_rows[1:]
1097
+ elif region_idx > 0 and merge_headers_enabled:
1098
+ # Explicit merge_headers=True: remove headers from subsequent segments
1099
+ if segment_rows and header_row and segment_rows[0] == header_row:
1100
+ segment_rows = segment_rows[1:]
1101
+ if not headers_warned:
1102
+ warnings.warn(
1103
+ "Removing repeated headers from multi-page table during merge.",
1104
+ UserWarning,
1105
+ stacklevel=2,
1106
+ )
1107
+ headers_warned = True
1108
+
1109
+ # Process remaining rows with stitch_rows logic
1110
+ for row_idx, row in enumerate(segment_rows):
1111
+ if (
1112
+ predicate is not None
1113
+ and aggregated_rows
1114
+ and predicate(aggregated_rows[-1], row, row_idx, region)
1115
+ ):
1116
+ # Merge with previous row
1117
+ aggregated_rows[-1] = _default_merge(aggregated_rows[-1], row)
1118
+ else:
1119
+ aggregated_rows.append(row)
682
1120
  except Exception as e:
683
1121
  logger.error(
684
1122
  f"FlowRegion.extract_table: Error extracting table from constituent region {region}: {e}",
685
1123
  exc_info=True,
686
1124
  )
687
1125
 
1126
+ # Check for inconsistent header patterns after processing all segments
1127
+ if merge_headers is None and len(segment_has_repeated_header) > 2:
1128
+ # During auto-detection, check for consistency across all segments
1129
+ expected_pattern = segment_has_repeated_header[1] # Pattern from second segment
1130
+ for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
1131
+ if has_header != expected_pattern:
1132
+ # Inconsistent pattern detected
1133
+ segments_with_headers = [
1134
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
1135
+ ]
1136
+ segments_without_headers = [
1137
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
1138
+ ]
1139
+ raise ValueError(
1140
+ f"Inconsistent header pattern in multi-page table: "
1141
+ f"segments {segments_with_headers} have repeated headers, "
1142
+ f"but segments {segments_without_headers} do not. "
1143
+ f"All segments must have the same header pattern for reliable merging."
1144
+ )
1145
+
688
1146
  return TableResult(aggregated_rows)
689
1147
 
690
1148
  def extract_tables(
@@ -751,3 +1209,39 @@ class FlowRegion:
751
1209
  This is an alias for normalized_type.
752
1210
  """
753
1211
  return self.normalized_type
1212
+
1213
+ def get_highlight_specs(self) -> List[Dict[str, Any]]:
1214
+ """
1215
+ Get highlight specifications for all constituent regions.
1216
+
1217
+ This implements the highlighting protocol for FlowRegions, returning
1218
+ specs for each constituent region so they can be highlighted on their
1219
+ respective pages.
1220
+
1221
+ Returns:
1222
+ List of highlight specification dictionaries, one for each
1223
+ constituent region.
1224
+ """
1225
+ specs = []
1226
+
1227
+ for region in self.constituent_regions:
1228
+ if not hasattr(region, "page") or region.page is None:
1229
+ continue
1230
+
1231
+ if not hasattr(region, "bbox") or region.bbox is None:
1232
+ continue
1233
+
1234
+ spec = {
1235
+ "page": region.page,
1236
+ "page_index": region.page.index if hasattr(region.page, "index") else 0,
1237
+ "bbox": region.bbox,
1238
+ "element": region, # Reference to the constituent region
1239
+ }
1240
+
1241
+ # Add polygon if available
1242
+ if hasattr(region, "polygon") and hasattr(region, "has_polygon") and region.has_polygon:
1243
+ spec["polygon"] = region.polygon
1244
+
1245
+ specs.append(spec)
1246
+
1247
+ return specs