natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
  50. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
  51. optimization/memory_comparison.py +1 -1
  52. optimization/pdf_analyzer.py +2 -2
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,13 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
2
+ import warnings
3
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
3
4
 
4
5
  from pdfplumber.utils.geometry import merge_bboxes # Import merge_bboxes directly
5
6
 
6
7
  # For runtime image manipulation
7
8
  from PIL import Image as PIL_Image_Runtime
8
9
 
10
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
9
11
  from natural_pdf.tables import TableResult
10
12
 
11
13
  if TYPE_CHECKING:
@@ -13,7 +15,7 @@ if TYPE_CHECKING:
13
15
 
14
16
  from natural_pdf.core.page import Page as PhysicalPage
15
17
  from natural_pdf.elements.base import Element as PhysicalElement
16
- from natural_pdf.elements.collections import ElementCollection
18
+ from natural_pdf.elements.element_collection import ElementCollection
17
19
  from natural_pdf.elements.region import Region as PhysicalRegion
18
20
 
19
21
  from .element import FlowElement
@@ -22,7 +24,7 @@ if TYPE_CHECKING:
22
24
  logger = logging.getLogger(__name__)
23
25
 
24
26
 
25
- class FlowRegion:
27
+ class FlowRegion(Visualizable):
26
28
  """
27
29
  Represents a selected area within a Flow, potentially composed of multiple
28
30
  physical Region objects (constituent_regions) that might span across
@@ -65,17 +67,156 @@ class FlowRegion:
65
67
  self._cached_elements: Optional["ElementCollection"] = None # Stringized
66
68
  self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
67
69
 
70
+ def _get_highlighter(self):
71
+ """Get the highlighting service from constituent regions."""
72
+ if not self.constituent_regions:
73
+ raise RuntimeError("FlowRegion has no constituent regions to get highlighter from")
74
+
75
+ # Get highlighter from first constituent region
76
+ first_region = self.constituent_regions[0]
77
+ if hasattr(first_region, "_highlighter"):
78
+ return first_region._highlighter
79
+ elif hasattr(first_region, "page") and hasattr(first_region.page, "_highlighter"):
80
+ return first_region.page._highlighter
81
+ else:
82
+ raise RuntimeError(
83
+ f"Cannot find HighlightingService from FlowRegion constituent regions. "
84
+ f"First region type: {type(first_region).__name__}"
85
+ )
86
+
87
+ def _get_render_specs(
88
+ self,
89
+ mode: Literal["show", "render"] = "show",
90
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
91
+ highlights: Optional[List[Dict[str, Any]]] = None,
92
+ crop: Union[bool, Literal["content"]] = False,
93
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
94
+ **kwargs,
95
+ ) -> List[RenderSpec]:
96
+ """Get render specifications for this flow region.
97
+
98
+ Args:
99
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
100
+ color: Color for highlighting this region in show mode
101
+ highlights: Additional highlight groups to show
102
+ crop: Whether to crop to constituent regions
103
+ crop_bbox: Explicit crop bounds
104
+ **kwargs: Additional parameters
105
+
106
+ Returns:
107
+ List of RenderSpec objects, one per page with constituent regions
108
+ """
109
+ if not self.constituent_regions:
110
+ return []
111
+
112
+ # Group constituent regions by page
113
+ regions_by_page = {}
114
+ for region in self.constituent_regions:
115
+ if hasattr(region, "page") and region.page:
116
+ page = region.page
117
+ if page not in regions_by_page:
118
+ regions_by_page[page] = []
119
+ regions_by_page[page].append(region)
120
+
121
+ if not regions_by_page:
122
+ return []
123
+
124
+ # Create RenderSpec for each page
125
+ specs = []
126
+ for page, page_regions in regions_by_page.items():
127
+ spec = RenderSpec(page=page)
128
+
129
+ # Handle cropping
130
+ if crop_bbox:
131
+ spec.crop_bbox = crop_bbox
132
+ elif crop == "content" or crop is True:
133
+ # Calculate bounds of regions on this page
134
+ x_coords = []
135
+ y_coords = []
136
+ for region in page_regions:
137
+ if hasattr(region, "bbox") and region.bbox:
138
+ x0, y0, x1, y1 = region.bbox
139
+ x_coords.extend([x0, x1])
140
+ y_coords.extend([y0, y1])
141
+
142
+ if x_coords and y_coords:
143
+ spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
144
+
145
+ # Add highlights in show mode
146
+ if mode == "show":
147
+ # Highlight constituent regions
148
+ for i, region in enumerate(page_regions):
149
+ # Label each part if multiple regions
150
+ label = None
151
+ if len(self.constituent_regions) > 1:
152
+ # Find global index
153
+ try:
154
+ global_idx = self.constituent_regions.index(region)
155
+ label = f"FlowPart_{global_idx + 1}"
156
+ except ValueError:
157
+ label = f"FlowPart_{i + 1}"
158
+ else:
159
+ label = "FlowRegion"
160
+
161
+ spec.add_highlight(
162
+ bbox=region.bbox,
163
+ polygon=region.polygon if region.has_polygon else None,
164
+ color=color or "fuchsia",
165
+ label=label,
166
+ )
167
+
168
+ # Add additional highlight groups if provided
169
+ if highlights:
170
+ for group in highlights:
171
+ group_elements = group.get("elements", [])
172
+ group_color = group.get("color", color)
173
+ group_label = group.get("label")
174
+
175
+ for elem in group_elements:
176
+ # Only add if element is on this page
177
+ if hasattr(elem, "page") and elem.page == page:
178
+ spec.add_highlight(
179
+ element=elem, color=group_color, label=group_label
180
+ )
181
+
182
+ specs.append(spec)
183
+
184
+ return specs
185
+
68
186
  def __getattr__(self, name: str) -> Any:
69
187
  """
70
- Dynamically proxy attribute access to the source FlowElement if the
71
- attribute is not found in this instance.
188
+ Dynamically proxy attribute access to the source FlowElement for safe attributes only.
189
+ Spatial methods (above, below, left, right) are explicitly implemented to prevent
190
+ silent failures and incorrect behavior.
72
191
  """
73
192
  if name in self.__dict__:
74
193
  return self.__dict__[name]
75
- elif self.source_flow_element is not None:
76
- return getattr(self.source_flow_element, name)
77
- else:
78
- raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
194
+
195
+ # List of methods that should NOT be proxied - they need proper FlowRegion implementation
196
+ spatial_methods = {"above", "below", "left", "right", "to_region"}
197
+
198
+ if name in spatial_methods:
199
+ raise AttributeError(
200
+ f"'{self.__class__.__name__}' object has no attribute '{name}'. "
201
+ f"This method requires proper FlowRegion implementation to handle spatial relationships correctly."
202
+ )
203
+
204
+ # Only proxy safe attributes and methods
205
+ if self.source_flow_element is not None:
206
+ try:
207
+ attr = getattr(self.source_flow_element, name)
208
+ # Only proxy non-callable attributes and explicitly safe methods
209
+ if not callable(attr) or name in {"page", "document"}: # Add safe methods as needed
210
+ return attr
211
+ else:
212
+ raise AttributeError(
213
+ f"Method '{name}' cannot be safely proxied from FlowElement to FlowRegion. "
214
+ f"It may need explicit implementation."
215
+ )
216
+ except AttributeError:
217
+ pass
218
+
219
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
79
220
 
80
221
  @property
81
222
  def bbox(self) -> Optional[Tuple[float, float, float, float]]:
@@ -90,10 +231,12 @@ class FlowRegion:
90
231
 
91
232
  # Use merge_bboxes from pdfplumber.utils.geometry to merge bboxes
92
233
  # Extract bbox tuples from regions first
93
- region_bboxes = [region.bbox for region in self.constituent_regions if hasattr(region, "bbox")]
234
+ region_bboxes = [
235
+ region.bbox for region in self.constituent_regions if hasattr(region, "bbox")
236
+ ]
94
237
  if not region_bboxes:
95
238
  return None
96
-
239
+
97
240
  self._cached_bbox = merge_bboxes(region_bboxes)
98
241
  return self._cached_bbox
99
242
 
@@ -171,7 +314,7 @@ class FlowRegion:
171
314
  Returns:
172
315
  An ElementCollection containing all unique elements.
173
316
  """
174
- from natural_pdf.elements.collections import (
317
+ from natural_pdf.elements.element_collection import (
175
318
  ElementCollection as RuntimeElementCollection, # Local import
176
319
  )
177
320
 
@@ -257,7 +400,7 @@ class FlowRegion:
257
400
  chains each region's native ``find_all`` call and concatenates their
258
401
  results into a single ElementCollection while preserving flow order.
259
402
  """
260
- from natural_pdf.elements.collections import (
403
+ from natural_pdf.elements.element_collection import (
261
404
  ElementCollection as RuntimeElementCollection,
262
405
  )
263
406
 
@@ -268,9 +411,7 @@ class FlowRegion:
268
411
 
269
412
  for region in self.constituent_regions:
270
413
  try:
271
- region_matches = region.find_all(
272
- selector=selector, text=text, **kwargs
273
- )
414
+ region_matches = region.find_all(selector=selector, text=text, **kwargs)
274
415
  if region_matches:
275
416
  # ``region_matches`` is an ElementCollection – extend with its
276
417
  # underlying list so we don't create nested collections.
@@ -312,200 +453,33 @@ class FlowRegion:
312
453
  region.highlight(label=current_label, color=color, **kwargs)
313
454
  return self
314
455
 
315
- def show(
316
- self,
317
- resolution: Optional[float] = None,
318
- labels: bool = True,
319
- legend_position: str = "right",
320
- color: Optional[Union[Tuple, str]] = "fuchsia",
321
- label_prefix: Optional[str] = "FlowPart",
322
- width: Optional[int] = None,
323
- stack_direction: str = "vertical",
324
- stack_gap: int = 5,
325
- stack_background_color: Tuple[int, int, int] = (255, 255, 255),
326
- crop: bool = False,
327
- **kwargs,
328
- ) -> Optional["PIL_Image"]:
456
+ def highlights(self, show: bool = False) -> "HighlightContext":
329
457
  """
330
- Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
331
- If multiple pages are involved, they are stacked into a single image.
458
+ Create a highlight context for accumulating highlights.
332
459
 
333
- Args:
334
- resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
335
- labels: Whether to include a legend for highlights.
336
- legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
337
- color: Color for highlighting the constituent regions.
338
- label_prefix: Prefix for region labels (e.g., 'FlowPart').
339
- width: Optional width for the output image (overrides resolution).
340
- stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
341
- stack_gap: Gap in pixels between stacked pages.
342
- stack_background_color: RGB background color for the stacked image.
343
- crop: If True, crop each rendered page to the bounding box of constituent regions on that page.
344
- **kwargs: Additional arguments passed to the underlying rendering methods.
460
+ This allows for clean syntax to show multiple highlight groups:
345
461
 
346
- Returns:
347
- PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
348
- """
349
- if not self.constituent_regions:
350
- logger.info("FlowRegion.show() called with no constituent regions.")
351
- return None
462
+ Example:
463
+ with flow_region.highlights() as h:
464
+ h.add(flow_region.find_all('table'), label='tables', color='blue')
465
+ h.add(flow_region.find_all('text:bold'), label='bold text', color='red')
466
+ h.show()
352
467
 
353
- # 1. Group constituent regions by their physical page
354
- regions_by_page: Dict["PhysicalPage", List["PhysicalRegion"]] = {}
355
- for region in self.constituent_regions:
356
- if region.page:
357
- if region.page not in regions_by_page:
358
- regions_by_page[region.page] = []
359
- regions_by_page[region.page].append(region)
360
- else:
361
- raise ValueError(f"Constituent region {region.bbox} has no page.")
362
-
363
- if not regions_by_page:
364
- logger.info("FlowRegion.show() found no constituent regions with associated pages.")
365
- return None
366
-
367
- # 2. Get a highlighter service (e.g., from the first page involved)
368
- first_page_with_regions = next(iter(regions_by_page.keys()), None)
369
- highlighter_service = None
370
- if first_page_with_regions and hasattr(first_page_with_regions, "_highlighter"):
371
- highlighter_service = first_page_with_regions._highlighter
468
+ Or with automatic display:
469
+ with flow_region.highlights(show=True) as h:
470
+ h.add(flow_region.find_all('table'), label='tables')
471
+ h.add(flow_region.find_all('text:bold'), label='bold')
472
+ # Automatically shows when exiting the context
372
473
 
373
- if not highlighter_service:
374
- raise ValueError(
375
- "Cannot get highlighter service for FlowRegion.show(). "
376
- "Ensure constituent regions' pages are initialized with a highlighter."
377
- )
378
-
379
- output_page_images: List["PIL_Image_Runtime"] = []
380
-
381
- # Sort pages by index for consistent output order
382
- sorted_pages = sorted(
383
- regions_by_page.keys(),
384
- key=lambda p: p.index if hasattr(p, "index") else getattr(p, "page_number", 0),
385
- )
386
-
387
- # 3. Render each page with its relevant constituent regions highlighted
388
- for page_idx, page_obj in enumerate(sorted_pages):
389
- constituent_regions_on_this_page = regions_by_page[page_obj]
390
- if not constituent_regions_on_this_page:
391
- continue
392
-
393
- temp_highlights_for_page = []
394
- for i, region_part in enumerate(constituent_regions_on_this_page):
395
- part_label = None
396
- if labels and label_prefix: # Ensure labels is True for label_prefix to apply
397
- # If FlowRegion consists of multiple parts on this page, or overall
398
- count_indicator = ""
399
- if (
400
- len(self.constituent_regions) > 1
401
- ): # If flow region has multiple parts overall
402
- # Find global index of this region_part in self.constituent_regions
403
- try:
404
- global_idx = self.constituent_regions.index(region_part)
405
- count_indicator = f"_{global_idx + 1}"
406
- except ValueError: # Should not happen if region_part is from the list
407
- count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
408
- elif (
409
- len(constituent_regions_on_this_page) > 1
410
- ): # If multiple parts on *this* page, but FR is single part overall
411
- count_indicator = f"_{i+1}"
412
-
413
- part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
414
-
415
- temp_highlights_for_page.append(
416
- {
417
- "page_index": (
418
- page_obj.index
419
- if hasattr(page_obj, "index")
420
- else getattr(page_obj, "page_number", 1) - 1
421
- ),
422
- "bbox": region_part.bbox,
423
- "polygon": region_part.polygon if region_part.has_polygon else None,
424
- "color": color, # Use the passed color
425
- "label": part_label,
426
- "use_color_cycling": False, # Keep specific color
427
- }
428
- )
429
-
430
- if not temp_highlights_for_page:
431
- continue
432
-
433
- # Calculate crop bbox if cropping is enabled
434
- crop_bbox = None
435
- if crop and constituent_regions_on_this_page:
436
- # Calculate the bounding box that encompasses all constituent regions on this page
437
- min_x0 = min(region.bbox[0] for region in constituent_regions_on_this_page)
438
- min_y0 = min(region.bbox[1] for region in constituent_regions_on_this_page)
439
- max_x1 = max(region.bbox[2] for region in constituent_regions_on_this_page)
440
- max_y1 = max(region.bbox[3] for region in constituent_regions_on_this_page)
441
- crop_bbox = (min_x0, min_y0, max_x1, max_y1)
442
-
443
- page_image = highlighter_service.render_preview(
444
- page_index=(
445
- page_obj.index
446
- if hasattr(page_obj, "index")
447
- else getattr(page_obj, "page_number", 1) - 1
448
- ),
449
- temporary_highlights=temp_highlights_for_page,
450
- resolution=resolution,
451
- width=width,
452
- labels=labels, # Pass through labels
453
- legend_position=legend_position,
454
- crop_bbox=crop_bbox,
455
- **kwargs,
456
- )
457
- if page_image:
458
- output_page_images.append(page_image)
459
-
460
- # 4. Stack the generated page images if multiple
461
- if not output_page_images:
462
- logger.info("FlowRegion.show() produced no page images to concatenate.")
463
- return None
464
-
465
- if len(output_page_images) == 1:
466
- return output_page_images[0]
467
-
468
- # Stacking logic (same as in FlowRegionCollection.show)
469
- if stack_direction == "vertical":
470
- final_width = max(img.width for img in output_page_images)
471
- final_height = (
472
- sum(img.height for img in output_page_images)
473
- + (len(output_page_images) - 1) * stack_gap
474
- )
475
- if final_width == 0 or final_height == 0:
476
- raise ValueError("Cannot create concatenated image with zero width or height.")
474
+ Args:
475
+ show: If True, automatically show highlights when exiting context
477
476
 
478
- concatenated_image = PIL_Image_Runtime.new(
479
- "RGB", (final_width, final_height), stack_background_color
480
- )
481
- current_y = 0
482
- for img in output_page_images:
483
- paste_x = (final_width - img.width) // 2
484
- concatenated_image.paste(img, (paste_x, current_y))
485
- current_y += img.height + stack_gap
486
- return concatenated_image
487
- elif stack_direction == "horizontal":
488
- final_width = (
489
- sum(img.width for img in output_page_images)
490
- + (len(output_page_images) - 1) * stack_gap
491
- )
492
- final_height = max(img.height for img in output_page_images)
493
- if final_width == 0 or final_height == 0:
494
- raise ValueError("Cannot create concatenated image with zero width or height.")
477
+ Returns:
478
+ HighlightContext for accumulating highlights
479
+ """
480
+ from natural_pdf.core.highlighting_service import HighlightContext
495
481
 
496
- concatenated_image = PIL_Image_Runtime.new(
497
- "RGB", (final_width, final_height), stack_background_color
498
- )
499
- current_x = 0
500
- for img in output_page_images:
501
- paste_y = (final_height - img.height) // 2
502
- concatenated_image.paste(img, (current_x, paste_y))
503
- current_x += img.width + stack_gap
504
- return concatenated_image
505
- else:
506
- raise ValueError(
507
- f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'."
508
- )
482
+ return HighlightContext(self, show_on_exit=show)
509
483
 
510
484
  def to_images(
511
485
  self,
@@ -523,9 +497,8 @@ class FlowRegion:
523
497
  cropped_images: List["PIL_Image"] = []
524
498
  for region_part in self.constituent_regions:
525
499
  try:
526
- img = region_part.to_image(
527
- resolution=resolution, crop=True, include_highlights=False, **kwargs
528
- )
500
+ # Use render() for clean image without highlights
501
+ img = region_part.render(resolution=resolution, crop=True, **kwargs)
529
502
  if img:
530
503
  cropped_images.append(img)
531
504
  except Exception as e:
@@ -536,73 +509,424 @@ class FlowRegion:
536
509
 
537
510
  return cropped_images
538
511
 
539
- def to_image(self, background_color=(255, 255, 255), **kwargs) -> Optional["PIL_Image"]:
512
+ def __repr__(self) -> str:
513
+ return (
514
+ f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
515
+ f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
516
+ )
517
+
518
+ def expand(
519
+ self,
520
+ left: float = 0,
521
+ right: float = 0,
522
+ top: float = 0,
523
+ bottom: float = 0,
524
+ width_factor: float = 1.0,
525
+ height_factor: float = 1.0,
526
+ ) -> "FlowRegion":
540
527
  """
541
- Creates a single composite image by stacking the images of its constituent regions.
542
- Stacking direction is based on the Flow's arrangement.
543
- Individual region images are obtained by calling to_images(**kwargs).
528
+ Create a new FlowRegion with all constituent regions expanded.
544
529
 
545
530
  Args:
546
- background_color: Tuple for RGB background color of the composite image.
547
- **kwargs: Additional arguments passed to to_images() for rendering individual parts
548
- (e.g., resolution).
531
+ left: Amount to expand left edge (positive value expands leftwards)
532
+ right: Amount to expand right edge (positive value expands rightwards)
533
+ top: Amount to expand top edge (positive value expands upwards)
534
+ bottom: Amount to expand bottom edge (positive value expands downwards)
535
+ width_factor: Factor to multiply width by (applied after absolute expansion)
536
+ height_factor: Factor to multiply height by (applied after absolute expansion)
549
537
 
550
538
  Returns:
551
- A single PIL.Image.Image object, or None if no constituent images.
539
+ New FlowRegion with expanded constituent regions
552
540
  """
553
- # Use PIL_Image_Runtime for creating new images at runtime
554
- images = self.to_images(**kwargs)
555
- if not images:
556
- return None
557
- if len(images) == 1:
558
- return images[0]
541
+ if not self.constituent_regions:
542
+ return FlowRegion(
543
+ flow=self.flow,
544
+ constituent_regions=[],
545
+ source_flow_element=self.source_flow_element,
546
+ boundary_element_found=self.boundary_element_found,
547
+ )
548
+
549
+ expanded_regions = []
550
+ for idx, region in enumerate(self.constituent_regions):
551
+ # Determine which adjustments to apply based on flow arrangement
552
+ apply_left = left
553
+ apply_right = right
554
+ apply_top = top
555
+ apply_bottom = bottom
556
+
557
+ if self.flow.arrangement == "vertical":
558
+ # In a vertical flow, only the *first* region should react to `top`
559
+ # and only the *last* region should react to `bottom`. This keeps
560
+ # the virtual contiguous area intact while allowing users to nudge
561
+ # the flow boundaries.
562
+ if idx != 0:
563
+ apply_top = 0
564
+ if idx != len(self.constituent_regions) - 1:
565
+ apply_bottom = 0
566
+ # left/right apply to every region (same column width change)
567
+ else: # horizontal flow
568
+ # In a horizontal flow, only the first region reacts to `left`
569
+ # and only the last region reacts to `right`.
570
+ if idx != 0:
571
+ apply_left = 0
572
+ if idx != len(self.constituent_regions) - 1:
573
+ apply_right = 0
574
+ # top/bottom apply to every region in horizontal flows
575
+
576
+ # Skip no-op expansion to avoid extra Region objects
577
+ needs_expansion = (
578
+ any(
579
+ v not in (0, 1.0) # compare width/height factor logically later
580
+ for v in (apply_left, apply_right, apply_top, apply_bottom)
581
+ )
582
+ or width_factor != 1.0
583
+ or height_factor != 1.0
584
+ )
585
+
586
+ try:
587
+ expanded_region = (
588
+ region.expand(
589
+ left=apply_left,
590
+ right=apply_right,
591
+ top=apply_top,
592
+ bottom=apply_bottom,
593
+ width_factor=width_factor,
594
+ height_factor=height_factor,
595
+ )
596
+ if needs_expansion
597
+ else region
598
+ )
599
+ expanded_regions.append(expanded_region)
600
+ except Exception as e:
601
+ logger.warning(
602
+ f"FlowRegion.expand: Error expanding constituent region {region.bbox}: {e}",
603
+ exc_info=False,
604
+ )
605
+ expanded_regions.append(region)
606
+
607
+ # Create new FlowRegion with expanded constituent regions
608
+ new_flow_region = FlowRegion(
609
+ flow=self.flow,
610
+ constituent_regions=expanded_regions,
611
+ source_flow_element=self.source_flow_element,
612
+ boundary_element_found=self.boundary_element_found,
613
+ )
614
+
615
+ # Copy metadata
616
+ new_flow_region.source = self.source
617
+ new_flow_region.region_type = self.region_type
618
+ new_flow_region.metadata = self.metadata.copy()
619
+
620
+ # Clear caches since the regions have changed
621
+ new_flow_region._cached_text = None
622
+ new_flow_region._cached_elements = None
623
+ new_flow_region._cached_bbox = None
624
+
625
+ return new_flow_region
626
+
627
+ def above(
628
+ self,
629
+ height: Optional[float] = None,
630
+ width: str = "full",
631
+ include_source: bool = False,
632
+ until: Optional[str] = None,
633
+ include_endpoint: bool = True,
634
+ **kwargs,
635
+ ) -> "FlowRegion":
636
+ """
637
+ Create a FlowRegion with regions above this FlowRegion.
638
+
639
+ For vertical flows: Only expands the topmost constituent region upward.
640
+ For horizontal flows: Expands all constituent regions upward.
641
+
642
+ Args:
643
+ height: Height of the region above, in points
644
+ width: Width mode - "full" for full page width or "element" for element width
645
+ include_source: Whether to include this FlowRegion in the result
646
+ until: Optional selector string to specify an upper boundary element
647
+ include_endpoint: Whether to include the boundary element in the region
648
+ **kwargs: Additional parameters
649
+
650
+ Returns:
651
+ New FlowRegion with regions above
652
+ """
653
+ if not self.constituent_regions:
654
+ return FlowRegion(
655
+ flow=self.flow,
656
+ constituent_regions=[],
657
+ source_flow_element=self.source_flow_element,
658
+ boundary_element_found=self.boundary_element_found,
659
+ )
660
+
661
+ new_regions = []
559
662
 
560
663
  if self.flow.arrangement == "vertical":
561
- # Stack vertically
562
- composite_width = max(img.width for img in images)
563
- composite_height = sum(img.height for img in images)
564
- if composite_width == 0 or composite_height == 0:
565
- return None # Avoid zero-size image
566
-
567
- new_image = PIL_Image_Runtime.new(
568
- "RGB", (composite_width, composite_height), background_color
664
+ # For vertical flow, use FLOW ORDER (index 0 is earliest). Only expand the
665
+ # first constituent region in that order.
666
+ first_region = self.constituent_regions[0]
667
+ for idx, region in enumerate(self.constituent_regions):
668
+ if idx == 0: # Only expand the first region (earliest in flow)
669
+ above_region = region.above(
670
+ height=height,
671
+ width="element", # Keep original column width
672
+ include_source=include_source,
673
+ until=until,
674
+ include_endpoint=include_endpoint,
675
+ **kwargs,
676
+ )
677
+ new_regions.append(above_region)
678
+ elif include_source:
679
+ new_regions.append(region)
680
+ else: # horizontal flow
681
+ # For horizontal flow, expand all regions upward
682
+ for region in self.constituent_regions:
683
+ above_region = region.above(
684
+ height=height,
685
+ width=width,
686
+ include_source=include_source,
687
+ until=until,
688
+ include_endpoint=include_endpoint,
689
+ **kwargs,
690
+ )
691
+ new_regions.append(above_region)
692
+
693
+ return FlowRegion(
694
+ flow=self.flow,
695
+ constituent_regions=new_regions,
696
+ source_flow_element=self.source_flow_element,
697
+ boundary_element_found=self.boundary_element_found,
698
+ )
699
+
700
+ def below(
701
+ self,
702
+ height: Optional[float] = None,
703
+ width: str = "full",
704
+ include_source: bool = False,
705
+ until: Optional[str] = None,
706
+ include_endpoint: bool = True,
707
+ **kwargs,
708
+ ) -> "FlowRegion":
709
+ """
710
+ Create a FlowRegion with regions below this FlowRegion.
711
+
712
+ For vertical flows: Only expands the bottommost constituent region downward.
713
+ For horizontal flows: Expands all constituent regions downward.
714
+
715
+ Args:
716
+ height: Height of the region below, in points
717
+ width: Width mode - "full" for full page width or "element" for element width
718
+ include_source: Whether to include this FlowRegion in the result
719
+ until: Optional selector string to specify a lower boundary element
720
+ include_endpoint: Whether to include the boundary element in the region
721
+ **kwargs: Additional parameters
722
+
723
+ Returns:
724
+ New FlowRegion with regions below
725
+ """
726
+ if not self.constituent_regions:
727
+ return FlowRegion(
728
+ flow=self.flow,
729
+ constituent_regions=[],
730
+ source_flow_element=self.source_flow_element,
731
+ boundary_element_found=self.boundary_element_found,
569
732
  )
570
- current_y = 0
571
- for img in images:
572
- # Default to left alignment for vertical stacking
573
- new_image.paste(img, (0, current_y))
574
- current_y += img.height
575
- return new_image
576
-
577
- elif self.flow.arrangement == "horizontal":
578
- # Stack horizontally
579
- composite_width = sum(img.width for img in images)
580
- composite_height = max(img.height for img in images)
581
- if composite_width == 0 or composite_height == 0:
582
- return None
583
-
584
- new_image = PIL_Image_Runtime.new(
585
- "RGB", (composite_width, composite_height), background_color
733
+
734
+ new_regions = []
735
+
736
+ if self.flow.arrangement == "vertical":
737
+ # For vertical flow, expand only the LAST constituent region in flow order.
738
+ last_idx = len(self.constituent_regions) - 1
739
+ for idx, region in enumerate(self.constituent_regions):
740
+ if idx == last_idx:
741
+ below_region = region.below(
742
+ height=height,
743
+ width="element",
744
+ include_source=include_source,
745
+ until=until,
746
+ include_endpoint=include_endpoint,
747
+ **kwargs,
748
+ )
749
+ new_regions.append(below_region)
750
+ elif include_source:
751
+ new_regions.append(region)
752
+ else: # horizontal flow
753
+ # For horizontal flow, expand all regions downward
754
+ for region in self.constituent_regions:
755
+ below_region = region.below(
756
+ height=height,
757
+ width=width,
758
+ include_source=include_source,
759
+ until=until,
760
+ include_endpoint=include_endpoint,
761
+ **kwargs,
762
+ )
763
+ new_regions.append(below_region)
764
+
765
+ return FlowRegion(
766
+ flow=self.flow,
767
+ constituent_regions=new_regions,
768
+ source_flow_element=self.source_flow_element,
769
+ boundary_element_found=self.boundary_element_found,
770
+ )
771
+
772
+ def left(
773
+ self,
774
+ width: Optional[float] = None,
775
+ height: str = "full",
776
+ include_source: bool = False,
777
+ until: Optional[str] = None,
778
+ include_endpoint: bool = True,
779
+ **kwargs,
780
+ ) -> "FlowRegion":
781
+ """
782
+ Create a FlowRegion with regions to the left of this FlowRegion.
783
+
784
+ For vertical flows: Expands all constituent regions leftward.
785
+ For horizontal flows: Only expands the leftmost constituent region leftward.
786
+
787
+ Args:
788
+ width: Width of the region to the left, in points
789
+ height: Height mode - "full" for full page height or "element" for element height
790
+ include_source: Whether to include this FlowRegion in the result
791
+ until: Optional selector string to specify a left boundary element
792
+ include_endpoint: Whether to include the boundary element in the region
793
+ **kwargs: Additional parameters
794
+
795
+ Returns:
796
+ New FlowRegion with regions to the left
797
+ """
798
+ if not self.constituent_regions:
799
+ return FlowRegion(
800
+ flow=self.flow,
801
+ constituent_regions=[],
802
+ source_flow_element=self.source_flow_element,
803
+ boundary_element_found=self.boundary_element_found,
586
804
  )
587
- current_x = 0
588
- for img in images:
589
- # Default to top alignment for horizontal stacking
590
- new_image.paste(img, (current_x, 0))
591
- current_x += img.width
592
- return new_image
593
- else:
594
- # Should not happen if flow.arrangement is validated
595
- logger.warning(
596
- f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images."
805
+
806
+ new_regions = []
807
+
808
+ if self.flow.arrangement == "vertical":
809
+ # For vertical flow, expand all regions leftward
810
+ for region in self.constituent_regions:
811
+ left_region = region.left(
812
+ width=width,
813
+ height="element",
814
+ include_source=include_source,
815
+ until=until,
816
+ include_endpoint=include_endpoint,
817
+ **kwargs,
818
+ )
819
+ new_regions.append(left_region)
820
+ else: # horizontal flow
821
+ # For horizontal flow, only expand the leftmost region leftward
822
+ leftmost_region = min(self.constituent_regions, key=lambda r: r.x0)
823
+ for region in self.constituent_regions:
824
+ if region == leftmost_region:
825
+ # Expand this region leftward
826
+ left_region = region.left(
827
+ width=width,
828
+ height="element",
829
+ include_source=include_source,
830
+ until=until,
831
+ include_endpoint=include_endpoint,
832
+ **kwargs,
833
+ )
834
+ new_regions.append(left_region)
835
+ elif include_source:
836
+ # Include other regions unchanged if include_source is True
837
+ new_regions.append(region)
838
+
839
+ return FlowRegion(
840
+ flow=self.flow,
841
+ constituent_regions=new_regions,
842
+ source_flow_element=self.source_flow_element,
843
+ boundary_element_found=self.boundary_element_found,
844
+ )
845
+
846
+ def right(
847
+ self,
848
+ width: Optional[float] = None,
849
+ height: str = "full",
850
+ include_source: bool = False,
851
+ until: Optional[str] = None,
852
+ include_endpoint: bool = True,
853
+ **kwargs,
854
+ ) -> "FlowRegion":
855
+ """
856
+ Create a FlowRegion with regions to the right of this FlowRegion.
857
+
858
+ For vertical flows: Expands all constituent regions rightward.
859
+ For horizontal flows: Only expands the rightmost constituent region rightward.
860
+
861
+ Args:
862
+ width: Width of the region to the right, in points
863
+ height: Height mode - "full" for full page height or "element" for element height
864
+ include_source: Whether to include this FlowRegion in the result
865
+ until: Optional selector string to specify a right boundary element
866
+ include_endpoint: Whether to include the boundary element in the region
867
+ **kwargs: Additional parameters
868
+
869
+ Returns:
870
+ New FlowRegion with regions to the right
871
+ """
872
+ if not self.constituent_regions:
873
+ return FlowRegion(
874
+ flow=self.flow,
875
+ constituent_regions=[],
876
+ source_flow_element=self.source_flow_element,
877
+ boundary_element_found=self.boundary_element_found,
597
878
  )
598
- return None
599
879
 
600
- def __repr__(self) -> str:
601
- return (
602
- f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
603
- f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
880
+ new_regions = []
881
+
882
+ if self.flow.arrangement == "vertical":
883
+ # For vertical flow, expand all regions rightward
884
+ for region in self.constituent_regions:
885
+ right_region = region.right(
886
+ width=width,
887
+ height="element",
888
+ include_source=include_source,
889
+ until=until,
890
+ include_endpoint=include_endpoint,
891
+ **kwargs,
892
+ )
893
+ new_regions.append(right_region)
894
+ else: # horizontal flow
895
+ # For horizontal flow, only expand the rightmost region rightward
896
+ rightmost_region = max(self.constituent_regions, key=lambda r: r.x1)
897
+ for region in self.constituent_regions:
898
+ if region == rightmost_region:
899
+ # Expand this region rightward
900
+ right_region = region.right(
901
+ width=width,
902
+ height="element",
903
+ include_source=include_source,
904
+ until=until,
905
+ include_endpoint=include_endpoint,
906
+ **kwargs,
907
+ )
908
+ new_regions.append(right_region)
909
+ elif include_source:
910
+ # Include other regions unchanged if include_source is True
911
+ new_regions.append(region)
912
+
913
+ return FlowRegion(
914
+ flow=self.flow,
915
+ constituent_regions=new_regions,
916
+ source_flow_element=self.source_flow_element,
917
+ boundary_element_found=self.boundary_element_found,
604
918
  )
605
919
 
920
+ def to_region(self) -> "FlowRegion":
921
+ """
922
+ Convert this FlowRegion to a region (returns a copy).
923
+ This is equivalent to calling expand() with no arguments.
924
+
925
+ Returns:
926
+ Copy of this FlowRegion
927
+ """
928
+ return self.expand()
929
+
606
930
  @property
607
931
  def is_empty(self) -> bool:
608
932
  """Checks if the FlowRegion contains no constituent regions or if all are empty."""
@@ -637,6 +961,7 @@ class FlowRegion:
637
961
  stitch_rows: Optional[
638
962
  Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
639
963
  ] = None,
964
+ merge_headers: Optional[bool] = None,
640
965
  **kwargs,
641
966
  ) -> TableResult:
642
967
  """Extracts a single logical table from the FlowRegion.
@@ -650,6 +975,11 @@ class FlowRegion:
650
975
  method, table_settings, use_ocr, ocr_config, text_options, cell_extraction_func, show_progress:
651
976
  Same as in :pymeth:`Region.extract_table` and are forwarded as-is
652
977
  to each physical region.
978
+ merge_headers: Whether to merge tables by removing repeated headers from subsequent
979
+ pages/segments. If None (default), auto-detects by checking if the first row
980
+ of each segment matches the first row of the first segment. If segments have
981
+ inconsistent header patterns (some repeat, others don't), raises ValueError.
982
+ Useful for multi-page tables where headers repeat on each page.
653
983
  **kwargs: Additional keyword arguments forwarded to the underlying
654
984
  ``Region.extract_table`` implementation.
655
985
 
@@ -661,6 +991,7 @@ class FlowRegion:
661
991
  stitch_rows parameter:
662
992
  Controls whether the first rows of subsequent segments/regions should be merged
663
993
  into the previous row (to handle spill-over across page breaks).
994
+ Applied AFTER header removal if merge_headers is enabled.
664
995
 
665
996
  • None (default) – no merging (behaviour identical to previous versions).
666
997
  • Callable – custom predicate taking
@@ -679,9 +1010,11 @@ class FlowRegion:
679
1010
  # Resolve stitch_rows predicate -------------------------------------------------------
680
1011
  predicate: Optional[
681
1012
  Callable[[List[Optional[str]], List[Optional[str]], int, "PhysicalRegion"], bool]
682
- ] = stitch_rows if callable(stitch_rows) else None
1013
+ ] = (stitch_rows if callable(stitch_rows) else None)
683
1014
 
684
- def _default_merge(prev_row: List[Optional[str]], cur_row: List[Optional[str]]) -> List[Optional[str]]:
1015
+ def _default_merge(
1016
+ prev_row: List[Optional[str]], cur_row: List[Optional[str]]
1017
+ ) -> List[Optional[str]]:
685
1018
  """Column-wise merge – concatenates non-empty strings with a space."""
686
1019
  from itertools import zip_longest
687
1020
 
@@ -694,6 +1027,10 @@ class FlowRegion:
694
1027
  return merged
695
1028
 
696
1029
  aggregated_rows: List[List[Optional[str]]] = []
1030
+ header_row: Optional[List[Optional[str]]] = None
1031
+ merge_headers_enabled = False
1032
+ headers_warned = False # Track if we've already warned about dropping headers
1033
+ segment_has_repeated_header = [] # Track which segments have repeated headers
697
1034
 
698
1035
  for region_idx, region in enumerate(self.constituent_regions):
699
1036
  try:
@@ -717,6 +1054,59 @@ class FlowRegion:
717
1054
  else:
718
1055
  segment_rows = list(region_result)
719
1056
 
1057
+ # Handle header detection and merging for multi-page tables
1058
+ if region_idx == 0:
1059
+ # First segment: capture potential header row
1060
+ if segment_rows:
1061
+ header_row = segment_rows[0]
1062
+ # Determine if we should merge headers
1063
+ if merge_headers is None:
1064
+ # Auto-detect: we'll check all subsequent segments
1065
+ merge_headers_enabled = False # Will be determined later
1066
+ else:
1067
+ merge_headers_enabled = merge_headers
1068
+ # Track that first segment exists (for consistency checking)
1069
+ segment_has_repeated_header.append(False) # First segment doesn't "repeat"
1070
+ elif region_idx == 1 and merge_headers is None:
1071
+ # Auto-detection: check if first row of second segment matches header
1072
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
1073
+ segment_has_repeated_header.append(has_header)
1074
+
1075
+ if has_header:
1076
+ merge_headers_enabled = True
1077
+ # Remove the detected repeated header from this segment
1078
+ segment_rows = segment_rows[1:]
1079
+ if not headers_warned:
1080
+ warnings.warn(
1081
+ "Detected repeated headers in multi-page table. Merging by removing "
1082
+ "repeated headers from subsequent pages.",
1083
+ UserWarning,
1084
+ stacklevel=2,
1085
+ )
1086
+ headers_warned = True
1087
+ else:
1088
+ merge_headers_enabled = False
1089
+ elif region_idx > 1:
1090
+ # Check consistency: all segments should have same pattern
1091
+ has_header = segment_rows and header_row and segment_rows[0] == header_row
1092
+ segment_has_repeated_header.append(has_header)
1093
+
1094
+ # Remove header if merging is enabled and header is present
1095
+ if merge_headers_enabled and has_header:
1096
+ segment_rows = segment_rows[1:]
1097
+ elif region_idx > 0 and merge_headers_enabled:
1098
+ # Explicit merge_headers=True: remove headers from subsequent segments
1099
+ if segment_rows and header_row and segment_rows[0] == header_row:
1100
+ segment_rows = segment_rows[1:]
1101
+ if not headers_warned:
1102
+ warnings.warn(
1103
+ "Removing repeated headers from multi-page table during merge.",
1104
+ UserWarning,
1105
+ stacklevel=2,
1106
+ )
1107
+ headers_warned = True
1108
+
1109
+ # Process remaining rows with stitch_rows logic
720
1110
  for row_idx, row in enumerate(segment_rows):
721
1111
  if (
722
1112
  predicate is not None
@@ -733,6 +1123,26 @@ class FlowRegion:
733
1123
  exc_info=True,
734
1124
  )
735
1125
 
1126
+ # Check for inconsistent header patterns after processing all segments
1127
+ if merge_headers is None and len(segment_has_repeated_header) > 2:
1128
+ # During auto-detection, check for consistency across all segments
1129
+ expected_pattern = segment_has_repeated_header[1] # Pattern from second segment
1130
+ for seg_idx, has_header in enumerate(segment_has_repeated_header[2:], 2):
1131
+ if has_header != expected_pattern:
1132
+ # Inconsistent pattern detected
1133
+ segments_with_headers = [
1134
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if has_h
1135
+ ]
1136
+ segments_without_headers = [
1137
+ i for i, has_h in enumerate(segment_has_repeated_header[1:], 1) if not has_h
1138
+ ]
1139
+ raise ValueError(
1140
+ f"Inconsistent header pattern in multi-page table: "
1141
+ f"segments {segments_with_headers} have repeated headers, "
1142
+ f"but segments {segments_without_headers} do not. "
1143
+ f"All segments must have the same header pattern for reliable merging."
1144
+ )
1145
+
736
1146
  return TableResult(aggregated_rows)
737
1147
 
738
1148
  def extract_tables(
@@ -799,3 +1209,39 @@ class FlowRegion:
799
1209
  This is an alias for normalized_type.
800
1210
  """
801
1211
  return self.normalized_type
1212
+
1213
+ def get_highlight_specs(self) -> List[Dict[str, Any]]:
1214
+ """
1215
+ Get highlight specifications for all constituent regions.
1216
+
1217
+ This implements the highlighting protocol for FlowRegions, returning
1218
+ specs for each constituent region so they can be highlighted on their
1219
+ respective pages.
1220
+
1221
+ Returns:
1222
+ List of highlight specification dictionaries, one for each
1223
+ constituent region.
1224
+ """
1225
+ specs = []
1226
+
1227
+ for region in self.constituent_regions:
1228
+ if not hasattr(region, "page") or region.page is None:
1229
+ continue
1230
+
1231
+ if not hasattr(region, "bbox") or region.bbox is None:
1232
+ continue
1233
+
1234
+ spec = {
1235
+ "page": region.page,
1236
+ "page_index": region.page.index if hasattr(region.page, "index") else 0,
1237
+ "bbox": region.bbox,
1238
+ "element": region, # Reference to the constituent region
1239
+ }
1240
+
1241
+ # Add polygon if available
1242
+ if hasattr(region, "polygon") and hasattr(region, "has_polygon") and region.has_polygon:
1243
+ spec["polygon"] = region.polygon
1244
+
1245
+ specs.append(spec)
1246
+
1247
+ return specs