natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2176 @@
1
+ """Guide system for table extraction and layout analysis."""
2
+
3
+ import json
4
+ import logging
5
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING
6
+ from collections import UserList
7
+
8
+ import numpy as np
9
+ from PIL import Image, ImageDraw
10
+
11
+ if TYPE_CHECKING:
12
+ from natural_pdf.core.page import Page
13
+ from natural_pdf.elements.region import Region
14
+ from natural_pdf.elements.base import Element
15
+ from natural_pdf.elements.collections import ElementCollection
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _normalize_markers(
21
+ markers: Union[str, List[str], "ElementCollection", None],
22
+ obj: Union["Page", "Region"]
23
+ ) -> List[str]:
24
+ """
25
+ Normalize markers parameter to a list of text strings for guide creation.
26
+
27
+ Args:
28
+ markers: Can be:
29
+ - str: single selector or text string
30
+ - List[str]: list of selectors or text strings
31
+ - ElementCollection: collection of elements to extract text from
32
+ - None: empty list
33
+ obj: Object to search for elements if markers contains selectors
34
+
35
+ Returns:
36
+ List of text strings to search for
37
+ """
38
+ if markers is None:
39
+ return []
40
+
41
+ if isinstance(markers, str):
42
+ # Single selector or text string
43
+ if markers.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
44
+ # It's a CSS selector, find elements and extract text
45
+ if hasattr(obj, 'find_all'):
46
+ elements = obj.find_all(markers)
47
+ return [elem.text if hasattr(elem, 'text') else str(elem) for elem in elements]
48
+ else:
49
+ logger.warning(f"Object {obj} doesn't support find_all for selector '{markers}'")
50
+ return [markers] # Treat as literal text
51
+ else:
52
+ # Treat as literal text
53
+ return [markers]
54
+
55
+ elif hasattr(markers, '__iter__') and not isinstance(markers, str):
56
+ # It might be an ElementCollection or list
57
+ if hasattr(markers, 'extract_each_text'):
58
+ # It's an ElementCollection
59
+ try:
60
+ return markers.extract_each_text()
61
+ except Exception as e:
62
+ logger.warning(f"Failed to extract text from ElementCollection: {e}")
63
+ # Fallback: try to get text from individual elements
64
+ texts = []
65
+ for elem in markers:
66
+ if hasattr(elem, 'text'):
67
+ texts.append(elem.text)
68
+ elif hasattr(elem, 'extract_text'):
69
+ texts.append(elem.extract_text())
70
+ else:
71
+ texts.append(str(elem))
72
+ return texts
73
+ else:
74
+ # It's a regular list - process each item
75
+ result = []
76
+ for marker in markers:
77
+ if isinstance(marker, str):
78
+ if marker.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
79
+ # It's a selector
80
+ if hasattr(obj, 'find_all'):
81
+ elements = obj.find_all(marker)
82
+ result.extend([elem.text if hasattr(elem, 'text') else str(elem) for elem in elements])
83
+ else:
84
+ result.append(marker) # Treat as literal
85
+ else:
86
+ # Literal text
87
+ result.append(marker)
88
+ elif hasattr(marker, 'text'):
89
+ # It's an element object
90
+ result.append(marker.text)
91
+ elif hasattr(marker, 'extract_text'):
92
+ # It's an element that can extract text
93
+ result.append(marker.extract_text())
94
+ else:
95
+ result.append(str(marker))
96
+ return result
97
+
98
+ else:
99
+ # Unknown type, try to convert to string
100
+ return [str(markers)]
101
+
102
+
103
+ class GuidesList(UserList):
104
+ """A list of guide coordinates that also provides methods for creating guides."""
105
+
106
+ def __init__(self, parent_guides: "Guides", axis: Literal["vertical", "horizontal"], data=None):
107
+ super().__init__(data or [])
108
+ self._parent = parent_guides
109
+ self._axis = axis
110
+
111
+ def from_content(
112
+ self,
113
+ markers: Union[str, List[str], "ElementCollection", None],
114
+ obj: Optional[Union["Page", "Region"]] = None,
115
+ align: Literal['left', 'right', 'center', 'between'] = 'left',
116
+ outer: bool = True,
117
+ tolerance: float = 5
118
+ ) -> "Guides":
119
+ """
120
+ Create guides from content markers and add to this axis.
121
+
122
+ Args:
123
+ markers: Content to search for. Can be:
124
+ - str: single selector (e.g., 'text:contains("Name")') or literal text
125
+ - List[str]: list of selectors or literal text strings
126
+ - ElementCollection: collection of elements to extract text from
127
+ - None: no markers
128
+ obj: Page/Region to search (uses parent's context if None)
129
+ align: How to align guides relative to found elements
130
+ outer: Whether to add outer boundary guides
131
+ tolerance: Tolerance for snapping to element edges
132
+
133
+ Returns:
134
+ Parent Guides object for chaining
135
+ """
136
+ target_obj = obj or self._parent.context
137
+ if target_obj is None:
138
+ raise ValueError("No object provided and no context available")
139
+
140
+ # Normalize markers to list of text strings
141
+ marker_texts = _normalize_markers(markers, target_obj)
142
+
143
+ # Create guides for this axis
144
+ new_guides = Guides.from_content(
145
+ obj=target_obj,
146
+ axis=self._axis,
147
+ markers=marker_texts,
148
+ align=align,
149
+ outer=outer,
150
+ tolerance=tolerance
151
+ )
152
+
153
+ # Add to our list
154
+ if self._axis == 'vertical':
155
+ self.extend(new_guides.vertical)
156
+ else:
157
+ self.extend(new_guides.horizontal)
158
+
159
+ # Remove duplicates while preserving order
160
+ seen = set()
161
+ unique = []
162
+ for x in self.data:
163
+ if x not in seen:
164
+ seen.add(x)
165
+ unique.append(x)
166
+ self.data = unique
167
+
168
+ return self._parent # Return parent for chaining
169
+
170
+ def from_lines(
171
+ self,
172
+ obj: Optional[Union["Page", "Region"]] = None,
173
+ threshold: Union[float, str] = 'auto',
174
+ source_label: Optional[str] = None,
175
+ max_lines: Optional[int] = None,
176
+ outer: bool = False,
177
+ detection_method: str = 'vector',
178
+ resolution: int = 192,
179
+ *,
180
+ n: Optional[int] = None,
181
+ min_gap: Optional[int] = None,
182
+ **detect_kwargs
183
+ ) -> "Guides":
184
+ """
185
+ Create guides from detected line elements.
186
+
187
+ Args:
188
+ obj: Page/Region to search (uses parent's context if None)
189
+ threshold: Line detection threshold ('auto' or float 0.0-1.0)
190
+ source_label: Filter lines by source label (for vector method)
191
+ max_lines: Maximum lines to use (alias: n)
192
+ n: Convenience alias for max_lines. If provided, overrides max_lines.
193
+ min_gap: Minimum pixel gap enforced between detected lines. Mapped to
194
+ ``min_gap_h`` or ``min_gap_v`` depending on axis (ignored if those
195
+ keys are already supplied via ``detect_kwargs``).
196
+ outer: Whether to add outer boundary guides
197
+ detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
198
+ resolution: DPI for pixel-based detection (default: 192)
199
+ **detect_kwargs: Additional parameters for pixel-based detection
200
+ (e.g., min_gap_h, min_gap_v, binarization_method, etc.)
201
+
202
+ Returns:
203
+ Parent Guides object for chaining
204
+ """
205
+ target_obj = obj or self._parent.context
206
+ if target_obj is None:
207
+ raise ValueError("No object provided and no context available")
208
+
209
+ # Resolve max_lines via alias `n` (n takes priority)
210
+ if n is not None:
211
+ if n <= 0:
212
+ raise ValueError("n must be a positive integer")
213
+ max_lines = n
214
+
215
+ # Set appropriate max_lines parameter for underlying API
216
+ max_lines_h = max_lines if self._axis == 'horizontal' else None
217
+ max_lines_v = max_lines if self._axis == 'vertical' else None
218
+
219
+ # Map generic `min_gap` to axis-specific argument expected by detection
220
+ if min_gap is not None:
221
+ if min_gap < 1:
222
+ raise ValueError("min_gap must be ≥ 1 pixel")
223
+ axis_key = 'min_gap_h' if self._axis == 'horizontal' else 'min_gap_v'
224
+ detect_kwargs.setdefault(axis_key, min_gap)
225
+
226
+ # Create guides for this axis
227
+ new_guides = Guides.from_lines(
228
+ obj=target_obj,
229
+ axis=self._axis,
230
+ threshold=threshold,
231
+ source_label=source_label,
232
+ max_lines_h=max_lines_h,
233
+ max_lines_v=max_lines_v,
234
+ outer=outer,
235
+ detection_method=detection_method,
236
+ resolution=resolution,
237
+ **detect_kwargs
238
+ )
239
+
240
+ # Add to our list
241
+ if self._axis == 'vertical':
242
+ self.extend(new_guides.vertical)
243
+ else:
244
+ self.extend(new_guides.horizontal)
245
+
246
+ # Remove duplicates
247
+ seen = set()
248
+ unique = []
249
+ for x in self.data:
250
+ if x not in seen:
251
+ seen.add(x)
252
+ unique.append(x)
253
+ self.data = unique
254
+
255
+ return self._parent
256
+
257
+ def from_whitespace(
258
+ self,
259
+ obj: Optional[Union["Page", "Region"]] = None,
260
+ min_gap: float = 10
261
+ ) -> "Guides":
262
+ """
263
+ Create guides from whitespace gaps.
264
+
265
+ Args:
266
+ obj: Page/Region to analyze (uses parent's context if None)
267
+ min_gap: Minimum gap size to consider
268
+
269
+ Returns:
270
+ Parent Guides object for chaining
271
+ """
272
+ target_obj = obj or self._parent.context
273
+ if target_obj is None:
274
+ raise ValueError("No object provided and no context available")
275
+
276
+ # Create guides for this axis
277
+ new_guides = Guides.from_whitespace(
278
+ obj=target_obj,
279
+ axis=self._axis,
280
+ min_gap=min_gap
281
+ )
282
+
283
+ # Add to our list
284
+ if self._axis == 'vertical':
285
+ self.extend(new_guides.vertical)
286
+ else:
287
+ self.extend(new_guides.horizontal)
288
+
289
+ # Remove duplicates
290
+ seen = set()
291
+ unique = []
292
+ for x in self.data:
293
+ if x not in seen:
294
+ seen.add(x)
295
+ unique.append(x)
296
+ self.data = unique
297
+
298
+ return self._parent
299
+
300
+ def divide(self, n: int = 2, obj: Optional[Union["Page", "Region"]] = None) -> "Guides":
301
+ """
302
+ Divide the space evenly along this axis.
303
+
304
+ Args:
305
+ n: Number of divisions (creates n-1 guides)
306
+ obj: Object to divide (uses parent's context if None)
307
+
308
+ Returns:
309
+ Parent Guides object for chaining
310
+ """
311
+ target_obj = obj or self._parent.context
312
+ if target_obj is None:
313
+ raise ValueError("No object provided and no context available")
314
+
315
+ # Create guides using divide
316
+ new_guides = Guides.divide(
317
+ obj=target_obj,
318
+ n=n,
319
+ axis=self._axis
320
+ )
321
+
322
+ # Add to our list
323
+ if self._axis == 'vertical':
324
+ self.extend(new_guides.vertical)
325
+ else:
326
+ self.extend(new_guides.horizontal)
327
+
328
+ # Remove duplicates
329
+ seen = set()
330
+ unique = []
331
+ for x in self.data:
332
+ if x not in seen:
333
+ seen.add(x)
334
+ unique.append(x)
335
+ self.data = unique
336
+
337
+ return self._parent
338
+
339
+ def snap_to_whitespace(
340
+ self,
341
+ min_gap: float = 10.0,
342
+ detection_method: str = 'pixels',
343
+ threshold: Union[float, str] = 'auto',
344
+ on_no_snap: str = 'warn',
345
+ obj: Optional[Union["Page", "Region"]] = None
346
+ ) -> "Guides":
347
+ """
348
+ Snap guides in this axis to whitespace gaps.
349
+
350
+ Args:
351
+ min_gap: Minimum gap size to consider
352
+ detection_method: 'pixels' or 'text' for gap detection
353
+ threshold: Threshold for whitespace detection (0.0-1.0) or 'auto'
354
+ on_no_snap: What to do when snapping fails ('warn', 'raise', 'ignore')
355
+ obj: Object to analyze (uses parent's context if None)
356
+
357
+ Returns:
358
+ Parent Guides object for chaining
359
+ """
360
+ target_obj = obj or self._parent.context
361
+ if target_obj is None:
362
+ raise ValueError("No object provided and no context available")
363
+
364
+ # Use the parent's snap_to_whitespace but only for this axis
365
+ original_guides = self.data.copy()
366
+
367
+ # Temporarily set the parent's guides to only this axis
368
+ if self._axis == 'vertical':
369
+ original_horizontal = self._parent.horizontal.data.copy()
370
+ self._parent.horizontal.data = []
371
+ else:
372
+ original_vertical = self._parent.vertical.data.copy()
373
+ self._parent.vertical.data = []
374
+
375
+ try:
376
+ # Call the parent's method
377
+ self._parent.snap_to_whitespace(
378
+ axis=self._axis,
379
+ min_gap=min_gap,
380
+ detection_method=detection_method,
381
+ threshold=threshold,
382
+ on_no_snap=on_no_snap
383
+ )
384
+
385
+ # Update our data from the parent
386
+ if self._axis == 'vertical':
387
+ self.data = self._parent.vertical.data.copy()
388
+ else:
389
+ self.data = self._parent.horizontal.data.copy()
390
+
391
+ finally:
392
+ # Restore the other axis
393
+ if self._axis == 'vertical':
394
+ self._parent.horizontal.data = original_horizontal
395
+ else:
396
+ self._parent.vertical.data = original_vertical
397
+
398
+ return self._parent
399
+
400
+ def snap_to_content(
401
+ self,
402
+ markers: Union[str, List[str], "ElementCollection", None] = 'text',
403
+ align: Literal['left', 'right', 'center'] = 'left',
404
+ tolerance: float = 5,
405
+ obj: Optional[Union["Page", "Region"]] = None
406
+ ) -> "Guides":
407
+ """
408
+ Snap guides in this axis to nearby text content.
409
+
410
+ Args:
411
+ markers: Content to snap to. Can be:
412
+ - str: single selector or literal text (default: 'text' for all text)
413
+ - List[str]: list of selectors or literal text strings
414
+ - ElementCollection: collection of elements
415
+ - None: no markers (no snapping)
416
+ align: How to align to the found text
417
+ tolerance: Maximum distance to move when snapping
418
+ obj: Object to search (uses parent's context if None)
419
+
420
+ Returns:
421
+ Parent Guides object for chaining
422
+ """
423
+ target_obj = obj or self._parent.context
424
+ if target_obj is None:
425
+ raise ValueError("No object provided and no context available")
426
+
427
+ # Handle special case of 'text' as a selector for all text
428
+ if markers == 'text':
429
+ # Get all text elements
430
+ if hasattr(target_obj, 'find_all'):
431
+ text_elements = target_obj.find_all('text')
432
+ if hasattr(text_elements, 'elements'):
433
+ text_elements = text_elements.elements
434
+
435
+ # Snap each guide to the nearest text element
436
+ for i, guide_pos in enumerate(self.data):
437
+ best_distance = float('inf')
438
+ best_pos = guide_pos
439
+
440
+ for elem in text_elements:
441
+ # Calculate target position based on alignment
442
+ if self._axis == 'vertical':
443
+ if align == 'left':
444
+ elem_pos = elem.x0
445
+ elif align == 'right':
446
+ elem_pos = elem.x1
447
+ else: # center
448
+ elem_pos = (elem.x0 + elem.x1) / 2
449
+ else: # horizontal
450
+ if align == 'left': # top for horizontal
451
+ elem_pos = elem.top
452
+ elif align == 'right': # bottom for horizontal
453
+ elem_pos = elem.bottom
454
+ else: # center
455
+ elem_pos = (elem.top + elem.bottom) / 2
456
+
457
+ # Check if this is closer than current best
458
+ distance = abs(guide_pos - elem_pos)
459
+ if distance < best_distance and distance <= tolerance:
460
+ best_distance = distance
461
+ best_pos = elem_pos
462
+
463
+ # Update guide position if we found a good snap
464
+ if best_pos != guide_pos:
465
+ self.data[i] = best_pos
466
+ logger.debug(f"Snapped {self._axis} guide from {guide_pos:.1f} to {best_pos:.1f}")
467
+ else:
468
+ logger.warning("Object does not support find_all for text snapping")
469
+ else:
470
+ # Original behavior for specific markers
471
+ marker_texts = _normalize_markers(markers, target_obj)
472
+
473
+ # Find each marker and snap guides
474
+ for marker in marker_texts:
475
+ if hasattr(target_obj, 'find'):
476
+ element = target_obj.find(f'text:contains("{marker}")')
477
+ if not element:
478
+ logger.warning(f"Could not find text '{marker}' for snapping")
479
+ continue
480
+
481
+ # Determine target position based on alignment
482
+ if self._axis == 'vertical':
483
+ if align == 'left':
484
+ target_pos = element.x0
485
+ elif align == 'right':
486
+ target_pos = element.x1
487
+ else: # center
488
+ target_pos = (element.x0 + element.x1) / 2
489
+ else: # horizontal
490
+ if align == 'left': # top for horizontal
491
+ target_pos = element.top
492
+ elif align == 'right': # bottom for horizontal
493
+ target_pos = element.bottom
494
+ else: # center
495
+ target_pos = (element.top + element.bottom) / 2
496
+
497
+ # Find closest guide and snap if within tolerance
498
+ if self.data:
499
+ closest_idx = min(range(len(self.data)),
500
+ key=lambda i: abs(self.data[i] - target_pos))
501
+ if abs(self.data[closest_idx] - target_pos) <= tolerance:
502
+ self.data[closest_idx] = target_pos
503
+
504
+ # Sort after snapping
505
+ self.data.sort()
506
+ return self._parent
507
+
508
+ def shift(self, index: int, offset: float) -> "Guides":
509
+ """
510
+ Move a specific guide in this axis by a offset amount.
511
+
512
+ Args:
513
+ index: Index of the guide to move
514
+ offset: Amount to move (positive = right/down)
515
+
516
+ Returns:
517
+ Parent Guides object for chaining
518
+ """
519
+ if 0 <= index < len(self.data):
520
+ self.data[index] += offset
521
+ self.data.sort()
522
+ else:
523
+ logger.warning(f"Guide index {index} out of range for {self._axis} axis")
524
+
525
+ return self._parent
526
+
527
+ def add(self, position: Union[float, List[float]]) -> "Guides":
528
+ """
529
+ Add one or more guides at the specified position(s).
530
+
531
+ Args:
532
+ position: Coordinate(s) to add guide(s) at. Can be:
533
+ - float: single position
534
+ - List[float]: multiple positions
535
+
536
+ Returns:
537
+ Parent Guides object for chaining
538
+ """
539
+ if isinstance(position, (list, tuple)):
540
+ # Add multiple positions
541
+ for pos in position:
542
+ self.append(float(pos))
543
+ else:
544
+ # Add single position
545
+ self.append(float(position))
546
+
547
+ self.data.sort()
548
+ return self._parent
549
+
550
+ def remove_at(self, index: int) -> "Guides":
551
+ """
552
+ Remove a guide by index.
553
+
554
+ Args:
555
+ index: Index of guide to remove
556
+
557
+ Returns:
558
+ Parent Guides object for chaining
559
+ """
560
+ if 0 <= index < len(self.data):
561
+ self.data.pop(index)
562
+ return self._parent
563
+
564
+ def clear_all(self) -> "Guides":
565
+ """
566
+ Remove all guides from this axis.
567
+
568
+ Returns:
569
+ Parent Guides object for chaining
570
+ """
571
+ self.data.clear()
572
+ return self._parent
573
+
574
+ def __add__(self, other):
575
+ """Handle addition of GuidesList objects by returning combined data."""
576
+ if isinstance(other, GuidesList):
577
+ return self.data + other.data
578
+ elif isinstance(other, list):
579
+ return self.data + other
580
+ else:
581
+ return NotImplemented
582
+
583
+
584
+ class Guides:
585
+ """
586
+ Manages vertical and horizontal guide lines for table extraction and layout analysis.
587
+
588
+ Guides are collections of coordinates that can be used to define table boundaries,
589
+ column positions, or general layout structures. They can be created through various
590
+ detection methods or manually specified.
591
+
592
+ Attributes:
593
+ verticals: List of x-coordinates for vertical guide lines
594
+ horizontals: List of y-coordinates for horizontal guide lines
595
+ context: Optional Page/Region that these guides relate to
596
+ bounds: Optional bounding box (x0, y0, x1, y1) for relative coordinate conversion
597
+ snap_behavior: How to handle failed snapping operations ('warn', 'ignore', 'raise')
598
+ """
599
+
600
+ def __init__(
601
+ self,
602
+ verticals: Optional[Union[List[float], "Page", "Region"]] = None,
603
+ horizontals: Optional[List[float]] = None,
604
+ context: Optional[Union["Page", "Region"]] = None,
605
+ bounds: Optional[Tuple[float, float, float, float]] = None,
606
+ relative: bool = False,
607
+ snap_behavior: Literal['raise', 'warn', 'ignore'] = 'warn'
608
+ ):
609
+ """
610
+ Initialize a Guides object.
611
+
612
+ Args:
613
+ verticals: List of x-coordinates for vertical guides, or a Page/Region as context
614
+ horizontals: List of y-coordinates for horizontal guides
615
+ context: Page or Region object these guides were created from
616
+ bounds: Bounding box (x0, top, x1, bottom) if context not provided
617
+ relative: Whether coordinates are relative (0-1) or absolute
618
+ snap_behavior: How to handle snapping conflicts ('raise', 'warn', or 'ignore')
619
+ """
620
+ # Handle Guides(page) shorthand
621
+ if verticals is not None and not isinstance(verticals, (list, tuple)) and horizontals is None and context is None:
622
+ # First argument is a page/region, not coordinates
623
+ context = verticals
624
+ verticals = None
625
+
626
+ self.context = context
627
+ self.bounds = bounds
628
+ self.relative = relative
629
+ self.snap_behavior = snap_behavior
630
+
631
+ # Initialize with GuidesList instances
632
+ self._vertical = GuidesList(self, "vertical", sorted([float(x) for x in (verticals or [])]))
633
+ self._horizontal = GuidesList(self, "horizontal", sorted([float(y) for y in (horizontals or [])]))
634
+
635
+ # Determine bounds from context if needed
636
+ if self.bounds is None and self.context is not None:
637
+ if hasattr(self.context, 'bbox'):
638
+ self.bounds = self.context.bbox
639
+ elif hasattr(self.context, 'x0'):
640
+ self.bounds = (self.context.x0, self.context.top,
641
+ self.context.x1, self.context.bottom)
642
+
643
+ # Convert relative to absolute if needed
644
+ if self.relative and self.bounds:
645
+ x0, top, x1, bottom = self.bounds
646
+ width = x1 - x0
647
+ height = bottom - top
648
+
649
+ self._vertical.data = [x0 + v * width for v in self._vertical]
650
+ self._horizontal.data = [top + h * height for h in self._horizontal]
651
+ self.relative = False
652
+
653
+ @property
654
+ def vertical(self) -> GuidesList:
655
+ """Get vertical guide coordinates."""
656
+ return self._vertical
657
+
658
+ @vertical.setter
659
+ def vertical(self, value: Union[List[float], "Guides", None]):
660
+ """Set vertical guides from a list of coordinates or another Guides object."""
661
+ if value is None:
662
+ self._vertical.data = []
663
+ elif isinstance(value, Guides):
664
+ # Extract vertical coordinates from another Guides object
665
+ self._vertical.data = sorted([float(x) for x in value.vertical])
666
+ elif isinstance(value, str):
667
+ # Explicitly reject strings to avoid confusing iteration over characters
668
+ raise TypeError(f"vertical cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
669
+ elif hasattr(value, '__iter__'):
670
+ # Handle list/tuple of coordinates
671
+ try:
672
+ self._vertical.data = sorted([float(x) for x in value])
673
+ except (ValueError, TypeError) as e:
674
+ raise TypeError(f"vertical must contain numeric values, got {value}: {e}")
675
+ else:
676
+ raise TypeError(f"vertical must be a list, Guides object, or None, got {type(value)}")
677
+
678
+ @property
679
+ def horizontal(self) -> GuidesList:
680
+ """Get horizontal guide coordinates."""
681
+ return self._horizontal
682
+
683
+ @horizontal.setter
684
+ def horizontal(self, value: Union[List[float], "Guides", None]):
685
+ """Set horizontal guides from a list of coordinates or another Guides object."""
686
+ if value is None:
687
+ self._horizontal.data = []
688
+ elif isinstance(value, Guides):
689
+ # Extract horizontal coordinates from another Guides object
690
+ self._horizontal.data = sorted([float(y) for y in value.horizontal])
691
+ elif isinstance(value, str):
692
+ # Explicitly reject strings
693
+ raise TypeError(f"horizontal cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
694
+ elif hasattr(value, '__iter__'):
695
+ # Handle list/tuple of coordinates
696
+ try:
697
+ self._horizontal.data = sorted([float(y) for y in value])
698
+ except (ValueError, TypeError) as e:
699
+ raise TypeError(f"horizontal must contain numeric values, got {value}: {e}")
700
+ else:
701
+ raise TypeError(f"horizontal must be a list, Guides object, or None, got {type(value)}")
702
+
703
+ def _get_context_bounds(self) -> Optional[Tuple[float, float, float, float]]:
704
+ """Get bounds from context if available."""
705
+ if self.context is None:
706
+ return None
707
+
708
+ if hasattr(self.context, 'bbox'):
709
+ return self.context.bbox
710
+ elif hasattr(self.context, 'x0') and hasattr(self.context, 'top'):
711
+ return (self.context.x0, self.context.top, self.context.x1, self.context.bottom)
712
+ elif hasattr(self.context, 'width') and hasattr(self.context, 'height'):
713
+ return (0, 0, self.context.width, self.context.height)
714
+ return None
715
+
716
+ # -------------------------------------------------------------------------
717
+ # Factory Methods
718
+ # -------------------------------------------------------------------------
719
+
720
+ @classmethod
721
+ def divide(
722
+ cls,
723
+ obj: Union["Page", "Region", Tuple[float, float, float, float]],
724
+ n: Optional[int] = None,
725
+ cols: Optional[int] = None,
726
+ rows: Optional[int] = None,
727
+ axis: Literal['vertical', 'horizontal', 'both'] = 'both'
728
+ ) -> "Guides":
729
+ """
730
+ Create guides by evenly dividing an object.
731
+
732
+ Args:
733
+ obj: Object to divide (Page, Region, or bbox tuple)
734
+ n: Number of divisions (creates n+1 guides). Used if cols/rows not specified.
735
+ cols: Number of columns (creates cols+1 vertical guides)
736
+ rows: Number of rows (creates rows+1 horizontal guides)
737
+ axis: Which axis to divide along
738
+
739
+ Returns:
740
+ New Guides object with evenly spaced lines
741
+
742
+ Examples:
743
+ # Divide into 3 columns
744
+ guides = Guides.divide(page, cols=3)
745
+
746
+ # Divide into 5 rows
747
+ guides = Guides.divide(region, rows=5)
748
+
749
+ # Divide both axes
750
+ guides = Guides.divide(page, cols=3, rows=5)
751
+ """
752
+ # Extract bounds from object
753
+ if isinstance(obj, tuple) and len(obj) == 4:
754
+ bounds = obj
755
+ context = None
756
+ else:
757
+ context = obj
758
+ if hasattr(obj, 'bbox'):
759
+ bounds = obj.bbox
760
+ elif hasattr(obj, 'x0'):
761
+ bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
762
+ else:
763
+ bounds = (0, 0, obj.width, obj.height)
764
+
765
+ x0, y0, x1, y1 = bounds
766
+ verticals = []
767
+ horizontals = []
768
+
769
+ # Handle vertical guides
770
+ if axis in ('vertical', 'both'):
771
+ n_vertical = cols + 1 if cols is not None else (n + 1 if n is not None else 0)
772
+ if n_vertical > 0:
773
+ for i in range(n_vertical):
774
+ x = x0 + (x1 - x0) * i / (n_vertical - 1)
775
+ verticals.append(float(x))
776
+
777
+ # Handle horizontal guides
778
+ if axis in ('horizontal', 'both'):
779
+ n_horizontal = rows + 1 if rows is not None else (n + 1 if n is not None else 0)
780
+ if n_horizontal > 0:
781
+ for i in range(n_horizontal):
782
+ y = y0 + (y1 - y0) * i / (n_horizontal - 1)
783
+ horizontals.append(float(y))
784
+
785
+ return cls(verticals=verticals, horizontals=horizontals, context=context, bounds=bounds)
786
+
787
+ @classmethod
788
+ def from_lines(
789
+ cls,
790
+ obj: Union["Page", "Region"],
791
+ axis: Literal['vertical', 'horizontal', 'both'] = 'both',
792
+ threshold: Union[float, str] = 'auto',
793
+ source_label: Optional[str] = None,
794
+ max_lines_h: Optional[int] = None,
795
+ max_lines_v: Optional[int] = None,
796
+ outer: bool = False,
797
+ detection_method: str = 'vector',
798
+ resolution: int = 192,
799
+ **detect_kwargs
800
+ ) -> "Guides":
801
+ """
802
+ Create guides from detected line elements.
803
+
804
+ Args:
805
+ obj: Page or Region to detect lines from
806
+ axis: Which orientations to detect
807
+ threshold: Detection threshold ('auto' or float 0.0-1.0) - used for pixel detection
808
+ source_label: Filter for line source (vector method) or label for detected lines (pixel method)
809
+ max_lines_h: Maximum number of horizontal lines to keep
810
+ max_lines_v: Maximum number of vertical lines to keep
811
+ outer: Whether to add outer boundary guides
812
+ detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
813
+ resolution: DPI for pixel-based detection (default: 192)
814
+ **detect_kwargs: Additional parameters for pixel-based detection:
815
+ - min_gap_h: Minimum gap between horizontal lines (pixels)
816
+ - min_gap_v: Minimum gap between vertical lines (pixels)
817
+ - binarization_method: 'adaptive' or 'otsu'
818
+ - morph_op_h/v: Morphological operations ('open', 'close', 'none')
819
+ - smoothing_sigma_h/v: Gaussian smoothing sigma
820
+ - method: 'projection' (default) or 'lsd' (requires opencv)
821
+
822
+ Returns:
823
+ New Guides object with detected line positions
824
+ """
825
+ # Get bounds for potential outer guides
826
+ if hasattr(obj, 'bbox'):
827
+ bounds = obj.bbox
828
+ elif hasattr(obj, 'x0'):
829
+ bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
830
+ elif hasattr(obj, 'width'):
831
+ bounds = (0, 0, obj.width, obj.height)
832
+ else:
833
+ bounds = None
834
+
835
+ verticals = []
836
+ horizontals = []
837
+
838
+ if detection_method == 'pixels':
839
+ # Use pixel-based line detection
840
+ if not hasattr(obj, 'detect_lines'):
841
+ raise ValueError(f"Object {obj} does not support pixel-based line detection")
842
+
843
+ # Set up detection parameters
844
+ detect_params = {
845
+ 'resolution': resolution,
846
+ 'source_label': source_label or 'guides_detection',
847
+ 'horizontal': axis in ('horizontal', 'both'),
848
+ 'vertical': axis in ('vertical', 'both'),
849
+ 'replace': True, # Replace any existing lines with this source
850
+ 'method': detect_kwargs.get('method', 'projection'),
851
+ }
852
+
853
+ # Handle threshold parameter
854
+ if threshold == 'auto':
855
+ # Auto mode: use very low thresholds with max_lines constraints
856
+ detect_params['peak_threshold_h'] = 0.0
857
+ detect_params['peak_threshold_v'] = 0.0
858
+ detect_params['max_lines_h'] = max_lines_h
859
+ detect_params['max_lines_v'] = max_lines_v
860
+ else:
861
+ # Fixed threshold mode
862
+ detect_params['peak_threshold_h'] = float(threshold) if axis in ('horizontal', 'both') else 1.0
863
+ detect_params['peak_threshold_v'] = float(threshold) if axis in ('vertical', 'both') else 1.0
864
+ detect_params['max_lines_h'] = max_lines_h
865
+ detect_params['max_lines_v'] = max_lines_v
866
+
867
+ # Add any additional detection parameters
868
+ for key in ['min_gap_h', 'min_gap_v', 'binarization_method',
869
+ 'adaptive_thresh_block_size', 'adaptive_thresh_C_val',
870
+ 'morph_op_h', 'morph_kernel_h', 'morph_op_v', 'morph_kernel_v',
871
+ 'smoothing_sigma_h', 'smoothing_sigma_v', 'peak_width_rel_height']:
872
+ if key in detect_kwargs:
873
+ detect_params[key] = detect_kwargs[key]
874
+
875
+ # Perform the detection
876
+ obj.detect_lines(**detect_params)
877
+
878
+ # Now get the detected lines and use them
879
+ if hasattr(obj, 'lines'):
880
+ lines = obj.lines
881
+ elif hasattr(obj, 'find_all'):
882
+ lines = obj.find_all('line')
883
+ else:
884
+ lines = []
885
+
886
+ # Filter by the source we just used
887
+ lines = [l for l in lines if getattr(l, 'source', None) == detect_params['source_label']]
888
+
889
+ else: # detection_method == 'vector' (default)
890
+ # Get existing lines from the object
891
+ if hasattr(obj, 'lines'):
892
+ lines = obj.lines
893
+ elif hasattr(obj, 'find_all'):
894
+ lines = obj.find_all('line')
895
+ else:
896
+ logger.warning(f"Object {obj} has no lines or find_all method")
897
+ lines = []
898
+
899
+ # Filter by source if specified
900
+ if source_label:
901
+ lines = [l for l in lines if getattr(l, 'source', None) == source_label]
902
+
903
+ # Process lines (same logic for both methods)
904
+ # Separate lines by orientation and collect with metadata for ranking
905
+ h_line_data = [] # (y_coord, length, line_obj)
906
+ v_line_data = [] # (x_coord, length, line_obj)
907
+
908
+ for line in lines:
909
+ if hasattr(line, 'is_horizontal') and hasattr(line, 'is_vertical'):
910
+ if line.is_horizontal and axis in ('horizontal', 'both'):
911
+ # Use the midpoint y-coordinate for horizontal lines
912
+ y = (line.top + line.bottom) / 2
913
+ # Calculate line length for ranking
914
+ length = getattr(line, 'width', abs(getattr(line, 'x1', 0) - getattr(line, 'x0', 0)))
915
+ h_line_data.append((y, length, line))
916
+ elif line.is_vertical and axis in ('vertical', 'both'):
917
+ # Use the midpoint x-coordinate for vertical lines
918
+ x = (line.x0 + line.x1) / 2
919
+ # Calculate line length for ranking
920
+ length = getattr(line, 'height', abs(getattr(line, 'bottom', 0) - getattr(line, 'top', 0)))
921
+ v_line_data.append((x, length, line))
922
+
923
+ # Process horizontal lines
924
+ if max_lines_h is not None and h_line_data:
925
+ # Sort by length (longer lines are typically more significant)
926
+ h_line_data.sort(key=lambda x: x[1], reverse=True)
927
+ # Take the top N by length
928
+ selected_h = h_line_data[:max_lines_h]
929
+ # Extract just the coordinates and sort by position
930
+ horizontals = sorted([coord for coord, _, _ in selected_h])
931
+ logger.debug(f"Selected {len(horizontals)} horizontal lines from {len(h_line_data)} candidates")
932
+ else:
933
+ # Use all horizontal lines (original behavior)
934
+ horizontals = [coord for coord, _, _ in h_line_data]
935
+ horizontals = sorted(list(set(horizontals)))
936
+
937
+ # Process vertical lines
938
+ if max_lines_v is not None and v_line_data:
939
+ # Sort by length (longer lines are typically more significant)
940
+ v_line_data.sort(key=lambda x: x[1], reverse=True)
941
+ # Take the top N by length
942
+ selected_v = v_line_data[:max_lines_v]
943
+ # Extract just the coordinates and sort by position
944
+ verticals = sorted([coord for coord, _, _ in selected_v])
945
+ logger.debug(f"Selected {len(verticals)} vertical lines from {len(v_line_data)} candidates")
946
+ else:
947
+ # Use all vertical lines (original behavior)
948
+ verticals = [coord for coord, _, _ in v_line_data]
949
+ verticals = sorted(list(set(verticals)))
950
+
951
+ # Add outer guides if requested
952
+ if outer and bounds:
953
+ if axis in ('vertical', 'both'):
954
+ if not verticals or verticals[0] > bounds[0]:
955
+ verticals.insert(0, bounds[0]) # x0
956
+ if not verticals or verticals[-1] < bounds[2]:
957
+ verticals.append(bounds[2]) # x1
958
+ if axis in ('horizontal', 'both'):
959
+ if not horizontals or horizontals[0] > bounds[1]:
960
+ horizontals.insert(0, bounds[1]) # y0
961
+ if not horizontals or horizontals[-1] < bounds[3]:
962
+ horizontals.append(bounds[3]) # y1
963
+
964
+ # Remove duplicates and sort again
965
+ verticals = sorted(list(set(verticals)))
966
+ horizontals = sorted(list(set(horizontals)))
967
+
968
+ return cls(verticals=verticals, horizontals=horizontals, context=obj, bounds=bounds)
969
+
970
+ @classmethod
971
+ def from_content(
972
+ cls,
973
+ obj: Union["Page", "Region"],
974
+ axis: Literal['vertical', 'horizontal'] = 'vertical',
975
+ markers: Union[str, List[str], "ElementCollection", None] = None,
976
+ align: Literal['left', 'right', 'center', 'between'] = 'left',
977
+ outer: bool = True,
978
+ tolerance: float = 5
979
+ ) -> "Guides":
980
+ """
981
+ Create guides based on text content positions.
982
+
983
+ Args:
984
+ obj: Page or Region to search for content
985
+ axis: Whether to create vertical or horizontal guides
986
+ markers: Content to search for. Can be:
987
+ - str: single selector (e.g., 'text:contains("Name")') or literal text
988
+ - List[str]: list of selectors or literal text strings
989
+ - ElementCollection: collection of elements to extract text from
990
+ - None: no markers
991
+ align: Where to place guides relative to found text
992
+ outer: Whether to add guides at the boundaries
993
+ tolerance: Maximum distance to search for text
994
+
995
+ Returns:
996
+ New Guides object aligned to text content
997
+ """
998
+ guides_coords = []
999
+ bounds = None
1000
+
1001
+ # Get bounds from object
1002
+ if hasattr(obj, 'bbox'):
1003
+ bounds = obj.bbox
1004
+ elif hasattr(obj, 'x0'):
1005
+ bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
1006
+ elif hasattr(obj, 'width'):
1007
+ bounds = (0, 0, obj.width, obj.height)
1008
+
1009
+ # Normalize markers to list of text strings
1010
+ marker_texts = _normalize_markers(markers, obj)
1011
+
1012
+ # Find each marker and determine guide position
1013
+ for marker in marker_texts:
1014
+ if hasattr(obj, 'find'):
1015
+ element = obj.find(f'text:contains("{marker}")')
1016
+ if element:
1017
+ if axis == 'vertical':
1018
+ if align == 'left':
1019
+ guides_coords.append(element.x0)
1020
+ elif align == 'right':
1021
+ guides_coords.append(element.x1)
1022
+ elif align == 'center':
1023
+ guides_coords.append((element.x0 + element.x1) / 2)
1024
+ elif align == 'between':
1025
+ # For between, collect left edges for processing later
1026
+ guides_coords.append(element.x0)
1027
+ else: # horizontal
1028
+ if align == 'left': # top for horizontal
1029
+ guides_coords.append(element.top)
1030
+ elif align == 'right': # bottom for horizontal
1031
+ guides_coords.append(element.bottom)
1032
+ elif align == 'center':
1033
+ guides_coords.append((element.top + element.bottom) / 2)
1034
+ elif align == 'between':
1035
+ # For between, collect top edges for processing later
1036
+ guides_coords.append(element.top)
1037
+
1038
+ # Handle 'between' alignment - find midpoints between adjacent markers
1039
+ if align == 'between' and len(guides_coords) >= 2:
1040
+ # We need to get the right and left edges of each marker
1041
+ marker_bounds = []
1042
+ for marker in marker_texts:
1043
+ if hasattr(obj, 'find'):
1044
+ element = obj.find(f'text:contains("{marker}")')
1045
+ if element:
1046
+ if axis == 'vertical':
1047
+ marker_bounds.append((element.x0, element.x1))
1048
+ else: # horizontal
1049
+ marker_bounds.append((element.top, element.bottom))
1050
+
1051
+ # Sort markers by their left edge (or top edge for horizontal)
1052
+ marker_bounds.sort(key=lambda x: x[0])
1053
+
1054
+ # Create guides at midpoints between adjacent markers
1055
+ between_coords = []
1056
+ for i in range(len(marker_bounds) - 1):
1057
+ # Midpoint between right edge of current marker and left edge of next marker
1058
+ right_edge_current = marker_bounds[i][1]
1059
+ left_edge_next = marker_bounds[i + 1][0]
1060
+ midpoint = (right_edge_current + left_edge_next) / 2
1061
+ between_coords.append(midpoint)
1062
+
1063
+ guides_coords = between_coords
1064
+
1065
+ # Add outer guides if requested
1066
+ if outer and bounds:
1067
+ if axis == 'vertical':
1068
+ guides_coords.insert(0, bounds[0]) # x0
1069
+ guides_coords.append(bounds[2]) # x1
1070
+ else:
1071
+ guides_coords.insert(0, bounds[1]) # y0
1072
+ guides_coords.append(bounds[3]) # y1
1073
+
1074
+ # Remove duplicates and sort
1075
+ guides_coords = sorted(list(set(guides_coords)))
1076
+
1077
+ # Create guides object
1078
+ if axis == 'vertical':
1079
+ return cls(verticals=guides_coords, context=obj, bounds=bounds)
1080
+ else:
1081
+ return cls(horizontals=guides_coords, context=obj, bounds=bounds)
1082
+
1083
+ @classmethod
1084
+ def from_whitespace(
1085
+ cls,
1086
+ obj: Union["Page", "Region"],
1087
+ axis: Literal['vertical', 'horizontal', 'both'] = 'both',
1088
+ min_gap: float = 10
1089
+ ) -> "Guides":
1090
+ """
1091
+ Create guides by detecting whitespace gaps.
1092
+
1093
+ Args:
1094
+ obj: Page or Region to analyze
1095
+ min_gap: Minimum gap size to consider as whitespace
1096
+ axis: Which axes to analyze for gaps
1097
+
1098
+ Returns:
1099
+ New Guides object positioned at whitespace gaps
1100
+ """
1101
+ # This is a placeholder - would need sophisticated gap detection
1102
+ logger.info("Whitespace detection not yet implemented, using divide instead")
1103
+ return cls.divide(obj, n=3, axis=axis)
1104
+
1105
+ @classmethod
1106
+ def new(
1107
+ cls,
1108
+ context: Optional[Union["Page", "Region"]] = None
1109
+ ) -> "Guides":
1110
+ """
1111
+ Create a new empty Guides object, optionally with a context.
1112
+
1113
+ This provides a clean way to start building guides through chaining:
1114
+ guides = Guides.new(page).add_content(axis='vertical', markers=[...])
1115
+
1116
+ Args:
1117
+ context: Optional Page or Region to use as default context for operations
1118
+
1119
+ Returns:
1120
+ New empty Guides object
1121
+ """
1122
+ return cls(verticals=[], horizontals=[], context=context)
1123
+
1124
+ # -------------------------------------------------------------------------
1125
+ # Manipulation Methods
1126
+ # -------------------------------------------------------------------------
1127
+
1128
+ def snap_to_whitespace(
1129
+ self,
1130
+ axis: str = 'vertical',
1131
+ min_gap: float = 10.0,
1132
+ detection_method: str = 'pixels', # 'pixels' or 'text'
1133
+ threshold: Union[float, str] = 'auto', # threshold for what counts as a trough (0.0-1.0) or 'auto'
1134
+ on_no_snap: str = 'warn'
1135
+ ) -> "Guides":
1136
+ """
1137
+ Snap guides to nearby whitespace gaps (troughs) using optimal assignment.
1138
+ Modifies this Guides object in place.
1139
+
1140
+ Args:
1141
+ axis: Direction to snap ('vertical' or 'horizontal')
1142
+ min_gap: Minimum gap size to consider as a valid trough
1143
+ detection_method: Method for detecting troughs:
1144
+ 'pixels' - use pixel-based density analysis (default)
1145
+ 'text' - use text element spacing analysis
1146
+ threshold: Threshold for what counts as a trough:
1147
+ - float (0.0-1.0): areas with this fraction or less of max density count as troughs
1148
+ - 'auto': automatically find threshold that creates enough troughs for guides
1149
+ on_no_snap: Action when snapping fails ('warn', 'ignore', 'raise')
1150
+
1151
+ Returns:
1152
+ Self for method chaining.
1153
+ """
1154
+ if not self.context:
1155
+ logger.warning("No context available for whitespace detection")
1156
+ return self
1157
+
1158
+ # Get elements for trough detection
1159
+ text_elements = self._get_text_elements()
1160
+ if not text_elements:
1161
+ logger.warning("No text elements found for whitespace detection")
1162
+ return self
1163
+
1164
+ if axis == 'vertical':
1165
+ gaps = self._find_vertical_whitespace_gaps(text_elements, min_gap, threshold)
1166
+ if gaps:
1167
+ self._snap_guides_to_gaps(self.vertical.data, gaps, axis)
1168
+ elif axis == 'horizontal':
1169
+ gaps = self._find_horizontal_whitespace_gaps(text_elements, min_gap, threshold)
1170
+ if gaps:
1171
+ self._snap_guides_to_gaps(self.horizontal.data, gaps, axis)
1172
+ else:
1173
+ raise ValueError("axis must be 'vertical' or 'horizontal'")
1174
+
1175
+ # Ensure all coordinates are Python floats (not numpy types)
1176
+ self.vertical.data[:] = [float(x) for x in self.vertical.data]
1177
+ self.horizontal.data[:] = [float(y) for y in self.horizontal.data]
1178
+
1179
+ return self
1180
+
1181
+ def shift(
1182
+ self,
1183
+ index: int,
1184
+ offset: float,
1185
+ axis: Literal['vertical', 'horizontal'] = 'vertical'
1186
+ ) -> "Guides":
1187
+ """
1188
+ Move a specific guide by a offset amount.
1189
+
1190
+ Args:
1191
+ index: Index of the guide to move
1192
+ offset: Amount to move (positive = right/down)
1193
+ axis: Which guide list to modify
1194
+
1195
+ Returns:
1196
+ Self for method chaining
1197
+ """
1198
+ if axis == 'vertical':
1199
+ if 0 <= index < len(self.vertical):
1200
+ self.vertical[index] += offset
1201
+ self.vertical = sorted(self.vertical)
1202
+ else:
1203
+ logger.warning(f"Vertical guide index {index} out of range")
1204
+ else:
1205
+ if 0 <= index < len(self.horizontal):
1206
+ self.horizontal[index] += offset
1207
+ self.horizontal = sorted(self.horizontal)
1208
+ else:
1209
+ logger.warning(f"Horizontal guide index {index} out of range")
1210
+
1211
+ return self
1212
+
1213
+ def add_vertical(self, x: float) -> "Guides":
1214
+ """Add a vertical guide at the specified x-coordinate."""
1215
+ self.vertical.append(x)
1216
+ self.vertical = sorted(self.vertical)
1217
+ return self
1218
+
1219
+ def add_horizontal(self, y: float) -> "Guides":
1220
+ """Add a horizontal guide at the specified y-coordinate."""
1221
+ self.horizontal.append(y)
1222
+ self.horizontal = sorted(self.horizontal)
1223
+ return self
1224
+
1225
+ def remove_vertical(self, index: int) -> "Guides":
1226
+ """Remove a vertical guide by index."""
1227
+ if 0 <= index < len(self.vertical):
1228
+ self.vertical.pop(index)
1229
+ return self
1230
+
1231
+ def remove_horizontal(self, index: int) -> "Guides":
1232
+ """Remove a horizontal guide by index."""
1233
+ if 0 <= index < len(self.horizontal):
1234
+ self.horizontal.pop(index)
1235
+ return self
1236
+
1237
+ # -------------------------------------------------------------------------
1238
+ # Operations
1239
+ # -------------------------------------------------------------------------
1240
+
1241
+ def __add__(self, other: "Guides") -> "Guides":
1242
+ """
1243
+ Combine two guide sets.
1244
+
1245
+ Returns:
1246
+ New Guides object with combined coordinates
1247
+ """
1248
+ # Combine and deduplicate coordinates, ensuring Python floats
1249
+ combined_verticals = sorted([float(x) for x in set(self.vertical + other.vertical)])
1250
+ combined_horizontals = sorted([float(y) for y in set(self.horizontal + other.horizontal)])
1251
+
1252
+ # Use context from self if available
1253
+ return Guides(
1254
+ verticals=combined_verticals,
1255
+ horizontals=combined_horizontals,
1256
+ context=self.context or other.context,
1257
+ bounds=self.bounds or other.bounds
1258
+ )
1259
+
1260
+ def show(self, on=None, **kwargs):
1261
+ """
1262
+ Display the guides overlaid on a page or region.
1263
+
1264
+ Args:
1265
+ on: Page, Region, PIL Image, or string to display guides on.
1266
+ If None, uses self.context (the object guides were created from).
1267
+ If string 'page', uses the page from self.context.
1268
+ **kwargs: Additional arguments passed to to_image() if applicable.
1269
+
1270
+ Returns:
1271
+ PIL Image with guides drawn on it.
1272
+ """
1273
+ # Determine what to display guides on
1274
+ target = on if on is not None else self.context
1275
+
1276
+ # Handle string shortcuts
1277
+ if isinstance(target, str):
1278
+ if target == 'page':
1279
+ if hasattr(self.context, 'page'):
1280
+ target = self.context.page
1281
+ elif hasattr(self.context, '_page'):
1282
+ target = self.context._page
1283
+ else:
1284
+ raise ValueError("Cannot resolve 'page' - context has no page attribute")
1285
+ else:
1286
+ raise ValueError(f"Unknown string target: {target}. Only 'page' is supported.")
1287
+
1288
+ if target is None:
1289
+ raise ValueError("No target specified and no context available for guides display")
1290
+
1291
+ # Prepare kwargs for image generation
1292
+ image_kwargs = kwargs.copy()
1293
+
1294
+ # Always turn off highlights to avoid visual clutter
1295
+ image_kwargs['include_highlights'] = False
1296
+
1297
+ # If target is a region-like object, crop to just that region
1298
+ if hasattr(target, 'bbox') and hasattr(target, 'page'):
1299
+ # This is likely a Region
1300
+ image_kwargs['crop'] = True
1301
+
1302
+ # Get base image
1303
+ if hasattr(target, 'to_image'):
1304
+ img = target.to_image(**image_kwargs)
1305
+ elif hasattr(target, 'mode') and hasattr(target, 'size'):
1306
+ # It's already a PIL Image
1307
+ img = target
1308
+ else:
1309
+ raise ValueError(f"Object {target} does not support to_image() and is not a PIL Image")
1310
+
1311
+ if img is None:
1312
+ raise ValueError("Failed to generate base image")
1313
+
1314
+ # Create a copy to draw on
1315
+ img = img.copy()
1316
+ draw = ImageDraw.Draw(img)
1317
+
1318
+ # Determine scale factor for coordinate conversion
1319
+ if hasattr(target, 'width') and hasattr(target, 'height') and not (hasattr(target, 'mode') and hasattr(target, 'size')):
1320
+ # target is a PDF object (Page/Region) with PDF coordinates
1321
+ scale_x = img.width / target.width
1322
+ scale_y = img.height / target.height
1323
+
1324
+ # If we're showing guides on a region, we need to adjust coordinates
1325
+ # to be relative to the region's origin
1326
+ if hasattr(target, 'bbox') and hasattr(target, 'page'):
1327
+ # This is a Region - adjust guide coordinates to be relative to region
1328
+ region_x0, region_top = target.x0, target.top
1329
+ else:
1330
+ # This is a Page - no adjustment needed
1331
+ region_x0, region_top = 0, 0
1332
+ else:
1333
+ # target is already an image, no scaling needed
1334
+ scale_x = 1.0
1335
+ scale_y = 1.0
1336
+ region_x0, region_top = 0, 0
1337
+
1338
+ # Draw vertical guides (blue)
1339
+ for x_coord in self.vertical:
1340
+ # Adjust coordinate if we're showing on a region
1341
+ adjusted_x = x_coord - region_x0
1342
+ pixel_x = adjusted_x * scale_x
1343
+ # Ensure guides at the edge are still visible by clamping to valid range
1344
+ if 0 <= pixel_x <= img.width - 1:
1345
+ x_pixel = int(min(pixel_x, img.width - 1))
1346
+ draw.line([(x_pixel, 0), (x_pixel, img.height - 1)], fill=(0, 0, 255, 200), width=2)
1347
+
1348
+ # Draw horizontal guides (red)
1349
+ for y_coord in self.horizontal:
1350
+ # Adjust coordinate if we're showing on a region
1351
+ adjusted_y = y_coord - region_top
1352
+ pixel_y = adjusted_y * scale_y
1353
+ # Ensure guides at the edge are still visible by clamping to valid range
1354
+ if 0 <= pixel_y <= img.height - 1:
1355
+ y_pixel = int(min(pixel_y, img.height - 1))
1356
+ draw.line([(0, y_pixel), (img.width - 1, y_pixel)], fill=(255, 0, 0, 200), width=2)
1357
+
1358
+ return img
1359
+
1360
+ # -------------------------------------------------------------------------
1361
+ # Utility Methods
1362
+ # -------------------------------------------------------------------------
1363
+
1364
+ def get_cells(self) -> List[Tuple[float, float, float, float]]:
1365
+ """
1366
+ Get all cell bounding boxes from guide intersections.
1367
+
1368
+ Returns:
1369
+ List of (x0, y0, x1, y1) tuples for each cell
1370
+ """
1371
+ cells = []
1372
+
1373
+ # Create cells from guide intersections
1374
+ for i in range(len(self.vertical) - 1):
1375
+ for j in range(len(self.horizontal) - 1):
1376
+ x0 = self.vertical[i]
1377
+ x1 = self.vertical[i + 1]
1378
+ y0 = self.horizontal[j]
1379
+ y1 = self.horizontal[j + 1]
1380
+ cells.append((x0, y0, x1, y1))
1381
+
1382
+ return cells
1383
+
1384
+ def to_dict(self) -> Dict[str, Any]:
1385
+ """
1386
+ Convert to dictionary format suitable for pdfplumber table_settings.
1387
+
1388
+ Returns:
1389
+ Dictionary with explicit_vertical_lines and explicit_horizontal_lines
1390
+ """
1391
+ return {
1392
+ 'explicit_vertical_lines': self.vertical,
1393
+ 'explicit_horizontal_lines': self.horizontal
1394
+ }
1395
+
1396
+ def to_relative(self) -> "Guides":
1397
+ """
1398
+ Convert absolute coordinates to relative (0-1) coordinates.
1399
+
1400
+ Returns:
1401
+ New Guides object with relative coordinates
1402
+ """
1403
+ if self.relative:
1404
+ return self # Already relative
1405
+
1406
+ if not self.bounds:
1407
+ raise ValueError("Cannot convert to relative without bounds")
1408
+
1409
+ x0, y0, x1, y1 = self.bounds
1410
+ width = x1 - x0
1411
+ height = y1 - y0
1412
+
1413
+ rel_verticals = [(x - x0) / width for x in self.vertical]
1414
+ rel_horizontals = [(y - y0) / height for y in self.horizontal]
1415
+
1416
+ return Guides(
1417
+ verticals=rel_verticals,
1418
+ horizontals=rel_horizontals,
1419
+ context=self.context,
1420
+ bounds=(0, 0, 1, 1),
1421
+ relative=True
1422
+ )
1423
+
1424
+ def to_absolute(self, bounds: Tuple[float, float, float, float]) -> "Guides":
1425
+ """
1426
+ Convert relative coordinates to absolute coordinates.
1427
+
1428
+ Args:
1429
+ bounds: Target bounding box (x0, y0, x1, y1)
1430
+
1431
+ Returns:
1432
+ New Guides object with absolute coordinates
1433
+ """
1434
+ if not self.relative:
1435
+ return self # Already absolute
1436
+
1437
+ x0, y0, x1, y1 = bounds
1438
+ width = x1 - x0
1439
+ height = y1 - y0
1440
+
1441
+ abs_verticals = [x0 + x * width for x in self.vertical]
1442
+ abs_horizontals = [y0 + y * height for y in self.horizontal]
1443
+
1444
+ return Guides(
1445
+ verticals=abs_verticals,
1446
+ horizontals=abs_horizontals,
1447
+ context=self.context,
1448
+ bounds=bounds,
1449
+ relative=False
1450
+ )
1451
+
1452
+ @property
1453
+ def n_rows(self) -> int:
1454
+ """Number of rows defined by horizontal guides."""
1455
+ return max(0, len(self.horizontal) - 1)
1456
+
1457
+ @property
1458
+ def n_cols(self) -> int:
1459
+ """Number of columns defined by vertical guides."""
1460
+ return max(0, len(self.vertical) - 1)
1461
+
1462
+ def _handle_snap_failure(self, message: str):
1463
+ """Handle cases where snapping cannot be performed."""
1464
+ if hasattr(self, 'on_no_snap'):
1465
+ if self.on_no_snap == 'warn':
1466
+ logger.warning(message)
1467
+ elif self.on_no_snap == 'raise':
1468
+ raise ValueError(message)
1469
+ # 'ignore' case: do nothing
1470
+ else:
1471
+ logger.warning(message) # Default behavior
1472
+
1473
+ def _find_vertical_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
1474
+ """
1475
+ Find vertical whitespace gaps using bbox-based density analysis.
1476
+ Returns list of (start, end) tuples representing trough ranges.
1477
+ """
1478
+ if not self.bounds:
1479
+ return []
1480
+
1481
+ x0, _, x1, _ = self.bounds
1482
+ width_pixels = int(x1 - x0)
1483
+
1484
+ if width_pixels <= 0:
1485
+ return []
1486
+
1487
+ # Create density histogram: count bbox overlaps per x-coordinate
1488
+ density = np.zeros(width_pixels)
1489
+
1490
+ for element in text_elements:
1491
+ if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
1492
+ continue
1493
+
1494
+ # Clip coordinates to bounds
1495
+ elem_x0 = max(x0, element.x0) - x0
1496
+ elem_x1 = min(x1, element.x1) - x0
1497
+
1498
+ if elem_x1 > elem_x0:
1499
+ start_px = int(elem_x0)
1500
+ end_px = int(elem_x1)
1501
+ density[start_px:end_px] += 1
1502
+
1503
+ if density.max() == 0:
1504
+ return []
1505
+
1506
+ # Determine the threshold value
1507
+ if threshold == 'auto':
1508
+ # Auto mode: try different thresholds with step 0.05 until we have enough troughs
1509
+ guides_needing_troughs = len([g for i, g in enumerate(self.vertical) if 0 < i < len(self.vertical) - 1])
1510
+ if guides_needing_troughs == 0:
1511
+ threshold_val = 0.5 # Default when no guides need placement
1512
+ else:
1513
+ threshold_val = None
1514
+ for test_threshold in np.arange(0.1, 1.0, 0.05):
1515
+ test_gaps = self._find_gaps_with_threshold(density, test_threshold, min_gap, x0)
1516
+ if len(test_gaps) >= guides_needing_troughs:
1517
+ threshold_val = test_threshold
1518
+ logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
1519
+ break
1520
+
1521
+ if threshold_val is None:
1522
+ threshold_val = 0.8 # Fallback to permissive threshold
1523
+ logger.debug(f"Auto threshold fallback to {threshold_val}")
1524
+ else:
1525
+ # Fixed threshold mode
1526
+ if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
1527
+ raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
1528
+ threshold_val = float(threshold)
1529
+
1530
+ return self._find_gaps_with_threshold(density, threshold_val, min_gap, x0)
1531
+
1532
+ def _find_gaps_with_threshold(self, density, threshold_val, min_gap, x0):
1533
+ """Helper method to find gaps given a specific threshold value."""
1534
+ max_density = density.max()
1535
+ threshold_density = threshold_val * max_density
1536
+
1537
+ # Smooth the density for better trough detection
1538
+ from scipy.ndimage import gaussian_filter1d
1539
+ smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
1540
+
1541
+ # Find regions below threshold
1542
+ below_threshold = smoothed_density <= threshold_density
1543
+
1544
+ # Find contiguous regions
1545
+ from scipy.ndimage import label as nd_label
1546
+ labeled_regions, num_regions = nd_label(below_threshold)
1547
+
1548
+ gaps = []
1549
+ for region_id in range(1, num_regions + 1):
1550
+ region_mask = labeled_regions == region_id
1551
+ region_indices = np.where(region_mask)[0]
1552
+
1553
+ if len(region_indices) == 0:
1554
+ continue
1555
+
1556
+ start_px = region_indices[0]
1557
+ end_px = region_indices[-1] + 1
1558
+
1559
+ # Convert back to PDF coordinates
1560
+ start_pdf = x0 + start_px
1561
+ end_pdf = x0 + end_px
1562
+
1563
+ # Check minimum gap size
1564
+ if end_pdf - start_pdf >= min_gap:
1565
+ gaps.append((start_pdf, end_pdf))
1566
+
1567
+ return gaps
1568
+
1569
+ def _find_horizontal_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
1570
+ """
1571
+ Find horizontal whitespace gaps using bbox-based density analysis.
1572
+ Returns list of (start, end) tuples representing trough ranges.
1573
+ """
1574
+ if not self.bounds:
1575
+ return []
1576
+
1577
+ _, y0, _, y1 = self.bounds
1578
+ height_pixels = int(y1 - y0)
1579
+
1580
+ if height_pixels <= 0:
1581
+ return []
1582
+
1583
+ # Create density histogram: count bbox overlaps per y-coordinate
1584
+ density = np.zeros(height_pixels)
1585
+
1586
+ for element in text_elements:
1587
+ if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
1588
+ continue
1589
+
1590
+ # Clip coordinates to bounds
1591
+ elem_top = max(y0, element.top) - y0
1592
+ elem_bottom = min(y1, element.bottom) - y0
1593
+
1594
+ if elem_bottom > elem_top:
1595
+ start_px = int(elem_top)
1596
+ end_px = int(elem_bottom)
1597
+ density[start_px:end_px] += 1
1598
+
1599
+ if density.max() == 0:
1600
+ return []
1601
+
1602
+ # Determine the threshold value (same logic as vertical)
1603
+ if threshold == 'auto':
1604
+ guides_needing_troughs = len([g for i, g in enumerate(self.horizontal) if 0 < i < len(self.horizontal) - 1])
1605
+ if guides_needing_troughs == 0:
1606
+ threshold_val = 0.5 # Default when no guides need placement
1607
+ else:
1608
+ threshold_val = None
1609
+ for test_threshold in np.arange(0.1, 1.0, 0.05):
1610
+ test_gaps = self._find_gaps_with_threshold_horizontal(density, test_threshold, min_gap, y0)
1611
+ if len(test_gaps) >= guides_needing_troughs:
1612
+ threshold_val = test_threshold
1613
+ logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
1614
+ break
1615
+
1616
+ if threshold_val is None:
1617
+ threshold_val = 0.8 # Fallback to permissive threshold
1618
+ logger.debug(f"Auto threshold fallback to {threshold_val}")
1619
+ else:
1620
+ # Fixed threshold mode
1621
+ if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
1622
+ raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
1623
+ threshold_val = float(threshold)
1624
+
1625
+ return self._find_gaps_with_threshold_horizontal(density, threshold_val, min_gap, y0)
1626
+
1627
+ def _find_gaps_with_threshold_horizontal(self, density, threshold_val, min_gap, y0):
1628
+ """Helper method to find horizontal gaps given a specific threshold value."""
1629
+ max_density = density.max()
1630
+ threshold_density = threshold_val * max_density
1631
+
1632
+ # Smooth the density for better trough detection
1633
+ from scipy.ndimage import gaussian_filter1d
1634
+ smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
1635
+
1636
+ # Find regions below threshold
1637
+ below_threshold = smoothed_density <= threshold_density
1638
+
1639
+ # Find contiguous regions
1640
+ from scipy.ndimage import label as nd_label
1641
+ labeled_regions, num_regions = nd_label(below_threshold)
1642
+
1643
+ gaps = []
1644
+ for region_id in range(1, num_regions + 1):
1645
+ region_mask = labeled_regions == region_id
1646
+ region_indices = np.where(region_mask)[0]
1647
+
1648
+ if len(region_indices) == 0:
1649
+ continue
1650
+
1651
+ start_px = region_indices[0]
1652
+ end_px = region_indices[-1] + 1
1653
+
1654
+ # Convert back to PDF coordinates
1655
+ start_pdf = y0 + start_px
1656
+ end_pdf = y0 + end_px
1657
+
1658
+ # Check minimum gap size
1659
+ if end_pdf - start_pdf >= min_gap:
1660
+ gaps.append((start_pdf, end_pdf))
1661
+
1662
+ return gaps
1663
+
1664
+ def _find_vertical_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
1665
+ """
1666
+ Find vertical whitespace gaps using text element spacing analysis.
1667
+ Returns list of (start, end) tuples representing trough ranges.
1668
+ """
1669
+ if not self.bounds or not text_elements:
1670
+ return []
1671
+
1672
+ x0, _, x1, _ = self.bounds
1673
+
1674
+ # Get all element right and left edges
1675
+ element_edges = []
1676
+ for element in text_elements:
1677
+ if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
1678
+ continue
1679
+ # Only include elements that overlap vertically with our bounds
1680
+ if hasattr(element, 'top') and hasattr(element, 'bottom'):
1681
+ if element.bottom < self.bounds[1] or element.top > self.bounds[3]:
1682
+ continue
1683
+ element_edges.extend([element.x0, element.x1])
1684
+
1685
+ if not element_edges:
1686
+ return []
1687
+
1688
+ # Sort edges and find gaps
1689
+ element_edges = sorted(set(element_edges))
1690
+
1691
+ trough_ranges = []
1692
+ for i in range(len(element_edges) - 1):
1693
+ gap_start = element_edges[i]
1694
+ gap_end = element_edges[i + 1]
1695
+ gap_width = gap_end - gap_start
1696
+
1697
+ if gap_width >= min_gap:
1698
+ # Check if this gap actually contains no text (is empty space)
1699
+ gap_has_text = False
1700
+ for element in text_elements:
1701
+ if (hasattr(element, 'x0') and hasattr(element, 'x1') and
1702
+ element.x0 < gap_end and element.x1 > gap_start):
1703
+ gap_has_text = True
1704
+ break
1705
+
1706
+ if not gap_has_text:
1707
+ trough_ranges.append((gap_start, gap_end))
1708
+
1709
+ return trough_ranges
1710
+
1711
+ def _find_horizontal_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
1712
+ """
1713
+ Find horizontal whitespace gaps using text element spacing analysis.
1714
+ Returns list of (start, end) tuples representing trough ranges.
1715
+ """
1716
+ if not self.bounds or not text_elements:
1717
+ return []
1718
+
1719
+ _, y0, _, y1 = self.bounds
1720
+
1721
+ # Get all element top and bottom edges
1722
+ element_edges = []
1723
+ for element in text_elements:
1724
+ if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
1725
+ continue
1726
+ # Only include elements that overlap horizontally with our bounds
1727
+ if hasattr(element, 'x0') and hasattr(element, 'x1'):
1728
+ if element.x1 < self.bounds[0] or element.x0 > self.bounds[2]:
1729
+ continue
1730
+ element_edges.extend([element.top, element.bottom])
1731
+
1732
+ if not element_edges:
1733
+ return []
1734
+
1735
+ # Sort edges and find gaps
1736
+ element_edges = sorted(set(element_edges))
1737
+
1738
+ trough_ranges = []
1739
+ for i in range(len(element_edges) - 1):
1740
+ gap_start = element_edges[i]
1741
+ gap_end = element_edges[i + 1]
1742
+ gap_width = gap_end - gap_start
1743
+
1744
+ if gap_width >= min_gap:
1745
+ # Check if this gap actually contains no text (is empty space)
1746
+ gap_has_text = False
1747
+ for element in text_elements:
1748
+ if (hasattr(element, 'top') and hasattr(element, 'bottom') and
1749
+ element.top < gap_end and element.bottom > gap_start):
1750
+ gap_has_text = True
1751
+ break
1752
+
1753
+ if not gap_has_text:
1754
+ trough_ranges.append((gap_start, gap_end))
1755
+
1756
+ return trough_ranges
1757
+
1758
+ def _optimal_guide_assignment(self, guides: List[float], trough_ranges: List[Tuple[float, float]]) -> Dict[int, int]:
1759
+ """
1760
+ Assign guides to trough ranges using the user's desired logic:
1761
+ - Guides already in a trough stay put
1762
+ - Only guides NOT in any trough get moved to available troughs
1763
+ - Prefer closest assignment for guides that need to move
1764
+ """
1765
+ if not guides or not trough_ranges:
1766
+ return {}
1767
+
1768
+ assignments = {}
1769
+
1770
+ # Step 1: Identify which guides are already in troughs
1771
+ guides_in_troughs = set()
1772
+ for i, guide_pos in enumerate(guides):
1773
+ for trough_start, trough_end in trough_ranges:
1774
+ if trough_start <= guide_pos <= trough_end:
1775
+ guides_in_troughs.add(i)
1776
+ logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is already in trough ({trough_start:.1f}-{trough_end:.1f}), keeping in place")
1777
+ break
1778
+
1779
+ # Step 2: Identify which troughs are already occupied
1780
+ occupied_troughs = set()
1781
+ for i in guides_in_troughs:
1782
+ guide_pos = guides[i]
1783
+ for j, (trough_start, trough_end) in enumerate(trough_ranges):
1784
+ if trough_start <= guide_pos <= trough_end:
1785
+ occupied_troughs.add(j)
1786
+ break
1787
+
1788
+ # Step 3: Find guides that need reassignment (not in any trough)
1789
+ guides_to_move = []
1790
+ for i, guide_pos in enumerate(guides):
1791
+ if i not in guides_in_troughs:
1792
+ guides_to_move.append(i)
1793
+ logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is NOT in any trough, needs reassignment")
1794
+
1795
+ # Step 4: Find available troughs (not occupied by existing guides)
1796
+ available_troughs = []
1797
+ for j, (trough_start, trough_end) in enumerate(trough_ranges):
1798
+ if j not in occupied_troughs:
1799
+ available_troughs.append(j)
1800
+ logger.debug(f"Trough {j} ({trough_start:.1f}-{trough_end:.1f}) is available")
1801
+
1802
+ # Step 5: Assign guides to move to closest available troughs
1803
+ if guides_to_move and available_troughs:
1804
+ # Calculate distances for all combinations
1805
+ distances = []
1806
+ for guide_idx in guides_to_move:
1807
+ guide_pos = guides[guide_idx]
1808
+ for trough_idx in available_troughs:
1809
+ trough_start, trough_end = trough_ranges[trough_idx]
1810
+ trough_center = (trough_start + trough_end) / 2
1811
+ distance = abs(guide_pos - trough_center)
1812
+ distances.append((distance, guide_idx, trough_idx))
1813
+
1814
+ # Sort by distance and assign greedily
1815
+ distances.sort()
1816
+ used_troughs = set()
1817
+
1818
+ for distance, guide_idx, trough_idx in distances:
1819
+ if guide_idx not in assignments and trough_idx not in used_troughs:
1820
+ assignments[guide_idx] = trough_idx
1821
+ used_troughs.add(trough_idx)
1822
+ logger.debug(f"Assigned guide {guide_idx} (pos {guides[guide_idx]:.1f}) to trough {trough_idx} (distance: {distance:.1f})")
1823
+
1824
+ logger.debug(f"Final assignments: {assignments}")
1825
+ return assignments
1826
+
1827
+ def _snap_guides_to_gaps(self, guides: List[float], gaps: List[Tuple[float, float]], axis: str):
1828
+ """
1829
+ Snap guides to nearby gaps using optimal assignment.
1830
+ Only moves guides that are NOT already in a trough.
1831
+ """
1832
+ if not guides or not gaps:
1833
+ return
1834
+
1835
+ logger.debug(f"Snapping {len(guides)} {axis} guides to {len(gaps)} trough ranges")
1836
+ for i, (start, end) in enumerate(gaps):
1837
+ center = (start + end) / 2
1838
+ logger.debug(f" Trough {i}: {start:.1f} to {end:.1f} (center: {center:.1f})")
1839
+
1840
+ # Get optimal assignments
1841
+ assignments = self._optimal_guide_assignment(guides, gaps)
1842
+
1843
+ # Apply assignments (modify guides list in-place)
1844
+ for guide_idx, trough_idx in assignments.items():
1845
+ trough_start, trough_end = gaps[trough_idx]
1846
+ new_pos = (trough_start + trough_end) / 2 # Move to trough center
1847
+ old_pos = guides[guide_idx]
1848
+ guides[guide_idx] = new_pos
1849
+ logger.info(f"Snapped {axis} guide from {old_pos:.1f} to {new_pos:.1f}")
1850
+
1851
+ def build_grid(
1852
+ self,
1853
+ target: Optional[Union["Page", "Region"]] = None,
1854
+ source: str = "guides",
1855
+ cell_padding: float = 0.5,
1856
+ include_outer_boundaries: bool = False
1857
+ ) -> Dict[str, int]:
1858
+ """
1859
+ Create table structure (table, rows, columns, cells) from guide coordinates.
1860
+
1861
+ Args:
1862
+ target: Page or Region to create regions on (uses self.context if None)
1863
+ source: Source label for created regions (for identification)
1864
+ cell_padding: Internal padding for cell regions in points
1865
+ include_outer_boundaries: Whether to add boundaries at edges if missing
1866
+
1867
+ Returns:
1868
+ Dictionary with counts: {'table': 1, 'rows': N, 'columns': M, 'cells': N*M}
1869
+ """
1870
+ # Determine target object
1871
+ target_obj = target or self.context
1872
+ if not target_obj:
1873
+ raise ValueError("No target object available. Provide target parameter or context.")
1874
+
1875
+ # Get the page for creating regions
1876
+ if hasattr(target_obj, 'x0') and hasattr(target_obj, 'top'): # Region (has bbox coordinates)
1877
+ page = target_obj._page
1878
+ origin_x, origin_y = target_obj.x0, target_obj.top
1879
+ context_width, context_height = target_obj.width, target_obj.height
1880
+ elif hasattr(target_obj, '_element_mgr') or hasattr(target_obj, 'width'): # Page
1881
+ page = target_obj
1882
+ origin_x, origin_y = 0.0, 0.0
1883
+ context_width, context_height = page.width, page.height
1884
+ else:
1885
+ raise ValueError(f"Target object {target_obj} is not a Page or Region")
1886
+
1887
+ element_manager = page._element_mgr
1888
+
1889
+ # Setup boundaries
1890
+ row_boundaries = list(self.horizontal)
1891
+ col_boundaries = list(self.vertical)
1892
+
1893
+ # Add outer boundaries if requested and missing
1894
+ if include_outer_boundaries:
1895
+ if not row_boundaries or row_boundaries[0] > origin_y:
1896
+ row_boundaries.insert(0, origin_y)
1897
+ if not row_boundaries or row_boundaries[-1] < origin_y + context_height:
1898
+ row_boundaries.append(origin_y + context_height)
1899
+
1900
+ if not col_boundaries or col_boundaries[0] > origin_x:
1901
+ col_boundaries.insert(0, origin_x)
1902
+ if not col_boundaries or col_boundaries[-1] < origin_x + context_width:
1903
+ col_boundaries.append(origin_x + context_width)
1904
+
1905
+ # Remove duplicates and sort
1906
+ row_boundaries = sorted(list(set(row_boundaries)))
1907
+ col_boundaries = sorted(list(set(col_boundaries)))
1908
+
1909
+ logger.debug(f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries")
1910
+
1911
+ # Track creation counts
1912
+ counts = {'table': 0, 'rows': 0, 'columns': 0, 'cells': 0}
1913
+
1914
+ # Create overall table region
1915
+ if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1916
+ table_region = page.create_region(
1917
+ col_boundaries[0], row_boundaries[0],
1918
+ col_boundaries[-1], row_boundaries[-1]
1919
+ )
1920
+ table_region.source = source
1921
+ table_region.region_type = "table"
1922
+ table_region.normalized_type = "table"
1923
+ table_region.metadata.update({
1924
+ "source_guides": True,
1925
+ "num_rows": len(row_boundaries) - 1,
1926
+ "num_cols": len(col_boundaries) - 1,
1927
+ "boundaries": {"rows": row_boundaries, "cols": col_boundaries}
1928
+ })
1929
+ element_manager.add_element(table_region, element_type="regions")
1930
+ counts['table'] = 1
1931
+
1932
+ # Create row regions
1933
+ if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1934
+ for i in range(len(row_boundaries) - 1):
1935
+ row_region = page.create_region(
1936
+ col_boundaries[0], row_boundaries[i],
1937
+ col_boundaries[-1], row_boundaries[i + 1]
1938
+ )
1939
+ row_region.source = source
1940
+ row_region.region_type = "table_row"
1941
+ row_region.normalized_type = "table_row"
1942
+ row_region.metadata.update({
1943
+ "row_index": i,
1944
+ "source_guides": True
1945
+ })
1946
+ element_manager.add_element(row_region, element_type="regions")
1947
+ counts['rows'] += 1
1948
+
1949
+ # Create column regions
1950
+ if len(col_boundaries) >= 2 and len(row_boundaries) >= 2:
1951
+ for j in range(len(col_boundaries) - 1):
1952
+ col_region = page.create_region(
1953
+ col_boundaries[j], row_boundaries[0],
1954
+ col_boundaries[j + 1], row_boundaries[-1]
1955
+ )
1956
+ col_region.source = source
1957
+ col_region.region_type = "table_column"
1958
+ col_region.normalized_type = "table_column"
1959
+ col_region.metadata.update({
1960
+ "col_index": j,
1961
+ "source_guides": True
1962
+ })
1963
+ element_manager.add_element(col_region, element_type="regions")
1964
+ counts['columns'] += 1
1965
+
1966
+ # Create cell regions
1967
+ if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1968
+ for i in range(len(row_boundaries) - 1):
1969
+ for j in range(len(col_boundaries) - 1):
1970
+ # Apply padding
1971
+ cell_x0 = col_boundaries[j] + cell_padding
1972
+ cell_top = row_boundaries[i] + cell_padding
1973
+ cell_x1 = col_boundaries[j + 1] - cell_padding
1974
+ cell_bottom = row_boundaries[i + 1] - cell_padding
1975
+
1976
+ # Skip invalid cells
1977
+ if cell_x1 <= cell_x0 or cell_bottom <= cell_top:
1978
+ continue
1979
+
1980
+ cell_region = page.create_region(cell_x0, cell_top, cell_x1, cell_bottom)
1981
+ cell_region.source = source
1982
+ cell_region.region_type = "table_cell"
1983
+ cell_region.normalized_type = "table_cell"
1984
+ cell_region.metadata.update({
1985
+ "row_index": i,
1986
+ "col_index": j,
1987
+ "source_guides": True,
1988
+ "original_boundaries": {
1989
+ "left": col_boundaries[j],
1990
+ "top": row_boundaries[i],
1991
+ "right": col_boundaries[j + 1],
1992
+ "bottom": row_boundaries[i + 1]
1993
+ }
1994
+ })
1995
+ element_manager.add_element(cell_region, element_type="regions")
1996
+ counts['cells'] += 1
1997
+
1998
+ logger.info(f"Created {counts['table']} table, {counts['rows']} rows, "
1999
+ f"{counts['columns']} columns, and {counts['cells']} cells from guides")
2000
+
2001
+ return counts
2002
+
2003
+ def __repr__(self) -> str:
2004
+ """String representation of the guides."""
2005
+ return (f"Guides(verticals={len(self.vertical)}, "
2006
+ f"horizontals={len(self.horizontal)}, "
2007
+ f"cells={len(self.get_cells())})")
2008
+
2009
+ def _get_text_elements(self):
2010
+ """Get text elements from the context."""
2011
+ if not self.context:
2012
+ return []
2013
+
2014
+ # Get text elements from the context
2015
+ if hasattr(self.context, 'find_all'):
2016
+ try:
2017
+ text_elements = self.context.find_all('text', apply_exclusions=False)
2018
+ return text_elements.elements if hasattr(text_elements, 'elements') else text_elements
2019
+ except Exception as e:
2020
+ logger.warning(f"Error getting text elements: {e}")
2021
+ return []
2022
+ else:
2023
+ logger.warning("Context does not support text element search")
2024
+ return []
2025
+
2026
+ # -------------------------------------------------------------------------
2027
+ # Instance methods for fluent chaining (avoid name conflicts with class methods)
2028
+ # -------------------------------------------------------------------------
2029
+
2030
+ def add_content(
2031
+ self,
2032
+ axis: Literal['vertical', 'horizontal'] = 'vertical',
2033
+ markers: Union[str, List[str], "ElementCollection", None] = None,
2034
+ obj: Optional[Union["Page", "Region"]] = None,
2035
+ align: Literal['left', 'right', 'center', 'between'] = 'left',
2036
+ outer: bool = True,
2037
+ tolerance: float = 5
2038
+ ) -> "Guides":
2039
+ """
2040
+ Instance method: Add guides from content, allowing chaining.
2041
+ This allows: Guides.new(page).add_content(axis='vertical', markers=[...])
2042
+
2043
+ Args:
2044
+ axis: Which axis to create guides for
2045
+ markers: Content to search for. Can be:
2046
+ - str: single selector or literal text
2047
+ - List[str]: list of selectors or literal text strings
2048
+ - ElementCollection: collection of elements to extract text from
2049
+ - None: no markers
2050
+ obj: Page or Region to search (uses self.context if None)
2051
+ align: How to align guides relative to found elements
2052
+ outer: Whether to add outer boundary guides
2053
+ tolerance: Tolerance for snapping to element edges
2054
+
2055
+ Returns:
2056
+ Self for method chaining
2057
+ """
2058
+ # Use provided object or fall back to stored context
2059
+ target_obj = obj or self.context
2060
+ if target_obj is None:
2061
+ raise ValueError("No object provided and no context available")
2062
+
2063
+ # Create new guides using the class method
2064
+ new_guides = Guides.from_content(
2065
+ obj=target_obj,
2066
+ axis=axis,
2067
+ markers=markers,
2068
+ align=align,
2069
+ outer=outer,
2070
+ tolerance=tolerance
2071
+ )
2072
+
2073
+ # Add the appropriate coordinates to this object
2074
+ if axis == 'vertical':
2075
+ self.vertical = list(set(self.vertical + new_guides.vertical))
2076
+ else:
2077
+ self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2078
+
2079
+ return self
2080
+
2081
+ def add_lines(
2082
+ self,
2083
+ axis: Literal['vertical', 'horizontal', 'both'] = 'both',
2084
+ obj: Optional[Union["Page", "Region"]] = None,
2085
+ threshold: Union[float, str] = 'auto',
2086
+ source_label: Optional[str] = None,
2087
+ max_lines_h: Optional[int] = None,
2088
+ max_lines_v: Optional[int] = None,
2089
+ outer: bool = False,
2090
+ detection_method: str = 'vector',
2091
+ resolution: int = 192,
2092
+ **detect_kwargs
2093
+ ) -> "Guides":
2094
+ """
2095
+ Instance method: Add guides from lines, allowing chaining.
2096
+ This allows: Guides.new(page).add_lines(axis='horizontal')
2097
+
2098
+ Args:
2099
+ axis: Which axis to detect lines for
2100
+ obj: Page or Region to search (uses self.context if None)
2101
+ threshold: Line detection threshold ('auto' or float 0.0-1.0)
2102
+ source_label: Filter lines by source label (vector) or label for detected lines (pixels)
2103
+ max_lines_h: Maximum horizontal lines to use
2104
+ max_lines_v: Maximum vertical lines to use
2105
+ outer: Whether to add outer boundary guides
2106
+ detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
2107
+ resolution: DPI for pixel-based detection (default: 192)
2108
+ **detect_kwargs: Additional parameters for pixel detection (see from_lines)
2109
+
2110
+ Returns:
2111
+ Self for method chaining
2112
+ """
2113
+ # Use provided object or fall back to stored context
2114
+ target_obj = obj or self.context
2115
+ if target_obj is None:
2116
+ raise ValueError("No object provided and no context available")
2117
+
2118
+ # Create new guides using the class method
2119
+ new_guides = Guides.from_lines(
2120
+ obj=target_obj,
2121
+ axis=axis,
2122
+ threshold=threshold,
2123
+ source_label=source_label,
2124
+ max_lines_h=max_lines_h,
2125
+ max_lines_v=max_lines_v,
2126
+ outer=outer,
2127
+ detection_method=detection_method,
2128
+ resolution=resolution,
2129
+ **detect_kwargs
2130
+ )
2131
+
2132
+ # Add the appropriate coordinates to this object
2133
+ if axis in ('vertical', 'both'):
2134
+ self.vertical = list(set(self.vertical + new_guides.vertical))
2135
+ if axis in ('horizontal', 'both'):
2136
+ self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2137
+
2138
+ return self
2139
+
2140
+ def add_whitespace(
2141
+ self,
2142
+ axis: Literal['vertical', 'horizontal', 'both'] = 'both',
2143
+ obj: Optional[Union["Page", "Region"]] = None,
2144
+ min_gap: float = 10
2145
+ ) -> "Guides":
2146
+ """
2147
+ Instance method: Add guides from whitespace, allowing chaining.
2148
+ This allows: Guides.new(page).add_whitespace(axis='both')
2149
+
2150
+ Args:
2151
+ axis: Which axis to create guides for
2152
+ obj: Page or Region to search (uses self.context if None)
2153
+ min_gap: Minimum gap size to consider
2154
+
2155
+ Returns:
2156
+ Self for method chaining
2157
+ """
2158
+ # Use provided object or fall back to stored context
2159
+ target_obj = obj or self.context
2160
+ if target_obj is None:
2161
+ raise ValueError("No object provided and no context available")
2162
+
2163
+ # Create new guides using the class method
2164
+ new_guides = Guides.from_whitespace(
2165
+ obj=target_obj,
2166
+ axis=axis,
2167
+ min_gap=min_gap
2168
+ )
2169
+
2170
+ # Add the appropriate coordinates to this object
2171
+ if axis in ('vertical', 'both'):
2172
+ self.vertical = list(set(self.vertical + new_guides.vertical))
2173
+ if axis in ('horizontal', 'both'):
2174
+ self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2175
+
2176
+ return self