natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +751 -607
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +120 -23
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.35.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -2,59 +2,58 @@
2
2
 
3
3
  import json
4
4
  import logging
5
- from typing import Any, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING
6
5
  from collections import UserList
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
7
7
 
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from natural_pdf.core.page import Page
13
- from natural_pdf.elements.region import Region
14
13
  from natural_pdf.elements.base import Element
15
14
  from natural_pdf.elements.collections import ElementCollection
15
+ from natural_pdf.elements.region import Region
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  def _normalize_markers(
21
- markers: Union[str, List[str], "ElementCollection", None],
22
- obj: Union["Page", "Region"]
21
+ markers: Union[str, List[str], "ElementCollection", None], obj: Union["Page", "Region"]
23
22
  ) -> List[str]:
24
23
  """
25
24
  Normalize markers parameter to a list of text strings for guide creation.
26
-
25
+
27
26
  Args:
28
27
  markers: Can be:
29
28
  - str: single selector or text string
30
- - List[str]: list of selectors or text strings
29
+ - List[str]: list of selectors or text strings
31
30
  - ElementCollection: collection of elements to extract text from
32
31
  - None: empty list
33
32
  obj: Object to search for elements if markers contains selectors
34
-
33
+
35
34
  Returns:
36
35
  List of text strings to search for
37
36
  """
38
37
  if markers is None:
39
38
  return []
40
-
39
+
41
40
  if isinstance(markers, str):
42
41
  # Single selector or text string
43
- if markers.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
42
+ if markers.startswith(("text", "region", "line", "rect", "blob", "image")):
44
43
  # It's a CSS selector, find elements and extract text
45
- if hasattr(obj, 'find_all'):
44
+ if hasattr(obj, "find_all"):
46
45
  elements = obj.find_all(markers)
47
- return [elem.text if hasattr(elem, 'text') else str(elem) for elem in elements]
46
+ return [elem.text if hasattr(elem, "text") else str(elem) for elem in elements]
48
47
  else:
49
48
  logger.warning(f"Object {obj} doesn't support find_all for selector '{markers}'")
50
49
  return [markers] # Treat as literal text
51
50
  else:
52
51
  # Treat as literal text
53
52
  return [markers]
54
-
55
- elif hasattr(markers, '__iter__') and not isinstance(markers, str):
53
+
54
+ elif hasattr(markers, "__iter__") and not isinstance(markers, str):
56
55
  # It might be an ElementCollection or list
57
- if hasattr(markers, 'extract_each_text'):
56
+ if hasattr(markers, "extract_each_text"):
58
57
  # It's an ElementCollection
59
58
  try:
60
59
  return markers.extract_each_text()
@@ -63,9 +62,9 @@ def _normalize_markers(
63
62
  # Fallback: try to get text from individual elements
64
63
  texts = []
65
64
  for elem in markers:
66
- if hasattr(elem, 'text'):
65
+ if hasattr(elem, "text"):
67
66
  texts.append(elem.text)
68
- elif hasattr(elem, 'extract_text'):
67
+ elif hasattr(elem, "extract_text"):
69
68
  texts.append(elem.extract_text())
70
69
  else:
71
70
  texts.append(str(elem))
@@ -75,26 +74,31 @@ def _normalize_markers(
75
74
  result = []
76
75
  for marker in markers:
77
76
  if isinstance(marker, str):
78
- if marker.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
77
+ if marker.startswith(("text", "region", "line", "rect", "blob", "image")):
79
78
  # It's a selector
80
- if hasattr(obj, 'find_all'):
79
+ if hasattr(obj, "find_all"):
81
80
  elements = obj.find_all(marker)
82
- result.extend([elem.text if hasattr(elem, 'text') else str(elem) for elem in elements])
81
+ result.extend(
82
+ [
83
+ elem.text if hasattr(elem, "text") else str(elem)
84
+ for elem in elements
85
+ ]
86
+ )
83
87
  else:
84
88
  result.append(marker) # Treat as literal
85
89
  else:
86
90
  # Literal text
87
91
  result.append(marker)
88
- elif hasattr(marker, 'text'):
92
+ elif hasattr(marker, "text"):
89
93
  # It's an element object
90
94
  result.append(marker.text)
91
- elif hasattr(marker, 'extract_text'):
95
+ elif hasattr(marker, "extract_text"):
92
96
  # It's an element that can extract text
93
97
  result.append(marker.extract_text())
94
98
  else:
95
99
  result.append(str(marker))
96
100
  return result
97
-
101
+
98
102
  else:
99
103
  # Unknown type, try to convert to string
100
104
  return [str(markers)]
@@ -102,44 +106,46 @@ def _normalize_markers(
102
106
 
103
107
  class GuidesList(UserList):
104
108
  """A list of guide coordinates that also provides methods for creating guides."""
105
-
109
+
106
110
  def __init__(self, parent_guides: "Guides", axis: Literal["vertical", "horizontal"], data=None):
107
111
  super().__init__(data or [])
108
112
  self._parent = parent_guides
109
113
  self._axis = axis
110
-
114
+
111
115
  def from_content(
112
116
  self,
113
117
  markers: Union[str, List[str], "ElementCollection", None],
114
118
  obj: Optional[Union["Page", "Region"]] = None,
115
- align: Literal['left', 'right', 'center', 'between'] = 'left',
119
+ align: Literal["left", "right", "center", "between"] = "left",
116
120
  outer: bool = True,
117
- tolerance: float = 5
121
+ tolerance: float = 5,
122
+ *,
123
+ append: bool = False,
118
124
  ) -> "Guides":
119
125
  """
120
126
  Create guides from content markers and add to this axis.
121
-
127
+
122
128
  Args:
123
129
  markers: Content to search for. Can be:
124
130
  - str: single selector (e.g., 'text:contains("Name")') or literal text
125
- - List[str]: list of selectors or literal text strings
131
+ - List[str]: list of selectors or literal text strings
126
132
  - ElementCollection: collection of elements to extract text from
127
133
  - None: no markers
128
134
  obj: Page/Region to search (uses parent's context if None)
129
135
  align: How to align guides relative to found elements
130
136
  outer: Whether to add outer boundary guides
131
137
  tolerance: Tolerance for snapping to element edges
132
-
138
+
133
139
  Returns:
134
140
  Parent Guides object for chaining
135
141
  """
136
142
  target_obj = obj or self._parent.context
137
143
  if target_obj is None:
138
144
  raise ValueError("No object provided and no context available")
139
-
145
+
140
146
  # Normalize markers to list of text strings
141
147
  marker_texts = _normalize_markers(markers, target_obj)
142
-
148
+
143
149
  # Create guides for this axis
144
150
  new_guides = Guides.from_content(
145
151
  obj=target_obj,
@@ -147,15 +153,21 @@ class GuidesList(UserList):
147
153
  markers=marker_texts,
148
154
  align=align,
149
155
  outer=outer,
150
- tolerance=tolerance
156
+ tolerance=tolerance,
151
157
  )
152
-
153
- # Add to our list
154
- if self._axis == 'vertical':
155
- self.extend(new_guides.vertical)
158
+
159
+ # Replace or append based on parameter
160
+ if append:
161
+ if self._axis == "vertical":
162
+ self.extend(new_guides.vertical)
163
+ else:
164
+ self.extend(new_guides.horizontal)
156
165
  else:
157
- self.extend(new_guides.horizontal)
158
-
166
+ if self._axis == "vertical":
167
+ self.data = list(new_guides.vertical)
168
+ else:
169
+ self.data = list(new_guides.horizontal)
170
+
159
171
  # Remove duplicates while preserving order
160
172
  seen = set()
161
173
  unique = []
@@ -164,26 +176,27 @@ class GuidesList(UserList):
164
176
  seen.add(x)
165
177
  unique.append(x)
166
178
  self.data = unique
167
-
179
+
168
180
  return self._parent # Return parent for chaining
169
-
181
+
170
182
  def from_lines(
171
183
  self,
172
184
  obj: Optional[Union["Page", "Region"]] = None,
173
- threshold: Union[float, str] = 'auto',
185
+ threshold: Union[float, str] = "auto",
174
186
  source_label: Optional[str] = None,
175
187
  max_lines: Optional[int] = None,
176
188
  outer: bool = False,
177
- detection_method: str = 'vector',
189
+ detection_method: str = "vector",
178
190
  resolution: int = 192,
179
191
  *,
180
192
  n: Optional[int] = None,
181
193
  min_gap: Optional[int] = None,
182
- **detect_kwargs
194
+ append: bool = False,
195
+ **detect_kwargs,
183
196
  ) -> "Guides":
184
197
  """
185
198
  Create guides from detected line elements.
186
-
199
+
187
200
  Args:
188
201
  obj: Page/Region to search (uses parent's context if None)
189
202
  threshold: Line detection threshold ('auto' or float 0.0-1.0)
@@ -198,14 +211,14 @@ class GuidesList(UserList):
198
211
  resolution: DPI for pixel-based detection (default: 192)
199
212
  **detect_kwargs: Additional parameters for pixel-based detection
200
213
  (e.g., min_gap_h, min_gap_v, binarization_method, etc.)
201
-
214
+
202
215
  Returns:
203
216
  Parent Guides object for chaining
204
217
  """
205
218
  target_obj = obj or self._parent.context
206
219
  if target_obj is None:
207
220
  raise ValueError("No object provided and no context available")
208
-
221
+
209
222
  # Resolve max_lines via alias `n` (n takes priority)
210
223
  if n is not None:
211
224
  if n <= 0:
@@ -213,16 +226,16 @@ class GuidesList(UserList):
213
226
  max_lines = n
214
227
 
215
228
  # Set appropriate max_lines parameter for underlying API
216
- max_lines_h = max_lines if self._axis == 'horizontal' else None
217
- max_lines_v = max_lines if self._axis == 'vertical' else None
218
-
229
+ max_lines_h = max_lines if self._axis == "horizontal" else None
230
+ max_lines_v = max_lines if self._axis == "vertical" else None
231
+
219
232
  # Map generic `min_gap` to axis-specific argument expected by detection
220
233
  if min_gap is not None:
221
234
  if min_gap < 1:
222
235
  raise ValueError("min_gap must be ≥ 1 pixel")
223
- axis_key = 'min_gap_h' if self._axis == 'horizontal' else 'min_gap_v'
236
+ axis_key = "min_gap_h" if self._axis == "horizontal" else "min_gap_v"
224
237
  detect_kwargs.setdefault(axis_key, min_gap)
225
-
238
+
226
239
  # Create guides for this axis
227
240
  new_guides = Guides.from_lines(
228
241
  obj=target_obj,
@@ -234,15 +247,21 @@ class GuidesList(UserList):
234
247
  outer=outer,
235
248
  detection_method=detection_method,
236
249
  resolution=resolution,
237
- **detect_kwargs
250
+ **detect_kwargs,
238
251
  )
239
-
240
- # Add to our list
241
- if self._axis == 'vertical':
242
- self.extend(new_guides.vertical)
252
+
253
+ # Replace or append based on parameter
254
+ if append:
255
+ if self._axis == "vertical":
256
+ self.extend(new_guides.vertical)
257
+ else:
258
+ self.extend(new_guides.horizontal)
243
259
  else:
244
- self.extend(new_guides.horizontal)
245
-
260
+ if self._axis == "vertical":
261
+ self.data = list(new_guides.vertical)
262
+ else:
263
+ self.data = list(new_guides.horizontal)
264
+
246
265
  # Remove duplicates
247
266
  seen = set()
248
267
  unique = []
@@ -251,41 +270,42 @@ class GuidesList(UserList):
251
270
  seen.add(x)
252
271
  unique.append(x)
253
272
  self.data = unique
254
-
273
+
255
274
  return self._parent
256
-
275
+
257
276
  def from_whitespace(
258
- self,
259
- obj: Optional[Union["Page", "Region"]] = None,
260
- min_gap: float = 10
277
+ self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10,
278
+ *, append: bool = False
261
279
  ) -> "Guides":
262
280
  """
263
281
  Create guides from whitespace gaps.
264
-
282
+
265
283
  Args:
266
284
  obj: Page/Region to analyze (uses parent's context if None)
267
285
  min_gap: Minimum gap size to consider
268
-
286
+
269
287
  Returns:
270
288
  Parent Guides object for chaining
271
289
  """
272
290
  target_obj = obj or self._parent.context
273
291
  if target_obj is None:
274
292
  raise ValueError("No object provided and no context available")
275
-
293
+
276
294
  # Create guides for this axis
277
- new_guides = Guides.from_whitespace(
278
- obj=target_obj,
279
- axis=self._axis,
280
- min_gap=min_gap
281
- )
282
-
283
- # Add to our list
284
- if self._axis == 'vertical':
285
- self.extend(new_guides.vertical)
295
+ new_guides = Guides.from_whitespace(obj=target_obj, axis=self._axis, min_gap=min_gap)
296
+
297
+ # Replace or append
298
+ if append:
299
+ if self._axis == "vertical":
300
+ self.extend(new_guides.vertical)
301
+ else:
302
+ self.extend(new_guides.horizontal)
286
303
  else:
287
- self.extend(new_guides.horizontal)
288
-
304
+ if self._axis == "vertical":
305
+ self.data = list(new_guides.vertical)
306
+ else:
307
+ self.data = list(new_guides.horizontal)
308
+
289
309
  # Remove duplicates
290
310
  seen = set()
291
311
  unique = []
@@ -294,37 +314,33 @@ class GuidesList(UserList):
294
314
  seen.add(x)
295
315
  unique.append(x)
296
316
  self.data = unique
297
-
317
+
298
318
  return self._parent
299
-
319
+
300
320
  def divide(self, n: int = 2, obj: Optional[Union["Page", "Region"]] = None) -> "Guides":
301
321
  """
302
322
  Divide the space evenly along this axis.
303
-
323
+
304
324
  Args:
305
325
  n: Number of divisions (creates n-1 guides)
306
326
  obj: Object to divide (uses parent's context if None)
307
-
327
+
308
328
  Returns:
309
329
  Parent Guides object for chaining
310
330
  """
311
331
  target_obj = obj or self._parent.context
312
332
  if target_obj is None:
313
333
  raise ValueError("No object provided and no context available")
314
-
334
+
315
335
  # Create guides using divide
316
- new_guides = Guides.divide(
317
- obj=target_obj,
318
- n=n,
319
- axis=self._axis
320
- )
321
-
322
- # Add to our list
323
- if self._axis == 'vertical':
324
- self.extend(new_guides.vertical)
336
+ new_guides = Guides.divide(obj=target_obj, n=n, axis=self._axis)
337
+
338
+ # Replace existing guides instead of extending (no append option here)
339
+ if self._axis == "vertical":
340
+ self.data = list(new_guides.vertical)
325
341
  else:
326
- self.extend(new_guides.horizontal)
327
-
342
+ self.data = list(new_guides.horizontal)
343
+
328
344
  # Remove duplicates
329
345
  seen = set()
330
346
  unique = []
@@ -333,45 +349,45 @@ class GuidesList(UserList):
333
349
  seen.add(x)
334
350
  unique.append(x)
335
351
  self.data = unique
336
-
352
+
337
353
  return self._parent
338
-
354
+
339
355
  def snap_to_whitespace(
340
356
  self,
341
357
  min_gap: float = 10.0,
342
- detection_method: str = 'pixels',
343
- threshold: Union[float, str] = 'auto',
344
- on_no_snap: str = 'warn',
345
- obj: Optional[Union["Page", "Region"]] = None
358
+ detection_method: str = "pixels",
359
+ threshold: Union[float, str] = "auto",
360
+ on_no_snap: str = "warn",
361
+ obj: Optional[Union["Page", "Region"]] = None,
346
362
  ) -> "Guides":
347
363
  """
348
364
  Snap guides in this axis to whitespace gaps.
349
-
365
+
350
366
  Args:
351
367
  min_gap: Minimum gap size to consider
352
368
  detection_method: 'pixels' or 'text' for gap detection
353
369
  threshold: Threshold for whitespace detection (0.0-1.0) or 'auto'
354
370
  on_no_snap: What to do when snapping fails ('warn', 'raise', 'ignore')
355
371
  obj: Object to analyze (uses parent's context if None)
356
-
372
+
357
373
  Returns:
358
374
  Parent Guides object for chaining
359
375
  """
360
376
  target_obj = obj or self._parent.context
361
377
  if target_obj is None:
362
378
  raise ValueError("No object provided and no context available")
363
-
379
+
364
380
  # Use the parent's snap_to_whitespace but only for this axis
365
381
  original_guides = self.data.copy()
366
-
382
+
367
383
  # Temporarily set the parent's guides to only this axis
368
- if self._axis == 'vertical':
384
+ if self._axis == "vertical":
369
385
  original_horizontal = self._parent.horizontal.data.copy()
370
386
  self._parent.horizontal.data = []
371
387
  else:
372
388
  original_vertical = self._parent.vertical.data.copy()
373
389
  self._parent.vertical.data = []
374
-
390
+
375
391
  try:
376
392
  # Call the parent's method
377
393
  self._parent.snap_to_whitespace(
@@ -379,140 +395,143 @@ class GuidesList(UserList):
379
395
  min_gap=min_gap,
380
396
  detection_method=detection_method,
381
397
  threshold=threshold,
382
- on_no_snap=on_no_snap
398
+ on_no_snap=on_no_snap,
383
399
  )
384
-
400
+
385
401
  # Update our data from the parent
386
- if self._axis == 'vertical':
402
+ if self._axis == "vertical":
387
403
  self.data = self._parent.vertical.data.copy()
388
404
  else:
389
405
  self.data = self._parent.horizontal.data.copy()
390
-
406
+
391
407
  finally:
392
408
  # Restore the other axis
393
- if self._axis == 'vertical':
409
+ if self._axis == "vertical":
394
410
  self._parent.horizontal.data = original_horizontal
395
411
  else:
396
412
  self._parent.vertical.data = original_vertical
397
-
413
+
398
414
  return self._parent
399
-
415
+
400
416
  def snap_to_content(
401
417
  self,
402
- markers: Union[str, List[str], "ElementCollection", None] = 'text',
403
- align: Literal['left', 'right', 'center'] = 'left',
418
+ markers: Union[str, List[str], "ElementCollection", None] = "text",
419
+ align: Literal["left", "right", "center"] = "left",
404
420
  tolerance: float = 5,
405
- obj: Optional[Union["Page", "Region"]] = None
421
+ obj: Optional[Union["Page", "Region"]] = None,
406
422
  ) -> "Guides":
407
423
  """
408
424
  Snap guides in this axis to nearby text content.
409
-
425
+
410
426
  Args:
411
427
  markers: Content to snap to. Can be:
412
428
  - str: single selector or literal text (default: 'text' for all text)
413
- - List[str]: list of selectors or literal text strings
429
+ - List[str]: list of selectors or literal text strings
414
430
  - ElementCollection: collection of elements
415
431
  - None: no markers (no snapping)
416
432
  align: How to align to the found text
417
433
  tolerance: Maximum distance to move when snapping
418
434
  obj: Object to search (uses parent's context if None)
419
-
435
+
420
436
  Returns:
421
437
  Parent Guides object for chaining
422
438
  """
423
439
  target_obj = obj or self._parent.context
424
440
  if target_obj is None:
425
441
  raise ValueError("No object provided and no context available")
426
-
442
+
427
443
  # Handle special case of 'text' as a selector for all text
428
- if markers == 'text':
444
+ if markers == "text":
429
445
  # Get all text elements
430
- if hasattr(target_obj, 'find_all'):
431
- text_elements = target_obj.find_all('text')
432
- if hasattr(text_elements, 'elements'):
446
+ if hasattr(target_obj, "find_all"):
447
+ text_elements = target_obj.find_all("text")
448
+ if hasattr(text_elements, "elements"):
433
449
  text_elements = text_elements.elements
434
-
450
+
435
451
  # Snap each guide to the nearest text element
436
452
  for i, guide_pos in enumerate(self.data):
437
- best_distance = float('inf')
453
+ best_distance = float("inf")
438
454
  best_pos = guide_pos
439
-
455
+
440
456
  for elem in text_elements:
441
457
  # Calculate target position based on alignment
442
- if self._axis == 'vertical':
443
- if align == 'left':
458
+ if self._axis == "vertical":
459
+ if align == "left":
444
460
  elem_pos = elem.x0
445
- elif align == 'right':
461
+ elif align == "right":
446
462
  elem_pos = elem.x1
447
463
  else: # center
448
464
  elem_pos = (elem.x0 + elem.x1) / 2
449
465
  else: # horizontal
450
- if align == 'left': # top for horizontal
466
+ if align == "left": # top for horizontal
451
467
  elem_pos = elem.top
452
- elif align == 'right': # bottom for horizontal
468
+ elif align == "right": # bottom for horizontal
453
469
  elem_pos = elem.bottom
454
470
  else: # center
455
471
  elem_pos = (elem.top + elem.bottom) / 2
456
-
472
+
457
473
  # Check if this is closer than current best
458
474
  distance = abs(guide_pos - elem_pos)
459
475
  if distance < best_distance and distance <= tolerance:
460
476
  best_distance = distance
461
477
  best_pos = elem_pos
462
-
478
+
463
479
  # Update guide position if we found a good snap
464
480
  if best_pos != guide_pos:
465
481
  self.data[i] = best_pos
466
- logger.debug(f"Snapped {self._axis} guide from {guide_pos:.1f} to {best_pos:.1f}")
482
+ logger.debug(
483
+ f"Snapped {self._axis} guide from {guide_pos:.1f} to {best_pos:.1f}"
484
+ )
467
485
  else:
468
486
  logger.warning("Object does not support find_all for text snapping")
469
487
  else:
470
488
  # Original behavior for specific markers
471
489
  marker_texts = _normalize_markers(markers, target_obj)
472
-
490
+
473
491
  # Find each marker and snap guides
474
492
  for marker in marker_texts:
475
- if hasattr(target_obj, 'find'):
493
+ if hasattr(target_obj, "find"):
476
494
  element = target_obj.find(f'text:contains("{marker}")')
477
495
  if not element:
478
496
  logger.warning(f"Could not find text '{marker}' for snapping")
479
497
  continue
480
-
498
+
481
499
  # Determine target position based on alignment
482
- if self._axis == 'vertical':
483
- if align == 'left':
500
+ if self._axis == "vertical":
501
+ if align == "left":
484
502
  target_pos = element.x0
485
- elif align == 'right':
503
+ elif align == "right":
486
504
  target_pos = element.x1
487
505
  else: # center
488
506
  target_pos = (element.x0 + element.x1) / 2
489
507
  else: # horizontal
490
- if align == 'left': # top for horizontal
508
+ if align == "left": # top for horizontal
491
509
  target_pos = element.top
492
- elif align == 'right': # bottom for horizontal
510
+ elif align == "right": # bottom for horizontal
493
511
  target_pos = element.bottom
494
512
  else: # center
495
513
  target_pos = (element.top + element.bottom) / 2
496
-
514
+
497
515
  # Find closest guide and snap if within tolerance
498
516
  if self.data:
499
- closest_idx = min(range(len(self.data)),
500
- key=lambda i: abs(self.data[i] - target_pos))
517
+ closest_idx = min(
518
+ range(len(self.data)), key=lambda i: abs(self.data[i] - target_pos)
519
+ )
501
520
  if abs(self.data[closest_idx] - target_pos) <= tolerance:
502
521
  self.data[closest_idx] = target_pos
503
-
522
+
504
523
  # Sort after snapping
505
524
  self.data.sort()
506
525
  return self._parent
507
-
526
+
508
527
  def shift(self, index: int, offset: float) -> "Guides":
509
528
  """
510
529
  Move a specific guide in this axis by a offset amount.
511
-
530
+
512
531
  Args:
513
532
  index: Index of the guide to move
514
533
  offset: Amount to move (positive = right/down)
515
-
534
+
516
535
  Returns:
517
536
  Parent Guides object for chaining
518
537
  """
@@ -521,18 +540,18 @@ class GuidesList(UserList):
521
540
  self.data.sort()
522
541
  else:
523
542
  logger.warning(f"Guide index {index} out of range for {self._axis} axis")
524
-
543
+
525
544
  return self._parent
526
-
545
+
527
546
  def add(self, position: Union[float, List[float]]) -> "Guides":
528
547
  """
529
548
  Add one or more guides at the specified position(s).
530
-
549
+
531
550
  Args:
532
551
  position: Coordinate(s) to add guide(s) at. Can be:
533
552
  - float: single position
534
553
  - List[float]: multiple positions
535
-
554
+
536
555
  Returns:
537
556
  Parent Guides object for chaining
538
557
  """
@@ -543,34 +562,34 @@ class GuidesList(UserList):
543
562
  else:
544
563
  # Add single position
545
564
  self.append(float(position))
546
-
565
+
547
566
  self.data.sort()
548
567
  return self._parent
549
-
568
+
550
569
  def remove_at(self, index: int) -> "Guides":
551
570
  """
552
571
  Remove a guide by index.
553
-
572
+
554
573
  Args:
555
574
  index: Index of guide to remove
556
-
575
+
557
576
  Returns:
558
577
  Parent Guides object for chaining
559
578
  """
560
579
  if 0 <= index < len(self.data):
561
580
  self.data.pop(index)
562
581
  return self._parent
563
-
582
+
564
583
  def clear_all(self) -> "Guides":
565
584
  """
566
585
  Remove all guides from this axis.
567
-
586
+
568
587
  Returns:
569
588
  Parent Guides object for chaining
570
589
  """
571
590
  self.data.clear()
572
591
  return self._parent
573
-
592
+
574
593
  def __add__(self, other):
575
594
  """Handle addition of GuidesList objects by returning combined data."""
576
595
  if isinstance(other, GuidesList):
@@ -584,11 +603,11 @@ class GuidesList(UserList):
584
603
  class Guides:
585
604
  """
586
605
  Manages vertical and horizontal guide lines for table extraction and layout analysis.
587
-
606
+
588
607
  Guides are collections of coordinates that can be used to define table boundaries,
589
608
  column positions, or general layout structures. They can be created through various
590
609
  detection methods or manually specified.
591
-
610
+
592
611
  Attributes:
593
612
  verticals: List of x-coordinates for vertical guide lines
594
613
  horizontals: List of y-coordinates for horizontal guide lines
@@ -596,7 +615,7 @@ class Guides:
596
615
  bounds: Optional bounding box (x0, y0, x1, y1) for relative coordinate conversion
597
616
  snap_behavior: How to handle failed snapping operations ('warn', 'ignore', 'raise')
598
617
  """
599
-
618
+
600
619
  def __init__(
601
620
  self,
602
621
  verticals: Optional[Union[List[float], "Page", "Region"]] = None,
@@ -604,52 +623,63 @@ class Guides:
604
623
  context: Optional[Union["Page", "Region"]] = None,
605
624
  bounds: Optional[Tuple[float, float, float, float]] = None,
606
625
  relative: bool = False,
607
- snap_behavior: Literal['raise', 'warn', 'ignore'] = 'warn'
626
+ snap_behavior: Literal["raise", "warn", "ignore"] = "warn",
608
627
  ):
609
628
  """
610
629
  Initialize a Guides object.
611
-
630
+
612
631
  Args:
613
632
  verticals: List of x-coordinates for vertical guides, or a Page/Region as context
614
- horizontals: List of y-coordinates for horizontal guides
633
+ horizontals: List of y-coordinates for horizontal guides
615
634
  context: Page or Region object these guides were created from
616
635
  bounds: Bounding box (x0, top, x1, bottom) if context not provided
617
636
  relative: Whether coordinates are relative (0-1) or absolute
618
637
  snap_behavior: How to handle snapping conflicts ('raise', 'warn', or 'ignore')
619
638
  """
620
639
  # Handle Guides(page) shorthand
621
- if verticals is not None and not isinstance(verticals, (list, tuple)) and horizontals is None and context is None:
640
+ if (
641
+ verticals is not None
642
+ and not isinstance(verticals, (list, tuple))
643
+ and horizontals is None
644
+ and context is None
645
+ ):
622
646
  # First argument is a page/region, not coordinates
623
647
  context = verticals
624
648
  verticals = None
625
-
649
+
626
650
  self.context = context
627
651
  self.bounds = bounds
628
652
  self.relative = relative
629
653
  self.snap_behavior = snap_behavior
630
-
654
+
631
655
  # Initialize with GuidesList instances
632
656
  self._vertical = GuidesList(self, "vertical", sorted([float(x) for x in (verticals or [])]))
633
- self._horizontal = GuidesList(self, "horizontal", sorted([float(y) for y in (horizontals or [])]))
634
-
657
+ self._horizontal = GuidesList(
658
+ self, "horizontal", sorted([float(y) for y in (horizontals or [])])
659
+ )
660
+
635
661
  # Determine bounds from context if needed
636
662
  if self.bounds is None and self.context is not None:
637
- if hasattr(self.context, 'bbox'):
663
+ if hasattr(self.context, "bbox"):
638
664
  self.bounds = self.context.bbox
639
- elif hasattr(self.context, 'x0'):
640
- self.bounds = (self.context.x0, self.context.top,
641
- self.context.x1, self.context.bottom)
642
-
665
+ elif hasattr(self.context, "x0"):
666
+ self.bounds = (
667
+ self.context.x0,
668
+ self.context.top,
669
+ self.context.x1,
670
+ self.context.bottom,
671
+ )
672
+
643
673
  # Convert relative to absolute if needed
644
674
  if self.relative and self.bounds:
645
675
  x0, top, x1, bottom = self.bounds
646
676
  width = x1 - x0
647
677
  height = bottom - top
648
-
678
+
649
679
  self._vertical.data = [x0 + v * width for v in self._vertical]
650
680
  self._horizontal.data = [top + h * height for h in self._horizontal]
651
681
  self.relative = False
652
-
682
+
653
683
  @property
654
684
  def vertical(self) -> GuidesList:
655
685
  """Get vertical guide coordinates."""
@@ -665,8 +695,10 @@ class Guides:
665
695
  self._vertical.data = sorted([float(x) for x in value.vertical])
666
696
  elif isinstance(value, str):
667
697
  # Explicitly reject strings to avoid confusing iteration over characters
668
- raise TypeError(f"vertical cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
669
- elif hasattr(value, '__iter__'):
698
+ raise TypeError(
699
+ f"vertical cannot be a string, got '{value}'. Use a list of coordinates or Guides object."
700
+ )
701
+ elif hasattr(value, "__iter__"):
670
702
  # Handle list/tuple of coordinates
671
703
  try:
672
704
  self._vertical.data = sorted([float(x) for x in value])
@@ -690,8 +722,10 @@ class Guides:
690
722
  self._horizontal.data = sorted([float(y) for y in value.horizontal])
691
723
  elif isinstance(value, str):
692
724
  # Explicitly reject strings
693
- raise TypeError(f"horizontal cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
694
- elif hasattr(value, '__iter__'):
725
+ raise TypeError(
726
+ f"horizontal cannot be a string, got '{value}'. Use a list of coordinates or Guides object."
727
+ )
728
+ elif hasattr(value, "__iter__"):
695
729
  # Handle list/tuple of coordinates
696
730
  try:
697
731
  self._horizontal.data = sorted([float(y) for y in value])
@@ -699,24 +733,24 @@ class Guides:
699
733
  raise TypeError(f"horizontal must contain numeric values, got {value}: {e}")
700
734
  else:
701
735
  raise TypeError(f"horizontal must be a list, Guides object, or None, got {type(value)}")
702
-
736
+
703
737
  def _get_context_bounds(self) -> Optional[Tuple[float, float, float, float]]:
704
738
  """Get bounds from context if available."""
705
739
  if self.context is None:
706
740
  return None
707
-
708
- if hasattr(self.context, 'bbox'):
741
+
742
+ if hasattr(self.context, "bbox"):
709
743
  return self.context.bbox
710
- elif hasattr(self.context, 'x0') and hasattr(self.context, 'top'):
744
+ elif hasattr(self.context, "x0") and hasattr(self.context, "top"):
711
745
  return (self.context.x0, self.context.top, self.context.x1, self.context.bottom)
712
- elif hasattr(self.context, 'width') and hasattr(self.context, 'height'):
746
+ elif hasattr(self.context, "width") and hasattr(self.context, "height"):
713
747
  return (0, 0, self.context.width, self.context.height)
714
748
  return None
715
-
749
+
716
750
  # -------------------------------------------------------------------------
717
751
  # Factory Methods
718
752
  # -------------------------------------------------------------------------
719
-
753
+
720
754
  @classmethod
721
755
  def divide(
722
756
  cls,
@@ -724,28 +758,28 @@ class Guides:
724
758
  n: Optional[int] = None,
725
759
  cols: Optional[int] = None,
726
760
  rows: Optional[int] = None,
727
- axis: Literal['vertical', 'horizontal', 'both'] = 'both'
761
+ axis: Literal["vertical", "horizontal", "both"] = "both",
728
762
  ) -> "Guides":
729
763
  """
730
764
  Create guides by evenly dividing an object.
731
-
765
+
732
766
  Args:
733
767
  obj: Object to divide (Page, Region, or bbox tuple)
734
768
  n: Number of divisions (creates n+1 guides). Used if cols/rows not specified.
735
769
  cols: Number of columns (creates cols+1 vertical guides)
736
770
  rows: Number of rows (creates rows+1 horizontal guides)
737
771
  axis: Which axis to divide along
738
-
772
+
739
773
  Returns:
740
774
  New Guides object with evenly spaced lines
741
-
775
+
742
776
  Examples:
743
777
  # Divide into 3 columns
744
778
  guides = Guides.divide(page, cols=3)
745
-
779
+
746
780
  # Divide into 5 rows
747
781
  guides = Guides.divide(region, rows=5)
748
-
782
+
749
783
  # Divide both axes
750
784
  guides = Guides.divide(page, cols=3, rows=5)
751
785
  """
@@ -755,52 +789,52 @@ class Guides:
755
789
  context = None
756
790
  else:
757
791
  context = obj
758
- if hasattr(obj, 'bbox'):
792
+ if hasattr(obj, "bbox"):
759
793
  bounds = obj.bbox
760
- elif hasattr(obj, 'x0'):
794
+ elif hasattr(obj, "x0"):
761
795
  bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
762
796
  else:
763
797
  bounds = (0, 0, obj.width, obj.height)
764
-
798
+
765
799
  x0, y0, x1, y1 = bounds
766
800
  verticals = []
767
801
  horizontals = []
768
-
802
+
769
803
  # Handle vertical guides
770
- if axis in ('vertical', 'both'):
804
+ if axis in ("vertical", "both"):
771
805
  n_vertical = cols + 1 if cols is not None else (n + 1 if n is not None else 0)
772
806
  if n_vertical > 0:
773
807
  for i in range(n_vertical):
774
808
  x = x0 + (x1 - x0) * i / (n_vertical - 1)
775
809
  verticals.append(float(x))
776
-
810
+
777
811
  # Handle horizontal guides
778
- if axis in ('horizontal', 'both'):
812
+ if axis in ("horizontal", "both"):
779
813
  n_horizontal = rows + 1 if rows is not None else (n + 1 if n is not None else 0)
780
814
  if n_horizontal > 0:
781
815
  for i in range(n_horizontal):
782
816
  y = y0 + (y1 - y0) * i / (n_horizontal - 1)
783
817
  horizontals.append(float(y))
784
-
818
+
785
819
  return cls(verticals=verticals, horizontals=horizontals, context=context, bounds=bounds)
786
-
820
+
787
821
  @classmethod
788
822
  def from_lines(
789
823
  cls,
790
824
  obj: Union["Page", "Region"],
791
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
792
- threshold: Union[float, str] = 'auto',
825
+ axis: Literal["vertical", "horizontal", "both"] = "both",
826
+ threshold: Union[float, str] = "auto",
793
827
  source_label: Optional[str] = None,
794
828
  max_lines_h: Optional[int] = None,
795
829
  max_lines_v: Optional[int] = None,
796
830
  outer: bool = False,
797
- detection_method: str = 'vector',
831
+ detection_method: str = "vector",
798
832
  resolution: int = 192,
799
- **detect_kwargs
833
+ **detect_kwargs,
800
834
  ) -> "Guides":
801
835
  """
802
836
  Create guides from detected line elements.
803
-
837
+
804
838
  Args:
805
839
  obj: Page or Region to detect lines from
806
840
  axis: Which orientations to detect
@@ -818,108 +852,128 @@ class Guides:
818
852
  - morph_op_h/v: Morphological operations ('open', 'close', 'none')
819
853
  - smoothing_sigma_h/v: Gaussian smoothing sigma
820
854
  - method: 'projection' (default) or 'lsd' (requires opencv)
821
-
855
+
822
856
  Returns:
823
857
  New Guides object with detected line positions
824
858
  """
825
859
  # Get bounds for potential outer guides
826
- if hasattr(obj, 'bbox'):
860
+ if hasattr(obj, "bbox"):
827
861
  bounds = obj.bbox
828
- elif hasattr(obj, 'x0'):
862
+ elif hasattr(obj, "x0"):
829
863
  bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
830
- elif hasattr(obj, 'width'):
864
+ elif hasattr(obj, "width"):
831
865
  bounds = (0, 0, obj.width, obj.height)
832
866
  else:
833
867
  bounds = None
834
-
868
+
835
869
  verticals = []
836
870
  horizontals = []
837
-
838
- if detection_method == 'pixels':
871
+
872
+ if detection_method == "pixels":
839
873
  # Use pixel-based line detection
840
- if not hasattr(obj, 'detect_lines'):
874
+ if not hasattr(obj, "detect_lines"):
841
875
  raise ValueError(f"Object {obj} does not support pixel-based line detection")
842
-
876
+
843
877
  # Set up detection parameters
844
878
  detect_params = {
845
- 'resolution': resolution,
846
- 'source_label': source_label or 'guides_detection',
847
- 'horizontal': axis in ('horizontal', 'both'),
848
- 'vertical': axis in ('vertical', 'both'),
849
- 'replace': True, # Replace any existing lines with this source
850
- 'method': detect_kwargs.get('method', 'projection'),
879
+ "resolution": resolution,
880
+ "source_label": source_label or "guides_detection",
881
+ "horizontal": axis in ("horizontal", "both"),
882
+ "vertical": axis in ("vertical", "both"),
883
+ "replace": True, # Replace any existing lines with this source
884
+ "method": detect_kwargs.get("method", "projection"),
851
885
  }
852
-
886
+
853
887
  # Handle threshold parameter
854
- if threshold == 'auto':
888
+ if threshold == "auto":
855
889
  # Auto mode: use very low thresholds with max_lines constraints
856
- detect_params['peak_threshold_h'] = 0.0
857
- detect_params['peak_threshold_v'] = 0.0
858
- detect_params['max_lines_h'] = max_lines_h
859
- detect_params['max_lines_v'] = max_lines_v
890
+ detect_params["peak_threshold_h"] = 0.0
891
+ detect_params["peak_threshold_v"] = 0.0
892
+ detect_params["max_lines_h"] = max_lines_h
893
+ detect_params["max_lines_v"] = max_lines_v
860
894
  else:
861
895
  # Fixed threshold mode
862
- detect_params['peak_threshold_h'] = float(threshold) if axis in ('horizontal', 'both') else 1.0
863
- detect_params['peak_threshold_v'] = float(threshold) if axis in ('vertical', 'both') else 1.0
864
- detect_params['max_lines_h'] = max_lines_h
865
- detect_params['max_lines_v'] = max_lines_v
866
-
896
+ detect_params["peak_threshold_h"] = (
897
+ float(threshold) if axis in ("horizontal", "both") else 1.0
898
+ )
899
+ detect_params["peak_threshold_v"] = (
900
+ float(threshold) if axis in ("vertical", "both") else 1.0
901
+ )
902
+ detect_params["max_lines_h"] = max_lines_h
903
+ detect_params["max_lines_v"] = max_lines_v
904
+
867
905
  # Add any additional detection parameters
868
- for key in ['min_gap_h', 'min_gap_v', 'binarization_method',
869
- 'adaptive_thresh_block_size', 'adaptive_thresh_C_val',
870
- 'morph_op_h', 'morph_kernel_h', 'morph_op_v', 'morph_kernel_v',
871
- 'smoothing_sigma_h', 'smoothing_sigma_v', 'peak_width_rel_height']:
906
+ for key in [
907
+ "min_gap_h",
908
+ "min_gap_v",
909
+ "binarization_method",
910
+ "adaptive_thresh_block_size",
911
+ "adaptive_thresh_C_val",
912
+ "morph_op_h",
913
+ "morph_kernel_h",
914
+ "morph_op_v",
915
+ "morph_kernel_v",
916
+ "smoothing_sigma_h",
917
+ "smoothing_sigma_v",
918
+ "peak_width_rel_height",
919
+ ]:
872
920
  if key in detect_kwargs:
873
921
  detect_params[key] = detect_kwargs[key]
874
-
922
+
875
923
  # Perform the detection
876
924
  obj.detect_lines(**detect_params)
877
-
925
+
878
926
  # Now get the detected lines and use them
879
- if hasattr(obj, 'lines'):
927
+ if hasattr(obj, "lines"):
880
928
  lines = obj.lines
881
- elif hasattr(obj, 'find_all'):
882
- lines = obj.find_all('line')
929
+ elif hasattr(obj, "find_all"):
930
+ lines = obj.find_all("line")
883
931
  else:
884
932
  lines = []
885
-
933
+
886
934
  # Filter by the source we just used
887
- lines = [l for l in lines if getattr(l, 'source', None) == detect_params['source_label']]
888
-
935
+ lines = [
936
+ l for l in lines if getattr(l, "source", None) == detect_params["source_label"]
937
+ ]
938
+
889
939
  else: # detection_method == 'vector' (default)
890
940
  # Get existing lines from the object
891
- if hasattr(obj, 'lines'):
941
+ if hasattr(obj, "lines"):
892
942
  lines = obj.lines
893
- elif hasattr(obj, 'find_all'):
894
- lines = obj.find_all('line')
943
+ elif hasattr(obj, "find_all"):
944
+ lines = obj.find_all("line")
895
945
  else:
896
946
  logger.warning(f"Object {obj} has no lines or find_all method")
897
947
  lines = []
898
-
948
+
899
949
  # Filter by source if specified
900
950
  if source_label:
901
- lines = [l for l in lines if getattr(l, 'source', None) == source_label]
902
-
951
+ lines = [l for l in lines if getattr(l, "source", None) == source_label]
952
+
903
953
  # Process lines (same logic for both methods)
904
954
  # Separate lines by orientation and collect with metadata for ranking
905
955
  h_line_data = [] # (y_coord, length, line_obj)
906
956
  v_line_data = [] # (x_coord, length, line_obj)
907
-
957
+
908
958
  for line in lines:
909
- if hasattr(line, 'is_horizontal') and hasattr(line, 'is_vertical'):
910
- if line.is_horizontal and axis in ('horizontal', 'both'):
959
+ if hasattr(line, "is_horizontal") and hasattr(line, "is_vertical"):
960
+ if line.is_horizontal and axis in ("horizontal", "both"):
911
961
  # Use the midpoint y-coordinate for horizontal lines
912
962
  y = (line.top + line.bottom) / 2
913
963
  # Calculate line length for ranking
914
- length = getattr(line, 'width', abs(getattr(line, 'x1', 0) - getattr(line, 'x0', 0)))
964
+ length = getattr(
965
+ line, "width", abs(getattr(line, "x1", 0) - getattr(line, "x0", 0))
966
+ )
915
967
  h_line_data.append((y, length, line))
916
- elif line.is_vertical and axis in ('vertical', 'both'):
968
+ elif line.is_vertical and axis in ("vertical", "both"):
917
969
  # Use the midpoint x-coordinate for vertical lines
918
970
  x = (line.x0 + line.x1) / 2
919
971
  # Calculate line length for ranking
920
- length = getattr(line, 'height', abs(getattr(line, 'bottom', 0) - getattr(line, 'top', 0)))
972
+ length = getattr(
973
+ line, "height", abs(getattr(line, "bottom", 0) - getattr(line, "top", 0))
974
+ )
921
975
  v_line_data.append((x, length, line))
922
-
976
+
923
977
  # Process horizontal lines
924
978
  if max_lines_h is not None and h_line_data:
925
979
  # Sort by length (longer lines are typically more significant)
@@ -928,12 +982,14 @@ class Guides:
928
982
  selected_h = h_line_data[:max_lines_h]
929
983
  # Extract just the coordinates and sort by position
930
984
  horizontals = sorted([coord for coord, _, _ in selected_h])
931
- logger.debug(f"Selected {len(horizontals)} horizontal lines from {len(h_line_data)} candidates")
985
+ logger.debug(
986
+ f"Selected {len(horizontals)} horizontal lines from {len(h_line_data)} candidates"
987
+ )
932
988
  else:
933
989
  # Use all horizontal lines (original behavior)
934
990
  horizontals = [coord for coord, _, _ in h_line_data]
935
991
  horizontals = sorted(list(set(horizontals)))
936
-
992
+
937
993
  # Process vertical lines
938
994
  if max_lines_v is not None and v_line_data:
939
995
  # Sort by length (longer lines are typically more significant)
@@ -942,115 +998,117 @@ class Guides:
942
998
  selected_v = v_line_data[:max_lines_v]
943
999
  # Extract just the coordinates and sort by position
944
1000
  verticals = sorted([coord for coord, _, _ in selected_v])
945
- logger.debug(f"Selected {len(verticals)} vertical lines from {len(v_line_data)} candidates")
1001
+ logger.debug(
1002
+ f"Selected {len(verticals)} vertical lines from {len(v_line_data)} candidates"
1003
+ )
946
1004
  else:
947
1005
  # Use all vertical lines (original behavior)
948
1006
  verticals = [coord for coord, _, _ in v_line_data]
949
1007
  verticals = sorted(list(set(verticals)))
950
-
1008
+
951
1009
  # Add outer guides if requested
952
1010
  if outer and bounds:
953
- if axis in ('vertical', 'both'):
1011
+ if axis in ("vertical", "both"):
954
1012
  if not verticals or verticals[0] > bounds[0]:
955
1013
  verticals.insert(0, bounds[0]) # x0
956
1014
  if not verticals or verticals[-1] < bounds[2]:
957
1015
  verticals.append(bounds[2]) # x1
958
- if axis in ('horizontal', 'both'):
1016
+ if axis in ("horizontal", "both"):
959
1017
  if not horizontals or horizontals[0] > bounds[1]:
960
1018
  horizontals.insert(0, bounds[1]) # y0
961
1019
  if not horizontals or horizontals[-1] < bounds[3]:
962
1020
  horizontals.append(bounds[3]) # y1
963
-
1021
+
964
1022
  # Remove duplicates and sort again
965
1023
  verticals = sorted(list(set(verticals)))
966
1024
  horizontals = sorted(list(set(horizontals)))
967
-
1025
+
968
1026
  return cls(verticals=verticals, horizontals=horizontals, context=obj, bounds=bounds)
969
-
1027
+
970
1028
  @classmethod
971
1029
  def from_content(
972
1030
  cls,
973
1031
  obj: Union["Page", "Region"],
974
- axis: Literal['vertical', 'horizontal'] = 'vertical',
1032
+ axis: Literal["vertical", "horizontal"] = "vertical",
975
1033
  markers: Union[str, List[str], "ElementCollection", None] = None,
976
- align: Literal['left', 'right', 'center', 'between'] = 'left',
1034
+ align: Literal["left", "right", "center", "between"] = "left",
977
1035
  outer: bool = True,
978
- tolerance: float = 5
1036
+ tolerance: float = 5,
979
1037
  ) -> "Guides":
980
1038
  """
981
1039
  Create guides based on text content positions.
982
-
1040
+
983
1041
  Args:
984
1042
  obj: Page or Region to search for content
985
1043
  axis: Whether to create vertical or horizontal guides
986
1044
  markers: Content to search for. Can be:
987
1045
  - str: single selector (e.g., 'text:contains("Name")') or literal text
988
- - List[str]: list of selectors or literal text strings
1046
+ - List[str]: list of selectors or literal text strings
989
1047
  - ElementCollection: collection of elements to extract text from
990
1048
  - None: no markers
991
1049
  align: Where to place guides relative to found text
992
1050
  outer: Whether to add guides at the boundaries
993
1051
  tolerance: Maximum distance to search for text
994
-
1052
+
995
1053
  Returns:
996
1054
  New Guides object aligned to text content
997
1055
  """
998
1056
  guides_coords = []
999
1057
  bounds = None
1000
-
1058
+
1001
1059
  # Get bounds from object
1002
- if hasattr(obj, 'bbox'):
1060
+ if hasattr(obj, "bbox"):
1003
1061
  bounds = obj.bbox
1004
- elif hasattr(obj, 'x0'):
1062
+ elif hasattr(obj, "x0"):
1005
1063
  bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
1006
- elif hasattr(obj, 'width'):
1064
+ elif hasattr(obj, "width"):
1007
1065
  bounds = (0, 0, obj.width, obj.height)
1008
-
1066
+
1009
1067
  # Normalize markers to list of text strings
1010
1068
  marker_texts = _normalize_markers(markers, obj)
1011
-
1069
+
1012
1070
  # Find each marker and determine guide position
1013
1071
  for marker in marker_texts:
1014
- if hasattr(obj, 'find'):
1072
+ if hasattr(obj, "find"):
1015
1073
  element = obj.find(f'text:contains("{marker}")')
1016
1074
  if element:
1017
- if axis == 'vertical':
1018
- if align == 'left':
1075
+ if axis == "vertical":
1076
+ if align == "left":
1019
1077
  guides_coords.append(element.x0)
1020
- elif align == 'right':
1078
+ elif align == "right":
1021
1079
  guides_coords.append(element.x1)
1022
- elif align == 'center':
1080
+ elif align == "center":
1023
1081
  guides_coords.append((element.x0 + element.x1) / 2)
1024
- elif align == 'between':
1082
+ elif align == "between":
1025
1083
  # For between, collect left edges for processing later
1026
1084
  guides_coords.append(element.x0)
1027
1085
  else: # horizontal
1028
- if align == 'left': # top for horizontal
1086
+ if align == "left": # top for horizontal
1029
1087
  guides_coords.append(element.top)
1030
- elif align == 'right': # bottom for horizontal
1088
+ elif align == "right": # bottom for horizontal
1031
1089
  guides_coords.append(element.bottom)
1032
- elif align == 'center':
1090
+ elif align == "center":
1033
1091
  guides_coords.append((element.top + element.bottom) / 2)
1034
- elif align == 'between':
1092
+ elif align == "between":
1035
1093
  # For between, collect top edges for processing later
1036
1094
  guides_coords.append(element.top)
1037
-
1095
+
1038
1096
  # Handle 'between' alignment - find midpoints between adjacent markers
1039
- if align == 'between' and len(guides_coords) >= 2:
1097
+ if align == "between" and len(guides_coords) >= 2:
1040
1098
  # We need to get the right and left edges of each marker
1041
1099
  marker_bounds = []
1042
1100
  for marker in marker_texts:
1043
- if hasattr(obj, 'find'):
1101
+ if hasattr(obj, "find"):
1044
1102
  element = obj.find(f'text:contains("{marker}")')
1045
1103
  if element:
1046
- if axis == 'vertical':
1104
+ if axis == "vertical":
1047
1105
  marker_bounds.append((element.x0, element.x1))
1048
1106
  else: # horizontal
1049
1107
  marker_bounds.append((element.top, element.bottom))
1050
-
1108
+
1051
1109
  # Sort markers by their left edge (or top edge for horizontal)
1052
1110
  marker_bounds.sort(key=lambda x: x[0])
1053
-
1111
+
1054
1112
  # Create guides at midpoints between adjacent markers
1055
1113
  between_coords = []
1056
1114
  for i in range(len(marker_bounds) - 1):
@@ -1059,79 +1117,78 @@ class Guides:
1059
1117
  left_edge_next = marker_bounds[i + 1][0]
1060
1118
  midpoint = (right_edge_current + left_edge_next) / 2
1061
1119
  between_coords.append(midpoint)
1062
-
1120
+
1063
1121
  guides_coords = between_coords
1064
-
1122
+
1065
1123
  # Add outer guides if requested
1066
1124
  if outer and bounds:
1067
- if axis == 'vertical':
1125
+ if axis == "vertical":
1068
1126
  guides_coords.insert(0, bounds[0]) # x0
1069
- guides_coords.append(bounds[2]) # x1
1127
+ guides_coords.append(bounds[2]) # x1
1070
1128
  else:
1071
1129
  guides_coords.insert(0, bounds[1]) # y0
1072
- guides_coords.append(bounds[3]) # y1
1073
-
1130
+ guides_coords.append(bounds[3]) # y1
1131
+
1074
1132
  # Remove duplicates and sort
1075
1133
  guides_coords = sorted(list(set(guides_coords)))
1076
-
1134
+
1077
1135
  # Create guides object
1078
- if axis == 'vertical':
1136
+ if axis == "vertical":
1079
1137
  return cls(verticals=guides_coords, context=obj, bounds=bounds)
1080
1138
  else:
1081
1139
  return cls(horizontals=guides_coords, context=obj, bounds=bounds)
1082
-
1140
+
1083
1141
  @classmethod
1084
1142
  def from_whitespace(
1085
1143
  cls,
1086
1144
  obj: Union["Page", "Region"],
1087
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
1088
- min_gap: float = 10
1145
+ axis: Literal["vertical", "horizontal", "both"] = "both",
1146
+ min_gap: float = 10,
1089
1147
  ) -> "Guides":
1090
1148
  """
1091
1149
  Create guides by detecting whitespace gaps.
1092
-
1150
+
1093
1151
  Args:
1094
1152
  obj: Page or Region to analyze
1095
1153
  min_gap: Minimum gap size to consider as whitespace
1096
1154
  axis: Which axes to analyze for gaps
1097
-
1155
+
1098
1156
  Returns:
1099
1157
  New Guides object positioned at whitespace gaps
1100
1158
  """
1101
1159
  # This is a placeholder - would need sophisticated gap detection
1102
1160
  logger.info("Whitespace detection not yet implemented, using divide instead")
1103
1161
  return cls.divide(obj, n=3, axis=axis)
1104
-
1162
+
1105
1163
  @classmethod
1106
- def new(
1107
- cls,
1108
- context: Optional[Union["Page", "Region"]] = None
1109
- ) -> "Guides":
1164
+ def new(cls, context: Optional[Union["Page", "Region"]] = None) -> "Guides":
1110
1165
  """
1111
1166
  Create a new empty Guides object, optionally with a context.
1112
-
1167
+
1113
1168
  This provides a clean way to start building guides through chaining:
1114
1169
  guides = Guides.new(page).add_content(axis='vertical', markers=[...])
1115
-
1170
+
1116
1171
  Args:
1117
1172
  context: Optional Page or Region to use as default context for operations
1118
-
1173
+
1119
1174
  Returns:
1120
1175
  New empty Guides object
1121
1176
  """
1122
1177
  return cls(verticals=[], horizontals=[], context=context)
1123
-
1178
+
1124
1179
  # -------------------------------------------------------------------------
1125
1180
  # Manipulation Methods
1126
1181
  # -------------------------------------------------------------------------
1127
-
1182
+
1128
1183
  def snap_to_whitespace(
1129
1184
  self,
1130
- axis: str = 'vertical',
1185
+ axis: str = "vertical",
1131
1186
  min_gap: float = 10.0,
1132
- detection_method: str = 'pixels', # 'pixels' or 'text'
1133
- threshold: Union[float, str] = 'auto', # threshold for what counts as a trough (0.0-1.0) or 'auto'
1134
- on_no_snap: str = 'warn'
1187
+ detection_method: str = "pixels", # 'pixels' or 'text'
1188
+ threshold: Union[
1189
+ float, str
1190
+ ] = "auto", # threshold for what counts as a trough (0.0-1.0) or 'auto'
1191
+ on_no_snap: str = "warn",
1135
1192
  ) -> "Guides":
1136
1193
  """
1137
1194
  Snap guides to nearby whitespace gaps (troughs) using optimal assignment.
@@ -1161,11 +1218,11 @@ class Guides:
1161
1218
  logger.warning("No text elements found for whitespace detection")
1162
1219
  return self
1163
1220
 
1164
- if axis == 'vertical':
1221
+ if axis == "vertical":
1165
1222
  gaps = self._find_vertical_whitespace_gaps(text_elements, min_gap, threshold)
1166
1223
  if gaps:
1167
1224
  self._snap_guides_to_gaps(self.vertical.data, gaps, axis)
1168
- elif axis == 'horizontal':
1225
+ elif axis == "horizontal":
1169
1226
  gaps = self._find_horizontal_whitespace_gaps(text_elements, min_gap, threshold)
1170
1227
  if gaps:
1171
1228
  self._snap_guides_to_gaps(self.horizontal.data, gaps, axis)
@@ -1177,25 +1234,22 @@ class Guides:
1177
1234
  self.horizontal.data[:] = [float(y) for y in self.horizontal.data]
1178
1235
 
1179
1236
  return self
1180
-
1237
+
1181
1238
  def shift(
1182
- self,
1183
- index: int,
1184
- offset: float,
1185
- axis: Literal['vertical', 'horizontal'] = 'vertical'
1239
+ self, index: int, offset: float, axis: Literal["vertical", "horizontal"] = "vertical"
1186
1240
  ) -> "Guides":
1187
1241
  """
1188
1242
  Move a specific guide by a offset amount.
1189
-
1243
+
1190
1244
  Args:
1191
1245
  index: Index of the guide to move
1192
1246
  offset: Amount to move (positive = right/down)
1193
1247
  axis: Which guide list to modify
1194
-
1248
+
1195
1249
  Returns:
1196
1250
  Self for method chaining
1197
1251
  """
1198
- if axis == 'vertical':
1252
+ if axis == "vertical":
1199
1253
  if 0 <= index < len(self.vertical):
1200
1254
  self.vertical[index] += offset
1201
1255
  self.vertical = sorted(self.vertical)
@@ -1207,123 +1261,127 @@ class Guides:
1207
1261
  self.horizontal = sorted(self.horizontal)
1208
1262
  else:
1209
1263
  logger.warning(f"Horizontal guide index {index} out of range")
1210
-
1264
+
1211
1265
  return self
1212
-
1266
+
1213
1267
  def add_vertical(self, x: float) -> "Guides":
1214
1268
  """Add a vertical guide at the specified x-coordinate."""
1215
1269
  self.vertical.append(x)
1216
1270
  self.vertical = sorted(self.vertical)
1217
1271
  return self
1218
-
1272
+
1219
1273
  def add_horizontal(self, y: float) -> "Guides":
1220
1274
  """Add a horizontal guide at the specified y-coordinate."""
1221
1275
  self.horizontal.append(y)
1222
1276
  self.horizontal = sorted(self.horizontal)
1223
1277
  return self
1224
-
1278
+
1225
1279
  def remove_vertical(self, index: int) -> "Guides":
1226
1280
  """Remove a vertical guide by index."""
1227
1281
  if 0 <= index < len(self.vertical):
1228
1282
  self.vertical.pop(index)
1229
1283
  return self
1230
-
1284
+
1231
1285
  def remove_horizontal(self, index: int) -> "Guides":
1232
1286
  """Remove a horizontal guide by index."""
1233
1287
  if 0 <= index < len(self.horizontal):
1234
1288
  self.horizontal.pop(index)
1235
1289
  return self
1236
-
1290
+
1237
1291
  # -------------------------------------------------------------------------
1238
1292
  # Operations
1239
1293
  # -------------------------------------------------------------------------
1240
-
1294
+
1241
1295
  def __add__(self, other: "Guides") -> "Guides":
1242
1296
  """
1243
1297
  Combine two guide sets.
1244
-
1298
+
1245
1299
  Returns:
1246
1300
  New Guides object with combined coordinates
1247
1301
  """
1248
1302
  # Combine and deduplicate coordinates, ensuring Python floats
1249
1303
  combined_verticals = sorted([float(x) for x in set(self.vertical + other.vertical)])
1250
1304
  combined_horizontals = sorted([float(y) for y in set(self.horizontal + other.horizontal)])
1251
-
1305
+
1252
1306
  # Use context from self if available
1253
1307
  return Guides(
1254
1308
  verticals=combined_verticals,
1255
1309
  horizontals=combined_horizontals,
1256
1310
  context=self.context or other.context,
1257
- bounds=self.bounds or other.bounds
1311
+ bounds=self.bounds or other.bounds,
1258
1312
  )
1259
-
1313
+
1260
1314
  def show(self, on=None, **kwargs):
1261
1315
  """
1262
1316
  Display the guides overlaid on a page or region.
1263
-
1317
+
1264
1318
  Args:
1265
1319
  on: Page, Region, PIL Image, or string to display guides on.
1266
1320
  If None, uses self.context (the object guides were created from).
1267
1321
  If string 'page', uses the page from self.context.
1268
1322
  **kwargs: Additional arguments passed to to_image() if applicable.
1269
-
1323
+
1270
1324
  Returns:
1271
1325
  PIL Image with guides drawn on it.
1272
1326
  """
1273
1327
  # Determine what to display guides on
1274
1328
  target = on if on is not None else self.context
1275
-
1329
+
1276
1330
  # Handle string shortcuts
1277
1331
  if isinstance(target, str):
1278
- if target == 'page':
1279
- if hasattr(self.context, 'page'):
1332
+ if target == "page":
1333
+ if hasattr(self.context, "page"):
1280
1334
  target = self.context.page
1281
- elif hasattr(self.context, '_page'):
1335
+ elif hasattr(self.context, "_page"):
1282
1336
  target = self.context._page
1283
1337
  else:
1284
1338
  raise ValueError("Cannot resolve 'page' - context has no page attribute")
1285
1339
  else:
1286
1340
  raise ValueError(f"Unknown string target: {target}. Only 'page' is supported.")
1287
-
1341
+
1288
1342
  if target is None:
1289
1343
  raise ValueError("No target specified and no context available for guides display")
1290
-
1344
+
1291
1345
  # Prepare kwargs for image generation
1292
1346
  image_kwargs = kwargs.copy()
1293
-
1347
+
1294
1348
  # Always turn off highlights to avoid visual clutter
1295
- image_kwargs['include_highlights'] = False
1296
-
1349
+ image_kwargs["include_highlights"] = False
1350
+
1297
1351
  # If target is a region-like object, crop to just that region
1298
- if hasattr(target, 'bbox') and hasattr(target, 'page'):
1352
+ if hasattr(target, "bbox") and hasattr(target, "page"):
1299
1353
  # This is likely a Region
1300
- image_kwargs['crop'] = True
1301
-
1354
+ image_kwargs["crop"] = True
1355
+
1302
1356
  # Get base image
1303
- if hasattr(target, 'to_image'):
1357
+ if hasattr(target, "to_image"):
1304
1358
  img = target.to_image(**image_kwargs)
1305
- elif hasattr(target, 'mode') and hasattr(target, 'size'):
1359
+ elif hasattr(target, "mode") and hasattr(target, "size"):
1306
1360
  # It's already a PIL Image
1307
1361
  img = target
1308
1362
  else:
1309
1363
  raise ValueError(f"Object {target} does not support to_image() and is not a PIL Image")
1310
-
1364
+
1311
1365
  if img is None:
1312
1366
  raise ValueError("Failed to generate base image")
1313
-
1367
+
1314
1368
  # Create a copy to draw on
1315
1369
  img = img.copy()
1316
1370
  draw = ImageDraw.Draw(img)
1317
-
1371
+
1318
1372
  # Determine scale factor for coordinate conversion
1319
- if hasattr(target, 'width') and hasattr(target, 'height') and not (hasattr(target, 'mode') and hasattr(target, 'size')):
1373
+ if (
1374
+ hasattr(target, "width")
1375
+ and hasattr(target, "height")
1376
+ and not (hasattr(target, "mode") and hasattr(target, "size"))
1377
+ ):
1320
1378
  # target is a PDF object (Page/Region) with PDF coordinates
1321
1379
  scale_x = img.width / target.width
1322
1380
  scale_y = img.height / target.height
1323
-
1381
+
1324
1382
  # If we're showing guides on a region, we need to adjust coordinates
1325
1383
  # to be relative to the region's origin
1326
- if hasattr(target, 'bbox') and hasattr(target, 'page'):
1384
+ if hasattr(target, "bbox") and hasattr(target, "page"):
1327
1385
  # This is a Region - adjust guide coordinates to be relative to region
1328
1386
  region_x0, region_top = target.x0, target.top
1329
1387
  else:
@@ -1334,7 +1392,7 @@ class Guides:
1334
1392
  scale_x = 1.0
1335
1393
  scale_y = 1.0
1336
1394
  region_x0, region_top = 0, 0
1337
-
1395
+
1338
1396
  # Draw vertical guides (blue)
1339
1397
  for x_coord in self.vertical:
1340
1398
  # Adjust coordinate if we're showing on a region
@@ -1344,8 +1402,8 @@ class Guides:
1344
1402
  if 0 <= pixel_x <= img.width - 1:
1345
1403
  x_pixel = int(min(pixel_x, img.width - 1))
1346
1404
  draw.line([(x_pixel, 0), (x_pixel, img.height - 1)], fill=(0, 0, 255, 200), width=2)
1347
-
1348
- # Draw horizontal guides (red)
1405
+
1406
+ # Draw horizontal guides (red)
1349
1407
  for y_coord in self.horizontal:
1350
1408
  # Adjust coordinate if we're showing on a region
1351
1409
  adjusted_y = y_coord - region_top
@@ -1354,22 +1412,22 @@ class Guides:
1354
1412
  if 0 <= pixel_y <= img.height - 1:
1355
1413
  y_pixel = int(min(pixel_y, img.height - 1))
1356
1414
  draw.line([(0, y_pixel), (img.width - 1, y_pixel)], fill=(255, 0, 0, 200), width=2)
1357
-
1415
+
1358
1416
  return img
1359
-
1417
+
1360
1418
  # -------------------------------------------------------------------------
1361
1419
  # Utility Methods
1362
1420
  # -------------------------------------------------------------------------
1363
-
1421
+
1364
1422
  def get_cells(self) -> List[Tuple[float, float, float, float]]:
1365
1423
  """
1366
1424
  Get all cell bounding boxes from guide intersections.
1367
-
1425
+
1368
1426
  Returns:
1369
1427
  List of (x0, y0, x1, y1) tuples for each cell
1370
1428
  """
1371
1429
  cells = []
1372
-
1430
+
1373
1431
  # Create cells from guide intersections
1374
1432
  for i in range(len(self.vertical) - 1):
1375
1433
  for j in range(len(self.horizontal) - 1):
@@ -1378,135 +1436,139 @@ class Guides:
1378
1436
  y0 = self.horizontal[j]
1379
1437
  y1 = self.horizontal[j + 1]
1380
1438
  cells.append((x0, y0, x1, y1))
1381
-
1439
+
1382
1440
  return cells
1383
-
1441
+
1384
1442
  def to_dict(self) -> Dict[str, Any]:
1385
1443
  """
1386
1444
  Convert to dictionary format suitable for pdfplumber table_settings.
1387
-
1445
+
1388
1446
  Returns:
1389
1447
  Dictionary with explicit_vertical_lines and explicit_horizontal_lines
1390
1448
  """
1391
1449
  return {
1392
- 'explicit_vertical_lines': self.vertical,
1393
- 'explicit_horizontal_lines': self.horizontal
1450
+ "explicit_vertical_lines": self.vertical,
1451
+ "explicit_horizontal_lines": self.horizontal,
1394
1452
  }
1395
-
1453
+
1396
1454
  def to_relative(self) -> "Guides":
1397
1455
  """
1398
1456
  Convert absolute coordinates to relative (0-1) coordinates.
1399
-
1457
+
1400
1458
  Returns:
1401
1459
  New Guides object with relative coordinates
1402
1460
  """
1403
1461
  if self.relative:
1404
1462
  return self # Already relative
1405
-
1463
+
1406
1464
  if not self.bounds:
1407
1465
  raise ValueError("Cannot convert to relative without bounds")
1408
-
1466
+
1409
1467
  x0, y0, x1, y1 = self.bounds
1410
1468
  width = x1 - x0
1411
1469
  height = y1 - y0
1412
-
1470
+
1413
1471
  rel_verticals = [(x - x0) / width for x in self.vertical]
1414
1472
  rel_horizontals = [(y - y0) / height for y in self.horizontal]
1415
-
1473
+
1416
1474
  return Guides(
1417
1475
  verticals=rel_verticals,
1418
1476
  horizontals=rel_horizontals,
1419
1477
  context=self.context,
1420
1478
  bounds=(0, 0, 1, 1),
1421
- relative=True
1479
+ relative=True,
1422
1480
  )
1423
-
1481
+
1424
1482
  def to_absolute(self, bounds: Tuple[float, float, float, float]) -> "Guides":
1425
1483
  """
1426
1484
  Convert relative coordinates to absolute coordinates.
1427
-
1485
+
1428
1486
  Args:
1429
1487
  bounds: Target bounding box (x0, y0, x1, y1)
1430
-
1488
+
1431
1489
  Returns:
1432
1490
  New Guides object with absolute coordinates
1433
1491
  """
1434
1492
  if not self.relative:
1435
1493
  return self # Already absolute
1436
-
1494
+
1437
1495
  x0, y0, x1, y1 = bounds
1438
1496
  width = x1 - x0
1439
1497
  height = y1 - y0
1440
-
1498
+
1441
1499
  abs_verticals = [x0 + x * width for x in self.vertical]
1442
1500
  abs_horizontals = [y0 + y * height for y in self.horizontal]
1443
-
1501
+
1444
1502
  return Guides(
1445
1503
  verticals=abs_verticals,
1446
1504
  horizontals=abs_horizontals,
1447
1505
  context=self.context,
1448
1506
  bounds=bounds,
1449
- relative=False
1507
+ relative=False,
1450
1508
  )
1451
-
1509
+
1452
1510
  @property
1453
1511
  def n_rows(self) -> int:
1454
1512
  """Number of rows defined by horizontal guides."""
1455
1513
  return max(0, len(self.horizontal) - 1)
1456
-
1514
+
1457
1515
  @property
1458
1516
  def n_cols(self) -> int:
1459
1517
  """Number of columns defined by vertical guides."""
1460
1518
  return max(0, len(self.vertical) - 1)
1461
-
1519
+
1462
1520
  def _handle_snap_failure(self, message: str):
1463
1521
  """Handle cases where snapping cannot be performed."""
1464
- if hasattr(self, 'on_no_snap'):
1465
- if self.on_no_snap == 'warn':
1522
+ if hasattr(self, "on_no_snap"):
1523
+ if self.on_no_snap == "warn":
1466
1524
  logger.warning(message)
1467
- elif self.on_no_snap == 'raise':
1525
+ elif self.on_no_snap == "raise":
1468
1526
  raise ValueError(message)
1469
1527
  # 'ignore' case: do nothing
1470
1528
  else:
1471
1529
  logger.warning(message) # Default behavior
1472
1530
 
1473
- def _find_vertical_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
1531
+ def _find_vertical_whitespace_gaps(
1532
+ self, text_elements, min_gap: float, threshold: Union[float, str] = "auto"
1533
+ ) -> List[Tuple[float, float]]:
1474
1534
  """
1475
1535
  Find vertical whitespace gaps using bbox-based density analysis.
1476
1536
  Returns list of (start, end) tuples representing trough ranges.
1477
1537
  """
1478
1538
  if not self.bounds:
1479
1539
  return []
1480
-
1540
+
1481
1541
  x0, _, x1, _ = self.bounds
1482
1542
  width_pixels = int(x1 - x0)
1483
-
1543
+
1484
1544
  if width_pixels <= 0:
1485
1545
  return []
1486
-
1546
+
1487
1547
  # Create density histogram: count bbox overlaps per x-coordinate
1488
1548
  density = np.zeros(width_pixels)
1489
-
1549
+
1490
1550
  for element in text_elements:
1491
- if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
1551
+ if not hasattr(element, "x0") or not hasattr(element, "x1"):
1492
1552
  continue
1493
-
1553
+
1494
1554
  # Clip coordinates to bounds
1495
1555
  elem_x0 = max(x0, element.x0) - x0
1496
1556
  elem_x1 = min(x1, element.x1) - x0
1497
-
1557
+
1498
1558
  if elem_x1 > elem_x0:
1499
1559
  start_px = int(elem_x0)
1500
1560
  end_px = int(elem_x1)
1501
1561
  density[start_px:end_px] += 1
1502
-
1562
+
1503
1563
  if density.max() == 0:
1504
1564
  return []
1505
-
1565
+
1506
1566
  # Determine the threshold value
1507
- if threshold == 'auto':
1567
+ if threshold == "auto":
1508
1568
  # Auto mode: try different thresholds with step 0.05 until we have enough troughs
1509
- guides_needing_troughs = len([g for i, g in enumerate(self.vertical) if 0 < i < len(self.vertical) - 1])
1569
+ guides_needing_troughs = len(
1570
+ [g for i, g in enumerate(self.vertical) if 0 < i < len(self.vertical) - 1]
1571
+ )
1510
1572
  if guides_needing_troughs == 0:
1511
1573
  threshold_val = 0.5 # Default when no guides need placement
1512
1574
  else:
@@ -1515,9 +1577,11 @@ class Guides:
1515
1577
  test_gaps = self._find_gaps_with_threshold(density, test_threshold, min_gap, x0)
1516
1578
  if len(test_gaps) >= guides_needing_troughs:
1517
1579
  threshold_val = test_threshold
1518
- logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
1580
+ logger.debug(
1581
+ f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)"
1582
+ )
1519
1583
  break
1520
-
1584
+
1521
1585
  if threshold_val is None:
1522
1586
  threshold_val = 0.8 # Fallback to permissive threshold
1523
1587
  logger.debug(f"Auto threshold fallback to {threshold_val}")
@@ -1526,93 +1590,103 @@ class Guides:
1526
1590
  if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
1527
1591
  raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
1528
1592
  threshold_val = float(threshold)
1529
-
1593
+
1530
1594
  return self._find_gaps_with_threshold(density, threshold_val, min_gap, x0)
1531
-
1595
+
1532
1596
  def _find_gaps_with_threshold(self, density, threshold_val, min_gap, x0):
1533
1597
  """Helper method to find gaps given a specific threshold value."""
1534
1598
  max_density = density.max()
1535
1599
  threshold_density = threshold_val * max_density
1536
-
1600
+
1537
1601
  # Smooth the density for better trough detection
1538
1602
  from scipy.ndimage import gaussian_filter1d
1603
+
1539
1604
  smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
1540
-
1605
+
1541
1606
  # Find regions below threshold
1542
1607
  below_threshold = smoothed_density <= threshold_density
1543
-
1608
+
1544
1609
  # Find contiguous regions
1545
1610
  from scipy.ndimage import label as nd_label
1611
+
1546
1612
  labeled_regions, num_regions = nd_label(below_threshold)
1547
-
1613
+
1548
1614
  gaps = []
1549
1615
  for region_id in range(1, num_regions + 1):
1550
1616
  region_mask = labeled_regions == region_id
1551
1617
  region_indices = np.where(region_mask)[0]
1552
-
1618
+
1553
1619
  if len(region_indices) == 0:
1554
1620
  continue
1555
-
1621
+
1556
1622
  start_px = region_indices[0]
1557
1623
  end_px = region_indices[-1] + 1
1558
-
1624
+
1559
1625
  # Convert back to PDF coordinates
1560
1626
  start_pdf = x0 + start_px
1561
1627
  end_pdf = x0 + end_px
1562
-
1628
+
1563
1629
  # Check minimum gap size
1564
1630
  if end_pdf - start_pdf >= min_gap:
1565
1631
  gaps.append((start_pdf, end_pdf))
1566
-
1632
+
1567
1633
  return gaps
1568
1634
 
1569
- def _find_horizontal_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
1635
+ def _find_horizontal_whitespace_gaps(
1636
+ self, text_elements, min_gap: float, threshold: Union[float, str] = "auto"
1637
+ ) -> List[Tuple[float, float]]:
1570
1638
  """
1571
1639
  Find horizontal whitespace gaps using bbox-based density analysis.
1572
1640
  Returns list of (start, end) tuples representing trough ranges.
1573
1641
  """
1574
1642
  if not self.bounds:
1575
1643
  return []
1576
-
1644
+
1577
1645
  _, y0, _, y1 = self.bounds
1578
1646
  height_pixels = int(y1 - y0)
1579
-
1647
+
1580
1648
  if height_pixels <= 0:
1581
1649
  return []
1582
-
1583
- # Create density histogram: count bbox overlaps per y-coordinate
1650
+
1651
+ # Create density histogram: count bbox overlaps per y-coordinate
1584
1652
  density = np.zeros(height_pixels)
1585
-
1653
+
1586
1654
  for element in text_elements:
1587
- if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
1655
+ if not hasattr(element, "top") or not hasattr(element, "bottom"):
1588
1656
  continue
1589
-
1657
+
1590
1658
  # Clip coordinates to bounds
1591
1659
  elem_top = max(y0, element.top) - y0
1592
1660
  elem_bottom = min(y1, element.bottom) - y0
1593
-
1661
+
1594
1662
  if elem_bottom > elem_top:
1595
1663
  start_px = int(elem_top)
1596
1664
  end_px = int(elem_bottom)
1597
1665
  density[start_px:end_px] += 1
1598
-
1666
+
1599
1667
  if density.max() == 0:
1600
1668
  return []
1601
-
1669
+
1602
1670
  # Determine the threshold value (same logic as vertical)
1603
- if threshold == 'auto':
1604
- guides_needing_troughs = len([g for i, g in enumerate(self.horizontal) if 0 < i < len(self.horizontal) - 1])
1671
+ if threshold == "auto":
1672
+ guides_needing_troughs = len(
1673
+ [g for i, g in enumerate(self.horizontal) if 0 < i < len(self.horizontal) - 1]
1674
+ )
1605
1675
  if guides_needing_troughs == 0:
1606
1676
  threshold_val = 0.5 # Default when no guides need placement
1607
1677
  else:
1608
1678
  threshold_val = None
1609
1679
  for test_threshold in np.arange(0.1, 1.0, 0.05):
1610
- test_gaps = self._find_gaps_with_threshold_horizontal(density, test_threshold, min_gap, y0)
1680
+ test_gaps = self._find_gaps_with_threshold_horizontal(
1681
+ density, test_threshold, min_gap, y0
1682
+ )
1611
1683
  if len(test_gaps) >= guides_needing_troughs:
1612
1684
  threshold_val = test_threshold
1613
- logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
1685
+ logger.debug(
1686
+ f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)"
1687
+ )
1614
1688
  break
1615
-
1689
+
1616
1690
  if threshold_val is None:
1617
1691
  threshold_val = 0.8 # Fallback to permissive threshold
1618
1692
  logger.debug(f"Auto threshold fallback to {threshold_val}")
@@ -1621,141 +1695,157 @@ class Guides:
1621
1695
  if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
1622
1696
  raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
1623
1697
  threshold_val = float(threshold)
1624
-
1698
+
1625
1699
  return self._find_gaps_with_threshold_horizontal(density, threshold_val, min_gap, y0)
1626
-
1700
+
1627
1701
  def _find_gaps_with_threshold_horizontal(self, density, threshold_val, min_gap, y0):
1628
1702
  """Helper method to find horizontal gaps given a specific threshold value."""
1629
1703
  max_density = density.max()
1630
1704
  threshold_density = threshold_val * max_density
1631
-
1705
+
1632
1706
  # Smooth the density for better trough detection
1633
1707
  from scipy.ndimage import gaussian_filter1d
1708
+
1634
1709
  smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
1635
-
1710
+
1636
1711
  # Find regions below threshold
1637
1712
  below_threshold = smoothed_density <= threshold_density
1638
-
1713
+
1639
1714
  # Find contiguous regions
1640
1715
  from scipy.ndimage import label as nd_label
1716
+
1641
1717
  labeled_regions, num_regions = nd_label(below_threshold)
1642
-
1718
+
1643
1719
  gaps = []
1644
1720
  for region_id in range(1, num_regions + 1):
1645
1721
  region_mask = labeled_regions == region_id
1646
1722
  region_indices = np.where(region_mask)[0]
1647
-
1723
+
1648
1724
  if len(region_indices) == 0:
1649
1725
  continue
1650
-
1726
+
1651
1727
  start_px = region_indices[0]
1652
1728
  end_px = region_indices[-1] + 1
1653
-
1729
+
1654
1730
  # Convert back to PDF coordinates
1655
1731
  start_pdf = y0 + start_px
1656
1732
  end_pdf = y0 + end_px
1657
-
1733
+
1658
1734
  # Check minimum gap size
1659
1735
  if end_pdf - start_pdf >= min_gap:
1660
1736
  gaps.append((start_pdf, end_pdf))
1661
-
1737
+
1662
1738
  return gaps
1663
-
1664
- def _find_vertical_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
1739
+
1740
+ def _find_vertical_element_gaps(
1741
+ self, text_elements, min_gap: float
1742
+ ) -> List[Tuple[float, float]]:
1665
1743
  """
1666
1744
  Find vertical whitespace gaps using text element spacing analysis.
1667
1745
  Returns list of (start, end) tuples representing trough ranges.
1668
1746
  """
1669
1747
  if not self.bounds or not text_elements:
1670
1748
  return []
1671
-
1749
+
1672
1750
  x0, _, x1, _ = self.bounds
1673
-
1751
+
1674
1752
  # Get all element right and left edges
1675
1753
  element_edges = []
1676
1754
  for element in text_elements:
1677
- if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
1755
+ if not hasattr(element, "x0") or not hasattr(element, "x1"):
1678
1756
  continue
1679
1757
  # Only include elements that overlap vertically with our bounds
1680
- if hasattr(element, 'top') and hasattr(element, 'bottom'):
1758
+ if hasattr(element, "top") and hasattr(element, "bottom"):
1681
1759
  if element.bottom < self.bounds[1] or element.top > self.bounds[3]:
1682
1760
  continue
1683
1761
  element_edges.extend([element.x0, element.x1])
1684
-
1762
+
1685
1763
  if not element_edges:
1686
1764
  return []
1687
-
1765
+
1688
1766
  # Sort edges and find gaps
1689
1767
  element_edges = sorted(set(element_edges))
1690
-
1768
+
1691
1769
  trough_ranges = []
1692
1770
  for i in range(len(element_edges) - 1):
1693
1771
  gap_start = element_edges[i]
1694
1772
  gap_end = element_edges[i + 1]
1695
1773
  gap_width = gap_end - gap_start
1696
-
1774
+
1697
1775
  if gap_width >= min_gap:
1698
1776
  # Check if this gap actually contains no text (is empty space)
1699
1777
  gap_has_text = False
1700
1778
  for element in text_elements:
1701
- if (hasattr(element, 'x0') and hasattr(element, 'x1') and
1702
- element.x0 < gap_end and element.x1 > gap_start):
1779
+ if (
1780
+ hasattr(element, "x0")
1781
+ and hasattr(element, "x1")
1782
+ and element.x0 < gap_end
1783
+ and element.x1 > gap_start
1784
+ ):
1703
1785
  gap_has_text = True
1704
1786
  break
1705
-
1787
+
1706
1788
  if not gap_has_text:
1707
1789
  trough_ranges.append((gap_start, gap_end))
1708
-
1790
+
1709
1791
  return trough_ranges
1710
-
1711
- def _find_horizontal_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
1792
+
1793
+ def _find_horizontal_element_gaps(
1794
+ self, text_elements, min_gap: float
1795
+ ) -> List[Tuple[float, float]]:
1712
1796
  """
1713
1797
  Find horizontal whitespace gaps using text element spacing analysis.
1714
1798
  Returns list of (start, end) tuples representing trough ranges.
1715
1799
  """
1716
1800
  if not self.bounds or not text_elements:
1717
1801
  return []
1718
-
1802
+
1719
1803
  _, y0, _, y1 = self.bounds
1720
-
1804
+
1721
1805
  # Get all element top and bottom edges
1722
1806
  element_edges = []
1723
1807
  for element in text_elements:
1724
- if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
1808
+ if not hasattr(element, "top") or not hasattr(element, "bottom"):
1725
1809
  continue
1726
1810
  # Only include elements that overlap horizontally with our bounds
1727
- if hasattr(element, 'x0') and hasattr(element, 'x1'):
1811
+ if hasattr(element, "x0") and hasattr(element, "x1"):
1728
1812
  if element.x1 < self.bounds[0] or element.x0 > self.bounds[2]:
1729
1813
  continue
1730
1814
  element_edges.extend([element.top, element.bottom])
1731
-
1815
+
1732
1816
  if not element_edges:
1733
1817
  return []
1734
-
1818
+
1735
1819
  # Sort edges and find gaps
1736
1820
  element_edges = sorted(set(element_edges))
1737
-
1821
+
1738
1822
  trough_ranges = []
1739
1823
  for i in range(len(element_edges) - 1):
1740
1824
  gap_start = element_edges[i]
1741
1825
  gap_end = element_edges[i + 1]
1742
1826
  gap_width = gap_end - gap_start
1743
-
1827
+
1744
1828
  if gap_width >= min_gap:
1745
1829
  # Check if this gap actually contains no text (is empty space)
1746
1830
  gap_has_text = False
1747
1831
  for element in text_elements:
1748
- if (hasattr(element, 'top') and hasattr(element, 'bottom') and
1749
- element.top < gap_end and element.bottom > gap_start):
1832
+ if (
1833
+ hasattr(element, "top")
1834
+ and hasattr(element, "bottom")
1835
+ and element.top < gap_end
1836
+ and element.bottom > gap_start
1837
+ ):
1750
1838
  gap_has_text = True
1751
1839
  break
1752
-
1840
+
1753
1841
  if not gap_has_text:
1754
1842
  trough_ranges.append((gap_start, gap_end))
1755
-
1843
+
1756
1844
  return trough_ranges
1757
-
1758
- def _optimal_guide_assignment(self, guides: List[float], trough_ranges: List[Tuple[float, float]]) -> Dict[int, int]:
1845
+
1846
+ def _optimal_guide_assignment(
1847
+ self, guides: List[float], trough_ranges: List[Tuple[float, float]]
1848
+ ) -> Dict[int, int]:
1759
1849
  """
1760
1850
  Assign guides to trough ranges using the user's desired logic:
1761
1851
  - Guides already in a trough stay put
@@ -1764,18 +1854,20 @@ class Guides:
1764
1854
  """
1765
1855
  if not guides or not trough_ranges:
1766
1856
  return {}
1767
-
1857
+
1768
1858
  assignments = {}
1769
-
1859
+
1770
1860
  # Step 1: Identify which guides are already in troughs
1771
1861
  guides_in_troughs = set()
1772
1862
  for i, guide_pos in enumerate(guides):
1773
1863
  for trough_start, trough_end in trough_ranges:
1774
1864
  if trough_start <= guide_pos <= trough_end:
1775
1865
  guides_in_troughs.add(i)
1776
- logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is already in trough ({trough_start:.1f}-{trough_end:.1f}), keeping in place")
1866
+ logger.debug(
1867
+ f"Guide {i} (pos {guide_pos:.1f}) is already in trough ({trough_start:.1f}-{trough_end:.1f}), keeping in place"
1868
+ )
1777
1869
  break
1778
-
1870
+
1779
1871
  # Step 2: Identify which troughs are already occupied
1780
1872
  occupied_troughs = set()
1781
1873
  for i in guides_in_troughs:
@@ -1784,21 +1876,23 @@ class Guides:
1784
1876
  if trough_start <= guide_pos <= trough_end:
1785
1877
  occupied_troughs.add(j)
1786
1878
  break
1787
-
1879
+
1788
1880
  # Step 3: Find guides that need reassignment (not in any trough)
1789
1881
  guides_to_move = []
1790
1882
  for i, guide_pos in enumerate(guides):
1791
1883
  if i not in guides_in_troughs:
1792
1884
  guides_to_move.append(i)
1793
- logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is NOT in any trough, needs reassignment")
1794
-
1885
+ logger.debug(
1886
+ f"Guide {i} (pos {guide_pos:.1f}) is NOT in any trough, needs reassignment"
1887
+ )
1888
+
1795
1889
  # Step 4: Find available troughs (not occupied by existing guides)
1796
1890
  available_troughs = []
1797
1891
  for j, (trough_start, trough_end) in enumerate(trough_ranges):
1798
1892
  if j not in occupied_troughs:
1799
1893
  available_troughs.append(j)
1800
1894
  logger.debug(f"Trough {j} ({trough_start:.1f}-{trough_end:.1f}) is available")
1801
-
1895
+
1802
1896
  # Step 5: Assign guides to move to closest available troughs
1803
1897
  if guides_to_move and available_troughs:
1804
1898
  # Calculate distances for all combinations
@@ -1810,20 +1904,22 @@ class Guides:
1810
1904
  trough_center = (trough_start + trough_end) / 2
1811
1905
  distance = abs(guide_pos - trough_center)
1812
1906
  distances.append((distance, guide_idx, trough_idx))
1813
-
1907
+
1814
1908
  # Sort by distance and assign greedily
1815
1909
  distances.sort()
1816
1910
  used_troughs = set()
1817
-
1911
+
1818
1912
  for distance, guide_idx, trough_idx in distances:
1819
1913
  if guide_idx not in assignments and trough_idx not in used_troughs:
1820
1914
  assignments[guide_idx] = trough_idx
1821
1915
  used_troughs.add(trough_idx)
1822
- logger.debug(f"Assigned guide {guide_idx} (pos {guides[guide_idx]:.1f}) to trough {trough_idx} (distance: {distance:.1f})")
1823
-
1916
+ logger.debug(
1917
+ f"Assigned guide {guide_idx} (pos {guides[guide_idx]:.1f}) to trough {trough_idx} (distance: {distance:.1f})"
1918
+ )
1919
+
1824
1920
  logger.debug(f"Final assignments: {assignments}")
1825
1921
  return assignments
1826
-
1922
+
1827
1923
  def _snap_guides_to_gaps(self, guides: List[float], gaps: List[Tuple[float, float]], axis: str):
1828
1924
  """
1829
1925
  Snap guides to nearby gaps using optimal assignment.
@@ -1831,15 +1927,15 @@ class Guides:
1831
1927
  """
1832
1928
  if not guides or not gaps:
1833
1929
  return
1834
-
1930
+
1835
1931
  logger.debug(f"Snapping {len(guides)} {axis} guides to {len(gaps)} trough ranges")
1836
1932
  for i, (start, end) in enumerate(gaps):
1837
1933
  center = (start + end) / 2
1838
1934
  logger.debug(f" Trough {i}: {start:.1f} to {end:.1f} (center: {center:.1f})")
1839
-
1935
+
1840
1936
  # Get optimal assignments
1841
1937
  assignments = self._optimal_guide_assignment(guides, gaps)
1842
-
1938
+
1843
1939
  # Apply assignments (modify guides list in-place)
1844
1940
  for guide_idx, trough_idx in assignments.items():
1845
1941
  trough_start, trough_end = gaps[trough_idx]
@@ -1847,23 +1943,23 @@ class Guides:
1847
1943
  old_pos = guides[guide_idx]
1848
1944
  guides[guide_idx] = new_pos
1849
1945
  logger.info(f"Snapped {axis} guide from {old_pos:.1f} to {new_pos:.1f}")
1850
-
1946
+
1851
1947
  def build_grid(
1852
1948
  self,
1853
1949
  target: Optional[Union["Page", "Region"]] = None,
1854
1950
  source: str = "guides",
1855
1951
  cell_padding: float = 0.5,
1856
- include_outer_boundaries: bool = False
1952
+ include_outer_boundaries: bool = False,
1857
1953
  ) -> Dict[str, int]:
1858
1954
  """
1859
1955
  Create table structure (table, rows, columns, cells) from guide coordinates.
1860
-
1956
+
1861
1957
  Args:
1862
1958
  target: Page or Region to create regions on (uses self.context if None)
1863
1959
  source: Source label for created regions (for identification)
1864
1960
  cell_padding: Internal padding for cell regions in points
1865
1961
  include_outer_boundaries: Whether to add boundaries at edges if missing
1866
-
1962
+
1867
1963
  Returns:
1868
1964
  Dictionary with counts: {'table': 1, 'rows': N, 'columns': M, 'cells': N*M}
1869
1965
  """
@@ -1871,98 +1967,142 @@ class Guides:
1871
1967
  target_obj = target or self.context
1872
1968
  if not target_obj:
1873
1969
  raise ValueError("No target object available. Provide target parameter or context.")
1874
-
1970
+
1875
1971
  # Get the page for creating regions
1876
- if hasattr(target_obj, 'x0') and hasattr(target_obj, 'top'): # Region (has bbox coordinates)
1972
+ if hasattr(target_obj, "x0") and hasattr(
1973
+ target_obj, "top"
1974
+ ): # Region (has bbox coordinates)
1877
1975
  page = target_obj._page
1878
1976
  origin_x, origin_y = target_obj.x0, target_obj.top
1879
1977
  context_width, context_height = target_obj.width, target_obj.height
1880
- elif hasattr(target_obj, '_element_mgr') or hasattr(target_obj, 'width'): # Page
1978
+ elif hasattr(target_obj, "_element_mgr") or hasattr(target_obj, "width"): # Page
1881
1979
  page = target_obj
1882
1980
  origin_x, origin_y = 0.0, 0.0
1883
1981
  context_width, context_height = page.width, page.height
1884
1982
  else:
1885
1983
  raise ValueError(f"Target object {target_obj} is not a Page or Region")
1886
-
1984
+
1887
1985
  element_manager = page._element_mgr
1888
-
1986
+
1889
1987
  # Setup boundaries
1890
1988
  row_boundaries = list(self.horizontal)
1891
1989
  col_boundaries = list(self.vertical)
1892
-
1990
+
1893
1991
  # Add outer boundaries if requested and missing
1894
1992
  if include_outer_boundaries:
1895
1993
  if not row_boundaries or row_boundaries[0] > origin_y:
1896
1994
  row_boundaries.insert(0, origin_y)
1897
1995
  if not row_boundaries or row_boundaries[-1] < origin_y + context_height:
1898
1996
  row_boundaries.append(origin_y + context_height)
1899
-
1997
+
1900
1998
  if not col_boundaries or col_boundaries[0] > origin_x:
1901
1999
  col_boundaries.insert(0, origin_x)
1902
2000
  if not col_boundaries or col_boundaries[-1] < origin_x + context_width:
1903
2001
  col_boundaries.append(origin_x + context_width)
1904
-
2002
+
1905
2003
  # Remove duplicates and sort
1906
2004
  row_boundaries = sorted(list(set(row_boundaries)))
1907
2005
  col_boundaries = sorted(list(set(col_boundaries)))
1908
-
1909
- logger.debug(f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries")
1910
-
2006
+
2007
+ # ------------------------------------------------------------------
2008
+ # Clean-up: remove any previously created grid regions (table, rows,
2009
+ # columns, cells) that were generated by the same `source` label and
2010
+ # overlap the area we are about to populate. This prevents the page's
2011
+ # `ElementManager` from accumulating stale/duplicate regions when the
2012
+ # user rebuilds the grid multiple times.
2013
+ # ------------------------------------------------------------------
2014
+ try:
2015
+ # Bounding box of the grid we are about to create
2016
+ if row_boundaries and col_boundaries:
2017
+ grid_bbox = (
2018
+ col_boundaries[0], # x0
2019
+ row_boundaries[0], # top
2020
+ col_boundaries[-1], # x1
2021
+ row_boundaries[-1], # bottom
2022
+ )
2023
+
2024
+ def _bbox_overlap(b1, b2):
2025
+ """Return True if two (x0, top, x1, bottom) bboxes overlap."""
2026
+ return not (
2027
+ b1[2] <= b2[0] # b1 right ≤ b2 left
2028
+ or b1[0] >= b2[2] # b1 left ≥ b2 right
2029
+ or b1[3] <= b2[1] # b1 bottom ≤ b2 top
2030
+ or b1[1] >= b2[3] # b1 top ≥ b2 bottom
2031
+ )
2032
+
2033
+ # Collect existing regions that match the source & region types
2034
+ regions_to_remove = [
2035
+ r
2036
+ for r in element_manager.regions
2037
+ if getattr(r, "source", None) == source
2038
+ and getattr(r, "region_type", None)
2039
+ in {"table", "table_row", "table_column", "table_cell"}
2040
+ and hasattr(r, "bbox")
2041
+ and _bbox_overlap(r.bbox, grid_bbox)
2042
+ ]
2043
+
2044
+ for r in regions_to_remove:
2045
+ element_manager.remove_element(r, element_type="regions")
2046
+
2047
+ if regions_to_remove:
2048
+ logger.debug(
2049
+ f"Removed {len(regions_to_remove)} existing grid region(s) prior to rebuild"
2050
+ )
2051
+ except Exception as cleanup_err: # pragma: no cover – cleanup must never crash
2052
+ logger.warning(f"Grid cleanup failed: {cleanup_err}")
2053
+
2054
+ logger.debug(
2055
+ f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries"
2056
+ )
2057
+
1911
2058
  # Track creation counts
1912
- counts = {'table': 0, 'rows': 0, 'columns': 0, 'cells': 0}
1913
-
2059
+ counts = {"table": 0, "rows": 0, "columns": 0, "cells": 0}
2060
+
1914
2061
  # Create overall table region
1915
2062
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1916
2063
  table_region = page.create_region(
1917
- col_boundaries[0], row_boundaries[0],
1918
- col_boundaries[-1], row_boundaries[-1]
2064
+ col_boundaries[0], row_boundaries[0], col_boundaries[-1], row_boundaries[-1]
1919
2065
  )
1920
2066
  table_region.source = source
1921
2067
  table_region.region_type = "table"
1922
2068
  table_region.normalized_type = "table"
1923
- table_region.metadata.update({
1924
- "source_guides": True,
1925
- "num_rows": len(row_boundaries) - 1,
1926
- "num_cols": len(col_boundaries) - 1,
1927
- "boundaries": {"rows": row_boundaries, "cols": col_boundaries}
1928
- })
2069
+ table_region.metadata.update(
2070
+ {
2071
+ "source_guides": True,
2072
+ "num_rows": len(row_boundaries) - 1,
2073
+ "num_cols": len(col_boundaries) - 1,
2074
+ "boundaries": {"rows": row_boundaries, "cols": col_boundaries},
2075
+ }
2076
+ )
1929
2077
  element_manager.add_element(table_region, element_type="regions")
1930
- counts['table'] = 1
1931
-
2078
+ counts["table"] = 1
2079
+
1932
2080
  # Create row regions
1933
2081
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1934
2082
  for i in range(len(row_boundaries) - 1):
1935
2083
  row_region = page.create_region(
1936
- col_boundaries[0], row_boundaries[i],
1937
- col_boundaries[-1], row_boundaries[i + 1]
2084
+ col_boundaries[0], row_boundaries[i], col_boundaries[-1], row_boundaries[i + 1]
1938
2085
  )
1939
2086
  row_region.source = source
1940
2087
  row_region.region_type = "table_row"
1941
2088
  row_region.normalized_type = "table_row"
1942
- row_region.metadata.update({
1943
- "row_index": i,
1944
- "source_guides": True
1945
- })
2089
+ row_region.metadata.update({"row_index": i, "source_guides": True})
1946
2090
  element_manager.add_element(row_region, element_type="regions")
1947
- counts['rows'] += 1
1948
-
2091
+ counts["rows"] += 1
2092
+
1949
2093
  # Create column regions
1950
2094
  if len(col_boundaries) >= 2 and len(row_boundaries) >= 2:
1951
2095
  for j in range(len(col_boundaries) - 1):
1952
2096
  col_region = page.create_region(
1953
- col_boundaries[j], row_boundaries[0],
1954
- col_boundaries[j + 1], row_boundaries[-1]
2097
+ col_boundaries[j], row_boundaries[0], col_boundaries[j + 1], row_boundaries[-1]
1955
2098
  )
1956
2099
  col_region.source = source
1957
2100
  col_region.region_type = "table_column"
1958
2101
  col_region.normalized_type = "table_column"
1959
- col_region.metadata.update({
1960
- "col_index": j,
1961
- "source_guides": True
1962
- })
2102
+ col_region.metadata.update({"col_index": j, "source_guides": True})
1963
2103
  element_manager.add_element(col_region, element_type="regions")
1964
- counts['columns'] += 1
1965
-
2104
+ counts["columns"] += 1
2105
+
1966
2106
  # Create cell regions
1967
2107
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1968
2108
  for i in range(len(row_boundaries) - 1):
@@ -1972,50 +2112,58 @@ class Guides:
1972
2112
  cell_top = row_boundaries[i] + cell_padding
1973
2113
  cell_x1 = col_boundaries[j + 1] - cell_padding
1974
2114
  cell_bottom = row_boundaries[i + 1] - cell_padding
1975
-
2115
+
1976
2116
  # Skip invalid cells
1977
2117
  if cell_x1 <= cell_x0 or cell_bottom <= cell_top:
1978
2118
  continue
1979
-
2119
+
1980
2120
  cell_region = page.create_region(cell_x0, cell_top, cell_x1, cell_bottom)
1981
2121
  cell_region.source = source
1982
2122
  cell_region.region_type = "table_cell"
1983
2123
  cell_region.normalized_type = "table_cell"
1984
- cell_region.metadata.update({
1985
- "row_index": i,
1986
- "col_index": j,
1987
- "source_guides": True,
1988
- "original_boundaries": {
1989
- "left": col_boundaries[j],
1990
- "top": row_boundaries[i],
1991
- "right": col_boundaries[j + 1],
1992
- "bottom": row_boundaries[i + 1]
2124
+ cell_region.metadata.update(
2125
+ {
2126
+ "row_index": i,
2127
+ "col_index": j,
2128
+ "source_guides": True,
2129
+ "original_boundaries": {
2130
+ "left": col_boundaries[j],
2131
+ "top": row_boundaries[i],
2132
+ "right": col_boundaries[j + 1],
2133
+ "bottom": row_boundaries[i + 1],
2134
+ },
1993
2135
  }
1994
- })
2136
+ )
1995
2137
  element_manager.add_element(cell_region, element_type="regions")
1996
- counts['cells'] += 1
1997
-
1998
- logger.info(f"Created {counts['table']} table, {counts['rows']} rows, "
1999
- f"{counts['columns']} columns, and {counts['cells']} cells from guides")
2000
-
2138
+ counts["cells"] += 1
2139
+
2140
+ logger.info(
2141
+ f"Created {counts['table']} table, {counts['rows']} rows, "
2142
+ f"{counts['columns']} columns, and {counts['cells']} cells from guides"
2143
+ )
2144
+
2001
2145
  return counts
2002
2146
 
2003
2147
  def __repr__(self) -> str:
2004
2148
  """String representation of the guides."""
2005
- return (f"Guides(verticals={len(self.vertical)}, "
2006
- f"horizontals={len(self.horizontal)}, "
2007
- f"cells={len(self.get_cells())})")
2149
+ return (
2150
+ f"Guides(verticals={len(self.vertical)}, "
2151
+ f"horizontals={len(self.horizontal)}, "
2152
+ f"cells={len(self.get_cells())})"
2153
+ )
2008
2154
 
2009
2155
  def _get_text_elements(self):
2010
2156
  """Get text elements from the context."""
2011
2157
  if not self.context:
2012
2158
  return []
2013
-
2159
+
2014
2160
  # Get text elements from the context
2015
- if hasattr(self.context, 'find_all'):
2161
+ if hasattr(self.context, "find_all"):
2016
2162
  try:
2017
- text_elements = self.context.find_all('text', apply_exclusions=False)
2018
- return text_elements.elements if hasattr(text_elements, 'elements') else text_elements
2163
+ text_elements = self.context.find_all("text", apply_exclusions=False)
2164
+ return (
2165
+ text_elements.elements if hasattr(text_elements, "elements") else text_elements
2166
+ )
2019
2167
  except Exception as e:
2020
2168
  logger.warning(f"Error getting text elements: {e}")
2021
2169
  return []
@@ -2026,32 +2174,32 @@ class Guides:
2026
2174
  # -------------------------------------------------------------------------
2027
2175
  # Instance methods for fluent chaining (avoid name conflicts with class methods)
2028
2176
  # -------------------------------------------------------------------------
2029
-
2177
+
2030
2178
  def add_content(
2031
2179
  self,
2032
- axis: Literal['vertical', 'horizontal'] = 'vertical',
2180
+ axis: Literal["vertical", "horizontal"] = "vertical",
2033
2181
  markers: Union[str, List[str], "ElementCollection", None] = None,
2034
2182
  obj: Optional[Union["Page", "Region"]] = None,
2035
- align: Literal['left', 'right', 'center', 'between'] = 'left',
2183
+ align: Literal["left", "right", "center", "between"] = "left",
2036
2184
  outer: bool = True,
2037
- tolerance: float = 5
2185
+ tolerance: float = 5,
2038
2186
  ) -> "Guides":
2039
2187
  """
2040
2188
  Instance method: Add guides from content, allowing chaining.
2041
2189
  This allows: Guides.new(page).add_content(axis='vertical', markers=[...])
2042
-
2190
+
2043
2191
  Args:
2044
2192
  axis: Which axis to create guides for
2045
2193
  markers: Content to search for. Can be:
2046
2194
  - str: single selector or literal text
2047
- - List[str]: list of selectors or literal text strings
2195
+ - List[str]: list of selectors or literal text strings
2048
2196
  - ElementCollection: collection of elements to extract text from
2049
2197
  - None: no markers
2050
2198
  obj: Page or Region to search (uses self.context if None)
2051
2199
  align: How to align guides relative to found elements
2052
2200
  outer: Whether to add outer boundary guides
2053
2201
  tolerance: Tolerance for snapping to element edges
2054
-
2202
+
2055
2203
  Returns:
2056
2204
  Self for method chaining
2057
2205
  """
@@ -2059,7 +2207,7 @@ class Guides:
2059
2207
  target_obj = obj or self.context
2060
2208
  if target_obj is None:
2061
2209
  raise ValueError("No object provided and no context available")
2062
-
2210
+
2063
2211
  # Create new guides using the class method
2064
2212
  new_guides = Guides.from_content(
2065
2213
  obj=target_obj,
@@ -2067,34 +2215,34 @@ class Guides:
2067
2215
  markers=markers,
2068
2216
  align=align,
2069
2217
  outer=outer,
2070
- tolerance=tolerance
2218
+ tolerance=tolerance,
2071
2219
  )
2072
-
2220
+
2073
2221
  # Add the appropriate coordinates to this object
2074
- if axis == 'vertical':
2222
+ if axis == "vertical":
2075
2223
  self.vertical = list(set(self.vertical + new_guides.vertical))
2076
2224
  else:
2077
2225
  self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2078
-
2226
+
2079
2227
  return self
2080
-
2228
+
2081
2229
  def add_lines(
2082
2230
  self,
2083
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
2084
- obj: Optional[Union["Page", "Region"]] = None,
2085
- threshold: Union[float, str] = 'auto',
2231
+ axis: Literal["vertical", "horizontal", "both"] = "both",
2232
+ obj: Optional[Union["Page", "Region"]] = None,
2233
+ threshold: Union[float, str] = "auto",
2086
2234
  source_label: Optional[str] = None,
2087
2235
  max_lines_h: Optional[int] = None,
2088
2236
  max_lines_v: Optional[int] = None,
2089
2237
  outer: bool = False,
2090
- detection_method: str = 'vector',
2238
+ detection_method: str = "vector",
2091
2239
  resolution: int = 192,
2092
- **detect_kwargs
2240
+ **detect_kwargs,
2093
2241
  ) -> "Guides":
2094
2242
  """
2095
2243
  Instance method: Add guides from lines, allowing chaining.
2096
2244
  This allows: Guides.new(page).add_lines(axis='horizontal')
2097
-
2245
+
2098
2246
  Args:
2099
2247
  axis: Which axis to detect lines for
2100
2248
  obj: Page or Region to search (uses self.context if None)
@@ -2106,7 +2254,7 @@ class Guides:
2106
2254
  detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
2107
2255
  resolution: DPI for pixel-based detection (default: 192)
2108
2256
  **detect_kwargs: Additional parameters for pixel detection (see from_lines)
2109
-
2257
+
2110
2258
  Returns:
2111
2259
  Self for method chaining
2112
2260
  """
@@ -2114,7 +2262,7 @@ class Guides:
2114
2262
  target_obj = obj or self.context
2115
2263
  if target_obj is None:
2116
2264
  raise ValueError("No object provided and no context available")
2117
-
2265
+
2118
2266
  # Create new guides using the class method
2119
2267
  new_guides = Guides.from_lines(
2120
2268
  obj=target_obj,
@@ -2126,32 +2274,32 @@ class Guides:
2126
2274
  outer=outer,
2127
2275
  detection_method=detection_method,
2128
2276
  resolution=resolution,
2129
- **detect_kwargs
2277
+ **detect_kwargs,
2130
2278
  )
2131
-
2279
+
2132
2280
  # Add the appropriate coordinates to this object
2133
- if axis in ('vertical', 'both'):
2281
+ if axis in ("vertical", "both"):
2134
2282
  self.vertical = list(set(self.vertical + new_guides.vertical))
2135
- if axis in ('horizontal', 'both'):
2283
+ if axis in ("horizontal", "both"):
2136
2284
  self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2137
-
2285
+
2138
2286
  return self
2139
-
2287
+
2140
2288
  def add_whitespace(
2141
2289
  self,
2142
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
2290
+ axis: Literal["vertical", "horizontal", "both"] = "both",
2143
2291
  obj: Optional[Union["Page", "Region"]] = None,
2144
- min_gap: float = 10
2292
+ min_gap: float = 10,
2145
2293
  ) -> "Guides":
2146
2294
  """
2147
2295
  Instance method: Add guides from whitespace, allowing chaining.
2148
2296
  This allows: Guides.new(page).add_whitespace(axis='both')
2149
-
2297
+
2150
2298
  Args:
2151
2299
  axis: Which axis to create guides for
2152
2300
  obj: Page or Region to search (uses self.context if None)
2153
2301
  min_gap: Minimum gap size to consider
2154
-
2302
+
2155
2303
  Returns:
2156
2304
  Self for method chaining
2157
2305
  """
@@ -2159,18 +2307,14 @@ class Guides:
2159
2307
  target_obj = obj or self.context
2160
2308
  if target_obj is None:
2161
2309
  raise ValueError("No object provided and no context available")
2162
-
2310
+
2163
2311
  # Create new guides using the class method
2164
- new_guides = Guides.from_whitespace(
2165
- obj=target_obj,
2166
- axis=axis,
2167
- min_gap=min_gap
2168
- )
2169
-
2312
+ new_guides = Guides.from_whitespace(obj=target_obj, axis=axis, min_gap=min_gap)
2313
+
2170
2314
  # Add the appropriate coordinates to this object
2171
- if axis in ('vertical', 'both'):
2315
+ if axis in ("vertical", "both"):
2172
2316
  self.vertical = list(set(self.vertical + new_guides.vertical))
2173
- if axis in ('horizontal', 'both'):
2317
+ if axis in ("horizontal", "both"):
2174
2318
  self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2175
-
2176
- return self
2319
+
2320
+ return self