natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -2,59 +2,58 @@
2
2
 
3
3
  import json
4
4
  import logging
5
- from typing import Any, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING
6
5
  from collections import UserList
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
7
7
 
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from natural_pdf.core.page import Page
13
- from natural_pdf.elements.region import Region
14
13
  from natural_pdf.elements.base import Element
15
14
  from natural_pdf.elements.collections import ElementCollection
15
+ from natural_pdf.elements.region import Region
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  def _normalize_markers(
21
- markers: Union[str, List[str], "ElementCollection", None],
22
- obj: Union["Page", "Region"]
21
+ markers: Union[str, List[str], "ElementCollection", None], obj: Union["Page", "Region"]
23
22
  ) -> List[str]:
24
23
  """
25
24
  Normalize markers parameter to a list of text strings for guide creation.
26
-
25
+
27
26
  Args:
28
27
  markers: Can be:
29
28
  - str: single selector or text string
30
- - List[str]: list of selectors or text strings
29
+ - List[str]: list of selectors or text strings
31
30
  - ElementCollection: collection of elements to extract text from
32
31
  - None: empty list
33
32
  obj: Object to search for elements if markers contains selectors
34
-
33
+
35
34
  Returns:
36
35
  List of text strings to search for
37
36
  """
38
37
  if markers is None:
39
38
  return []
40
-
39
+
41
40
  if isinstance(markers, str):
42
41
  # Single selector or text string
43
- if markers.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
42
+ if markers.startswith(("text", "region", "line", "rect", "blob", "image")):
44
43
  # It's a CSS selector, find elements and extract text
45
- if hasattr(obj, 'find_all'):
44
+ if hasattr(obj, "find_all"):
46
45
  elements = obj.find_all(markers)
47
- return [elem.text if hasattr(elem, 'text') else str(elem) for elem in elements]
46
+ return [elem.text if hasattr(elem, "text") else str(elem) for elem in elements]
48
47
  else:
49
48
  logger.warning(f"Object {obj} doesn't support find_all for selector '{markers}'")
50
49
  return [markers] # Treat as literal text
51
50
  else:
52
51
  # Treat as literal text
53
52
  return [markers]
54
-
55
- elif hasattr(markers, '__iter__') and not isinstance(markers, str):
53
+
54
+ elif hasattr(markers, "__iter__") and not isinstance(markers, str):
56
55
  # It might be an ElementCollection or list
57
- if hasattr(markers, 'extract_each_text'):
56
+ if hasattr(markers, "extract_each_text"):
58
57
  # It's an ElementCollection
59
58
  try:
60
59
  return markers.extract_each_text()
@@ -63,9 +62,9 @@ def _normalize_markers(
63
62
  # Fallback: try to get text from individual elements
64
63
  texts = []
65
64
  for elem in markers:
66
- if hasattr(elem, 'text'):
65
+ if hasattr(elem, "text"):
67
66
  texts.append(elem.text)
68
- elif hasattr(elem, 'extract_text'):
67
+ elif hasattr(elem, "extract_text"):
69
68
  texts.append(elem.extract_text())
70
69
  else:
71
70
  texts.append(str(elem))
@@ -75,26 +74,31 @@ def _normalize_markers(
75
74
  result = []
76
75
  for marker in markers:
77
76
  if isinstance(marker, str):
78
- if marker.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
77
+ if marker.startswith(("text", "region", "line", "rect", "blob", "image")):
79
78
  # It's a selector
80
- if hasattr(obj, 'find_all'):
79
+ if hasattr(obj, "find_all"):
81
80
  elements = obj.find_all(marker)
82
- result.extend([elem.text if hasattr(elem, 'text') else str(elem) for elem in elements])
81
+ result.extend(
82
+ [
83
+ elem.text if hasattr(elem, "text") else str(elem)
84
+ for elem in elements
85
+ ]
86
+ )
83
87
  else:
84
88
  result.append(marker) # Treat as literal
85
89
  else:
86
90
  # Literal text
87
91
  result.append(marker)
88
- elif hasattr(marker, 'text'):
92
+ elif hasattr(marker, "text"):
89
93
  # It's an element object
90
94
  result.append(marker.text)
91
- elif hasattr(marker, 'extract_text'):
95
+ elif hasattr(marker, "extract_text"):
92
96
  # It's an element that can extract text
93
97
  result.append(marker.extract_text())
94
98
  else:
95
99
  result.append(str(marker))
96
100
  return result
97
-
101
+
98
102
  else:
99
103
  # Unknown type, try to convert to string
100
104
  return [str(markers)]
@@ -102,44 +106,44 @@ def _normalize_markers(
102
106
 
103
107
  class GuidesList(UserList):
104
108
  """A list of guide coordinates that also provides methods for creating guides."""
105
-
109
+
106
110
  def __init__(self, parent_guides: "Guides", axis: Literal["vertical", "horizontal"], data=None):
107
111
  super().__init__(data or [])
108
112
  self._parent = parent_guides
109
113
  self._axis = axis
110
-
114
+
111
115
  def from_content(
112
116
  self,
113
117
  markers: Union[str, List[str], "ElementCollection", None],
114
118
  obj: Optional[Union["Page", "Region"]] = None,
115
- align: Literal['left', 'right', 'center', 'between'] = 'left',
119
+ align: Literal["left", "right", "center", "between"] = "left",
116
120
  outer: bool = True,
117
- tolerance: float = 5
121
+ tolerance: float = 5,
118
122
  ) -> "Guides":
119
123
  """
120
124
  Create guides from content markers and add to this axis.
121
-
125
+
122
126
  Args:
123
127
  markers: Content to search for. Can be:
124
128
  - str: single selector (e.g., 'text:contains("Name")') or literal text
125
- - List[str]: list of selectors or literal text strings
129
+ - List[str]: list of selectors or literal text strings
126
130
  - ElementCollection: collection of elements to extract text from
127
131
  - None: no markers
128
132
  obj: Page/Region to search (uses parent's context if None)
129
133
  align: How to align guides relative to found elements
130
134
  outer: Whether to add outer boundary guides
131
135
  tolerance: Tolerance for snapping to element edges
132
-
136
+
133
137
  Returns:
134
138
  Parent Guides object for chaining
135
139
  """
136
140
  target_obj = obj or self._parent.context
137
141
  if target_obj is None:
138
142
  raise ValueError("No object provided and no context available")
139
-
143
+
140
144
  # Normalize markers to list of text strings
141
145
  marker_texts = _normalize_markers(markers, target_obj)
142
-
146
+
143
147
  # Create guides for this axis
144
148
  new_guides = Guides.from_content(
145
149
  obj=target_obj,
@@ -147,15 +151,15 @@ class GuidesList(UserList):
147
151
  markers=marker_texts,
148
152
  align=align,
149
153
  outer=outer,
150
- tolerance=tolerance
154
+ tolerance=tolerance,
151
155
  )
152
-
156
+
153
157
  # Add to our list
154
- if self._axis == 'vertical':
158
+ if self._axis == "vertical":
155
159
  self.extend(new_guides.vertical)
156
160
  else:
157
161
  self.extend(new_guides.horizontal)
158
-
162
+
159
163
  # Remove duplicates while preserving order
160
164
  seen = set()
161
165
  unique = []
@@ -164,26 +168,26 @@ class GuidesList(UserList):
164
168
  seen.add(x)
165
169
  unique.append(x)
166
170
  self.data = unique
167
-
171
+
168
172
  return self._parent # Return parent for chaining
169
-
173
+
170
174
  def from_lines(
171
175
  self,
172
176
  obj: Optional[Union["Page", "Region"]] = None,
173
- threshold: Union[float, str] = 'auto',
177
+ threshold: Union[float, str] = "auto",
174
178
  source_label: Optional[str] = None,
175
179
  max_lines: Optional[int] = None,
176
180
  outer: bool = False,
177
- detection_method: str = 'vector',
181
+ detection_method: str = "vector",
178
182
  resolution: int = 192,
179
183
  *,
180
184
  n: Optional[int] = None,
181
185
  min_gap: Optional[int] = None,
182
- **detect_kwargs
186
+ **detect_kwargs,
183
187
  ) -> "Guides":
184
188
  """
185
189
  Create guides from detected line elements.
186
-
190
+
187
191
  Args:
188
192
  obj: Page/Region to search (uses parent's context if None)
189
193
  threshold: Line detection threshold ('auto' or float 0.0-1.0)
@@ -198,14 +202,14 @@ class GuidesList(UserList):
198
202
  resolution: DPI for pixel-based detection (default: 192)
199
203
  **detect_kwargs: Additional parameters for pixel-based detection
200
204
  (e.g., min_gap_h, min_gap_v, binarization_method, etc.)
201
-
205
+
202
206
  Returns:
203
207
  Parent Guides object for chaining
204
208
  """
205
209
  target_obj = obj or self._parent.context
206
210
  if target_obj is None:
207
211
  raise ValueError("No object provided and no context available")
208
-
212
+
209
213
  # Resolve max_lines via alias `n` (n takes priority)
210
214
  if n is not None:
211
215
  if n <= 0:
@@ -213,16 +217,16 @@ class GuidesList(UserList):
213
217
  max_lines = n
214
218
 
215
219
  # Set appropriate max_lines parameter for underlying API
216
- max_lines_h = max_lines if self._axis == 'horizontal' else None
217
- max_lines_v = max_lines if self._axis == 'vertical' else None
218
-
220
+ max_lines_h = max_lines if self._axis == "horizontal" else None
221
+ max_lines_v = max_lines if self._axis == "vertical" else None
222
+
219
223
  # Map generic `min_gap` to axis-specific argument expected by detection
220
224
  if min_gap is not None:
221
225
  if min_gap < 1:
222
226
  raise ValueError("min_gap must be ≥ 1 pixel")
223
- axis_key = 'min_gap_h' if self._axis == 'horizontal' else 'min_gap_v'
227
+ axis_key = "min_gap_h" if self._axis == "horizontal" else "min_gap_v"
224
228
  detect_kwargs.setdefault(axis_key, min_gap)
225
-
229
+
226
230
  # Create guides for this axis
227
231
  new_guides = Guides.from_lines(
228
232
  obj=target_obj,
@@ -234,15 +238,15 @@ class GuidesList(UserList):
234
238
  outer=outer,
235
239
  detection_method=detection_method,
236
240
  resolution=resolution,
237
- **detect_kwargs
241
+ **detect_kwargs,
238
242
  )
239
-
243
+
240
244
  # Add to our list
241
- if self._axis == 'vertical':
245
+ if self._axis == "vertical":
242
246
  self.extend(new_guides.vertical)
243
247
  else:
244
248
  self.extend(new_guides.horizontal)
245
-
249
+
246
250
  # Remove duplicates
247
251
  seen = set()
248
252
  unique = []
@@ -251,41 +255,35 @@ class GuidesList(UserList):
251
255
  seen.add(x)
252
256
  unique.append(x)
253
257
  self.data = unique
254
-
258
+
255
259
  return self._parent
256
-
260
+
257
261
  def from_whitespace(
258
- self,
259
- obj: Optional[Union["Page", "Region"]] = None,
260
- min_gap: float = 10
262
+ self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10
261
263
  ) -> "Guides":
262
264
  """
263
265
  Create guides from whitespace gaps.
264
-
266
+
265
267
  Args:
266
268
  obj: Page/Region to analyze (uses parent's context if None)
267
269
  min_gap: Minimum gap size to consider
268
-
270
+
269
271
  Returns:
270
272
  Parent Guides object for chaining
271
273
  """
272
274
  target_obj = obj or self._parent.context
273
275
  if target_obj is None:
274
276
  raise ValueError("No object provided and no context available")
275
-
277
+
276
278
  # Create guides for this axis
277
- new_guides = Guides.from_whitespace(
278
- obj=target_obj,
279
- axis=self._axis,
280
- min_gap=min_gap
281
- )
282
-
279
+ new_guides = Guides.from_whitespace(obj=target_obj, axis=self._axis, min_gap=min_gap)
280
+
283
281
  # Add to our list
284
- if self._axis == 'vertical':
282
+ if self._axis == "vertical":
285
283
  self.extend(new_guides.vertical)
286
284
  else:
287
285
  self.extend(new_guides.horizontal)
288
-
286
+
289
287
  # Remove duplicates
290
288
  seen = set()
291
289
  unique = []
@@ -294,37 +292,33 @@ class GuidesList(UserList):
294
292
  seen.add(x)
295
293
  unique.append(x)
296
294
  self.data = unique
297
-
295
+
298
296
  return self._parent
299
-
297
+
300
298
  def divide(self, n: int = 2, obj: Optional[Union["Page", "Region"]] = None) -> "Guides":
301
299
  """
302
300
  Divide the space evenly along this axis.
303
-
301
+
304
302
  Args:
305
303
  n: Number of divisions (creates n-1 guides)
306
304
  obj: Object to divide (uses parent's context if None)
307
-
305
+
308
306
  Returns:
309
307
  Parent Guides object for chaining
310
308
  """
311
309
  target_obj = obj or self._parent.context
312
310
  if target_obj is None:
313
311
  raise ValueError("No object provided and no context available")
314
-
312
+
315
313
  # Create guides using divide
316
- new_guides = Guides.divide(
317
- obj=target_obj,
318
- n=n,
319
- axis=self._axis
320
- )
321
-
314
+ new_guides = Guides.divide(obj=target_obj, n=n, axis=self._axis)
315
+
322
316
  # Add to our list
323
- if self._axis == 'vertical':
317
+ if self._axis == "vertical":
324
318
  self.extend(new_guides.vertical)
325
319
  else:
326
320
  self.extend(new_guides.horizontal)
327
-
321
+
328
322
  # Remove duplicates
329
323
  seen = set()
330
324
  unique = []
@@ -333,45 +327,45 @@ class GuidesList(UserList):
333
327
  seen.add(x)
334
328
  unique.append(x)
335
329
  self.data = unique
336
-
330
+
337
331
  return self._parent
338
-
332
+
339
333
  def snap_to_whitespace(
340
334
  self,
341
335
  min_gap: float = 10.0,
342
- detection_method: str = 'pixels',
343
- threshold: Union[float, str] = 'auto',
344
- on_no_snap: str = 'warn',
345
- obj: Optional[Union["Page", "Region"]] = None
336
+ detection_method: str = "pixels",
337
+ threshold: Union[float, str] = "auto",
338
+ on_no_snap: str = "warn",
339
+ obj: Optional[Union["Page", "Region"]] = None,
346
340
  ) -> "Guides":
347
341
  """
348
342
  Snap guides in this axis to whitespace gaps.
349
-
343
+
350
344
  Args:
351
345
  min_gap: Minimum gap size to consider
352
346
  detection_method: 'pixels' or 'text' for gap detection
353
347
  threshold: Threshold for whitespace detection (0.0-1.0) or 'auto'
354
348
  on_no_snap: What to do when snapping fails ('warn', 'raise', 'ignore')
355
349
  obj: Object to analyze (uses parent's context if None)
356
-
350
+
357
351
  Returns:
358
352
  Parent Guides object for chaining
359
353
  """
360
354
  target_obj = obj or self._parent.context
361
355
  if target_obj is None:
362
356
  raise ValueError("No object provided and no context available")
363
-
357
+
364
358
  # Use the parent's snap_to_whitespace but only for this axis
365
359
  original_guides = self.data.copy()
366
-
360
+
367
361
  # Temporarily set the parent's guides to only this axis
368
- if self._axis == 'vertical':
362
+ if self._axis == "vertical":
369
363
  original_horizontal = self._parent.horizontal.data.copy()
370
364
  self._parent.horizontal.data = []
371
365
  else:
372
366
  original_vertical = self._parent.vertical.data.copy()
373
367
  self._parent.vertical.data = []
374
-
368
+
375
369
  try:
376
370
  # Call the parent's method
377
371
  self._parent.snap_to_whitespace(
@@ -379,140 +373,143 @@ class GuidesList(UserList):
379
373
  min_gap=min_gap,
380
374
  detection_method=detection_method,
381
375
  threshold=threshold,
382
- on_no_snap=on_no_snap
376
+ on_no_snap=on_no_snap,
383
377
  )
384
-
378
+
385
379
  # Update our data from the parent
386
- if self._axis == 'vertical':
380
+ if self._axis == "vertical":
387
381
  self.data = self._parent.vertical.data.copy()
388
382
  else:
389
383
  self.data = self._parent.horizontal.data.copy()
390
-
384
+
391
385
  finally:
392
386
  # Restore the other axis
393
- if self._axis == 'vertical':
387
+ if self._axis == "vertical":
394
388
  self._parent.horizontal.data = original_horizontal
395
389
  else:
396
390
  self._parent.vertical.data = original_vertical
397
-
391
+
398
392
  return self._parent
399
-
393
+
400
394
  def snap_to_content(
401
395
  self,
402
- markers: Union[str, List[str], "ElementCollection", None] = 'text',
403
- align: Literal['left', 'right', 'center'] = 'left',
396
+ markers: Union[str, List[str], "ElementCollection", None] = "text",
397
+ align: Literal["left", "right", "center"] = "left",
404
398
  tolerance: float = 5,
405
- obj: Optional[Union["Page", "Region"]] = None
399
+ obj: Optional[Union["Page", "Region"]] = None,
406
400
  ) -> "Guides":
407
401
  """
408
402
  Snap guides in this axis to nearby text content.
409
-
403
+
410
404
  Args:
411
405
  markers: Content to snap to. Can be:
412
406
  - str: single selector or literal text (default: 'text' for all text)
413
- - List[str]: list of selectors or literal text strings
407
+ - List[str]: list of selectors or literal text strings
414
408
  - ElementCollection: collection of elements
415
409
  - None: no markers (no snapping)
416
410
  align: How to align to the found text
417
411
  tolerance: Maximum distance to move when snapping
418
412
  obj: Object to search (uses parent's context if None)
419
-
413
+
420
414
  Returns:
421
415
  Parent Guides object for chaining
422
416
  """
423
417
  target_obj = obj or self._parent.context
424
418
  if target_obj is None:
425
419
  raise ValueError("No object provided and no context available")
426
-
420
+
427
421
  # Handle special case of 'text' as a selector for all text
428
- if markers == 'text':
422
+ if markers == "text":
429
423
  # Get all text elements
430
- if hasattr(target_obj, 'find_all'):
431
- text_elements = target_obj.find_all('text')
432
- if hasattr(text_elements, 'elements'):
424
+ if hasattr(target_obj, "find_all"):
425
+ text_elements = target_obj.find_all("text")
426
+ if hasattr(text_elements, "elements"):
433
427
  text_elements = text_elements.elements
434
-
428
+
435
429
  # Snap each guide to the nearest text element
436
430
  for i, guide_pos in enumerate(self.data):
437
- best_distance = float('inf')
431
+ best_distance = float("inf")
438
432
  best_pos = guide_pos
439
-
433
+
440
434
  for elem in text_elements:
441
435
  # Calculate target position based on alignment
442
- if self._axis == 'vertical':
443
- if align == 'left':
436
+ if self._axis == "vertical":
437
+ if align == "left":
444
438
  elem_pos = elem.x0
445
- elif align == 'right':
439
+ elif align == "right":
446
440
  elem_pos = elem.x1
447
441
  else: # center
448
442
  elem_pos = (elem.x0 + elem.x1) / 2
449
443
  else: # horizontal
450
- if align == 'left': # top for horizontal
444
+ if align == "left": # top for horizontal
451
445
  elem_pos = elem.top
452
- elif align == 'right': # bottom for horizontal
446
+ elif align == "right": # bottom for horizontal
453
447
  elem_pos = elem.bottom
454
448
  else: # center
455
449
  elem_pos = (elem.top + elem.bottom) / 2
456
-
450
+
457
451
  # Check if this is closer than current best
458
452
  distance = abs(guide_pos - elem_pos)
459
453
  if distance < best_distance and distance <= tolerance:
460
454
  best_distance = distance
461
455
  best_pos = elem_pos
462
-
456
+
463
457
  # Update guide position if we found a good snap
464
458
  if best_pos != guide_pos:
465
459
  self.data[i] = best_pos
466
- logger.debug(f"Snapped {self._axis} guide from {guide_pos:.1f} to {best_pos:.1f}")
460
+ logger.debug(
461
+ f"Snapped {self._axis} guide from {guide_pos:.1f} to {best_pos:.1f}"
462
+ )
467
463
  else:
468
464
  logger.warning("Object does not support find_all for text snapping")
469
465
  else:
470
466
  # Original behavior for specific markers
471
467
  marker_texts = _normalize_markers(markers, target_obj)
472
-
468
+
473
469
  # Find each marker and snap guides
474
470
  for marker in marker_texts:
475
- if hasattr(target_obj, 'find'):
471
+ if hasattr(target_obj, "find"):
476
472
  element = target_obj.find(f'text:contains("{marker}")')
477
473
  if not element:
478
474
  logger.warning(f"Could not find text '{marker}' for snapping")
479
475
  continue
480
-
476
+
481
477
  # Determine target position based on alignment
482
- if self._axis == 'vertical':
483
- if align == 'left':
478
+ if self._axis == "vertical":
479
+ if align == "left":
484
480
  target_pos = element.x0
485
- elif align == 'right':
481
+ elif align == "right":
486
482
  target_pos = element.x1
487
483
  else: # center
488
484
  target_pos = (element.x0 + element.x1) / 2
489
485
  else: # horizontal
490
- if align == 'left': # top for horizontal
486
+ if align == "left": # top for horizontal
491
487
  target_pos = element.top
492
- elif align == 'right': # bottom for horizontal
488
+ elif align == "right": # bottom for horizontal
493
489
  target_pos = element.bottom
494
490
  else: # center
495
491
  target_pos = (element.top + element.bottom) / 2
496
-
492
+
497
493
  # Find closest guide and snap if within tolerance
498
494
  if self.data:
499
- closest_idx = min(range(len(self.data)),
500
- key=lambda i: abs(self.data[i] - target_pos))
495
+ closest_idx = min(
496
+ range(len(self.data)), key=lambda i: abs(self.data[i] - target_pos)
497
+ )
501
498
  if abs(self.data[closest_idx] - target_pos) <= tolerance:
502
499
  self.data[closest_idx] = target_pos
503
-
500
+
504
501
  # Sort after snapping
505
502
  self.data.sort()
506
503
  return self._parent
507
-
504
+
508
505
  def shift(self, index: int, offset: float) -> "Guides":
509
506
  """
510
507
  Move a specific guide in this axis by a offset amount.
511
-
508
+
512
509
  Args:
513
510
  index: Index of the guide to move
514
511
  offset: Amount to move (positive = right/down)
515
-
512
+
516
513
  Returns:
517
514
  Parent Guides object for chaining
518
515
  """
@@ -521,18 +518,18 @@ class GuidesList(UserList):
521
518
  self.data.sort()
522
519
  else:
523
520
  logger.warning(f"Guide index {index} out of range for {self._axis} axis")
524
-
521
+
525
522
  return self._parent
526
-
523
+
527
524
  def add(self, position: Union[float, List[float]]) -> "Guides":
528
525
  """
529
526
  Add one or more guides at the specified position(s).
530
-
527
+
531
528
  Args:
532
529
  position: Coordinate(s) to add guide(s) at. Can be:
533
530
  - float: single position
534
531
  - List[float]: multiple positions
535
-
532
+
536
533
  Returns:
537
534
  Parent Guides object for chaining
538
535
  """
@@ -543,34 +540,34 @@ class GuidesList(UserList):
543
540
  else:
544
541
  # Add single position
545
542
  self.append(float(position))
546
-
543
+
547
544
  self.data.sort()
548
545
  return self._parent
549
-
546
+
550
547
  def remove_at(self, index: int) -> "Guides":
551
548
  """
552
549
  Remove a guide by index.
553
-
550
+
554
551
  Args:
555
552
  index: Index of guide to remove
556
-
553
+
557
554
  Returns:
558
555
  Parent Guides object for chaining
559
556
  """
560
557
  if 0 <= index < len(self.data):
561
558
  self.data.pop(index)
562
559
  return self._parent
563
-
560
+
564
561
  def clear_all(self) -> "Guides":
565
562
  """
566
563
  Remove all guides from this axis.
567
-
564
+
568
565
  Returns:
569
566
  Parent Guides object for chaining
570
567
  """
571
568
  self.data.clear()
572
569
  return self._parent
573
-
570
+
574
571
  def __add__(self, other):
575
572
  """Handle addition of GuidesList objects by returning combined data."""
576
573
  if isinstance(other, GuidesList):
@@ -584,11 +581,11 @@ class GuidesList(UserList):
584
581
  class Guides:
585
582
  """
586
583
  Manages vertical and horizontal guide lines for table extraction and layout analysis.
587
-
584
+
588
585
  Guides are collections of coordinates that can be used to define table boundaries,
589
586
  column positions, or general layout structures. They can be created through various
590
587
  detection methods or manually specified.
591
-
588
+
592
589
  Attributes:
593
590
  verticals: List of x-coordinates for vertical guide lines
594
591
  horizontals: List of y-coordinates for horizontal guide lines
@@ -596,7 +593,7 @@ class Guides:
596
593
  bounds: Optional bounding box (x0, y0, x1, y1) for relative coordinate conversion
597
594
  snap_behavior: How to handle failed snapping operations ('warn', 'ignore', 'raise')
598
595
  """
599
-
596
+
600
597
  def __init__(
601
598
  self,
602
599
  verticals: Optional[Union[List[float], "Page", "Region"]] = None,
@@ -604,52 +601,63 @@ class Guides:
604
601
  context: Optional[Union["Page", "Region"]] = None,
605
602
  bounds: Optional[Tuple[float, float, float, float]] = None,
606
603
  relative: bool = False,
607
- snap_behavior: Literal['raise', 'warn', 'ignore'] = 'warn'
604
+ snap_behavior: Literal["raise", "warn", "ignore"] = "warn",
608
605
  ):
609
606
  """
610
607
  Initialize a Guides object.
611
-
608
+
612
609
  Args:
613
610
  verticals: List of x-coordinates for vertical guides, or a Page/Region as context
614
- horizontals: List of y-coordinates for horizontal guides
611
+ horizontals: List of y-coordinates for horizontal guides
615
612
  context: Page or Region object these guides were created from
616
613
  bounds: Bounding box (x0, top, x1, bottom) if context not provided
617
614
  relative: Whether coordinates are relative (0-1) or absolute
618
615
  snap_behavior: How to handle snapping conflicts ('raise', 'warn', or 'ignore')
619
616
  """
620
617
  # Handle Guides(page) shorthand
621
- if verticals is not None and not isinstance(verticals, (list, tuple)) and horizontals is None and context is None:
618
+ if (
619
+ verticals is not None
620
+ and not isinstance(verticals, (list, tuple))
621
+ and horizontals is None
622
+ and context is None
623
+ ):
622
624
  # First argument is a page/region, not coordinates
623
625
  context = verticals
624
626
  verticals = None
625
-
627
+
626
628
  self.context = context
627
629
  self.bounds = bounds
628
630
  self.relative = relative
629
631
  self.snap_behavior = snap_behavior
630
-
632
+
631
633
  # Initialize with GuidesList instances
632
634
  self._vertical = GuidesList(self, "vertical", sorted([float(x) for x in (verticals or [])]))
633
- self._horizontal = GuidesList(self, "horizontal", sorted([float(y) for y in (horizontals or [])]))
634
-
635
+ self._horizontal = GuidesList(
636
+ self, "horizontal", sorted([float(y) for y in (horizontals or [])])
637
+ )
638
+
635
639
  # Determine bounds from context if needed
636
640
  if self.bounds is None and self.context is not None:
637
- if hasattr(self.context, 'bbox'):
641
+ if hasattr(self.context, "bbox"):
638
642
  self.bounds = self.context.bbox
639
- elif hasattr(self.context, 'x0'):
640
- self.bounds = (self.context.x0, self.context.top,
641
- self.context.x1, self.context.bottom)
642
-
643
+ elif hasattr(self.context, "x0"):
644
+ self.bounds = (
645
+ self.context.x0,
646
+ self.context.top,
647
+ self.context.x1,
648
+ self.context.bottom,
649
+ )
650
+
643
651
  # Convert relative to absolute if needed
644
652
  if self.relative and self.bounds:
645
653
  x0, top, x1, bottom = self.bounds
646
654
  width = x1 - x0
647
655
  height = bottom - top
648
-
656
+
649
657
  self._vertical.data = [x0 + v * width for v in self._vertical]
650
658
  self._horizontal.data = [top + h * height for h in self._horizontal]
651
659
  self.relative = False
652
-
660
+
653
661
  @property
654
662
  def vertical(self) -> GuidesList:
655
663
  """Get vertical guide coordinates."""
@@ -665,8 +673,10 @@ class Guides:
665
673
  self._vertical.data = sorted([float(x) for x in value.vertical])
666
674
  elif isinstance(value, str):
667
675
  # Explicitly reject strings to avoid confusing iteration over characters
668
- raise TypeError(f"vertical cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
669
- elif hasattr(value, '__iter__'):
676
+ raise TypeError(
677
+ f"vertical cannot be a string, got '{value}'. Use a list of coordinates or Guides object."
678
+ )
679
+ elif hasattr(value, "__iter__"):
670
680
  # Handle list/tuple of coordinates
671
681
  try:
672
682
  self._vertical.data = sorted([float(x) for x in value])
@@ -690,8 +700,10 @@ class Guides:
690
700
  self._horizontal.data = sorted([float(y) for y in value.horizontal])
691
701
  elif isinstance(value, str):
692
702
  # Explicitly reject strings
693
- raise TypeError(f"horizontal cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
694
- elif hasattr(value, '__iter__'):
703
+ raise TypeError(
704
+ f"horizontal cannot be a string, got '{value}'. Use a list of coordinates or Guides object."
705
+ )
706
+ elif hasattr(value, "__iter__"):
695
707
  # Handle list/tuple of coordinates
696
708
  try:
697
709
  self._horizontal.data = sorted([float(y) for y in value])
@@ -699,24 +711,24 @@ class Guides:
699
711
  raise TypeError(f"horizontal must contain numeric values, got {value}: {e}")
700
712
  else:
701
713
  raise TypeError(f"horizontal must be a list, Guides object, or None, got {type(value)}")
702
-
714
+
703
715
  def _get_context_bounds(self) -> Optional[Tuple[float, float, float, float]]:
704
716
  """Get bounds from context if available."""
705
717
  if self.context is None:
706
718
  return None
707
-
708
- if hasattr(self.context, 'bbox'):
719
+
720
+ if hasattr(self.context, "bbox"):
709
721
  return self.context.bbox
710
- elif hasattr(self.context, 'x0') and hasattr(self.context, 'top'):
722
+ elif hasattr(self.context, "x0") and hasattr(self.context, "top"):
711
723
  return (self.context.x0, self.context.top, self.context.x1, self.context.bottom)
712
- elif hasattr(self.context, 'width') and hasattr(self.context, 'height'):
724
+ elif hasattr(self.context, "width") and hasattr(self.context, "height"):
713
725
  return (0, 0, self.context.width, self.context.height)
714
726
  return None
715
-
727
+
716
728
  # -------------------------------------------------------------------------
717
729
  # Factory Methods
718
730
  # -------------------------------------------------------------------------
719
-
731
+
720
732
  @classmethod
721
733
  def divide(
722
734
  cls,
@@ -724,28 +736,28 @@ class Guides:
724
736
  n: Optional[int] = None,
725
737
  cols: Optional[int] = None,
726
738
  rows: Optional[int] = None,
727
- axis: Literal['vertical', 'horizontal', 'both'] = 'both'
739
+ axis: Literal["vertical", "horizontal", "both"] = "both",
728
740
  ) -> "Guides":
729
741
  """
730
742
  Create guides by evenly dividing an object.
731
-
743
+
732
744
  Args:
733
745
  obj: Object to divide (Page, Region, or bbox tuple)
734
746
  n: Number of divisions (creates n+1 guides). Used if cols/rows not specified.
735
747
  cols: Number of columns (creates cols+1 vertical guides)
736
748
  rows: Number of rows (creates rows+1 horizontal guides)
737
749
  axis: Which axis to divide along
738
-
750
+
739
751
  Returns:
740
752
  New Guides object with evenly spaced lines
741
-
753
+
742
754
  Examples:
743
755
  # Divide into 3 columns
744
756
  guides = Guides.divide(page, cols=3)
745
-
757
+
746
758
  # Divide into 5 rows
747
759
  guides = Guides.divide(region, rows=5)
748
-
760
+
749
761
  # Divide both axes
750
762
  guides = Guides.divide(page, cols=3, rows=5)
751
763
  """
@@ -755,52 +767,52 @@ class Guides:
755
767
  context = None
756
768
  else:
757
769
  context = obj
758
- if hasattr(obj, 'bbox'):
770
+ if hasattr(obj, "bbox"):
759
771
  bounds = obj.bbox
760
- elif hasattr(obj, 'x0'):
772
+ elif hasattr(obj, "x0"):
761
773
  bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
762
774
  else:
763
775
  bounds = (0, 0, obj.width, obj.height)
764
-
776
+
765
777
  x0, y0, x1, y1 = bounds
766
778
  verticals = []
767
779
  horizontals = []
768
-
780
+
769
781
  # Handle vertical guides
770
- if axis in ('vertical', 'both'):
782
+ if axis in ("vertical", "both"):
771
783
  n_vertical = cols + 1 if cols is not None else (n + 1 if n is not None else 0)
772
784
  if n_vertical > 0:
773
785
  for i in range(n_vertical):
774
786
  x = x0 + (x1 - x0) * i / (n_vertical - 1)
775
787
  verticals.append(float(x))
776
-
788
+
777
789
  # Handle horizontal guides
778
- if axis in ('horizontal', 'both'):
790
+ if axis in ("horizontal", "both"):
779
791
  n_horizontal = rows + 1 if rows is not None else (n + 1 if n is not None else 0)
780
792
  if n_horizontal > 0:
781
793
  for i in range(n_horizontal):
782
794
  y = y0 + (y1 - y0) * i / (n_horizontal - 1)
783
795
  horizontals.append(float(y))
784
-
796
+
785
797
  return cls(verticals=verticals, horizontals=horizontals, context=context, bounds=bounds)
786
-
798
+
787
799
  @classmethod
788
800
  def from_lines(
789
801
  cls,
790
802
  obj: Union["Page", "Region"],
791
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
792
- threshold: Union[float, str] = 'auto',
803
+ axis: Literal["vertical", "horizontal", "both"] = "both",
804
+ threshold: Union[float, str] = "auto",
793
805
  source_label: Optional[str] = None,
794
806
  max_lines_h: Optional[int] = None,
795
807
  max_lines_v: Optional[int] = None,
796
808
  outer: bool = False,
797
- detection_method: str = 'vector',
809
+ detection_method: str = "vector",
798
810
  resolution: int = 192,
799
- **detect_kwargs
811
+ **detect_kwargs,
800
812
  ) -> "Guides":
801
813
  """
802
814
  Create guides from detected line elements.
803
-
815
+
804
816
  Args:
805
817
  obj: Page or Region to detect lines from
806
818
  axis: Which orientations to detect
@@ -818,108 +830,128 @@ class Guides:
818
830
  - morph_op_h/v: Morphological operations ('open', 'close', 'none')
819
831
  - smoothing_sigma_h/v: Gaussian smoothing sigma
820
832
  - method: 'projection' (default) or 'lsd' (requires opencv)
821
-
833
+
822
834
  Returns:
823
835
  New Guides object with detected line positions
824
836
  """
825
837
  # Get bounds for potential outer guides
826
- if hasattr(obj, 'bbox'):
838
+ if hasattr(obj, "bbox"):
827
839
  bounds = obj.bbox
828
- elif hasattr(obj, 'x0'):
840
+ elif hasattr(obj, "x0"):
829
841
  bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
830
- elif hasattr(obj, 'width'):
842
+ elif hasattr(obj, "width"):
831
843
  bounds = (0, 0, obj.width, obj.height)
832
844
  else:
833
845
  bounds = None
834
-
846
+
835
847
  verticals = []
836
848
  horizontals = []
837
-
838
- if detection_method == 'pixels':
849
+
850
+ if detection_method == "pixels":
839
851
  # Use pixel-based line detection
840
- if not hasattr(obj, 'detect_lines'):
852
+ if not hasattr(obj, "detect_lines"):
841
853
  raise ValueError(f"Object {obj} does not support pixel-based line detection")
842
-
854
+
843
855
  # Set up detection parameters
844
856
  detect_params = {
845
- 'resolution': resolution,
846
- 'source_label': source_label or 'guides_detection',
847
- 'horizontal': axis in ('horizontal', 'both'),
848
- 'vertical': axis in ('vertical', 'both'),
849
- 'replace': True, # Replace any existing lines with this source
850
- 'method': detect_kwargs.get('method', 'projection'),
857
+ "resolution": resolution,
858
+ "source_label": source_label or "guides_detection",
859
+ "horizontal": axis in ("horizontal", "both"),
860
+ "vertical": axis in ("vertical", "both"),
861
+ "replace": True, # Replace any existing lines with this source
862
+ "method": detect_kwargs.get("method", "projection"),
851
863
  }
852
-
864
+
853
865
  # Handle threshold parameter
854
- if threshold == 'auto':
866
+ if threshold == "auto":
855
867
  # Auto mode: use very low thresholds with max_lines constraints
856
- detect_params['peak_threshold_h'] = 0.0
857
- detect_params['peak_threshold_v'] = 0.0
858
- detect_params['max_lines_h'] = max_lines_h
859
- detect_params['max_lines_v'] = max_lines_v
868
+ detect_params["peak_threshold_h"] = 0.0
869
+ detect_params["peak_threshold_v"] = 0.0
870
+ detect_params["max_lines_h"] = max_lines_h
871
+ detect_params["max_lines_v"] = max_lines_v
860
872
  else:
861
873
  # Fixed threshold mode
862
- detect_params['peak_threshold_h'] = float(threshold) if axis in ('horizontal', 'both') else 1.0
863
- detect_params['peak_threshold_v'] = float(threshold) if axis in ('vertical', 'both') else 1.0
864
- detect_params['max_lines_h'] = max_lines_h
865
- detect_params['max_lines_v'] = max_lines_v
866
-
874
+ detect_params["peak_threshold_h"] = (
875
+ float(threshold) if axis in ("horizontal", "both") else 1.0
876
+ )
877
+ detect_params["peak_threshold_v"] = (
878
+ float(threshold) if axis in ("vertical", "both") else 1.0
879
+ )
880
+ detect_params["max_lines_h"] = max_lines_h
881
+ detect_params["max_lines_v"] = max_lines_v
882
+
867
883
  # Add any additional detection parameters
868
- for key in ['min_gap_h', 'min_gap_v', 'binarization_method',
869
- 'adaptive_thresh_block_size', 'adaptive_thresh_C_val',
870
- 'morph_op_h', 'morph_kernel_h', 'morph_op_v', 'morph_kernel_v',
871
- 'smoothing_sigma_h', 'smoothing_sigma_v', 'peak_width_rel_height']:
884
+ for key in [
885
+ "min_gap_h",
886
+ "min_gap_v",
887
+ "binarization_method",
888
+ "adaptive_thresh_block_size",
889
+ "adaptive_thresh_C_val",
890
+ "morph_op_h",
891
+ "morph_kernel_h",
892
+ "morph_op_v",
893
+ "morph_kernel_v",
894
+ "smoothing_sigma_h",
895
+ "smoothing_sigma_v",
896
+ "peak_width_rel_height",
897
+ ]:
872
898
  if key in detect_kwargs:
873
899
  detect_params[key] = detect_kwargs[key]
874
-
900
+
875
901
  # Perform the detection
876
902
  obj.detect_lines(**detect_params)
877
-
903
+
878
904
  # Now get the detected lines and use them
879
- if hasattr(obj, 'lines'):
905
+ if hasattr(obj, "lines"):
880
906
  lines = obj.lines
881
- elif hasattr(obj, 'find_all'):
882
- lines = obj.find_all('line')
907
+ elif hasattr(obj, "find_all"):
908
+ lines = obj.find_all("line")
883
909
  else:
884
910
  lines = []
885
-
911
+
886
912
  # Filter by the source we just used
887
- lines = [l for l in lines if getattr(l, 'source', None) == detect_params['source_label']]
888
-
913
+ lines = [
914
+ l for l in lines if getattr(l, "source", None) == detect_params["source_label"]
915
+ ]
916
+
889
917
  else: # detection_method == 'vector' (default)
890
918
  # Get existing lines from the object
891
- if hasattr(obj, 'lines'):
919
+ if hasattr(obj, "lines"):
892
920
  lines = obj.lines
893
- elif hasattr(obj, 'find_all'):
894
- lines = obj.find_all('line')
921
+ elif hasattr(obj, "find_all"):
922
+ lines = obj.find_all("line")
895
923
  else:
896
924
  logger.warning(f"Object {obj} has no lines or find_all method")
897
925
  lines = []
898
-
926
+
899
927
  # Filter by source if specified
900
928
  if source_label:
901
- lines = [l for l in lines if getattr(l, 'source', None) == source_label]
902
-
929
+ lines = [l for l in lines if getattr(l, "source", None) == source_label]
930
+
903
931
  # Process lines (same logic for both methods)
904
932
  # Separate lines by orientation and collect with metadata for ranking
905
933
  h_line_data = [] # (y_coord, length, line_obj)
906
934
  v_line_data = [] # (x_coord, length, line_obj)
907
-
935
+
908
936
  for line in lines:
909
- if hasattr(line, 'is_horizontal') and hasattr(line, 'is_vertical'):
910
- if line.is_horizontal and axis in ('horizontal', 'both'):
937
+ if hasattr(line, "is_horizontal") and hasattr(line, "is_vertical"):
938
+ if line.is_horizontal and axis in ("horizontal", "both"):
911
939
  # Use the midpoint y-coordinate for horizontal lines
912
940
  y = (line.top + line.bottom) / 2
913
941
  # Calculate line length for ranking
914
- length = getattr(line, 'width', abs(getattr(line, 'x1', 0) - getattr(line, 'x0', 0)))
942
+ length = getattr(
943
+ line, "width", abs(getattr(line, "x1", 0) - getattr(line, "x0", 0))
944
+ )
915
945
  h_line_data.append((y, length, line))
916
- elif line.is_vertical and axis in ('vertical', 'both'):
946
+ elif line.is_vertical and axis in ("vertical", "both"):
917
947
  # Use the midpoint x-coordinate for vertical lines
918
948
  x = (line.x0 + line.x1) / 2
919
949
  # Calculate line length for ranking
920
- length = getattr(line, 'height', abs(getattr(line, 'bottom', 0) - getattr(line, 'top', 0)))
950
+ length = getattr(
951
+ line, "height", abs(getattr(line, "bottom", 0) - getattr(line, "top", 0))
952
+ )
921
953
  v_line_data.append((x, length, line))
922
-
954
+
923
955
  # Process horizontal lines
924
956
  if max_lines_h is not None and h_line_data:
925
957
  # Sort by length (longer lines are typically more significant)
@@ -928,12 +960,14 @@ class Guides:
928
960
  selected_h = h_line_data[:max_lines_h]
929
961
  # Extract just the coordinates and sort by position
930
962
  horizontals = sorted([coord for coord, _, _ in selected_h])
931
- logger.debug(f"Selected {len(horizontals)} horizontal lines from {len(h_line_data)} candidates")
963
+ logger.debug(
964
+ f"Selected {len(horizontals)} horizontal lines from {len(h_line_data)} candidates"
965
+ )
932
966
  else:
933
967
  # Use all horizontal lines (original behavior)
934
968
  horizontals = [coord for coord, _, _ in h_line_data]
935
969
  horizontals = sorted(list(set(horizontals)))
936
-
970
+
937
971
  # Process vertical lines
938
972
  if max_lines_v is not None and v_line_data:
939
973
  # Sort by length (longer lines are typically more significant)
@@ -942,115 +976,117 @@ class Guides:
942
976
  selected_v = v_line_data[:max_lines_v]
943
977
  # Extract just the coordinates and sort by position
944
978
  verticals = sorted([coord for coord, _, _ in selected_v])
945
- logger.debug(f"Selected {len(verticals)} vertical lines from {len(v_line_data)} candidates")
979
+ logger.debug(
980
+ f"Selected {len(verticals)} vertical lines from {len(v_line_data)} candidates"
981
+ )
946
982
  else:
947
983
  # Use all vertical lines (original behavior)
948
984
  verticals = [coord for coord, _, _ in v_line_data]
949
985
  verticals = sorted(list(set(verticals)))
950
-
986
+
951
987
  # Add outer guides if requested
952
988
  if outer and bounds:
953
- if axis in ('vertical', 'both'):
989
+ if axis in ("vertical", "both"):
954
990
  if not verticals or verticals[0] > bounds[0]:
955
991
  verticals.insert(0, bounds[0]) # x0
956
992
  if not verticals or verticals[-1] < bounds[2]:
957
993
  verticals.append(bounds[2]) # x1
958
- if axis in ('horizontal', 'both'):
994
+ if axis in ("horizontal", "both"):
959
995
  if not horizontals or horizontals[0] > bounds[1]:
960
996
  horizontals.insert(0, bounds[1]) # y0
961
997
  if not horizontals or horizontals[-1] < bounds[3]:
962
998
  horizontals.append(bounds[3]) # y1
963
-
999
+
964
1000
  # Remove duplicates and sort again
965
1001
  verticals = sorted(list(set(verticals)))
966
1002
  horizontals = sorted(list(set(horizontals)))
967
-
1003
+
968
1004
  return cls(verticals=verticals, horizontals=horizontals, context=obj, bounds=bounds)
969
-
1005
+
970
1006
  @classmethod
971
1007
  def from_content(
972
1008
  cls,
973
1009
  obj: Union["Page", "Region"],
974
- axis: Literal['vertical', 'horizontal'] = 'vertical',
1010
+ axis: Literal["vertical", "horizontal"] = "vertical",
975
1011
  markers: Union[str, List[str], "ElementCollection", None] = None,
976
- align: Literal['left', 'right', 'center', 'between'] = 'left',
1012
+ align: Literal["left", "right", "center", "between"] = "left",
977
1013
  outer: bool = True,
978
- tolerance: float = 5
1014
+ tolerance: float = 5,
979
1015
  ) -> "Guides":
980
1016
  """
981
1017
  Create guides based on text content positions.
982
-
1018
+
983
1019
  Args:
984
1020
  obj: Page or Region to search for content
985
1021
  axis: Whether to create vertical or horizontal guides
986
1022
  markers: Content to search for. Can be:
987
1023
  - str: single selector (e.g., 'text:contains("Name")') or literal text
988
- - List[str]: list of selectors or literal text strings
1024
+ - List[str]: list of selectors or literal text strings
989
1025
  - ElementCollection: collection of elements to extract text from
990
1026
  - None: no markers
991
1027
  align: Where to place guides relative to found text
992
1028
  outer: Whether to add guides at the boundaries
993
1029
  tolerance: Maximum distance to search for text
994
-
1030
+
995
1031
  Returns:
996
1032
  New Guides object aligned to text content
997
1033
  """
998
1034
  guides_coords = []
999
1035
  bounds = None
1000
-
1036
+
1001
1037
  # Get bounds from object
1002
- if hasattr(obj, 'bbox'):
1038
+ if hasattr(obj, "bbox"):
1003
1039
  bounds = obj.bbox
1004
- elif hasattr(obj, 'x0'):
1040
+ elif hasattr(obj, "x0"):
1005
1041
  bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
1006
- elif hasattr(obj, 'width'):
1042
+ elif hasattr(obj, "width"):
1007
1043
  bounds = (0, 0, obj.width, obj.height)
1008
-
1044
+
1009
1045
  # Normalize markers to list of text strings
1010
1046
  marker_texts = _normalize_markers(markers, obj)
1011
-
1047
+
1012
1048
  # Find each marker and determine guide position
1013
1049
  for marker in marker_texts:
1014
- if hasattr(obj, 'find'):
1050
+ if hasattr(obj, "find"):
1015
1051
  element = obj.find(f'text:contains("{marker}")')
1016
1052
  if element:
1017
- if axis == 'vertical':
1018
- if align == 'left':
1053
+ if axis == "vertical":
1054
+ if align == "left":
1019
1055
  guides_coords.append(element.x0)
1020
- elif align == 'right':
1056
+ elif align == "right":
1021
1057
  guides_coords.append(element.x1)
1022
- elif align == 'center':
1058
+ elif align == "center":
1023
1059
  guides_coords.append((element.x0 + element.x1) / 2)
1024
- elif align == 'between':
1060
+ elif align == "between":
1025
1061
  # For between, collect left edges for processing later
1026
1062
  guides_coords.append(element.x0)
1027
1063
  else: # horizontal
1028
- if align == 'left': # top for horizontal
1064
+ if align == "left": # top for horizontal
1029
1065
  guides_coords.append(element.top)
1030
- elif align == 'right': # bottom for horizontal
1066
+ elif align == "right": # bottom for horizontal
1031
1067
  guides_coords.append(element.bottom)
1032
- elif align == 'center':
1068
+ elif align == "center":
1033
1069
  guides_coords.append((element.top + element.bottom) / 2)
1034
- elif align == 'between':
1070
+ elif align == "between":
1035
1071
  # For between, collect top edges for processing later
1036
1072
  guides_coords.append(element.top)
1037
-
1073
+
1038
1074
  # Handle 'between' alignment - find midpoints between adjacent markers
1039
- if align == 'between' and len(guides_coords) >= 2:
1075
+ if align == "between" and len(guides_coords) >= 2:
1040
1076
  # We need to get the right and left edges of each marker
1041
1077
  marker_bounds = []
1042
1078
  for marker in marker_texts:
1043
- if hasattr(obj, 'find'):
1079
+ if hasattr(obj, "find"):
1044
1080
  element = obj.find(f'text:contains("{marker}")')
1045
1081
  if element:
1046
- if axis == 'vertical':
1082
+ if axis == "vertical":
1047
1083
  marker_bounds.append((element.x0, element.x1))
1048
1084
  else: # horizontal
1049
1085
  marker_bounds.append((element.top, element.bottom))
1050
-
1086
+
1051
1087
  # Sort markers by their left edge (or top edge for horizontal)
1052
1088
  marker_bounds.sort(key=lambda x: x[0])
1053
-
1089
+
1054
1090
  # Create guides at midpoints between adjacent markers
1055
1091
  between_coords = []
1056
1092
  for i in range(len(marker_bounds) - 1):
@@ -1059,79 +1095,78 @@ class Guides:
1059
1095
  left_edge_next = marker_bounds[i + 1][0]
1060
1096
  midpoint = (right_edge_current + left_edge_next) / 2
1061
1097
  between_coords.append(midpoint)
1062
-
1098
+
1063
1099
  guides_coords = between_coords
1064
-
1100
+
1065
1101
  # Add outer guides if requested
1066
1102
  if outer and bounds:
1067
- if axis == 'vertical':
1103
+ if axis == "vertical":
1068
1104
  guides_coords.insert(0, bounds[0]) # x0
1069
- guides_coords.append(bounds[2]) # x1
1105
+ guides_coords.append(bounds[2]) # x1
1070
1106
  else:
1071
1107
  guides_coords.insert(0, bounds[1]) # y0
1072
- guides_coords.append(bounds[3]) # y1
1073
-
1108
+ guides_coords.append(bounds[3]) # y1
1109
+
1074
1110
  # Remove duplicates and sort
1075
1111
  guides_coords = sorted(list(set(guides_coords)))
1076
-
1112
+
1077
1113
  # Create guides object
1078
- if axis == 'vertical':
1114
+ if axis == "vertical":
1079
1115
  return cls(verticals=guides_coords, context=obj, bounds=bounds)
1080
1116
  else:
1081
1117
  return cls(horizontals=guides_coords, context=obj, bounds=bounds)
1082
-
1118
+
1083
1119
  @classmethod
1084
1120
  def from_whitespace(
1085
1121
  cls,
1086
1122
  obj: Union["Page", "Region"],
1087
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
1088
- min_gap: float = 10
1123
+ axis: Literal["vertical", "horizontal", "both"] = "both",
1124
+ min_gap: float = 10,
1089
1125
  ) -> "Guides":
1090
1126
  """
1091
1127
  Create guides by detecting whitespace gaps.
1092
-
1128
+
1093
1129
  Args:
1094
1130
  obj: Page or Region to analyze
1095
1131
  min_gap: Minimum gap size to consider as whitespace
1096
1132
  axis: Which axes to analyze for gaps
1097
-
1133
+
1098
1134
  Returns:
1099
1135
  New Guides object positioned at whitespace gaps
1100
1136
  """
1101
1137
  # This is a placeholder - would need sophisticated gap detection
1102
1138
  logger.info("Whitespace detection not yet implemented, using divide instead")
1103
1139
  return cls.divide(obj, n=3, axis=axis)
1104
-
1140
+
1105
1141
  @classmethod
1106
- def new(
1107
- cls,
1108
- context: Optional[Union["Page", "Region"]] = None
1109
- ) -> "Guides":
1142
+ def new(cls, context: Optional[Union["Page", "Region"]] = None) -> "Guides":
1110
1143
  """
1111
1144
  Create a new empty Guides object, optionally with a context.
1112
-
1145
+
1113
1146
  This provides a clean way to start building guides through chaining:
1114
1147
  guides = Guides.new(page).add_content(axis='vertical', markers=[...])
1115
-
1148
+
1116
1149
  Args:
1117
1150
  context: Optional Page or Region to use as default context for operations
1118
-
1151
+
1119
1152
  Returns:
1120
1153
  New empty Guides object
1121
1154
  """
1122
1155
  return cls(verticals=[], horizontals=[], context=context)
1123
-
1156
+
1124
1157
  # -------------------------------------------------------------------------
1125
1158
  # Manipulation Methods
1126
1159
  # -------------------------------------------------------------------------
1127
-
1160
+
1128
1161
  def snap_to_whitespace(
1129
1162
  self,
1130
- axis: str = 'vertical',
1163
+ axis: str = "vertical",
1131
1164
  min_gap: float = 10.0,
1132
- detection_method: str = 'pixels', # 'pixels' or 'text'
1133
- threshold: Union[float, str] = 'auto', # threshold for what counts as a trough (0.0-1.0) or 'auto'
1134
- on_no_snap: str = 'warn'
1165
+ detection_method: str = "pixels", # 'pixels' or 'text'
1166
+ threshold: Union[
1167
+ float, str
1168
+ ] = "auto", # threshold for what counts as a trough (0.0-1.0) or 'auto'
1169
+ on_no_snap: str = "warn",
1135
1170
  ) -> "Guides":
1136
1171
  """
1137
1172
  Snap guides to nearby whitespace gaps (troughs) using optimal assignment.
@@ -1161,11 +1196,11 @@ class Guides:
1161
1196
  logger.warning("No text elements found for whitespace detection")
1162
1197
  return self
1163
1198
 
1164
- if axis == 'vertical':
1199
+ if axis == "vertical":
1165
1200
  gaps = self._find_vertical_whitespace_gaps(text_elements, min_gap, threshold)
1166
1201
  if gaps:
1167
1202
  self._snap_guides_to_gaps(self.vertical.data, gaps, axis)
1168
- elif axis == 'horizontal':
1203
+ elif axis == "horizontal":
1169
1204
  gaps = self._find_horizontal_whitespace_gaps(text_elements, min_gap, threshold)
1170
1205
  if gaps:
1171
1206
  self._snap_guides_to_gaps(self.horizontal.data, gaps, axis)
@@ -1177,25 +1212,22 @@ class Guides:
1177
1212
  self.horizontal.data[:] = [float(y) for y in self.horizontal.data]
1178
1213
 
1179
1214
  return self
1180
-
1215
+
1181
1216
  def shift(
1182
- self,
1183
- index: int,
1184
- offset: float,
1185
- axis: Literal['vertical', 'horizontal'] = 'vertical'
1217
+ self, index: int, offset: float, axis: Literal["vertical", "horizontal"] = "vertical"
1186
1218
  ) -> "Guides":
1187
1219
  """
1188
1220
  Move a specific guide by a offset amount.
1189
-
1221
+
1190
1222
  Args:
1191
1223
  index: Index of the guide to move
1192
1224
  offset: Amount to move (positive = right/down)
1193
1225
  axis: Which guide list to modify
1194
-
1226
+
1195
1227
  Returns:
1196
1228
  Self for method chaining
1197
1229
  """
1198
- if axis == 'vertical':
1230
+ if axis == "vertical":
1199
1231
  if 0 <= index < len(self.vertical):
1200
1232
  self.vertical[index] += offset
1201
1233
  self.vertical = sorted(self.vertical)
@@ -1207,123 +1239,127 @@ class Guides:
1207
1239
  self.horizontal = sorted(self.horizontal)
1208
1240
  else:
1209
1241
  logger.warning(f"Horizontal guide index {index} out of range")
1210
-
1242
+
1211
1243
  return self
1212
-
1244
+
1213
1245
  def add_vertical(self, x: float) -> "Guides":
1214
1246
  """Add a vertical guide at the specified x-coordinate."""
1215
1247
  self.vertical.append(x)
1216
1248
  self.vertical = sorted(self.vertical)
1217
1249
  return self
1218
-
1250
+
1219
1251
  def add_horizontal(self, y: float) -> "Guides":
1220
1252
  """Add a horizontal guide at the specified y-coordinate."""
1221
1253
  self.horizontal.append(y)
1222
1254
  self.horizontal = sorted(self.horizontal)
1223
1255
  return self
1224
-
1256
+
1225
1257
  def remove_vertical(self, index: int) -> "Guides":
1226
1258
  """Remove a vertical guide by index."""
1227
1259
  if 0 <= index < len(self.vertical):
1228
1260
  self.vertical.pop(index)
1229
1261
  return self
1230
-
1262
+
1231
1263
  def remove_horizontal(self, index: int) -> "Guides":
1232
1264
  """Remove a horizontal guide by index."""
1233
1265
  if 0 <= index < len(self.horizontal):
1234
1266
  self.horizontal.pop(index)
1235
1267
  return self
1236
-
1268
+
1237
1269
  # -------------------------------------------------------------------------
1238
1270
  # Operations
1239
1271
  # -------------------------------------------------------------------------
1240
-
1272
+
1241
1273
  def __add__(self, other: "Guides") -> "Guides":
1242
1274
  """
1243
1275
  Combine two guide sets.
1244
-
1276
+
1245
1277
  Returns:
1246
1278
  New Guides object with combined coordinates
1247
1279
  """
1248
1280
  # Combine and deduplicate coordinates, ensuring Python floats
1249
1281
  combined_verticals = sorted([float(x) for x in set(self.vertical + other.vertical)])
1250
1282
  combined_horizontals = sorted([float(y) for y in set(self.horizontal + other.horizontal)])
1251
-
1283
+
1252
1284
  # Use context from self if available
1253
1285
  return Guides(
1254
1286
  verticals=combined_verticals,
1255
1287
  horizontals=combined_horizontals,
1256
1288
  context=self.context or other.context,
1257
- bounds=self.bounds or other.bounds
1289
+ bounds=self.bounds or other.bounds,
1258
1290
  )
1259
-
1291
+
1260
1292
  def show(self, on=None, **kwargs):
1261
1293
  """
1262
1294
  Display the guides overlaid on a page or region.
1263
-
1295
+
1264
1296
  Args:
1265
1297
  on: Page, Region, PIL Image, or string to display guides on.
1266
1298
  If None, uses self.context (the object guides were created from).
1267
1299
  If string 'page', uses the page from self.context.
1268
1300
  **kwargs: Additional arguments passed to to_image() if applicable.
1269
-
1301
+
1270
1302
  Returns:
1271
1303
  PIL Image with guides drawn on it.
1272
1304
  """
1273
1305
  # Determine what to display guides on
1274
1306
  target = on if on is not None else self.context
1275
-
1307
+
1276
1308
  # Handle string shortcuts
1277
1309
  if isinstance(target, str):
1278
- if target == 'page':
1279
- if hasattr(self.context, 'page'):
1310
+ if target == "page":
1311
+ if hasattr(self.context, "page"):
1280
1312
  target = self.context.page
1281
- elif hasattr(self.context, '_page'):
1313
+ elif hasattr(self.context, "_page"):
1282
1314
  target = self.context._page
1283
1315
  else:
1284
1316
  raise ValueError("Cannot resolve 'page' - context has no page attribute")
1285
1317
  else:
1286
1318
  raise ValueError(f"Unknown string target: {target}. Only 'page' is supported.")
1287
-
1319
+
1288
1320
  if target is None:
1289
1321
  raise ValueError("No target specified and no context available for guides display")
1290
-
1322
+
1291
1323
  # Prepare kwargs for image generation
1292
1324
  image_kwargs = kwargs.copy()
1293
-
1325
+
1294
1326
  # Always turn off highlights to avoid visual clutter
1295
- image_kwargs['include_highlights'] = False
1296
-
1327
+ image_kwargs["include_highlights"] = False
1328
+
1297
1329
  # If target is a region-like object, crop to just that region
1298
- if hasattr(target, 'bbox') and hasattr(target, 'page'):
1330
+ if hasattr(target, "bbox") and hasattr(target, "page"):
1299
1331
  # This is likely a Region
1300
- image_kwargs['crop'] = True
1301
-
1332
+ image_kwargs["crop"] = True
1333
+
1302
1334
  # Get base image
1303
- if hasattr(target, 'to_image'):
1335
+ if hasattr(target, "to_image"):
1304
1336
  img = target.to_image(**image_kwargs)
1305
- elif hasattr(target, 'mode') and hasattr(target, 'size'):
1337
+ elif hasattr(target, "mode") and hasattr(target, "size"):
1306
1338
  # It's already a PIL Image
1307
1339
  img = target
1308
1340
  else:
1309
1341
  raise ValueError(f"Object {target} does not support to_image() and is not a PIL Image")
1310
-
1342
+
1311
1343
  if img is None:
1312
1344
  raise ValueError("Failed to generate base image")
1313
-
1345
+
1314
1346
  # Create a copy to draw on
1315
1347
  img = img.copy()
1316
1348
  draw = ImageDraw.Draw(img)
1317
-
1349
+
1318
1350
  # Determine scale factor for coordinate conversion
1319
- if hasattr(target, 'width') and hasattr(target, 'height') and not (hasattr(target, 'mode') and hasattr(target, 'size')):
1351
+ if (
1352
+ hasattr(target, "width")
1353
+ and hasattr(target, "height")
1354
+ and not (hasattr(target, "mode") and hasattr(target, "size"))
1355
+ ):
1320
1356
  # target is a PDF object (Page/Region) with PDF coordinates
1321
1357
  scale_x = img.width / target.width
1322
1358
  scale_y = img.height / target.height
1323
-
1359
+
1324
1360
  # If we're showing guides on a region, we need to adjust coordinates
1325
1361
  # to be relative to the region's origin
1326
- if hasattr(target, 'bbox') and hasattr(target, 'page'):
1362
+ if hasattr(target, "bbox") and hasattr(target, "page"):
1327
1363
  # This is a Region - adjust guide coordinates to be relative to region
1328
1364
  region_x0, region_top = target.x0, target.top
1329
1365
  else:
@@ -1334,7 +1370,7 @@ class Guides:
1334
1370
  scale_x = 1.0
1335
1371
  scale_y = 1.0
1336
1372
  region_x0, region_top = 0, 0
1337
-
1373
+
1338
1374
  # Draw vertical guides (blue)
1339
1375
  for x_coord in self.vertical:
1340
1376
  # Adjust coordinate if we're showing on a region
@@ -1344,8 +1380,8 @@ class Guides:
1344
1380
  if 0 <= pixel_x <= img.width - 1:
1345
1381
  x_pixel = int(min(pixel_x, img.width - 1))
1346
1382
  draw.line([(x_pixel, 0), (x_pixel, img.height - 1)], fill=(0, 0, 255, 200), width=2)
1347
-
1348
- # Draw horizontal guides (red)
1383
+
1384
+ # Draw horizontal guides (red)
1349
1385
  for y_coord in self.horizontal:
1350
1386
  # Adjust coordinate if we're showing on a region
1351
1387
  adjusted_y = y_coord - region_top
@@ -1354,22 +1390,22 @@ class Guides:
1354
1390
  if 0 <= pixel_y <= img.height - 1:
1355
1391
  y_pixel = int(min(pixel_y, img.height - 1))
1356
1392
  draw.line([(0, y_pixel), (img.width - 1, y_pixel)], fill=(255, 0, 0, 200), width=2)
1357
-
1393
+
1358
1394
  return img
1359
-
1395
+
1360
1396
  # -------------------------------------------------------------------------
1361
1397
  # Utility Methods
1362
1398
  # -------------------------------------------------------------------------
1363
-
1399
+
1364
1400
  def get_cells(self) -> List[Tuple[float, float, float, float]]:
1365
1401
  """
1366
1402
  Get all cell bounding boxes from guide intersections.
1367
-
1403
+
1368
1404
  Returns:
1369
1405
  List of (x0, y0, x1, y1) tuples for each cell
1370
1406
  """
1371
1407
  cells = []
1372
-
1408
+
1373
1409
  # Create cells from guide intersections
1374
1410
  for i in range(len(self.vertical) - 1):
1375
1411
  for j in range(len(self.horizontal) - 1):
@@ -1378,135 +1414,139 @@ class Guides:
1378
1414
  y0 = self.horizontal[j]
1379
1415
  y1 = self.horizontal[j + 1]
1380
1416
  cells.append((x0, y0, x1, y1))
1381
-
1417
+
1382
1418
  return cells
1383
-
1419
+
1384
1420
  def to_dict(self) -> Dict[str, Any]:
1385
1421
  """
1386
1422
  Convert to dictionary format suitable for pdfplumber table_settings.
1387
-
1423
+
1388
1424
  Returns:
1389
1425
  Dictionary with explicit_vertical_lines and explicit_horizontal_lines
1390
1426
  """
1391
1427
  return {
1392
- 'explicit_vertical_lines': self.vertical,
1393
- 'explicit_horizontal_lines': self.horizontal
1428
+ "explicit_vertical_lines": self.vertical,
1429
+ "explicit_horizontal_lines": self.horizontal,
1394
1430
  }
1395
-
1431
+
1396
1432
  def to_relative(self) -> "Guides":
1397
1433
  """
1398
1434
  Convert absolute coordinates to relative (0-1) coordinates.
1399
-
1435
+
1400
1436
  Returns:
1401
1437
  New Guides object with relative coordinates
1402
1438
  """
1403
1439
  if self.relative:
1404
1440
  return self # Already relative
1405
-
1441
+
1406
1442
  if not self.bounds:
1407
1443
  raise ValueError("Cannot convert to relative without bounds")
1408
-
1444
+
1409
1445
  x0, y0, x1, y1 = self.bounds
1410
1446
  width = x1 - x0
1411
1447
  height = y1 - y0
1412
-
1448
+
1413
1449
  rel_verticals = [(x - x0) / width for x in self.vertical]
1414
1450
  rel_horizontals = [(y - y0) / height for y in self.horizontal]
1415
-
1451
+
1416
1452
  return Guides(
1417
1453
  verticals=rel_verticals,
1418
1454
  horizontals=rel_horizontals,
1419
1455
  context=self.context,
1420
1456
  bounds=(0, 0, 1, 1),
1421
- relative=True
1457
+ relative=True,
1422
1458
  )
1423
-
1459
+
1424
1460
  def to_absolute(self, bounds: Tuple[float, float, float, float]) -> "Guides":
1425
1461
  """
1426
1462
  Convert relative coordinates to absolute coordinates.
1427
-
1463
+
1428
1464
  Args:
1429
1465
  bounds: Target bounding box (x0, y0, x1, y1)
1430
-
1466
+
1431
1467
  Returns:
1432
1468
  New Guides object with absolute coordinates
1433
1469
  """
1434
1470
  if not self.relative:
1435
1471
  return self # Already absolute
1436
-
1472
+
1437
1473
  x0, y0, x1, y1 = bounds
1438
1474
  width = x1 - x0
1439
1475
  height = y1 - y0
1440
-
1476
+
1441
1477
  abs_verticals = [x0 + x * width for x in self.vertical]
1442
1478
  abs_horizontals = [y0 + y * height for y in self.horizontal]
1443
-
1479
+
1444
1480
  return Guides(
1445
1481
  verticals=abs_verticals,
1446
1482
  horizontals=abs_horizontals,
1447
1483
  context=self.context,
1448
1484
  bounds=bounds,
1449
- relative=False
1485
+ relative=False,
1450
1486
  )
1451
-
1487
+
1452
1488
  @property
1453
1489
  def n_rows(self) -> int:
1454
1490
  """Number of rows defined by horizontal guides."""
1455
1491
  return max(0, len(self.horizontal) - 1)
1456
-
1492
+
1457
1493
  @property
1458
1494
  def n_cols(self) -> int:
1459
1495
  """Number of columns defined by vertical guides."""
1460
1496
  return max(0, len(self.vertical) - 1)
1461
-
1497
+
1462
1498
  def _handle_snap_failure(self, message: str):
1463
1499
  """Handle cases where snapping cannot be performed."""
1464
- if hasattr(self, 'on_no_snap'):
1465
- if self.on_no_snap == 'warn':
1500
+ if hasattr(self, "on_no_snap"):
1501
+ if self.on_no_snap == "warn":
1466
1502
  logger.warning(message)
1467
- elif self.on_no_snap == 'raise':
1503
+ elif self.on_no_snap == "raise":
1468
1504
  raise ValueError(message)
1469
1505
  # 'ignore' case: do nothing
1470
1506
  else:
1471
1507
  logger.warning(message) # Default behavior
1472
1508
 
1473
- def _find_vertical_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
1509
+ def _find_vertical_whitespace_gaps(
1510
+ self, text_elements, min_gap: float, threshold: Union[float, str] = "auto"
1511
+ ) -> List[Tuple[float, float]]:
1474
1512
  """
1475
1513
  Find vertical whitespace gaps using bbox-based density analysis.
1476
1514
  Returns list of (start, end) tuples representing trough ranges.
1477
1515
  """
1478
1516
  if not self.bounds:
1479
1517
  return []
1480
-
1518
+
1481
1519
  x0, _, x1, _ = self.bounds
1482
1520
  width_pixels = int(x1 - x0)
1483
-
1521
+
1484
1522
  if width_pixels <= 0:
1485
1523
  return []
1486
-
1524
+
1487
1525
  # Create density histogram: count bbox overlaps per x-coordinate
1488
1526
  density = np.zeros(width_pixels)
1489
-
1527
+
1490
1528
  for element in text_elements:
1491
- if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
1529
+ if not hasattr(element, "x0") or not hasattr(element, "x1"):
1492
1530
  continue
1493
-
1531
+
1494
1532
  # Clip coordinates to bounds
1495
1533
  elem_x0 = max(x0, element.x0) - x0
1496
1534
  elem_x1 = min(x1, element.x1) - x0
1497
-
1535
+
1498
1536
  if elem_x1 > elem_x0:
1499
1537
  start_px = int(elem_x0)
1500
1538
  end_px = int(elem_x1)
1501
1539
  density[start_px:end_px] += 1
1502
-
1540
+
1503
1541
  if density.max() == 0:
1504
1542
  return []
1505
-
1543
+
1506
1544
  # Determine the threshold value
1507
- if threshold == 'auto':
1545
+ if threshold == "auto":
1508
1546
  # Auto mode: try different thresholds with step 0.05 until we have enough troughs
1509
- guides_needing_troughs = len([g for i, g in enumerate(self.vertical) if 0 < i < len(self.vertical) - 1])
1547
+ guides_needing_troughs = len(
1548
+ [g for i, g in enumerate(self.vertical) if 0 < i < len(self.vertical) - 1]
1549
+ )
1510
1550
  if guides_needing_troughs == 0:
1511
1551
  threshold_val = 0.5 # Default when no guides need placement
1512
1552
  else:
@@ -1515,9 +1555,11 @@ class Guides:
1515
1555
  test_gaps = self._find_gaps_with_threshold(density, test_threshold, min_gap, x0)
1516
1556
  if len(test_gaps) >= guides_needing_troughs:
1517
1557
  threshold_val = test_threshold
1518
- logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
1558
+ logger.debug(
1559
+ f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)"
1560
+ )
1519
1561
  break
1520
-
1562
+
1521
1563
  if threshold_val is None:
1522
1564
  threshold_val = 0.8 # Fallback to permissive threshold
1523
1565
  logger.debug(f"Auto threshold fallback to {threshold_val}")
@@ -1526,93 +1568,103 @@ class Guides:
1526
1568
  if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
1527
1569
  raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
1528
1570
  threshold_val = float(threshold)
1529
-
1571
+
1530
1572
  return self._find_gaps_with_threshold(density, threshold_val, min_gap, x0)
1531
-
1573
+
1532
1574
  def _find_gaps_with_threshold(self, density, threshold_val, min_gap, x0):
1533
1575
  """Helper method to find gaps given a specific threshold value."""
1534
1576
  max_density = density.max()
1535
1577
  threshold_density = threshold_val * max_density
1536
-
1578
+
1537
1579
  # Smooth the density for better trough detection
1538
1580
  from scipy.ndimage import gaussian_filter1d
1581
+
1539
1582
  smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
1540
-
1583
+
1541
1584
  # Find regions below threshold
1542
1585
  below_threshold = smoothed_density <= threshold_density
1543
-
1586
+
1544
1587
  # Find contiguous regions
1545
1588
  from scipy.ndimage import label as nd_label
1589
+
1546
1590
  labeled_regions, num_regions = nd_label(below_threshold)
1547
-
1591
+
1548
1592
  gaps = []
1549
1593
  for region_id in range(1, num_regions + 1):
1550
1594
  region_mask = labeled_regions == region_id
1551
1595
  region_indices = np.where(region_mask)[0]
1552
-
1596
+
1553
1597
  if len(region_indices) == 0:
1554
1598
  continue
1555
-
1599
+
1556
1600
  start_px = region_indices[0]
1557
1601
  end_px = region_indices[-1] + 1
1558
-
1602
+
1559
1603
  # Convert back to PDF coordinates
1560
1604
  start_pdf = x0 + start_px
1561
1605
  end_pdf = x0 + end_px
1562
-
1606
+
1563
1607
  # Check minimum gap size
1564
1608
  if end_pdf - start_pdf >= min_gap:
1565
1609
  gaps.append((start_pdf, end_pdf))
1566
-
1610
+
1567
1611
  return gaps
1568
1612
 
1569
- def _find_horizontal_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
1613
+ def _find_horizontal_whitespace_gaps(
1614
+ self, text_elements, min_gap: float, threshold: Union[float, str] = "auto"
1615
+ ) -> List[Tuple[float, float]]:
1570
1616
  """
1571
1617
  Find horizontal whitespace gaps using bbox-based density analysis.
1572
1618
  Returns list of (start, end) tuples representing trough ranges.
1573
1619
  """
1574
1620
  if not self.bounds:
1575
1621
  return []
1576
-
1622
+
1577
1623
  _, y0, _, y1 = self.bounds
1578
1624
  height_pixels = int(y1 - y0)
1579
-
1625
+
1580
1626
  if height_pixels <= 0:
1581
1627
  return []
1582
-
1583
- # Create density histogram: count bbox overlaps per y-coordinate
1628
+
1629
+ # Create density histogram: count bbox overlaps per y-coordinate
1584
1630
  density = np.zeros(height_pixels)
1585
-
1631
+
1586
1632
  for element in text_elements:
1587
- if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
1633
+ if not hasattr(element, "top") or not hasattr(element, "bottom"):
1588
1634
  continue
1589
-
1635
+
1590
1636
  # Clip coordinates to bounds
1591
1637
  elem_top = max(y0, element.top) - y0
1592
1638
  elem_bottom = min(y1, element.bottom) - y0
1593
-
1639
+
1594
1640
  if elem_bottom > elem_top:
1595
1641
  start_px = int(elem_top)
1596
1642
  end_px = int(elem_bottom)
1597
1643
  density[start_px:end_px] += 1
1598
-
1644
+
1599
1645
  if density.max() == 0:
1600
1646
  return []
1601
-
1647
+
1602
1648
  # Determine the threshold value (same logic as vertical)
1603
- if threshold == 'auto':
1604
- guides_needing_troughs = len([g for i, g in enumerate(self.horizontal) if 0 < i < len(self.horizontal) - 1])
1649
+ if threshold == "auto":
1650
+ guides_needing_troughs = len(
1651
+ [g for i, g in enumerate(self.horizontal) if 0 < i < len(self.horizontal) - 1]
1652
+ )
1605
1653
  if guides_needing_troughs == 0:
1606
1654
  threshold_val = 0.5 # Default when no guides need placement
1607
1655
  else:
1608
1656
  threshold_val = None
1609
1657
  for test_threshold in np.arange(0.1, 1.0, 0.05):
1610
- test_gaps = self._find_gaps_with_threshold_horizontal(density, test_threshold, min_gap, y0)
1658
+ test_gaps = self._find_gaps_with_threshold_horizontal(
1659
+ density, test_threshold, min_gap, y0
1660
+ )
1611
1661
  if len(test_gaps) >= guides_needing_troughs:
1612
1662
  threshold_val = test_threshold
1613
- logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
1663
+ logger.debug(
1664
+ f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)"
1665
+ )
1614
1666
  break
1615
-
1667
+
1616
1668
  if threshold_val is None:
1617
1669
  threshold_val = 0.8 # Fallback to permissive threshold
1618
1670
  logger.debug(f"Auto threshold fallback to {threshold_val}")
@@ -1621,141 +1673,157 @@ class Guides:
1621
1673
  if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
1622
1674
  raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
1623
1675
  threshold_val = float(threshold)
1624
-
1676
+
1625
1677
  return self._find_gaps_with_threshold_horizontal(density, threshold_val, min_gap, y0)
1626
-
1678
+
1627
1679
  def _find_gaps_with_threshold_horizontal(self, density, threshold_val, min_gap, y0):
1628
1680
  """Helper method to find horizontal gaps given a specific threshold value."""
1629
1681
  max_density = density.max()
1630
1682
  threshold_density = threshold_val * max_density
1631
-
1683
+
1632
1684
  # Smooth the density for better trough detection
1633
1685
  from scipy.ndimage import gaussian_filter1d
1686
+
1634
1687
  smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
1635
-
1688
+
1636
1689
  # Find regions below threshold
1637
1690
  below_threshold = smoothed_density <= threshold_density
1638
-
1691
+
1639
1692
  # Find contiguous regions
1640
1693
  from scipy.ndimage import label as nd_label
1694
+
1641
1695
  labeled_regions, num_regions = nd_label(below_threshold)
1642
-
1696
+
1643
1697
  gaps = []
1644
1698
  for region_id in range(1, num_regions + 1):
1645
1699
  region_mask = labeled_regions == region_id
1646
1700
  region_indices = np.where(region_mask)[0]
1647
-
1701
+
1648
1702
  if len(region_indices) == 0:
1649
1703
  continue
1650
-
1704
+
1651
1705
  start_px = region_indices[0]
1652
1706
  end_px = region_indices[-1] + 1
1653
-
1707
+
1654
1708
  # Convert back to PDF coordinates
1655
1709
  start_pdf = y0 + start_px
1656
1710
  end_pdf = y0 + end_px
1657
-
1711
+
1658
1712
  # Check minimum gap size
1659
1713
  if end_pdf - start_pdf >= min_gap:
1660
1714
  gaps.append((start_pdf, end_pdf))
1661
-
1715
+
1662
1716
  return gaps
1663
-
1664
- def _find_vertical_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
1717
+
1718
+ def _find_vertical_element_gaps(
1719
+ self, text_elements, min_gap: float
1720
+ ) -> List[Tuple[float, float]]:
1665
1721
  """
1666
1722
  Find vertical whitespace gaps using text element spacing analysis.
1667
1723
  Returns list of (start, end) tuples representing trough ranges.
1668
1724
  """
1669
1725
  if not self.bounds or not text_elements:
1670
1726
  return []
1671
-
1727
+
1672
1728
  x0, _, x1, _ = self.bounds
1673
-
1729
+
1674
1730
  # Get all element right and left edges
1675
1731
  element_edges = []
1676
1732
  for element in text_elements:
1677
- if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
1733
+ if not hasattr(element, "x0") or not hasattr(element, "x1"):
1678
1734
  continue
1679
1735
  # Only include elements that overlap vertically with our bounds
1680
- if hasattr(element, 'top') and hasattr(element, 'bottom'):
1736
+ if hasattr(element, "top") and hasattr(element, "bottom"):
1681
1737
  if element.bottom < self.bounds[1] or element.top > self.bounds[3]:
1682
1738
  continue
1683
1739
  element_edges.extend([element.x0, element.x1])
1684
-
1740
+
1685
1741
  if not element_edges:
1686
1742
  return []
1687
-
1743
+
1688
1744
  # Sort edges and find gaps
1689
1745
  element_edges = sorted(set(element_edges))
1690
-
1746
+
1691
1747
  trough_ranges = []
1692
1748
  for i in range(len(element_edges) - 1):
1693
1749
  gap_start = element_edges[i]
1694
1750
  gap_end = element_edges[i + 1]
1695
1751
  gap_width = gap_end - gap_start
1696
-
1752
+
1697
1753
  if gap_width >= min_gap:
1698
1754
  # Check if this gap actually contains no text (is empty space)
1699
1755
  gap_has_text = False
1700
1756
  for element in text_elements:
1701
- if (hasattr(element, 'x0') and hasattr(element, 'x1') and
1702
- element.x0 < gap_end and element.x1 > gap_start):
1757
+ if (
1758
+ hasattr(element, "x0")
1759
+ and hasattr(element, "x1")
1760
+ and element.x0 < gap_end
1761
+ and element.x1 > gap_start
1762
+ ):
1703
1763
  gap_has_text = True
1704
1764
  break
1705
-
1765
+
1706
1766
  if not gap_has_text:
1707
1767
  trough_ranges.append((gap_start, gap_end))
1708
-
1768
+
1709
1769
  return trough_ranges
1710
-
1711
- def _find_horizontal_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
1770
+
1771
+ def _find_horizontal_element_gaps(
1772
+ self, text_elements, min_gap: float
1773
+ ) -> List[Tuple[float, float]]:
1712
1774
  """
1713
1775
  Find horizontal whitespace gaps using text element spacing analysis.
1714
1776
  Returns list of (start, end) tuples representing trough ranges.
1715
1777
  """
1716
1778
  if not self.bounds or not text_elements:
1717
1779
  return []
1718
-
1780
+
1719
1781
  _, y0, _, y1 = self.bounds
1720
-
1782
+
1721
1783
  # Get all element top and bottom edges
1722
1784
  element_edges = []
1723
1785
  for element in text_elements:
1724
- if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
1786
+ if not hasattr(element, "top") or not hasattr(element, "bottom"):
1725
1787
  continue
1726
1788
  # Only include elements that overlap horizontally with our bounds
1727
- if hasattr(element, 'x0') and hasattr(element, 'x1'):
1789
+ if hasattr(element, "x0") and hasattr(element, "x1"):
1728
1790
  if element.x1 < self.bounds[0] or element.x0 > self.bounds[2]:
1729
1791
  continue
1730
1792
  element_edges.extend([element.top, element.bottom])
1731
-
1793
+
1732
1794
  if not element_edges:
1733
1795
  return []
1734
-
1796
+
1735
1797
  # Sort edges and find gaps
1736
1798
  element_edges = sorted(set(element_edges))
1737
-
1799
+
1738
1800
  trough_ranges = []
1739
1801
  for i in range(len(element_edges) - 1):
1740
1802
  gap_start = element_edges[i]
1741
1803
  gap_end = element_edges[i + 1]
1742
1804
  gap_width = gap_end - gap_start
1743
-
1805
+
1744
1806
  if gap_width >= min_gap:
1745
1807
  # Check if this gap actually contains no text (is empty space)
1746
1808
  gap_has_text = False
1747
1809
  for element in text_elements:
1748
- if (hasattr(element, 'top') and hasattr(element, 'bottom') and
1749
- element.top < gap_end and element.bottom > gap_start):
1810
+ if (
1811
+ hasattr(element, "top")
1812
+ and hasattr(element, "bottom")
1813
+ and element.top < gap_end
1814
+ and element.bottom > gap_start
1815
+ ):
1750
1816
  gap_has_text = True
1751
1817
  break
1752
-
1818
+
1753
1819
  if not gap_has_text:
1754
1820
  trough_ranges.append((gap_start, gap_end))
1755
-
1821
+
1756
1822
  return trough_ranges
1757
-
1758
- def _optimal_guide_assignment(self, guides: List[float], trough_ranges: List[Tuple[float, float]]) -> Dict[int, int]:
1823
+
1824
+ def _optimal_guide_assignment(
1825
+ self, guides: List[float], trough_ranges: List[Tuple[float, float]]
1826
+ ) -> Dict[int, int]:
1759
1827
  """
1760
1828
  Assign guides to trough ranges using the user's desired logic:
1761
1829
  - Guides already in a trough stay put
@@ -1764,18 +1832,20 @@ class Guides:
1764
1832
  """
1765
1833
  if not guides or not trough_ranges:
1766
1834
  return {}
1767
-
1835
+
1768
1836
  assignments = {}
1769
-
1837
+
1770
1838
  # Step 1: Identify which guides are already in troughs
1771
1839
  guides_in_troughs = set()
1772
1840
  for i, guide_pos in enumerate(guides):
1773
1841
  for trough_start, trough_end in trough_ranges:
1774
1842
  if trough_start <= guide_pos <= trough_end:
1775
1843
  guides_in_troughs.add(i)
1776
- logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is already in trough ({trough_start:.1f}-{trough_end:.1f}), keeping in place")
1844
+ logger.debug(
1845
+ f"Guide {i} (pos {guide_pos:.1f}) is already in trough ({trough_start:.1f}-{trough_end:.1f}), keeping in place"
1846
+ )
1777
1847
  break
1778
-
1848
+
1779
1849
  # Step 2: Identify which troughs are already occupied
1780
1850
  occupied_troughs = set()
1781
1851
  for i in guides_in_troughs:
@@ -1784,21 +1854,23 @@ class Guides:
1784
1854
  if trough_start <= guide_pos <= trough_end:
1785
1855
  occupied_troughs.add(j)
1786
1856
  break
1787
-
1857
+
1788
1858
  # Step 3: Find guides that need reassignment (not in any trough)
1789
1859
  guides_to_move = []
1790
1860
  for i, guide_pos in enumerate(guides):
1791
1861
  if i not in guides_in_troughs:
1792
1862
  guides_to_move.append(i)
1793
- logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is NOT in any trough, needs reassignment")
1794
-
1863
+ logger.debug(
1864
+ f"Guide {i} (pos {guide_pos:.1f}) is NOT in any trough, needs reassignment"
1865
+ )
1866
+
1795
1867
  # Step 4: Find available troughs (not occupied by existing guides)
1796
1868
  available_troughs = []
1797
1869
  for j, (trough_start, trough_end) in enumerate(trough_ranges):
1798
1870
  if j not in occupied_troughs:
1799
1871
  available_troughs.append(j)
1800
1872
  logger.debug(f"Trough {j} ({trough_start:.1f}-{trough_end:.1f}) is available")
1801
-
1873
+
1802
1874
  # Step 5: Assign guides to move to closest available troughs
1803
1875
  if guides_to_move and available_troughs:
1804
1876
  # Calculate distances for all combinations
@@ -1810,20 +1882,22 @@ class Guides:
1810
1882
  trough_center = (trough_start + trough_end) / 2
1811
1883
  distance = abs(guide_pos - trough_center)
1812
1884
  distances.append((distance, guide_idx, trough_idx))
1813
-
1885
+
1814
1886
  # Sort by distance and assign greedily
1815
1887
  distances.sort()
1816
1888
  used_troughs = set()
1817
-
1889
+
1818
1890
  for distance, guide_idx, trough_idx in distances:
1819
1891
  if guide_idx not in assignments and trough_idx not in used_troughs:
1820
1892
  assignments[guide_idx] = trough_idx
1821
1893
  used_troughs.add(trough_idx)
1822
- logger.debug(f"Assigned guide {guide_idx} (pos {guides[guide_idx]:.1f}) to trough {trough_idx} (distance: {distance:.1f})")
1823
-
1894
+ logger.debug(
1895
+ f"Assigned guide {guide_idx} (pos {guides[guide_idx]:.1f}) to trough {trough_idx} (distance: {distance:.1f})"
1896
+ )
1897
+
1824
1898
  logger.debug(f"Final assignments: {assignments}")
1825
1899
  return assignments
1826
-
1900
+
1827
1901
  def _snap_guides_to_gaps(self, guides: List[float], gaps: List[Tuple[float, float]], axis: str):
1828
1902
  """
1829
1903
  Snap guides to nearby gaps using optimal assignment.
@@ -1831,15 +1905,15 @@ class Guides:
1831
1905
  """
1832
1906
  if not guides or not gaps:
1833
1907
  return
1834
-
1908
+
1835
1909
  logger.debug(f"Snapping {len(guides)} {axis} guides to {len(gaps)} trough ranges")
1836
1910
  for i, (start, end) in enumerate(gaps):
1837
1911
  center = (start + end) / 2
1838
1912
  logger.debug(f" Trough {i}: {start:.1f} to {end:.1f} (center: {center:.1f})")
1839
-
1913
+
1840
1914
  # Get optimal assignments
1841
1915
  assignments = self._optimal_guide_assignment(guides, gaps)
1842
-
1916
+
1843
1917
  # Apply assignments (modify guides list in-place)
1844
1918
  for guide_idx, trough_idx in assignments.items():
1845
1919
  trough_start, trough_end = gaps[trough_idx]
@@ -1847,23 +1921,23 @@ class Guides:
1847
1921
  old_pos = guides[guide_idx]
1848
1922
  guides[guide_idx] = new_pos
1849
1923
  logger.info(f"Snapped {axis} guide from {old_pos:.1f} to {new_pos:.1f}")
1850
-
1924
+
1851
1925
  def build_grid(
1852
1926
  self,
1853
1927
  target: Optional[Union["Page", "Region"]] = None,
1854
1928
  source: str = "guides",
1855
1929
  cell_padding: float = 0.5,
1856
- include_outer_boundaries: bool = False
1930
+ include_outer_boundaries: bool = False,
1857
1931
  ) -> Dict[str, int]:
1858
1932
  """
1859
1933
  Create table structure (table, rows, columns, cells) from guide coordinates.
1860
-
1934
+
1861
1935
  Args:
1862
1936
  target: Page or Region to create regions on (uses self.context if None)
1863
1937
  source: Source label for created regions (for identification)
1864
1938
  cell_padding: Internal padding for cell regions in points
1865
1939
  include_outer_boundaries: Whether to add boundaries at edges if missing
1866
-
1940
+
1867
1941
  Returns:
1868
1942
  Dictionary with counts: {'table': 1, 'rows': N, 'columns': M, 'cells': N*M}
1869
1943
  """
@@ -1871,98 +1945,95 @@ class Guides:
1871
1945
  target_obj = target or self.context
1872
1946
  if not target_obj:
1873
1947
  raise ValueError("No target object available. Provide target parameter or context.")
1874
-
1948
+
1875
1949
  # Get the page for creating regions
1876
- if hasattr(target_obj, 'x0') and hasattr(target_obj, 'top'): # Region (has bbox coordinates)
1950
+ if hasattr(target_obj, "x0") and hasattr(
1951
+ target_obj, "top"
1952
+ ): # Region (has bbox coordinates)
1877
1953
  page = target_obj._page
1878
1954
  origin_x, origin_y = target_obj.x0, target_obj.top
1879
1955
  context_width, context_height = target_obj.width, target_obj.height
1880
- elif hasattr(target_obj, '_element_mgr') or hasattr(target_obj, 'width'): # Page
1956
+ elif hasattr(target_obj, "_element_mgr") or hasattr(target_obj, "width"): # Page
1881
1957
  page = target_obj
1882
1958
  origin_x, origin_y = 0.0, 0.0
1883
1959
  context_width, context_height = page.width, page.height
1884
1960
  else:
1885
1961
  raise ValueError(f"Target object {target_obj} is not a Page or Region")
1886
-
1962
+
1887
1963
  element_manager = page._element_mgr
1888
-
1964
+
1889
1965
  # Setup boundaries
1890
1966
  row_boundaries = list(self.horizontal)
1891
1967
  col_boundaries = list(self.vertical)
1892
-
1968
+
1893
1969
  # Add outer boundaries if requested and missing
1894
1970
  if include_outer_boundaries:
1895
1971
  if not row_boundaries or row_boundaries[0] > origin_y:
1896
1972
  row_boundaries.insert(0, origin_y)
1897
1973
  if not row_boundaries or row_boundaries[-1] < origin_y + context_height:
1898
1974
  row_boundaries.append(origin_y + context_height)
1899
-
1975
+
1900
1976
  if not col_boundaries or col_boundaries[0] > origin_x:
1901
1977
  col_boundaries.insert(0, origin_x)
1902
1978
  if not col_boundaries or col_boundaries[-1] < origin_x + context_width:
1903
1979
  col_boundaries.append(origin_x + context_width)
1904
-
1980
+
1905
1981
  # Remove duplicates and sort
1906
1982
  row_boundaries = sorted(list(set(row_boundaries)))
1907
1983
  col_boundaries = sorted(list(set(col_boundaries)))
1908
-
1909
- logger.debug(f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries")
1910
-
1984
+
1985
+ logger.debug(
1986
+ f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries"
1987
+ )
1988
+
1911
1989
  # Track creation counts
1912
- counts = {'table': 0, 'rows': 0, 'columns': 0, 'cells': 0}
1913
-
1990
+ counts = {"table": 0, "rows": 0, "columns": 0, "cells": 0}
1991
+
1914
1992
  # Create overall table region
1915
1993
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1916
1994
  table_region = page.create_region(
1917
- col_boundaries[0], row_boundaries[0],
1918
- col_boundaries[-1], row_boundaries[-1]
1995
+ col_boundaries[0], row_boundaries[0], col_boundaries[-1], row_boundaries[-1]
1919
1996
  )
1920
1997
  table_region.source = source
1921
1998
  table_region.region_type = "table"
1922
1999
  table_region.normalized_type = "table"
1923
- table_region.metadata.update({
1924
- "source_guides": True,
1925
- "num_rows": len(row_boundaries) - 1,
1926
- "num_cols": len(col_boundaries) - 1,
1927
- "boundaries": {"rows": row_boundaries, "cols": col_boundaries}
1928
- })
2000
+ table_region.metadata.update(
2001
+ {
2002
+ "source_guides": True,
2003
+ "num_rows": len(row_boundaries) - 1,
2004
+ "num_cols": len(col_boundaries) - 1,
2005
+ "boundaries": {"rows": row_boundaries, "cols": col_boundaries},
2006
+ }
2007
+ )
1929
2008
  element_manager.add_element(table_region, element_type="regions")
1930
- counts['table'] = 1
1931
-
2009
+ counts["table"] = 1
2010
+
1932
2011
  # Create row regions
1933
2012
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1934
2013
  for i in range(len(row_boundaries) - 1):
1935
2014
  row_region = page.create_region(
1936
- col_boundaries[0], row_boundaries[i],
1937
- col_boundaries[-1], row_boundaries[i + 1]
2015
+ col_boundaries[0], row_boundaries[i], col_boundaries[-1], row_boundaries[i + 1]
1938
2016
  )
1939
2017
  row_region.source = source
1940
2018
  row_region.region_type = "table_row"
1941
2019
  row_region.normalized_type = "table_row"
1942
- row_region.metadata.update({
1943
- "row_index": i,
1944
- "source_guides": True
1945
- })
2020
+ row_region.metadata.update({"row_index": i, "source_guides": True})
1946
2021
  element_manager.add_element(row_region, element_type="regions")
1947
- counts['rows'] += 1
1948
-
2022
+ counts["rows"] += 1
2023
+
1949
2024
  # Create column regions
1950
2025
  if len(col_boundaries) >= 2 and len(row_boundaries) >= 2:
1951
2026
  for j in range(len(col_boundaries) - 1):
1952
2027
  col_region = page.create_region(
1953
- col_boundaries[j], row_boundaries[0],
1954
- col_boundaries[j + 1], row_boundaries[-1]
2028
+ col_boundaries[j], row_boundaries[0], col_boundaries[j + 1], row_boundaries[-1]
1955
2029
  )
1956
2030
  col_region.source = source
1957
2031
  col_region.region_type = "table_column"
1958
2032
  col_region.normalized_type = "table_column"
1959
- col_region.metadata.update({
1960
- "col_index": j,
1961
- "source_guides": True
1962
- })
2033
+ col_region.metadata.update({"col_index": j, "source_guides": True})
1963
2034
  element_manager.add_element(col_region, element_type="regions")
1964
- counts['columns'] += 1
1965
-
2035
+ counts["columns"] += 1
2036
+
1966
2037
  # Create cell regions
1967
2038
  if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
1968
2039
  for i in range(len(row_boundaries) - 1):
@@ -1972,50 +2043,58 @@ class Guides:
1972
2043
  cell_top = row_boundaries[i] + cell_padding
1973
2044
  cell_x1 = col_boundaries[j + 1] - cell_padding
1974
2045
  cell_bottom = row_boundaries[i + 1] - cell_padding
1975
-
2046
+
1976
2047
  # Skip invalid cells
1977
2048
  if cell_x1 <= cell_x0 or cell_bottom <= cell_top:
1978
2049
  continue
1979
-
2050
+
1980
2051
  cell_region = page.create_region(cell_x0, cell_top, cell_x1, cell_bottom)
1981
2052
  cell_region.source = source
1982
2053
  cell_region.region_type = "table_cell"
1983
2054
  cell_region.normalized_type = "table_cell"
1984
- cell_region.metadata.update({
1985
- "row_index": i,
1986
- "col_index": j,
1987
- "source_guides": True,
1988
- "original_boundaries": {
1989
- "left": col_boundaries[j],
1990
- "top": row_boundaries[i],
1991
- "right": col_boundaries[j + 1],
1992
- "bottom": row_boundaries[i + 1]
2055
+ cell_region.metadata.update(
2056
+ {
2057
+ "row_index": i,
2058
+ "col_index": j,
2059
+ "source_guides": True,
2060
+ "original_boundaries": {
2061
+ "left": col_boundaries[j],
2062
+ "top": row_boundaries[i],
2063
+ "right": col_boundaries[j + 1],
2064
+ "bottom": row_boundaries[i + 1],
2065
+ },
1993
2066
  }
1994
- })
2067
+ )
1995
2068
  element_manager.add_element(cell_region, element_type="regions")
1996
- counts['cells'] += 1
1997
-
1998
- logger.info(f"Created {counts['table']} table, {counts['rows']} rows, "
1999
- f"{counts['columns']} columns, and {counts['cells']} cells from guides")
2000
-
2069
+ counts["cells"] += 1
2070
+
2071
+ logger.info(
2072
+ f"Created {counts['table']} table, {counts['rows']} rows, "
2073
+ f"{counts['columns']} columns, and {counts['cells']} cells from guides"
2074
+ )
2075
+
2001
2076
  return counts
2002
2077
 
2003
2078
  def __repr__(self) -> str:
2004
2079
  """String representation of the guides."""
2005
- return (f"Guides(verticals={len(self.vertical)}, "
2006
- f"horizontals={len(self.horizontal)}, "
2007
- f"cells={len(self.get_cells())})")
2080
+ return (
2081
+ f"Guides(verticals={len(self.vertical)}, "
2082
+ f"horizontals={len(self.horizontal)}, "
2083
+ f"cells={len(self.get_cells())})"
2084
+ )
2008
2085
 
2009
2086
  def _get_text_elements(self):
2010
2087
  """Get text elements from the context."""
2011
2088
  if not self.context:
2012
2089
  return []
2013
-
2090
+
2014
2091
  # Get text elements from the context
2015
- if hasattr(self.context, 'find_all'):
2092
+ if hasattr(self.context, "find_all"):
2016
2093
  try:
2017
- text_elements = self.context.find_all('text', apply_exclusions=False)
2018
- return text_elements.elements if hasattr(text_elements, 'elements') else text_elements
2094
+ text_elements = self.context.find_all("text", apply_exclusions=False)
2095
+ return (
2096
+ text_elements.elements if hasattr(text_elements, "elements") else text_elements
2097
+ )
2019
2098
  except Exception as e:
2020
2099
  logger.warning(f"Error getting text elements: {e}")
2021
2100
  return []
@@ -2026,32 +2105,32 @@ class Guides:
2026
2105
  # -------------------------------------------------------------------------
2027
2106
  # Instance methods for fluent chaining (avoid name conflicts with class methods)
2028
2107
  # -------------------------------------------------------------------------
2029
-
2108
+
2030
2109
  def add_content(
2031
2110
  self,
2032
- axis: Literal['vertical', 'horizontal'] = 'vertical',
2111
+ axis: Literal["vertical", "horizontal"] = "vertical",
2033
2112
  markers: Union[str, List[str], "ElementCollection", None] = None,
2034
2113
  obj: Optional[Union["Page", "Region"]] = None,
2035
- align: Literal['left', 'right', 'center', 'between'] = 'left',
2114
+ align: Literal["left", "right", "center", "between"] = "left",
2036
2115
  outer: bool = True,
2037
- tolerance: float = 5
2116
+ tolerance: float = 5,
2038
2117
  ) -> "Guides":
2039
2118
  """
2040
2119
  Instance method: Add guides from content, allowing chaining.
2041
2120
  This allows: Guides.new(page).add_content(axis='vertical', markers=[...])
2042
-
2121
+
2043
2122
  Args:
2044
2123
  axis: Which axis to create guides for
2045
2124
  markers: Content to search for. Can be:
2046
2125
  - str: single selector or literal text
2047
- - List[str]: list of selectors or literal text strings
2126
+ - List[str]: list of selectors or literal text strings
2048
2127
  - ElementCollection: collection of elements to extract text from
2049
2128
  - None: no markers
2050
2129
  obj: Page or Region to search (uses self.context if None)
2051
2130
  align: How to align guides relative to found elements
2052
2131
  outer: Whether to add outer boundary guides
2053
2132
  tolerance: Tolerance for snapping to element edges
2054
-
2133
+
2055
2134
  Returns:
2056
2135
  Self for method chaining
2057
2136
  """
@@ -2059,7 +2138,7 @@ class Guides:
2059
2138
  target_obj = obj or self.context
2060
2139
  if target_obj is None:
2061
2140
  raise ValueError("No object provided and no context available")
2062
-
2141
+
2063
2142
  # Create new guides using the class method
2064
2143
  new_guides = Guides.from_content(
2065
2144
  obj=target_obj,
@@ -2067,34 +2146,34 @@ class Guides:
2067
2146
  markers=markers,
2068
2147
  align=align,
2069
2148
  outer=outer,
2070
- tolerance=tolerance
2149
+ tolerance=tolerance,
2071
2150
  )
2072
-
2151
+
2073
2152
  # Add the appropriate coordinates to this object
2074
- if axis == 'vertical':
2153
+ if axis == "vertical":
2075
2154
  self.vertical = list(set(self.vertical + new_guides.vertical))
2076
2155
  else:
2077
2156
  self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2078
-
2157
+
2079
2158
  return self
2080
-
2159
+
2081
2160
  def add_lines(
2082
2161
  self,
2083
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
2084
- obj: Optional[Union["Page", "Region"]] = None,
2085
- threshold: Union[float, str] = 'auto',
2162
+ axis: Literal["vertical", "horizontal", "both"] = "both",
2163
+ obj: Optional[Union["Page", "Region"]] = None,
2164
+ threshold: Union[float, str] = "auto",
2086
2165
  source_label: Optional[str] = None,
2087
2166
  max_lines_h: Optional[int] = None,
2088
2167
  max_lines_v: Optional[int] = None,
2089
2168
  outer: bool = False,
2090
- detection_method: str = 'vector',
2169
+ detection_method: str = "vector",
2091
2170
  resolution: int = 192,
2092
- **detect_kwargs
2171
+ **detect_kwargs,
2093
2172
  ) -> "Guides":
2094
2173
  """
2095
2174
  Instance method: Add guides from lines, allowing chaining.
2096
2175
  This allows: Guides.new(page).add_lines(axis='horizontal')
2097
-
2176
+
2098
2177
  Args:
2099
2178
  axis: Which axis to detect lines for
2100
2179
  obj: Page or Region to search (uses self.context if None)
@@ -2106,7 +2185,7 @@ class Guides:
2106
2185
  detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
2107
2186
  resolution: DPI for pixel-based detection (default: 192)
2108
2187
  **detect_kwargs: Additional parameters for pixel detection (see from_lines)
2109
-
2188
+
2110
2189
  Returns:
2111
2190
  Self for method chaining
2112
2191
  """
@@ -2114,7 +2193,7 @@ class Guides:
2114
2193
  target_obj = obj or self.context
2115
2194
  if target_obj is None:
2116
2195
  raise ValueError("No object provided and no context available")
2117
-
2196
+
2118
2197
  # Create new guides using the class method
2119
2198
  new_guides = Guides.from_lines(
2120
2199
  obj=target_obj,
@@ -2126,32 +2205,32 @@ class Guides:
2126
2205
  outer=outer,
2127
2206
  detection_method=detection_method,
2128
2207
  resolution=resolution,
2129
- **detect_kwargs
2208
+ **detect_kwargs,
2130
2209
  )
2131
-
2210
+
2132
2211
  # Add the appropriate coordinates to this object
2133
- if axis in ('vertical', 'both'):
2212
+ if axis in ("vertical", "both"):
2134
2213
  self.vertical = list(set(self.vertical + new_guides.vertical))
2135
- if axis in ('horizontal', 'both'):
2214
+ if axis in ("horizontal", "both"):
2136
2215
  self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2137
-
2216
+
2138
2217
  return self
2139
-
2218
+
2140
2219
  def add_whitespace(
2141
2220
  self,
2142
- axis: Literal['vertical', 'horizontal', 'both'] = 'both',
2221
+ axis: Literal["vertical", "horizontal", "both"] = "both",
2143
2222
  obj: Optional[Union["Page", "Region"]] = None,
2144
- min_gap: float = 10
2223
+ min_gap: float = 10,
2145
2224
  ) -> "Guides":
2146
2225
  """
2147
2226
  Instance method: Add guides from whitespace, allowing chaining.
2148
2227
  This allows: Guides.new(page).add_whitespace(axis='both')
2149
-
2228
+
2150
2229
  Args:
2151
2230
  axis: Which axis to create guides for
2152
2231
  obj: Page or Region to search (uses self.context if None)
2153
2232
  min_gap: Minimum gap size to consider
2154
-
2233
+
2155
2234
  Returns:
2156
2235
  Self for method chaining
2157
2236
  """
@@ -2159,18 +2238,14 @@ class Guides:
2159
2238
  target_obj = obj or self.context
2160
2239
  if target_obj is None:
2161
2240
  raise ValueError("No object provided and no context available")
2162
-
2241
+
2163
2242
  # Create new guides using the class method
2164
- new_guides = Guides.from_whitespace(
2165
- obj=target_obj,
2166
- axis=axis,
2167
- min_gap=min_gap
2168
- )
2169
-
2243
+ new_guides = Guides.from_whitespace(obj=target_obj, axis=axis, min_gap=min_gap)
2244
+
2170
2245
  # Add the appropriate coordinates to this object
2171
- if axis in ('vertical', 'both'):
2246
+ if axis in ("vertical", "both"):
2172
2247
  self.vertical = list(set(self.vertical + new_guides.vertical))
2173
- if axis in ('horizontal', 'both'):
2248
+ if axis in ("horizontal", "both"):
2174
2249
  self.horizontal = list(set(self.horizontal + new_guides.horizontal))
2175
-
2176
- return self
2250
+
2251
+ return self