natural-pdf 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +18 -4
- natural_pdf/analyzers/guides.py +2176 -0
- natural_pdf/analyzers/shape_detection_mixin.py +0 -650
- natural_pdf/core/element_manager.py +86 -27
- natural_pdf/core/page.py +49 -1
- natural_pdf/core/pdf.py +22 -0
- natural_pdf/elements/collections.py +61 -0
- natural_pdf/elements/region.py +257 -14
- natural_pdf/elements/text.py +29 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/RECORD +15 -19
- bad_pdf_analysis/analyze_10_more.py +0 -300
- bad_pdf_analysis/analyze_final_10.py +0 -552
- bad_pdf_analysis/analyze_specific_pages.py +0 -394
- bad_pdf_analysis/analyze_specific_pages_direct.py +0 -382
- tools/rtl_smoke_test.py +0 -80
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.32.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2176 @@
|
|
1
|
+
"""Guide system for table extraction and layout analysis."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING
|
6
|
+
from collections import UserList
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
from PIL import Image, ImageDraw
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from natural_pdf.core.page import Page
|
13
|
+
from natural_pdf.elements.region import Region
|
14
|
+
from natural_pdf.elements.base import Element
|
15
|
+
from natural_pdf.elements.collections import ElementCollection
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def _normalize_markers(
|
21
|
+
markers: Union[str, List[str], "ElementCollection", None],
|
22
|
+
obj: Union["Page", "Region"]
|
23
|
+
) -> List[str]:
|
24
|
+
"""
|
25
|
+
Normalize markers parameter to a list of text strings for guide creation.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
markers: Can be:
|
29
|
+
- str: single selector or text string
|
30
|
+
- List[str]: list of selectors or text strings
|
31
|
+
- ElementCollection: collection of elements to extract text from
|
32
|
+
- None: empty list
|
33
|
+
obj: Object to search for elements if markers contains selectors
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
List of text strings to search for
|
37
|
+
"""
|
38
|
+
if markers is None:
|
39
|
+
return []
|
40
|
+
|
41
|
+
if isinstance(markers, str):
|
42
|
+
# Single selector or text string
|
43
|
+
if markers.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
|
44
|
+
# It's a CSS selector, find elements and extract text
|
45
|
+
if hasattr(obj, 'find_all'):
|
46
|
+
elements = obj.find_all(markers)
|
47
|
+
return [elem.text if hasattr(elem, 'text') else str(elem) for elem in elements]
|
48
|
+
else:
|
49
|
+
logger.warning(f"Object {obj} doesn't support find_all for selector '{markers}'")
|
50
|
+
return [markers] # Treat as literal text
|
51
|
+
else:
|
52
|
+
# Treat as literal text
|
53
|
+
return [markers]
|
54
|
+
|
55
|
+
elif hasattr(markers, '__iter__') and not isinstance(markers, str):
|
56
|
+
# It might be an ElementCollection or list
|
57
|
+
if hasattr(markers, 'extract_each_text'):
|
58
|
+
# It's an ElementCollection
|
59
|
+
try:
|
60
|
+
return markers.extract_each_text()
|
61
|
+
except Exception as e:
|
62
|
+
logger.warning(f"Failed to extract text from ElementCollection: {e}")
|
63
|
+
# Fallback: try to get text from individual elements
|
64
|
+
texts = []
|
65
|
+
for elem in markers:
|
66
|
+
if hasattr(elem, 'text'):
|
67
|
+
texts.append(elem.text)
|
68
|
+
elif hasattr(elem, 'extract_text'):
|
69
|
+
texts.append(elem.extract_text())
|
70
|
+
else:
|
71
|
+
texts.append(str(elem))
|
72
|
+
return texts
|
73
|
+
else:
|
74
|
+
# It's a regular list - process each item
|
75
|
+
result = []
|
76
|
+
for marker in markers:
|
77
|
+
if isinstance(marker, str):
|
78
|
+
if marker.startswith(('text', 'region', 'line', 'rect', 'blob', 'image')):
|
79
|
+
# It's a selector
|
80
|
+
if hasattr(obj, 'find_all'):
|
81
|
+
elements = obj.find_all(marker)
|
82
|
+
result.extend([elem.text if hasattr(elem, 'text') else str(elem) for elem in elements])
|
83
|
+
else:
|
84
|
+
result.append(marker) # Treat as literal
|
85
|
+
else:
|
86
|
+
# Literal text
|
87
|
+
result.append(marker)
|
88
|
+
elif hasattr(marker, 'text'):
|
89
|
+
# It's an element object
|
90
|
+
result.append(marker.text)
|
91
|
+
elif hasattr(marker, 'extract_text'):
|
92
|
+
# It's an element that can extract text
|
93
|
+
result.append(marker.extract_text())
|
94
|
+
else:
|
95
|
+
result.append(str(marker))
|
96
|
+
return result
|
97
|
+
|
98
|
+
else:
|
99
|
+
# Unknown type, try to convert to string
|
100
|
+
return [str(markers)]
|
101
|
+
|
102
|
+
|
103
|
+
class GuidesList(UserList):
|
104
|
+
"""A list of guide coordinates that also provides methods for creating guides."""
|
105
|
+
|
106
|
+
def __init__(self, parent_guides: "Guides", axis: Literal["vertical", "horizontal"], data=None):
|
107
|
+
super().__init__(data or [])
|
108
|
+
self._parent = parent_guides
|
109
|
+
self._axis = axis
|
110
|
+
|
111
|
+
def from_content(
|
112
|
+
self,
|
113
|
+
markers: Union[str, List[str], "ElementCollection", None],
|
114
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
115
|
+
align: Literal['left', 'right', 'center', 'between'] = 'left',
|
116
|
+
outer: bool = True,
|
117
|
+
tolerance: float = 5
|
118
|
+
) -> "Guides":
|
119
|
+
"""
|
120
|
+
Create guides from content markers and add to this axis.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
markers: Content to search for. Can be:
|
124
|
+
- str: single selector (e.g., 'text:contains("Name")') or literal text
|
125
|
+
- List[str]: list of selectors or literal text strings
|
126
|
+
- ElementCollection: collection of elements to extract text from
|
127
|
+
- None: no markers
|
128
|
+
obj: Page/Region to search (uses parent's context if None)
|
129
|
+
align: How to align guides relative to found elements
|
130
|
+
outer: Whether to add outer boundary guides
|
131
|
+
tolerance: Tolerance for snapping to element edges
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
Parent Guides object for chaining
|
135
|
+
"""
|
136
|
+
target_obj = obj or self._parent.context
|
137
|
+
if target_obj is None:
|
138
|
+
raise ValueError("No object provided and no context available")
|
139
|
+
|
140
|
+
# Normalize markers to list of text strings
|
141
|
+
marker_texts = _normalize_markers(markers, target_obj)
|
142
|
+
|
143
|
+
# Create guides for this axis
|
144
|
+
new_guides = Guides.from_content(
|
145
|
+
obj=target_obj,
|
146
|
+
axis=self._axis,
|
147
|
+
markers=marker_texts,
|
148
|
+
align=align,
|
149
|
+
outer=outer,
|
150
|
+
tolerance=tolerance
|
151
|
+
)
|
152
|
+
|
153
|
+
# Add to our list
|
154
|
+
if self._axis == 'vertical':
|
155
|
+
self.extend(new_guides.vertical)
|
156
|
+
else:
|
157
|
+
self.extend(new_guides.horizontal)
|
158
|
+
|
159
|
+
# Remove duplicates while preserving order
|
160
|
+
seen = set()
|
161
|
+
unique = []
|
162
|
+
for x in self.data:
|
163
|
+
if x not in seen:
|
164
|
+
seen.add(x)
|
165
|
+
unique.append(x)
|
166
|
+
self.data = unique
|
167
|
+
|
168
|
+
return self._parent # Return parent for chaining
|
169
|
+
|
170
|
+
def from_lines(
|
171
|
+
self,
|
172
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
173
|
+
threshold: Union[float, str] = 'auto',
|
174
|
+
source_label: Optional[str] = None,
|
175
|
+
max_lines: Optional[int] = None,
|
176
|
+
outer: bool = False,
|
177
|
+
detection_method: str = 'vector',
|
178
|
+
resolution: int = 192,
|
179
|
+
*,
|
180
|
+
n: Optional[int] = None,
|
181
|
+
min_gap: Optional[int] = None,
|
182
|
+
**detect_kwargs
|
183
|
+
) -> "Guides":
|
184
|
+
"""
|
185
|
+
Create guides from detected line elements.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
obj: Page/Region to search (uses parent's context if None)
|
189
|
+
threshold: Line detection threshold ('auto' or float 0.0-1.0)
|
190
|
+
source_label: Filter lines by source label (for vector method)
|
191
|
+
max_lines: Maximum lines to use (alias: n)
|
192
|
+
n: Convenience alias for max_lines. If provided, overrides max_lines.
|
193
|
+
min_gap: Minimum pixel gap enforced between detected lines. Mapped to
|
194
|
+
``min_gap_h`` or ``min_gap_v`` depending on axis (ignored if those
|
195
|
+
keys are already supplied via ``detect_kwargs``).
|
196
|
+
outer: Whether to add outer boundary guides
|
197
|
+
detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
|
198
|
+
resolution: DPI for pixel-based detection (default: 192)
|
199
|
+
**detect_kwargs: Additional parameters for pixel-based detection
|
200
|
+
(e.g., min_gap_h, min_gap_v, binarization_method, etc.)
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
Parent Guides object for chaining
|
204
|
+
"""
|
205
|
+
target_obj = obj or self._parent.context
|
206
|
+
if target_obj is None:
|
207
|
+
raise ValueError("No object provided and no context available")
|
208
|
+
|
209
|
+
# Resolve max_lines via alias `n` (n takes priority)
|
210
|
+
if n is not None:
|
211
|
+
if n <= 0:
|
212
|
+
raise ValueError("n must be a positive integer")
|
213
|
+
max_lines = n
|
214
|
+
|
215
|
+
# Set appropriate max_lines parameter for underlying API
|
216
|
+
max_lines_h = max_lines if self._axis == 'horizontal' else None
|
217
|
+
max_lines_v = max_lines if self._axis == 'vertical' else None
|
218
|
+
|
219
|
+
# Map generic `min_gap` to axis-specific argument expected by detection
|
220
|
+
if min_gap is not None:
|
221
|
+
if min_gap < 1:
|
222
|
+
raise ValueError("min_gap must be ≥ 1 pixel")
|
223
|
+
axis_key = 'min_gap_h' if self._axis == 'horizontal' else 'min_gap_v'
|
224
|
+
detect_kwargs.setdefault(axis_key, min_gap)
|
225
|
+
|
226
|
+
# Create guides for this axis
|
227
|
+
new_guides = Guides.from_lines(
|
228
|
+
obj=target_obj,
|
229
|
+
axis=self._axis,
|
230
|
+
threshold=threshold,
|
231
|
+
source_label=source_label,
|
232
|
+
max_lines_h=max_lines_h,
|
233
|
+
max_lines_v=max_lines_v,
|
234
|
+
outer=outer,
|
235
|
+
detection_method=detection_method,
|
236
|
+
resolution=resolution,
|
237
|
+
**detect_kwargs
|
238
|
+
)
|
239
|
+
|
240
|
+
# Add to our list
|
241
|
+
if self._axis == 'vertical':
|
242
|
+
self.extend(new_guides.vertical)
|
243
|
+
else:
|
244
|
+
self.extend(new_guides.horizontal)
|
245
|
+
|
246
|
+
# Remove duplicates
|
247
|
+
seen = set()
|
248
|
+
unique = []
|
249
|
+
for x in self.data:
|
250
|
+
if x not in seen:
|
251
|
+
seen.add(x)
|
252
|
+
unique.append(x)
|
253
|
+
self.data = unique
|
254
|
+
|
255
|
+
return self._parent
|
256
|
+
|
257
|
+
def from_whitespace(
|
258
|
+
self,
|
259
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
260
|
+
min_gap: float = 10
|
261
|
+
) -> "Guides":
|
262
|
+
"""
|
263
|
+
Create guides from whitespace gaps.
|
264
|
+
|
265
|
+
Args:
|
266
|
+
obj: Page/Region to analyze (uses parent's context if None)
|
267
|
+
min_gap: Minimum gap size to consider
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
Parent Guides object for chaining
|
271
|
+
"""
|
272
|
+
target_obj = obj or self._parent.context
|
273
|
+
if target_obj is None:
|
274
|
+
raise ValueError("No object provided and no context available")
|
275
|
+
|
276
|
+
# Create guides for this axis
|
277
|
+
new_guides = Guides.from_whitespace(
|
278
|
+
obj=target_obj,
|
279
|
+
axis=self._axis,
|
280
|
+
min_gap=min_gap
|
281
|
+
)
|
282
|
+
|
283
|
+
# Add to our list
|
284
|
+
if self._axis == 'vertical':
|
285
|
+
self.extend(new_guides.vertical)
|
286
|
+
else:
|
287
|
+
self.extend(new_guides.horizontal)
|
288
|
+
|
289
|
+
# Remove duplicates
|
290
|
+
seen = set()
|
291
|
+
unique = []
|
292
|
+
for x in self.data:
|
293
|
+
if x not in seen:
|
294
|
+
seen.add(x)
|
295
|
+
unique.append(x)
|
296
|
+
self.data = unique
|
297
|
+
|
298
|
+
return self._parent
|
299
|
+
|
300
|
+
def divide(self, n: int = 2, obj: Optional[Union["Page", "Region"]] = None) -> "Guides":
|
301
|
+
"""
|
302
|
+
Divide the space evenly along this axis.
|
303
|
+
|
304
|
+
Args:
|
305
|
+
n: Number of divisions (creates n-1 guides)
|
306
|
+
obj: Object to divide (uses parent's context if None)
|
307
|
+
|
308
|
+
Returns:
|
309
|
+
Parent Guides object for chaining
|
310
|
+
"""
|
311
|
+
target_obj = obj or self._parent.context
|
312
|
+
if target_obj is None:
|
313
|
+
raise ValueError("No object provided and no context available")
|
314
|
+
|
315
|
+
# Create guides using divide
|
316
|
+
new_guides = Guides.divide(
|
317
|
+
obj=target_obj,
|
318
|
+
n=n,
|
319
|
+
axis=self._axis
|
320
|
+
)
|
321
|
+
|
322
|
+
# Add to our list
|
323
|
+
if self._axis == 'vertical':
|
324
|
+
self.extend(new_guides.vertical)
|
325
|
+
else:
|
326
|
+
self.extend(new_guides.horizontal)
|
327
|
+
|
328
|
+
# Remove duplicates
|
329
|
+
seen = set()
|
330
|
+
unique = []
|
331
|
+
for x in self.data:
|
332
|
+
if x not in seen:
|
333
|
+
seen.add(x)
|
334
|
+
unique.append(x)
|
335
|
+
self.data = unique
|
336
|
+
|
337
|
+
return self._parent
|
338
|
+
|
339
|
+
def snap_to_whitespace(
|
340
|
+
self,
|
341
|
+
min_gap: float = 10.0,
|
342
|
+
detection_method: str = 'pixels',
|
343
|
+
threshold: Union[float, str] = 'auto',
|
344
|
+
on_no_snap: str = 'warn',
|
345
|
+
obj: Optional[Union["Page", "Region"]] = None
|
346
|
+
) -> "Guides":
|
347
|
+
"""
|
348
|
+
Snap guides in this axis to whitespace gaps.
|
349
|
+
|
350
|
+
Args:
|
351
|
+
min_gap: Minimum gap size to consider
|
352
|
+
detection_method: 'pixels' or 'text' for gap detection
|
353
|
+
threshold: Threshold for whitespace detection (0.0-1.0) or 'auto'
|
354
|
+
on_no_snap: What to do when snapping fails ('warn', 'raise', 'ignore')
|
355
|
+
obj: Object to analyze (uses parent's context if None)
|
356
|
+
|
357
|
+
Returns:
|
358
|
+
Parent Guides object for chaining
|
359
|
+
"""
|
360
|
+
target_obj = obj or self._parent.context
|
361
|
+
if target_obj is None:
|
362
|
+
raise ValueError("No object provided and no context available")
|
363
|
+
|
364
|
+
# Use the parent's snap_to_whitespace but only for this axis
|
365
|
+
original_guides = self.data.copy()
|
366
|
+
|
367
|
+
# Temporarily set the parent's guides to only this axis
|
368
|
+
if self._axis == 'vertical':
|
369
|
+
original_horizontal = self._parent.horizontal.data.copy()
|
370
|
+
self._parent.horizontal.data = []
|
371
|
+
else:
|
372
|
+
original_vertical = self._parent.vertical.data.copy()
|
373
|
+
self._parent.vertical.data = []
|
374
|
+
|
375
|
+
try:
|
376
|
+
# Call the parent's method
|
377
|
+
self._parent.snap_to_whitespace(
|
378
|
+
axis=self._axis,
|
379
|
+
min_gap=min_gap,
|
380
|
+
detection_method=detection_method,
|
381
|
+
threshold=threshold,
|
382
|
+
on_no_snap=on_no_snap
|
383
|
+
)
|
384
|
+
|
385
|
+
# Update our data from the parent
|
386
|
+
if self._axis == 'vertical':
|
387
|
+
self.data = self._parent.vertical.data.copy()
|
388
|
+
else:
|
389
|
+
self.data = self._parent.horizontal.data.copy()
|
390
|
+
|
391
|
+
finally:
|
392
|
+
# Restore the other axis
|
393
|
+
if self._axis == 'vertical':
|
394
|
+
self._parent.horizontal.data = original_horizontal
|
395
|
+
else:
|
396
|
+
self._parent.vertical.data = original_vertical
|
397
|
+
|
398
|
+
return self._parent
|
399
|
+
|
400
|
+
def snap_to_content(
|
401
|
+
self,
|
402
|
+
markers: Union[str, List[str], "ElementCollection", None] = 'text',
|
403
|
+
align: Literal['left', 'right', 'center'] = 'left',
|
404
|
+
tolerance: float = 5,
|
405
|
+
obj: Optional[Union["Page", "Region"]] = None
|
406
|
+
) -> "Guides":
|
407
|
+
"""
|
408
|
+
Snap guides in this axis to nearby text content.
|
409
|
+
|
410
|
+
Args:
|
411
|
+
markers: Content to snap to. Can be:
|
412
|
+
- str: single selector or literal text (default: 'text' for all text)
|
413
|
+
- List[str]: list of selectors or literal text strings
|
414
|
+
- ElementCollection: collection of elements
|
415
|
+
- None: no markers (no snapping)
|
416
|
+
align: How to align to the found text
|
417
|
+
tolerance: Maximum distance to move when snapping
|
418
|
+
obj: Object to search (uses parent's context if None)
|
419
|
+
|
420
|
+
Returns:
|
421
|
+
Parent Guides object for chaining
|
422
|
+
"""
|
423
|
+
target_obj = obj or self._parent.context
|
424
|
+
if target_obj is None:
|
425
|
+
raise ValueError("No object provided and no context available")
|
426
|
+
|
427
|
+
# Handle special case of 'text' as a selector for all text
|
428
|
+
if markers == 'text':
|
429
|
+
# Get all text elements
|
430
|
+
if hasattr(target_obj, 'find_all'):
|
431
|
+
text_elements = target_obj.find_all('text')
|
432
|
+
if hasattr(text_elements, 'elements'):
|
433
|
+
text_elements = text_elements.elements
|
434
|
+
|
435
|
+
# Snap each guide to the nearest text element
|
436
|
+
for i, guide_pos in enumerate(self.data):
|
437
|
+
best_distance = float('inf')
|
438
|
+
best_pos = guide_pos
|
439
|
+
|
440
|
+
for elem in text_elements:
|
441
|
+
# Calculate target position based on alignment
|
442
|
+
if self._axis == 'vertical':
|
443
|
+
if align == 'left':
|
444
|
+
elem_pos = elem.x0
|
445
|
+
elif align == 'right':
|
446
|
+
elem_pos = elem.x1
|
447
|
+
else: # center
|
448
|
+
elem_pos = (elem.x0 + elem.x1) / 2
|
449
|
+
else: # horizontal
|
450
|
+
if align == 'left': # top for horizontal
|
451
|
+
elem_pos = elem.top
|
452
|
+
elif align == 'right': # bottom for horizontal
|
453
|
+
elem_pos = elem.bottom
|
454
|
+
else: # center
|
455
|
+
elem_pos = (elem.top + elem.bottom) / 2
|
456
|
+
|
457
|
+
# Check if this is closer than current best
|
458
|
+
distance = abs(guide_pos - elem_pos)
|
459
|
+
if distance < best_distance and distance <= tolerance:
|
460
|
+
best_distance = distance
|
461
|
+
best_pos = elem_pos
|
462
|
+
|
463
|
+
# Update guide position if we found a good snap
|
464
|
+
if best_pos != guide_pos:
|
465
|
+
self.data[i] = best_pos
|
466
|
+
logger.debug(f"Snapped {self._axis} guide from {guide_pos:.1f} to {best_pos:.1f}")
|
467
|
+
else:
|
468
|
+
logger.warning("Object does not support find_all for text snapping")
|
469
|
+
else:
|
470
|
+
# Original behavior for specific markers
|
471
|
+
marker_texts = _normalize_markers(markers, target_obj)
|
472
|
+
|
473
|
+
# Find each marker and snap guides
|
474
|
+
for marker in marker_texts:
|
475
|
+
if hasattr(target_obj, 'find'):
|
476
|
+
element = target_obj.find(f'text:contains("{marker}")')
|
477
|
+
if not element:
|
478
|
+
logger.warning(f"Could not find text '{marker}' for snapping")
|
479
|
+
continue
|
480
|
+
|
481
|
+
# Determine target position based on alignment
|
482
|
+
if self._axis == 'vertical':
|
483
|
+
if align == 'left':
|
484
|
+
target_pos = element.x0
|
485
|
+
elif align == 'right':
|
486
|
+
target_pos = element.x1
|
487
|
+
else: # center
|
488
|
+
target_pos = (element.x0 + element.x1) / 2
|
489
|
+
else: # horizontal
|
490
|
+
if align == 'left': # top for horizontal
|
491
|
+
target_pos = element.top
|
492
|
+
elif align == 'right': # bottom for horizontal
|
493
|
+
target_pos = element.bottom
|
494
|
+
else: # center
|
495
|
+
target_pos = (element.top + element.bottom) / 2
|
496
|
+
|
497
|
+
# Find closest guide and snap if within tolerance
|
498
|
+
if self.data:
|
499
|
+
closest_idx = min(range(len(self.data)),
|
500
|
+
key=lambda i: abs(self.data[i] - target_pos))
|
501
|
+
if abs(self.data[closest_idx] - target_pos) <= tolerance:
|
502
|
+
self.data[closest_idx] = target_pos
|
503
|
+
|
504
|
+
# Sort after snapping
|
505
|
+
self.data.sort()
|
506
|
+
return self._parent
|
507
|
+
|
508
|
+
def shift(self, index: int, offset: float) -> "Guides":
|
509
|
+
"""
|
510
|
+
Move a specific guide in this axis by a offset amount.
|
511
|
+
|
512
|
+
Args:
|
513
|
+
index: Index of the guide to move
|
514
|
+
offset: Amount to move (positive = right/down)
|
515
|
+
|
516
|
+
Returns:
|
517
|
+
Parent Guides object for chaining
|
518
|
+
"""
|
519
|
+
if 0 <= index < len(self.data):
|
520
|
+
self.data[index] += offset
|
521
|
+
self.data.sort()
|
522
|
+
else:
|
523
|
+
logger.warning(f"Guide index {index} out of range for {self._axis} axis")
|
524
|
+
|
525
|
+
return self._parent
|
526
|
+
|
527
|
+
def add(self, position: Union[float, List[float]]) -> "Guides":
|
528
|
+
"""
|
529
|
+
Add one or more guides at the specified position(s).
|
530
|
+
|
531
|
+
Args:
|
532
|
+
position: Coordinate(s) to add guide(s) at. Can be:
|
533
|
+
- float: single position
|
534
|
+
- List[float]: multiple positions
|
535
|
+
|
536
|
+
Returns:
|
537
|
+
Parent Guides object for chaining
|
538
|
+
"""
|
539
|
+
if isinstance(position, (list, tuple)):
|
540
|
+
# Add multiple positions
|
541
|
+
for pos in position:
|
542
|
+
self.append(float(pos))
|
543
|
+
else:
|
544
|
+
# Add single position
|
545
|
+
self.append(float(position))
|
546
|
+
|
547
|
+
self.data.sort()
|
548
|
+
return self._parent
|
549
|
+
|
550
|
+
def remove_at(self, index: int) -> "Guides":
|
551
|
+
"""
|
552
|
+
Remove a guide by index.
|
553
|
+
|
554
|
+
Args:
|
555
|
+
index: Index of guide to remove
|
556
|
+
|
557
|
+
Returns:
|
558
|
+
Parent Guides object for chaining
|
559
|
+
"""
|
560
|
+
if 0 <= index < len(self.data):
|
561
|
+
self.data.pop(index)
|
562
|
+
return self._parent
|
563
|
+
|
564
|
+
def clear_all(self) -> "Guides":
|
565
|
+
"""
|
566
|
+
Remove all guides from this axis.
|
567
|
+
|
568
|
+
Returns:
|
569
|
+
Parent Guides object for chaining
|
570
|
+
"""
|
571
|
+
self.data.clear()
|
572
|
+
return self._parent
|
573
|
+
|
574
|
+
def __add__(self, other):
|
575
|
+
"""Handle addition of GuidesList objects by returning combined data."""
|
576
|
+
if isinstance(other, GuidesList):
|
577
|
+
return self.data + other.data
|
578
|
+
elif isinstance(other, list):
|
579
|
+
return self.data + other
|
580
|
+
else:
|
581
|
+
return NotImplemented
|
582
|
+
|
583
|
+
|
584
|
+
class Guides:
|
585
|
+
"""
|
586
|
+
Manages vertical and horizontal guide lines for table extraction and layout analysis.
|
587
|
+
|
588
|
+
Guides are collections of coordinates that can be used to define table boundaries,
|
589
|
+
column positions, or general layout structures. They can be created through various
|
590
|
+
detection methods or manually specified.
|
591
|
+
|
592
|
+
Attributes:
|
593
|
+
verticals: List of x-coordinates for vertical guide lines
|
594
|
+
horizontals: List of y-coordinates for horizontal guide lines
|
595
|
+
context: Optional Page/Region that these guides relate to
|
596
|
+
bounds: Optional bounding box (x0, y0, x1, y1) for relative coordinate conversion
|
597
|
+
snap_behavior: How to handle failed snapping operations ('warn', 'ignore', 'raise')
|
598
|
+
"""
|
599
|
+
|
600
|
+
def __init__(
|
601
|
+
self,
|
602
|
+
verticals: Optional[Union[List[float], "Page", "Region"]] = None,
|
603
|
+
horizontals: Optional[List[float]] = None,
|
604
|
+
context: Optional[Union["Page", "Region"]] = None,
|
605
|
+
bounds: Optional[Tuple[float, float, float, float]] = None,
|
606
|
+
relative: bool = False,
|
607
|
+
snap_behavior: Literal['raise', 'warn', 'ignore'] = 'warn'
|
608
|
+
):
|
609
|
+
"""
|
610
|
+
Initialize a Guides object.
|
611
|
+
|
612
|
+
Args:
|
613
|
+
verticals: List of x-coordinates for vertical guides, or a Page/Region as context
|
614
|
+
horizontals: List of y-coordinates for horizontal guides
|
615
|
+
context: Page or Region object these guides were created from
|
616
|
+
bounds: Bounding box (x0, top, x1, bottom) if context not provided
|
617
|
+
relative: Whether coordinates are relative (0-1) or absolute
|
618
|
+
snap_behavior: How to handle snapping conflicts ('raise', 'warn', or 'ignore')
|
619
|
+
"""
|
620
|
+
# Handle Guides(page) shorthand
|
621
|
+
if verticals is not None and not isinstance(verticals, (list, tuple)) and horizontals is None and context is None:
|
622
|
+
# First argument is a page/region, not coordinates
|
623
|
+
context = verticals
|
624
|
+
verticals = None
|
625
|
+
|
626
|
+
self.context = context
|
627
|
+
self.bounds = bounds
|
628
|
+
self.relative = relative
|
629
|
+
self.snap_behavior = snap_behavior
|
630
|
+
|
631
|
+
# Initialize with GuidesList instances
|
632
|
+
self._vertical = GuidesList(self, "vertical", sorted([float(x) for x in (verticals or [])]))
|
633
|
+
self._horizontal = GuidesList(self, "horizontal", sorted([float(y) for y in (horizontals or [])]))
|
634
|
+
|
635
|
+
# Determine bounds from context if needed
|
636
|
+
if self.bounds is None and self.context is not None:
|
637
|
+
if hasattr(self.context, 'bbox'):
|
638
|
+
self.bounds = self.context.bbox
|
639
|
+
elif hasattr(self.context, 'x0'):
|
640
|
+
self.bounds = (self.context.x0, self.context.top,
|
641
|
+
self.context.x1, self.context.bottom)
|
642
|
+
|
643
|
+
# Convert relative to absolute if needed
|
644
|
+
if self.relative and self.bounds:
|
645
|
+
x0, top, x1, bottom = self.bounds
|
646
|
+
width = x1 - x0
|
647
|
+
height = bottom - top
|
648
|
+
|
649
|
+
self._vertical.data = [x0 + v * width for v in self._vertical]
|
650
|
+
self._horizontal.data = [top + h * height for h in self._horizontal]
|
651
|
+
self.relative = False
|
652
|
+
|
653
|
+
@property
|
654
|
+
def vertical(self) -> GuidesList:
|
655
|
+
"""Get vertical guide coordinates."""
|
656
|
+
return self._vertical
|
657
|
+
|
658
|
+
@vertical.setter
|
659
|
+
def vertical(self, value: Union[List[float], "Guides", None]):
|
660
|
+
"""Set vertical guides from a list of coordinates or another Guides object."""
|
661
|
+
if value is None:
|
662
|
+
self._vertical.data = []
|
663
|
+
elif isinstance(value, Guides):
|
664
|
+
# Extract vertical coordinates from another Guides object
|
665
|
+
self._vertical.data = sorted([float(x) for x in value.vertical])
|
666
|
+
elif isinstance(value, str):
|
667
|
+
# Explicitly reject strings to avoid confusing iteration over characters
|
668
|
+
raise TypeError(f"vertical cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
|
669
|
+
elif hasattr(value, '__iter__'):
|
670
|
+
# Handle list/tuple of coordinates
|
671
|
+
try:
|
672
|
+
self._vertical.data = sorted([float(x) for x in value])
|
673
|
+
except (ValueError, TypeError) as e:
|
674
|
+
raise TypeError(f"vertical must contain numeric values, got {value}: {e}")
|
675
|
+
else:
|
676
|
+
raise TypeError(f"vertical must be a list, Guides object, or None, got {type(value)}")
|
677
|
+
|
678
|
+
@property
|
679
|
+
def horizontal(self) -> GuidesList:
|
680
|
+
"""Get horizontal guide coordinates."""
|
681
|
+
return self._horizontal
|
682
|
+
|
683
|
+
@horizontal.setter
|
684
|
+
def horizontal(self, value: Union[List[float], "Guides", None]):
|
685
|
+
"""Set horizontal guides from a list of coordinates or another Guides object."""
|
686
|
+
if value is None:
|
687
|
+
self._horizontal.data = []
|
688
|
+
elif isinstance(value, Guides):
|
689
|
+
# Extract horizontal coordinates from another Guides object
|
690
|
+
self._horizontal.data = sorted([float(y) for y in value.horizontal])
|
691
|
+
elif isinstance(value, str):
|
692
|
+
# Explicitly reject strings
|
693
|
+
raise TypeError(f"horizontal cannot be a string, got '{value}'. Use a list of coordinates or Guides object.")
|
694
|
+
elif hasattr(value, '__iter__'):
|
695
|
+
# Handle list/tuple of coordinates
|
696
|
+
try:
|
697
|
+
self._horizontal.data = sorted([float(y) for y in value])
|
698
|
+
except (ValueError, TypeError) as e:
|
699
|
+
raise TypeError(f"horizontal must contain numeric values, got {value}: {e}")
|
700
|
+
else:
|
701
|
+
raise TypeError(f"horizontal must be a list, Guides object, or None, got {type(value)}")
|
702
|
+
|
703
|
+
def _get_context_bounds(self) -> Optional[Tuple[float, float, float, float]]:
|
704
|
+
"""Get bounds from context if available."""
|
705
|
+
if self.context is None:
|
706
|
+
return None
|
707
|
+
|
708
|
+
if hasattr(self.context, 'bbox'):
|
709
|
+
return self.context.bbox
|
710
|
+
elif hasattr(self.context, 'x0') and hasattr(self.context, 'top'):
|
711
|
+
return (self.context.x0, self.context.top, self.context.x1, self.context.bottom)
|
712
|
+
elif hasattr(self.context, 'width') and hasattr(self.context, 'height'):
|
713
|
+
return (0, 0, self.context.width, self.context.height)
|
714
|
+
return None
|
715
|
+
|
716
|
+
# -------------------------------------------------------------------------
|
717
|
+
# Factory Methods
|
718
|
+
# -------------------------------------------------------------------------
|
719
|
+
|
720
|
+
@classmethod
|
721
|
+
def divide(
|
722
|
+
cls,
|
723
|
+
obj: Union["Page", "Region", Tuple[float, float, float, float]],
|
724
|
+
n: Optional[int] = None,
|
725
|
+
cols: Optional[int] = None,
|
726
|
+
rows: Optional[int] = None,
|
727
|
+
axis: Literal['vertical', 'horizontal', 'both'] = 'both'
|
728
|
+
) -> "Guides":
|
729
|
+
"""
|
730
|
+
Create guides by evenly dividing an object.
|
731
|
+
|
732
|
+
Args:
|
733
|
+
obj: Object to divide (Page, Region, or bbox tuple)
|
734
|
+
n: Number of divisions (creates n+1 guides). Used if cols/rows not specified.
|
735
|
+
cols: Number of columns (creates cols+1 vertical guides)
|
736
|
+
rows: Number of rows (creates rows+1 horizontal guides)
|
737
|
+
axis: Which axis to divide along
|
738
|
+
|
739
|
+
Returns:
|
740
|
+
New Guides object with evenly spaced lines
|
741
|
+
|
742
|
+
Examples:
|
743
|
+
# Divide into 3 columns
|
744
|
+
guides = Guides.divide(page, cols=3)
|
745
|
+
|
746
|
+
# Divide into 5 rows
|
747
|
+
guides = Guides.divide(region, rows=5)
|
748
|
+
|
749
|
+
# Divide both axes
|
750
|
+
guides = Guides.divide(page, cols=3, rows=5)
|
751
|
+
"""
|
752
|
+
# Extract bounds from object
|
753
|
+
if isinstance(obj, tuple) and len(obj) == 4:
|
754
|
+
bounds = obj
|
755
|
+
context = None
|
756
|
+
else:
|
757
|
+
context = obj
|
758
|
+
if hasattr(obj, 'bbox'):
|
759
|
+
bounds = obj.bbox
|
760
|
+
elif hasattr(obj, 'x0'):
|
761
|
+
bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
|
762
|
+
else:
|
763
|
+
bounds = (0, 0, obj.width, obj.height)
|
764
|
+
|
765
|
+
x0, y0, x1, y1 = bounds
|
766
|
+
verticals = []
|
767
|
+
horizontals = []
|
768
|
+
|
769
|
+
# Handle vertical guides
|
770
|
+
if axis in ('vertical', 'both'):
|
771
|
+
n_vertical = cols + 1 if cols is not None else (n + 1 if n is not None else 0)
|
772
|
+
if n_vertical > 0:
|
773
|
+
for i in range(n_vertical):
|
774
|
+
x = x0 + (x1 - x0) * i / (n_vertical - 1)
|
775
|
+
verticals.append(float(x))
|
776
|
+
|
777
|
+
# Handle horizontal guides
|
778
|
+
if axis in ('horizontal', 'both'):
|
779
|
+
n_horizontal = rows + 1 if rows is not None else (n + 1 if n is not None else 0)
|
780
|
+
if n_horizontal > 0:
|
781
|
+
for i in range(n_horizontal):
|
782
|
+
y = y0 + (y1 - y0) * i / (n_horizontal - 1)
|
783
|
+
horizontals.append(float(y))
|
784
|
+
|
785
|
+
return cls(verticals=verticals, horizontals=horizontals, context=context, bounds=bounds)
|
786
|
+
|
787
|
+
@classmethod
|
788
|
+
def from_lines(
|
789
|
+
cls,
|
790
|
+
obj: Union["Page", "Region"],
|
791
|
+
axis: Literal['vertical', 'horizontal', 'both'] = 'both',
|
792
|
+
threshold: Union[float, str] = 'auto',
|
793
|
+
source_label: Optional[str] = None,
|
794
|
+
max_lines_h: Optional[int] = None,
|
795
|
+
max_lines_v: Optional[int] = None,
|
796
|
+
outer: bool = False,
|
797
|
+
detection_method: str = 'vector',
|
798
|
+
resolution: int = 192,
|
799
|
+
**detect_kwargs
|
800
|
+
) -> "Guides":
|
801
|
+
"""
|
802
|
+
Create guides from detected line elements.
|
803
|
+
|
804
|
+
Args:
|
805
|
+
obj: Page or Region to detect lines from
|
806
|
+
axis: Which orientations to detect
|
807
|
+
threshold: Detection threshold ('auto' or float 0.0-1.0) - used for pixel detection
|
808
|
+
source_label: Filter for line source (vector method) or label for detected lines (pixel method)
|
809
|
+
max_lines_h: Maximum number of horizontal lines to keep
|
810
|
+
max_lines_v: Maximum number of vertical lines to keep
|
811
|
+
outer: Whether to add outer boundary guides
|
812
|
+
detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
|
813
|
+
resolution: DPI for pixel-based detection (default: 192)
|
814
|
+
**detect_kwargs: Additional parameters for pixel-based detection:
|
815
|
+
- min_gap_h: Minimum gap between horizontal lines (pixels)
|
816
|
+
- min_gap_v: Minimum gap between vertical lines (pixels)
|
817
|
+
- binarization_method: 'adaptive' or 'otsu'
|
818
|
+
- morph_op_h/v: Morphological operations ('open', 'close', 'none')
|
819
|
+
- smoothing_sigma_h/v: Gaussian smoothing sigma
|
820
|
+
- method: 'projection' (default) or 'lsd' (requires opencv)
|
821
|
+
|
822
|
+
Returns:
|
823
|
+
New Guides object with detected line positions
|
824
|
+
"""
|
825
|
+
# Get bounds for potential outer guides
|
826
|
+
if hasattr(obj, 'bbox'):
|
827
|
+
bounds = obj.bbox
|
828
|
+
elif hasattr(obj, 'x0'):
|
829
|
+
bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
|
830
|
+
elif hasattr(obj, 'width'):
|
831
|
+
bounds = (0, 0, obj.width, obj.height)
|
832
|
+
else:
|
833
|
+
bounds = None
|
834
|
+
|
835
|
+
verticals = []
|
836
|
+
horizontals = []
|
837
|
+
|
838
|
+
if detection_method == 'pixels':
|
839
|
+
# Use pixel-based line detection
|
840
|
+
if not hasattr(obj, 'detect_lines'):
|
841
|
+
raise ValueError(f"Object {obj} does not support pixel-based line detection")
|
842
|
+
|
843
|
+
# Set up detection parameters
|
844
|
+
detect_params = {
|
845
|
+
'resolution': resolution,
|
846
|
+
'source_label': source_label or 'guides_detection',
|
847
|
+
'horizontal': axis in ('horizontal', 'both'),
|
848
|
+
'vertical': axis in ('vertical', 'both'),
|
849
|
+
'replace': True, # Replace any existing lines with this source
|
850
|
+
'method': detect_kwargs.get('method', 'projection'),
|
851
|
+
}
|
852
|
+
|
853
|
+
# Handle threshold parameter
|
854
|
+
if threshold == 'auto':
|
855
|
+
# Auto mode: use very low thresholds with max_lines constraints
|
856
|
+
detect_params['peak_threshold_h'] = 0.0
|
857
|
+
detect_params['peak_threshold_v'] = 0.0
|
858
|
+
detect_params['max_lines_h'] = max_lines_h
|
859
|
+
detect_params['max_lines_v'] = max_lines_v
|
860
|
+
else:
|
861
|
+
# Fixed threshold mode
|
862
|
+
detect_params['peak_threshold_h'] = float(threshold) if axis in ('horizontal', 'both') else 1.0
|
863
|
+
detect_params['peak_threshold_v'] = float(threshold) if axis in ('vertical', 'both') else 1.0
|
864
|
+
detect_params['max_lines_h'] = max_lines_h
|
865
|
+
detect_params['max_lines_v'] = max_lines_v
|
866
|
+
|
867
|
+
# Add any additional detection parameters
|
868
|
+
for key in ['min_gap_h', 'min_gap_v', 'binarization_method',
|
869
|
+
'adaptive_thresh_block_size', 'adaptive_thresh_C_val',
|
870
|
+
'morph_op_h', 'morph_kernel_h', 'morph_op_v', 'morph_kernel_v',
|
871
|
+
'smoothing_sigma_h', 'smoothing_sigma_v', 'peak_width_rel_height']:
|
872
|
+
if key in detect_kwargs:
|
873
|
+
detect_params[key] = detect_kwargs[key]
|
874
|
+
|
875
|
+
# Perform the detection
|
876
|
+
obj.detect_lines(**detect_params)
|
877
|
+
|
878
|
+
# Now get the detected lines and use them
|
879
|
+
if hasattr(obj, 'lines'):
|
880
|
+
lines = obj.lines
|
881
|
+
elif hasattr(obj, 'find_all'):
|
882
|
+
lines = obj.find_all('line')
|
883
|
+
else:
|
884
|
+
lines = []
|
885
|
+
|
886
|
+
# Filter by the source we just used
|
887
|
+
lines = [l for l in lines if getattr(l, 'source', None) == detect_params['source_label']]
|
888
|
+
|
889
|
+
else: # detection_method == 'vector' (default)
|
890
|
+
# Get existing lines from the object
|
891
|
+
if hasattr(obj, 'lines'):
|
892
|
+
lines = obj.lines
|
893
|
+
elif hasattr(obj, 'find_all'):
|
894
|
+
lines = obj.find_all('line')
|
895
|
+
else:
|
896
|
+
logger.warning(f"Object {obj} has no lines or find_all method")
|
897
|
+
lines = []
|
898
|
+
|
899
|
+
# Filter by source if specified
|
900
|
+
if source_label:
|
901
|
+
lines = [l for l in lines if getattr(l, 'source', None) == source_label]
|
902
|
+
|
903
|
+
# Process lines (same logic for both methods)
|
904
|
+
# Separate lines by orientation and collect with metadata for ranking
|
905
|
+
h_line_data = [] # (y_coord, length, line_obj)
|
906
|
+
v_line_data = [] # (x_coord, length, line_obj)
|
907
|
+
|
908
|
+
for line in lines:
|
909
|
+
if hasattr(line, 'is_horizontal') and hasattr(line, 'is_vertical'):
|
910
|
+
if line.is_horizontal and axis in ('horizontal', 'both'):
|
911
|
+
# Use the midpoint y-coordinate for horizontal lines
|
912
|
+
y = (line.top + line.bottom) / 2
|
913
|
+
# Calculate line length for ranking
|
914
|
+
length = getattr(line, 'width', abs(getattr(line, 'x1', 0) - getattr(line, 'x0', 0)))
|
915
|
+
h_line_data.append((y, length, line))
|
916
|
+
elif line.is_vertical and axis in ('vertical', 'both'):
|
917
|
+
# Use the midpoint x-coordinate for vertical lines
|
918
|
+
x = (line.x0 + line.x1) / 2
|
919
|
+
# Calculate line length for ranking
|
920
|
+
length = getattr(line, 'height', abs(getattr(line, 'bottom', 0) - getattr(line, 'top', 0)))
|
921
|
+
v_line_data.append((x, length, line))
|
922
|
+
|
923
|
+
# Process horizontal lines
|
924
|
+
if max_lines_h is not None and h_line_data:
|
925
|
+
# Sort by length (longer lines are typically more significant)
|
926
|
+
h_line_data.sort(key=lambda x: x[1], reverse=True)
|
927
|
+
# Take the top N by length
|
928
|
+
selected_h = h_line_data[:max_lines_h]
|
929
|
+
# Extract just the coordinates and sort by position
|
930
|
+
horizontals = sorted([coord for coord, _, _ in selected_h])
|
931
|
+
logger.debug(f"Selected {len(horizontals)} horizontal lines from {len(h_line_data)} candidates")
|
932
|
+
else:
|
933
|
+
# Use all horizontal lines (original behavior)
|
934
|
+
horizontals = [coord for coord, _, _ in h_line_data]
|
935
|
+
horizontals = sorted(list(set(horizontals)))
|
936
|
+
|
937
|
+
# Process vertical lines
|
938
|
+
if max_lines_v is not None and v_line_data:
|
939
|
+
# Sort by length (longer lines are typically more significant)
|
940
|
+
v_line_data.sort(key=lambda x: x[1], reverse=True)
|
941
|
+
# Take the top N by length
|
942
|
+
selected_v = v_line_data[:max_lines_v]
|
943
|
+
# Extract just the coordinates and sort by position
|
944
|
+
verticals = sorted([coord for coord, _, _ in selected_v])
|
945
|
+
logger.debug(f"Selected {len(verticals)} vertical lines from {len(v_line_data)} candidates")
|
946
|
+
else:
|
947
|
+
# Use all vertical lines (original behavior)
|
948
|
+
verticals = [coord for coord, _, _ in v_line_data]
|
949
|
+
verticals = sorted(list(set(verticals)))
|
950
|
+
|
951
|
+
# Add outer guides if requested
|
952
|
+
if outer and bounds:
|
953
|
+
if axis in ('vertical', 'both'):
|
954
|
+
if not verticals or verticals[0] > bounds[0]:
|
955
|
+
verticals.insert(0, bounds[0]) # x0
|
956
|
+
if not verticals or verticals[-1] < bounds[2]:
|
957
|
+
verticals.append(bounds[2]) # x1
|
958
|
+
if axis in ('horizontal', 'both'):
|
959
|
+
if not horizontals or horizontals[0] > bounds[1]:
|
960
|
+
horizontals.insert(0, bounds[1]) # y0
|
961
|
+
if not horizontals or horizontals[-1] < bounds[3]:
|
962
|
+
horizontals.append(bounds[3]) # y1
|
963
|
+
|
964
|
+
# Remove duplicates and sort again
|
965
|
+
verticals = sorted(list(set(verticals)))
|
966
|
+
horizontals = sorted(list(set(horizontals)))
|
967
|
+
|
968
|
+
return cls(verticals=verticals, horizontals=horizontals, context=obj, bounds=bounds)
|
969
|
+
|
970
|
+
@classmethod
|
971
|
+
def from_content(
|
972
|
+
cls,
|
973
|
+
obj: Union["Page", "Region"],
|
974
|
+
axis: Literal['vertical', 'horizontal'] = 'vertical',
|
975
|
+
markers: Union[str, List[str], "ElementCollection", None] = None,
|
976
|
+
align: Literal['left', 'right', 'center', 'between'] = 'left',
|
977
|
+
outer: bool = True,
|
978
|
+
tolerance: float = 5
|
979
|
+
) -> "Guides":
|
980
|
+
"""
|
981
|
+
Create guides based on text content positions.
|
982
|
+
|
983
|
+
Args:
|
984
|
+
obj: Page or Region to search for content
|
985
|
+
axis: Whether to create vertical or horizontal guides
|
986
|
+
markers: Content to search for. Can be:
|
987
|
+
- str: single selector (e.g., 'text:contains("Name")') or literal text
|
988
|
+
- List[str]: list of selectors or literal text strings
|
989
|
+
- ElementCollection: collection of elements to extract text from
|
990
|
+
- None: no markers
|
991
|
+
align: Where to place guides relative to found text
|
992
|
+
outer: Whether to add guides at the boundaries
|
993
|
+
tolerance: Maximum distance to search for text
|
994
|
+
|
995
|
+
Returns:
|
996
|
+
New Guides object aligned to text content
|
997
|
+
"""
|
998
|
+
guides_coords = []
|
999
|
+
bounds = None
|
1000
|
+
|
1001
|
+
# Get bounds from object
|
1002
|
+
if hasattr(obj, 'bbox'):
|
1003
|
+
bounds = obj.bbox
|
1004
|
+
elif hasattr(obj, 'x0'):
|
1005
|
+
bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
|
1006
|
+
elif hasattr(obj, 'width'):
|
1007
|
+
bounds = (0, 0, obj.width, obj.height)
|
1008
|
+
|
1009
|
+
# Normalize markers to list of text strings
|
1010
|
+
marker_texts = _normalize_markers(markers, obj)
|
1011
|
+
|
1012
|
+
# Find each marker and determine guide position
|
1013
|
+
for marker in marker_texts:
|
1014
|
+
if hasattr(obj, 'find'):
|
1015
|
+
element = obj.find(f'text:contains("{marker}")')
|
1016
|
+
if element:
|
1017
|
+
if axis == 'vertical':
|
1018
|
+
if align == 'left':
|
1019
|
+
guides_coords.append(element.x0)
|
1020
|
+
elif align == 'right':
|
1021
|
+
guides_coords.append(element.x1)
|
1022
|
+
elif align == 'center':
|
1023
|
+
guides_coords.append((element.x0 + element.x1) / 2)
|
1024
|
+
elif align == 'between':
|
1025
|
+
# For between, collect left edges for processing later
|
1026
|
+
guides_coords.append(element.x0)
|
1027
|
+
else: # horizontal
|
1028
|
+
if align == 'left': # top for horizontal
|
1029
|
+
guides_coords.append(element.top)
|
1030
|
+
elif align == 'right': # bottom for horizontal
|
1031
|
+
guides_coords.append(element.bottom)
|
1032
|
+
elif align == 'center':
|
1033
|
+
guides_coords.append((element.top + element.bottom) / 2)
|
1034
|
+
elif align == 'between':
|
1035
|
+
# For between, collect top edges for processing later
|
1036
|
+
guides_coords.append(element.top)
|
1037
|
+
|
1038
|
+
# Handle 'between' alignment - find midpoints between adjacent markers
|
1039
|
+
if align == 'between' and len(guides_coords) >= 2:
|
1040
|
+
# We need to get the right and left edges of each marker
|
1041
|
+
marker_bounds = []
|
1042
|
+
for marker in marker_texts:
|
1043
|
+
if hasattr(obj, 'find'):
|
1044
|
+
element = obj.find(f'text:contains("{marker}")')
|
1045
|
+
if element:
|
1046
|
+
if axis == 'vertical':
|
1047
|
+
marker_bounds.append((element.x0, element.x1))
|
1048
|
+
else: # horizontal
|
1049
|
+
marker_bounds.append((element.top, element.bottom))
|
1050
|
+
|
1051
|
+
# Sort markers by their left edge (or top edge for horizontal)
|
1052
|
+
marker_bounds.sort(key=lambda x: x[0])
|
1053
|
+
|
1054
|
+
# Create guides at midpoints between adjacent markers
|
1055
|
+
between_coords = []
|
1056
|
+
for i in range(len(marker_bounds) - 1):
|
1057
|
+
# Midpoint between right edge of current marker and left edge of next marker
|
1058
|
+
right_edge_current = marker_bounds[i][1]
|
1059
|
+
left_edge_next = marker_bounds[i + 1][0]
|
1060
|
+
midpoint = (right_edge_current + left_edge_next) / 2
|
1061
|
+
between_coords.append(midpoint)
|
1062
|
+
|
1063
|
+
guides_coords = between_coords
|
1064
|
+
|
1065
|
+
# Add outer guides if requested
|
1066
|
+
if outer and bounds:
|
1067
|
+
if axis == 'vertical':
|
1068
|
+
guides_coords.insert(0, bounds[0]) # x0
|
1069
|
+
guides_coords.append(bounds[2]) # x1
|
1070
|
+
else:
|
1071
|
+
guides_coords.insert(0, bounds[1]) # y0
|
1072
|
+
guides_coords.append(bounds[3]) # y1
|
1073
|
+
|
1074
|
+
# Remove duplicates and sort
|
1075
|
+
guides_coords = sorted(list(set(guides_coords)))
|
1076
|
+
|
1077
|
+
# Create guides object
|
1078
|
+
if axis == 'vertical':
|
1079
|
+
return cls(verticals=guides_coords, context=obj, bounds=bounds)
|
1080
|
+
else:
|
1081
|
+
return cls(horizontals=guides_coords, context=obj, bounds=bounds)
|
1082
|
+
|
1083
|
+
@classmethod
|
1084
|
+
def from_whitespace(
|
1085
|
+
cls,
|
1086
|
+
obj: Union["Page", "Region"],
|
1087
|
+
axis: Literal['vertical', 'horizontal', 'both'] = 'both',
|
1088
|
+
min_gap: float = 10
|
1089
|
+
) -> "Guides":
|
1090
|
+
"""
|
1091
|
+
Create guides by detecting whitespace gaps.
|
1092
|
+
|
1093
|
+
Args:
|
1094
|
+
obj: Page or Region to analyze
|
1095
|
+
min_gap: Minimum gap size to consider as whitespace
|
1096
|
+
axis: Which axes to analyze for gaps
|
1097
|
+
|
1098
|
+
Returns:
|
1099
|
+
New Guides object positioned at whitespace gaps
|
1100
|
+
"""
|
1101
|
+
# This is a placeholder - would need sophisticated gap detection
|
1102
|
+
logger.info("Whitespace detection not yet implemented, using divide instead")
|
1103
|
+
return cls.divide(obj, n=3, axis=axis)
|
1104
|
+
|
1105
|
+
@classmethod
|
1106
|
+
def new(
|
1107
|
+
cls,
|
1108
|
+
context: Optional[Union["Page", "Region"]] = None
|
1109
|
+
) -> "Guides":
|
1110
|
+
"""
|
1111
|
+
Create a new empty Guides object, optionally with a context.
|
1112
|
+
|
1113
|
+
This provides a clean way to start building guides through chaining:
|
1114
|
+
guides = Guides.new(page).add_content(axis='vertical', markers=[...])
|
1115
|
+
|
1116
|
+
Args:
|
1117
|
+
context: Optional Page or Region to use as default context for operations
|
1118
|
+
|
1119
|
+
Returns:
|
1120
|
+
New empty Guides object
|
1121
|
+
"""
|
1122
|
+
return cls(verticals=[], horizontals=[], context=context)
|
1123
|
+
|
1124
|
+
# -------------------------------------------------------------------------
|
1125
|
+
# Manipulation Methods
|
1126
|
+
# -------------------------------------------------------------------------
|
1127
|
+
|
1128
|
+
def snap_to_whitespace(
|
1129
|
+
self,
|
1130
|
+
axis: str = 'vertical',
|
1131
|
+
min_gap: float = 10.0,
|
1132
|
+
detection_method: str = 'pixels', # 'pixels' or 'text'
|
1133
|
+
threshold: Union[float, str] = 'auto', # threshold for what counts as a trough (0.0-1.0) or 'auto'
|
1134
|
+
on_no_snap: str = 'warn'
|
1135
|
+
) -> "Guides":
|
1136
|
+
"""
|
1137
|
+
Snap guides to nearby whitespace gaps (troughs) using optimal assignment.
|
1138
|
+
Modifies this Guides object in place.
|
1139
|
+
|
1140
|
+
Args:
|
1141
|
+
axis: Direction to snap ('vertical' or 'horizontal')
|
1142
|
+
min_gap: Minimum gap size to consider as a valid trough
|
1143
|
+
detection_method: Method for detecting troughs:
|
1144
|
+
'pixels' - use pixel-based density analysis (default)
|
1145
|
+
'text' - use text element spacing analysis
|
1146
|
+
threshold: Threshold for what counts as a trough:
|
1147
|
+
- float (0.0-1.0): areas with this fraction or less of max density count as troughs
|
1148
|
+
- 'auto': automatically find threshold that creates enough troughs for guides
|
1149
|
+
on_no_snap: Action when snapping fails ('warn', 'ignore', 'raise')
|
1150
|
+
|
1151
|
+
Returns:
|
1152
|
+
Self for method chaining.
|
1153
|
+
"""
|
1154
|
+
if not self.context:
|
1155
|
+
logger.warning("No context available for whitespace detection")
|
1156
|
+
return self
|
1157
|
+
|
1158
|
+
# Get elements for trough detection
|
1159
|
+
text_elements = self._get_text_elements()
|
1160
|
+
if not text_elements:
|
1161
|
+
logger.warning("No text elements found for whitespace detection")
|
1162
|
+
return self
|
1163
|
+
|
1164
|
+
if axis == 'vertical':
|
1165
|
+
gaps = self._find_vertical_whitespace_gaps(text_elements, min_gap, threshold)
|
1166
|
+
if gaps:
|
1167
|
+
self._snap_guides_to_gaps(self.vertical.data, gaps, axis)
|
1168
|
+
elif axis == 'horizontal':
|
1169
|
+
gaps = self._find_horizontal_whitespace_gaps(text_elements, min_gap, threshold)
|
1170
|
+
if gaps:
|
1171
|
+
self._snap_guides_to_gaps(self.horizontal.data, gaps, axis)
|
1172
|
+
else:
|
1173
|
+
raise ValueError("axis must be 'vertical' or 'horizontal'")
|
1174
|
+
|
1175
|
+
# Ensure all coordinates are Python floats (not numpy types)
|
1176
|
+
self.vertical.data[:] = [float(x) for x in self.vertical.data]
|
1177
|
+
self.horizontal.data[:] = [float(y) for y in self.horizontal.data]
|
1178
|
+
|
1179
|
+
return self
|
1180
|
+
|
1181
|
+
def shift(
|
1182
|
+
self,
|
1183
|
+
index: int,
|
1184
|
+
offset: float,
|
1185
|
+
axis: Literal['vertical', 'horizontal'] = 'vertical'
|
1186
|
+
) -> "Guides":
|
1187
|
+
"""
|
1188
|
+
Move a specific guide by a offset amount.
|
1189
|
+
|
1190
|
+
Args:
|
1191
|
+
index: Index of the guide to move
|
1192
|
+
offset: Amount to move (positive = right/down)
|
1193
|
+
axis: Which guide list to modify
|
1194
|
+
|
1195
|
+
Returns:
|
1196
|
+
Self for method chaining
|
1197
|
+
"""
|
1198
|
+
if axis == 'vertical':
|
1199
|
+
if 0 <= index < len(self.vertical):
|
1200
|
+
self.vertical[index] += offset
|
1201
|
+
self.vertical = sorted(self.vertical)
|
1202
|
+
else:
|
1203
|
+
logger.warning(f"Vertical guide index {index} out of range")
|
1204
|
+
else:
|
1205
|
+
if 0 <= index < len(self.horizontal):
|
1206
|
+
self.horizontal[index] += offset
|
1207
|
+
self.horizontal = sorted(self.horizontal)
|
1208
|
+
else:
|
1209
|
+
logger.warning(f"Horizontal guide index {index} out of range")
|
1210
|
+
|
1211
|
+
return self
|
1212
|
+
|
1213
|
+
def add_vertical(self, x: float) -> "Guides":
|
1214
|
+
"""Add a vertical guide at the specified x-coordinate."""
|
1215
|
+
self.vertical.append(x)
|
1216
|
+
self.vertical = sorted(self.vertical)
|
1217
|
+
return self
|
1218
|
+
|
1219
|
+
def add_horizontal(self, y: float) -> "Guides":
|
1220
|
+
"""Add a horizontal guide at the specified y-coordinate."""
|
1221
|
+
self.horizontal.append(y)
|
1222
|
+
self.horizontal = sorted(self.horizontal)
|
1223
|
+
return self
|
1224
|
+
|
1225
|
+
def remove_vertical(self, index: int) -> "Guides":
|
1226
|
+
"""Remove a vertical guide by index."""
|
1227
|
+
if 0 <= index < len(self.vertical):
|
1228
|
+
self.vertical.pop(index)
|
1229
|
+
return self
|
1230
|
+
|
1231
|
+
def remove_horizontal(self, index: int) -> "Guides":
|
1232
|
+
"""Remove a horizontal guide by index."""
|
1233
|
+
if 0 <= index < len(self.horizontal):
|
1234
|
+
self.horizontal.pop(index)
|
1235
|
+
return self
|
1236
|
+
|
1237
|
+
# -------------------------------------------------------------------------
|
1238
|
+
# Operations
|
1239
|
+
# -------------------------------------------------------------------------
|
1240
|
+
|
1241
|
+
def __add__(self, other: "Guides") -> "Guides":
|
1242
|
+
"""
|
1243
|
+
Combine two guide sets.
|
1244
|
+
|
1245
|
+
Returns:
|
1246
|
+
New Guides object with combined coordinates
|
1247
|
+
"""
|
1248
|
+
# Combine and deduplicate coordinates, ensuring Python floats
|
1249
|
+
combined_verticals = sorted([float(x) for x in set(self.vertical + other.vertical)])
|
1250
|
+
combined_horizontals = sorted([float(y) for y in set(self.horizontal + other.horizontal)])
|
1251
|
+
|
1252
|
+
# Use context from self if available
|
1253
|
+
return Guides(
|
1254
|
+
verticals=combined_verticals,
|
1255
|
+
horizontals=combined_horizontals,
|
1256
|
+
context=self.context or other.context,
|
1257
|
+
bounds=self.bounds or other.bounds
|
1258
|
+
)
|
1259
|
+
|
1260
|
+
def show(self, on=None, **kwargs):
|
1261
|
+
"""
|
1262
|
+
Display the guides overlaid on a page or region.
|
1263
|
+
|
1264
|
+
Args:
|
1265
|
+
on: Page, Region, PIL Image, or string to display guides on.
|
1266
|
+
If None, uses self.context (the object guides were created from).
|
1267
|
+
If string 'page', uses the page from self.context.
|
1268
|
+
**kwargs: Additional arguments passed to to_image() if applicable.
|
1269
|
+
|
1270
|
+
Returns:
|
1271
|
+
PIL Image with guides drawn on it.
|
1272
|
+
"""
|
1273
|
+
# Determine what to display guides on
|
1274
|
+
target = on if on is not None else self.context
|
1275
|
+
|
1276
|
+
# Handle string shortcuts
|
1277
|
+
if isinstance(target, str):
|
1278
|
+
if target == 'page':
|
1279
|
+
if hasattr(self.context, 'page'):
|
1280
|
+
target = self.context.page
|
1281
|
+
elif hasattr(self.context, '_page'):
|
1282
|
+
target = self.context._page
|
1283
|
+
else:
|
1284
|
+
raise ValueError("Cannot resolve 'page' - context has no page attribute")
|
1285
|
+
else:
|
1286
|
+
raise ValueError(f"Unknown string target: {target}. Only 'page' is supported.")
|
1287
|
+
|
1288
|
+
if target is None:
|
1289
|
+
raise ValueError("No target specified and no context available for guides display")
|
1290
|
+
|
1291
|
+
# Prepare kwargs for image generation
|
1292
|
+
image_kwargs = kwargs.copy()
|
1293
|
+
|
1294
|
+
# Always turn off highlights to avoid visual clutter
|
1295
|
+
image_kwargs['include_highlights'] = False
|
1296
|
+
|
1297
|
+
# If target is a region-like object, crop to just that region
|
1298
|
+
if hasattr(target, 'bbox') and hasattr(target, 'page'):
|
1299
|
+
# This is likely a Region
|
1300
|
+
image_kwargs['crop'] = True
|
1301
|
+
|
1302
|
+
# Get base image
|
1303
|
+
if hasattr(target, 'to_image'):
|
1304
|
+
img = target.to_image(**image_kwargs)
|
1305
|
+
elif hasattr(target, 'mode') and hasattr(target, 'size'):
|
1306
|
+
# It's already a PIL Image
|
1307
|
+
img = target
|
1308
|
+
else:
|
1309
|
+
raise ValueError(f"Object {target} does not support to_image() and is not a PIL Image")
|
1310
|
+
|
1311
|
+
if img is None:
|
1312
|
+
raise ValueError("Failed to generate base image")
|
1313
|
+
|
1314
|
+
# Create a copy to draw on
|
1315
|
+
img = img.copy()
|
1316
|
+
draw = ImageDraw.Draw(img)
|
1317
|
+
|
1318
|
+
# Determine scale factor for coordinate conversion
|
1319
|
+
if hasattr(target, 'width') and hasattr(target, 'height') and not (hasattr(target, 'mode') and hasattr(target, 'size')):
|
1320
|
+
# target is a PDF object (Page/Region) with PDF coordinates
|
1321
|
+
scale_x = img.width / target.width
|
1322
|
+
scale_y = img.height / target.height
|
1323
|
+
|
1324
|
+
# If we're showing guides on a region, we need to adjust coordinates
|
1325
|
+
# to be relative to the region's origin
|
1326
|
+
if hasattr(target, 'bbox') and hasattr(target, 'page'):
|
1327
|
+
# This is a Region - adjust guide coordinates to be relative to region
|
1328
|
+
region_x0, region_top = target.x0, target.top
|
1329
|
+
else:
|
1330
|
+
# This is a Page - no adjustment needed
|
1331
|
+
region_x0, region_top = 0, 0
|
1332
|
+
else:
|
1333
|
+
# target is already an image, no scaling needed
|
1334
|
+
scale_x = 1.0
|
1335
|
+
scale_y = 1.0
|
1336
|
+
region_x0, region_top = 0, 0
|
1337
|
+
|
1338
|
+
# Draw vertical guides (blue)
|
1339
|
+
for x_coord in self.vertical:
|
1340
|
+
# Adjust coordinate if we're showing on a region
|
1341
|
+
adjusted_x = x_coord - region_x0
|
1342
|
+
pixel_x = adjusted_x * scale_x
|
1343
|
+
# Ensure guides at the edge are still visible by clamping to valid range
|
1344
|
+
if 0 <= pixel_x <= img.width - 1:
|
1345
|
+
x_pixel = int(min(pixel_x, img.width - 1))
|
1346
|
+
draw.line([(x_pixel, 0), (x_pixel, img.height - 1)], fill=(0, 0, 255, 200), width=2)
|
1347
|
+
|
1348
|
+
# Draw horizontal guides (red)
|
1349
|
+
for y_coord in self.horizontal:
|
1350
|
+
# Adjust coordinate if we're showing on a region
|
1351
|
+
adjusted_y = y_coord - region_top
|
1352
|
+
pixel_y = adjusted_y * scale_y
|
1353
|
+
# Ensure guides at the edge are still visible by clamping to valid range
|
1354
|
+
if 0 <= pixel_y <= img.height - 1:
|
1355
|
+
y_pixel = int(min(pixel_y, img.height - 1))
|
1356
|
+
draw.line([(0, y_pixel), (img.width - 1, y_pixel)], fill=(255, 0, 0, 200), width=2)
|
1357
|
+
|
1358
|
+
return img
|
1359
|
+
|
1360
|
+
# -------------------------------------------------------------------------
|
1361
|
+
# Utility Methods
|
1362
|
+
# -------------------------------------------------------------------------
|
1363
|
+
|
1364
|
+
def get_cells(self) -> List[Tuple[float, float, float, float]]:
|
1365
|
+
"""
|
1366
|
+
Get all cell bounding boxes from guide intersections.
|
1367
|
+
|
1368
|
+
Returns:
|
1369
|
+
List of (x0, y0, x1, y1) tuples for each cell
|
1370
|
+
"""
|
1371
|
+
cells = []
|
1372
|
+
|
1373
|
+
# Create cells from guide intersections
|
1374
|
+
for i in range(len(self.vertical) - 1):
|
1375
|
+
for j in range(len(self.horizontal) - 1):
|
1376
|
+
x0 = self.vertical[i]
|
1377
|
+
x1 = self.vertical[i + 1]
|
1378
|
+
y0 = self.horizontal[j]
|
1379
|
+
y1 = self.horizontal[j + 1]
|
1380
|
+
cells.append((x0, y0, x1, y1))
|
1381
|
+
|
1382
|
+
return cells
|
1383
|
+
|
1384
|
+
def to_dict(self) -> Dict[str, Any]:
|
1385
|
+
"""
|
1386
|
+
Convert to dictionary format suitable for pdfplumber table_settings.
|
1387
|
+
|
1388
|
+
Returns:
|
1389
|
+
Dictionary with explicit_vertical_lines and explicit_horizontal_lines
|
1390
|
+
"""
|
1391
|
+
return {
|
1392
|
+
'explicit_vertical_lines': self.vertical,
|
1393
|
+
'explicit_horizontal_lines': self.horizontal
|
1394
|
+
}
|
1395
|
+
|
1396
|
+
def to_relative(self) -> "Guides":
|
1397
|
+
"""
|
1398
|
+
Convert absolute coordinates to relative (0-1) coordinates.
|
1399
|
+
|
1400
|
+
Returns:
|
1401
|
+
New Guides object with relative coordinates
|
1402
|
+
"""
|
1403
|
+
if self.relative:
|
1404
|
+
return self # Already relative
|
1405
|
+
|
1406
|
+
if not self.bounds:
|
1407
|
+
raise ValueError("Cannot convert to relative without bounds")
|
1408
|
+
|
1409
|
+
x0, y0, x1, y1 = self.bounds
|
1410
|
+
width = x1 - x0
|
1411
|
+
height = y1 - y0
|
1412
|
+
|
1413
|
+
rel_verticals = [(x - x0) / width for x in self.vertical]
|
1414
|
+
rel_horizontals = [(y - y0) / height for y in self.horizontal]
|
1415
|
+
|
1416
|
+
return Guides(
|
1417
|
+
verticals=rel_verticals,
|
1418
|
+
horizontals=rel_horizontals,
|
1419
|
+
context=self.context,
|
1420
|
+
bounds=(0, 0, 1, 1),
|
1421
|
+
relative=True
|
1422
|
+
)
|
1423
|
+
|
1424
|
+
def to_absolute(self, bounds: Tuple[float, float, float, float]) -> "Guides":
|
1425
|
+
"""
|
1426
|
+
Convert relative coordinates to absolute coordinates.
|
1427
|
+
|
1428
|
+
Args:
|
1429
|
+
bounds: Target bounding box (x0, y0, x1, y1)
|
1430
|
+
|
1431
|
+
Returns:
|
1432
|
+
New Guides object with absolute coordinates
|
1433
|
+
"""
|
1434
|
+
if not self.relative:
|
1435
|
+
return self # Already absolute
|
1436
|
+
|
1437
|
+
x0, y0, x1, y1 = bounds
|
1438
|
+
width = x1 - x0
|
1439
|
+
height = y1 - y0
|
1440
|
+
|
1441
|
+
abs_verticals = [x0 + x * width for x in self.vertical]
|
1442
|
+
abs_horizontals = [y0 + y * height for y in self.horizontal]
|
1443
|
+
|
1444
|
+
return Guides(
|
1445
|
+
verticals=abs_verticals,
|
1446
|
+
horizontals=abs_horizontals,
|
1447
|
+
context=self.context,
|
1448
|
+
bounds=bounds,
|
1449
|
+
relative=False
|
1450
|
+
)
|
1451
|
+
|
1452
|
+
@property
|
1453
|
+
def n_rows(self) -> int:
|
1454
|
+
"""Number of rows defined by horizontal guides."""
|
1455
|
+
return max(0, len(self.horizontal) - 1)
|
1456
|
+
|
1457
|
+
@property
|
1458
|
+
def n_cols(self) -> int:
|
1459
|
+
"""Number of columns defined by vertical guides."""
|
1460
|
+
return max(0, len(self.vertical) - 1)
|
1461
|
+
|
1462
|
+
def _handle_snap_failure(self, message: str):
|
1463
|
+
"""Handle cases where snapping cannot be performed."""
|
1464
|
+
if hasattr(self, 'on_no_snap'):
|
1465
|
+
if self.on_no_snap == 'warn':
|
1466
|
+
logger.warning(message)
|
1467
|
+
elif self.on_no_snap == 'raise':
|
1468
|
+
raise ValueError(message)
|
1469
|
+
# 'ignore' case: do nothing
|
1470
|
+
else:
|
1471
|
+
logger.warning(message) # Default behavior
|
1472
|
+
|
1473
|
+
def _find_vertical_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
|
1474
|
+
"""
|
1475
|
+
Find vertical whitespace gaps using bbox-based density analysis.
|
1476
|
+
Returns list of (start, end) tuples representing trough ranges.
|
1477
|
+
"""
|
1478
|
+
if not self.bounds:
|
1479
|
+
return []
|
1480
|
+
|
1481
|
+
x0, _, x1, _ = self.bounds
|
1482
|
+
width_pixels = int(x1 - x0)
|
1483
|
+
|
1484
|
+
if width_pixels <= 0:
|
1485
|
+
return []
|
1486
|
+
|
1487
|
+
# Create density histogram: count bbox overlaps per x-coordinate
|
1488
|
+
density = np.zeros(width_pixels)
|
1489
|
+
|
1490
|
+
for element in text_elements:
|
1491
|
+
if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
|
1492
|
+
continue
|
1493
|
+
|
1494
|
+
# Clip coordinates to bounds
|
1495
|
+
elem_x0 = max(x0, element.x0) - x0
|
1496
|
+
elem_x1 = min(x1, element.x1) - x0
|
1497
|
+
|
1498
|
+
if elem_x1 > elem_x0:
|
1499
|
+
start_px = int(elem_x0)
|
1500
|
+
end_px = int(elem_x1)
|
1501
|
+
density[start_px:end_px] += 1
|
1502
|
+
|
1503
|
+
if density.max() == 0:
|
1504
|
+
return []
|
1505
|
+
|
1506
|
+
# Determine the threshold value
|
1507
|
+
if threshold == 'auto':
|
1508
|
+
# Auto mode: try different thresholds with step 0.05 until we have enough troughs
|
1509
|
+
guides_needing_troughs = len([g for i, g in enumerate(self.vertical) if 0 < i < len(self.vertical) - 1])
|
1510
|
+
if guides_needing_troughs == 0:
|
1511
|
+
threshold_val = 0.5 # Default when no guides need placement
|
1512
|
+
else:
|
1513
|
+
threshold_val = None
|
1514
|
+
for test_threshold in np.arange(0.1, 1.0, 0.05):
|
1515
|
+
test_gaps = self._find_gaps_with_threshold(density, test_threshold, min_gap, x0)
|
1516
|
+
if len(test_gaps) >= guides_needing_troughs:
|
1517
|
+
threshold_val = test_threshold
|
1518
|
+
logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
|
1519
|
+
break
|
1520
|
+
|
1521
|
+
if threshold_val is None:
|
1522
|
+
threshold_val = 0.8 # Fallback to permissive threshold
|
1523
|
+
logger.debug(f"Auto threshold fallback to {threshold_val}")
|
1524
|
+
else:
|
1525
|
+
# Fixed threshold mode
|
1526
|
+
if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
|
1527
|
+
raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
|
1528
|
+
threshold_val = float(threshold)
|
1529
|
+
|
1530
|
+
return self._find_gaps_with_threshold(density, threshold_val, min_gap, x0)
|
1531
|
+
|
1532
|
+
def _find_gaps_with_threshold(self, density, threshold_val, min_gap, x0):
|
1533
|
+
"""Helper method to find gaps given a specific threshold value."""
|
1534
|
+
max_density = density.max()
|
1535
|
+
threshold_density = threshold_val * max_density
|
1536
|
+
|
1537
|
+
# Smooth the density for better trough detection
|
1538
|
+
from scipy.ndimage import gaussian_filter1d
|
1539
|
+
smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
|
1540
|
+
|
1541
|
+
# Find regions below threshold
|
1542
|
+
below_threshold = smoothed_density <= threshold_density
|
1543
|
+
|
1544
|
+
# Find contiguous regions
|
1545
|
+
from scipy.ndimage import label as nd_label
|
1546
|
+
labeled_regions, num_regions = nd_label(below_threshold)
|
1547
|
+
|
1548
|
+
gaps = []
|
1549
|
+
for region_id in range(1, num_regions + 1):
|
1550
|
+
region_mask = labeled_regions == region_id
|
1551
|
+
region_indices = np.where(region_mask)[0]
|
1552
|
+
|
1553
|
+
if len(region_indices) == 0:
|
1554
|
+
continue
|
1555
|
+
|
1556
|
+
start_px = region_indices[0]
|
1557
|
+
end_px = region_indices[-1] + 1
|
1558
|
+
|
1559
|
+
# Convert back to PDF coordinates
|
1560
|
+
start_pdf = x0 + start_px
|
1561
|
+
end_pdf = x0 + end_px
|
1562
|
+
|
1563
|
+
# Check minimum gap size
|
1564
|
+
if end_pdf - start_pdf >= min_gap:
|
1565
|
+
gaps.append((start_pdf, end_pdf))
|
1566
|
+
|
1567
|
+
return gaps
|
1568
|
+
|
1569
|
+
def _find_horizontal_whitespace_gaps(self, text_elements, min_gap: float, threshold: Union[float, str] = 'auto') -> List[Tuple[float, float]]:
|
1570
|
+
"""
|
1571
|
+
Find horizontal whitespace gaps using bbox-based density analysis.
|
1572
|
+
Returns list of (start, end) tuples representing trough ranges.
|
1573
|
+
"""
|
1574
|
+
if not self.bounds:
|
1575
|
+
return []
|
1576
|
+
|
1577
|
+
_, y0, _, y1 = self.bounds
|
1578
|
+
height_pixels = int(y1 - y0)
|
1579
|
+
|
1580
|
+
if height_pixels <= 0:
|
1581
|
+
return []
|
1582
|
+
|
1583
|
+
# Create density histogram: count bbox overlaps per y-coordinate
|
1584
|
+
density = np.zeros(height_pixels)
|
1585
|
+
|
1586
|
+
for element in text_elements:
|
1587
|
+
if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
|
1588
|
+
continue
|
1589
|
+
|
1590
|
+
# Clip coordinates to bounds
|
1591
|
+
elem_top = max(y0, element.top) - y0
|
1592
|
+
elem_bottom = min(y1, element.bottom) - y0
|
1593
|
+
|
1594
|
+
if elem_bottom > elem_top:
|
1595
|
+
start_px = int(elem_top)
|
1596
|
+
end_px = int(elem_bottom)
|
1597
|
+
density[start_px:end_px] += 1
|
1598
|
+
|
1599
|
+
if density.max() == 0:
|
1600
|
+
return []
|
1601
|
+
|
1602
|
+
# Determine the threshold value (same logic as vertical)
|
1603
|
+
if threshold == 'auto':
|
1604
|
+
guides_needing_troughs = len([g for i, g in enumerate(self.horizontal) if 0 < i < len(self.horizontal) - 1])
|
1605
|
+
if guides_needing_troughs == 0:
|
1606
|
+
threshold_val = 0.5 # Default when no guides need placement
|
1607
|
+
else:
|
1608
|
+
threshold_val = None
|
1609
|
+
for test_threshold in np.arange(0.1, 1.0, 0.05):
|
1610
|
+
test_gaps = self._find_gaps_with_threshold_horizontal(density, test_threshold, min_gap, y0)
|
1611
|
+
if len(test_gaps) >= guides_needing_troughs:
|
1612
|
+
threshold_val = test_threshold
|
1613
|
+
logger.debug(f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)")
|
1614
|
+
break
|
1615
|
+
|
1616
|
+
if threshold_val is None:
|
1617
|
+
threshold_val = 0.8 # Fallback to permissive threshold
|
1618
|
+
logger.debug(f"Auto threshold fallback to {threshold_val}")
|
1619
|
+
else:
|
1620
|
+
# Fixed threshold mode
|
1621
|
+
if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
|
1622
|
+
raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
|
1623
|
+
threshold_val = float(threshold)
|
1624
|
+
|
1625
|
+
return self._find_gaps_with_threshold_horizontal(density, threshold_val, min_gap, y0)
|
1626
|
+
|
1627
|
+
def _find_gaps_with_threshold_horizontal(self, density, threshold_val, min_gap, y0):
|
1628
|
+
"""Helper method to find horizontal gaps given a specific threshold value."""
|
1629
|
+
max_density = density.max()
|
1630
|
+
threshold_density = threshold_val * max_density
|
1631
|
+
|
1632
|
+
# Smooth the density for better trough detection
|
1633
|
+
from scipy.ndimage import gaussian_filter1d
|
1634
|
+
smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
|
1635
|
+
|
1636
|
+
# Find regions below threshold
|
1637
|
+
below_threshold = smoothed_density <= threshold_density
|
1638
|
+
|
1639
|
+
# Find contiguous regions
|
1640
|
+
from scipy.ndimage import label as nd_label
|
1641
|
+
labeled_regions, num_regions = nd_label(below_threshold)
|
1642
|
+
|
1643
|
+
gaps = []
|
1644
|
+
for region_id in range(1, num_regions + 1):
|
1645
|
+
region_mask = labeled_regions == region_id
|
1646
|
+
region_indices = np.where(region_mask)[0]
|
1647
|
+
|
1648
|
+
if len(region_indices) == 0:
|
1649
|
+
continue
|
1650
|
+
|
1651
|
+
start_px = region_indices[0]
|
1652
|
+
end_px = region_indices[-1] + 1
|
1653
|
+
|
1654
|
+
# Convert back to PDF coordinates
|
1655
|
+
start_pdf = y0 + start_px
|
1656
|
+
end_pdf = y0 + end_px
|
1657
|
+
|
1658
|
+
# Check minimum gap size
|
1659
|
+
if end_pdf - start_pdf >= min_gap:
|
1660
|
+
gaps.append((start_pdf, end_pdf))
|
1661
|
+
|
1662
|
+
return gaps
|
1663
|
+
|
1664
|
+
def _find_vertical_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
|
1665
|
+
"""
|
1666
|
+
Find vertical whitespace gaps using text element spacing analysis.
|
1667
|
+
Returns list of (start, end) tuples representing trough ranges.
|
1668
|
+
"""
|
1669
|
+
if not self.bounds or not text_elements:
|
1670
|
+
return []
|
1671
|
+
|
1672
|
+
x0, _, x1, _ = self.bounds
|
1673
|
+
|
1674
|
+
# Get all element right and left edges
|
1675
|
+
element_edges = []
|
1676
|
+
for element in text_elements:
|
1677
|
+
if not hasattr(element, 'x0') or not hasattr(element, 'x1'):
|
1678
|
+
continue
|
1679
|
+
# Only include elements that overlap vertically with our bounds
|
1680
|
+
if hasattr(element, 'top') and hasattr(element, 'bottom'):
|
1681
|
+
if element.bottom < self.bounds[1] or element.top > self.bounds[3]:
|
1682
|
+
continue
|
1683
|
+
element_edges.extend([element.x0, element.x1])
|
1684
|
+
|
1685
|
+
if not element_edges:
|
1686
|
+
return []
|
1687
|
+
|
1688
|
+
# Sort edges and find gaps
|
1689
|
+
element_edges = sorted(set(element_edges))
|
1690
|
+
|
1691
|
+
trough_ranges = []
|
1692
|
+
for i in range(len(element_edges) - 1):
|
1693
|
+
gap_start = element_edges[i]
|
1694
|
+
gap_end = element_edges[i + 1]
|
1695
|
+
gap_width = gap_end - gap_start
|
1696
|
+
|
1697
|
+
if gap_width >= min_gap:
|
1698
|
+
# Check if this gap actually contains no text (is empty space)
|
1699
|
+
gap_has_text = False
|
1700
|
+
for element in text_elements:
|
1701
|
+
if (hasattr(element, 'x0') and hasattr(element, 'x1') and
|
1702
|
+
element.x0 < gap_end and element.x1 > gap_start):
|
1703
|
+
gap_has_text = True
|
1704
|
+
break
|
1705
|
+
|
1706
|
+
if not gap_has_text:
|
1707
|
+
trough_ranges.append((gap_start, gap_end))
|
1708
|
+
|
1709
|
+
return trough_ranges
|
1710
|
+
|
1711
|
+
def _find_horizontal_element_gaps(self, text_elements, min_gap: float) -> List[Tuple[float, float]]:
|
1712
|
+
"""
|
1713
|
+
Find horizontal whitespace gaps using text element spacing analysis.
|
1714
|
+
Returns list of (start, end) tuples representing trough ranges.
|
1715
|
+
"""
|
1716
|
+
if not self.bounds or not text_elements:
|
1717
|
+
return []
|
1718
|
+
|
1719
|
+
_, y0, _, y1 = self.bounds
|
1720
|
+
|
1721
|
+
# Get all element top and bottom edges
|
1722
|
+
element_edges = []
|
1723
|
+
for element in text_elements:
|
1724
|
+
if not hasattr(element, 'top') or not hasattr(element, 'bottom'):
|
1725
|
+
continue
|
1726
|
+
# Only include elements that overlap horizontally with our bounds
|
1727
|
+
if hasattr(element, 'x0') and hasattr(element, 'x1'):
|
1728
|
+
if element.x1 < self.bounds[0] or element.x0 > self.bounds[2]:
|
1729
|
+
continue
|
1730
|
+
element_edges.extend([element.top, element.bottom])
|
1731
|
+
|
1732
|
+
if not element_edges:
|
1733
|
+
return []
|
1734
|
+
|
1735
|
+
# Sort edges and find gaps
|
1736
|
+
element_edges = sorted(set(element_edges))
|
1737
|
+
|
1738
|
+
trough_ranges = []
|
1739
|
+
for i in range(len(element_edges) - 1):
|
1740
|
+
gap_start = element_edges[i]
|
1741
|
+
gap_end = element_edges[i + 1]
|
1742
|
+
gap_width = gap_end - gap_start
|
1743
|
+
|
1744
|
+
if gap_width >= min_gap:
|
1745
|
+
# Check if this gap actually contains no text (is empty space)
|
1746
|
+
gap_has_text = False
|
1747
|
+
for element in text_elements:
|
1748
|
+
if (hasattr(element, 'top') and hasattr(element, 'bottom') and
|
1749
|
+
element.top < gap_end and element.bottom > gap_start):
|
1750
|
+
gap_has_text = True
|
1751
|
+
break
|
1752
|
+
|
1753
|
+
if not gap_has_text:
|
1754
|
+
trough_ranges.append((gap_start, gap_end))
|
1755
|
+
|
1756
|
+
return trough_ranges
|
1757
|
+
|
1758
|
+
def _optimal_guide_assignment(self, guides: List[float], trough_ranges: List[Tuple[float, float]]) -> Dict[int, int]:
|
1759
|
+
"""
|
1760
|
+
Assign guides to trough ranges using the user's desired logic:
|
1761
|
+
- Guides already in a trough stay put
|
1762
|
+
- Only guides NOT in any trough get moved to available troughs
|
1763
|
+
- Prefer closest assignment for guides that need to move
|
1764
|
+
"""
|
1765
|
+
if not guides or not trough_ranges:
|
1766
|
+
return {}
|
1767
|
+
|
1768
|
+
assignments = {}
|
1769
|
+
|
1770
|
+
# Step 1: Identify which guides are already in troughs
|
1771
|
+
guides_in_troughs = set()
|
1772
|
+
for i, guide_pos in enumerate(guides):
|
1773
|
+
for trough_start, trough_end in trough_ranges:
|
1774
|
+
if trough_start <= guide_pos <= trough_end:
|
1775
|
+
guides_in_troughs.add(i)
|
1776
|
+
logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is already in trough ({trough_start:.1f}-{trough_end:.1f}), keeping in place")
|
1777
|
+
break
|
1778
|
+
|
1779
|
+
# Step 2: Identify which troughs are already occupied
|
1780
|
+
occupied_troughs = set()
|
1781
|
+
for i in guides_in_troughs:
|
1782
|
+
guide_pos = guides[i]
|
1783
|
+
for j, (trough_start, trough_end) in enumerate(trough_ranges):
|
1784
|
+
if trough_start <= guide_pos <= trough_end:
|
1785
|
+
occupied_troughs.add(j)
|
1786
|
+
break
|
1787
|
+
|
1788
|
+
# Step 3: Find guides that need reassignment (not in any trough)
|
1789
|
+
guides_to_move = []
|
1790
|
+
for i, guide_pos in enumerate(guides):
|
1791
|
+
if i not in guides_in_troughs:
|
1792
|
+
guides_to_move.append(i)
|
1793
|
+
logger.debug(f"Guide {i} (pos {guide_pos:.1f}) is NOT in any trough, needs reassignment")
|
1794
|
+
|
1795
|
+
# Step 4: Find available troughs (not occupied by existing guides)
|
1796
|
+
available_troughs = []
|
1797
|
+
for j, (trough_start, trough_end) in enumerate(trough_ranges):
|
1798
|
+
if j not in occupied_troughs:
|
1799
|
+
available_troughs.append(j)
|
1800
|
+
logger.debug(f"Trough {j} ({trough_start:.1f}-{trough_end:.1f}) is available")
|
1801
|
+
|
1802
|
+
# Step 5: Assign guides to move to closest available troughs
|
1803
|
+
if guides_to_move and available_troughs:
|
1804
|
+
# Calculate distances for all combinations
|
1805
|
+
distances = []
|
1806
|
+
for guide_idx in guides_to_move:
|
1807
|
+
guide_pos = guides[guide_idx]
|
1808
|
+
for trough_idx in available_troughs:
|
1809
|
+
trough_start, trough_end = trough_ranges[trough_idx]
|
1810
|
+
trough_center = (trough_start + trough_end) / 2
|
1811
|
+
distance = abs(guide_pos - trough_center)
|
1812
|
+
distances.append((distance, guide_idx, trough_idx))
|
1813
|
+
|
1814
|
+
# Sort by distance and assign greedily
|
1815
|
+
distances.sort()
|
1816
|
+
used_troughs = set()
|
1817
|
+
|
1818
|
+
for distance, guide_idx, trough_idx in distances:
|
1819
|
+
if guide_idx not in assignments and trough_idx not in used_troughs:
|
1820
|
+
assignments[guide_idx] = trough_idx
|
1821
|
+
used_troughs.add(trough_idx)
|
1822
|
+
logger.debug(f"Assigned guide {guide_idx} (pos {guides[guide_idx]:.1f}) to trough {trough_idx} (distance: {distance:.1f})")
|
1823
|
+
|
1824
|
+
logger.debug(f"Final assignments: {assignments}")
|
1825
|
+
return assignments
|
1826
|
+
|
1827
|
+
def _snap_guides_to_gaps(self, guides: List[float], gaps: List[Tuple[float, float]], axis: str):
|
1828
|
+
"""
|
1829
|
+
Snap guides to nearby gaps using optimal assignment.
|
1830
|
+
Only moves guides that are NOT already in a trough.
|
1831
|
+
"""
|
1832
|
+
if not guides or not gaps:
|
1833
|
+
return
|
1834
|
+
|
1835
|
+
logger.debug(f"Snapping {len(guides)} {axis} guides to {len(gaps)} trough ranges")
|
1836
|
+
for i, (start, end) in enumerate(gaps):
|
1837
|
+
center = (start + end) / 2
|
1838
|
+
logger.debug(f" Trough {i}: {start:.1f} to {end:.1f} (center: {center:.1f})")
|
1839
|
+
|
1840
|
+
# Get optimal assignments
|
1841
|
+
assignments = self._optimal_guide_assignment(guides, gaps)
|
1842
|
+
|
1843
|
+
# Apply assignments (modify guides list in-place)
|
1844
|
+
for guide_idx, trough_idx in assignments.items():
|
1845
|
+
trough_start, trough_end = gaps[trough_idx]
|
1846
|
+
new_pos = (trough_start + trough_end) / 2 # Move to trough center
|
1847
|
+
old_pos = guides[guide_idx]
|
1848
|
+
guides[guide_idx] = new_pos
|
1849
|
+
logger.info(f"Snapped {axis} guide from {old_pos:.1f} to {new_pos:.1f}")
|
1850
|
+
|
1851
|
+
def build_grid(
|
1852
|
+
self,
|
1853
|
+
target: Optional[Union["Page", "Region"]] = None,
|
1854
|
+
source: str = "guides",
|
1855
|
+
cell_padding: float = 0.5,
|
1856
|
+
include_outer_boundaries: bool = False
|
1857
|
+
) -> Dict[str, int]:
|
1858
|
+
"""
|
1859
|
+
Create table structure (table, rows, columns, cells) from guide coordinates.
|
1860
|
+
|
1861
|
+
Args:
|
1862
|
+
target: Page or Region to create regions on (uses self.context if None)
|
1863
|
+
source: Source label for created regions (for identification)
|
1864
|
+
cell_padding: Internal padding for cell regions in points
|
1865
|
+
include_outer_boundaries: Whether to add boundaries at edges if missing
|
1866
|
+
|
1867
|
+
Returns:
|
1868
|
+
Dictionary with counts: {'table': 1, 'rows': N, 'columns': M, 'cells': N*M}
|
1869
|
+
"""
|
1870
|
+
# Determine target object
|
1871
|
+
target_obj = target or self.context
|
1872
|
+
if not target_obj:
|
1873
|
+
raise ValueError("No target object available. Provide target parameter or context.")
|
1874
|
+
|
1875
|
+
# Get the page for creating regions
|
1876
|
+
if hasattr(target_obj, 'x0') and hasattr(target_obj, 'top'): # Region (has bbox coordinates)
|
1877
|
+
page = target_obj._page
|
1878
|
+
origin_x, origin_y = target_obj.x0, target_obj.top
|
1879
|
+
context_width, context_height = target_obj.width, target_obj.height
|
1880
|
+
elif hasattr(target_obj, '_element_mgr') or hasattr(target_obj, 'width'): # Page
|
1881
|
+
page = target_obj
|
1882
|
+
origin_x, origin_y = 0.0, 0.0
|
1883
|
+
context_width, context_height = page.width, page.height
|
1884
|
+
else:
|
1885
|
+
raise ValueError(f"Target object {target_obj} is not a Page or Region")
|
1886
|
+
|
1887
|
+
element_manager = page._element_mgr
|
1888
|
+
|
1889
|
+
# Setup boundaries
|
1890
|
+
row_boundaries = list(self.horizontal)
|
1891
|
+
col_boundaries = list(self.vertical)
|
1892
|
+
|
1893
|
+
# Add outer boundaries if requested and missing
|
1894
|
+
if include_outer_boundaries:
|
1895
|
+
if not row_boundaries or row_boundaries[0] > origin_y:
|
1896
|
+
row_boundaries.insert(0, origin_y)
|
1897
|
+
if not row_boundaries or row_boundaries[-1] < origin_y + context_height:
|
1898
|
+
row_boundaries.append(origin_y + context_height)
|
1899
|
+
|
1900
|
+
if not col_boundaries or col_boundaries[0] > origin_x:
|
1901
|
+
col_boundaries.insert(0, origin_x)
|
1902
|
+
if not col_boundaries or col_boundaries[-1] < origin_x + context_width:
|
1903
|
+
col_boundaries.append(origin_x + context_width)
|
1904
|
+
|
1905
|
+
# Remove duplicates and sort
|
1906
|
+
row_boundaries = sorted(list(set(row_boundaries)))
|
1907
|
+
col_boundaries = sorted(list(set(col_boundaries)))
|
1908
|
+
|
1909
|
+
logger.debug(f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries")
|
1910
|
+
|
1911
|
+
# Track creation counts
|
1912
|
+
counts = {'table': 0, 'rows': 0, 'columns': 0, 'cells': 0}
|
1913
|
+
|
1914
|
+
# Create overall table region
|
1915
|
+
if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
|
1916
|
+
table_region = page.create_region(
|
1917
|
+
col_boundaries[0], row_boundaries[0],
|
1918
|
+
col_boundaries[-1], row_boundaries[-1]
|
1919
|
+
)
|
1920
|
+
table_region.source = source
|
1921
|
+
table_region.region_type = "table"
|
1922
|
+
table_region.normalized_type = "table"
|
1923
|
+
table_region.metadata.update({
|
1924
|
+
"source_guides": True,
|
1925
|
+
"num_rows": len(row_boundaries) - 1,
|
1926
|
+
"num_cols": len(col_boundaries) - 1,
|
1927
|
+
"boundaries": {"rows": row_boundaries, "cols": col_boundaries}
|
1928
|
+
})
|
1929
|
+
element_manager.add_element(table_region, element_type="regions")
|
1930
|
+
counts['table'] = 1
|
1931
|
+
|
1932
|
+
# Create row regions
|
1933
|
+
if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
|
1934
|
+
for i in range(len(row_boundaries) - 1):
|
1935
|
+
row_region = page.create_region(
|
1936
|
+
col_boundaries[0], row_boundaries[i],
|
1937
|
+
col_boundaries[-1], row_boundaries[i + 1]
|
1938
|
+
)
|
1939
|
+
row_region.source = source
|
1940
|
+
row_region.region_type = "table_row"
|
1941
|
+
row_region.normalized_type = "table_row"
|
1942
|
+
row_region.metadata.update({
|
1943
|
+
"row_index": i,
|
1944
|
+
"source_guides": True
|
1945
|
+
})
|
1946
|
+
element_manager.add_element(row_region, element_type="regions")
|
1947
|
+
counts['rows'] += 1
|
1948
|
+
|
1949
|
+
# Create column regions
|
1950
|
+
if len(col_boundaries) >= 2 and len(row_boundaries) >= 2:
|
1951
|
+
for j in range(len(col_boundaries) - 1):
|
1952
|
+
col_region = page.create_region(
|
1953
|
+
col_boundaries[j], row_boundaries[0],
|
1954
|
+
col_boundaries[j + 1], row_boundaries[-1]
|
1955
|
+
)
|
1956
|
+
col_region.source = source
|
1957
|
+
col_region.region_type = "table_column"
|
1958
|
+
col_region.normalized_type = "table_column"
|
1959
|
+
col_region.metadata.update({
|
1960
|
+
"col_index": j,
|
1961
|
+
"source_guides": True
|
1962
|
+
})
|
1963
|
+
element_manager.add_element(col_region, element_type="regions")
|
1964
|
+
counts['columns'] += 1
|
1965
|
+
|
1966
|
+
# Create cell regions
|
1967
|
+
if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
|
1968
|
+
for i in range(len(row_boundaries) - 1):
|
1969
|
+
for j in range(len(col_boundaries) - 1):
|
1970
|
+
# Apply padding
|
1971
|
+
cell_x0 = col_boundaries[j] + cell_padding
|
1972
|
+
cell_top = row_boundaries[i] + cell_padding
|
1973
|
+
cell_x1 = col_boundaries[j + 1] - cell_padding
|
1974
|
+
cell_bottom = row_boundaries[i + 1] - cell_padding
|
1975
|
+
|
1976
|
+
# Skip invalid cells
|
1977
|
+
if cell_x1 <= cell_x0 or cell_bottom <= cell_top:
|
1978
|
+
continue
|
1979
|
+
|
1980
|
+
cell_region = page.create_region(cell_x0, cell_top, cell_x1, cell_bottom)
|
1981
|
+
cell_region.source = source
|
1982
|
+
cell_region.region_type = "table_cell"
|
1983
|
+
cell_region.normalized_type = "table_cell"
|
1984
|
+
cell_region.metadata.update({
|
1985
|
+
"row_index": i,
|
1986
|
+
"col_index": j,
|
1987
|
+
"source_guides": True,
|
1988
|
+
"original_boundaries": {
|
1989
|
+
"left": col_boundaries[j],
|
1990
|
+
"top": row_boundaries[i],
|
1991
|
+
"right": col_boundaries[j + 1],
|
1992
|
+
"bottom": row_boundaries[i + 1]
|
1993
|
+
}
|
1994
|
+
})
|
1995
|
+
element_manager.add_element(cell_region, element_type="regions")
|
1996
|
+
counts['cells'] += 1
|
1997
|
+
|
1998
|
+
logger.info(f"Created {counts['table']} table, {counts['rows']} rows, "
|
1999
|
+
f"{counts['columns']} columns, and {counts['cells']} cells from guides")
|
2000
|
+
|
2001
|
+
return counts
|
2002
|
+
|
2003
|
+
def __repr__(self) -> str:
|
2004
|
+
"""String representation of the guides."""
|
2005
|
+
return (f"Guides(verticals={len(self.vertical)}, "
|
2006
|
+
f"horizontals={len(self.horizontal)}, "
|
2007
|
+
f"cells={len(self.get_cells())})")
|
2008
|
+
|
2009
|
+
def _get_text_elements(self):
|
2010
|
+
"""Get text elements from the context."""
|
2011
|
+
if not self.context:
|
2012
|
+
return []
|
2013
|
+
|
2014
|
+
# Get text elements from the context
|
2015
|
+
if hasattr(self.context, 'find_all'):
|
2016
|
+
try:
|
2017
|
+
text_elements = self.context.find_all('text', apply_exclusions=False)
|
2018
|
+
return text_elements.elements if hasattr(text_elements, 'elements') else text_elements
|
2019
|
+
except Exception as e:
|
2020
|
+
logger.warning(f"Error getting text elements: {e}")
|
2021
|
+
return []
|
2022
|
+
else:
|
2023
|
+
logger.warning("Context does not support text element search")
|
2024
|
+
return []
|
2025
|
+
|
2026
|
+
# -------------------------------------------------------------------------
|
2027
|
+
# Instance methods for fluent chaining (avoid name conflicts with class methods)
|
2028
|
+
# -------------------------------------------------------------------------
|
2029
|
+
|
2030
|
+
def add_content(
|
2031
|
+
self,
|
2032
|
+
axis: Literal['vertical', 'horizontal'] = 'vertical',
|
2033
|
+
markers: Union[str, List[str], "ElementCollection", None] = None,
|
2034
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
2035
|
+
align: Literal['left', 'right', 'center', 'between'] = 'left',
|
2036
|
+
outer: bool = True,
|
2037
|
+
tolerance: float = 5
|
2038
|
+
) -> "Guides":
|
2039
|
+
"""
|
2040
|
+
Instance method: Add guides from content, allowing chaining.
|
2041
|
+
This allows: Guides.new(page).add_content(axis='vertical', markers=[...])
|
2042
|
+
|
2043
|
+
Args:
|
2044
|
+
axis: Which axis to create guides for
|
2045
|
+
markers: Content to search for. Can be:
|
2046
|
+
- str: single selector or literal text
|
2047
|
+
- List[str]: list of selectors or literal text strings
|
2048
|
+
- ElementCollection: collection of elements to extract text from
|
2049
|
+
- None: no markers
|
2050
|
+
obj: Page or Region to search (uses self.context if None)
|
2051
|
+
align: How to align guides relative to found elements
|
2052
|
+
outer: Whether to add outer boundary guides
|
2053
|
+
tolerance: Tolerance for snapping to element edges
|
2054
|
+
|
2055
|
+
Returns:
|
2056
|
+
Self for method chaining
|
2057
|
+
"""
|
2058
|
+
# Use provided object or fall back to stored context
|
2059
|
+
target_obj = obj or self.context
|
2060
|
+
if target_obj is None:
|
2061
|
+
raise ValueError("No object provided and no context available")
|
2062
|
+
|
2063
|
+
# Create new guides using the class method
|
2064
|
+
new_guides = Guides.from_content(
|
2065
|
+
obj=target_obj,
|
2066
|
+
axis=axis,
|
2067
|
+
markers=markers,
|
2068
|
+
align=align,
|
2069
|
+
outer=outer,
|
2070
|
+
tolerance=tolerance
|
2071
|
+
)
|
2072
|
+
|
2073
|
+
# Add the appropriate coordinates to this object
|
2074
|
+
if axis == 'vertical':
|
2075
|
+
self.vertical = list(set(self.vertical + new_guides.vertical))
|
2076
|
+
else:
|
2077
|
+
self.horizontal = list(set(self.horizontal + new_guides.horizontal))
|
2078
|
+
|
2079
|
+
return self
|
2080
|
+
|
2081
|
+
def add_lines(
|
2082
|
+
self,
|
2083
|
+
axis: Literal['vertical', 'horizontal', 'both'] = 'both',
|
2084
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
2085
|
+
threshold: Union[float, str] = 'auto',
|
2086
|
+
source_label: Optional[str] = None,
|
2087
|
+
max_lines_h: Optional[int] = None,
|
2088
|
+
max_lines_v: Optional[int] = None,
|
2089
|
+
outer: bool = False,
|
2090
|
+
detection_method: str = 'vector',
|
2091
|
+
resolution: int = 192,
|
2092
|
+
**detect_kwargs
|
2093
|
+
) -> "Guides":
|
2094
|
+
"""
|
2095
|
+
Instance method: Add guides from lines, allowing chaining.
|
2096
|
+
This allows: Guides.new(page).add_lines(axis='horizontal')
|
2097
|
+
|
2098
|
+
Args:
|
2099
|
+
axis: Which axis to detect lines for
|
2100
|
+
obj: Page or Region to search (uses self.context if None)
|
2101
|
+
threshold: Line detection threshold ('auto' or float 0.0-1.0)
|
2102
|
+
source_label: Filter lines by source label (vector) or label for detected lines (pixels)
|
2103
|
+
max_lines_h: Maximum horizontal lines to use
|
2104
|
+
max_lines_v: Maximum vertical lines to use
|
2105
|
+
outer: Whether to add outer boundary guides
|
2106
|
+
detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
|
2107
|
+
resolution: DPI for pixel-based detection (default: 192)
|
2108
|
+
**detect_kwargs: Additional parameters for pixel detection (see from_lines)
|
2109
|
+
|
2110
|
+
Returns:
|
2111
|
+
Self for method chaining
|
2112
|
+
"""
|
2113
|
+
# Use provided object or fall back to stored context
|
2114
|
+
target_obj = obj or self.context
|
2115
|
+
if target_obj is None:
|
2116
|
+
raise ValueError("No object provided and no context available")
|
2117
|
+
|
2118
|
+
# Create new guides using the class method
|
2119
|
+
new_guides = Guides.from_lines(
|
2120
|
+
obj=target_obj,
|
2121
|
+
axis=axis,
|
2122
|
+
threshold=threshold,
|
2123
|
+
source_label=source_label,
|
2124
|
+
max_lines_h=max_lines_h,
|
2125
|
+
max_lines_v=max_lines_v,
|
2126
|
+
outer=outer,
|
2127
|
+
detection_method=detection_method,
|
2128
|
+
resolution=resolution,
|
2129
|
+
**detect_kwargs
|
2130
|
+
)
|
2131
|
+
|
2132
|
+
# Add the appropriate coordinates to this object
|
2133
|
+
if axis in ('vertical', 'both'):
|
2134
|
+
self.vertical = list(set(self.vertical + new_guides.vertical))
|
2135
|
+
if axis in ('horizontal', 'both'):
|
2136
|
+
self.horizontal = list(set(self.horizontal + new_guides.horizontal))
|
2137
|
+
|
2138
|
+
return self
|
2139
|
+
|
2140
|
+
def add_whitespace(
|
2141
|
+
self,
|
2142
|
+
axis: Literal['vertical', 'horizontal', 'both'] = 'both',
|
2143
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
2144
|
+
min_gap: float = 10
|
2145
|
+
) -> "Guides":
|
2146
|
+
"""
|
2147
|
+
Instance method: Add guides from whitespace, allowing chaining.
|
2148
|
+
This allows: Guides.new(page).add_whitespace(axis='both')
|
2149
|
+
|
2150
|
+
Args:
|
2151
|
+
axis: Which axis to create guides for
|
2152
|
+
obj: Page or Region to search (uses self.context if None)
|
2153
|
+
min_gap: Minimum gap size to consider
|
2154
|
+
|
2155
|
+
Returns:
|
2156
|
+
Self for method chaining
|
2157
|
+
"""
|
2158
|
+
# Use provided object or fall back to stored context
|
2159
|
+
target_obj = obj or self.context
|
2160
|
+
if target_obj is None:
|
2161
|
+
raise ValueError("No object provided and no context available")
|
2162
|
+
|
2163
|
+
# Create new guides using the class method
|
2164
|
+
new_guides = Guides.from_whitespace(
|
2165
|
+
obj=target_obj,
|
2166
|
+
axis=axis,
|
2167
|
+
min_gap=min_gap
|
2168
|
+
)
|
2169
|
+
|
2170
|
+
# Add the appropriate coordinates to this object
|
2171
|
+
if axis in ('vertical', 'both'):
|
2172
|
+
self.vertical = list(set(self.vertical + new_guides.vertical))
|
2173
|
+
if axis in ('horizontal', 'both'):
|
2174
|
+
self.horizontal = list(set(self.horizontal + new_guides.horizontal))
|
2175
|
+
|
2176
|
+
return self
|