natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
natural_pdf/analyzers/guides.py
CHANGED
@@ -2,59 +2,58 @@
|
|
2
2
|
|
3
3
|
import json
|
4
4
|
import logging
|
5
|
-
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING
|
6
5
|
from collections import UserList
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from PIL import Image, ImageDraw
|
10
10
|
|
11
11
|
if TYPE_CHECKING:
|
12
12
|
from natural_pdf.core.page import Page
|
13
|
-
from natural_pdf.elements.region import Region
|
14
13
|
from natural_pdf.elements.base import Element
|
15
14
|
from natural_pdf.elements.collections import ElementCollection
|
15
|
+
from natural_pdf.elements.region import Region
|
16
16
|
|
17
17
|
logger = logging.getLogger(__name__)
|
18
18
|
|
19
19
|
|
20
20
|
def _normalize_markers(
|
21
|
-
markers: Union[str, List[str], "ElementCollection", None],
|
22
|
-
obj: Union["Page", "Region"]
|
21
|
+
markers: Union[str, List[str], "ElementCollection", None], obj: Union["Page", "Region"]
|
23
22
|
) -> List[str]:
|
24
23
|
"""
|
25
24
|
Normalize markers parameter to a list of text strings for guide creation.
|
26
|
-
|
25
|
+
|
27
26
|
Args:
|
28
27
|
markers: Can be:
|
29
28
|
- str: single selector or text string
|
30
|
-
- List[str]: list of selectors or text strings
|
29
|
+
- List[str]: list of selectors or text strings
|
31
30
|
- ElementCollection: collection of elements to extract text from
|
32
31
|
- None: empty list
|
33
32
|
obj: Object to search for elements if markers contains selectors
|
34
|
-
|
33
|
+
|
35
34
|
Returns:
|
36
35
|
List of text strings to search for
|
37
36
|
"""
|
38
37
|
if markers is None:
|
39
38
|
return []
|
40
|
-
|
39
|
+
|
41
40
|
if isinstance(markers, str):
|
42
41
|
# Single selector or text string
|
43
|
-
if markers.startswith((
|
42
|
+
if markers.startswith(("text", "region", "line", "rect", "blob", "image")):
|
44
43
|
# It's a CSS selector, find elements and extract text
|
45
|
-
if hasattr(obj,
|
44
|
+
if hasattr(obj, "find_all"):
|
46
45
|
elements = obj.find_all(markers)
|
47
|
-
return [elem.text if hasattr(elem,
|
46
|
+
return [elem.text if hasattr(elem, "text") else str(elem) for elem in elements]
|
48
47
|
else:
|
49
48
|
logger.warning(f"Object {obj} doesn't support find_all for selector '{markers}'")
|
50
49
|
return [markers] # Treat as literal text
|
51
50
|
else:
|
52
51
|
# Treat as literal text
|
53
52
|
return [markers]
|
54
|
-
|
55
|
-
elif hasattr(markers,
|
53
|
+
|
54
|
+
elif hasattr(markers, "__iter__") and not isinstance(markers, str):
|
56
55
|
# It might be an ElementCollection or list
|
57
|
-
if hasattr(markers,
|
56
|
+
if hasattr(markers, "extract_each_text"):
|
58
57
|
# It's an ElementCollection
|
59
58
|
try:
|
60
59
|
return markers.extract_each_text()
|
@@ -63,9 +62,9 @@ def _normalize_markers(
|
|
63
62
|
# Fallback: try to get text from individual elements
|
64
63
|
texts = []
|
65
64
|
for elem in markers:
|
66
|
-
if hasattr(elem,
|
65
|
+
if hasattr(elem, "text"):
|
67
66
|
texts.append(elem.text)
|
68
|
-
elif hasattr(elem,
|
67
|
+
elif hasattr(elem, "extract_text"):
|
69
68
|
texts.append(elem.extract_text())
|
70
69
|
else:
|
71
70
|
texts.append(str(elem))
|
@@ -75,26 +74,31 @@ def _normalize_markers(
|
|
75
74
|
result = []
|
76
75
|
for marker in markers:
|
77
76
|
if isinstance(marker, str):
|
78
|
-
if marker.startswith((
|
77
|
+
if marker.startswith(("text", "region", "line", "rect", "blob", "image")):
|
79
78
|
# It's a selector
|
80
|
-
if hasattr(obj,
|
79
|
+
if hasattr(obj, "find_all"):
|
81
80
|
elements = obj.find_all(marker)
|
82
|
-
result.extend(
|
81
|
+
result.extend(
|
82
|
+
[
|
83
|
+
elem.text if hasattr(elem, "text") else str(elem)
|
84
|
+
for elem in elements
|
85
|
+
]
|
86
|
+
)
|
83
87
|
else:
|
84
88
|
result.append(marker) # Treat as literal
|
85
89
|
else:
|
86
90
|
# Literal text
|
87
91
|
result.append(marker)
|
88
|
-
elif hasattr(marker,
|
92
|
+
elif hasattr(marker, "text"):
|
89
93
|
# It's an element object
|
90
94
|
result.append(marker.text)
|
91
|
-
elif hasattr(marker,
|
95
|
+
elif hasattr(marker, "extract_text"):
|
92
96
|
# It's an element that can extract text
|
93
97
|
result.append(marker.extract_text())
|
94
98
|
else:
|
95
99
|
result.append(str(marker))
|
96
100
|
return result
|
97
|
-
|
101
|
+
|
98
102
|
else:
|
99
103
|
# Unknown type, try to convert to string
|
100
104
|
return [str(markers)]
|
@@ -102,44 +106,46 @@ def _normalize_markers(
|
|
102
106
|
|
103
107
|
class GuidesList(UserList):
|
104
108
|
"""A list of guide coordinates that also provides methods for creating guides."""
|
105
|
-
|
109
|
+
|
106
110
|
def __init__(self, parent_guides: "Guides", axis: Literal["vertical", "horizontal"], data=None):
|
107
111
|
super().__init__(data or [])
|
108
112
|
self._parent = parent_guides
|
109
113
|
self._axis = axis
|
110
|
-
|
114
|
+
|
111
115
|
def from_content(
|
112
116
|
self,
|
113
117
|
markers: Union[str, List[str], "ElementCollection", None],
|
114
118
|
obj: Optional[Union["Page", "Region"]] = None,
|
115
|
-
align: Literal[
|
119
|
+
align: Literal["left", "right", "center", "between"] = "left",
|
116
120
|
outer: bool = True,
|
117
|
-
tolerance: float = 5
|
121
|
+
tolerance: float = 5,
|
122
|
+
*,
|
123
|
+
append: bool = False,
|
118
124
|
) -> "Guides":
|
119
125
|
"""
|
120
126
|
Create guides from content markers and add to this axis.
|
121
|
-
|
127
|
+
|
122
128
|
Args:
|
123
129
|
markers: Content to search for. Can be:
|
124
130
|
- str: single selector (e.g., 'text:contains("Name")') or literal text
|
125
|
-
- List[str]: list of selectors or literal text strings
|
131
|
+
- List[str]: list of selectors or literal text strings
|
126
132
|
- ElementCollection: collection of elements to extract text from
|
127
133
|
- None: no markers
|
128
134
|
obj: Page/Region to search (uses parent's context if None)
|
129
135
|
align: How to align guides relative to found elements
|
130
136
|
outer: Whether to add outer boundary guides
|
131
137
|
tolerance: Tolerance for snapping to element edges
|
132
|
-
|
138
|
+
|
133
139
|
Returns:
|
134
140
|
Parent Guides object for chaining
|
135
141
|
"""
|
136
142
|
target_obj = obj or self._parent.context
|
137
143
|
if target_obj is None:
|
138
144
|
raise ValueError("No object provided and no context available")
|
139
|
-
|
145
|
+
|
140
146
|
# Normalize markers to list of text strings
|
141
147
|
marker_texts = _normalize_markers(markers, target_obj)
|
142
|
-
|
148
|
+
|
143
149
|
# Create guides for this axis
|
144
150
|
new_guides = Guides.from_content(
|
145
151
|
obj=target_obj,
|
@@ -147,15 +153,21 @@ class GuidesList(UserList):
|
|
147
153
|
markers=marker_texts,
|
148
154
|
align=align,
|
149
155
|
outer=outer,
|
150
|
-
tolerance=tolerance
|
156
|
+
tolerance=tolerance,
|
151
157
|
)
|
152
|
-
|
153
|
-
#
|
154
|
-
if
|
155
|
-
self.
|
158
|
+
|
159
|
+
# Replace or append based on parameter
|
160
|
+
if append:
|
161
|
+
if self._axis == "vertical":
|
162
|
+
self.extend(new_guides.vertical)
|
163
|
+
else:
|
164
|
+
self.extend(new_guides.horizontal)
|
156
165
|
else:
|
157
|
-
self.
|
158
|
-
|
166
|
+
if self._axis == "vertical":
|
167
|
+
self.data = list(new_guides.vertical)
|
168
|
+
else:
|
169
|
+
self.data = list(new_guides.horizontal)
|
170
|
+
|
159
171
|
# Remove duplicates while preserving order
|
160
172
|
seen = set()
|
161
173
|
unique = []
|
@@ -164,26 +176,27 @@ class GuidesList(UserList):
|
|
164
176
|
seen.add(x)
|
165
177
|
unique.append(x)
|
166
178
|
self.data = unique
|
167
|
-
|
179
|
+
|
168
180
|
return self._parent # Return parent for chaining
|
169
|
-
|
181
|
+
|
170
182
|
def from_lines(
|
171
183
|
self,
|
172
184
|
obj: Optional[Union["Page", "Region"]] = None,
|
173
|
-
threshold: Union[float, str] =
|
185
|
+
threshold: Union[float, str] = "auto",
|
174
186
|
source_label: Optional[str] = None,
|
175
187
|
max_lines: Optional[int] = None,
|
176
188
|
outer: bool = False,
|
177
|
-
detection_method: str =
|
189
|
+
detection_method: str = "vector",
|
178
190
|
resolution: int = 192,
|
179
191
|
*,
|
180
192
|
n: Optional[int] = None,
|
181
193
|
min_gap: Optional[int] = None,
|
182
|
-
|
194
|
+
append: bool = False,
|
195
|
+
**detect_kwargs,
|
183
196
|
) -> "Guides":
|
184
197
|
"""
|
185
198
|
Create guides from detected line elements.
|
186
|
-
|
199
|
+
|
187
200
|
Args:
|
188
201
|
obj: Page/Region to search (uses parent's context if None)
|
189
202
|
threshold: Line detection threshold ('auto' or float 0.0-1.0)
|
@@ -198,14 +211,14 @@ class GuidesList(UserList):
|
|
198
211
|
resolution: DPI for pixel-based detection (default: 192)
|
199
212
|
**detect_kwargs: Additional parameters for pixel-based detection
|
200
213
|
(e.g., min_gap_h, min_gap_v, binarization_method, etc.)
|
201
|
-
|
214
|
+
|
202
215
|
Returns:
|
203
216
|
Parent Guides object for chaining
|
204
217
|
"""
|
205
218
|
target_obj = obj or self._parent.context
|
206
219
|
if target_obj is None:
|
207
220
|
raise ValueError("No object provided and no context available")
|
208
|
-
|
221
|
+
|
209
222
|
# Resolve max_lines via alias `n` (n takes priority)
|
210
223
|
if n is not None:
|
211
224
|
if n <= 0:
|
@@ -213,16 +226,16 @@ class GuidesList(UserList):
|
|
213
226
|
max_lines = n
|
214
227
|
|
215
228
|
# Set appropriate max_lines parameter for underlying API
|
216
|
-
max_lines_h = max_lines if self._axis ==
|
217
|
-
max_lines_v = max_lines if self._axis ==
|
218
|
-
|
229
|
+
max_lines_h = max_lines if self._axis == "horizontal" else None
|
230
|
+
max_lines_v = max_lines if self._axis == "vertical" else None
|
231
|
+
|
219
232
|
# Map generic `min_gap` to axis-specific argument expected by detection
|
220
233
|
if min_gap is not None:
|
221
234
|
if min_gap < 1:
|
222
235
|
raise ValueError("min_gap must be ≥ 1 pixel")
|
223
|
-
axis_key =
|
236
|
+
axis_key = "min_gap_h" if self._axis == "horizontal" else "min_gap_v"
|
224
237
|
detect_kwargs.setdefault(axis_key, min_gap)
|
225
|
-
|
238
|
+
|
226
239
|
# Create guides for this axis
|
227
240
|
new_guides = Guides.from_lines(
|
228
241
|
obj=target_obj,
|
@@ -234,15 +247,21 @@ class GuidesList(UserList):
|
|
234
247
|
outer=outer,
|
235
248
|
detection_method=detection_method,
|
236
249
|
resolution=resolution,
|
237
|
-
**detect_kwargs
|
250
|
+
**detect_kwargs,
|
238
251
|
)
|
239
|
-
|
240
|
-
#
|
241
|
-
if
|
242
|
-
self.
|
252
|
+
|
253
|
+
# Replace or append based on parameter
|
254
|
+
if append:
|
255
|
+
if self._axis == "vertical":
|
256
|
+
self.extend(new_guides.vertical)
|
257
|
+
else:
|
258
|
+
self.extend(new_guides.horizontal)
|
243
259
|
else:
|
244
|
-
self.
|
245
|
-
|
260
|
+
if self._axis == "vertical":
|
261
|
+
self.data = list(new_guides.vertical)
|
262
|
+
else:
|
263
|
+
self.data = list(new_guides.horizontal)
|
264
|
+
|
246
265
|
# Remove duplicates
|
247
266
|
seen = set()
|
248
267
|
unique = []
|
@@ -251,41 +270,42 @@ class GuidesList(UserList):
|
|
251
270
|
seen.add(x)
|
252
271
|
unique.append(x)
|
253
272
|
self.data = unique
|
254
|
-
|
273
|
+
|
255
274
|
return self._parent
|
256
|
-
|
275
|
+
|
257
276
|
def from_whitespace(
|
258
|
-
self,
|
259
|
-
|
260
|
-
min_gap: float = 10
|
277
|
+
self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10,
|
278
|
+
*, append: bool = False
|
261
279
|
) -> "Guides":
|
262
280
|
"""
|
263
281
|
Create guides from whitespace gaps.
|
264
|
-
|
282
|
+
|
265
283
|
Args:
|
266
284
|
obj: Page/Region to analyze (uses parent's context if None)
|
267
285
|
min_gap: Minimum gap size to consider
|
268
|
-
|
286
|
+
|
269
287
|
Returns:
|
270
288
|
Parent Guides object for chaining
|
271
289
|
"""
|
272
290
|
target_obj = obj or self._parent.context
|
273
291
|
if target_obj is None:
|
274
292
|
raise ValueError("No object provided and no context available")
|
275
|
-
|
293
|
+
|
276
294
|
# Create guides for this axis
|
277
|
-
new_guides = Guides.from_whitespace(
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
self.extend(new_guides.vertical)
|
295
|
+
new_guides = Guides.from_whitespace(obj=target_obj, axis=self._axis, min_gap=min_gap)
|
296
|
+
|
297
|
+
# Replace or append
|
298
|
+
if append:
|
299
|
+
if self._axis == "vertical":
|
300
|
+
self.extend(new_guides.vertical)
|
301
|
+
else:
|
302
|
+
self.extend(new_guides.horizontal)
|
286
303
|
else:
|
287
|
-
self.
|
288
|
-
|
304
|
+
if self._axis == "vertical":
|
305
|
+
self.data = list(new_guides.vertical)
|
306
|
+
else:
|
307
|
+
self.data = list(new_guides.horizontal)
|
308
|
+
|
289
309
|
# Remove duplicates
|
290
310
|
seen = set()
|
291
311
|
unique = []
|
@@ -294,37 +314,33 @@ class GuidesList(UserList):
|
|
294
314
|
seen.add(x)
|
295
315
|
unique.append(x)
|
296
316
|
self.data = unique
|
297
|
-
|
317
|
+
|
298
318
|
return self._parent
|
299
|
-
|
319
|
+
|
300
320
|
def divide(self, n: int = 2, obj: Optional[Union["Page", "Region"]] = None) -> "Guides":
|
301
321
|
"""
|
302
322
|
Divide the space evenly along this axis.
|
303
|
-
|
323
|
+
|
304
324
|
Args:
|
305
325
|
n: Number of divisions (creates n-1 guides)
|
306
326
|
obj: Object to divide (uses parent's context if None)
|
307
|
-
|
327
|
+
|
308
328
|
Returns:
|
309
329
|
Parent Guides object for chaining
|
310
330
|
"""
|
311
331
|
target_obj = obj or self._parent.context
|
312
332
|
if target_obj is None:
|
313
333
|
raise ValueError("No object provided and no context available")
|
314
|
-
|
334
|
+
|
315
335
|
# Create guides using divide
|
316
|
-
new_guides = Guides.divide(
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
# Add to our list
|
323
|
-
if self._axis == 'vertical':
|
324
|
-
self.extend(new_guides.vertical)
|
336
|
+
new_guides = Guides.divide(obj=target_obj, n=n, axis=self._axis)
|
337
|
+
|
338
|
+
# Replace existing guides instead of extending (no append option here)
|
339
|
+
if self._axis == "vertical":
|
340
|
+
self.data = list(new_guides.vertical)
|
325
341
|
else:
|
326
|
-
self.
|
327
|
-
|
342
|
+
self.data = list(new_guides.horizontal)
|
343
|
+
|
328
344
|
# Remove duplicates
|
329
345
|
seen = set()
|
330
346
|
unique = []
|
@@ -333,45 +349,45 @@ class GuidesList(UserList):
|
|
333
349
|
seen.add(x)
|
334
350
|
unique.append(x)
|
335
351
|
self.data = unique
|
336
|
-
|
352
|
+
|
337
353
|
return self._parent
|
338
|
-
|
354
|
+
|
339
355
|
def snap_to_whitespace(
|
340
356
|
self,
|
341
357
|
min_gap: float = 10.0,
|
342
|
-
detection_method: str =
|
343
|
-
threshold: Union[float, str] =
|
344
|
-
on_no_snap: str =
|
345
|
-
obj: Optional[Union["Page", "Region"]] = None
|
358
|
+
detection_method: str = "pixels",
|
359
|
+
threshold: Union[float, str] = "auto",
|
360
|
+
on_no_snap: str = "warn",
|
361
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
346
362
|
) -> "Guides":
|
347
363
|
"""
|
348
364
|
Snap guides in this axis to whitespace gaps.
|
349
|
-
|
365
|
+
|
350
366
|
Args:
|
351
367
|
min_gap: Minimum gap size to consider
|
352
368
|
detection_method: 'pixels' or 'text' for gap detection
|
353
369
|
threshold: Threshold for whitespace detection (0.0-1.0) or 'auto'
|
354
370
|
on_no_snap: What to do when snapping fails ('warn', 'raise', 'ignore')
|
355
371
|
obj: Object to analyze (uses parent's context if None)
|
356
|
-
|
372
|
+
|
357
373
|
Returns:
|
358
374
|
Parent Guides object for chaining
|
359
375
|
"""
|
360
376
|
target_obj = obj or self._parent.context
|
361
377
|
if target_obj is None:
|
362
378
|
raise ValueError("No object provided and no context available")
|
363
|
-
|
379
|
+
|
364
380
|
# Use the parent's snap_to_whitespace but only for this axis
|
365
381
|
original_guides = self.data.copy()
|
366
|
-
|
382
|
+
|
367
383
|
# Temporarily set the parent's guides to only this axis
|
368
|
-
if self._axis ==
|
384
|
+
if self._axis == "vertical":
|
369
385
|
original_horizontal = self._parent.horizontal.data.copy()
|
370
386
|
self._parent.horizontal.data = []
|
371
387
|
else:
|
372
388
|
original_vertical = self._parent.vertical.data.copy()
|
373
389
|
self._parent.vertical.data = []
|
374
|
-
|
390
|
+
|
375
391
|
try:
|
376
392
|
# Call the parent's method
|
377
393
|
self._parent.snap_to_whitespace(
|
@@ -379,140 +395,143 @@ class GuidesList(UserList):
|
|
379
395
|
min_gap=min_gap,
|
380
396
|
detection_method=detection_method,
|
381
397
|
threshold=threshold,
|
382
|
-
on_no_snap=on_no_snap
|
398
|
+
on_no_snap=on_no_snap,
|
383
399
|
)
|
384
|
-
|
400
|
+
|
385
401
|
# Update our data from the parent
|
386
|
-
if self._axis ==
|
402
|
+
if self._axis == "vertical":
|
387
403
|
self.data = self._parent.vertical.data.copy()
|
388
404
|
else:
|
389
405
|
self.data = self._parent.horizontal.data.copy()
|
390
|
-
|
406
|
+
|
391
407
|
finally:
|
392
408
|
# Restore the other axis
|
393
|
-
if self._axis ==
|
409
|
+
if self._axis == "vertical":
|
394
410
|
self._parent.horizontal.data = original_horizontal
|
395
411
|
else:
|
396
412
|
self._parent.vertical.data = original_vertical
|
397
|
-
|
413
|
+
|
398
414
|
return self._parent
|
399
|
-
|
415
|
+
|
400
416
|
def snap_to_content(
|
401
417
|
self,
|
402
|
-
markers: Union[str, List[str], "ElementCollection", None] =
|
403
|
-
align: Literal[
|
418
|
+
markers: Union[str, List[str], "ElementCollection", None] = "text",
|
419
|
+
align: Literal["left", "right", "center"] = "left",
|
404
420
|
tolerance: float = 5,
|
405
|
-
obj: Optional[Union["Page", "Region"]] = None
|
421
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
406
422
|
) -> "Guides":
|
407
423
|
"""
|
408
424
|
Snap guides in this axis to nearby text content.
|
409
|
-
|
425
|
+
|
410
426
|
Args:
|
411
427
|
markers: Content to snap to. Can be:
|
412
428
|
- str: single selector or literal text (default: 'text' for all text)
|
413
|
-
- List[str]: list of selectors or literal text strings
|
429
|
+
- List[str]: list of selectors or literal text strings
|
414
430
|
- ElementCollection: collection of elements
|
415
431
|
- None: no markers (no snapping)
|
416
432
|
align: How to align to the found text
|
417
433
|
tolerance: Maximum distance to move when snapping
|
418
434
|
obj: Object to search (uses parent's context if None)
|
419
|
-
|
435
|
+
|
420
436
|
Returns:
|
421
437
|
Parent Guides object for chaining
|
422
438
|
"""
|
423
439
|
target_obj = obj or self._parent.context
|
424
440
|
if target_obj is None:
|
425
441
|
raise ValueError("No object provided and no context available")
|
426
|
-
|
442
|
+
|
427
443
|
# Handle special case of 'text' as a selector for all text
|
428
|
-
if markers ==
|
444
|
+
if markers == "text":
|
429
445
|
# Get all text elements
|
430
|
-
if hasattr(target_obj,
|
431
|
-
text_elements = target_obj.find_all(
|
432
|
-
if hasattr(text_elements,
|
446
|
+
if hasattr(target_obj, "find_all"):
|
447
|
+
text_elements = target_obj.find_all("text")
|
448
|
+
if hasattr(text_elements, "elements"):
|
433
449
|
text_elements = text_elements.elements
|
434
|
-
|
450
|
+
|
435
451
|
# Snap each guide to the nearest text element
|
436
452
|
for i, guide_pos in enumerate(self.data):
|
437
|
-
best_distance = float(
|
453
|
+
best_distance = float("inf")
|
438
454
|
best_pos = guide_pos
|
439
|
-
|
455
|
+
|
440
456
|
for elem in text_elements:
|
441
457
|
# Calculate target position based on alignment
|
442
|
-
if self._axis ==
|
443
|
-
if align ==
|
458
|
+
if self._axis == "vertical":
|
459
|
+
if align == "left":
|
444
460
|
elem_pos = elem.x0
|
445
|
-
elif align ==
|
461
|
+
elif align == "right":
|
446
462
|
elem_pos = elem.x1
|
447
463
|
else: # center
|
448
464
|
elem_pos = (elem.x0 + elem.x1) / 2
|
449
465
|
else: # horizontal
|
450
|
-
if align ==
|
466
|
+
if align == "left": # top for horizontal
|
451
467
|
elem_pos = elem.top
|
452
|
-
elif align ==
|
468
|
+
elif align == "right": # bottom for horizontal
|
453
469
|
elem_pos = elem.bottom
|
454
470
|
else: # center
|
455
471
|
elem_pos = (elem.top + elem.bottom) / 2
|
456
|
-
|
472
|
+
|
457
473
|
# Check if this is closer than current best
|
458
474
|
distance = abs(guide_pos - elem_pos)
|
459
475
|
if distance < best_distance and distance <= tolerance:
|
460
476
|
best_distance = distance
|
461
477
|
best_pos = elem_pos
|
462
|
-
|
478
|
+
|
463
479
|
# Update guide position if we found a good snap
|
464
480
|
if best_pos != guide_pos:
|
465
481
|
self.data[i] = best_pos
|
466
|
-
logger.debug(
|
482
|
+
logger.debug(
|
483
|
+
f"Snapped {self._axis} guide from {guide_pos:.1f} to {best_pos:.1f}"
|
484
|
+
)
|
467
485
|
else:
|
468
486
|
logger.warning("Object does not support find_all for text snapping")
|
469
487
|
else:
|
470
488
|
# Original behavior for specific markers
|
471
489
|
marker_texts = _normalize_markers(markers, target_obj)
|
472
|
-
|
490
|
+
|
473
491
|
# Find each marker and snap guides
|
474
492
|
for marker in marker_texts:
|
475
|
-
if hasattr(target_obj,
|
493
|
+
if hasattr(target_obj, "find"):
|
476
494
|
element = target_obj.find(f'text:contains("{marker}")')
|
477
495
|
if not element:
|
478
496
|
logger.warning(f"Could not find text '{marker}' for snapping")
|
479
497
|
continue
|
480
|
-
|
498
|
+
|
481
499
|
# Determine target position based on alignment
|
482
|
-
if self._axis ==
|
483
|
-
if align ==
|
500
|
+
if self._axis == "vertical":
|
501
|
+
if align == "left":
|
484
502
|
target_pos = element.x0
|
485
|
-
elif align ==
|
503
|
+
elif align == "right":
|
486
504
|
target_pos = element.x1
|
487
505
|
else: # center
|
488
506
|
target_pos = (element.x0 + element.x1) / 2
|
489
507
|
else: # horizontal
|
490
|
-
if align ==
|
508
|
+
if align == "left": # top for horizontal
|
491
509
|
target_pos = element.top
|
492
|
-
elif align ==
|
510
|
+
elif align == "right": # bottom for horizontal
|
493
511
|
target_pos = element.bottom
|
494
512
|
else: # center
|
495
513
|
target_pos = (element.top + element.bottom) / 2
|
496
|
-
|
514
|
+
|
497
515
|
# Find closest guide and snap if within tolerance
|
498
516
|
if self.data:
|
499
|
-
closest_idx = min(
|
500
|
-
|
517
|
+
closest_idx = min(
|
518
|
+
range(len(self.data)), key=lambda i: abs(self.data[i] - target_pos)
|
519
|
+
)
|
501
520
|
if abs(self.data[closest_idx] - target_pos) <= tolerance:
|
502
521
|
self.data[closest_idx] = target_pos
|
503
|
-
|
522
|
+
|
504
523
|
# Sort after snapping
|
505
524
|
self.data.sort()
|
506
525
|
return self._parent
|
507
|
-
|
526
|
+
|
508
527
|
def shift(self, index: int, offset: float) -> "Guides":
|
509
528
|
"""
|
510
529
|
Move a specific guide in this axis by a offset amount.
|
511
|
-
|
530
|
+
|
512
531
|
Args:
|
513
532
|
index: Index of the guide to move
|
514
533
|
offset: Amount to move (positive = right/down)
|
515
|
-
|
534
|
+
|
516
535
|
Returns:
|
517
536
|
Parent Guides object for chaining
|
518
537
|
"""
|
@@ -521,18 +540,18 @@ class GuidesList(UserList):
|
|
521
540
|
self.data.sort()
|
522
541
|
else:
|
523
542
|
logger.warning(f"Guide index {index} out of range for {self._axis} axis")
|
524
|
-
|
543
|
+
|
525
544
|
return self._parent
|
526
|
-
|
545
|
+
|
527
546
|
def add(self, position: Union[float, List[float]]) -> "Guides":
|
528
547
|
"""
|
529
548
|
Add one or more guides at the specified position(s).
|
530
|
-
|
549
|
+
|
531
550
|
Args:
|
532
551
|
position: Coordinate(s) to add guide(s) at. Can be:
|
533
552
|
- float: single position
|
534
553
|
- List[float]: multiple positions
|
535
|
-
|
554
|
+
|
536
555
|
Returns:
|
537
556
|
Parent Guides object for chaining
|
538
557
|
"""
|
@@ -543,34 +562,34 @@ class GuidesList(UserList):
|
|
543
562
|
else:
|
544
563
|
# Add single position
|
545
564
|
self.append(float(position))
|
546
|
-
|
565
|
+
|
547
566
|
self.data.sort()
|
548
567
|
return self._parent
|
549
|
-
|
568
|
+
|
550
569
|
def remove_at(self, index: int) -> "Guides":
|
551
570
|
"""
|
552
571
|
Remove a guide by index.
|
553
|
-
|
572
|
+
|
554
573
|
Args:
|
555
574
|
index: Index of guide to remove
|
556
|
-
|
575
|
+
|
557
576
|
Returns:
|
558
577
|
Parent Guides object for chaining
|
559
578
|
"""
|
560
579
|
if 0 <= index < len(self.data):
|
561
580
|
self.data.pop(index)
|
562
581
|
return self._parent
|
563
|
-
|
582
|
+
|
564
583
|
def clear_all(self) -> "Guides":
|
565
584
|
"""
|
566
585
|
Remove all guides from this axis.
|
567
|
-
|
586
|
+
|
568
587
|
Returns:
|
569
588
|
Parent Guides object for chaining
|
570
589
|
"""
|
571
590
|
self.data.clear()
|
572
591
|
return self._parent
|
573
|
-
|
592
|
+
|
574
593
|
def __add__(self, other):
|
575
594
|
"""Handle addition of GuidesList objects by returning combined data."""
|
576
595
|
if isinstance(other, GuidesList):
|
@@ -584,11 +603,11 @@ class GuidesList(UserList):
|
|
584
603
|
class Guides:
|
585
604
|
"""
|
586
605
|
Manages vertical and horizontal guide lines for table extraction and layout analysis.
|
587
|
-
|
606
|
+
|
588
607
|
Guides are collections of coordinates that can be used to define table boundaries,
|
589
608
|
column positions, or general layout structures. They can be created through various
|
590
609
|
detection methods or manually specified.
|
591
|
-
|
610
|
+
|
592
611
|
Attributes:
|
593
612
|
verticals: List of x-coordinates for vertical guide lines
|
594
613
|
horizontals: List of y-coordinates for horizontal guide lines
|
@@ -596,7 +615,7 @@ class Guides:
|
|
596
615
|
bounds: Optional bounding box (x0, y0, x1, y1) for relative coordinate conversion
|
597
616
|
snap_behavior: How to handle failed snapping operations ('warn', 'ignore', 'raise')
|
598
617
|
"""
|
599
|
-
|
618
|
+
|
600
619
|
def __init__(
|
601
620
|
self,
|
602
621
|
verticals: Optional[Union[List[float], "Page", "Region"]] = None,
|
@@ -604,52 +623,63 @@ class Guides:
|
|
604
623
|
context: Optional[Union["Page", "Region"]] = None,
|
605
624
|
bounds: Optional[Tuple[float, float, float, float]] = None,
|
606
625
|
relative: bool = False,
|
607
|
-
snap_behavior: Literal[
|
626
|
+
snap_behavior: Literal["raise", "warn", "ignore"] = "warn",
|
608
627
|
):
|
609
628
|
"""
|
610
629
|
Initialize a Guides object.
|
611
|
-
|
630
|
+
|
612
631
|
Args:
|
613
632
|
verticals: List of x-coordinates for vertical guides, or a Page/Region as context
|
614
|
-
horizontals: List of y-coordinates for horizontal guides
|
633
|
+
horizontals: List of y-coordinates for horizontal guides
|
615
634
|
context: Page or Region object these guides were created from
|
616
635
|
bounds: Bounding box (x0, top, x1, bottom) if context not provided
|
617
636
|
relative: Whether coordinates are relative (0-1) or absolute
|
618
637
|
snap_behavior: How to handle snapping conflicts ('raise', 'warn', or 'ignore')
|
619
638
|
"""
|
620
639
|
# Handle Guides(page) shorthand
|
621
|
-
if
|
640
|
+
if (
|
641
|
+
verticals is not None
|
642
|
+
and not isinstance(verticals, (list, tuple))
|
643
|
+
and horizontals is None
|
644
|
+
and context is None
|
645
|
+
):
|
622
646
|
# First argument is a page/region, not coordinates
|
623
647
|
context = verticals
|
624
648
|
verticals = None
|
625
|
-
|
649
|
+
|
626
650
|
self.context = context
|
627
651
|
self.bounds = bounds
|
628
652
|
self.relative = relative
|
629
653
|
self.snap_behavior = snap_behavior
|
630
|
-
|
654
|
+
|
631
655
|
# Initialize with GuidesList instances
|
632
656
|
self._vertical = GuidesList(self, "vertical", sorted([float(x) for x in (verticals or [])]))
|
633
|
-
self._horizontal = GuidesList(
|
634
|
-
|
657
|
+
self._horizontal = GuidesList(
|
658
|
+
self, "horizontal", sorted([float(y) for y in (horizontals or [])])
|
659
|
+
)
|
660
|
+
|
635
661
|
# Determine bounds from context if needed
|
636
662
|
if self.bounds is None and self.context is not None:
|
637
|
-
if hasattr(self.context,
|
663
|
+
if hasattr(self.context, "bbox"):
|
638
664
|
self.bounds = self.context.bbox
|
639
|
-
elif hasattr(self.context,
|
640
|
-
self.bounds = (
|
641
|
-
|
642
|
-
|
665
|
+
elif hasattr(self.context, "x0"):
|
666
|
+
self.bounds = (
|
667
|
+
self.context.x0,
|
668
|
+
self.context.top,
|
669
|
+
self.context.x1,
|
670
|
+
self.context.bottom,
|
671
|
+
)
|
672
|
+
|
643
673
|
# Convert relative to absolute if needed
|
644
674
|
if self.relative and self.bounds:
|
645
675
|
x0, top, x1, bottom = self.bounds
|
646
676
|
width = x1 - x0
|
647
677
|
height = bottom - top
|
648
|
-
|
678
|
+
|
649
679
|
self._vertical.data = [x0 + v * width for v in self._vertical]
|
650
680
|
self._horizontal.data = [top + h * height for h in self._horizontal]
|
651
681
|
self.relative = False
|
652
|
-
|
682
|
+
|
653
683
|
@property
|
654
684
|
def vertical(self) -> GuidesList:
|
655
685
|
"""Get vertical guide coordinates."""
|
@@ -665,8 +695,10 @@ class Guides:
|
|
665
695
|
self._vertical.data = sorted([float(x) for x in value.vertical])
|
666
696
|
elif isinstance(value, str):
|
667
697
|
# Explicitly reject strings to avoid confusing iteration over characters
|
668
|
-
raise TypeError(
|
669
|
-
|
698
|
+
raise TypeError(
|
699
|
+
f"vertical cannot be a string, got '{value}'. Use a list of coordinates or Guides object."
|
700
|
+
)
|
701
|
+
elif hasattr(value, "__iter__"):
|
670
702
|
# Handle list/tuple of coordinates
|
671
703
|
try:
|
672
704
|
self._vertical.data = sorted([float(x) for x in value])
|
@@ -690,8 +722,10 @@ class Guides:
|
|
690
722
|
self._horizontal.data = sorted([float(y) for y in value.horizontal])
|
691
723
|
elif isinstance(value, str):
|
692
724
|
# Explicitly reject strings
|
693
|
-
raise TypeError(
|
694
|
-
|
725
|
+
raise TypeError(
|
726
|
+
f"horizontal cannot be a string, got '{value}'. Use a list of coordinates or Guides object."
|
727
|
+
)
|
728
|
+
elif hasattr(value, "__iter__"):
|
695
729
|
# Handle list/tuple of coordinates
|
696
730
|
try:
|
697
731
|
self._horizontal.data = sorted([float(y) for y in value])
|
@@ -699,24 +733,24 @@ class Guides:
|
|
699
733
|
raise TypeError(f"horizontal must contain numeric values, got {value}: {e}")
|
700
734
|
else:
|
701
735
|
raise TypeError(f"horizontal must be a list, Guides object, or None, got {type(value)}")
|
702
|
-
|
736
|
+
|
703
737
|
def _get_context_bounds(self) -> Optional[Tuple[float, float, float, float]]:
|
704
738
|
"""Get bounds from context if available."""
|
705
739
|
if self.context is None:
|
706
740
|
return None
|
707
|
-
|
708
|
-
if hasattr(self.context,
|
741
|
+
|
742
|
+
if hasattr(self.context, "bbox"):
|
709
743
|
return self.context.bbox
|
710
|
-
elif hasattr(self.context,
|
744
|
+
elif hasattr(self.context, "x0") and hasattr(self.context, "top"):
|
711
745
|
return (self.context.x0, self.context.top, self.context.x1, self.context.bottom)
|
712
|
-
elif hasattr(self.context,
|
746
|
+
elif hasattr(self.context, "width") and hasattr(self.context, "height"):
|
713
747
|
return (0, 0, self.context.width, self.context.height)
|
714
748
|
return None
|
715
|
-
|
749
|
+
|
716
750
|
# -------------------------------------------------------------------------
|
717
751
|
# Factory Methods
|
718
752
|
# -------------------------------------------------------------------------
|
719
|
-
|
753
|
+
|
720
754
|
@classmethod
|
721
755
|
def divide(
|
722
756
|
cls,
|
@@ -724,28 +758,28 @@ class Guides:
|
|
724
758
|
n: Optional[int] = None,
|
725
759
|
cols: Optional[int] = None,
|
726
760
|
rows: Optional[int] = None,
|
727
|
-
axis: Literal[
|
761
|
+
axis: Literal["vertical", "horizontal", "both"] = "both",
|
728
762
|
) -> "Guides":
|
729
763
|
"""
|
730
764
|
Create guides by evenly dividing an object.
|
731
|
-
|
765
|
+
|
732
766
|
Args:
|
733
767
|
obj: Object to divide (Page, Region, or bbox tuple)
|
734
768
|
n: Number of divisions (creates n+1 guides). Used if cols/rows not specified.
|
735
769
|
cols: Number of columns (creates cols+1 vertical guides)
|
736
770
|
rows: Number of rows (creates rows+1 horizontal guides)
|
737
771
|
axis: Which axis to divide along
|
738
|
-
|
772
|
+
|
739
773
|
Returns:
|
740
774
|
New Guides object with evenly spaced lines
|
741
|
-
|
775
|
+
|
742
776
|
Examples:
|
743
777
|
# Divide into 3 columns
|
744
778
|
guides = Guides.divide(page, cols=3)
|
745
|
-
|
779
|
+
|
746
780
|
# Divide into 5 rows
|
747
781
|
guides = Guides.divide(region, rows=5)
|
748
|
-
|
782
|
+
|
749
783
|
# Divide both axes
|
750
784
|
guides = Guides.divide(page, cols=3, rows=5)
|
751
785
|
"""
|
@@ -755,52 +789,52 @@ class Guides:
|
|
755
789
|
context = None
|
756
790
|
else:
|
757
791
|
context = obj
|
758
|
-
if hasattr(obj,
|
792
|
+
if hasattr(obj, "bbox"):
|
759
793
|
bounds = obj.bbox
|
760
|
-
elif hasattr(obj,
|
794
|
+
elif hasattr(obj, "x0"):
|
761
795
|
bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
|
762
796
|
else:
|
763
797
|
bounds = (0, 0, obj.width, obj.height)
|
764
|
-
|
798
|
+
|
765
799
|
x0, y0, x1, y1 = bounds
|
766
800
|
verticals = []
|
767
801
|
horizontals = []
|
768
|
-
|
802
|
+
|
769
803
|
# Handle vertical guides
|
770
|
-
if axis in (
|
804
|
+
if axis in ("vertical", "both"):
|
771
805
|
n_vertical = cols + 1 if cols is not None else (n + 1 if n is not None else 0)
|
772
806
|
if n_vertical > 0:
|
773
807
|
for i in range(n_vertical):
|
774
808
|
x = x0 + (x1 - x0) * i / (n_vertical - 1)
|
775
809
|
verticals.append(float(x))
|
776
|
-
|
810
|
+
|
777
811
|
# Handle horizontal guides
|
778
|
-
if axis in (
|
812
|
+
if axis in ("horizontal", "both"):
|
779
813
|
n_horizontal = rows + 1 if rows is not None else (n + 1 if n is not None else 0)
|
780
814
|
if n_horizontal > 0:
|
781
815
|
for i in range(n_horizontal):
|
782
816
|
y = y0 + (y1 - y0) * i / (n_horizontal - 1)
|
783
817
|
horizontals.append(float(y))
|
784
|
-
|
818
|
+
|
785
819
|
return cls(verticals=verticals, horizontals=horizontals, context=context, bounds=bounds)
|
786
|
-
|
820
|
+
|
787
821
|
@classmethod
|
788
822
|
def from_lines(
|
789
823
|
cls,
|
790
824
|
obj: Union["Page", "Region"],
|
791
|
-
axis: Literal[
|
792
|
-
threshold: Union[float, str] =
|
825
|
+
axis: Literal["vertical", "horizontal", "both"] = "both",
|
826
|
+
threshold: Union[float, str] = "auto",
|
793
827
|
source_label: Optional[str] = None,
|
794
828
|
max_lines_h: Optional[int] = None,
|
795
829
|
max_lines_v: Optional[int] = None,
|
796
830
|
outer: bool = False,
|
797
|
-
detection_method: str =
|
831
|
+
detection_method: str = "vector",
|
798
832
|
resolution: int = 192,
|
799
|
-
**detect_kwargs
|
833
|
+
**detect_kwargs,
|
800
834
|
) -> "Guides":
|
801
835
|
"""
|
802
836
|
Create guides from detected line elements.
|
803
|
-
|
837
|
+
|
804
838
|
Args:
|
805
839
|
obj: Page or Region to detect lines from
|
806
840
|
axis: Which orientations to detect
|
@@ -818,108 +852,128 @@ class Guides:
|
|
818
852
|
- morph_op_h/v: Morphological operations ('open', 'close', 'none')
|
819
853
|
- smoothing_sigma_h/v: Gaussian smoothing sigma
|
820
854
|
- method: 'projection' (default) or 'lsd' (requires opencv)
|
821
|
-
|
855
|
+
|
822
856
|
Returns:
|
823
857
|
New Guides object with detected line positions
|
824
858
|
"""
|
825
859
|
# Get bounds for potential outer guides
|
826
|
-
if hasattr(obj,
|
860
|
+
if hasattr(obj, "bbox"):
|
827
861
|
bounds = obj.bbox
|
828
|
-
elif hasattr(obj,
|
862
|
+
elif hasattr(obj, "x0"):
|
829
863
|
bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
|
830
|
-
elif hasattr(obj,
|
864
|
+
elif hasattr(obj, "width"):
|
831
865
|
bounds = (0, 0, obj.width, obj.height)
|
832
866
|
else:
|
833
867
|
bounds = None
|
834
|
-
|
868
|
+
|
835
869
|
verticals = []
|
836
870
|
horizontals = []
|
837
|
-
|
838
|
-
if detection_method ==
|
871
|
+
|
872
|
+
if detection_method == "pixels":
|
839
873
|
# Use pixel-based line detection
|
840
|
-
if not hasattr(obj,
|
874
|
+
if not hasattr(obj, "detect_lines"):
|
841
875
|
raise ValueError(f"Object {obj} does not support pixel-based line detection")
|
842
|
-
|
876
|
+
|
843
877
|
# Set up detection parameters
|
844
878
|
detect_params = {
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
879
|
+
"resolution": resolution,
|
880
|
+
"source_label": source_label or "guides_detection",
|
881
|
+
"horizontal": axis in ("horizontal", "both"),
|
882
|
+
"vertical": axis in ("vertical", "both"),
|
883
|
+
"replace": True, # Replace any existing lines with this source
|
884
|
+
"method": detect_kwargs.get("method", "projection"),
|
851
885
|
}
|
852
|
-
|
886
|
+
|
853
887
|
# Handle threshold parameter
|
854
|
-
if threshold ==
|
888
|
+
if threshold == "auto":
|
855
889
|
# Auto mode: use very low thresholds with max_lines constraints
|
856
|
-
detect_params[
|
857
|
-
detect_params[
|
858
|
-
detect_params[
|
859
|
-
detect_params[
|
890
|
+
detect_params["peak_threshold_h"] = 0.0
|
891
|
+
detect_params["peak_threshold_v"] = 0.0
|
892
|
+
detect_params["max_lines_h"] = max_lines_h
|
893
|
+
detect_params["max_lines_v"] = max_lines_v
|
860
894
|
else:
|
861
895
|
# Fixed threshold mode
|
862
|
-
detect_params[
|
863
|
-
|
864
|
-
|
865
|
-
detect_params[
|
866
|
-
|
896
|
+
detect_params["peak_threshold_h"] = (
|
897
|
+
float(threshold) if axis in ("horizontal", "both") else 1.0
|
898
|
+
)
|
899
|
+
detect_params["peak_threshold_v"] = (
|
900
|
+
float(threshold) if axis in ("vertical", "both") else 1.0
|
901
|
+
)
|
902
|
+
detect_params["max_lines_h"] = max_lines_h
|
903
|
+
detect_params["max_lines_v"] = max_lines_v
|
904
|
+
|
867
905
|
# Add any additional detection parameters
|
868
|
-
for key in [
|
869
|
-
|
870
|
-
|
871
|
-
|
906
|
+
for key in [
|
907
|
+
"min_gap_h",
|
908
|
+
"min_gap_v",
|
909
|
+
"binarization_method",
|
910
|
+
"adaptive_thresh_block_size",
|
911
|
+
"adaptive_thresh_C_val",
|
912
|
+
"morph_op_h",
|
913
|
+
"morph_kernel_h",
|
914
|
+
"morph_op_v",
|
915
|
+
"morph_kernel_v",
|
916
|
+
"smoothing_sigma_h",
|
917
|
+
"smoothing_sigma_v",
|
918
|
+
"peak_width_rel_height",
|
919
|
+
]:
|
872
920
|
if key in detect_kwargs:
|
873
921
|
detect_params[key] = detect_kwargs[key]
|
874
|
-
|
922
|
+
|
875
923
|
# Perform the detection
|
876
924
|
obj.detect_lines(**detect_params)
|
877
|
-
|
925
|
+
|
878
926
|
# Now get the detected lines and use them
|
879
|
-
if hasattr(obj,
|
927
|
+
if hasattr(obj, "lines"):
|
880
928
|
lines = obj.lines
|
881
|
-
elif hasattr(obj,
|
882
|
-
lines = obj.find_all(
|
929
|
+
elif hasattr(obj, "find_all"):
|
930
|
+
lines = obj.find_all("line")
|
883
931
|
else:
|
884
932
|
lines = []
|
885
|
-
|
933
|
+
|
886
934
|
# Filter by the source we just used
|
887
|
-
lines = [
|
888
|
-
|
935
|
+
lines = [
|
936
|
+
l for l in lines if getattr(l, "source", None) == detect_params["source_label"]
|
937
|
+
]
|
938
|
+
|
889
939
|
else: # detection_method == 'vector' (default)
|
890
940
|
# Get existing lines from the object
|
891
|
-
if hasattr(obj,
|
941
|
+
if hasattr(obj, "lines"):
|
892
942
|
lines = obj.lines
|
893
|
-
elif hasattr(obj,
|
894
|
-
lines = obj.find_all(
|
943
|
+
elif hasattr(obj, "find_all"):
|
944
|
+
lines = obj.find_all("line")
|
895
945
|
else:
|
896
946
|
logger.warning(f"Object {obj} has no lines or find_all method")
|
897
947
|
lines = []
|
898
|
-
|
948
|
+
|
899
949
|
# Filter by source if specified
|
900
950
|
if source_label:
|
901
|
-
lines = [l for l in lines if getattr(l,
|
902
|
-
|
951
|
+
lines = [l for l in lines if getattr(l, "source", None) == source_label]
|
952
|
+
|
903
953
|
# Process lines (same logic for both methods)
|
904
954
|
# Separate lines by orientation and collect with metadata for ranking
|
905
955
|
h_line_data = [] # (y_coord, length, line_obj)
|
906
956
|
v_line_data = [] # (x_coord, length, line_obj)
|
907
|
-
|
957
|
+
|
908
958
|
for line in lines:
|
909
|
-
if hasattr(line,
|
910
|
-
if line.is_horizontal and axis in (
|
959
|
+
if hasattr(line, "is_horizontal") and hasattr(line, "is_vertical"):
|
960
|
+
if line.is_horizontal and axis in ("horizontal", "both"):
|
911
961
|
# Use the midpoint y-coordinate for horizontal lines
|
912
962
|
y = (line.top + line.bottom) / 2
|
913
963
|
# Calculate line length for ranking
|
914
|
-
length = getattr(
|
964
|
+
length = getattr(
|
965
|
+
line, "width", abs(getattr(line, "x1", 0) - getattr(line, "x0", 0))
|
966
|
+
)
|
915
967
|
h_line_data.append((y, length, line))
|
916
|
-
elif line.is_vertical and axis in (
|
968
|
+
elif line.is_vertical and axis in ("vertical", "both"):
|
917
969
|
# Use the midpoint x-coordinate for vertical lines
|
918
970
|
x = (line.x0 + line.x1) / 2
|
919
971
|
# Calculate line length for ranking
|
920
|
-
length = getattr(
|
972
|
+
length = getattr(
|
973
|
+
line, "height", abs(getattr(line, "bottom", 0) - getattr(line, "top", 0))
|
974
|
+
)
|
921
975
|
v_line_data.append((x, length, line))
|
922
|
-
|
976
|
+
|
923
977
|
# Process horizontal lines
|
924
978
|
if max_lines_h is not None and h_line_data:
|
925
979
|
# Sort by length (longer lines are typically more significant)
|
@@ -928,12 +982,14 @@ class Guides:
|
|
928
982
|
selected_h = h_line_data[:max_lines_h]
|
929
983
|
# Extract just the coordinates and sort by position
|
930
984
|
horizontals = sorted([coord for coord, _, _ in selected_h])
|
931
|
-
logger.debug(
|
985
|
+
logger.debug(
|
986
|
+
f"Selected {len(horizontals)} horizontal lines from {len(h_line_data)} candidates"
|
987
|
+
)
|
932
988
|
else:
|
933
989
|
# Use all horizontal lines (original behavior)
|
934
990
|
horizontals = [coord for coord, _, _ in h_line_data]
|
935
991
|
horizontals = sorted(list(set(horizontals)))
|
936
|
-
|
992
|
+
|
937
993
|
# Process vertical lines
|
938
994
|
if max_lines_v is not None and v_line_data:
|
939
995
|
# Sort by length (longer lines are typically more significant)
|
@@ -942,115 +998,117 @@ class Guides:
|
|
942
998
|
selected_v = v_line_data[:max_lines_v]
|
943
999
|
# Extract just the coordinates and sort by position
|
944
1000
|
verticals = sorted([coord for coord, _, _ in selected_v])
|
945
|
-
logger.debug(
|
1001
|
+
logger.debug(
|
1002
|
+
f"Selected {len(verticals)} vertical lines from {len(v_line_data)} candidates"
|
1003
|
+
)
|
946
1004
|
else:
|
947
1005
|
# Use all vertical lines (original behavior)
|
948
1006
|
verticals = [coord for coord, _, _ in v_line_data]
|
949
1007
|
verticals = sorted(list(set(verticals)))
|
950
|
-
|
1008
|
+
|
951
1009
|
# Add outer guides if requested
|
952
1010
|
if outer and bounds:
|
953
|
-
if axis in (
|
1011
|
+
if axis in ("vertical", "both"):
|
954
1012
|
if not verticals or verticals[0] > bounds[0]:
|
955
1013
|
verticals.insert(0, bounds[0]) # x0
|
956
1014
|
if not verticals or verticals[-1] < bounds[2]:
|
957
1015
|
verticals.append(bounds[2]) # x1
|
958
|
-
if axis in (
|
1016
|
+
if axis in ("horizontal", "both"):
|
959
1017
|
if not horizontals or horizontals[0] > bounds[1]:
|
960
1018
|
horizontals.insert(0, bounds[1]) # y0
|
961
1019
|
if not horizontals or horizontals[-1] < bounds[3]:
|
962
1020
|
horizontals.append(bounds[3]) # y1
|
963
|
-
|
1021
|
+
|
964
1022
|
# Remove duplicates and sort again
|
965
1023
|
verticals = sorted(list(set(verticals)))
|
966
1024
|
horizontals = sorted(list(set(horizontals)))
|
967
|
-
|
1025
|
+
|
968
1026
|
return cls(verticals=verticals, horizontals=horizontals, context=obj, bounds=bounds)
|
969
|
-
|
1027
|
+
|
970
1028
|
@classmethod
|
971
1029
|
def from_content(
|
972
1030
|
cls,
|
973
1031
|
obj: Union["Page", "Region"],
|
974
|
-
axis: Literal[
|
1032
|
+
axis: Literal["vertical", "horizontal"] = "vertical",
|
975
1033
|
markers: Union[str, List[str], "ElementCollection", None] = None,
|
976
|
-
align: Literal[
|
1034
|
+
align: Literal["left", "right", "center", "between"] = "left",
|
977
1035
|
outer: bool = True,
|
978
|
-
tolerance: float = 5
|
1036
|
+
tolerance: float = 5,
|
979
1037
|
) -> "Guides":
|
980
1038
|
"""
|
981
1039
|
Create guides based on text content positions.
|
982
|
-
|
1040
|
+
|
983
1041
|
Args:
|
984
1042
|
obj: Page or Region to search for content
|
985
1043
|
axis: Whether to create vertical or horizontal guides
|
986
1044
|
markers: Content to search for. Can be:
|
987
1045
|
- str: single selector (e.g., 'text:contains("Name")') or literal text
|
988
|
-
- List[str]: list of selectors or literal text strings
|
1046
|
+
- List[str]: list of selectors or literal text strings
|
989
1047
|
- ElementCollection: collection of elements to extract text from
|
990
1048
|
- None: no markers
|
991
1049
|
align: Where to place guides relative to found text
|
992
1050
|
outer: Whether to add guides at the boundaries
|
993
1051
|
tolerance: Maximum distance to search for text
|
994
|
-
|
1052
|
+
|
995
1053
|
Returns:
|
996
1054
|
New Guides object aligned to text content
|
997
1055
|
"""
|
998
1056
|
guides_coords = []
|
999
1057
|
bounds = None
|
1000
|
-
|
1058
|
+
|
1001
1059
|
# Get bounds from object
|
1002
|
-
if hasattr(obj,
|
1060
|
+
if hasattr(obj, "bbox"):
|
1003
1061
|
bounds = obj.bbox
|
1004
|
-
elif hasattr(obj,
|
1062
|
+
elif hasattr(obj, "x0"):
|
1005
1063
|
bounds = (obj.x0, obj.top, obj.x1, obj.bottom)
|
1006
|
-
elif hasattr(obj,
|
1064
|
+
elif hasattr(obj, "width"):
|
1007
1065
|
bounds = (0, 0, obj.width, obj.height)
|
1008
|
-
|
1066
|
+
|
1009
1067
|
# Normalize markers to list of text strings
|
1010
1068
|
marker_texts = _normalize_markers(markers, obj)
|
1011
|
-
|
1069
|
+
|
1012
1070
|
# Find each marker and determine guide position
|
1013
1071
|
for marker in marker_texts:
|
1014
|
-
if hasattr(obj,
|
1072
|
+
if hasattr(obj, "find"):
|
1015
1073
|
element = obj.find(f'text:contains("{marker}")')
|
1016
1074
|
if element:
|
1017
|
-
if axis ==
|
1018
|
-
if align ==
|
1075
|
+
if axis == "vertical":
|
1076
|
+
if align == "left":
|
1019
1077
|
guides_coords.append(element.x0)
|
1020
|
-
elif align ==
|
1078
|
+
elif align == "right":
|
1021
1079
|
guides_coords.append(element.x1)
|
1022
|
-
elif align ==
|
1080
|
+
elif align == "center":
|
1023
1081
|
guides_coords.append((element.x0 + element.x1) / 2)
|
1024
|
-
elif align ==
|
1082
|
+
elif align == "between":
|
1025
1083
|
# For between, collect left edges for processing later
|
1026
1084
|
guides_coords.append(element.x0)
|
1027
1085
|
else: # horizontal
|
1028
|
-
if align ==
|
1086
|
+
if align == "left": # top for horizontal
|
1029
1087
|
guides_coords.append(element.top)
|
1030
|
-
elif align ==
|
1088
|
+
elif align == "right": # bottom for horizontal
|
1031
1089
|
guides_coords.append(element.bottom)
|
1032
|
-
elif align ==
|
1090
|
+
elif align == "center":
|
1033
1091
|
guides_coords.append((element.top + element.bottom) / 2)
|
1034
|
-
elif align ==
|
1092
|
+
elif align == "between":
|
1035
1093
|
# For between, collect top edges for processing later
|
1036
1094
|
guides_coords.append(element.top)
|
1037
|
-
|
1095
|
+
|
1038
1096
|
# Handle 'between' alignment - find midpoints between adjacent markers
|
1039
|
-
if align ==
|
1097
|
+
if align == "between" and len(guides_coords) >= 2:
|
1040
1098
|
# We need to get the right and left edges of each marker
|
1041
1099
|
marker_bounds = []
|
1042
1100
|
for marker in marker_texts:
|
1043
|
-
if hasattr(obj,
|
1101
|
+
if hasattr(obj, "find"):
|
1044
1102
|
element = obj.find(f'text:contains("{marker}")')
|
1045
1103
|
if element:
|
1046
|
-
if axis ==
|
1104
|
+
if axis == "vertical":
|
1047
1105
|
marker_bounds.append((element.x0, element.x1))
|
1048
1106
|
else: # horizontal
|
1049
1107
|
marker_bounds.append((element.top, element.bottom))
|
1050
|
-
|
1108
|
+
|
1051
1109
|
# Sort markers by their left edge (or top edge for horizontal)
|
1052
1110
|
marker_bounds.sort(key=lambda x: x[0])
|
1053
|
-
|
1111
|
+
|
1054
1112
|
# Create guides at midpoints between adjacent markers
|
1055
1113
|
between_coords = []
|
1056
1114
|
for i in range(len(marker_bounds) - 1):
|
@@ -1059,79 +1117,78 @@ class Guides:
|
|
1059
1117
|
left_edge_next = marker_bounds[i + 1][0]
|
1060
1118
|
midpoint = (right_edge_current + left_edge_next) / 2
|
1061
1119
|
between_coords.append(midpoint)
|
1062
|
-
|
1120
|
+
|
1063
1121
|
guides_coords = between_coords
|
1064
|
-
|
1122
|
+
|
1065
1123
|
# Add outer guides if requested
|
1066
1124
|
if outer and bounds:
|
1067
|
-
if axis ==
|
1125
|
+
if axis == "vertical":
|
1068
1126
|
guides_coords.insert(0, bounds[0]) # x0
|
1069
|
-
guides_coords.append(bounds[2])
|
1127
|
+
guides_coords.append(bounds[2]) # x1
|
1070
1128
|
else:
|
1071
1129
|
guides_coords.insert(0, bounds[1]) # y0
|
1072
|
-
guides_coords.append(bounds[3])
|
1073
|
-
|
1130
|
+
guides_coords.append(bounds[3]) # y1
|
1131
|
+
|
1074
1132
|
# Remove duplicates and sort
|
1075
1133
|
guides_coords = sorted(list(set(guides_coords)))
|
1076
|
-
|
1134
|
+
|
1077
1135
|
# Create guides object
|
1078
|
-
if axis ==
|
1136
|
+
if axis == "vertical":
|
1079
1137
|
return cls(verticals=guides_coords, context=obj, bounds=bounds)
|
1080
1138
|
else:
|
1081
1139
|
return cls(horizontals=guides_coords, context=obj, bounds=bounds)
|
1082
|
-
|
1140
|
+
|
1083
1141
|
@classmethod
|
1084
1142
|
def from_whitespace(
|
1085
1143
|
cls,
|
1086
1144
|
obj: Union["Page", "Region"],
|
1087
|
-
axis: Literal[
|
1088
|
-
min_gap: float = 10
|
1145
|
+
axis: Literal["vertical", "horizontal", "both"] = "both",
|
1146
|
+
min_gap: float = 10,
|
1089
1147
|
) -> "Guides":
|
1090
1148
|
"""
|
1091
1149
|
Create guides by detecting whitespace gaps.
|
1092
|
-
|
1150
|
+
|
1093
1151
|
Args:
|
1094
1152
|
obj: Page or Region to analyze
|
1095
1153
|
min_gap: Minimum gap size to consider as whitespace
|
1096
1154
|
axis: Which axes to analyze for gaps
|
1097
|
-
|
1155
|
+
|
1098
1156
|
Returns:
|
1099
1157
|
New Guides object positioned at whitespace gaps
|
1100
1158
|
"""
|
1101
1159
|
# This is a placeholder - would need sophisticated gap detection
|
1102
1160
|
logger.info("Whitespace detection not yet implemented, using divide instead")
|
1103
1161
|
return cls.divide(obj, n=3, axis=axis)
|
1104
|
-
|
1162
|
+
|
1105
1163
|
@classmethod
|
1106
|
-
def new(
|
1107
|
-
cls,
|
1108
|
-
context: Optional[Union["Page", "Region"]] = None
|
1109
|
-
) -> "Guides":
|
1164
|
+
def new(cls, context: Optional[Union["Page", "Region"]] = None) -> "Guides":
|
1110
1165
|
"""
|
1111
1166
|
Create a new empty Guides object, optionally with a context.
|
1112
|
-
|
1167
|
+
|
1113
1168
|
This provides a clean way to start building guides through chaining:
|
1114
1169
|
guides = Guides.new(page).add_content(axis='vertical', markers=[...])
|
1115
|
-
|
1170
|
+
|
1116
1171
|
Args:
|
1117
1172
|
context: Optional Page or Region to use as default context for operations
|
1118
|
-
|
1173
|
+
|
1119
1174
|
Returns:
|
1120
1175
|
New empty Guides object
|
1121
1176
|
"""
|
1122
1177
|
return cls(verticals=[], horizontals=[], context=context)
|
1123
|
-
|
1178
|
+
|
1124
1179
|
# -------------------------------------------------------------------------
|
1125
1180
|
# Manipulation Methods
|
1126
1181
|
# -------------------------------------------------------------------------
|
1127
|
-
|
1182
|
+
|
1128
1183
|
def snap_to_whitespace(
|
1129
1184
|
self,
|
1130
|
-
axis: str =
|
1185
|
+
axis: str = "vertical",
|
1131
1186
|
min_gap: float = 10.0,
|
1132
|
-
detection_method: str =
|
1133
|
-
threshold: Union[
|
1134
|
-
|
1187
|
+
detection_method: str = "pixels", # 'pixels' or 'text'
|
1188
|
+
threshold: Union[
|
1189
|
+
float, str
|
1190
|
+
] = "auto", # threshold for what counts as a trough (0.0-1.0) or 'auto'
|
1191
|
+
on_no_snap: str = "warn",
|
1135
1192
|
) -> "Guides":
|
1136
1193
|
"""
|
1137
1194
|
Snap guides to nearby whitespace gaps (troughs) using optimal assignment.
|
@@ -1161,11 +1218,11 @@ class Guides:
|
|
1161
1218
|
logger.warning("No text elements found for whitespace detection")
|
1162
1219
|
return self
|
1163
1220
|
|
1164
|
-
if axis ==
|
1221
|
+
if axis == "vertical":
|
1165
1222
|
gaps = self._find_vertical_whitespace_gaps(text_elements, min_gap, threshold)
|
1166
1223
|
if gaps:
|
1167
1224
|
self._snap_guides_to_gaps(self.vertical.data, gaps, axis)
|
1168
|
-
elif axis ==
|
1225
|
+
elif axis == "horizontal":
|
1169
1226
|
gaps = self._find_horizontal_whitespace_gaps(text_elements, min_gap, threshold)
|
1170
1227
|
if gaps:
|
1171
1228
|
self._snap_guides_to_gaps(self.horizontal.data, gaps, axis)
|
@@ -1177,25 +1234,22 @@ class Guides:
|
|
1177
1234
|
self.horizontal.data[:] = [float(y) for y in self.horizontal.data]
|
1178
1235
|
|
1179
1236
|
return self
|
1180
|
-
|
1237
|
+
|
1181
1238
|
def shift(
|
1182
|
-
self,
|
1183
|
-
index: int,
|
1184
|
-
offset: float,
|
1185
|
-
axis: Literal['vertical', 'horizontal'] = 'vertical'
|
1239
|
+
self, index: int, offset: float, axis: Literal["vertical", "horizontal"] = "vertical"
|
1186
1240
|
) -> "Guides":
|
1187
1241
|
"""
|
1188
1242
|
Move a specific guide by a offset amount.
|
1189
|
-
|
1243
|
+
|
1190
1244
|
Args:
|
1191
1245
|
index: Index of the guide to move
|
1192
1246
|
offset: Amount to move (positive = right/down)
|
1193
1247
|
axis: Which guide list to modify
|
1194
|
-
|
1248
|
+
|
1195
1249
|
Returns:
|
1196
1250
|
Self for method chaining
|
1197
1251
|
"""
|
1198
|
-
if axis ==
|
1252
|
+
if axis == "vertical":
|
1199
1253
|
if 0 <= index < len(self.vertical):
|
1200
1254
|
self.vertical[index] += offset
|
1201
1255
|
self.vertical = sorted(self.vertical)
|
@@ -1207,123 +1261,127 @@ class Guides:
|
|
1207
1261
|
self.horizontal = sorted(self.horizontal)
|
1208
1262
|
else:
|
1209
1263
|
logger.warning(f"Horizontal guide index {index} out of range")
|
1210
|
-
|
1264
|
+
|
1211
1265
|
return self
|
1212
|
-
|
1266
|
+
|
1213
1267
|
def add_vertical(self, x: float) -> "Guides":
|
1214
1268
|
"""Add a vertical guide at the specified x-coordinate."""
|
1215
1269
|
self.vertical.append(x)
|
1216
1270
|
self.vertical = sorted(self.vertical)
|
1217
1271
|
return self
|
1218
|
-
|
1272
|
+
|
1219
1273
|
def add_horizontal(self, y: float) -> "Guides":
|
1220
1274
|
"""Add a horizontal guide at the specified y-coordinate."""
|
1221
1275
|
self.horizontal.append(y)
|
1222
1276
|
self.horizontal = sorted(self.horizontal)
|
1223
1277
|
return self
|
1224
|
-
|
1278
|
+
|
1225
1279
|
def remove_vertical(self, index: int) -> "Guides":
|
1226
1280
|
"""Remove a vertical guide by index."""
|
1227
1281
|
if 0 <= index < len(self.vertical):
|
1228
1282
|
self.vertical.pop(index)
|
1229
1283
|
return self
|
1230
|
-
|
1284
|
+
|
1231
1285
|
def remove_horizontal(self, index: int) -> "Guides":
|
1232
1286
|
"""Remove a horizontal guide by index."""
|
1233
1287
|
if 0 <= index < len(self.horizontal):
|
1234
1288
|
self.horizontal.pop(index)
|
1235
1289
|
return self
|
1236
|
-
|
1290
|
+
|
1237
1291
|
# -------------------------------------------------------------------------
|
1238
1292
|
# Operations
|
1239
1293
|
# -------------------------------------------------------------------------
|
1240
|
-
|
1294
|
+
|
1241
1295
|
def __add__(self, other: "Guides") -> "Guides":
|
1242
1296
|
"""
|
1243
1297
|
Combine two guide sets.
|
1244
|
-
|
1298
|
+
|
1245
1299
|
Returns:
|
1246
1300
|
New Guides object with combined coordinates
|
1247
1301
|
"""
|
1248
1302
|
# Combine and deduplicate coordinates, ensuring Python floats
|
1249
1303
|
combined_verticals = sorted([float(x) for x in set(self.vertical + other.vertical)])
|
1250
1304
|
combined_horizontals = sorted([float(y) for y in set(self.horizontal + other.horizontal)])
|
1251
|
-
|
1305
|
+
|
1252
1306
|
# Use context from self if available
|
1253
1307
|
return Guides(
|
1254
1308
|
verticals=combined_verticals,
|
1255
1309
|
horizontals=combined_horizontals,
|
1256
1310
|
context=self.context or other.context,
|
1257
|
-
bounds=self.bounds or other.bounds
|
1311
|
+
bounds=self.bounds or other.bounds,
|
1258
1312
|
)
|
1259
|
-
|
1313
|
+
|
1260
1314
|
def show(self, on=None, **kwargs):
|
1261
1315
|
"""
|
1262
1316
|
Display the guides overlaid on a page or region.
|
1263
|
-
|
1317
|
+
|
1264
1318
|
Args:
|
1265
1319
|
on: Page, Region, PIL Image, or string to display guides on.
|
1266
1320
|
If None, uses self.context (the object guides were created from).
|
1267
1321
|
If string 'page', uses the page from self.context.
|
1268
1322
|
**kwargs: Additional arguments passed to to_image() if applicable.
|
1269
|
-
|
1323
|
+
|
1270
1324
|
Returns:
|
1271
1325
|
PIL Image with guides drawn on it.
|
1272
1326
|
"""
|
1273
1327
|
# Determine what to display guides on
|
1274
1328
|
target = on if on is not None else self.context
|
1275
|
-
|
1329
|
+
|
1276
1330
|
# Handle string shortcuts
|
1277
1331
|
if isinstance(target, str):
|
1278
|
-
if target ==
|
1279
|
-
if hasattr(self.context,
|
1332
|
+
if target == "page":
|
1333
|
+
if hasattr(self.context, "page"):
|
1280
1334
|
target = self.context.page
|
1281
|
-
elif hasattr(self.context,
|
1335
|
+
elif hasattr(self.context, "_page"):
|
1282
1336
|
target = self.context._page
|
1283
1337
|
else:
|
1284
1338
|
raise ValueError("Cannot resolve 'page' - context has no page attribute")
|
1285
1339
|
else:
|
1286
1340
|
raise ValueError(f"Unknown string target: {target}. Only 'page' is supported.")
|
1287
|
-
|
1341
|
+
|
1288
1342
|
if target is None:
|
1289
1343
|
raise ValueError("No target specified and no context available for guides display")
|
1290
|
-
|
1344
|
+
|
1291
1345
|
# Prepare kwargs for image generation
|
1292
1346
|
image_kwargs = kwargs.copy()
|
1293
|
-
|
1347
|
+
|
1294
1348
|
# Always turn off highlights to avoid visual clutter
|
1295
|
-
image_kwargs[
|
1296
|
-
|
1349
|
+
image_kwargs["include_highlights"] = False
|
1350
|
+
|
1297
1351
|
# If target is a region-like object, crop to just that region
|
1298
|
-
if hasattr(target,
|
1352
|
+
if hasattr(target, "bbox") and hasattr(target, "page"):
|
1299
1353
|
# This is likely a Region
|
1300
|
-
image_kwargs[
|
1301
|
-
|
1354
|
+
image_kwargs["crop"] = True
|
1355
|
+
|
1302
1356
|
# Get base image
|
1303
|
-
if hasattr(target,
|
1357
|
+
if hasattr(target, "to_image"):
|
1304
1358
|
img = target.to_image(**image_kwargs)
|
1305
|
-
elif hasattr(target,
|
1359
|
+
elif hasattr(target, "mode") and hasattr(target, "size"):
|
1306
1360
|
# It's already a PIL Image
|
1307
1361
|
img = target
|
1308
1362
|
else:
|
1309
1363
|
raise ValueError(f"Object {target} does not support to_image() and is not a PIL Image")
|
1310
|
-
|
1364
|
+
|
1311
1365
|
if img is None:
|
1312
1366
|
raise ValueError("Failed to generate base image")
|
1313
|
-
|
1367
|
+
|
1314
1368
|
# Create a copy to draw on
|
1315
1369
|
img = img.copy()
|
1316
1370
|
draw = ImageDraw.Draw(img)
|
1317
|
-
|
1371
|
+
|
1318
1372
|
# Determine scale factor for coordinate conversion
|
1319
|
-
if
|
1373
|
+
if (
|
1374
|
+
hasattr(target, "width")
|
1375
|
+
and hasattr(target, "height")
|
1376
|
+
and not (hasattr(target, "mode") and hasattr(target, "size"))
|
1377
|
+
):
|
1320
1378
|
# target is a PDF object (Page/Region) with PDF coordinates
|
1321
1379
|
scale_x = img.width / target.width
|
1322
1380
|
scale_y = img.height / target.height
|
1323
|
-
|
1381
|
+
|
1324
1382
|
# If we're showing guides on a region, we need to adjust coordinates
|
1325
1383
|
# to be relative to the region's origin
|
1326
|
-
if hasattr(target,
|
1384
|
+
if hasattr(target, "bbox") and hasattr(target, "page"):
|
1327
1385
|
# This is a Region - adjust guide coordinates to be relative to region
|
1328
1386
|
region_x0, region_top = target.x0, target.top
|
1329
1387
|
else:
|
@@ -1334,7 +1392,7 @@ class Guides:
|
|
1334
1392
|
scale_x = 1.0
|
1335
1393
|
scale_y = 1.0
|
1336
1394
|
region_x0, region_top = 0, 0
|
1337
|
-
|
1395
|
+
|
1338
1396
|
# Draw vertical guides (blue)
|
1339
1397
|
for x_coord in self.vertical:
|
1340
1398
|
# Adjust coordinate if we're showing on a region
|
@@ -1344,8 +1402,8 @@ class Guides:
|
|
1344
1402
|
if 0 <= pixel_x <= img.width - 1:
|
1345
1403
|
x_pixel = int(min(pixel_x, img.width - 1))
|
1346
1404
|
draw.line([(x_pixel, 0), (x_pixel, img.height - 1)], fill=(0, 0, 255, 200), width=2)
|
1347
|
-
|
1348
|
-
# Draw horizontal guides (red)
|
1405
|
+
|
1406
|
+
# Draw horizontal guides (red)
|
1349
1407
|
for y_coord in self.horizontal:
|
1350
1408
|
# Adjust coordinate if we're showing on a region
|
1351
1409
|
adjusted_y = y_coord - region_top
|
@@ -1354,22 +1412,22 @@ class Guides:
|
|
1354
1412
|
if 0 <= pixel_y <= img.height - 1:
|
1355
1413
|
y_pixel = int(min(pixel_y, img.height - 1))
|
1356
1414
|
draw.line([(0, y_pixel), (img.width - 1, y_pixel)], fill=(255, 0, 0, 200), width=2)
|
1357
|
-
|
1415
|
+
|
1358
1416
|
return img
|
1359
|
-
|
1417
|
+
|
1360
1418
|
# -------------------------------------------------------------------------
|
1361
1419
|
# Utility Methods
|
1362
1420
|
# -------------------------------------------------------------------------
|
1363
|
-
|
1421
|
+
|
1364
1422
|
def get_cells(self) -> List[Tuple[float, float, float, float]]:
|
1365
1423
|
"""
|
1366
1424
|
Get all cell bounding boxes from guide intersections.
|
1367
|
-
|
1425
|
+
|
1368
1426
|
Returns:
|
1369
1427
|
List of (x0, y0, x1, y1) tuples for each cell
|
1370
1428
|
"""
|
1371
1429
|
cells = []
|
1372
|
-
|
1430
|
+
|
1373
1431
|
# Create cells from guide intersections
|
1374
1432
|
for i in range(len(self.vertical) - 1):
|
1375
1433
|
for j in range(len(self.horizontal) - 1):
|
@@ -1378,135 +1436,139 @@ class Guides:
|
|
1378
1436
|
y0 = self.horizontal[j]
|
1379
1437
|
y1 = self.horizontal[j + 1]
|
1380
1438
|
cells.append((x0, y0, x1, y1))
|
1381
|
-
|
1439
|
+
|
1382
1440
|
return cells
|
1383
|
-
|
1441
|
+
|
1384
1442
|
def to_dict(self) -> Dict[str, Any]:
|
1385
1443
|
"""
|
1386
1444
|
Convert to dictionary format suitable for pdfplumber table_settings.
|
1387
|
-
|
1445
|
+
|
1388
1446
|
Returns:
|
1389
1447
|
Dictionary with explicit_vertical_lines and explicit_horizontal_lines
|
1390
1448
|
"""
|
1391
1449
|
return {
|
1392
|
-
|
1393
|
-
|
1450
|
+
"explicit_vertical_lines": self.vertical,
|
1451
|
+
"explicit_horizontal_lines": self.horizontal,
|
1394
1452
|
}
|
1395
|
-
|
1453
|
+
|
1396
1454
|
def to_relative(self) -> "Guides":
|
1397
1455
|
"""
|
1398
1456
|
Convert absolute coordinates to relative (0-1) coordinates.
|
1399
|
-
|
1457
|
+
|
1400
1458
|
Returns:
|
1401
1459
|
New Guides object with relative coordinates
|
1402
1460
|
"""
|
1403
1461
|
if self.relative:
|
1404
1462
|
return self # Already relative
|
1405
|
-
|
1463
|
+
|
1406
1464
|
if not self.bounds:
|
1407
1465
|
raise ValueError("Cannot convert to relative without bounds")
|
1408
|
-
|
1466
|
+
|
1409
1467
|
x0, y0, x1, y1 = self.bounds
|
1410
1468
|
width = x1 - x0
|
1411
1469
|
height = y1 - y0
|
1412
|
-
|
1470
|
+
|
1413
1471
|
rel_verticals = [(x - x0) / width for x in self.vertical]
|
1414
1472
|
rel_horizontals = [(y - y0) / height for y in self.horizontal]
|
1415
|
-
|
1473
|
+
|
1416
1474
|
return Guides(
|
1417
1475
|
verticals=rel_verticals,
|
1418
1476
|
horizontals=rel_horizontals,
|
1419
1477
|
context=self.context,
|
1420
1478
|
bounds=(0, 0, 1, 1),
|
1421
|
-
relative=True
|
1479
|
+
relative=True,
|
1422
1480
|
)
|
1423
|
-
|
1481
|
+
|
1424
1482
|
def to_absolute(self, bounds: Tuple[float, float, float, float]) -> "Guides":
|
1425
1483
|
"""
|
1426
1484
|
Convert relative coordinates to absolute coordinates.
|
1427
|
-
|
1485
|
+
|
1428
1486
|
Args:
|
1429
1487
|
bounds: Target bounding box (x0, y0, x1, y1)
|
1430
|
-
|
1488
|
+
|
1431
1489
|
Returns:
|
1432
1490
|
New Guides object with absolute coordinates
|
1433
1491
|
"""
|
1434
1492
|
if not self.relative:
|
1435
1493
|
return self # Already absolute
|
1436
|
-
|
1494
|
+
|
1437
1495
|
x0, y0, x1, y1 = bounds
|
1438
1496
|
width = x1 - x0
|
1439
1497
|
height = y1 - y0
|
1440
|
-
|
1498
|
+
|
1441
1499
|
abs_verticals = [x0 + x * width for x in self.vertical]
|
1442
1500
|
abs_horizontals = [y0 + y * height for y in self.horizontal]
|
1443
|
-
|
1501
|
+
|
1444
1502
|
return Guides(
|
1445
1503
|
verticals=abs_verticals,
|
1446
1504
|
horizontals=abs_horizontals,
|
1447
1505
|
context=self.context,
|
1448
1506
|
bounds=bounds,
|
1449
|
-
relative=False
|
1507
|
+
relative=False,
|
1450
1508
|
)
|
1451
|
-
|
1509
|
+
|
1452
1510
|
@property
|
1453
1511
|
def n_rows(self) -> int:
|
1454
1512
|
"""Number of rows defined by horizontal guides."""
|
1455
1513
|
return max(0, len(self.horizontal) - 1)
|
1456
|
-
|
1514
|
+
|
1457
1515
|
@property
|
1458
1516
|
def n_cols(self) -> int:
|
1459
1517
|
"""Number of columns defined by vertical guides."""
|
1460
1518
|
return max(0, len(self.vertical) - 1)
|
1461
|
-
|
1519
|
+
|
1462
1520
|
def _handle_snap_failure(self, message: str):
|
1463
1521
|
"""Handle cases where snapping cannot be performed."""
|
1464
|
-
if hasattr(self,
|
1465
|
-
if self.on_no_snap ==
|
1522
|
+
if hasattr(self, "on_no_snap"):
|
1523
|
+
if self.on_no_snap == "warn":
|
1466
1524
|
logger.warning(message)
|
1467
|
-
elif self.on_no_snap ==
|
1525
|
+
elif self.on_no_snap == "raise":
|
1468
1526
|
raise ValueError(message)
|
1469
1527
|
# 'ignore' case: do nothing
|
1470
1528
|
else:
|
1471
1529
|
logger.warning(message) # Default behavior
|
1472
1530
|
|
1473
|
-
def _find_vertical_whitespace_gaps(
|
1531
|
+
def _find_vertical_whitespace_gaps(
|
1532
|
+
self, text_elements, min_gap: float, threshold: Union[float, str] = "auto"
|
1533
|
+
) -> List[Tuple[float, float]]:
|
1474
1534
|
"""
|
1475
1535
|
Find vertical whitespace gaps using bbox-based density analysis.
|
1476
1536
|
Returns list of (start, end) tuples representing trough ranges.
|
1477
1537
|
"""
|
1478
1538
|
if not self.bounds:
|
1479
1539
|
return []
|
1480
|
-
|
1540
|
+
|
1481
1541
|
x0, _, x1, _ = self.bounds
|
1482
1542
|
width_pixels = int(x1 - x0)
|
1483
|
-
|
1543
|
+
|
1484
1544
|
if width_pixels <= 0:
|
1485
1545
|
return []
|
1486
|
-
|
1546
|
+
|
1487
1547
|
# Create density histogram: count bbox overlaps per x-coordinate
|
1488
1548
|
density = np.zeros(width_pixels)
|
1489
|
-
|
1549
|
+
|
1490
1550
|
for element in text_elements:
|
1491
|
-
if not hasattr(element,
|
1551
|
+
if not hasattr(element, "x0") or not hasattr(element, "x1"):
|
1492
1552
|
continue
|
1493
|
-
|
1553
|
+
|
1494
1554
|
# Clip coordinates to bounds
|
1495
1555
|
elem_x0 = max(x0, element.x0) - x0
|
1496
1556
|
elem_x1 = min(x1, element.x1) - x0
|
1497
|
-
|
1557
|
+
|
1498
1558
|
if elem_x1 > elem_x0:
|
1499
1559
|
start_px = int(elem_x0)
|
1500
1560
|
end_px = int(elem_x1)
|
1501
1561
|
density[start_px:end_px] += 1
|
1502
|
-
|
1562
|
+
|
1503
1563
|
if density.max() == 0:
|
1504
1564
|
return []
|
1505
|
-
|
1565
|
+
|
1506
1566
|
# Determine the threshold value
|
1507
|
-
if threshold ==
|
1567
|
+
if threshold == "auto":
|
1508
1568
|
# Auto mode: try different thresholds with step 0.05 until we have enough troughs
|
1509
|
-
guides_needing_troughs = len(
|
1569
|
+
guides_needing_troughs = len(
|
1570
|
+
[g for i, g in enumerate(self.vertical) if 0 < i < len(self.vertical) - 1]
|
1571
|
+
)
|
1510
1572
|
if guides_needing_troughs == 0:
|
1511
1573
|
threshold_val = 0.5 # Default when no guides need placement
|
1512
1574
|
else:
|
@@ -1515,9 +1577,11 @@ class Guides:
|
|
1515
1577
|
test_gaps = self._find_gaps_with_threshold(density, test_threshold, min_gap, x0)
|
1516
1578
|
if len(test_gaps) >= guides_needing_troughs:
|
1517
1579
|
threshold_val = test_threshold
|
1518
|
-
logger.debug(
|
1580
|
+
logger.debug(
|
1581
|
+
f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)"
|
1582
|
+
)
|
1519
1583
|
break
|
1520
|
-
|
1584
|
+
|
1521
1585
|
if threshold_val is None:
|
1522
1586
|
threshold_val = 0.8 # Fallback to permissive threshold
|
1523
1587
|
logger.debug(f"Auto threshold fallback to {threshold_val}")
|
@@ -1526,93 +1590,103 @@ class Guides:
|
|
1526
1590
|
if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
|
1527
1591
|
raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
|
1528
1592
|
threshold_val = float(threshold)
|
1529
|
-
|
1593
|
+
|
1530
1594
|
return self._find_gaps_with_threshold(density, threshold_val, min_gap, x0)
|
1531
|
-
|
1595
|
+
|
1532
1596
|
def _find_gaps_with_threshold(self, density, threshold_val, min_gap, x0):
|
1533
1597
|
"""Helper method to find gaps given a specific threshold value."""
|
1534
1598
|
max_density = density.max()
|
1535
1599
|
threshold_density = threshold_val * max_density
|
1536
|
-
|
1600
|
+
|
1537
1601
|
# Smooth the density for better trough detection
|
1538
1602
|
from scipy.ndimage import gaussian_filter1d
|
1603
|
+
|
1539
1604
|
smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
|
1540
|
-
|
1605
|
+
|
1541
1606
|
# Find regions below threshold
|
1542
1607
|
below_threshold = smoothed_density <= threshold_density
|
1543
|
-
|
1608
|
+
|
1544
1609
|
# Find contiguous regions
|
1545
1610
|
from scipy.ndimage import label as nd_label
|
1611
|
+
|
1546
1612
|
labeled_regions, num_regions = nd_label(below_threshold)
|
1547
|
-
|
1613
|
+
|
1548
1614
|
gaps = []
|
1549
1615
|
for region_id in range(1, num_regions + 1):
|
1550
1616
|
region_mask = labeled_regions == region_id
|
1551
1617
|
region_indices = np.where(region_mask)[0]
|
1552
|
-
|
1618
|
+
|
1553
1619
|
if len(region_indices) == 0:
|
1554
1620
|
continue
|
1555
|
-
|
1621
|
+
|
1556
1622
|
start_px = region_indices[0]
|
1557
1623
|
end_px = region_indices[-1] + 1
|
1558
|
-
|
1624
|
+
|
1559
1625
|
# Convert back to PDF coordinates
|
1560
1626
|
start_pdf = x0 + start_px
|
1561
1627
|
end_pdf = x0 + end_px
|
1562
|
-
|
1628
|
+
|
1563
1629
|
# Check minimum gap size
|
1564
1630
|
if end_pdf - start_pdf >= min_gap:
|
1565
1631
|
gaps.append((start_pdf, end_pdf))
|
1566
|
-
|
1632
|
+
|
1567
1633
|
return gaps
|
1568
1634
|
|
1569
|
-
def _find_horizontal_whitespace_gaps(
|
1635
|
+
def _find_horizontal_whitespace_gaps(
|
1636
|
+
self, text_elements, min_gap: float, threshold: Union[float, str] = "auto"
|
1637
|
+
) -> List[Tuple[float, float]]:
|
1570
1638
|
"""
|
1571
1639
|
Find horizontal whitespace gaps using bbox-based density analysis.
|
1572
1640
|
Returns list of (start, end) tuples representing trough ranges.
|
1573
1641
|
"""
|
1574
1642
|
if not self.bounds:
|
1575
1643
|
return []
|
1576
|
-
|
1644
|
+
|
1577
1645
|
_, y0, _, y1 = self.bounds
|
1578
1646
|
height_pixels = int(y1 - y0)
|
1579
|
-
|
1647
|
+
|
1580
1648
|
if height_pixels <= 0:
|
1581
1649
|
return []
|
1582
|
-
|
1583
|
-
# Create density histogram: count bbox overlaps per y-coordinate
|
1650
|
+
|
1651
|
+
# Create density histogram: count bbox overlaps per y-coordinate
|
1584
1652
|
density = np.zeros(height_pixels)
|
1585
|
-
|
1653
|
+
|
1586
1654
|
for element in text_elements:
|
1587
|
-
if not hasattr(element,
|
1655
|
+
if not hasattr(element, "top") or not hasattr(element, "bottom"):
|
1588
1656
|
continue
|
1589
|
-
|
1657
|
+
|
1590
1658
|
# Clip coordinates to bounds
|
1591
1659
|
elem_top = max(y0, element.top) - y0
|
1592
1660
|
elem_bottom = min(y1, element.bottom) - y0
|
1593
|
-
|
1661
|
+
|
1594
1662
|
if elem_bottom > elem_top:
|
1595
1663
|
start_px = int(elem_top)
|
1596
1664
|
end_px = int(elem_bottom)
|
1597
1665
|
density[start_px:end_px] += 1
|
1598
|
-
|
1666
|
+
|
1599
1667
|
if density.max() == 0:
|
1600
1668
|
return []
|
1601
|
-
|
1669
|
+
|
1602
1670
|
# Determine the threshold value (same logic as vertical)
|
1603
|
-
if threshold ==
|
1604
|
-
guides_needing_troughs = len(
|
1671
|
+
if threshold == "auto":
|
1672
|
+
guides_needing_troughs = len(
|
1673
|
+
[g for i, g in enumerate(self.horizontal) if 0 < i < len(self.horizontal) - 1]
|
1674
|
+
)
|
1605
1675
|
if guides_needing_troughs == 0:
|
1606
1676
|
threshold_val = 0.5 # Default when no guides need placement
|
1607
1677
|
else:
|
1608
1678
|
threshold_val = None
|
1609
1679
|
for test_threshold in np.arange(0.1, 1.0, 0.05):
|
1610
|
-
test_gaps = self._find_gaps_with_threshold_horizontal(
|
1680
|
+
test_gaps = self._find_gaps_with_threshold_horizontal(
|
1681
|
+
density, test_threshold, min_gap, y0
|
1682
|
+
)
|
1611
1683
|
if len(test_gaps) >= guides_needing_troughs:
|
1612
1684
|
threshold_val = test_threshold
|
1613
|
-
logger.debug(
|
1685
|
+
logger.debug(
|
1686
|
+
f"Auto threshold found: {test_threshold:.2f} (found {len(test_gaps)} troughs for {guides_needing_troughs} guides)"
|
1687
|
+
)
|
1614
1688
|
break
|
1615
|
-
|
1689
|
+
|
1616
1690
|
if threshold_val is None:
|
1617
1691
|
threshold_val = 0.8 # Fallback to permissive threshold
|
1618
1692
|
logger.debug(f"Auto threshold fallback to {threshold_val}")
|
@@ -1621,141 +1695,157 @@ class Guides:
|
|
1621
1695
|
if not isinstance(threshold, (int, float)) or not (0.0 <= threshold <= 1.0):
|
1622
1696
|
raise ValueError("threshold must be a number between 0.0 and 1.0, or 'auto'")
|
1623
1697
|
threshold_val = float(threshold)
|
1624
|
-
|
1698
|
+
|
1625
1699
|
return self._find_gaps_with_threshold_horizontal(density, threshold_val, min_gap, y0)
|
1626
|
-
|
1700
|
+
|
1627
1701
|
def _find_gaps_with_threshold_horizontal(self, density, threshold_val, min_gap, y0):
|
1628
1702
|
"""Helper method to find horizontal gaps given a specific threshold value."""
|
1629
1703
|
max_density = density.max()
|
1630
1704
|
threshold_density = threshold_val * max_density
|
1631
|
-
|
1705
|
+
|
1632
1706
|
# Smooth the density for better trough detection
|
1633
1707
|
from scipy.ndimage import gaussian_filter1d
|
1708
|
+
|
1634
1709
|
smoothed_density = gaussian_filter1d(density.astype(float), sigma=1.0)
|
1635
|
-
|
1710
|
+
|
1636
1711
|
# Find regions below threshold
|
1637
1712
|
below_threshold = smoothed_density <= threshold_density
|
1638
|
-
|
1713
|
+
|
1639
1714
|
# Find contiguous regions
|
1640
1715
|
from scipy.ndimage import label as nd_label
|
1716
|
+
|
1641
1717
|
labeled_regions, num_regions = nd_label(below_threshold)
|
1642
|
-
|
1718
|
+
|
1643
1719
|
gaps = []
|
1644
1720
|
for region_id in range(1, num_regions + 1):
|
1645
1721
|
region_mask = labeled_regions == region_id
|
1646
1722
|
region_indices = np.where(region_mask)[0]
|
1647
|
-
|
1723
|
+
|
1648
1724
|
if len(region_indices) == 0:
|
1649
1725
|
continue
|
1650
|
-
|
1726
|
+
|
1651
1727
|
start_px = region_indices[0]
|
1652
1728
|
end_px = region_indices[-1] + 1
|
1653
|
-
|
1729
|
+
|
1654
1730
|
# Convert back to PDF coordinates
|
1655
1731
|
start_pdf = y0 + start_px
|
1656
1732
|
end_pdf = y0 + end_px
|
1657
|
-
|
1733
|
+
|
1658
1734
|
# Check minimum gap size
|
1659
1735
|
if end_pdf - start_pdf >= min_gap:
|
1660
1736
|
gaps.append((start_pdf, end_pdf))
|
1661
|
-
|
1737
|
+
|
1662
1738
|
return gaps
|
1663
|
-
|
1664
|
-
def _find_vertical_element_gaps(
|
1739
|
+
|
1740
|
+
def _find_vertical_element_gaps(
|
1741
|
+
self, text_elements, min_gap: float
|
1742
|
+
) -> List[Tuple[float, float]]:
|
1665
1743
|
"""
|
1666
1744
|
Find vertical whitespace gaps using text element spacing analysis.
|
1667
1745
|
Returns list of (start, end) tuples representing trough ranges.
|
1668
1746
|
"""
|
1669
1747
|
if not self.bounds or not text_elements:
|
1670
1748
|
return []
|
1671
|
-
|
1749
|
+
|
1672
1750
|
x0, _, x1, _ = self.bounds
|
1673
|
-
|
1751
|
+
|
1674
1752
|
# Get all element right and left edges
|
1675
1753
|
element_edges = []
|
1676
1754
|
for element in text_elements:
|
1677
|
-
if not hasattr(element,
|
1755
|
+
if not hasattr(element, "x0") or not hasattr(element, "x1"):
|
1678
1756
|
continue
|
1679
1757
|
# Only include elements that overlap vertically with our bounds
|
1680
|
-
if hasattr(element,
|
1758
|
+
if hasattr(element, "top") and hasattr(element, "bottom"):
|
1681
1759
|
if element.bottom < self.bounds[1] or element.top > self.bounds[3]:
|
1682
1760
|
continue
|
1683
1761
|
element_edges.extend([element.x0, element.x1])
|
1684
|
-
|
1762
|
+
|
1685
1763
|
if not element_edges:
|
1686
1764
|
return []
|
1687
|
-
|
1765
|
+
|
1688
1766
|
# Sort edges and find gaps
|
1689
1767
|
element_edges = sorted(set(element_edges))
|
1690
|
-
|
1768
|
+
|
1691
1769
|
trough_ranges = []
|
1692
1770
|
for i in range(len(element_edges) - 1):
|
1693
1771
|
gap_start = element_edges[i]
|
1694
1772
|
gap_end = element_edges[i + 1]
|
1695
1773
|
gap_width = gap_end - gap_start
|
1696
|
-
|
1774
|
+
|
1697
1775
|
if gap_width >= min_gap:
|
1698
1776
|
# Check if this gap actually contains no text (is empty space)
|
1699
1777
|
gap_has_text = False
|
1700
1778
|
for element in text_elements:
|
1701
|
-
if (
|
1702
|
-
element
|
1779
|
+
if (
|
1780
|
+
hasattr(element, "x0")
|
1781
|
+
and hasattr(element, "x1")
|
1782
|
+
and element.x0 < gap_end
|
1783
|
+
and element.x1 > gap_start
|
1784
|
+
):
|
1703
1785
|
gap_has_text = True
|
1704
1786
|
break
|
1705
|
-
|
1787
|
+
|
1706
1788
|
if not gap_has_text:
|
1707
1789
|
trough_ranges.append((gap_start, gap_end))
|
1708
|
-
|
1790
|
+
|
1709
1791
|
return trough_ranges
|
1710
|
-
|
1711
|
-
def _find_horizontal_element_gaps(
|
1792
|
+
|
1793
|
+
def _find_horizontal_element_gaps(
|
1794
|
+
self, text_elements, min_gap: float
|
1795
|
+
) -> List[Tuple[float, float]]:
|
1712
1796
|
"""
|
1713
1797
|
Find horizontal whitespace gaps using text element spacing analysis.
|
1714
1798
|
Returns list of (start, end) tuples representing trough ranges.
|
1715
1799
|
"""
|
1716
1800
|
if not self.bounds or not text_elements:
|
1717
1801
|
return []
|
1718
|
-
|
1802
|
+
|
1719
1803
|
_, y0, _, y1 = self.bounds
|
1720
|
-
|
1804
|
+
|
1721
1805
|
# Get all element top and bottom edges
|
1722
1806
|
element_edges = []
|
1723
1807
|
for element in text_elements:
|
1724
|
-
if not hasattr(element,
|
1808
|
+
if not hasattr(element, "top") or not hasattr(element, "bottom"):
|
1725
1809
|
continue
|
1726
1810
|
# Only include elements that overlap horizontally with our bounds
|
1727
|
-
if hasattr(element,
|
1811
|
+
if hasattr(element, "x0") and hasattr(element, "x1"):
|
1728
1812
|
if element.x1 < self.bounds[0] or element.x0 > self.bounds[2]:
|
1729
1813
|
continue
|
1730
1814
|
element_edges.extend([element.top, element.bottom])
|
1731
|
-
|
1815
|
+
|
1732
1816
|
if not element_edges:
|
1733
1817
|
return []
|
1734
|
-
|
1818
|
+
|
1735
1819
|
# Sort edges and find gaps
|
1736
1820
|
element_edges = sorted(set(element_edges))
|
1737
|
-
|
1821
|
+
|
1738
1822
|
trough_ranges = []
|
1739
1823
|
for i in range(len(element_edges) - 1):
|
1740
1824
|
gap_start = element_edges[i]
|
1741
1825
|
gap_end = element_edges[i + 1]
|
1742
1826
|
gap_width = gap_end - gap_start
|
1743
|
-
|
1827
|
+
|
1744
1828
|
if gap_width >= min_gap:
|
1745
1829
|
# Check if this gap actually contains no text (is empty space)
|
1746
1830
|
gap_has_text = False
|
1747
1831
|
for element in text_elements:
|
1748
|
-
if (
|
1749
|
-
element
|
1832
|
+
if (
|
1833
|
+
hasattr(element, "top")
|
1834
|
+
and hasattr(element, "bottom")
|
1835
|
+
and element.top < gap_end
|
1836
|
+
and element.bottom > gap_start
|
1837
|
+
):
|
1750
1838
|
gap_has_text = True
|
1751
1839
|
break
|
1752
|
-
|
1840
|
+
|
1753
1841
|
if not gap_has_text:
|
1754
1842
|
trough_ranges.append((gap_start, gap_end))
|
1755
|
-
|
1843
|
+
|
1756
1844
|
return trough_ranges
|
1757
|
-
|
1758
|
-
def _optimal_guide_assignment(
|
1845
|
+
|
1846
|
+
def _optimal_guide_assignment(
|
1847
|
+
self, guides: List[float], trough_ranges: List[Tuple[float, float]]
|
1848
|
+
) -> Dict[int, int]:
|
1759
1849
|
"""
|
1760
1850
|
Assign guides to trough ranges using the user's desired logic:
|
1761
1851
|
- Guides already in a trough stay put
|
@@ -1764,18 +1854,20 @@ class Guides:
|
|
1764
1854
|
"""
|
1765
1855
|
if not guides or not trough_ranges:
|
1766
1856
|
return {}
|
1767
|
-
|
1857
|
+
|
1768
1858
|
assignments = {}
|
1769
|
-
|
1859
|
+
|
1770
1860
|
# Step 1: Identify which guides are already in troughs
|
1771
1861
|
guides_in_troughs = set()
|
1772
1862
|
for i, guide_pos in enumerate(guides):
|
1773
1863
|
for trough_start, trough_end in trough_ranges:
|
1774
1864
|
if trough_start <= guide_pos <= trough_end:
|
1775
1865
|
guides_in_troughs.add(i)
|
1776
|
-
logger.debug(
|
1866
|
+
logger.debug(
|
1867
|
+
f"Guide {i} (pos {guide_pos:.1f}) is already in trough ({trough_start:.1f}-{trough_end:.1f}), keeping in place"
|
1868
|
+
)
|
1777
1869
|
break
|
1778
|
-
|
1870
|
+
|
1779
1871
|
# Step 2: Identify which troughs are already occupied
|
1780
1872
|
occupied_troughs = set()
|
1781
1873
|
for i in guides_in_troughs:
|
@@ -1784,21 +1876,23 @@ class Guides:
|
|
1784
1876
|
if trough_start <= guide_pos <= trough_end:
|
1785
1877
|
occupied_troughs.add(j)
|
1786
1878
|
break
|
1787
|
-
|
1879
|
+
|
1788
1880
|
# Step 3: Find guides that need reassignment (not in any trough)
|
1789
1881
|
guides_to_move = []
|
1790
1882
|
for i, guide_pos in enumerate(guides):
|
1791
1883
|
if i not in guides_in_troughs:
|
1792
1884
|
guides_to_move.append(i)
|
1793
|
-
logger.debug(
|
1794
|
-
|
1885
|
+
logger.debug(
|
1886
|
+
f"Guide {i} (pos {guide_pos:.1f}) is NOT in any trough, needs reassignment"
|
1887
|
+
)
|
1888
|
+
|
1795
1889
|
# Step 4: Find available troughs (not occupied by existing guides)
|
1796
1890
|
available_troughs = []
|
1797
1891
|
for j, (trough_start, trough_end) in enumerate(trough_ranges):
|
1798
1892
|
if j not in occupied_troughs:
|
1799
1893
|
available_troughs.append(j)
|
1800
1894
|
logger.debug(f"Trough {j} ({trough_start:.1f}-{trough_end:.1f}) is available")
|
1801
|
-
|
1895
|
+
|
1802
1896
|
# Step 5: Assign guides to move to closest available troughs
|
1803
1897
|
if guides_to_move and available_troughs:
|
1804
1898
|
# Calculate distances for all combinations
|
@@ -1810,20 +1904,22 @@ class Guides:
|
|
1810
1904
|
trough_center = (trough_start + trough_end) / 2
|
1811
1905
|
distance = abs(guide_pos - trough_center)
|
1812
1906
|
distances.append((distance, guide_idx, trough_idx))
|
1813
|
-
|
1907
|
+
|
1814
1908
|
# Sort by distance and assign greedily
|
1815
1909
|
distances.sort()
|
1816
1910
|
used_troughs = set()
|
1817
|
-
|
1911
|
+
|
1818
1912
|
for distance, guide_idx, trough_idx in distances:
|
1819
1913
|
if guide_idx not in assignments and trough_idx not in used_troughs:
|
1820
1914
|
assignments[guide_idx] = trough_idx
|
1821
1915
|
used_troughs.add(trough_idx)
|
1822
|
-
logger.debug(
|
1823
|
-
|
1916
|
+
logger.debug(
|
1917
|
+
f"Assigned guide {guide_idx} (pos {guides[guide_idx]:.1f}) to trough {trough_idx} (distance: {distance:.1f})"
|
1918
|
+
)
|
1919
|
+
|
1824
1920
|
logger.debug(f"Final assignments: {assignments}")
|
1825
1921
|
return assignments
|
1826
|
-
|
1922
|
+
|
1827
1923
|
def _snap_guides_to_gaps(self, guides: List[float], gaps: List[Tuple[float, float]], axis: str):
|
1828
1924
|
"""
|
1829
1925
|
Snap guides to nearby gaps using optimal assignment.
|
@@ -1831,15 +1927,15 @@ class Guides:
|
|
1831
1927
|
"""
|
1832
1928
|
if not guides or not gaps:
|
1833
1929
|
return
|
1834
|
-
|
1930
|
+
|
1835
1931
|
logger.debug(f"Snapping {len(guides)} {axis} guides to {len(gaps)} trough ranges")
|
1836
1932
|
for i, (start, end) in enumerate(gaps):
|
1837
1933
|
center = (start + end) / 2
|
1838
1934
|
logger.debug(f" Trough {i}: {start:.1f} to {end:.1f} (center: {center:.1f})")
|
1839
|
-
|
1935
|
+
|
1840
1936
|
# Get optimal assignments
|
1841
1937
|
assignments = self._optimal_guide_assignment(guides, gaps)
|
1842
|
-
|
1938
|
+
|
1843
1939
|
# Apply assignments (modify guides list in-place)
|
1844
1940
|
for guide_idx, trough_idx in assignments.items():
|
1845
1941
|
trough_start, trough_end = gaps[trough_idx]
|
@@ -1847,23 +1943,23 @@ class Guides:
|
|
1847
1943
|
old_pos = guides[guide_idx]
|
1848
1944
|
guides[guide_idx] = new_pos
|
1849
1945
|
logger.info(f"Snapped {axis} guide from {old_pos:.1f} to {new_pos:.1f}")
|
1850
|
-
|
1946
|
+
|
1851
1947
|
def build_grid(
|
1852
1948
|
self,
|
1853
1949
|
target: Optional[Union["Page", "Region"]] = None,
|
1854
1950
|
source: str = "guides",
|
1855
1951
|
cell_padding: float = 0.5,
|
1856
|
-
include_outer_boundaries: bool = False
|
1952
|
+
include_outer_boundaries: bool = False,
|
1857
1953
|
) -> Dict[str, int]:
|
1858
1954
|
"""
|
1859
1955
|
Create table structure (table, rows, columns, cells) from guide coordinates.
|
1860
|
-
|
1956
|
+
|
1861
1957
|
Args:
|
1862
1958
|
target: Page or Region to create regions on (uses self.context if None)
|
1863
1959
|
source: Source label for created regions (for identification)
|
1864
1960
|
cell_padding: Internal padding for cell regions in points
|
1865
1961
|
include_outer_boundaries: Whether to add boundaries at edges if missing
|
1866
|
-
|
1962
|
+
|
1867
1963
|
Returns:
|
1868
1964
|
Dictionary with counts: {'table': 1, 'rows': N, 'columns': M, 'cells': N*M}
|
1869
1965
|
"""
|
@@ -1871,98 +1967,142 @@ class Guides:
|
|
1871
1967
|
target_obj = target or self.context
|
1872
1968
|
if not target_obj:
|
1873
1969
|
raise ValueError("No target object available. Provide target parameter or context.")
|
1874
|
-
|
1970
|
+
|
1875
1971
|
# Get the page for creating regions
|
1876
|
-
if hasattr(target_obj,
|
1972
|
+
if hasattr(target_obj, "x0") and hasattr(
|
1973
|
+
target_obj, "top"
|
1974
|
+
): # Region (has bbox coordinates)
|
1877
1975
|
page = target_obj._page
|
1878
1976
|
origin_x, origin_y = target_obj.x0, target_obj.top
|
1879
1977
|
context_width, context_height = target_obj.width, target_obj.height
|
1880
|
-
elif hasattr(target_obj,
|
1978
|
+
elif hasattr(target_obj, "_element_mgr") or hasattr(target_obj, "width"): # Page
|
1881
1979
|
page = target_obj
|
1882
1980
|
origin_x, origin_y = 0.0, 0.0
|
1883
1981
|
context_width, context_height = page.width, page.height
|
1884
1982
|
else:
|
1885
1983
|
raise ValueError(f"Target object {target_obj} is not a Page or Region")
|
1886
|
-
|
1984
|
+
|
1887
1985
|
element_manager = page._element_mgr
|
1888
|
-
|
1986
|
+
|
1889
1987
|
# Setup boundaries
|
1890
1988
|
row_boundaries = list(self.horizontal)
|
1891
1989
|
col_boundaries = list(self.vertical)
|
1892
|
-
|
1990
|
+
|
1893
1991
|
# Add outer boundaries if requested and missing
|
1894
1992
|
if include_outer_boundaries:
|
1895
1993
|
if not row_boundaries or row_boundaries[0] > origin_y:
|
1896
1994
|
row_boundaries.insert(0, origin_y)
|
1897
1995
|
if not row_boundaries or row_boundaries[-1] < origin_y + context_height:
|
1898
1996
|
row_boundaries.append(origin_y + context_height)
|
1899
|
-
|
1997
|
+
|
1900
1998
|
if not col_boundaries or col_boundaries[0] > origin_x:
|
1901
1999
|
col_boundaries.insert(0, origin_x)
|
1902
2000
|
if not col_boundaries or col_boundaries[-1] < origin_x + context_width:
|
1903
2001
|
col_boundaries.append(origin_x + context_width)
|
1904
|
-
|
2002
|
+
|
1905
2003
|
# Remove duplicates and sort
|
1906
2004
|
row_boundaries = sorted(list(set(row_boundaries)))
|
1907
2005
|
col_boundaries = sorted(list(set(col_boundaries)))
|
1908
|
-
|
1909
|
-
|
1910
|
-
|
2006
|
+
|
2007
|
+
# ------------------------------------------------------------------
|
2008
|
+
# Clean-up: remove any previously created grid regions (table, rows,
|
2009
|
+
# columns, cells) that were generated by the same `source` label and
|
2010
|
+
# overlap the area we are about to populate. This prevents the page's
|
2011
|
+
# `ElementManager` from accumulating stale/duplicate regions when the
|
2012
|
+
# user rebuilds the grid multiple times.
|
2013
|
+
# ------------------------------------------------------------------
|
2014
|
+
try:
|
2015
|
+
# Bounding box of the grid we are about to create
|
2016
|
+
if row_boundaries and col_boundaries:
|
2017
|
+
grid_bbox = (
|
2018
|
+
col_boundaries[0], # x0
|
2019
|
+
row_boundaries[0], # top
|
2020
|
+
col_boundaries[-1], # x1
|
2021
|
+
row_boundaries[-1], # bottom
|
2022
|
+
)
|
2023
|
+
|
2024
|
+
def _bbox_overlap(b1, b2):
|
2025
|
+
"""Return True if two (x0, top, x1, bottom) bboxes overlap."""
|
2026
|
+
return not (
|
2027
|
+
b1[2] <= b2[0] # b1 right ≤ b2 left
|
2028
|
+
or b1[0] >= b2[2] # b1 left ≥ b2 right
|
2029
|
+
or b1[3] <= b2[1] # b1 bottom ≤ b2 top
|
2030
|
+
or b1[1] >= b2[3] # b1 top ≥ b2 bottom
|
2031
|
+
)
|
2032
|
+
|
2033
|
+
# Collect existing regions that match the source & region types
|
2034
|
+
regions_to_remove = [
|
2035
|
+
r
|
2036
|
+
for r in element_manager.regions
|
2037
|
+
if getattr(r, "source", None) == source
|
2038
|
+
and getattr(r, "region_type", None)
|
2039
|
+
in {"table", "table_row", "table_column", "table_cell"}
|
2040
|
+
and hasattr(r, "bbox")
|
2041
|
+
and _bbox_overlap(r.bbox, grid_bbox)
|
2042
|
+
]
|
2043
|
+
|
2044
|
+
for r in regions_to_remove:
|
2045
|
+
element_manager.remove_element(r, element_type="regions")
|
2046
|
+
|
2047
|
+
if regions_to_remove:
|
2048
|
+
logger.debug(
|
2049
|
+
f"Removed {len(regions_to_remove)} existing grid region(s) prior to rebuild"
|
2050
|
+
)
|
2051
|
+
except Exception as cleanup_err: # pragma: no cover – cleanup must never crash
|
2052
|
+
logger.warning(f"Grid cleanup failed: {cleanup_err}")
|
2053
|
+
|
2054
|
+
logger.debug(
|
2055
|
+
f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries"
|
2056
|
+
)
|
2057
|
+
|
1911
2058
|
# Track creation counts
|
1912
|
-
counts = {
|
1913
|
-
|
2059
|
+
counts = {"table": 0, "rows": 0, "columns": 0, "cells": 0}
|
2060
|
+
|
1914
2061
|
# Create overall table region
|
1915
2062
|
if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
|
1916
2063
|
table_region = page.create_region(
|
1917
|
-
col_boundaries[0], row_boundaries[0],
|
1918
|
-
col_boundaries[-1], row_boundaries[-1]
|
2064
|
+
col_boundaries[0], row_boundaries[0], col_boundaries[-1], row_boundaries[-1]
|
1919
2065
|
)
|
1920
2066
|
table_region.source = source
|
1921
2067
|
table_region.region_type = "table"
|
1922
2068
|
table_region.normalized_type = "table"
|
1923
|
-
table_region.metadata.update(
|
1924
|
-
|
1925
|
-
|
1926
|
-
|
1927
|
-
|
1928
|
-
|
2069
|
+
table_region.metadata.update(
|
2070
|
+
{
|
2071
|
+
"source_guides": True,
|
2072
|
+
"num_rows": len(row_boundaries) - 1,
|
2073
|
+
"num_cols": len(col_boundaries) - 1,
|
2074
|
+
"boundaries": {"rows": row_boundaries, "cols": col_boundaries},
|
2075
|
+
}
|
2076
|
+
)
|
1929
2077
|
element_manager.add_element(table_region, element_type="regions")
|
1930
|
-
counts[
|
1931
|
-
|
2078
|
+
counts["table"] = 1
|
2079
|
+
|
1932
2080
|
# Create row regions
|
1933
2081
|
if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
|
1934
2082
|
for i in range(len(row_boundaries) - 1):
|
1935
2083
|
row_region = page.create_region(
|
1936
|
-
col_boundaries[0], row_boundaries[i],
|
1937
|
-
col_boundaries[-1], row_boundaries[i + 1]
|
2084
|
+
col_boundaries[0], row_boundaries[i], col_boundaries[-1], row_boundaries[i + 1]
|
1938
2085
|
)
|
1939
2086
|
row_region.source = source
|
1940
2087
|
row_region.region_type = "table_row"
|
1941
2088
|
row_region.normalized_type = "table_row"
|
1942
|
-
row_region.metadata.update({
|
1943
|
-
"row_index": i,
|
1944
|
-
"source_guides": True
|
1945
|
-
})
|
2089
|
+
row_region.metadata.update({"row_index": i, "source_guides": True})
|
1946
2090
|
element_manager.add_element(row_region, element_type="regions")
|
1947
|
-
counts[
|
1948
|
-
|
2091
|
+
counts["rows"] += 1
|
2092
|
+
|
1949
2093
|
# Create column regions
|
1950
2094
|
if len(col_boundaries) >= 2 and len(row_boundaries) >= 2:
|
1951
2095
|
for j in range(len(col_boundaries) - 1):
|
1952
2096
|
col_region = page.create_region(
|
1953
|
-
col_boundaries[j], row_boundaries[0],
|
1954
|
-
col_boundaries[j + 1], row_boundaries[-1]
|
2097
|
+
col_boundaries[j], row_boundaries[0], col_boundaries[j + 1], row_boundaries[-1]
|
1955
2098
|
)
|
1956
2099
|
col_region.source = source
|
1957
2100
|
col_region.region_type = "table_column"
|
1958
2101
|
col_region.normalized_type = "table_column"
|
1959
|
-
col_region.metadata.update({
|
1960
|
-
"col_index": j,
|
1961
|
-
"source_guides": True
|
1962
|
-
})
|
2102
|
+
col_region.metadata.update({"col_index": j, "source_guides": True})
|
1963
2103
|
element_manager.add_element(col_region, element_type="regions")
|
1964
|
-
counts[
|
1965
|
-
|
2104
|
+
counts["columns"] += 1
|
2105
|
+
|
1966
2106
|
# Create cell regions
|
1967
2107
|
if len(row_boundaries) >= 2 and len(col_boundaries) >= 2:
|
1968
2108
|
for i in range(len(row_boundaries) - 1):
|
@@ -1972,50 +2112,58 @@ class Guides:
|
|
1972
2112
|
cell_top = row_boundaries[i] + cell_padding
|
1973
2113
|
cell_x1 = col_boundaries[j + 1] - cell_padding
|
1974
2114
|
cell_bottom = row_boundaries[i + 1] - cell_padding
|
1975
|
-
|
2115
|
+
|
1976
2116
|
# Skip invalid cells
|
1977
2117
|
if cell_x1 <= cell_x0 or cell_bottom <= cell_top:
|
1978
2118
|
continue
|
1979
|
-
|
2119
|
+
|
1980
2120
|
cell_region = page.create_region(cell_x0, cell_top, cell_x1, cell_bottom)
|
1981
2121
|
cell_region.source = source
|
1982
2122
|
cell_region.region_type = "table_cell"
|
1983
2123
|
cell_region.normalized_type = "table_cell"
|
1984
|
-
cell_region.metadata.update(
|
1985
|
-
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
"
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
2124
|
+
cell_region.metadata.update(
|
2125
|
+
{
|
2126
|
+
"row_index": i,
|
2127
|
+
"col_index": j,
|
2128
|
+
"source_guides": True,
|
2129
|
+
"original_boundaries": {
|
2130
|
+
"left": col_boundaries[j],
|
2131
|
+
"top": row_boundaries[i],
|
2132
|
+
"right": col_boundaries[j + 1],
|
2133
|
+
"bottom": row_boundaries[i + 1],
|
2134
|
+
},
|
1993
2135
|
}
|
1994
|
-
|
2136
|
+
)
|
1995
2137
|
element_manager.add_element(cell_region, element_type="regions")
|
1996
|
-
counts[
|
1997
|
-
|
1998
|
-
logger.info(
|
1999
|
-
|
2000
|
-
|
2138
|
+
counts["cells"] += 1
|
2139
|
+
|
2140
|
+
logger.info(
|
2141
|
+
f"Created {counts['table']} table, {counts['rows']} rows, "
|
2142
|
+
f"{counts['columns']} columns, and {counts['cells']} cells from guides"
|
2143
|
+
)
|
2144
|
+
|
2001
2145
|
return counts
|
2002
2146
|
|
2003
2147
|
def __repr__(self) -> str:
|
2004
2148
|
"""String representation of the guides."""
|
2005
|
-
return (
|
2006
|
-
|
2007
|
-
|
2149
|
+
return (
|
2150
|
+
f"Guides(verticals={len(self.vertical)}, "
|
2151
|
+
f"horizontals={len(self.horizontal)}, "
|
2152
|
+
f"cells={len(self.get_cells())})"
|
2153
|
+
)
|
2008
2154
|
|
2009
2155
|
def _get_text_elements(self):
|
2010
2156
|
"""Get text elements from the context."""
|
2011
2157
|
if not self.context:
|
2012
2158
|
return []
|
2013
|
-
|
2159
|
+
|
2014
2160
|
# Get text elements from the context
|
2015
|
-
if hasattr(self.context,
|
2161
|
+
if hasattr(self.context, "find_all"):
|
2016
2162
|
try:
|
2017
|
-
text_elements = self.context.find_all(
|
2018
|
-
return
|
2163
|
+
text_elements = self.context.find_all("text", apply_exclusions=False)
|
2164
|
+
return (
|
2165
|
+
text_elements.elements if hasattr(text_elements, "elements") else text_elements
|
2166
|
+
)
|
2019
2167
|
except Exception as e:
|
2020
2168
|
logger.warning(f"Error getting text elements: {e}")
|
2021
2169
|
return []
|
@@ -2026,32 +2174,32 @@ class Guides:
|
|
2026
2174
|
# -------------------------------------------------------------------------
|
2027
2175
|
# Instance methods for fluent chaining (avoid name conflicts with class methods)
|
2028
2176
|
# -------------------------------------------------------------------------
|
2029
|
-
|
2177
|
+
|
2030
2178
|
def add_content(
|
2031
2179
|
self,
|
2032
|
-
axis: Literal[
|
2180
|
+
axis: Literal["vertical", "horizontal"] = "vertical",
|
2033
2181
|
markers: Union[str, List[str], "ElementCollection", None] = None,
|
2034
2182
|
obj: Optional[Union["Page", "Region"]] = None,
|
2035
|
-
align: Literal[
|
2183
|
+
align: Literal["left", "right", "center", "between"] = "left",
|
2036
2184
|
outer: bool = True,
|
2037
|
-
tolerance: float = 5
|
2185
|
+
tolerance: float = 5,
|
2038
2186
|
) -> "Guides":
|
2039
2187
|
"""
|
2040
2188
|
Instance method: Add guides from content, allowing chaining.
|
2041
2189
|
This allows: Guides.new(page).add_content(axis='vertical', markers=[...])
|
2042
|
-
|
2190
|
+
|
2043
2191
|
Args:
|
2044
2192
|
axis: Which axis to create guides for
|
2045
2193
|
markers: Content to search for. Can be:
|
2046
2194
|
- str: single selector or literal text
|
2047
|
-
- List[str]: list of selectors or literal text strings
|
2195
|
+
- List[str]: list of selectors or literal text strings
|
2048
2196
|
- ElementCollection: collection of elements to extract text from
|
2049
2197
|
- None: no markers
|
2050
2198
|
obj: Page or Region to search (uses self.context if None)
|
2051
2199
|
align: How to align guides relative to found elements
|
2052
2200
|
outer: Whether to add outer boundary guides
|
2053
2201
|
tolerance: Tolerance for snapping to element edges
|
2054
|
-
|
2202
|
+
|
2055
2203
|
Returns:
|
2056
2204
|
Self for method chaining
|
2057
2205
|
"""
|
@@ -2059,7 +2207,7 @@ class Guides:
|
|
2059
2207
|
target_obj = obj or self.context
|
2060
2208
|
if target_obj is None:
|
2061
2209
|
raise ValueError("No object provided and no context available")
|
2062
|
-
|
2210
|
+
|
2063
2211
|
# Create new guides using the class method
|
2064
2212
|
new_guides = Guides.from_content(
|
2065
2213
|
obj=target_obj,
|
@@ -2067,34 +2215,34 @@ class Guides:
|
|
2067
2215
|
markers=markers,
|
2068
2216
|
align=align,
|
2069
2217
|
outer=outer,
|
2070
|
-
tolerance=tolerance
|
2218
|
+
tolerance=tolerance,
|
2071
2219
|
)
|
2072
|
-
|
2220
|
+
|
2073
2221
|
# Add the appropriate coordinates to this object
|
2074
|
-
if axis ==
|
2222
|
+
if axis == "vertical":
|
2075
2223
|
self.vertical = list(set(self.vertical + new_guides.vertical))
|
2076
2224
|
else:
|
2077
2225
|
self.horizontal = list(set(self.horizontal + new_guides.horizontal))
|
2078
|
-
|
2226
|
+
|
2079
2227
|
return self
|
2080
|
-
|
2228
|
+
|
2081
2229
|
def add_lines(
|
2082
2230
|
self,
|
2083
|
-
axis: Literal[
|
2084
|
-
obj: Optional[Union["Page", "Region"]] = None,
|
2085
|
-
threshold: Union[float, str] =
|
2231
|
+
axis: Literal["vertical", "horizontal", "both"] = "both",
|
2232
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
2233
|
+
threshold: Union[float, str] = "auto",
|
2086
2234
|
source_label: Optional[str] = None,
|
2087
2235
|
max_lines_h: Optional[int] = None,
|
2088
2236
|
max_lines_v: Optional[int] = None,
|
2089
2237
|
outer: bool = False,
|
2090
|
-
detection_method: str =
|
2238
|
+
detection_method: str = "vector",
|
2091
2239
|
resolution: int = 192,
|
2092
|
-
**detect_kwargs
|
2240
|
+
**detect_kwargs,
|
2093
2241
|
) -> "Guides":
|
2094
2242
|
"""
|
2095
2243
|
Instance method: Add guides from lines, allowing chaining.
|
2096
2244
|
This allows: Guides.new(page).add_lines(axis='horizontal')
|
2097
|
-
|
2245
|
+
|
2098
2246
|
Args:
|
2099
2247
|
axis: Which axis to detect lines for
|
2100
2248
|
obj: Page or Region to search (uses self.context if None)
|
@@ -2106,7 +2254,7 @@ class Guides:
|
|
2106
2254
|
detection_method: 'vector' (use existing LineElements) or 'pixels' (detect from image)
|
2107
2255
|
resolution: DPI for pixel-based detection (default: 192)
|
2108
2256
|
**detect_kwargs: Additional parameters for pixel detection (see from_lines)
|
2109
|
-
|
2257
|
+
|
2110
2258
|
Returns:
|
2111
2259
|
Self for method chaining
|
2112
2260
|
"""
|
@@ -2114,7 +2262,7 @@ class Guides:
|
|
2114
2262
|
target_obj = obj or self.context
|
2115
2263
|
if target_obj is None:
|
2116
2264
|
raise ValueError("No object provided and no context available")
|
2117
|
-
|
2265
|
+
|
2118
2266
|
# Create new guides using the class method
|
2119
2267
|
new_guides = Guides.from_lines(
|
2120
2268
|
obj=target_obj,
|
@@ -2126,32 +2274,32 @@ class Guides:
|
|
2126
2274
|
outer=outer,
|
2127
2275
|
detection_method=detection_method,
|
2128
2276
|
resolution=resolution,
|
2129
|
-
**detect_kwargs
|
2277
|
+
**detect_kwargs,
|
2130
2278
|
)
|
2131
|
-
|
2279
|
+
|
2132
2280
|
# Add the appropriate coordinates to this object
|
2133
|
-
if axis in (
|
2281
|
+
if axis in ("vertical", "both"):
|
2134
2282
|
self.vertical = list(set(self.vertical + new_guides.vertical))
|
2135
|
-
if axis in (
|
2283
|
+
if axis in ("horizontal", "both"):
|
2136
2284
|
self.horizontal = list(set(self.horizontal + new_guides.horizontal))
|
2137
|
-
|
2285
|
+
|
2138
2286
|
return self
|
2139
|
-
|
2287
|
+
|
2140
2288
|
def add_whitespace(
|
2141
2289
|
self,
|
2142
|
-
axis: Literal[
|
2290
|
+
axis: Literal["vertical", "horizontal", "both"] = "both",
|
2143
2291
|
obj: Optional[Union["Page", "Region"]] = None,
|
2144
|
-
min_gap: float = 10
|
2292
|
+
min_gap: float = 10,
|
2145
2293
|
) -> "Guides":
|
2146
2294
|
"""
|
2147
2295
|
Instance method: Add guides from whitespace, allowing chaining.
|
2148
2296
|
This allows: Guides.new(page).add_whitespace(axis='both')
|
2149
|
-
|
2297
|
+
|
2150
2298
|
Args:
|
2151
2299
|
axis: Which axis to create guides for
|
2152
2300
|
obj: Page or Region to search (uses self.context if None)
|
2153
2301
|
min_gap: Minimum gap size to consider
|
2154
|
-
|
2302
|
+
|
2155
2303
|
Returns:
|
2156
2304
|
Self for method chaining
|
2157
2305
|
"""
|
@@ -2159,18 +2307,14 @@ class Guides:
|
|
2159
2307
|
target_obj = obj or self.context
|
2160
2308
|
if target_obj is None:
|
2161
2309
|
raise ValueError("No object provided and no context available")
|
2162
|
-
|
2310
|
+
|
2163
2311
|
# Create new guides using the class method
|
2164
|
-
new_guides = Guides.from_whitespace(
|
2165
|
-
|
2166
|
-
axis=axis,
|
2167
|
-
min_gap=min_gap
|
2168
|
-
)
|
2169
|
-
|
2312
|
+
new_guides = Guides.from_whitespace(obj=target_obj, axis=axis, min_gap=min_gap)
|
2313
|
+
|
2170
2314
|
# Add the appropriate coordinates to this object
|
2171
|
-
if axis in (
|
2315
|
+
if axis in ("vertical", "both"):
|
2172
2316
|
self.vertical = list(set(self.vertical + new_guides.vertical))
|
2173
|
-
if axis in (
|
2317
|
+
if axis in ("horizontal", "both"):
|
2174
2318
|
self.horizontal = list(set(self.horizontal + new_guides.horizontal))
|
2175
|
-
|
2176
|
-
return self
|
2319
|
+
|
2320
|
+
return self
|