span-aligner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
span_aligner/__init__.py
ADDED
span_aligner/aligner.py
ADDED
|
@@ -0,0 +1,1126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SpanAligner Module
|
|
3
|
+
==================
|
|
4
|
+
|
|
5
|
+
A utility module for aligning and mapping text spans between different text representations,
|
|
6
|
+
particularly useful for Label Studio annotation compatibility.
|
|
7
|
+
|
|
8
|
+
This module provides functionality to:
|
|
9
|
+
- Sanitize span boundaries to avoid special characters
|
|
10
|
+
- Find exact and fuzzy matches of text segments in original documents
|
|
11
|
+
- Map spans from one text representation to another
|
|
12
|
+
- Rebuild tagged text with nested annotations
|
|
13
|
+
- Merge result objects containing span annotations
|
|
14
|
+
|
|
15
|
+
Typical use case: When text has been modified (e.g., cleaned, translated) and annotations
|
|
16
|
+
need to be realigned to the original or modified text.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import re
|
|
22
|
+
from difflib import SequenceMatcher
|
|
23
|
+
from typing import Dict, Any, List, Tuple, Optional, Union
|
|
24
|
+
from rapidfuzz import fuzz
|
|
25
|
+
|
|
26
|
+
# Span sanitization helper for Label Studio compatibility
|
|
27
|
+
SPECIAL_CHARS = {"\n", "\r", "\t", " "}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SpanAligner:
|
|
31
|
+
"""
|
|
32
|
+
A utility class for aligning text spans between different text representations.
|
|
33
|
+
|
|
34
|
+
This class provides static methods for:
|
|
35
|
+
- Sanitizing span boundaries
|
|
36
|
+
- Finding exact and fuzzy text matches
|
|
37
|
+
- Mapping spans from extracted/modified text back to original text
|
|
38
|
+
- Rebuilding tagged text with proper nesting
|
|
39
|
+
- Merging annotation result objects
|
|
40
|
+
|
|
41
|
+
All methods are static and the class serves as a namespace for related functionality.
|
|
42
|
+
|
|
43
|
+
Example Usage:
|
|
44
|
+
>>> original = "Hello, World!"
|
|
45
|
+
>>> result_obj = {
|
|
46
|
+
... "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
|
|
47
|
+
... "entities": [],
|
|
48
|
+
... "task": {"data": {"text": ""}}
|
|
49
|
+
... }
|
|
50
|
+
>>> success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
|
|
51
|
+
"""
|
|
52
|
+
@staticmethod
|
|
53
|
+
def sanitize_span(text: str, start: int, end: int) -> tuple[int, int]:
|
|
54
|
+
"""
|
|
55
|
+
Adjust start/end indices so they do not land on special characters.
|
|
56
|
+
|
|
57
|
+
Moves start forward and end backward to avoid whitespace and control characters
|
|
58
|
+
at span boundaries, which is important for Label Studio compatibility.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
text: The text containing the span.
|
|
62
|
+
start: The starting index of the span (inclusive).
|
|
63
|
+
end: The ending index of the span (exclusive).
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
tuple[int, int]: A tuple of (sanitized_start, sanitized_end) indices.
|
|
67
|
+
Both values are clamped to [0, len(text)] and guaranteed to satisfy start <= end.
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> SpanAligner.sanitize_span(" Hello ", 0, 9)
|
|
71
|
+
(2, 7) # Removes leading/trailing spaces
|
|
72
|
+
"""
|
|
73
|
+
n = len(text)
|
|
74
|
+
s = max(0, min(start, n))
|
|
75
|
+
e = max(0, min(end, n))
|
|
76
|
+
|
|
77
|
+
# Move start forward while on a special char and s < e
|
|
78
|
+
while s < e and s < n and text[s] in SPECIAL_CHARS:
|
|
79
|
+
s += 1
|
|
80
|
+
# Move end backward while on a special char and s < e
|
|
81
|
+
while s < e and e > 0 and text[e-1] in SPECIAL_CHARS:
|
|
82
|
+
e -= 1
|
|
83
|
+
|
|
84
|
+
return s, e
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def _sequence_similarity(a: str, b: str) -> float:
|
|
88
|
+
"""
|
|
89
|
+
Calculate the similarity ratio between two strings using SequenceMatcher.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
a: First string to compare.
|
|
93
|
+
b: Second string to compare.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
float: Similarity ratio between 0.0 and 1.0, where 1.0 means identical strings.
|
|
97
|
+
Returns 0.0 if either string is empty.
|
|
98
|
+
"""
|
|
99
|
+
if not a or not b:
|
|
100
|
+
return 0.0
|
|
101
|
+
return SequenceMatcher(None, a, b).ratio()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def _find_exact(original_text: str, segment: str) -> List[int]:
|
|
106
|
+
"""
|
|
107
|
+
Find all exact occurrences of a segment within the original text.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
original_text: The text to search within.
|
|
111
|
+
segment: The exact substring to find.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
List[int]: A list of starting indices where the segment was found.
|
|
115
|
+
Empty list if no matches found.
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
>>> SpanAligner._find_exact("hello world hello", "hello")
|
|
119
|
+
[0, 12]
|
|
120
|
+
"""
|
|
121
|
+
indices = []
|
|
122
|
+
start = 0
|
|
123
|
+
while True:
|
|
124
|
+
idx = original_text.find(segment, start)
|
|
125
|
+
if idx == -1:
|
|
126
|
+
break
|
|
127
|
+
indices.append(idx)
|
|
128
|
+
start = idx + 1
|
|
129
|
+
return indices
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _best_fuzzy_in_window(
|
|
133
|
+
original_text: str,
|
|
134
|
+
segment: str,
|
|
135
|
+
start_hint: Optional[int],
|
|
136
|
+
max_search_slack: int = 20
|
|
137
|
+
) -> Tuple[Optional[int], Optional[int], float]:
|
|
138
|
+
"""
|
|
139
|
+
Find the best fuzzy match for a segment within a window around a hint position.
|
|
140
|
+
Uses RapidFuzz for performance and better fuzzy matching.
|
|
141
|
+
"""
|
|
142
|
+
if not segment:
|
|
143
|
+
return None, None, 0.0
|
|
144
|
+
|
|
145
|
+
# Calculate window bounds
|
|
146
|
+
if start_hint is None:
|
|
147
|
+
left = 0
|
|
148
|
+
right = len(original_text)
|
|
149
|
+
else:
|
|
150
|
+
left = max(0, start_hint - max_search_slack)
|
|
151
|
+
# Add segment length + slack to the window end
|
|
152
|
+
right = min(len(original_text), start_hint + len(segment) + 2 * max_search_slack)
|
|
153
|
+
|
|
154
|
+
window = original_text[left:right]
|
|
155
|
+
|
|
156
|
+
if not window:
|
|
157
|
+
return None, None, 0.0
|
|
158
|
+
|
|
159
|
+
best_ratio = 0.0
|
|
160
|
+
best_start = None
|
|
161
|
+
best_end = None
|
|
162
|
+
|
|
163
|
+
seg_len = len(segment)
|
|
164
|
+
# Allow length variation (slack)
|
|
165
|
+
# We allow candidates to be +/- 20% length of segment, but at least +/- 5 chars
|
|
166
|
+
slack_len = max(5, int(seg_len * 0.2))
|
|
167
|
+
min_len = max(1, seg_len - slack_len)
|
|
168
|
+
max_len = seg_len + slack_len
|
|
169
|
+
|
|
170
|
+
len_window = len(window)
|
|
171
|
+
|
|
172
|
+
# Sliding window search
|
|
173
|
+
for i in range(len_window):
|
|
174
|
+
# Optimization: If the remaining window is shorter than min_len, stop
|
|
175
|
+
if i + min_len > len_window:
|
|
176
|
+
break
|
|
177
|
+
|
|
178
|
+
# Limit candidate end to avoid checking excessively long strings
|
|
179
|
+
end_limit = min(i + max_len, len_window) + 1
|
|
180
|
+
|
|
181
|
+
for j in range(i + min_len, end_limit):
|
|
182
|
+
candidate = window[i:j]
|
|
183
|
+
|
|
184
|
+
# Use rapidfuzz ratio
|
|
185
|
+
ratio = fuzz.ratio(segment, candidate) / 100.0
|
|
186
|
+
|
|
187
|
+
if ratio > best_ratio:
|
|
188
|
+
best_ratio = ratio
|
|
189
|
+
best_start = left + i
|
|
190
|
+
best_end = left + j
|
|
191
|
+
|
|
192
|
+
if best_ratio == 1.0:
|
|
193
|
+
break
|
|
194
|
+
if best_ratio == 1.0:
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
if best_start is None:
|
|
198
|
+
return None, None, 0.0
|
|
199
|
+
|
|
200
|
+
# Sanitize using the original SpanAligner helper
|
|
201
|
+
best_start, best_end = SpanAligner.sanitize_span(original_text, best_start, best_end)
|
|
202
|
+
|
|
203
|
+
return best_start, best_end, best_ratio
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def _regex_word_sequence(
|
|
207
|
+
original_text: str,
|
|
208
|
+
segment: str,
|
|
209
|
+
start_hint: Optional[int] = None,
|
|
210
|
+
max_search_slack: int = 20
|
|
211
|
+
) -> Optional[Tuple[int, int]]:
|
|
212
|
+
"""
|
|
213
|
+
Find a segment in the original text using regex-based word sequence matching.
|
|
214
|
+
|
|
215
|
+
This method tokenizes the segment into words and punctuation, then builds a
|
|
216
|
+
tolerant regex pattern that allows for varying whitespace/separators between
|
|
217
|
+
tokens. This is useful for matching text that may have different formatting.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
original_text: The text to search within.
|
|
221
|
+
segment: The text segment to find (will be tokenized).
|
|
222
|
+
start_hint: Approximate starting position to prioritize searching around.
|
|
223
|
+
If provided, searches near this position first before falling back to
|
|
224
|
+
full text search.
|
|
225
|
+
max_search_slack: Maximum distance from start_hint to search.
|
|
226
|
+
Default is 20 characters.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Optional[Tuple[int, int]]: A tuple of (start, end) indices if found,
|
|
230
|
+
or (None, None) if no match found.
|
|
231
|
+
|
|
232
|
+
Example:
|
|
233
|
+
>>> # Matches "hello world" even with different whitespace
|
|
234
|
+
>>> SpanAligner._regex_word_sequence("hello world", "hello world")
|
|
235
|
+
(0, 13)
|
|
236
|
+
"""
|
|
237
|
+
# Tokenize into words and punctuation, keeping punctuation tokens
|
|
238
|
+
# Words: one or more word chars; Punct: any single non-word, non-space char
|
|
239
|
+
tokens = re.findall(r"\w+|[^\w\s]", segment)
|
|
240
|
+
if not tokens:
|
|
241
|
+
return None, None
|
|
242
|
+
|
|
243
|
+
# Build a tolerant pattern that matches tokens in order, allowing non-word separators/newlines after each
|
|
244
|
+
# Keep punctuation characters explicitly in the pattern
|
|
245
|
+
escaped = list(map(re.escape, tokens))
|
|
246
|
+
pattern = r"(?s)" + r"\W*".join(escaped)
|
|
247
|
+
try:
|
|
248
|
+
regex = re.compile(pattern)
|
|
249
|
+
except re.error:
|
|
250
|
+
return None, None
|
|
251
|
+
|
|
252
|
+
# If we have a hint, first search in a bounded region around it
|
|
253
|
+
if isinstance(start_hint, int):
|
|
254
|
+
left = max(0, start_hint - max_search_slack)
|
|
255
|
+
# allow for extra room to the right in case of many separators
|
|
256
|
+
right = min(len(original_text), start_hint + max(max_search_slack, len(segment) * 2))
|
|
257
|
+
subset = original_text[left:right]
|
|
258
|
+
m = regex.search(subset)
|
|
259
|
+
if m:
|
|
260
|
+
return left + m.start(), left + m.end()
|
|
261
|
+
|
|
262
|
+
# Fallback: search entire text
|
|
263
|
+
m = regex.search(original_text)
|
|
264
|
+
|
|
265
|
+
if not m:
|
|
266
|
+
return None, None
|
|
267
|
+
return m.start(), m.end()
|
|
268
|
+
|
|
269
|
+
@staticmethod
|
|
270
|
+
def map_spans_to_original(
|
|
271
|
+
original_text: str,
|
|
272
|
+
result_obj: Dict[str, Any],
|
|
273
|
+
min_ratio: float = 0.90,
|
|
274
|
+
logging: bool = False,
|
|
275
|
+
max_dist: int = 20,
|
|
276
|
+
enable_fuzzy: bool = False,
|
|
277
|
+
) -> Tuple[bool, Dict[str, Any]]:
|
|
278
|
+
"""
|
|
279
|
+
Map spans from a result object to their positions in the original text.
|
|
280
|
+
|
|
281
|
+
This is the main alignment method that attempts to find the correct positions
|
|
282
|
+
of annotated spans in the original text. It uses multiple strategies:
|
|
283
|
+
1. Exact matching (fastest, most reliable)
|
|
284
|
+
2. Regex-based word sequence matching (handles whitespace variations)
|
|
285
|
+
3. Fuzzy matching (optional, for handling minor text differences)
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
original_text: The original/target text to map spans onto.
|
|
289
|
+
result_obj: A dictionary containing annotation data with the following structure:
|
|
290
|
+
{
|
|
291
|
+
"spans": [{
|
|
292
|
+
"start": int, # Approximate start position
|
|
293
|
+
"end": int, # Approximate end position
|
|
294
|
+
"text": str, # The text content of the span
|
|
295
|
+
"labels": [str] # List of label names
|
|
296
|
+
}, ...],
|
|
297
|
+
"entities": [...], # Same structure as spans
|
|
298
|
+
"task": {
|
|
299
|
+
"data": {"text": str} # Will be updated with original_text
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
min_ratio: Minimum similarity ratio (0.0-1.0) for fuzzy matching.
|
|
303
|
+
Default is 0.90.
|
|
304
|
+
logging: If True, prints debug information during alignment.
|
|
305
|
+
Default is False.
|
|
306
|
+
max_dist: Maximum allowed distance between approximate and actual
|
|
307
|
+
start positions. Matches further than this are rejected.
|
|
308
|
+
Default is 20 characters.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Tuple[bool, Dict[str, Any]]: A tuple of:
|
|
312
|
+
- bool: True if all spans were successfully aligned, False otherwise.
|
|
313
|
+
- Dict: Updated result_obj with mapped spans. Each span now includes:
|
|
314
|
+
- start/end: Mapped positions (or None if unmatched)
|
|
315
|
+
- text: Matched text from original (or None if unmatched)
|
|
316
|
+
- status: "exact", "regex", "fuzzy", or "unmatched"
|
|
317
|
+
- similarity: Match similarity score (0.0-1.0)
|
|
318
|
+
- detected: The cleaned segment text that was searched for
|
|
319
|
+
- approx_start: Original approximate start position
|
|
320
|
+
|
|
321
|
+
Example:
|
|
322
|
+
>>> original = "Hello, World!"
|
|
323
|
+
>>> result = {
|
|
324
|
+
... "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
|
|
325
|
+
... "entities": [],
|
|
326
|
+
... "task": {"data": {"text": ""}}
|
|
327
|
+
... }
|
|
328
|
+
>>> success, mapped = SpanAligner.map_spans_to_original(original, result)
|
|
329
|
+
>>> success
|
|
330
|
+
True
|
|
331
|
+
>>> mapped["spans"][0]["status"]
|
|
332
|
+
'exact'
|
|
333
|
+
"""
|
|
334
|
+
input_spans: List[Dict[str, Any]] = result_obj.get("spans", [])
|
|
335
|
+
input_entities: List[Dict[str, Any]] = result_obj.get("entities", [])
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def realign(items: List[Dict[str, Any]], enable_fuzzy: bool = False) -> Tuple[bool,List[Dict[str, Any]]]:
|
|
339
|
+
mapped: List[Dict[str, Any]] = []
|
|
340
|
+
all_aligned = True
|
|
341
|
+
for span in items:
|
|
342
|
+
|
|
343
|
+
approx_start = span.get("start", 0)
|
|
344
|
+
segment = span.get("text", "") or ""
|
|
345
|
+
labels = span.get("labels", [])
|
|
346
|
+
clean_segment = segment.strip()
|
|
347
|
+
chosen_end = None
|
|
348
|
+
|
|
349
|
+
if logging:
|
|
350
|
+
print("\n\n\n=======NEW SPAN==============")
|
|
351
|
+
print(f"label: {labels}")
|
|
352
|
+
print(f"cleaned span: '{clean_segment}' from original segment: '{segment}'")
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
exact_indices = SpanAligner._find_exact(original_text, clean_segment)
|
|
356
|
+
chosen_start = None
|
|
357
|
+
similarity = 1.0 if exact_indices else 0.0
|
|
358
|
+
status = "unmatched"
|
|
359
|
+
|
|
360
|
+
# First try exact match
|
|
361
|
+
if exact_indices:
|
|
362
|
+
chosen_start = min(exact_indices, key=lambda i: abs(i - approx_start))
|
|
363
|
+
chosen_end = chosen_start + len(clean_segment)
|
|
364
|
+
status = "exact"
|
|
365
|
+
|
|
366
|
+
# Next try regex-based word sequence match (search near approx_start for all words in order)
|
|
367
|
+
else:
|
|
368
|
+
step = 20
|
|
369
|
+
slacks = sorted(list(set(list(range(0, max_dist + 1, step)) + [max_dist])))
|
|
370
|
+
|
|
371
|
+
# 1. Try Regex progressively
|
|
372
|
+
for current_slack in slacks:
|
|
373
|
+
regex_start, regex_end = SpanAligner._regex_word_sequence(original_text, clean_segment, start_hint=approx_start, max_search_slack=current_slack)
|
|
374
|
+
|
|
375
|
+
if regex_start is not None:
|
|
376
|
+
chosen_start = regex_start
|
|
377
|
+
chosen_end = regex_end
|
|
378
|
+
# similarity compared against the fully matched span
|
|
379
|
+
similarity = SpanAligner._sequence_similarity(clean_segment, original_text[regex_start:regex_end])
|
|
380
|
+
status = "regex"
|
|
381
|
+
break
|
|
382
|
+
|
|
383
|
+
# 2. Try fuzzy matching if regex failed
|
|
384
|
+
if status == "unmatched" and enable_fuzzy:
|
|
385
|
+
|
|
386
|
+
step = 20
|
|
387
|
+
slacks = sorted(list(set(list(range(0, max_dist + 1, step)) + [max_dist])))
|
|
388
|
+
|
|
389
|
+
for current_slack in slacks:
|
|
390
|
+
fuzzy_start, fuzzy_end, fuzzy_ratio = SpanAligner._best_fuzzy_in_window(original_text, clean_segment, start_hint=approx_start, max_search_slack=current_slack)
|
|
391
|
+
|
|
392
|
+
if fuzzy_start is not None and fuzzy_ratio >= min_ratio:
|
|
393
|
+
chosen_start = fuzzy_start
|
|
394
|
+
chosen_end = fuzzy_end
|
|
395
|
+
similarity = fuzzy_ratio
|
|
396
|
+
status = "fuzzy"
|
|
397
|
+
break
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# Check distance threshold
|
|
402
|
+
if chosen_start is not None and abs(chosen_start - approx_start) > max_dist:
|
|
403
|
+
if logging:
|
|
404
|
+
print(f"Match rejected due to distance: {abs(chosen_start - approx_start)} > {max_dist}")
|
|
405
|
+
chosen_start = None
|
|
406
|
+
chosen_end = None
|
|
407
|
+
status = "unmatched"
|
|
408
|
+
|
|
409
|
+
if chosen_start is not None:
|
|
410
|
+
# Sanitize the mapped span to avoid leading/trailing special characters
|
|
411
|
+
chosen_start, chosen_end = SpanAligner.sanitize_span(original_text, chosen_start, chosen_end)
|
|
412
|
+
matched_text = original_text[chosen_start:chosen_end]
|
|
413
|
+
else:
|
|
414
|
+
if logging:
|
|
415
|
+
print("No match found")
|
|
416
|
+
|
|
417
|
+
matched_text = None
|
|
418
|
+
all_aligned = False
|
|
419
|
+
|
|
420
|
+
if logging:
|
|
421
|
+
print("=====================")
|
|
422
|
+
print(f"span: {span} segment: {clean_segment}")
|
|
423
|
+
print(f"status:{status} similarity: {similarity}")
|
|
424
|
+
print(f"pre sanit: {(chosen_start,chosen_end, len(clean_segment))}" if chosen_start is not None else "pre sanit: None")
|
|
425
|
+
if chosen_start is not None:
|
|
426
|
+
print(f"updated positions: (start: {chosen_start}; end: {chosen_end})")
|
|
427
|
+
print(f"Extracted in original: '{original_text[chosen_start:chosen_end]}'")
|
|
428
|
+
|
|
429
|
+
mapped.append({
|
|
430
|
+
"start": chosen_start,
|
|
431
|
+
"end": chosen_end,
|
|
432
|
+
"text": matched_text,
|
|
433
|
+
"labels": labels,
|
|
434
|
+
"status": status,
|
|
435
|
+
"similarity": round(similarity, 4),
|
|
436
|
+
"detected": clean_segment,
|
|
437
|
+
"approx_start": approx_start,
|
|
438
|
+
})
|
|
439
|
+
return all_aligned, mapped
|
|
440
|
+
|
|
441
|
+
updated = dict(result_obj)
|
|
442
|
+
all_spans_aligned, updated["spans"] = realign(input_spans, enable_fuzzy)
|
|
443
|
+
all_entities_aligned, updated["entities"] = realign(input_entities, enable_fuzzy)
|
|
444
|
+
updated["task"]["data"]["text"] = original_text
|
|
445
|
+
return all_spans_aligned and all_entities_aligned, updated
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
@staticmethod
|
|
449
|
+
def merge_result_objects(
|
|
450
|
+
base: Dict[str, Any],
|
|
451
|
+
addition: Dict[str, Any],
|
|
452
|
+
span_from_name: str,
|
|
453
|
+
ner_from_name: str
|
|
454
|
+
) -> Dict[str, Any]:
|
|
455
|
+
"""
|
|
456
|
+
Merge two result objects by combining their span and entity lists.
|
|
457
|
+
|
|
458
|
+
Creates a new dictionary based on the base object, then appends spans and
|
|
459
|
+
entities from the addition object.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
base: The base result object to merge into.
|
|
463
|
+
addition: The result object to merge from.
|
|
464
|
+
span_from_name: The key name for span annotations (e.g., "spans", "segmentation").
|
|
465
|
+
ner_from_name: The key name for NER/entity annotations (e.g., "entities").
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Dict[str, Any]: A new merged dictionary with combined spans and entities.
|
|
469
|
+
The base object is shallow-copied, so nested objects may still be shared.
|
|
470
|
+
|
|
471
|
+
Example:
|
|
472
|
+
>>> base = {"spans": [{"text": "A"}], "entities": []}
|
|
473
|
+
>>> addition = {"spans": [{"text": "B"}], "entities": [{"text": "C"}]}
|
|
474
|
+
>>> merged = SpanAligner.merge_result_objects(base, addition, "spans", "entities")
|
|
475
|
+
>>> len(merged["spans"])
|
|
476
|
+
2
|
|
477
|
+
"""
|
|
478
|
+
merged = dict(base)
|
|
479
|
+
|
|
480
|
+
addition_spans = addition.get(span_from_name, [])
|
|
481
|
+
base_spans = merged.get(span_from_name, [])
|
|
482
|
+
merged[span_from_name] = base_spans + addition_spans
|
|
483
|
+
|
|
484
|
+
addition_ner = addition.get(ner_from_name, [])
|
|
485
|
+
base_ner = merged.get(ner_from_name, [])
|
|
486
|
+
merged[ner_from_name] = base_ner + addition_ner
|
|
487
|
+
return merged
|
|
488
|
+
|
|
489
|
+
@staticmethod
|
|
490
|
+
def _invert_label_map(tag_to_label: Dict[str, str]) -> Dict[str, str]:
|
|
491
|
+
"""
|
|
492
|
+
Invert a tag-to-label mapping to create a label-to-tag mapping.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
tag_to_label: A dictionary mapping tag names to label names.
|
|
496
|
+
Can be None, in which case an empty dict is returned.
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
Dict[str, str]: A dictionary mapping label names to tag names.
|
|
500
|
+
|
|
501
|
+
Example:
|
|
502
|
+
>>> SpanAligner._invert_label_map({"loc": "Location", "per": "Person"})
|
|
503
|
+
{'Location': 'loc', 'Person': 'per'}
|
|
504
|
+
"""
|
|
505
|
+
return {v: k for k, v in (tag_to_label or {}).items()}
|
|
506
|
+
|
|
507
|
+
@staticmethod
|
|
508
|
+
def _sanitize_label_to_tag(label: str) -> str:
|
|
509
|
+
"""
|
|
510
|
+
Convert a human-readable label to a sanitized XML-safe tag name.
|
|
511
|
+
|
|
512
|
+
Converts the label to lowercase, replaces spaces with underscores,
|
|
513
|
+
and removes any characters that are not alphanumeric or underscores.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
label: The human-readable label to convert.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
str: A sanitized tag name suitable for use in XML/HTML tags.
|
|
520
|
+
Returns "span" if the result would be empty.
|
|
521
|
+
|
|
522
|
+
Example:
|
|
523
|
+
>>> SpanAligner._sanitize_label_to_tag("My Label (Special)")
|
|
524
|
+
'my_label_special'
|
|
525
|
+
"""
|
|
526
|
+
# Fallback: convert human label to tag-like form
|
|
527
|
+
tag = label.strip().lower().replace(" ", "_")
|
|
528
|
+
tag = re.sub(r"[^a-z0-9_]+", "_", tag).strip("_")
|
|
529
|
+
return tag or "span"
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
@staticmethod
|
|
533
|
+
def _format_annotations(task: Any) -> Dict[str, Any]:
|
|
534
|
+
"""
|
|
535
|
+
Extract and format annotations from a Label Studio task object.
|
|
536
|
+
|
|
537
|
+
Parses the first annotation from the task and categorizes the results
|
|
538
|
+
into classification choices, entities, and segmentation spans.
|
|
539
|
+
Falls back to predictions if no annotations are available.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
task: A Label Studio task object with an `annotations` attribute.
|
|
543
|
+
Expected structure:
|
|
544
|
+
task.annotations = [{
|
|
545
|
+
"result": [
|
|
546
|
+
{"type": "choices", "from_name": "type", "value": {"choices": [...]}},
|
|
547
|
+
{"type": "labels", "from_name": "entities", "value": {...}},
|
|
548
|
+
{"type": "labels", "from_name": "segmentation", "value": {...}}
|
|
549
|
+
]
|
|
550
|
+
}]
|
|
551
|
+
If annotations are empty, predictions with the same structure
|
|
552
|
+
will be used as a fallback.
|
|
553
|
+
|
|
554
|
+
Returns:
|
|
555
|
+
Dict[str, Any]: A dictionary with three keys:
|
|
556
|
+
- "classification": List of classification choices
|
|
557
|
+
- "entities": List of entity annotation values
|
|
558
|
+
- "segmentation": List of segmentation span values
|
|
559
|
+
"""
|
|
560
|
+
# Try annotations first, fall back to predictions if empty
|
|
561
|
+
results = []
|
|
562
|
+
if task.annotations:
|
|
563
|
+
results = task.annotations[0].get("result", [])
|
|
564
|
+
|
|
565
|
+
# If no annotations, try predictions
|
|
566
|
+
if not results and hasattr(task, 'predictions') and task.predictions:
|
|
567
|
+
results = task.predictions[0].result or []
|
|
568
|
+
|
|
569
|
+
classification = []
|
|
570
|
+
entities = []
|
|
571
|
+
spans = []
|
|
572
|
+
|
|
573
|
+
for ann in results:
|
|
574
|
+
ann_type = ann.get("type")
|
|
575
|
+
from_name = ann.get("from_name")
|
|
576
|
+
value = ann.get("value", {})
|
|
577
|
+
|
|
578
|
+
if ann_type == "choices" and from_name == "type":
|
|
579
|
+
if choices := value.get("choices"):
|
|
580
|
+
classification = choices
|
|
581
|
+
elif ann_type == "labels":
|
|
582
|
+
if from_name == "entities":
|
|
583
|
+
entities.append(value)
|
|
584
|
+
elif from_name == "segmentation":
|
|
585
|
+
spans.append(value)
|
|
586
|
+
|
|
587
|
+
return {
|
|
588
|
+
"classification": classification,
|
|
589
|
+
"entities": entities,
|
|
590
|
+
"segmentation": spans
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
@staticmethod
|
|
595
|
+
def update_mapped_with_rebuilt(
|
|
596
|
+
original_text: str,
|
|
597
|
+
mapped: Dict[str, Any],
|
|
598
|
+
span_label_mapping: Optional[Dict[str, str]] = None,
|
|
599
|
+
ner_label_mapping: Optional[Dict[str, str]] = None,
|
|
600
|
+
overwrite: bool = True
|
|
601
|
+
) -> Dict[str, Any]:
|
|
602
|
+
"""
|
|
603
|
+
Update a mapped result object with rebuilt tagged text.
|
|
604
|
+
|
|
605
|
+
Takes a mapped result object (output from map_spans_to_original) and
|
|
606
|
+
generates tagged text from its spans and entities, storing the result
|
|
607
|
+
in the task data.
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
original_text: The original text to use for rebuilding tags.
|
|
611
|
+
mapped: A mapped result object containing:
|
|
612
|
+
- "spans": List of span annotations
|
|
613
|
+
- "entities": List of entity annotations
|
|
614
|
+
- "task": {"data": {...}} - Task data to update
|
|
615
|
+
span_label_mapping: Optional tag-to-label mapping for spans.
|
|
616
|
+
Will be inverted to create label-to-tag mapping.
|
|
617
|
+
ner_label_mapping: Optional tag-to-label mapping for NER entities.
|
|
618
|
+
Will be inverted to create label-to-tag mapping.
|
|
619
|
+
overwrite: If True, overwrites "tagged_text" in task data.
|
|
620
|
+
If False, stores result in "tagged_text_unified" instead.
|
|
621
|
+
When overwriting, the original tagged_text is preserved in
|
|
622
|
+
"tagged_text_original" if it exists.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
Dict[str, Any]: The same mapped object (modified in place) with:
|
|
626
|
+
- task.data.tagged_text (or tagged_text_unified): The rebuilt tagged text
|
|
627
|
+
- task.data.tagged_text_original: Original tagged_text if overwritten
|
|
628
|
+
- task.data.rebuild_stats: Statistics from rebuild operation
|
|
629
|
+
|
|
630
|
+
Example:
|
|
631
|
+
>>> mapped = {"spans": [...], "entities": [...], "task": {"data": {}}}
|
|
632
|
+
>>> updated = SpanAligner.update_mapped_with_rebuilt("Hello World", mapped)
|
|
633
|
+
>>> "tagged_text" in updated["task"]["data"]
|
|
634
|
+
True
|
|
635
|
+
"""
|
|
636
|
+
data = mapped.get("task", {}).get("data", {})
|
|
637
|
+
# text = data.get("text", "")
|
|
638
|
+
label_to_tag = {}
|
|
639
|
+
label_to_tag.update(SpanAligner._invert_label_map(span_label_mapping or {}))
|
|
640
|
+
label_to_tag.update(SpanAligner._invert_label_map(ner_label_mapping or {}))
|
|
641
|
+
|
|
642
|
+
rebuilt, stats = SpanAligner.rebuild_tagged_text(
|
|
643
|
+
original_text,
|
|
644
|
+
mapped.get("spans", []),
|
|
645
|
+
mapped.get("entities", []),
|
|
646
|
+
label_to_tag=label_to_tag,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
# Preserve original and write unified
|
|
650
|
+
if "tagged_text" in data and not data.get("tagged_text_original"):
|
|
651
|
+
data["tagged_text_original"] = data.get("tagged_text")
|
|
652
|
+
if overwrite:
|
|
653
|
+
data["tagged_text"] = rebuilt
|
|
654
|
+
else:
|
|
655
|
+
data["tagged_text_unified"] = rebuilt
|
|
656
|
+
data["rebuild_stats"] = stats
|
|
657
|
+
return mapped
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
#### From tagged text to task
|
|
662
|
+
@staticmethod
|
|
663
|
+
def get_annotations_from_tagged_text(
|
|
664
|
+
result: Union[dict, str],
|
|
665
|
+
*,
|
|
666
|
+
include_attachments: bool = True,
|
|
667
|
+
span_map: Optional[Dict[str, str]] = None,
|
|
668
|
+
ner_map: Optional[Dict[str, str]] = None,
|
|
669
|
+
class_map: Optional[Dict[str, str]] = None,
|
|
670
|
+
allowed_tags: Optional[List[str]] = None,
|
|
671
|
+
) -> dict:
|
|
672
|
+
"""
|
|
673
|
+
Convert a tagged result (with inline XML-like tags) into structured annotations.
|
|
674
|
+
|
|
675
|
+
Extracts spans and entities from tagged text by removing tags and tracking
|
|
676
|
+
character offsets in the resulting plain text. Supports nested tags and
|
|
677
|
+
custom tag-to-label mappings.
|
|
678
|
+
|
|
679
|
+
Args:
|
|
680
|
+
result: Input dictionary with 'tagged_text' key, or the tagged text string itself.
|
|
681
|
+
include_attachments: Whether to include text content inside <attachment> tags.
|
|
682
|
+
span_map: Dictionary mapping tag names to span labels.
|
|
683
|
+
ner_map: Dictionary mapping tag names to entity labels.
|
|
684
|
+
class_map: Dictionary mapping document classifications to labels.
|
|
685
|
+
allowed_tags: List of tag names to process. If None, derived from map keys.
|
|
686
|
+
|
|
687
|
+
Returns:
|
|
688
|
+
dict: A dictionary containing:
|
|
689
|
+
- spans: List of span (segmentation) objects
|
|
690
|
+
- entities: List of entity (NER) objects
|
|
691
|
+
- plain_text: The text content with tags removed
|
|
692
|
+
- tagged_text: The original tagged text used
|
|
693
|
+
- document_classification: The classification from input result (if any)
|
|
694
|
+
|
|
695
|
+
Notes:
|
|
696
|
+
- Spans are derived by removing tags while tracking character offsets in the
|
|
697
|
+
plain text. Nested tags are supported; spans may overlap.
|
|
698
|
+
- `span_map` lets you rename tags to match your LS label config.
|
|
699
|
+
- `allowed_tags` limits which tags are turned into spans. If None, uses the
|
|
700
|
+
tag set defined in your prompts.
|
|
701
|
+
"""
|
|
702
|
+
|
|
703
|
+
# Resolve tagged_text input
|
|
704
|
+
tagged_text = ""
|
|
705
|
+
doc_class = None
|
|
706
|
+
if isinstance(result, dict):
|
|
707
|
+
tagged_text = result.get("tagged_text", "")
|
|
708
|
+
doc_class = result.get("document_classification")
|
|
709
|
+
else:
|
|
710
|
+
tagged_text = str(result or "")
|
|
711
|
+
|
|
712
|
+
if not tagged_text:
|
|
713
|
+
raise ValueError("No tagged_text found in input result.")
|
|
714
|
+
|
|
715
|
+
# Default allowed tags (from your SYSTEM/USER prompts)
|
|
716
|
+
if allowed_tags is None:
|
|
717
|
+
# Safely handle None maps
|
|
718
|
+
s_map = span_map or {}
|
|
719
|
+
n_map = ner_map or {}
|
|
720
|
+
allowed_tags = list(n_map.keys()) + list(s_map.keys())
|
|
721
|
+
|
|
722
|
+
# Merge span_map and ner_map safely into annotation_map
|
|
723
|
+
annotation_map = {}
|
|
724
|
+
for mapping in (span_map, ner_map):
|
|
725
|
+
if mapping:
|
|
726
|
+
annotation_map.update(mapping)
|
|
727
|
+
|
|
728
|
+
# If annotation_map ends up empty, initialize with identity mapping
|
|
729
|
+
if not annotation_map:
|
|
730
|
+
annotation_map = {t: t for t in allowed_tags}
|
|
731
|
+
|
|
732
|
+
# Regex to capture bare tags like <tag> or </tag>
|
|
733
|
+
tag_re = re.compile(r"<(/?)([a-zA-Z_][a-zA-Z0-9_-]*)>")
|
|
734
|
+
|
|
735
|
+
plain_parts: List[str] = []
|
|
736
|
+
spans: List[dict] = []
|
|
737
|
+
entities: List[dict] = []
|
|
738
|
+
|
|
739
|
+
stack: List[Tuple[str, int]] = [] # (tag_name_lower, start_offset_in_plain)
|
|
740
|
+
|
|
741
|
+
pos_in = 0 # position in tagged_text
|
|
742
|
+
pos_out = 0 # position in plain text we are building
|
|
743
|
+
|
|
744
|
+
def emit_text(s: str):
|
|
745
|
+
nonlocal pos_out
|
|
746
|
+
if not s:
|
|
747
|
+
return
|
|
748
|
+
plain_parts.append(s)
|
|
749
|
+
pos_out += len(s)
|
|
750
|
+
|
|
751
|
+
# Attachment handling: if we skip attachments, when inside attachments or attachment, we don't emit text
|
|
752
|
+
inside_attachments_level = 0
|
|
753
|
+
|
|
754
|
+
for m in tag_re.finditer(tagged_text):
|
|
755
|
+
# Emit any literal text before this tag
|
|
756
|
+
literal = tagged_text[pos_in:m.start()]
|
|
757
|
+
current_tag_is_attachment = inside_attachments_level > 0
|
|
758
|
+
|
|
759
|
+
if include_attachments or not current_tag_is_attachment:
|
|
760
|
+
emit_text(literal)
|
|
761
|
+
|
|
762
|
+
is_closing = bool(m.group(1))
|
|
763
|
+
tag_name = m.group(2).lower()
|
|
764
|
+
|
|
765
|
+
# Track attachments nesting regardless of allowed_tags so we can drop their content when requested
|
|
766
|
+
if tag_name in ("attachments", "attachment"):
|
|
767
|
+
if not is_closing:
|
|
768
|
+
inside_attachments_level += 1
|
|
769
|
+
else:
|
|
770
|
+
inside_attachments_level = max(0, inside_attachments_level - 1)
|
|
771
|
+
|
|
772
|
+
# Handle span stack only for allowed tags
|
|
773
|
+
if tag_name in allowed_tags:
|
|
774
|
+
if not is_closing:
|
|
775
|
+
# Opening tag
|
|
776
|
+
stack.append((tag_name, pos_out))
|
|
777
|
+
else:
|
|
778
|
+
# Closing tag — find the last matching opening tag
|
|
779
|
+
# Iterate backwards to find the matching opening tag
|
|
780
|
+
found_open = False
|
|
781
|
+
for i in range(len(stack) - 1, -1, -1):
|
|
782
|
+
open_tag, start_off = stack[i]
|
|
783
|
+
if open_tag == tag_name:
|
|
784
|
+
# Pop all tags above the matching one (handle mismatched nesting)
|
|
785
|
+
stack = stack[:i]
|
|
786
|
+
end_off = pos_out
|
|
787
|
+
|
|
788
|
+
# Create a span only if it has positive length
|
|
789
|
+
if end_off > start_off:
|
|
790
|
+
full_span_text = ("".join(plain_parts))[start_off:end_off]
|
|
791
|
+
|
|
792
|
+
# Adjust start to skip leading newlines
|
|
793
|
+
adjusted_start = start_off
|
|
794
|
+
span_text = full_span_text
|
|
795
|
+
|
|
796
|
+
while span_text.startswith('\n'):
|
|
797
|
+
adjusted_start += 1
|
|
798
|
+
span_text = span_text[1:]
|
|
799
|
+
|
|
800
|
+
# Adjust end to skip trailing newlines
|
|
801
|
+
adjusted_end = end_off
|
|
802
|
+
while span_text.endswith('\n'):
|
|
803
|
+
adjusted_end -= 1
|
|
804
|
+
span_text = span_text[:-1]
|
|
805
|
+
|
|
806
|
+
# Only create span if there's content after trimming
|
|
807
|
+
if adjusted_end > adjusted_start:
|
|
808
|
+
annotation_entry = {
|
|
809
|
+
"start": adjusted_start,
|
|
810
|
+
"end": adjusted_end,
|
|
811
|
+
"text": span_text,
|
|
812
|
+
"labels": [annotation_map.get(tag_name, tag_name)]
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
if ner_map and tag_name in ner_map:
|
|
816
|
+
entities.append(annotation_entry)
|
|
817
|
+
else:
|
|
818
|
+
spans.append(annotation_entry)
|
|
819
|
+
found_open = True
|
|
820
|
+
break
|
|
821
|
+
# If no matching opening tag found, ignore gracefully
|
|
822
|
+
|
|
823
|
+
pos_in = m.end()
|
|
824
|
+
|
|
825
|
+
# Emit remaining tail text
|
|
826
|
+
tail = tagged_text[pos_in:]
|
|
827
|
+
if include_attachments or inside_attachments_level == 0:
|
|
828
|
+
emit_text(tail if include_attachments else "")
|
|
829
|
+
|
|
830
|
+
plain_text = "".join(plain_parts)
|
|
831
|
+
|
|
832
|
+
return {
|
|
833
|
+
"spans": spans,
|
|
834
|
+
"entities": entities,
|
|
835
|
+
"plain_text": plain_text,
|
|
836
|
+
"tagged_text": tagged_text,
|
|
837
|
+
"document_classification": doc_class
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
@staticmethod
|
|
841
|
+
def tagged_text_to_task(
|
|
842
|
+
result: Union[dict, str],
|
|
843
|
+
*,
|
|
844
|
+
include_attachments: bool = True,
|
|
845
|
+
span_map: Optional[Dict[str, str]] = None,
|
|
846
|
+
ner_map: Optional[Dict[str, str]] = None,
|
|
847
|
+
class_map: Optional[Dict[str, str]] = None,
|
|
848
|
+
allowed_tags: Optional[List[str]] = None,
|
|
849
|
+
) -> dict:
|
|
850
|
+
"""
|
|
851
|
+
Convert a tagged result into an uploader-ready Label Studio task.
|
|
852
|
+
|
|
853
|
+
Uses `get_annotations_from_tagged_text` to parse the input and formats
|
|
854
|
+
the output as expected by the Label Studio uploader class.
|
|
855
|
+
|
|
856
|
+
Args:
|
|
857
|
+
result: Input dictionary with 'tagged_text' key, or the tagged text string.
|
|
858
|
+
include_attachments: Whether to include text content inside <attachment> tags.
|
|
859
|
+
span_map: Dictionary mapping tag names to span labels.
|
|
860
|
+
ner_map: Dictionary mapping tag names to entity labels.
|
|
861
|
+
class_map: Dictionary mapping document classifications to labels.
|
|
862
|
+
allowed_tags: List of tag names to process. If None, derived from map keys.
|
|
863
|
+
|
|
864
|
+
Returns:
|
|
865
|
+
dict: A dictionary ready for Label Studio import, containing:
|
|
866
|
+
- task: Task data including text and metadata
|
|
867
|
+
- spans: Extracted spans
|
|
868
|
+
- entities: Extracted entities
|
|
869
|
+
- labels: Classification labels (if applicable)
|
|
870
|
+
"""
|
|
871
|
+
# Parse annotations using shared logic
|
|
872
|
+
parsed = SpanAligner.get_annotations_from_tagged_text(
|
|
873
|
+
result,
|
|
874
|
+
include_attachments=include_attachments,
|
|
875
|
+
span_map=span_map,
|
|
876
|
+
ner_map=ner_map,
|
|
877
|
+
class_map=class_map,
|
|
878
|
+
allowed_tags=allowed_tags
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
spans = parsed["spans"]
|
|
882
|
+
entities = parsed["entities"]
|
|
883
|
+
plain_text = parsed["plain_text"]
|
|
884
|
+
tagged_text = parsed["tagged_text"]
|
|
885
|
+
doc_class = parsed["document_classification"]
|
|
886
|
+
|
|
887
|
+
# Handle classification mapping
|
|
888
|
+
classification_labels = []
|
|
889
|
+
if doc_class and class_map and doc_class in class_map:
|
|
890
|
+
classification_labels = [class_map[doc_class]]
|
|
891
|
+
|
|
892
|
+
content = {
|
|
893
|
+
"task": {
|
|
894
|
+
"data": {
|
|
895
|
+
"text": plain_text,
|
|
896
|
+
"tagged_text": tagged_text,
|
|
897
|
+
"meta": {
|
|
898
|
+
"segments": len(spans),
|
|
899
|
+
"labels_present": sorted({(s.get("labels") or [""])[0] for s in spans}),
|
|
900
|
+
"include_attachments": include_attachments,
|
|
901
|
+
"document_classification": doc_class or ""
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
},
|
|
905
|
+
"spans": spans,
|
|
906
|
+
"labels": classification_labels,
|
|
907
|
+
"entities": entities
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
return content
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
#### From task to tagged text
|
|
914
|
+
@staticmethod
|
|
915
|
+
def rebuild_tagged_text(
|
|
916
|
+
original_text: str,
|
|
917
|
+
spans: List[Dict[str, Any]],
|
|
918
|
+
entities: List[Dict[str, Any]],
|
|
919
|
+
label_to_tag: Optional[Dict[str, str]] = None
|
|
920
|
+
) -> Tuple[str, Dict[str, int]]:
|
|
921
|
+
"""
|
|
922
|
+
Rebuild text with nested XML-style tags from span and entity annotations.
|
|
923
|
+
|
|
924
|
+
Creates properly nested tags from annotations, handling overlapping spans
|
|
925
|
+
by skipping crossing (non-nested) annotations to maintain valid XML structure.
|
|
926
|
+
|
|
927
|
+
Args:
|
|
928
|
+
original_text: The source text to add tags to.
|
|
929
|
+
spans: List of span annotations, each with:
|
|
930
|
+
- "start": int - Starting character index
|
|
931
|
+
- "end": int - Ending character index (exclusive)
|
|
932
|
+
- "labels": List[str] - Label names (first one is used)
|
|
933
|
+
entities: List of entity annotations (same structure as spans).
|
|
934
|
+
label_to_tag: Optional mapping from label names to tag names.
|
|
935
|
+
If a label is not in the mapping, it will be sanitized to
|
|
936
|
+
create a valid tag name.
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
Tuple[str, Dict[str, int]]: A tuple of:
|
|
940
|
+
- str: The text with XML tags inserted (e.g., "<tag>text</tag>")
|
|
941
|
+
- Dict with statistics:
|
|
942
|
+
- "total": Total number of valid annotations processed
|
|
943
|
+
- "skipped_crossing": Number of annotations skipped due to
|
|
944
|
+
crossing (non-nested) overlaps
|
|
945
|
+
|
|
946
|
+
Note:
|
|
947
|
+
- Annotations with invalid positions (negative, overlapping bounds,
|
|
948
|
+
or exceeding text length) are silently skipped.
|
|
949
|
+
- For overlapping annotations, outer (longer) spans are preferred.
|
|
950
|
+
- Crossing annotations that would create invalid XML are skipped.
|
|
951
|
+
|
|
952
|
+
Example:
|
|
953
|
+
>>> text = "Hello World"
|
|
954
|
+
>>> spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
|
|
955
|
+
>>> entities = [{"start": 0, "end": 5, "labels": ["greeting"]}]
|
|
956
|
+
>>> result, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
957
|
+
>>> result
|
|
958
|
+
'<sentence><greeting>Hello</greeting> World</sentence>'
|
|
959
|
+
"""
|
|
960
|
+
annotations: List[Dict[str, Any]] = []
|
|
961
|
+
|
|
962
|
+
def to_tag(lbls: List[str]) -> Optional[str]:
|
|
963
|
+
if not lbls:
|
|
964
|
+
return None
|
|
965
|
+
lbl = lbls[0]
|
|
966
|
+
if label_to_tag and lbl in label_to_tag:
|
|
967
|
+
return label_to_tag[lbl]
|
|
968
|
+
return SpanAligner._sanitize_label_to_tag(lbl)
|
|
969
|
+
|
|
970
|
+
def add_items(items: List[Dict[str, Any]]):
|
|
971
|
+
for it in items or []:
|
|
972
|
+
s = it.get("start")
|
|
973
|
+
e = it.get("end")
|
|
974
|
+
if not isinstance(s, int) or not isinstance(e, int) or s is None or e is None or s < 0 or e <= s or e > len(original_text):
|
|
975
|
+
continue
|
|
976
|
+
tag = to_tag(it.get("labels") or [])
|
|
977
|
+
if not tag:
|
|
978
|
+
continue
|
|
979
|
+
annotations.append({
|
|
980
|
+
"start": s,
|
|
981
|
+
"end": e,
|
|
982
|
+
"tag": str(tag),
|
|
983
|
+
"length": e - s,
|
|
984
|
+
})
|
|
985
|
+
|
|
986
|
+
add_items(spans)
|
|
987
|
+
add_items(entities)
|
|
988
|
+
|
|
989
|
+
# Sort: by start asc, longer first (end desc) to open outers before inners
|
|
990
|
+
annotations.sort(key=lambda a: (a["start"], -a["length"]))
|
|
991
|
+
|
|
992
|
+
# Index starts and ends
|
|
993
|
+
starts: Dict[int, List[Dict[str, Any]]] = {}
|
|
994
|
+
for a in annotations:
|
|
995
|
+
starts.setdefault(a["start"], []).append(a)
|
|
996
|
+
for pos in starts:
|
|
997
|
+
starts[pos].sort(key=lambda a: -a["length"]) # longer first
|
|
998
|
+
|
|
999
|
+
ends: Dict[int, List[Dict[str, Any]]] = {}
|
|
1000
|
+
for a in annotations:
|
|
1001
|
+
ends.setdefault(a["end"], []).append(a)
|
|
1002
|
+
|
|
1003
|
+
event_positions = sorted({0, len(original_text), *starts.keys(), *ends.keys()})
|
|
1004
|
+
|
|
1005
|
+
pieces: List[str] = []
|
|
1006
|
+
stack: List[Dict[str, Any]] = []
|
|
1007
|
+
last = 0
|
|
1008
|
+
skipped_cross = 0
|
|
1009
|
+
|
|
1010
|
+
for pos in event_positions:
|
|
1011
|
+
if pos > last:
|
|
1012
|
+
pieces.append(original_text[last:pos])
|
|
1013
|
+
|
|
1014
|
+
# Close all tags that end here (LIFO)
|
|
1015
|
+
while stack and stack[-1]["end"] == pos:
|
|
1016
|
+
top = stack.pop()
|
|
1017
|
+
pieces.append(f"</{top['tag']}>")
|
|
1018
|
+
|
|
1019
|
+
# Open tags that start here (outer first)
|
|
1020
|
+
for ann in starts.get(pos, []):
|
|
1021
|
+
# Crossing check: if an open tag exists with end < ann.end (not nested), skip ann
|
|
1022
|
+
if stack and ann["end"] > stack[-1]["end"]:
|
|
1023
|
+
skipped_cross += 1
|
|
1024
|
+
continue
|
|
1025
|
+
pieces.append(f"<{ann['tag']}>")
|
|
1026
|
+
stack.append(ann)
|
|
1027
|
+
|
|
1028
|
+
last = pos
|
|
1029
|
+
|
|
1030
|
+
# Tail
|
|
1031
|
+
pieces.append(original_text[last:])
|
|
1032
|
+
|
|
1033
|
+
# Close any still-open tags (best-effort)
|
|
1034
|
+
while stack:
|
|
1035
|
+
top = stack.pop()
|
|
1036
|
+
pieces.append(f"</{top['tag']}>")
|
|
1037
|
+
|
|
1038
|
+
return "".join(pieces), {"total": len(annotations), "skipped_crossing": skipped_cross}
|
|
1039
|
+
|
|
1040
|
+
@staticmethod
|
|
1041
|
+
def rebuild_tagged_text_from_task(task: Any, mapping: Dict[str, str]) -> str:
|
|
1042
|
+
"""
|
|
1043
|
+
Generate tagged text from a Label Studio task's annotations.
|
|
1044
|
+
|
|
1045
|
+
Extracts annotations from the task and rebuilds the text with XML-style
|
|
1046
|
+
tags around annotated spans.
|
|
1047
|
+
|
|
1048
|
+
Args:
|
|
1049
|
+
task: A Label Studio task object with:
|
|
1050
|
+
- task.annotations: List of annotation objects
|
|
1051
|
+
- task.data: Dict containing "text" key with the source text
|
|
1052
|
+
mapping: A dictionary mapping label names to tag names to use in
|
|
1053
|
+
the output. Labels not in the mapping will be sanitized to
|
|
1054
|
+
create tag names.
|
|
1055
|
+
|
|
1056
|
+
Returns:
|
|
1057
|
+
str: The text with XML-style tags inserted around annotated spans.
|
|
1058
|
+
|
|
1059
|
+
Example:
|
|
1060
|
+
>>> # Returns something like: "<greeting>Hello</greeting>, World!"
|
|
1061
|
+
"""
|
|
1062
|
+
extracted = SpanAligner._format_annotations(task)
|
|
1063
|
+
text = task.data.get("text", "")
|
|
1064
|
+
|
|
1065
|
+
retagged, _ = SpanAligner.rebuild_tagged_text(
|
|
1066
|
+
text,
|
|
1067
|
+
spans=extracted["segmentation"],
|
|
1068
|
+
entities=extracted["entities"],
|
|
1069
|
+
label_to_tag=mapping
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
return retagged
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
#### Transpose tags back to original text
|
|
1076
|
+
@staticmethod
|
|
1077
|
+
def map_tags_to_original(
|
|
1078
|
+
original_text: str,
|
|
1079
|
+
tagged_text: str,
|
|
1080
|
+
min_ratio: float = 0.8,
|
|
1081
|
+
max_dist: int = 20,
|
|
1082
|
+
logging: bool = False
|
|
1083
|
+
) -> str:
|
|
1084
|
+
"""
|
|
1085
|
+
Map spans from tagged text back to their positions in the original text.
|
|
1086
|
+
|
|
1087
|
+
Takes tagged text with XML-style tags and aligns the annotated spans
|
|
1088
|
+
back to their positions in the provided original text. Uses exact,
|
|
1089
|
+
regex-based, and fuzzy matching to find the best alignment.
|
|
1090
|
+
|
|
1091
|
+
Args:
|
|
1092
|
+
original_text: The original untagged text.
|
|
1093
|
+
tagged_text: The text with XML-style tags indicating spans.
|
|
1094
|
+
min_ratio: Minimum similarity ratio (0.0-1.0) for fuzzy matching.
|
|
1095
|
+
Defaults to 0.8.
|
|
1096
|
+
max_dist: Maximum character distance from approximate position
|
|
1097
|
+
to consider a match valid. Defaults to 20.
|
|
1098
|
+
logging: If True, prints detailed debug information during mapping.
|
|
1099
|
+
Defaults to False.
|
|
1100
|
+
""" # First, extract spans/entities from tagged_text
|
|
1101
|
+
temp_content = SpanAligner.tagged_text_to_task(
|
|
1102
|
+
tagged_text,
|
|
1103
|
+
include_attachments=True,
|
|
1104
|
+
allowed_tags=None # allow all tags
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
result_obj = {
|
|
1108
|
+
"spans": temp_content.get("spans", []),
|
|
1109
|
+
"entities": temp_content.get("entities", []),
|
|
1110
|
+
"task": {
|
|
1111
|
+
"data": {
|
|
1112
|
+
"text": "" # will be filled later
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
# Now map spans/entities back to original_text
|
|
1118
|
+
success, mapped = SpanAligner.map_spans_to_original(
|
|
1119
|
+
original_text,
|
|
1120
|
+
result_obj,
|
|
1121
|
+
min_ratio=min_ratio,
|
|
1122
|
+
max_dist=max_dist,
|
|
1123
|
+
logging=logging
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
return mapped["task"]["data"].get("tagged_text", "")
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: span-aligner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A utility for aligning and mapping text spans between different text representations.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: rapidfuzz>=3.0.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# Span Aligner
|
|
15
|
+
|
|
16
|
+
A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- Sanitize span boundaries to avoid special characters.
|
|
21
|
+
- Find exact and fuzzy matches of text segments in original documents.
|
|
22
|
+
- Map spans from one text representation to another.
|
|
23
|
+
- Rebuild tagged text with nested annotations.
|
|
24
|
+
- Merge result objects containing span annotations.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Install from source:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install .
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
For development:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install -e ".[dev]"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from span_aligner import SpanAligner
|
|
44
|
+
|
|
45
|
+
original = "Hello, World!"
|
|
46
|
+
result_obj = {
|
|
47
|
+
"spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
|
|
48
|
+
"entities": [],
|
|
49
|
+
"task": {"data": {"text": ""}}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
|
|
53
|
+
print(mapped)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Map Tags to Original
|
|
57
|
+
|
|
58
|
+
Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and original text as written in the original.
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
original_text = "The quick brown fox jumps\n\n over the dog."
|
|
62
|
+
# Imagine the text was slightly modified or translated, but tags are present
|
|
63
|
+
tagged_text = "The <adj>quick</adj> brown fox jumps over the <animal>dog</animal>."
|
|
64
|
+
|
|
65
|
+
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
66
|
+
original_text=original_text,
|
|
67
|
+
tagged_text=tagged_text,
|
|
68
|
+
min_ratio=0.8
|
|
69
|
+
)
|
|
70
|
+
print(mapped_tagged_text)
|
|
71
|
+
# Output might look like: "The <adj>quick</adj> brown fox jumps\n\n over the <animal>dog</animal>."
|
|
72
|
+
# (If original text differed slightly, tags would be placed on best matching spans)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Rebuild Tagged Text
|
|
76
|
+
|
|
77
|
+
Reconstruct a string with XML-like tags from raw text and span/entity lists.
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
text = "Hello World"
|
|
81
|
+
spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
|
|
82
|
+
entities = [{"start": 6, "end": 11, "labels": ["location"]}]
|
|
83
|
+
|
|
84
|
+
tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
85
|
+
print(tagged)
|
|
86
|
+
# Output: <sentence>Hello <location>World</location></sentence>
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Rebuild Tagged Text from Task
|
|
90
|
+
|
|
91
|
+
Generate tagged text directly from a Label Studio task object.
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
# Assuming 'task' is a Label Studio task object (or similar structure)
|
|
95
|
+
# with .data['text'] and .annotations attributes
|
|
96
|
+
mapping = {"Location": "loc", "Person": "per"}
|
|
97
|
+
|
|
98
|
+
tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
|
|
99
|
+
print(tagged_output)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Get Annotations from Tagged Text
|
|
103
|
+
|
|
104
|
+
Extract structured spans and entities from a string with inline tags.
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
tagged_input = "Visit <loc>Paris</loc> and see the <landmark>Eiffel Tower</landmark>."
|
|
108
|
+
|
|
109
|
+
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
110
|
+
tagged_input,
|
|
111
|
+
ner_map={"loc": "Location", "landmark": "Location"}
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
print(annotations["entities"])
|
|
115
|
+
# Output:
|
|
116
|
+
# [
|
|
117
|
+
# {"start": 6, "end": 11, "text": "Paris", "labels": ["Location"]},
|
|
118
|
+
# {"start": 24, "end": 36, "text": "Eiffel Tower", "labels": ["Location"]}
|
|
119
|
+
# ]
|
|
120
|
+
print(annotations["plain_text"])
|
|
121
|
+
# Output: "Visit Paris and see the Eiffel Tower."
|
|
122
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
span_aligner/__init__.py,sha256=ERLPBS6aad_17IPcoMDkVqi7lrSJxuHgqrdh69EN9xI,63
|
|
2
|
+
span_aligner/aligner.py,sha256=HYo5CvmC9WrnsJo7bFFcUHoJaZhiNf4aZ7pOEzd2MlA,47658
|
|
3
|
+
span_aligner-0.1.0.dist-info/licenses/LICENSE,sha256=TqCZNrAXPrgWq9k95te7bOOyXztjxjxWQWnsHSqT8SM,1096
|
|
4
|
+
span_aligner-0.1.0.dist-info/METADATA,sha256=nqV2rIVPmFCR2aiZ3KanmFx_ZlYfShz6JMp1SVRHld8,3583
|
|
5
|
+
span_aligner-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
span_aligner-0.1.0.dist-info/top_level.txt,sha256=syADug30Z0JSDJXnan6CIBWuI4mCdpghyDa48kj69VY,13
|
|
7
|
+
span_aligner-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Stefaan Vercoutere
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
span_aligner
|