span-aligner 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1178 @@
1
+ """
2
+ SpanAligner Module
3
+ ==================
4
+
5
+ A utility module for aligning and mapping text spans between different text representations,
6
+ particularly useful for Label Studio annotation compatibility.
7
+
8
+ This module provides functionality to:
9
+ - Sanitize span boundaries to avoid special characters
10
+ - Find exact and fuzzy matches of text segments in original documents
11
+ - Map spans from one text representation to another
12
+ - Rebuild tagged text with nested annotations
13
+ - Merge result objects containing span annotations
14
+
15
+ Typical use case: When text has been modified (e.g., cleaned, translated) and annotations
16
+ need to be realigned to the original or modified text.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import re
22
+ from difflib import SequenceMatcher
23
+ from typing import Dict, Any, List, Tuple, Optional, Union
24
+ from rapidfuzz import fuzz
25
+
26
+ # Span sanitization helper for Label Studio compatibility
27
+ SPECIAL_CHARS = {"\n", "\r", "\t", " "}
28
+
29
+
30
+ class SpanAligner:
31
+ """
32
+ A utility class for aligning text spans between different text representations.
33
+
34
+ This class provides static methods for:
35
+ - Sanitizing span boundaries
36
+ - Finding exact and fuzzy text matches
37
+ - Mapping spans from extracted/modified text back to original text
38
+ - Rebuilding tagged text with proper nesting
39
+ - Merging annotation result objects
40
+
41
+ All methods are static and the class serves as a namespace for related functionality.
42
+
43
+ Example Usage:
44
+ >>> original = "Hello, World!"
45
+ >>> result_obj = {
46
+ ... "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
47
+ ... "entities": [],
48
+ ... "task": {"data": {"text": ""}}
49
+ ... }
50
+ >>> success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
51
+ """
52
+
53
+ @staticmethod
54
+ def project_spans(
55
+ src_text: str,
56
+ tgt_text: str,
57
+ src_spans: List[Dict[str, Any]],
58
+ **kwargs
59
+ ) -> List[Dict[str, Any]]:
60
+ """
61
+ Project spans from source to target text using fuzzy matching.
62
+
63
+ Args:
64
+ src_text: Source text (mostly ignored as spans are expected to contain 'text').
65
+ tgt_text: Target text to align to.
66
+ src_spans: List of spans with 'start', 'end', 'text', 'labels'.
67
+ kwargs: Extra arguments:
68
+ - min_ratio: Minimum similarity ratio for fuzzy matching.
69
+ - max_dist: Maximum allowed distance deviation.
70
+ - enable_fuzzy: Whether to use fuzzy matching.
71
+ - logging: Enable debug logging.
72
+
73
+ Returns:
74
+ List of projected spans aligned to target text.
75
+ """
76
+ # Construct result object expected by map_spans_to_original
77
+ # We assume src_spans have correct text relative to what we want to find in tgt_text
78
+ result_obj = {
79
+ "spans": src_spans,
80
+ "entities": [],
81
+ "task": {"data": {"text": ""}}
82
+ }
83
+
84
+ min_ratio = kwargs.get('min_ratio', 0.90)
85
+ max_dist = kwargs.get('max_dist', 20)
86
+ enable_fuzzy = kwargs.get('enable_fuzzy', False)
87
+ logging = kwargs.get('logging', False)
88
+
89
+ _, mapped = SpanAligner.map_spans_to_original(
90
+ tgt_text,
91
+ result_obj,
92
+ min_ratio=min_ratio,
93
+ max_dist=max_dist,
94
+ enable_fuzzy=enable_fuzzy,
95
+ logging=logging
96
+ )
97
+
98
+ return mapped.get("spans", [])
99
+
100
+ @staticmethod
101
+ def sanitize_span(text: str, start: int, end: int) -> tuple[int, int]:
102
+ """
103
+ Adjust start/end indices so they do not land on special characters.
104
+
105
+ Moves start forward and end backward to avoid whitespace and control characters
106
+ at span boundaries, which is important for Label Studio compatibility.
107
+
108
+ Args:
109
+ text: The text containing the span.
110
+ start: The starting index of the span (inclusive).
111
+ end: The ending index of the span (exclusive).
112
+
113
+ Returns:
114
+ tuple[int, int]: A tuple of (sanitized_start, sanitized_end) indices.
115
+ Both values are clamped to [0, len(text)] and guaranteed to satisfy start <= end.
116
+
117
+ Example:
118
+ >>> SpanAligner.sanitize_span(" Hello ", 0, 9)
119
+ (2, 7) # Removes leading/trailing spaces
120
+ """
121
+ n = len(text)
122
+ s = max(0, min(start, n))
123
+ e = max(0, min(end, n))
124
+
125
+ # Move start forward while on a special char and s < e
126
+ while s < e and s < n and text[s] in SPECIAL_CHARS:
127
+ s += 1
128
+ # Move end backward while on a special char and s < e
129
+ while s < e and e > 0 and text[e-1] in SPECIAL_CHARS:
130
+ e -= 1
131
+
132
+ return s, e
133
+
134
+ @staticmethod
135
+ def _sequence_similarity(a: str, b: str) -> float:
136
+ """
137
+ Calculate the similarity ratio between two strings using SequenceMatcher.
138
+
139
+ Args:
140
+ a: First string to compare.
141
+ b: Second string to compare.
142
+
143
+ Returns:
144
+ float: Similarity ratio between 0.0 and 1.0, where 1.0 means identical strings.
145
+ Returns 0.0 if either string is empty.
146
+ """
147
+ if not a or not b:
148
+ return 0.0
149
+ return SequenceMatcher(None, a, b).ratio()
150
+
151
+
152
+ @staticmethod
153
+ def _find_exact(original_text: str, segment: str) -> List[int]:
154
+ """
155
+ Find all exact occurrences of a segment within the original text.
156
+
157
+ Args:
158
+ original_text: The text to search within.
159
+ segment: The exact substring to find.
160
+
161
+ Returns:
162
+ List[int]: A list of starting indices where the segment was found.
163
+ Empty list if no matches found.
164
+
165
+ Example:
166
+ >>> SpanAligner._find_exact("hello world hello", "hello")
167
+ [0, 12]
168
+ """
169
+ indices = []
170
+ start = 0
171
+ while True:
172
+ idx = original_text.find(segment, start)
173
+ if idx == -1:
174
+ break
175
+ indices.append(idx)
176
+ start = idx + 1
177
+ return indices
178
+
179
+ @staticmethod
180
+ def _best_fuzzy_in_window(
181
+ original_text: str,
182
+ segment: str,
183
+ start_hint: Optional[int],
184
+ max_search_slack: int = 20
185
+ ) -> Tuple[Optional[int], Optional[int], float]:
186
+ """
187
+ Find the best fuzzy match for a segment within a window around a hint position.
188
+ Uses RapidFuzz for performance and better fuzzy matching.
189
+ """
190
+ if not segment:
191
+ return None, None, 0.0
192
+
193
+ # Calculate window bounds
194
+ if start_hint is None:
195
+ left = 0
196
+ right = len(original_text)
197
+ else:
198
+ left = max(0, start_hint - max_search_slack)
199
+ # Add segment length + slack to the window end
200
+ right = min(len(original_text), start_hint + len(segment) + 2 * max_search_slack)
201
+
202
+ window = original_text[left:right]
203
+
204
+ if not window:
205
+ return None, None, 0.0
206
+
207
+ best_ratio = 0.0
208
+ best_start = None
209
+ best_end = None
210
+
211
+ seg_len = len(segment)
212
+ # Allow length variation (slack)
213
+ # We allow candidates to be +/- 20% length of segment, but at least +/- 5 chars
214
+ slack_len = max(5, int(seg_len * 0.2))
215
+ min_len = max(1, seg_len - slack_len)
216
+ max_len = seg_len + slack_len
217
+
218
+ len_window = len(window)
219
+
220
+ # Sliding window search
221
+ for i in range(len_window):
222
+ # Optimization: If the remaining window is shorter than min_len, stop
223
+ if i + min_len > len_window:
224
+ break
225
+
226
+ # Limit candidate end to avoid checking excessively long strings
227
+ end_limit = min(i + max_len, len_window) + 1
228
+
229
+ for j in range(i + min_len, end_limit):
230
+ candidate = window[i:j]
231
+
232
+ # Use rapidfuzz ratio
233
+ ratio = fuzz.ratio(segment, candidate) / 100.0
234
+
235
+ if ratio > best_ratio:
236
+ best_ratio = ratio
237
+ best_start = left + i
238
+ best_end = left + j
239
+
240
+ if best_ratio == 1.0:
241
+ break
242
+ if best_ratio == 1.0:
243
+ break
244
+
245
+ if best_start is None:
246
+ return None, None, 0.0
247
+
248
+ # Sanitize using the original SpanAligner helper
249
+ best_start, best_end = SpanAligner.sanitize_span(original_text, best_start, best_end)
250
+
251
+ return best_start, best_end, best_ratio
252
+
253
+ @staticmethod
254
+ def _regex_word_sequence(
255
+ original_text: str,
256
+ segment: str,
257
+ start_hint: Optional[int] = None,
258
+ max_search_slack: int = 20
259
+ ) -> Optional[Tuple[int, int]]:
260
+ """
261
+ Find a segment in the original text using regex-based word sequence matching.
262
+
263
+ This method tokenizes the segment into words and punctuation, then builds a
264
+ tolerant regex pattern that allows for varying whitespace/separators between
265
+ tokens. This is useful for matching text that may have different formatting.
266
+
267
+ Args:
268
+ original_text: The text to search within.
269
+ segment: The text segment to find (will be tokenized).
270
+ start_hint: Approximate starting position to prioritize searching around.
271
+ If provided, searches near this position first before falling back to
272
+ full text search.
273
+ max_search_slack: Maximum distance from start_hint to search.
274
+ Default is 20 characters.
275
+
276
+ Returns:
277
+ Optional[Tuple[int, int]]: A tuple of (start, end) indices if found,
278
+ or (None, None) if no match found.
279
+
280
+ Example:
281
+ >>> # Matches "hello world" even with different whitespace
282
+ >>> SpanAligner._regex_word_sequence("hello world", "hello world")
283
+ (0, 13)
284
+ """
285
+ # Tokenize into words and punctuation, keeping punctuation tokens
286
+ # Words: one or more word chars; Punct: any single non-word, non-space char
287
+ tokens = re.findall(r"\w+|[^\w\s]", segment)
288
+ if not tokens:
289
+ return None, None
290
+
291
+ # Build a tolerant pattern that matches tokens in order, allowing non-word separators/newlines after each
292
+ # Keep punctuation characters explicitly in the pattern
293
+ escaped = list(map(re.escape, tokens))
294
+ pattern = r"(?s)" + r"\W*".join(escaped)
295
+ try:
296
+ regex = re.compile(pattern)
297
+ except re.error:
298
+ return None, None
299
+
300
+ # If we have a hint, first search in a bounded region around it
301
+ if isinstance(start_hint, int):
302
+ left = max(0, start_hint - max_search_slack)
303
+ # allow for extra room to the right in case of many separators
304
+ right = min(len(original_text), start_hint + max(max_search_slack, len(segment) * 2))
305
+ subset = original_text[left:right]
306
+ m = regex.search(subset)
307
+ if m:
308
+ return left + m.start(), left + m.end()
309
+
310
+ # Fallback: search entire text
311
+ m = regex.search(original_text)
312
+
313
+ if not m:
314
+ return None, None
315
+ return m.start(), m.end()
316
+
317
+ @staticmethod
318
+ def map_spans_to_original(
319
+ original_text: str,
320
+ result_obj: Dict[str, Any],
321
+ min_ratio: float = 0.90,
322
+ logging: bool = False,
323
+ max_dist: int = 20,
324
+ enable_fuzzy: bool = False,
325
+ ) -> Tuple[bool, Dict[str, Any]]:
326
+ """
327
+ Map spans from a result object to their positions in the original text.
328
+
329
+ This is the main alignment method that attempts to find the correct positions
330
+ of annotated spans in the original text. It uses multiple strategies:
331
+ 1. Exact matching (fastest, most reliable)
332
+ 2. Regex-based word sequence matching (handles whitespace variations)
333
+ 3. Fuzzy matching (optional, for handling minor text differences)
334
+
335
+ Args:
336
+ original_text: The original/target text to map spans onto.
337
+ result_obj: A dictionary containing annotation data with the following structure:
338
+ {
339
+ "spans": [{
340
+ "start": int, # Approximate start position
341
+ "end": int, # Approximate end position
342
+ "text": str, # The text content of the span
343
+ "labels": [str] # List of label names
344
+ }, ...],
345
+ "entities": [...], # Same structure as spans
346
+ "task": {
347
+ "data": {"text": str} # Will be updated with original_text
348
+ }
349
+ }
350
+ min_ratio: Minimum similarity ratio (0.0-1.0) for fuzzy matching.
351
+ Default is 0.90.
352
+ logging: If True, prints debug information during alignment.
353
+ Default is False.
354
+ max_dist: Maximum allowed distance between approximate and actual
355
+ start positions. Matches further than this are rejected.
356
+ Default is 20 characters.
357
+
358
+ Returns:
359
+ Tuple[bool, Dict[str, Any]]: A tuple of:
360
+ - bool: True if all spans were successfully aligned, False otherwise.
361
+ - Dict: Updated result_obj with mapped spans. Each span now includes:
362
+ - start/end: Mapped positions (or None if unmatched)
363
+ - text: Matched text from original (or None if unmatched)
364
+ - status: "exact", "regex", "fuzzy", or "unmatched"
365
+ - similarity: Match similarity score (0.0-1.0)
366
+ - detected: The cleaned segment text that was searched for
367
+ - approx_start: Original approximate start position
368
+
369
+ Example:
370
+ >>> original = "Hello, World!"
371
+ >>> result = {
372
+ ... "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
373
+ ... "entities": [],
374
+ ... "task": {"data": {"text": ""}}
375
+ ... }
376
+ >>> success, mapped = SpanAligner.map_spans_to_original(original, result)
377
+ >>> success
378
+ True
379
+ >>> mapped["spans"][0]["status"]
380
+ 'exact'
381
+ """
382
+ input_spans: List[Dict[str, Any]] = result_obj.get("spans", [])
383
+ input_entities: List[Dict[str, Any]] = result_obj.get("entities", [])
384
+
385
+
386
+ def realign(items: List[Dict[str, Any]], enable_fuzzy: bool = False) -> Tuple[bool,List[Dict[str, Any]]]:
387
+ mapped: List[Dict[str, Any]] = []
388
+ all_aligned = True
389
+ for span in items:
390
+
391
+ approx_start = span.get("start", 0)
392
+ segment = span.get("text", "") or ""
393
+ labels = span.get("labels", [])
394
+ clean_segment = segment.strip()
395
+ chosen_end = None
396
+
397
+ if logging:
398
+ print("\n\n\n=======NEW SPAN==============")
399
+ print(f"label: {labels}")
400
+ print(f"cleaned span: '{clean_segment}' from original segment: '{segment}'")
401
+
402
+
403
+ exact_indices = SpanAligner._find_exact(original_text, clean_segment)
404
+ chosen_start = None
405
+ similarity = 1.0 if exact_indices else 0.0
406
+ status = "unmatched"
407
+
408
+ # First try exact match
409
+ if exact_indices:
410
+ chosen_start = min(exact_indices, key=lambda i: abs(i - approx_start))
411
+ chosen_end = chosen_start + len(clean_segment)
412
+ status = "exact"
413
+
414
+ # Next try regex-based word sequence match (search near approx_start for all words in order)
415
+ else:
416
+ step = 20
417
+ slacks = sorted(list(set(list(range(0, max_dist + 1, step)) + [max_dist])))
418
+
419
+ # 1. Try Regex progressively
420
+ for current_slack in slacks:
421
+ regex_start, regex_end = SpanAligner._regex_word_sequence(original_text, clean_segment, start_hint=approx_start, max_search_slack=current_slack)
422
+
423
+ if regex_start is not None:
424
+ chosen_start = regex_start
425
+ chosen_end = regex_end
426
+ # similarity compared against the fully matched span
427
+ similarity = SpanAligner._sequence_similarity(clean_segment, original_text[regex_start:regex_end])
428
+ status = "regex"
429
+ break
430
+
431
+ # 2. Try fuzzy matching if regex failed
432
+ if status == "unmatched" and enable_fuzzy:
433
+
434
+ step = 20
435
+ slacks = sorted(list(set(list(range(0, max_dist + 1, step)) + [max_dist])))
436
+
437
+ for current_slack in slacks:
438
+ fuzzy_start, fuzzy_end, fuzzy_ratio = SpanAligner._best_fuzzy_in_window(original_text, clean_segment, start_hint=approx_start, max_search_slack=current_slack)
439
+
440
+ if fuzzy_start is not None and fuzzy_ratio >= min_ratio:
441
+ chosen_start = fuzzy_start
442
+ chosen_end = fuzzy_end
443
+ similarity = fuzzy_ratio
444
+ status = "fuzzy"
445
+ break
446
+
447
+
448
+
449
+ # Check distance threshold
450
+ if chosen_start is not None and abs(chosen_start - approx_start) > max_dist:
451
+ if logging:
452
+ print(f"Match rejected due to distance: {abs(chosen_start - approx_start)} > {max_dist}")
453
+ chosen_start = None
454
+ chosen_end = None
455
+ status = "unmatched"
456
+
457
+ if chosen_start is not None:
458
+ # Sanitize the mapped span to avoid leading/trailing special characters
459
+ chosen_start, chosen_end = SpanAligner.sanitize_span(original_text, chosen_start, chosen_end)
460
+ matched_text = original_text[chosen_start:chosen_end]
461
+ else:
462
+ if logging:
463
+ print("No match found")
464
+
465
+ matched_text = None
466
+ all_aligned = False
467
+
468
+ if logging:
469
+ print("=====================")
470
+ print(f"span: {span} segment: {clean_segment}")
471
+ print(f"status:{status} similarity: {similarity}")
472
+ print(f"pre sanit: {(chosen_start,chosen_end, len(clean_segment))}" if chosen_start is not None else "pre sanit: None")
473
+ if chosen_start is not None:
474
+ print(f"updated positions: (start: {chosen_start}; end: {chosen_end})")
475
+ print(f"Extracted in original: '{original_text[chosen_start:chosen_end]}'")
476
+
477
+ mapped.append({
478
+ "start": chosen_start,
479
+ "end": chosen_end,
480
+ "text": matched_text,
481
+ "labels": labels,
482
+ "status": status,
483
+ "similarity": round(similarity, 4),
484
+ "detected": clean_segment,
485
+ "approx_start": approx_start,
486
+ })
487
+ return all_aligned, mapped
488
+
489
+ updated = dict(result_obj)
490
+ all_spans_aligned, updated["spans"] = realign(input_spans, enable_fuzzy)
491
+ all_entities_aligned, updated["entities"] = realign(input_entities, enable_fuzzy)
492
+ updated["task"]["data"]["text"] = original_text
493
+ return updated, all_spans_aligned and all_entities_aligned
494
+
495
+
496
+ @staticmethod
497
+ def merge_result_objects(
498
+ base: Dict[str, Any],
499
+ addition: Dict[str, Any],
500
+ span_from_name: str,
501
+ ner_from_name: str
502
+ ) -> Dict[str, Any]:
503
+ """
504
+ Merge two result objects by combining their span and entity lists.
505
+
506
+ Creates a new dictionary based on the base object, then appends spans and
507
+ entities from the addition object.
508
+
509
+ Args:
510
+ base: The base result object to merge into.
511
+ addition: The result object to merge from.
512
+ span_from_name: The key name for span annotations (e.g., "spans", "segmentation").
513
+ ner_from_name: The key name for NER/entity annotations (e.g., "entities").
514
+
515
+ Returns:
516
+ Dict[str, Any]: A new merged dictionary with combined spans and entities.
517
+ The base object is shallow-copied, so nested objects may still be shared.
518
+
519
+ Example:
520
+ >>> base = {"spans": [{"text": "A"}], "entities": []}
521
+ >>> addition = {"spans": [{"text": "B"}], "entities": [{"text": "C"}]}
522
+ >>> merged = SpanAligner.merge_result_objects(base, addition, "spans", "entities")
523
+ >>> len(merged["spans"])
524
+ 2
525
+ """
526
+ merged = dict(base)
527
+
528
+ addition_spans = addition.get(span_from_name, [])
529
+ base_spans = merged.get(span_from_name, [])
530
+ merged[span_from_name] = base_spans + addition_spans
531
+
532
+ addition_ner = addition.get(ner_from_name, [])
533
+ base_ner = merged.get(ner_from_name, [])
534
+ merged[ner_from_name] = base_ner + addition_ner
535
+ return merged
536
+
537
+ @staticmethod
538
+ def _invert_label_map(tag_to_label: Dict[str, str]) -> Dict[str, str]:
539
+ """
540
+ Invert a tag-to-label mapping to create a label-to-tag mapping.
541
+
542
+ Args:
543
+ tag_to_label: A dictionary mapping tag names to label names.
544
+ Can be None, in which case an empty dict is returned.
545
+
546
+ Returns:
547
+ Dict[str, str]: A dictionary mapping label names to tag names.
548
+
549
+ Example:
550
+ >>> SpanAligner._invert_label_map({"loc": "Location", "per": "Person"})
551
+ {'Location': 'loc', 'Person': 'per'}
552
+ """
553
+ return {v: k for k, v in (tag_to_label or {}).items()}
554
+
555
+ @staticmethod
556
+ def _sanitize_label_to_tag(label: str) -> str:
557
+ """
558
+ Convert a human-readable label to a sanitized XML-safe tag name.
559
+
560
+ Converts the label to lowercase, replaces spaces with underscores,
561
+ and removes any characters that are not alphanumeric or underscores.
562
+
563
+ Args:
564
+ label: The human-readable label to convert.
565
+
566
+ Returns:
567
+ str: A sanitized tag name suitable for use in XML/HTML tags.
568
+ Returns "span" if the result would be empty.
569
+
570
+ Example:
571
+ >>> SpanAligner._sanitize_label_to_tag("My Label (Special)")
572
+ 'my_label_special'
573
+ """
574
+ # Fallback: convert human label to tag-like form
575
+ tag = label.strip().lower().replace(" ", "_")
576
+ tag = re.sub(r"[^a-z0-9_]+", "_", tag).strip("_")
577
+ return tag or "span"
578
+
579
+
580
+ @staticmethod
581
+ def _format_annotations(task: Any) -> Dict[str, Any]:
582
+ """
583
+ Extract and format annotations from a Label Studio task object.
584
+
585
+ Parses the first annotation from the task and categorizes the results
586
+ into classification choices, entities, and segmentation spans.
587
+ Falls back to predictions if no annotations are available.
588
+
589
+ Args:
590
+ task: A Label Studio task object with an `annotations` attribute.
591
+ Expected structure:
592
+ task.annotations = [{
593
+ "result": [
594
+ {"type": "choices", "from_name": "type", "value": {"choices": [...]}},
595
+ {"type": "labels", "from_name": "entities", "value": {...}},
596
+ {"type": "labels", "from_name": "segmentation", "value": {...}}
597
+ ]
598
+ }]
599
+ If annotations are empty, predictions with the same structure
600
+ will be used as a fallback.
601
+
602
+ Returns:
603
+ Dict[str, Any]: A dictionary with three keys:
604
+ - "classification": List of classification choices
605
+ - "entities": List of entity annotation values
606
+ - "segmentation": List of segmentation span values
607
+ """
608
+ # Try annotations first, fall back to predictions if empty
609
+ results = []
610
+ if task.annotations:
611
+ results = task.annotations[0].get("result", [])
612
+
613
+ # If no annotations, try predictions
614
+ if not results and hasattr(task, 'predictions') and task.predictions:
615
+ results = task.predictions[0].result or []
616
+
617
+ classification = []
618
+ entities = []
619
+ spans = []
620
+
621
+ for ann in results:
622
+ ann_type = ann.get("type")
623
+ from_name = ann.get("from_name")
624
+ value = ann.get("value", {})
625
+
626
+ if ann_type == "choices" and from_name == "type":
627
+ if choices := value.get("choices"):
628
+ classification = choices
629
+ elif ann_type == "labels":
630
+ if from_name == "entities":
631
+ entities.append(value)
632
+ elif from_name == "segmentation":
633
+ spans.append(value)
634
+
635
+ return {
636
+ "classification": classification,
637
+ "entities": entities,
638
+ "segmentation": spans
639
+ }
640
+
641
+
642
+ @staticmethod
643
+ def update_mapped_with_rebuilt(
644
+ original_text: str,
645
+ mapped: Dict[str, Any],
646
+ span_label_mapping: Optional[Dict[str, str]] = None,
647
+ ner_label_mapping: Optional[Dict[str, str]] = None,
648
+ overwrite: bool = True
649
+ ) -> Dict[str, Any]:
650
+ """
651
+ Update a mapped result object with rebuilt tagged text.
652
+
653
+ Takes a mapped result object (output from map_spans_to_original) and
654
+ generates tagged text from its spans and entities, storing the result
655
+ in the task data.
656
+
657
+ Args:
658
+ original_text: The original text to use for rebuilding tags.
659
+ mapped: A mapped result object containing:
660
+ - "spans": List of span annotations
661
+ - "entities": List of entity annotations
662
+ - "task": {"data": {...}} - Task data to update
663
+ span_label_mapping: Optional tag-to-label mapping for spans.
664
+ Will be inverted to create label-to-tag mapping.
665
+ ner_label_mapping: Optional tag-to-label mapping for NER entities.
666
+ Will be inverted to create label-to-tag mapping.
667
+ overwrite: If True, overwrites "tagged_text" in task data.
668
+ If False, stores result in "tagged_text_unified" instead.
669
+ When overwriting, the original tagged_text is preserved in
670
+ "tagged_text_original" if it exists.
671
+
672
+ Returns:
673
+ Dict[str, Any]: The same mapped object (modified in place) with:
674
+ - task.data.tagged_text (or tagged_text_unified): The rebuilt tagged text
675
+ - task.data.tagged_text_original: Original tagged_text if overwritten
676
+ - task.data.rebuild_stats: Statistics from rebuild operation
677
+
678
+ Example:
679
+ >>> mapped = {"spans": [...], "entities": [...], "task": {"data": {}}}
680
+ >>> updated = SpanAligner.update_mapped_with_rebuilt("Hello World", mapped)
681
+ >>> "tagged_text" in updated["task"]["data"]
682
+ True
683
+ """
684
+ data = mapped.get("task", {}).get("data", {})
685
+ # text = data.get("text", "")
686
+ label_to_tag = {}
687
+ label_to_tag.update(SpanAligner._invert_label_map(span_label_mapping or {}))
688
+ label_to_tag.update(SpanAligner._invert_label_map(ner_label_mapping or {}))
689
+
690
+ rebuilt, stats = SpanAligner.rebuild_tagged_text(
691
+ original_text,
692
+ mapped.get("spans", []),
693
+ mapped.get("entities", []),
694
+ label_to_tag=label_to_tag,
695
+ )
696
+
697
+ # Preserve original and write unified
698
+ if "tagged_text" in data and not data.get("tagged_text_original"):
699
+ data["tagged_text_original"] = data.get("tagged_text")
700
+ if overwrite:
701
+ data["tagged_text"] = rebuilt
702
+ else:
703
+ data["tagged_text_unified"] = rebuilt
704
+ data["rebuild_stats"] = stats
705
+ return mapped
706
+
707
+
708
+
709
+ #### From tagged text to task
710
+ @staticmethod
711
+ def get_annotations_from_tagged_text(
712
+ result: Union[dict, str],
713
+ *,
714
+ include_attachments: bool = True,
715
+ span_map: Optional[Dict[str, str]] = None,
716
+ ner_map: Optional[Dict[str, str]] = None,
717
+ class_map: Optional[Dict[str, str]] = None,
718
+ allowed_tags: Optional[List[str]] = None,
719
+ ) -> dict:
720
+ """
721
+ Convert a tagged result (with inline XML-like tags) into structured annotations.
722
+
723
+ Extracts spans and entities from tagged text by removing tags and tracking
724
+ character offsets in the resulting plain text. Supports nested tags and
725
+ custom tag-to-label mappings.
726
+
727
+ Args:
728
+ result: Input dictionary with 'tagged_text' key, or the tagged text string itself.
729
+ include_attachments: Whether to include text content inside <attachment> tags.
730
+ span_map: Dictionary mapping tag names to span labels.
731
+ ner_map: Dictionary mapping tag names to entity labels.
732
+ class_map: Dictionary mapping document classifications to labels.
733
+ allowed_tags: List of tag names to process. If None, derived from map keys.
734
+
735
+ Returns:
736
+ dict: A dictionary containing:
737
+ - spans: List of span (segmentation) objects
738
+ - entities: List of entity (NER) objects
739
+ - plain_text: The text content with tags removed
740
+ - tagged_text: The original tagged text used
741
+ - document_classification: The classification from input result (if any)
742
+
743
+ Notes:
744
+ - Spans are derived by removing tags while tracking character offsets in the
745
+ plain text. Nested tags are supported; spans may overlap.
746
+ - `span_map` lets you rename tags to match your LS label config.
747
+ - `allowed_tags` limits which tags are turned into spans. If None, uses the
748
+ tag set defined in your prompts.
749
+ """
750
+
751
+ # Resolve tagged_text input
752
+ tagged_text = ""
753
+ doc_class = None
754
+ if isinstance(result, dict):
755
+ tagged_text = result.get("tagged_text", "")
756
+ doc_class = result.get("document_classification")
757
+ else:
758
+ tagged_text = str(result or "")
759
+
760
+ if not tagged_text:
761
+ raise ValueError("No tagged_text found in input result.")
762
+
763
+ # Default allowed tags (from your SYSTEM/USER prompts)
764
+ if allowed_tags is None and (span_map or ner_map):
765
+ # Safely handle None maps
766
+ s_map = span_map or {}
767
+ n_map = ner_map or {}
768
+ allowed_tags = list(n_map.keys()) + list(s_map.keys())
769
+
770
+ # Merge span_map and ner_map safely into annotation_map
771
+ annotation_map = {}
772
+ for mapping in (span_map, ner_map):
773
+ if mapping:
774
+ annotation_map.update(mapping)
775
+
776
+ # Regex to capture bare tags like <tag> or </tag>
777
+ tag_re = re.compile(r"<(/?)([a-zA-Z_][a-zA-Z0-9_-]*)>")
778
+
779
+ plain_parts: List[str] = []
780
+ spans: List[dict] = []
781
+ entities: List[dict] = []
782
+
783
+ stack: List[Tuple[str, int]] = [] # (tag_name_lower, start_offset_in_plain)
784
+
785
+ pos_in = 0 # position in tagged_text
786
+ pos_out = 0 # position in plain text we are building
787
+
788
+ def emit_text(s: str):
789
+ nonlocal pos_out
790
+ if not s:
791
+ return
792
+ plain_parts.append(s)
793
+ pos_out += len(s)
794
+
795
+ # Attachment handling: if we skip attachments, when inside attachments or attachment, we don't emit text
796
+ inside_attachments_level = 0
797
+
798
+ for m in tag_re.finditer(tagged_text):
799
+ # Emit any literal text before this tag
800
+ literal = tagged_text[pos_in:m.start()]
801
+ current_tag_is_attachment = inside_attachments_level > 0
802
+
803
+ if include_attachments or not current_tag_is_attachment:
804
+ emit_text(literal)
805
+
806
+ is_closing = bool(m.group(1))
807
+ tag_name = m.group(2).lower()
808
+
809
+ # Track attachments nesting regardless of allowed_tags so we can drop their content when requested
810
+ if tag_name in ("attachments", "attachment"):
811
+ if not is_closing:
812
+ inside_attachments_level += 1
813
+ else:
814
+ inside_attachments_level = max(0, inside_attachments_level - 1)
815
+
816
+ # Handle span stack only for allowed tags
817
+ if allowed_tags is None or tag_name in allowed_tags:
818
+ if not is_closing:
819
+ # Opening tag
820
+ stack.append((tag_name, pos_out))
821
+ else:
822
+ # Closing tag — find the last matching opening tag
823
+ # Iterate backwards to find the matching opening tag
824
+ found_open = False
825
+ for i in range(len(stack) - 1, -1, -1):
826
+ open_tag, start_off = stack[i]
827
+ if open_tag == tag_name:
828
+ # Pop all tags above the matching one (handle mismatched nesting)
829
+ stack = stack[:i]
830
+ end_off = pos_out
831
+
832
+ # Create a span only if it has positive length
833
+ if end_off > start_off:
834
+ full_span_text = ("".join(plain_parts))[start_off:end_off]
835
+
836
+ # Adjust start to skip leading newlines
837
+ adjusted_start = start_off
838
+ span_text = full_span_text
839
+
840
+ while span_text.startswith('\n'):
841
+ adjusted_start += 1
842
+ span_text = span_text[1:]
843
+
844
+ # Adjust end to skip trailing newlines
845
+ adjusted_end = end_off
846
+ while span_text.endswith('\n'):
847
+ adjusted_end -= 1
848
+ span_text = span_text[:-1]
849
+
850
+ # Only create span if there's content after trimming
851
+ if adjusted_end > adjusted_start:
852
+ annotation_entry = {
853
+ "start": adjusted_start,
854
+ "end": adjusted_end,
855
+ "text": span_text,
856
+ "labels": [annotation_map.get(tag_name, tag_name) if annotation_map else tag_name]
857
+ }
858
+
859
+ if ner_map and tag_name in ner_map:
860
+ entities.append(annotation_entry)
861
+ else:
862
+ spans.append(annotation_entry)
863
+
864
+ found_open = True
865
+ break
866
+ # If no matching opening tag found, ignore gracefully
867
+
868
+ pos_in = m.end()
869
+
870
+ # Emit remaining tail text
871
+ tail = tagged_text[pos_in:]
872
+ if include_attachments or inside_attachments_level == 0:
873
+ emit_text(tail if include_attachments else "")
874
+
875
+ plain_text = "".join(plain_parts)
876
+
877
+ return {
878
+ "spans": spans,
879
+ "entities": entities,
880
+ "plain_text": plain_text,
881
+ "tagged_text": tagged_text,
882
+ "document_classification": doc_class
883
+ }
884
+
885
+ @staticmethod
886
+ def tagged_text_to_task(
887
+ result: Union[dict, str],
888
+ *,
889
+ include_attachments: bool = True,
890
+ span_map: Optional[Dict[str, str]] = None,
891
+ ner_map: Optional[Dict[str, str]] = None,
892
+ class_map: Optional[Dict[str, str]] = None,
893
+ allowed_tags: Optional[List[str]] = None,
894
+ ) -> dict:
895
+ """
896
+ Convert a tagged result into an uploader-ready Label Studio task.
897
+
898
+ Uses `get_annotations_from_tagged_text` to parse the input and formats
899
+ the output as expected by the Label Studio uploader class.
900
+
901
+ Args:
902
+ result: Input dictionary with 'tagged_text' key, or the tagged text string.
903
+ include_attachments: Whether to include text content inside <attachment> tags.
904
+ span_map: Dictionary mapping tag names to span labels.
905
+ ner_map: Dictionary mapping tag names to entity labels.
906
+ class_map: Dictionary mapping document classifications to labels.
907
+ allowed_tags: List of tag names to process. If None, derived from map keys.
908
+
909
+ Returns:
910
+ dict: A dictionary ready for Label Studio import, containing:
911
+ - task: Task data including text and metadata
912
+ - spans: Extracted spans
913
+ - entities: Extracted entities
914
+ - labels: Classification labels (if applicable)
915
+ """
916
+ # Parse annotations using shared logic
917
+ parsed = SpanAligner.get_annotations_from_tagged_text(
918
+ result,
919
+ include_attachments=include_attachments,
920
+ span_map=span_map,
921
+ ner_map=ner_map,
922
+ class_map=class_map,
923
+ allowed_tags=allowed_tags
924
+ )
925
+
926
+ spans = parsed["spans"]
927
+ entities = parsed["entities"]
928
+ plain_text = parsed["plain_text"]
929
+ tagged_text = parsed["tagged_text"]
930
+ doc_class = parsed["document_classification"]
931
+
932
+ # Handle classification mapping
933
+ classification_labels = []
934
+ if doc_class and class_map and doc_class in class_map:
935
+ classification_labels = [class_map[doc_class]]
936
+
937
+ content = {
938
+ "task": {
939
+ "data": {
940
+ "text": plain_text,
941
+ "tagged_text": tagged_text,
942
+ "meta": {
943
+ "segments": len(spans),
944
+ "labels_present": sorted({(s.get("labels") or [""])[0] for s in spans}),
945
+ "include_attachments": include_attachments,
946
+ "document_classification": doc_class or ""
947
+ }
948
+ }
949
+ },
950
+ "spans": spans,
951
+ "labels": classification_labels,
952
+ "entities": entities
953
+ }
954
+
955
+ return content
956
+
957
+
958
+ #### From task to tagged text
959
+ @staticmethod
960
+ def rebuild_tagged_text(
961
+ original_text: str,
962
+ spans: List[Dict[str, Any]] = None,
963
+ entities: List[Dict[str, Any]] = None,
964
+ label_to_tag: Optional[Dict[str, str]] = None
965
+ ) -> Tuple[str, Dict[str, int]]:
966
+ """
967
+ Rebuild text with nested XML-style tags from span and entity annotations.
968
+
969
+ Creates properly nested tags from annotations, handling overlapping spans
970
+ by skipping crossing (non-nested) annotations to maintain valid XML structure.
971
+
972
+ Args:
973
+ original_text: The source text to add tags to.
974
+ spans: List of span annotations, each with:
975
+ - "start": int - Starting character index
976
+ - "end": int - Ending character index (exclusive)
977
+ - "labels": List[str] - Label names (first one is used)
978
+ entities: List of entity annotations (same structure as spans).
979
+ label_to_tag: Optional mapping from label names to tag names.
980
+ If a label is not in the mapping, it will be sanitized to
981
+ create a valid tag name.
982
+
983
+ Returns:
984
+ Tuple[str, Dict[str, int]]: A tuple of:
985
+ - str: The text with XML tags inserted (e.g., "<tag>text</tag>")
986
+ - Dict with statistics:
987
+ - "total": Total number of valid annotations processed
988
+ - "skipped_crossing": Number of annotations skipped due to
989
+ crossing (non-nested) overlaps
990
+
991
+ Note:
992
+ - Annotations with invalid positions (negative, overlapping bounds,
993
+ or exceeding text length) are silently skipped.
994
+ - For overlapping annotations, outer (longer) spans are preferred.
995
+ - Crossing annotations that would create invalid XML are skipped.
996
+
997
+ Example:
998
+ >>> text = "Hello World"
999
+ >>> spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
1000
+ >>> entities = [{"start": 0, "end": 5, "labels": ["greeting"]}]
1001
+ >>> result, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
1002
+ >>> result
1003
+ '<sentence><greeting>Hello</greeting> World</sentence>'
1004
+ """
1005
+ annotations: List[Dict[str, Any]] = []
1006
+
1007
+ def to_tag(lbls: List[str]) -> Optional[str]:
1008
+ if not lbls:
1009
+ return None
1010
+ lbl = lbls[0]
1011
+ if label_to_tag and lbl in label_to_tag:
1012
+ return label_to_tag[lbl]
1013
+ return SpanAligner._sanitize_label_to_tag(lbl)
1014
+
1015
+ def add_items(items: List[Dict[str, Any]]):
1016
+ for it in items or []:
1017
+ s = it.get("start")
1018
+ e = it.get("end")
1019
+ if not isinstance(s, int) or not isinstance(e, int) or s is None or e is None or s < 0 or e <= s or e > len(original_text):
1020
+ continue
1021
+ tag = to_tag(it.get("labels") or [])
1022
+ if not tag:
1023
+ continue
1024
+ annotations.append({
1025
+ "start": s,
1026
+ "end": e,
1027
+ "tag": str(tag),
1028
+ "length": e - s,
1029
+ })
1030
+
1031
+ if spans and len(spans)>0:
1032
+ add_items(spans)
1033
+ if entities and len(entities)>0:
1034
+ add_items(entities)
1035
+
1036
+ # Sort: by start asc, longer first (end desc) to open outers before inners
1037
+ annotations.sort(key=lambda a: (a["start"], -a["length"]))
1038
+
1039
+ # Index starts and ends
1040
+ starts: Dict[int, List[Dict[str, Any]]] = {}
1041
+ for a in annotations:
1042
+ starts.setdefault(a["start"], []).append(a)
1043
+ for pos in starts:
1044
+ starts[pos].sort(key=lambda a: -a["length"]) # longer first
1045
+
1046
+ ends: Dict[int, List[Dict[str, Any]]] = {}
1047
+ for a in annotations:
1048
+ ends.setdefault(a["end"], []).append(a)
1049
+
1050
+ event_positions = sorted({0, len(original_text), *starts.keys(), *ends.keys()})
1051
+
1052
+ pieces: List[str] = []
1053
+ stack: List[Dict[str, Any]] = []
1054
+ last = 0
1055
+ skipped_cross = 0
1056
+
1057
+ for pos in event_positions:
1058
+ if pos > last:
1059
+ pieces.append(original_text[last:pos])
1060
+
1061
+ # Close all tags that end here (LIFO)
1062
+ while stack and stack[-1]["end"] == pos:
1063
+ top = stack.pop()
1064
+ pieces.append(f"</{top['tag']}>")
1065
+
1066
+ # Open tags that start here (outer first)
1067
+ for ann in starts.get(pos, []):
1068
+ # Crossing check: if an open tag exists with end < ann.end (not nested), skip ann
1069
+ if stack and ann["end"] > stack[-1]["end"]:
1070
+ skipped_cross += 1
1071
+ continue
1072
+ pieces.append(f"<{ann['tag']}>")
1073
+ stack.append(ann)
1074
+
1075
+ last = pos
1076
+
1077
+ # Tail
1078
+ pieces.append(original_text[last:])
1079
+
1080
+ # Close any still-open tags (best-effort)
1081
+ while stack:
1082
+ top = stack.pop()
1083
+ pieces.append(f"</{top['tag']}>")
1084
+
1085
+ return "".join(pieces), {"total": len(annotations), "skipped_crossing": skipped_cross}
1086
+
1087
+ @staticmethod
1088
+ def rebuild_tagged_text_from_task(task: Any, mapping: Dict[str, str]) -> str:
1089
+ """
1090
+ Generate tagged text from a Label Studio task's annotations.
1091
+
1092
+ Extracts annotations from the task and rebuilds the text with XML-style
1093
+ tags around annotated spans.
1094
+
1095
+ Args:
1096
+ task: A Label Studio task object with:
1097
+ - task.annotations: List of annotation objects
1098
+ - task.data: Dict containing "text" key with the source text
1099
+ mapping: A dictionary mapping label names to tag names to use in
1100
+ the output. Labels not in the mapping will be sanitized to
1101
+ create tag names.
1102
+
1103
+ Returns:
1104
+ str: The text with XML-style tags inserted around annotated spans.
1105
+
1106
+ Example:
1107
+ >>> # Returns something like: "<greeting>Hello</greeting>, World!"
1108
+ """
1109
+ extracted = SpanAligner._format_annotations(task)
1110
+ text = task.data.get("text", "")
1111
+
1112
+ retagged, _ = SpanAligner.rebuild_tagged_text(
1113
+ text,
1114
+ spans=extracted["segmentation"],
1115
+ entities=extracted["entities"],
1116
+ label_to_tag=mapping
1117
+ )
1118
+
1119
+ return retagged
1120
+
1121
+
1122
+ #### Transpose tags back to original text
1123
+ @staticmethod
1124
+ def map_tags_to_original(
1125
+ original_text: str,
1126
+ tagged_text: str,
1127
+ min_ratio: float = 0.8,
1128
+ max_dist: int = 20,
1129
+ enable_fuzzy: bool = False,
1130
+ logging: bool = False
1131
+
1132
+ ) -> str:
1133
+ """
1134
+ Map spans from tagged text back to their positions in the original text.
1135
+
1136
+ Takes tagged text with XML-style tags and aligns the annotated spans
1137
+ back to their positions in the provided original text. Uses exact,
1138
+ regex-based, and fuzzy matching to find the best alignment.
1139
+
1140
+ Args:
1141
+ original_text: The original untagged text.
1142
+ tagged_text: The text with XML-style tags indicating spans.
1143
+ min_ratio: Minimum similarity ratio (0.0-1.0) for fuzzy matching.
1144
+ Defaults to 0.8.
1145
+ max_dist: Maximum character distance from approximate position
1146
+ to consider a match valid. Defaults to 20.
1147
+ logging: If True, prints detailed debug information during mapping.
1148
+ Defaults to False.
1149
+ """ # First, extract spans/entities from tagged_text
1150
+ temp_content = SpanAligner.tagged_text_to_task(
1151
+ tagged_text,
1152
+ include_attachments=True,
1153
+ allowed_tags=None # allow all tags
1154
+ )
1155
+
1156
+ result_obj = {
1157
+ "spans": temp_content.get("spans", []),
1158
+ "entities": temp_content.get("entities", []),
1159
+ "task": {
1160
+ "data": {
1161
+ "text": "" # will be filled later
1162
+ }
1163
+ }
1164
+ }
1165
+
1166
+ # Now map spans/entities back to original_text
1167
+ mapped, _ = SpanAligner.map_spans_to_original(
1168
+ original_text,
1169
+ result_obj,
1170
+ min_ratio=min_ratio,
1171
+ max_dist=max_dist,
1172
+ enable_fuzzy = enable_fuzzy,
1173
+ logging=logging,
1174
+
1175
+ )
1176
+
1177
+ original_text_tagged, _ = SpanAligner.rebuild_tagged_text(original_text, spans = mapped.get("spans", []))
1178
+ return original_text_tagged