span-aligner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ from .aligner import SpanAligner
2
+
3
+ __all__ = ["SpanAligner"]
@@ -0,0 +1,1126 @@
1
+ """
2
+ SpanAligner Module
3
+ ==================
4
+
5
+ A utility module for aligning and mapping text spans between different text representations,
6
+ particularly useful for Label Studio annotation compatibility.
7
+
8
+ This module provides functionality to:
9
+ - Sanitize span boundaries to avoid special characters
10
+ - Find exact and fuzzy matches of text segments in original documents
11
+ - Map spans from one text representation to another
12
+ - Rebuild tagged text with nested annotations
13
+ - Merge result objects containing span annotations
14
+
15
+ Typical use case: When text has been modified (e.g., cleaned, translated) and annotations
16
+ need to be realigned to the original or modified text.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import re
22
+ from difflib import SequenceMatcher
23
+ from typing import Dict, Any, List, Tuple, Optional, Union
24
+ from rapidfuzz import fuzz
25
+
26
+ # Span sanitization helper for Label Studio compatibility
27
+ SPECIAL_CHARS = {"\n", "\r", "\t", " "}
28
+
29
+
30
+ class SpanAligner:
31
+ """
32
+ A utility class for aligning text spans between different text representations.
33
+
34
+ This class provides static methods for:
35
+ - Sanitizing span boundaries
36
+ - Finding exact and fuzzy text matches
37
+ - Mapping spans from extracted/modified text back to original text
38
+ - Rebuilding tagged text with proper nesting
39
+ - Merging annotation result objects
40
+
41
+ All methods are static and the class serves as a namespace for related functionality.
42
+
43
+ Example Usage:
44
+ >>> original = "Hello, World!"
45
+ >>> result_obj = {
46
+ ... "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
47
+ ... "entities": [],
48
+ ... "task": {"data": {"text": ""}}
49
+ ... }
50
+ >>> success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
51
+ """
52
+ @staticmethod
53
+ def sanitize_span(text: str, start: int, end: int) -> tuple[int, int]:
54
+ """
55
+ Adjust start/end indices so they do not land on special characters.
56
+
57
+ Moves start forward and end backward to avoid whitespace and control characters
58
+ at span boundaries, which is important for Label Studio compatibility.
59
+
60
+ Args:
61
+ text: The text containing the span.
62
+ start: The starting index of the span (inclusive).
63
+ end: The ending index of the span (exclusive).
64
+
65
+ Returns:
66
+ tuple[int, int]: A tuple of (sanitized_start, sanitized_end) indices.
67
+ Both values are clamped to [0, len(text)] and guaranteed to satisfy start <= end.
68
+
69
+ Example:
70
+ >>> SpanAligner.sanitize_span(" Hello ", 0, 9)
71
+ (2, 7) # Removes leading/trailing spaces
72
+ """
73
+ n = len(text)
74
+ s = max(0, min(start, n))
75
+ e = max(0, min(end, n))
76
+
77
+ # Move start forward while on a special char and s < e
78
+ while s < e and s < n and text[s] in SPECIAL_CHARS:
79
+ s += 1
80
+ # Move end backward while on a special char and s < e
81
+ while s < e and e > 0 and text[e-1] in SPECIAL_CHARS:
82
+ e -= 1
83
+
84
+ return s, e
85
+
86
+ @staticmethod
87
+ def _sequence_similarity(a: str, b: str) -> float:
88
+ """
89
+ Calculate the similarity ratio between two strings using SequenceMatcher.
90
+
91
+ Args:
92
+ a: First string to compare.
93
+ b: Second string to compare.
94
+
95
+ Returns:
96
+ float: Similarity ratio between 0.0 and 1.0, where 1.0 means identical strings.
97
+ Returns 0.0 if either string is empty.
98
+ """
99
+ if not a or not b:
100
+ return 0.0
101
+ return SequenceMatcher(None, a, b).ratio()
102
+
103
+
104
+ @staticmethod
105
+ def _find_exact(original_text: str, segment: str) -> List[int]:
106
+ """
107
+ Find all exact occurrences of a segment within the original text.
108
+
109
+ Args:
110
+ original_text: The text to search within.
111
+ segment: The exact substring to find.
112
+
113
+ Returns:
114
+ List[int]: A list of starting indices where the segment was found.
115
+ Empty list if no matches found.
116
+
117
+ Example:
118
+ >>> SpanAligner._find_exact("hello world hello", "hello")
119
+ [0, 12]
120
+ """
121
+ indices = []
122
+ start = 0
123
+ while True:
124
+ idx = original_text.find(segment, start)
125
+ if idx == -1:
126
+ break
127
+ indices.append(idx)
128
+ start = idx + 1
129
+ return indices
130
+
131
+ @staticmethod
132
+ def _best_fuzzy_in_window(
133
+ original_text: str,
134
+ segment: str,
135
+ start_hint: Optional[int],
136
+ max_search_slack: int = 20
137
+ ) -> Tuple[Optional[int], Optional[int], float]:
138
+ """
139
+ Find the best fuzzy match for a segment within a window around a hint position.
140
+ Uses RapidFuzz for performance and better fuzzy matching.
141
+ """
142
+ if not segment:
143
+ return None, None, 0.0
144
+
145
+ # Calculate window bounds
146
+ if start_hint is None:
147
+ left = 0
148
+ right = len(original_text)
149
+ else:
150
+ left = max(0, start_hint - max_search_slack)
151
+ # Add segment length + slack to the window end
152
+ right = min(len(original_text), start_hint + len(segment) + 2 * max_search_slack)
153
+
154
+ window = original_text[left:right]
155
+
156
+ if not window:
157
+ return None, None, 0.0
158
+
159
+ best_ratio = 0.0
160
+ best_start = None
161
+ best_end = None
162
+
163
+ seg_len = len(segment)
164
+ # Allow length variation (slack)
165
+ # We allow candidates to be +/- 20% length of segment, but at least +/- 5 chars
166
+ slack_len = max(5, int(seg_len * 0.2))
167
+ min_len = max(1, seg_len - slack_len)
168
+ max_len = seg_len + slack_len
169
+
170
+ len_window = len(window)
171
+
172
+ # Sliding window search
173
+ for i in range(len_window):
174
+ # Optimization: If the remaining window is shorter than min_len, stop
175
+ if i + min_len > len_window:
176
+ break
177
+
178
+ # Limit candidate end to avoid checking excessively long strings
179
+ end_limit = min(i + max_len, len_window) + 1
180
+
181
+ for j in range(i + min_len, end_limit):
182
+ candidate = window[i:j]
183
+
184
+ # Use rapidfuzz ratio
185
+ ratio = fuzz.ratio(segment, candidate) / 100.0
186
+
187
+ if ratio > best_ratio:
188
+ best_ratio = ratio
189
+ best_start = left + i
190
+ best_end = left + j
191
+
192
+ if best_ratio == 1.0:
193
+ break
194
+ if best_ratio == 1.0:
195
+ break
196
+
197
+ if best_start is None:
198
+ return None, None, 0.0
199
+
200
+ # Sanitize using the original SpanAligner helper
201
+ best_start, best_end = SpanAligner.sanitize_span(original_text, best_start, best_end)
202
+
203
+ return best_start, best_end, best_ratio
204
+
205
+ @staticmethod
206
+ def _regex_word_sequence(
207
+ original_text: str,
208
+ segment: str,
209
+ start_hint: Optional[int] = None,
210
+ max_search_slack: int = 20
211
+ ) -> Optional[Tuple[int, int]]:
212
+ """
213
+ Find a segment in the original text using regex-based word sequence matching.
214
+
215
+ This method tokenizes the segment into words and punctuation, then builds a
216
+ tolerant regex pattern that allows for varying whitespace/separators between
217
+ tokens. This is useful for matching text that may have different formatting.
218
+
219
+ Args:
220
+ original_text: The text to search within.
221
+ segment: The text segment to find (will be tokenized).
222
+ start_hint: Approximate starting position to prioritize searching around.
223
+ If provided, searches near this position first before falling back to
224
+ full text search.
225
+ max_search_slack: Maximum distance from start_hint to search.
226
+ Default is 20 characters.
227
+
228
+ Returns:
229
+ Optional[Tuple[int, int]]: A tuple of (start, end) indices if found,
230
+ or (None, None) if no match found.
231
+
232
+ Example:
233
+ >>> # Matches "hello world" even with different whitespace
234
+ >>> SpanAligner._regex_word_sequence("hello world", "hello world")
235
+ (0, 13)
236
+ """
237
+ # Tokenize into words and punctuation, keeping punctuation tokens
238
+ # Words: one or more word chars; Punct: any single non-word, non-space char
239
+ tokens = re.findall(r"\w+|[^\w\s]", segment)
240
+ if not tokens:
241
+ return None, None
242
+
243
+ # Build a tolerant pattern that matches tokens in order, allowing non-word separators/newlines after each
244
+ # Keep punctuation characters explicitly in the pattern
245
+ escaped = list(map(re.escape, tokens))
246
+ pattern = r"(?s)" + r"\W*".join(escaped)
247
+ try:
248
+ regex = re.compile(pattern)
249
+ except re.error:
250
+ return None, None
251
+
252
+ # If we have a hint, first search in a bounded region around it
253
+ if isinstance(start_hint, int):
254
+ left = max(0, start_hint - max_search_slack)
255
+ # allow for extra room to the right in case of many separators
256
+ right = min(len(original_text), start_hint + max(max_search_slack, len(segment) * 2))
257
+ subset = original_text[left:right]
258
+ m = regex.search(subset)
259
+ if m:
260
+ return left + m.start(), left + m.end()
261
+
262
+ # Fallback: search entire text
263
+ m = regex.search(original_text)
264
+
265
+ if not m:
266
+ return None, None
267
+ return m.start(), m.end()
268
+
269
+ @staticmethod
270
+ def map_spans_to_original(
271
+ original_text: str,
272
+ result_obj: Dict[str, Any],
273
+ min_ratio: float = 0.90,
274
+ logging: bool = False,
275
+ max_dist: int = 20,
276
+ enable_fuzzy: bool = False,
277
+ ) -> Tuple[bool, Dict[str, Any]]:
278
+ """
279
+ Map spans from a result object to their positions in the original text.
280
+
281
+ This is the main alignment method that attempts to find the correct positions
282
+ of annotated spans in the original text. It uses multiple strategies:
283
+ 1. Exact matching (fastest, most reliable)
284
+ 2. Regex-based word sequence matching (handles whitespace variations)
285
+ 3. Fuzzy matching (optional, for handling minor text differences)
286
+
287
+ Args:
288
+ original_text: The original/target text to map spans onto.
289
+ result_obj: A dictionary containing annotation data with the following structure:
290
+ {
291
+ "spans": [{
292
+ "start": int, # Approximate start position
293
+ "end": int, # Approximate end position
294
+ "text": str, # The text content of the span
295
+ "labels": [str] # List of label names
296
+ }, ...],
297
+ "entities": [...], # Same structure as spans
298
+ "task": {
299
+ "data": {"text": str} # Will be updated with original_text
300
+ }
301
+ }
302
+ min_ratio: Minimum similarity ratio (0.0-1.0) for fuzzy matching.
303
+ Default is 0.90.
304
+ logging: If True, prints debug information during alignment.
305
+ Default is False.
306
+ max_dist: Maximum allowed distance between approximate and actual
307
+ start positions. Matches further than this are rejected.
308
+ Default is 20 characters.
309
+
310
+ Returns:
311
+ Tuple[bool, Dict[str, Any]]: A tuple of:
312
+ - bool: True if all spans were successfully aligned, False otherwise.
313
+ - Dict: Updated result_obj with mapped spans. Each span now includes:
314
+ - start/end: Mapped positions (or None if unmatched)
315
+ - text: Matched text from original (or None if unmatched)
316
+ - status: "exact", "regex", "fuzzy", or "unmatched"
317
+ - similarity: Match similarity score (0.0-1.0)
318
+ - detected: The cleaned segment text that was searched for
319
+ - approx_start: Original approximate start position
320
+
321
+ Example:
322
+ >>> original = "Hello, World!"
323
+ >>> result = {
324
+ ... "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
325
+ ... "entities": [],
326
+ ... "task": {"data": {"text": ""}}
327
+ ... }
328
+ >>> success, mapped = SpanAligner.map_spans_to_original(original, result)
329
+ >>> success
330
+ True
331
+ >>> mapped["spans"][0]["status"]
332
+ 'exact'
333
+ """
334
+ input_spans: List[Dict[str, Any]] = result_obj.get("spans", [])
335
+ input_entities: List[Dict[str, Any]] = result_obj.get("entities", [])
336
+
337
+
338
+ def realign(items: List[Dict[str, Any]], enable_fuzzy: bool = False) -> Tuple[bool,List[Dict[str, Any]]]:
339
+ mapped: List[Dict[str, Any]] = []
340
+ all_aligned = True
341
+ for span in items:
342
+
343
+ approx_start = span.get("start", 0)
344
+ segment = span.get("text", "") or ""
345
+ labels = span.get("labels", [])
346
+ clean_segment = segment.strip()
347
+ chosen_end = None
348
+
349
+ if logging:
350
+ print("\n\n\n=======NEW SPAN==============")
351
+ print(f"label: {labels}")
352
+ print(f"cleaned span: '{clean_segment}' from original segment: '{segment}'")
353
+
354
+
355
+ exact_indices = SpanAligner._find_exact(original_text, clean_segment)
356
+ chosen_start = None
357
+ similarity = 1.0 if exact_indices else 0.0
358
+ status = "unmatched"
359
+
360
+ # First try exact match
361
+ if exact_indices:
362
+ chosen_start = min(exact_indices, key=lambda i: abs(i - approx_start))
363
+ chosen_end = chosen_start + len(clean_segment)
364
+ status = "exact"
365
+
366
+ # Next try regex-based word sequence match (search near approx_start for all words in order)
367
+ else:
368
+ step = 20
369
+ slacks = sorted(list(set(list(range(0, max_dist + 1, step)) + [max_dist])))
370
+
371
+ # 1. Try Regex progressively
372
+ for current_slack in slacks:
373
+ regex_start, regex_end = SpanAligner._regex_word_sequence(original_text, clean_segment, start_hint=approx_start, max_search_slack=current_slack)
374
+
375
+ if regex_start is not None:
376
+ chosen_start = regex_start
377
+ chosen_end = regex_end
378
+ # similarity compared against the fully matched span
379
+ similarity = SpanAligner._sequence_similarity(clean_segment, original_text[regex_start:regex_end])
380
+ status = "regex"
381
+ break
382
+
383
+ # 2. Try fuzzy matching if regex failed
384
+ if status == "unmatched" and enable_fuzzy:
385
+
386
+ step = 20
387
+ slacks = sorted(list(set(list(range(0, max_dist + 1, step)) + [max_dist])))
388
+
389
+ for current_slack in slacks:
390
+ fuzzy_start, fuzzy_end, fuzzy_ratio = SpanAligner._best_fuzzy_in_window(original_text, clean_segment, start_hint=approx_start, max_search_slack=current_slack)
391
+
392
+ if fuzzy_start is not None and fuzzy_ratio >= min_ratio:
393
+ chosen_start = fuzzy_start
394
+ chosen_end = fuzzy_end
395
+ similarity = fuzzy_ratio
396
+ status = "fuzzy"
397
+ break
398
+
399
+
400
+
401
+ # Check distance threshold
402
+ if chosen_start is not None and abs(chosen_start - approx_start) > max_dist:
403
+ if logging:
404
+ print(f"Match rejected due to distance: {abs(chosen_start - approx_start)} > {max_dist}")
405
+ chosen_start = None
406
+ chosen_end = None
407
+ status = "unmatched"
408
+
409
+ if chosen_start is not None:
410
+ # Sanitize the mapped span to avoid leading/trailing special characters
411
+ chosen_start, chosen_end = SpanAligner.sanitize_span(original_text, chosen_start, chosen_end)
412
+ matched_text = original_text[chosen_start:chosen_end]
413
+ else:
414
+ if logging:
415
+ print("No match found")
416
+
417
+ matched_text = None
418
+ all_aligned = False
419
+
420
+ if logging:
421
+ print("=====================")
422
+ print(f"span: {span} segment: {clean_segment}")
423
+ print(f"status:{status} similarity: {similarity}")
424
+ print(f"pre sanit: {(chosen_start,chosen_end, len(clean_segment))}" if chosen_start is not None else "pre sanit: None")
425
+ if chosen_start is not None:
426
+ print(f"updated positions: (start: {chosen_start}; end: {chosen_end})")
427
+ print(f"Extracted in original: '{original_text[chosen_start:chosen_end]}'")
428
+
429
+ mapped.append({
430
+ "start": chosen_start,
431
+ "end": chosen_end,
432
+ "text": matched_text,
433
+ "labels": labels,
434
+ "status": status,
435
+ "similarity": round(similarity, 4),
436
+ "detected": clean_segment,
437
+ "approx_start": approx_start,
438
+ })
439
+ return all_aligned, mapped
440
+
441
+ updated = dict(result_obj)
442
+ all_spans_aligned, updated["spans"] = realign(input_spans, enable_fuzzy)
443
+ all_entities_aligned, updated["entities"] = realign(input_entities, enable_fuzzy)
444
+ updated["task"]["data"]["text"] = original_text
445
+ return all_spans_aligned and all_entities_aligned, updated
446
+
447
+
448
+ @staticmethod
449
+ def merge_result_objects(
450
+ base: Dict[str, Any],
451
+ addition: Dict[str, Any],
452
+ span_from_name: str,
453
+ ner_from_name: str
454
+ ) -> Dict[str, Any]:
455
+ """
456
+ Merge two result objects by combining their span and entity lists.
457
+
458
+ Creates a new dictionary based on the base object, then appends spans and
459
+ entities from the addition object.
460
+
461
+ Args:
462
+ base: The base result object to merge into.
463
+ addition: The result object to merge from.
464
+ span_from_name: The key name for span annotations (e.g., "spans", "segmentation").
465
+ ner_from_name: The key name for NER/entity annotations (e.g., "entities").
466
+
467
+ Returns:
468
+ Dict[str, Any]: A new merged dictionary with combined spans and entities.
469
+ The base object is shallow-copied, so nested objects may still be shared.
470
+
471
+ Example:
472
+ >>> base = {"spans": [{"text": "A"}], "entities": []}
473
+ >>> addition = {"spans": [{"text": "B"}], "entities": [{"text": "C"}]}
474
+ >>> merged = SpanAligner.merge_result_objects(base, addition, "spans", "entities")
475
+ >>> len(merged["spans"])
476
+ 2
477
+ """
478
+ merged = dict(base)
479
+
480
+ addition_spans = addition.get(span_from_name, [])
481
+ base_spans = merged.get(span_from_name, [])
482
+ merged[span_from_name] = base_spans + addition_spans
483
+
484
+ addition_ner = addition.get(ner_from_name, [])
485
+ base_ner = merged.get(ner_from_name, [])
486
+ merged[ner_from_name] = base_ner + addition_ner
487
+ return merged
488
+
489
+ @staticmethod
490
+ def _invert_label_map(tag_to_label: Dict[str, str]) -> Dict[str, str]:
491
+ """
492
+ Invert a tag-to-label mapping to create a label-to-tag mapping.
493
+
494
+ Args:
495
+ tag_to_label: A dictionary mapping tag names to label names.
496
+ Can be None, in which case an empty dict is returned.
497
+
498
+ Returns:
499
+ Dict[str, str]: A dictionary mapping label names to tag names.
500
+
501
+ Example:
502
+ >>> SpanAligner._invert_label_map({"loc": "Location", "per": "Person"})
503
+ {'Location': 'loc', 'Person': 'per'}
504
+ """
505
+ return {v: k for k, v in (tag_to_label or {}).items()}
506
+
507
+ @staticmethod
508
+ def _sanitize_label_to_tag(label: str) -> str:
509
+ """
510
+ Convert a human-readable label to a sanitized XML-safe tag name.
511
+
512
+ Converts the label to lowercase, replaces spaces with underscores,
513
+ and removes any characters that are not alphanumeric or underscores.
514
+
515
+ Args:
516
+ label: The human-readable label to convert.
517
+
518
+ Returns:
519
+ str: A sanitized tag name suitable for use in XML/HTML tags.
520
+ Returns "span" if the result would be empty.
521
+
522
+ Example:
523
+ >>> SpanAligner._sanitize_label_to_tag("My Label (Special)")
524
+ 'my_label_special'
525
+ """
526
+ # Fallback: convert human label to tag-like form
527
+ tag = label.strip().lower().replace(" ", "_")
528
+ tag = re.sub(r"[^a-z0-9_]+", "_", tag).strip("_")
529
+ return tag or "span"
530
+
531
+
532
+ @staticmethod
533
+ def _format_annotations(task: Any) -> Dict[str, Any]:
534
+ """
535
+ Extract and format annotations from a Label Studio task object.
536
+
537
+ Parses the first annotation from the task and categorizes the results
538
+ into classification choices, entities, and segmentation spans.
539
+ Falls back to predictions if no annotations are available.
540
+
541
+ Args:
542
+ task: A Label Studio task object with an `annotations` attribute.
543
+ Expected structure:
544
+ task.annotations = [{
545
+ "result": [
546
+ {"type": "choices", "from_name": "type", "value": {"choices": [...]}},
547
+ {"type": "labels", "from_name": "entities", "value": {...}},
548
+ {"type": "labels", "from_name": "segmentation", "value": {...}}
549
+ ]
550
+ }]
551
+ If annotations are empty, predictions with the same structure
552
+ will be used as a fallback.
553
+
554
+ Returns:
555
+ Dict[str, Any]: A dictionary with three keys:
556
+ - "classification": List of classification choices
557
+ - "entities": List of entity annotation values
558
+ - "segmentation": List of segmentation span values
559
+ """
560
+ # Try annotations first, fall back to predictions if empty
561
+ results = []
562
+ if task.annotations:
563
+ results = task.annotations[0].get("result", [])
564
+
565
+ # If no annotations, try predictions
566
+ if not results and hasattr(task, 'predictions') and task.predictions:
567
+ results = task.predictions[0].result or []
568
+
569
+ classification = []
570
+ entities = []
571
+ spans = []
572
+
573
+ for ann in results:
574
+ ann_type = ann.get("type")
575
+ from_name = ann.get("from_name")
576
+ value = ann.get("value", {})
577
+
578
+ if ann_type == "choices" and from_name == "type":
579
+ if choices := value.get("choices"):
580
+ classification = choices
581
+ elif ann_type == "labels":
582
+ if from_name == "entities":
583
+ entities.append(value)
584
+ elif from_name == "segmentation":
585
+ spans.append(value)
586
+
587
+ return {
588
+ "classification": classification,
589
+ "entities": entities,
590
+ "segmentation": spans
591
+ }
592
+
593
+
594
+ @staticmethod
595
+ def update_mapped_with_rebuilt(
596
+ original_text: str,
597
+ mapped: Dict[str, Any],
598
+ span_label_mapping: Optional[Dict[str, str]] = None,
599
+ ner_label_mapping: Optional[Dict[str, str]] = None,
600
+ overwrite: bool = True
601
+ ) -> Dict[str, Any]:
602
+ """
603
+ Update a mapped result object with rebuilt tagged text.
604
+
605
+ Takes a mapped result object (output from map_spans_to_original) and
606
+ generates tagged text from its spans and entities, storing the result
607
+ in the task data.
608
+
609
+ Args:
610
+ original_text: The original text to use for rebuilding tags.
611
+ mapped: A mapped result object containing:
612
+ - "spans": List of span annotations
613
+ - "entities": List of entity annotations
614
+ - "task": {"data": {...}} - Task data to update
615
+ span_label_mapping: Optional tag-to-label mapping for spans.
616
+ Will be inverted to create label-to-tag mapping.
617
+ ner_label_mapping: Optional tag-to-label mapping for NER entities.
618
+ Will be inverted to create label-to-tag mapping.
619
+ overwrite: If True, overwrites "tagged_text" in task data.
620
+ If False, stores result in "tagged_text_unified" instead.
621
+ When overwriting, the original tagged_text is preserved in
622
+ "tagged_text_original" if it exists.
623
+
624
+ Returns:
625
+ Dict[str, Any]: The same mapped object (modified in place) with:
626
+ - task.data.tagged_text (or tagged_text_unified): The rebuilt tagged text
627
+ - task.data.tagged_text_original: Original tagged_text if overwritten
628
+ - task.data.rebuild_stats: Statistics from rebuild operation
629
+
630
+ Example:
631
+ >>> mapped = {"spans": [...], "entities": [...], "task": {"data": {}}}
632
+ >>> updated = SpanAligner.update_mapped_with_rebuilt("Hello World", mapped)
633
+ >>> "tagged_text" in updated["task"]["data"]
634
+ True
635
+ """
636
+ data = mapped.get("task", {}).get("data", {})
637
+ # text = data.get("text", "")
638
+ label_to_tag = {}
639
+ label_to_tag.update(SpanAligner._invert_label_map(span_label_mapping or {}))
640
+ label_to_tag.update(SpanAligner._invert_label_map(ner_label_mapping or {}))
641
+
642
+ rebuilt, stats = SpanAligner.rebuild_tagged_text(
643
+ original_text,
644
+ mapped.get("spans", []),
645
+ mapped.get("entities", []),
646
+ label_to_tag=label_to_tag,
647
+ )
648
+
649
+ # Preserve original and write unified
650
+ if "tagged_text" in data and not data.get("tagged_text_original"):
651
+ data["tagged_text_original"] = data.get("tagged_text")
652
+ if overwrite:
653
+ data["tagged_text"] = rebuilt
654
+ else:
655
+ data["tagged_text_unified"] = rebuilt
656
+ data["rebuild_stats"] = stats
657
+ return mapped
658
+
659
+
660
+
661
+ #### From tagged text to task
662
+ @staticmethod
663
+ def get_annotations_from_tagged_text(
664
+ result: Union[dict, str],
665
+ *,
666
+ include_attachments: bool = True,
667
+ span_map: Optional[Dict[str, str]] = None,
668
+ ner_map: Optional[Dict[str, str]] = None,
669
+ class_map: Optional[Dict[str, str]] = None,
670
+ allowed_tags: Optional[List[str]] = None,
671
+ ) -> dict:
672
+ """
673
+ Convert a tagged result (with inline XML-like tags) into structured annotations.
674
+
675
+ Extracts spans and entities from tagged text by removing tags and tracking
676
+ character offsets in the resulting plain text. Supports nested tags and
677
+ custom tag-to-label mappings.
678
+
679
+ Args:
680
+ result: Input dictionary with 'tagged_text' key, or the tagged text string itself.
681
+ include_attachments: Whether to include text content inside <attachment> tags.
682
+ span_map: Dictionary mapping tag names to span labels.
683
+ ner_map: Dictionary mapping tag names to entity labels.
684
+ class_map: Dictionary mapping document classifications to labels.
685
+ allowed_tags: List of tag names to process. If None, derived from map keys.
686
+
687
+ Returns:
688
+ dict: A dictionary containing:
689
+ - spans: List of span (segmentation) objects
690
+ - entities: List of entity (NER) objects
691
+ - plain_text: The text content with tags removed
692
+ - tagged_text: The original tagged text used
693
+ - document_classification: The classification from input result (if any)
694
+
695
+ Notes:
696
+ - Spans are derived by removing tags while tracking character offsets in the
697
+ plain text. Nested tags are supported; spans may overlap.
698
+ - `span_map` lets you rename tags to match your LS label config.
699
+ - `allowed_tags` limits which tags are turned into spans. If None, uses the
700
+ tag set defined in your prompts.
701
+ """
702
+
703
+ # Resolve tagged_text input
704
+ tagged_text = ""
705
+ doc_class = None
706
+ if isinstance(result, dict):
707
+ tagged_text = result.get("tagged_text", "")
708
+ doc_class = result.get("document_classification")
709
+ else:
710
+ tagged_text = str(result or "")
711
+
712
+ if not tagged_text:
713
+ raise ValueError("No tagged_text found in input result.")
714
+
715
+ # Default allowed tags (from your SYSTEM/USER prompts)
716
+ if allowed_tags is None:
717
+ # Safely handle None maps
718
+ s_map = span_map or {}
719
+ n_map = ner_map or {}
720
+ allowed_tags = list(n_map.keys()) + list(s_map.keys())
721
+
722
+ # Merge span_map and ner_map safely into annotation_map
723
+ annotation_map = {}
724
+ for mapping in (span_map, ner_map):
725
+ if mapping:
726
+ annotation_map.update(mapping)
727
+
728
+ # If annotation_map ends up empty, initialize with identity mapping
729
+ if not annotation_map:
730
+ annotation_map = {t: t for t in allowed_tags}
731
+
732
+ # Regex to capture bare tags like <tag> or </tag>
733
+ tag_re = re.compile(r"<(/?)([a-zA-Z_][a-zA-Z0-9_-]*)>")
734
+
735
+ plain_parts: List[str] = []
736
+ spans: List[dict] = []
737
+ entities: List[dict] = []
738
+
739
+ stack: List[Tuple[str, int]] = [] # (tag_name_lower, start_offset_in_plain)
740
+
741
+ pos_in = 0 # position in tagged_text
742
+ pos_out = 0 # position in plain text we are building
743
+
744
+ def emit_text(s: str):
745
+ nonlocal pos_out
746
+ if not s:
747
+ return
748
+ plain_parts.append(s)
749
+ pos_out += len(s)
750
+
751
+ # Attachment handling: if we skip attachments, when inside attachments or attachment, we don't emit text
752
+ inside_attachments_level = 0
753
+
754
+ for m in tag_re.finditer(tagged_text):
755
+ # Emit any literal text before this tag
756
+ literal = tagged_text[pos_in:m.start()]
757
+ current_tag_is_attachment = inside_attachments_level > 0
758
+
759
+ if include_attachments or not current_tag_is_attachment:
760
+ emit_text(literal)
761
+
762
+ is_closing = bool(m.group(1))
763
+ tag_name = m.group(2).lower()
764
+
765
+ # Track attachments nesting regardless of allowed_tags so we can drop their content when requested
766
+ if tag_name in ("attachments", "attachment"):
767
+ if not is_closing:
768
+ inside_attachments_level += 1
769
+ else:
770
+ inside_attachments_level = max(0, inside_attachments_level - 1)
771
+
772
+ # Handle span stack only for allowed tags
773
+ if tag_name in allowed_tags:
774
+ if not is_closing:
775
+ # Opening tag
776
+ stack.append((tag_name, pos_out))
777
+ else:
778
+ # Closing tag — find the last matching opening tag
779
+ # Iterate backwards to find the matching opening tag
780
+ found_open = False
781
+ for i in range(len(stack) - 1, -1, -1):
782
+ open_tag, start_off = stack[i]
783
+ if open_tag == tag_name:
784
+ # Pop all tags above the matching one (handle mismatched nesting)
785
+ stack = stack[:i]
786
+ end_off = pos_out
787
+
788
+ # Create a span only if it has positive length
789
+ if end_off > start_off:
790
+ full_span_text = ("".join(plain_parts))[start_off:end_off]
791
+
792
+ # Adjust start to skip leading newlines
793
+ adjusted_start = start_off
794
+ span_text = full_span_text
795
+
796
+ while span_text.startswith('\n'):
797
+ adjusted_start += 1
798
+ span_text = span_text[1:]
799
+
800
+ # Adjust end to skip trailing newlines
801
+ adjusted_end = end_off
802
+ while span_text.endswith('\n'):
803
+ adjusted_end -= 1
804
+ span_text = span_text[:-1]
805
+
806
+ # Only create span if there's content after trimming
807
+ if adjusted_end > adjusted_start:
808
+ annotation_entry = {
809
+ "start": adjusted_start,
810
+ "end": adjusted_end,
811
+ "text": span_text,
812
+ "labels": [annotation_map.get(tag_name, tag_name)]
813
+ }
814
+
815
+ if ner_map and tag_name in ner_map:
816
+ entities.append(annotation_entry)
817
+ else:
818
+ spans.append(annotation_entry)
819
+ found_open = True
820
+ break
821
+ # If no matching opening tag found, ignore gracefully
822
+
823
+ pos_in = m.end()
824
+
825
+ # Emit remaining tail text
826
+ tail = tagged_text[pos_in:]
827
+ if include_attachments or inside_attachments_level == 0:
828
+ emit_text(tail if include_attachments else "")
829
+
830
+ plain_text = "".join(plain_parts)
831
+
832
+ return {
833
+ "spans": spans,
834
+ "entities": entities,
835
+ "plain_text": plain_text,
836
+ "tagged_text": tagged_text,
837
+ "document_classification": doc_class
838
+ }
839
+
840
+ @staticmethod
841
+ def tagged_text_to_task(
842
+ result: Union[dict, str],
843
+ *,
844
+ include_attachments: bool = True,
845
+ span_map: Optional[Dict[str, str]] = None,
846
+ ner_map: Optional[Dict[str, str]] = None,
847
+ class_map: Optional[Dict[str, str]] = None,
848
+ allowed_tags: Optional[List[str]] = None,
849
+ ) -> dict:
850
+ """
851
+ Convert a tagged result into an uploader-ready Label Studio task.
852
+
853
+ Uses `get_annotations_from_tagged_text` to parse the input and formats
854
+ the output as expected by the Label Studio uploader class.
855
+
856
+ Args:
857
+ result: Input dictionary with 'tagged_text' key, or the tagged text string.
858
+ include_attachments: Whether to include text content inside <attachment> tags.
859
+ span_map: Dictionary mapping tag names to span labels.
860
+ ner_map: Dictionary mapping tag names to entity labels.
861
+ class_map: Dictionary mapping document classifications to labels.
862
+ allowed_tags: List of tag names to process. If None, derived from map keys.
863
+
864
+ Returns:
865
+ dict: A dictionary ready for Label Studio import, containing:
866
+ - task: Task data including text and metadata
867
+ - spans: Extracted spans
868
+ - entities: Extracted entities
869
+ - labels: Classification labels (if applicable)
870
+ """
871
+ # Parse annotations using shared logic
872
+ parsed = SpanAligner.get_annotations_from_tagged_text(
873
+ result,
874
+ include_attachments=include_attachments,
875
+ span_map=span_map,
876
+ ner_map=ner_map,
877
+ class_map=class_map,
878
+ allowed_tags=allowed_tags
879
+ )
880
+
881
+ spans = parsed["spans"]
882
+ entities = parsed["entities"]
883
+ plain_text = parsed["plain_text"]
884
+ tagged_text = parsed["tagged_text"]
885
+ doc_class = parsed["document_classification"]
886
+
887
+ # Handle classification mapping
888
+ classification_labels = []
889
+ if doc_class and class_map and doc_class in class_map:
890
+ classification_labels = [class_map[doc_class]]
891
+
892
+ content = {
893
+ "task": {
894
+ "data": {
895
+ "text": plain_text,
896
+ "tagged_text": tagged_text,
897
+ "meta": {
898
+ "segments": len(spans),
899
+ "labels_present": sorted({(s.get("labels") or [""])[0] for s in spans}),
900
+ "include_attachments": include_attachments,
901
+ "document_classification": doc_class or ""
902
+ }
903
+ }
904
+ },
905
+ "spans": spans,
906
+ "labels": classification_labels,
907
+ "entities": entities
908
+ }
909
+
910
+ return content
911
+
912
+
913
+ #### From task to tagged text
914
+ @staticmethod
915
+ def rebuild_tagged_text(
916
+ original_text: str,
917
+ spans: List[Dict[str, Any]],
918
+ entities: List[Dict[str, Any]],
919
+ label_to_tag: Optional[Dict[str, str]] = None
920
+ ) -> Tuple[str, Dict[str, int]]:
921
+ """
922
+ Rebuild text with nested XML-style tags from span and entity annotations.
923
+
924
+ Creates properly nested tags from annotations, handling overlapping spans
925
+ by skipping crossing (non-nested) annotations to maintain valid XML structure.
926
+
927
+ Args:
928
+ original_text: The source text to add tags to.
929
+ spans: List of span annotations, each with:
930
+ - "start": int - Starting character index
931
+ - "end": int - Ending character index (exclusive)
932
+ - "labels": List[str] - Label names (first one is used)
933
+ entities: List of entity annotations (same structure as spans).
934
+ label_to_tag: Optional mapping from label names to tag names.
935
+ If a label is not in the mapping, it will be sanitized to
936
+ create a valid tag name.
937
+
938
+ Returns:
939
+ Tuple[str, Dict[str, int]]: A tuple of:
940
+ - str: The text with XML tags inserted (e.g., "<tag>text</tag>")
941
+ - Dict with statistics:
942
+ - "total": Total number of valid annotations processed
943
+ - "skipped_crossing": Number of annotations skipped due to
944
+ crossing (non-nested) overlaps
945
+
946
+ Note:
947
+ - Annotations with invalid positions (negative, overlapping bounds,
948
+ or exceeding text length) are silently skipped.
949
+ - For overlapping annotations, outer (longer) spans are preferred.
950
+ - Crossing annotations that would create invalid XML are skipped.
951
+
952
+ Example:
953
+ >>> text = "Hello World"
954
+ >>> spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
955
+ >>> entities = [{"start": 0, "end": 5, "labels": ["greeting"]}]
956
+ >>> result, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
957
+ >>> result
958
+ '<sentence><greeting>Hello</greeting> World</sentence>'
959
+ """
960
+ annotations: List[Dict[str, Any]] = []
961
+
962
+ def to_tag(lbls: List[str]) -> Optional[str]:
963
+ if not lbls:
964
+ return None
965
+ lbl = lbls[0]
966
+ if label_to_tag and lbl in label_to_tag:
967
+ return label_to_tag[lbl]
968
+ return SpanAligner._sanitize_label_to_tag(lbl)
969
+
970
+ def add_items(items: List[Dict[str, Any]]):
971
+ for it in items or []:
972
+ s = it.get("start")
973
+ e = it.get("end")
974
+ if not isinstance(s, int) or not isinstance(e, int) or s is None or e is None or s < 0 or e <= s or e > len(original_text):
975
+ continue
976
+ tag = to_tag(it.get("labels") or [])
977
+ if not tag:
978
+ continue
979
+ annotations.append({
980
+ "start": s,
981
+ "end": e,
982
+ "tag": str(tag),
983
+ "length": e - s,
984
+ })
985
+
986
+ add_items(spans)
987
+ add_items(entities)
988
+
989
+ # Sort: by start asc, longer first (end desc) to open outers before inners
990
+ annotations.sort(key=lambda a: (a["start"], -a["length"]))
991
+
992
+ # Index starts and ends
993
+ starts: Dict[int, List[Dict[str, Any]]] = {}
994
+ for a in annotations:
995
+ starts.setdefault(a["start"], []).append(a)
996
+ for pos in starts:
997
+ starts[pos].sort(key=lambda a: -a["length"]) # longer first
998
+
999
+ ends: Dict[int, List[Dict[str, Any]]] = {}
1000
+ for a in annotations:
1001
+ ends.setdefault(a["end"], []).append(a)
1002
+
1003
+ event_positions = sorted({0, len(original_text), *starts.keys(), *ends.keys()})
1004
+
1005
+ pieces: List[str] = []
1006
+ stack: List[Dict[str, Any]] = []
1007
+ last = 0
1008
+ skipped_cross = 0
1009
+
1010
+ for pos in event_positions:
1011
+ if pos > last:
1012
+ pieces.append(original_text[last:pos])
1013
+
1014
+ # Close all tags that end here (LIFO)
1015
+ while stack and stack[-1]["end"] == pos:
1016
+ top = stack.pop()
1017
+ pieces.append(f"</{top['tag']}>")
1018
+
1019
+ # Open tags that start here (outer first)
1020
+ for ann in starts.get(pos, []):
1021
+ # Crossing check: if an open tag exists with end < ann.end (not nested), skip ann
1022
+ if stack and ann["end"] > stack[-1]["end"]:
1023
+ skipped_cross += 1
1024
+ continue
1025
+ pieces.append(f"<{ann['tag']}>")
1026
+ stack.append(ann)
1027
+
1028
+ last = pos
1029
+
1030
+ # Tail
1031
+ pieces.append(original_text[last:])
1032
+
1033
+ # Close any still-open tags (best-effort)
1034
+ while stack:
1035
+ top = stack.pop()
1036
+ pieces.append(f"</{top['tag']}>")
1037
+
1038
+ return "".join(pieces), {"total": len(annotations), "skipped_crossing": skipped_cross}
1039
+
1040
+ @staticmethod
1041
+ def rebuild_tagged_text_from_task(task: Any, mapping: Dict[str, str]) -> str:
1042
+ """
1043
+ Generate tagged text from a Label Studio task's annotations.
1044
+
1045
+ Extracts annotations from the task and rebuilds the text with XML-style
1046
+ tags around annotated spans.
1047
+
1048
+ Args:
1049
+ task: A Label Studio task object with:
1050
+ - task.annotations: List of annotation objects
1051
+ - task.data: Dict containing "text" key with the source text
1052
+ mapping: A dictionary mapping label names to tag names to use in
1053
+ the output. Labels not in the mapping will be sanitized to
1054
+ create tag names.
1055
+
1056
+ Returns:
1057
+ str: The text with XML-style tags inserted around annotated spans.
1058
+
1059
+ Example:
1060
+ >>> # Returns something like: "<greeting>Hello</greeting>, World!"
1061
+ """
1062
+ extracted = SpanAligner._format_annotations(task)
1063
+ text = task.data.get("text", "")
1064
+
1065
+ retagged, _ = SpanAligner.rebuild_tagged_text(
1066
+ text,
1067
+ spans=extracted["segmentation"],
1068
+ entities=extracted["entities"],
1069
+ label_to_tag=mapping
1070
+ )
1071
+
1072
+ return retagged
1073
+
1074
+
1075
+ #### Transpose tags back to original text
1076
+ @staticmethod
1077
+ def map_tags_to_original(
1078
+ original_text: str,
1079
+ tagged_text: str,
1080
+ min_ratio: float = 0.8,
1081
+ max_dist: int = 20,
1082
+ logging: bool = False
1083
+ ) -> str:
1084
+ """
1085
+ Map spans from tagged text back to their positions in the original text.
1086
+
1087
+ Takes tagged text with XML-style tags and aligns the annotated spans
1088
+ back to their positions in the provided original text. Uses exact,
1089
+ regex-based, and fuzzy matching to find the best alignment.
1090
+
1091
+ Args:
1092
+ original_text: The original untagged text.
1093
+ tagged_text: The text with XML-style tags indicating spans.
1094
+ min_ratio: Minimum similarity ratio (0.0-1.0) for fuzzy matching.
1095
+ Defaults to 0.8.
1096
+ max_dist: Maximum character distance from approximate position
1097
+ to consider a match valid. Defaults to 20.
1098
+ logging: If True, prints detailed debug information during mapping.
1099
+ Defaults to False.
1100
+ """ # First, extract spans/entities from tagged_text
1101
+ temp_content = SpanAligner.tagged_text_to_task(
1102
+ tagged_text,
1103
+ include_attachments=True,
1104
+ allowed_tags=None # allow all tags
1105
+ )
1106
+
1107
+ result_obj = {
1108
+ "spans": temp_content.get("spans", []),
1109
+ "entities": temp_content.get("entities", []),
1110
+ "task": {
1111
+ "data": {
1112
+ "text": "" # will be filled later
1113
+ }
1114
+ }
1115
+ }
1116
+
1117
+ # Now map spans/entities back to original_text
1118
+ success, mapped = SpanAligner.map_spans_to_original(
1119
+ original_text,
1120
+ result_obj,
1121
+ min_ratio=min_ratio,
1122
+ max_dist=max_dist,
1123
+ logging=logging
1124
+ )
1125
+
1126
+ return mapped["task"]["data"].get("tagged_text", "")
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.4
2
+ Name: span-aligner
3
+ Version: 0.1.0
4
+ Summary: A utility for aligning and mapping text spans between different text representations.
5
+ License: MIT
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: rapidfuzz>=3.0.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
12
+ Dynamic: license-file
13
+
14
+ # Span Aligner
15
+
16
+ A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
17
+
18
+ ## Features
19
+
20
+ - Sanitize span boundaries to avoid special characters.
21
+ - Find exact and fuzzy matches of text segments in original documents.
22
+ - Map spans from one text representation to another.
23
+ - Rebuild tagged text with nested annotations.
24
+ - Merge result objects containing span annotations.
25
+
26
+ ## Installation
27
+
28
+ Install from source:
29
+
30
+ ```bash
31
+ pip install .
32
+ ```
33
+
34
+ For development:
35
+
36
+ ```bash
37
+ pip install -e ".[dev]"
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ ```python
43
+ from span_aligner import SpanAligner
44
+
45
+ original = "Hello, World!"
46
+ result_obj = {
47
+ "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
48
+ "entities": [],
49
+ "task": {"data": {"text": ""}}
50
+ }
51
+
52
+ success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
53
+ print(mapped)
54
+ ```
55
+
56
+ ### Map Tags to Original
57
+
58
+ Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and original text as written in the original.
59
+
60
+ ```python
61
+ original_text = "The quick brown fox jumps\n\n over the dog."
62
+ # Imagine the text was slightly modified or translated, but tags are present
63
+ tagged_text = "The <adj>quick</adj> brown fox jumps over the <animal>dog</animal>."
64
+
65
+ mapped_tagged_text = SpanAligner.map_tags_to_original(
66
+ original_text=original_text,
67
+ tagged_text=tagged_text,
68
+ min_ratio=0.8
69
+ )
70
+ print(mapped_tagged_text)
71
+ # Output might look like: "The <adj>quick</adj> brown fox jumps\n\n over the <animal>dog</animal>."
72
+ # (If original text differed slightly, tags would be placed on best matching spans)
73
+ ```
74
+
75
+ ### Rebuild Tagged Text
76
+
77
+ Reconstruct a string with XML-like tags from raw text and span/entity lists.
78
+
79
+ ```python
80
+ text = "Hello World"
81
+ spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
82
+ entities = [{"start": 6, "end": 11, "labels": ["location"]}]
83
+
84
+ tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
85
+ print(tagged)
86
+ # Output: <sentence>Hello <location>World</location></sentence>
87
+ ```
88
+
89
+ ### Rebuild Tagged Text from Task
90
+
91
+ Generate tagged text directly from a Label Studio task object.
92
+
93
+ ```python
94
+ # Assuming 'task' is a Label Studio task object (or similar structure)
95
+ # with .data['text'] and .annotations attributes
96
+ mapping = {"Location": "loc", "Person": "per"}
97
+
98
+ tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
99
+ print(tagged_output)
100
+ ```
101
+
102
+ ### Get Annotations from Tagged Text
103
+
104
+ Extract structured spans and entities from a string with inline tags.
105
+
106
+ ```python
107
+ tagged_input = "Visit <loc>Paris</loc> and see the <landmark>Eiffel Tower</landmark>."
108
+
109
+ annotations = SpanAligner.get_annotations_from_tagged_text(
110
+ tagged_input,
111
+ ner_map={"loc": "Location", "landmark": "Location"}
112
+ )
113
+
114
+ print(annotations["entities"])
115
+ # Output:
116
+ # [
117
+ # {"start": 6, "end": 11, "text": "Paris", "labels": ["Location"]},
118
+ # {"start": 24, "end": 36, "text": "Eiffel Tower", "labels": ["Location"]}
119
+ # ]
120
+ print(annotations["plain_text"])
121
+ # Output: "Visit Paris and see the Eiffel Tower."
122
+ ```
@@ -0,0 +1,7 @@
1
+ span_aligner/__init__.py,sha256=ERLPBS6aad_17IPcoMDkVqi7lrSJxuHgqrdh69EN9xI,63
2
+ span_aligner/aligner.py,sha256=HYo5CvmC9WrnsJo7bFFcUHoJaZhiNf4aZ7pOEzd2MlA,47658
3
+ span_aligner-0.1.0.dist-info/licenses/LICENSE,sha256=TqCZNrAXPrgWq9k95te7bOOyXztjxjxWQWnsHSqT8SM,1096
4
+ span_aligner-0.1.0.dist-info/METADATA,sha256=nqV2rIVPmFCR2aiZ3KanmFx_ZlYfShz6JMp1SVRHld8,3583
5
+ span_aligner-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ span_aligner-0.1.0.dist-info/top_level.txt,sha256=syADug30Z0JSDJXnan6CIBWuI4mCdpghyDa48kj69VY,13
7
+ span_aligner-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stefaan Vercoutere
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ span_aligner