span-aligner 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- span_aligner-0.1.2/PKG-INFO +169 -0
- span_aligner-0.1.2/README.md +156 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/pyproject.toml +1 -1
- {span_aligner-0.1.0 → span_aligner-0.1.2}/span_aligner/aligner.py +19 -15
- span_aligner-0.1.2/span_aligner.egg-info/PKG-INFO +169 -0
- span_aligner-0.1.0/PKG-INFO +0 -122
- span_aligner-0.1.0/README.md +0 -109
- span_aligner-0.1.0/span_aligner.egg-info/PKG-INFO +0 -122
- {span_aligner-0.1.0 → span_aligner-0.1.2}/LICENSE +0 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/setup.cfg +0 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/span_aligner/__init__.py +0 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/span_aligner.egg-info/SOURCES.txt +0 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/span_aligner.egg-info/dependency_links.txt +0 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/span_aligner.egg-info/requires.txt +0 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/span_aligner.egg-info/top_level.txt +0 -0
- {span_aligner-0.1.0 → span_aligner-0.1.2}/tests/test_span_aligner.py +0 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: span-aligner
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A utility for aligning and mapping text spans between different text representations.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: rapidfuzz>=3.0.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# Span Aligner
|
|
15
|
+
|
|
16
|
+
A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- Sanitize span boundaries to avoid special characters.
|
|
21
|
+
- Find exact and fuzzy matches of text segments in original documents.
|
|
22
|
+
- Map spans from one text representation to another.
|
|
23
|
+
- Rebuild tagged text with nested annotations.
|
|
24
|
+
- Merge result objects containing span annotations.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Install from source:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install span-aligner
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
### Get Annotations from Tagged Text
|
|
39
|
+
|
|
40
|
+
Extract structured spans and entities from a string with inline tags.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
tagged_input = "<administrative_body>Environmental Committee</administrative_body> discussed the <impact_location>central park</impact_location> renovation on <publication_date>2025-12-15</publication_date>."
|
|
44
|
+
|
|
45
|
+
ner_map = {
|
|
46
|
+
"administrative_body": "ADMINISTRATIVE BODY",
|
|
47
|
+
"publication_date": "PUBLICATION DATE",
|
|
48
|
+
"impact_location": "PRIMARY LOCATION"
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
span_map ={
|
|
52
|
+
"motivation" : "MOTIVATION"
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
56
|
+
tagged_input,
|
|
57
|
+
ner_map=ner_map,
|
|
58
|
+
span_map=span_map
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
print(annotations["entities"])
|
|
62
|
+
# Output:
|
|
63
|
+
#[
|
|
64
|
+
# {'start': 0, 'end': 23, 'text': 'Environmental Committee', 'labels': ['ADMINISTRATIVE BODY']},
|
|
65
|
+
# {'start': 38, 'end': 50, 'text': 'central park', 'labels': ['PRIMARY LOCATION']},
|
|
66
|
+
# {'start': 65, 'end': 75, 'text': '2025-12-15', 'labels': ['PUBLICATION DATE']}
|
|
67
|
+
#]
|
|
68
|
+
|
|
69
|
+
print(annotations["spans"])
|
|
70
|
+
# Output:
|
|
71
|
+
#[
|
|
72
|
+
# {'start': 0, 'end': 76, 'text': 'Environmental Committee discussed the central park renovation on 2025-12-15.', 'labels': ['MOTIVATION']}
|
|
73
|
+
#]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
print(annotations["plain_text"])
|
|
77
|
+
# Output: "Environmental Committee discussed the central park renovation on 2025-12-15."
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Rebuild Tagged Text
|
|
81
|
+
|
|
82
|
+
Reconstruct a string with XML-like tags from raw text and span/entity lists.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
text = "On 2026-01-12, the Budget Committee finalized the annual report."
|
|
86
|
+
# Spans corresponding to 'MOTIVATION' label, mapped to 'motivation' tag
|
|
87
|
+
spans = [{"start": 0, "end": 64, "labels": ["motivation"]}]
|
|
88
|
+
# Entities corresponding to 'ADMINISTRATIVE BODY' label, mapped to 'administrative_body' tag
|
|
89
|
+
entities = [{"start": 15, "end": 35, "labels": ["administrative_body"]}]
|
|
90
|
+
|
|
91
|
+
tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
92
|
+
print(tagged)
|
|
93
|
+
# Output: <motivation>On 2026-01-12, the <administrative_body>Budget Committee</administrative_body> finalized the annual report.</motivation>
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Rebuild Tagged Text from Task
|
|
97
|
+
|
|
98
|
+
Generate tagged text directly from a Label Studio task object.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# Assuming 'task' is a Label Studio task object (or similar structure)
|
|
102
|
+
# with .data['text'] and .annotations attributes
|
|
103
|
+
mapping = {
|
|
104
|
+
"DECISION": "decision",
|
|
105
|
+
"LEGAL FRAMEWORK": "legal_framework",
|
|
106
|
+
"EXPIRATION DATE": "expiry_date"
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
|
|
110
|
+
print(tagged_output)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Map Tags to Original
|
|
114
|
+
|
|
115
|
+
Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and text as written in the original.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
original_text = "Budget Budget Committee met on 2026-01-12 to view\n\n the central park prject."
|
|
119
|
+
# Imagine the text was slightly modified or translated, but tags are present
|
|
120
|
+
tagged_text = "<administrative_body>Budget Committee</administrative_body> met on <publication_date>2026-01-12</publication_date> to review the <impact_location>central park</impact_location> project."
|
|
121
|
+
|
|
122
|
+
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
123
|
+
original_text=original_text,
|
|
124
|
+
tagged_text=tagged_text,
|
|
125
|
+
min_ratio=0.7
|
|
126
|
+
)
|
|
127
|
+
print(mapped_tagged_text)
|
|
128
|
+
# Output might look like: "Budget <administrative_body>Budget Committee</administrative_body> met on <publication_date>2026-01-12</publication_date> to view\n\n the <impact_location>central park</impact_location> prject."
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
### Map Tags to Original and Get Positions
|
|
134
|
+
|
|
135
|
+
Combine mapping tags to original text and extracting entities with correct labels.
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
original_text = "Legal basis: Art. 5. The Env. Committee met on 2026-01-12."
|
|
139
|
+
tagged_text = "Legal basis: <article>Art. 5</article>. The <administrative_body>Environmental Committee</administrative_body> met on <session_date>2026-01-12</session_date>."
|
|
140
|
+
|
|
141
|
+
# 1. Map tags to the noisy original text
|
|
142
|
+
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
143
|
+
original_text=original_text,
|
|
144
|
+
tagged_text=tagged_text,
|
|
145
|
+
min_ratio=0.7
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# 2. Extract annotations using the mapping
|
|
149
|
+
ner_label_mapping = {
|
|
150
|
+
"administrative_body": "ADMINISTRATIVE BODY",
|
|
151
|
+
"session_date": "SESSION DATE",
|
|
152
|
+
"article": "ARTICLE"
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
156
|
+
mapped_tagged_text,
|
|
157
|
+
ner_map=ner_label_mapping
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
print(annotations["entities"])
|
|
161
|
+
# Output:
|
|
162
|
+
# [
|
|
163
|
+
# {'start': 13, 'end': 19, 'text': 'Art. 5', 'labels': ['ARTICLE']},
|
|
164
|
+
# {'start': 47, 'end': 57, 'text': '2026-01-12', 'labels': ['SESSION DATE']}
|
|
165
|
+
# ]
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# Span Aligner
|
|
2
|
+
|
|
3
|
+
A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Sanitize span boundaries to avoid special characters.
|
|
8
|
+
- Find exact and fuzzy matches of text segments in original documents.
|
|
9
|
+
- Map spans from one text representation to another.
|
|
10
|
+
- Rebuild tagged text with nested annotations.
|
|
11
|
+
- Merge result objects containing span annotations.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
Install from source:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install span-aligner
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
### Get Annotations from Tagged Text
|
|
26
|
+
|
|
27
|
+
Extract structured spans and entities from a string with inline tags.
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
tagged_input = "<administrative_body>Environmental Committee</administrative_body> discussed the <impact_location>central park</impact_location> renovation on <publication_date>2025-12-15</publication_date>."
|
|
31
|
+
|
|
32
|
+
ner_map = {
|
|
33
|
+
"administrative_body": "ADMINISTRATIVE BODY",
|
|
34
|
+
"publication_date": "PUBLICATION DATE",
|
|
35
|
+
"impact_location": "PRIMARY LOCATION"
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
span_map ={
|
|
39
|
+
"motivation" : "MOTIVATION"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
43
|
+
tagged_input,
|
|
44
|
+
ner_map=ner_map,
|
|
45
|
+
span_map=span_map
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
print(annotations["entities"])
|
|
49
|
+
# Output:
|
|
50
|
+
#[
|
|
51
|
+
# {'start': 0, 'end': 23, 'text': 'Environmental Committee', 'labels': ['ADMINISTRATIVE BODY']},
|
|
52
|
+
# {'start': 38, 'end': 50, 'text': 'central park', 'labels': ['PRIMARY LOCATION']},
|
|
53
|
+
# {'start': 65, 'end': 75, 'text': '2025-12-15', 'labels': ['PUBLICATION DATE']}
|
|
54
|
+
#]
|
|
55
|
+
|
|
56
|
+
print(annotations["spans"])
|
|
57
|
+
# Output:
|
|
58
|
+
#[
|
|
59
|
+
# {'start': 0, 'end': 76, 'text': 'Environmental Committee discussed the central park renovation on 2025-12-15.', 'labels': ['MOTIVATION']}
|
|
60
|
+
#]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
print(annotations["plain_text"])
|
|
64
|
+
# Output: "Environmental Committee discussed the central park renovation on 2025-12-15."
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Rebuild Tagged Text
|
|
68
|
+
|
|
69
|
+
Reconstruct a string with XML-like tags from raw text and span/entity lists.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
text = "On 2026-01-12, the Budget Committee finalized the annual report."
|
|
73
|
+
# Spans corresponding to 'MOTIVATION' label, mapped to 'motivation' tag
|
|
74
|
+
spans = [{"start": 0, "end": 64, "labels": ["motivation"]}]
|
|
75
|
+
# Entities corresponding to 'ADMINISTRATIVE BODY' label, mapped to 'administrative_body' tag
|
|
76
|
+
entities = [{"start": 15, "end": 35, "labels": ["administrative_body"]}]
|
|
77
|
+
|
|
78
|
+
tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
79
|
+
print(tagged)
|
|
80
|
+
# Output: <motivation>On 2026-01-12, the <administrative_body>Budget Committee</administrative_body> finalized the annual report.</motivation>
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Rebuild Tagged Text from Task
|
|
84
|
+
|
|
85
|
+
Generate tagged text directly from a Label Studio task object.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
# Assuming 'task' is a Label Studio task object (or similar structure)
|
|
89
|
+
# with .data['text'] and .annotations attributes
|
|
90
|
+
mapping = {
|
|
91
|
+
"DECISION": "decision",
|
|
92
|
+
"LEGAL FRAMEWORK": "legal_framework",
|
|
93
|
+
"EXPIRATION DATE": "expiry_date"
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
|
|
97
|
+
print(tagged_output)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Map Tags to Original
|
|
101
|
+
|
|
102
|
+
Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and text as written in the original.
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
original_text = "Budget Budget Committee met on 2026-01-12 to view\n\n the central park prject."
|
|
106
|
+
# Imagine the text was slightly modified or translated, but tags are present
|
|
107
|
+
tagged_text = "<administrative_body>Budget Committee</administrative_body> met on <publication_date>2026-01-12</publication_date> to review the <impact_location>central park</impact_location> project."
|
|
108
|
+
|
|
109
|
+
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
110
|
+
original_text=original_text,
|
|
111
|
+
tagged_text=tagged_text,
|
|
112
|
+
min_ratio=0.7
|
|
113
|
+
)
|
|
114
|
+
print(mapped_tagged_text)
|
|
115
|
+
# Output might look like: "Budget <administrative_body>Budget Committee</administrative_body> met on <publication_date>2026-01-12</publication_date> to view\n\n the <impact_location>central park</impact_location> prject."
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
### Map Tags to Original and Get Positions
|
|
121
|
+
|
|
122
|
+
Combine mapping tags to original text and extracting entities with correct labels.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
original_text = "Legal basis: Art. 5. The Env. Committee met on 2026-01-12."
|
|
126
|
+
tagged_text = "Legal basis: <article>Art. 5</article>. The <administrative_body>Environmental Committee</administrative_body> met on <session_date>2026-01-12</session_date>."
|
|
127
|
+
|
|
128
|
+
# 1. Map tags to the noisy original text
|
|
129
|
+
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
130
|
+
original_text=original_text,
|
|
131
|
+
tagged_text=tagged_text,
|
|
132
|
+
min_ratio=0.7
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# 2. Extract annotations using the mapping
|
|
136
|
+
ner_label_mapping = {
|
|
137
|
+
"administrative_body": "ADMINISTRATIVE BODY",
|
|
138
|
+
"session_date": "SESSION DATE",
|
|
139
|
+
"article": "ARTICLE"
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
143
|
+
mapped_tagged_text,
|
|
144
|
+
ner_map=ner_label_mapping
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
print(annotations["entities"])
|
|
148
|
+
# Output:
|
|
149
|
+
# [
|
|
150
|
+
# {'start': 13, 'end': 19, 'text': 'Art. 5', 'labels': ['ARTICLE']},
|
|
151
|
+
# {'start': 47, 'end': 57, 'text': '2026-01-12', 'labels': ['SESSION DATE']}
|
|
152
|
+
# ]
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "span-aligner"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.2"
|
|
8
8
|
description = "A utility for aligning and mapping text spans between different text representations."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -442,7 +442,7 @@ class SpanAligner:
|
|
|
442
442
|
all_spans_aligned, updated["spans"] = realign(input_spans, enable_fuzzy)
|
|
443
443
|
all_entities_aligned, updated["entities"] = realign(input_entities, enable_fuzzy)
|
|
444
444
|
updated["task"]["data"]["text"] = original_text
|
|
445
|
-
return all_spans_aligned and all_entities_aligned
|
|
445
|
+
return updated, all_spans_aligned and all_entities_aligned
|
|
446
446
|
|
|
447
447
|
|
|
448
448
|
@staticmethod
|
|
@@ -713,7 +713,7 @@ class SpanAligner:
|
|
|
713
713
|
raise ValueError("No tagged_text found in input result.")
|
|
714
714
|
|
|
715
715
|
# Default allowed tags (from your SYSTEM/USER prompts)
|
|
716
|
-
if allowed_tags is None:
|
|
716
|
+
if allowed_tags is None and (span_map or ner_map):
|
|
717
717
|
# Safely handle None maps
|
|
718
718
|
s_map = span_map or {}
|
|
719
719
|
n_map = ner_map or {}
|
|
@@ -725,10 +725,6 @@ class SpanAligner:
|
|
|
725
725
|
if mapping:
|
|
726
726
|
annotation_map.update(mapping)
|
|
727
727
|
|
|
728
|
-
# If annotation_map ends up empty, initialize with identity mapping
|
|
729
|
-
if not annotation_map:
|
|
730
|
-
annotation_map = {t: t for t in allowed_tags}
|
|
731
|
-
|
|
732
728
|
# Regex to capture bare tags like <tag> or </tag>
|
|
733
729
|
tag_re = re.compile(r"<(/?)([a-zA-Z_][a-zA-Z0-9_-]*)>")
|
|
734
730
|
|
|
@@ -770,7 +766,7 @@ class SpanAligner:
|
|
|
770
766
|
inside_attachments_level = max(0, inside_attachments_level - 1)
|
|
771
767
|
|
|
772
768
|
# Handle span stack only for allowed tags
|
|
773
|
-
if tag_name in allowed_tags:
|
|
769
|
+
if allowed_tags is None or tag_name in allowed_tags:
|
|
774
770
|
if not is_closing:
|
|
775
771
|
# Opening tag
|
|
776
772
|
stack.append((tag_name, pos_out))
|
|
@@ -809,13 +805,14 @@ class SpanAligner:
|
|
|
809
805
|
"start": adjusted_start,
|
|
810
806
|
"end": adjusted_end,
|
|
811
807
|
"text": span_text,
|
|
812
|
-
"labels": [annotation_map.get(tag_name, tag_name)]
|
|
808
|
+
"labels": [annotation_map.get(tag_name, tag_name) if annotation_map else tag_name]
|
|
813
809
|
}
|
|
814
810
|
|
|
815
811
|
if ner_map and tag_name in ner_map:
|
|
816
812
|
entities.append(annotation_entry)
|
|
817
813
|
else:
|
|
818
814
|
spans.append(annotation_entry)
|
|
815
|
+
|
|
819
816
|
found_open = True
|
|
820
817
|
break
|
|
821
818
|
# If no matching opening tag found, ignore gracefully
|
|
@@ -914,8 +911,8 @@ class SpanAligner:
|
|
|
914
911
|
@staticmethod
|
|
915
912
|
def rebuild_tagged_text(
|
|
916
913
|
original_text: str,
|
|
917
|
-
spans: List[Dict[str, Any]],
|
|
918
|
-
entities: List[Dict[str, Any]],
|
|
914
|
+
spans: List[Dict[str, Any]] = None,
|
|
915
|
+
entities: List[Dict[str, Any]] = None,
|
|
919
916
|
label_to_tag: Optional[Dict[str, str]] = None
|
|
920
917
|
) -> Tuple[str, Dict[str, int]]:
|
|
921
918
|
"""
|
|
@@ -983,8 +980,10 @@ class SpanAligner:
|
|
|
983
980
|
"length": e - s,
|
|
984
981
|
})
|
|
985
982
|
|
|
986
|
-
|
|
987
|
-
|
|
983
|
+
if spans and len(spans)>0:
|
|
984
|
+
add_items(spans)
|
|
985
|
+
if entities and len(entities)>0:
|
|
986
|
+
add_items(entities)
|
|
988
987
|
|
|
989
988
|
# Sort: by start asc, longer first (end desc) to open outers before inners
|
|
990
989
|
annotations.sort(key=lambda a: (a["start"], -a["length"]))
|
|
@@ -1079,7 +1078,9 @@ class SpanAligner:
|
|
|
1079
1078
|
tagged_text: str,
|
|
1080
1079
|
min_ratio: float = 0.8,
|
|
1081
1080
|
max_dist: int = 20,
|
|
1081
|
+
enable_fuzzy: bool = False,
|
|
1082
1082
|
logging: bool = False
|
|
1083
|
+
|
|
1083
1084
|
) -> str:
|
|
1084
1085
|
"""
|
|
1085
1086
|
Map spans from tagged text back to their positions in the original text.
|
|
@@ -1115,12 +1116,15 @@ class SpanAligner:
|
|
|
1115
1116
|
}
|
|
1116
1117
|
|
|
1117
1118
|
# Now map spans/entities back to original_text
|
|
1118
|
-
|
|
1119
|
+
mapped, _ = SpanAligner.map_spans_to_original(
|
|
1119
1120
|
original_text,
|
|
1120
1121
|
result_obj,
|
|
1121
1122
|
min_ratio=min_ratio,
|
|
1122
1123
|
max_dist=max_dist,
|
|
1123
|
-
|
|
1124
|
+
enable_fuzzy = enable_fuzzy,
|
|
1125
|
+
logging=logging,
|
|
1126
|
+
|
|
1124
1127
|
)
|
|
1125
1128
|
|
|
1126
|
-
|
|
1129
|
+
original_text_tagged, _ = SpanAligner.rebuild_tagged_text(original_text, spans = mapped.get("spans", []))
|
|
1130
|
+
return original_text_tagged
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: span-aligner
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: A utility for aligning and mapping text spans between different text representations.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: rapidfuzz>=3.0.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# Span Aligner
|
|
15
|
+
|
|
16
|
+
A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- Sanitize span boundaries to avoid special characters.
|
|
21
|
+
- Find exact and fuzzy matches of text segments in original documents.
|
|
22
|
+
- Map spans from one text representation to another.
|
|
23
|
+
- Rebuild tagged text with nested annotations.
|
|
24
|
+
- Merge result objects containing span annotations.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Install from source:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install span-aligner
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
### Get Annotations from Tagged Text
|
|
39
|
+
|
|
40
|
+
Extract structured spans and entities from a string with inline tags.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
tagged_input = "<administrative_body>Environmental Committee</administrative_body> discussed the <impact_location>central park</impact_location> renovation on <publication_date>2025-12-15</publication_date>."
|
|
44
|
+
|
|
45
|
+
ner_map = {
|
|
46
|
+
"administrative_body": "ADMINISTRATIVE BODY",
|
|
47
|
+
"publication_date": "PUBLICATION DATE",
|
|
48
|
+
"impact_location": "PRIMARY LOCATION"
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
span_map ={
|
|
52
|
+
"motivation" : "MOTIVATION"
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
56
|
+
tagged_input,
|
|
57
|
+
ner_map=ner_map,
|
|
58
|
+
span_map=span_map
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
print(annotations["entities"])
|
|
62
|
+
# Output:
|
|
63
|
+
#[
|
|
64
|
+
# {'start': 0, 'end': 23, 'text': 'Environmental Committee', 'labels': ['ADMINISTRATIVE BODY']},
|
|
65
|
+
# {'start': 38, 'end': 50, 'text': 'central park', 'labels': ['PRIMARY LOCATION']},
|
|
66
|
+
# {'start': 65, 'end': 75, 'text': '2025-12-15', 'labels': ['PUBLICATION DATE']}
|
|
67
|
+
#]
|
|
68
|
+
|
|
69
|
+
print(annotations["spans"])
|
|
70
|
+
# Output:
|
|
71
|
+
#[
|
|
72
|
+
# {'start': 0, 'end': 76, 'text': 'Environmental Committee discussed the central park renovation on 2025-12-15.', 'labels': ['MOTIVATION']}
|
|
73
|
+
#]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
print(annotations["plain_text"])
|
|
77
|
+
# Output: "Environmental Committee discussed the central park renovation on 2025-12-15."
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Rebuild Tagged Text
|
|
81
|
+
|
|
82
|
+
Reconstruct a string with XML-like tags from raw text and span/entity lists.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
text = "On 2026-01-12, the Budget Committee finalized the annual report."
|
|
86
|
+
# Spans corresponding to 'MOTIVATION' label, mapped to 'motivation' tag
|
|
87
|
+
spans = [{"start": 0, "end": 64, "labels": ["motivation"]}]
|
|
88
|
+
# Entities corresponding to 'ADMINISTRATIVE BODY' label, mapped to 'administrative_body' tag
|
|
89
|
+
entities = [{"start": 15, "end": 35, "labels": ["administrative_body"]}]
|
|
90
|
+
|
|
91
|
+
tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
92
|
+
print(tagged)
|
|
93
|
+
# Output: <motivation>On 2026-01-12, the <administrative_body>Budget Committee</administrative_body> finalized the annual report.</motivation>
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Rebuild Tagged Text from Task
|
|
97
|
+
|
|
98
|
+
Generate tagged text directly from a Label Studio task object.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# Assuming 'task' is a Label Studio task object (or similar structure)
|
|
102
|
+
# with .data['text'] and .annotations attributes
|
|
103
|
+
mapping = {
|
|
104
|
+
"DECISION": "decision",
|
|
105
|
+
"LEGAL FRAMEWORK": "legal_framework",
|
|
106
|
+
"EXPIRATION DATE": "expiry_date"
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
|
|
110
|
+
print(tagged_output)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Map Tags to Original
|
|
114
|
+
|
|
115
|
+
Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and text as written in the original.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
original_text = "Budget Budget Committee met on 2026-01-12 to view\n\n the central park prject."
|
|
119
|
+
# Imagine the text was slightly modified or translated, but tags are present
|
|
120
|
+
tagged_text = "<administrative_body>Budget Committee</administrative_body> met on <publication_date>2026-01-12</publication_date> to review the <impact_location>central park</impact_location> project."
|
|
121
|
+
|
|
122
|
+
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
123
|
+
original_text=original_text,
|
|
124
|
+
tagged_text=tagged_text,
|
|
125
|
+
min_ratio=0.7
|
|
126
|
+
)
|
|
127
|
+
print(mapped_tagged_text)
|
|
128
|
+
# Output might look like: "Budget <administrative_body>Budget Committee</administrative_body> met on <publication_date>2026-01-12</publication_date> to view\n\n the <impact_location>central park</impact_location> prject."
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
### Map Tags to Original and Get Positions
|
|
134
|
+
|
|
135
|
+
Combine mapping tags to original text and extracting entities with correct labels.
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
original_text = "Legal basis: Art. 5. The Env. Committee met on 2026-01-12."
|
|
139
|
+
tagged_text = "Legal basis: <article>Art. 5</article>. The <administrative_body>Environmental Committee</administrative_body> met on <session_date>2026-01-12</session_date>."
|
|
140
|
+
|
|
141
|
+
# 1. Map tags to the noisy original text
|
|
142
|
+
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
143
|
+
original_text=original_text,
|
|
144
|
+
tagged_text=tagged_text,
|
|
145
|
+
min_ratio=0.7
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# 2. Extract annotations using the mapping
|
|
149
|
+
ner_label_mapping = {
|
|
150
|
+
"administrative_body": "ADMINISTRATIVE BODY",
|
|
151
|
+
"session_date": "SESSION DATE",
|
|
152
|
+
"article": "ARTICLE"
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
156
|
+
mapped_tagged_text,
|
|
157
|
+
ner_map=ner_label_mapping
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
print(annotations["entities"])
|
|
161
|
+
# Output:
|
|
162
|
+
# [
|
|
163
|
+
# {'start': 13, 'end': 19, 'text': 'Art. 5', 'labels': ['ARTICLE']},
|
|
164
|
+
# {'start': 47, 'end': 57, 'text': '2026-01-12', 'labels': ['SESSION DATE']}
|
|
165
|
+
# ]
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
span_aligner-0.1.0/PKG-INFO
DELETED
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: span-aligner
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: A utility for aligning and mapping text spans between different text representations.
|
|
5
|
-
License: MIT
|
|
6
|
-
Requires-Python: >=3.8
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Requires-Dist: rapidfuzz>=3.0.0
|
|
10
|
-
Provides-Extra: dev
|
|
11
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
12
|
-
Dynamic: license-file
|
|
13
|
-
|
|
14
|
-
# Span Aligner
|
|
15
|
-
|
|
16
|
-
A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
|
|
17
|
-
|
|
18
|
-
## Features
|
|
19
|
-
|
|
20
|
-
- Sanitize span boundaries to avoid special characters.
|
|
21
|
-
- Find exact and fuzzy matches of text segments in original documents.
|
|
22
|
-
- Map spans from one text representation to another.
|
|
23
|
-
- Rebuild tagged text with nested annotations.
|
|
24
|
-
- Merge result objects containing span annotations.
|
|
25
|
-
|
|
26
|
-
## Installation
|
|
27
|
-
|
|
28
|
-
Install from source:
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
pip install .
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
For development:
|
|
35
|
-
|
|
36
|
-
```bash
|
|
37
|
-
pip install -e ".[dev]"
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
## Usage
|
|
41
|
-
|
|
42
|
-
```python
|
|
43
|
-
from span_aligner import SpanAligner
|
|
44
|
-
|
|
45
|
-
original = "Hello, World!"
|
|
46
|
-
result_obj = {
|
|
47
|
-
"spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
|
|
48
|
-
"entities": [],
|
|
49
|
-
"task": {"data": {"text": ""}}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
|
|
53
|
-
print(mapped)
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
### Map Tags to Original
|
|
57
|
-
|
|
58
|
-
Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and original text as written in the original.
|
|
59
|
-
|
|
60
|
-
```python
|
|
61
|
-
original_text = "The quick brown fox jumps\n\n over the dog."
|
|
62
|
-
# Imagine the text was slightly modified or translated, but tags are present
|
|
63
|
-
tagged_text = "The <adj>quick</adj> brown fox jumps over the <animal>dog</animal>."
|
|
64
|
-
|
|
65
|
-
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
66
|
-
original_text=original_text,
|
|
67
|
-
tagged_text=tagged_text,
|
|
68
|
-
min_ratio=0.8
|
|
69
|
-
)
|
|
70
|
-
print(mapped_tagged_text)
|
|
71
|
-
# Output might look like: "The <adj>quick</adj> brown fox jumps\n\n over the <animal>dog</animal>."
|
|
72
|
-
# (If original text differed slightly, tags would be placed on best matching spans)
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
### Rebuild Tagged Text
|
|
76
|
-
|
|
77
|
-
Reconstruct a string with XML-like tags from raw text and span/entity lists.
|
|
78
|
-
|
|
79
|
-
```python
|
|
80
|
-
text = "Hello World"
|
|
81
|
-
spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
|
|
82
|
-
entities = [{"start": 6, "end": 11, "labels": ["location"]}]
|
|
83
|
-
|
|
84
|
-
tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
85
|
-
print(tagged)
|
|
86
|
-
# Output: <sentence>Hello <location>World</location></sentence>
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
### Rebuild Tagged Text from Task
|
|
90
|
-
|
|
91
|
-
Generate tagged text directly from a Label Studio task object.
|
|
92
|
-
|
|
93
|
-
```python
|
|
94
|
-
# Assuming 'task' is a Label Studio task object (or similar structure)
|
|
95
|
-
# with .data['text'] and .annotations attributes
|
|
96
|
-
mapping = {"Location": "loc", "Person": "per"}
|
|
97
|
-
|
|
98
|
-
tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
|
|
99
|
-
print(tagged_output)
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### Get Annotations from Tagged Text
|
|
103
|
-
|
|
104
|
-
Extract structured spans and entities from a string with inline tags.
|
|
105
|
-
|
|
106
|
-
```python
|
|
107
|
-
tagged_input = "Visit <loc>Paris</loc> and see the <landmark>Eiffel Tower</landmark>."
|
|
108
|
-
|
|
109
|
-
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
110
|
-
tagged_input,
|
|
111
|
-
ner_map={"loc": "Location", "landmark": "Location"}
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
print(annotations["entities"])
|
|
115
|
-
# Output:
|
|
116
|
-
# [
|
|
117
|
-
# {"start": 6, "end": 11, "text": "Paris", "labels": ["Location"]},
|
|
118
|
-
# {"start": 24, "end": 36, "text": "Eiffel Tower", "labels": ["Location"]}
|
|
119
|
-
# ]
|
|
120
|
-
print(annotations["plain_text"])
|
|
121
|
-
# Output: "Visit Paris and see the Eiffel Tower."
|
|
122
|
-
```
|
span_aligner-0.1.0/README.md
DELETED
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
# Span Aligner
|
|
2
|
-
|
|
3
|
-
A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
|
|
4
|
-
|
|
5
|
-
## Features
|
|
6
|
-
|
|
7
|
-
- Sanitize span boundaries to avoid special characters.
|
|
8
|
-
- Find exact and fuzzy matches of text segments in original documents.
|
|
9
|
-
- Map spans from one text representation to another.
|
|
10
|
-
- Rebuild tagged text with nested annotations.
|
|
11
|
-
- Merge result objects containing span annotations.
|
|
12
|
-
|
|
13
|
-
## Installation
|
|
14
|
-
|
|
15
|
-
Install from source:
|
|
16
|
-
|
|
17
|
-
```bash
|
|
18
|
-
pip install .
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
For development:
|
|
22
|
-
|
|
23
|
-
```bash
|
|
24
|
-
pip install -e ".[dev]"
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
## Usage
|
|
28
|
-
|
|
29
|
-
```python
|
|
30
|
-
from span_aligner import SpanAligner
|
|
31
|
-
|
|
32
|
-
original = "Hello, World!"
|
|
33
|
-
result_obj = {
|
|
34
|
-
"spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
|
|
35
|
-
"entities": [],
|
|
36
|
-
"task": {"data": {"text": ""}}
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
|
|
40
|
-
print(mapped)
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
### Map Tags to Original
|
|
44
|
-
|
|
45
|
-
Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and original text as written in the original.
|
|
46
|
-
|
|
47
|
-
```python
|
|
48
|
-
original_text = "The quick brown fox jumps\n\n over the dog."
|
|
49
|
-
# Imagine the text was slightly modified or translated, but tags are present
|
|
50
|
-
tagged_text = "The <adj>quick</adj> brown fox jumps over the <animal>dog</animal>."
|
|
51
|
-
|
|
52
|
-
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
53
|
-
original_text=original_text,
|
|
54
|
-
tagged_text=tagged_text,
|
|
55
|
-
min_ratio=0.8
|
|
56
|
-
)
|
|
57
|
-
print(mapped_tagged_text)
|
|
58
|
-
# Output might look like: "The <adj>quick</adj> brown fox jumps\n\n over the <animal>dog</animal>."
|
|
59
|
-
# (If original text differed slightly, tags would be placed on best matching spans)
|
|
60
|
-
```
|
|
61
|
-
|
|
62
|
-
### Rebuild Tagged Text
|
|
63
|
-
|
|
64
|
-
Reconstruct a string with XML-like tags from raw text and span/entity lists.
|
|
65
|
-
|
|
66
|
-
```python
|
|
67
|
-
text = "Hello World"
|
|
68
|
-
spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
|
|
69
|
-
entities = [{"start": 6, "end": 11, "labels": ["location"]}]
|
|
70
|
-
|
|
71
|
-
tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
72
|
-
print(tagged)
|
|
73
|
-
# Output: <sentence>Hello <location>World</location></sentence>
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
### Rebuild Tagged Text from Task
|
|
77
|
-
|
|
78
|
-
Generate tagged text directly from a Label Studio task object.
|
|
79
|
-
|
|
80
|
-
```python
|
|
81
|
-
# Assuming 'task' is a Label Studio task object (or similar structure)
|
|
82
|
-
# with .data['text'] and .annotations attributes
|
|
83
|
-
mapping = {"Location": "loc", "Person": "per"}
|
|
84
|
-
|
|
85
|
-
tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
|
|
86
|
-
print(tagged_output)
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
### Get Annotations from Tagged Text
|
|
90
|
-
|
|
91
|
-
Extract structured spans and entities from a string with inline tags.
|
|
92
|
-
|
|
93
|
-
```python
|
|
94
|
-
tagged_input = "Visit <loc>Paris</loc> and see the <landmark>Eiffel Tower</landmark>."
|
|
95
|
-
|
|
96
|
-
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
97
|
-
tagged_input,
|
|
98
|
-
ner_map={"loc": "Location", "landmark": "Location"}
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
print(annotations["entities"])
|
|
102
|
-
# Output:
|
|
103
|
-
# [
|
|
104
|
-
# {"start": 6, "end": 11, "text": "Paris", "labels": ["Location"]},
|
|
105
|
-
# {"start": 24, "end": 36, "text": "Eiffel Tower", "labels": ["Location"]}
|
|
106
|
-
# ]
|
|
107
|
-
print(annotations["plain_text"])
|
|
108
|
-
# Output: "Visit Paris and see the Eiffel Tower."
|
|
109
|
-
```
|
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: span-aligner
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: A utility for aligning and mapping text spans between different text representations.
|
|
5
|
-
License: MIT
|
|
6
|
-
Requires-Python: >=3.8
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Requires-Dist: rapidfuzz>=3.0.0
|
|
10
|
-
Provides-Extra: dev
|
|
11
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
12
|
-
Dynamic: license-file
|
|
13
|
-
|
|
14
|
-
# Span Aligner
|
|
15
|
-
|
|
16
|
-
A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
|
|
17
|
-
|
|
18
|
-
## Features
|
|
19
|
-
|
|
20
|
-
- Sanitize span boundaries to avoid special characters.
|
|
21
|
-
- Find exact and fuzzy matches of text segments in original documents.
|
|
22
|
-
- Map spans from one text representation to another.
|
|
23
|
-
- Rebuild tagged text with nested annotations.
|
|
24
|
-
- Merge result objects containing span annotations.
|
|
25
|
-
|
|
26
|
-
## Installation
|
|
27
|
-
|
|
28
|
-
Install from source:
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
pip install .
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
For development:
|
|
35
|
-
|
|
36
|
-
```bash
|
|
37
|
-
pip install -e ".[dev]"
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
## Usage
|
|
41
|
-
|
|
42
|
-
```python
|
|
43
|
-
from span_aligner import SpanAligner
|
|
44
|
-
|
|
45
|
-
original = "Hello, World!"
|
|
46
|
-
result_obj = {
|
|
47
|
-
"spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
|
|
48
|
-
"entities": [],
|
|
49
|
-
"task": {"data": {"text": ""}}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
|
|
53
|
-
print(mapped)
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
### Map Tags to Original
|
|
57
|
-
|
|
58
|
-
Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and original text as written in the original.
|
|
59
|
-
|
|
60
|
-
```python
|
|
61
|
-
original_text = "The quick brown fox jumps\n\n over the dog."
|
|
62
|
-
# Imagine the text was slightly modified or translated, but tags are present
|
|
63
|
-
tagged_text = "The <adj>quick</adj> brown fox jumps over the <animal>dog</animal>."
|
|
64
|
-
|
|
65
|
-
mapped_tagged_text = SpanAligner.map_tags_to_original(
|
|
66
|
-
original_text=original_text,
|
|
67
|
-
tagged_text=tagged_text,
|
|
68
|
-
min_ratio=0.8
|
|
69
|
-
)
|
|
70
|
-
print(mapped_tagged_text)
|
|
71
|
-
# Output might look like: "The <adj>quick</adj> brown fox jumps\n\n over the <animal>dog</animal>."
|
|
72
|
-
# (If original text differed slightly, tags would be placed on best matching spans)
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
### Rebuild Tagged Text
|
|
76
|
-
|
|
77
|
-
Reconstruct a string with XML-like tags from raw text and span/entity lists.
|
|
78
|
-
|
|
79
|
-
```python
|
|
80
|
-
text = "Hello World"
|
|
81
|
-
spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
|
|
82
|
-
entities = [{"start": 6, "end": 11, "labels": ["location"]}]
|
|
83
|
-
|
|
84
|
-
tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
|
|
85
|
-
print(tagged)
|
|
86
|
-
# Output: <sentence>Hello <location>World</location></sentence>
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
### Rebuild Tagged Text from Task
|
|
90
|
-
|
|
91
|
-
Generate tagged text directly from a Label Studio task object.
|
|
92
|
-
|
|
93
|
-
```python
|
|
94
|
-
# Assuming 'task' is a Label Studio task object (or similar structure)
|
|
95
|
-
# with .data['text'] and .annotations attributes
|
|
96
|
-
mapping = {"Location": "loc", "Person": "per"}
|
|
97
|
-
|
|
98
|
-
tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
|
|
99
|
-
print(tagged_output)
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### Get Annotations from Tagged Text
|
|
103
|
-
|
|
104
|
-
Extract structured spans and entities from a string with inline tags.
|
|
105
|
-
|
|
106
|
-
```python
|
|
107
|
-
tagged_input = "Visit <loc>Paris</loc> and see the <landmark>Eiffel Tower</landmark>."
|
|
108
|
-
|
|
109
|
-
annotations = SpanAligner.get_annotations_from_tagged_text(
|
|
110
|
-
tagged_input,
|
|
111
|
-
ner_map={"loc": "Location", "landmark": "Location"}
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
print(annotations["entities"])
|
|
115
|
-
# Output:
|
|
116
|
-
# [
|
|
117
|
-
# {"start": 6, "end": 11, "text": "Paris", "labels": ["Location"]},
|
|
118
|
-
# {"start": 24, "end": 36, "text": "Eiffel Tower", "labels": ["Location"]}
|
|
119
|
-
# ]
|
|
120
|
-
print(annotations["plain_text"])
|
|
121
|
-
# Output: "Visit Paris and see the Eiffel Tower."
|
|
122
|
-
```
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|