span-aligner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stefaan Vercoutere
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.4
2
+ Name: span-aligner
3
+ Version: 0.1.0
4
+ Summary: A utility for aligning and mapping text spans between different text representations.
5
+ License: MIT
6
+ Requires-Python: >=3.8
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: rapidfuzz>=3.0.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
12
+ Dynamic: license-file
13
+
14
+ # Span Aligner
15
+
16
+ A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
17
+
18
+ ## Features
19
+
20
+ - Sanitize span boundaries to avoid special characters.
21
+ - Find exact and fuzzy matches of text segments in original documents.
22
+ - Map spans from one text representation to another.
23
+ - Rebuild tagged text with nested annotations.
24
+ - Merge result objects containing span annotations.
25
+
26
+ ## Installation
27
+
28
+ Install from source:
29
+
30
+ ```bash
31
+ pip install .
32
+ ```
33
+
34
+ For development:
35
+
36
+ ```bash
37
+ pip install -e ".[dev]"
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ ```python
43
+ from span_aligner import SpanAligner
44
+
45
+ original = "Hello, World!"
46
+ result_obj = {
47
+ "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
48
+ "entities": [],
49
+ "task": {"data": {"text": ""}}
50
+ }
51
+
52
+ success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
53
+ print(mapped)
54
+ ```
55
+
56
+ ### Map Tags to Original
57
+
58
+ Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and original text as written in the original.
59
+
60
+ ```python
61
+ original_text = "The quick brown fox jumps\n\n over the dog."
62
+ # Imagine the text was slightly modified or translated, but tags are present
63
+ tagged_text = "The <adj>quick</adj> brown fox jumps over the <animal>dog</animal>."
64
+
65
+ mapped_tagged_text = SpanAligner.map_tags_to_original(
66
+ original_text=original_text,
67
+ tagged_text=tagged_text,
68
+ min_ratio=0.8
69
+ )
70
+ print(mapped_tagged_text)
71
+ # Output might look like: "The <adj>quick</adj> brown fox jumps\n\n over the <animal>dog</animal>."
72
+ # (If original text differed slightly, tags would be placed on best matching spans)
73
+ ```
74
+
75
+ ### Rebuild Tagged Text
76
+
77
+ Reconstruct a string with XML-like tags from raw text and span/entity lists.
78
+
79
+ ```python
80
+ text = "Hello World"
81
+ spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
82
+ entities = [{"start": 6, "end": 11, "labels": ["location"]}]
83
+
84
+ tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
85
+ print(tagged)
86
+ # Output: <sentence>Hello <location>World</location></sentence>
87
+ ```
88
+
89
+ ### Rebuild Tagged Text from Task
90
+
91
+ Generate tagged text directly from a Label Studio task object.
92
+
93
+ ```python
94
+ # Assuming 'task' is a Label Studio task object (or similar structure)
95
+ # with .data['text'] and .annotations attributes
96
+ mapping = {"Location": "loc", "Person": "per"}
97
+
98
+ tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
99
+ print(tagged_output)
100
+ ```
101
+
102
+ ### Get Annotations from Tagged Text
103
+
104
+ Extract structured spans and entities from a string with inline tags.
105
+
106
+ ```python
107
+ tagged_input = "Visit <loc>Paris</loc> and see the <landmark>Eiffel Tower</landmark>."
108
+
109
+ annotations = SpanAligner.get_annotations_from_tagged_text(
110
+ tagged_input,
111
+ ner_map={"loc": "Location", "landmark": "Location"}
112
+ )
113
+
114
+ print(annotations["entities"])
115
+ # Output:
116
+ # [
117
+ # {"start": 6, "end": 11, "text": "Paris", "labels": ["Location"]},
118
+ # {"start": 24, "end": 36, "text": "Eiffel Tower", "labels": ["Location"]}
119
+ # ]
120
+ print(annotations["plain_text"])
121
+ # Output: "Visit Paris and see the Eiffel Tower."
122
+ ```
@@ -0,0 +1,109 @@
1
+ # Span Aligner
2
+
3
+ A utility for aligning and mapping text spans between different text representations, particularly useful for Label Studio annotation compatibility.
4
+
5
+ ## Features
6
+
7
+ - Sanitize span boundaries to avoid special characters.
8
+ - Find exact and fuzzy matches of text segments in original documents.
9
+ - Map spans from one text representation to another.
10
+ - Rebuild tagged text with nested annotations.
11
+ - Merge result objects containing span annotations.
12
+
13
+ ## Installation
14
+
15
+ Install from source:
16
+
17
+ ```bash
18
+ pip install .
19
+ ```
20
+
21
+ For development:
22
+
23
+ ```bash
24
+ pip install -e ".[dev]"
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```python
30
+ from span_aligner import SpanAligner
31
+
32
+ original = "Hello, World!"
33
+ result_obj = {
34
+ "spans": [{"start": 0, "end": 5, "text": "Hello", "labels": ["greeting"]}],
35
+ "entities": [],
36
+ "task": {"data": {"text": ""}}
37
+ }
38
+
39
+ success, mapped = SpanAligner.map_spans_to_original(original, result_obj)
40
+ print(mapped)
41
+ ```
42
+
43
+ ### Map Tags to Original
44
+
45
+ Align annotated spans from a tagged string back to their positions in the original text, keeping the mistakes and original text as written in the original.
46
+
47
+ ```python
48
+ original_text = "The quick brown fox jumps\n\n over the dog."
49
+ # Imagine the text was slightly modified or translated, but tags are present
50
+ tagged_text = "The <adj>quick</adj> brown fox jumps over the <animal>dog</animal>."
51
+
52
+ mapped_tagged_text = SpanAligner.map_tags_to_original(
53
+ original_text=original_text,
54
+ tagged_text=tagged_text,
55
+ min_ratio=0.8
56
+ )
57
+ print(mapped_tagged_text)
58
+ # Output might look like: "The <adj>quick</adj> brown fox jumps\n\n over the <animal>dog</animal>."
59
+ # (If original text differed slightly, tags would be placed on best matching spans)
60
+ ```
61
+
62
+ ### Rebuild Tagged Text
63
+
64
+ Reconstruct a string with XML-like tags from raw text and span/entity lists.
65
+
66
+ ```python
67
+ text = "Hello World"
68
+ spans = [{"start": 0, "end": 11, "labels": ["sentence"]}]
69
+ entities = [{"start": 6, "end": 11, "labels": ["location"]}]
70
+
71
+ tagged, stats = SpanAligner.rebuild_tagged_text(text, spans, entities)
72
+ print(tagged)
73
+ # Output: <sentence>Hello <location>World</location></sentence>
74
+ ```
75
+
76
+ ### Rebuild Tagged Text from Task
77
+
78
+ Generate tagged text directly from a Label Studio task object.
79
+
80
+ ```python
81
+ # Assuming 'task' is a Label Studio task object (or similar structure)
82
+ # with .data['text'] and .annotations attributes
83
+ mapping = {"Location": "loc", "Person": "per"}
84
+
85
+ tagged_output = SpanAligner.rebuild_tagged_text_from_task(task, mapping)
86
+ print(tagged_output)
87
+ ```
88
+
89
+ ### Get Annotations from Tagged Text
90
+
91
+ Extract structured spans and entities from a string with inline tags.
92
+
93
+ ```python
94
+ tagged_input = "Visit <loc>Paris</loc> and see the <landmark>Eiffel Tower</landmark>."
95
+
96
+ annotations = SpanAligner.get_annotations_from_tagged_text(
97
+ tagged_input,
98
+ ner_map={"loc": "Location", "landmark": "Location"}
99
+ )
100
+
101
+ print(annotations["entities"])
102
+ # Output:
103
+ # [
104
+ # {"start": 6, "end": 11, "text": "Paris", "labels": ["Location"]},
105
+ # {"start": 24, "end": 36, "text": "Eiffel Tower", "labels": ["Location"]}
106
+ # ]
107
+ print(annotations["plain_text"])
108
+ # Output: "Visit Paris and see the Eiffel Tower."
109
+ ```
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "span-aligner"
7
+ version = "0.1.0"
8
+ description = "A utility for aligning and mapping text spans between different text representations."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "MIT"}
12
+ dependencies = [
13
+ "rapidfuzz>=3.0.0",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ dev = [
18
+ "pytest>=7.0.0",
19
+ ]
20
+
21
+ [tool.setuptools.packages.find]
22
+ where = ["."]
23
+ include = ["span_aligner*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ from .aligner import SpanAligner
2
+
3
+ __all__ = ["SpanAligner"]