iob2labels 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iob2labels-0.1.0/.github/workflows/publish.yml +25 -0
- iob2labels-0.1.0/.gitignore +11 -0
- iob2labels-0.1.0/.python-version +1 -0
- iob2labels-0.1.0/LICENSE +21 -0
- iob2labels-0.1.0/PKG-INFO +163 -0
- iob2labels-0.1.0/README.md +139 -0
- iob2labels-0.1.0/pyproject.toml +40 -0
- iob2labels-0.1.0/src/iob2labels/__init__.py +15 -0
- iob2labels-0.1.0/src/iob2labels/annotations.py +87 -0
- iob2labels-0.1.0/src/iob2labels/checker.py +71 -0
- iob2labels-0.1.0/src/iob2labels/encoder.py +212 -0
- iob2labels-0.1.0/src/iob2labels/labels.py +61 -0
- iob2labels-0.1.0/src/iob2labels/py.typed +0 -0
- iob2labels-0.1.0/src/iob2labels/tokenizers.py +62 -0
- iob2labels-0.1.0/src/iob2tensor/__init__.py +5 -0
- iob2labels-0.1.0/src/iob2tensor/annotations.py +87 -0
- iob2labels-0.1.0/src/iob2tensor/checker.py +69 -0
- iob2labels-0.1.0/src/iob2tensor/convert.py +70 -0
- iob2labels-0.1.0/src/iob2tensor/labels.py +61 -0
- iob2labels-0.1.0/src/iob2tensor/tokenizers.py +14 -0
- iob2labels-0.1.0/tests/conftest.py +73 -0
- iob2labels-0.1.0/tests/test_checker.py +68 -0
- iob2labels-0.1.0/tests/test_create_map.py +28 -0
- iob2labels-0.1.0/tests/test_encoder.py +151 -0
- iob2labels-0.1.0/tests/test_get_entity_index_ranges.py +35 -0
- iob2labels-0.1.0/tests/test_tokenizer_matrix.py +66 -0
- iob2labels-0.1.0/uv.lock +1165 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# .github/workflows/publish.yml
|
|
2
|
+
name: Publish to PyPI
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
release:
|
|
6
|
+
types: [published]
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
publish:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
environment: pypi
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v4
|
|
20
|
+
|
|
21
|
+
- name: Build package
|
|
22
|
+
run: uv build
|
|
23
|
+
|
|
24
|
+
- name: Publish to PyPI
|
|
25
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.10
|
iob2labels-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: iob2labels
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert IOB2 NER span annotations into integer label sequences aligned to any HuggingFace-compatible tokenizer.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: iob2,named-entity-recognition,ner,nlp,token-classification,tokenizers
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: pydantic>=2.0
|
|
22
|
+
Requires-Dist: tokenizers>=0.15
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# iob2labels
|
|
26
|
+
|
|
27
|
+
Convert [IOB2-format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) NER span annotations into integer label sequences for Transformer-based token classification tasks.
|
|
28
|
+
|
|
29
|
+
If you use annotation tools like [Prodigy](https://prodi.gy/docs), [Label Studio](https://labelstud.io/), or [Doccano](https://github.com/doccano/doccano) to annotate NER data, this library converts those character-offset span annotations into the label arrays you need for training.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
uv add iob2labels
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Dependencies: `tokenizers` (HuggingFace Rust-backed tokenizer) and `pydantic`. No `torch` or `transformers` required.
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from iob2labels import IOB2Encoder
|
|
43
|
+
|
|
44
|
+
encoder = IOB2Encoder(
|
|
45
|
+
labels=["actor", "character", "plot"],
|
|
46
|
+
tokenizer="bert-base-uncased",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
labels = encoder(
|
|
50
|
+
text="Did Dame Judy Dench star in a British film about Queen Elizabeth?",
|
|
51
|
+
spans=[
|
|
52
|
+
{"label": "actor", "start": 4, "end": 19},
|
|
53
|
+
{"label": "plot", "start": 30, "end": 37},
|
|
54
|
+
{"label": "character", "start": 49, "end": 64},
|
|
55
|
+
]
|
|
56
|
+
)
|
|
57
|
+
# >>> [-100, 0, 1, 2, 2, 2, 0, 0, 0, 5, 0, 0, 3, 4, 0, -100]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
> Example pulled from the [MITMovie](https://groups.csail.mit.edu/sls/downloads/movie/) dataset.
|
|
61
|
+
|
|
62
|
+
The output is a `list[int]` aligned to the tokenizer's output. Convert to a tensor or array as needed:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import torch
|
|
66
|
+
x = torch.tensor(labels)
|
|
67
|
+
|
|
68
|
+
# or with numpy
|
|
69
|
+
import numpy as np
|
|
70
|
+
x = np.array(labels)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## How It Works
|
|
74
|
+
|
|
75
|
+
The IOB2 format assigns each token one of three tag types:
|
|
76
|
+
|
|
77
|
+
- **O** (Outside) - not part of any entity
|
|
78
|
+
- **B-LABEL** (Beginning) - first token of an entity
|
|
79
|
+
- **I-LABEL** (Inside) - continuation of an entity
|
|
80
|
+
|
|
81
|
+
Each entity class generates 2 labels (B + I), plus the O class, so the total label count is always `(n * 2) + 1`:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
encoder.label_map
|
|
85
|
+
# >>> {'O': 0, 'B-ACTOR': 1, 'I-ACTOR': 2, 'B-CHARACTER': 3, 'I-CHARACTER': 4, 'B-PLOT': 5, 'I-PLOT': 6}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Special tokens (e.g., `[CLS]`, `[SEP]`) receive the ignore value `-100`, which PyTorch's `CrossEntropyLoss` skips by default.
|
|
89
|
+
|
|
90
|
+
## Tokenizer Input
|
|
91
|
+
|
|
92
|
+
The `tokenizer` argument accepts three forms:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# 1. checkpoint name (downloads from HuggingFace Hub)
|
|
96
|
+
encoder = IOB2Encoder(labels=labels, tokenizer="bert-base-uncased")
|
|
97
|
+
|
|
98
|
+
# 2. standalone tokenizers.Tokenizer instance
|
|
99
|
+
from tokenizers import Tokenizer
|
|
100
|
+
tok = Tokenizer.from_pretrained("bert-base-uncased")
|
|
101
|
+
encoder = IOB2Encoder(labels=labels, tokenizer=tok)
|
|
102
|
+
|
|
103
|
+
# 3. transformers PreTrainedTokenizerFast (unwrapped automatically)
|
|
104
|
+
from transformers import AutoTokenizer
|
|
105
|
+
tok = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
106
|
+
encoder = IOB2Encoder(labels=labels, tokenizer=tok)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Batch Encoding
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
annotations = [
|
|
113
|
+
{"text": "Did Dame Judy Dench star?", "spans": [{"label": "actor", "start": 4, "end": 19}]},
|
|
114
|
+
{"text": "Matt Damon was Jason Bourne.", "spans": [{"label": "actor", "start": 0, "end": 10}]},
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
results = encoder.batch(annotations)
|
|
118
|
+
# >>> [[-100, 0, 1, 2, 2, 2, 0, -100], [-100, 1, 2, 0, 0, 0, 0, -100]]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The batch path uses the Rust-backed `encode_batch()` for parallelized tokenization. Returns `list[list[int]]` with no padding; use HuggingFace's `DataCollatorForTokenClassification` or your own padding for training.
|
|
122
|
+
|
|
123
|
+
## Custom Field Names
|
|
124
|
+
|
|
125
|
+
If your annotation data uses non-standard field names, configure them at construction:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
# BioMed-NER dataset uses "entities" and "class" instead of "spans" and "label"
|
|
129
|
+
encoder = IOB2Encoder(
|
|
130
|
+
labels=["organism", "chemicals"],
|
|
131
|
+
tokenizer="bert-base-uncased",
|
|
132
|
+
spans_field="entities",
|
|
133
|
+
label_field="class",
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Built-in Conversion Check
|
|
138
|
+
|
|
139
|
+
By default, every encoding is verified by recovering the entity text from the produced labels and comparing it to the original annotation. This catches misalignment bugs early. Disable it for performance in production:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
encoder = IOB2Encoder(labels=labels, tokenizer=tok, conversion_check=False)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Supported Tokenizers
|
|
146
|
+
|
|
147
|
+
Tested across three tokenizer families:
|
|
148
|
+
|
|
149
|
+
| Family | Checkpoints |
|
|
150
|
+
|---|---|
|
|
151
|
+
| WordPiece | `bert-base-cased`, `bert-base-uncased`, `bert-large-cased`, `bert-large-uncased`, `distilbert-base-cased`, `distilbert-base-uncased`, `google/electra-base-discriminator` |
|
|
152
|
+
| BPE | `roberta-base`, `roberta-large` |
|
|
153
|
+
| SentencePiece | `albert-base-v2`, `xlnet-base-cased`, `t5-small` |
|
|
154
|
+
|
|
155
|
+
Other HuggingFace-compatible tokenizers should work as well. The built-in conversion check will flag any issues.
|
|
156
|
+
|
|
157
|
+
## Tests
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
uv run pytest tests/ -v
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
The test suite includes unit tests for label map construction, entity range detection, and the conversion checker, plus a parametrized matrix of 12 tokenizer checkpoints across multiple annotation edge cases (entities at text boundaries, adjacent entities, punctuation, etc.).
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# iob2labels
|
|
2
|
+
|
|
3
|
+
Convert [IOB2-format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) NER span annotations into integer label sequences for Transformer-based token classification tasks.
|
|
4
|
+
|
|
5
|
+
If you use annotation tools like [Prodigy](https://prodi.gy/docs), [Label Studio](https://labelstud.io/), or [Doccano](https://github.com/doccano/doccano) to annotate NER data, this library converts those character-offset span annotations into the label arrays you need for training.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
uv add iob2labels
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Dependencies: `tokenizers` (HuggingFace Rust-backed tokenizer) and `pydantic`. No `torch` or `transformers` required.
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from iob2labels import IOB2Encoder
|
|
19
|
+
|
|
20
|
+
encoder = IOB2Encoder(
|
|
21
|
+
labels=["actor", "character", "plot"],
|
|
22
|
+
tokenizer="bert-base-uncased",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
labels = encoder(
|
|
26
|
+
text="Did Dame Judy Dench star in a British film about Queen Elizabeth?",
|
|
27
|
+
spans=[
|
|
28
|
+
{"label": "actor", "start": 4, "end": 19},
|
|
29
|
+
{"label": "plot", "start": 30, "end": 37},
|
|
30
|
+
{"label": "character", "start": 49, "end": 64},
|
|
31
|
+
]
|
|
32
|
+
)
|
|
33
|
+
# >>> [-100, 0, 1, 2, 2, 2, 0, 0, 0, 5, 0, 0, 3, 4, 0, -100]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
> Example pulled from the [MITMovie](https://groups.csail.mit.edu/sls/downloads/movie/) dataset.
|
|
37
|
+
|
|
38
|
+
The output is a `list[int]` aligned to the tokenizer's output. Convert to a tensor or array as needed:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import torch
|
|
42
|
+
x = torch.tensor(labels)
|
|
43
|
+
|
|
44
|
+
# or with numpy
|
|
45
|
+
import numpy as np
|
|
46
|
+
x = np.array(labels)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## How It Works
|
|
50
|
+
|
|
51
|
+
The IOB2 format assigns each token one of three tag types:
|
|
52
|
+
|
|
53
|
+
- **O** (Outside) - not part of any entity
|
|
54
|
+
- **B-LABEL** (Beginning) - first token of an entity
|
|
55
|
+
- **I-LABEL** (Inside) - continuation of an entity
|
|
56
|
+
|
|
57
|
+
Each entity class generates 2 labels (B + I), plus the O class, so the total label count is always `(n * 2) + 1`:
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
encoder.label_map
|
|
61
|
+
# >>> {'O': 0, 'B-ACTOR': 1, 'I-ACTOR': 2, 'B-CHARACTER': 3, 'I-CHARACTER': 4, 'B-PLOT': 5, 'I-PLOT': 6}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Special tokens (e.g., `[CLS]`, `[SEP]`) receive the ignore value `-100`, which PyTorch's `CrossEntropyLoss` skips by default.
|
|
65
|
+
|
|
66
|
+
## Tokenizer Input
|
|
67
|
+
|
|
68
|
+
The `tokenizer` argument accepts three forms:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
# 1. checkpoint name (downloads from HuggingFace Hub)
|
|
72
|
+
encoder = IOB2Encoder(labels=labels, tokenizer="bert-base-uncased")
|
|
73
|
+
|
|
74
|
+
# 2. standalone tokenizers.Tokenizer instance
|
|
75
|
+
from tokenizers import Tokenizer
|
|
76
|
+
tok = Tokenizer.from_pretrained("bert-base-uncased")
|
|
77
|
+
encoder = IOB2Encoder(labels=labels, tokenizer=tok)
|
|
78
|
+
|
|
79
|
+
# 3. transformers PreTrainedTokenizerFast (unwrapped automatically)
|
|
80
|
+
from transformers import AutoTokenizer
|
|
81
|
+
tok = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
82
|
+
encoder = IOB2Encoder(labels=labels, tokenizer=tok)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Batch Encoding
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
annotations = [
|
|
89
|
+
{"text": "Did Dame Judy Dench star?", "spans": [{"label": "actor", "start": 4, "end": 19}]},
|
|
90
|
+
{"text": "Matt Damon was Jason Bourne.", "spans": [{"label": "actor", "start": 0, "end": 10}]},
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
results = encoder.batch(annotations)
|
|
94
|
+
# >>> [[-100, 0, 1, 2, 2, 2, 0, -100], [-100, 1, 2, 0, 0, 0, 0, -100]]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
The batch path uses the Rust-backed `encode_batch()` for parallelized tokenization. Returns `list[list[int]]` with no padding; use HuggingFace's `DataCollatorForTokenClassification` or your own padding for training.
|
|
98
|
+
|
|
99
|
+
## Custom Field Names
|
|
100
|
+
|
|
101
|
+
If your annotation data uses non-standard field names, configure them at construction:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
# BioMed-NER dataset uses "entities" and "class" instead of "spans" and "label"
|
|
105
|
+
encoder = IOB2Encoder(
|
|
106
|
+
labels=["organism", "chemicals"],
|
|
107
|
+
tokenizer="bert-base-uncased",
|
|
108
|
+
spans_field="entities",
|
|
109
|
+
label_field="class",
|
|
110
|
+
)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Built-in Conversion Check
|
|
114
|
+
|
|
115
|
+
By default, every encoding is verified by recovering the entity text from the produced labels and comparing it to the original annotation. This catches misalignment bugs early. Disable it for performance in production:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
encoder = IOB2Encoder(labels=labels, tokenizer=tok, conversion_check=False)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Supported Tokenizers
|
|
122
|
+
|
|
123
|
+
Tested across three tokenizer families:
|
|
124
|
+
|
|
125
|
+
| Family | Checkpoints |
|
|
126
|
+
|---|---|
|
|
127
|
+
| WordPiece | `bert-base-cased`, `bert-base-uncased`, `bert-large-cased`, `bert-large-uncased`, `distilbert-base-cased`, `distilbert-base-uncased`, `google/electra-base-discriminator` |
|
|
128
|
+
| BPE | `roberta-base`, `roberta-large` |
|
|
129
|
+
| SentencePiece | `albert-base-v2`, `xlnet-base-cased`, `t5-small` |
|
|
130
|
+
|
|
131
|
+
Other HuggingFace-compatible tokenizers should work as well. The built-in conversion check will flag any issues.
|
|
132
|
+
|
|
133
|
+
## Tests
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
uv run pytest tests/ -v
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
The test suite includes unit tests for label map construction, entity range detection, and the conversion checker, plus a parametrized matrix of 12 tokenizer checkpoints across multiple annotation edge cases (entities at text boundaries, adjacent entities, punctuation, etc.).
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "iob2labels"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Convert IOB2 NER span annotations into integer label sequences aligned to any HuggingFace-compatible tokenizer."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.10"
|
|
8
|
+
keywords = ["ner", "named-entity-recognition", "iob2", "token-classification", "nlp", "tokenizers"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 3 - Alpha",
|
|
11
|
+
"Intended Audience :: Developers",
|
|
12
|
+
"Intended Audience :: Science/Research",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
+
"Topic :: Text Processing :: Linguistic",
|
|
21
|
+
"Typing :: Typed",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"pydantic>=2.0",
|
|
25
|
+
"tokenizers>=0.15",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["src/iob2labels"]
|
|
30
|
+
|
|
31
|
+
[build-system]
|
|
32
|
+
requires = ["hatchling"]
|
|
33
|
+
build-backend = "hatchling.build"
|
|
34
|
+
|
|
35
|
+
[dependency-groups]
|
|
36
|
+
dev = [
|
|
37
|
+
"ipython>=8.37.0",
|
|
38
|
+
"pytest>=8.4.2",
|
|
39
|
+
"transformers>=4.40",
|
|
40
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
## -- primary interface
|
|
2
|
+
from .encoder import IOB2Encoder as IOB2Encoder
|
|
3
|
+
|
|
4
|
+
## -- types
|
|
5
|
+
from .annotations import Annotation as Annotation
|
|
6
|
+
from .annotations import Span as Span
|
|
7
|
+
|
|
8
|
+
## -- utilities
|
|
9
|
+
from .labels import create_label_map as create_label_map
|
|
10
|
+
from .labels import format_entity_label as format_entity_label
|
|
11
|
+
from .annotations import preprocessing as preprocessing
|
|
12
|
+
|
|
13
|
+
## -- checker
|
|
14
|
+
from .checker import check_iob_conversion as check_iob_conversion
|
|
15
|
+
from .checker import get_entity_index_ranges as get_entity_index_ranges
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from typing_extensions import TypedDict
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, ValidationError
|
|
5
|
+
from pydantic import AfterValidator, StrictStr, StrictInt
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DefaultFields:
|
|
9
|
+
TEXT = "text"
|
|
10
|
+
SPANS = "spans"
|
|
11
|
+
START = "start"
|
|
12
|
+
END = "end"
|
|
13
|
+
LABEL = "label"
|
|
14
|
+
|
|
15
|
+
## -- Pydantic models to perform validation of input annotation data
|
|
16
|
+
## -- and support conversion for iob-label conversion.
|
|
17
|
+
|
|
18
|
+
class _SpanFormat(BaseModel):
|
|
19
|
+
start: StrictInt = Field(..., description="Index of entity starting character in text string.")
|
|
20
|
+
end: StrictInt = Field(..., description="Index of entity ending character in text string.")
|
|
21
|
+
label: StrictStr = Field(..., description="Label name for annotated entity (e.g., PERSON, PRODUCT, LOCATION, etc.")
|
|
22
|
+
|
|
23
|
+
class _AnnotationFormat(BaseModel):
|
|
24
|
+
text: StrictStr
|
|
25
|
+
spans: list[_SpanFormat]
|
|
26
|
+
|
|
27
|
+
def convert_to_validated_format(
|
|
28
|
+
text: str,
|
|
29
|
+
spans: list[dict],
|
|
30
|
+
start_field: str,
|
|
31
|
+
end_field: str,
|
|
32
|
+
label_field: str
|
|
33
|
+
) -> _AnnotationFormat:
|
|
34
|
+
"""Intermediate function for converting input annotation data to pydantic models for validation and field conversion."""
|
|
35
|
+
return _AnnotationFormat(
|
|
36
|
+
text=text,
|
|
37
|
+
spans=[
|
|
38
|
+
_SpanFormat(
|
|
39
|
+
start=span[start_field],
|
|
40
|
+
end=span[end_field],
|
|
41
|
+
label=span[label_field]
|
|
42
|
+
) for span in spans
|
|
43
|
+
]
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
## -- typed-dicts are returned from validation step as they are _lightly_ typed
|
|
47
|
+
## -- but benefit from intermediate pydantic validation and preprocessing.
|
|
48
|
+
class Span(TypedDict):
|
|
49
|
+
start: int
|
|
50
|
+
end: int
|
|
51
|
+
label: str
|
|
52
|
+
|
|
53
|
+
class Annotation(TypedDict):
|
|
54
|
+
text: str
|
|
55
|
+
spans: list[Span]
|
|
56
|
+
|
|
57
|
+
def preprocessing(
|
|
58
|
+
text: str,
|
|
59
|
+
spans: list[dict],
|
|
60
|
+
start_field: str = DefaultFields.START,
|
|
61
|
+
end_field: str = DefaultFields.END,
|
|
62
|
+
label_field: str = DefaultFields.LABEL
|
|
63
|
+
) -> Annotation:
|
|
64
|
+
# first convert to pydantic models to convert and validate input data
|
|
65
|
+
validated = convert_to_validated_format(text, spans, start_field, end_field, label_field)
|
|
66
|
+
# return as typed dict
|
|
67
|
+
return Annotation(**validated.model_dump())
|
|
68
|
+
|
|
69
|
+
def validate_batch(
|
|
70
|
+
annotations: list[dict],
|
|
71
|
+
text_field: str = DefaultFields.TEXT,
|
|
72
|
+
spans_field: str = DefaultFields.SPANS,
|
|
73
|
+
start_field: str = DefaultFields.START,
|
|
74
|
+
end_field: str = DefaultFields.END,
|
|
75
|
+
label_field: str = DefaultFields.LABEL
|
|
76
|
+
) -> list[Annotation]:
|
|
77
|
+
assert isinstance(annotations, list) and all([isinstance(ann, dict) for ann in annotations]), f"Input for annotations is not a list of dicts."
|
|
78
|
+
return [
|
|
79
|
+
preprocessing( # <- was `validate` (nonexistent); fixed to call preprocessing
|
|
80
|
+
text=ann[text_field],
|
|
81
|
+
spans=ann[spans_field],
|
|
82
|
+
start_field=start_field,
|
|
83
|
+
end_field=end_field,
|
|
84
|
+
label_field=label_field
|
|
85
|
+
)
|
|
86
|
+
for ann in annotations
|
|
87
|
+
]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from itertools import takewhile
|
|
2
|
+
|
|
3
|
+
from tokenizers import Tokenizer
|
|
4
|
+
|
|
5
|
+
from iob2labels.labels import IobPrefixes, IGNORE_TOKEN
|
|
6
|
+
from iob2labels.annotations import Annotation, DefaultFields
|
|
7
|
+
|
|
8
|
+
def invert_label_map(label_map: dict[str, int]) -> dict[int, str]:
|
|
9
|
+
return {v: k for k, v in label_map.items()}
|
|
10
|
+
|
|
11
|
+
def get_iob_type_by_iob_label(label_map: dict[str, int], iob_label: int) -> str:
|
|
12
|
+
if iob_label == IGNORE_TOKEN:
|
|
13
|
+
iob_type = IobPrefixes.OUTSIDE
|
|
14
|
+
else:
|
|
15
|
+
idx_map = invert_label_map(label_map)
|
|
16
|
+
iob_type = idx_map[iob_label]
|
|
17
|
+
return iob_type[0]
|
|
18
|
+
|
|
19
|
+
def is_beginning_tag(label_map: dict[str, int], iob_label: int) -> bool:
|
|
20
|
+
"""Boolean check if label associated with input index is a beginning tag."""
|
|
21
|
+
return get_iob_type_by_iob_label(label_map, iob_label) == IobPrefixes.BEGINNING
|
|
22
|
+
|
|
23
|
+
def is_inside_tag(label_map: dict[str, int], iob_label: int) -> bool:
|
|
24
|
+
"""Boolean check if label associated with input index is an inside tag."""
|
|
25
|
+
return get_iob_type_by_iob_label(label_map, iob_label) == IobPrefixes.INSIDE
|
|
26
|
+
|
|
27
|
+
def is_outside_tag(label_map: dict[str, int], iob_label: int) -> bool:
|
|
28
|
+
"""Boolean check if label associated with input index is an outside tag."""
|
|
29
|
+
return get_iob_type_by_iob_label(label_map, iob_label) == IobPrefixes.OUTSIDE
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_entity_sequence_length(label_map: dict[str, int], iob_labels: list[int]) -> int:
|
|
34
|
+
return len(list(takewhile(lambda x: is_inside_tag(label_map, x), iob_labels)))
|
|
35
|
+
|
|
36
|
+
def get_entity_index_ranges(label_map: dict[str, int], iob_labels: list[int]) -> list[tuple[int, int]]:
|
|
37
|
+
return [
|
|
38
|
+
(idx, idx + get_entity_sequence_length(label_map, iob_labels[(idx + 1):]))
|
|
39
|
+
for idx in range(len(iob_labels)) if is_beginning_tag(label_map, iob_labels[idx])
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
def check_iob_conversion(
|
|
43
|
+
iob_labels: list[int],
|
|
44
|
+
label_map: dict[str, int],
|
|
45
|
+
tokenizer: Tokenizer,
|
|
46
|
+
input_ids: list[int],
|
|
47
|
+
annotation: Annotation,
|
|
48
|
+
debug: bool = False,
|
|
49
|
+
strict: bool = True,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Tests to ensure assigned IOB labels are correct based on character and token indices for annotated entities."""
|
|
52
|
+
match_ranges = get_entity_index_ranges(label_map, iob_labels)
|
|
53
|
+
num_ranges, num_spans = len(match_ranges), len(annotation[DefaultFields.SPANS])
|
|
54
|
+
assert num_ranges == num_spans, f"Test found {num_ranges} matches but annotation includes {num_spans} entities."
|
|
55
|
+
|
|
56
|
+
for _range, span in zip(match_ranges, annotation[DefaultFields.SPANS]):
|
|
57
|
+
# recover entity from IOB positive label indices
|
|
58
|
+
entity_input_ids = input_ids[_range[0]: (_range[1] + 1)]
|
|
59
|
+
recovered_entity = tokenizer.decode(entity_input_ids).strip() # <- some tokenizers (e.g., Roberta-Base), can include leading whitespace in decoded entity
|
|
60
|
+
|
|
61
|
+
# encode/decode annotated entity and assert equality
|
|
62
|
+
annotated_entity = annotation[DefaultFields.TEXT][span[DefaultFields.START]:span[DefaultFields.END]]
|
|
63
|
+
expected_entity = tokenizer.decode(
|
|
64
|
+
tokenizer.encode(annotated_entity, add_special_tokens=False).ids # <- standalone tokenizers: encode() returns Encoding, access .ids for token IDs
|
|
65
|
+
)
|
|
66
|
+
result = expected_entity == recovered_entity if strict else expected_entity in recovered_entity
|
|
67
|
+
assert result, f"Recovered entity (via IOB labels) '{recovered_entity}' does not match expected entity '{annotated_entity}'. Decoded form is '{expected_entity}'."
|
|
68
|
+
|
|
69
|
+
if debug: print(f"| -> recovered entity '{recovered_entity}' for IOB labels at indices ({_range[0]}, {_range[1]}), which matches annotated entity '{annotated_entity}'.")
|
|
70
|
+
|
|
71
|
+
return
|