iob2labels 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ # .github/workflows/publish.yml
2
+ name: Publish to PyPI
3
+
4
+ on:
5
+ release:
6
+ types: [published]
7
+
8
+ permissions:
9
+ id-token: write
10
+
11
+ jobs:
12
+ publish:
13
+ runs-on: ubuntu-latest
14
+ environment: pypi
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v4
20
+
21
+ - name: Build package
22
+ run: uv build
23
+
24
+ - name: Publish to PyPI
25
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,11 @@
1
+ .venv
2
+ .env
3
+ .DS_Store
4
+ pyrightconfig.json
5
+
6
+ __pycache__
7
+ .pytest_cache
8
+
9
+ data/
10
+ dist/
11
+ *.egg-info/
@@ -0,0 +1 @@
1
+ 3.10
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,163 @@
1
+ Metadata-Version: 2.4
2
+ Name: iob2labels
3
+ Version: 0.1.0
4
+ Summary: Convert IOB2 NER span annotations into integer label sequences aligned to any HuggingFace-compatible tokenizer.
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Keywords: iob2,named-entity-recognition,ner,nlp,token-classification,tokenizers
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: pydantic>=2.0
22
+ Requires-Dist: tokenizers>=0.15
23
+ Description-Content-Type: text/markdown
24
+
25
+ # iob2labels
26
+
27
+ Convert [IOB2-format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) NER span annotations into integer label sequences for Transformer-based token classification tasks.
28
+
29
+ If you use annotation tools like [Prodigy](https://prodi.gy/docs), [Label Studio](https://labelstud.io/), or [Doccano](https://github.com/doccano/doccano) to annotate NER data, this library converts those character-offset span annotations into the label arrays you need for training.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ uv add iob2labels
35
+ ```
36
+
37
+ Dependencies: `tokenizers` (HuggingFace Rust-backed tokenizer) and `pydantic`. No `torch` or `transformers` required.
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from iob2labels import IOB2Encoder
43
+
44
+ encoder = IOB2Encoder(
45
+ labels=["actor", "character", "plot"],
46
+ tokenizer="bert-base-uncased",
47
+ )
48
+
49
+ labels = encoder(
50
+ text="Did Dame Judy Dench star in a British film about Queen Elizabeth?",
51
+ spans=[
52
+ {"label": "actor", "start": 4, "end": 19},
53
+ {"label": "plot", "start": 30, "end": 37},
54
+ {"label": "character", "start": 49, "end": 64},
55
+ ]
56
+ )
57
+ # >>> [-100, 0, 1, 2, 2, 2, 0, 0, 0, 5, 0, 0, 3, 4, 0, -100]
58
+ ```
59
+
60
+ > Example pulled from the [MITMovie](https://groups.csail.mit.edu/sls/downloads/movie/) dataset.
61
+
62
+ The output is a `list[int]` aligned to the tokenizer's output. Convert to a tensor or array as needed:
63
+
64
+ ```python
65
+ import torch
66
+ x = torch.tensor(labels)
67
+
68
+ # or with numpy
69
+ import numpy as np
70
+ x = np.array(labels)
71
+ ```
72
+
73
+ ## How It Works
74
+
75
+ The IOB2 format assigns each token one of three tag types:
76
+
77
+ - **O** (Outside) - not part of any entity
78
+ - **B-LABEL** (Beginning) - first token of an entity
79
+ - **I-LABEL** (Inside) - continuation of an entity
80
+
81
+ Each entity class generates 2 labels (B + I), plus the O class, so the total label count is always `(n * 2) + 1`:
82
+
83
+ ```python
84
+ encoder.label_map
85
+ # >>> {'O': 0, 'B-ACTOR': 1, 'I-ACTOR': 2, 'B-CHARACTER': 3, 'I-CHARACTER': 4, 'B-PLOT': 5, 'I-PLOT': 6}
86
+ ```
87
+
88
+ Special tokens (e.g., `[CLS]`, `[SEP]`) receive the ignore value `-100`, which PyTorch's `CrossEntropyLoss` skips by default.
89
+
90
+ ## Tokenizer Input
91
+
92
+ The `tokenizer` argument accepts three forms:
93
+
94
+ ```python
95
+ # 1. checkpoint name (downloads from HuggingFace Hub)
96
+ encoder = IOB2Encoder(labels=labels, tokenizer="bert-base-uncased")
97
+
98
+ # 2. standalone tokenizers.Tokenizer instance
99
+ from tokenizers import Tokenizer
100
+ tok = Tokenizer.from_pretrained("bert-base-uncased")
101
+ encoder = IOB2Encoder(labels=labels, tokenizer=tok)
102
+
103
+ # 3. transformers PreTrainedTokenizerFast (unwrapped automatically)
104
+ from transformers import AutoTokenizer
105
+ tok = AutoTokenizer.from_pretrained("bert-base-uncased")
106
+ encoder = IOB2Encoder(labels=labels, tokenizer=tok)
107
+ ```
108
+
109
+ ## Batch Encoding
110
+
111
+ ```python
112
+ annotations = [
113
+ {"text": "Did Dame Judy Dench star?", "spans": [{"label": "actor", "start": 4, "end": 19}]},
114
+ {"text": "Matt Damon was Jason Bourne.", "spans": [{"label": "actor", "start": 0, "end": 10}]},
115
+ ]
116
+
117
+ results = encoder.batch(annotations)
118
+ # >>> [[-100, 0, 1, 2, 2, 2, 0, -100], [-100, 1, 2, 0, 0, 0, 0, -100]]
119
+ ```
120
+
121
+ The batch path uses the Rust-backed `encode_batch()` for parallelized tokenization. Returns `list[list[int]]` with no padding; use HuggingFace's `DataCollatorForTokenClassification` or your own padding for training.
122
+
123
+ ## Custom Field Names
124
+
125
+ If your annotation data uses non-standard field names, configure them at construction:
126
+
127
+ ```python
128
+ # BioMed-NER dataset uses "entities" and "class" instead of "spans" and "label"
129
+ encoder = IOB2Encoder(
130
+ labels=["organism", "chemicals"],
131
+ tokenizer="bert-base-uncased",
132
+ spans_field="entities",
133
+ label_field="class",
134
+ )
135
+ ```
136
+
137
+ ## Built-in Conversion Check
138
+
139
+ By default, every encoding is verified by recovering the entity text from the produced labels and comparing it to the original annotation. This catches misalignment bugs early. Disable it for performance in production:
140
+
141
+ ```python
142
+ encoder = IOB2Encoder(labels=labels, tokenizer=tok, conversion_check=False)
143
+ ```
144
+
145
+ ## Supported Tokenizers
146
+
147
+ Tested across three tokenizer families:
148
+
149
+ | Family | Checkpoints |
150
+ |---|---|
151
+ | WordPiece | `bert-base-cased`, `bert-base-uncased`, `bert-large-cased`, `bert-large-uncased`, `distilbert-base-cased`, `distilbert-base-uncased`, `google/electra-base-discriminator` |
152
+ | BPE | `roberta-base`, `roberta-large` |
153
+ | SentencePiece | `albert-base-v2`, `xlnet-base-cased`, `t5-small` |
154
+
155
+ Other HuggingFace-compatible tokenizers should work as well. The built-in conversion check will flag any issues.
156
+
157
+ ## Tests
158
+
159
+ ```bash
160
+ uv run pytest tests/ -v
161
+ ```
162
+
163
+ The test suite includes unit tests for label map construction, entity range detection, and the conversion checker, plus a parametrized matrix of 12 tokenizer checkpoints across multiple annotation edge cases (entities at text boundaries, adjacent entities, punctuation, etc.).
@@ -0,0 +1,139 @@
1
+ # iob2labels
2
+
3
+ Convert [IOB2-format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) NER span annotations into integer label sequences for Transformer-based token classification tasks.
4
+
5
+ If you use annotation tools like [Prodigy](https://prodi.gy/docs), [Label Studio](https://labelstud.io/), or [Doccano](https://github.com/doccano/doccano) to annotate NER data, this library converts those character-offset span annotations into the label arrays you need for training.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ uv add iob2labels
11
+ ```
12
+
13
+ Dependencies: `tokenizers` (HuggingFace Rust-backed tokenizer) and `pydantic`. No `torch` or `transformers` required.
14
+
15
+ ## Quick Start
16
+
17
+ ```python
18
+ from iob2labels import IOB2Encoder
19
+
20
+ encoder = IOB2Encoder(
21
+ labels=["actor", "character", "plot"],
22
+ tokenizer="bert-base-uncased",
23
+ )
24
+
25
+ labels = encoder(
26
+ text="Did Dame Judy Dench star in a British film about Queen Elizabeth?",
27
+ spans=[
28
+ {"label": "actor", "start": 4, "end": 19},
29
+ {"label": "plot", "start": 30, "end": 37},
30
+ {"label": "character", "start": 49, "end": 64},
31
+ ]
32
+ )
33
+ # >>> [-100, 0, 1, 2, 2, 2, 0, 0, 0, 5, 0, 0, 3, 4, 0, -100]
34
+ ```
35
+
36
+ > Example pulled from the [MITMovie](https://groups.csail.mit.edu/sls/downloads/movie/) dataset.
37
+
38
+ The output is a `list[int]` aligned to the tokenizer's output. Convert to a tensor or array as needed:
39
+
40
+ ```python
41
+ import torch
42
+ x = torch.tensor(labels)
43
+
44
+ # or with numpy
45
+ import numpy as np
46
+ x = np.array(labels)
47
+ ```
48
+
49
+ ## How It Works
50
+
51
+ The IOB2 format assigns each token one of three tag types:
52
+
53
+ - **O** (Outside) - not part of any entity
54
+ - **B-LABEL** (Beginning) - first token of an entity
55
+ - **I-LABEL** (Inside) - continuation of an entity
56
+
57
+ Each entity class generates 2 labels (B + I), plus the O class, so the total label count is always `(n * 2) + 1`:
58
+
59
+ ```python
60
+ encoder.label_map
61
+ # >>> {'O': 0, 'B-ACTOR': 1, 'I-ACTOR': 2, 'B-CHARACTER': 3, 'I-CHARACTER': 4, 'B-PLOT': 5, 'I-PLOT': 6}
62
+ ```
63
+
64
+ Special tokens (e.g., `[CLS]`, `[SEP]`) receive the ignore value `-100`, which PyTorch's `CrossEntropyLoss` skips by default.
65
+
66
+ ## Tokenizer Input
67
+
68
+ The `tokenizer` argument accepts three forms:
69
+
70
+ ```python
71
+ # 1. checkpoint name (downloads from HuggingFace Hub)
72
+ encoder = IOB2Encoder(labels=labels, tokenizer="bert-base-uncased")
73
+
74
+ # 2. standalone tokenizers.Tokenizer instance
75
+ from tokenizers import Tokenizer
76
+ tok = Tokenizer.from_pretrained("bert-base-uncased")
77
+ encoder = IOB2Encoder(labels=labels, tokenizer=tok)
78
+
79
+ # 3. transformers PreTrainedTokenizerFast (unwrapped automatically)
80
+ from transformers import AutoTokenizer
81
+ tok = AutoTokenizer.from_pretrained("bert-base-uncased")
82
+ encoder = IOB2Encoder(labels=labels, tokenizer=tok)
83
+ ```
84
+
85
+ ## Batch Encoding
86
+
87
+ ```python
88
+ annotations = [
89
+ {"text": "Did Dame Judy Dench star?", "spans": [{"label": "actor", "start": 4, "end": 19}]},
90
+ {"text": "Matt Damon was Jason Bourne.", "spans": [{"label": "actor", "start": 0, "end": 10}]},
91
+ ]
92
+
93
+ results = encoder.batch(annotations)
94
+ # >>> [[-100, 0, 1, 2, 2, 2, 0, -100], [-100, 1, 2, 0, 0, 0, 0, -100]]
95
+ ```
96
+
97
+ The batch path uses the Rust-backed `encode_batch()` for parallelized tokenization. Returns `list[list[int]]` with no padding; use HuggingFace's `DataCollatorForTokenClassification` or your own padding for training.
98
+
99
+ ## Custom Field Names
100
+
101
+ If your annotation data uses non-standard field names, configure them at construction:
102
+
103
+ ```python
104
+ # BioMed-NER dataset uses "entities" and "class" instead of "spans" and "label"
105
+ encoder = IOB2Encoder(
106
+ labels=["organism", "chemicals"],
107
+ tokenizer="bert-base-uncased",
108
+ spans_field="entities",
109
+ label_field="class",
110
+ )
111
+ ```
112
+
113
+ ## Built-in Conversion Check
114
+
115
+ By default, every encoding is verified by recovering the entity text from the produced labels and comparing it to the original annotation. This catches misalignment bugs early. Disable it for performance in production:
116
+
117
+ ```python
118
+ encoder = IOB2Encoder(labels=labels, tokenizer=tok, conversion_check=False)
119
+ ```
120
+
121
+ ## Supported Tokenizers
122
+
123
+ Tested across three tokenizer families:
124
+
125
+ | Family | Checkpoints |
126
+ |---|---|
127
+ | WordPiece | `bert-base-cased`, `bert-base-uncased`, `bert-large-cased`, `bert-large-uncased`, `distilbert-base-cased`, `distilbert-base-uncased`, `google/electra-base-discriminator` |
128
+ | BPE | `roberta-base`, `roberta-large` |
129
+ | SentencePiece | `albert-base-v2`, `xlnet-base-cased`, `t5-small` |
130
+
131
+ Other HuggingFace-compatible tokenizers should work as well. The built-in conversion check will flag any issues.
132
+
133
+ ## Tests
134
+
135
+ ```bash
136
+ uv run pytest tests/ -v
137
+ ```
138
+
139
+ The test suite includes unit tests for label map construction, entity range detection, and the conversion checker, plus a parametrized matrix of 12 tokenizer checkpoints across multiple annotation edge cases (entities at text boundaries, adjacent entities, punctuation, etc.).
@@ -0,0 +1,40 @@
1
+ [project]
2
+ name = "iob2labels"
3
+ version = "0.1.0"
4
+ description = "Convert IOB2 NER span annotations into integer label sequences aligned to any HuggingFace-compatible tokenizer."
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ requires-python = ">=3.10"
8
+ keywords = ["ner", "named-entity-recognition", "iob2", "token-classification", "nlp", "tokenizers"]
9
+ classifiers = [
10
+ "Development Status :: 3 - Alpha",
11
+ "Intended Audience :: Developers",
12
+ "Intended Audience :: Science/Research",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.10",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Programming Language :: Python :: 3.13",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ "Topic :: Text Processing :: Linguistic",
21
+ "Typing :: Typed",
22
+ ]
23
+ dependencies = [
24
+ "pydantic>=2.0",
25
+ "tokenizers>=0.15",
26
+ ]
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = ["src/iob2labels"]
30
+
31
+ [build-system]
32
+ requires = ["hatchling"]
33
+ build-backend = "hatchling.build"
34
+
35
+ [dependency-groups]
36
+ dev = [
37
+ "ipython>=8.37.0",
38
+ "pytest>=8.4.2",
39
+ "transformers>=4.40",
40
+ ]
@@ -0,0 +1,15 @@
1
+ ## -- primary interface
2
+ from .encoder import IOB2Encoder as IOB2Encoder
3
+
4
+ ## -- types
5
+ from .annotations import Annotation as Annotation
6
+ from .annotations import Span as Span
7
+
8
+ ## -- utilities
9
+ from .labels import create_label_map as create_label_map
10
+ from .labels import format_entity_label as format_entity_label
11
+ from .annotations import preprocessing as preprocessing
12
+
13
+ ## -- checker
14
+ from .checker import check_iob_conversion as check_iob_conversion
15
+ from .checker import get_entity_index_ranges as get_entity_index_ranges
@@ -0,0 +1,87 @@
1
+ from typing import Any
2
+ from typing_extensions import TypedDict
3
+
4
+ from pydantic import BaseModel, Field, ValidationError
5
+ from pydantic import AfterValidator, StrictStr, StrictInt
6
+
7
+
8
+ class DefaultFields:
9
+ TEXT = "text"
10
+ SPANS = "spans"
11
+ START = "start"
12
+ END = "end"
13
+ LABEL = "label"
14
+
15
+ ## -- Pydantic models to perform validation of input annotation data
16
+ ## -- and support conversion for iob-label conversion.
17
+
18
+ class _SpanFormat(BaseModel):
19
+ start: StrictInt = Field(..., description="Index of entity starting character in text string.")
20
+ end: StrictInt = Field(..., description="Index of entity ending character in text string.")
21
+ label: StrictStr = Field(..., description="Label name for annotated entity (e.g., PERSON, PRODUCT, LOCATION, etc.")
22
+
23
+ class _AnnotationFormat(BaseModel):
24
+ text: StrictStr
25
+ spans: list[_SpanFormat]
26
+
27
+ def convert_to_validated_format(
28
+ text: str,
29
+ spans: list[dict],
30
+ start_field: str,
31
+ end_field: str,
32
+ label_field: str
33
+ ) -> _AnnotationFormat:
34
+ """Intermediate function for converting input annotation data to pydantic models for validation and field conversion."""
35
+ return _AnnotationFormat(
36
+ text=text,
37
+ spans=[
38
+ _SpanFormat(
39
+ start=span[start_field],
40
+ end=span[end_field],
41
+ label=span[label_field]
42
+ ) for span in spans
43
+ ]
44
+ )
45
+
46
+ ## -- typed-dicts are returned from validation step as they are _lightly_ typed
47
+ ## -- but benefit from intermediate pydantic validation and preprocessing.
48
+ class Span(TypedDict):
49
+ start: int
50
+ end: int
51
+ label: str
52
+
53
+ class Annotation(TypedDict):
54
+ text: str
55
+ spans: list[Span]
56
+
57
+ def preprocessing(
58
+ text: str,
59
+ spans: list[dict],
60
+ start_field: str = DefaultFields.START,
61
+ end_field: str = DefaultFields.END,
62
+ label_field: str = DefaultFields.LABEL
63
+ ) -> Annotation:
64
+ # first convert to pydantic models to convert and validate input data
65
+ validated = convert_to_validated_format(text, spans, start_field, end_field, label_field)
66
+ # return as typed dict
67
+ return Annotation(**validated.model_dump())
68
+
69
+ def validate_batch(
70
+ annotations: list[dict],
71
+ text_field: str = DefaultFields.TEXT,
72
+ spans_field: str = DefaultFields.SPANS,
73
+ start_field: str = DefaultFields.START,
74
+ end_field: str = DefaultFields.END,
75
+ label_field: str = DefaultFields.LABEL
76
+ ) -> list[Annotation]:
77
+ assert isinstance(annotations, list) and all([isinstance(ann, dict) for ann in annotations]), f"Input for annotations is not a list of dicts."
78
+ return [
79
+ preprocessing( # <- was `validate` (nonexistent); fixed to call preprocessing
80
+ text=ann[text_field],
81
+ spans=ann[spans_field],
82
+ start_field=start_field,
83
+ end_field=end_field,
84
+ label_field=label_field
85
+ )
86
+ for ann in annotations
87
+ ]
@@ -0,0 +1,71 @@
1
+ from itertools import takewhile
2
+
3
+ from tokenizers import Tokenizer
4
+
5
+ from iob2labels.labels import IobPrefixes, IGNORE_TOKEN
6
+ from iob2labels.annotations import Annotation, DefaultFields
7
+
8
+ def invert_label_map(label_map: dict[str, int]) -> dict[int, str]:
9
+ return {v: k for k, v in label_map.items()}
10
+
11
+ def get_iob_type_by_iob_label(label_map: dict[str, int], iob_label: int) -> str:
12
+ if iob_label == IGNORE_TOKEN:
13
+ iob_type = IobPrefixes.OUTSIDE
14
+ else:
15
+ idx_map = invert_label_map(label_map)
16
+ iob_type = idx_map[iob_label]
17
+ return iob_type[0]
18
+
19
+ def is_beginning_tag(label_map: dict[str, int], iob_label: int) -> bool:
20
+ """Boolean check if label associated with input index is a beginning tag."""
21
+ return get_iob_type_by_iob_label(label_map, iob_label) == IobPrefixes.BEGINNING
22
+
23
+ def is_inside_tag(label_map: dict[str, int], iob_label: int) -> bool:
24
+ """Boolean check if label associated with input index is an inside tag."""
25
+ return get_iob_type_by_iob_label(label_map, iob_label) == IobPrefixes.INSIDE
26
+
27
+ def is_outside_tag(label_map: dict[str, int], iob_label: int) -> bool:
28
+ """Boolean check if label associated with input index is an outside tag."""
29
+ return get_iob_type_by_iob_label(label_map, iob_label) == IobPrefixes.OUTSIDE
30
+
31
+
32
+
33
+ def get_entity_sequence_length(label_map: dict[str, int], iob_labels: list[int]) -> int:
34
+ return len(list(takewhile(lambda x: is_inside_tag(label_map, x), iob_labels)))
35
+
36
+ def get_entity_index_ranges(label_map: dict[str, int], iob_labels: list[int]) -> list[tuple[int, int]]:
37
+ return [
38
+ (idx, idx + get_entity_sequence_length(label_map, iob_labels[(idx + 1):]))
39
+ for idx in range(len(iob_labels)) if is_beginning_tag(label_map, iob_labels[idx])
40
+ ]
41
+
42
+ def check_iob_conversion(
43
+ iob_labels: list[int],
44
+ label_map: dict[str, int],
45
+ tokenizer: Tokenizer,
46
+ input_ids: list[int],
47
+ annotation: Annotation,
48
+ debug: bool = False,
49
+ strict: bool = True,
50
+ ) -> None:
51
+ """Tests to ensure assigned IOB labels are correct based on character and token indices for annotated entities."""
52
+ match_ranges = get_entity_index_ranges(label_map, iob_labels)
53
+ num_ranges, num_spans = len(match_ranges), len(annotation[DefaultFields.SPANS])
54
+ assert num_ranges == num_spans, f"Test found {num_ranges} matches but annotation includes {num_spans} entities."
55
+
56
+ for _range, span in zip(match_ranges, annotation[DefaultFields.SPANS]):
57
+ # recover entity from IOB positive label indices
58
+ entity_input_ids = input_ids[_range[0]: (_range[1] + 1)]
59
+ recovered_entity = tokenizer.decode(entity_input_ids).strip() # <- some tokenizers (e.g., Roberta-Base), can include leading whitespace in decoded entity
60
+
61
+ # encode/decode annotated entity and assert equality
62
+ annotated_entity = annotation[DefaultFields.TEXT][span[DefaultFields.START]:span[DefaultFields.END]]
63
+ expected_entity = tokenizer.decode(
64
+ tokenizer.encode(annotated_entity, add_special_tokens=False).ids # <- standalone tokenizers: encode() returns Encoding, access .ids for token IDs
65
+ )
66
+ result = expected_entity == recovered_entity if strict else expected_entity in recovered_entity
67
+ assert result, f"Recovered entity (via IOB labels) '{recovered_entity}' does not match expected entity '{annotated_entity}'. Decoded form is '{expected_entity}'."
68
+
69
+ if debug: print(f"| -> recovered entity '{recovered_entity}' for IOB labels at indices ({_range[0]}, {_range[1]}), which matches annotated entity '{annotated_entity}'.")
70
+
71
+ return