glitchlings 0.10.2__cp312-cp312-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +99 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +147 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +493 -0
- glitchlings/attack/core_execution.py +367 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +218 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +227 -0
- glitchlings/auggie.py +284 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +19 -0
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +490 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +27 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +356 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +678 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +243 -0
- glitchlings/zoo/mim1c.py +148 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +97 -0
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +66 -0
- glitchlings/zoo/transforms.py +346 -0
- glitchlings/zoo/typogre.py +128 -0
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +93 -0
- glitchlings-0.10.2.dist-info/METADATA +337 -0
- glitchlings-0.10.2.dist-info/RECORD +83 -0
- glitchlings-0.10.2.dist-info/WHEEL +5 -0
- glitchlings-0.10.2.dist-info/entry_points.txt +3 -0
- glitchlings-0.10.2.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.10.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Pure encoding utilities for tokenization.
|
|
2
|
+
|
|
3
|
+
This module contains pure functions for encoding text using tokenizers.
|
|
4
|
+
The functions here do not resolve tokenizers or perform IO - they operate
|
|
5
|
+
on already-resolved Tokenizer instances.
|
|
6
|
+
|
|
7
|
+
Pure guarantees:
|
|
8
|
+
- No import side effects beyond stdlib
|
|
9
|
+
- No file IO or network calls
|
|
10
|
+
- No environment variable access
|
|
11
|
+
- Deterministic output for given inputs
|
|
12
|
+
|
|
13
|
+
The impure tokenizer resolution lives in tokenization.py.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import TYPE_CHECKING, Sequence
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
21
|
+
from .tokenization import Tokenizer
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def encode_single(
|
|
25
|
+
tokenizer: "Tokenizer",
|
|
26
|
+
text: str,
|
|
27
|
+
) -> tuple[list[str], list[int]]:
|
|
28
|
+
"""Encode a single text string into tokens and IDs.
|
|
29
|
+
|
|
30
|
+
This is a thin wrapper that ensures list output types.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
tokenizer: A resolved tokenizer instance.
|
|
34
|
+
text: Text to encode.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Tuple of (tokens, token_ids) as lists.
|
|
38
|
+
"""
|
|
39
|
+
tokens, ids = tokenizer.encode(text)
|
|
40
|
+
return list(tokens), list(ids)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def encode_batch(
|
|
44
|
+
tokenizer: "Tokenizer",
|
|
45
|
+
texts: Sequence[str],
|
|
46
|
+
) -> tuple[list[list[str]], list[list[int]]]:
|
|
47
|
+
"""Encode multiple texts into batched tokens and IDs.
|
|
48
|
+
|
|
49
|
+
Attempts to use the tokenizer's batch_encode method if available,
|
|
50
|
+
otherwise falls back to per-item encoding.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
tokenizer: A resolved tokenizer instance.
|
|
54
|
+
texts: Sequence of texts to encode.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Tuple of (token_batches, id_batches) as nested lists.
|
|
58
|
+
"""
|
|
59
|
+
# Try batch encoding if available
|
|
60
|
+
batch_encode = getattr(tokenizer, "encode_batch", None)
|
|
61
|
+
if callable(batch_encode):
|
|
62
|
+
encoded = batch_encode(texts)
|
|
63
|
+
token_batches: list[list[str]] = []
|
|
64
|
+
id_batches: list[list[int]] = []
|
|
65
|
+
for tokens, ids in encoded:
|
|
66
|
+
token_batches.append(list(tokens))
|
|
67
|
+
id_batches.append(list(ids))
|
|
68
|
+
return token_batches, id_batches
|
|
69
|
+
|
|
70
|
+
# Fallback: encode each text individually
|
|
71
|
+
token_batches_fallback: list[list[str]] = []
|
|
72
|
+
id_batches_fallback: list[list[int]] = []
|
|
73
|
+
for entry in texts:
|
|
74
|
+
tokens, ids = encode_single(tokenizer, entry)
|
|
75
|
+
token_batches_fallback.append(tokens)
|
|
76
|
+
id_batches_fallback.append(ids)
|
|
77
|
+
return token_batches_fallback, id_batches_fallback
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def describe_tokenizer(
|
|
81
|
+
tokenizer: "Tokenizer",
|
|
82
|
+
raw_spec: "str | Tokenizer | None",
|
|
83
|
+
) -> str:
|
|
84
|
+
"""Generate a human-readable description of a tokenizer.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
tokenizer: The resolved tokenizer instance.
|
|
88
|
+
raw_spec: The original specification used to create/resolve the tokenizer.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
A descriptive string identifying the tokenizer.
|
|
92
|
+
"""
|
|
93
|
+
# If the raw spec was a string, use it directly
|
|
94
|
+
if isinstance(raw_spec, str):
|
|
95
|
+
return raw_spec
|
|
96
|
+
|
|
97
|
+
# Try to get a name attribute
|
|
98
|
+
name = getattr(tokenizer, "name", None)
|
|
99
|
+
if isinstance(name, str) and name:
|
|
100
|
+
return name
|
|
101
|
+
|
|
102
|
+
# For None spec, use the class name
|
|
103
|
+
if raw_spec is None:
|
|
104
|
+
return tokenizer.__class__.__name__
|
|
105
|
+
|
|
106
|
+
# Fallback to string representation
|
|
107
|
+
return str(raw_spec)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
__all__ = [
|
|
111
|
+
"describe_tokenizer",
|
|
112
|
+
"encode_batch",
|
|
113
|
+
"encode_single",
|
|
114
|
+
]
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Protocol, cast
|
|
6
|
+
|
|
7
|
+
from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Metric(Protocol):
|
|
14
|
+
def __call__(
|
|
15
|
+
self,
|
|
16
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
17
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
18
|
+
) -> float | list[float]: ...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BatchMetric(Protocol):
|
|
22
|
+
def __call__(self, inputs: TokenBatch, outputs: TokenBatch) -> list[float]: ...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
_rust: Any = importlib.import_module("glitchlings._zoo_rust")
|
|
27
|
+
except ModuleNotFoundError as exc: # pragma: no cover - runtime guard
|
|
28
|
+
raise ImportError(
|
|
29
|
+
"Could not import compiled Rust extension. "
|
|
30
|
+
"Please ensure the project is installed with the Rust extension built."
|
|
31
|
+
) from exc
|
|
32
|
+
|
|
33
|
+
_single_jsd = cast(Metric, getattr(_rust, "jensen_shannon_divergence"))
|
|
34
|
+
_single_ned = cast(Metric, getattr(_rust, "normalized_edit_distance"))
|
|
35
|
+
_single_sr = cast(Metric, getattr(_rust, "subsequence_retention"))
|
|
36
|
+
_single_ed = cast(Metric, getattr(_rust, "entropy_delta"))
|
|
37
|
+
_single_msi = cast(Metric, getattr(_rust, "merge_split_index"))
|
|
38
|
+
_batch_jsd = cast(BatchMetric, getattr(_rust, "batch_jensen_shannon_divergence"))
|
|
39
|
+
_batch_ned = cast(BatchMetric, getattr(_rust, "batch_normalized_edit_distance"))
|
|
40
|
+
_batch_sr = cast(BatchMetric, getattr(_rust, "batch_subsequence_retention"))
|
|
41
|
+
_batch_ed = cast(BatchMetric, getattr(_rust, "batch_entropy_delta"))
|
|
42
|
+
_batch_msi = cast(BatchMetric, getattr(_rust, "batch_merge_split_index"))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _dispatch_metric(
|
|
46
|
+
original: TokenSequence | TokenBatch,
|
|
47
|
+
corrupted: TokenSequence | TokenBatch,
|
|
48
|
+
*,
|
|
49
|
+
single: Metric,
|
|
50
|
+
batch: BatchMetric,
|
|
51
|
+
name: str,
|
|
52
|
+
) -> float | list[float]:
|
|
53
|
+
"""Dispatch metric computation to single or batch implementation.
|
|
54
|
+
|
|
55
|
+
Uses the pure is_batch function to determine which implementation to call.
|
|
56
|
+
"""
|
|
57
|
+
validate_batch_consistency(original, corrupted, name)
|
|
58
|
+
|
|
59
|
+
if is_batch(original):
|
|
60
|
+
return batch(original, corrupted)
|
|
61
|
+
|
|
62
|
+
return single(original, corrupted)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def jensen_shannon_divergence(
|
|
66
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
67
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
68
|
+
) -> float | list[float]:
|
|
69
|
+
return _dispatch_metric(
|
|
70
|
+
original_tokens,
|
|
71
|
+
corrupted_tokens,
|
|
72
|
+
single=_single_jsd,
|
|
73
|
+
batch=_batch_jsd,
|
|
74
|
+
name="jensen_shannon_divergence",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def normalized_edit_distance(
|
|
79
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
80
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
81
|
+
) -> float | list[float]:
|
|
82
|
+
return _dispatch_metric(
|
|
83
|
+
original_tokens,
|
|
84
|
+
corrupted_tokens,
|
|
85
|
+
single=_single_ned,
|
|
86
|
+
batch=_batch_ned,
|
|
87
|
+
name="normalized_edit_distance",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def subsequence_retention(
|
|
92
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
93
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
94
|
+
) -> float | list[float]:
|
|
95
|
+
return _dispatch_metric(
|
|
96
|
+
original_tokens,
|
|
97
|
+
corrupted_tokens,
|
|
98
|
+
single=_single_sr,
|
|
99
|
+
batch=_batch_sr,
|
|
100
|
+
name="subsequence_retention",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def entropy_delta(
|
|
105
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
106
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
107
|
+
) -> float | list[float]:
|
|
108
|
+
"""Compute normalized entropy delta between original and corrupted tokens.
|
|
109
|
+
|
|
110
|
+
Measures the change in token distribution entropy:
|
|
111
|
+
ΔH = H(corrupted) - H(original), normalized to [-1, 1].
|
|
112
|
+
|
|
113
|
+
Positive values indicate the corrupted text has higher entropy
|
|
114
|
+
(more uniform/diverse token distribution). Negative values indicate
|
|
115
|
+
lower entropy (more concentrated distribution).
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
original_tokens: Original token sequence(s).
|
|
119
|
+
corrupted_tokens: Corrupted token sequence(s).
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Normalized entropy delta in [-1, 1], or list for batches.
|
|
123
|
+
"""
|
|
124
|
+
return _dispatch_metric(
|
|
125
|
+
original_tokens,
|
|
126
|
+
corrupted_tokens,
|
|
127
|
+
single=_single_ed,
|
|
128
|
+
batch=_batch_ed,
|
|
129
|
+
name="entropy_delta",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def merge_split_index(
|
|
134
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
135
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
136
|
+
) -> float | list[float]:
|
|
137
|
+
"""Compute merge-split index measuring subword restructuring.
|
|
138
|
+
|
|
139
|
+
Estimates 1→k (split) and k→1 (merge) token events from alignment.
|
|
140
|
+
Higher values indicate more dramatic tokenization changes.
|
|
141
|
+
|
|
142
|
+
MSI = (splits + merges) / max(m, n) ∈ [0, 1]
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
original_tokens: Original token sequence(s).
|
|
146
|
+
corrupted_tokens: Corrupted token sequence(s).
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Merge-split index in [0, 1], or list for batches.
|
|
150
|
+
"""
|
|
151
|
+
return _dispatch_metric(
|
|
152
|
+
original_tokens,
|
|
153
|
+
corrupted_tokens,
|
|
154
|
+
single=_single_msi,
|
|
155
|
+
batch=_batch_msi,
|
|
156
|
+
name="merge_split_index",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
# MetricName Enum
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class MetricName(str, Enum):
|
|
166
|
+
"""Built-in metric names.
|
|
167
|
+
|
|
168
|
+
Use these instead of string literals to avoid typos and enable IDE completion.
|
|
169
|
+
|
|
170
|
+
Example:
|
|
171
|
+
>>> attack = Attack(Typogre(), metrics={MetricName.NED: normalized_edit_distance})
|
|
172
|
+
>>> # or get all defaults:
|
|
173
|
+
>>> attack = Attack(Typogre(), metrics=MetricName.defaults())
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
JSD = "jensen_shannon_divergence"
|
|
177
|
+
NED = "normalized_edit_distance"
|
|
178
|
+
SR = "subsequence_retention"
|
|
179
|
+
HD = "entropy_delta"
|
|
180
|
+
MSI = "merge_split_index"
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def func(self) -> "Callable[..., float | list[float]]":
|
|
184
|
+
"""Get the metric function for this name."""
|
|
185
|
+
return _METRIC_FUNCTIONS[self]
|
|
186
|
+
|
|
187
|
+
@classmethod
|
|
188
|
+
def defaults(cls) -> dict[str, "Callable[..., float | list[float]]"]:
|
|
189
|
+
"""Get all built-in metrics as a dictionary.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Dictionary mapping metric names to functions.
|
|
193
|
+
"""
|
|
194
|
+
return {m.value: m.func for m in cls}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# Mapping from enum to function - populated after functions are defined
|
|
198
|
+
_METRIC_FUNCTIONS: dict[MetricName, "Callable[..., float | list[float]]"] = {
|
|
199
|
+
MetricName.JSD: jensen_shannon_divergence,
|
|
200
|
+
MetricName.NED: normalized_edit_distance,
|
|
201
|
+
MetricName.SR: subsequence_retention,
|
|
202
|
+
MetricName.HD: entropy_delta,
|
|
203
|
+
MetricName.MSI: merge_split_index,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
__all__ = [
|
|
208
|
+
"Metric",
|
|
209
|
+
"BatchMetric",
|
|
210
|
+
"MetricName",
|
|
211
|
+
"TokenBatch",
|
|
212
|
+
"TokenSequence",
|
|
213
|
+
"jensen_shannon_divergence",
|
|
214
|
+
"normalized_edit_distance",
|
|
215
|
+
"subsequence_retention",
|
|
216
|
+
"entropy_delta",
|
|
217
|
+
"merge_split_index",
|
|
218
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Pure metric dispatch functions.
|
|
2
|
+
|
|
3
|
+
This module contains pure functions for dispatching metric computations.
|
|
4
|
+
It does not import Rust FFI or perform any IO - it operates on already-
|
|
5
|
+
resolved metric functions.
|
|
6
|
+
|
|
7
|
+
Pure guarantees:
|
|
8
|
+
- No import side effects beyond stdlib
|
|
9
|
+
- No Rust FFI loading
|
|
10
|
+
- Deterministic dispatch logic
|
|
11
|
+
|
|
12
|
+
The impure Rust metric loading lives in metrics.py.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from typing import Sequence, TypeGuard
|
|
18
|
+
|
|
19
|
+
TokenSequence = Sequence[str]
|
|
20
|
+
TokenBatch = Sequence[TokenSequence]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def is_batch(tokens: TokenSequence | TokenBatch) -> TypeGuard[TokenBatch]:
|
|
24
|
+
"""Determine if tokens represent a batch of sequences.
|
|
25
|
+
|
|
26
|
+
An empty list is treated as an empty batch (returning True) so that
|
|
27
|
+
``metric([], [])`` returns ``[]`` rather than ``0.0``. This matches
|
|
28
|
+
the behavior of :meth:`Attack.run` when processing empty transcripts.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
tokens: Either a sequence of token strings or a batch of such sequences.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
True if tokens is a batch (list of lists), False if a single sequence.
|
|
35
|
+
"""
|
|
36
|
+
if not tokens:
|
|
37
|
+
return True # Empty list is an empty batch
|
|
38
|
+
|
|
39
|
+
first = tokens[0]
|
|
40
|
+
return isinstance(first, Sequence) and not isinstance(first, (str, bytes))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def validate_batch_consistency(
|
|
44
|
+
original: TokenSequence | TokenBatch,
|
|
45
|
+
corrupted: TokenSequence | TokenBatch,
|
|
46
|
+
metric_name: str,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Validate that both inputs are consistently batched or single.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
original: Original token sequence or batch.
|
|
52
|
+
corrupted: Corrupted token sequence or batch.
|
|
53
|
+
metric_name: Name of the metric (for error messages).
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
TypeError: If one input is batched and the other isn't.
|
|
57
|
+
"""
|
|
58
|
+
original_is_batch = is_batch(original)
|
|
59
|
+
corrupted_is_batch = is_batch(corrupted)
|
|
60
|
+
|
|
61
|
+
if original_is_batch != corrupted_is_batch:
|
|
62
|
+
raise TypeError(f"{metric_name} expects either both batch inputs or both single sequences")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
__all__ = [
|
|
66
|
+
"TokenBatch",
|
|
67
|
+
"TokenSequence",
|
|
68
|
+
"is_batch",
|
|
69
|
+
"validate_batch_consistency",
|
|
70
|
+
]
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import zlib
|
|
5
|
+
from typing import Any, Protocol, Sequence
|
|
6
|
+
|
|
7
|
+
DEFAULT_TIKTOKEN_ENCODINGS = ("o200k_base", "cl100k_base")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Tokenizer(Protocol):
|
|
11
|
+
def encode(self, text: str) -> tuple[list[str], list[int]]: ...
|
|
12
|
+
|
|
13
|
+
def decode(self, tokens: Sequence[str]) -> str: ...
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class WhitespaceTokenizer:
|
|
17
|
+
def encode(self, text: str) -> tuple[list[str], list[int]]:
|
|
18
|
+
tokens = text.split()
|
|
19
|
+
# Synthetic IDs based on adler32 hash for stability
|
|
20
|
+
ids = [zlib.adler32(t.encode("utf-8")) & 0xFFFFFFFF for t in tokens]
|
|
21
|
+
return tokens, ids
|
|
22
|
+
|
|
23
|
+
def decode(self, tokens: Sequence[str]) -> str:
|
|
24
|
+
return " ".join(tokens)
|
|
25
|
+
|
|
26
|
+
def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
|
|
27
|
+
return [self.encode(text) for text in texts]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TiktokenTokenizer:
|
|
31
|
+
def __init__(self, model_name: str):
|
|
32
|
+
import tiktoken
|
|
33
|
+
|
|
34
|
+
self.name = model_name
|
|
35
|
+
try:
|
|
36
|
+
self.enc = tiktoken.get_encoding(model_name)
|
|
37
|
+
except ValueError:
|
|
38
|
+
self.enc = tiktoken.encoding_for_model(model_name)
|
|
39
|
+
|
|
40
|
+
def encode(self, text: str) -> tuple[list[str], list[int]]:
|
|
41
|
+
ids = self.enc.encode(text)
|
|
42
|
+
tokens = [
|
|
43
|
+
self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace") for i in ids
|
|
44
|
+
]
|
|
45
|
+
return tokens, ids
|
|
46
|
+
|
|
47
|
+
def decode(self, tokens: Sequence[str], sep: str = "") -> str:
|
|
48
|
+
return sep.join(tokens)
|
|
49
|
+
|
|
50
|
+
def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
|
|
51
|
+
id_batches = [list(batch) for batch in self.enc.encode_batch(list(texts))]
|
|
52
|
+
token_batches: list[list[str]] = []
|
|
53
|
+
for ids in id_batches:
|
|
54
|
+
token_batches.append(
|
|
55
|
+
[
|
|
56
|
+
self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace")
|
|
57
|
+
for i in ids
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
return list(zip(token_batches, id_batches))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class HuggingFaceTokenizerWrapper:
|
|
64
|
+
def __init__(self, tokenizer_obj: Any, *, unknown_token: str = "[UNK]"):
|
|
65
|
+
self.tokenizer = tokenizer_obj
|
|
66
|
+
self.unknown_token = unknown_token
|
|
67
|
+
|
|
68
|
+
def encode(self, text: str) -> tuple[list[str], list[int]]:
|
|
69
|
+
# tokenizers.Tokenizer.encode returns an Encoding object
|
|
70
|
+
encoding = self.tokenizer.encode(text)
|
|
71
|
+
return encoding.tokens, encoding.ids
|
|
72
|
+
|
|
73
|
+
def decode(self, tokens: Sequence[str]) -> str:
|
|
74
|
+
# Use the tokenizer's decode method to properly handle model-specific
|
|
75
|
+
# artifacts (e.g., "##" for WordPiece, "Ġ" for BPE).
|
|
76
|
+
# Convert tokens to IDs first, then decode.
|
|
77
|
+
try:
|
|
78
|
+
token_ids = [self.tokenizer.token_to_id(token) for token in tokens]
|
|
79
|
+
# Filter out None values (tokens not in vocabulary)
|
|
80
|
+
valid_ids = [tid for tid in token_ids if tid is not None]
|
|
81
|
+
if valid_ids:
|
|
82
|
+
result: str = self.tokenizer.decode(valid_ids)
|
|
83
|
+
return result
|
|
84
|
+
except (AttributeError, TypeError):
|
|
85
|
+
pass
|
|
86
|
+
# Fallback: decode each token individually to handle artifacts properly
|
|
87
|
+
decoded_tokens = []
|
|
88
|
+
for token in tokens:
|
|
89
|
+
token_id = None
|
|
90
|
+
try:
|
|
91
|
+
token_id = self.tokenizer.token_to_id(token)
|
|
92
|
+
except (AttributeError, TypeError):
|
|
93
|
+
pass
|
|
94
|
+
if token_id is None:
|
|
95
|
+
decoded_tokens.append(self.unknown_token)
|
|
96
|
+
else:
|
|
97
|
+
# Decode the single token ID to properly handle artifacts
|
|
98
|
+
try:
|
|
99
|
+
decoded = self.tokenizer.decode([token_id])
|
|
100
|
+
decoded_tokens.append(decoded)
|
|
101
|
+
except (AttributeError, TypeError):
|
|
102
|
+
# Last resort: strip common prefixes and use token as-is
|
|
103
|
+
clean_token = token.lstrip("Ġ").lstrip("##").lstrip("▁")
|
|
104
|
+
decoded_tokens.append(clean_token if clean_token else token)
|
|
105
|
+
return " ".join(decoded_tokens)
|
|
106
|
+
|
|
107
|
+
def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
|
|
108
|
+
encodings = self.tokenizer.encode_batch(list(texts))
|
|
109
|
+
return [(encoding.tokens, encoding.ids) for encoding in encodings]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def list_available_tokenizers() -> list[str]:
|
|
113
|
+
"""List tokenizer names that can be resolved.
|
|
114
|
+
|
|
115
|
+
Returns a list of known tokenizer names including:
|
|
116
|
+
- Tiktoken encodings (if tiktoken is installed)
|
|
117
|
+
- A note about HuggingFace tokenizers (if tokenizers is installed)
|
|
118
|
+
- 'whitespace' (always available)
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
List of available tokenizer names/descriptions.
|
|
122
|
+
"""
|
|
123
|
+
available: list[str] = []
|
|
124
|
+
|
|
125
|
+
if importlib.util.find_spec("tiktoken"):
|
|
126
|
+
import tiktoken
|
|
127
|
+
|
|
128
|
+
# Add known tiktoken encodings
|
|
129
|
+
for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
|
|
130
|
+
try:
|
|
131
|
+
tiktoken.get_encoding(encoding)
|
|
132
|
+
available.append(encoding)
|
|
133
|
+
except ValueError:
|
|
134
|
+
pass
|
|
135
|
+
# Add common model names
|
|
136
|
+
available.extend(["gpt-4", "gpt-4o", "gpt-3.5-turbo"])
|
|
137
|
+
|
|
138
|
+
if importlib.util.find_spec("tokenizers"):
|
|
139
|
+
available.append("<any HuggingFace tokenizer name>")
|
|
140
|
+
|
|
141
|
+
available.append("whitespace")
|
|
142
|
+
return available
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def resolve_tokenizer(tokenizer: str | Tokenizer | None) -> Tokenizer:
|
|
146
|
+
"""Resolve a tokenizer specification to a Tokenizer instance.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
tokenizer: One of:
|
|
150
|
+
- None: Use default tokenizer (tiktoken o200k_base, or whitespace)
|
|
151
|
+
- str: Tokenizer name (tiktoken encoding, model name, or HF tokenizer)
|
|
152
|
+
- Tokenizer: Pass through as-is
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
A Tokenizer instance.
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
ValueError: If string tokenizer cannot be resolved.
|
|
159
|
+
"""
|
|
160
|
+
if tokenizer is None:
|
|
161
|
+
return _default_tokenizer()
|
|
162
|
+
|
|
163
|
+
if isinstance(tokenizer, str):
|
|
164
|
+
if importlib.util.find_spec("tiktoken"):
|
|
165
|
+
import tiktoken
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
# Check if valid tiktoken encoding/model
|
|
169
|
+
try:
|
|
170
|
+
tiktoken.get_encoding(tokenizer)
|
|
171
|
+
return TiktokenTokenizer(tokenizer)
|
|
172
|
+
except ValueError:
|
|
173
|
+
try:
|
|
174
|
+
tiktoken.encoding_for_model(tokenizer)
|
|
175
|
+
return TiktokenTokenizer(tokenizer)
|
|
176
|
+
except (ValueError, KeyError):
|
|
177
|
+
pass
|
|
178
|
+
except ImportError:
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
if importlib.util.find_spec("tokenizers"):
|
|
182
|
+
from tokenizers import Tokenizer
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
return HuggingFaceTokenizerWrapper(Tokenizer.from_pretrained(tokenizer))
|
|
186
|
+
except Exception:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
available = list_available_tokenizers()
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"Could not resolve tokenizer: {tokenizer!r}. Available: {', '.join(available)}"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Check if it is a HuggingFace tokenizer object
|
|
195
|
+
if importlib.util.find_spec("tokenizers"):
|
|
196
|
+
from tokenizers import Tokenizer as HFTokenizer
|
|
197
|
+
|
|
198
|
+
if isinstance(tokenizer, HFTokenizer):
|
|
199
|
+
return HuggingFaceTokenizerWrapper(tokenizer)
|
|
200
|
+
|
|
201
|
+
return tokenizer
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _default_tokenizer() -> Tokenizer:
|
|
205
|
+
"""Select a modern, lightweight tokenizer with graceful fallbacks."""
|
|
206
|
+
if importlib.util.find_spec("tiktoken"):
|
|
207
|
+
import tiktoken
|
|
208
|
+
|
|
209
|
+
for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
|
|
210
|
+
try:
|
|
211
|
+
tiktoken.get_encoding(encoding)
|
|
212
|
+
return TiktokenTokenizer(encoding)
|
|
213
|
+
except ValueError:
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
return WhitespaceTokenizer()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
__all__ = [
|
|
220
|
+
"DEFAULT_TIKTOKEN_ENCODINGS",
|
|
221
|
+
"HuggingFaceTokenizerWrapper",
|
|
222
|
+
"TiktokenTokenizer",
|
|
223
|
+
"Tokenizer",
|
|
224
|
+
"WhitespaceTokenizer",
|
|
225
|
+
"list_available_tokenizers",
|
|
226
|
+
"resolve_tokenizer",
|
|
227
|
+
]
|