glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +36 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +53 -0
- glitchlings/attack/compose.py +299 -0
- glitchlings/attack/core.py +465 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +104 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +157 -0
- glitchlings/auggie.py +283 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +17 -3
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +37 -65
- glitchlings/dlc/prime.py +55 -114
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +432 -0
- glitchlings/main.py +123 -32
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +29 -176
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +311 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +47 -24
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +301 -167
- glitchlings/zoo/core_execution.py +98 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/ekkokin.py +118 -0
- glitchlings/zoo/hokey.py +137 -0
- glitchlings/zoo/jargoyle.py +179 -274
- glitchlings/zoo/mim1c.py +106 -68
- glitchlings/zoo/pedant/__init__.py +107 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +44 -175
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +359 -116
- glitchlings/zoo/scannequin.py +18 -125
- glitchlings/zoo/transforms.py +386 -0
- glitchlings/zoo/typogre.py +76 -162
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/zeedub.py +33 -86
- glitchlings-0.9.3.dist-info/METADATA +334 -0
- glitchlings-0.9.3.dist-info/RECORD +80 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
- glitchlings/zoo/_ocr_confusions.py +0 -34
- glitchlings/zoo/_rate.py +0 -21
- glitchlings/zoo/reduple.py +0 -169
- glitchlings-0.2.5.dist-info/METADATA +0 -490
- glitchlings-0.2.5.dist-info/RECORD +0 -27
- /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, TypeGuard, cast
|
|
6
|
+
|
|
7
|
+
from ..conf import DEFAULT_ATTACK_SEED
|
|
8
|
+
from ..util.adapters import coerce_gaggle
|
|
9
|
+
from ..util.transcripts import Transcript, TranscriptTarget, is_transcript
|
|
10
|
+
from ..zoo.core import Glitchling
|
|
11
|
+
from .compose import (
|
|
12
|
+
build_batch_result,
|
|
13
|
+
build_empty_result,
|
|
14
|
+
build_single_result,
|
|
15
|
+
extract_transcript_contents,
|
|
16
|
+
)
|
|
17
|
+
from .encode import describe_tokenizer, encode_batch
|
|
18
|
+
from .metrics import (
|
|
19
|
+
Metric,
|
|
20
|
+
jensen_shannon_divergence,
|
|
21
|
+
normalized_edit_distance,
|
|
22
|
+
subsequence_retention,
|
|
23
|
+
)
|
|
24
|
+
from .tokenization import Tokenizer, resolve_tokenizer
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _is_string_batch(value: Any) -> TypeGuard[Sequence[str]]:
|
|
28
|
+
if isinstance(value, (str, bytes)):
|
|
29
|
+
return False
|
|
30
|
+
if not isinstance(value, Sequence):
|
|
31
|
+
return False
|
|
32
|
+
return all(isinstance(item, str) for item in value)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class AttackResult:
|
|
37
|
+
original: str | Transcript | Sequence[str]
|
|
38
|
+
corrupted: str | Transcript | Sequence[str]
|
|
39
|
+
input_tokens: list[str] | list[list[str]]
|
|
40
|
+
output_tokens: list[str] | list[list[str]]
|
|
41
|
+
input_token_ids: list[int] | list[list[int]]
|
|
42
|
+
output_token_ids: list[int] | list[list[int]]
|
|
43
|
+
tokenizer_info: str
|
|
44
|
+
metrics: dict[str, float | list[float]]
|
|
45
|
+
|
|
46
|
+
def _tokens_are_batched(self) -> bool:
|
|
47
|
+
tokens = self.input_tokens
|
|
48
|
+
if tokens and isinstance(tokens[0], list):
|
|
49
|
+
return True
|
|
50
|
+
return isinstance(self.original, list) or isinstance(self.corrupted, list)
|
|
51
|
+
|
|
52
|
+
def _token_batches(self) -> tuple[list[list[str]], list[list[str]]]:
|
|
53
|
+
if self._tokens_are_batched():
|
|
54
|
+
return (
|
|
55
|
+
cast(list[list[str]], self.input_tokens),
|
|
56
|
+
cast(list[list[str]], self.output_tokens),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return (
|
|
60
|
+
[cast(list[str], self.input_tokens)],
|
|
61
|
+
[cast(list[str], self.output_tokens)],
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def _token_counts(self) -> tuple[list[int], list[int]]:
|
|
65
|
+
inputs, outputs = self._token_batches()
|
|
66
|
+
return [len(tokens) for tokens in inputs], [len(tokens) for tokens in outputs]
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _format_metric_value(value: float | list[float]) -> str:
|
|
70
|
+
if isinstance(value, list):
|
|
71
|
+
if not value:
|
|
72
|
+
return "[]"
|
|
73
|
+
if len(value) <= 4:
|
|
74
|
+
rendered = ", ".join(f"{entry:.3f}" for entry in value)
|
|
75
|
+
return f"[{rendered}]"
|
|
76
|
+
total = sum(value)
|
|
77
|
+
minimum = min(value)
|
|
78
|
+
maximum = max(value)
|
|
79
|
+
mean = total / len(value)
|
|
80
|
+
return f"avg={mean:.3f} min={minimum:.3f} max={maximum:.3f}"
|
|
81
|
+
|
|
82
|
+
return f"{value:.3f}"
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _format_token(token: str, *, max_length: int) -> str:
|
|
86
|
+
clean = token.replace("\n", "\\n")
|
|
87
|
+
if len(clean) > max_length:
|
|
88
|
+
return clean[: max_length - 3] + "..."
|
|
89
|
+
return clean
|
|
90
|
+
|
|
91
|
+
def to_report(self) -> dict[str, object]:
|
|
92
|
+
input_counts, output_counts = self._token_counts()
|
|
93
|
+
return {
|
|
94
|
+
"tokenizer": self.tokenizer_info,
|
|
95
|
+
"original": self.original,
|
|
96
|
+
"corrupted": self.corrupted,
|
|
97
|
+
"input_tokens": self.input_tokens,
|
|
98
|
+
"output_tokens": self.output_tokens,
|
|
99
|
+
"input_token_ids": self.input_token_ids,
|
|
100
|
+
"output_token_ids": self.output_token_ids,
|
|
101
|
+
"token_counts": {
|
|
102
|
+
"input": {"per_sample": input_counts, "total": sum(input_counts)},
|
|
103
|
+
"output": {"per_sample": output_counts, "total": sum(output_counts)},
|
|
104
|
+
},
|
|
105
|
+
"metrics": self.metrics,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
def summary(self, *, max_rows: int = 8, max_token_length: int = 24) -> str:
|
|
109
|
+
input_batches, output_batches = self._token_batches()
|
|
110
|
+
input_counts, output_counts = self._token_counts()
|
|
111
|
+
is_batch = self._tokens_are_batched()
|
|
112
|
+
|
|
113
|
+
lines: list[str] = [f"Tokenizer: {self.tokenizer_info}"]
|
|
114
|
+
if is_batch:
|
|
115
|
+
lines.append(f"Samples: {len(input_batches)}")
|
|
116
|
+
|
|
117
|
+
lines.append("Token counts:")
|
|
118
|
+
for index, (input_count, output_count) in enumerate(
|
|
119
|
+
zip(input_counts, output_counts), start=1
|
|
120
|
+
):
|
|
121
|
+
prefix = f"#{index} " if is_batch else ""
|
|
122
|
+
delta = output_count - input_count
|
|
123
|
+
lines.append(f" {prefix}{input_count} -> {output_count} ({delta:+d})")
|
|
124
|
+
if index >= max_rows and len(input_batches) > max_rows:
|
|
125
|
+
remaining = len(input_batches) - max_rows
|
|
126
|
+
lines.append(f" ... {remaining} more samples")
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
lines.append("Metrics:")
|
|
130
|
+
for name, value in self.metrics.items():
|
|
131
|
+
lines.append(f" {name}: {self._format_metric_value(value)}")
|
|
132
|
+
|
|
133
|
+
if input_batches:
|
|
134
|
+
focus_index = 0
|
|
135
|
+
if is_batch and len(input_batches) > 1:
|
|
136
|
+
lines.append("Token drift (first sample):")
|
|
137
|
+
else:
|
|
138
|
+
lines.append("Token drift:")
|
|
139
|
+
input_tokens = input_batches[focus_index]
|
|
140
|
+
output_tokens = output_batches[focus_index]
|
|
141
|
+
rows = max(len(input_tokens), len(output_tokens))
|
|
142
|
+
display_rows = min(rows, max_rows)
|
|
143
|
+
for idx in range(display_rows):
|
|
144
|
+
left = (
|
|
145
|
+
self._format_token(input_tokens[idx], max_length=max_token_length)
|
|
146
|
+
if idx < len(input_tokens)
|
|
147
|
+
else ""
|
|
148
|
+
)
|
|
149
|
+
right = (
|
|
150
|
+
self._format_token(output_tokens[idx], max_length=max_token_length)
|
|
151
|
+
if idx < len(output_tokens)
|
|
152
|
+
else ""
|
|
153
|
+
)
|
|
154
|
+
if idx >= len(input_tokens):
|
|
155
|
+
marker = "+"
|
|
156
|
+
elif idx >= len(output_tokens):
|
|
157
|
+
marker = "-"
|
|
158
|
+
elif input_tokens[idx] == output_tokens[idx]:
|
|
159
|
+
marker = "="
|
|
160
|
+
else:
|
|
161
|
+
marker = "!"
|
|
162
|
+
lines.append(f" {idx + 1:>3}{marker} {left} -> {right}")
|
|
163
|
+
if rows > display_rows:
|
|
164
|
+
lines.append(f" ... {rows - display_rows} more tokens")
|
|
165
|
+
else:
|
|
166
|
+
lines.append("Token drift: (empty input)")
|
|
167
|
+
|
|
168
|
+
return "\n".join(lines)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class MultiAttackResult:
|
|
173
|
+
results: dict[str, AttackResult]
|
|
174
|
+
order: list[str]
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def primary(self) -> AttackResult:
|
|
178
|
+
return self.results[self.order[0]]
|
|
179
|
+
|
|
180
|
+
def to_report(self) -> dict[str, object]:
|
|
181
|
+
return {
|
|
182
|
+
"tokenizers": list(self.order),
|
|
183
|
+
"results": {name: self.results[name].to_report() for name in self.order},
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
def summary(self, *, max_rows: int = 6, max_token_length: int = 24) -> str:
|
|
187
|
+
lines: list[str] = []
|
|
188
|
+
for index, name in enumerate(self.order, start=1):
|
|
189
|
+
lines.append(f"{index}. {name}")
|
|
190
|
+
nested = self.results[name].summary(
|
|
191
|
+
max_rows=max_rows,
|
|
192
|
+
max_token_length=max_token_length,
|
|
193
|
+
)
|
|
194
|
+
lines.extend(f" {line}" for line in nested.splitlines())
|
|
195
|
+
return "\n".join(lines)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class Attack:
|
|
199
|
+
"""Orchestrator for applying glitchling corruptions and measuring impact.
|
|
200
|
+
|
|
201
|
+
Attack is a thin orchestrator that coordinates:
|
|
202
|
+
- Glitchling invocation (impure: may use Rust FFI)
|
|
203
|
+
- Tokenization (impure: resolves tokenizers)
|
|
204
|
+
- Metric computation (impure: calls Rust metrics)
|
|
205
|
+
- Result composition (delegated to pure compose.py helpers)
|
|
206
|
+
|
|
207
|
+
The class validates inputs at construction time (boundary layer)
|
|
208
|
+
and delegates pure operations to compose.py and encode.py modules.
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
def __init__(
|
|
212
|
+
self,
|
|
213
|
+
glitchlings: Glitchling | str | Iterable[str | Glitchling],
|
|
214
|
+
tokenizer: str | Tokenizer | None = None,
|
|
215
|
+
metrics: Mapping[str, Metric] | None = None,
|
|
216
|
+
*,
|
|
217
|
+
seed: int | None = None,
|
|
218
|
+
transcript_target: TranscriptTarget | None = None,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""Initialize an Attack.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
glitchlings: A single Glitchling (including Gaggle), a string specification
|
|
224
|
+
(e.g. 'Typogre(rate=0.05)'), or an iterable of glitchlings/specs.
|
|
225
|
+
tokenizer: Tokenizer name (e.g. 'cl100k_base', 'bert-base-uncased'),
|
|
226
|
+
Tokenizer object, or None (defaults to whitespace).
|
|
227
|
+
metrics: Dictionary of metric functions. If None, defaults are used.
|
|
228
|
+
seed: Optional master seed used when building a Gaggle. When a Gaggle
|
|
229
|
+
instance is provided directly, the seed is applied to that instance
|
|
230
|
+
to keep runs deterministic. Instances are cloned before seeding to
|
|
231
|
+
avoid mutating caller-owned objects.
|
|
232
|
+
transcript_target: Which transcript turns to corrupt. When None (default),
|
|
233
|
+
uses the Gaggle default ("last"). Accepts:
|
|
234
|
+
- "last": corrupt only the last turn (default)
|
|
235
|
+
- "all": corrupt all turns
|
|
236
|
+
- "assistant": corrupt only assistant turns
|
|
237
|
+
- "user": corrupt only user turns
|
|
238
|
+
- int: corrupt a specific index (negative indexing supported)
|
|
239
|
+
- Sequence[int]: corrupt specific indices
|
|
240
|
+
"""
|
|
241
|
+
# Boundary validation and resolution (impure)
|
|
242
|
+
gaggle_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
|
|
243
|
+
cloned_glitchlings = self._clone_glitchling_specs(glitchlings)
|
|
244
|
+
self.glitchlings = coerce_gaggle(
|
|
245
|
+
cloned_glitchlings,
|
|
246
|
+
seed=gaggle_seed,
|
|
247
|
+
apply_seed_to_existing=True,
|
|
248
|
+
transcript_target=transcript_target,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Impure tokenizer resolution
|
|
252
|
+
self.tokenizer = resolve_tokenizer(tokenizer)
|
|
253
|
+
self.tokenizer_info = describe_tokenizer(self.tokenizer, tokenizer)
|
|
254
|
+
|
|
255
|
+
# Metrics setup
|
|
256
|
+
if metrics is None:
|
|
257
|
+
self.metrics: dict[str, Metric] = {
|
|
258
|
+
"jensen_shannon_divergence": jensen_shannon_divergence,
|
|
259
|
+
"normalized_edit_distance": normalized_edit_distance,
|
|
260
|
+
"subsequence_retention": subsequence_retention,
|
|
261
|
+
}
|
|
262
|
+
else:
|
|
263
|
+
self.metrics = dict(metrics)
|
|
264
|
+
|
|
265
|
+
@staticmethod
|
|
266
|
+
def _clone_glitchling_specs(
|
|
267
|
+
glitchlings: Glitchling | str | Iterable[str | Glitchling],
|
|
268
|
+
) -> Glitchling | str | list[str | Glitchling]:
|
|
269
|
+
"""Return cloned glitchling specs so Attack ownership never mutates inputs."""
|
|
270
|
+
if isinstance(glitchlings, Glitchling):
|
|
271
|
+
return glitchlings.clone()
|
|
272
|
+
|
|
273
|
+
if isinstance(glitchlings, str):
|
|
274
|
+
return glitchlings
|
|
275
|
+
|
|
276
|
+
if isinstance(glitchlings, Iterable):
|
|
277
|
+
cloned_specs: list[str | Glitchling] = []
|
|
278
|
+
for entry in glitchlings:
|
|
279
|
+
if isinstance(entry, Glitchling):
|
|
280
|
+
cloned_specs.append(entry.clone())
|
|
281
|
+
else:
|
|
282
|
+
cloned_specs.append(entry)
|
|
283
|
+
return cloned_specs
|
|
284
|
+
|
|
285
|
+
return glitchlings
|
|
286
|
+
|
|
287
|
+
def run(self, text: str | Transcript | Sequence[str]) -> AttackResult:
|
|
288
|
+
"""Apply corruptions and calculate metrics.
|
|
289
|
+
|
|
290
|
+
Supports single strings, batches of strings, and chat transcripts. For
|
|
291
|
+
batched inputs (transcripts or lists of strings) metrics are computed
|
|
292
|
+
per entry and returned as lists.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
text: Input text, transcript, or batch of plain strings to corrupt.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
AttackResult containing original, corrupted, tokens, and metrics.
|
|
299
|
+
"""
|
|
300
|
+
if _is_string_batch(text):
|
|
301
|
+
original_batch = list(text)
|
|
302
|
+
corrupted_batch: list[str] = []
|
|
303
|
+
for entry in original_batch:
|
|
304
|
+
corrupted = self.glitchlings.corrupt(entry)
|
|
305
|
+
if not isinstance(corrupted, str):
|
|
306
|
+
raise TypeError("Attack expected string output when given a batch of strings.")
|
|
307
|
+
corrupted_batch.append(corrupted)
|
|
308
|
+
|
|
309
|
+
return self._compose_result(
|
|
310
|
+
original_container=original_batch,
|
|
311
|
+
corrupted_container=corrupted_batch,
|
|
312
|
+
original_contents=original_batch,
|
|
313
|
+
corrupted_contents=corrupted_batch,
|
|
314
|
+
is_batch=True,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
if is_transcript(text):
|
|
318
|
+
original_transcript = text
|
|
319
|
+
corrupted_transcript = self.glitchlings.corrupt(original_transcript)
|
|
320
|
+
if not is_transcript(corrupted_transcript):
|
|
321
|
+
raise ValueError("Attack expected output type to mirror input type.")
|
|
322
|
+
|
|
323
|
+
original_contents = extract_transcript_contents(original_transcript)
|
|
324
|
+
corrupted_contents = extract_transcript_contents(corrupted_transcript)
|
|
325
|
+
|
|
326
|
+
return self._compose_result(
|
|
327
|
+
original_container=original_transcript,
|
|
328
|
+
corrupted_container=corrupted_transcript,
|
|
329
|
+
original_contents=original_contents,
|
|
330
|
+
corrupted_contents=corrupted_contents,
|
|
331
|
+
is_batch=True,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
if not isinstance(text, str):
|
|
335
|
+
message = (
|
|
336
|
+
"Attack.run expected string, transcript, or list of strings, "
|
|
337
|
+
f"got {type(text).__name__}"
|
|
338
|
+
)
|
|
339
|
+
raise TypeError(message)
|
|
340
|
+
|
|
341
|
+
corrupted = self.glitchlings.corrupt(text)
|
|
342
|
+
if not isinstance(corrupted, str):
|
|
343
|
+
raise TypeError("Attack expected output type to mirror input type.")
|
|
344
|
+
|
|
345
|
+
return self._compose_result(
|
|
346
|
+
original_container=text,
|
|
347
|
+
corrupted_container=corrupted,
|
|
348
|
+
original_contents=[text],
|
|
349
|
+
corrupted_contents=[corrupted],
|
|
350
|
+
is_batch=False,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
def _compose_result(
|
|
354
|
+
self,
|
|
355
|
+
*,
|
|
356
|
+
original_container: str | Transcript | Sequence[str],
|
|
357
|
+
corrupted_container: str | Transcript | Sequence[str],
|
|
358
|
+
original_contents: list[str],
|
|
359
|
+
corrupted_contents: list[str],
|
|
360
|
+
is_batch: bool,
|
|
361
|
+
) -> AttackResult:
|
|
362
|
+
if len(original_contents) != len(corrupted_contents):
|
|
363
|
+
raise ValueError("Inputs and outputs must contain the same number of entries.")
|
|
364
|
+
|
|
365
|
+
if not original_contents:
|
|
366
|
+
fields = build_empty_result(
|
|
367
|
+
original_container,
|
|
368
|
+
corrupted_container,
|
|
369
|
+
self.tokenizer_info,
|
|
370
|
+
list(self.metrics.keys()),
|
|
371
|
+
)
|
|
372
|
+
return AttackResult(**fields) # type: ignore[arg-type]
|
|
373
|
+
|
|
374
|
+
batched_input_tokens, batched_input_token_ids = encode_batch(
|
|
375
|
+
self.tokenizer, original_contents
|
|
376
|
+
)
|
|
377
|
+
batched_output_tokens, batched_output_token_ids = encode_batch(
|
|
378
|
+
self.tokenizer, corrupted_contents
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
metric_inputs: list[str] | list[list[str]]
|
|
382
|
+
metric_outputs: list[str] | list[list[str]]
|
|
383
|
+
if is_batch:
|
|
384
|
+
metric_inputs = batched_input_tokens
|
|
385
|
+
metric_outputs = batched_output_tokens
|
|
386
|
+
else:
|
|
387
|
+
metric_inputs = batched_input_tokens[0]
|
|
388
|
+
metric_outputs = batched_output_tokens[0]
|
|
389
|
+
|
|
390
|
+
computed_metrics: dict[str, float | list[float]] = {}
|
|
391
|
+
for name, metric_fn in self.metrics.items():
|
|
392
|
+
computed_metrics[name] = metric_fn(metric_inputs, metric_outputs)
|
|
393
|
+
|
|
394
|
+
if not is_batch:
|
|
395
|
+
fields = build_single_result(
|
|
396
|
+
original=cast(str, original_container),
|
|
397
|
+
corrupted=cast(str, corrupted_container),
|
|
398
|
+
input_tokens=batched_input_tokens[0],
|
|
399
|
+
input_token_ids=batched_input_token_ids[0],
|
|
400
|
+
output_tokens=batched_output_tokens[0],
|
|
401
|
+
output_token_ids=batched_output_token_ids[0],
|
|
402
|
+
tokenizer_info=self.tokenizer_info,
|
|
403
|
+
metrics=computed_metrics,
|
|
404
|
+
)
|
|
405
|
+
return AttackResult(**fields) # type: ignore[arg-type]
|
|
406
|
+
|
|
407
|
+
fields = build_batch_result(
|
|
408
|
+
original=original_container,
|
|
409
|
+
corrupted=corrupted_container,
|
|
410
|
+
input_tokens=batched_input_tokens,
|
|
411
|
+
input_token_ids=batched_input_token_ids,
|
|
412
|
+
output_tokens=batched_output_tokens,
|
|
413
|
+
output_token_ids=batched_output_token_ids,
|
|
414
|
+
tokenizer_info=self.tokenizer_info,
|
|
415
|
+
metrics=computed_metrics,
|
|
416
|
+
)
|
|
417
|
+
return AttackResult(**fields) # type: ignore[arg-type]
|
|
418
|
+
|
|
419
|
+
def compare(
|
|
420
|
+
self,
|
|
421
|
+
text: str | Transcript | Sequence[str],
|
|
422
|
+
*,
|
|
423
|
+
tokenizers: Sequence[str | Tokenizer],
|
|
424
|
+
include_self: bool = True,
|
|
425
|
+
) -> MultiAttackResult:
|
|
426
|
+
"""Run the attack across multiple tokenizers for side-by-side comparison."""
|
|
427
|
+
if not tokenizers and not include_self:
|
|
428
|
+
raise ValueError("At least one tokenizer must be provided for comparison.")
|
|
429
|
+
|
|
430
|
+
results: dict[str, AttackResult] = {}
|
|
431
|
+
order: list[str] = []
|
|
432
|
+
seen: set[str] = set()
|
|
433
|
+
|
|
434
|
+
def record(result: AttackResult) -> None:
|
|
435
|
+
if result.tokenizer_info in seen:
|
|
436
|
+
return
|
|
437
|
+
seen.add(result.tokenizer_info)
|
|
438
|
+
order.append(result.tokenizer_info)
|
|
439
|
+
results[result.tokenizer_info] = result
|
|
440
|
+
|
|
441
|
+
runner_seed = self.glitchlings.seed
|
|
442
|
+
transcript_target = getattr(self.glitchlings, "transcript_target", None)
|
|
443
|
+
|
|
444
|
+
if include_self:
|
|
445
|
+
baseline = Attack(
|
|
446
|
+
self.glitchlings,
|
|
447
|
+
tokenizer=self.tokenizer,
|
|
448
|
+
metrics=self.metrics,
|
|
449
|
+
seed=runner_seed,
|
|
450
|
+
transcript_target=transcript_target,
|
|
451
|
+
).run(text)
|
|
452
|
+
record(baseline)
|
|
453
|
+
|
|
454
|
+
for spec in tokenizers:
|
|
455
|
+
resolved_tokenizer = resolve_tokenizer(spec)
|
|
456
|
+
comparator = Attack(
|
|
457
|
+
self.glitchlings,
|
|
458
|
+
tokenizer=resolved_tokenizer,
|
|
459
|
+
metrics=self.metrics,
|
|
460
|
+
seed=runner_seed,
|
|
461
|
+
transcript_target=transcript_target,
|
|
462
|
+
)
|
|
463
|
+
record(comparator.run(text))
|
|
464
|
+
|
|
465
|
+
return MultiAttackResult(results=results, order=order)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Pure encoding utilities for tokenization.
|
|
2
|
+
|
|
3
|
+
This module contains pure functions for encoding text using tokenizers.
|
|
4
|
+
The functions here do not resolve tokenizers or perform IO - they operate
|
|
5
|
+
on already-resolved Tokenizer instances.
|
|
6
|
+
|
|
7
|
+
Pure guarantees:
|
|
8
|
+
- No import side effects beyond stdlib
|
|
9
|
+
- No file IO or network calls
|
|
10
|
+
- No environment variable access
|
|
11
|
+
- Deterministic output for given inputs
|
|
12
|
+
|
|
13
|
+
The impure tokenizer resolution lives in tokenization.py.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import TYPE_CHECKING, Sequence
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
21
|
+
from .tokenization import Tokenizer
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def encode_single(
|
|
25
|
+
tokenizer: "Tokenizer",
|
|
26
|
+
text: str,
|
|
27
|
+
) -> tuple[list[str], list[int]]:
|
|
28
|
+
"""Encode a single text string into tokens and IDs.
|
|
29
|
+
|
|
30
|
+
This is a thin wrapper that ensures list output types.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
tokenizer: A resolved tokenizer instance.
|
|
34
|
+
text: Text to encode.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Tuple of (tokens, token_ids) as lists.
|
|
38
|
+
"""
|
|
39
|
+
tokens, ids = tokenizer.encode(text)
|
|
40
|
+
return list(tokens), list(ids)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def encode_batch(
|
|
44
|
+
tokenizer: "Tokenizer",
|
|
45
|
+
texts: Sequence[str],
|
|
46
|
+
) -> tuple[list[list[str]], list[list[int]]]:
|
|
47
|
+
"""Encode multiple texts into batched tokens and IDs.
|
|
48
|
+
|
|
49
|
+
Attempts to use the tokenizer's batch_encode method if available,
|
|
50
|
+
otherwise falls back to per-item encoding.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
tokenizer: A resolved tokenizer instance.
|
|
54
|
+
texts: Sequence of texts to encode.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Tuple of (token_batches, id_batches) as nested lists.
|
|
58
|
+
"""
|
|
59
|
+
# Try batch encoding if available
|
|
60
|
+
batch_encode = getattr(tokenizer, "encode_batch", None)
|
|
61
|
+
if callable(batch_encode):
|
|
62
|
+
encoded = batch_encode(texts)
|
|
63
|
+
token_batches: list[list[str]] = []
|
|
64
|
+
id_batches: list[list[int]] = []
|
|
65
|
+
for tokens, ids in encoded:
|
|
66
|
+
token_batches.append(list(tokens))
|
|
67
|
+
id_batches.append(list(ids))
|
|
68
|
+
return token_batches, id_batches
|
|
69
|
+
|
|
70
|
+
# Fallback: encode each text individually
|
|
71
|
+
token_batches_fallback: list[list[str]] = []
|
|
72
|
+
id_batches_fallback: list[list[int]] = []
|
|
73
|
+
for entry in texts:
|
|
74
|
+
tokens, ids = encode_single(tokenizer, entry)
|
|
75
|
+
token_batches_fallback.append(tokens)
|
|
76
|
+
id_batches_fallback.append(ids)
|
|
77
|
+
return token_batches_fallback, id_batches_fallback
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def describe_tokenizer(
|
|
81
|
+
tokenizer: "Tokenizer",
|
|
82
|
+
raw_spec: "str | Tokenizer | None",
|
|
83
|
+
) -> str:
|
|
84
|
+
"""Generate a human-readable description of a tokenizer.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
tokenizer: The resolved tokenizer instance.
|
|
88
|
+
raw_spec: The original specification used to create/resolve the tokenizer.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
A descriptive string identifying the tokenizer.
|
|
92
|
+
"""
|
|
93
|
+
# If the raw spec was a string, use it directly
|
|
94
|
+
if isinstance(raw_spec, str):
|
|
95
|
+
return raw_spec
|
|
96
|
+
|
|
97
|
+
# Try to get a name attribute
|
|
98
|
+
name = getattr(tokenizer, "name", None)
|
|
99
|
+
if isinstance(name, str) and name:
|
|
100
|
+
return name
|
|
101
|
+
|
|
102
|
+
# For None spec, use the class name
|
|
103
|
+
if raw_spec is None:
|
|
104
|
+
return tokenizer.__class__.__name__
|
|
105
|
+
|
|
106
|
+
# Fallback to string representation
|
|
107
|
+
return str(raw_spec)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
__all__ = [
|
|
111
|
+
"describe_tokenizer",
|
|
112
|
+
"encode_batch",
|
|
113
|
+
"encode_single",
|
|
114
|
+
]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
from typing import Any, Protocol, cast
|
|
5
|
+
|
|
6
|
+
from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Metric(Protocol):
|
|
10
|
+
def __call__(
|
|
11
|
+
self,
|
|
12
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
13
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
14
|
+
) -> float | list[float]: ...
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BatchMetric(Protocol):
|
|
18
|
+
def __call__(self, inputs: TokenBatch, outputs: TokenBatch) -> list[float]: ...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
_rust: Any = importlib.import_module("glitchlings._zoo_rust")
|
|
23
|
+
except ModuleNotFoundError as exc: # pragma: no cover - runtime guard
|
|
24
|
+
raise ImportError(
|
|
25
|
+
"Could not import compiled Rust extension. "
|
|
26
|
+
"Please ensure the project is installed with the Rust extension built."
|
|
27
|
+
) from exc
|
|
28
|
+
|
|
29
|
+
_single_jsd = cast(Metric, getattr(_rust, "jensen_shannon_divergence"))
|
|
30
|
+
_single_ned = cast(Metric, getattr(_rust, "normalized_edit_distance"))
|
|
31
|
+
_single_sr = cast(Metric, getattr(_rust, "subsequence_retention"))
|
|
32
|
+
_batch_jsd = cast(BatchMetric, getattr(_rust, "batch_jensen_shannon_divergence"))
|
|
33
|
+
_batch_ned = cast(BatchMetric, getattr(_rust, "batch_normalized_edit_distance"))
|
|
34
|
+
_batch_sr = cast(BatchMetric, getattr(_rust, "batch_subsequence_retention"))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _dispatch_metric(
|
|
38
|
+
original: TokenSequence | TokenBatch,
|
|
39
|
+
corrupted: TokenSequence | TokenBatch,
|
|
40
|
+
*,
|
|
41
|
+
single: Metric,
|
|
42
|
+
batch: BatchMetric,
|
|
43
|
+
name: str,
|
|
44
|
+
) -> float | list[float]:
|
|
45
|
+
"""Dispatch metric computation to single or batch implementation.
|
|
46
|
+
|
|
47
|
+
Uses the pure is_batch function to determine which implementation to call.
|
|
48
|
+
"""
|
|
49
|
+
validate_batch_consistency(original, corrupted, name)
|
|
50
|
+
|
|
51
|
+
if is_batch(original):
|
|
52
|
+
return batch(original, corrupted)
|
|
53
|
+
|
|
54
|
+
return single(original, corrupted)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def jensen_shannon_divergence(
|
|
58
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
59
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
60
|
+
) -> float | list[float]:
|
|
61
|
+
return _dispatch_metric(
|
|
62
|
+
original_tokens,
|
|
63
|
+
corrupted_tokens,
|
|
64
|
+
single=_single_jsd,
|
|
65
|
+
batch=_batch_jsd,
|
|
66
|
+
name="jensen_shannon_divergence",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def normalized_edit_distance(
|
|
71
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
72
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
73
|
+
) -> float | list[float]:
|
|
74
|
+
return _dispatch_metric(
|
|
75
|
+
original_tokens,
|
|
76
|
+
corrupted_tokens,
|
|
77
|
+
single=_single_ned,
|
|
78
|
+
batch=_batch_ned,
|
|
79
|
+
name="normalized_edit_distance",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def subsequence_retention(
|
|
84
|
+
original_tokens: TokenSequence | TokenBatch,
|
|
85
|
+
corrupted_tokens: TokenSequence | TokenBatch,
|
|
86
|
+
) -> float | list[float]:
|
|
87
|
+
return _dispatch_metric(
|
|
88
|
+
original_tokens,
|
|
89
|
+
corrupted_tokens,
|
|
90
|
+
single=_single_sr,
|
|
91
|
+
batch=_batch_sr,
|
|
92
|
+
name="subsequence_retention",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
__all__ = [
|
|
97
|
+
"Metric",
|
|
98
|
+
"BatchMetric",
|
|
99
|
+
"TokenBatch",
|
|
100
|
+
"TokenSequence",
|
|
101
|
+
"jensen_shannon_divergence",
|
|
102
|
+
"normalized_edit_distance",
|
|
103
|
+
"subsequence_retention",
|
|
104
|
+
]
|