glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. glitchlings/__init__.py +36 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust/__init__.py +12 -0
  4. glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/pipeline_assets.json +29 -0
  17. glitchlings/attack/__init__.py +53 -0
  18. glitchlings/attack/compose.py +299 -0
  19. glitchlings/attack/core.py +465 -0
  20. glitchlings/attack/encode.py +114 -0
  21. glitchlings/attack/metrics.py +104 -0
  22. glitchlings/attack/metrics_dispatch.py +70 -0
  23. glitchlings/attack/tokenization.py +157 -0
  24. glitchlings/auggie.py +283 -0
  25. glitchlings/compat/__init__.py +9 -0
  26. glitchlings/compat/loaders.py +355 -0
  27. glitchlings/compat/types.py +41 -0
  28. glitchlings/conf/__init__.py +41 -0
  29. glitchlings/conf/loaders.py +331 -0
  30. glitchlings/conf/schema.py +156 -0
  31. glitchlings/conf/types.py +72 -0
  32. glitchlings/config.toml +2 -0
  33. glitchlings/constants.py +59 -0
  34. glitchlings/dev/__init__.py +3 -0
  35. glitchlings/dev/docs.py +45 -0
  36. glitchlings/dlc/__init__.py +17 -3
  37. glitchlings/dlc/_shared.py +296 -0
  38. glitchlings/dlc/gutenberg.py +400 -0
  39. glitchlings/dlc/huggingface.py +37 -65
  40. glitchlings/dlc/prime.py +55 -114
  41. glitchlings/dlc/pytorch.py +98 -0
  42. glitchlings/dlc/pytorch_lightning.py +173 -0
  43. glitchlings/internal/__init__.py +16 -0
  44. glitchlings/internal/rust.py +159 -0
  45. glitchlings/internal/rust_ffi.py +432 -0
  46. glitchlings/main.py +123 -32
  47. glitchlings/runtime_config.py +24 -0
  48. glitchlings/util/__init__.py +29 -176
  49. glitchlings/util/adapters.py +65 -0
  50. glitchlings/util/keyboards.py +311 -0
  51. glitchlings/util/transcripts.py +108 -0
  52. glitchlings/zoo/__init__.py +47 -24
  53. glitchlings/zoo/assets/__init__.py +29 -0
  54. glitchlings/zoo/core.py +301 -167
  55. glitchlings/zoo/core_execution.py +98 -0
  56. glitchlings/zoo/core_planning.py +451 -0
  57. glitchlings/zoo/corrupt_dispatch.py +295 -0
  58. glitchlings/zoo/ekkokin.py +118 -0
  59. glitchlings/zoo/hokey.py +137 -0
  60. glitchlings/zoo/jargoyle.py +179 -274
  61. glitchlings/zoo/mim1c.py +106 -68
  62. glitchlings/zoo/pedant/__init__.py +107 -0
  63. glitchlings/zoo/pedant/core.py +105 -0
  64. glitchlings/zoo/pedant/forms.py +74 -0
  65. glitchlings/zoo/pedant/stones.py +74 -0
  66. glitchlings/zoo/redactyl.py +44 -175
  67. glitchlings/zoo/rng.py +259 -0
  68. glitchlings/zoo/rushmore.py +359 -116
  69. glitchlings/zoo/scannequin.py +18 -125
  70. glitchlings/zoo/transforms.py +386 -0
  71. glitchlings/zoo/typogre.py +76 -162
  72. glitchlings/zoo/validation.py +477 -0
  73. glitchlings/zoo/zeedub.py +33 -86
  74. glitchlings-0.9.3.dist-info/METADATA +334 -0
  75. glitchlings-0.9.3.dist-info/RECORD +80 -0
  76. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
  77. glitchlings/zoo/_ocr_confusions.py +0 -34
  78. glitchlings/zoo/_rate.py +0 -21
  79. glitchlings/zoo/reduple.py +0 -169
  80. glitchlings-0.2.5.dist-info/METADATA +0 -490
  81. glitchlings-0.2.5.dist-info/RECORD +0 -27
  82. /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
  83. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
  84. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
  85. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,465 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable, Mapping, Sequence
4
+ from dataclasses import dataclass
5
+ from typing import Any, TypeGuard, cast
6
+
7
+ from ..conf import DEFAULT_ATTACK_SEED
8
+ from ..util.adapters import coerce_gaggle
9
+ from ..util.transcripts import Transcript, TranscriptTarget, is_transcript
10
+ from ..zoo.core import Glitchling
11
+ from .compose import (
12
+ build_batch_result,
13
+ build_empty_result,
14
+ build_single_result,
15
+ extract_transcript_contents,
16
+ )
17
+ from .encode import describe_tokenizer, encode_batch
18
+ from .metrics import (
19
+ Metric,
20
+ jensen_shannon_divergence,
21
+ normalized_edit_distance,
22
+ subsequence_retention,
23
+ )
24
+ from .tokenization import Tokenizer, resolve_tokenizer
25
+
26
+
27
+ def _is_string_batch(value: Any) -> TypeGuard[Sequence[str]]:
28
+ if isinstance(value, (str, bytes)):
29
+ return False
30
+ if not isinstance(value, Sequence):
31
+ return False
32
+ return all(isinstance(item, str) for item in value)
33
+
34
+
35
+ @dataclass
36
+ class AttackResult:
37
+ original: str | Transcript | Sequence[str]
38
+ corrupted: str | Transcript | Sequence[str]
39
+ input_tokens: list[str] | list[list[str]]
40
+ output_tokens: list[str] | list[list[str]]
41
+ input_token_ids: list[int] | list[list[int]]
42
+ output_token_ids: list[int] | list[list[int]]
43
+ tokenizer_info: str
44
+ metrics: dict[str, float | list[float]]
45
+
46
+ def _tokens_are_batched(self) -> bool:
47
+ tokens = self.input_tokens
48
+ if tokens and isinstance(tokens[0], list):
49
+ return True
50
+ return isinstance(self.original, list) or isinstance(self.corrupted, list)
51
+
52
+ def _token_batches(self) -> tuple[list[list[str]], list[list[str]]]:
53
+ if self._tokens_are_batched():
54
+ return (
55
+ cast(list[list[str]], self.input_tokens),
56
+ cast(list[list[str]], self.output_tokens),
57
+ )
58
+
59
+ return (
60
+ [cast(list[str], self.input_tokens)],
61
+ [cast(list[str], self.output_tokens)],
62
+ )
63
+
64
+ def _token_counts(self) -> tuple[list[int], list[int]]:
65
+ inputs, outputs = self._token_batches()
66
+ return [len(tokens) for tokens in inputs], [len(tokens) for tokens in outputs]
67
+
68
+ @staticmethod
69
+ def _format_metric_value(value: float | list[float]) -> str:
70
+ if isinstance(value, list):
71
+ if not value:
72
+ return "[]"
73
+ if len(value) <= 4:
74
+ rendered = ", ".join(f"{entry:.3f}" for entry in value)
75
+ return f"[{rendered}]"
76
+ total = sum(value)
77
+ minimum = min(value)
78
+ maximum = max(value)
79
+ mean = total / len(value)
80
+ return f"avg={mean:.3f} min={minimum:.3f} max={maximum:.3f}"
81
+
82
+ return f"{value:.3f}"
83
+
84
+ @staticmethod
85
+ def _format_token(token: str, *, max_length: int) -> str:
86
+ clean = token.replace("\n", "\\n")
87
+ if len(clean) > max_length:
88
+ return clean[: max_length - 3] + "..."
89
+ return clean
90
+
91
+ def to_report(self) -> dict[str, object]:
92
+ input_counts, output_counts = self._token_counts()
93
+ return {
94
+ "tokenizer": self.tokenizer_info,
95
+ "original": self.original,
96
+ "corrupted": self.corrupted,
97
+ "input_tokens": self.input_tokens,
98
+ "output_tokens": self.output_tokens,
99
+ "input_token_ids": self.input_token_ids,
100
+ "output_token_ids": self.output_token_ids,
101
+ "token_counts": {
102
+ "input": {"per_sample": input_counts, "total": sum(input_counts)},
103
+ "output": {"per_sample": output_counts, "total": sum(output_counts)},
104
+ },
105
+ "metrics": self.metrics,
106
+ }
107
+
108
+ def summary(self, *, max_rows: int = 8, max_token_length: int = 24) -> str:
109
+ input_batches, output_batches = self._token_batches()
110
+ input_counts, output_counts = self._token_counts()
111
+ is_batch = self._tokens_are_batched()
112
+
113
+ lines: list[str] = [f"Tokenizer: {self.tokenizer_info}"]
114
+ if is_batch:
115
+ lines.append(f"Samples: {len(input_batches)}")
116
+
117
+ lines.append("Token counts:")
118
+ for index, (input_count, output_count) in enumerate(
119
+ zip(input_counts, output_counts), start=1
120
+ ):
121
+ prefix = f"#{index} " if is_batch else ""
122
+ delta = output_count - input_count
123
+ lines.append(f" {prefix}{input_count} -> {output_count} ({delta:+d})")
124
+ if index >= max_rows and len(input_batches) > max_rows:
125
+ remaining = len(input_batches) - max_rows
126
+ lines.append(f" ... {remaining} more samples")
127
+ break
128
+
129
+ lines.append("Metrics:")
130
+ for name, value in self.metrics.items():
131
+ lines.append(f" {name}: {self._format_metric_value(value)}")
132
+
133
+ if input_batches:
134
+ focus_index = 0
135
+ if is_batch and len(input_batches) > 1:
136
+ lines.append("Token drift (first sample):")
137
+ else:
138
+ lines.append("Token drift:")
139
+ input_tokens = input_batches[focus_index]
140
+ output_tokens = output_batches[focus_index]
141
+ rows = max(len(input_tokens), len(output_tokens))
142
+ display_rows = min(rows, max_rows)
143
+ for idx in range(display_rows):
144
+ left = (
145
+ self._format_token(input_tokens[idx], max_length=max_token_length)
146
+ if idx < len(input_tokens)
147
+ else ""
148
+ )
149
+ right = (
150
+ self._format_token(output_tokens[idx], max_length=max_token_length)
151
+ if idx < len(output_tokens)
152
+ else ""
153
+ )
154
+ if idx >= len(input_tokens):
155
+ marker = "+"
156
+ elif idx >= len(output_tokens):
157
+ marker = "-"
158
+ elif input_tokens[idx] == output_tokens[idx]:
159
+ marker = "="
160
+ else:
161
+ marker = "!"
162
+ lines.append(f" {idx + 1:>3}{marker} {left} -> {right}")
163
+ if rows > display_rows:
164
+ lines.append(f" ... {rows - display_rows} more tokens")
165
+ else:
166
+ lines.append("Token drift: (empty input)")
167
+
168
+ return "\n".join(lines)
169
+
170
+
171
+ @dataclass
172
+ class MultiAttackResult:
173
+ results: dict[str, AttackResult]
174
+ order: list[str]
175
+
176
+ @property
177
+ def primary(self) -> AttackResult:
178
+ return self.results[self.order[0]]
179
+
180
+ def to_report(self) -> dict[str, object]:
181
+ return {
182
+ "tokenizers": list(self.order),
183
+ "results": {name: self.results[name].to_report() for name in self.order},
184
+ }
185
+
186
+ def summary(self, *, max_rows: int = 6, max_token_length: int = 24) -> str:
187
+ lines: list[str] = []
188
+ for index, name in enumerate(self.order, start=1):
189
+ lines.append(f"{index}. {name}")
190
+ nested = self.results[name].summary(
191
+ max_rows=max_rows,
192
+ max_token_length=max_token_length,
193
+ )
194
+ lines.extend(f" {line}" for line in nested.splitlines())
195
+ return "\n".join(lines)
196
+
197
+
198
+ class Attack:
199
+ """Orchestrator for applying glitchling corruptions and measuring impact.
200
+
201
+ Attack is a thin orchestrator that coordinates:
202
+ - Glitchling invocation (impure: may use Rust FFI)
203
+ - Tokenization (impure: resolves tokenizers)
204
+ - Metric computation (impure: calls Rust metrics)
205
+ - Result composition (delegated to pure compose.py helpers)
206
+
207
+ The class validates inputs at construction time (boundary layer)
208
+ and delegates pure operations to compose.py and encode.py modules.
209
+ """
210
+
211
+ def __init__(
212
+ self,
213
+ glitchlings: Glitchling | str | Iterable[str | Glitchling],
214
+ tokenizer: str | Tokenizer | None = None,
215
+ metrics: Mapping[str, Metric] | None = None,
216
+ *,
217
+ seed: int | None = None,
218
+ transcript_target: TranscriptTarget | None = None,
219
+ ) -> None:
220
+ """Initialize an Attack.
221
+
222
+ Args:
223
+ glitchlings: A single Glitchling (including Gaggle), a string specification
224
+ (e.g. 'Typogre(rate=0.05)'), or an iterable of glitchlings/specs.
225
+ tokenizer: Tokenizer name (e.g. 'cl100k_base', 'bert-base-uncased'),
226
+ Tokenizer object, or None (defaults to whitespace).
227
+ metrics: Dictionary of metric functions. If None, defaults are used.
228
+ seed: Optional master seed used when building a Gaggle. When a Gaggle
229
+ instance is provided directly, the seed is applied to that instance
230
+ to keep runs deterministic. Instances are cloned before seeding to
231
+ avoid mutating caller-owned objects.
232
+ transcript_target: Which transcript turns to corrupt. When None (default),
233
+ uses the Gaggle default ("last"). Accepts:
234
+ - "last": corrupt only the last turn (default)
235
+ - "all": corrupt all turns
236
+ - "assistant": corrupt only assistant turns
237
+ - "user": corrupt only user turns
238
+ - int: corrupt a specific index (negative indexing supported)
239
+ - Sequence[int]: corrupt specific indices
240
+ """
241
+ # Boundary validation and resolution (impure)
242
+ gaggle_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
243
+ cloned_glitchlings = self._clone_glitchling_specs(glitchlings)
244
+ self.glitchlings = coerce_gaggle(
245
+ cloned_glitchlings,
246
+ seed=gaggle_seed,
247
+ apply_seed_to_existing=True,
248
+ transcript_target=transcript_target,
249
+ )
250
+
251
+ # Impure tokenizer resolution
252
+ self.tokenizer = resolve_tokenizer(tokenizer)
253
+ self.tokenizer_info = describe_tokenizer(self.tokenizer, tokenizer)
254
+
255
+ # Metrics setup
256
+ if metrics is None:
257
+ self.metrics: dict[str, Metric] = {
258
+ "jensen_shannon_divergence": jensen_shannon_divergence,
259
+ "normalized_edit_distance": normalized_edit_distance,
260
+ "subsequence_retention": subsequence_retention,
261
+ }
262
+ else:
263
+ self.metrics = dict(metrics)
264
+
265
+ @staticmethod
266
+ def _clone_glitchling_specs(
267
+ glitchlings: Glitchling | str | Iterable[str | Glitchling],
268
+ ) -> Glitchling | str | list[str | Glitchling]:
269
+ """Return cloned glitchling specs so Attack ownership never mutates inputs."""
270
+ if isinstance(glitchlings, Glitchling):
271
+ return glitchlings.clone()
272
+
273
+ if isinstance(glitchlings, str):
274
+ return glitchlings
275
+
276
+ if isinstance(glitchlings, Iterable):
277
+ cloned_specs: list[str | Glitchling] = []
278
+ for entry in glitchlings:
279
+ if isinstance(entry, Glitchling):
280
+ cloned_specs.append(entry.clone())
281
+ else:
282
+ cloned_specs.append(entry)
283
+ return cloned_specs
284
+
285
+ return glitchlings
286
+
287
+ def run(self, text: str | Transcript | Sequence[str]) -> AttackResult:
288
+ """Apply corruptions and calculate metrics.
289
+
290
+ Supports single strings, batches of strings, and chat transcripts. For
291
+ batched inputs (transcripts or lists of strings) metrics are computed
292
+ per entry and returned as lists.
293
+
294
+ Args:
295
+ text: Input text, transcript, or batch of plain strings to corrupt.
296
+
297
+ Returns:
298
+ AttackResult containing original, corrupted, tokens, and metrics.
299
+ """
300
+ if _is_string_batch(text):
301
+ original_batch = list(text)
302
+ corrupted_batch: list[str] = []
303
+ for entry in original_batch:
304
+ corrupted = self.glitchlings.corrupt(entry)
305
+ if not isinstance(corrupted, str):
306
+ raise TypeError("Attack expected string output when given a batch of strings.")
307
+ corrupted_batch.append(corrupted)
308
+
309
+ return self._compose_result(
310
+ original_container=original_batch,
311
+ corrupted_container=corrupted_batch,
312
+ original_contents=original_batch,
313
+ corrupted_contents=corrupted_batch,
314
+ is_batch=True,
315
+ )
316
+
317
+ if is_transcript(text):
318
+ original_transcript = text
319
+ corrupted_transcript = self.glitchlings.corrupt(original_transcript)
320
+ if not is_transcript(corrupted_transcript):
321
+ raise ValueError("Attack expected output type to mirror input type.")
322
+
323
+ original_contents = extract_transcript_contents(original_transcript)
324
+ corrupted_contents = extract_transcript_contents(corrupted_transcript)
325
+
326
+ return self._compose_result(
327
+ original_container=original_transcript,
328
+ corrupted_container=corrupted_transcript,
329
+ original_contents=original_contents,
330
+ corrupted_contents=corrupted_contents,
331
+ is_batch=True,
332
+ )
333
+
334
+ if not isinstance(text, str):
335
+ message = (
336
+ "Attack.run expected string, transcript, or list of strings, "
337
+ f"got {type(text).__name__}"
338
+ )
339
+ raise TypeError(message)
340
+
341
+ corrupted = self.glitchlings.corrupt(text)
342
+ if not isinstance(corrupted, str):
343
+ raise TypeError("Attack expected output type to mirror input type.")
344
+
345
+ return self._compose_result(
346
+ original_container=text,
347
+ corrupted_container=corrupted,
348
+ original_contents=[text],
349
+ corrupted_contents=[corrupted],
350
+ is_batch=False,
351
+ )
352
+
353
+ def _compose_result(
354
+ self,
355
+ *,
356
+ original_container: str | Transcript | Sequence[str],
357
+ corrupted_container: str | Transcript | Sequence[str],
358
+ original_contents: list[str],
359
+ corrupted_contents: list[str],
360
+ is_batch: bool,
361
+ ) -> AttackResult:
362
+ if len(original_contents) != len(corrupted_contents):
363
+ raise ValueError("Inputs and outputs must contain the same number of entries.")
364
+
365
+ if not original_contents:
366
+ fields = build_empty_result(
367
+ original_container,
368
+ corrupted_container,
369
+ self.tokenizer_info,
370
+ list(self.metrics.keys()),
371
+ )
372
+ return AttackResult(**fields) # type: ignore[arg-type]
373
+
374
+ batched_input_tokens, batched_input_token_ids = encode_batch(
375
+ self.tokenizer, original_contents
376
+ )
377
+ batched_output_tokens, batched_output_token_ids = encode_batch(
378
+ self.tokenizer, corrupted_contents
379
+ )
380
+
381
+ metric_inputs: list[str] | list[list[str]]
382
+ metric_outputs: list[str] | list[list[str]]
383
+ if is_batch:
384
+ metric_inputs = batched_input_tokens
385
+ metric_outputs = batched_output_tokens
386
+ else:
387
+ metric_inputs = batched_input_tokens[0]
388
+ metric_outputs = batched_output_tokens[0]
389
+
390
+ computed_metrics: dict[str, float | list[float]] = {}
391
+ for name, metric_fn in self.metrics.items():
392
+ computed_metrics[name] = metric_fn(metric_inputs, metric_outputs)
393
+
394
+ if not is_batch:
395
+ fields = build_single_result(
396
+ original=cast(str, original_container),
397
+ corrupted=cast(str, corrupted_container),
398
+ input_tokens=batched_input_tokens[0],
399
+ input_token_ids=batched_input_token_ids[0],
400
+ output_tokens=batched_output_tokens[0],
401
+ output_token_ids=batched_output_token_ids[0],
402
+ tokenizer_info=self.tokenizer_info,
403
+ metrics=computed_metrics,
404
+ )
405
+ return AttackResult(**fields) # type: ignore[arg-type]
406
+
407
+ fields = build_batch_result(
408
+ original=original_container,
409
+ corrupted=corrupted_container,
410
+ input_tokens=batched_input_tokens,
411
+ input_token_ids=batched_input_token_ids,
412
+ output_tokens=batched_output_tokens,
413
+ output_token_ids=batched_output_token_ids,
414
+ tokenizer_info=self.tokenizer_info,
415
+ metrics=computed_metrics,
416
+ )
417
+ return AttackResult(**fields) # type: ignore[arg-type]
418
+
419
+ def compare(
420
+ self,
421
+ text: str | Transcript | Sequence[str],
422
+ *,
423
+ tokenizers: Sequence[str | Tokenizer],
424
+ include_self: bool = True,
425
+ ) -> MultiAttackResult:
426
+ """Run the attack across multiple tokenizers for side-by-side comparison."""
427
+ if not tokenizers and not include_self:
428
+ raise ValueError("At least one tokenizer must be provided for comparison.")
429
+
430
+ results: dict[str, AttackResult] = {}
431
+ order: list[str] = []
432
+ seen: set[str] = set()
433
+
434
+ def record(result: AttackResult) -> None:
435
+ if result.tokenizer_info in seen:
436
+ return
437
+ seen.add(result.tokenizer_info)
438
+ order.append(result.tokenizer_info)
439
+ results[result.tokenizer_info] = result
440
+
441
+ runner_seed = self.glitchlings.seed
442
+ transcript_target = getattr(self.glitchlings, "transcript_target", None)
443
+
444
+ if include_self:
445
+ baseline = Attack(
446
+ self.glitchlings,
447
+ tokenizer=self.tokenizer,
448
+ metrics=self.metrics,
449
+ seed=runner_seed,
450
+ transcript_target=transcript_target,
451
+ ).run(text)
452
+ record(baseline)
453
+
454
+ for spec in tokenizers:
455
+ resolved_tokenizer = resolve_tokenizer(spec)
456
+ comparator = Attack(
457
+ self.glitchlings,
458
+ tokenizer=resolved_tokenizer,
459
+ metrics=self.metrics,
460
+ seed=runner_seed,
461
+ transcript_target=transcript_target,
462
+ )
463
+ record(comparator.run(text))
464
+
465
+ return MultiAttackResult(results=results, order=order)
@@ -0,0 +1,114 @@
1
+ """Pure encoding utilities for tokenization.
2
+
3
+ This module contains pure functions for encoding text using tokenizers.
4
+ The functions here do not resolve tokenizers or perform IO - they operate
5
+ on already-resolved Tokenizer instances.
6
+
7
+ Pure guarantees:
8
+ - No import side effects beyond stdlib
9
+ - No file IO or network calls
10
+ - No environment variable access
11
+ - Deterministic output for given inputs
12
+
13
+ The impure tokenizer resolution lives in tokenization.py.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import TYPE_CHECKING, Sequence
19
+
20
+ if TYPE_CHECKING: # pragma: no cover - typing only
21
+ from .tokenization import Tokenizer
22
+
23
+
24
+ def encode_single(
25
+ tokenizer: "Tokenizer",
26
+ text: str,
27
+ ) -> tuple[list[str], list[int]]:
28
+ """Encode a single text string into tokens and IDs.
29
+
30
+ This is a thin wrapper that ensures list output types.
31
+
32
+ Args:
33
+ tokenizer: A resolved tokenizer instance.
34
+ text: Text to encode.
35
+
36
+ Returns:
37
+ Tuple of (tokens, token_ids) as lists.
38
+ """
39
+ tokens, ids = tokenizer.encode(text)
40
+ return list(tokens), list(ids)
41
+
42
+
43
+ def encode_batch(
44
+ tokenizer: "Tokenizer",
45
+ texts: Sequence[str],
46
+ ) -> tuple[list[list[str]], list[list[int]]]:
47
+ """Encode multiple texts into batched tokens and IDs.
48
+
49
+ Attempts to use the tokenizer's batch_encode method if available,
50
+ otherwise falls back to per-item encoding.
51
+
52
+ Args:
53
+ tokenizer: A resolved tokenizer instance.
54
+ texts: Sequence of texts to encode.
55
+
56
+ Returns:
57
+ Tuple of (token_batches, id_batches) as nested lists.
58
+ """
59
+ # Try batch encoding if available
60
+ batch_encode = getattr(tokenizer, "encode_batch", None)
61
+ if callable(batch_encode):
62
+ encoded = batch_encode(texts)
63
+ token_batches: list[list[str]] = []
64
+ id_batches: list[list[int]] = []
65
+ for tokens, ids in encoded:
66
+ token_batches.append(list(tokens))
67
+ id_batches.append(list(ids))
68
+ return token_batches, id_batches
69
+
70
+ # Fallback: encode each text individually
71
+ token_batches_fallback: list[list[str]] = []
72
+ id_batches_fallback: list[list[int]] = []
73
+ for entry in texts:
74
+ tokens, ids = encode_single(tokenizer, entry)
75
+ token_batches_fallback.append(tokens)
76
+ id_batches_fallback.append(ids)
77
+ return token_batches_fallback, id_batches_fallback
78
+
79
+
80
+ def describe_tokenizer(
81
+ tokenizer: "Tokenizer",
82
+ raw_spec: "str | Tokenizer | None",
83
+ ) -> str:
84
+ """Generate a human-readable description of a tokenizer.
85
+
86
+ Args:
87
+ tokenizer: The resolved tokenizer instance.
88
+ raw_spec: The original specification used to create/resolve the tokenizer.
89
+
90
+ Returns:
91
+ A descriptive string identifying the tokenizer.
92
+ """
93
+ # If the raw spec was a string, use it directly
94
+ if isinstance(raw_spec, str):
95
+ return raw_spec
96
+
97
+ # Try to get a name attribute
98
+ name = getattr(tokenizer, "name", None)
99
+ if isinstance(name, str) and name:
100
+ return name
101
+
102
+ # For None spec, use the class name
103
+ if raw_spec is None:
104
+ return tokenizer.__class__.__name__
105
+
106
+ # Fallback to string representation
107
+ return str(raw_spec)
108
+
109
+
110
+ __all__ = [
111
+ "describe_tokenizer",
112
+ "encode_batch",
113
+ "encode_single",
114
+ ]
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from typing import Any, Protocol, cast
5
+
6
+ from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
7
+
8
+
9
+ class Metric(Protocol):
10
+ def __call__(
11
+ self,
12
+ original_tokens: TokenSequence | TokenBatch,
13
+ corrupted_tokens: TokenSequence | TokenBatch,
14
+ ) -> float | list[float]: ...
15
+
16
+
17
+ class BatchMetric(Protocol):
18
+ def __call__(self, inputs: TokenBatch, outputs: TokenBatch) -> list[float]: ...
19
+
20
+
21
+ try:
22
+ _rust: Any = importlib.import_module("glitchlings._zoo_rust")
23
+ except ModuleNotFoundError as exc: # pragma: no cover - runtime guard
24
+ raise ImportError(
25
+ "Could not import compiled Rust extension. "
26
+ "Please ensure the project is installed with the Rust extension built."
27
+ ) from exc
28
+
29
+ _single_jsd = cast(Metric, getattr(_rust, "jensen_shannon_divergence"))
30
+ _single_ned = cast(Metric, getattr(_rust, "normalized_edit_distance"))
31
+ _single_sr = cast(Metric, getattr(_rust, "subsequence_retention"))
32
+ _batch_jsd = cast(BatchMetric, getattr(_rust, "batch_jensen_shannon_divergence"))
33
+ _batch_ned = cast(BatchMetric, getattr(_rust, "batch_normalized_edit_distance"))
34
+ _batch_sr = cast(BatchMetric, getattr(_rust, "batch_subsequence_retention"))
35
+
36
+
37
+ def _dispatch_metric(
38
+ original: TokenSequence | TokenBatch,
39
+ corrupted: TokenSequence | TokenBatch,
40
+ *,
41
+ single: Metric,
42
+ batch: BatchMetric,
43
+ name: str,
44
+ ) -> float | list[float]:
45
+ """Dispatch metric computation to single or batch implementation.
46
+
47
+ Uses the pure is_batch function to determine which implementation to call.
48
+ """
49
+ validate_batch_consistency(original, corrupted, name)
50
+
51
+ if is_batch(original):
52
+ return batch(original, corrupted)
53
+
54
+ return single(original, corrupted)
55
+
56
+
57
+ def jensen_shannon_divergence(
58
+ original_tokens: TokenSequence | TokenBatch,
59
+ corrupted_tokens: TokenSequence | TokenBatch,
60
+ ) -> float | list[float]:
61
+ return _dispatch_metric(
62
+ original_tokens,
63
+ corrupted_tokens,
64
+ single=_single_jsd,
65
+ batch=_batch_jsd,
66
+ name="jensen_shannon_divergence",
67
+ )
68
+
69
+
70
+ def normalized_edit_distance(
71
+ original_tokens: TokenSequence | TokenBatch,
72
+ corrupted_tokens: TokenSequence | TokenBatch,
73
+ ) -> float | list[float]:
74
+ return _dispatch_metric(
75
+ original_tokens,
76
+ corrupted_tokens,
77
+ single=_single_ned,
78
+ batch=_batch_ned,
79
+ name="normalized_edit_distance",
80
+ )
81
+
82
+
83
+ def subsequence_retention(
84
+ original_tokens: TokenSequence | TokenBatch,
85
+ corrupted_tokens: TokenSequence | TokenBatch,
86
+ ) -> float | list[float]:
87
+ return _dispatch_metric(
88
+ original_tokens,
89
+ corrupted_tokens,
90
+ single=_single_sr,
91
+ batch=_batch_sr,
92
+ name="subsequence_retention",
93
+ )
94
+
95
+
96
+ __all__ = [
97
+ "Metric",
98
+ "BatchMetric",
99
+ "TokenBatch",
100
+ "TokenSequence",
101
+ "jensen_shannon_divergence",
102
+ "normalized_edit_distance",
103
+ "subsequence_retention",
104
+ ]