glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. glitchlings/__init__.py +101 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_corruption_engine/__init__.py +12 -0
  4. glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +184 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +819 -0
  21. glitchlings/attack/core_execution.py +378 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +211 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +338 -0
  27. glitchlings/attack/tokenizer_metrics.py +373 -0
  28. glitchlings/auggie.py +285 -0
  29. glitchlings/compat/__init__.py +9 -0
  30. glitchlings/compat/loaders.py +355 -0
  31. glitchlings/compat/types.py +41 -0
  32. glitchlings/conf/__init__.py +39 -0
  33. glitchlings/conf/loaders.py +331 -0
  34. glitchlings/conf/schema.py +156 -0
  35. glitchlings/conf/types.py +72 -0
  36. glitchlings/config.toml +2 -0
  37. glitchlings/constants.py +139 -0
  38. glitchlings/dev/__init__.py +3 -0
  39. glitchlings/dev/docs.py +45 -0
  40. glitchlings/dlc/__init__.py +21 -0
  41. glitchlings/dlc/_shared.py +300 -0
  42. glitchlings/dlc/gutenberg.py +400 -0
  43. glitchlings/dlc/huggingface.py +68 -0
  44. glitchlings/dlc/langchain.py +147 -0
  45. glitchlings/dlc/nemo.py +283 -0
  46. glitchlings/dlc/prime.py +215 -0
  47. glitchlings/dlc/pytorch.py +98 -0
  48. glitchlings/dlc/pytorch_lightning.py +173 -0
  49. glitchlings/internal/__init__.py +16 -0
  50. glitchlings/internal/rust.py +159 -0
  51. glitchlings/internal/rust_ffi.py +599 -0
  52. glitchlings/main.py +426 -0
  53. glitchlings/protocols.py +91 -0
  54. glitchlings/runtime_config.py +24 -0
  55. glitchlings/util/__init__.py +41 -0
  56. glitchlings/util/adapters.py +65 -0
  57. glitchlings/util/keyboards.py +508 -0
  58. glitchlings/util/transcripts.py +108 -0
  59. glitchlings/zoo/__init__.py +161 -0
  60. glitchlings/zoo/assets/__init__.py +29 -0
  61. glitchlings/zoo/core.py +852 -0
  62. glitchlings/zoo/core_execution.py +154 -0
  63. glitchlings/zoo/core_planning.py +451 -0
  64. glitchlings/zoo/corrupt_dispatch.py +291 -0
  65. glitchlings/zoo/hokey.py +139 -0
  66. glitchlings/zoo/jargoyle.py +301 -0
  67. glitchlings/zoo/mim1c.py +269 -0
  68. glitchlings/zoo/pedant/__init__.py +109 -0
  69. glitchlings/zoo/pedant/core.py +99 -0
  70. glitchlings/zoo/pedant/forms.py +50 -0
  71. glitchlings/zoo/pedant/stones.py +83 -0
  72. glitchlings/zoo/redactyl.py +94 -0
  73. glitchlings/zoo/rng.py +280 -0
  74. glitchlings/zoo/rushmore.py +416 -0
  75. glitchlings/zoo/scannequin.py +370 -0
  76. glitchlings/zoo/transforms.py +331 -0
  77. glitchlings/zoo/typogre.py +194 -0
  78. glitchlings/zoo/validation.py +643 -0
  79. glitchlings/zoo/wherewolf.py +120 -0
  80. glitchlings/zoo/zeedub.py +165 -0
  81. glitchlings-1.0.0.dist-info/METADATA +404 -0
  82. glitchlings-1.0.0.dist-info/RECORD +86 -0
  83. glitchlings-1.0.0.dist-info/WHEEL +5 -0
  84. glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
  85. glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
  86. glitchlings-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,338 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.util
4
+ import threading
5
+ import zlib
6
+ from typing import Any, Protocol, Sequence
7
+
8
+ DEFAULT_TIKTOKEN_ENCODINGS = ("o200k_base", "cl100k_base")
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # Tokenizer Cache
12
+ # ---------------------------------------------------------------------------
13
+
14
+ _TOKENIZER_CACHE: dict[str, "Tokenizer"] = {}
15
+ _TOKENIZER_CACHE_LOCK = threading.Lock()
16
+ _TOKENIZER_CACHE_MAX_SIZE = 16
17
+
18
+ # Sentinel for default tokenizer cache key (avoids collision with user names)
19
+ _DEFAULT_TOKENIZER_KEY = object()
20
+
21
+
22
+ def _get_cache_key(key: str | object) -> str:
23
+ """Convert cache key to string, handling sentinel."""
24
+ return "__default__" if key is _DEFAULT_TOKENIZER_KEY else str(key)
25
+
26
+
27
+ def _get_cached_tokenizer(key: str | object) -> "Tokenizer | None":
28
+ """Thread-safe lookup of cached tokenizer with LRU refresh."""
29
+ cache_key = _get_cache_key(key)
30
+ with _TOKENIZER_CACHE_LOCK:
31
+ if cache_key in _TOKENIZER_CACHE:
32
+ # Move to end to mark as recently used (true LRU)
33
+ tokenizer = _TOKENIZER_CACHE.pop(cache_key)
34
+ _TOKENIZER_CACHE[cache_key] = tokenizer
35
+ return tokenizer
36
+ return None
37
+
38
+
39
+ def _cache_tokenizer(key: str | object, tokenizer: "Tokenizer") -> "Tokenizer":
40
+ """Thread-safe caching of tokenizer with LRU eviction."""
41
+ cache_key = _get_cache_key(key)
42
+ with _TOKENIZER_CACHE_LOCK:
43
+ # Remove if already exists (will re-add at end)
44
+ if cache_key in _TOKENIZER_CACHE:
45
+ del _TOKENIZER_CACHE[cache_key]
46
+ # Evict oldest if at capacity
47
+ elif len(_TOKENIZER_CACHE) >= _TOKENIZER_CACHE_MAX_SIZE:
48
+ oldest_key = next(iter(_TOKENIZER_CACHE))
49
+ del _TOKENIZER_CACHE[oldest_key]
50
+ _TOKENIZER_CACHE[cache_key] = tokenizer
51
+ return tokenizer
52
+
53
+
54
+ def clear_tokenizer_cache() -> int:
55
+ """Clear the tokenizer cache and return the number of entries cleared.
56
+
57
+ Useful for testing or when memory is constrained.
58
+
59
+ Returns:
60
+ Number of cached tokenizers that were cleared.
61
+ """
62
+ with _TOKENIZER_CACHE_LOCK:
63
+ count = len(_TOKENIZER_CACHE)
64
+ _TOKENIZER_CACHE.clear()
65
+ return count
66
+
67
+
68
+ def get_tokenizer_cache_info() -> dict[str, Any]:
69
+ """Get information about the tokenizer cache.
70
+
71
+ Returns:
72
+ Dictionary with cache stats: size, max_size, cached_keys.
73
+ """
74
+ with _TOKENIZER_CACHE_LOCK:
75
+ return {
76
+ "size": len(_TOKENIZER_CACHE),
77
+ "max_size": _TOKENIZER_CACHE_MAX_SIZE,
78
+ "cached_keys": list(_TOKENIZER_CACHE.keys()),
79
+ }
80
+
81
+
82
+ class Tokenizer(Protocol):
83
+ def encode(self, text: str) -> tuple[list[str], list[int]]: ...
84
+
85
+ def decode(self, tokens: Sequence[str]) -> str: ...
86
+
87
+
88
+ class WhitespaceTokenizer:
89
+ def encode(self, text: str) -> tuple[list[str], list[int]]:
90
+ tokens = text.split()
91
+ # Synthetic IDs based on adler32 hash for stability
92
+ ids = [zlib.adler32(t.encode("utf-8")) & 0xFFFFFFFF for t in tokens]
93
+ return tokens, ids
94
+
95
+ def decode(self, tokens: Sequence[str]) -> str:
96
+ return " ".join(tokens)
97
+
98
+ def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
99
+ return [self.encode(text) for text in texts]
100
+
101
+
102
+ class TiktokenTokenizer:
103
+ def __init__(self, model_name: str):
104
+ import tiktoken
105
+
106
+ self.name = model_name
107
+ try:
108
+ self.enc = tiktoken.get_encoding(model_name)
109
+ except ValueError:
110
+ self.enc = tiktoken.encoding_for_model(model_name)
111
+
112
+ def encode(self, text: str) -> tuple[list[str], list[int]]:
113
+ ids = self.enc.encode(text)
114
+ tokens = [
115
+ self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace") for i in ids
116
+ ]
117
+ return tokens, ids
118
+
119
+ def decode(self, tokens: Sequence[str], sep: str = "") -> str:
120
+ return sep.join(tokens)
121
+
122
+ def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
123
+ id_batches = [list(batch) for batch in self.enc.encode_batch(list(texts))]
124
+ token_batches: list[list[str]] = []
125
+ for ids in id_batches:
126
+ token_batches.append(
127
+ [
128
+ self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace")
129
+ for i in ids
130
+ ]
131
+ )
132
+ return list(zip(token_batches, id_batches))
133
+
134
+
135
+ class HuggingFaceTokenizerWrapper:
136
+ def __init__(self, tokenizer_obj: Any, *, unknown_token: str = "[UNK]"):
137
+ self.tokenizer = tokenizer_obj
138
+ self.unknown_token = unknown_token
139
+
140
+ def encode(self, text: str) -> tuple[list[str], list[int]]:
141
+ # tokenizers.Tokenizer.encode returns an Encoding object
142
+ encoding = self.tokenizer.encode(text)
143
+ return encoding.tokens, encoding.ids
144
+
145
+ def decode(self, tokens: Sequence[str]) -> str:
146
+ # Use the tokenizer's decode method to properly handle model-specific
147
+ # artifacts (e.g., "##" for WordPiece, "Ġ" for BPE).
148
+ # Convert tokens to IDs first, then decode.
149
+ try:
150
+ token_ids = [self.tokenizer.token_to_id(token) for token in tokens]
151
+ # Filter out None values (tokens not in vocabulary)
152
+ valid_ids = [tid for tid in token_ids if tid is not None]
153
+ if valid_ids:
154
+ result: str = self.tokenizer.decode(valid_ids)
155
+ return result
156
+ except (AttributeError, TypeError):
157
+ pass
158
+ # Fallback: decode each token individually to handle artifacts properly
159
+ decoded_tokens = []
160
+ for token in tokens:
161
+ token_id = None
162
+ try:
163
+ token_id = self.tokenizer.token_to_id(token)
164
+ except (AttributeError, TypeError):
165
+ pass
166
+ if token_id is None:
167
+ decoded_tokens.append(self.unknown_token)
168
+ else:
169
+ # Decode the single token ID to properly handle artifacts
170
+ try:
171
+ decoded = self.tokenizer.decode([token_id])
172
+ decoded_tokens.append(decoded)
173
+ except (AttributeError, TypeError):
174
+ # Last resort: strip common prefixes and use token as-is
175
+ clean_token = token.lstrip("Ġ").lstrip("##").lstrip("▁")
176
+ decoded_tokens.append(clean_token if clean_token else token)
177
+ return " ".join(decoded_tokens)
178
+
179
+ def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
180
+ encodings = self.tokenizer.encode_batch(list(texts))
181
+ return [(encoding.tokens, encoding.ids) for encoding in encodings]
182
+
183
+
184
+ def list_available_tokenizers() -> list[str]:
185
+ """List tokenizer names that can be resolved.
186
+
187
+ Returns a list of known tokenizer names including:
188
+ - Tiktoken encodings (if tiktoken is installed)
189
+ - A note about HuggingFace tokenizers (if tokenizers is installed)
190
+ - 'whitespace' (always available)
191
+
192
+ Returns:
193
+ List of available tokenizer names/descriptions.
194
+ """
195
+ available: list[str] = []
196
+
197
+ if importlib.util.find_spec("tiktoken"):
198
+ import tiktoken
199
+
200
+ # Add known tiktoken encodings
201
+ for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
202
+ try:
203
+ tiktoken.get_encoding(encoding)
204
+ available.append(encoding)
205
+ except ValueError:
206
+ pass
207
+ # Add common model names
208
+ available.extend(["gpt-4", "gpt-4o", "gpt-3.5-turbo"])
209
+
210
+ if importlib.util.find_spec("tokenizers"):
211
+ available.append("<any HuggingFace tokenizer name>")
212
+
213
+ available.append("whitespace")
214
+ return available
215
+
216
+
217
+ def resolve_tokenizer(
218
+ tokenizer: str | Tokenizer | None,
219
+ *,
220
+ use_cache: bool = True,
221
+ ) -> Tokenizer:
222
+ """Resolve a tokenizer specification to a Tokenizer instance.
223
+
224
+ Tokenizers resolved from string specifications are cached by default for
225
+ efficient reuse across multiple Attack instances.
226
+
227
+ Args:
228
+ tokenizer: One of:
229
+ - None: Use default tokenizer (tiktoken o200k_base, or whitespace)
230
+ - str: Tokenizer name (tiktoken encoding, model name, or HF tokenizer)
231
+ - Tokenizer: Pass through as-is
232
+ use_cache: Whether to use the tokenizer cache for string specs.
233
+ Defaults to True. Set to False to always create fresh instances.
234
+
235
+ Returns:
236
+ A Tokenizer instance.
237
+
238
+ Raises:
239
+ ValueError: If string tokenizer cannot be resolved.
240
+ """
241
+ if tokenizer is None:
242
+ # Default tokenizer resolution is also cached
243
+ return _resolve_default_tokenizer(use_cache=use_cache)
244
+
245
+ if isinstance(tokenizer, str):
246
+ # Check cache first
247
+ if use_cache:
248
+ cached = _get_cached_tokenizer(tokenizer)
249
+ if cached is not None:
250
+ return cached
251
+
252
+ resolved = _resolve_string_tokenizer(tokenizer)
253
+ if use_cache:
254
+ return _cache_tokenizer(tokenizer, resolved)
255
+ return resolved
256
+
257
+ # Check if it is a HuggingFace tokenizer object
258
+ if importlib.util.find_spec("tokenizers"):
259
+ from tokenizers import Tokenizer as HFTokenizer
260
+
261
+ if isinstance(tokenizer, HFTokenizer):
262
+ return HuggingFaceTokenizerWrapper(tokenizer)
263
+
264
+ return tokenizer
265
+
266
+
267
+ def _resolve_string_tokenizer(tokenizer: str) -> Tokenizer:
268
+ """Resolve a string tokenizer specification (no caching)."""
269
+ if importlib.util.find_spec("tiktoken"):
270
+ import tiktoken
271
+
272
+ try:
273
+ # Check if valid tiktoken encoding/model
274
+ try:
275
+ tiktoken.get_encoding(tokenizer)
276
+ return TiktokenTokenizer(tokenizer)
277
+ except ValueError:
278
+ try:
279
+ tiktoken.encoding_for_model(tokenizer)
280
+ return TiktokenTokenizer(tokenizer)
281
+ except (ValueError, KeyError):
282
+ pass
283
+ except ImportError:
284
+ pass
285
+
286
+ if importlib.util.find_spec("tokenizers"):
287
+ from tokenizers import Tokenizer
288
+
289
+ try:
290
+ return HuggingFaceTokenizerWrapper(Tokenizer.from_pretrained(tokenizer))
291
+ except Exception:
292
+ pass
293
+
294
+ available = list_available_tokenizers()
295
+ raise ValueError(
296
+ f"Could not resolve tokenizer: {tokenizer!r}. Available: {', '.join(available)}"
297
+ )
298
+
299
+
300
+ def _resolve_default_tokenizer(*, use_cache: bool = True) -> Tokenizer:
301
+ """Resolve the default tokenizer with optional caching."""
302
+ if use_cache:
303
+ cached = _get_cached_tokenizer(_DEFAULT_TOKENIZER_KEY)
304
+ if cached is not None:
305
+ return cached
306
+
307
+ resolved = _default_tokenizer()
308
+ if use_cache:
309
+ return _cache_tokenizer(_DEFAULT_TOKENIZER_KEY, resolved)
310
+ return resolved
311
+
312
+
313
+ def _default_tokenizer() -> Tokenizer:
314
+ """Select a modern, lightweight tokenizer with graceful fallbacks."""
315
+ if importlib.util.find_spec("tiktoken"):
316
+ import tiktoken
317
+
318
+ for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
319
+ try:
320
+ tiktoken.get_encoding(encoding)
321
+ return TiktokenTokenizer(encoding)
322
+ except ValueError:
323
+ continue
324
+
325
+ return WhitespaceTokenizer()
326
+
327
+
328
+ __all__ = [
329
+ "DEFAULT_TIKTOKEN_ENCODINGS",
330
+ "HuggingFaceTokenizerWrapper",
331
+ "TiktokenTokenizer",
332
+ "Tokenizer",
333
+ "WhitespaceTokenizer",
334
+ "clear_tokenizer_cache",
335
+ "get_tokenizer_cache_info",
336
+ "list_available_tokenizers",
337
+ "resolve_tokenizer",
338
+ ]
@@ -0,0 +1,373 @@
1
+ """Tokenizer analysis metrics for evaluating tokenizer behavior.
2
+
3
+ This module provides functions for analyzing how a tokenizer encodes text.
4
+ Unlike the corruption metrics in metrics.py which compare before/after token
5
+ sequences, these metrics evaluate the tokenizer's encoding of a single text.
6
+
7
+ These metrics are implemented in Rust for performance. The functions here
8
+ provide a Python API with documentation and type hints.
9
+
10
+ Example:
11
+ >>> from glitchlings.attack.tokenizer_metrics import compression_ratio
12
+ >>> from glitchlings.attack.tokenization import resolve_tokenizer
13
+ >>> tokenizer = resolve_tokenizer("o200k_base")
14
+ >>> text = "Hello, world!"
15
+ >>> tokens, token_ids = tokenizer.encode(text)
16
+ >>> ratio = compression_ratio(text, tokens)
17
+ >>> print(f"Bytes per token: {ratio:.2f}")
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from typing import TYPE_CHECKING, Sequence, cast
23
+
24
+ from ..internal.rust import get_rust_operation
25
+
26
+ if TYPE_CHECKING:
27
+ from .tokenization import Tokenizer
28
+
29
+ # Rust function references (loaded on first use via get_rust_operation)
30
+ _compression_ratio = get_rust_operation("compression_ratio")
31
+ _batch_compression_ratio = get_rust_operation("batch_compression_ratio")
32
+ _characters_per_token = get_rust_operation("characters_per_token")
33
+ _batch_characters_per_token = get_rust_operation("batch_characters_per_token")
34
+ _token_entropy = get_rust_operation("token_entropy")
35
+ _batch_token_entropy = get_rust_operation("batch_token_entropy")
36
+ _vocabulary_utilization = get_rust_operation("vocabulary_utilization")
37
+ _batch_vocabulary_utilization = get_rust_operation("batch_vocabulary_utilization")
38
+ _unknown_token_rate = get_rust_operation("unknown_token_rate")
39
+ _batch_unknown_token_rate = get_rust_operation("batch_unknown_token_rate")
40
+
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Compression Metrics
44
+ # ---------------------------------------------------------------------------
45
+
46
+
47
+ def compression_ratio(text: str, tokens: Sequence[str]) -> float:
48
+ """Compute bytes per token - measures encoding efficiency.
49
+
50
+ Lower values indicate the tokenizer represents the text more compactly.
51
+ Useful for comparing tokenizer suitability across domains.
52
+
53
+ Args:
54
+ text: Input text to measure.
55
+ tokens: Token strings from encoding the text.
56
+
57
+ Returns:
58
+ Ratio of UTF-8 bytes to token count. Returns inf for empty output.
59
+
60
+ Example:
61
+ >>> text = "Hello, world!"
62
+ >>> tokens, _ = tokenizer.encode(text)
63
+ >>> ratio = compression_ratio(text, tokens)
64
+ """
65
+ return cast(float, _compression_ratio(text, list(tokens)))
66
+
67
+
68
+ def batch_compression_ratio(
69
+ texts: Sequence[str],
70
+ token_batches: Sequence[Sequence[str]],
71
+ ) -> list[float]:
72
+ """Compute compression ratios for a batch of texts.
73
+
74
+ Args:
75
+ texts: Input texts to measure.
76
+ token_batches: Token sequences from encoding each text.
77
+
78
+ Returns:
79
+ List of compression ratios for each text.
80
+ """
81
+ return cast(
82
+ list[float],
83
+ _batch_compression_ratio(list(texts), [list(tokens) for tokens in token_batches]),
84
+ )
85
+
86
+
87
+ def characters_per_token(text: str, tokens: Sequence[str]) -> float:
88
+ """Compute average characters per token - simpler efficiency measure.
89
+
90
+ Higher values mean fewer tokens needed. Unlike compression_ratio,
91
+ this ignores UTF-8 encoding costs, so it's more intuitive for
92
+ ASCII-heavy text but less accurate for multilingual content.
93
+
94
+ Args:
95
+ text: Input text to measure.
96
+ tokens: Token strings from encoding the text.
97
+
98
+ Returns:
99
+ Ratio of character count to token count. Returns inf for empty output.
100
+ """
101
+ return cast(float, _characters_per_token(text, list(tokens)))
102
+
103
+
104
+ def batch_characters_per_token(
105
+ texts: Sequence[str],
106
+ token_batches: Sequence[Sequence[str]],
107
+ ) -> list[float]:
108
+ """Compute characters per token for a batch of texts.
109
+
110
+ Args:
111
+ texts: Input texts to measure.
112
+ token_batches: Token sequences from encoding each text.
113
+
114
+ Returns:
115
+ List of characters-per-token ratios for each text.
116
+ """
117
+ return cast(
118
+ list[float],
119
+ _batch_characters_per_token(list(texts), [list(tokens) for tokens in token_batches]),
120
+ )
121
+
122
+
123
+ # ---------------------------------------------------------------------------
124
+ # Token Distribution Metrics
125
+ # ---------------------------------------------------------------------------
126
+
127
+
128
+ def token_entropy(tokens: Sequence[str]) -> float:
129
+ """Compute Shannon entropy of token distribution.
130
+
131
+ Higher entropy means more uniform token usage (less repetition).
132
+ Useful for understanding how "spread out" the vocabulary usage is.
133
+
134
+ Args:
135
+ tokens: Token sequence to analyze.
136
+
137
+ Returns:
138
+ Entropy in bits. Returns 0.0 for empty input.
139
+
140
+ Example:
141
+ >>> tokens = ["the", "cat", "sat", "on", "the", "mat"]
142
+ >>> entropy = token_entropy(tokens)
143
+ """
144
+ return cast(float, _token_entropy(list(tokens)))
145
+
146
+
147
+ def batch_token_entropy(token_batches: Sequence[Sequence[str]]) -> list[float]:
148
+ """Compute token entropy for a batch of token sequences.
149
+
150
+ Args:
151
+ token_batches: Token sequences to analyze.
152
+
153
+ Returns:
154
+ List of entropy values for each sequence.
155
+ """
156
+ return cast(
157
+ list[float],
158
+ _batch_token_entropy([list(tokens) for tokens in token_batches]),
159
+ )
160
+
161
+
162
+ # ---------------------------------------------------------------------------
163
+ # Vocabulary Analysis
164
+ # ---------------------------------------------------------------------------
165
+
166
+
167
+ def vocabulary_utilization(
168
+ tokens: Sequence[str],
169
+ token_ids: Sequence[int],
170
+ ) -> dict[str, float]:
171
+ """Analyze vocabulary usage patterns.
172
+
173
+ Provides insights into how the tokenizer uses its vocabulary for a
174
+ given text. Useful for identifying domain mismatches where the
175
+ tokenizer may be using unusual or sparse regions of its vocabulary.
176
+
177
+ Args:
178
+ tokens: Token strings from encoding.
179
+ token_ids: Corresponding token IDs.
180
+
181
+ Returns:
182
+ Dictionary with:
183
+ - unique_ratio: fraction of tokens that are unique (type/token ratio)
184
+ - repetition_rate: 1 - unique_ratio (how much token reuse)
185
+ - max_id: highest token ID used (hints at vocabulary region)
186
+ - id_spread: stddev of IDs (are we using clustered or spread vocab?)
187
+
188
+ Example:
189
+ >>> tokens, ids = tokenizer.encode("The quick brown fox")
190
+ >>> stats = vocabulary_utilization(tokens, ids)
191
+ >>> print(f"Unique ratio: {stats['unique_ratio']:.2%}")
192
+ """
193
+ result = _vocabulary_utilization(list(tokens), list(token_ids))
194
+ return dict(result)
195
+
196
+
197
+ def batch_vocabulary_utilization(
198
+ token_batches: Sequence[Sequence[str]],
199
+ token_id_batches: Sequence[Sequence[int]],
200
+ ) -> list[dict[str, float]]:
201
+ """Analyze vocabulary usage patterns for a batch of token sequences.
202
+
203
+ Args:
204
+ token_batches: Token string sequences from encoding multiple texts.
205
+ token_id_batches: Corresponding token ID sequences.
206
+
207
+ Returns:
208
+ List of dictionaries, each with:
209
+ - unique_ratio: fraction of tokens that are unique
210
+ - repetition_rate: 1 - unique_ratio
211
+ - max_id: highest token ID used
212
+ - id_spread: stddev of IDs
213
+ """
214
+ results = _batch_vocabulary_utilization(
215
+ [list(tokens) for tokens in token_batches],
216
+ [list(ids) for ids in token_id_batches],
217
+ )
218
+ return [dict(r) for r in results]
219
+
220
+
221
+ # ---------------------------------------------------------------------------
222
+ # Unknown Token Detection
223
+ # ---------------------------------------------------------------------------
224
+
225
+
226
+ DEFAULT_UNKNOWN_MARKERS = ("[UNK]", "<unk>", "�", "\ufffd")
227
+
228
+
229
+ def unknown_token_rate(
230
+ tokens: Sequence[str],
231
+ *,
232
+ unknown_markers: tuple[str, ...] | None = None,
233
+ ) -> float:
234
+ """Compute fraction of tokens that appear to be unknown/fallback tokens.
235
+
236
+ Different tokenizers use different markers for OOV (out-of-vocabulary)
237
+ handling. High rates suggest the tokenizer's vocabulary doesn't cover
238
+ this domain well.
239
+
240
+ Also detects byte fallback tokens (e.g., "<0xFF>") which indicate
241
+ characters that couldn't be represented by the vocabulary.
242
+
243
+ Args:
244
+ tokens: Token sequence to analyze.
245
+ unknown_markers: Tuple of strings that indicate unknown tokens.
246
+ Defaults to common markers like "[UNK]", "<unk>", "�".
247
+
248
+ Returns:
249
+ Fraction of tokens that are unknown/fallback tokens.
250
+
251
+ Example:
252
+ >>> tokens, _ = tokenizer.encode("日本語テスト")
253
+ >>> rate = unknown_token_rate(tokens)
254
+ >>> if rate > 0.1:
255
+ ... print("Warning: high unknown token rate")
256
+ """
257
+ markers = list(unknown_markers) if unknown_markers is not None else None
258
+ return cast(float, _unknown_token_rate(list(tokens), markers))
259
+
260
+
261
+ def batch_unknown_token_rate(
262
+ token_batches: Sequence[Sequence[str]],
263
+ *,
264
+ unknown_markers: tuple[str, ...] | None = None,
265
+ ) -> list[float]:
266
+ """Compute unknown token rates for a batch of token sequences.
267
+
268
+ Args:
269
+ token_batches: Token sequences to analyze.
270
+ unknown_markers: Tuple of strings that indicate unknown tokens.
271
+
272
+ Returns:
273
+ List of unknown token rates for each sequence.
274
+ """
275
+ markers = list(unknown_markers) if unknown_markers is not None else None
276
+ return cast(
277
+ list[float],
278
+ _batch_unknown_token_rate([list(tokens) for tokens in token_batches], markers),
279
+ )
280
+
281
+
282
+ # ---------------------------------------------------------------------------
283
+ # Convenience Functions (using Tokenizer directly)
284
+ # ---------------------------------------------------------------------------
285
+
286
+
287
+ def analyze_tokenizer(
288
+ text: str,
289
+ tokenizer: "Tokenizer",
290
+ *,
291
+ unknown_markers: tuple[str, ...] | None = None,
292
+ ) -> dict[str, float]:
293
+ """Comprehensive tokenizer analysis for a text.
294
+
295
+ Convenience function that encodes the text and computes all tokenizer
296
+ metrics at once.
297
+
298
+ Args:
299
+ text: Input text to analyze.
300
+ tokenizer: Tokenizer to evaluate.
301
+ unknown_markers: Tuple of strings that indicate unknown tokens.
302
+
303
+ Returns:
304
+ Dictionary with all tokenizer metrics:
305
+ - compression_ratio: bytes per token
306
+ - characters_per_token: chars per token
307
+ - token_entropy: Shannon entropy of token distribution
308
+ - unknown_token_rate: fraction of unknown tokens
309
+ - unique_ratio: type/token ratio
310
+ - repetition_rate: 1 - unique_ratio
311
+ - max_id: highest token ID
312
+ - id_spread: standard deviation of token IDs
313
+ - token_count: total number of tokens
314
+
315
+ Example:
316
+ >>> from glitchlings.attack.tokenization import resolve_tokenizer
317
+ >>> tokenizer = resolve_tokenizer("o200k_base")
318
+ >>> stats = analyze_tokenizer("Hello, world!", tokenizer)
319
+ >>> for key, value in stats.items():
320
+ ... print(f"{key}: {value:.4f}")
321
+ """
322
+ if not text:
323
+ return {
324
+ "compression_ratio": 0.0,
325
+ "characters_per_token": 0.0,
326
+ "token_entropy": 0.0,
327
+ "unknown_token_rate": 0.0,
328
+ "unique_ratio": 0.0,
329
+ "repetition_rate": 0.0,
330
+ "max_id": 0.0,
331
+ "id_spread": 0.0,
332
+ "token_count": 0.0,
333
+ }
334
+
335
+ tokens, token_ids = tokenizer.encode(text)
336
+
337
+ # Compute all metrics
338
+ comp_ratio = compression_ratio(text, tokens)
339
+ chars_per_token = characters_per_token(text, tokens)
340
+ entropy = token_entropy(tokens)
341
+ unk_rate = unknown_token_rate(tokens, unknown_markers=unknown_markers)
342
+ vocab_stats = vocabulary_utilization(tokens, token_ids)
343
+
344
+ return {
345
+ "compression_ratio": comp_ratio,
346
+ "characters_per_token": chars_per_token,
347
+ "token_entropy": entropy,
348
+ "unknown_token_rate": unk_rate,
349
+ "unique_ratio": vocab_stats["unique_ratio"],
350
+ "repetition_rate": vocab_stats["repetition_rate"],
351
+ "max_id": vocab_stats["max_id"],
352
+ "id_spread": vocab_stats["id_spread"],
353
+ "token_count": float(len(tokens)),
354
+ }
355
+
356
+
357
+ __all__ = [
358
+ # Core metrics
359
+ "compression_ratio",
360
+ "batch_compression_ratio",
361
+ "characters_per_token",
362
+ "batch_characters_per_token",
363
+ "token_entropy",
364
+ "batch_token_entropy",
365
+ "vocabulary_utilization",
366
+ "batch_vocabulary_utilization",
367
+ "unknown_token_rate",
368
+ "batch_unknown_token_rate",
369
+ # Convenience
370
+ "analyze_tokenizer",
371
+ # Constants
372
+ "DEFAULT_UNKNOWN_MARKERS",
373
+ ]