glitchlings 0.10.2__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (83) hide show
  1. glitchlings/__init__.py +99 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_zoo_rust/__init__.py +12 -0
  4. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +147 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +493 -0
  21. glitchlings/attack/core_execution.py +367 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +218 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +227 -0
  27. glitchlings/auggie.py +284 -0
  28. glitchlings/compat/__init__.py +9 -0
  29. glitchlings/compat/loaders.py +355 -0
  30. glitchlings/compat/types.py +41 -0
  31. glitchlings/conf/__init__.py +41 -0
  32. glitchlings/conf/loaders.py +331 -0
  33. glitchlings/conf/schema.py +156 -0
  34. glitchlings/conf/types.py +72 -0
  35. glitchlings/config.toml +2 -0
  36. glitchlings/constants.py +59 -0
  37. glitchlings/dev/__init__.py +3 -0
  38. glitchlings/dev/docs.py +45 -0
  39. glitchlings/dlc/__init__.py +19 -0
  40. glitchlings/dlc/_shared.py +296 -0
  41. glitchlings/dlc/gutenberg.py +400 -0
  42. glitchlings/dlc/huggingface.py +68 -0
  43. glitchlings/dlc/prime.py +215 -0
  44. glitchlings/dlc/pytorch.py +98 -0
  45. glitchlings/dlc/pytorch_lightning.py +173 -0
  46. glitchlings/internal/__init__.py +16 -0
  47. glitchlings/internal/rust.py +159 -0
  48. glitchlings/internal/rust_ffi.py +490 -0
  49. glitchlings/main.py +426 -0
  50. glitchlings/protocols.py +91 -0
  51. glitchlings/runtime_config.py +24 -0
  52. glitchlings/util/__init__.py +27 -0
  53. glitchlings/util/adapters.py +65 -0
  54. glitchlings/util/keyboards.py +356 -0
  55. glitchlings/util/transcripts.py +108 -0
  56. glitchlings/zoo/__init__.py +161 -0
  57. glitchlings/zoo/assets/__init__.py +29 -0
  58. glitchlings/zoo/core.py +678 -0
  59. glitchlings/zoo/core_execution.py +154 -0
  60. glitchlings/zoo/core_planning.py +451 -0
  61. glitchlings/zoo/corrupt_dispatch.py +295 -0
  62. glitchlings/zoo/hokey.py +139 -0
  63. glitchlings/zoo/jargoyle.py +243 -0
  64. glitchlings/zoo/mim1c.py +148 -0
  65. glitchlings/zoo/pedant/__init__.py +109 -0
  66. glitchlings/zoo/pedant/core.py +105 -0
  67. glitchlings/zoo/pedant/forms.py +74 -0
  68. glitchlings/zoo/pedant/stones.py +74 -0
  69. glitchlings/zoo/redactyl.py +97 -0
  70. glitchlings/zoo/rng.py +259 -0
  71. glitchlings/zoo/rushmore.py +416 -0
  72. glitchlings/zoo/scannequin.py +66 -0
  73. glitchlings/zoo/transforms.py +346 -0
  74. glitchlings/zoo/typogre.py +128 -0
  75. glitchlings/zoo/validation.py +477 -0
  76. glitchlings/zoo/wherewolf.py +120 -0
  77. glitchlings/zoo/zeedub.py +93 -0
  78. glitchlings-0.10.2.dist-info/METADATA +337 -0
  79. glitchlings-0.10.2.dist-info/RECORD +83 -0
  80. glitchlings-0.10.2.dist-info/WHEEL +5 -0
  81. glitchlings-0.10.2.dist-info/entry_points.txt +3 -0
  82. glitchlings-0.10.2.dist-info/licenses/LICENSE +201 -0
  83. glitchlings-0.10.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,114 @@
1
+ """Pure encoding utilities for tokenization.
2
+
3
+ This module contains pure functions for encoding text using tokenizers.
4
+ The functions here do not resolve tokenizers or perform IO - they operate
5
+ on already-resolved Tokenizer instances.
6
+
7
+ Pure guarantees:
8
+ - No import side effects beyond stdlib
9
+ - No file IO or network calls
10
+ - No environment variable access
11
+ - Deterministic output for given inputs
12
+
13
+ The impure tokenizer resolution lives in tokenization.py.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import TYPE_CHECKING, Sequence
19
+
20
+ if TYPE_CHECKING: # pragma: no cover - typing only
21
+ from .tokenization import Tokenizer
22
+
23
+
24
+ def encode_single(
25
+ tokenizer: "Tokenizer",
26
+ text: str,
27
+ ) -> tuple[list[str], list[int]]:
28
+ """Encode a single text string into tokens and IDs.
29
+
30
+ This is a thin wrapper that ensures list output types.
31
+
32
+ Args:
33
+ tokenizer: A resolved tokenizer instance.
34
+ text: Text to encode.
35
+
36
+ Returns:
37
+ Tuple of (tokens, token_ids) as lists.
38
+ """
39
+ tokens, ids = tokenizer.encode(text)
40
+ return list(tokens), list(ids)
41
+
42
+
43
+ def encode_batch(
44
+ tokenizer: "Tokenizer",
45
+ texts: Sequence[str],
46
+ ) -> tuple[list[list[str]], list[list[int]]]:
47
+ """Encode multiple texts into batched tokens and IDs.
48
+
49
+ Attempts to use the tokenizer's batch_encode method if available,
50
+ otherwise falls back to per-item encoding.
51
+
52
+ Args:
53
+ tokenizer: A resolved tokenizer instance.
54
+ texts: Sequence of texts to encode.
55
+
56
+ Returns:
57
+ Tuple of (token_batches, id_batches) as nested lists.
58
+ """
59
+ # Try batch encoding if available
60
+ batch_encode = getattr(tokenizer, "encode_batch", None)
61
+ if callable(batch_encode):
62
+ encoded = batch_encode(texts)
63
+ token_batches: list[list[str]] = []
64
+ id_batches: list[list[int]] = []
65
+ for tokens, ids in encoded:
66
+ token_batches.append(list(tokens))
67
+ id_batches.append(list(ids))
68
+ return token_batches, id_batches
69
+
70
+ # Fallback: encode each text individually
71
+ token_batches_fallback: list[list[str]] = []
72
+ id_batches_fallback: list[list[int]] = []
73
+ for entry in texts:
74
+ tokens, ids = encode_single(tokenizer, entry)
75
+ token_batches_fallback.append(tokens)
76
+ id_batches_fallback.append(ids)
77
+ return token_batches_fallback, id_batches_fallback
78
+
79
+
80
+ def describe_tokenizer(
81
+ tokenizer: "Tokenizer",
82
+ raw_spec: "str | Tokenizer | None",
83
+ ) -> str:
84
+ """Generate a human-readable description of a tokenizer.
85
+
86
+ Args:
87
+ tokenizer: The resolved tokenizer instance.
88
+ raw_spec: The original specification used to create/resolve the tokenizer.
89
+
90
+ Returns:
91
+ A descriptive string identifying the tokenizer.
92
+ """
93
+ # If the raw spec was a string, use it directly
94
+ if isinstance(raw_spec, str):
95
+ return raw_spec
96
+
97
+ # Try to get a name attribute
98
+ name = getattr(tokenizer, "name", None)
99
+ if isinstance(name, str) and name:
100
+ return name
101
+
102
+ # For None spec, use the class name
103
+ if raw_spec is None:
104
+ return tokenizer.__class__.__name__
105
+
106
+ # Fallback to string representation
107
+ return str(raw_spec)
108
+
109
+
110
+ __all__ = [
111
+ "describe_tokenizer",
112
+ "encode_batch",
113
+ "encode_single",
114
+ ]
@@ -0,0 +1,218 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from enum import Enum
5
+ from typing import TYPE_CHECKING, Any, Protocol, cast
6
+
7
+ from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Callable
11
+
12
+
13
+ class Metric(Protocol):
14
+ def __call__(
15
+ self,
16
+ original_tokens: TokenSequence | TokenBatch,
17
+ corrupted_tokens: TokenSequence | TokenBatch,
18
+ ) -> float | list[float]: ...
19
+
20
+
21
+ class BatchMetric(Protocol):
22
+ def __call__(self, inputs: TokenBatch, outputs: TokenBatch) -> list[float]: ...
23
+
24
+
25
+ try:
26
+ _rust: Any = importlib.import_module("glitchlings._zoo_rust")
27
+ except ModuleNotFoundError as exc: # pragma: no cover - runtime guard
28
+ raise ImportError(
29
+ "Could not import compiled Rust extension. "
30
+ "Please ensure the project is installed with the Rust extension built."
31
+ ) from exc
32
+
33
+ _single_jsd = cast(Metric, getattr(_rust, "jensen_shannon_divergence"))
34
+ _single_ned = cast(Metric, getattr(_rust, "normalized_edit_distance"))
35
+ _single_sr = cast(Metric, getattr(_rust, "subsequence_retention"))
36
+ _single_ed = cast(Metric, getattr(_rust, "entropy_delta"))
37
+ _single_msi = cast(Metric, getattr(_rust, "merge_split_index"))
38
+ _batch_jsd = cast(BatchMetric, getattr(_rust, "batch_jensen_shannon_divergence"))
39
+ _batch_ned = cast(BatchMetric, getattr(_rust, "batch_normalized_edit_distance"))
40
+ _batch_sr = cast(BatchMetric, getattr(_rust, "batch_subsequence_retention"))
41
+ _batch_ed = cast(BatchMetric, getattr(_rust, "batch_entropy_delta"))
42
+ _batch_msi = cast(BatchMetric, getattr(_rust, "batch_merge_split_index"))
43
+
44
+
45
+ def _dispatch_metric(
46
+ original: TokenSequence | TokenBatch,
47
+ corrupted: TokenSequence | TokenBatch,
48
+ *,
49
+ single: Metric,
50
+ batch: BatchMetric,
51
+ name: str,
52
+ ) -> float | list[float]:
53
+ """Dispatch metric computation to single or batch implementation.
54
+
55
+ Uses the pure is_batch function to determine which implementation to call.
56
+ """
57
+ validate_batch_consistency(original, corrupted, name)
58
+
59
+ if is_batch(original):
60
+ return batch(original, corrupted)
61
+
62
+ return single(original, corrupted)
63
+
64
+
65
+ def jensen_shannon_divergence(
66
+ original_tokens: TokenSequence | TokenBatch,
67
+ corrupted_tokens: TokenSequence | TokenBatch,
68
+ ) -> float | list[float]:
69
+ return _dispatch_metric(
70
+ original_tokens,
71
+ corrupted_tokens,
72
+ single=_single_jsd,
73
+ batch=_batch_jsd,
74
+ name="jensen_shannon_divergence",
75
+ )
76
+
77
+
78
+ def normalized_edit_distance(
79
+ original_tokens: TokenSequence | TokenBatch,
80
+ corrupted_tokens: TokenSequence | TokenBatch,
81
+ ) -> float | list[float]:
82
+ return _dispatch_metric(
83
+ original_tokens,
84
+ corrupted_tokens,
85
+ single=_single_ned,
86
+ batch=_batch_ned,
87
+ name="normalized_edit_distance",
88
+ )
89
+
90
+
91
+ def subsequence_retention(
92
+ original_tokens: TokenSequence | TokenBatch,
93
+ corrupted_tokens: TokenSequence | TokenBatch,
94
+ ) -> float | list[float]:
95
+ return _dispatch_metric(
96
+ original_tokens,
97
+ corrupted_tokens,
98
+ single=_single_sr,
99
+ batch=_batch_sr,
100
+ name="subsequence_retention",
101
+ )
102
+
103
+
104
+ def entropy_delta(
105
+ original_tokens: TokenSequence | TokenBatch,
106
+ corrupted_tokens: TokenSequence | TokenBatch,
107
+ ) -> float | list[float]:
108
+ """Compute normalized entropy delta between original and corrupted tokens.
109
+
110
+ Measures the change in token distribution entropy:
111
+ ΔH = H(corrupted) - H(original), normalized to [-1, 1].
112
+
113
+ Positive values indicate the corrupted text has higher entropy
114
+ (more uniform/diverse token distribution). Negative values indicate
115
+ lower entropy (more concentrated distribution).
116
+
117
+ Args:
118
+ original_tokens: Original token sequence(s).
119
+ corrupted_tokens: Corrupted token sequence(s).
120
+
121
+ Returns:
122
+ Normalized entropy delta in [-1, 1], or list for batches.
123
+ """
124
+ return _dispatch_metric(
125
+ original_tokens,
126
+ corrupted_tokens,
127
+ single=_single_ed,
128
+ batch=_batch_ed,
129
+ name="entropy_delta",
130
+ )
131
+
132
+
133
+ def merge_split_index(
134
+ original_tokens: TokenSequence | TokenBatch,
135
+ corrupted_tokens: TokenSequence | TokenBatch,
136
+ ) -> float | list[float]:
137
+ """Compute merge-split index measuring subword restructuring.
138
+
139
+ Estimates 1→k (split) and k→1 (merge) token events from alignment.
140
+ Higher values indicate more dramatic tokenization changes.
141
+
142
+ MSI = (splits + merges) / max(m, n) ∈ [0, 1]
143
+
144
+ Args:
145
+ original_tokens: Original token sequence(s).
146
+ corrupted_tokens: Corrupted token sequence(s).
147
+
148
+ Returns:
149
+ Merge-split index in [0, 1], or list for batches.
150
+ """
151
+ return _dispatch_metric(
152
+ original_tokens,
153
+ corrupted_tokens,
154
+ single=_single_msi,
155
+ batch=_batch_msi,
156
+ name="merge_split_index",
157
+ )
158
+
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # MetricName Enum
162
+ # ---------------------------------------------------------------------------
163
+
164
+
165
+ class MetricName(str, Enum):
166
+ """Built-in metric names.
167
+
168
+ Use these instead of string literals to avoid typos and enable IDE completion.
169
+
170
+ Example:
171
+ >>> attack = Attack(Typogre(), metrics={MetricName.NED: normalized_edit_distance})
172
+ >>> # or get all defaults:
173
+ >>> attack = Attack(Typogre(), metrics=MetricName.defaults())
174
+ """
175
+
176
+ JSD = "jensen_shannon_divergence"
177
+ NED = "normalized_edit_distance"
178
+ SR = "subsequence_retention"
179
+ HD = "entropy_delta"
180
+ MSI = "merge_split_index"
181
+
182
+ @property
183
+ def func(self) -> "Callable[..., float | list[float]]":
184
+ """Get the metric function for this name."""
185
+ return _METRIC_FUNCTIONS[self]
186
+
187
+ @classmethod
188
+ def defaults(cls) -> dict[str, "Callable[..., float | list[float]]"]:
189
+ """Get all built-in metrics as a dictionary.
190
+
191
+ Returns:
192
+ Dictionary mapping metric names to functions.
193
+ """
194
+ return {m.value: m.func for m in cls}
195
+
196
+
197
+ # Mapping from enum to function - populated after functions are defined
198
+ _METRIC_FUNCTIONS: dict[MetricName, "Callable[..., float | list[float]]"] = {
199
+ MetricName.JSD: jensen_shannon_divergence,
200
+ MetricName.NED: normalized_edit_distance,
201
+ MetricName.SR: subsequence_retention,
202
+ MetricName.HD: entropy_delta,
203
+ MetricName.MSI: merge_split_index,
204
+ }
205
+
206
+
207
+ __all__ = [
208
+ "Metric",
209
+ "BatchMetric",
210
+ "MetricName",
211
+ "TokenBatch",
212
+ "TokenSequence",
213
+ "jensen_shannon_divergence",
214
+ "normalized_edit_distance",
215
+ "subsequence_retention",
216
+ "entropy_delta",
217
+ "merge_split_index",
218
+ ]
@@ -0,0 +1,70 @@
1
+ """Pure metric dispatch functions.
2
+
3
+ This module contains pure functions for dispatching metric computations.
4
+ It does not import Rust FFI or perform any IO - it operates on already-
5
+ resolved metric functions.
6
+
7
+ Pure guarantees:
8
+ - No import side effects beyond stdlib
9
+ - No Rust FFI loading
10
+ - Deterministic dispatch logic
11
+
12
+ The impure Rust metric loading lives in metrics.py.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Sequence, TypeGuard
18
+
19
+ TokenSequence = Sequence[str]
20
+ TokenBatch = Sequence[TokenSequence]
21
+
22
+
23
+ def is_batch(tokens: TokenSequence | TokenBatch) -> TypeGuard[TokenBatch]:
24
+ """Determine if tokens represent a batch of sequences.
25
+
26
+ An empty list is treated as an empty batch (returning True) so that
27
+ ``metric([], [])`` returns ``[]`` rather than ``0.0``. This matches
28
+ the behavior of :meth:`Attack.run` when processing empty transcripts.
29
+
30
+ Args:
31
+ tokens: Either a sequence of token strings or a batch of such sequences.
32
+
33
+ Returns:
34
+ True if tokens is a batch (list of lists), False if a single sequence.
35
+ """
36
+ if not tokens:
37
+ return True # Empty list is an empty batch
38
+
39
+ first = tokens[0]
40
+ return isinstance(first, Sequence) and not isinstance(first, (str, bytes))
41
+
42
+
43
+ def validate_batch_consistency(
44
+ original: TokenSequence | TokenBatch,
45
+ corrupted: TokenSequence | TokenBatch,
46
+ metric_name: str,
47
+ ) -> None:
48
+ """Validate that both inputs are consistently batched or single.
49
+
50
+ Args:
51
+ original: Original token sequence or batch.
52
+ corrupted: Corrupted token sequence or batch.
53
+ metric_name: Name of the metric (for error messages).
54
+
55
+ Raises:
56
+ TypeError: If one input is batched and the other isn't.
57
+ """
58
+ original_is_batch = is_batch(original)
59
+ corrupted_is_batch = is_batch(corrupted)
60
+
61
+ if original_is_batch != corrupted_is_batch:
62
+ raise TypeError(f"{metric_name} expects either both batch inputs or both single sequences")
63
+
64
+
65
+ __all__ = [
66
+ "TokenBatch",
67
+ "TokenSequence",
68
+ "is_batch",
69
+ "validate_batch_consistency",
70
+ ]
@@ -0,0 +1,227 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.util
4
+ import zlib
5
+ from typing import Any, Protocol, Sequence
6
+
7
+ DEFAULT_TIKTOKEN_ENCODINGS = ("o200k_base", "cl100k_base")
8
+
9
+
10
+ class Tokenizer(Protocol):
11
+ def encode(self, text: str) -> tuple[list[str], list[int]]: ...
12
+
13
+ def decode(self, tokens: Sequence[str]) -> str: ...
14
+
15
+
16
+ class WhitespaceTokenizer:
17
+ def encode(self, text: str) -> tuple[list[str], list[int]]:
18
+ tokens = text.split()
19
+ # Synthetic IDs based on adler32 hash for stability
20
+ ids = [zlib.adler32(t.encode("utf-8")) & 0xFFFFFFFF for t in tokens]
21
+ return tokens, ids
22
+
23
+ def decode(self, tokens: Sequence[str]) -> str:
24
+ return " ".join(tokens)
25
+
26
+ def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
27
+ return [self.encode(text) for text in texts]
28
+
29
+
30
+ class TiktokenTokenizer:
31
+ def __init__(self, model_name: str):
32
+ import tiktoken
33
+
34
+ self.name = model_name
35
+ try:
36
+ self.enc = tiktoken.get_encoding(model_name)
37
+ except ValueError:
38
+ self.enc = tiktoken.encoding_for_model(model_name)
39
+
40
+ def encode(self, text: str) -> tuple[list[str], list[int]]:
41
+ ids = self.enc.encode(text)
42
+ tokens = [
43
+ self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace") for i in ids
44
+ ]
45
+ return tokens, ids
46
+
47
+ def decode(self, tokens: Sequence[str], sep: str = "") -> str:
48
+ return sep.join(tokens)
49
+
50
+ def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
51
+ id_batches = [list(batch) for batch in self.enc.encode_batch(list(texts))]
52
+ token_batches: list[list[str]] = []
53
+ for ids in id_batches:
54
+ token_batches.append(
55
+ [
56
+ self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace")
57
+ for i in ids
58
+ ]
59
+ )
60
+ return list(zip(token_batches, id_batches))
61
+
62
+
63
+ class HuggingFaceTokenizerWrapper:
64
+ def __init__(self, tokenizer_obj: Any, *, unknown_token: str = "[UNK]"):
65
+ self.tokenizer = tokenizer_obj
66
+ self.unknown_token = unknown_token
67
+
68
+ def encode(self, text: str) -> tuple[list[str], list[int]]:
69
+ # tokenizers.Tokenizer.encode returns an Encoding object
70
+ encoding = self.tokenizer.encode(text)
71
+ return encoding.tokens, encoding.ids
72
+
73
+ def decode(self, tokens: Sequence[str]) -> str:
74
+ # Use the tokenizer's decode method to properly handle model-specific
75
+ # artifacts (e.g., "##" for WordPiece, "Ġ" for BPE).
76
+ # Convert tokens to IDs first, then decode.
77
+ try:
78
+ token_ids = [self.tokenizer.token_to_id(token) for token in tokens]
79
+ # Filter out None values (tokens not in vocabulary)
80
+ valid_ids = [tid for tid in token_ids if tid is not None]
81
+ if valid_ids:
82
+ result: str = self.tokenizer.decode(valid_ids)
83
+ return result
84
+ except (AttributeError, TypeError):
85
+ pass
86
+ # Fallback: decode each token individually to handle artifacts properly
87
+ decoded_tokens = []
88
+ for token in tokens:
89
+ token_id = None
90
+ try:
91
+ token_id = self.tokenizer.token_to_id(token)
92
+ except (AttributeError, TypeError):
93
+ pass
94
+ if token_id is None:
95
+ decoded_tokens.append(self.unknown_token)
96
+ else:
97
+ # Decode the single token ID to properly handle artifacts
98
+ try:
99
+ decoded = self.tokenizer.decode([token_id])
100
+ decoded_tokens.append(decoded)
101
+ except (AttributeError, TypeError):
102
+ # Last resort: strip common prefixes and use token as-is
103
+ clean_token = token.lstrip("Ġ").lstrip("##").lstrip("▁")
104
+ decoded_tokens.append(clean_token if clean_token else token)
105
+ return " ".join(decoded_tokens)
106
+
107
+ def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
108
+ encodings = self.tokenizer.encode_batch(list(texts))
109
+ return [(encoding.tokens, encoding.ids) for encoding in encodings]
110
+
111
+
112
+ def list_available_tokenizers() -> list[str]:
113
+ """List tokenizer names that can be resolved.
114
+
115
+ Returns a list of known tokenizer names including:
116
+ - Tiktoken encodings (if tiktoken is installed)
117
+ - A note about HuggingFace tokenizers (if tokenizers is installed)
118
+ - 'whitespace' (always available)
119
+
120
+ Returns:
121
+ List of available tokenizer names/descriptions.
122
+ """
123
+ available: list[str] = []
124
+
125
+ if importlib.util.find_spec("tiktoken"):
126
+ import tiktoken
127
+
128
+ # Add known tiktoken encodings
129
+ for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
130
+ try:
131
+ tiktoken.get_encoding(encoding)
132
+ available.append(encoding)
133
+ except ValueError:
134
+ pass
135
+ # Add common model names
136
+ available.extend(["gpt-4", "gpt-4o", "gpt-3.5-turbo"])
137
+
138
+ if importlib.util.find_spec("tokenizers"):
139
+ available.append("<any HuggingFace tokenizer name>")
140
+
141
+ available.append("whitespace")
142
+ return available
143
+
144
+
145
+ def resolve_tokenizer(tokenizer: str | Tokenizer | None) -> Tokenizer:
146
+ """Resolve a tokenizer specification to a Tokenizer instance.
147
+
148
+ Args:
149
+ tokenizer: One of:
150
+ - None: Use default tokenizer (tiktoken o200k_base, or whitespace)
151
+ - str: Tokenizer name (tiktoken encoding, model name, or HF tokenizer)
152
+ - Tokenizer: Pass through as-is
153
+
154
+ Returns:
155
+ A Tokenizer instance.
156
+
157
+ Raises:
158
+ ValueError: If string tokenizer cannot be resolved.
159
+ """
160
+ if tokenizer is None:
161
+ return _default_tokenizer()
162
+
163
+ if isinstance(tokenizer, str):
164
+ if importlib.util.find_spec("tiktoken"):
165
+ import tiktoken
166
+
167
+ try:
168
+ # Check if valid tiktoken encoding/model
169
+ try:
170
+ tiktoken.get_encoding(tokenizer)
171
+ return TiktokenTokenizer(tokenizer)
172
+ except ValueError:
173
+ try:
174
+ tiktoken.encoding_for_model(tokenizer)
175
+ return TiktokenTokenizer(tokenizer)
176
+ except (ValueError, KeyError):
177
+ pass
178
+ except ImportError:
179
+ pass
180
+
181
+ if importlib.util.find_spec("tokenizers"):
182
+ from tokenizers import Tokenizer
183
+
184
+ try:
185
+ return HuggingFaceTokenizerWrapper(Tokenizer.from_pretrained(tokenizer))
186
+ except Exception:
187
+ pass
188
+
189
+ available = list_available_tokenizers()
190
+ raise ValueError(
191
+ f"Could not resolve tokenizer: {tokenizer!r}. Available: {', '.join(available)}"
192
+ )
193
+
194
+ # Check if it is a HuggingFace tokenizer object
195
+ if importlib.util.find_spec("tokenizers"):
196
+ from tokenizers import Tokenizer as HFTokenizer
197
+
198
+ if isinstance(tokenizer, HFTokenizer):
199
+ return HuggingFaceTokenizerWrapper(tokenizer)
200
+
201
+ return tokenizer
202
+
203
+
204
+ def _default_tokenizer() -> Tokenizer:
205
+ """Select a modern, lightweight tokenizer with graceful fallbacks."""
206
+ if importlib.util.find_spec("tiktoken"):
207
+ import tiktoken
208
+
209
+ for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
210
+ try:
211
+ tiktoken.get_encoding(encoding)
212
+ return TiktokenTokenizer(encoding)
213
+ except ValueError:
214
+ continue
215
+
216
+ return WhitespaceTokenizer()
217
+
218
+
219
+ __all__ = [
220
+ "DEFAULT_TIKTOKEN_ENCODINGS",
221
+ "HuggingFaceTokenizerWrapper",
222
+ "TiktokenTokenizer",
223
+ "Tokenizer",
224
+ "WhitespaceTokenizer",
225
+ "list_available_tokenizers",
226
+ "resolve_tokenizer",
227
+ ]