glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. glitchlings/__init__.py +101 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_corruption_engine/__init__.py +12 -0
  4. glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +184 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +819 -0
  21. glitchlings/attack/core_execution.py +378 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +211 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +338 -0
  27. glitchlings/attack/tokenizer_metrics.py +373 -0
  28. glitchlings/auggie.py +285 -0
  29. glitchlings/compat/__init__.py +9 -0
  30. glitchlings/compat/loaders.py +355 -0
  31. glitchlings/compat/types.py +41 -0
  32. glitchlings/conf/__init__.py +39 -0
  33. glitchlings/conf/loaders.py +331 -0
  34. glitchlings/conf/schema.py +156 -0
  35. glitchlings/conf/types.py +72 -0
  36. glitchlings/config.toml +2 -0
  37. glitchlings/constants.py +139 -0
  38. glitchlings/dev/__init__.py +3 -0
  39. glitchlings/dev/docs.py +45 -0
  40. glitchlings/dlc/__init__.py +21 -0
  41. glitchlings/dlc/_shared.py +300 -0
  42. glitchlings/dlc/gutenberg.py +400 -0
  43. glitchlings/dlc/huggingface.py +68 -0
  44. glitchlings/dlc/langchain.py +147 -0
  45. glitchlings/dlc/nemo.py +283 -0
  46. glitchlings/dlc/prime.py +215 -0
  47. glitchlings/dlc/pytorch.py +98 -0
  48. glitchlings/dlc/pytorch_lightning.py +173 -0
  49. glitchlings/internal/__init__.py +16 -0
  50. glitchlings/internal/rust.py +159 -0
  51. glitchlings/internal/rust_ffi.py +599 -0
  52. glitchlings/main.py +426 -0
  53. glitchlings/protocols.py +91 -0
  54. glitchlings/runtime_config.py +24 -0
  55. glitchlings/util/__init__.py +41 -0
  56. glitchlings/util/adapters.py +65 -0
  57. glitchlings/util/keyboards.py +508 -0
  58. glitchlings/util/transcripts.py +108 -0
  59. glitchlings/zoo/__init__.py +161 -0
  60. glitchlings/zoo/assets/__init__.py +29 -0
  61. glitchlings/zoo/core.py +852 -0
  62. glitchlings/zoo/core_execution.py +154 -0
  63. glitchlings/zoo/core_planning.py +451 -0
  64. glitchlings/zoo/corrupt_dispatch.py +291 -0
  65. glitchlings/zoo/hokey.py +139 -0
  66. glitchlings/zoo/jargoyle.py +301 -0
  67. glitchlings/zoo/mim1c.py +269 -0
  68. glitchlings/zoo/pedant/__init__.py +109 -0
  69. glitchlings/zoo/pedant/core.py +99 -0
  70. glitchlings/zoo/pedant/forms.py +50 -0
  71. glitchlings/zoo/pedant/stones.py +83 -0
  72. glitchlings/zoo/redactyl.py +94 -0
  73. glitchlings/zoo/rng.py +280 -0
  74. glitchlings/zoo/rushmore.py +416 -0
  75. glitchlings/zoo/scannequin.py +370 -0
  76. glitchlings/zoo/transforms.py +331 -0
  77. glitchlings/zoo/typogre.py +194 -0
  78. glitchlings/zoo/validation.py +643 -0
  79. glitchlings/zoo/wherewolf.py +120 -0
  80. glitchlings/zoo/zeedub.py +165 -0
  81. glitchlings-1.0.0.dist-info/METADATA +404 -0
  82. glitchlings-1.0.0.dist-info/RECORD +86 -0
  83. glitchlings-1.0.0.dist-info/WHEEL +5 -0
  84. glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
  85. glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
  86. glitchlings-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,114 @@
1
+ """Pure encoding utilities for tokenization.
2
+
3
+ This module contains pure functions for encoding text using tokenizers.
4
+ The functions here do not resolve tokenizers or perform IO - they operate
5
+ on already-resolved Tokenizer instances.
6
+
7
+ Pure guarantees:
8
+ - No import side effects beyond stdlib
9
+ - No file IO or network calls
10
+ - No environment variable access
11
+ - Deterministic output for given inputs
12
+
13
+ The impure tokenizer resolution lives in tokenization.py.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import TYPE_CHECKING, Sequence
19
+
20
+ if TYPE_CHECKING: # pragma: no cover - typing only
21
+ from .tokenization import Tokenizer
22
+
23
+
24
+ def encode_single(
25
+ tokenizer: "Tokenizer",
26
+ text: str,
27
+ ) -> tuple[list[str], list[int]]:
28
+ """Encode a single text string into tokens and IDs.
29
+
30
+ This is a thin wrapper that ensures list output types.
31
+
32
+ Args:
33
+ tokenizer: A resolved tokenizer instance.
34
+ text: Text to encode.
35
+
36
+ Returns:
37
+ Tuple of (tokens, token_ids) as lists.
38
+ """
39
+ tokens, ids = tokenizer.encode(text)
40
+ return list(tokens), list(ids)
41
+
42
+
43
+ def encode_batch(
44
+ tokenizer: "Tokenizer",
45
+ texts: Sequence[str],
46
+ ) -> tuple[list[list[str]], list[list[int]]]:
47
+ """Encode multiple texts into batched tokens and IDs.
48
+
49
+ Attempts to use the tokenizer's batch_encode method if available,
50
+ otherwise falls back to per-item encoding.
51
+
52
+ Args:
53
+ tokenizer: A resolved tokenizer instance.
54
+ texts: Sequence of texts to encode.
55
+
56
+ Returns:
57
+ Tuple of (token_batches, id_batches) as nested lists.
58
+ """
59
+ # Try batch encoding if available
60
+ batch_encode = getattr(tokenizer, "encode_batch", None)
61
+ if callable(batch_encode):
62
+ encoded = batch_encode(texts)
63
+ token_batches: list[list[str]] = []
64
+ id_batches: list[list[int]] = []
65
+ for tokens, ids in encoded:
66
+ token_batches.append(list(tokens))
67
+ id_batches.append(list(ids))
68
+ return token_batches, id_batches
69
+
70
+ # Fallback: encode each text individually
71
+ token_batches_fallback: list[list[str]] = []
72
+ id_batches_fallback: list[list[int]] = []
73
+ for entry in texts:
74
+ tokens, ids = encode_single(tokenizer, entry)
75
+ token_batches_fallback.append(tokens)
76
+ id_batches_fallback.append(ids)
77
+ return token_batches_fallback, id_batches_fallback
78
+
79
+
80
+ def describe_tokenizer(
81
+ tokenizer: "Tokenizer",
82
+ raw_spec: "str | Tokenizer | None",
83
+ ) -> str:
84
+ """Generate a human-readable description of a tokenizer.
85
+
86
+ Args:
87
+ tokenizer: The resolved tokenizer instance.
88
+ raw_spec: The original specification used to create/resolve the tokenizer.
89
+
90
+ Returns:
91
+ A descriptive string identifying the tokenizer.
92
+ """
93
+ # If the raw spec was a string, use it directly
94
+ if isinstance(raw_spec, str):
95
+ return raw_spec
96
+
97
+ # Try to get a name attribute
98
+ name = getattr(tokenizer, "name", None)
99
+ if isinstance(name, str) and name:
100
+ return name
101
+
102
+ # For None spec, use the class name
103
+ if raw_spec is None:
104
+ return tokenizer.__class__.__name__
105
+
106
+ # Fallback to string representation
107
+ return str(raw_spec)
108
+
109
+
110
+ __all__ = [
111
+ "describe_tokenizer",
112
+ "encode_batch",
113
+ "encode_single",
114
+ ]
@@ -0,0 +1,211 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+ from typing import TYPE_CHECKING, Protocol, cast
5
+
6
+ from ..internal.rust import get_rust_operation
7
+ from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Callable
11
+
12
+
13
+ class Metric(Protocol):
14
+ def __call__(
15
+ self,
16
+ original_tokens: TokenSequence | TokenBatch,
17
+ corrupted_tokens: TokenSequence | TokenBatch,
18
+ ) -> float | list[float]: ...
19
+
20
+
21
+ class BatchMetric(Protocol):
22
+ def __call__(self, inputs: TokenBatch, outputs: TokenBatch) -> list[float]: ...
23
+
24
+
25
+ # Rust function references (loaded on first use via get_rust_operation)
26
+ _single_jsd = cast(Metric, get_rust_operation("jensen_shannon_divergence"))
27
+ _single_ned = cast(Metric, get_rust_operation("normalized_edit_distance"))
28
+ _single_sr = cast(Metric, get_rust_operation("subsequence_retention"))
29
+ _single_ed = cast(Metric, get_rust_operation("entropy_delta"))
30
+ _single_msi = cast(Metric, get_rust_operation("merge_split_index"))
31
+ _batch_jsd = cast(BatchMetric, get_rust_operation("batch_jensen_shannon_divergence"))
32
+ _batch_ned = cast(BatchMetric, get_rust_operation("batch_normalized_edit_distance"))
33
+ _batch_sr = cast(BatchMetric, get_rust_operation("batch_subsequence_retention"))
34
+ _batch_ed = cast(BatchMetric, get_rust_operation("batch_entropy_delta"))
35
+ _batch_msi = cast(BatchMetric, get_rust_operation("batch_merge_split_index"))
36
+
37
+
38
+ def _dispatch_metric(
39
+ original: TokenSequence | TokenBatch,
40
+ corrupted: TokenSequence | TokenBatch,
41
+ *,
42
+ single: Metric,
43
+ batch: BatchMetric,
44
+ name: str,
45
+ ) -> float | list[float]:
46
+ """Dispatch metric computation to single or batch implementation.
47
+
48
+ Uses the pure is_batch function to determine which implementation to call.
49
+ """
50
+ validate_batch_consistency(original, corrupted, name)
51
+
52
+ if is_batch(original):
53
+ return batch(original, corrupted)
54
+
55
+ return single(original, corrupted)
56
+
57
+
58
+ def jensen_shannon_divergence(
59
+ original_tokens: TokenSequence | TokenBatch,
60
+ corrupted_tokens: TokenSequence | TokenBatch,
61
+ ) -> float | list[float]:
62
+ return _dispatch_metric(
63
+ original_tokens,
64
+ corrupted_tokens,
65
+ single=_single_jsd,
66
+ batch=_batch_jsd,
67
+ name="jensen_shannon_divergence",
68
+ )
69
+
70
+
71
+ def normalized_edit_distance(
72
+ original_tokens: TokenSequence | TokenBatch,
73
+ corrupted_tokens: TokenSequence | TokenBatch,
74
+ ) -> float | list[float]:
75
+ return _dispatch_metric(
76
+ original_tokens,
77
+ corrupted_tokens,
78
+ single=_single_ned,
79
+ batch=_batch_ned,
80
+ name="normalized_edit_distance",
81
+ )
82
+
83
+
84
+ def subsequence_retention(
85
+ original_tokens: TokenSequence | TokenBatch,
86
+ corrupted_tokens: TokenSequence | TokenBatch,
87
+ ) -> float | list[float]:
88
+ return _dispatch_metric(
89
+ original_tokens,
90
+ corrupted_tokens,
91
+ single=_single_sr,
92
+ batch=_batch_sr,
93
+ name="subsequence_retention",
94
+ )
95
+
96
+
97
+ def entropy_delta(
98
+ original_tokens: TokenSequence | TokenBatch,
99
+ corrupted_tokens: TokenSequence | TokenBatch,
100
+ ) -> float | list[float]:
101
+ """Compute normalized entropy delta between original and corrupted tokens.
102
+
103
+ Measures the change in token distribution entropy:
104
+ ΔH = H(corrupted) - H(original), normalized to [-1, 1].
105
+
106
+ Positive values indicate the corrupted text has higher entropy
107
+ (more uniform/diverse token distribution). Negative values indicate
108
+ lower entropy (more concentrated distribution).
109
+
110
+ Args:
111
+ original_tokens: Original token sequence(s).
112
+ corrupted_tokens: Corrupted token sequence(s).
113
+
114
+ Returns:
115
+ Normalized entropy delta in [-1, 1], or list for batches.
116
+ """
117
+ return _dispatch_metric(
118
+ original_tokens,
119
+ corrupted_tokens,
120
+ single=_single_ed,
121
+ batch=_batch_ed,
122
+ name="entropy_delta",
123
+ )
124
+
125
+
126
+ def merge_split_index(
127
+ original_tokens: TokenSequence | TokenBatch,
128
+ corrupted_tokens: TokenSequence | TokenBatch,
129
+ ) -> float | list[float]:
130
+ """Compute merge-split index measuring subword restructuring.
131
+
132
+ Estimates 1→k (split) and k→1 (merge) token events from alignment.
133
+ Higher values indicate more dramatic tokenization changes.
134
+
135
+ MSI = (splits + merges) / max(m, n) ∈ [0, 1]
136
+
137
+ Args:
138
+ original_tokens: Original token sequence(s).
139
+ corrupted_tokens: Corrupted token sequence(s).
140
+
141
+ Returns:
142
+ Merge-split index in [0, 1], or list for batches.
143
+ """
144
+ return _dispatch_metric(
145
+ original_tokens,
146
+ corrupted_tokens,
147
+ single=_single_msi,
148
+ batch=_batch_msi,
149
+ name="merge_split_index",
150
+ )
151
+
152
+
153
+ # ---------------------------------------------------------------------------
154
+ # MetricName Enum
155
+ # ---------------------------------------------------------------------------
156
+
157
+
158
+ class MetricName(str, Enum):
159
+ """Built-in metric names.
160
+
161
+ Use these instead of string literals to avoid typos and enable IDE completion.
162
+
163
+ Example:
164
+ >>> attack = Attack(Typogre(), metrics={MetricName.NED: normalized_edit_distance})
165
+ >>> # or get all defaults:
166
+ >>> attack = Attack(Typogre(), metrics=MetricName.defaults())
167
+ """
168
+
169
+ JSD = "jensen_shannon_divergence"
170
+ NED = "normalized_edit_distance"
171
+ SR = "subsequence_retention"
172
+ HD = "entropy_delta"
173
+ MSI = "merge_split_index"
174
+
175
+ @property
176
+ def func(self) -> "Callable[..., float | list[float]]":
177
+ """Get the metric function for this name."""
178
+ return _METRIC_FUNCTIONS[self]
179
+
180
+ @classmethod
181
+ def defaults(cls) -> dict[str, "Callable[..., float | list[float]]"]:
182
+ """Get all built-in metrics as a dictionary.
183
+
184
+ Returns:
185
+ Dictionary mapping metric names to functions.
186
+ """
187
+ return {m.value: m.func for m in cls}
188
+
189
+
190
+ # Mapping from enum to function - populated after functions are defined
191
+ _METRIC_FUNCTIONS: dict[MetricName, "Callable[..., float | list[float]]"] = {
192
+ MetricName.JSD: jensen_shannon_divergence,
193
+ MetricName.NED: normalized_edit_distance,
194
+ MetricName.SR: subsequence_retention,
195
+ MetricName.HD: entropy_delta,
196
+ MetricName.MSI: merge_split_index,
197
+ }
198
+
199
+
200
+ __all__ = [
201
+ "Metric",
202
+ "BatchMetric",
203
+ "MetricName",
204
+ "TokenBatch",
205
+ "TokenSequence",
206
+ "jensen_shannon_divergence",
207
+ "normalized_edit_distance",
208
+ "subsequence_retention",
209
+ "entropy_delta",
210
+ "merge_split_index",
211
+ ]
@@ -0,0 +1,70 @@
1
+ """Pure metric dispatch functions.
2
+
3
+ This module contains pure functions for dispatching metric computations.
4
+ It does not import Rust FFI or perform any IO - it operates on already-
5
+ resolved metric functions.
6
+
7
+ Pure guarantees:
8
+ - No import side effects beyond stdlib
9
+ - No Rust FFI loading
10
+ - Deterministic dispatch logic
11
+
12
+ The impure Rust metric loading lives in metrics.py.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Sequence, TypeGuard
18
+
19
+ TokenSequence = Sequence[str]
20
+ TokenBatch = Sequence[TokenSequence]
21
+
22
+
23
+ def is_batch(tokens: TokenSequence | TokenBatch) -> TypeGuard[TokenBatch]:
24
+ """Determine if tokens represent a batch of sequences.
25
+
26
+ An empty list is treated as an empty batch (returning True) so that
27
+ ``metric([], [])`` returns ``[]`` rather than ``0.0``. This matches
28
+ the behavior of :meth:`Attack.run` when processing empty transcripts.
29
+
30
+ Args:
31
+ tokens: Either a sequence of token strings or a batch of such sequences.
32
+
33
+ Returns:
34
+ True if tokens is a batch (list of lists), False if a single sequence.
35
+ """
36
+ if not tokens:
37
+ return True # Empty list is an empty batch
38
+
39
+ first = tokens[0]
40
+ return isinstance(first, Sequence) and not isinstance(first, (str, bytes))
41
+
42
+
43
+ def validate_batch_consistency(
44
+ original: TokenSequence | TokenBatch,
45
+ corrupted: TokenSequence | TokenBatch,
46
+ metric_name: str,
47
+ ) -> None:
48
+ """Validate that both inputs are consistently batched or single.
49
+
50
+ Args:
51
+ original: Original token sequence or batch.
52
+ corrupted: Corrupted token sequence or batch.
53
+ metric_name: Name of the metric (for error messages).
54
+
55
+ Raises:
56
+ TypeError: If one input is batched and the other isn't.
57
+ """
58
+ original_is_batch = is_batch(original)
59
+ corrupted_is_batch = is_batch(corrupted)
60
+
61
+ if original_is_batch != corrupted_is_batch:
62
+ raise TypeError(f"{metric_name} expects either both batch inputs or both single sequences")
63
+
64
+
65
+ __all__ = [
66
+ "TokenBatch",
67
+ "TokenSequence",
68
+ "is_batch",
69
+ "validate_batch_consistency",
70
+ ]