glitchlings 1.0.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +101 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_corruption_engine/__init__.py +12 -0
- glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +184 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +819 -0
- glitchlings/attack/core_execution.py +378 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +211 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +338 -0
- glitchlings/attack/tokenizer_metrics.py +373 -0
- glitchlings/auggie.py +285 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +39 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +139 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +21 -0
- glitchlings/dlc/_shared.py +300 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/langchain.py +147 -0
- glitchlings/dlc/nemo.py +283 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +599 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +41 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +508 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +852 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +291 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +301 -0
- glitchlings/zoo/mim1c.py +269 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +99 -0
- glitchlings/zoo/pedant/forms.py +50 -0
- glitchlings/zoo/pedant/stones.py +83 -0
- glitchlings/zoo/redactyl.py +94 -0
- glitchlings/zoo/rng.py +280 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +370 -0
- glitchlings/zoo/transforms.py +331 -0
- glitchlings/zoo/typogre.py +194 -0
- glitchlings/zoo/validation.py +643 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +165 -0
- glitchlings-1.0.0.dist-info/METADATA +404 -0
- glitchlings-1.0.0.dist-info/RECORD +86 -0
- glitchlings-1.0.0.dist-info/WHEEL +5 -0
- glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
- glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
- glitchlings-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Jargoyle glitchling: Dictionary-based word drift.
|
|
2
|
+
|
|
3
|
+
Jargoyle swaps words with alternatives from bundled lexeme dictionaries.
|
|
4
|
+
Multiple dictionaries are supported:
|
|
5
|
+
- "colors": Color term swapping
|
|
6
|
+
- "synonyms": General synonym substitution
|
|
7
|
+
- "corporate": Business jargon alternatives
|
|
8
|
+
- "academic": Scholarly word substitutions
|
|
9
|
+
- "cyberpunk": Neon cyberpunk slang and gadgetry
|
|
10
|
+
- "lovecraftian": Cosmic horror terminology
|
|
11
|
+
You can also drop additional dictionaries into ``assets/lexemes`` to make
|
|
12
|
+
them available without modifying the code. The backend discovers any
|
|
13
|
+
``*.json`` file in that directory at runtime.
|
|
14
|
+
|
|
15
|
+
Two modes are available:
|
|
16
|
+
- "literal": First entry in each word's alternatives (deterministic mapping)
|
|
17
|
+
- "drift": Random selection from alternatives (probabilistic)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
import random
|
|
24
|
+
from importlib import resources
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Literal, cast
|
|
27
|
+
|
|
28
|
+
from glitchlings.constants import DEFAULT_JARGOYLE_RATE
|
|
29
|
+
from glitchlings.internal.rust_ffi import (
|
|
30
|
+
is_bundled_lexeme_rust,
|
|
31
|
+
list_bundled_lexeme_dictionaries_rust,
|
|
32
|
+
list_lexeme_dictionaries_rust,
|
|
33
|
+
resolve_seed,
|
|
34
|
+
substitute_lexeme_rust,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
|
|
38
|
+
|
|
39
|
+
_LEXEME_ENV_VAR = "GLITCHLINGS_LEXEME_DIR"
|
|
40
|
+
_lexeme_directory_configured = False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _configure_lexeme_directory() -> Path | None:
|
|
44
|
+
"""Expose the bundled lexeme directory to the Rust backend via an env var.
|
|
45
|
+
|
|
46
|
+
This is only needed for discovering custom lexeme files at runtime.
|
|
47
|
+
Built-in lexemes (synonyms, colors, corporate, academic, cyberpunk, lovecraftian)
|
|
48
|
+
are embedded directly in the Rust binary and require no file I/O.
|
|
49
|
+
"""
|
|
50
|
+
global _lexeme_directory_configured
|
|
51
|
+
if _lexeme_directory_configured:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
lexeme_root = resources.files("glitchlings.assets.lexemes")
|
|
56
|
+
except (ModuleNotFoundError, AttributeError):
|
|
57
|
+
_lexeme_directory_configured = True
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
with resources.as_file(lexeme_root) as resolved:
|
|
62
|
+
path = Path(resolved)
|
|
63
|
+
except FileNotFoundError:
|
|
64
|
+
_lexeme_directory_configured = True
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
if not path.is_dir():
|
|
68
|
+
_lexeme_directory_configured = True
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
os.environ.setdefault(_LEXEME_ENV_VAR, str(path))
|
|
72
|
+
_lexeme_directory_configured = True
|
|
73
|
+
return path
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# NOTE: We intentionally do NOT call _configure_lexeme_directory() at module load.
|
|
77
|
+
# Built-in lexemes are embedded in the Rust binary and require no file I/O.
|
|
78
|
+
# The directory configuration is only needed for custom lexeme discovery.
|
|
79
|
+
|
|
80
|
+
DEFAULT_LEXEMES = "synonyms"
|
|
81
|
+
|
|
82
|
+
# Valid modes
|
|
83
|
+
JargoyleMode = Literal["literal", "drift"]
|
|
84
|
+
VALID_MODES = ("literal", "drift")
|
|
85
|
+
DEFAULT_MODE: JargoyleMode = "drift"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _bundled_lexemes() -> list[str]:
|
|
89
|
+
"""Return the list of bundled (embedded) lexeme dictionaries."""
|
|
90
|
+
return sorted({name.lower() for name in list_bundled_lexeme_dictionaries_rust()})
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _available_lexemes() -> list[str]:
|
|
94
|
+
"""Return all available lexeme dictionaries (bundled + custom)."""
|
|
95
|
+
return sorted({name.lower() for name in list_lexeme_dictionaries_rust()})
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _validate_lexemes(name: str) -> str:
|
|
99
|
+
"""Validate and normalize a lexeme dictionary name.
|
|
100
|
+
|
|
101
|
+
For built-in lexemes (bundled in the Rust binary), no file I/O is performed.
|
|
102
|
+
For custom lexemes, the lexeme directory is configured on-demand to discover them.
|
|
103
|
+
"""
|
|
104
|
+
normalized = name.lower()
|
|
105
|
+
|
|
106
|
+
# Fast path: check if it's a bundled lexeme (no file I/O needed)
|
|
107
|
+
if is_bundled_lexeme_rust(normalized):
|
|
108
|
+
return normalized
|
|
109
|
+
|
|
110
|
+
# Slow path: configure directory to discover custom lexemes
|
|
111
|
+
_configure_lexeme_directory()
|
|
112
|
+
|
|
113
|
+
available = _available_lexemes()
|
|
114
|
+
if normalized not in available:
|
|
115
|
+
raise ValueError(f"Invalid lexemes '{name}'. Must be one of: {', '.join(available)}")
|
|
116
|
+
return normalized
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _validate_mode(mode: JargoyleMode | str) -> JargoyleMode:
|
|
120
|
+
normalized = mode.lower()
|
|
121
|
+
if normalized not in VALID_MODES:
|
|
122
|
+
raise ValueError(f"Invalid mode '{mode}'. Must be one of: {', '.join(VALID_MODES)}")
|
|
123
|
+
return cast(JargoyleMode, normalized)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
VALID_LEXEMES = tuple(_bundled_lexemes())
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def list_lexeme_dictionaries() -> list[str]:
|
|
130
|
+
"""Return the list of available lexeme dictionaries.
|
|
131
|
+
|
|
132
|
+
This includes both built-in dictionaries (embedded in the binary) and any
|
|
133
|
+
custom dictionaries found in the lexeme directory.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
List of dictionary names that can be used with Jargoyle.
|
|
137
|
+
"""
|
|
138
|
+
# Configure directory to discover any custom lexemes
|
|
139
|
+
_configure_lexeme_directory()
|
|
140
|
+
return _available_lexemes()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def list_bundled_lexeme_dictionaries() -> list[str]:
|
|
144
|
+
"""Return the list of bundled (built-in) lexeme dictionaries.
|
|
145
|
+
|
|
146
|
+
These dictionaries are embedded directly in the Rust binary and require
|
|
147
|
+
no file I/O to access.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
List of built-in dictionary names: academic, colors, corporate,
|
|
151
|
+
cyberpunk, lovecraftian, synonyms.
|
|
152
|
+
"""
|
|
153
|
+
return _bundled_lexemes()
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def jargoyle_drift(
|
|
157
|
+
text: str,
|
|
158
|
+
*,
|
|
159
|
+
lexemes: str = DEFAULT_LEXEMES,
|
|
160
|
+
mode: JargoyleMode = DEFAULT_MODE,
|
|
161
|
+
rate: float | None = None,
|
|
162
|
+
seed: int | None = None,
|
|
163
|
+
rng: random.Random | None = None,
|
|
164
|
+
) -> str:
|
|
165
|
+
"""Apply dictionary-based word drift to text.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
text: Input text to transform.
|
|
169
|
+
lexemes: Name of the dictionary to use.
|
|
170
|
+
mode: "literal" for deterministic first-entry swaps,
|
|
171
|
+
"drift" for random selection from alternatives.
|
|
172
|
+
rate: Probability of transforming each matching word (0.0 to 1.0).
|
|
173
|
+
seed: Seed for deterministic randomness (only used in "drift" mode).
|
|
174
|
+
rng: Random number generator (alternative to seed).
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Text with word substitutions applied.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
ValueError: If lexemes or mode is invalid.
|
|
181
|
+
"""
|
|
182
|
+
normalized_lexemes = _validate_lexemes(lexemes)
|
|
183
|
+
normalized_mode = _validate_mode(mode)
|
|
184
|
+
|
|
185
|
+
effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else float(rate)
|
|
186
|
+
resolved_seed = resolve_seed(seed, rng) if normalized_mode == "drift" else None
|
|
187
|
+
|
|
188
|
+
return substitute_lexeme_rust(
|
|
189
|
+
text,
|
|
190
|
+
normalized_lexemes,
|
|
191
|
+
normalized_mode,
|
|
192
|
+
effective_rate,
|
|
193
|
+
resolved_seed,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class Jargoyle(Glitchling):
|
|
198
|
+
"""Glitchling that swaps words using bundled lexeme dictionaries.
|
|
199
|
+
|
|
200
|
+
Jargoyle replaces words with alternatives from one of several dictionaries:
|
|
201
|
+
|
|
202
|
+
- **colors**: Swap color terms (e.g., "red" -> "blue").
|
|
203
|
+
- **synonyms**: General synonym substitution (e.g., "fast" -> "rapid").
|
|
204
|
+
- **corporate**: Business jargon alternatives.
|
|
205
|
+
- **academic**: Scholarly word substitutions.
|
|
206
|
+
- **cyberpunk**: Neon cyberpunk slang and gadgetry.
|
|
207
|
+
- **lovecraftian**: Cosmic horror terminology.
|
|
208
|
+
- **custom**: Any ``*.json`` dictionary placed in ``assets/lexemes``.
|
|
209
|
+
|
|
210
|
+
Two modes are supported:
|
|
211
|
+
|
|
212
|
+
- **literal**: Use the first (canonical) entry for each word.
|
|
213
|
+
- **drift**: Randomly select from available alternatives.
|
|
214
|
+
|
|
215
|
+
Example:
|
|
216
|
+
>>> from glitchlings import Jargoyle
|
|
217
|
+
>>> jargoyle = Jargoyle(lexemes="colors", mode="literal")
|
|
218
|
+
>>> jargoyle("The red balloon floated away.")
|
|
219
|
+
'The blue balloon floated away.'
|
|
220
|
+
|
|
221
|
+
>>> jargoyle = Jargoyle(lexemes="synonyms", mode="drift", rate=0.5, seed=42)
|
|
222
|
+
>>> jargoyle("The quick fox jumps fast.")
|
|
223
|
+
'The swift fox jumps rapid.'
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
flavor = "Oh no... The worst person you know just bought a thesaurus..."
|
|
227
|
+
|
|
228
|
+
def __init__(
|
|
229
|
+
self,
|
|
230
|
+
*,
|
|
231
|
+
lexemes: str = DEFAULT_LEXEMES,
|
|
232
|
+
mode: JargoyleMode = DEFAULT_MODE,
|
|
233
|
+
rate: float | None = None,
|
|
234
|
+
seed: int | None = None,
|
|
235
|
+
**kwargs: Any,
|
|
236
|
+
) -> None:
|
|
237
|
+
"""Initialize Jargoyle with the specified dictionary and mode.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
lexemes: Name of the dictionary to use. See ``list_lexeme_dictionaries()``
|
|
241
|
+
for the full, dynamic list (including any custom ``*.json`` files).
|
|
242
|
+
mode: Transformation mode. "literal" for deterministic swaps,
|
|
243
|
+
"drift" for random selection.
|
|
244
|
+
rate: Probability of transforming each matching word (0.0 to 1.0).
|
|
245
|
+
Defaults to 0.01.
|
|
246
|
+
seed: Seed for deterministic randomness.
|
|
247
|
+
"""
|
|
248
|
+
# Validate inputs
|
|
249
|
+
normalized_lexemes = _validate_lexemes(lexemes)
|
|
250
|
+
normalized_mode = _validate_mode(mode)
|
|
251
|
+
|
|
252
|
+
effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else rate
|
|
253
|
+
|
|
254
|
+
super().__init__(
|
|
255
|
+
name="Jargoyle",
|
|
256
|
+
corruption_function=jargoyle_drift,
|
|
257
|
+
scope=AttackWave.WORD,
|
|
258
|
+
order=AttackOrder.NORMAL,
|
|
259
|
+
seed=seed,
|
|
260
|
+
lexemes=normalized_lexemes,
|
|
261
|
+
mode=normalized_mode,
|
|
262
|
+
rate=effective_rate,
|
|
263
|
+
**kwargs,
|
|
264
|
+
# Pass seed explicitly to kwargs so corruption_function receives it
|
|
265
|
+
# (seed is stored separately in base class but needed by jargoyle_drift)
|
|
266
|
+
)
|
|
267
|
+
# Ensure seed is in kwargs for the corruption function
|
|
268
|
+
self.kwargs["seed"] = seed
|
|
269
|
+
|
|
270
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
271
|
+
"""Return the pipeline descriptor for the Rust backend."""
|
|
272
|
+
lexemes = self.kwargs.get("lexemes", DEFAULT_LEXEMES)
|
|
273
|
+
mode = self.kwargs.get("mode", DEFAULT_MODE)
|
|
274
|
+
rate = self.kwargs.get("rate", DEFAULT_JARGOYLE_RATE)
|
|
275
|
+
return cast(
|
|
276
|
+
PipelineOperationPayload,
|
|
277
|
+
{
|
|
278
|
+
"type": "jargoyle",
|
|
279
|
+
"lexemes": str(lexemes),
|
|
280
|
+
"mode": str(mode),
|
|
281
|
+
"rate": float(rate),
|
|
282
|
+
},
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# Module-level singleton for convenience
|
|
287
|
+
jargoyle = Jargoyle()
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
__all__ = [
|
|
291
|
+
"DEFAULT_LEXEMES",
|
|
292
|
+
"DEFAULT_MODE",
|
|
293
|
+
"Jargoyle",
|
|
294
|
+
"JargoyleMode",
|
|
295
|
+
"VALID_LEXEMES",
|
|
296
|
+
"VALID_MODES",
|
|
297
|
+
"jargoyle",
|
|
298
|
+
"jargoyle_drift",
|
|
299
|
+
"list_bundled_lexeme_dictionaries",
|
|
300
|
+
"list_lexeme_dictionaries",
|
|
301
|
+
]
|
glitchlings/zoo/mim1c.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""Rust-backed Mim1c glitchling that swaps characters for homoglyphs.
|
|
2
|
+
|
|
3
|
+
The Mim1c glitchling replaces characters with visually similar confusable
|
|
4
|
+
characters (homoglyphs) based on Unicode Technical Standard #39.
|
|
5
|
+
|
|
6
|
+
## Modes
|
|
7
|
+
|
|
8
|
+
- **single_script** (safest): Only substitute within the same script
|
|
9
|
+
(Latin→Latin variants). Minimal visual disruption.
|
|
10
|
+
- **mixed_script** (default): Allow visually similar cross-script substitutions
|
|
11
|
+
(Latin↔Cyrillic↔Greek). Maximum visual similarity with some mixed scripts.
|
|
12
|
+
- **compatibility**: Include Unicode compatibility variants
|
|
13
|
+
(fullwidth, math alphanumerics). Wider range of substitutions.
|
|
14
|
+
- **aggressive**: All of the above combined. Most aggressive substitution.
|
|
15
|
+
|
|
16
|
+
## Locality Control
|
|
17
|
+
|
|
18
|
+
`max_consecutive` limits how many adjacent characters can be substituted,
|
|
19
|
+
preventing the "ransom note" effect where every character is from a different
|
|
20
|
+
script. Default is 3.
|
|
21
|
+
|
|
22
|
+
## Data Source
|
|
23
|
+
|
|
24
|
+
Confusable mappings derived from Unicode Technical Standard #39 (confusables.txt).
|
|
25
|
+
|
|
26
|
+
## References
|
|
27
|
+
|
|
28
|
+
- **Unicode Technical Standard #39**: Unicode Security Mechanisms
|
|
29
|
+
- https://www.unicode.org/reports/tr39/
|
|
30
|
+
- **confusables.txt**: Official confusable character mappings
|
|
31
|
+
- https://www.unicode.org/Public/security/latest/confusables.txt
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import random
|
|
37
|
+
from collections.abc import Collection, Iterable
|
|
38
|
+
from typing import Any, Literal, cast
|
|
39
|
+
|
|
40
|
+
from glitchlings.constants import (
|
|
41
|
+
DEFAULT_MIM1C_MAX_CONSECUTIVE,
|
|
42
|
+
DEFAULT_MIM1C_MODE,
|
|
43
|
+
DEFAULT_MIM1C_RATE,
|
|
44
|
+
MIM1C_DEFAULT_CLASSES,
|
|
45
|
+
)
|
|
46
|
+
from glitchlings.internal.rust_ffi import resolve_seed, swap_homoglyphs_rust
|
|
47
|
+
|
|
48
|
+
from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
|
|
49
|
+
from .validation import normalize_mim1c_max_consecutive, normalize_mim1c_mode
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _normalise_classes(
|
|
53
|
+
value: object,
|
|
54
|
+
) -> tuple[str, ...] | Literal["all"] | None:
|
|
55
|
+
if value is None:
|
|
56
|
+
return None
|
|
57
|
+
if isinstance(value, str):
|
|
58
|
+
if value.lower() == "all":
|
|
59
|
+
return "all"
|
|
60
|
+
return (value,)
|
|
61
|
+
if isinstance(value, Iterable):
|
|
62
|
+
return tuple(str(item) for item in value)
|
|
63
|
+
raise TypeError("classes must be an iterable of strings or 'all'")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _normalise_banned(value: object) -> tuple[str, ...] | None:
|
|
67
|
+
if value is None:
|
|
68
|
+
return None
|
|
69
|
+
if isinstance(value, str):
|
|
70
|
+
return tuple(value)
|
|
71
|
+
if isinstance(value, Iterable):
|
|
72
|
+
return tuple(str(item) for item in value)
|
|
73
|
+
raise TypeError("banned_characters must be an iterable of strings")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _serialise_classes(
|
|
77
|
+
value: tuple[str, ...] | Literal["all"] | None,
|
|
78
|
+
) -> list[str] | Literal["all"] | None:
|
|
79
|
+
if value is None:
|
|
80
|
+
return None
|
|
81
|
+
if value == "all":
|
|
82
|
+
return "all"
|
|
83
|
+
return list(value)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _serialise_banned(value: tuple[str, ...] | None) -> list[str] | None:
|
|
87
|
+
if value is None:
|
|
88
|
+
return None
|
|
89
|
+
return list(value)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
HomoglyphMode = Literal["single_script", "mixed_script", "compatibility", "aggressive"]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def swap_homoglyphs(
|
|
96
|
+
text: str,
|
|
97
|
+
rate: float | None = None,
|
|
98
|
+
classes: list[str] | Literal["all"] | None = None,
|
|
99
|
+
banned_characters: Collection[str] | None = None,
|
|
100
|
+
seed: int | None = None,
|
|
101
|
+
rng: random.Random | None = None,
|
|
102
|
+
mode: HomoglyphMode | None = None,
|
|
103
|
+
max_consecutive: int | None = None,
|
|
104
|
+
) -> str:
|
|
105
|
+
"""Replace characters with visually confusable homoglyphs via the Rust engine.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
text: The input text to transform.
|
|
109
|
+
rate: Probability of substituting each eligible character. Default 0.02.
|
|
110
|
+
classes: Unicode script classes to include.
|
|
111
|
+
Default ["LATIN", "GREEK", "CYRILLIC", "COMMON"].
|
|
112
|
+
banned_characters: Characters to never use as substitutes.
|
|
113
|
+
seed: Random seed for deterministic behavior.
|
|
114
|
+
rng: Optional random.Random instance (alternative to seed).
|
|
115
|
+
mode: Substitution mode controlling confusable types:
|
|
116
|
+
- "single_script": Only same-script substitutions (safest).
|
|
117
|
+
- "mixed_script": Allow cross-script like Latin↔Cyrillic↔Greek (default).
|
|
118
|
+
- "compatibility": Include fullwidth, math alphanumerics.
|
|
119
|
+
- "aggressive": All confusable types.
|
|
120
|
+
max_consecutive: Maximum consecutive characters to substitute. Default 3.
|
|
121
|
+
Set to 0 for unlimited.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Text with some characters replaced by visually similar confusables.
|
|
125
|
+
"""
|
|
126
|
+
effective_rate = DEFAULT_MIM1C_RATE if rate is None else rate
|
|
127
|
+
effective_mode = normalize_mim1c_mode(mode, DEFAULT_MIM1C_MODE)
|
|
128
|
+
effective_max_consecutive = normalize_mim1c_max_consecutive(
|
|
129
|
+
max_consecutive, DEFAULT_MIM1C_MAX_CONSECUTIVE
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
normalised_classes = _normalise_classes(classes)
|
|
133
|
+
normalised_banned = _normalise_banned(banned_characters)
|
|
134
|
+
|
|
135
|
+
if normalised_classes is None:
|
|
136
|
+
payload_classes: list[str] | Literal["all"] | None = list(MIM1C_DEFAULT_CLASSES)
|
|
137
|
+
else:
|
|
138
|
+
payload_classes = _serialise_classes(normalised_classes)
|
|
139
|
+
payload_banned = _serialise_banned(normalised_banned)
|
|
140
|
+
|
|
141
|
+
return swap_homoglyphs_rust(
|
|
142
|
+
text,
|
|
143
|
+
effective_rate,
|
|
144
|
+
payload_classes,
|
|
145
|
+
payload_banned,
|
|
146
|
+
resolve_seed(seed, rng),
|
|
147
|
+
effective_mode,
|
|
148
|
+
effective_max_consecutive,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class Mim1c(Glitchling):
|
|
153
|
+
"""Glitchling that swaps characters for visually similar homoglyphs.
|
|
154
|
+
|
|
155
|
+
Mim1c replaces characters with visually similar confusable characters
|
|
156
|
+
(homoglyphs) based on Unicode Technical Standard #39.
|
|
157
|
+
|
|
158
|
+
## Modes
|
|
159
|
+
|
|
160
|
+
- **single_script** (safest): Only substitute within the same script
|
|
161
|
+
(Latin→Latin variants). Minimal visual disruption.
|
|
162
|
+
- **mixed_script** (default): Allow visually similar cross-script substitutions
|
|
163
|
+
(Latin↔Cyrillic↔Greek). Maximum visual similarity with some mixed scripts.
|
|
164
|
+
- **compatibility**: Include Unicode compatibility variants
|
|
165
|
+
(fullwidth, math alphanumerics). Wider range of substitutions.
|
|
166
|
+
- **aggressive**: All of the above combined. Most aggressive substitution.
|
|
167
|
+
|
|
168
|
+
## Locality Control
|
|
169
|
+
|
|
170
|
+
`max_consecutive` limits how many adjacent characters can be substituted,
|
|
171
|
+
preventing the "ransom note" effect where every character is from a different
|
|
172
|
+
script. Default is 3. Set to 0 for unlimited.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
rate: Probability of substituting each eligible character. Default 0.02.
|
|
176
|
+
classes: Unicode script classes to include.
|
|
177
|
+
Default ["LATIN", "GREEK", "CYRILLIC", "COMMON"].
|
|
178
|
+
banned_characters: Characters to never use as substitutes.
|
|
179
|
+
mode: Substitution mode. One of "single_script", "mixed_script",
|
|
180
|
+
"compatibility", "aggressive".
|
|
181
|
+
max_consecutive: Maximum consecutive characters to substitute. Default 3.
|
|
182
|
+
seed: Random seed for deterministic behavior.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
flavor = (
|
|
186
|
+
"Breaks your parser by replacing some characters in strings with "
|
|
187
|
+
"doppelgangers. Don't worry, this text is clean. ;)"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def __init__(
|
|
191
|
+
self,
|
|
192
|
+
*,
|
|
193
|
+
rate: float | None = None,
|
|
194
|
+
classes: list[str] | Literal["all"] | None = None,
|
|
195
|
+
banned_characters: Collection[str] | None = None,
|
|
196
|
+
mode: HomoglyphMode | None = None,
|
|
197
|
+
max_consecutive: int | None = None,
|
|
198
|
+
seed: int | None = None,
|
|
199
|
+
**kwargs: Any,
|
|
200
|
+
) -> None:
|
|
201
|
+
effective_rate = DEFAULT_MIM1C_RATE if rate is None else rate
|
|
202
|
+
effective_mode = normalize_mim1c_mode(mode, DEFAULT_MIM1C_MODE)
|
|
203
|
+
effective_max_consecutive = normalize_mim1c_max_consecutive(
|
|
204
|
+
max_consecutive, DEFAULT_MIM1C_MAX_CONSECUTIVE
|
|
205
|
+
)
|
|
206
|
+
normalised_classes = _normalise_classes(classes)
|
|
207
|
+
normalised_banned = _normalise_banned(banned_characters)
|
|
208
|
+
super().__init__(
|
|
209
|
+
name="Mim1c",
|
|
210
|
+
corruption_function=swap_homoglyphs,
|
|
211
|
+
scope=AttackWave.CHARACTER,
|
|
212
|
+
order=AttackOrder.LAST,
|
|
213
|
+
seed=seed,
|
|
214
|
+
rate=effective_rate,
|
|
215
|
+
classes=normalised_classes,
|
|
216
|
+
banned_characters=normalised_banned,
|
|
217
|
+
mode=effective_mode,
|
|
218
|
+
max_consecutive=effective_max_consecutive,
|
|
219
|
+
**kwargs,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
223
|
+
rate_value = self.kwargs.get("rate")
|
|
224
|
+
rate = DEFAULT_MIM1C_RATE if rate_value is None else float(rate_value)
|
|
225
|
+
|
|
226
|
+
descriptor: dict[str, object] = {"type": "mimic", "rate": rate}
|
|
227
|
+
|
|
228
|
+
classes = self.kwargs.get("classes")
|
|
229
|
+
serialised_classes = _serialise_classes(classes)
|
|
230
|
+
if serialised_classes is not None:
|
|
231
|
+
descriptor["classes"] = serialised_classes
|
|
232
|
+
|
|
233
|
+
banned = self.kwargs.get("banned_characters")
|
|
234
|
+
serialised_banned = _serialise_banned(banned)
|
|
235
|
+
if serialised_banned:
|
|
236
|
+
descriptor["banned_characters"] = serialised_banned
|
|
237
|
+
|
|
238
|
+
# Add mode and max_consecutive parameters
|
|
239
|
+
mode = self.kwargs.get("mode")
|
|
240
|
+
if mode is not None:
|
|
241
|
+
descriptor["mode"] = str(mode)
|
|
242
|
+
|
|
243
|
+
max_consecutive = self.kwargs.get("max_consecutive")
|
|
244
|
+
if max_consecutive is not None:
|
|
245
|
+
descriptor["max_consecutive"] = int(max_consecutive)
|
|
246
|
+
|
|
247
|
+
return cast(PipelineOperationPayload, descriptor)
|
|
248
|
+
|
|
249
|
+
def set_param(self, key: str, value: object) -> None:
|
|
250
|
+
if key == "classes":
|
|
251
|
+
super().set_param(key, _normalise_classes(value))
|
|
252
|
+
return
|
|
253
|
+
if key == "banned_characters":
|
|
254
|
+
super().set_param(key, _normalise_banned(value))
|
|
255
|
+
return
|
|
256
|
+
if key == "mode":
|
|
257
|
+
super().set_param(key, normalize_mim1c_mode(str(value) if value else None))
|
|
258
|
+
return
|
|
259
|
+
if key == "max_consecutive":
|
|
260
|
+
int_value: int | None = int(cast(Any, value)) if value is not None else None
|
|
261
|
+
super().set_param(key, normalize_mim1c_max_consecutive(int_value))
|
|
262
|
+
return
|
|
263
|
+
super().set_param(key, value)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
mim1c = Mim1c()
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
__all__ = ["Mim1c", "mim1c", "swap_homoglyphs", "HomoglyphMode"]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Pedant glitchling integrating grammar evolutions with Rust acceleration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from glitchlings.internal.rust_ffi import resolve_seed
|
|
9
|
+
|
|
10
|
+
from ..core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
|
|
11
|
+
from .core import EVOLUTIONS, PedantBase, apply_pedant
|
|
12
|
+
from .stones import STONES, PedantStone
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _coerce_stone(value: Any) -> PedantStone:
|
|
16
|
+
"""Return a :class:`PedantStone` enum member for ``value``."""
|
|
17
|
+
|
|
18
|
+
return PedantStone.from_value(value)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def pedant_transform(
|
|
22
|
+
text: str,
|
|
23
|
+
*,
|
|
24
|
+
stone: PedantStone | str = PedantStone.HYPERCORRECTITE,
|
|
25
|
+
seed: int | None = None,
|
|
26
|
+
rng: random.Random | None = None,
|
|
27
|
+
) -> str:
|
|
28
|
+
"""Apply a pedant evolution to text."""
|
|
29
|
+
|
|
30
|
+
pedant_stone = _coerce_stone(stone)
|
|
31
|
+
if pedant_stone not in EVOLUTIONS:
|
|
32
|
+
raise ValueError(f"Unknown pedant stone: {stone!r}")
|
|
33
|
+
|
|
34
|
+
effective_seed = resolve_seed(seed, rng)
|
|
35
|
+
|
|
36
|
+
return apply_pedant(
|
|
37
|
+
text,
|
|
38
|
+
stone=pedant_stone,
|
|
39
|
+
seed=effective_seed,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _build_pipeline_descriptor(glitch: Glitchling) -> PipelineOperationPayload:
|
|
44
|
+
stone_value = glitch.kwargs.get("stone")
|
|
45
|
+
if stone_value is None:
|
|
46
|
+
message = "Pedant requires a stone to build the pipeline descriptor"
|
|
47
|
+
raise RuntimeError(message)
|
|
48
|
+
|
|
49
|
+
pedant_stone = _coerce_stone(stone_value)
|
|
50
|
+
|
|
51
|
+
return cast(
|
|
52
|
+
PipelineOperationPayload,
|
|
53
|
+
{"type": "pedant", "stone": pedant_stone.label},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Pedant(Glitchling):
|
|
58
|
+
"""Glitchling that deterministically applies pedant evolutions."""
|
|
59
|
+
|
|
60
|
+
_param_aliases = {
|
|
61
|
+
"form": "stone",
|
|
62
|
+
"stone_name": "stone",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
stone: PedantStone | str = PedantStone.HYPERCORRECTITE,
|
|
69
|
+
seed: int | None = None,
|
|
70
|
+
**kwargs: Any,
|
|
71
|
+
) -> None:
|
|
72
|
+
super().__init__(
|
|
73
|
+
name="Pedant",
|
|
74
|
+
corruption_function=pedant_transform,
|
|
75
|
+
scope=AttackWave.WORD,
|
|
76
|
+
order=AttackOrder.LATE,
|
|
77
|
+
seed=seed,
|
|
78
|
+
pipeline_operation=_build_pipeline_descriptor,
|
|
79
|
+
stone=_coerce_stone(stone),
|
|
80
|
+
**kwargs,
|
|
81
|
+
)
|
|
82
|
+
if seed is not None:
|
|
83
|
+
self.set_param("seed", int(seed))
|
|
84
|
+
|
|
85
|
+
def set_param(self, key: str, value: object) -> None:
|
|
86
|
+
if key in {"stone", "form", "stone_name"}:
|
|
87
|
+
super().set_param(key, _coerce_stone(value))
|
|
88
|
+
return
|
|
89
|
+
super().set_param(key, value)
|
|
90
|
+
|
|
91
|
+
def reset_rng(self, seed: int | None = None) -> None:
|
|
92
|
+
super().reset_rng(seed)
|
|
93
|
+
if self.seed is None:
|
|
94
|
+
self.kwargs.pop("seed", None)
|
|
95
|
+
return
|
|
96
|
+
self.kwargs["seed"] = int(self.seed)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
pedant = Pedant()
|
|
100
|
+
|
|
101
|
+
__all__ = [
|
|
102
|
+
"PedantBase",
|
|
103
|
+
"Pedant",
|
|
104
|
+
"pedant",
|
|
105
|
+
"pedant_transform",
|
|
106
|
+
"EVOLUTIONS",
|
|
107
|
+
"STONES",
|
|
108
|
+
"PedantStone",
|
|
109
|
+
]
|