glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +36 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +53 -0
- glitchlings/attack/compose.py +299 -0
- glitchlings/attack/core.py +465 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +104 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +157 -0
- glitchlings/auggie.py +283 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +17 -3
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +37 -65
- glitchlings/dlc/prime.py +55 -114
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +432 -0
- glitchlings/main.py +123 -32
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +29 -176
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +311 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +47 -24
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +301 -167
- glitchlings/zoo/core_execution.py +98 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/ekkokin.py +118 -0
- glitchlings/zoo/hokey.py +137 -0
- glitchlings/zoo/jargoyle.py +179 -274
- glitchlings/zoo/mim1c.py +106 -68
- glitchlings/zoo/pedant/__init__.py +107 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +44 -175
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +359 -116
- glitchlings/zoo/scannequin.py +18 -125
- glitchlings/zoo/transforms.py +386 -0
- glitchlings/zoo/typogre.py +76 -162
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/zeedub.py +33 -86
- glitchlings-0.9.3.dist-info/METADATA +334 -0
- glitchlings-0.9.3.dist-info/RECORD +80 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
- glitchlings/zoo/_ocr_confusions.py +0 -34
- glitchlings/zoo/_rate.py +0 -21
- glitchlings/zoo/reduple.py +0 -169
- glitchlings-0.2.5.dist-info/METADATA +0 -490
- glitchlings-0.2.5.dist-info/RECORD +0 -27
- /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
glitchlings/zoo/jargoyle.py
CHANGED
|
@@ -1,336 +1,241 @@
|
|
|
1
|
-
|
|
2
|
-
import re
|
|
3
|
-
from collections.abc import Iterable
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
|
-
|
|
7
|
-
try: # pragma: no cover - exercised in environments with NLTK installed
|
|
8
|
-
import nltk # type: ignore[import]
|
|
9
|
-
except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
|
|
10
|
-
nltk = None # type: ignore[assignment]
|
|
11
|
-
find = None # type: ignore[assignment]
|
|
12
|
-
_NLTK_IMPORT_ERROR = exc
|
|
13
|
-
else: # pragma: no cover - executed when NLTK is available
|
|
14
|
-
from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
|
|
15
|
-
from nltk.data import find as _nltk_find # type: ignore[import]
|
|
16
|
-
|
|
17
|
-
find = _nltk_find
|
|
18
|
-
_NLTK_IMPORT_ERROR = None
|
|
19
|
-
|
|
20
|
-
if TYPE_CHECKING: # pragma: no cover - typing aid only
|
|
21
|
-
from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
|
|
22
|
-
else: # Use ``Any`` at runtime to avoid hard dependency when NLTK missing
|
|
23
|
-
WordNetCorpusReader = Any
|
|
24
|
-
|
|
25
|
-
if nltk is not None: # pragma: no cover - guarded by import success
|
|
26
|
-
try:
|
|
27
|
-
from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
|
|
28
|
-
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
|
29
|
-
_WORDNET_MODULE = None
|
|
30
|
-
else:
|
|
31
|
-
WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
|
|
32
|
-
else:
|
|
33
|
-
_WORDNET_MODULE = None
|
|
34
|
-
|
|
35
|
-
from .core import AttackWave, Glitchling
|
|
36
|
-
from ._rate import resolve_rate
|
|
37
|
-
|
|
38
|
-
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
|
1
|
+
"""Jargoyle glitchling: Dictionary-based word drift.
|
|
39
2
|
|
|
40
|
-
|
|
3
|
+
Jargoyle swaps words with alternatives from bundled lexeme dictionaries.
|
|
4
|
+
Multiple dictionaries are supported:
|
|
5
|
+
- "colors": Color term swapping
|
|
6
|
+
- "synonyms": General synonym substitution
|
|
7
|
+
- "corporate": Business jargon alternatives
|
|
8
|
+
- "academic": Scholarly word substitutions
|
|
9
|
+
- "cyberpunk": Neon cyberpunk slang and gadgetry
|
|
10
|
+
- "lovecraftian": Cosmic horror terminology
|
|
11
|
+
You can also drop additional dictionaries into ``assets/lexemes`` to make
|
|
12
|
+
them available without modifying the code. The backend discovers any
|
|
13
|
+
``*.json`` file in that directory at runtime.
|
|
41
14
|
|
|
15
|
+
Two modes are available:
|
|
16
|
+
- "literal": First entry in each word's alternatives (deterministic mapping)
|
|
17
|
+
- "drift": Random selection from alternatives (probabilistic)
|
|
18
|
+
"""
|
|
42
19
|
|
|
43
|
-
|
|
44
|
-
"""Ensure the NLTK dependency is present before continuing."""
|
|
45
|
-
|
|
46
|
-
if nltk is None or find is None:
|
|
47
|
-
message = (
|
|
48
|
-
"The NLTK package is required for the jargoyle glitchling; install "
|
|
49
|
-
"the 'wordnet' extra via `pip install glitchlings[wordnet]`."
|
|
50
|
-
)
|
|
51
|
-
if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
|
|
52
|
-
raise RuntimeError(message) from _NLTK_IMPORT_ERROR
|
|
53
|
-
raise RuntimeError(message)
|
|
20
|
+
from __future__ import annotations
|
|
54
21
|
|
|
22
|
+
import os
|
|
23
|
+
from importlib import resources
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Literal, cast
|
|
55
26
|
|
|
56
|
-
|
|
57
|
-
|
|
27
|
+
from glitchlings.constants import DEFAULT_JARGOYLE_RATE
|
|
28
|
+
from glitchlings.internal.rust_ffi import (
|
|
29
|
+
jargoyle_drift_rust,
|
|
30
|
+
list_lexeme_dictionaries_rust,
|
|
31
|
+
resolve_seed,
|
|
32
|
+
)
|
|
58
33
|
|
|
59
|
-
|
|
34
|
+
from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
|
|
60
35
|
|
|
36
|
+
_LEXEME_ENV_VAR = "GLITCHLINGS_LEXEME_DIR"
|
|
61
37
|
|
|
62
|
-
def _load_wordnet_reader() -> WordNetCorpusReader:
|
|
63
|
-
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
|
64
38
|
|
|
65
|
-
|
|
39
|
+
def _configure_lexeme_directory() -> Path | None:
|
|
40
|
+
"""Expose the bundled lexeme directory to the Rust backend via an env var."""
|
|
66
41
|
|
|
67
42
|
try:
|
|
68
|
-
|
|
69
|
-
except
|
|
70
|
-
|
|
71
|
-
zip_root = find("corpora/wordnet.zip")
|
|
72
|
-
except LookupError as exc:
|
|
73
|
-
raise RuntimeError(
|
|
74
|
-
"The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
|
|
75
|
-
) from exc
|
|
76
|
-
root = zip_root.join("wordnet/")
|
|
77
|
-
|
|
78
|
-
return WordNetCorpusReader(root, None)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
|
|
82
|
-
"""Retrieve the active WordNet handle, rebuilding it on demand."""
|
|
83
|
-
|
|
84
|
-
global _WORDNET_HANDLE
|
|
85
|
-
|
|
86
|
-
if force_refresh:
|
|
87
|
-
_WORDNET_HANDLE = _WORDNET_MODULE
|
|
88
|
-
|
|
89
|
-
if _WORDNET_HANDLE is not None:
|
|
90
|
-
return _WORDNET_HANDLE
|
|
91
|
-
|
|
92
|
-
_WORDNET_HANDLE = _load_wordnet_reader()
|
|
93
|
-
return _WORDNET_HANDLE
|
|
43
|
+
lexeme_root = resources.files("glitchlings.assets.lexemes")
|
|
44
|
+
except (ModuleNotFoundError, AttributeError):
|
|
45
|
+
return None
|
|
94
46
|
|
|
47
|
+
try:
|
|
48
|
+
with resources.as_file(lexeme_root) as resolved:
|
|
49
|
+
path = Path(resolved)
|
|
50
|
+
except FileNotFoundError:
|
|
51
|
+
return None
|
|
95
52
|
|
|
96
|
-
|
|
97
|
-
|
|
53
|
+
if not path.is_dir():
|
|
54
|
+
return None
|
|
98
55
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
return
|
|
56
|
+
os.environ.setdefault(_LEXEME_ENV_VAR, str(path))
|
|
57
|
+
return path
|
|
102
58
|
|
|
103
|
-
_require_nltk()
|
|
104
59
|
|
|
105
|
-
|
|
60
|
+
_configure_lexeme_directory()
|
|
106
61
|
|
|
107
|
-
|
|
108
|
-
resource.ensure_loaded()
|
|
109
|
-
except LookupError:
|
|
110
|
-
nltk.download("wordnet", quiet=True)
|
|
111
|
-
try:
|
|
112
|
-
resource = _wordnet(force_refresh=True)
|
|
113
|
-
resource.ensure_loaded()
|
|
114
|
-
except LookupError as exc: # pragma: no cover - only triggered when download fails
|
|
115
|
-
raise RuntimeError(
|
|
116
|
-
"Unable to load NLTK WordNet corpus for the jargoyle glitchling."
|
|
117
|
-
) from exc
|
|
62
|
+
DEFAULT_LEXEMES = "synonyms"
|
|
118
63
|
|
|
119
|
-
|
|
64
|
+
# Valid modes
|
|
65
|
+
JargoyleMode = Literal["literal", "drift"]
|
|
66
|
+
VALID_MODES = ("literal", "drift")
|
|
67
|
+
DEFAULT_MODE: JargoyleMode = "drift"
|
|
120
68
|
|
|
121
69
|
|
|
122
|
-
|
|
123
|
-
|
|
70
|
+
def _available_lexemes() -> list[str]:
|
|
71
|
+
return sorted({name.lower() for name in list_lexeme_dictionaries_rust()})
|
|
124
72
|
|
|
125
73
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
74
|
+
def _validate_lexemes(name: str) -> str:
|
|
75
|
+
normalized = name.lower()
|
|
76
|
+
available = _available_lexemes()
|
|
77
|
+
if normalized not in available:
|
|
78
|
+
raise ValueError(f"Invalid lexemes '{name}'. Must be one of: {', '.join(available)}")
|
|
79
|
+
return normalized
|
|
129
80
|
|
|
130
|
-
_VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
|
|
131
81
|
|
|
82
|
+
def _validate_mode(mode: JargoyleMode | str) -> JargoyleMode:
|
|
83
|
+
normalized = mode.lower()
|
|
84
|
+
if normalized not in VALID_MODES:
|
|
85
|
+
raise ValueError(f"Invalid mode '{mode}'. Must be one of: {', '.join(VALID_MODES)}")
|
|
86
|
+
return cast(JargoyleMode, normalized)
|
|
132
87
|
|
|
133
|
-
def _split_token(token: str) -> tuple[str, str, str]:
|
|
134
|
-
"""Split a token into leading punctuation, core word, and trailing punctuation."""
|
|
135
88
|
|
|
136
|
-
|
|
137
|
-
if not match:
|
|
138
|
-
return "", token, ""
|
|
139
|
-
prefix, core, suffix = match.groups()
|
|
140
|
-
return prefix, core, suffix
|
|
89
|
+
VALID_LEXEMES = tuple(_available_lexemes())
|
|
141
90
|
|
|
142
91
|
|
|
143
|
-
def
|
|
144
|
-
"""
|
|
92
|
+
def list_lexeme_dictionaries() -> list[str]:
|
|
93
|
+
"""Return the list of available lexeme dictionaries.
|
|
145
94
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if lowered not in _VALID_POS:
|
|
151
|
-
raise ValueError(
|
|
152
|
-
"part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'"
|
|
153
|
-
)
|
|
154
|
-
return (cast(PartOfSpeech, lowered),)
|
|
155
|
-
|
|
156
|
-
normalized: list[PartOfSpeech] = []
|
|
157
|
-
for pos in part_of_speech:
|
|
158
|
-
if pos not in _VALID_POS:
|
|
159
|
-
raise ValueError(
|
|
160
|
-
"part_of_speech entries must be one of 'n', 'v', 'a', or 'r'"
|
|
161
|
-
)
|
|
162
|
-
if pos not in normalized:
|
|
163
|
-
normalized.append(pos)
|
|
164
|
-
if not normalized:
|
|
165
|
-
raise ValueError("part_of_speech iterable may not be empty")
|
|
166
|
-
return tuple(normalized)
|
|
95
|
+
Returns:
|
|
96
|
+
List of dictionary names that can be used with Jargoyle.
|
|
97
|
+
"""
|
|
98
|
+
return _available_lexemes()
|
|
167
99
|
|
|
168
100
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
101
|
+
def jargoyle_drift(
|
|
102
|
+
text: str,
|
|
103
|
+
*,
|
|
104
|
+
lexemes: str = DEFAULT_LEXEMES,
|
|
105
|
+
mode: JargoyleMode = DEFAULT_MODE,
|
|
106
|
+
rate: float | None = None,
|
|
107
|
+
seed: int | None = None,
|
|
108
|
+
) -> str:
|
|
109
|
+
"""Apply dictionary-based word drift to text.
|
|
172
110
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
111
|
+
Args:
|
|
112
|
+
text: Input text to transform.
|
|
113
|
+
lexemes: Name of the dictionary to use.
|
|
114
|
+
mode: "literal" for deterministic first-entry swaps,
|
|
115
|
+
"drift" for random selection from alternatives.
|
|
116
|
+
rate: Probability of transforming each matching word (0.0 to 1.0).
|
|
117
|
+
seed: Seed for deterministic randomness (only used in "drift" mode).
|
|
177
118
|
|
|
119
|
+
Returns:
|
|
120
|
+
Text with word substitutions applied.
|
|
178
121
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
122
|
+
Raises:
|
|
123
|
+
ValueError: If lexemes or mode is invalid.
|
|
124
|
+
"""
|
|
125
|
+
normalized_lexemes = _validate_lexemes(lexemes)
|
|
126
|
+
normalized_mode = _validate_mode(mode)
|
|
127
|
+
|
|
128
|
+
effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else float(rate)
|
|
129
|
+
resolved_seed = resolve_seed(seed, None) if normalized_mode == "drift" else None
|
|
130
|
+
|
|
131
|
+
return jargoyle_drift_rust(
|
|
132
|
+
text,
|
|
133
|
+
normalized_lexemes,
|
|
134
|
+
normalized_mode,
|
|
135
|
+
effective_rate,
|
|
136
|
+
resolved_seed,
|
|
137
|
+
)
|
|
183
138
|
|
|
184
|
-
normalized_word = word.lower()
|
|
185
|
-
wordnet = _wordnet()
|
|
186
|
-
synonyms: set[str] = set()
|
|
187
|
-
for pos_tag in parts_of_speech:
|
|
188
|
-
synsets = wordnet.synsets(word, pos=pos_tag)
|
|
189
|
-
if not synsets:
|
|
190
|
-
continue
|
|
191
139
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
if not lemmas_list:
|
|
195
|
-
continue
|
|
140
|
+
class Jargoyle(Glitchling):
|
|
141
|
+
"""Glitchling that swaps words using bundled lexeme dictionaries.
|
|
196
142
|
|
|
197
|
-
|
|
198
|
-
for lemma_str in lemmas_list:
|
|
199
|
-
cleaned = lemma_str.replace("_", " ")
|
|
200
|
-
if cleaned.lower() != normalized_word:
|
|
201
|
-
filtered.append(cleaned)
|
|
143
|
+
Jargoyle replaces words with alternatives from one of several dictionaries:
|
|
202
144
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
145
|
+
- **colors**: Swap color terms (e.g., "red" -> "blue").
|
|
146
|
+
- **synonyms**: General synonym substitution (e.g., "fast" -> "rapid").
|
|
147
|
+
- **corporate**: Business jargon alternatives.
|
|
148
|
+
- **academic**: Scholarly word substitutions.
|
|
149
|
+
- **cyberpunk**: Neon cyberpunk slang and gadgetry.
|
|
150
|
+
- **lovecraftian**: Cosmic horror terminology.
|
|
151
|
+
- **custom**: Any ``*.json`` dictionary placed in ``assets/lexemes``.
|
|
206
152
|
|
|
207
|
-
|
|
208
|
-
break
|
|
153
|
+
Two modes are supported:
|
|
209
154
|
|
|
210
|
-
|
|
155
|
+
- **literal**: Use the first (canonical) entry for each word.
|
|
156
|
+
- **drift**: Randomly select from available alternatives.
|
|
211
157
|
|
|
158
|
+
Example:
|
|
159
|
+
>>> from glitchlings import Jargoyle
|
|
160
|
+
>>> jargoyle = Jargoyle(lexemes="colors", mode="literal")
|
|
161
|
+
>>> jargoyle("The red balloon floated away.")
|
|
162
|
+
'The blue balloon floated away.'
|
|
212
163
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
part_of_speech: PartOfSpeechInput = "n",
|
|
217
|
-
seed: int | None = None,
|
|
218
|
-
rng: random.Random | None = None,
|
|
219
|
-
*,
|
|
220
|
-
replacement_rate: float | None = None,
|
|
221
|
-
) -> str:
|
|
222
|
-
"""Replace words with random WordNet synonyms.
|
|
223
|
-
|
|
224
|
-
Parameters
|
|
225
|
-
- text: Input text.
|
|
226
|
-
- rate: Max proportion of candidate words to replace (default 0.1).
|
|
227
|
-
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
|
228
|
-
any iterable of those tags, or "any" to include all four.
|
|
229
|
-
- rng: Optional RNG instance used for deterministic sampling.
|
|
230
|
-
- seed: Optional seed if `rng` not provided.
|
|
231
|
-
|
|
232
|
-
Determinism
|
|
233
|
-
- Candidates collected in left-to-right order; no set() reordering.
|
|
234
|
-
- Replacement positions chosen via rng.sample.
|
|
235
|
-
- Synonyms sorted before rng.choice to fix ordering.
|
|
236
|
-
- For each POS, the first synset containing alternate lemmas is used for stability.
|
|
164
|
+
>>> jargoyle = Jargoyle(lexemes="synonyms", mode="drift", rate=0.5, seed=42)
|
|
165
|
+
>>> jargoyle("The quick fox jumps fast.")
|
|
166
|
+
'The swift fox jumps rapid.'
|
|
237
167
|
"""
|
|
238
|
-
effective_rate = resolve_rate(
|
|
239
|
-
rate=rate,
|
|
240
|
-
legacy_value=replacement_rate,
|
|
241
|
-
default=0.1,
|
|
242
|
-
legacy_name="replacement_rate",
|
|
243
|
-
)
|
|
244
168
|
|
|
245
|
-
|
|
246
|
-
wordnet = _wordnet()
|
|
247
|
-
|
|
248
|
-
active_rng: random.Random
|
|
249
|
-
if rng is not None:
|
|
250
|
-
active_rng = rng
|
|
251
|
-
else:
|
|
252
|
-
active_rng = random.Random(seed)
|
|
253
|
-
|
|
254
|
-
target_pos = _normalize_parts_of_speech(part_of_speech)
|
|
255
|
-
|
|
256
|
-
# Split but keep whitespace separators so we can rebuild easily
|
|
257
|
-
tokens = re.split(r"(\s+)", text)
|
|
258
|
-
|
|
259
|
-
# Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
|
|
260
|
-
candidate_indices: list[int] = []
|
|
261
|
-
candidate_metadata: dict[int, CandidateInfo] = {}
|
|
262
|
-
for idx, tok in enumerate(tokens):
|
|
263
|
-
if idx % 2 == 0 and tok and not tok.isspace():
|
|
264
|
-
prefix, core_word, suffix = _split_token(tok)
|
|
265
|
-
if not core_word:
|
|
266
|
-
continue
|
|
267
|
-
|
|
268
|
-
available_pos: NormalizedPartsOfSpeech = tuple(
|
|
269
|
-
pos for pos in target_pos if wordnet.synsets(core_word, pos=pos)
|
|
270
|
-
)
|
|
271
|
-
if available_pos:
|
|
272
|
-
candidate_indices.append(idx)
|
|
273
|
-
candidate_metadata[idx] = CandidateInfo(
|
|
274
|
-
prefix=prefix,
|
|
275
|
-
core_word=core_word,
|
|
276
|
-
suffix=suffix,
|
|
277
|
-
parts_of_speech=available_pos,
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
if not candidate_indices:
|
|
281
|
-
return text
|
|
282
|
-
|
|
283
|
-
clamped_rate = max(0.0, effective_rate)
|
|
284
|
-
max_replacements = int(len(candidate_indices) * clamped_rate)
|
|
285
|
-
if max_replacements <= 0:
|
|
286
|
-
return text
|
|
287
|
-
|
|
288
|
-
# Choose which positions to replace deterministically via rng.sample
|
|
289
|
-
replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
|
|
290
|
-
# Process in ascending order to avoid affecting later indices
|
|
291
|
-
replace_positions.sort()
|
|
292
|
-
|
|
293
|
-
for pos in replace_positions:
|
|
294
|
-
metadata = candidate_metadata[pos]
|
|
295
|
-
synonyms = _collect_synonyms(metadata.core_word, metadata.parts_of_speech)
|
|
296
|
-
if not synonyms:
|
|
297
|
-
continue
|
|
298
|
-
|
|
299
|
-
replacement = active_rng.choice(synonyms)
|
|
300
|
-
tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
|
|
301
|
-
|
|
302
|
-
return "".join(tokens)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
class Jargoyle(Glitchling):
|
|
306
|
-
"""Glitchling that swaps words with random WordNet synonyms."""
|
|
169
|
+
flavor = "Oh no... The worst person you know just bought a thesaurus..."
|
|
307
170
|
|
|
308
171
|
def __init__(
|
|
309
172
|
self,
|
|
310
173
|
*,
|
|
174
|
+
lexemes: str = DEFAULT_LEXEMES,
|
|
175
|
+
mode: JargoyleMode = DEFAULT_MODE,
|
|
311
176
|
rate: float | None = None,
|
|
312
|
-
replacement_rate: float | None = None,
|
|
313
|
-
part_of_speech: PartOfSpeechInput = "n",
|
|
314
177
|
seed: int | None = None,
|
|
315
178
|
) -> None:
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
179
|
+
"""Initialize Jargoyle with the specified dictionary and mode.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
lexemes: Name of the dictionary to use. See ``list_lexeme_dictionaries()``
|
|
183
|
+
for the full, dynamic list (including any custom ``*.json`` files).
|
|
184
|
+
mode: Transformation mode. "literal" for deterministic swaps,
|
|
185
|
+
"drift" for random selection.
|
|
186
|
+
rate: Probability of transforming each matching word (0.0 to 1.0).
|
|
187
|
+
Defaults to 0.01.
|
|
188
|
+
seed: Seed for deterministic randomness.
|
|
189
|
+
"""
|
|
190
|
+
# Validate inputs
|
|
191
|
+
normalized_lexemes = _validate_lexemes(lexemes)
|
|
192
|
+
normalized_mode = _validate_mode(mode)
|
|
193
|
+
|
|
194
|
+
effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else rate
|
|
195
|
+
|
|
323
196
|
super().__init__(
|
|
324
197
|
name="Jargoyle",
|
|
325
|
-
corruption_function=
|
|
198
|
+
corruption_function=jargoyle_drift,
|
|
326
199
|
scope=AttackWave.WORD,
|
|
200
|
+
order=AttackOrder.NORMAL,
|
|
327
201
|
seed=seed,
|
|
202
|
+
lexemes=normalized_lexemes,
|
|
203
|
+
mode=normalized_mode,
|
|
328
204
|
rate=effective_rate,
|
|
329
|
-
|
|
205
|
+
# Pass seed explicitly to kwargs so corruption_function receives it
|
|
206
|
+
# (seed is stored separately in base class but needed by jargoyle_drift)
|
|
207
|
+
)
|
|
208
|
+
# Ensure seed is in kwargs for the corruption function
|
|
209
|
+
self.kwargs["seed"] = seed
|
|
210
|
+
|
|
211
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
212
|
+
"""Return the pipeline descriptor for the Rust backend."""
|
|
213
|
+
lexemes = self.kwargs.get("lexemes", DEFAULT_LEXEMES)
|
|
214
|
+
mode = self.kwargs.get("mode", DEFAULT_MODE)
|
|
215
|
+
rate = self.kwargs.get("rate", DEFAULT_JARGOYLE_RATE)
|
|
216
|
+
return cast(
|
|
217
|
+
PipelineOperationPayload,
|
|
218
|
+
{
|
|
219
|
+
"type": "jargoyle",
|
|
220
|
+
"lexemes": str(lexemes),
|
|
221
|
+
"mode": str(mode),
|
|
222
|
+
"rate": float(rate),
|
|
223
|
+
},
|
|
330
224
|
)
|
|
331
225
|
|
|
332
226
|
|
|
227
|
+
# Module-level singleton for convenience
|
|
333
228
|
jargoyle = Jargoyle()
|
|
334
229
|
|
|
335
230
|
|
|
336
|
-
__all__ = [
|
|
231
|
+
__all__ = [
|
|
232
|
+
"DEFAULT_LEXEMES",
|
|
233
|
+
"DEFAULT_MODE",
|
|
234
|
+
"Jargoyle",
|
|
235
|
+
"JargoyleMode",
|
|
236
|
+
"VALID_LEXEMES",
|
|
237
|
+
"VALID_MODES",
|
|
238
|
+
"jargoyle",
|
|
239
|
+
"jargoyle_drift",
|
|
240
|
+
"list_lexeme_dictionaries",
|
|
241
|
+
]
|