glitchlings 0.4.4__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +67 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cp310-win_amd64.pyd +0 -0
- glitchlings/compat.py +284 -0
- glitchlings/config.py +388 -0
- glitchlings/config.toml +3 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +215 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +110 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +651 -0
- glitchlings/lexicon/wordnet.py +232 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +168 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.4.dist-info/METADATA +627 -0
- glitchlings-0.4.4.dist-info/RECORD +47 -0
- glitchlings-0.4.4.dist-info/WHEEL +5 -0
- glitchlings-0.4.4.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.4.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import re
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from types import ModuleType
|
|
6
|
+
from typing import Any, Literal, cast
|
|
7
|
+
|
|
8
|
+
from glitchlings.lexicon import Lexicon, get_default_lexicon
|
|
9
|
+
|
|
10
|
+
from ._rate import resolve_rate
|
|
11
|
+
from .core import AttackWave, Glitchling
|
|
12
|
+
|
|
13
|
+
_wordnet_module: ModuleType | None
|
|
14
|
+
|
|
15
|
+
try: # pragma: no cover - optional WordNet dependency
|
|
16
|
+
import glitchlings.lexicon.wordnet as _wordnet_module
|
|
17
|
+
except (
|
|
18
|
+
ImportError,
|
|
19
|
+
ModuleNotFoundError,
|
|
20
|
+
AttributeError,
|
|
21
|
+
): # pragma: no cover - triggered when nltk unavailable
|
|
22
|
+
_wordnet_module = None
|
|
23
|
+
|
|
24
|
+
_wordnet_runtime: ModuleType | None = _wordnet_module
|
|
25
|
+
|
|
26
|
+
WordNetLexicon: type[Lexicon] | None
|
|
27
|
+
if _wordnet_runtime is None:
|
|
28
|
+
|
|
29
|
+
def _lexicon_dependencies_available() -> bool:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
def _lexicon_ensure_wordnet() -> None:
|
|
33
|
+
raise RuntimeError(
|
|
34
|
+
"The WordNet backend is no longer bundled by default. Install NLTK "
|
|
35
|
+
"and download its WordNet corpus manually if you need legacy synonyms."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
WordNetLexicon = None
|
|
39
|
+
else:
|
|
40
|
+
WordNetLexicon = cast(type[Lexicon], _wordnet_runtime.WordNetLexicon)
|
|
41
|
+
_lexicon_dependencies_available = _wordnet_runtime.dependencies_available
|
|
42
|
+
_lexicon_ensure_wordnet = _wordnet_runtime.ensure_wordnet
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
ensure_wordnet = _lexicon_ensure_wordnet
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def dependencies_available() -> bool:
|
|
49
|
+
"""Return ``True`` when a synonym backend is accessible."""
|
|
50
|
+
if _lexicon_dependencies_available():
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
# Fall back to the configured default lexicon (typically the bundled vector cache).
|
|
55
|
+
get_default_lexicon(seed=None)
|
|
56
|
+
except (RuntimeError, ImportError, ModuleNotFoundError, AttributeError):
|
|
57
|
+
return False
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Backwards compatibility for callers relying on the previous private helper name.
|
|
62
|
+
_ensure_wordnet = ensure_wordnet
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
PartOfSpeech = Literal["n", "v", "a", "r"]
|
|
66
|
+
PartOfSpeechInput = PartOfSpeech | Iterable[PartOfSpeech] | Literal["any"]
|
|
67
|
+
NormalizedPartsOfSpeech = tuple[PartOfSpeech, ...]
|
|
68
|
+
|
|
69
|
+
_VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _split_token(token: str) -> tuple[str, str, str]:
|
|
73
|
+
"""Split a token into leading punctuation, core word, and trailing punctuation."""
|
|
74
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", token)
|
|
75
|
+
if not match:
|
|
76
|
+
return "", token, ""
|
|
77
|
+
prefix, core, suffix = match.groups()
|
|
78
|
+
return prefix, core, suffix
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _normalize_parts_of_speech(
|
|
82
|
+
part_of_speech: PartOfSpeechInput,
|
|
83
|
+
) -> NormalizedPartsOfSpeech:
|
|
84
|
+
"""Coerce user input into a tuple of valid WordNet POS tags."""
|
|
85
|
+
if isinstance(part_of_speech, str):
|
|
86
|
+
lowered = part_of_speech.lower()
|
|
87
|
+
if lowered == "any":
|
|
88
|
+
return _VALID_POS
|
|
89
|
+
if lowered not in _VALID_POS:
|
|
90
|
+
raise ValueError("part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'")
|
|
91
|
+
return (cast(PartOfSpeech, lowered),)
|
|
92
|
+
|
|
93
|
+
normalized: list[PartOfSpeech] = []
|
|
94
|
+
for pos in part_of_speech:
|
|
95
|
+
if pos not in _VALID_POS:
|
|
96
|
+
raise ValueError("part_of_speech entries must be one of 'n', 'v', 'a', or 'r'")
|
|
97
|
+
if pos not in normalized:
|
|
98
|
+
normalized.append(pos)
|
|
99
|
+
if not normalized:
|
|
100
|
+
raise ValueError("part_of_speech iterable may not be empty")
|
|
101
|
+
return tuple(normalized)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass(frozen=True)
|
|
105
|
+
class CandidateInfo:
|
|
106
|
+
"""Metadata for a candidate token that may be replaced."""
|
|
107
|
+
|
|
108
|
+
prefix: str
|
|
109
|
+
core_word: str
|
|
110
|
+
suffix: str
|
|
111
|
+
part_of_speech: str | None
|
|
112
|
+
synonyms: list[str]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def substitute_random_synonyms(
|
|
116
|
+
text: str,
|
|
117
|
+
rate: float | None = None,
|
|
118
|
+
part_of_speech: PartOfSpeechInput = "n",
|
|
119
|
+
seed: int | None = None,
|
|
120
|
+
rng: random.Random | None = None,
|
|
121
|
+
*,
|
|
122
|
+
replacement_rate: float | None = None,
|
|
123
|
+
lexicon: Lexicon | None = None,
|
|
124
|
+
) -> str:
|
|
125
|
+
"""Replace words with random lexicon-driven synonyms.
|
|
126
|
+
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
- text: Input text.
|
|
130
|
+
- rate: Max proportion of candidate words to replace (default 0.01).
|
|
131
|
+
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
|
132
|
+
any iterable of those tags, or "any" to include all four. Backends that do
|
|
133
|
+
not differentiate parts of speech simply ignore the setting.
|
|
134
|
+
- rng: Optional RNG instance used for deterministic sampling.
|
|
135
|
+
- seed: Optional seed if `rng` not provided.
|
|
136
|
+
- lexicon: Optional :class:`~glitchlings.lexicon.Lexicon` implementation to
|
|
137
|
+
supply synonyms. Defaults to the configured lexicon priority, typically the
|
|
138
|
+
packaged vector cache.
|
|
139
|
+
|
|
140
|
+
Determinism
|
|
141
|
+
- Candidates collected in left-to-right order; no set() reordering.
|
|
142
|
+
- Replacement positions chosen via rng.sample.
|
|
143
|
+
- Synonyms sourced through the lexicon; the default backend derives
|
|
144
|
+
deterministic subsets per word and part-of-speech using the active seed.
|
|
145
|
+
|
|
146
|
+
"""
|
|
147
|
+
effective_rate = resolve_rate(
|
|
148
|
+
rate=rate,
|
|
149
|
+
legacy_value=replacement_rate,
|
|
150
|
+
default=0.1,
|
|
151
|
+
legacy_name="replacement_rate",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
active_rng: random.Random
|
|
155
|
+
if rng is not None:
|
|
156
|
+
active_rng = rng
|
|
157
|
+
else:
|
|
158
|
+
active_rng = random.Random(seed)
|
|
159
|
+
|
|
160
|
+
active_lexicon: Lexicon
|
|
161
|
+
restore_lexicon_seed = False
|
|
162
|
+
original_lexicon_seed: int | None = None
|
|
163
|
+
|
|
164
|
+
if lexicon is None:
|
|
165
|
+
active_lexicon = get_default_lexicon(seed=seed)
|
|
166
|
+
else:
|
|
167
|
+
active_lexicon = lexicon
|
|
168
|
+
if seed is not None:
|
|
169
|
+
original_lexicon_seed = active_lexicon.seed
|
|
170
|
+
if original_lexicon_seed != seed:
|
|
171
|
+
active_lexicon.reseed(seed)
|
|
172
|
+
restore_lexicon_seed = True
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
target_pos = _normalize_parts_of_speech(part_of_speech)
|
|
176
|
+
|
|
177
|
+
# Split but keep whitespace separators so we can rebuild easily
|
|
178
|
+
tokens = re.split(r"(\s+)", text)
|
|
179
|
+
|
|
180
|
+
# Collect candidate word indices (even positions are words because separators are kept)
|
|
181
|
+
candidate_indices: list[int] = []
|
|
182
|
+
candidate_metadata: dict[int, CandidateInfo] = {}
|
|
183
|
+
for idx, tok in enumerate(tokens):
|
|
184
|
+
if idx % 2 != 0 or not tok or tok.isspace():
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
prefix, core_word, suffix = _split_token(tok)
|
|
188
|
+
if not core_word:
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
chosen_pos: str | None = None
|
|
192
|
+
synonyms: list[str] = []
|
|
193
|
+
|
|
194
|
+
for tag in target_pos:
|
|
195
|
+
if not active_lexicon.supports_pos(tag):
|
|
196
|
+
continue
|
|
197
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=tag)
|
|
198
|
+
if synonyms:
|
|
199
|
+
chosen_pos = tag
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
if not synonyms and active_lexicon.supports_pos(None):
|
|
203
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=None)
|
|
204
|
+
|
|
205
|
+
if synonyms:
|
|
206
|
+
candidate_indices.append(idx)
|
|
207
|
+
candidate_metadata[idx] = CandidateInfo(
|
|
208
|
+
prefix=prefix,
|
|
209
|
+
core_word=core_word,
|
|
210
|
+
suffix=suffix,
|
|
211
|
+
part_of_speech=chosen_pos,
|
|
212
|
+
synonyms=synonyms,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if not candidate_indices:
|
|
216
|
+
return text
|
|
217
|
+
|
|
218
|
+
clamped_rate = max(0.0, effective_rate)
|
|
219
|
+
if clamped_rate == 0.0:
|
|
220
|
+
return text
|
|
221
|
+
|
|
222
|
+
population = len(candidate_indices)
|
|
223
|
+
effective_fraction = min(clamped_rate, 1.0)
|
|
224
|
+
expected_replacements = population * effective_fraction
|
|
225
|
+
max_replacements = int(expected_replacements)
|
|
226
|
+
remainder = expected_replacements - max_replacements
|
|
227
|
+
if remainder > 0.0 and active_rng.random() < remainder:
|
|
228
|
+
max_replacements += 1
|
|
229
|
+
if clamped_rate >= 1.0:
|
|
230
|
+
max_replacements = population
|
|
231
|
+
max_replacements = min(population, max_replacements)
|
|
232
|
+
if max_replacements <= 0:
|
|
233
|
+
return text
|
|
234
|
+
|
|
235
|
+
# Choose which positions to replace deterministically via rng.sample
|
|
236
|
+
replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
|
|
237
|
+
# Process in ascending order to avoid affecting later indices
|
|
238
|
+
replace_positions.sort()
|
|
239
|
+
|
|
240
|
+
for pos in replace_positions:
|
|
241
|
+
metadata = candidate_metadata[pos]
|
|
242
|
+
if not metadata.synonyms:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
replacement = active_rng.choice(metadata.synonyms)
|
|
246
|
+
tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
|
|
247
|
+
|
|
248
|
+
return "".join(tokens)
|
|
249
|
+
finally:
|
|
250
|
+
if restore_lexicon_seed:
|
|
251
|
+
active_lexicon.reseed(original_lexicon_seed)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class Jargoyle(Glitchling):
|
|
255
|
+
"""Glitchling that swaps words with lexicon-driven synonyms."""
|
|
256
|
+
|
|
257
|
+
def __init__(
|
|
258
|
+
self,
|
|
259
|
+
*,
|
|
260
|
+
rate: float | None = None,
|
|
261
|
+
replacement_rate: float | None = None,
|
|
262
|
+
part_of_speech: PartOfSpeechInput = "n",
|
|
263
|
+
seed: int | None = None,
|
|
264
|
+
lexicon: Lexicon | None = None,
|
|
265
|
+
) -> None:
|
|
266
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
|
267
|
+
self._owns_lexicon = lexicon is None
|
|
268
|
+
self._external_lexicon_original_seed = (
|
|
269
|
+
lexicon.seed if isinstance(lexicon, Lexicon) else None
|
|
270
|
+
)
|
|
271
|
+
self._initializing = True
|
|
272
|
+
effective_rate = resolve_rate(
|
|
273
|
+
rate=rate,
|
|
274
|
+
legacy_value=replacement_rate,
|
|
275
|
+
default=0.01,
|
|
276
|
+
legacy_name="replacement_rate",
|
|
277
|
+
)
|
|
278
|
+
prepared_lexicon = lexicon or get_default_lexicon(seed=seed)
|
|
279
|
+
if lexicon is not None and seed is not None:
|
|
280
|
+
prepared_lexicon.reseed(seed)
|
|
281
|
+
try:
|
|
282
|
+
super().__init__(
|
|
283
|
+
name="Jargoyle",
|
|
284
|
+
corruption_function=substitute_random_synonyms,
|
|
285
|
+
scope=AttackWave.WORD,
|
|
286
|
+
seed=seed,
|
|
287
|
+
rate=effective_rate,
|
|
288
|
+
part_of_speech=part_of_speech,
|
|
289
|
+
lexicon=prepared_lexicon,
|
|
290
|
+
)
|
|
291
|
+
finally:
|
|
292
|
+
self._initializing = False
|
|
293
|
+
|
|
294
|
+
def set_param(self, key: str, value: Any) -> None:
|
|
295
|
+
super().set_param(key, value)
|
|
296
|
+
|
|
297
|
+
aliases = getattr(self, "_param_aliases", {})
|
|
298
|
+
canonical = aliases.get(key, key)
|
|
299
|
+
|
|
300
|
+
if canonical == "seed":
|
|
301
|
+
current_lexicon = getattr(self, "lexicon", None)
|
|
302
|
+
if isinstance(current_lexicon, Lexicon):
|
|
303
|
+
if getattr(self, "_owns_lexicon", False):
|
|
304
|
+
current_lexicon.reseed(self.seed)
|
|
305
|
+
else:
|
|
306
|
+
if self.seed is not None:
|
|
307
|
+
current_lexicon.reseed(self.seed)
|
|
308
|
+
else:
|
|
309
|
+
if hasattr(self, "_external_lexicon_original_seed"):
|
|
310
|
+
original_seed = getattr(self, "_external_lexicon_original_seed", None)
|
|
311
|
+
current_lexicon.reseed(original_seed)
|
|
312
|
+
elif canonical == "lexicon" and isinstance(value, Lexicon):
|
|
313
|
+
if getattr(self, "_initializing", False):
|
|
314
|
+
if getattr(self, "_owns_lexicon", False):
|
|
315
|
+
if self.seed is not None:
|
|
316
|
+
value.reseed(self.seed)
|
|
317
|
+
else:
|
|
318
|
+
if getattr(self, "_external_lexicon_original_seed", None) is None:
|
|
319
|
+
self._external_lexicon_original_seed = value.seed
|
|
320
|
+
if self.seed is not None:
|
|
321
|
+
value.reseed(self.seed)
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
self._owns_lexicon = False
|
|
325
|
+
self._external_lexicon_original_seed = value.seed
|
|
326
|
+
if self.seed is not None:
|
|
327
|
+
value.reseed(self.seed)
|
|
328
|
+
elif value.seed != self._external_lexicon_original_seed:
|
|
329
|
+
value.reseed(self._external_lexicon_original_seed)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
jargoyle = Jargoyle()
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
__all__ = ["Jargoyle", "dependencies_available", "ensure_wordnet", "jargoyle"]
|
glitchlings/zoo/mim1c.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from collections.abc import Collection
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from confusable_homoglyphs import confusables
|
|
6
|
+
|
|
7
|
+
from ._rate import resolve_rate
|
|
8
|
+
from .core import AttackOrder, AttackWave, Glitchling
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def swap_homoglyphs(
|
|
12
|
+
text: str,
|
|
13
|
+
rate: float | None = None,
|
|
14
|
+
classes: list[str] | Literal["all"] | None = None,
|
|
15
|
+
banned_characters: Collection[str] | None = None,
|
|
16
|
+
seed: int | None = None,
|
|
17
|
+
rng: random.Random | None = None,
|
|
18
|
+
*,
|
|
19
|
+
replacement_rate: float | None = None,
|
|
20
|
+
) -> str:
|
|
21
|
+
"""Replace characters with visually confusable homoglyphs.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
- text: Input text.
|
|
26
|
+
- rate: Max proportion of eligible characters to replace (default 0.02).
|
|
27
|
+
- classes: Restrict replacements to these Unicode script classes (default
|
|
28
|
+
["LATIN", "GREEK", "CYRILLIC"]). Use "all" to allow any.
|
|
29
|
+
- banned_characters: Characters that must never appear as replacements.
|
|
30
|
+
- seed: Optional seed if `rng` not provided.
|
|
31
|
+
- rng: Optional RNG; overrides seed.
|
|
32
|
+
|
|
33
|
+
Notes
|
|
34
|
+
-----
|
|
35
|
+
- Only replaces characters present in ``confusables.confusables_data`` with
|
|
36
|
+
single-codepoint alternatives.
|
|
37
|
+
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
effective_rate = resolve_rate(
|
|
41
|
+
rate=rate,
|
|
42
|
+
legacy_value=replacement_rate,
|
|
43
|
+
default=0.02,
|
|
44
|
+
legacy_name="replacement_rate",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if rng is None:
|
|
48
|
+
rng = random.Random(seed)
|
|
49
|
+
|
|
50
|
+
if classes is None:
|
|
51
|
+
classes = ["LATIN", "GREEK", "CYRILLIC"]
|
|
52
|
+
|
|
53
|
+
target_chars = [char for char in text if char.isalnum()]
|
|
54
|
+
confusable_chars = [char for char in target_chars if char in confusables.confusables_data]
|
|
55
|
+
clamped_rate = max(0.0, effective_rate)
|
|
56
|
+
num_replacements = int(len(confusable_chars) * clamped_rate)
|
|
57
|
+
done = 0
|
|
58
|
+
rng.shuffle(confusable_chars)
|
|
59
|
+
banned_set = set(banned_characters or ())
|
|
60
|
+
for char in confusable_chars:
|
|
61
|
+
if done >= num_replacements:
|
|
62
|
+
break
|
|
63
|
+
options = [o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1]
|
|
64
|
+
if classes != "all":
|
|
65
|
+
options = [opt for opt in options if confusables.alias(opt) in classes]
|
|
66
|
+
if banned_set:
|
|
67
|
+
options = [opt for opt in options if opt not in banned_set]
|
|
68
|
+
if not options:
|
|
69
|
+
continue
|
|
70
|
+
text = text.replace(char, rng.choice(options), 1)
|
|
71
|
+
done += 1
|
|
72
|
+
return text
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Mim1c(Glitchling):
|
|
76
|
+
"""Glitchling that swaps characters for visually similar homoglyphs."""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
*,
|
|
81
|
+
rate: float | None = None,
|
|
82
|
+
replacement_rate: float | None = None,
|
|
83
|
+
classes: list[str] | Literal["all"] | None = None,
|
|
84
|
+
banned_characters: Collection[str] | None = None,
|
|
85
|
+
seed: int | None = None,
|
|
86
|
+
) -> None:
|
|
87
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
|
88
|
+
effective_rate = resolve_rate(
|
|
89
|
+
rate=rate,
|
|
90
|
+
legacy_value=replacement_rate,
|
|
91
|
+
default=0.02,
|
|
92
|
+
legacy_name="replacement_rate",
|
|
93
|
+
)
|
|
94
|
+
super().__init__(
|
|
95
|
+
name="Mim1c",
|
|
96
|
+
corruption_function=swap_homoglyphs,
|
|
97
|
+
scope=AttackWave.CHARACTER,
|
|
98
|
+
order=AttackOrder.LAST,
|
|
99
|
+
seed=seed,
|
|
100
|
+
rate=effective_rate,
|
|
101
|
+
classes=classes,
|
|
102
|
+
banned_characters=banned_characters,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
mim1c = Mim1c()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
__all__ = ["Mim1c", "mim1c"]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Source Replacements (space-separated)
|
|
2
|
+
li h
|
|
3
|
+
h li
|
|
4
|
+
rn m
|
|
5
|
+
m rn
|
|
6
|
+
cl d
|
|
7
|
+
d cl
|
|
8
|
+
I l
|
|
9
|
+
l I 1
|
|
10
|
+
1 l I
|
|
11
|
+
0 O
|
|
12
|
+
O 0
|
|
13
|
+
B 8
|
|
14
|
+
8 B
|
|
15
|
+
S 5
|
|
16
|
+
5 S
|
|
17
|
+
Z 2
|
|
18
|
+
2 Z
|
|
19
|
+
G 6
|
|
20
|
+
6 G
|
|
21
|
+
“ "
|
|
22
|
+
” "
|
|
23
|
+
‘ '
|
|
24
|
+
’ '
|
|
25
|
+
— -
|
|
26
|
+
– -
|
|
27
|
+
vv w
|
|
28
|
+
w vv
|
|
29
|
+
ri n
|
|
30
|
+
n ri
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from ._rate import resolve_rate
|
|
6
|
+
from ._rust_extensions import get_rust_operation
|
|
7
|
+
from ._sampling import weighted_sample_without_replacement
|
|
8
|
+
from ._text_utils import (
|
|
9
|
+
WordToken,
|
|
10
|
+
collect_word_tokens,
|
|
11
|
+
split_preserving_whitespace,
|
|
12
|
+
)
|
|
13
|
+
from .core import AttackWave, Glitchling
|
|
14
|
+
|
|
15
|
+
FULL_BLOCK = "█"
|
|
16
|
+
|
|
17
|
+
# Load Rust-accelerated operation if available
|
|
18
|
+
_redact_words_rust = get_rust_operation("redact_words")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _python_redact_words(
|
|
22
|
+
text: str,
|
|
23
|
+
*,
|
|
24
|
+
replacement_char: str,
|
|
25
|
+
rate: float,
|
|
26
|
+
merge_adjacent: bool,
|
|
27
|
+
rng: random.Random,
|
|
28
|
+
unweighted: bool = False,
|
|
29
|
+
) -> str:
|
|
30
|
+
"""Redact random words by replacing their characters.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
- text: Input text.
|
|
35
|
+
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
|
36
|
+
- rate: Max proportion of words to redact (default 0.05).
|
|
37
|
+
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
|
38
|
+
- rng: RNG used for sampling decisions.
|
|
39
|
+
- unweighted: When True, sample words uniformly instead of by length.
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
tokens = split_preserving_whitespace(text)
|
|
43
|
+
word_tokens = collect_word_tokens(tokens)
|
|
44
|
+
if not word_tokens:
|
|
45
|
+
raise ValueError("Cannot redact words because the input text contains no redactable words.")
|
|
46
|
+
|
|
47
|
+
population = [token.index for token in word_tokens]
|
|
48
|
+
weights = [1.0 if unweighted else float(token.core_length) for token in word_tokens]
|
|
49
|
+
|
|
50
|
+
clamped_rate = max(0.0, min(rate, 1.0))
|
|
51
|
+
raw_quota = len(population) * clamped_rate
|
|
52
|
+
num_to_redact = int(raw_quota)
|
|
53
|
+
if clamped_rate > 0.0:
|
|
54
|
+
num_to_redact = max(1, num_to_redact)
|
|
55
|
+
num_to_redact = min(num_to_redact, len(population))
|
|
56
|
+
if num_to_redact <= 0:
|
|
57
|
+
return "".join(tokens)
|
|
58
|
+
|
|
59
|
+
indices_to_redact = weighted_sample_without_replacement(
|
|
60
|
+
population,
|
|
61
|
+
weights,
|
|
62
|
+
k=num_to_redact,
|
|
63
|
+
rng=rng,
|
|
64
|
+
)
|
|
65
|
+
indices_to_redact.sort()
|
|
66
|
+
|
|
67
|
+
token_by_index: dict[int, WordToken] = {token.index: token for token in word_tokens}
|
|
68
|
+
|
|
69
|
+
for i in indices_to_redact:
|
|
70
|
+
if i >= len(tokens):
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
token = token_by_index.get(i)
|
|
74
|
+
if token is None:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
prefix, core, suffix = token.prefix, token.core, token.suffix
|
|
78
|
+
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
|
79
|
+
|
|
80
|
+
text = "".join(tokens)
|
|
81
|
+
|
|
82
|
+
if merge_adjacent:
|
|
83
|
+
text = re.sub(
|
|
84
|
+
rf"{replacement_char}\W+{replacement_char}",
|
|
85
|
+
lambda m: replacement_char * (len(m.group(0)) - 1),
|
|
86
|
+
text,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return text
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def redact_words(
|
|
93
|
+
text: str,
|
|
94
|
+
replacement_char: str = FULL_BLOCK,
|
|
95
|
+
rate: float | None = None,
|
|
96
|
+
merge_adjacent: bool = False,
|
|
97
|
+
seed: int = 151,
|
|
98
|
+
rng: random.Random | None = None,
|
|
99
|
+
*,
|
|
100
|
+
redaction_rate: float | None = None,
|
|
101
|
+
unweighted: bool = False,
|
|
102
|
+
) -> str:
|
|
103
|
+
"""Redact random words by replacing their characters."""
|
|
104
|
+
effective_rate = resolve_rate(
|
|
105
|
+
rate=rate,
|
|
106
|
+
legacy_value=redaction_rate,
|
|
107
|
+
default=0.025,
|
|
108
|
+
legacy_name="redaction_rate",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if rng is None:
|
|
112
|
+
rng = random.Random(seed)
|
|
113
|
+
|
|
114
|
+
clamped_rate = max(0.0, min(effective_rate, 1.0))
|
|
115
|
+
unweighted_flag = bool(unweighted)
|
|
116
|
+
|
|
117
|
+
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
|
118
|
+
|
|
119
|
+
if use_rust:
|
|
120
|
+
assert _redact_words_rust is not None # Type narrowing for mypy
|
|
121
|
+
return cast(
|
|
122
|
+
str,
|
|
123
|
+
_redact_words_rust(
|
|
124
|
+
text,
|
|
125
|
+
replacement_char,
|
|
126
|
+
clamped_rate,
|
|
127
|
+
merge_adjacent,
|
|
128
|
+
unweighted_flag,
|
|
129
|
+
rng,
|
|
130
|
+
),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return _python_redact_words(
|
|
134
|
+
text,
|
|
135
|
+
replacement_char=replacement_char,
|
|
136
|
+
rate=clamped_rate,
|
|
137
|
+
merge_adjacent=merge_adjacent,
|
|
138
|
+
rng=rng,
|
|
139
|
+
unweighted=unweighted_flag,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class Redactyl(Glitchling):
|
|
144
|
+
"""Glitchling that redacts words with block characters."""
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
*,
|
|
149
|
+
replacement_char: str = FULL_BLOCK,
|
|
150
|
+
rate: float | None = None,
|
|
151
|
+
redaction_rate: float | None = None,
|
|
152
|
+
merge_adjacent: bool = False,
|
|
153
|
+
seed: int = 151,
|
|
154
|
+
unweighted: bool = False,
|
|
155
|
+
) -> None:
|
|
156
|
+
self._param_aliases = {"redaction_rate": "rate"}
|
|
157
|
+
effective_rate = resolve_rate(
|
|
158
|
+
rate=rate,
|
|
159
|
+
legacy_value=redaction_rate,
|
|
160
|
+
default=0.025,
|
|
161
|
+
legacy_name="redaction_rate",
|
|
162
|
+
)
|
|
163
|
+
super().__init__(
|
|
164
|
+
name="Redactyl",
|
|
165
|
+
corruption_function=redact_words,
|
|
166
|
+
scope=AttackWave.WORD,
|
|
167
|
+
seed=seed,
|
|
168
|
+
replacement_char=replacement_char,
|
|
169
|
+
rate=effective_rate,
|
|
170
|
+
merge_adjacent=merge_adjacent,
|
|
171
|
+
unweighted=unweighted,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
175
|
+
replacement_char = self.kwargs.get("replacement_char")
|
|
176
|
+
rate = self.kwargs.get("rate")
|
|
177
|
+
merge_adjacent = self.kwargs.get("merge_adjacent")
|
|
178
|
+
if replacement_char is None or rate is None or merge_adjacent is None:
|
|
179
|
+
return None
|
|
180
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
|
181
|
+
return {
|
|
182
|
+
"type": "redact",
|
|
183
|
+
"replacement_char": str(replacement_char),
|
|
184
|
+
"redaction_rate": float(rate),
|
|
185
|
+
"merge_adjacent": bool(merge_adjacent),
|
|
186
|
+
"unweighted": unweighted,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
redactyl = Redactyl()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
__all__ = ["Redactyl", "redactyl"]
|