glitchlings 0.2.3__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +42 -0
- glitchlings/__main__.py +9 -0
- glitchlings/_zoo_rust.cpython-310-x86_64-linux-gnu.so +0 -0
- glitchlings/dlc/__init__.py +5 -0
- glitchlings/dlc/huggingface.py +96 -0
- glitchlings/dlc/prime.py +274 -0
- glitchlings/main.py +218 -0
- glitchlings/util/__init__.py +181 -0
- glitchlings/zoo/__init__.py +134 -0
- glitchlings/zoo/_ocr_confusions.py +34 -0
- glitchlings/zoo/_rate.py +21 -0
- glitchlings/zoo/core.py +405 -0
- glitchlings/zoo/jargoyle.py +336 -0
- glitchlings/zoo/mim1c.py +108 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +165 -0
- glitchlings/zoo/reduple.py +128 -0
- glitchlings/zoo/rushmore.py +136 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +212 -0
- glitchlings-0.2.3.dist-info/METADATA +478 -0
- glitchlings-0.2.3.dist-info/RECORD +26 -0
- glitchlings-0.2.3.dist-info/WHEEL +5 -0
- glitchlings-0.2.3.dist-info/entry_points.txt +2 -0
- glitchlings-0.2.3.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.2.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,336 @@
|
|
1
|
+
import random
|
2
|
+
import re
|
3
|
+
from collections.abc import Iterable
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
6
|
+
|
7
|
+
try: # pragma: no cover - exercised in environments with NLTK installed
|
8
|
+
import nltk # type: ignore[import]
|
9
|
+
except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
|
10
|
+
nltk = None # type: ignore[assignment]
|
11
|
+
find = None # type: ignore[assignment]
|
12
|
+
_NLTK_IMPORT_ERROR = exc
|
13
|
+
else: # pragma: no cover - executed when NLTK is available
|
14
|
+
from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
|
15
|
+
from nltk.data import find as _nltk_find # type: ignore[import]
|
16
|
+
|
17
|
+
find = _nltk_find
|
18
|
+
_NLTK_IMPORT_ERROR = None
|
19
|
+
|
20
|
+
if TYPE_CHECKING: # pragma: no cover - typing aid only
|
21
|
+
from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
|
22
|
+
else: # Use ``Any`` at runtime to avoid hard dependency when NLTK missing
|
23
|
+
WordNetCorpusReader = Any
|
24
|
+
|
25
|
+
if nltk is not None: # pragma: no cover - guarded by import success
|
26
|
+
try:
|
27
|
+
from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
|
28
|
+
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
29
|
+
_WORDNET_MODULE = None
|
30
|
+
else:
|
31
|
+
WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
|
32
|
+
else:
|
33
|
+
_WORDNET_MODULE = None
|
34
|
+
|
35
|
+
from .core import AttackWave, Glitchling
|
36
|
+
from ._rate import resolve_rate
|
37
|
+
|
38
|
+
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
39
|
+
|
40
|
+
_wordnet_ready = False
|
41
|
+
|
42
|
+
|
43
|
+
def _require_nltk() -> None:
|
44
|
+
"""Ensure the NLTK dependency is present before continuing."""
|
45
|
+
|
46
|
+
if nltk is None or find is None:
|
47
|
+
message = (
|
48
|
+
"The NLTK package is required for the jargoyle glitchling; install "
|
49
|
+
"the 'wordnet' extra via `pip install glitchlings[wordnet]`."
|
50
|
+
)
|
51
|
+
if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
|
52
|
+
raise RuntimeError(message) from _NLTK_IMPORT_ERROR
|
53
|
+
raise RuntimeError(message)
|
54
|
+
|
55
|
+
|
56
|
+
def dependencies_available() -> bool:
|
57
|
+
"""Return ``True`` when the runtime NLTK dependency is present."""
|
58
|
+
|
59
|
+
return nltk is not None and find is not None
|
60
|
+
|
61
|
+
|
62
|
+
def _load_wordnet_reader() -> WordNetCorpusReader:
|
63
|
+
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
64
|
+
|
65
|
+
_require_nltk()
|
66
|
+
|
67
|
+
try:
|
68
|
+
root = find("corpora/wordnet")
|
69
|
+
except LookupError:
|
70
|
+
try:
|
71
|
+
zip_root = find("corpora/wordnet.zip")
|
72
|
+
except LookupError as exc:
|
73
|
+
raise RuntimeError(
|
74
|
+
"The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
|
75
|
+
) from exc
|
76
|
+
root = zip_root.join("wordnet/")
|
77
|
+
|
78
|
+
return WordNetCorpusReader(root, None)
|
79
|
+
|
80
|
+
|
81
|
+
def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
|
82
|
+
"""Retrieve the active WordNet handle, rebuilding it on demand."""
|
83
|
+
|
84
|
+
global _WORDNET_HANDLE
|
85
|
+
|
86
|
+
if force_refresh:
|
87
|
+
_WORDNET_HANDLE = _WORDNET_MODULE
|
88
|
+
|
89
|
+
if _WORDNET_HANDLE is not None:
|
90
|
+
return _WORDNET_HANDLE
|
91
|
+
|
92
|
+
_WORDNET_HANDLE = _load_wordnet_reader()
|
93
|
+
return _WORDNET_HANDLE
|
94
|
+
|
95
|
+
|
96
|
+
def ensure_wordnet() -> None:
|
97
|
+
"""Ensure the WordNet corpus is available before use."""
|
98
|
+
|
99
|
+
global _wordnet_ready
|
100
|
+
if _wordnet_ready:
|
101
|
+
return
|
102
|
+
|
103
|
+
_require_nltk()
|
104
|
+
|
105
|
+
resource = _wordnet()
|
106
|
+
|
107
|
+
try:
|
108
|
+
resource.ensure_loaded()
|
109
|
+
except LookupError:
|
110
|
+
nltk.download("wordnet", quiet=True)
|
111
|
+
try:
|
112
|
+
resource = _wordnet(force_refresh=True)
|
113
|
+
resource.ensure_loaded()
|
114
|
+
except LookupError as exc: # pragma: no cover - only triggered when download fails
|
115
|
+
raise RuntimeError(
|
116
|
+
"Unable to load NLTK WordNet corpus for the jargoyle glitchling."
|
117
|
+
) from exc
|
118
|
+
|
119
|
+
_wordnet_ready = True
|
120
|
+
|
121
|
+
|
122
|
+
# Backwards compatibility for callers relying on the previous private helper name.
|
123
|
+
_ensure_wordnet = ensure_wordnet
|
124
|
+
|
125
|
+
|
126
|
+
PartOfSpeech = Literal["n", "v", "a", "r"]
|
127
|
+
PartOfSpeechInput = PartOfSpeech | Iterable[PartOfSpeech] | Literal["any"]
|
128
|
+
NormalizedPartsOfSpeech = tuple[PartOfSpeech, ...]
|
129
|
+
|
130
|
+
_VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
|
131
|
+
|
132
|
+
|
133
|
+
def _split_token(token: str) -> tuple[str, str, str]:
|
134
|
+
"""Split a token into leading punctuation, core word, and trailing punctuation."""
|
135
|
+
|
136
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", token)
|
137
|
+
if not match:
|
138
|
+
return "", token, ""
|
139
|
+
prefix, core, suffix = match.groups()
|
140
|
+
return prefix, core, suffix
|
141
|
+
|
142
|
+
|
143
|
+
def _normalize_parts_of_speech(part_of_speech: PartOfSpeechInput) -> NormalizedPartsOfSpeech:
|
144
|
+
"""Coerce user input into a tuple of valid WordNet POS tags."""
|
145
|
+
|
146
|
+
if isinstance(part_of_speech, str):
|
147
|
+
lowered = part_of_speech.lower()
|
148
|
+
if lowered == "any":
|
149
|
+
return _VALID_POS
|
150
|
+
if lowered not in _VALID_POS:
|
151
|
+
raise ValueError(
|
152
|
+
"part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'"
|
153
|
+
)
|
154
|
+
return (cast(PartOfSpeech, lowered),)
|
155
|
+
|
156
|
+
normalized: list[PartOfSpeech] = []
|
157
|
+
for pos in part_of_speech:
|
158
|
+
if pos not in _VALID_POS:
|
159
|
+
raise ValueError(
|
160
|
+
"part_of_speech entries must be one of 'n', 'v', 'a', or 'r'"
|
161
|
+
)
|
162
|
+
if pos not in normalized:
|
163
|
+
normalized.append(pos)
|
164
|
+
if not normalized:
|
165
|
+
raise ValueError("part_of_speech iterable may not be empty")
|
166
|
+
return tuple(normalized)
|
167
|
+
|
168
|
+
|
169
|
+
@dataclass(frozen=True)
|
170
|
+
class CandidateInfo:
|
171
|
+
"""Metadata for a candidate token that may be replaced."""
|
172
|
+
|
173
|
+
prefix: str
|
174
|
+
core_word: str
|
175
|
+
suffix: str
|
176
|
+
parts_of_speech: NormalizedPartsOfSpeech
|
177
|
+
|
178
|
+
|
179
|
+
def _collect_synonyms(
|
180
|
+
word: str, parts_of_speech: NormalizedPartsOfSpeech
|
181
|
+
) -> list[str]:
|
182
|
+
"""Gather deterministic synonym candidates for the supplied word."""
|
183
|
+
|
184
|
+
normalized_word = word.lower()
|
185
|
+
wordnet = _wordnet()
|
186
|
+
synonyms: set[str] = set()
|
187
|
+
for pos_tag in parts_of_speech:
|
188
|
+
synsets = wordnet.synsets(word, pos=pos_tag)
|
189
|
+
if not synsets:
|
190
|
+
continue
|
191
|
+
|
192
|
+
for synset in synsets:
|
193
|
+
lemmas_list = [lemma.name() for lemma in cast(Any, synset).lemmas()]
|
194
|
+
if not lemmas_list:
|
195
|
+
continue
|
196
|
+
|
197
|
+
filtered = []
|
198
|
+
for lemma_str in lemmas_list:
|
199
|
+
cleaned = lemma_str.replace("_", " ")
|
200
|
+
if cleaned.lower() != normalized_word:
|
201
|
+
filtered.append(cleaned)
|
202
|
+
|
203
|
+
if filtered:
|
204
|
+
synonyms.update(filtered)
|
205
|
+
break
|
206
|
+
|
207
|
+
if synonyms:
|
208
|
+
break
|
209
|
+
|
210
|
+
return sorted(synonyms)
|
211
|
+
|
212
|
+
|
213
|
+
def substitute_random_synonyms(
|
214
|
+
text: str,
|
215
|
+
rate: float | None = None,
|
216
|
+
part_of_speech: PartOfSpeechInput = "n",
|
217
|
+
seed: int | None = None,
|
218
|
+
rng: random.Random | None = None,
|
219
|
+
*,
|
220
|
+
replacement_rate: float | None = None,
|
221
|
+
) -> str:
|
222
|
+
"""Replace words with random WordNet synonyms.
|
223
|
+
|
224
|
+
Parameters
|
225
|
+
- text: Input text.
|
226
|
+
- rate: Max proportion of candidate words to replace (default 0.1).
|
227
|
+
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
228
|
+
any iterable of those tags, or "any" to include all four.
|
229
|
+
- rng: Optional RNG instance used for deterministic sampling.
|
230
|
+
- seed: Optional seed if `rng` not provided.
|
231
|
+
|
232
|
+
Determinism
|
233
|
+
- Candidates collected in left-to-right order; no set() reordering.
|
234
|
+
- Replacement positions chosen via rng.sample.
|
235
|
+
- Synonyms sorted before rng.choice to fix ordering.
|
236
|
+
- For each POS, the first synset containing alternate lemmas is used for stability.
|
237
|
+
"""
|
238
|
+
effective_rate = resolve_rate(
|
239
|
+
rate=rate,
|
240
|
+
legacy_value=replacement_rate,
|
241
|
+
default=0.1,
|
242
|
+
legacy_name="replacement_rate",
|
243
|
+
)
|
244
|
+
|
245
|
+
ensure_wordnet()
|
246
|
+
wordnet = _wordnet()
|
247
|
+
|
248
|
+
active_rng: random.Random
|
249
|
+
if rng is not None:
|
250
|
+
active_rng = rng
|
251
|
+
else:
|
252
|
+
active_rng = random.Random(seed)
|
253
|
+
|
254
|
+
target_pos = _normalize_parts_of_speech(part_of_speech)
|
255
|
+
|
256
|
+
# Split but keep whitespace separators so we can rebuild easily
|
257
|
+
tokens = re.split(r"(\s+)", text)
|
258
|
+
|
259
|
+
# Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
|
260
|
+
candidate_indices: list[int] = []
|
261
|
+
candidate_metadata: dict[int, CandidateInfo] = {}
|
262
|
+
for idx, tok in enumerate(tokens):
|
263
|
+
if idx % 2 == 0 and tok and not tok.isspace():
|
264
|
+
prefix, core_word, suffix = _split_token(tok)
|
265
|
+
if not core_word:
|
266
|
+
continue
|
267
|
+
|
268
|
+
available_pos: NormalizedPartsOfSpeech = tuple(
|
269
|
+
pos for pos in target_pos if wordnet.synsets(core_word, pos=pos)
|
270
|
+
)
|
271
|
+
if available_pos:
|
272
|
+
candidate_indices.append(idx)
|
273
|
+
candidate_metadata[idx] = CandidateInfo(
|
274
|
+
prefix=prefix,
|
275
|
+
core_word=core_word,
|
276
|
+
suffix=suffix,
|
277
|
+
parts_of_speech=available_pos,
|
278
|
+
)
|
279
|
+
|
280
|
+
if not candidate_indices:
|
281
|
+
return text
|
282
|
+
|
283
|
+
clamped_rate = max(0.0, effective_rate)
|
284
|
+
max_replacements = int(len(candidate_indices) * clamped_rate)
|
285
|
+
if max_replacements <= 0:
|
286
|
+
return text
|
287
|
+
|
288
|
+
# Choose which positions to replace deterministically via rng.sample
|
289
|
+
replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
|
290
|
+
# Process in ascending order to avoid affecting later indices
|
291
|
+
replace_positions.sort()
|
292
|
+
|
293
|
+
for pos in replace_positions:
|
294
|
+
metadata = candidate_metadata[pos]
|
295
|
+
synonyms = _collect_synonyms(metadata.core_word, metadata.parts_of_speech)
|
296
|
+
if not synonyms:
|
297
|
+
continue
|
298
|
+
|
299
|
+
replacement = active_rng.choice(synonyms)
|
300
|
+
tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
|
301
|
+
|
302
|
+
return "".join(tokens)
|
303
|
+
|
304
|
+
|
305
|
+
class Jargoyle(Glitchling):
|
306
|
+
"""Glitchling that swaps words with random WordNet synonyms."""
|
307
|
+
|
308
|
+
def __init__(
|
309
|
+
self,
|
310
|
+
*,
|
311
|
+
rate: float | None = None,
|
312
|
+
replacement_rate: float | None = None,
|
313
|
+
part_of_speech: PartOfSpeechInput = "n",
|
314
|
+
seed: int | None = None,
|
315
|
+
) -> None:
|
316
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
317
|
+
effective_rate = resolve_rate(
|
318
|
+
rate=rate,
|
319
|
+
legacy_value=replacement_rate,
|
320
|
+
default=0.1,
|
321
|
+
legacy_name="replacement_rate",
|
322
|
+
)
|
323
|
+
super().__init__(
|
324
|
+
name="Jargoyle",
|
325
|
+
corruption_function=substitute_random_synonyms,
|
326
|
+
scope=AttackWave.WORD,
|
327
|
+
seed=seed,
|
328
|
+
rate=effective_rate,
|
329
|
+
part_of_speech=part_of_speech,
|
330
|
+
)
|
331
|
+
|
332
|
+
|
333
|
+
jargoyle = Jargoyle()
|
334
|
+
|
335
|
+
|
336
|
+
__all__ = ["Jargoyle", "dependencies_available", "ensure_wordnet", "jargoyle"]
|
glitchlings/zoo/mim1c.py
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
from collections.abc import Collection
|
2
|
+
import random
|
3
|
+
from typing import Literal
|
4
|
+
|
5
|
+
from confusable_homoglyphs import confusables
|
6
|
+
|
7
|
+
from .core import AttackOrder, AttackWave, Glitchling
|
8
|
+
from ._rate import resolve_rate
|
9
|
+
|
10
|
+
|
11
|
+
def swap_homoglyphs(
|
12
|
+
text: str,
|
13
|
+
rate: float | None = None,
|
14
|
+
classes: list[str] | Literal["all"] | None = None,
|
15
|
+
banned_characters: Collection[str] | None = None,
|
16
|
+
seed: int | None = None,
|
17
|
+
rng: random.Random | None = None,
|
18
|
+
*,
|
19
|
+
replacement_rate: float | None = None,
|
20
|
+
) -> str:
|
21
|
+
"""Replace characters with visually confusable homoglyphs.
|
22
|
+
|
23
|
+
Parameters
|
24
|
+
- text: Input text.
|
25
|
+
- rate: Max proportion of eligible characters to replace (default 0.02).
|
26
|
+
- classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
|
27
|
+
- banned_characters: Characters that must never appear as replacements.
|
28
|
+
- seed: Optional seed if `rng` not provided.
|
29
|
+
- rng: Optional RNG; overrides seed.
|
30
|
+
|
31
|
+
Notes
|
32
|
+
- Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
|
33
|
+
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
34
|
+
"""
|
35
|
+
effective_rate = resolve_rate(
|
36
|
+
rate=rate,
|
37
|
+
legacy_value=replacement_rate,
|
38
|
+
default=0.02,
|
39
|
+
legacy_name="replacement_rate",
|
40
|
+
)
|
41
|
+
|
42
|
+
if rng is None:
|
43
|
+
rng = random.Random(seed)
|
44
|
+
|
45
|
+
if classes is None:
|
46
|
+
classes = ["LATIN", "GREEK", "CYRILLIC"]
|
47
|
+
|
48
|
+
target_chars = [char for char in text if char.isalnum()]
|
49
|
+
confusable_chars = [
|
50
|
+
char for char in target_chars if char in confusables.confusables_data
|
51
|
+
]
|
52
|
+
clamped_rate = max(0.0, effective_rate)
|
53
|
+
num_replacements = int(len(confusable_chars) * clamped_rate)
|
54
|
+
done = 0
|
55
|
+
rng.shuffle(confusable_chars)
|
56
|
+
banned_set = set(banned_characters or ())
|
57
|
+
for char in confusable_chars:
|
58
|
+
if done >= num_replacements:
|
59
|
+
break
|
60
|
+
options = [
|
61
|
+
o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
|
62
|
+
]
|
63
|
+
if classes != "all":
|
64
|
+
options = [opt for opt in options if confusables.alias(opt) in classes]
|
65
|
+
if banned_set:
|
66
|
+
options = [opt for opt in options if opt not in banned_set]
|
67
|
+
if not options:
|
68
|
+
continue
|
69
|
+
text = text.replace(char, rng.choice(options), 1)
|
70
|
+
done += 1
|
71
|
+
return text
|
72
|
+
|
73
|
+
|
74
|
+
class Mim1c(Glitchling):
|
75
|
+
"""Glitchling that swaps characters for visually similar homoglyphs."""
|
76
|
+
|
77
|
+
def __init__(
|
78
|
+
self,
|
79
|
+
*,
|
80
|
+
rate: float | None = None,
|
81
|
+
replacement_rate: float | None = None,
|
82
|
+
classes: list[str] | Literal["all"] | None = None,
|
83
|
+
banned_characters: Collection[str] | None = None,
|
84
|
+
seed: int | None = None,
|
85
|
+
) -> None:
|
86
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
87
|
+
effective_rate = resolve_rate(
|
88
|
+
rate=rate,
|
89
|
+
legacy_value=replacement_rate,
|
90
|
+
default=0.02,
|
91
|
+
legacy_name="replacement_rate",
|
92
|
+
)
|
93
|
+
super().__init__(
|
94
|
+
name="Mim1c",
|
95
|
+
corruption_function=swap_homoglyphs,
|
96
|
+
scope=AttackWave.CHARACTER,
|
97
|
+
order=AttackOrder.LAST,
|
98
|
+
seed=seed,
|
99
|
+
rate=effective_rate,
|
100
|
+
classes=classes,
|
101
|
+
banned_characters=banned_characters,
|
102
|
+
)
|
103
|
+
|
104
|
+
|
105
|
+
mim1c = Mim1c()
|
106
|
+
|
107
|
+
|
108
|
+
__all__ = ["Mim1c", "mim1c"]
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Source Replacements (space-separated)
|
2
|
+
li h
|
3
|
+
h li
|
4
|
+
rn m
|
5
|
+
m rn
|
6
|
+
cl d
|
7
|
+
d cl
|
8
|
+
I l
|
9
|
+
l I 1
|
10
|
+
1 l I
|
11
|
+
0 O
|
12
|
+
O 0
|
13
|
+
B 8
|
14
|
+
8 B
|
15
|
+
S 5
|
16
|
+
5 S
|
17
|
+
Z 2
|
18
|
+
2 Z
|
19
|
+
G 6
|
20
|
+
6 G
|
21
|
+
“ "
|
22
|
+
” "
|
23
|
+
‘ '
|
24
|
+
’ '
|
25
|
+
— -
|
26
|
+
– -
|
27
|
+
vv w
|
28
|
+
w vv
|
29
|
+
ri n
|
30
|
+
n ri
|
@@ -0,0 +1,165 @@
|
|
1
|
+
import re
|
2
|
+
import random
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from .core import Glitchling, AttackWave
|
6
|
+
from ._rate import resolve_rate
|
7
|
+
|
8
|
+
FULL_BLOCK = "█"
|
9
|
+
|
10
|
+
|
11
|
+
try:
|
12
|
+
from glitchlings._zoo_rust import redact_words as _redact_words_rust
|
13
|
+
except ImportError: # pragma: no cover - compiled extension not present
|
14
|
+
_redact_words_rust = None
|
15
|
+
|
16
|
+
|
17
|
+
def _python_redact_words(
|
18
|
+
text: str,
|
19
|
+
*,
|
20
|
+
replacement_char: str,
|
21
|
+
rate: float,
|
22
|
+
merge_adjacent: bool,
|
23
|
+
rng: random.Random,
|
24
|
+
) -> str:
|
25
|
+
"""Redact random words by replacing their characters.
|
26
|
+
|
27
|
+
Parameters
|
28
|
+
- text: Input text.
|
29
|
+
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
30
|
+
- rate: Max proportion of words to redact (default 0.05).
|
31
|
+
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
32
|
+
- seed: Seed used if `rng` not provided (default 151).
|
33
|
+
- rng: Optional RNG; overrides seed.
|
34
|
+
"""
|
35
|
+
# Preserve exact spacing and punctuation by using regex
|
36
|
+
tokens = re.split(r"(\s+)", text)
|
37
|
+
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
38
|
+
if not word_indices:
|
39
|
+
raise ValueError("Cannot redact words because the input text contains no redactable words.")
|
40
|
+
num_to_redact = max(1, int(len(word_indices) * rate))
|
41
|
+
|
42
|
+
# Sample from the indices of actual words
|
43
|
+
indices_to_redact = rng.sample(word_indices, k=num_to_redact)
|
44
|
+
indices_to_redact.sort()
|
45
|
+
|
46
|
+
for i in indices_to_redact:
|
47
|
+
if i >= len(tokens):
|
48
|
+
break
|
49
|
+
|
50
|
+
word = tokens[i]
|
51
|
+
if not word or word.isspace(): # Skip empty or whitespace
|
52
|
+
continue
|
53
|
+
|
54
|
+
# Check if word has trailing punctuation
|
55
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
56
|
+
if match:
|
57
|
+
prefix, core, suffix = match.groups()
|
58
|
+
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
59
|
+
else:
|
60
|
+
tokens[i] = f"{replacement_char * len(word)}"
|
61
|
+
|
62
|
+
text = "".join(tokens)
|
63
|
+
|
64
|
+
if merge_adjacent:
|
65
|
+
text = re.sub(
|
66
|
+
rf"{replacement_char}\W+{replacement_char}",
|
67
|
+
lambda m: replacement_char * (len(m.group(0)) - 1),
|
68
|
+
text,
|
69
|
+
)
|
70
|
+
|
71
|
+
return text
|
72
|
+
|
73
|
+
|
74
|
+
def redact_words(
|
75
|
+
text: str,
|
76
|
+
replacement_char: str = FULL_BLOCK,
|
77
|
+
rate: float | None = None,
|
78
|
+
merge_adjacent: bool = False,
|
79
|
+
seed: int = 151,
|
80
|
+
rng: random.Random | None = None,
|
81
|
+
*,
|
82
|
+
redaction_rate: float | None = None,
|
83
|
+
) -> str:
|
84
|
+
"""Redact random words by replacing their characters."""
|
85
|
+
|
86
|
+
effective_rate = resolve_rate(
|
87
|
+
rate=rate,
|
88
|
+
legacy_value=redaction_rate,
|
89
|
+
default=0.05,
|
90
|
+
legacy_name="redaction_rate",
|
91
|
+
)
|
92
|
+
|
93
|
+
if rng is None:
|
94
|
+
rng = random.Random(seed)
|
95
|
+
|
96
|
+
clamped_rate = max(0.0, effective_rate)
|
97
|
+
|
98
|
+
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
99
|
+
|
100
|
+
if use_rust:
|
101
|
+
return _redact_words_rust(
|
102
|
+
text,
|
103
|
+
replacement_char,
|
104
|
+
clamped_rate,
|
105
|
+
merge_adjacent,
|
106
|
+
rng,
|
107
|
+
)
|
108
|
+
|
109
|
+
return _python_redact_words(
|
110
|
+
text,
|
111
|
+
replacement_char=replacement_char,
|
112
|
+
rate=clamped_rate,
|
113
|
+
merge_adjacent=merge_adjacent,
|
114
|
+
rng=rng,
|
115
|
+
)
|
116
|
+
|
117
|
+
|
118
|
+
class Redactyl(Glitchling):
|
119
|
+
"""Glitchling that redacts words with block characters."""
|
120
|
+
|
121
|
+
def __init__(
|
122
|
+
self,
|
123
|
+
*,
|
124
|
+
replacement_char: str = FULL_BLOCK,
|
125
|
+
rate: float | None = None,
|
126
|
+
redaction_rate: float | None = None,
|
127
|
+
merge_adjacent: bool = False,
|
128
|
+
seed: int = 151,
|
129
|
+
) -> None:
|
130
|
+
self._param_aliases = {"redaction_rate": "rate"}
|
131
|
+
effective_rate = resolve_rate(
|
132
|
+
rate=rate,
|
133
|
+
legacy_value=redaction_rate,
|
134
|
+
default=0.05,
|
135
|
+
legacy_name="redaction_rate",
|
136
|
+
)
|
137
|
+
super().__init__(
|
138
|
+
name="Redactyl",
|
139
|
+
corruption_function=redact_words,
|
140
|
+
scope=AttackWave.WORD,
|
141
|
+
seed=seed,
|
142
|
+
replacement_char=replacement_char,
|
143
|
+
rate=effective_rate,
|
144
|
+
merge_adjacent=merge_adjacent,
|
145
|
+
)
|
146
|
+
|
147
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
148
|
+
replacement_char = self.kwargs.get("replacement_char")
|
149
|
+
rate = self.kwargs.get("rate")
|
150
|
+
merge_adjacent = self.kwargs.get("merge_adjacent")
|
151
|
+
if replacement_char is None or rate is None or merge_adjacent is None:
|
152
|
+
return None
|
153
|
+
return {
|
154
|
+
"type": "redact",
|
155
|
+
"replacement_char": str(replacement_char),
|
156
|
+
"redaction_rate": float(rate),
|
157
|
+
"merge_adjacent": bool(merge_adjacent),
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
redactyl = Redactyl()
|
163
|
+
|
164
|
+
|
165
|
+
__all__ = ["Redactyl", "redactyl"]
|