glitchlings 0.2.1__cp312-cp312-win_amd64.whl → 0.2.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/dlc/prime.py +44 -22
- glitchlings/main.py +17 -39
- glitchlings/util/__init__.py +30 -0
- glitchlings/zoo/__init__.py +96 -19
- glitchlings/zoo/_ocr_confusions.py +34 -0
- glitchlings/zoo/_rate.py +21 -0
- glitchlings/zoo/core.py +56 -52
- glitchlings/zoo/jargoyle.py +77 -16
- glitchlings/zoo/mim1c.py +24 -5
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +46 -9
- glitchlings/zoo/reduple.py +36 -8
- glitchlings/zoo/rushmore.py +40 -8
- glitchlings/zoo/scannequin.py +42 -37
- glitchlings/zoo/typogre.py +36 -8
- {glitchlings-0.2.1.dist-info → glitchlings-0.2.3.dist-info}/METADATA +28 -61
- glitchlings-0.2.3.dist-info/RECORD +26 -0
- glitchlings/_typogre_rust.cp312-win_amd64.pyd +0 -0
- glitchlings-0.2.1.dist-info/RECORD +0 -24
- {glitchlings-0.2.1.dist-info → glitchlings-0.2.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.1.dist-info → glitchlings-0.2.3.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.2.1.dist-info → glitchlings-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.1.dist-info → glitchlings-0.2.3.dist-info}/top_level.txt +0 -0
glitchlings/zoo/jargoyle.py
CHANGED
@@ -2,27 +2,68 @@ import random
|
|
2
2
|
import re
|
3
3
|
from collections.abc import Iterable
|
4
4
|
from dataclasses import dataclass
|
5
|
-
from typing import Any, Literal, cast
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
6
|
+
|
7
|
+
try: # pragma: no cover - exercised in environments with NLTK installed
|
8
|
+
import nltk # type: ignore[import]
|
9
|
+
except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
|
10
|
+
nltk = None # type: ignore[assignment]
|
11
|
+
find = None # type: ignore[assignment]
|
12
|
+
_NLTK_IMPORT_ERROR = exc
|
13
|
+
else: # pragma: no cover - executed when NLTK is available
|
14
|
+
from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
|
15
|
+
from nltk.data import find as _nltk_find # type: ignore[import]
|
16
|
+
|
17
|
+
find = _nltk_find
|
18
|
+
_NLTK_IMPORT_ERROR = None
|
19
|
+
|
20
|
+
if TYPE_CHECKING: # pragma: no cover - typing aid only
|
21
|
+
from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
|
22
|
+
else: # Use ``Any`` at runtime to avoid hard dependency when NLTK missing
|
23
|
+
WordNetCorpusReader = Any
|
24
|
+
|
25
|
+
if nltk is not None: # pragma: no cover - guarded by import success
|
26
|
+
try:
|
27
|
+
from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
|
28
|
+
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
29
|
+
_WORDNET_MODULE = None
|
30
|
+
else:
|
31
|
+
WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
|
32
|
+
else:
|
33
|
+
_WORDNET_MODULE = None
|
10
34
|
|
11
35
|
from .core import AttackWave, Glitchling
|
12
|
-
|
13
|
-
try: # pragma: no cover - exercised when the namespace package is present
|
14
|
-
from nltk.corpus import wordnet as _WORDNET_MODULE
|
15
|
-
except ModuleNotFoundError: # pragma: no cover - triggered on modern NLTK installs
|
16
|
-
_WORDNET_MODULE = None
|
36
|
+
from ._rate import resolve_rate
|
17
37
|
|
18
38
|
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
19
39
|
|
20
40
|
_wordnet_ready = False
|
21
41
|
|
22
42
|
|
43
|
+
def _require_nltk() -> None:
|
44
|
+
"""Ensure the NLTK dependency is present before continuing."""
|
45
|
+
|
46
|
+
if nltk is None or find is None:
|
47
|
+
message = (
|
48
|
+
"The NLTK package is required for the jargoyle glitchling; install "
|
49
|
+
"the 'wordnet' extra via `pip install glitchlings[wordnet]`."
|
50
|
+
)
|
51
|
+
if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
|
52
|
+
raise RuntimeError(message) from _NLTK_IMPORT_ERROR
|
53
|
+
raise RuntimeError(message)
|
54
|
+
|
55
|
+
|
56
|
+
def dependencies_available() -> bool:
|
57
|
+
"""Return ``True`` when the runtime NLTK dependency is present."""
|
58
|
+
|
59
|
+
return nltk is not None and find is not None
|
60
|
+
|
61
|
+
|
23
62
|
def _load_wordnet_reader() -> WordNetCorpusReader:
|
24
63
|
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
25
64
|
|
65
|
+
_require_nltk()
|
66
|
+
|
26
67
|
try:
|
27
68
|
root = find("corpora/wordnet")
|
28
69
|
except LookupError:
|
@@ -59,6 +100,8 @@ def ensure_wordnet() -> None:
|
|
59
100
|
if _wordnet_ready:
|
60
101
|
return
|
61
102
|
|
103
|
+
_require_nltk()
|
104
|
+
|
62
105
|
resource = _wordnet()
|
63
106
|
|
64
107
|
try:
|
@@ -169,16 +212,18 @@ def _collect_synonyms(
|
|
169
212
|
|
170
213
|
def substitute_random_synonyms(
|
171
214
|
text: str,
|
172
|
-
|
215
|
+
rate: float | None = None,
|
173
216
|
part_of_speech: PartOfSpeechInput = "n",
|
174
217
|
seed: int | None = None,
|
175
218
|
rng: random.Random | None = None,
|
219
|
+
*,
|
220
|
+
replacement_rate: float | None = None,
|
176
221
|
) -> str:
|
177
222
|
"""Replace words with random WordNet synonyms.
|
178
223
|
|
179
224
|
Parameters
|
180
225
|
- text: Input text.
|
181
|
-
-
|
226
|
+
- rate: Max proportion of candidate words to replace (default 0.1).
|
182
227
|
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
183
228
|
any iterable of those tags, or "any" to include all four.
|
184
229
|
- rng: Optional RNG instance used for deterministic sampling.
|
@@ -190,6 +235,13 @@ def substitute_random_synonyms(
|
|
190
235
|
- Synonyms sorted before rng.choice to fix ordering.
|
191
236
|
- For each POS, the first synset containing alternate lemmas is used for stability.
|
192
237
|
"""
|
238
|
+
effective_rate = resolve_rate(
|
239
|
+
rate=rate,
|
240
|
+
legacy_value=replacement_rate,
|
241
|
+
default=0.1,
|
242
|
+
legacy_name="replacement_rate",
|
243
|
+
)
|
244
|
+
|
193
245
|
ensure_wordnet()
|
194
246
|
wordnet = _wordnet()
|
195
247
|
|
@@ -228,7 +280,8 @@ def substitute_random_synonyms(
|
|
228
280
|
if not candidate_indices:
|
229
281
|
return text
|
230
282
|
|
231
|
-
|
283
|
+
clamped_rate = max(0.0, effective_rate)
|
284
|
+
max_replacements = int(len(candidate_indices) * clamped_rate)
|
232
285
|
if max_replacements <= 0:
|
233
286
|
return text
|
234
287
|
|
@@ -255,16 +308,24 @@ class Jargoyle(Glitchling):
|
|
255
308
|
def __init__(
|
256
309
|
self,
|
257
310
|
*,
|
258
|
-
|
311
|
+
rate: float | None = None,
|
312
|
+
replacement_rate: float | None = None,
|
259
313
|
part_of_speech: PartOfSpeechInput = "n",
|
260
314
|
seed: int | None = None,
|
261
315
|
) -> None:
|
316
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
317
|
+
effective_rate = resolve_rate(
|
318
|
+
rate=rate,
|
319
|
+
legacy_value=replacement_rate,
|
320
|
+
default=0.1,
|
321
|
+
legacy_name="replacement_rate",
|
322
|
+
)
|
262
323
|
super().__init__(
|
263
324
|
name="Jargoyle",
|
264
325
|
corruption_function=substitute_random_synonyms,
|
265
326
|
scope=AttackWave.WORD,
|
266
327
|
seed=seed,
|
267
|
-
|
328
|
+
rate=effective_rate,
|
268
329
|
part_of_speech=part_of_speech,
|
269
330
|
)
|
270
331
|
|
@@ -272,4 +333,4 @@ class Jargoyle(Glitchling):
|
|
272
333
|
jargoyle = Jargoyle()
|
273
334
|
|
274
335
|
|
275
|
-
__all__ = ["Jargoyle", "ensure_wordnet", "jargoyle"]
|
336
|
+
__all__ = ["Jargoyle", "dependencies_available", "ensure_wordnet", "jargoyle"]
|
glitchlings/zoo/mim1c.py
CHANGED
@@ -5,21 +5,24 @@ from typing import Literal
|
|
5
5
|
from confusable_homoglyphs import confusables
|
6
6
|
|
7
7
|
from .core import AttackOrder, AttackWave, Glitchling
|
8
|
+
from ._rate import resolve_rate
|
8
9
|
|
9
10
|
|
10
11
|
def swap_homoglyphs(
|
11
12
|
text: str,
|
12
|
-
|
13
|
+
rate: float | None = None,
|
13
14
|
classes: list[str] | Literal["all"] | None = None,
|
14
15
|
banned_characters: Collection[str] | None = None,
|
15
16
|
seed: int | None = None,
|
16
17
|
rng: random.Random | None = None,
|
18
|
+
*,
|
19
|
+
replacement_rate: float | None = None,
|
17
20
|
) -> str:
|
18
21
|
"""Replace characters with visually confusable homoglyphs.
|
19
22
|
|
20
23
|
Parameters
|
21
24
|
- text: Input text.
|
22
|
-
-
|
25
|
+
- rate: Max proportion of eligible characters to replace (default 0.02).
|
23
26
|
- classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
|
24
27
|
- banned_characters: Characters that must never appear as replacements.
|
25
28
|
- seed: Optional seed if `rng` not provided.
|
@@ -29,6 +32,13 @@ def swap_homoglyphs(
|
|
29
32
|
- Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
|
30
33
|
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
31
34
|
"""
|
35
|
+
effective_rate = resolve_rate(
|
36
|
+
rate=rate,
|
37
|
+
legacy_value=replacement_rate,
|
38
|
+
default=0.02,
|
39
|
+
legacy_name="replacement_rate",
|
40
|
+
)
|
41
|
+
|
32
42
|
if rng is None:
|
33
43
|
rng = random.Random(seed)
|
34
44
|
|
@@ -39,7 +49,8 @@ def swap_homoglyphs(
|
|
39
49
|
confusable_chars = [
|
40
50
|
char for char in target_chars if char in confusables.confusables_data
|
41
51
|
]
|
42
|
-
|
52
|
+
clamped_rate = max(0.0, effective_rate)
|
53
|
+
num_replacements = int(len(confusable_chars) * clamped_rate)
|
43
54
|
done = 0
|
44
55
|
rng.shuffle(confusable_chars)
|
45
56
|
banned_set = set(banned_characters or ())
|
@@ -66,18 +77,26 @@ class Mim1c(Glitchling):
|
|
66
77
|
def __init__(
|
67
78
|
self,
|
68
79
|
*,
|
69
|
-
|
80
|
+
rate: float | None = None,
|
81
|
+
replacement_rate: float | None = None,
|
70
82
|
classes: list[str] | Literal["all"] | None = None,
|
71
83
|
banned_characters: Collection[str] | None = None,
|
72
84
|
seed: int | None = None,
|
73
85
|
) -> None:
|
86
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
87
|
+
effective_rate = resolve_rate(
|
88
|
+
rate=rate,
|
89
|
+
legacy_value=replacement_rate,
|
90
|
+
default=0.02,
|
91
|
+
legacy_name="replacement_rate",
|
92
|
+
)
|
74
93
|
super().__init__(
|
75
94
|
name="Mim1c",
|
76
95
|
corruption_function=swap_homoglyphs,
|
77
96
|
scope=AttackWave.CHARACTER,
|
78
97
|
order=AttackOrder.LAST,
|
79
98
|
seed=seed,
|
80
|
-
|
99
|
+
rate=effective_rate,
|
81
100
|
classes=classes,
|
82
101
|
banned_characters=banned_characters,
|
83
102
|
)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Source Replacements (space-separated)
|
2
|
+
li h
|
3
|
+
h li
|
4
|
+
rn m
|
5
|
+
m rn
|
6
|
+
cl d
|
7
|
+
d cl
|
8
|
+
I l
|
9
|
+
l I 1
|
10
|
+
1 l I
|
11
|
+
0 O
|
12
|
+
O 0
|
13
|
+
B 8
|
14
|
+
8 B
|
15
|
+
S 5
|
16
|
+
5 S
|
17
|
+
Z 2
|
18
|
+
2 Z
|
19
|
+
G 6
|
20
|
+
6 G
|
21
|
+
“ "
|
22
|
+
” "
|
23
|
+
‘ '
|
24
|
+
’ '
|
25
|
+
— -
|
26
|
+
– -
|
27
|
+
vv w
|
28
|
+
w vv
|
29
|
+
ri n
|
30
|
+
n ri
|
glitchlings/zoo/redactyl.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from .core import Glitchling, AttackWave
|
6
|
+
from ._rate import resolve_rate
|
5
7
|
|
6
8
|
FULL_BLOCK = "█"
|
7
9
|
|
@@ -16,7 +18,7 @@ def _python_redact_words(
|
|
16
18
|
text: str,
|
17
19
|
*,
|
18
20
|
replacement_char: str,
|
19
|
-
|
21
|
+
rate: float,
|
20
22
|
merge_adjacent: bool,
|
21
23
|
rng: random.Random,
|
22
24
|
) -> str:
|
@@ -25,7 +27,7 @@ def _python_redact_words(
|
|
25
27
|
Parameters
|
26
28
|
- text: Input text.
|
27
29
|
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
28
|
-
-
|
30
|
+
- rate: Max proportion of words to redact (default 0.05).
|
29
31
|
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
30
32
|
- seed: Seed used if `rng` not provided (default 151).
|
31
33
|
- rng: Optional RNG; overrides seed.
|
@@ -35,7 +37,7 @@ def _python_redact_words(
|
|
35
37
|
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
36
38
|
if not word_indices:
|
37
39
|
raise ValueError("Cannot redact words because the input text contains no redactable words.")
|
38
|
-
num_to_redact = max(1, int(len(word_indices) *
|
40
|
+
num_to_redact = max(1, int(len(word_indices) * rate))
|
39
41
|
|
40
42
|
# Sample from the indices of actual words
|
41
43
|
indices_to_redact = rng.sample(word_indices, k=num_to_redact)
|
@@ -72,21 +74,34 @@ def _python_redact_words(
|
|
72
74
|
def redact_words(
|
73
75
|
text: str,
|
74
76
|
replacement_char: str = FULL_BLOCK,
|
75
|
-
|
77
|
+
rate: float | None = None,
|
76
78
|
merge_adjacent: bool = False,
|
77
79
|
seed: int = 151,
|
78
80
|
rng: random.Random | None = None,
|
81
|
+
*,
|
82
|
+
redaction_rate: float | None = None,
|
79
83
|
) -> str:
|
80
84
|
"""Redact random words by replacing their characters."""
|
81
85
|
|
86
|
+
effective_rate = resolve_rate(
|
87
|
+
rate=rate,
|
88
|
+
legacy_value=redaction_rate,
|
89
|
+
default=0.05,
|
90
|
+
legacy_name="redaction_rate",
|
91
|
+
)
|
92
|
+
|
82
93
|
if rng is None:
|
83
94
|
rng = random.Random(seed)
|
84
95
|
|
85
|
-
|
96
|
+
clamped_rate = max(0.0, effective_rate)
|
97
|
+
|
98
|
+
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
99
|
+
|
100
|
+
if use_rust:
|
86
101
|
return _redact_words_rust(
|
87
102
|
text,
|
88
103
|
replacement_char,
|
89
|
-
|
104
|
+
clamped_rate,
|
90
105
|
merge_adjacent,
|
91
106
|
rng,
|
92
107
|
)
|
@@ -94,7 +109,7 @@ def redact_words(
|
|
94
109
|
return _python_redact_words(
|
95
110
|
text,
|
96
111
|
replacement_char=replacement_char,
|
97
|
-
|
112
|
+
rate=clamped_rate,
|
98
113
|
merge_adjacent=merge_adjacent,
|
99
114
|
rng=rng,
|
100
115
|
)
|
@@ -107,20 +122,42 @@ class Redactyl(Glitchling):
|
|
107
122
|
self,
|
108
123
|
*,
|
109
124
|
replacement_char: str = FULL_BLOCK,
|
110
|
-
|
125
|
+
rate: float | None = None,
|
126
|
+
redaction_rate: float | None = None,
|
111
127
|
merge_adjacent: bool = False,
|
112
128
|
seed: int = 151,
|
113
129
|
) -> None:
|
130
|
+
self._param_aliases = {"redaction_rate": "rate"}
|
131
|
+
effective_rate = resolve_rate(
|
132
|
+
rate=rate,
|
133
|
+
legacy_value=redaction_rate,
|
134
|
+
default=0.05,
|
135
|
+
legacy_name="redaction_rate",
|
136
|
+
)
|
114
137
|
super().__init__(
|
115
138
|
name="Redactyl",
|
116
139
|
corruption_function=redact_words,
|
117
140
|
scope=AttackWave.WORD,
|
118
141
|
seed=seed,
|
119
142
|
replacement_char=replacement_char,
|
120
|
-
|
143
|
+
rate=effective_rate,
|
121
144
|
merge_adjacent=merge_adjacent,
|
122
145
|
)
|
123
146
|
|
147
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
148
|
+
replacement_char = self.kwargs.get("replacement_char")
|
149
|
+
rate = self.kwargs.get("rate")
|
150
|
+
merge_adjacent = self.kwargs.get("merge_adjacent")
|
151
|
+
if replacement_char is None or rate is None or merge_adjacent is None:
|
152
|
+
return None
|
153
|
+
return {
|
154
|
+
"type": "redact",
|
155
|
+
"replacement_char": str(replacement_char),
|
156
|
+
"redaction_rate": float(rate),
|
157
|
+
"merge_adjacent": bool(merge_adjacent),
|
158
|
+
}
|
159
|
+
|
160
|
+
|
124
161
|
|
125
162
|
redactyl = Redactyl()
|
126
163
|
|
glitchlings/zoo/reduple.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from .core import Glitchling, AttackWave
|
6
|
+
from ._rate import resolve_rate
|
5
7
|
|
6
8
|
try:
|
7
9
|
from glitchlings._zoo_rust import reduplicate_words as _reduplicate_words_rust
|
@@ -12,14 +14,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
12
14
|
def _python_reduplicate_words(
|
13
15
|
text: str,
|
14
16
|
*,
|
15
|
-
|
17
|
+
rate: float,
|
16
18
|
rng: random.Random,
|
17
19
|
) -> str:
|
18
20
|
"""Randomly reduplicate words in the text.
|
19
21
|
|
20
22
|
Parameters
|
21
23
|
- text: Input text.
|
22
|
-
-
|
24
|
+
- rate: Max proportion of words to reduplicate (default 0.05).
|
23
25
|
- seed: Optional seed if `rng` not provided.
|
24
26
|
- rng: Optional RNG; overrides seed.
|
25
27
|
|
@@ -39,7 +41,7 @@ def _python_reduplicate_words(
|
|
39
41
|
continue
|
40
42
|
|
41
43
|
# Only consider actual words for reduplication
|
42
|
-
if rng.random() <
|
44
|
+
if rng.random() < rate:
|
43
45
|
# Check if word has trailing punctuation
|
44
46
|
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
45
47
|
if match:
|
@@ -53,9 +55,11 @@ def _python_reduplicate_words(
|
|
53
55
|
|
54
56
|
def reduplicate_words(
|
55
57
|
text: str,
|
56
|
-
|
58
|
+
rate: float | None = None,
|
57
59
|
seed: int | None = None,
|
58
60
|
rng: random.Random | None = None,
|
61
|
+
*,
|
62
|
+
reduplication_rate: float | None = None,
|
59
63
|
) -> str:
|
60
64
|
"""Randomly reduplicate words in the text.
|
61
65
|
|
@@ -63,15 +67,24 @@ def reduplicate_words(
|
|
63
67
|
extension is unavailable.
|
64
68
|
"""
|
65
69
|
|
70
|
+
effective_rate = resolve_rate(
|
71
|
+
rate=rate,
|
72
|
+
legacy_value=reduplication_rate,
|
73
|
+
default=0.05,
|
74
|
+
legacy_name="reduplication_rate",
|
75
|
+
)
|
76
|
+
|
66
77
|
if rng is None:
|
67
78
|
rng = random.Random(seed)
|
68
79
|
|
80
|
+
clamped_rate = max(0.0, effective_rate)
|
81
|
+
|
69
82
|
if _reduplicate_words_rust is not None:
|
70
|
-
return _reduplicate_words_rust(text,
|
83
|
+
return _reduplicate_words_rust(text, clamped_rate, rng)
|
71
84
|
|
72
85
|
return _python_reduplicate_words(
|
73
86
|
text,
|
74
|
-
|
87
|
+
rate=clamped_rate,
|
75
88
|
rng=rng,
|
76
89
|
)
|
77
90
|
|
@@ -82,17 +95,32 @@ class Reduple(Glitchling):
|
|
82
95
|
def __init__(
|
83
96
|
self,
|
84
97
|
*,
|
85
|
-
|
98
|
+
rate: float | None = None,
|
99
|
+
reduplication_rate: float | None = None,
|
86
100
|
seed: int | None = None,
|
87
101
|
) -> None:
|
102
|
+
self._param_aliases = {"reduplication_rate": "rate"}
|
103
|
+
effective_rate = resolve_rate(
|
104
|
+
rate=rate,
|
105
|
+
legacy_value=reduplication_rate,
|
106
|
+
default=0.05,
|
107
|
+
legacy_name="reduplication_rate",
|
108
|
+
)
|
88
109
|
super().__init__(
|
89
110
|
name="Reduple",
|
90
111
|
corruption_function=reduplicate_words,
|
91
112
|
scope=AttackWave.WORD,
|
92
113
|
seed=seed,
|
93
|
-
|
114
|
+
rate=effective_rate,
|
94
115
|
)
|
95
116
|
|
117
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
118
|
+
rate = self.kwargs.get("rate")
|
119
|
+
if rate is None:
|
120
|
+
return None
|
121
|
+
return {"type": "reduplicate", "reduplication_rate": float(rate)}
|
122
|
+
|
123
|
+
|
96
124
|
|
97
125
|
reduple = Reduple()
|
98
126
|
|
glitchlings/zoo/rushmore.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import math
|
2
2
|
import random
|
3
3
|
import re
|
4
|
+
from typing import Any
|
4
5
|
|
5
6
|
from .core import Glitchling, AttackWave
|
7
|
+
from ._rate import resolve_rate
|
6
8
|
|
7
9
|
try:
|
8
10
|
from glitchlings._zoo_rust import delete_random_words as _delete_random_words_rust
|
@@ -13,11 +15,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
13
15
|
def _python_delete_random_words(
|
14
16
|
text: str,
|
15
17
|
*,
|
16
|
-
|
18
|
+
rate: float,
|
17
19
|
rng: random.Random,
|
18
20
|
) -> str:
|
19
21
|
"""Delete random words from the input text while preserving whitespace."""
|
20
22
|
|
23
|
+
if rate <= 0.0:
|
24
|
+
return text
|
25
|
+
|
21
26
|
tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
|
22
27
|
|
23
28
|
candidate_indices: list[int] = []
|
@@ -29,14 +34,14 @@ def _python_delete_random_words(
|
|
29
34
|
candidate_indices.append(i)
|
30
35
|
|
31
36
|
allowed_deletions = min(
|
32
|
-
len(candidate_indices), math.floor(len(candidate_indices) *
|
37
|
+
len(candidate_indices), math.floor(len(candidate_indices) * rate)
|
33
38
|
)
|
34
39
|
if allowed_deletions <= 0:
|
35
40
|
return text
|
36
41
|
|
37
42
|
deletions = 0
|
38
43
|
for i in candidate_indices:
|
39
|
-
if rng.random() <
|
44
|
+
if rng.random() < rate:
|
40
45
|
word = tokens[i]
|
41
46
|
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
42
47
|
if match:
|
@@ -58,24 +63,35 @@ def _python_delete_random_words(
|
|
58
63
|
|
59
64
|
def delete_random_words(
|
60
65
|
text: str,
|
61
|
-
|
66
|
+
rate: float | None = None,
|
62
67
|
seed: int | None = None,
|
63
68
|
rng: random.Random | None = None,
|
69
|
+
*,
|
70
|
+
max_deletion_rate: float | None = None,
|
64
71
|
) -> str:
|
65
72
|
"""Delete random words from the input text.
|
66
73
|
|
67
74
|
Uses the optional Rust implementation when available.
|
68
75
|
"""
|
69
76
|
|
77
|
+
effective_rate = resolve_rate(
|
78
|
+
rate=rate,
|
79
|
+
legacy_value=max_deletion_rate,
|
80
|
+
default=0.01,
|
81
|
+
legacy_name="max_deletion_rate",
|
82
|
+
)
|
83
|
+
|
70
84
|
if rng is None:
|
71
85
|
rng = random.Random(seed)
|
72
86
|
|
87
|
+
clamped_rate = max(0.0, effective_rate)
|
88
|
+
|
73
89
|
if _delete_random_words_rust is not None:
|
74
|
-
return _delete_random_words_rust(text,
|
90
|
+
return _delete_random_words_rust(text, clamped_rate, rng)
|
75
91
|
|
76
92
|
return _python_delete_random_words(
|
77
93
|
text,
|
78
|
-
|
94
|
+
rate=clamped_rate,
|
79
95
|
rng=rng,
|
80
96
|
)
|
81
97
|
|
@@ -86,17 +102,33 @@ class Rushmore(Glitchling):
|
|
86
102
|
def __init__(
|
87
103
|
self,
|
88
104
|
*,
|
89
|
-
|
105
|
+
rate: float | None = None,
|
106
|
+
max_deletion_rate: float | None = None,
|
90
107
|
seed: int | None = None,
|
91
108
|
) -> None:
|
109
|
+
self._param_aliases = {"max_deletion_rate": "rate"}
|
110
|
+
effective_rate = resolve_rate(
|
111
|
+
rate=rate,
|
112
|
+
legacy_value=max_deletion_rate,
|
113
|
+
default=0.01,
|
114
|
+
legacy_name="max_deletion_rate",
|
115
|
+
)
|
92
116
|
super().__init__(
|
93
117
|
name="Rushmore",
|
94
118
|
corruption_function=delete_random_words,
|
95
119
|
scope=AttackWave.WORD,
|
96
120
|
seed=seed,
|
97
|
-
|
121
|
+
rate=effective_rate,
|
98
122
|
)
|
99
123
|
|
124
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
125
|
+
rate = self.kwargs.get("rate")
|
126
|
+
if rate is None:
|
127
|
+
rate = self.kwargs.get("max_deletion_rate")
|
128
|
+
if rate is None:
|
129
|
+
return None
|
130
|
+
return {"type": "delete", "max_deletion_rate": float(rate)}
|
131
|
+
|
100
132
|
|
101
133
|
rushmore = Rushmore()
|
102
134
|
|