glitchlings 0.2.2__cp312-cp312-win_amd64.whl → 0.2.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/dlc/prime.py +44 -22
- glitchlings/main.py +1 -1
- glitchlings/zoo/_rate.py +21 -0
- glitchlings/zoo/core.py +56 -52
- glitchlings/zoo/jargoyle.py +24 -5
- glitchlings/zoo/mim1c.py +24 -5
- glitchlings/zoo/redactyl.py +43 -8
- glitchlings/zoo/reduple.py +36 -8
- glitchlings/zoo/rushmore.py +40 -8
- glitchlings/zoo/scannequin.py +38 -8
- glitchlings/zoo/typogre.py +29 -9
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.3.dist-info}/METADATA +21 -48
- glitchlings-0.2.3.dist-info/RECORD +26 -0
- glitchlings-0.2.2.dist-info/RECORD +0 -25
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.3.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.3.dist-info}/top_level.txt +0 -0
Binary file
|
glitchlings/dlc/prime.py
CHANGED
@@ -79,8 +79,8 @@ def tutorial_level(
|
|
79
79
|
) -> vf.Environment:
|
80
80
|
"""Create a low-corruption environment using tuned defaults."""
|
81
81
|
|
82
|
-
tuned_mim1c = Mim1c(
|
83
|
-
tuned_typogre = Typogre(
|
82
|
+
tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
|
83
|
+
tuned_typogre = Typogre(rate=0.025 * difficulty.value)
|
84
84
|
|
85
85
|
return load_environment(
|
86
86
|
env,
|
@@ -220,32 +220,54 @@ def echo_chamber(
|
|
220
220
|
"Specify which split to use when the dataset loads as a DatasetDict."
|
221
221
|
)
|
222
222
|
|
223
|
-
|
224
|
-
|
223
|
+
filtered_dataset = hf_dataset.filter(
|
224
|
+
lambda row: row.get(column) is not None,
|
225
|
+
load_from_cache_file=False,
|
226
|
+
)
|
225
227
|
|
226
|
-
|
227
|
-
value = row.get(column)
|
228
|
-
if value is None:
|
229
|
-
continue
|
228
|
+
source_column_names = list(filtered_dataset.column_names)
|
230
229
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
answers.append(text)
|
230
|
+
def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
|
231
|
+
text = str(row[column])
|
232
|
+
prompt = [
|
233
|
+
{"role": "system", "content": instructions},
|
234
|
+
{"role": "user", "content": f"Corrupted text:\n{text}"},
|
235
|
+
]
|
236
|
+
return {"prompt": prompt, "answer": text}
|
239
237
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
238
|
+
base_dataset = filtered_dataset.map(
|
239
|
+
_build_prompt,
|
240
|
+
remove_columns=source_column_names,
|
241
|
+
load_from_cache_file=False,
|
242
|
+
)
|
244
243
|
|
245
|
-
|
244
|
+
try:
|
245
|
+
dataset_length = len(base_dataset) # type: ignore[arg-type]
|
246
|
+
except TypeError:
|
247
|
+
preview_rows: list[dict[str, Any]]
|
248
|
+
take_fn = getattr(base_dataset, "take", None)
|
249
|
+
if callable(take_fn):
|
250
|
+
preview_rows = list(take_fn(1))
|
251
|
+
else:
|
252
|
+
iterator = iter(base_dataset)
|
253
|
+
try:
|
254
|
+
first_row = next(iterator)
|
255
|
+
except StopIteration:
|
256
|
+
preview_rows = []
|
257
|
+
else:
|
258
|
+
preview_rows = [first_row]
|
259
|
+
if not preview_rows:
|
260
|
+
raise ValueError(
|
261
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
if dataset_length == 0:
|
265
|
+
raise ValueError(
|
266
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
267
|
+
)
|
246
268
|
|
247
269
|
gaggle = _as_gaggle(glitchlings, seed=seed)
|
248
|
-
glitched_dataset = gaggle.corrupt_dataset(
|
270
|
+
glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
|
249
271
|
|
250
272
|
rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
|
251
273
|
rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
|
glitchlings/main.py
CHANGED
@@ -46,7 +46,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
46
46
|
metavar="SPEC",
|
47
47
|
help=(
|
48
48
|
"Glitchling to apply, optionally with parameters like "
|
49
|
-
"Typogre(
|
49
|
+
"Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
|
50
50
|
),
|
51
51
|
)
|
52
52
|
parser.add_argument(
|
glitchlings/zoo/_rate.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
|
4
|
+
def resolve_rate(
|
5
|
+
*,
|
6
|
+
rate: float | None,
|
7
|
+
legacy_value: float | None,
|
8
|
+
default: float,
|
9
|
+
legacy_name: str,
|
10
|
+
) -> float:
|
11
|
+
"""Return the effective rate while enforcing mutual exclusivity."""
|
12
|
+
|
13
|
+
if rate is not None and legacy_value is not None:
|
14
|
+
raise ValueError(
|
15
|
+
f"Specify either 'rate' or '{legacy_name}', not both."
|
16
|
+
)
|
17
|
+
if rate is not None:
|
18
|
+
return rate
|
19
|
+
if legacy_value is not None:
|
20
|
+
return legacy_value
|
21
|
+
return default
|
glitchlings/zoo/core.py
CHANGED
@@ -107,6 +107,7 @@ class Glitchling:
|
|
107
107
|
scope: AttackWave,
|
108
108
|
order: AttackOrder = AttackOrder.NORMAL,
|
109
109
|
seed: int | None = None,
|
110
|
+
pipeline_operation: Callable[["Glitchling"], dict[str, Any] | None] | None = None,
|
110
111
|
**kwargs: Any,
|
111
112
|
) -> None:
|
112
113
|
"""Initialize a glitchling.
|
@@ -128,31 +129,76 @@ class Glitchling:
|
|
128
129
|
self.corruption_function: CorruptionCallable = corruption_function
|
129
130
|
self.level: AttackWave = scope
|
130
131
|
self.order: AttackOrder = order
|
132
|
+
self._pipeline_descriptor_factory = pipeline_operation
|
131
133
|
self.kwargs: dict[str, Any] = {}
|
134
|
+
self._cached_rng_callable: CorruptionCallable | None = None
|
135
|
+
self._cached_rng_expectation: bool | None = None
|
132
136
|
for kw, val in kwargs.items():
|
133
137
|
self.set_param(kw, val)
|
134
138
|
|
135
139
|
def set_param(self, key: str, value: Any) -> None:
|
136
140
|
"""Persist a parameter for use by the corruption callable."""
|
137
141
|
|
138
|
-
|
139
|
-
|
140
|
-
|
142
|
+
aliases = getattr(self, "_param_aliases", {})
|
143
|
+
canonical = aliases.get(key, key)
|
144
|
+
|
145
|
+
# Drop stale alias keys so we only forward canonical kwargs.
|
146
|
+
self.kwargs.pop(key, None)
|
147
|
+
for alias, target in aliases.items():
|
148
|
+
if target == canonical:
|
149
|
+
self.kwargs.pop(alias, None)
|
150
|
+
|
151
|
+
self.kwargs[canonical] = value
|
152
|
+
setattr(self, canonical, value)
|
153
|
+
|
154
|
+
if canonical == "seed":
|
141
155
|
self.reset_rng(value)
|
142
156
|
|
143
|
-
|
144
|
-
|
157
|
+
for alias, target in aliases.items():
|
158
|
+
if target == canonical:
|
159
|
+
setattr(self, alias, value)
|
145
160
|
|
146
|
-
|
161
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
162
|
+
"""Return the Rust pipeline operation descriptor for this glitchling."""
|
163
|
+
|
164
|
+
factory = self._pipeline_descriptor_factory
|
165
|
+
if factory is None:
|
166
|
+
return None
|
167
|
+
|
168
|
+
return factory(self)
|
169
|
+
|
170
|
+
def _corruption_expects_rng(self) -> bool:
|
171
|
+
"""Return `True` when the corruption function accepts an rng keyword."""
|
172
|
+
|
173
|
+
cached_callable = self._cached_rng_callable
|
174
|
+
cached_expectation = self._cached_rng_expectation
|
175
|
+
corruption_function = self.corruption_function
|
176
|
+
|
177
|
+
if (
|
178
|
+
cached_callable is corruption_function
|
179
|
+
and cached_expectation is not None
|
180
|
+
):
|
181
|
+
return cached_expectation
|
182
|
+
|
183
|
+
expects_rng = False
|
147
184
|
try:
|
148
|
-
signature = inspect.signature(
|
185
|
+
signature = inspect.signature(corruption_function)
|
149
186
|
except (TypeError, ValueError):
|
150
187
|
signature = None
|
151
188
|
|
152
|
-
expects_rng = False
|
153
189
|
if signature is not None:
|
154
190
|
expects_rng = "rng" in signature.parameters
|
155
191
|
|
192
|
+
self._cached_rng_callable = corruption_function
|
193
|
+
self._cached_rng_expectation = expects_rng
|
194
|
+
return expects_rng
|
195
|
+
|
196
|
+
def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
|
197
|
+
"""Execute the corruption callable, injecting the RNG when required."""
|
198
|
+
|
199
|
+
# Pass rng to underlying corruption function if it expects it.
|
200
|
+
expects_rng = self._corruption_expects_rng()
|
201
|
+
|
156
202
|
if expects_rng:
|
157
203
|
corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
|
158
204
|
else:
|
@@ -231,53 +277,14 @@ class Glitchling:
|
|
231
277
|
self.corruption_function,
|
232
278
|
self.level,
|
233
279
|
self.order,
|
280
|
+
pipeline_operation=self._pipeline_descriptor_factory,
|
234
281
|
**filtered_kwargs,
|
235
282
|
)
|
236
283
|
|
237
284
|
return cls(**filtered_kwargs)
|
238
285
|
|
239
286
|
|
240
|
-
def _pipeline_operation_reduplicate(glitchling: "Glitchling") -> dict[str, Any] | None:
|
241
|
-
rate = glitchling.kwargs.get("reduplication_rate")
|
242
|
-
if rate is None:
|
243
|
-
return None
|
244
|
-
return {"type": "reduplicate", "reduplication_rate": float(rate)}
|
245
|
-
|
246
287
|
|
247
|
-
def _pipeline_operation_delete(glitchling: "Glitchling") -> dict[str, Any] | None:
|
248
|
-
rate = glitchling.kwargs.get("max_deletion_rate")
|
249
|
-
if rate is None:
|
250
|
-
return None
|
251
|
-
return {"type": "delete", "max_deletion_rate": float(rate)}
|
252
|
-
|
253
|
-
|
254
|
-
def _pipeline_operation_redact(glitchling: "Glitchling") -> dict[str, Any] | None:
|
255
|
-
replacement_char = glitchling.kwargs.get("replacement_char")
|
256
|
-
redaction_rate = glitchling.kwargs.get("redaction_rate")
|
257
|
-
merge_adjacent = glitchling.kwargs.get("merge_adjacent")
|
258
|
-
if replacement_char is None or redaction_rate is None or merge_adjacent is None:
|
259
|
-
return None
|
260
|
-
return {
|
261
|
-
"type": "redact",
|
262
|
-
"replacement_char": str(replacement_char),
|
263
|
-
"redaction_rate": float(redaction_rate),
|
264
|
-
"merge_adjacent": bool(merge_adjacent),
|
265
|
-
}
|
266
|
-
|
267
|
-
|
268
|
-
def _pipeline_operation_ocr(glitchling: "Glitchling") -> dict[str, Any] | None:
|
269
|
-
error_rate = glitchling.kwargs.get("error_rate")
|
270
|
-
if error_rate is None:
|
271
|
-
return None
|
272
|
-
return {"type": "ocr", "error_rate": float(error_rate)}
|
273
|
-
|
274
|
-
|
275
|
-
_PIPELINE_OPERATION_BUILDERS: dict[str, Callable[["Glitchling"], dict[str, Any] | None]] = {
|
276
|
-
"Reduple": _pipeline_operation_reduplicate,
|
277
|
-
"Rushmore": _pipeline_operation_delete,
|
278
|
-
"Redactyl": _pipeline_operation_redact,
|
279
|
-
"Scannequin": _pipeline_operation_ocr,
|
280
|
-
}
|
281
288
|
|
282
289
|
|
283
290
|
class Gaggle(Glitchling):
|
@@ -359,10 +366,7 @@ class Gaggle(Glitchling):
|
|
359
366
|
|
360
367
|
descriptors: list[dict[str, Any]] = []
|
361
368
|
for glitchling in self.apply_order:
|
362
|
-
|
363
|
-
if builder is None:
|
364
|
-
return None
|
365
|
-
operation = builder(glitchling)
|
369
|
+
operation = glitchling.pipeline_operation()
|
366
370
|
if operation is None:
|
367
371
|
return None
|
368
372
|
|
glitchlings/zoo/jargoyle.py
CHANGED
@@ -33,6 +33,7 @@ else:
|
|
33
33
|
_WORDNET_MODULE = None
|
34
34
|
|
35
35
|
from .core import AttackWave, Glitchling
|
36
|
+
from ._rate import resolve_rate
|
36
37
|
|
37
38
|
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
38
39
|
|
@@ -211,16 +212,18 @@ def _collect_synonyms(
|
|
211
212
|
|
212
213
|
def substitute_random_synonyms(
|
213
214
|
text: str,
|
214
|
-
|
215
|
+
rate: float | None = None,
|
215
216
|
part_of_speech: PartOfSpeechInput = "n",
|
216
217
|
seed: int | None = None,
|
217
218
|
rng: random.Random | None = None,
|
219
|
+
*,
|
220
|
+
replacement_rate: float | None = None,
|
218
221
|
) -> str:
|
219
222
|
"""Replace words with random WordNet synonyms.
|
220
223
|
|
221
224
|
Parameters
|
222
225
|
- text: Input text.
|
223
|
-
-
|
226
|
+
- rate: Max proportion of candidate words to replace (default 0.1).
|
224
227
|
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
225
228
|
any iterable of those tags, or "any" to include all four.
|
226
229
|
- rng: Optional RNG instance used for deterministic sampling.
|
@@ -232,6 +235,13 @@ def substitute_random_synonyms(
|
|
232
235
|
- Synonyms sorted before rng.choice to fix ordering.
|
233
236
|
- For each POS, the first synset containing alternate lemmas is used for stability.
|
234
237
|
"""
|
238
|
+
effective_rate = resolve_rate(
|
239
|
+
rate=rate,
|
240
|
+
legacy_value=replacement_rate,
|
241
|
+
default=0.1,
|
242
|
+
legacy_name="replacement_rate",
|
243
|
+
)
|
244
|
+
|
235
245
|
ensure_wordnet()
|
236
246
|
wordnet = _wordnet()
|
237
247
|
|
@@ -270,7 +280,8 @@ def substitute_random_synonyms(
|
|
270
280
|
if not candidate_indices:
|
271
281
|
return text
|
272
282
|
|
273
|
-
|
283
|
+
clamped_rate = max(0.0, effective_rate)
|
284
|
+
max_replacements = int(len(candidate_indices) * clamped_rate)
|
274
285
|
if max_replacements <= 0:
|
275
286
|
return text
|
276
287
|
|
@@ -297,16 +308,24 @@ class Jargoyle(Glitchling):
|
|
297
308
|
def __init__(
|
298
309
|
self,
|
299
310
|
*,
|
300
|
-
|
311
|
+
rate: float | None = None,
|
312
|
+
replacement_rate: float | None = None,
|
301
313
|
part_of_speech: PartOfSpeechInput = "n",
|
302
314
|
seed: int | None = None,
|
303
315
|
) -> None:
|
316
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
317
|
+
effective_rate = resolve_rate(
|
318
|
+
rate=rate,
|
319
|
+
legacy_value=replacement_rate,
|
320
|
+
default=0.1,
|
321
|
+
legacy_name="replacement_rate",
|
322
|
+
)
|
304
323
|
super().__init__(
|
305
324
|
name="Jargoyle",
|
306
325
|
corruption_function=substitute_random_synonyms,
|
307
326
|
scope=AttackWave.WORD,
|
308
327
|
seed=seed,
|
309
|
-
|
328
|
+
rate=effective_rate,
|
310
329
|
part_of_speech=part_of_speech,
|
311
330
|
)
|
312
331
|
|
glitchlings/zoo/mim1c.py
CHANGED
@@ -5,21 +5,24 @@ from typing import Literal
|
|
5
5
|
from confusable_homoglyphs import confusables
|
6
6
|
|
7
7
|
from .core import AttackOrder, AttackWave, Glitchling
|
8
|
+
from ._rate import resolve_rate
|
8
9
|
|
9
10
|
|
10
11
|
def swap_homoglyphs(
|
11
12
|
text: str,
|
12
|
-
|
13
|
+
rate: float | None = None,
|
13
14
|
classes: list[str] | Literal["all"] | None = None,
|
14
15
|
banned_characters: Collection[str] | None = None,
|
15
16
|
seed: int | None = None,
|
16
17
|
rng: random.Random | None = None,
|
18
|
+
*,
|
19
|
+
replacement_rate: float | None = None,
|
17
20
|
) -> str:
|
18
21
|
"""Replace characters with visually confusable homoglyphs.
|
19
22
|
|
20
23
|
Parameters
|
21
24
|
- text: Input text.
|
22
|
-
-
|
25
|
+
- rate: Max proportion of eligible characters to replace (default 0.02).
|
23
26
|
- classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
|
24
27
|
- banned_characters: Characters that must never appear as replacements.
|
25
28
|
- seed: Optional seed if `rng` not provided.
|
@@ -29,6 +32,13 @@ def swap_homoglyphs(
|
|
29
32
|
- Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
|
30
33
|
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
31
34
|
"""
|
35
|
+
effective_rate = resolve_rate(
|
36
|
+
rate=rate,
|
37
|
+
legacy_value=replacement_rate,
|
38
|
+
default=0.02,
|
39
|
+
legacy_name="replacement_rate",
|
40
|
+
)
|
41
|
+
|
32
42
|
if rng is None:
|
33
43
|
rng = random.Random(seed)
|
34
44
|
|
@@ -39,7 +49,8 @@ def swap_homoglyphs(
|
|
39
49
|
confusable_chars = [
|
40
50
|
char for char in target_chars if char in confusables.confusables_data
|
41
51
|
]
|
42
|
-
|
52
|
+
clamped_rate = max(0.0, effective_rate)
|
53
|
+
num_replacements = int(len(confusable_chars) * clamped_rate)
|
43
54
|
done = 0
|
44
55
|
rng.shuffle(confusable_chars)
|
45
56
|
banned_set = set(banned_characters or ())
|
@@ -66,18 +77,26 @@ class Mim1c(Glitchling):
|
|
66
77
|
def __init__(
|
67
78
|
self,
|
68
79
|
*,
|
69
|
-
|
80
|
+
rate: float | None = None,
|
81
|
+
replacement_rate: float | None = None,
|
70
82
|
classes: list[str] | Literal["all"] | None = None,
|
71
83
|
banned_characters: Collection[str] | None = None,
|
72
84
|
seed: int | None = None,
|
73
85
|
) -> None:
|
86
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
87
|
+
effective_rate = resolve_rate(
|
88
|
+
rate=rate,
|
89
|
+
legacy_value=replacement_rate,
|
90
|
+
default=0.02,
|
91
|
+
legacy_name="replacement_rate",
|
92
|
+
)
|
74
93
|
super().__init__(
|
75
94
|
name="Mim1c",
|
76
95
|
corruption_function=swap_homoglyphs,
|
77
96
|
scope=AttackWave.CHARACTER,
|
78
97
|
order=AttackOrder.LAST,
|
79
98
|
seed=seed,
|
80
|
-
|
99
|
+
rate=effective_rate,
|
81
100
|
classes=classes,
|
82
101
|
banned_characters=banned_characters,
|
83
102
|
)
|
glitchlings/zoo/redactyl.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from .core import Glitchling, AttackWave
|
6
|
+
from ._rate import resolve_rate
|
5
7
|
|
6
8
|
FULL_BLOCK = "█"
|
7
9
|
|
@@ -16,7 +18,7 @@ def _python_redact_words(
|
|
16
18
|
text: str,
|
17
19
|
*,
|
18
20
|
replacement_char: str,
|
19
|
-
|
21
|
+
rate: float,
|
20
22
|
merge_adjacent: bool,
|
21
23
|
rng: random.Random,
|
22
24
|
) -> str:
|
@@ -25,7 +27,7 @@ def _python_redact_words(
|
|
25
27
|
Parameters
|
26
28
|
- text: Input text.
|
27
29
|
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
28
|
-
-
|
30
|
+
- rate: Max proportion of words to redact (default 0.05).
|
29
31
|
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
30
32
|
- seed: Seed used if `rng` not provided (default 151).
|
31
33
|
- rng: Optional RNG; overrides seed.
|
@@ -35,7 +37,7 @@ def _python_redact_words(
|
|
35
37
|
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
36
38
|
if not word_indices:
|
37
39
|
raise ValueError("Cannot redact words because the input text contains no redactable words.")
|
38
|
-
num_to_redact = max(1, int(len(word_indices) *
|
40
|
+
num_to_redact = max(1, int(len(word_indices) * rate))
|
39
41
|
|
40
42
|
# Sample from the indices of actual words
|
41
43
|
indices_to_redact = rng.sample(word_indices, k=num_to_redact)
|
@@ -72,23 +74,34 @@ def _python_redact_words(
|
|
72
74
|
def redact_words(
|
73
75
|
text: str,
|
74
76
|
replacement_char: str = FULL_BLOCK,
|
75
|
-
|
77
|
+
rate: float | None = None,
|
76
78
|
merge_adjacent: bool = False,
|
77
79
|
seed: int = 151,
|
78
80
|
rng: random.Random | None = None,
|
81
|
+
*,
|
82
|
+
redaction_rate: float | None = None,
|
79
83
|
) -> str:
|
80
84
|
"""Redact random words by replacing their characters."""
|
81
85
|
|
86
|
+
effective_rate = resolve_rate(
|
87
|
+
rate=rate,
|
88
|
+
legacy_value=redaction_rate,
|
89
|
+
default=0.05,
|
90
|
+
legacy_name="redaction_rate",
|
91
|
+
)
|
92
|
+
|
82
93
|
if rng is None:
|
83
94
|
rng = random.Random(seed)
|
84
95
|
|
96
|
+
clamped_rate = max(0.0, effective_rate)
|
97
|
+
|
85
98
|
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
86
99
|
|
87
100
|
if use_rust:
|
88
101
|
return _redact_words_rust(
|
89
102
|
text,
|
90
103
|
replacement_char,
|
91
|
-
|
104
|
+
clamped_rate,
|
92
105
|
merge_adjacent,
|
93
106
|
rng,
|
94
107
|
)
|
@@ -96,7 +109,7 @@ def redact_words(
|
|
96
109
|
return _python_redact_words(
|
97
110
|
text,
|
98
111
|
replacement_char=replacement_char,
|
99
|
-
|
112
|
+
rate=clamped_rate,
|
100
113
|
merge_adjacent=merge_adjacent,
|
101
114
|
rng=rng,
|
102
115
|
)
|
@@ -109,20 +122,42 @@ class Redactyl(Glitchling):
|
|
109
122
|
self,
|
110
123
|
*,
|
111
124
|
replacement_char: str = FULL_BLOCK,
|
112
|
-
|
125
|
+
rate: float | None = None,
|
126
|
+
redaction_rate: float | None = None,
|
113
127
|
merge_adjacent: bool = False,
|
114
128
|
seed: int = 151,
|
115
129
|
) -> None:
|
130
|
+
self._param_aliases = {"redaction_rate": "rate"}
|
131
|
+
effective_rate = resolve_rate(
|
132
|
+
rate=rate,
|
133
|
+
legacy_value=redaction_rate,
|
134
|
+
default=0.05,
|
135
|
+
legacy_name="redaction_rate",
|
136
|
+
)
|
116
137
|
super().__init__(
|
117
138
|
name="Redactyl",
|
118
139
|
corruption_function=redact_words,
|
119
140
|
scope=AttackWave.WORD,
|
120
141
|
seed=seed,
|
121
142
|
replacement_char=replacement_char,
|
122
|
-
|
143
|
+
rate=effective_rate,
|
123
144
|
merge_adjacent=merge_adjacent,
|
124
145
|
)
|
125
146
|
|
147
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
148
|
+
replacement_char = self.kwargs.get("replacement_char")
|
149
|
+
rate = self.kwargs.get("rate")
|
150
|
+
merge_adjacent = self.kwargs.get("merge_adjacent")
|
151
|
+
if replacement_char is None or rate is None or merge_adjacent is None:
|
152
|
+
return None
|
153
|
+
return {
|
154
|
+
"type": "redact",
|
155
|
+
"replacement_char": str(replacement_char),
|
156
|
+
"redaction_rate": float(rate),
|
157
|
+
"merge_adjacent": bool(merge_adjacent),
|
158
|
+
}
|
159
|
+
|
160
|
+
|
126
161
|
|
127
162
|
redactyl = Redactyl()
|
128
163
|
|
glitchlings/zoo/reduple.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from .core import Glitchling, AttackWave
|
6
|
+
from ._rate import resolve_rate
|
5
7
|
|
6
8
|
try:
|
7
9
|
from glitchlings._zoo_rust import reduplicate_words as _reduplicate_words_rust
|
@@ -12,14 +14,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
12
14
|
def _python_reduplicate_words(
|
13
15
|
text: str,
|
14
16
|
*,
|
15
|
-
|
17
|
+
rate: float,
|
16
18
|
rng: random.Random,
|
17
19
|
) -> str:
|
18
20
|
"""Randomly reduplicate words in the text.
|
19
21
|
|
20
22
|
Parameters
|
21
23
|
- text: Input text.
|
22
|
-
-
|
24
|
+
- rate: Max proportion of words to reduplicate (default 0.05).
|
23
25
|
- seed: Optional seed if `rng` not provided.
|
24
26
|
- rng: Optional RNG; overrides seed.
|
25
27
|
|
@@ -39,7 +41,7 @@ def _python_reduplicate_words(
|
|
39
41
|
continue
|
40
42
|
|
41
43
|
# Only consider actual words for reduplication
|
42
|
-
if rng.random() <
|
44
|
+
if rng.random() < rate:
|
43
45
|
# Check if word has trailing punctuation
|
44
46
|
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
45
47
|
if match:
|
@@ -53,9 +55,11 @@ def _python_reduplicate_words(
|
|
53
55
|
|
54
56
|
def reduplicate_words(
|
55
57
|
text: str,
|
56
|
-
|
58
|
+
rate: float | None = None,
|
57
59
|
seed: int | None = None,
|
58
60
|
rng: random.Random | None = None,
|
61
|
+
*,
|
62
|
+
reduplication_rate: float | None = None,
|
59
63
|
) -> str:
|
60
64
|
"""Randomly reduplicate words in the text.
|
61
65
|
|
@@ -63,15 +67,24 @@ def reduplicate_words(
|
|
63
67
|
extension is unavailable.
|
64
68
|
"""
|
65
69
|
|
70
|
+
effective_rate = resolve_rate(
|
71
|
+
rate=rate,
|
72
|
+
legacy_value=reduplication_rate,
|
73
|
+
default=0.05,
|
74
|
+
legacy_name="reduplication_rate",
|
75
|
+
)
|
76
|
+
|
66
77
|
if rng is None:
|
67
78
|
rng = random.Random(seed)
|
68
79
|
|
80
|
+
clamped_rate = max(0.0, effective_rate)
|
81
|
+
|
69
82
|
if _reduplicate_words_rust is not None:
|
70
|
-
return _reduplicate_words_rust(text,
|
83
|
+
return _reduplicate_words_rust(text, clamped_rate, rng)
|
71
84
|
|
72
85
|
return _python_reduplicate_words(
|
73
86
|
text,
|
74
|
-
|
87
|
+
rate=clamped_rate,
|
75
88
|
rng=rng,
|
76
89
|
)
|
77
90
|
|
@@ -82,17 +95,32 @@ class Reduple(Glitchling):
|
|
82
95
|
def __init__(
|
83
96
|
self,
|
84
97
|
*,
|
85
|
-
|
98
|
+
rate: float | None = None,
|
99
|
+
reduplication_rate: float | None = None,
|
86
100
|
seed: int | None = None,
|
87
101
|
) -> None:
|
102
|
+
self._param_aliases = {"reduplication_rate": "rate"}
|
103
|
+
effective_rate = resolve_rate(
|
104
|
+
rate=rate,
|
105
|
+
legacy_value=reduplication_rate,
|
106
|
+
default=0.05,
|
107
|
+
legacy_name="reduplication_rate",
|
108
|
+
)
|
88
109
|
super().__init__(
|
89
110
|
name="Reduple",
|
90
111
|
corruption_function=reduplicate_words,
|
91
112
|
scope=AttackWave.WORD,
|
92
113
|
seed=seed,
|
93
|
-
|
114
|
+
rate=effective_rate,
|
94
115
|
)
|
95
116
|
|
117
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
118
|
+
rate = self.kwargs.get("rate")
|
119
|
+
if rate is None:
|
120
|
+
return None
|
121
|
+
return {"type": "reduplicate", "reduplication_rate": float(rate)}
|
122
|
+
|
123
|
+
|
96
124
|
|
97
125
|
reduple = Reduple()
|
98
126
|
|
glitchlings/zoo/rushmore.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import math
|
2
2
|
import random
|
3
3
|
import re
|
4
|
+
from typing import Any
|
4
5
|
|
5
6
|
from .core import Glitchling, AttackWave
|
7
|
+
from ._rate import resolve_rate
|
6
8
|
|
7
9
|
try:
|
8
10
|
from glitchlings._zoo_rust import delete_random_words as _delete_random_words_rust
|
@@ -13,11 +15,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
13
15
|
def _python_delete_random_words(
|
14
16
|
text: str,
|
15
17
|
*,
|
16
|
-
|
18
|
+
rate: float,
|
17
19
|
rng: random.Random,
|
18
20
|
) -> str:
|
19
21
|
"""Delete random words from the input text while preserving whitespace."""
|
20
22
|
|
23
|
+
if rate <= 0.0:
|
24
|
+
return text
|
25
|
+
|
21
26
|
tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
|
22
27
|
|
23
28
|
candidate_indices: list[int] = []
|
@@ -29,14 +34,14 @@ def _python_delete_random_words(
|
|
29
34
|
candidate_indices.append(i)
|
30
35
|
|
31
36
|
allowed_deletions = min(
|
32
|
-
len(candidate_indices), math.floor(len(candidate_indices) *
|
37
|
+
len(candidate_indices), math.floor(len(candidate_indices) * rate)
|
33
38
|
)
|
34
39
|
if allowed_deletions <= 0:
|
35
40
|
return text
|
36
41
|
|
37
42
|
deletions = 0
|
38
43
|
for i in candidate_indices:
|
39
|
-
if rng.random() <
|
44
|
+
if rng.random() < rate:
|
40
45
|
word = tokens[i]
|
41
46
|
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
42
47
|
if match:
|
@@ -58,24 +63,35 @@ def _python_delete_random_words(
|
|
58
63
|
|
59
64
|
def delete_random_words(
|
60
65
|
text: str,
|
61
|
-
|
66
|
+
rate: float | None = None,
|
62
67
|
seed: int | None = None,
|
63
68
|
rng: random.Random | None = None,
|
69
|
+
*,
|
70
|
+
max_deletion_rate: float | None = None,
|
64
71
|
) -> str:
|
65
72
|
"""Delete random words from the input text.
|
66
73
|
|
67
74
|
Uses the optional Rust implementation when available.
|
68
75
|
"""
|
69
76
|
|
77
|
+
effective_rate = resolve_rate(
|
78
|
+
rate=rate,
|
79
|
+
legacy_value=max_deletion_rate,
|
80
|
+
default=0.01,
|
81
|
+
legacy_name="max_deletion_rate",
|
82
|
+
)
|
83
|
+
|
70
84
|
if rng is None:
|
71
85
|
rng = random.Random(seed)
|
72
86
|
|
87
|
+
clamped_rate = max(0.0, effective_rate)
|
88
|
+
|
73
89
|
if _delete_random_words_rust is not None:
|
74
|
-
return _delete_random_words_rust(text,
|
90
|
+
return _delete_random_words_rust(text, clamped_rate, rng)
|
75
91
|
|
76
92
|
return _python_delete_random_words(
|
77
93
|
text,
|
78
|
-
|
94
|
+
rate=clamped_rate,
|
79
95
|
rng=rng,
|
80
96
|
)
|
81
97
|
|
@@ -86,17 +102,33 @@ class Rushmore(Glitchling):
|
|
86
102
|
def __init__(
|
87
103
|
self,
|
88
104
|
*,
|
89
|
-
|
105
|
+
rate: float | None = None,
|
106
|
+
max_deletion_rate: float | None = None,
|
90
107
|
seed: int | None = None,
|
91
108
|
) -> None:
|
109
|
+
self._param_aliases = {"max_deletion_rate": "rate"}
|
110
|
+
effective_rate = resolve_rate(
|
111
|
+
rate=rate,
|
112
|
+
legacy_value=max_deletion_rate,
|
113
|
+
default=0.01,
|
114
|
+
legacy_name="max_deletion_rate",
|
115
|
+
)
|
92
116
|
super().__init__(
|
93
117
|
name="Rushmore",
|
94
118
|
corruption_function=delete_random_words,
|
95
119
|
scope=AttackWave.WORD,
|
96
120
|
seed=seed,
|
97
|
-
|
121
|
+
rate=effective_rate,
|
98
122
|
)
|
99
123
|
|
124
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
125
|
+
rate = self.kwargs.get("rate")
|
126
|
+
if rate is None:
|
127
|
+
rate = self.kwargs.get("max_deletion_rate")
|
128
|
+
if rate is None:
|
129
|
+
return None
|
130
|
+
return {"type": "delete", "max_deletion_rate": float(rate)}
|
131
|
+
|
100
132
|
|
101
133
|
rushmore = Rushmore()
|
102
134
|
|
glitchlings/zoo/scannequin.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from ._ocr_confusions import load_confusion_table
|
5
6
|
from .core import Glitchling, AttackWave, AttackOrder
|
7
|
+
from ._rate import resolve_rate
|
6
8
|
|
7
9
|
try:
|
8
10
|
from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
|
@@ -13,14 +15,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
13
15
|
def _python_ocr_artifacts(
|
14
16
|
text: str,
|
15
17
|
*,
|
16
|
-
|
18
|
+
rate: float,
|
17
19
|
rng: random.Random,
|
18
20
|
) -> str:
|
19
21
|
"""Introduce OCR-like artifacts into text.
|
20
22
|
|
21
23
|
Parameters
|
22
24
|
- text: Input text to corrupt.
|
23
|
-
-
|
25
|
+
- rate: Max proportion of eligible confusion matches to replace (default 0.02).
|
24
26
|
- seed: Optional seed if `rng` not provided.
|
25
27
|
- rng: Optional RNG; overrides seed.
|
26
28
|
|
@@ -53,7 +55,7 @@ def _python_ocr_artifacts(
|
|
53
55
|
return text
|
54
56
|
|
55
57
|
# Decide how many to replace
|
56
|
-
k = int(len(candidates) *
|
58
|
+
k = int(len(candidates) * rate)
|
57
59
|
if k <= 0:
|
58
60
|
return text
|
59
61
|
|
@@ -95,9 +97,11 @@ def _python_ocr_artifacts(
|
|
95
97
|
|
96
98
|
def ocr_artifacts(
|
97
99
|
text: str,
|
98
|
-
|
100
|
+
rate: float | None = None,
|
99
101
|
seed: int | None = None,
|
100
102
|
rng: random.Random | None = None,
|
103
|
+
*,
|
104
|
+
error_rate: float | None = None,
|
101
105
|
) -> str:
|
102
106
|
"""Introduce OCR-like artifacts into text.
|
103
107
|
|
@@ -107,13 +111,22 @@ def ocr_artifacts(
|
|
107
111
|
if not text:
|
108
112
|
return text
|
109
113
|
|
114
|
+
effective_rate = resolve_rate(
|
115
|
+
rate=rate,
|
116
|
+
legacy_value=error_rate,
|
117
|
+
default=0.02,
|
118
|
+
legacy_name="error_rate",
|
119
|
+
)
|
120
|
+
|
110
121
|
if rng is None:
|
111
122
|
rng = random.Random(seed)
|
112
123
|
|
124
|
+
clamped_rate = max(0.0, effective_rate)
|
125
|
+
|
113
126
|
if _ocr_artifacts_rust is not None:
|
114
|
-
return _ocr_artifacts_rust(text,
|
127
|
+
return _ocr_artifacts_rust(text, clamped_rate, rng)
|
115
128
|
|
116
|
-
return _python_ocr_artifacts(text,
|
129
|
+
return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
|
117
130
|
|
118
131
|
|
119
132
|
class Scannequin(Glitchling):
|
@@ -122,18 +135,35 @@ class Scannequin(Glitchling):
|
|
122
135
|
def __init__(
|
123
136
|
self,
|
124
137
|
*,
|
125
|
-
|
138
|
+
rate: float | None = None,
|
139
|
+
error_rate: float | None = None,
|
126
140
|
seed: int | None = None,
|
127
141
|
) -> None:
|
142
|
+
self._param_aliases = {"error_rate": "rate"}
|
143
|
+
effective_rate = resolve_rate(
|
144
|
+
rate=rate,
|
145
|
+
legacy_value=error_rate,
|
146
|
+
default=0.02,
|
147
|
+
legacy_name="error_rate",
|
148
|
+
)
|
128
149
|
super().__init__(
|
129
150
|
name="Scannequin",
|
130
151
|
corruption_function=ocr_artifacts,
|
131
152
|
scope=AttackWave.CHARACTER,
|
132
153
|
order=AttackOrder.LATE,
|
133
154
|
seed=seed,
|
134
|
-
|
155
|
+
rate=effective_rate,
|
135
156
|
)
|
136
157
|
|
158
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
159
|
+
rate = self.kwargs.get("rate")
|
160
|
+
if rate is None:
|
161
|
+
rate = self.kwargs.get("error_rate")
|
162
|
+
if rate is None:
|
163
|
+
return None
|
164
|
+
return {"type": "ocr", "error_rate": float(rate)}
|
165
|
+
|
166
|
+
|
137
167
|
|
138
168
|
scannequin = Scannequin()
|
139
169
|
|
glitchlings/zoo/typogre.py
CHANGED
@@ -5,6 +5,7 @@ import random
|
|
5
5
|
from typing import Optional
|
6
6
|
|
7
7
|
from .core import Glitchling, AttackWave, AttackOrder
|
8
|
+
from ._rate import resolve_rate
|
8
9
|
from ..util import KEYNEIGHBORS
|
9
10
|
|
10
11
|
try:
|
@@ -88,11 +89,13 @@ def _python_draw_eligible_index(
|
|
88
89
|
def _fatfinger_python(
|
89
90
|
text: str,
|
90
91
|
*,
|
91
|
-
|
92
|
+
rate: float,
|
92
93
|
layout: dict[str, list[str]],
|
93
94
|
rng: random.Random,
|
94
95
|
) -> str:
|
95
|
-
rate
|
96
|
+
if rate <= 0.0:
|
97
|
+
return text
|
98
|
+
|
96
99
|
s = text
|
97
100
|
max_changes = math.ceil(len(s) * rate)
|
98
101
|
if max_changes == 0:
|
@@ -140,28 +143,37 @@ def _fatfinger_python(
|
|
140
143
|
|
141
144
|
def fatfinger(
|
142
145
|
text: str,
|
143
|
-
|
146
|
+
rate: float | None = None,
|
144
147
|
keyboard: str = "CURATOR_QWERTY",
|
145
148
|
seed: int | None = None,
|
146
149
|
rng: random.Random | None = None,
|
150
|
+
*,
|
151
|
+
max_change_rate: float | None = None,
|
147
152
|
) -> str:
|
148
153
|
"""Introduce character-level "fat finger" edits with a Rust fast path."""
|
149
154
|
|
155
|
+
effective_rate = resolve_rate(
|
156
|
+
rate=rate,
|
157
|
+
legacy_value=max_change_rate,
|
158
|
+
default=0.02,
|
159
|
+
legacy_name="max_change_rate",
|
160
|
+
)
|
161
|
+
|
150
162
|
if rng is None:
|
151
163
|
rng = random.Random(seed)
|
152
164
|
if not text:
|
153
165
|
return ""
|
154
166
|
|
155
|
-
|
156
|
-
if
|
167
|
+
clamped_rate = max(0.0, effective_rate)
|
168
|
+
if clamped_rate == 0.0:
|
157
169
|
return text
|
158
170
|
|
159
171
|
layout = getattr(KEYNEIGHBORS, keyboard)
|
160
172
|
|
161
173
|
if _fatfinger_rust is not None:
|
162
|
-
return _fatfinger_rust(text, max_change_rate=
|
174
|
+
return _fatfinger_rust(text, max_change_rate=clamped_rate, layout=layout, rng=rng)
|
163
175
|
|
164
|
-
return _fatfinger_python(text,
|
176
|
+
return _fatfinger_python(text, rate=clamped_rate, layout=layout, rng=rng)
|
165
177
|
|
166
178
|
|
167
179
|
class Typogre(Glitchling):
|
@@ -170,17 +182,25 @@ class Typogre(Glitchling):
|
|
170
182
|
def __init__(
|
171
183
|
self,
|
172
184
|
*,
|
173
|
-
|
185
|
+
rate: float | None = None,
|
186
|
+
max_change_rate: float | None = None,
|
174
187
|
keyboard: str = "CURATOR_QWERTY",
|
175
188
|
seed: int | None = None,
|
176
189
|
) -> None:
|
190
|
+
self._param_aliases = {"max_change_rate": "rate"}
|
191
|
+
effective_rate = resolve_rate(
|
192
|
+
rate=rate,
|
193
|
+
legacy_value=max_change_rate,
|
194
|
+
default=0.02,
|
195
|
+
legacy_name="max_change_rate",
|
196
|
+
)
|
177
197
|
super().__init__(
|
178
198
|
name="Typogre",
|
179
199
|
corruption_function=fatfinger,
|
180
200
|
scope=AttackWave.CHARACTER,
|
181
201
|
order=AttackOrder.EARLY,
|
182
202
|
seed=seed,
|
183
|
-
|
203
|
+
rate=effective_rate,
|
184
204
|
keyboard=keyboard,
|
185
205
|
)
|
186
206
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -215,6 +215,8 @@ Classifier: Intended Audience :: Developers
|
|
215
215
|
Classifier: License :: OSI Approved :: Apache Software License
|
216
216
|
Classifier: Programming Language :: Python
|
217
217
|
Classifier: Programming Language :: Python :: 3
|
218
|
+
Classifier: Programming Language :: Python :: 3.10
|
219
|
+
Classifier: Programming Language :: Python :: 3.11
|
218
220
|
Classifier: Programming Language :: Python :: 3.12
|
219
221
|
Classifier: Programming Language :: Rust
|
220
222
|
Classifier: Operating System :: MacOS :: MacOS X
|
@@ -223,7 +225,7 @@ Classifier: Operating System :: POSIX :: Linux
|
|
223
225
|
Classifier: Operating System :: OS Independent
|
224
226
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
225
227
|
Classifier: Topic :: Software Development :: Testing
|
226
|
-
Requires-Python: >=3.
|
228
|
+
Requires-Python: >=3.10
|
227
229
|
Description-Content-Type: text/markdown
|
228
230
|
License-File: LICENSE
|
229
231
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
@@ -280,14 +282,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
|
|
280
282
|
pip install -U glitchlings
|
281
283
|
```
|
282
284
|
|
285
|
+
> Glitchlings requires Python 3.10 or newer.
|
286
|
+
|
283
287
|
```python
|
284
288
|
from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
|
285
289
|
|
286
290
|
gaggle = Gaggle([
|
287
|
-
Typogre(
|
288
|
-
Mim1c(
|
291
|
+
Typogre(rate=0.03),
|
292
|
+
Mim1c(rate=0.02),
|
289
293
|
Reduple(seed=404),
|
290
|
-
Rushmore(
|
294
|
+
Rushmore(rate=0.02),
|
291
295
|
])
|
292
296
|
|
293
297
|
print(gaggle(SAMPLE_TEXT))
|
@@ -295,41 +299,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
295
299
|
|
296
300
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
297
301
|
|
298
|
-
|
299
|
-
|
300
|
-
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
301
|
-
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
302
|
-
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
302
|
+
Consult the [Glitchlings Usage Guide](docs/index.md)
|
303
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
303
304
|
integrations, and the feature-flagged Rust pipeline.
|
304
305
|
|
305
|
-
### Prime Intellect environments
|
306
|
-
|
307
|
-
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
308
|
-
|
309
|
-
```python
|
310
|
-
from glitchlings import Mim1c, Typogre
|
311
|
-
from glitchlings.dlc.prime import echo_chamber, load_environment
|
312
|
-
|
313
|
-
env = load_environment(
|
314
|
-
"osoleve/syllabify-en",
|
315
|
-
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
316
|
-
seed=404,
|
317
|
-
)
|
318
|
-
|
319
|
-
# Spin up an echo chamber that corrupts a dataset column and
|
320
|
-
# rewards models for perfectly restoring it
|
321
|
-
practice_env = echo_chamber(
|
322
|
-
"osoleve/clean-room",
|
323
|
-
column="text",
|
324
|
-
glitchlings=["Typogre", "Mim1c"],
|
325
|
-
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
326
|
-
)
|
327
|
-
```
|
328
|
-
|
329
|
-
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
330
|
-
override `reward_function` when you want to evaluate completions with a custom
|
331
|
-
scoring routine.
|
332
|
-
|
333
306
|
## Motivation
|
334
307
|
|
335
308
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -344,8 +317,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
|
|
344
317
|
```python
|
345
318
|
from glitchlings import Gaggle, Typogre, Mim1c
|
346
319
|
|
347
|
-
custom_typogre = Typogre(
|
348
|
-
selective_mimic = Mim1c(
|
320
|
+
custom_typogre = Typogre(rate=0.1)
|
321
|
+
selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
|
349
322
|
|
350
323
|
gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
|
351
324
|
print(gaggle("Summoned heroes do not fear the glitch."))
|
@@ -376,7 +349,7 @@ glitchlings --list
|
|
376
349
|
glitchlings -g typogre --file documents/report.txt --diff
|
377
350
|
|
378
351
|
# Configure glitchlings inline by passing keyword arguments.
|
379
|
-
glitchlings -g "Typogre(
|
352
|
+
glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
|
380
353
|
|
381
354
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
382
355
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
@@ -400,7 +373,7 @@ _What a nice word, would be a shame if something happened to it._
|
|
400
373
|
>
|
401
374
|
> Args
|
402
375
|
>
|
403
|
-
> - `
|
376
|
+
> - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
|
404
377
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
405
378
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
406
379
|
|
@@ -412,7 +385,7 @@ _Wait, was that...?_
|
|
412
385
|
>
|
413
386
|
> Args
|
414
387
|
>
|
415
|
-
> - `
|
388
|
+
> - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
|
416
389
|
> - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
|
417
390
|
> - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
|
418
391
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
@@ -425,7 +398,7 @@ _How can a computer need reading glasses?_
|
|
425
398
|
>
|
426
399
|
> Args
|
427
400
|
>
|
428
|
-
> - `
|
401
|
+
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
429
402
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
430
403
|
|
431
404
|
### Jargoyle
|
@@ -436,7 +409,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
|
|
436
409
|
>
|
437
410
|
> Args
|
438
411
|
>
|
439
|
-
> - `
|
412
|
+
> - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
|
440
413
|
> - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
|
441
414
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
442
415
|
|
@@ -448,7 +421,7 @@ _Did you say that or did I?_
|
|
448
421
|
>
|
449
422
|
> Args
|
450
423
|
>
|
451
|
-
> - `
|
424
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
|
452
425
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
453
426
|
|
454
427
|
### Rushmore
|
@@ -459,7 +432,7 @@ _I accidentally an entire word._
|
|
459
432
|
>
|
460
433
|
> Args
|
461
434
|
>
|
462
|
-
> - `
|
435
|
+
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
463
436
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
464
437
|
|
465
438
|
### Redactyl
|
@@ -471,7 +444,7 @@ _Oops, that was my black highlighter._
|
|
471
444
|
> ### Args
|
472
445
|
>
|
473
446
|
> - `replacement_char (str)`: The character to use for redaction (default: █).
|
474
|
-
> - `
|
447
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
|
475
448
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
476
449
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
477
450
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
glitchlings/__init__.py,sha256=w8heFqUejrXM_9NNlM9CQnIGkmGUyBV29acg3WsocXA,622
|
2
|
+
glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
|
3
|
+
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=08xnERw5xVGeKk4DT4g1_NWmyHwiDlqt8UtLxR1jk9k,1989632
|
4
|
+
glitchlings/main.py,sha256=QrSSLWcKh1_NDfJDGh-3UVKdI7AkzfMy6Jz1ouxIgnE,6149
|
5
|
+
glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
|
6
|
+
glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
|
7
|
+
glitchlings/dlc/prime.py,sha256=hySyYBncUM-49j6JtrHYO6c3HpbG2vTt2EYZnOJ85C0,8972
|
8
|
+
glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
|
9
|
+
glitchlings/zoo/__init__.py,sha256=kYKKlNvEwKtrD26E1hfde33rkN83CMf_h5AQFGjQyBQ,4312
|
10
|
+
glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
|
11
|
+
glitchlings/zoo/_rate.py,sha256=EYUWXYyR2IK0zYBWyBOlnUjDxU32JE9mZTZeodVx5CA,548
|
12
|
+
glitchlings/zoo/core.py,sha256=QKHmzmONNkiA3RdfgLdNx-FPFwoH4Bm-Tkc3vSCHNpc,14412
|
13
|
+
glitchlings/zoo/jargoyle.py,sha256=1fnL_8bv1Y-T2h1C6NRzIylYyOuAUI-BiMReFewqh00,11002
|
14
|
+
glitchlings/zoo/mim1c.py,sha256=3ddNOzWgLABuEOh5T98Xk439ejx-YHGI7ErXET03Crc,3537
|
15
|
+
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
16
|
+
glitchlings/zoo/redactyl.py,sha256=dM3W59xLhuiS8t5jXETc_L8EEhRN1CpLazBnVPiSknk,4834
|
17
|
+
glitchlings/zoo/reduple.py,sha256=9jid6tCvCaiSxWSPMNuHWZitd7et60RRFYeek3S0ElU,3641
|
18
|
+
glitchlings/zoo/rushmore.py,sha256=pJy3g_H1z8PNoHitvD3-HsytAuE0U6FOdsdaKZy6OqY,3680
|
19
|
+
glitchlings/zoo/scannequin.py,sha256=TJyNYTTIB7rxZH3XKIETy0YVf4EjsMgGWYmYaxH9jxU,5030
|
20
|
+
glitchlings/zoo/typogre.py,sha256=olTTXDmFkVQ3r-T1vxm2mLomRvIDXHrNHfgin316wzE,6221
|
21
|
+
glitchlings-0.2.3.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
22
|
+
glitchlings-0.2.3.dist-info/METADATA,sha256=oiBG6ir6cxTdmOHfNJ4A3FXoTGJRnXk2Qebs2OMX7ZY,26696
|
23
|
+
glitchlings-0.2.3.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
24
|
+
glitchlings-0.2.3.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
25
|
+
glitchlings-0.2.3.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
26
|
+
glitchlings-0.2.3.dist-info/RECORD,,
|
@@ -1,25 +0,0 @@
|
|
1
|
-
glitchlings/__init__.py,sha256=w8heFqUejrXM_9NNlM9CQnIGkmGUyBV29acg3WsocXA,622
|
2
|
-
glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
|
3
|
-
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=Eh4tD2b4ym3zX0KWxVWCFRpmPsZFnyeOiFWr_qQGg5A,1989632
|
4
|
-
glitchlings/main.py,sha256=krujz3GBrdP6FU3O6Z9f3rvc444rT79Hm69zAPG3b-U,6160
|
5
|
-
glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
|
6
|
-
glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
|
7
|
-
glitchlings/dlc/prime.py,sha256=oKVAVWSD-aa-LqDsctSLXzq0JW2RaIc1l2859ogr4lY,8107
|
8
|
-
glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
|
9
|
-
glitchlings/zoo/__init__.py,sha256=kYKKlNvEwKtrD26E1hfde33rkN83CMf_h5AQFGjQyBQ,4312
|
10
|
-
glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
|
11
|
-
glitchlings/zoo/core.py,sha256=aGGc0M97QeKM5rsQjTZs3fhIVac0g8A72mW4u72YnD0,14373
|
12
|
-
glitchlings/zoo/jargoyle.py,sha256=TBzt9CFL5GBP_DjqKqUY54DFsX2VAU4LnBNMDIg7P-Y,10444
|
13
|
-
glitchlings/zoo/mim1c.py,sha256=YHFELu3fpY_9VxRavYfCoAWZYp-HZBXdiLk4DTKdqcY,2979
|
14
|
-
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
15
|
-
glitchlings/zoo/redactyl.py,sha256=VV2mPE2WQ41Sl874TjaHu9ShhYlFNLI7embQqKM5_ZE,3738
|
16
|
-
glitchlings/zoo/reduple.py,sha256=WuMpmuZrf5x7JneiRjDF2Y0beEAn7j1DPCV2BuuTuRY,2873
|
17
|
-
glitchlings/zoo/rushmore.py,sha256=dAiv53B_6Zg-zNG5aW8YobJevyBV586HtJVlZqgcGR8,2790
|
18
|
-
glitchlings/zoo/scannequin.py,sha256=BLJ8VFNTrXxv6mKjTMPUHOqziXO-NLpKNQNPbxG7jLI,4178
|
19
|
-
glitchlings/zoo/typogre.py,sha256=CISk0aqI8y5SdZXibqhfP0cu5MZ7TkiOQ7kftqW9RtI,5680
|
20
|
-
glitchlings-0.2.2.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
21
|
-
glitchlings-0.2.2.dist-info/METADATA,sha256=mRSQQoNoQAPmmVzfUn6ZZLHL1I6n5wxr45o3DyWsSMw,27811
|
22
|
-
glitchlings-0.2.2.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
23
|
-
glitchlings-0.2.2.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
24
|
-
glitchlings-0.2.2.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
25
|
-
glitchlings-0.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|