glitchlings 0.2.2__cp312-cp312-win_amd64.whl → 0.2.4__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/dlc/prime.py +44 -22
- glitchlings/main.py +1 -1
- glitchlings/zoo/_rate.py +21 -0
- glitchlings/zoo/core.py +56 -52
- glitchlings/zoo/jargoyle.py +24 -5
- glitchlings/zoo/mim1c.py +24 -5
- glitchlings/zoo/redactyl.py +43 -8
- glitchlings/zoo/reduple.py +36 -8
- glitchlings/zoo/rushmore.py +40 -8
- glitchlings/zoo/scannequin.py +38 -8
- glitchlings/zoo/typogre.py +29 -9
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.4.dist-info}/METADATA +23 -55
- glitchlings-0.2.4.dist-info/RECORD +26 -0
- glitchlings-0.2.2.dist-info/RECORD +0 -25
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.4.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.4.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.2.dist-info → glitchlings-0.2.4.dist-info}/top_level.txt +0 -0
Binary file
|
glitchlings/dlc/prime.py
CHANGED
@@ -79,8 +79,8 @@ def tutorial_level(
|
|
79
79
|
) -> vf.Environment:
|
80
80
|
"""Create a low-corruption environment using tuned defaults."""
|
81
81
|
|
82
|
-
tuned_mim1c = Mim1c(
|
83
|
-
tuned_typogre = Typogre(
|
82
|
+
tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
|
83
|
+
tuned_typogre = Typogre(rate=0.025 * difficulty.value)
|
84
84
|
|
85
85
|
return load_environment(
|
86
86
|
env,
|
@@ -220,32 +220,54 @@ def echo_chamber(
|
|
220
220
|
"Specify which split to use when the dataset loads as a DatasetDict."
|
221
221
|
)
|
222
222
|
|
223
|
-
|
224
|
-
|
223
|
+
filtered_dataset = hf_dataset.filter(
|
224
|
+
lambda row: row.get(column) is not None,
|
225
|
+
load_from_cache_file=False,
|
226
|
+
)
|
225
227
|
|
226
|
-
|
227
|
-
value = row.get(column)
|
228
|
-
if value is None:
|
229
|
-
continue
|
228
|
+
source_column_names = list(filtered_dataset.column_names)
|
230
229
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
answers.append(text)
|
230
|
+
def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
|
231
|
+
text = str(row[column])
|
232
|
+
prompt = [
|
233
|
+
{"role": "system", "content": instructions},
|
234
|
+
{"role": "user", "content": f"Corrupted text:\n{text}"},
|
235
|
+
]
|
236
|
+
return {"prompt": prompt, "answer": text}
|
239
237
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
238
|
+
base_dataset = filtered_dataset.map(
|
239
|
+
_build_prompt,
|
240
|
+
remove_columns=source_column_names,
|
241
|
+
load_from_cache_file=False,
|
242
|
+
)
|
244
243
|
|
245
|
-
|
244
|
+
try:
|
245
|
+
dataset_length = len(base_dataset) # type: ignore[arg-type]
|
246
|
+
except TypeError:
|
247
|
+
preview_rows: list[dict[str, Any]]
|
248
|
+
take_fn = getattr(base_dataset, "take", None)
|
249
|
+
if callable(take_fn):
|
250
|
+
preview_rows = list(take_fn(1))
|
251
|
+
else:
|
252
|
+
iterator = iter(base_dataset)
|
253
|
+
try:
|
254
|
+
first_row = next(iterator)
|
255
|
+
except StopIteration:
|
256
|
+
preview_rows = []
|
257
|
+
else:
|
258
|
+
preview_rows = [first_row]
|
259
|
+
if not preview_rows:
|
260
|
+
raise ValueError(
|
261
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
if dataset_length == 0:
|
265
|
+
raise ValueError(
|
266
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
267
|
+
)
|
246
268
|
|
247
269
|
gaggle = _as_gaggle(glitchlings, seed=seed)
|
248
|
-
glitched_dataset = gaggle.corrupt_dataset(
|
270
|
+
glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
|
249
271
|
|
250
272
|
rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
|
251
273
|
rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
|
glitchlings/main.py
CHANGED
@@ -46,7 +46,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
46
46
|
metavar="SPEC",
|
47
47
|
help=(
|
48
48
|
"Glitchling to apply, optionally with parameters like "
|
49
|
-
"Typogre(
|
49
|
+
"Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
|
50
50
|
),
|
51
51
|
)
|
52
52
|
parser.add_argument(
|
glitchlings/zoo/_rate.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
|
4
|
+
def resolve_rate(
|
5
|
+
*,
|
6
|
+
rate: float | None,
|
7
|
+
legacy_value: float | None,
|
8
|
+
default: float,
|
9
|
+
legacy_name: str,
|
10
|
+
) -> float:
|
11
|
+
"""Return the effective rate while enforcing mutual exclusivity."""
|
12
|
+
|
13
|
+
if rate is not None and legacy_value is not None:
|
14
|
+
raise ValueError(
|
15
|
+
f"Specify either 'rate' or '{legacy_name}', not both."
|
16
|
+
)
|
17
|
+
if rate is not None:
|
18
|
+
return rate
|
19
|
+
if legacy_value is not None:
|
20
|
+
return legacy_value
|
21
|
+
return default
|
glitchlings/zoo/core.py
CHANGED
@@ -107,6 +107,7 @@ class Glitchling:
|
|
107
107
|
scope: AttackWave,
|
108
108
|
order: AttackOrder = AttackOrder.NORMAL,
|
109
109
|
seed: int | None = None,
|
110
|
+
pipeline_operation: Callable[["Glitchling"], dict[str, Any] | None] | None = None,
|
110
111
|
**kwargs: Any,
|
111
112
|
) -> None:
|
112
113
|
"""Initialize a glitchling.
|
@@ -128,31 +129,76 @@ class Glitchling:
|
|
128
129
|
self.corruption_function: CorruptionCallable = corruption_function
|
129
130
|
self.level: AttackWave = scope
|
130
131
|
self.order: AttackOrder = order
|
132
|
+
self._pipeline_descriptor_factory = pipeline_operation
|
131
133
|
self.kwargs: dict[str, Any] = {}
|
134
|
+
self._cached_rng_callable: CorruptionCallable | None = None
|
135
|
+
self._cached_rng_expectation: bool | None = None
|
132
136
|
for kw, val in kwargs.items():
|
133
137
|
self.set_param(kw, val)
|
134
138
|
|
135
139
|
def set_param(self, key: str, value: Any) -> None:
|
136
140
|
"""Persist a parameter for use by the corruption callable."""
|
137
141
|
|
138
|
-
|
139
|
-
|
140
|
-
|
142
|
+
aliases = getattr(self, "_param_aliases", {})
|
143
|
+
canonical = aliases.get(key, key)
|
144
|
+
|
145
|
+
# Drop stale alias keys so we only forward canonical kwargs.
|
146
|
+
self.kwargs.pop(key, None)
|
147
|
+
for alias, target in aliases.items():
|
148
|
+
if target == canonical:
|
149
|
+
self.kwargs.pop(alias, None)
|
150
|
+
|
151
|
+
self.kwargs[canonical] = value
|
152
|
+
setattr(self, canonical, value)
|
153
|
+
|
154
|
+
if canonical == "seed":
|
141
155
|
self.reset_rng(value)
|
142
156
|
|
143
|
-
|
144
|
-
|
157
|
+
for alias, target in aliases.items():
|
158
|
+
if target == canonical:
|
159
|
+
setattr(self, alias, value)
|
145
160
|
|
146
|
-
|
161
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
162
|
+
"""Return the Rust pipeline operation descriptor for this glitchling."""
|
163
|
+
|
164
|
+
factory = self._pipeline_descriptor_factory
|
165
|
+
if factory is None:
|
166
|
+
return None
|
167
|
+
|
168
|
+
return factory(self)
|
169
|
+
|
170
|
+
def _corruption_expects_rng(self) -> bool:
|
171
|
+
"""Return `True` when the corruption function accepts an rng keyword."""
|
172
|
+
|
173
|
+
cached_callable = self._cached_rng_callable
|
174
|
+
cached_expectation = self._cached_rng_expectation
|
175
|
+
corruption_function = self.corruption_function
|
176
|
+
|
177
|
+
if (
|
178
|
+
cached_callable is corruption_function
|
179
|
+
and cached_expectation is not None
|
180
|
+
):
|
181
|
+
return cached_expectation
|
182
|
+
|
183
|
+
expects_rng = False
|
147
184
|
try:
|
148
|
-
signature = inspect.signature(
|
185
|
+
signature = inspect.signature(corruption_function)
|
149
186
|
except (TypeError, ValueError):
|
150
187
|
signature = None
|
151
188
|
|
152
|
-
expects_rng = False
|
153
189
|
if signature is not None:
|
154
190
|
expects_rng = "rng" in signature.parameters
|
155
191
|
|
192
|
+
self._cached_rng_callable = corruption_function
|
193
|
+
self._cached_rng_expectation = expects_rng
|
194
|
+
return expects_rng
|
195
|
+
|
196
|
+
def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
|
197
|
+
"""Execute the corruption callable, injecting the RNG when required."""
|
198
|
+
|
199
|
+
# Pass rng to underlying corruption function if it expects it.
|
200
|
+
expects_rng = self._corruption_expects_rng()
|
201
|
+
|
156
202
|
if expects_rng:
|
157
203
|
corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
|
158
204
|
else:
|
@@ -231,53 +277,14 @@ class Glitchling:
|
|
231
277
|
self.corruption_function,
|
232
278
|
self.level,
|
233
279
|
self.order,
|
280
|
+
pipeline_operation=self._pipeline_descriptor_factory,
|
234
281
|
**filtered_kwargs,
|
235
282
|
)
|
236
283
|
|
237
284
|
return cls(**filtered_kwargs)
|
238
285
|
|
239
286
|
|
240
|
-
def _pipeline_operation_reduplicate(glitchling: "Glitchling") -> dict[str, Any] | None:
|
241
|
-
rate = glitchling.kwargs.get("reduplication_rate")
|
242
|
-
if rate is None:
|
243
|
-
return None
|
244
|
-
return {"type": "reduplicate", "reduplication_rate": float(rate)}
|
245
|
-
|
246
287
|
|
247
|
-
def _pipeline_operation_delete(glitchling: "Glitchling") -> dict[str, Any] | None:
|
248
|
-
rate = glitchling.kwargs.get("max_deletion_rate")
|
249
|
-
if rate is None:
|
250
|
-
return None
|
251
|
-
return {"type": "delete", "max_deletion_rate": float(rate)}
|
252
|
-
|
253
|
-
|
254
|
-
def _pipeline_operation_redact(glitchling: "Glitchling") -> dict[str, Any] | None:
|
255
|
-
replacement_char = glitchling.kwargs.get("replacement_char")
|
256
|
-
redaction_rate = glitchling.kwargs.get("redaction_rate")
|
257
|
-
merge_adjacent = glitchling.kwargs.get("merge_adjacent")
|
258
|
-
if replacement_char is None or redaction_rate is None or merge_adjacent is None:
|
259
|
-
return None
|
260
|
-
return {
|
261
|
-
"type": "redact",
|
262
|
-
"replacement_char": str(replacement_char),
|
263
|
-
"redaction_rate": float(redaction_rate),
|
264
|
-
"merge_adjacent": bool(merge_adjacent),
|
265
|
-
}
|
266
|
-
|
267
|
-
|
268
|
-
def _pipeline_operation_ocr(glitchling: "Glitchling") -> dict[str, Any] | None:
|
269
|
-
error_rate = glitchling.kwargs.get("error_rate")
|
270
|
-
if error_rate is None:
|
271
|
-
return None
|
272
|
-
return {"type": "ocr", "error_rate": float(error_rate)}
|
273
|
-
|
274
|
-
|
275
|
-
_PIPELINE_OPERATION_BUILDERS: dict[str, Callable[["Glitchling"], dict[str, Any] | None]] = {
|
276
|
-
"Reduple": _pipeline_operation_reduplicate,
|
277
|
-
"Rushmore": _pipeline_operation_delete,
|
278
|
-
"Redactyl": _pipeline_operation_redact,
|
279
|
-
"Scannequin": _pipeline_operation_ocr,
|
280
|
-
}
|
281
288
|
|
282
289
|
|
283
290
|
class Gaggle(Glitchling):
|
@@ -359,10 +366,7 @@ class Gaggle(Glitchling):
|
|
359
366
|
|
360
367
|
descriptors: list[dict[str, Any]] = []
|
361
368
|
for glitchling in self.apply_order:
|
362
|
-
|
363
|
-
if builder is None:
|
364
|
-
return None
|
365
|
-
operation = builder(glitchling)
|
369
|
+
operation = glitchling.pipeline_operation()
|
366
370
|
if operation is None:
|
367
371
|
return None
|
368
372
|
|
glitchlings/zoo/jargoyle.py
CHANGED
@@ -33,6 +33,7 @@ else:
|
|
33
33
|
_WORDNET_MODULE = None
|
34
34
|
|
35
35
|
from .core import AttackWave, Glitchling
|
36
|
+
from ._rate import resolve_rate
|
36
37
|
|
37
38
|
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
38
39
|
|
@@ -211,16 +212,18 @@ def _collect_synonyms(
|
|
211
212
|
|
212
213
|
def substitute_random_synonyms(
|
213
214
|
text: str,
|
214
|
-
|
215
|
+
rate: float | None = None,
|
215
216
|
part_of_speech: PartOfSpeechInput = "n",
|
216
217
|
seed: int | None = None,
|
217
218
|
rng: random.Random | None = None,
|
219
|
+
*,
|
220
|
+
replacement_rate: float | None = None,
|
218
221
|
) -> str:
|
219
222
|
"""Replace words with random WordNet synonyms.
|
220
223
|
|
221
224
|
Parameters
|
222
225
|
- text: Input text.
|
223
|
-
-
|
226
|
+
- rate: Max proportion of candidate words to replace (default 0.1).
|
224
227
|
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
225
228
|
any iterable of those tags, or "any" to include all four.
|
226
229
|
- rng: Optional RNG instance used for deterministic sampling.
|
@@ -232,6 +235,13 @@ def substitute_random_synonyms(
|
|
232
235
|
- Synonyms sorted before rng.choice to fix ordering.
|
233
236
|
- For each POS, the first synset containing alternate lemmas is used for stability.
|
234
237
|
"""
|
238
|
+
effective_rate = resolve_rate(
|
239
|
+
rate=rate,
|
240
|
+
legacy_value=replacement_rate,
|
241
|
+
default=0.1,
|
242
|
+
legacy_name="replacement_rate",
|
243
|
+
)
|
244
|
+
|
235
245
|
ensure_wordnet()
|
236
246
|
wordnet = _wordnet()
|
237
247
|
|
@@ -270,7 +280,8 @@ def substitute_random_synonyms(
|
|
270
280
|
if not candidate_indices:
|
271
281
|
return text
|
272
282
|
|
273
|
-
|
283
|
+
clamped_rate = max(0.0, effective_rate)
|
284
|
+
max_replacements = int(len(candidate_indices) * clamped_rate)
|
274
285
|
if max_replacements <= 0:
|
275
286
|
return text
|
276
287
|
|
@@ -297,16 +308,24 @@ class Jargoyle(Glitchling):
|
|
297
308
|
def __init__(
|
298
309
|
self,
|
299
310
|
*,
|
300
|
-
|
311
|
+
rate: float | None = None,
|
312
|
+
replacement_rate: float | None = None,
|
301
313
|
part_of_speech: PartOfSpeechInput = "n",
|
302
314
|
seed: int | None = None,
|
303
315
|
) -> None:
|
316
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
317
|
+
effective_rate = resolve_rate(
|
318
|
+
rate=rate,
|
319
|
+
legacy_value=replacement_rate,
|
320
|
+
default=0.1,
|
321
|
+
legacy_name="replacement_rate",
|
322
|
+
)
|
304
323
|
super().__init__(
|
305
324
|
name="Jargoyle",
|
306
325
|
corruption_function=substitute_random_synonyms,
|
307
326
|
scope=AttackWave.WORD,
|
308
327
|
seed=seed,
|
309
|
-
|
328
|
+
rate=effective_rate,
|
310
329
|
part_of_speech=part_of_speech,
|
311
330
|
)
|
312
331
|
|
glitchlings/zoo/mim1c.py
CHANGED
@@ -5,21 +5,24 @@ from typing import Literal
|
|
5
5
|
from confusable_homoglyphs import confusables
|
6
6
|
|
7
7
|
from .core import AttackOrder, AttackWave, Glitchling
|
8
|
+
from ._rate import resolve_rate
|
8
9
|
|
9
10
|
|
10
11
|
def swap_homoglyphs(
|
11
12
|
text: str,
|
12
|
-
|
13
|
+
rate: float | None = None,
|
13
14
|
classes: list[str] | Literal["all"] | None = None,
|
14
15
|
banned_characters: Collection[str] | None = None,
|
15
16
|
seed: int | None = None,
|
16
17
|
rng: random.Random | None = None,
|
18
|
+
*,
|
19
|
+
replacement_rate: float | None = None,
|
17
20
|
) -> str:
|
18
21
|
"""Replace characters with visually confusable homoglyphs.
|
19
22
|
|
20
23
|
Parameters
|
21
24
|
- text: Input text.
|
22
|
-
-
|
25
|
+
- rate: Max proportion of eligible characters to replace (default 0.02).
|
23
26
|
- classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
|
24
27
|
- banned_characters: Characters that must never appear as replacements.
|
25
28
|
- seed: Optional seed if `rng` not provided.
|
@@ -29,6 +32,13 @@ def swap_homoglyphs(
|
|
29
32
|
- Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
|
30
33
|
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
31
34
|
"""
|
35
|
+
effective_rate = resolve_rate(
|
36
|
+
rate=rate,
|
37
|
+
legacy_value=replacement_rate,
|
38
|
+
default=0.02,
|
39
|
+
legacy_name="replacement_rate",
|
40
|
+
)
|
41
|
+
|
32
42
|
if rng is None:
|
33
43
|
rng = random.Random(seed)
|
34
44
|
|
@@ -39,7 +49,8 @@ def swap_homoglyphs(
|
|
39
49
|
confusable_chars = [
|
40
50
|
char for char in target_chars if char in confusables.confusables_data
|
41
51
|
]
|
42
|
-
|
52
|
+
clamped_rate = max(0.0, effective_rate)
|
53
|
+
num_replacements = int(len(confusable_chars) * clamped_rate)
|
43
54
|
done = 0
|
44
55
|
rng.shuffle(confusable_chars)
|
45
56
|
banned_set = set(banned_characters or ())
|
@@ -66,18 +77,26 @@ class Mim1c(Glitchling):
|
|
66
77
|
def __init__(
|
67
78
|
self,
|
68
79
|
*,
|
69
|
-
|
80
|
+
rate: float | None = None,
|
81
|
+
replacement_rate: float | None = None,
|
70
82
|
classes: list[str] | Literal["all"] | None = None,
|
71
83
|
banned_characters: Collection[str] | None = None,
|
72
84
|
seed: int | None = None,
|
73
85
|
) -> None:
|
86
|
+
self._param_aliases = {"replacement_rate": "rate"}
|
87
|
+
effective_rate = resolve_rate(
|
88
|
+
rate=rate,
|
89
|
+
legacy_value=replacement_rate,
|
90
|
+
default=0.02,
|
91
|
+
legacy_name="replacement_rate",
|
92
|
+
)
|
74
93
|
super().__init__(
|
75
94
|
name="Mim1c",
|
76
95
|
corruption_function=swap_homoglyphs,
|
77
96
|
scope=AttackWave.CHARACTER,
|
78
97
|
order=AttackOrder.LAST,
|
79
98
|
seed=seed,
|
80
|
-
|
99
|
+
rate=effective_rate,
|
81
100
|
classes=classes,
|
82
101
|
banned_characters=banned_characters,
|
83
102
|
)
|
glitchlings/zoo/redactyl.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from .core import Glitchling, AttackWave
|
6
|
+
from ._rate import resolve_rate
|
5
7
|
|
6
8
|
FULL_BLOCK = "█"
|
7
9
|
|
@@ -16,7 +18,7 @@ def _python_redact_words(
|
|
16
18
|
text: str,
|
17
19
|
*,
|
18
20
|
replacement_char: str,
|
19
|
-
|
21
|
+
rate: float,
|
20
22
|
merge_adjacent: bool,
|
21
23
|
rng: random.Random,
|
22
24
|
) -> str:
|
@@ -25,7 +27,7 @@ def _python_redact_words(
|
|
25
27
|
Parameters
|
26
28
|
- text: Input text.
|
27
29
|
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
28
|
-
-
|
30
|
+
- rate: Max proportion of words to redact (default 0.05).
|
29
31
|
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
30
32
|
- seed: Seed used if `rng` not provided (default 151).
|
31
33
|
- rng: Optional RNG; overrides seed.
|
@@ -35,7 +37,7 @@ def _python_redact_words(
|
|
35
37
|
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
36
38
|
if not word_indices:
|
37
39
|
raise ValueError("Cannot redact words because the input text contains no redactable words.")
|
38
|
-
num_to_redact = max(1, int(len(word_indices) *
|
40
|
+
num_to_redact = max(1, int(len(word_indices) * rate))
|
39
41
|
|
40
42
|
# Sample from the indices of actual words
|
41
43
|
indices_to_redact = rng.sample(word_indices, k=num_to_redact)
|
@@ -72,23 +74,34 @@ def _python_redact_words(
|
|
72
74
|
def redact_words(
|
73
75
|
text: str,
|
74
76
|
replacement_char: str = FULL_BLOCK,
|
75
|
-
|
77
|
+
rate: float | None = None,
|
76
78
|
merge_adjacent: bool = False,
|
77
79
|
seed: int = 151,
|
78
80
|
rng: random.Random | None = None,
|
81
|
+
*,
|
82
|
+
redaction_rate: float | None = None,
|
79
83
|
) -> str:
|
80
84
|
"""Redact random words by replacing their characters."""
|
81
85
|
|
86
|
+
effective_rate = resolve_rate(
|
87
|
+
rate=rate,
|
88
|
+
legacy_value=redaction_rate,
|
89
|
+
default=0.05,
|
90
|
+
legacy_name="redaction_rate",
|
91
|
+
)
|
92
|
+
|
82
93
|
if rng is None:
|
83
94
|
rng = random.Random(seed)
|
84
95
|
|
96
|
+
clamped_rate = max(0.0, effective_rate)
|
97
|
+
|
85
98
|
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
86
99
|
|
87
100
|
if use_rust:
|
88
101
|
return _redact_words_rust(
|
89
102
|
text,
|
90
103
|
replacement_char,
|
91
|
-
|
104
|
+
clamped_rate,
|
92
105
|
merge_adjacent,
|
93
106
|
rng,
|
94
107
|
)
|
@@ -96,7 +109,7 @@ def redact_words(
|
|
96
109
|
return _python_redact_words(
|
97
110
|
text,
|
98
111
|
replacement_char=replacement_char,
|
99
|
-
|
112
|
+
rate=clamped_rate,
|
100
113
|
merge_adjacent=merge_adjacent,
|
101
114
|
rng=rng,
|
102
115
|
)
|
@@ -109,20 +122,42 @@ class Redactyl(Glitchling):
|
|
109
122
|
self,
|
110
123
|
*,
|
111
124
|
replacement_char: str = FULL_BLOCK,
|
112
|
-
|
125
|
+
rate: float | None = None,
|
126
|
+
redaction_rate: float | None = None,
|
113
127
|
merge_adjacent: bool = False,
|
114
128
|
seed: int = 151,
|
115
129
|
) -> None:
|
130
|
+
self._param_aliases = {"redaction_rate": "rate"}
|
131
|
+
effective_rate = resolve_rate(
|
132
|
+
rate=rate,
|
133
|
+
legacy_value=redaction_rate,
|
134
|
+
default=0.05,
|
135
|
+
legacy_name="redaction_rate",
|
136
|
+
)
|
116
137
|
super().__init__(
|
117
138
|
name="Redactyl",
|
118
139
|
corruption_function=redact_words,
|
119
140
|
scope=AttackWave.WORD,
|
120
141
|
seed=seed,
|
121
142
|
replacement_char=replacement_char,
|
122
|
-
|
143
|
+
rate=effective_rate,
|
123
144
|
merge_adjacent=merge_adjacent,
|
124
145
|
)
|
125
146
|
|
147
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
148
|
+
replacement_char = self.kwargs.get("replacement_char")
|
149
|
+
rate = self.kwargs.get("rate")
|
150
|
+
merge_adjacent = self.kwargs.get("merge_adjacent")
|
151
|
+
if replacement_char is None or rate is None or merge_adjacent is None:
|
152
|
+
return None
|
153
|
+
return {
|
154
|
+
"type": "redact",
|
155
|
+
"replacement_char": str(replacement_char),
|
156
|
+
"redaction_rate": float(rate),
|
157
|
+
"merge_adjacent": bool(merge_adjacent),
|
158
|
+
}
|
159
|
+
|
160
|
+
|
126
161
|
|
127
162
|
redactyl = Redactyl()
|
128
163
|
|
glitchlings/zoo/reduple.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from .core import Glitchling, AttackWave
|
6
|
+
from ._rate import resolve_rate
|
5
7
|
|
6
8
|
try:
|
7
9
|
from glitchlings._zoo_rust import reduplicate_words as _reduplicate_words_rust
|
@@ -12,14 +14,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
12
14
|
def _python_reduplicate_words(
|
13
15
|
text: str,
|
14
16
|
*,
|
15
|
-
|
17
|
+
rate: float,
|
16
18
|
rng: random.Random,
|
17
19
|
) -> str:
|
18
20
|
"""Randomly reduplicate words in the text.
|
19
21
|
|
20
22
|
Parameters
|
21
23
|
- text: Input text.
|
22
|
-
-
|
24
|
+
- rate: Max proportion of words to reduplicate (default 0.05).
|
23
25
|
- seed: Optional seed if `rng` not provided.
|
24
26
|
- rng: Optional RNG; overrides seed.
|
25
27
|
|
@@ -39,7 +41,7 @@ def _python_reduplicate_words(
|
|
39
41
|
continue
|
40
42
|
|
41
43
|
# Only consider actual words for reduplication
|
42
|
-
if rng.random() <
|
44
|
+
if rng.random() < rate:
|
43
45
|
# Check if word has trailing punctuation
|
44
46
|
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
45
47
|
if match:
|
@@ -53,9 +55,11 @@ def _python_reduplicate_words(
|
|
53
55
|
|
54
56
|
def reduplicate_words(
|
55
57
|
text: str,
|
56
|
-
|
58
|
+
rate: float | None = None,
|
57
59
|
seed: int | None = None,
|
58
60
|
rng: random.Random | None = None,
|
61
|
+
*,
|
62
|
+
reduplication_rate: float | None = None,
|
59
63
|
) -> str:
|
60
64
|
"""Randomly reduplicate words in the text.
|
61
65
|
|
@@ -63,15 +67,24 @@ def reduplicate_words(
|
|
63
67
|
extension is unavailable.
|
64
68
|
"""
|
65
69
|
|
70
|
+
effective_rate = resolve_rate(
|
71
|
+
rate=rate,
|
72
|
+
legacy_value=reduplication_rate,
|
73
|
+
default=0.05,
|
74
|
+
legacy_name="reduplication_rate",
|
75
|
+
)
|
76
|
+
|
66
77
|
if rng is None:
|
67
78
|
rng = random.Random(seed)
|
68
79
|
|
80
|
+
clamped_rate = max(0.0, effective_rate)
|
81
|
+
|
69
82
|
if _reduplicate_words_rust is not None:
|
70
|
-
return _reduplicate_words_rust(text,
|
83
|
+
return _reduplicate_words_rust(text, clamped_rate, rng)
|
71
84
|
|
72
85
|
return _python_reduplicate_words(
|
73
86
|
text,
|
74
|
-
|
87
|
+
rate=clamped_rate,
|
75
88
|
rng=rng,
|
76
89
|
)
|
77
90
|
|
@@ -82,17 +95,32 @@ class Reduple(Glitchling):
|
|
82
95
|
def __init__(
|
83
96
|
self,
|
84
97
|
*,
|
85
|
-
|
98
|
+
rate: float | None = None,
|
99
|
+
reduplication_rate: float | None = None,
|
86
100
|
seed: int | None = None,
|
87
101
|
) -> None:
|
102
|
+
self._param_aliases = {"reduplication_rate": "rate"}
|
103
|
+
effective_rate = resolve_rate(
|
104
|
+
rate=rate,
|
105
|
+
legacy_value=reduplication_rate,
|
106
|
+
default=0.05,
|
107
|
+
legacy_name="reduplication_rate",
|
108
|
+
)
|
88
109
|
super().__init__(
|
89
110
|
name="Reduple",
|
90
111
|
corruption_function=reduplicate_words,
|
91
112
|
scope=AttackWave.WORD,
|
92
113
|
seed=seed,
|
93
|
-
|
114
|
+
rate=effective_rate,
|
94
115
|
)
|
95
116
|
|
117
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
118
|
+
rate = self.kwargs.get("rate")
|
119
|
+
if rate is None:
|
120
|
+
return None
|
121
|
+
return {"type": "reduplicate", "reduplication_rate": float(rate)}
|
122
|
+
|
123
|
+
|
96
124
|
|
97
125
|
reduple = Reduple()
|
98
126
|
|
glitchlings/zoo/rushmore.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import math
|
2
2
|
import random
|
3
3
|
import re
|
4
|
+
from typing import Any
|
4
5
|
|
5
6
|
from .core import Glitchling, AttackWave
|
7
|
+
from ._rate import resolve_rate
|
6
8
|
|
7
9
|
try:
|
8
10
|
from glitchlings._zoo_rust import delete_random_words as _delete_random_words_rust
|
@@ -13,11 +15,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
13
15
|
def _python_delete_random_words(
|
14
16
|
text: str,
|
15
17
|
*,
|
16
|
-
|
18
|
+
rate: float,
|
17
19
|
rng: random.Random,
|
18
20
|
) -> str:
|
19
21
|
"""Delete random words from the input text while preserving whitespace."""
|
20
22
|
|
23
|
+
if rate <= 0.0:
|
24
|
+
return text
|
25
|
+
|
21
26
|
tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
|
22
27
|
|
23
28
|
candidate_indices: list[int] = []
|
@@ -29,14 +34,14 @@ def _python_delete_random_words(
|
|
29
34
|
candidate_indices.append(i)
|
30
35
|
|
31
36
|
allowed_deletions = min(
|
32
|
-
len(candidate_indices), math.floor(len(candidate_indices) *
|
37
|
+
len(candidate_indices), math.floor(len(candidate_indices) * rate)
|
33
38
|
)
|
34
39
|
if allowed_deletions <= 0:
|
35
40
|
return text
|
36
41
|
|
37
42
|
deletions = 0
|
38
43
|
for i in candidate_indices:
|
39
|
-
if rng.random() <
|
44
|
+
if rng.random() < rate:
|
40
45
|
word = tokens[i]
|
41
46
|
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
42
47
|
if match:
|
@@ -58,24 +63,35 @@ def _python_delete_random_words(
|
|
58
63
|
|
59
64
|
def delete_random_words(
|
60
65
|
text: str,
|
61
|
-
|
66
|
+
rate: float | None = None,
|
62
67
|
seed: int | None = None,
|
63
68
|
rng: random.Random | None = None,
|
69
|
+
*,
|
70
|
+
max_deletion_rate: float | None = None,
|
64
71
|
) -> str:
|
65
72
|
"""Delete random words from the input text.
|
66
73
|
|
67
74
|
Uses the optional Rust implementation when available.
|
68
75
|
"""
|
69
76
|
|
77
|
+
effective_rate = resolve_rate(
|
78
|
+
rate=rate,
|
79
|
+
legacy_value=max_deletion_rate,
|
80
|
+
default=0.01,
|
81
|
+
legacy_name="max_deletion_rate",
|
82
|
+
)
|
83
|
+
|
70
84
|
if rng is None:
|
71
85
|
rng = random.Random(seed)
|
72
86
|
|
87
|
+
clamped_rate = max(0.0, effective_rate)
|
88
|
+
|
73
89
|
if _delete_random_words_rust is not None:
|
74
|
-
return _delete_random_words_rust(text,
|
90
|
+
return _delete_random_words_rust(text, clamped_rate, rng)
|
75
91
|
|
76
92
|
return _python_delete_random_words(
|
77
93
|
text,
|
78
|
-
|
94
|
+
rate=clamped_rate,
|
79
95
|
rng=rng,
|
80
96
|
)
|
81
97
|
|
@@ -86,17 +102,33 @@ class Rushmore(Glitchling):
|
|
86
102
|
def __init__(
|
87
103
|
self,
|
88
104
|
*,
|
89
|
-
|
105
|
+
rate: float | None = None,
|
106
|
+
max_deletion_rate: float | None = None,
|
90
107
|
seed: int | None = None,
|
91
108
|
) -> None:
|
109
|
+
self._param_aliases = {"max_deletion_rate": "rate"}
|
110
|
+
effective_rate = resolve_rate(
|
111
|
+
rate=rate,
|
112
|
+
legacy_value=max_deletion_rate,
|
113
|
+
default=0.01,
|
114
|
+
legacy_name="max_deletion_rate",
|
115
|
+
)
|
92
116
|
super().__init__(
|
93
117
|
name="Rushmore",
|
94
118
|
corruption_function=delete_random_words,
|
95
119
|
scope=AttackWave.WORD,
|
96
120
|
seed=seed,
|
97
|
-
|
121
|
+
rate=effective_rate,
|
98
122
|
)
|
99
123
|
|
124
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
125
|
+
rate = self.kwargs.get("rate")
|
126
|
+
if rate is None:
|
127
|
+
rate = self.kwargs.get("max_deletion_rate")
|
128
|
+
if rate is None:
|
129
|
+
return None
|
130
|
+
return {"type": "delete", "max_deletion_rate": float(rate)}
|
131
|
+
|
100
132
|
|
101
133
|
rushmore = Rushmore()
|
102
134
|
|
glitchlings/zoo/scannequin.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import re
|
2
2
|
import random
|
3
|
+
from typing import Any
|
3
4
|
|
4
5
|
from ._ocr_confusions import load_confusion_table
|
5
6
|
from .core import Glitchling, AttackWave, AttackOrder
|
7
|
+
from ._rate import resolve_rate
|
6
8
|
|
7
9
|
try:
|
8
10
|
from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
|
@@ -13,14 +15,14 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
13
15
|
def _python_ocr_artifacts(
|
14
16
|
text: str,
|
15
17
|
*,
|
16
|
-
|
18
|
+
rate: float,
|
17
19
|
rng: random.Random,
|
18
20
|
) -> str:
|
19
21
|
"""Introduce OCR-like artifacts into text.
|
20
22
|
|
21
23
|
Parameters
|
22
24
|
- text: Input text to corrupt.
|
23
|
-
-
|
25
|
+
- rate: Max proportion of eligible confusion matches to replace (default 0.02).
|
24
26
|
- seed: Optional seed if `rng` not provided.
|
25
27
|
- rng: Optional RNG; overrides seed.
|
26
28
|
|
@@ -53,7 +55,7 @@ def _python_ocr_artifacts(
|
|
53
55
|
return text
|
54
56
|
|
55
57
|
# Decide how many to replace
|
56
|
-
k = int(len(candidates) *
|
58
|
+
k = int(len(candidates) * rate)
|
57
59
|
if k <= 0:
|
58
60
|
return text
|
59
61
|
|
@@ -95,9 +97,11 @@ def _python_ocr_artifacts(
|
|
95
97
|
|
96
98
|
def ocr_artifacts(
|
97
99
|
text: str,
|
98
|
-
|
100
|
+
rate: float | None = None,
|
99
101
|
seed: int | None = None,
|
100
102
|
rng: random.Random | None = None,
|
103
|
+
*,
|
104
|
+
error_rate: float | None = None,
|
101
105
|
) -> str:
|
102
106
|
"""Introduce OCR-like artifacts into text.
|
103
107
|
|
@@ -107,13 +111,22 @@ def ocr_artifacts(
|
|
107
111
|
if not text:
|
108
112
|
return text
|
109
113
|
|
114
|
+
effective_rate = resolve_rate(
|
115
|
+
rate=rate,
|
116
|
+
legacy_value=error_rate,
|
117
|
+
default=0.02,
|
118
|
+
legacy_name="error_rate",
|
119
|
+
)
|
120
|
+
|
110
121
|
if rng is None:
|
111
122
|
rng = random.Random(seed)
|
112
123
|
|
124
|
+
clamped_rate = max(0.0, effective_rate)
|
125
|
+
|
113
126
|
if _ocr_artifacts_rust is not None:
|
114
|
-
return _ocr_artifacts_rust(text,
|
127
|
+
return _ocr_artifacts_rust(text, clamped_rate, rng)
|
115
128
|
|
116
|
-
return _python_ocr_artifacts(text,
|
129
|
+
return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
|
117
130
|
|
118
131
|
|
119
132
|
class Scannequin(Glitchling):
|
@@ -122,18 +135,35 @@ class Scannequin(Glitchling):
|
|
122
135
|
def __init__(
|
123
136
|
self,
|
124
137
|
*,
|
125
|
-
|
138
|
+
rate: float | None = None,
|
139
|
+
error_rate: float | None = None,
|
126
140
|
seed: int | None = None,
|
127
141
|
) -> None:
|
142
|
+
self._param_aliases = {"error_rate": "rate"}
|
143
|
+
effective_rate = resolve_rate(
|
144
|
+
rate=rate,
|
145
|
+
legacy_value=error_rate,
|
146
|
+
default=0.02,
|
147
|
+
legacy_name="error_rate",
|
148
|
+
)
|
128
149
|
super().__init__(
|
129
150
|
name="Scannequin",
|
130
151
|
corruption_function=ocr_artifacts,
|
131
152
|
scope=AttackWave.CHARACTER,
|
132
153
|
order=AttackOrder.LATE,
|
133
154
|
seed=seed,
|
134
|
-
|
155
|
+
rate=effective_rate,
|
135
156
|
)
|
136
157
|
|
158
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
159
|
+
rate = self.kwargs.get("rate")
|
160
|
+
if rate is None:
|
161
|
+
rate = self.kwargs.get("error_rate")
|
162
|
+
if rate is None:
|
163
|
+
return None
|
164
|
+
return {"type": "ocr", "error_rate": float(rate)}
|
165
|
+
|
166
|
+
|
137
167
|
|
138
168
|
scannequin = Scannequin()
|
139
169
|
|
glitchlings/zoo/typogre.py
CHANGED
@@ -5,6 +5,7 @@ import random
|
|
5
5
|
from typing import Optional
|
6
6
|
|
7
7
|
from .core import Glitchling, AttackWave, AttackOrder
|
8
|
+
from ._rate import resolve_rate
|
8
9
|
from ..util import KEYNEIGHBORS
|
9
10
|
|
10
11
|
try:
|
@@ -88,11 +89,13 @@ def _python_draw_eligible_index(
|
|
88
89
|
def _fatfinger_python(
|
89
90
|
text: str,
|
90
91
|
*,
|
91
|
-
|
92
|
+
rate: float,
|
92
93
|
layout: dict[str, list[str]],
|
93
94
|
rng: random.Random,
|
94
95
|
) -> str:
|
95
|
-
rate
|
96
|
+
if rate <= 0.0:
|
97
|
+
return text
|
98
|
+
|
96
99
|
s = text
|
97
100
|
max_changes = math.ceil(len(s) * rate)
|
98
101
|
if max_changes == 0:
|
@@ -140,28 +143,37 @@ def _fatfinger_python(
|
|
140
143
|
|
141
144
|
def fatfinger(
|
142
145
|
text: str,
|
143
|
-
|
146
|
+
rate: float | None = None,
|
144
147
|
keyboard: str = "CURATOR_QWERTY",
|
145
148
|
seed: int | None = None,
|
146
149
|
rng: random.Random | None = None,
|
150
|
+
*,
|
151
|
+
max_change_rate: float | None = None,
|
147
152
|
) -> str:
|
148
153
|
"""Introduce character-level "fat finger" edits with a Rust fast path."""
|
149
154
|
|
155
|
+
effective_rate = resolve_rate(
|
156
|
+
rate=rate,
|
157
|
+
legacy_value=max_change_rate,
|
158
|
+
default=0.02,
|
159
|
+
legacy_name="max_change_rate",
|
160
|
+
)
|
161
|
+
|
150
162
|
if rng is None:
|
151
163
|
rng = random.Random(seed)
|
152
164
|
if not text:
|
153
165
|
return ""
|
154
166
|
|
155
|
-
|
156
|
-
if
|
167
|
+
clamped_rate = max(0.0, effective_rate)
|
168
|
+
if clamped_rate == 0.0:
|
157
169
|
return text
|
158
170
|
|
159
171
|
layout = getattr(KEYNEIGHBORS, keyboard)
|
160
172
|
|
161
173
|
if _fatfinger_rust is not None:
|
162
|
-
return _fatfinger_rust(text, max_change_rate=
|
174
|
+
return _fatfinger_rust(text, max_change_rate=clamped_rate, layout=layout, rng=rng)
|
163
175
|
|
164
|
-
return _fatfinger_python(text,
|
176
|
+
return _fatfinger_python(text, rate=clamped_rate, layout=layout, rng=rng)
|
165
177
|
|
166
178
|
|
167
179
|
class Typogre(Glitchling):
|
@@ -170,17 +182,25 @@ class Typogre(Glitchling):
|
|
170
182
|
def __init__(
|
171
183
|
self,
|
172
184
|
*,
|
173
|
-
|
185
|
+
rate: float | None = None,
|
186
|
+
max_change_rate: float | None = None,
|
174
187
|
keyboard: str = "CURATOR_QWERTY",
|
175
188
|
seed: int | None = None,
|
176
189
|
) -> None:
|
190
|
+
self._param_aliases = {"max_change_rate": "rate"}
|
191
|
+
effective_rate = resolve_rate(
|
192
|
+
rate=rate,
|
193
|
+
legacy_value=max_change_rate,
|
194
|
+
default=0.02,
|
195
|
+
legacy_name="max_change_rate",
|
196
|
+
)
|
177
197
|
super().__init__(
|
178
198
|
name="Typogre",
|
179
199
|
corruption_function=fatfinger,
|
180
200
|
scope=AttackWave.CHARACTER,
|
181
201
|
order=AttackOrder.EARLY,
|
182
202
|
seed=seed,
|
183
|
-
|
203
|
+
rate=effective_rate,
|
184
204
|
keyboard=keyboard,
|
185
205
|
)
|
186
206
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -209,25 +209,21 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
|
209
209
|
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
210
210
|
Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
|
211
211
|
Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
|
212
|
-
Keywords: nlp,text,adversarial augmentation,text augmentation
|
212
|
+
Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,confusables,typo,
|
213
213
|
Classifier: Development Status :: 3 - Alpha
|
214
214
|
Classifier: Intended Audience :: Developers
|
215
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
216
215
|
Classifier: Programming Language :: Python
|
217
216
|
Classifier: Programming Language :: Python :: 3
|
217
|
+
Classifier: Programming Language :: Python :: 3.10
|
218
|
+
Classifier: Programming Language :: Python :: 3.11
|
218
219
|
Classifier: Programming Language :: Python :: 3.12
|
219
220
|
Classifier: Programming Language :: Rust
|
220
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
221
|
-
Classifier: Operating System :: Microsoft :: Windows
|
222
|
-
Classifier: Operating System :: POSIX :: Linux
|
223
|
-
Classifier: Operating System :: OS Independent
|
224
221
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
225
222
|
Classifier: Topic :: Software Development :: Testing
|
226
|
-
Requires-Python: >=3.
|
223
|
+
Requires-Python: >=3.10
|
227
224
|
Description-Content-Type: text/markdown
|
228
225
|
License-File: LICENSE
|
229
226
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
230
|
-
Requires-Dist: jellyfish>=1.2.0
|
231
227
|
Provides-Extra: hf
|
232
228
|
Requires-Dist: datasets>=4.0.0; extra == "hf"
|
233
229
|
Provides-Extra: wordnet
|
@@ -235,6 +231,7 @@ Requires-Dist: nltk>=3.9.1; extra == "wordnet"
|
|
235
231
|
Requires-Dist: numpy<=2.0,>=1.24; extra == "wordnet"
|
236
232
|
Provides-Extra: prime
|
237
233
|
Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
|
234
|
+
Requires-Dist: jellyfish>=1.2.0; extra == "prime"
|
238
235
|
Provides-Extra: dev
|
239
236
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
240
237
|
Requires-Dist: hypothesis>=6.140.0; extra == "dev"
|
@@ -280,14 +277,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
|
|
280
277
|
pip install -U glitchlings
|
281
278
|
```
|
282
279
|
|
280
|
+
> Glitchlings requires Python 3.10 or newer.
|
281
|
+
|
283
282
|
```python
|
284
283
|
from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
|
285
284
|
|
286
285
|
gaggle = Gaggle([
|
287
|
-
Typogre(
|
288
|
-
Mim1c(
|
286
|
+
Typogre(rate=0.03),
|
287
|
+
Mim1c(rate=0.02),
|
289
288
|
Reduple(seed=404),
|
290
|
-
Rushmore(
|
289
|
+
Rushmore(rate=0.02),
|
291
290
|
])
|
292
291
|
|
293
292
|
print(gaggle(SAMPLE_TEXT))
|
@@ -295,41 +294,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
295
294
|
|
296
295
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
297
296
|
|
298
|
-
|
299
|
-
|
300
|
-
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
301
|
-
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
302
|
-
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
297
|
+
Consult the [Glitchlings Usage Guide](docs/index.md)
|
298
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
303
299
|
integrations, and the feature-flagged Rust pipeline.
|
304
300
|
|
305
|
-
### Prime Intellect environments
|
306
|
-
|
307
|
-
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
308
|
-
|
309
|
-
```python
|
310
|
-
from glitchlings import Mim1c, Typogre
|
311
|
-
from glitchlings.dlc.prime import echo_chamber, load_environment
|
312
|
-
|
313
|
-
env = load_environment(
|
314
|
-
"osoleve/syllabify-en",
|
315
|
-
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
316
|
-
seed=404,
|
317
|
-
)
|
318
|
-
|
319
|
-
# Spin up an echo chamber that corrupts a dataset column and
|
320
|
-
# rewards models for perfectly restoring it
|
321
|
-
practice_env = echo_chamber(
|
322
|
-
"osoleve/clean-room",
|
323
|
-
column="text",
|
324
|
-
glitchlings=["Typogre", "Mim1c"],
|
325
|
-
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
326
|
-
)
|
327
|
-
```
|
328
|
-
|
329
|
-
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
330
|
-
override `reward_function` when you want to evaluate completions with a custom
|
331
|
-
scoring routine.
|
332
|
-
|
333
301
|
## Motivation
|
334
302
|
|
335
303
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -344,8 +312,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
|
|
344
312
|
```python
|
345
313
|
from glitchlings import Gaggle, Typogre, Mim1c
|
346
314
|
|
347
|
-
custom_typogre = Typogre(
|
348
|
-
selective_mimic = Mim1c(
|
315
|
+
custom_typogre = Typogre(rate=0.1)
|
316
|
+
selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
|
349
317
|
|
350
318
|
gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
|
351
319
|
print(gaggle("Summoned heroes do not fear the glitch."))
|
@@ -376,7 +344,7 @@ glitchlings --list
|
|
376
344
|
glitchlings -g typogre --file documents/report.txt --diff
|
377
345
|
|
378
346
|
# Configure glitchlings inline by passing keyword arguments.
|
379
|
-
glitchlings -g "Typogre(
|
347
|
+
glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
|
380
348
|
|
381
349
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
382
350
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
@@ -400,7 +368,7 @@ _What a nice word, would be a shame if something happened to it._
|
|
400
368
|
>
|
401
369
|
> Args
|
402
370
|
>
|
403
|
-
> - `
|
371
|
+
> - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
|
404
372
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
405
373
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
406
374
|
|
@@ -412,7 +380,7 @@ _Wait, was that...?_
|
|
412
380
|
>
|
413
381
|
> Args
|
414
382
|
>
|
415
|
-
> - `
|
383
|
+
> - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
|
416
384
|
> - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
|
417
385
|
> - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
|
418
386
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
@@ -425,7 +393,7 @@ _How can a computer need reading glasses?_
|
|
425
393
|
>
|
426
394
|
> Args
|
427
395
|
>
|
428
|
-
> - `
|
396
|
+
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
429
397
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
430
398
|
|
431
399
|
### Jargoyle
|
@@ -436,7 +404,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
|
|
436
404
|
>
|
437
405
|
> Args
|
438
406
|
>
|
439
|
-
> - `
|
407
|
+
> - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
|
440
408
|
> - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
|
441
409
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
442
410
|
|
@@ -448,7 +416,7 @@ _Did you say that or did I?_
|
|
448
416
|
>
|
449
417
|
> Args
|
450
418
|
>
|
451
|
-
> - `
|
419
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
|
452
420
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
453
421
|
|
454
422
|
### Rushmore
|
@@ -459,7 +427,7 @@ _I accidentally an entire word._
|
|
459
427
|
>
|
460
428
|
> Args
|
461
429
|
>
|
462
|
-
> - `
|
430
|
+
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
463
431
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
464
432
|
|
465
433
|
### Redactyl
|
@@ -471,7 +439,7 @@ _Oops, that was my black highlighter._
|
|
471
439
|
> ### Args
|
472
440
|
>
|
473
441
|
> - `replacement_char (str)`: The character to use for redaction (default: █).
|
474
|
-
> - `
|
442
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
|
475
443
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
476
444
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
477
445
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
glitchlings/__init__.py,sha256=w8heFqUejrXM_9NNlM9CQnIGkmGUyBV29acg3WsocXA,622
|
2
|
+
glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
|
3
|
+
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=qHk8hPmRrzJTwOyhcBNr-2qhXBaEBUy__7_SMFhzWSc,1989632
|
4
|
+
glitchlings/main.py,sha256=QrSSLWcKh1_NDfJDGh-3UVKdI7AkzfMy6Jz1ouxIgnE,6149
|
5
|
+
glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
|
6
|
+
glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
|
7
|
+
glitchlings/dlc/prime.py,sha256=hySyYBncUM-49j6JtrHYO6c3HpbG2vTt2EYZnOJ85C0,8972
|
8
|
+
glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
|
9
|
+
glitchlings/zoo/__init__.py,sha256=kYKKlNvEwKtrD26E1hfde33rkN83CMf_h5AQFGjQyBQ,4312
|
10
|
+
glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
|
11
|
+
glitchlings/zoo/_rate.py,sha256=EYUWXYyR2IK0zYBWyBOlnUjDxU32JE9mZTZeodVx5CA,548
|
12
|
+
glitchlings/zoo/core.py,sha256=QKHmzmONNkiA3RdfgLdNx-FPFwoH4Bm-Tkc3vSCHNpc,14412
|
13
|
+
glitchlings/zoo/jargoyle.py,sha256=1fnL_8bv1Y-T2h1C6NRzIylYyOuAUI-BiMReFewqh00,11002
|
14
|
+
glitchlings/zoo/mim1c.py,sha256=3ddNOzWgLABuEOh5T98Xk439ejx-YHGI7ErXET03Crc,3537
|
15
|
+
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
16
|
+
glitchlings/zoo/redactyl.py,sha256=dM3W59xLhuiS8t5jXETc_L8EEhRN1CpLazBnVPiSknk,4834
|
17
|
+
glitchlings/zoo/reduple.py,sha256=9jid6tCvCaiSxWSPMNuHWZitd7et60RRFYeek3S0ElU,3641
|
18
|
+
glitchlings/zoo/rushmore.py,sha256=pJy3g_H1z8PNoHitvD3-HsytAuE0U6FOdsdaKZy6OqY,3680
|
19
|
+
glitchlings/zoo/scannequin.py,sha256=TJyNYTTIB7rxZH3XKIETy0YVf4EjsMgGWYmYaxH9jxU,5030
|
20
|
+
glitchlings/zoo/typogre.py,sha256=olTTXDmFkVQ3r-T1vxm2mLomRvIDXHrNHfgin316wzE,6221
|
21
|
+
glitchlings-0.2.4.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
22
|
+
glitchlings-0.2.4.dist-info/METADATA,sha256=mGKlfmodtLjWsfrz6O0cLk4DDPFeUO5vt6LKgw-uu-M,26513
|
23
|
+
glitchlings-0.2.4.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
24
|
+
glitchlings-0.2.4.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
25
|
+
glitchlings-0.2.4.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
26
|
+
glitchlings-0.2.4.dist-info/RECORD,,
|
@@ -1,25 +0,0 @@
|
|
1
|
-
glitchlings/__init__.py,sha256=w8heFqUejrXM_9NNlM9CQnIGkmGUyBV29acg3WsocXA,622
|
2
|
-
glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
|
3
|
-
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=Eh4tD2b4ym3zX0KWxVWCFRpmPsZFnyeOiFWr_qQGg5A,1989632
|
4
|
-
glitchlings/main.py,sha256=krujz3GBrdP6FU3O6Z9f3rvc444rT79Hm69zAPG3b-U,6160
|
5
|
-
glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
|
6
|
-
glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
|
7
|
-
glitchlings/dlc/prime.py,sha256=oKVAVWSD-aa-LqDsctSLXzq0JW2RaIc1l2859ogr4lY,8107
|
8
|
-
glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
|
9
|
-
glitchlings/zoo/__init__.py,sha256=kYKKlNvEwKtrD26E1hfde33rkN83CMf_h5AQFGjQyBQ,4312
|
10
|
-
glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
|
11
|
-
glitchlings/zoo/core.py,sha256=aGGc0M97QeKM5rsQjTZs3fhIVac0g8A72mW4u72YnD0,14373
|
12
|
-
glitchlings/zoo/jargoyle.py,sha256=TBzt9CFL5GBP_DjqKqUY54DFsX2VAU4LnBNMDIg7P-Y,10444
|
13
|
-
glitchlings/zoo/mim1c.py,sha256=YHFELu3fpY_9VxRavYfCoAWZYp-HZBXdiLk4DTKdqcY,2979
|
14
|
-
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
15
|
-
glitchlings/zoo/redactyl.py,sha256=VV2mPE2WQ41Sl874TjaHu9ShhYlFNLI7embQqKM5_ZE,3738
|
16
|
-
glitchlings/zoo/reduple.py,sha256=WuMpmuZrf5x7JneiRjDF2Y0beEAn7j1DPCV2BuuTuRY,2873
|
17
|
-
glitchlings/zoo/rushmore.py,sha256=dAiv53B_6Zg-zNG5aW8YobJevyBV586HtJVlZqgcGR8,2790
|
18
|
-
glitchlings/zoo/scannequin.py,sha256=BLJ8VFNTrXxv6mKjTMPUHOqziXO-NLpKNQNPbxG7jLI,4178
|
19
|
-
glitchlings/zoo/typogre.py,sha256=CISk0aqI8y5SdZXibqhfP0cu5MZ7TkiOQ7kftqW9RtI,5680
|
20
|
-
glitchlings-0.2.2.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
21
|
-
glitchlings-0.2.2.dist-info/METADATA,sha256=mRSQQoNoQAPmmVzfUn6ZZLHL1I6n5wxr45o3DyWsSMw,27811
|
22
|
-
glitchlings-0.2.2.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
23
|
-
glitchlings-0.2.2.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
24
|
-
glitchlings-0.2.2.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
25
|
-
glitchlings-0.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|