glitchlings 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {glitchlings-0.2.2 → glitchlings-0.2.3}/PKG-INFO +21 -48
- {glitchlings-0.2.2 → glitchlings-0.2.3}/README.md +17 -46
- {glitchlings-0.2.2 → glitchlings-0.2.3}/pyproject.toml +4 -2
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/build.rs +7 -1
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/dlc/prime.py +44 -22
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/main.py +1 -1
- glitchlings-0.2.3/src/glitchlings/zoo/_rate.py +21 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/core.py +56 -52
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/jargoyle.py +24 -5
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/mim1c.py +24 -5
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/redactyl.py +43 -8
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/reduple.py +36 -8
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/rushmore.py +40 -8
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/scannequin.py +38 -8
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/typogre.py +29 -9
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/PKG-INFO +21 -48
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/SOURCES.txt +1 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_cli.py +2 -2
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_gaggle.py +2 -1
- glitchlings-0.2.3/tests/test_glitchling_core.py +68 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_glitchlings_determinism.py +7 -7
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_jargoyle.py +2 -2
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_parameter_effects.py +10 -10
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_prime_echo_chamber.py +93 -4
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_property_based.py +2 -2
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_rust_backed_glitchlings.py +30 -30
- glitchlings-0.2.2/tests/test_glitchling_core.py +0 -24
- {glitchlings-0.2.2 → glitchlings-0.2.3}/LICENSE +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/MANIFEST.in +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/Cargo.lock +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/Cargo.toml +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/Cargo.toml +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/assets/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/glitch_ops.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/lib.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/pipeline.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/resources.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/rng.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/text_buffer.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/typogre.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/setup.cfg +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/dlc/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/dlc/huggingface.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/dependency_links.txt +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/entry_points.txt +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/requires.txt +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/top_level.txt +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_dataset_corruption.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_huggingface_dlc.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_keyboard_layouts.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -215,6 +215,8 @@ Classifier: Intended Audience :: Developers
|
|
215
215
|
Classifier: License :: OSI Approved :: Apache Software License
|
216
216
|
Classifier: Programming Language :: Python
|
217
217
|
Classifier: Programming Language :: Python :: 3
|
218
|
+
Classifier: Programming Language :: Python :: 3.10
|
219
|
+
Classifier: Programming Language :: Python :: 3.11
|
218
220
|
Classifier: Programming Language :: Python :: 3.12
|
219
221
|
Classifier: Programming Language :: Rust
|
220
222
|
Classifier: Operating System :: MacOS :: MacOS X
|
@@ -223,7 +225,7 @@ Classifier: Operating System :: POSIX :: Linux
|
|
223
225
|
Classifier: Operating System :: OS Independent
|
224
226
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
225
227
|
Classifier: Topic :: Software Development :: Testing
|
226
|
-
Requires-Python: >=3.
|
228
|
+
Requires-Python: >=3.10
|
227
229
|
Description-Content-Type: text/markdown
|
228
230
|
License-File: LICENSE
|
229
231
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
@@ -280,14 +282,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
|
|
280
282
|
pip install -U glitchlings
|
281
283
|
```
|
282
284
|
|
285
|
+
> Glitchlings requires Python 3.10 or newer.
|
286
|
+
|
283
287
|
```python
|
284
288
|
from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
|
285
289
|
|
286
290
|
gaggle = Gaggle([
|
287
|
-
Typogre(
|
288
|
-
Mim1c(
|
291
|
+
Typogre(rate=0.03),
|
292
|
+
Mim1c(rate=0.02),
|
289
293
|
Reduple(seed=404),
|
290
|
-
Rushmore(
|
294
|
+
Rushmore(rate=0.02),
|
291
295
|
])
|
292
296
|
|
293
297
|
print(gaggle(SAMPLE_TEXT))
|
@@ -295,41 +299,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
295
299
|
|
296
300
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
297
301
|
|
298
|
-
|
299
|
-
|
300
|
-
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
301
|
-
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
302
|
-
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
302
|
+
Consult the [Glitchlings Usage Guide](docs/index.md)
|
303
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
303
304
|
integrations, and the feature-flagged Rust pipeline.
|
304
305
|
|
305
|
-
### Prime Intellect environments
|
306
|
-
|
307
|
-
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
308
|
-
|
309
|
-
```python
|
310
|
-
from glitchlings import Mim1c, Typogre
|
311
|
-
from glitchlings.dlc.prime import echo_chamber, load_environment
|
312
|
-
|
313
|
-
env = load_environment(
|
314
|
-
"osoleve/syllabify-en",
|
315
|
-
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
316
|
-
seed=404,
|
317
|
-
)
|
318
|
-
|
319
|
-
# Spin up an echo chamber that corrupts a dataset column and
|
320
|
-
# rewards models for perfectly restoring it
|
321
|
-
practice_env = echo_chamber(
|
322
|
-
"osoleve/clean-room",
|
323
|
-
column="text",
|
324
|
-
glitchlings=["Typogre", "Mim1c"],
|
325
|
-
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
326
|
-
)
|
327
|
-
```
|
328
|
-
|
329
|
-
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
330
|
-
override `reward_function` when you want to evaluate completions with a custom
|
331
|
-
scoring routine.
|
332
|
-
|
333
306
|
## Motivation
|
334
307
|
|
335
308
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -344,8 +317,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
|
|
344
317
|
```python
|
345
318
|
from glitchlings import Gaggle, Typogre, Mim1c
|
346
319
|
|
347
|
-
custom_typogre = Typogre(
|
348
|
-
selective_mimic = Mim1c(
|
320
|
+
custom_typogre = Typogre(rate=0.1)
|
321
|
+
selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
|
349
322
|
|
350
323
|
gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
|
351
324
|
print(gaggle("Summoned heroes do not fear the glitch."))
|
@@ -376,7 +349,7 @@ glitchlings --list
|
|
376
349
|
glitchlings -g typogre --file documents/report.txt --diff
|
377
350
|
|
378
351
|
# Configure glitchlings inline by passing keyword arguments.
|
379
|
-
glitchlings -g "Typogre(
|
352
|
+
glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
|
380
353
|
|
381
354
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
382
355
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
@@ -400,7 +373,7 @@ _What a nice word, would be a shame if something happened to it._
|
|
400
373
|
>
|
401
374
|
> Args
|
402
375
|
>
|
403
|
-
> - `
|
376
|
+
> - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
|
404
377
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
405
378
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
406
379
|
|
@@ -412,7 +385,7 @@ _Wait, was that...?_
|
|
412
385
|
>
|
413
386
|
> Args
|
414
387
|
>
|
415
|
-
> - `
|
388
|
+
> - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
|
416
389
|
> - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
|
417
390
|
> - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
|
418
391
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
@@ -425,7 +398,7 @@ _How can a computer need reading glasses?_
|
|
425
398
|
>
|
426
399
|
> Args
|
427
400
|
>
|
428
|
-
> - `
|
401
|
+
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
429
402
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
430
403
|
|
431
404
|
### Jargoyle
|
@@ -436,7 +409,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
|
|
436
409
|
>
|
437
410
|
> Args
|
438
411
|
>
|
439
|
-
> - `
|
412
|
+
> - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
|
440
413
|
> - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
|
441
414
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
442
415
|
|
@@ -448,7 +421,7 @@ _Did you say that or did I?_
|
|
448
421
|
>
|
449
422
|
> Args
|
450
423
|
>
|
451
|
-
> - `
|
424
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
|
452
425
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
453
426
|
|
454
427
|
### Rushmore
|
@@ -459,7 +432,7 @@ _I accidentally an entire word._
|
|
459
432
|
>
|
460
433
|
> Args
|
461
434
|
>
|
462
|
-
> - `
|
435
|
+
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
463
436
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
464
437
|
|
465
438
|
### Redactyl
|
@@ -471,7 +444,7 @@ _Oops, that was my black highlighter._
|
|
471
444
|
> ### Args
|
472
445
|
>
|
473
446
|
> - `replacement_char (str)`: The character to use for redaction (default: █).
|
474
|
-
> - `
|
447
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
|
475
448
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
476
449
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
477
450
|
|
@@ -36,14 +36,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
|
|
36
36
|
pip install -U glitchlings
|
37
37
|
```
|
38
38
|
|
39
|
+
> Glitchlings requires Python 3.10 or newer.
|
40
|
+
|
39
41
|
```python
|
40
42
|
from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
|
41
43
|
|
42
44
|
gaggle = Gaggle([
|
43
|
-
Typogre(
|
44
|
-
Mim1c(
|
45
|
+
Typogre(rate=0.03),
|
46
|
+
Mim1c(rate=0.02),
|
45
47
|
Reduple(seed=404),
|
46
|
-
Rushmore(
|
48
|
+
Rushmore(rate=0.02),
|
47
49
|
])
|
48
50
|
|
49
51
|
print(gaggle(SAMPLE_TEXT))
|
@@ -51,41 +53,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
51
53
|
|
52
54
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
-
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
57
|
-
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
58
|
-
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
56
|
+
Consult the [Glitchlings Usage Guide](docs/index.md)
|
57
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
59
58
|
integrations, and the feature-flagged Rust pipeline.
|
60
59
|
|
61
|
-
### Prime Intellect environments
|
62
|
-
|
63
|
-
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
64
|
-
|
65
|
-
```python
|
66
|
-
from glitchlings import Mim1c, Typogre
|
67
|
-
from glitchlings.dlc.prime import echo_chamber, load_environment
|
68
|
-
|
69
|
-
env = load_environment(
|
70
|
-
"osoleve/syllabify-en",
|
71
|
-
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
72
|
-
seed=404,
|
73
|
-
)
|
74
|
-
|
75
|
-
# Spin up an echo chamber that corrupts a dataset column and
|
76
|
-
# rewards models for perfectly restoring it
|
77
|
-
practice_env = echo_chamber(
|
78
|
-
"osoleve/clean-room",
|
79
|
-
column="text",
|
80
|
-
glitchlings=["Typogre", "Mim1c"],
|
81
|
-
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
82
|
-
)
|
83
|
-
```
|
84
|
-
|
85
|
-
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
86
|
-
override `reward_function` when you want to evaluate completions with a custom
|
87
|
-
scoring routine.
|
88
|
-
|
89
60
|
## Motivation
|
90
61
|
|
91
62
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -100,8 +71,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
|
|
100
71
|
```python
|
101
72
|
from glitchlings import Gaggle, Typogre, Mim1c
|
102
73
|
|
103
|
-
custom_typogre = Typogre(
|
104
|
-
selective_mimic = Mim1c(
|
74
|
+
custom_typogre = Typogre(rate=0.1)
|
75
|
+
selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
|
105
76
|
|
106
77
|
gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
|
107
78
|
print(gaggle("Summoned heroes do not fear the glitch."))
|
@@ -132,7 +103,7 @@ glitchlings --list
|
|
132
103
|
glitchlings -g typogre --file documents/report.txt --diff
|
133
104
|
|
134
105
|
# Configure glitchlings inline by passing keyword arguments.
|
135
|
-
glitchlings -g "Typogre(
|
106
|
+
glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
|
136
107
|
|
137
108
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
138
109
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
@@ -156,7 +127,7 @@ _What a nice word, would be a shame if something happened to it._
|
|
156
127
|
>
|
157
128
|
> Args
|
158
129
|
>
|
159
|
-
> - `
|
130
|
+
> - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
|
160
131
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
161
132
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
162
133
|
|
@@ -168,7 +139,7 @@ _Wait, was that...?_
|
|
168
139
|
>
|
169
140
|
> Args
|
170
141
|
>
|
171
|
-
> - `
|
142
|
+
> - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
|
172
143
|
> - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
|
173
144
|
> - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
|
174
145
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
@@ -181,7 +152,7 @@ _How can a computer need reading glasses?_
|
|
181
152
|
>
|
182
153
|
> Args
|
183
154
|
>
|
184
|
-
> - `
|
155
|
+
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
185
156
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
186
157
|
|
187
158
|
### Jargoyle
|
@@ -192,7 +163,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
|
|
192
163
|
>
|
193
164
|
> Args
|
194
165
|
>
|
195
|
-
> - `
|
166
|
+
> - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
|
196
167
|
> - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
|
197
168
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
198
169
|
|
@@ -204,7 +175,7 @@ _Did you say that or did I?_
|
|
204
175
|
>
|
205
176
|
> Args
|
206
177
|
>
|
207
|
-
> - `
|
178
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
|
208
179
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
209
180
|
|
210
181
|
### Rushmore
|
@@ -215,7 +186,7 @@ _I accidentally an entire word._
|
|
215
186
|
>
|
216
187
|
> Args
|
217
188
|
>
|
218
|
-
> - `
|
189
|
+
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
219
190
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
220
191
|
|
221
192
|
### Redactyl
|
@@ -227,7 +198,7 @@ _Oops, that was my black highlighter._
|
|
227
198
|
> ### Args
|
228
199
|
>
|
229
200
|
> - `replacement_char (str)`: The character to use for redaction (default: █).
|
230
|
-
> - `
|
201
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
|
231
202
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
232
203
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
233
204
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
[project]
|
2
2
|
name = "glitchlings"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.3"
|
4
4
|
description = "Monsters for your language games."
|
5
5
|
readme = "README.md"
|
6
|
-
requires-python = ">=3.
|
6
|
+
requires-python = ">=3.10"
|
7
7
|
|
8
8
|
dependencies = [
|
9
9
|
"confusable-homoglyphs>=3.3.1",
|
@@ -22,6 +22,8 @@ classifiers = [
|
|
22
22
|
"License :: OSI Approved :: Apache Software License",
|
23
23
|
"Programming Language :: Python",
|
24
24
|
"Programming Language :: Python :: 3",
|
25
|
+
"Programming Language :: Python :: 3.10",
|
26
|
+
"Programming Language :: Python :: 3.11",
|
25
27
|
"Programming Language :: Python :: 3.12",
|
26
28
|
"Programming Language :: Rust",
|
27
29
|
"Operating System :: MacOS :: MacOS X",
|
@@ -23,7 +23,13 @@ fn configured_python() -> Option<OsString> {
|
|
23
23
|
}
|
24
24
|
|
25
25
|
fn detect_python() -> Option<OsString> {
|
26
|
-
const CANDIDATES: &[&str] = &[
|
26
|
+
const CANDIDATES: &[&str] = &[
|
27
|
+
"python3.12",
|
28
|
+
"python3.11",
|
29
|
+
"python3.10",
|
30
|
+
"python3",
|
31
|
+
"python",
|
32
|
+
];
|
27
33
|
|
28
34
|
for candidate in CANDIDATES {
|
29
35
|
let status = Command::new(candidate)
|
@@ -79,8 +79,8 @@ def tutorial_level(
|
|
79
79
|
) -> vf.Environment:
|
80
80
|
"""Create a low-corruption environment using tuned defaults."""
|
81
81
|
|
82
|
-
tuned_mim1c = Mim1c(
|
83
|
-
tuned_typogre = Typogre(
|
82
|
+
tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
|
83
|
+
tuned_typogre = Typogre(rate=0.025 * difficulty.value)
|
84
84
|
|
85
85
|
return load_environment(
|
86
86
|
env,
|
@@ -220,32 +220,54 @@ def echo_chamber(
|
|
220
220
|
"Specify which split to use when the dataset loads as a DatasetDict."
|
221
221
|
)
|
222
222
|
|
223
|
-
|
224
|
-
|
223
|
+
filtered_dataset = hf_dataset.filter(
|
224
|
+
lambda row: row.get(column) is not None,
|
225
|
+
load_from_cache_file=False,
|
226
|
+
)
|
225
227
|
|
226
|
-
|
227
|
-
value = row.get(column)
|
228
|
-
if value is None:
|
229
|
-
continue
|
228
|
+
source_column_names = list(filtered_dataset.column_names)
|
230
229
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
answers.append(text)
|
230
|
+
def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
|
231
|
+
text = str(row[column])
|
232
|
+
prompt = [
|
233
|
+
{"role": "system", "content": instructions},
|
234
|
+
{"role": "user", "content": f"Corrupted text:\n{text}"},
|
235
|
+
]
|
236
|
+
return {"prompt": prompt, "answer": text}
|
239
237
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
238
|
+
base_dataset = filtered_dataset.map(
|
239
|
+
_build_prompt,
|
240
|
+
remove_columns=source_column_names,
|
241
|
+
load_from_cache_file=False,
|
242
|
+
)
|
244
243
|
|
245
|
-
|
244
|
+
try:
|
245
|
+
dataset_length = len(base_dataset) # type: ignore[arg-type]
|
246
|
+
except TypeError:
|
247
|
+
preview_rows: list[dict[str, Any]]
|
248
|
+
take_fn = getattr(base_dataset, "take", None)
|
249
|
+
if callable(take_fn):
|
250
|
+
preview_rows = list(take_fn(1))
|
251
|
+
else:
|
252
|
+
iterator = iter(base_dataset)
|
253
|
+
try:
|
254
|
+
first_row = next(iterator)
|
255
|
+
except StopIteration:
|
256
|
+
preview_rows = []
|
257
|
+
else:
|
258
|
+
preview_rows = [first_row]
|
259
|
+
if not preview_rows:
|
260
|
+
raise ValueError(
|
261
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
if dataset_length == 0:
|
265
|
+
raise ValueError(
|
266
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
267
|
+
)
|
246
268
|
|
247
269
|
gaggle = _as_gaggle(glitchlings, seed=seed)
|
248
|
-
glitched_dataset = gaggle.corrupt_dataset(
|
270
|
+
glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
|
249
271
|
|
250
272
|
rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
|
251
273
|
rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
|
@@ -46,7 +46,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
46
46
|
metavar="SPEC",
|
47
47
|
help=(
|
48
48
|
"Glitchling to apply, optionally with parameters like "
|
49
|
-
"Typogre(
|
49
|
+
"Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
|
50
50
|
),
|
51
51
|
)
|
52
52
|
parser.add_argument(
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
|
4
|
+
def resolve_rate(
|
5
|
+
*,
|
6
|
+
rate: float | None,
|
7
|
+
legacy_value: float | None,
|
8
|
+
default: float,
|
9
|
+
legacy_name: str,
|
10
|
+
) -> float:
|
11
|
+
"""Return the effective rate while enforcing mutual exclusivity."""
|
12
|
+
|
13
|
+
if rate is not None and legacy_value is not None:
|
14
|
+
raise ValueError(
|
15
|
+
f"Specify either 'rate' or '{legacy_name}', not both."
|
16
|
+
)
|
17
|
+
if rate is not None:
|
18
|
+
return rate
|
19
|
+
if legacy_value is not None:
|
20
|
+
return legacy_value
|
21
|
+
return default
|
@@ -107,6 +107,7 @@ class Glitchling:
|
|
107
107
|
scope: AttackWave,
|
108
108
|
order: AttackOrder = AttackOrder.NORMAL,
|
109
109
|
seed: int | None = None,
|
110
|
+
pipeline_operation: Callable[["Glitchling"], dict[str, Any] | None] | None = None,
|
110
111
|
**kwargs: Any,
|
111
112
|
) -> None:
|
112
113
|
"""Initialize a glitchling.
|
@@ -128,31 +129,76 @@ class Glitchling:
|
|
128
129
|
self.corruption_function: CorruptionCallable = corruption_function
|
129
130
|
self.level: AttackWave = scope
|
130
131
|
self.order: AttackOrder = order
|
132
|
+
self._pipeline_descriptor_factory = pipeline_operation
|
131
133
|
self.kwargs: dict[str, Any] = {}
|
134
|
+
self._cached_rng_callable: CorruptionCallable | None = None
|
135
|
+
self._cached_rng_expectation: bool | None = None
|
132
136
|
for kw, val in kwargs.items():
|
133
137
|
self.set_param(kw, val)
|
134
138
|
|
135
139
|
def set_param(self, key: str, value: Any) -> None:
|
136
140
|
"""Persist a parameter for use by the corruption callable."""
|
137
141
|
|
138
|
-
|
139
|
-
|
140
|
-
|
142
|
+
aliases = getattr(self, "_param_aliases", {})
|
143
|
+
canonical = aliases.get(key, key)
|
144
|
+
|
145
|
+
# Drop stale alias keys so we only forward canonical kwargs.
|
146
|
+
self.kwargs.pop(key, None)
|
147
|
+
for alias, target in aliases.items():
|
148
|
+
if target == canonical:
|
149
|
+
self.kwargs.pop(alias, None)
|
150
|
+
|
151
|
+
self.kwargs[canonical] = value
|
152
|
+
setattr(self, canonical, value)
|
153
|
+
|
154
|
+
if canonical == "seed":
|
141
155
|
self.reset_rng(value)
|
142
156
|
|
143
|
-
|
144
|
-
|
157
|
+
for alias, target in aliases.items():
|
158
|
+
if target == canonical:
|
159
|
+
setattr(self, alias, value)
|
145
160
|
|
146
|
-
|
161
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
162
|
+
"""Return the Rust pipeline operation descriptor for this glitchling."""
|
163
|
+
|
164
|
+
factory = self._pipeline_descriptor_factory
|
165
|
+
if factory is None:
|
166
|
+
return None
|
167
|
+
|
168
|
+
return factory(self)
|
169
|
+
|
170
|
+
def _corruption_expects_rng(self) -> bool:
|
171
|
+
"""Return `True` when the corruption function accepts an rng keyword."""
|
172
|
+
|
173
|
+
cached_callable = self._cached_rng_callable
|
174
|
+
cached_expectation = self._cached_rng_expectation
|
175
|
+
corruption_function = self.corruption_function
|
176
|
+
|
177
|
+
if (
|
178
|
+
cached_callable is corruption_function
|
179
|
+
and cached_expectation is not None
|
180
|
+
):
|
181
|
+
return cached_expectation
|
182
|
+
|
183
|
+
expects_rng = False
|
147
184
|
try:
|
148
|
-
signature = inspect.signature(
|
185
|
+
signature = inspect.signature(corruption_function)
|
149
186
|
except (TypeError, ValueError):
|
150
187
|
signature = None
|
151
188
|
|
152
|
-
expects_rng = False
|
153
189
|
if signature is not None:
|
154
190
|
expects_rng = "rng" in signature.parameters
|
155
191
|
|
192
|
+
self._cached_rng_callable = corruption_function
|
193
|
+
self._cached_rng_expectation = expects_rng
|
194
|
+
return expects_rng
|
195
|
+
|
196
|
+
def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
|
197
|
+
"""Execute the corruption callable, injecting the RNG when required."""
|
198
|
+
|
199
|
+
# Pass rng to underlying corruption function if it expects it.
|
200
|
+
expects_rng = self._corruption_expects_rng()
|
201
|
+
|
156
202
|
if expects_rng:
|
157
203
|
corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
|
158
204
|
else:
|
@@ -231,53 +277,14 @@ class Glitchling:
|
|
231
277
|
self.corruption_function,
|
232
278
|
self.level,
|
233
279
|
self.order,
|
280
|
+
pipeline_operation=self._pipeline_descriptor_factory,
|
234
281
|
**filtered_kwargs,
|
235
282
|
)
|
236
283
|
|
237
284
|
return cls(**filtered_kwargs)
|
238
285
|
|
239
286
|
|
240
|
-
def _pipeline_operation_reduplicate(glitchling: "Glitchling") -> dict[str, Any] | None:
|
241
|
-
rate = glitchling.kwargs.get("reduplication_rate")
|
242
|
-
if rate is None:
|
243
|
-
return None
|
244
|
-
return {"type": "reduplicate", "reduplication_rate": float(rate)}
|
245
|
-
|
246
287
|
|
247
|
-
def _pipeline_operation_delete(glitchling: "Glitchling") -> dict[str, Any] | None:
|
248
|
-
rate = glitchling.kwargs.get("max_deletion_rate")
|
249
|
-
if rate is None:
|
250
|
-
return None
|
251
|
-
return {"type": "delete", "max_deletion_rate": float(rate)}
|
252
|
-
|
253
|
-
|
254
|
-
def _pipeline_operation_redact(glitchling: "Glitchling") -> dict[str, Any] | None:
|
255
|
-
replacement_char = glitchling.kwargs.get("replacement_char")
|
256
|
-
redaction_rate = glitchling.kwargs.get("redaction_rate")
|
257
|
-
merge_adjacent = glitchling.kwargs.get("merge_adjacent")
|
258
|
-
if replacement_char is None or redaction_rate is None or merge_adjacent is None:
|
259
|
-
return None
|
260
|
-
return {
|
261
|
-
"type": "redact",
|
262
|
-
"replacement_char": str(replacement_char),
|
263
|
-
"redaction_rate": float(redaction_rate),
|
264
|
-
"merge_adjacent": bool(merge_adjacent),
|
265
|
-
}
|
266
|
-
|
267
|
-
|
268
|
-
def _pipeline_operation_ocr(glitchling: "Glitchling") -> dict[str, Any] | None:
|
269
|
-
error_rate = glitchling.kwargs.get("error_rate")
|
270
|
-
if error_rate is None:
|
271
|
-
return None
|
272
|
-
return {"type": "ocr", "error_rate": float(error_rate)}
|
273
|
-
|
274
|
-
|
275
|
-
_PIPELINE_OPERATION_BUILDERS: dict[str, Callable[["Glitchling"], dict[str, Any] | None]] = {
|
276
|
-
"Reduple": _pipeline_operation_reduplicate,
|
277
|
-
"Rushmore": _pipeline_operation_delete,
|
278
|
-
"Redactyl": _pipeline_operation_redact,
|
279
|
-
"Scannequin": _pipeline_operation_ocr,
|
280
|
-
}
|
281
288
|
|
282
289
|
|
283
290
|
class Gaggle(Glitchling):
|
@@ -359,10 +366,7 @@ class Gaggle(Glitchling):
|
|
359
366
|
|
360
367
|
descriptors: list[dict[str, Any]] = []
|
361
368
|
for glitchling in self.apply_order:
|
362
|
-
|
363
|
-
if builder is None:
|
364
|
-
return None
|
365
|
-
operation = builder(glitchling)
|
369
|
+
operation = glitchling.pipeline_operation()
|
366
370
|
if operation is None:
|
367
371
|
return None
|
368
372
|
|