glitchlings 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {glitchlings-0.2.2 → glitchlings-0.2.4}/PKG-INFO +23 -55
- {glitchlings-0.2.2 → glitchlings-0.2.4}/README.md +17 -46
- {glitchlings-0.2.2 → glitchlings-0.2.4}/pyproject.toml +6 -10
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/build.rs +17 -5
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/dlc/prime.py +44 -22
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/main.py +1 -1
- glitchlings-0.2.4/src/glitchlings/zoo/_rate.py +21 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/core.py +56 -52
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/jargoyle.py +24 -5
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/mim1c.py +24 -5
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/redactyl.py +43 -8
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/reduple.py +36 -8
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/rushmore.py +40 -8
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/scannequin.py +38 -8
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/typogre.py +29 -9
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/PKG-INFO +23 -55
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/SOURCES.txt +1 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_cli.py +2 -2
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_gaggle.py +2 -1
- glitchlings-0.2.4/tests/test_glitchling_core.py +68 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_glitchlings_determinism.py +7 -7
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_jargoyle.py +2 -2
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_parameter_effects.py +10 -10
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_prime_echo_chamber.py +93 -4
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_property_based.py +2 -2
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_rust_backed_glitchlings.py +30 -30
- glitchlings-0.2.2/tests/test_glitchling_core.py +0 -24
- {glitchlings-0.2.2 → glitchlings-0.2.4}/LICENSE +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/MANIFEST.in +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/Cargo.lock +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/Cargo.toml +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/Cargo.toml +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/assets/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/glitch_ops.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/lib.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/pipeline.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/resources.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/rng.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/text_buffer.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/typogre.rs +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/setup.cfg +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/dlc/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/dlc/huggingface.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/__init__.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/dependency_links.txt +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/entry_points.txt +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/requires.txt +1 -1
- {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/top_level.txt +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_dataset_corruption.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_huggingface_dlc.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_keyboard_layouts.py +0 -0
- {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -209,25 +209,21 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
|
209
209
|
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
210
210
|
Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
|
211
211
|
Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
|
212
|
-
Keywords: nlp,text,adversarial augmentation,text augmentation
|
212
|
+
Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,confusables,typo,
|
213
213
|
Classifier: Development Status :: 3 - Alpha
|
214
214
|
Classifier: Intended Audience :: Developers
|
215
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
216
215
|
Classifier: Programming Language :: Python
|
217
216
|
Classifier: Programming Language :: Python :: 3
|
217
|
+
Classifier: Programming Language :: Python :: 3.10
|
218
|
+
Classifier: Programming Language :: Python :: 3.11
|
218
219
|
Classifier: Programming Language :: Python :: 3.12
|
219
220
|
Classifier: Programming Language :: Rust
|
220
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
221
|
-
Classifier: Operating System :: Microsoft :: Windows
|
222
|
-
Classifier: Operating System :: POSIX :: Linux
|
223
|
-
Classifier: Operating System :: OS Independent
|
224
221
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
225
222
|
Classifier: Topic :: Software Development :: Testing
|
226
|
-
Requires-Python: >=3.
|
223
|
+
Requires-Python: >=3.10
|
227
224
|
Description-Content-Type: text/markdown
|
228
225
|
License-File: LICENSE
|
229
226
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
230
|
-
Requires-Dist: jellyfish>=1.2.0
|
231
227
|
Provides-Extra: hf
|
232
228
|
Requires-Dist: datasets>=4.0.0; extra == "hf"
|
233
229
|
Provides-Extra: wordnet
|
@@ -235,6 +231,7 @@ Requires-Dist: nltk>=3.9.1; extra == "wordnet"
|
|
235
231
|
Requires-Dist: numpy<=2.0,>=1.24; extra == "wordnet"
|
236
232
|
Provides-Extra: prime
|
237
233
|
Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
|
234
|
+
Requires-Dist: jellyfish>=1.2.0; extra == "prime"
|
238
235
|
Provides-Extra: dev
|
239
236
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
240
237
|
Requires-Dist: hypothesis>=6.140.0; extra == "dev"
|
@@ -280,14 +277,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
|
|
280
277
|
pip install -U glitchlings
|
281
278
|
```
|
282
279
|
|
280
|
+
> Glitchlings requires Python 3.10 or newer.
|
281
|
+
|
283
282
|
```python
|
284
283
|
from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
|
285
284
|
|
286
285
|
gaggle = Gaggle([
|
287
|
-
Typogre(
|
288
|
-
Mim1c(
|
286
|
+
Typogre(rate=0.03),
|
287
|
+
Mim1c(rate=0.02),
|
289
288
|
Reduple(seed=404),
|
290
|
-
Rushmore(
|
289
|
+
Rushmore(rate=0.02),
|
291
290
|
])
|
292
291
|
|
293
292
|
print(gaggle(SAMPLE_TEXT))
|
@@ -295,41 +294,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
295
294
|
|
296
295
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
297
296
|
|
298
|
-
|
299
|
-
|
300
|
-
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
301
|
-
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
302
|
-
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
297
|
+
Consult the [Glitchlings Usage Guide](docs/index.md)
|
298
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
303
299
|
integrations, and the feature-flagged Rust pipeline.
|
304
300
|
|
305
|
-
### Prime Intellect environments
|
306
|
-
|
307
|
-
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
308
|
-
|
309
|
-
```python
|
310
|
-
from glitchlings import Mim1c, Typogre
|
311
|
-
from glitchlings.dlc.prime import echo_chamber, load_environment
|
312
|
-
|
313
|
-
env = load_environment(
|
314
|
-
"osoleve/syllabify-en",
|
315
|
-
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
316
|
-
seed=404,
|
317
|
-
)
|
318
|
-
|
319
|
-
# Spin up an echo chamber that corrupts a dataset column and
|
320
|
-
# rewards models for perfectly restoring it
|
321
|
-
practice_env = echo_chamber(
|
322
|
-
"osoleve/clean-room",
|
323
|
-
column="text",
|
324
|
-
glitchlings=["Typogre", "Mim1c"],
|
325
|
-
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
326
|
-
)
|
327
|
-
```
|
328
|
-
|
329
|
-
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
330
|
-
override `reward_function` when you want to evaluate completions with a custom
|
331
|
-
scoring routine.
|
332
|
-
|
333
301
|
## Motivation
|
334
302
|
|
335
303
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -344,8 +312,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
|
|
344
312
|
```python
|
345
313
|
from glitchlings import Gaggle, Typogre, Mim1c
|
346
314
|
|
347
|
-
custom_typogre = Typogre(
|
348
|
-
selective_mimic = Mim1c(
|
315
|
+
custom_typogre = Typogre(rate=0.1)
|
316
|
+
selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
|
349
317
|
|
350
318
|
gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
|
351
319
|
print(gaggle("Summoned heroes do not fear the glitch."))
|
@@ -376,7 +344,7 @@ glitchlings --list
|
|
376
344
|
glitchlings -g typogre --file documents/report.txt --diff
|
377
345
|
|
378
346
|
# Configure glitchlings inline by passing keyword arguments.
|
379
|
-
glitchlings -g "Typogre(
|
347
|
+
glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
|
380
348
|
|
381
349
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
382
350
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
@@ -400,7 +368,7 @@ _What a nice word, would be a shame if something happened to it._
|
|
400
368
|
>
|
401
369
|
> Args
|
402
370
|
>
|
403
|
-
> - `
|
371
|
+
> - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
|
404
372
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
405
373
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
406
374
|
|
@@ -412,7 +380,7 @@ _Wait, was that...?_
|
|
412
380
|
>
|
413
381
|
> Args
|
414
382
|
>
|
415
|
-
> - `
|
383
|
+
> - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
|
416
384
|
> - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
|
417
385
|
> - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
|
418
386
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
@@ -425,7 +393,7 @@ _How can a computer need reading glasses?_
|
|
425
393
|
>
|
426
394
|
> Args
|
427
395
|
>
|
428
|
-
> - `
|
396
|
+
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
429
397
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
430
398
|
|
431
399
|
### Jargoyle
|
@@ -436,7 +404,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
|
|
436
404
|
>
|
437
405
|
> Args
|
438
406
|
>
|
439
|
-
> - `
|
407
|
+
> - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
|
440
408
|
> - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
|
441
409
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
442
410
|
|
@@ -448,7 +416,7 @@ _Did you say that or did I?_
|
|
448
416
|
>
|
449
417
|
> Args
|
450
418
|
>
|
451
|
-
> - `
|
419
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
|
452
420
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
453
421
|
|
454
422
|
### Rushmore
|
@@ -459,7 +427,7 @@ _I accidentally an entire word._
|
|
459
427
|
>
|
460
428
|
> Args
|
461
429
|
>
|
462
|
-
> - `
|
430
|
+
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
463
431
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
464
432
|
|
465
433
|
### Redactyl
|
@@ -471,7 +439,7 @@ _Oops, that was my black highlighter._
|
|
471
439
|
> ### Args
|
472
440
|
>
|
473
441
|
> - `replacement_char (str)`: The character to use for redaction (default: █).
|
474
|
-
> - `
|
442
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
|
475
443
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
476
444
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
477
445
|
|
@@ -36,14 +36,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
|
|
36
36
|
pip install -U glitchlings
|
37
37
|
```
|
38
38
|
|
39
|
+
> Glitchlings requires Python 3.10 or newer.
|
40
|
+
|
39
41
|
```python
|
40
42
|
from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
|
41
43
|
|
42
44
|
gaggle = Gaggle([
|
43
|
-
Typogre(
|
44
|
-
Mim1c(
|
45
|
+
Typogre(rate=0.03),
|
46
|
+
Mim1c(rate=0.02),
|
45
47
|
Reduple(seed=404),
|
46
|
-
Rushmore(
|
48
|
+
Rushmore(rate=0.02),
|
47
49
|
])
|
48
50
|
|
49
51
|
print(gaggle(SAMPLE_TEXT))
|
@@ -51,41 +53,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
51
53
|
|
52
54
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
-
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
57
|
-
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
58
|
-
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
56
|
+
Consult the [Glitchlings Usage Guide](docs/index.md)
|
57
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
59
58
|
integrations, and the feature-flagged Rust pipeline.
|
60
59
|
|
61
|
-
### Prime Intellect environments
|
62
|
-
|
63
|
-
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
64
|
-
|
65
|
-
```python
|
66
|
-
from glitchlings import Mim1c, Typogre
|
67
|
-
from glitchlings.dlc.prime import echo_chamber, load_environment
|
68
|
-
|
69
|
-
env = load_environment(
|
70
|
-
"osoleve/syllabify-en",
|
71
|
-
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
72
|
-
seed=404,
|
73
|
-
)
|
74
|
-
|
75
|
-
# Spin up an echo chamber that corrupts a dataset column and
|
76
|
-
# rewards models for perfectly restoring it
|
77
|
-
practice_env = echo_chamber(
|
78
|
-
"osoleve/clean-room",
|
79
|
-
column="text",
|
80
|
-
glitchlings=["Typogre", "Mim1c"],
|
81
|
-
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
82
|
-
)
|
83
|
-
```
|
84
|
-
|
85
|
-
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
86
|
-
override `reward_function` when you want to evaluate completions with a custom
|
87
|
-
scoring routine.
|
88
|
-
|
89
60
|
## Motivation
|
90
61
|
|
91
62
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -100,8 +71,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
|
|
100
71
|
```python
|
101
72
|
from glitchlings import Gaggle, Typogre, Mim1c
|
102
73
|
|
103
|
-
custom_typogre = Typogre(
|
104
|
-
selective_mimic = Mim1c(
|
74
|
+
custom_typogre = Typogre(rate=0.1)
|
75
|
+
selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
|
105
76
|
|
106
77
|
gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
|
107
78
|
print(gaggle("Summoned heroes do not fear the glitch."))
|
@@ -132,7 +103,7 @@ glitchlings --list
|
|
132
103
|
glitchlings -g typogre --file documents/report.txt --diff
|
133
104
|
|
134
105
|
# Configure glitchlings inline by passing keyword arguments.
|
135
|
-
glitchlings -g "Typogre(
|
106
|
+
glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
|
136
107
|
|
137
108
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
138
109
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
@@ -156,7 +127,7 @@ _What a nice word, would be a shame if something happened to it._
|
|
156
127
|
>
|
157
128
|
> Args
|
158
129
|
>
|
159
|
-
> - `
|
130
|
+
> - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
|
160
131
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
161
132
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
162
133
|
|
@@ -168,7 +139,7 @@ _Wait, was that...?_
|
|
168
139
|
>
|
169
140
|
> Args
|
170
141
|
>
|
171
|
-
> - `
|
142
|
+
> - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
|
172
143
|
> - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
|
173
144
|
> - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
|
174
145
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
@@ -181,7 +152,7 @@ _How can a computer need reading glasses?_
|
|
181
152
|
>
|
182
153
|
> Args
|
183
154
|
>
|
184
|
-
> - `
|
155
|
+
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
185
156
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
186
157
|
|
187
158
|
### Jargoyle
|
@@ -192,7 +163,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
|
|
192
163
|
>
|
193
164
|
> Args
|
194
165
|
>
|
195
|
-
> - `
|
166
|
+
> - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
|
196
167
|
> - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
|
197
168
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
198
169
|
|
@@ -204,7 +175,7 @@ _Did you say that or did I?_
|
|
204
175
|
>
|
205
176
|
> Args
|
206
177
|
>
|
207
|
-
> - `
|
178
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
|
208
179
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
209
180
|
|
210
181
|
### Rushmore
|
@@ -215,7 +186,7 @@ _I accidentally an entire word._
|
|
215
186
|
>
|
216
187
|
> Args
|
217
188
|
>
|
218
|
-
> - `
|
189
|
+
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
219
190
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
220
191
|
|
221
192
|
### Redactyl
|
@@ -227,7 +198,7 @@ _Oops, that was my black highlighter._
|
|
227
198
|
> ### Args
|
228
199
|
>
|
229
200
|
> - `replacement_char (str)`: The character to use for redaction (default: █).
|
230
|
-
> - `
|
201
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
|
231
202
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
232
203
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
233
204
|
|
@@ -1,33 +1,29 @@
|
|
1
1
|
[project]
|
2
2
|
name = "glitchlings"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.4"
|
4
4
|
description = "Monsters for your language games."
|
5
5
|
readme = "README.md"
|
6
|
-
requires-python = ">=3.
|
6
|
+
requires-python = ">=3.10"
|
7
7
|
|
8
8
|
dependencies = [
|
9
9
|
"confusable-homoglyphs>=3.3.1",
|
10
|
-
"jellyfish>=1.2.0",
|
11
10
|
]
|
12
11
|
|
13
12
|
authors = [
|
14
13
|
{ name = "osoleve" }
|
15
14
|
]
|
16
15
|
|
17
|
-
keywords = ["nlp", "text", "adversarial augmentation", "text augmentation"]
|
16
|
+
keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "confusables", "typo", ""]
|
18
17
|
|
19
18
|
classifiers = [
|
20
19
|
"Development Status :: 3 - Alpha",
|
21
20
|
"Intended Audience :: Developers",
|
22
|
-
"License :: OSI Approved :: Apache Software License",
|
23
21
|
"Programming Language :: Python",
|
24
22
|
"Programming Language :: Python :: 3",
|
23
|
+
"Programming Language :: Python :: 3.10",
|
24
|
+
"Programming Language :: Python :: 3.11",
|
25
25
|
"Programming Language :: Python :: 3.12",
|
26
26
|
"Programming Language :: Rust",
|
27
|
-
"Operating System :: MacOS :: MacOS X",
|
28
|
-
"Operating System :: Microsoft :: Windows",
|
29
|
-
"Operating System :: POSIX :: Linux",
|
30
|
-
"Operating System :: OS Independent",
|
31
27
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
32
28
|
"Topic :: Software Development :: Testing",
|
33
29
|
]
|
@@ -47,7 +43,7 @@ glitchlings = "glitchlings.main:main"
|
|
47
43
|
[project.optional-dependencies]
|
48
44
|
hf = ["datasets>=4.0.0"]
|
49
45
|
wordnet = ["nltk>=3.9.1", "numpy>=1.24,<=2.0"]
|
50
|
-
prime = ["verifiers>=0.1.3.post0"]
|
46
|
+
prime = ["verifiers>=0.1.3.post0", "jellyfish>=1.2.0"]
|
51
47
|
dev = [
|
52
48
|
"pytest>=8.0.0",
|
53
49
|
"hypothesis>=6.140.0",
|
@@ -9,10 +9,15 @@ fn main() {
|
|
9
9
|
prepare_confusion_table().expect("failed to stage OCR confusion table for compilation");
|
10
10
|
pyo3_build_config::add_extension_module_link_args();
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
// Only perform custom Python linking on non-Linux platforms.
|
13
|
+
// On Linux, manylinux wheels must NOT link against libpython to ensure portability.
|
14
|
+
// PyO3's add_extension_module_link_args() already handles this correctly by default.
|
15
|
+
if cfg!(not(target_os = "linux")) {
|
16
|
+
if let Some(python) = configured_python() {
|
17
|
+
link_python(&python);
|
18
|
+
} else if let Some(python) = detect_python() {
|
19
|
+
link_python(&python);
|
20
|
+
}
|
16
21
|
}
|
17
22
|
}
|
18
23
|
|
@@ -23,7 +28,13 @@ fn configured_python() -> Option<OsString> {
|
|
23
28
|
}
|
24
29
|
|
25
30
|
fn detect_python() -> Option<OsString> {
|
26
|
-
const CANDIDATES: &[&str] = &[
|
31
|
+
const CANDIDATES: &[&str] = &[
|
32
|
+
"python3.12",
|
33
|
+
"python3.11",
|
34
|
+
"python3.10",
|
35
|
+
"python3",
|
36
|
+
"python",
|
37
|
+
];
|
27
38
|
|
28
39
|
for candidate in CANDIDATES {
|
29
40
|
let status = Command::new(candidate)
|
@@ -71,6 +82,7 @@ fn link_python(python: &OsStr) {
|
|
71
82
|
let stem = stripped
|
72
83
|
.strip_suffix(".so")
|
73
84
|
.or_else(|| stripped.strip_suffix(".a"))
|
85
|
+
.or_else(|| stripped.strip_suffix(".dylib"))
|
74
86
|
.unwrap_or(stripped);
|
75
87
|
if !stem.is_empty() {
|
76
88
|
println!("cargo:rustc-link-lib={stem}");
|
@@ -79,8 +79,8 @@ def tutorial_level(
|
|
79
79
|
) -> vf.Environment:
|
80
80
|
"""Create a low-corruption environment using tuned defaults."""
|
81
81
|
|
82
|
-
tuned_mim1c = Mim1c(
|
83
|
-
tuned_typogre = Typogre(
|
82
|
+
tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
|
83
|
+
tuned_typogre = Typogre(rate=0.025 * difficulty.value)
|
84
84
|
|
85
85
|
return load_environment(
|
86
86
|
env,
|
@@ -220,32 +220,54 @@ def echo_chamber(
|
|
220
220
|
"Specify which split to use when the dataset loads as a DatasetDict."
|
221
221
|
)
|
222
222
|
|
223
|
-
|
224
|
-
|
223
|
+
filtered_dataset = hf_dataset.filter(
|
224
|
+
lambda row: row.get(column) is not None,
|
225
|
+
load_from_cache_file=False,
|
226
|
+
)
|
225
227
|
|
226
|
-
|
227
|
-
value = row.get(column)
|
228
|
-
if value is None:
|
229
|
-
continue
|
228
|
+
source_column_names = list(filtered_dataset.column_names)
|
230
229
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
answers.append(text)
|
230
|
+
def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
|
231
|
+
text = str(row[column])
|
232
|
+
prompt = [
|
233
|
+
{"role": "system", "content": instructions},
|
234
|
+
{"role": "user", "content": f"Corrupted text:\n{text}"},
|
235
|
+
]
|
236
|
+
return {"prompt": prompt, "answer": text}
|
239
237
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
238
|
+
base_dataset = filtered_dataset.map(
|
239
|
+
_build_prompt,
|
240
|
+
remove_columns=source_column_names,
|
241
|
+
load_from_cache_file=False,
|
242
|
+
)
|
244
243
|
|
245
|
-
|
244
|
+
try:
|
245
|
+
dataset_length = len(base_dataset) # type: ignore[arg-type]
|
246
|
+
except TypeError:
|
247
|
+
preview_rows: list[dict[str, Any]]
|
248
|
+
take_fn = getattr(base_dataset, "take", None)
|
249
|
+
if callable(take_fn):
|
250
|
+
preview_rows = list(take_fn(1))
|
251
|
+
else:
|
252
|
+
iterator = iter(base_dataset)
|
253
|
+
try:
|
254
|
+
first_row = next(iterator)
|
255
|
+
except StopIteration:
|
256
|
+
preview_rows = []
|
257
|
+
else:
|
258
|
+
preview_rows = [first_row]
|
259
|
+
if not preview_rows:
|
260
|
+
raise ValueError(
|
261
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
if dataset_length == 0:
|
265
|
+
raise ValueError(
|
266
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
267
|
+
)
|
246
268
|
|
247
269
|
gaggle = _as_gaggle(glitchlings, seed=seed)
|
248
|
-
glitched_dataset = gaggle.corrupt_dataset(
|
270
|
+
glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
|
249
271
|
|
250
272
|
rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
|
251
273
|
rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
|
@@ -46,7 +46,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
46
46
|
metavar="SPEC",
|
47
47
|
help=(
|
48
48
|
"Glitchling to apply, optionally with parameters like "
|
49
|
-
"Typogre(
|
49
|
+
"Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
|
50
50
|
),
|
51
51
|
)
|
52
52
|
parser.add_argument(
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
|
4
|
+
def resolve_rate(
|
5
|
+
*,
|
6
|
+
rate: float | None,
|
7
|
+
legacy_value: float | None,
|
8
|
+
default: float,
|
9
|
+
legacy_name: str,
|
10
|
+
) -> float:
|
11
|
+
"""Return the effective rate while enforcing mutual exclusivity."""
|
12
|
+
|
13
|
+
if rate is not None and legacy_value is not None:
|
14
|
+
raise ValueError(
|
15
|
+
f"Specify either 'rate' or '{legacy_name}', not both."
|
16
|
+
)
|
17
|
+
if rate is not None:
|
18
|
+
return rate
|
19
|
+
if legacy_value is not None:
|
20
|
+
return legacy_value
|
21
|
+
return default
|