glitchlings 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {glitchlings-0.2.2 → glitchlings-0.2.4}/PKG-INFO +23 -55
  2. {glitchlings-0.2.2 → glitchlings-0.2.4}/README.md +17 -46
  3. {glitchlings-0.2.2 → glitchlings-0.2.4}/pyproject.toml +6 -10
  4. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/build.rs +17 -5
  5. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/dlc/prime.py +44 -22
  6. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/main.py +1 -1
  7. glitchlings-0.2.4/src/glitchlings/zoo/_rate.py +21 -0
  8. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/core.py +56 -52
  9. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/jargoyle.py +24 -5
  10. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/mim1c.py +24 -5
  11. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/redactyl.py +43 -8
  12. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/reduple.py +36 -8
  13. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/rushmore.py +40 -8
  14. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/scannequin.py +38 -8
  15. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/typogre.py +29 -9
  16. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/PKG-INFO +23 -55
  17. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/SOURCES.txt +1 -0
  18. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_cli.py +2 -2
  19. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_gaggle.py +2 -1
  20. glitchlings-0.2.4/tests/test_glitchling_core.py +68 -0
  21. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_glitchlings_determinism.py +7 -7
  22. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_jargoyle.py +2 -2
  23. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_parameter_effects.py +10 -10
  24. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_prime_echo_chamber.py +93 -4
  25. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_property_based.py +2 -2
  26. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_rust_backed_glitchlings.py +30 -30
  27. glitchlings-0.2.2/tests/test_glitchling_core.py +0 -24
  28. {glitchlings-0.2.2 → glitchlings-0.2.4}/LICENSE +0 -0
  29. {glitchlings-0.2.2 → glitchlings-0.2.4}/MANIFEST.in +0 -0
  30. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/Cargo.lock +0 -0
  31. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/Cargo.toml +0 -0
  32. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/Cargo.toml +0 -0
  33. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/assets/ocr_confusions.tsv +0 -0
  34. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/glitch_ops.rs +0 -0
  35. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/lib.rs +0 -0
  36. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/pipeline.rs +0 -0
  37. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/resources.rs +0 -0
  38. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/rng.rs +0 -0
  39. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/text_buffer.rs +0 -0
  40. {glitchlings-0.2.2 → glitchlings-0.2.4}/rust/zoo/src/typogre.rs +0 -0
  41. {glitchlings-0.2.2 → glitchlings-0.2.4}/setup.cfg +0 -0
  42. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/__init__.py +0 -0
  43. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/__main__.py +0 -0
  44. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/dlc/__init__.py +0 -0
  45. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/dlc/huggingface.py +0 -0
  46. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/util/__init__.py +0 -0
  47. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/__init__.py +0 -0
  48. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
  49. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
  50. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/dependency_links.txt +0 -0
  51. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/entry_points.txt +0 -0
  52. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/requires.txt +1 -1
  53. {glitchlings-0.2.2 → glitchlings-0.2.4}/src/glitchlings.egg-info/top_level.txt +0 -0
  54. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_dataset_corruption.py +0 -0
  55. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_huggingface_dlc.py +0 -0
  56. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_keyboard_layouts.py +0 -0
  57. {glitchlings-0.2.2 → glitchlings-0.2.4}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -209,25 +209,21 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
209
209
  Project-URL: Repository, https://github.com/osoleve/glitchlings.git
210
210
  Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
211
211
  Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
212
- Keywords: nlp,text,adversarial augmentation,text augmentation
212
+ Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,confusables,typo,
213
213
  Classifier: Development Status :: 3 - Alpha
214
214
  Classifier: Intended Audience :: Developers
215
- Classifier: License :: OSI Approved :: Apache Software License
216
215
  Classifier: Programming Language :: Python
217
216
  Classifier: Programming Language :: Python :: 3
217
+ Classifier: Programming Language :: Python :: 3.10
218
+ Classifier: Programming Language :: Python :: 3.11
218
219
  Classifier: Programming Language :: Python :: 3.12
219
220
  Classifier: Programming Language :: Rust
220
- Classifier: Operating System :: MacOS :: MacOS X
221
- Classifier: Operating System :: Microsoft :: Windows
222
- Classifier: Operating System :: POSIX :: Linux
223
- Classifier: Operating System :: OS Independent
224
221
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
225
222
  Classifier: Topic :: Software Development :: Testing
226
- Requires-Python: >=3.12
223
+ Requires-Python: >=3.10
227
224
  Description-Content-Type: text/markdown
228
225
  License-File: LICENSE
229
226
  Requires-Dist: confusable-homoglyphs>=3.3.1
230
- Requires-Dist: jellyfish>=1.2.0
231
227
  Provides-Extra: hf
232
228
  Requires-Dist: datasets>=4.0.0; extra == "hf"
233
229
  Provides-Extra: wordnet
@@ -235,6 +231,7 @@ Requires-Dist: nltk>=3.9.1; extra == "wordnet"
235
231
  Requires-Dist: numpy<=2.0,>=1.24; extra == "wordnet"
236
232
  Provides-Extra: prime
237
233
  Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
234
+ Requires-Dist: jellyfish>=1.2.0; extra == "prime"
238
235
  Provides-Extra: dev
239
236
  Requires-Dist: pytest>=8.0.0; extra == "dev"
240
237
  Requires-Dist: hypothesis>=6.140.0; extra == "dev"
@@ -280,14 +277,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
280
277
  pip install -U glitchlings
281
278
  ```
282
279
 
280
+ > Glitchlings requires Python 3.10 or newer.
281
+
283
282
  ```python
284
283
  from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
285
284
 
286
285
  gaggle = Gaggle([
287
- Typogre(max_change_rate=0.03),
288
- Mim1c(replacement_rate=0.02),
286
+ Typogre(rate=0.03),
287
+ Mim1c(rate=0.02),
289
288
  Reduple(seed=404),
290
- Rushmore(max_deletion_rate=0.02),
289
+ Rushmore(rate=0.02),
291
290
  ])
292
291
 
293
292
  print(gaggle(SAMPLE_TEXT))
@@ -295,41 +294,10 @@ print(gaggle(SAMPLE_TEXT))
295
294
 
296
295
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
297
296
 
298
- ## Usage
299
-
300
- Need detailed usage patterns, dataset workflows, or tips for enabling the
301
- Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
302
- for end-to-end instructions spanning the Python API, CLI, Hugging Face
297
+ Consult the [Glitchlings Usage Guide](docs/index.md)
298
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
303
299
  integrations, and the feature-flagged Rust pipeline.
304
300
 
305
- ### Prime Intellect environments
306
-
307
- After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
308
-
309
- ```python
310
- from glitchlings import Mim1c, Typogre
311
- from glitchlings.dlc.prime import echo_chamber, load_environment
312
-
313
- env = load_environment(
314
- "osoleve/syllabify-en",
315
- glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
316
- seed=404,
317
- )
318
-
319
- # Spin up an echo chamber that corrupts a dataset column and
320
- # rewards models for perfectly restoring it
321
- practice_env = echo_chamber(
322
- "osoleve/clean-room",
323
- column="text",
324
- glitchlings=["Typogre", "Mim1c"],
325
- reward_function=lambda prompt, completion, answer: float(completion == answer),
326
- )
327
- ```
328
-
329
- Skip the `glitchlings` argument to receive an untouched verifier dataset, and
330
- override `reward_function` when you want to evaluate completions with a custom
331
- scoring routine.
332
-
333
301
  ## Motivation
334
302
 
335
303
  If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
@@ -344,8 +312,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
344
312
  ```python
345
313
  from glitchlings import Gaggle, Typogre, Mim1c
346
314
 
347
- custom_typogre = Typogre(max_change_rate=0.1)
348
- selective_mimic = Mim1c(replacement_rate=0.05, classes=["LATIN", "GREEK"])
315
+ custom_typogre = Typogre(rate=0.1)
316
+ selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
349
317
 
350
318
  gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
351
319
  print(gaggle("Summoned heroes do not fear the glitch."))
@@ -376,7 +344,7 @@ glitchlings --list
376
344
  glitchlings -g typogre --file documents/report.txt --diff
377
345
 
378
346
  # Configure glitchlings inline by passing keyword arguments.
379
- glitchlings -g "Typogre(max_change_rate=0.05)" "Ghouls just wanna have fun"
347
+ glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
380
348
 
381
349
  # Pipe text straight into the CLI for an on-the-fly corruption.
382
350
  echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
@@ -400,7 +368,7 @@ _What a nice word, would be a shame if something happened to it._
400
368
  >
401
369
  > Args
402
370
  >
403
- > - `max_change_rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
371
+ > - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
404
372
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
405
373
  > - `seed (int)`: The random seed for reproducibility (default: 151).
406
374
 
@@ -412,7 +380,7 @@ _Wait, was that...?_
412
380
  >
413
381
  > Args
414
382
  >
415
- > - `replacement_rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
383
+ > - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
416
384
  > - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
417
385
  > - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
418
386
  > - `seed (int)`: The random seed for reproducibility (default: 151).
@@ -425,7 +393,7 @@ _How can a computer need reading glasses?_
425
393
  >
426
394
  > Args
427
395
  >
428
- > - `error_rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
396
+ > - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
429
397
  > - `seed (int)`: The random seed for reproducibility (default: 151).
430
398
 
431
399
  ### Jargoyle
@@ -436,7 +404,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
436
404
  >
437
405
  > Args
438
406
  >
439
- > - `replacement_rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
407
+ > - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
440
408
  > - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
441
409
  > - `seed (int)`: The random seed for reproducibility (default: 151).
442
410
 
@@ -448,7 +416,7 @@ _Did you say that or did I?_
448
416
  >
449
417
  > Args
450
418
  >
451
- > - `reduplication_rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
419
+ > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
452
420
  > - `seed (int)`: The random seed for reproducibility (default: 151).
453
421
 
454
422
  ### Rushmore
@@ -459,7 +427,7 @@ _I accidentally an entire word._
459
427
  >
460
428
  > Args
461
429
  >
462
- > - `max_deletion_rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
430
+ > - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
463
431
  > - `seed (int)`: The random seed for reproducibility (default: 151).
464
432
 
465
433
  ### Redactyl
@@ -471,7 +439,7 @@ _Oops, that was my black highlighter._
471
439
  > ### Args
472
440
  >
473
441
  > - `replacement_char (str)`: The character to use for redaction (default: █).
474
- > - `redaction_rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
442
+ > - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
475
443
  > - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
476
444
  > - `seed (int)`: The random seed for reproducibility (default: 151).
477
445
 
@@ -36,14 +36,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
36
36
  pip install -U glitchlings
37
37
  ```
38
38
 
39
+ > Glitchlings requires Python 3.10 or newer.
40
+
39
41
  ```python
40
42
  from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
41
43
 
42
44
  gaggle = Gaggle([
43
- Typogre(max_change_rate=0.03),
44
- Mim1c(replacement_rate=0.02),
45
+ Typogre(rate=0.03),
46
+ Mim1c(rate=0.02),
45
47
  Reduple(seed=404),
46
- Rushmore(max_deletion_rate=0.02),
48
+ Rushmore(rate=0.02),
47
49
  ])
48
50
 
49
51
  print(gaggle(SAMPLE_TEXT))
@@ -51,41 +53,10 @@ print(gaggle(SAMPLE_TEXT))
51
53
 
52
54
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
53
55
 
54
- ## Usage
55
-
56
- Need detailed usage patterns, dataset workflows, or tips for enabling the
57
- Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
58
- for end-to-end instructions spanning the Python API, CLI, Hugging Face
56
+ Consult the [Glitchlings Usage Guide](docs/index.md)
57
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
59
58
  integrations, and the feature-flagged Rust pipeline.
60
59
 
61
- ### Prime Intellect environments
62
-
63
- After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
64
-
65
- ```python
66
- from glitchlings import Mim1c, Typogre
67
- from glitchlings.dlc.prime import echo_chamber, load_environment
68
-
69
- env = load_environment(
70
- "osoleve/syllabify-en",
71
- glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
72
- seed=404,
73
- )
74
-
75
- # Spin up an echo chamber that corrupts a dataset column and
76
- # rewards models for perfectly restoring it
77
- practice_env = echo_chamber(
78
- "osoleve/clean-room",
79
- column="text",
80
- glitchlings=["Typogre", "Mim1c"],
81
- reward_function=lambda prompt, completion, answer: float(completion == answer),
82
- )
83
- ```
84
-
85
- Skip the `glitchlings` argument to receive an untouched verifier dataset, and
86
- override `reward_function` when you want to evaluate completions with a custom
87
- scoring routine.
88
-
89
60
  ## Motivation
90
61
 
91
62
  If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
@@ -100,8 +71,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
100
71
  ```python
101
72
  from glitchlings import Gaggle, Typogre, Mim1c
102
73
 
103
- custom_typogre = Typogre(max_change_rate=0.1)
104
- selective_mimic = Mim1c(replacement_rate=0.05, classes=["LATIN", "GREEK"])
74
+ custom_typogre = Typogre(rate=0.1)
75
+ selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
105
76
 
106
77
  gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
107
78
  print(gaggle("Summoned heroes do not fear the glitch."))
@@ -132,7 +103,7 @@ glitchlings --list
132
103
  glitchlings -g typogre --file documents/report.txt --diff
133
104
 
134
105
  # Configure glitchlings inline by passing keyword arguments.
135
- glitchlings -g "Typogre(max_change_rate=0.05)" "Ghouls just wanna have fun"
106
+ glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
136
107
 
137
108
  # Pipe text straight into the CLI for an on-the-fly corruption.
138
109
  echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
@@ -156,7 +127,7 @@ _What a nice word, would be a shame if something happened to it._
156
127
  >
157
128
  > Args
158
129
  >
159
- > - `max_change_rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
130
+ > - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
160
131
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
161
132
  > - `seed (int)`: The random seed for reproducibility (default: 151).
162
133
 
@@ -168,7 +139,7 @@ _Wait, was that...?_
168
139
  >
169
140
  > Args
170
141
  >
171
- > - `replacement_rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
142
+ > - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
172
143
  > - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
173
144
  > - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
174
145
  > - `seed (int)`: The random seed for reproducibility (default: 151).
@@ -181,7 +152,7 @@ _How can a computer need reading glasses?_
181
152
  >
182
153
  > Args
183
154
  >
184
- > - `error_rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
155
+ > - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
185
156
  > - `seed (int)`: The random seed for reproducibility (default: 151).
186
157
 
187
158
  ### Jargoyle
@@ -192,7 +163,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
192
163
  >
193
164
  > Args
194
165
  >
195
- > - `replacement_rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
166
+ > - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
196
167
  > - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
197
168
  > - `seed (int)`: The random seed for reproducibility (default: 151).
198
169
 
@@ -204,7 +175,7 @@ _Did you say that or did I?_
204
175
  >
205
176
  > Args
206
177
  >
207
- > - `reduplication_rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
178
+ > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
208
179
  > - `seed (int)`: The random seed for reproducibility (default: 151).
209
180
 
210
181
  ### Rushmore
@@ -215,7 +186,7 @@ _I accidentally an entire word._
215
186
  >
216
187
  > Args
217
188
  >
218
- > - `max_deletion_rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
189
+ > - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
219
190
  > - `seed (int)`: The random seed for reproducibility (default: 151).
220
191
 
221
192
  ### Redactyl
@@ -227,7 +198,7 @@ _Oops, that was my black highlighter._
227
198
  > ### Args
228
199
  >
229
200
  > - `replacement_char (str)`: The character to use for redaction (default: █).
230
- > - `redaction_rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
201
+ > - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
231
202
  > - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
232
203
  > - `seed (int)`: The random seed for reproducibility (default: 151).
233
204
 
@@ -1,33 +1,29 @@
1
1
  [project]
2
2
  name = "glitchlings"
3
- version = "0.2.2"
3
+ version = "0.2.4"
4
4
  description = "Monsters for your language games."
5
5
  readme = "README.md"
6
- requires-python = ">=3.12"
6
+ requires-python = ">=3.10"
7
7
 
8
8
  dependencies = [
9
9
  "confusable-homoglyphs>=3.3.1",
10
- "jellyfish>=1.2.0",
11
10
  ]
12
11
 
13
12
  authors = [
14
13
  { name = "osoleve" }
15
14
  ]
16
15
 
17
- keywords = ["nlp", "text", "adversarial augmentation", "text augmentation"]
16
+ keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "confusables", "typo", ""]
18
17
 
19
18
  classifiers = [
20
19
  "Development Status :: 3 - Alpha",
21
20
  "Intended Audience :: Developers",
22
- "License :: OSI Approved :: Apache Software License",
23
21
  "Programming Language :: Python",
24
22
  "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
25
  "Programming Language :: Python :: 3.12",
26
26
  "Programming Language :: Rust",
27
- "Operating System :: MacOS :: MacOS X",
28
- "Operating System :: Microsoft :: Windows",
29
- "Operating System :: POSIX :: Linux",
30
- "Operating System :: OS Independent",
31
27
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
32
28
  "Topic :: Software Development :: Testing",
33
29
  ]
@@ -47,7 +43,7 @@ glitchlings = "glitchlings.main:main"
47
43
  [project.optional-dependencies]
48
44
  hf = ["datasets>=4.0.0"]
49
45
  wordnet = ["nltk>=3.9.1", "numpy>=1.24,<=2.0"]
50
- prime = ["verifiers>=0.1.3.post0"]
46
+ prime = ["verifiers>=0.1.3.post0", "jellyfish>=1.2.0"]
51
47
  dev = [
52
48
  "pytest>=8.0.0",
53
49
  "hypothesis>=6.140.0",
@@ -9,10 +9,15 @@ fn main() {
9
9
  prepare_confusion_table().expect("failed to stage OCR confusion table for compilation");
10
10
  pyo3_build_config::add_extension_module_link_args();
11
11
 
12
- if let Some(python) = configured_python() {
13
- link_python(&python);
14
- } else if let Some(python) = detect_python() {
15
- link_python(&python);
12
+ // Only perform custom Python linking on non-Linux platforms.
13
+ // On Linux, manylinux wheels must NOT link against libpython to ensure portability.
14
+ // PyO3's add_extension_module_link_args() already handles this correctly by default.
15
+ if cfg!(not(target_os = "linux")) {
16
+ if let Some(python) = configured_python() {
17
+ link_python(&python);
18
+ } else if let Some(python) = detect_python() {
19
+ link_python(&python);
20
+ }
16
21
  }
17
22
  }
18
23
 
@@ -23,7 +28,13 @@ fn configured_python() -> Option<OsString> {
23
28
  }
24
29
 
25
30
  fn detect_python() -> Option<OsString> {
26
- const CANDIDATES: &[&str] = &["python3.12", "python3", "python"];
31
+ const CANDIDATES: &[&str] = &[
32
+ "python3.12",
33
+ "python3.11",
34
+ "python3.10",
35
+ "python3",
36
+ "python",
37
+ ];
27
38
 
28
39
  for candidate in CANDIDATES {
29
40
  let status = Command::new(candidate)
@@ -71,6 +82,7 @@ fn link_python(python: &OsStr) {
71
82
  let stem = stripped
72
83
  .strip_suffix(".so")
73
84
  .or_else(|| stripped.strip_suffix(".a"))
85
+ .or_else(|| stripped.strip_suffix(".dylib"))
74
86
  .unwrap_or(stripped);
75
87
  if !stem.is_empty() {
76
88
  println!("cargo:rustc-link-lib={stem}");
@@ -79,8 +79,8 @@ def tutorial_level(
79
79
  ) -> vf.Environment:
80
80
  """Create a low-corruption environment using tuned defaults."""
81
81
 
82
- tuned_mim1c = Mim1c(replacement_rate=0.01 * difficulty.value)
83
- tuned_typogre = Typogre(max_change_rate=0.025 * difficulty.value)
82
+ tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
83
+ tuned_typogre = Typogre(rate=0.025 * difficulty.value)
84
84
 
85
85
  return load_environment(
86
86
  env,
@@ -220,32 +220,54 @@ def echo_chamber(
220
220
  "Specify which split to use when the dataset loads as a DatasetDict."
221
221
  )
222
222
 
223
- prompts: list[list[dict[str, str]]] = []
224
- answers: list[str] = []
223
+ filtered_dataset = hf_dataset.filter(
224
+ lambda row: row.get(column) is not None,
225
+ load_from_cache_file=False,
226
+ )
225
227
 
226
- for row in hf_dataset:
227
- value = row.get(column)
228
- if value is None:
229
- continue
228
+ source_column_names = list(filtered_dataset.column_names)
230
229
 
231
- text = str(value)
232
- prompts.append(
233
- [
234
- {"role": "system", "content": instructions},
235
- {"role": "user", "content": f"Corrupted text:\n{text}"},
236
- ]
237
- )
238
- answers.append(text)
230
+ def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
231
+ text = str(row[column])
232
+ prompt = [
233
+ {"role": "system", "content": instructions},
234
+ {"role": "user", "content": f"Corrupted text:\n{text}"},
235
+ ]
236
+ return {"prompt": prompt, "answer": text}
239
237
 
240
- if not prompts:
241
- raise ValueError(
242
- f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
243
- )
238
+ base_dataset = filtered_dataset.map(
239
+ _build_prompt,
240
+ remove_columns=source_column_names,
241
+ load_from_cache_file=False,
242
+ )
244
243
 
245
- dataset = HFDataset.from_dict({"prompt": prompts, "answer": answers})
244
+ try:
245
+ dataset_length = len(base_dataset) # type: ignore[arg-type]
246
+ except TypeError:
247
+ preview_rows: list[dict[str, Any]]
248
+ take_fn = getattr(base_dataset, "take", None)
249
+ if callable(take_fn):
250
+ preview_rows = list(take_fn(1))
251
+ else:
252
+ iterator = iter(base_dataset)
253
+ try:
254
+ first_row = next(iterator)
255
+ except StopIteration:
256
+ preview_rows = []
257
+ else:
258
+ preview_rows = [first_row]
259
+ if not preview_rows:
260
+ raise ValueError(
261
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
262
+ )
263
+ else:
264
+ if dataset_length == 0:
265
+ raise ValueError(
266
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
267
+ )
246
268
 
247
269
  gaggle = _as_gaggle(glitchlings, seed=seed)
248
- glitched_dataset = gaggle.corrupt_dataset(dataset, ["prompt"])
270
+ glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
249
271
 
250
272
  rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
251
273
  rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
@@ -46,7 +46,7 @@ def build_parser() -> argparse.ArgumentParser:
46
46
  metavar="SPEC",
47
47
  help=(
48
48
  "Glitchling to apply, optionally with parameters like "
49
- "Typogre(max_change_rate=0.05). Repeat for multiples; defaults to all built-ins."
49
+ "Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
50
50
  ),
51
51
  )
52
52
  parser.add_argument(
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def resolve_rate(
5
+ *,
6
+ rate: float | None,
7
+ legacy_value: float | None,
8
+ default: float,
9
+ legacy_name: str,
10
+ ) -> float:
11
+ """Return the effective rate while enforcing mutual exclusivity."""
12
+
13
+ if rate is not None and legacy_value is not None:
14
+ raise ValueError(
15
+ f"Specify either 'rate' or '{legacy_name}', not both."
16
+ )
17
+ if rate is not None:
18
+ return rate
19
+ if legacy_value is not None:
20
+ return legacy_value
21
+ return default