glitchlings 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {glitchlings-0.2.2 → glitchlings-0.2.3}/PKG-INFO +21 -48
  2. {glitchlings-0.2.2 → glitchlings-0.2.3}/README.md +17 -46
  3. {glitchlings-0.2.2 → glitchlings-0.2.3}/pyproject.toml +4 -2
  4. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/build.rs +7 -1
  5. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/dlc/prime.py +44 -22
  6. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/main.py +1 -1
  7. glitchlings-0.2.3/src/glitchlings/zoo/_rate.py +21 -0
  8. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/core.py +56 -52
  9. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/jargoyle.py +24 -5
  10. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/mim1c.py +24 -5
  11. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/redactyl.py +43 -8
  12. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/reduple.py +36 -8
  13. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/rushmore.py +40 -8
  14. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/scannequin.py +38 -8
  15. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/typogre.py +29 -9
  16. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/PKG-INFO +21 -48
  17. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/SOURCES.txt +1 -0
  18. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_cli.py +2 -2
  19. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_gaggle.py +2 -1
  20. glitchlings-0.2.3/tests/test_glitchling_core.py +68 -0
  21. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_glitchlings_determinism.py +7 -7
  22. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_jargoyle.py +2 -2
  23. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_parameter_effects.py +10 -10
  24. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_prime_echo_chamber.py +93 -4
  25. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_property_based.py +2 -2
  26. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_rust_backed_glitchlings.py +30 -30
  27. glitchlings-0.2.2/tests/test_glitchling_core.py +0 -24
  28. {glitchlings-0.2.2 → glitchlings-0.2.3}/LICENSE +0 -0
  29. {glitchlings-0.2.2 → glitchlings-0.2.3}/MANIFEST.in +0 -0
  30. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/Cargo.lock +0 -0
  31. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/Cargo.toml +0 -0
  32. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/Cargo.toml +0 -0
  33. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/assets/ocr_confusions.tsv +0 -0
  34. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/glitch_ops.rs +0 -0
  35. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/lib.rs +0 -0
  36. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/pipeline.rs +0 -0
  37. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/resources.rs +0 -0
  38. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/rng.rs +0 -0
  39. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/text_buffer.rs +0 -0
  40. {glitchlings-0.2.2 → glitchlings-0.2.3}/rust/zoo/src/typogre.rs +0 -0
  41. {glitchlings-0.2.2 → glitchlings-0.2.3}/setup.cfg +0 -0
  42. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/__init__.py +0 -0
  43. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/__main__.py +0 -0
  44. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/dlc/__init__.py +0 -0
  45. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/dlc/huggingface.py +0 -0
  46. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/util/__init__.py +0 -0
  47. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/__init__.py +0 -0
  48. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
  49. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
  50. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/dependency_links.txt +0 -0
  51. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/entry_points.txt +0 -0
  52. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/requires.txt +0 -0
  53. {glitchlings-0.2.2 → glitchlings-0.2.3}/src/glitchlings.egg-info/top_level.txt +0 -0
  54. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_dataset_corruption.py +0 -0
  55. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_huggingface_dlc.py +0 -0
  56. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_keyboard_layouts.py +0 -0
  57. {glitchlings-0.2.2 → glitchlings-0.2.3}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -215,6 +215,8 @@ Classifier: Intended Audience :: Developers
215
215
  Classifier: License :: OSI Approved :: Apache Software License
216
216
  Classifier: Programming Language :: Python
217
217
  Classifier: Programming Language :: Python :: 3
218
+ Classifier: Programming Language :: Python :: 3.10
219
+ Classifier: Programming Language :: Python :: 3.11
218
220
  Classifier: Programming Language :: Python :: 3.12
219
221
  Classifier: Programming Language :: Rust
220
222
  Classifier: Operating System :: MacOS :: MacOS X
@@ -223,7 +225,7 @@ Classifier: Operating System :: POSIX :: Linux
223
225
  Classifier: Operating System :: OS Independent
224
226
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
225
227
  Classifier: Topic :: Software Development :: Testing
226
- Requires-Python: >=3.12
228
+ Requires-Python: >=3.10
227
229
  Description-Content-Type: text/markdown
228
230
  License-File: LICENSE
229
231
  Requires-Dist: confusable-homoglyphs>=3.3.1
@@ -280,14 +282,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
280
282
  pip install -U glitchlings
281
283
  ```
282
284
 
285
+ > Glitchlings requires Python 3.10 or newer.
286
+
283
287
  ```python
284
288
  from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
285
289
 
286
290
  gaggle = Gaggle([
287
- Typogre(max_change_rate=0.03),
288
- Mim1c(replacement_rate=0.02),
291
+ Typogre(rate=0.03),
292
+ Mim1c(rate=0.02),
289
293
  Reduple(seed=404),
290
- Rushmore(max_deletion_rate=0.02),
294
+ Rushmore(rate=0.02),
291
295
  ])
292
296
 
293
297
  print(gaggle(SAMPLE_TEXT))
@@ -295,41 +299,10 @@ print(gaggle(SAMPLE_TEXT))
295
299
 
296
300
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
297
301
 
298
- ## Usage
299
-
300
- Need detailed usage patterns, dataset workflows, or tips for enabling the
301
- Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
302
- for end-to-end instructions spanning the Python API, CLI, Hugging Face
302
+ Consult the [Glitchlings Usage Guide](docs/index.md)
303
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
303
304
  integrations, and the feature-flagged Rust pipeline.
304
305
 
305
- ### Prime Intellect environments
306
-
307
- After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
308
-
309
- ```python
310
- from glitchlings import Mim1c, Typogre
311
- from glitchlings.dlc.prime import echo_chamber, load_environment
312
-
313
- env = load_environment(
314
- "osoleve/syllabify-en",
315
- glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
316
- seed=404,
317
- )
318
-
319
- # Spin up an echo chamber that corrupts a dataset column and
320
- # rewards models for perfectly restoring it
321
- practice_env = echo_chamber(
322
- "osoleve/clean-room",
323
- column="text",
324
- glitchlings=["Typogre", "Mim1c"],
325
- reward_function=lambda prompt, completion, answer: float(completion == answer),
326
- )
327
- ```
328
-
329
- Skip the `glitchlings` argument to receive an untouched verifier dataset, and
330
- override `reward_function` when you want to evaluate completions with a custom
331
- scoring routine.
332
-
333
306
  ## Motivation
334
307
 
335
308
  If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
@@ -344,8 +317,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
344
317
  ```python
345
318
  from glitchlings import Gaggle, Typogre, Mim1c
346
319
 
347
- custom_typogre = Typogre(max_change_rate=0.1)
348
- selective_mimic = Mim1c(replacement_rate=0.05, classes=["LATIN", "GREEK"])
320
+ custom_typogre = Typogre(rate=0.1)
321
+ selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
349
322
 
350
323
  gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
351
324
  print(gaggle("Summoned heroes do not fear the glitch."))
@@ -376,7 +349,7 @@ glitchlings --list
376
349
  glitchlings -g typogre --file documents/report.txt --diff
377
350
 
378
351
  # Configure glitchlings inline by passing keyword arguments.
379
- glitchlings -g "Typogre(max_change_rate=0.05)" "Ghouls just wanna have fun"
352
+ glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
380
353
 
381
354
  # Pipe text straight into the CLI for an on-the-fly corruption.
382
355
  echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
@@ -400,7 +373,7 @@ _What a nice word, would be a shame if something happened to it._
400
373
  >
401
374
  > Args
402
375
  >
403
- > - `max_change_rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
376
+ > - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
404
377
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
405
378
  > - `seed (int)`: The random seed for reproducibility (default: 151).
406
379
 
@@ -412,7 +385,7 @@ _Wait, was that...?_
412
385
  >
413
386
  > Args
414
387
  >
415
- > - `replacement_rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
388
+ > - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
416
389
  > - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
417
390
  > - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
418
391
  > - `seed (int)`: The random seed for reproducibility (default: 151).
@@ -425,7 +398,7 @@ _How can a computer need reading glasses?_
425
398
  >
426
399
  > Args
427
400
  >
428
- > - `error_rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
401
+ > - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
429
402
  > - `seed (int)`: The random seed for reproducibility (default: 151).
430
403
 
431
404
  ### Jargoyle
@@ -436,7 +409,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
436
409
  >
437
410
  > Args
438
411
  >
439
- > - `replacement_rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
412
+ > - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
440
413
  > - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
441
414
  > - `seed (int)`: The random seed for reproducibility (default: 151).
442
415
 
@@ -448,7 +421,7 @@ _Did you say that or did I?_
448
421
  >
449
422
  > Args
450
423
  >
451
- > - `reduplication_rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
424
+ > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
452
425
  > - `seed (int)`: The random seed for reproducibility (default: 151).
453
426
 
454
427
  ### Rushmore
@@ -459,7 +432,7 @@ _I accidentally an entire word._
459
432
  >
460
433
  > Args
461
434
  >
462
- > - `max_deletion_rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
435
+ > - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
463
436
  > - `seed (int)`: The random seed for reproducibility (default: 151).
464
437
 
465
438
  ### Redactyl
@@ -471,7 +444,7 @@ _Oops, that was my black highlighter._
471
444
  > ### Args
472
445
  >
473
446
  > - `replacement_char (str)`: The character to use for redaction (default: █).
474
- > - `redaction_rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
447
+ > - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
475
448
  > - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
476
449
  > - `seed (int)`: The random seed for reproducibility (default: 151).
477
450
 
@@ -36,14 +36,16 @@ After all, what good is general intelligence if it can't handle a little chaos?
36
36
  pip install -U glitchlings
37
37
  ```
38
38
 
39
+ > Glitchlings requires Python 3.10 or newer.
40
+
39
41
  ```python
40
42
  from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
41
43
 
42
44
  gaggle = Gaggle([
43
- Typogre(max_change_rate=0.03),
44
- Mim1c(replacement_rate=0.02),
45
+ Typogre(rate=0.03),
46
+ Mim1c(rate=0.02),
45
47
  Reduple(seed=404),
46
- Rushmore(max_deletion_rate=0.02),
48
+ Rushmore(rate=0.02),
47
49
  ])
48
50
 
49
51
  print(gaggle(SAMPLE_TEXT))
@@ -51,41 +53,10 @@ print(gaggle(SAMPLE_TEXT))
51
53
 
52
54
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
53
55
 
54
- ## Usage
55
-
56
- Need detailed usage patterns, dataset workflows, or tips for enabling the
57
- Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
58
- for end-to-end instructions spanning the Python API, CLI, Hugging Face
56
+ Consult the [Glitchlings Usage Guide](docs/index.md)
57
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
59
58
  integrations, and the feature-flagged Rust pipeline.
60
59
 
61
- ### Prime Intellect environments
62
-
63
- After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
64
-
65
- ```python
66
- from glitchlings import Mim1c, Typogre
67
- from glitchlings.dlc.prime import echo_chamber, load_environment
68
-
69
- env = load_environment(
70
- "osoleve/syllabify-en",
71
- glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
72
- seed=404,
73
- )
74
-
75
- # Spin up an echo chamber that corrupts a dataset column and
76
- # rewards models for perfectly restoring it
77
- practice_env = echo_chamber(
78
- "osoleve/clean-room",
79
- column="text",
80
- glitchlings=["Typogre", "Mim1c"],
81
- reward_function=lambda prompt, completion, answer: float(completion == answer),
82
- )
83
- ```
84
-
85
- Skip the `glitchlings` argument to receive an untouched verifier dataset, and
86
- override `reward_function` when you want to evaluate completions with a custom
87
- scoring routine.
88
-
89
60
  ## Motivation
90
61
 
91
62
  If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
@@ -100,8 +71,8 @@ Glitchlings are standard Python classes, so you can instantiate them with whatev
100
71
  ```python
101
72
  from glitchlings import Gaggle, Typogre, Mim1c
102
73
 
103
- custom_typogre = Typogre(max_change_rate=0.1)
104
- selective_mimic = Mim1c(replacement_rate=0.05, classes=["LATIN", "GREEK"])
74
+ custom_typogre = Typogre(rate=0.1)
75
+ selective_mimic = Mim1c(rate=0.05, classes=["LATIN", "GREEK"])
105
76
 
106
77
  gaggle = Gaggle([custom_typogre, selective_mimic], seed=99)
107
78
  print(gaggle("Summoned heroes do not fear the glitch."))
@@ -132,7 +103,7 @@ glitchlings --list
132
103
  glitchlings -g typogre --file documents/report.txt --diff
133
104
 
134
105
  # Configure glitchlings inline by passing keyword arguments.
135
- glitchlings -g "Typogre(max_change_rate=0.05)" "Ghouls just wanna have fun"
106
+ glitchlings -g "Typogre(rate=0.05)" "Ghouls just wanna have fun"
136
107
 
137
108
  # Pipe text straight into the CLI for an on-the-fly corruption.
138
109
  echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
@@ -156,7 +127,7 @@ _What a nice word, would be a shame if something happened to it._
156
127
  >
157
128
  > Args
158
129
  >
159
- > - `max_change_rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
130
+ > - `rate (float)`: The maximum number of edits to make as a percentage of the length (default: 0.02, 2%).
160
131
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
161
132
  > - `seed (int)`: The random seed for reproducibility (default: 151).
162
133
 
@@ -168,7 +139,7 @@ _Wait, was that...?_
168
139
  >
169
140
  > Args
170
141
  >
171
- > - `replacement_rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
142
+ > - `rate (float)`: The maximum proportion of characters to replace (default: 0.02, 2%).
172
143
  > - `classes (list[str] | "all")`: Restrict replacements to these Unicode script classes (default: ["LATIN", "GREEK", "CYRILLIC"]).
173
144
  > - `banned_characters (Collection[str])`: Characters that must never appear as replacements (default: none).
174
145
  > - `seed (int)`: The random seed for reproducibility (default: 151).
@@ -181,7 +152,7 @@ _How can a computer need reading glasses?_
181
152
  >
182
153
  > Args
183
154
  >
184
- > - `error_rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
155
+ > - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
185
156
  > - `seed (int)`: The random seed for reproducibility (default: 151).
186
157
 
187
158
  ### Jargoyle
@@ -192,7 +163,7 @@ _Uh oh. The worst person you know just bought a thesaurus._
192
163
  >
193
164
  > Args
194
165
  >
195
- > - `replacement_rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
166
+ > - `rate (float)`: The maximum proportion of words to replace (default: 0.1, 10%).
196
167
  > - `part_of_speech`: The WordNet part(s) of speech to target (default: nouns). Accepts `wn.NOUN`, `wn.VERB`, `wn.ADJ`, `wn.ADV`, any iterable of those tags, or the string `"any"` to include them all.
197
168
  > - `seed (int)`: The random seed for reproducibility (default: 151).
198
169
 
@@ -204,7 +175,7 @@ _Did you say that or did I?_
204
175
  >
205
176
  > Args
206
177
  >
207
- > - `reduplication_rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
178
+ > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
208
179
  > - `seed (int)`: The random seed for reproducibility (default: 151).
209
180
 
210
181
  ### Rushmore
@@ -215,7 +186,7 @@ _I accidentally an entire word._
215
186
  >
216
187
  > Args
217
188
  >
218
- > - `max_deletion_rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
189
+ > - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
219
190
  > - `seed (int)`: The random seed for reproducibility (default: 151).
220
191
 
221
192
  ### Redactyl
@@ -227,7 +198,7 @@ _Oops, that was my black highlighter._
227
198
  > ### Args
228
199
  >
229
200
  > - `replacement_char (str)`: The character to use for redaction (default: █).
230
- > - `redaction_rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
201
+ > - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
231
202
  > - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
232
203
  > - `seed (int)`: The random seed for reproducibility (default: 151).
233
204
 
@@ -1,9 +1,9 @@
1
1
  [project]
2
2
  name = "glitchlings"
3
- version = "0.2.2"
3
+ version = "0.2.3"
4
4
  description = "Monsters for your language games."
5
5
  readme = "README.md"
6
- requires-python = ">=3.12"
6
+ requires-python = ">=3.10"
7
7
 
8
8
  dependencies = [
9
9
  "confusable-homoglyphs>=3.3.1",
@@ -22,6 +22,8 @@ classifiers = [
22
22
  "License :: OSI Approved :: Apache Software License",
23
23
  "Programming Language :: Python",
24
24
  "Programming Language :: Python :: 3",
25
+ "Programming Language :: Python :: 3.10",
26
+ "Programming Language :: Python :: 3.11",
25
27
  "Programming Language :: Python :: 3.12",
26
28
  "Programming Language :: Rust",
27
29
  "Operating System :: MacOS :: MacOS X",
@@ -23,7 +23,13 @@ fn configured_python() -> Option<OsString> {
23
23
  }
24
24
 
25
25
  fn detect_python() -> Option<OsString> {
26
- const CANDIDATES: &[&str] = &["python3.12", "python3", "python"];
26
+ const CANDIDATES: &[&str] = &[
27
+ "python3.12",
28
+ "python3.11",
29
+ "python3.10",
30
+ "python3",
31
+ "python",
32
+ ];
27
33
 
28
34
  for candidate in CANDIDATES {
29
35
  let status = Command::new(candidate)
@@ -79,8 +79,8 @@ def tutorial_level(
79
79
  ) -> vf.Environment:
80
80
  """Create a low-corruption environment using tuned defaults."""
81
81
 
82
- tuned_mim1c = Mim1c(replacement_rate=0.01 * difficulty.value)
83
- tuned_typogre = Typogre(max_change_rate=0.025 * difficulty.value)
82
+ tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
83
+ tuned_typogre = Typogre(rate=0.025 * difficulty.value)
84
84
 
85
85
  return load_environment(
86
86
  env,
@@ -220,32 +220,54 @@ def echo_chamber(
220
220
  "Specify which split to use when the dataset loads as a DatasetDict."
221
221
  )
222
222
 
223
- prompts: list[list[dict[str, str]]] = []
224
- answers: list[str] = []
223
+ filtered_dataset = hf_dataset.filter(
224
+ lambda row: row.get(column) is not None,
225
+ load_from_cache_file=False,
226
+ )
225
227
 
226
- for row in hf_dataset:
227
- value = row.get(column)
228
- if value is None:
229
- continue
228
+ source_column_names = list(filtered_dataset.column_names)
230
229
 
231
- text = str(value)
232
- prompts.append(
233
- [
234
- {"role": "system", "content": instructions},
235
- {"role": "user", "content": f"Corrupted text:\n{text}"},
236
- ]
237
- )
238
- answers.append(text)
230
+ def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
231
+ text = str(row[column])
232
+ prompt = [
233
+ {"role": "system", "content": instructions},
234
+ {"role": "user", "content": f"Corrupted text:\n{text}"},
235
+ ]
236
+ return {"prompt": prompt, "answer": text}
239
237
 
240
- if not prompts:
241
- raise ValueError(
242
- f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
243
- )
238
+ base_dataset = filtered_dataset.map(
239
+ _build_prompt,
240
+ remove_columns=source_column_names,
241
+ load_from_cache_file=False,
242
+ )
244
243
 
245
- dataset = HFDataset.from_dict({"prompt": prompts, "answer": answers})
244
+ try:
245
+ dataset_length = len(base_dataset) # type: ignore[arg-type]
246
+ except TypeError:
247
+ preview_rows: list[dict[str, Any]]
248
+ take_fn = getattr(base_dataset, "take", None)
249
+ if callable(take_fn):
250
+ preview_rows = list(take_fn(1))
251
+ else:
252
+ iterator = iter(base_dataset)
253
+ try:
254
+ first_row = next(iterator)
255
+ except StopIteration:
256
+ preview_rows = []
257
+ else:
258
+ preview_rows = [first_row]
259
+ if not preview_rows:
260
+ raise ValueError(
261
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
262
+ )
263
+ else:
264
+ if dataset_length == 0:
265
+ raise ValueError(
266
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
267
+ )
246
268
 
247
269
  gaggle = _as_gaggle(glitchlings, seed=seed)
248
- glitched_dataset = gaggle.corrupt_dataset(dataset, ["prompt"])
270
+ glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
249
271
 
250
272
  rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
251
273
  rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
@@ -46,7 +46,7 @@ def build_parser() -> argparse.ArgumentParser:
46
46
  metavar="SPEC",
47
47
  help=(
48
48
  "Glitchling to apply, optionally with parameters like "
49
- "Typogre(max_change_rate=0.05). Repeat for multiples; defaults to all built-ins."
49
+ "Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
50
50
  ),
51
51
  )
52
52
  parser.add_argument(
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def resolve_rate(
5
+ *,
6
+ rate: float | None,
7
+ legacy_value: float | None,
8
+ default: float,
9
+ legacy_name: str,
10
+ ) -> float:
11
+ """Return the effective rate while enforcing mutual exclusivity."""
12
+
13
+ if rate is not None and legacy_value is not None:
14
+ raise ValueError(
15
+ f"Specify either 'rate' or '{legacy_name}', not both."
16
+ )
17
+ if rate is not None:
18
+ return rate
19
+ if legacy_value is not None:
20
+ return legacy_value
21
+ return default
@@ -107,6 +107,7 @@ class Glitchling:
107
107
  scope: AttackWave,
108
108
  order: AttackOrder = AttackOrder.NORMAL,
109
109
  seed: int | None = None,
110
+ pipeline_operation: Callable[["Glitchling"], dict[str, Any] | None] | None = None,
110
111
  **kwargs: Any,
111
112
  ) -> None:
112
113
  """Initialize a glitchling.
@@ -128,31 +129,76 @@ class Glitchling:
128
129
  self.corruption_function: CorruptionCallable = corruption_function
129
130
  self.level: AttackWave = scope
130
131
  self.order: AttackOrder = order
132
+ self._pipeline_descriptor_factory = pipeline_operation
131
133
  self.kwargs: dict[str, Any] = {}
134
+ self._cached_rng_callable: CorruptionCallable | None = None
135
+ self._cached_rng_expectation: bool | None = None
132
136
  for kw, val in kwargs.items():
133
137
  self.set_param(kw, val)
134
138
 
135
139
  def set_param(self, key: str, value: Any) -> None:
136
140
  """Persist a parameter for use by the corruption callable."""
137
141
 
138
- setattr(self, key, value)
139
- self.kwargs[key] = value
140
- if key == "seed":
142
+ aliases = getattr(self, "_param_aliases", {})
143
+ canonical = aliases.get(key, key)
144
+
145
+ # Drop stale alias keys so we only forward canonical kwargs.
146
+ self.kwargs.pop(key, None)
147
+ for alias, target in aliases.items():
148
+ if target == canonical:
149
+ self.kwargs.pop(alias, None)
150
+
151
+ self.kwargs[canonical] = value
152
+ setattr(self, canonical, value)
153
+
154
+ if canonical == "seed":
141
155
  self.reset_rng(value)
142
156
 
143
- def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
144
- """Execute the corruption callable, injecting the RNG when required."""
157
+ for alias, target in aliases.items():
158
+ if target == canonical:
159
+ setattr(self, alias, value)
145
160
 
146
- # Pass rng to underlying corruption function if it expects it.
161
+ def pipeline_operation(self) -> dict[str, Any] | None:
162
+ """Return the Rust pipeline operation descriptor for this glitchling."""
163
+
164
+ factory = self._pipeline_descriptor_factory
165
+ if factory is None:
166
+ return None
167
+
168
+ return factory(self)
169
+
170
+ def _corruption_expects_rng(self) -> bool:
171
+ """Return `True` when the corruption function accepts an rng keyword."""
172
+
173
+ cached_callable = self._cached_rng_callable
174
+ cached_expectation = self._cached_rng_expectation
175
+ corruption_function = self.corruption_function
176
+
177
+ if (
178
+ cached_callable is corruption_function
179
+ and cached_expectation is not None
180
+ ):
181
+ return cached_expectation
182
+
183
+ expects_rng = False
147
184
  try:
148
- signature = inspect.signature(self.corruption_function)
185
+ signature = inspect.signature(corruption_function)
149
186
  except (TypeError, ValueError):
150
187
  signature = None
151
188
 
152
- expects_rng = False
153
189
  if signature is not None:
154
190
  expects_rng = "rng" in signature.parameters
155
191
 
192
+ self._cached_rng_callable = corruption_function
193
+ self._cached_rng_expectation = expects_rng
194
+ return expects_rng
195
+
196
+ def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
197
+ """Execute the corruption callable, injecting the RNG when required."""
198
+
199
+ # Pass rng to underlying corruption function if it expects it.
200
+ expects_rng = self._corruption_expects_rng()
201
+
156
202
  if expects_rng:
157
203
  corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
158
204
  else:
@@ -231,53 +277,14 @@ class Glitchling:
231
277
  self.corruption_function,
232
278
  self.level,
233
279
  self.order,
280
+ pipeline_operation=self._pipeline_descriptor_factory,
234
281
  **filtered_kwargs,
235
282
  )
236
283
 
237
284
  return cls(**filtered_kwargs)
238
285
 
239
286
 
240
- def _pipeline_operation_reduplicate(glitchling: "Glitchling") -> dict[str, Any] | None:
241
- rate = glitchling.kwargs.get("reduplication_rate")
242
- if rate is None:
243
- return None
244
- return {"type": "reduplicate", "reduplication_rate": float(rate)}
245
-
246
287
 
247
- def _pipeline_operation_delete(glitchling: "Glitchling") -> dict[str, Any] | None:
248
- rate = glitchling.kwargs.get("max_deletion_rate")
249
- if rate is None:
250
- return None
251
- return {"type": "delete", "max_deletion_rate": float(rate)}
252
-
253
-
254
- def _pipeline_operation_redact(glitchling: "Glitchling") -> dict[str, Any] | None:
255
- replacement_char = glitchling.kwargs.get("replacement_char")
256
- redaction_rate = glitchling.kwargs.get("redaction_rate")
257
- merge_adjacent = glitchling.kwargs.get("merge_adjacent")
258
- if replacement_char is None or redaction_rate is None or merge_adjacent is None:
259
- return None
260
- return {
261
- "type": "redact",
262
- "replacement_char": str(replacement_char),
263
- "redaction_rate": float(redaction_rate),
264
- "merge_adjacent": bool(merge_adjacent),
265
- }
266
-
267
-
268
- def _pipeline_operation_ocr(glitchling: "Glitchling") -> dict[str, Any] | None:
269
- error_rate = glitchling.kwargs.get("error_rate")
270
- if error_rate is None:
271
- return None
272
- return {"type": "ocr", "error_rate": float(error_rate)}
273
-
274
-
275
- _PIPELINE_OPERATION_BUILDERS: dict[str, Callable[["Glitchling"], dict[str, Any] | None]] = {
276
- "Reduple": _pipeline_operation_reduplicate,
277
- "Rushmore": _pipeline_operation_delete,
278
- "Redactyl": _pipeline_operation_redact,
279
- "Scannequin": _pipeline_operation_ocr,
280
- }
281
288
 
282
289
 
283
290
  class Gaggle(Glitchling):
@@ -359,10 +366,7 @@ class Gaggle(Glitchling):
359
366
 
360
367
  descriptors: list[dict[str, Any]] = []
361
368
  for glitchling in self.apply_order:
362
- builder = _PIPELINE_OPERATION_BUILDERS.get(glitchling.name)
363
- if builder is None:
364
- return None
365
- operation = builder(glitchling)
369
+ operation = glitchling.pipeline_operation()
366
370
  if operation is None:
367
371
  return None
368
372