glitchlings 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (92) hide show
  1. {glitchlings-0.4.1 → glitchlings-0.4.3}/MANIFEST.in +1 -1
  2. {glitchlings-0.4.1 → glitchlings-0.4.3}/PKG-INFO +101 -4
  3. {glitchlings-0.4.1 → glitchlings-0.4.3}/README.md +68 -3
  4. glitchlings-0.4.3/pyproject.toml +186 -0
  5. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/Cargo.lock +55 -0
  6. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/Cargo.toml +1 -0
  7. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/build.rs +47 -31
  8. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/glitch_ops.rs +154 -2
  9. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/lib.rs +14 -2
  10. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/resources.rs +35 -1
  11. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/text_buffer.rs +2 -8
  12. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/__init__.py +30 -17
  13. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/__main__.py +0 -1
  14. glitchlings-0.4.3/src/glitchlings/compat.py +284 -0
  15. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/config.py +164 -34
  16. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/config.toml +1 -1
  17. glitchlings-0.4.3/src/glitchlings/dlc/__init__.py +7 -0
  18. glitchlings-0.4.3/src/glitchlings/dlc/_shared.py +68 -0
  19. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/dlc/huggingface.py +26 -41
  20. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/dlc/prime.py +64 -101
  21. glitchlings-0.4.3/src/glitchlings/dlc/pytorch.py +216 -0
  22. glitchlings-0.4.3/src/glitchlings/dlc/pytorch_lightning.py +233 -0
  23. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/lexicon/__init__.py +12 -33
  24. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/lexicon/_cache.py +21 -22
  25. glitchlings-0.4.3/src/glitchlings/lexicon/data/default_vector_cache.json +82 -0
  26. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/lexicon/metrics.py +1 -8
  27. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/lexicon/vector.py +109 -49
  28. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/lexicon/wordnet.py +89 -49
  29. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/main.py +30 -24
  30. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/util/__init__.py +18 -4
  31. glitchlings-0.4.3/src/glitchlings/util/adapters.py +27 -0
  32. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/__init__.py +26 -15
  33. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/_ocr_confusions.py +1 -3
  34. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/_rate.py +1 -4
  35. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/_sampling.py +0 -1
  36. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/_text_utils.py +1 -5
  37. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/adjax.py +2 -4
  38. glitchlings-0.4.3/src/glitchlings/zoo/apostrofae.py +128 -0
  39. glitchlings-0.4.3/src/glitchlings/zoo/assets/__init__.py +0 -0
  40. glitchlings-0.4.3/src/glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
  41. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/core.py +152 -87
  42. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/jargoyle.py +50 -45
  43. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/mim1c.py +11 -10
  44. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/redactyl.py +16 -16
  45. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/reduple.py +5 -3
  46. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/rushmore.py +4 -10
  47. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/scannequin.py +7 -6
  48. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/typogre.py +8 -9
  49. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/zeedub.py +6 -3
  50. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings.egg-info/PKG-INFO +101 -4
  51. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings.egg-info/SOURCES.txt +8 -24
  52. glitchlings-0.4.3/src/glitchlings.egg-info/requires.txt +58 -0
  53. glitchlings-0.4.1/pyproject.toml +0 -87
  54. glitchlings-0.4.1/src/glitchlings/dlc/__init__.py +0 -5
  55. glitchlings-0.4.1/src/glitchlings/lexicon/data/default_vector_cache.json +0 -16
  56. glitchlings-0.4.1/src/glitchlings/lexicon/graph.py +0 -290
  57. glitchlings-0.4.1/src/glitchlings.egg-info/requires.txt +0 -22
  58. glitchlings-0.4.1/tests/test_benchmarks.py +0 -137
  59. glitchlings-0.4.1/tests/test_cli.py +0 -369
  60. glitchlings-0.4.1/tests/test_config.py +0 -196
  61. glitchlings-0.4.1/tests/test_dataset_corruption.py +0 -128
  62. glitchlings-0.4.1/tests/test_gaggle.py +0 -68
  63. glitchlings-0.4.1/tests/test_glitchling_core.py +0 -68
  64. glitchlings-0.4.1/tests/test_glitchlings_determinism.py +0 -103
  65. glitchlings-0.4.1/tests/test_graph_lexicon.py +0 -81
  66. glitchlings-0.4.1/tests/test_huggingface_dlc.py +0 -78
  67. glitchlings-0.4.1/tests/test_jargoyle.py +0 -209
  68. glitchlings-0.4.1/tests/test_keyboard_layouts.py +0 -42
  69. glitchlings-0.4.1/tests/test_lexicon_backends.py +0 -85
  70. glitchlings-0.4.1/tests/test_lexicon_config.py +0 -56
  71. glitchlings-0.4.1/tests/test_lexicon_metrics.py +0 -120
  72. glitchlings-0.4.1/tests/test_parameter_effects.py +0 -281
  73. glitchlings-0.4.1/tests/test_pipeline_operations.py +0 -95
  74. glitchlings-0.4.1/tests/test_prime_echo_chamber.py +0 -390
  75. glitchlings-0.4.1/tests/test_property_based.py +0 -150
  76. glitchlings-0.4.1/tests/test_rate_and_sampling.py +0 -51
  77. glitchlings-0.4.1/tests/test_rust_backed_glitchlings.py +0 -931
  78. glitchlings-0.4.1/tests/test_text_utils.py +0 -37
  79. glitchlings-0.4.1/tests/test_util.py +0 -35
  80. glitchlings-0.4.1/tests/test_vector_lexicon.py +0 -438
  81. {glitchlings-0.4.1 → glitchlings-0.4.3}/LICENSE +0 -0
  82. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/Cargo.toml +0 -0
  83. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/assets/ocr_confusions.tsv +0 -0
  84. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/pipeline.rs +0 -0
  85. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/rng.rs +0 -0
  86. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/typogre.rs +0 -0
  87. {glitchlings-0.4.1 → glitchlings-0.4.3}/rust/zoo/src/zeedub.rs +0 -0
  88. {glitchlings-0.4.1 → glitchlings-0.4.3}/setup.cfg +0 -0
  89. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
  90. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings.egg-info/dependency_links.txt +0 -0
  91. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings.egg-info/entry_points.txt +0 -0
  92. {glitchlings-0.4.1 → glitchlings-0.4.3}/src/glitchlings.egg-info/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  recursive-include rust *.rs *.toml *.lock *.tsv
2
- recursive-include src/glitchlings/zoo *.tsv
2
+ recursive-include src/glitchlings/zoo *.tsv *.json
3
3
  recursive-include src/glitchlings/lexicon/data *.json
4
4
  include src/glitchlings/config.toml
5
5
  prune rust/target
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -226,19 +226,51 @@ License-File: LICENSE
226
226
  Requires-Dist: confusable-homoglyphs>=3.3.1
227
227
  Requires-Dist: tomli>=2.0.1; python_version < "3.11"
228
228
  Requires-Dist: pyyaml>=6.0.0
229
+ Provides-Extra: all
230
+ Requires-Dist: black>=24.4.0; extra == "all"
231
+ Requires-Dist: hypothesis>=6.140.0; extra == "all"
232
+ Requires-Dist: interrogate>=1.5.0; extra == "all"
233
+ Requires-Dist: jellyfish>=1.2.0; extra == "all"
234
+ Requires-Dist: isort>=5.13.0; extra == "all"
235
+ Requires-Dist: mkdocs>=1.6.0; extra == "all"
236
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "all"
237
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "all"
238
+ Requires-Dist: mkdocstrings-python>=1.10.0; extra == "all"
239
+ Requires-Dist: mypy>=1.8.0; extra == "all"
240
+ Requires-Dist: numpy<=2.0,>=1.24; extra == "all"
241
+ Requires-Dist: pre-commit>=3.8.0; extra == "all"
242
+ Requires-Dist: pytest>=8.0.0; extra == "all"
243
+ Requires-Dist: ruff>=0.6.0; extra == "all"
244
+ Requires-Dist: verifiers>=0.1.3.post0; extra == "all"
229
245
  Provides-Extra: hf
230
246
  Requires-Dist: datasets>=4.0.0; extra == "hf"
247
+ Provides-Extra: lightning
248
+ Requires-Dist: pytorch_lightning>=2.0.0; extra == "lightning"
231
249
  Provides-Extra: vectors
232
250
  Requires-Dist: numpy<=2.0,>=1.24; extra == "vectors"
233
251
  Requires-Dist: spacy>=3.7.2; extra == "vectors"
234
252
  Requires-Dist: gensim>=4.3.2; extra == "vectors"
253
+ Provides-Extra: st
254
+ Requires-Dist: sentence-transformers>=3.0.0; extra == "st"
235
255
  Provides-Extra: prime
236
256
  Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
237
257
  Requires-Dist: jellyfish>=1.2.0; extra == "prime"
258
+ Provides-Extra: torch
259
+ Requires-Dist: torch>=2.0.0; extra == "torch"
238
260
  Provides-Extra: dev
239
261
  Requires-Dist: pytest>=8.0.0; extra == "dev"
240
262
  Requires-Dist: hypothesis>=6.140.0; extra == "dev"
241
263
  Requires-Dist: numpy<=2.0,>=1.24; extra == "dev"
264
+ Requires-Dist: mkdocs>=1.6.0; extra == "dev"
265
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "dev"
266
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "dev"
267
+ Requires-Dist: mkdocstrings-python>=1.10.0; extra == "dev"
268
+ Requires-Dist: interrogate>=1.5.0; extra == "dev"
269
+ Requires-Dist: black>=24.4.0; extra == "dev"
270
+ Requires-Dist: isort>=5.13.0; extra == "dev"
271
+ Requires-Dist: ruff>=0.6.0; extra == "dev"
272
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
273
+ Requires-Dist: pre-commit>=3.8.0; extra == "dev"
242
274
  Dynamic: license-file
243
275
 
244
276
  #
@@ -297,7 +329,7 @@ print(gaggle(SAMPLE_TEXT))
297
329
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
298
330
 
299
331
  Consult the [Glitchlings Usage Guide](docs/index.md)
300
- for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
332
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
301
333
  integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
302
334
 
303
335
  ## Motivation
@@ -338,10 +370,67 @@ They're horrible little gremlins, but they're not _unreasonable_.
338
370
 
339
371
  Keyboard warriors can challenge them directly via the `glitchlings` command:
340
372
 
373
+ <!-- BEGIN: CLI_USAGE -->
341
374
  ```bash
342
375
  # Discover which glitchlings are currently on the loose.
343
376
  glitchlings --list
377
+ ```
378
+
379
+ ```text
380
+ Typogre — scope: Character, order: early
381
+ Apostrofae — scope: Character, order: normal
382
+ Mim1c — scope: Character, order: last
383
+ Jargoyle — scope: Word, order: normal
384
+ Adjax — scope: Word, order: normal
385
+ Reduple — scope: Word, order: normal
386
+ Rushmore — scope: Word, order: normal
387
+ Redactyl — scope: Word, order: normal
388
+ Scannequin — scope: Character, order: late
389
+ Zeedub — scope: Character, order: last
390
+ ```
391
+
392
+ ```bash
393
+ # Review the full CLI contract.
394
+ glitchlings --help
395
+ ```
396
+
397
+ ```text
398
+ usage: glitchlings [-h] [-g SPEC] [-s SEED] [-f FILE] [--sample] [--diff]
399
+ [--list] [-c CONFIG]
400
+ [text]
401
+
402
+ Summon glitchlings to corrupt text. Provide input text as an argument, via
403
+ --file, or pipe it on stdin.
404
+
405
+ positional arguments:
406
+ text Text to corrupt. If omitted, stdin is used or --sample
407
+ provides fallback text.
408
+
409
+ options:
410
+ -h, --help show this help message and exit
411
+ -g SPEC, --glitchling SPEC
412
+ Glitchling to apply, optionally with parameters like
413
+ Typogre(rate=0.05). Repeat for multiples; defaults to
414
+ all built-ins.
415
+ -s SEED, --seed SEED Seed controlling deterministic corruption order
416
+ (default: 151).
417
+ -f FILE, --file FILE Read input text from a file instead of the command
418
+ line argument.
419
+ --sample Use the included SAMPLE_TEXT when no other input is
420
+ provided.
421
+ --diff Show a unified diff between the original and corrupted
422
+ text.
423
+ --list List available glitchlings and exit.
424
+ -c CONFIG, --config CONFIG
425
+ Load glitchlings from a YAML configuration file.
426
+ ```
427
+ <!-- END: CLI_USAGE -->
428
+
429
+ Run `python docs/build_cli_reference.py` whenever you tweak the CLI so the README stays in sync with the actual output. The script executes the commands above and replaces the block between the markers automatically.
344
430
 
431
+ Prefer inline tweaks? You can still configure glitchlings directly in the shell:
432
+
433
+ ```bash
345
434
  # Run Typogre against the contents of a file and inspect the diff.
346
435
  glitchlings -g typogre --file documents/report.txt --diff
347
436
 
@@ -355,8 +444,6 @@ echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
355
444
  glitchlings --config experiments/chaos.yaml "Let slips the glitchlings of war"
356
445
  ```
357
446
 
358
- Use `--help` for a complete breakdown of available options, including support for parameterised glitchlings via `-g "Name(arg=value, ...)"` to mirror the Python API.
359
-
360
447
  Attack configurations live in plain YAML files so you can version-control experiments without touching code:
361
448
 
362
449
  ```yaml
@@ -394,6 +481,16 @@ _What a nice word, would be a shame if something happened to it._
394
481
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
395
482
  > - `seed (int)`: The random seed for reproducibility (default: 151).
396
483
 
484
+ ### Apostrofae
485
+
486
+ _It looks like you're trying to paste some text. Can I help?_
487
+
488
+ > _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
489
+ >
490
+ > Args
491
+ >
492
+ > - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
493
+
397
494
  ### Mim1c
398
495
 
399
496
  _Wait, was that...?_
@@ -54,7 +54,7 @@ print(gaggle(SAMPLE_TEXT))
54
54
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
55
55
 
56
56
  Consult the [Glitchlings Usage Guide](docs/index.md)
57
- for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
57
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
58
58
  integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
59
59
 
60
60
  ## Motivation
@@ -95,10 +95,67 @@ They're horrible little gremlins, but they're not _unreasonable_.
95
95
 
96
96
  Keyboard warriors can challenge them directly via the `glitchlings` command:
97
97
 
98
+ <!-- BEGIN: CLI_USAGE -->
98
99
  ```bash
99
100
  # Discover which glitchlings are currently on the loose.
100
101
  glitchlings --list
102
+ ```
103
+
104
+ ```text
105
+ Typogre — scope: Character, order: early
106
+ Apostrofae — scope: Character, order: normal
107
+ Mim1c — scope: Character, order: last
108
+ Jargoyle — scope: Word, order: normal
109
+ Adjax — scope: Word, order: normal
110
+ Reduple — scope: Word, order: normal
111
+ Rushmore — scope: Word, order: normal
112
+ Redactyl — scope: Word, order: normal
113
+ Scannequin — scope: Character, order: late
114
+ Zeedub — scope: Character, order: last
115
+ ```
116
+
117
+ ```bash
118
+ # Review the full CLI contract.
119
+ glitchlings --help
120
+ ```
121
+
122
+ ```text
123
+ usage: glitchlings [-h] [-g SPEC] [-s SEED] [-f FILE] [--sample] [--diff]
124
+ [--list] [-c CONFIG]
125
+ [text]
126
+
127
+ Summon glitchlings to corrupt text. Provide input text as an argument, via
128
+ --file, or pipe it on stdin.
129
+
130
+ positional arguments:
131
+ text Text to corrupt. If omitted, stdin is used or --sample
132
+ provides fallback text.
133
+
134
+ options:
135
+ -h, --help show this help message and exit
136
+ -g SPEC, --glitchling SPEC
137
+ Glitchling to apply, optionally with parameters like
138
+ Typogre(rate=0.05). Repeat for multiples; defaults to
139
+ all built-ins.
140
+ -s SEED, --seed SEED Seed controlling deterministic corruption order
141
+ (default: 151).
142
+ -f FILE, --file FILE Read input text from a file instead of the command
143
+ line argument.
144
+ --sample Use the included SAMPLE_TEXT when no other input is
145
+ provided.
146
+ --diff Show a unified diff between the original and corrupted
147
+ text.
148
+ --list List available glitchlings and exit.
149
+ -c CONFIG, --config CONFIG
150
+ Load glitchlings from a YAML configuration file.
151
+ ```
152
+ <!-- END: CLI_USAGE -->
153
+
154
+ Run `python docs/build_cli_reference.py` whenever you tweak the CLI so the README stays in sync with the actual output. The script executes the commands above and replaces the block between the markers automatically.
101
155
 
156
+ Prefer inline tweaks? You can still configure glitchlings directly in the shell:
157
+
158
+ ```bash
102
159
  # Run Typogre against the contents of a file and inspect the diff.
103
160
  glitchlings -g typogre --file documents/report.txt --diff
104
161
 
@@ -112,8 +169,6 @@ echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
112
169
  glitchlings --config experiments/chaos.yaml "Let slips the glitchlings of war"
113
170
  ```
114
171
 
115
- Use `--help` for a complete breakdown of available options, including support for parameterised glitchlings via `-g "Name(arg=value, ...)"` to mirror the Python API.
116
-
117
172
  Attack configurations live in plain YAML files so you can version-control experiments without touching code:
118
173
 
119
174
  ```yaml
@@ -151,6 +206,16 @@ _What a nice word, would be a shame if something happened to it._
151
206
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
152
207
  > - `seed (int)`: The random seed for reproducibility (default: 151).
153
208
 
209
+ ### Apostrofae
210
+
211
+ _It looks like you're trying to paste some text. Can I help?_
212
+
213
+ > _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
214
+ >
215
+ > Args
216
+ >
217
+ > - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
218
+
154
219
  ### Mim1c
155
220
 
156
221
  _Wait, was that...?_
@@ -0,0 +1,186 @@
1
+ [project]
2
+ name = "glitchlings"
3
+ version = "0.4.3"
4
+ description = "Monsters for your language games."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+
8
+ dependencies = [
9
+ "confusable-homoglyphs>=3.3.1",
10
+ "tomli>=2.0.1; python_version < '3.11'",
11
+ "pyyaml>=6.0.0",
12
+ ]
13
+
14
+ authors = [
15
+ { name = "osoleve" }
16
+ ]
17
+
18
+ keywords = [
19
+ "nlp",
20
+ "adversarial augmentation",
21
+ "text augmentation",
22
+ "data augmentation",
23
+ "domain randomization"
24
+ ]
25
+
26
+ classifiers = [
27
+ "Development Status :: 3 - Alpha",
28
+ "Intended Audience :: Developers",
29
+ "Programming Language :: Python",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.10",
32
+ "Programming Language :: Python :: 3.11",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Programming Language :: Rust",
35
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
36
+ "Topic :: Software Development :: Testing",
37
+ ]
38
+
39
+ [project.license]
40
+ file = "LICENSE"
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/osoleve/glitchlings"
44
+ Repository = "https://github.com/osoleve/glitchlings.git"
45
+ Issues = "https://github.com/osoleve/glitchlings/issues"
46
+ Changelog = "https://github.com/osoleve/glitchlings/releases"
47
+
48
+ [project.scripts]
49
+ glitchlings = "glitchlings.main:main"
50
+
51
+ [project.optional-dependencies]
52
+ all = [
53
+ "black>=24.4.0",
54
+ "hypothesis>=6.140.0",
55
+ "interrogate>=1.5.0",
56
+ "jellyfish>=1.2.0",
57
+ "isort>=5.13.0",
58
+ "mkdocs>=1.6.0",
59
+ "mkdocs-material>=9.5.0",
60
+ "mkdocstrings[python]>=0.24.0",
61
+ "mkdocstrings-python>=1.10.0",
62
+ "mypy>=1.8.0",
63
+ "numpy>=1.24,<=2.0",
64
+ "pre-commit>=3.8.0",
65
+ "pytest>=8.0.0",
66
+ "ruff>=0.6.0",
67
+ "verifiers>=0.1.3.post0",
68
+ ]
69
+ hf = ["datasets>=4.0.0"]
70
+ lightning = ["pytorch_lightning>=2.0.0"]
71
+ vectors = ["numpy>=1.24,<=2.0", "spacy>=3.7.2", "gensim>=4.3.2"]
72
+ st = ["sentence-transformers>=3.0.0"]
73
+ prime = ["verifiers>=0.1.3.post0", "jellyfish>=1.2.0"]
74
+ torch = ["torch>=2.0.0"]
75
+ dev = [
76
+ "pytest>=8.0.0",
77
+ "hypothesis>=6.140.0",
78
+ "numpy>=1.24,<=2.0",
79
+ "mkdocs>=1.6.0",
80
+ "mkdocstrings[python]>=0.24.0",
81
+ "mkdocs-material>=9.5.0",
82
+ "mkdocstrings-python>=1.10.0",
83
+ "interrogate>=1.5.0",
84
+ "black>=24.4.0",
85
+ "isort>=5.13.0",
86
+ "ruff>=0.6.0",
87
+ "mypy>=1.8.0",
88
+ "pre-commit>=3.8.0",
89
+ ]
90
+
91
+ [build-system]
92
+ requires = ["setuptools>=69", "wheel", "setuptools-rust>=1.8.0"]
93
+ build-backend = "setuptools.build_meta"
94
+
95
+ [tool.setuptools]
96
+ package-dir = {"" = "src"}
97
+ include-package-data = true
98
+
99
+ [tool.setuptools.package-data]
100
+ "glitchlings" = ["config.toml"]
101
+ "glitchlings.lexicon" = ["data/*.json"]
102
+ "glitchlings.zoo" = ["ocr_confusions.tsv"]
103
+ "glitchlings.zoo.assets" = ["apostrofae_pairs.json"]
104
+
105
+ [tool.setuptools.packages.find]
106
+ where = ["src"]
107
+
108
+ [[tool.setuptools-rust.ext-modules]]
109
+ target = "glitchlings._zoo_rust"
110
+ path = "rust/zoo/Cargo.toml"
111
+ binding = "PyO3"
112
+ debug = false
113
+
114
+
115
+ [tool.pytest.ini_options]
116
+ pythonpath = [
117
+ "src",
118
+ ]
119
+
120
+ [tool.interrogate]
121
+ config = true
122
+ fail-under = 80
123
+ ignore-init-module = true
124
+ ignore-module = true
125
+ ignore-nested-functions = true
126
+ ignore-private = true
127
+ ignore-semiprivate = true
128
+ ignore-magic = true
129
+ ignore-property-decorators = false
130
+ color = true
131
+ quiet = false
132
+ exclude = [
133
+ "tests",
134
+ "docs",
135
+ "rust",
136
+ "benchmarks",
137
+ ]
138
+
139
+ [tool.black]
140
+ line-length = 100
141
+ target-version = ["py310"]
142
+
143
+ [tool.isort]
144
+ profile = "black"
145
+ line_length = 100
146
+
147
+ [tool.ruff]
148
+ target-version = "py310"
149
+ line-length = 100
150
+
151
+ [tool.ruff.lint]
152
+ select = ["E", "F", "I"]
153
+
154
+ [tool.mypy]
155
+ python_version = "3.10"
156
+ follow_imports = "skip"
157
+ ignore_missing_imports = true
158
+ enable_error_code = ["ignore-without-code"]
159
+
160
+ [[tool.mypy.overrides]]
161
+ module = [
162
+ "glitchlings.util.adapters",
163
+ "glitchlings.dlc._shared",
164
+ "glitchlings.dlc.huggingface",
165
+ "glitchlings.dlc.prime",
166
+ ]
167
+ strict = true
168
+
169
+ [[tool.mypy.overrides]]
170
+ module = [
171
+ "glitchlings.compat",
172
+ "glitchlings.config",
173
+ "glitchlings.lexicon",
174
+ "glitchlings.lexicon.*",
175
+ ]
176
+ strict = true
177
+
178
+ [[tool.mypy.overrides]]
179
+ module = [
180
+ "glitchlings.main",
181
+ "glitchlings.__main__",
182
+ "glitchlings.__init__",
183
+ "glitchlings.zoo",
184
+ "glitchlings.zoo.*",
185
+ ]
186
+ strict = true
@@ -90,6 +90,12 @@ version = "2.0.6"
90
90
  source = "registry+https://github.com/rust-lang/crates.io-index"
91
91
  checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
92
92
 
93
+ [[package]]
94
+ name = "itoa"
95
+ version = "1.0.15"
96
+ source = "registry+https://github.com/rust-lang/crates.io-index"
97
+ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
98
+
93
99
  [[package]]
94
100
  name = "libc"
95
101
  version = "0.2.176"
@@ -275,12 +281,60 @@ version = "0.8.6"
275
281
  source = "registry+https://github.com/rust-lang/crates.io-index"
276
282
  checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
277
283
 
284
+ [[package]]
285
+ name = "ryu"
286
+ version = "1.0.20"
287
+ source = "registry+https://github.com/rust-lang/crates.io-index"
288
+ checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
289
+
278
290
  [[package]]
279
291
  name = "scopeguard"
280
292
  version = "1.2.0"
281
293
  source = "registry+https://github.com/rust-lang/crates.io-index"
282
294
  checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
283
295
 
296
+ [[package]]
297
+ name = "serde"
298
+ version = "1.0.228"
299
+ source = "registry+https://github.com/rust-lang/crates.io-index"
300
+ checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
301
+ dependencies = [
302
+ "serde_core",
303
+ ]
304
+
305
+ [[package]]
306
+ name = "serde_core"
307
+ version = "1.0.228"
308
+ source = "registry+https://github.com/rust-lang/crates.io-index"
309
+ checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
310
+ dependencies = [
311
+ "serde_derive",
312
+ ]
313
+
314
+ [[package]]
315
+ name = "serde_derive"
316
+ version = "1.0.228"
317
+ source = "registry+https://github.com/rust-lang/crates.io-index"
318
+ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
319
+ dependencies = [
320
+ "proc-macro2",
321
+ "quote",
322
+ "syn",
323
+ ]
324
+
325
+ [[package]]
326
+ name = "serde_json"
327
+ version = "1.0.145"
328
+ source = "registry+https://github.com/rust-lang/crates.io-index"
329
+ checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
330
+ dependencies = [
331
+ "itoa",
332
+ "memchr",
333
+ "ryu",
334
+ "serde",
335
+ "serde_core",
336
+ ]
337
+
284
338
  [[package]]
285
339
  name = "smallvec"
286
340
  version = "1.15.1"
@@ -407,5 +461,6 @@ dependencies = [
407
461
  "pyo3",
408
462
  "pyo3-build-config",
409
463
  "regex",
464
+ "serde_json",
410
465
  "smallvec",
411
466
  ]
@@ -14,6 +14,7 @@ regex = { workspace = true }
14
14
  once_cell = { workspace = true }
15
15
  blake2 = { workspace = true }
16
16
  smallvec = "1"
17
+ serde_json = "1"
17
18
 
18
19
  [package.metadata.maturin]
19
20
  module-name = "glitchlings._zoo_rust"
@@ -6,7 +6,9 @@ use std::path::PathBuf;
6
6
  use std::process::Command;
7
7
 
8
8
  fn main() {
9
- prepare_confusion_table().expect("failed to stage OCR confusion table for compilation");
9
+ stage_asset("ocr_confusions.tsv").expect("failed to stage OCR confusion table for compilation");
10
+ stage_asset("apostrofae_pairs.json")
11
+ .expect("failed to stage Apostrofae replacement table for compilation");
10
12
  pyo3_build_config::add_extension_module_link_args();
11
13
 
12
14
  // Only perform custom Python linking on non-Linux platforms.
@@ -97,46 +99,60 @@ fn query_python(python: &OsStr, command: &str) -> Option<String> {
97
99
  Some(value)
98
100
  }
99
101
 
100
- fn prepare_confusion_table() -> io::Result<()> {
102
+ fn stage_asset(asset_name: &str) -> io::Result<()> {
101
103
  let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("missing manifest dir"));
102
104
  let out_dir = PathBuf::from(env::var("OUT_DIR").expect("missing OUT_DIR"));
103
105
 
104
- let repo_path = manifest_dir.join("../../src/glitchlings/zoo/ocr_confusions.tsv");
105
- let packaged_path = manifest_dir.join("assets/ocr_confusions.tsv");
106
+ let repo_candidates = [
107
+ manifest_dir
108
+ .join("../../src/glitchlings/zoo/assets")
109
+ .join(asset_name),
110
+ manifest_dir
111
+ .join("../../src/glitchlings/zoo")
112
+ .join(asset_name),
113
+ ];
114
+ let packaged_path = manifest_dir.join("assets").join(asset_name);
106
115
  println!("cargo:rerun-if-changed={}", packaged_path.display());
107
116
 
108
- let source_path = if repo_path.exists() {
109
- println!("cargo:rerun-if-changed={}", repo_path.display());
110
- if packaged_path.exists() {
111
- let repo_bytes = fs::read(&repo_path)?;
112
- let packaged_bytes = fs::read(&packaged_path)?;
113
- if repo_bytes != packaged_bytes {
114
- return Err(io::Error::new(
115
- ErrorKind::Other,
116
- format!(
117
- "OCR confusion table at {} is out of sync with {}",
118
- packaged_path.display(),
119
- repo_path.display()
120
- ),
121
- ));
117
+ let mut source_path: Option<PathBuf> = None;
118
+ for candidate in &repo_candidates {
119
+ if candidate.exists() {
120
+ println!("cargo:rerun-if-changed={}", candidate.display());
121
+ if packaged_path.exists() {
122
+ let repo_bytes = fs::read(candidate)?;
123
+ let packaged_bytes = fs::read(&packaged_path)?;
124
+ if repo_bytes != packaged_bytes {
125
+ return Err(io::Error::new(
126
+ ErrorKind::Other,
127
+ format!(
128
+ "asset {} is out of sync with {}",
129
+ packaged_path.display(),
130
+ candidate.display()
131
+ ),
132
+ ));
133
+ }
122
134
  }
135
+ source_path = Some(candidate.clone());
136
+ break;
123
137
  }
124
- repo_path
125
- } else {
126
- if !packaged_path.exists() {
127
- return Err(io::Error::new(
128
- ErrorKind::NotFound,
129
- format!(
130
- "missing OCR confusion table; looked for {} and {}",
131
- repo_path.display(),
132
- packaged_path.display()
133
- ),
134
- ));
135
- }
138
+ }
139
+
140
+ let source_path = if let Some(path) = source_path {
141
+ path
142
+ } else if packaged_path.exists() {
136
143
  packaged_path
144
+ } else {
145
+ return Err(io::Error::new(
146
+ ErrorKind::NotFound,
147
+ format!(
148
+ "missing asset {asset_name}; looked for {} and {}",
149
+ repo_candidates[0].display(),
150
+ packaged_path.display()
151
+ ),
152
+ ));
137
153
  };
138
154
 
139
155
  fs::create_dir_all(&out_dir)?;
140
- fs::copy(&source_path, out_dir.join("ocr_confusions.tsv"))?;
156
+ fs::copy(&source_path, out_dir.join(asset_name))?;
141
157
  Ok(())
142
158
  }