glitchlings 0.4.2__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (67) hide show
  1. {glitchlings-0.4.2 → glitchlings-0.4.3}/MANIFEST.in +1 -1
  2. {glitchlings-0.4.2 → glitchlings-0.4.3}/PKG-INFO +35 -2
  3. {glitchlings-0.4.2 → glitchlings-0.4.3}/README.md +12 -1
  4. {glitchlings-0.4.2 → glitchlings-0.4.3}/pyproject.toml +30 -4
  5. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/Cargo.lock +55 -0
  6. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/Cargo.toml +1 -0
  7. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/build.rs +47 -31
  8. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/glitch_ops.rs +154 -2
  9. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/lib.rs +14 -2
  10. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/resources.rs +35 -1
  11. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/__init__.py +4 -0
  12. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/compat.py +80 -11
  13. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/config.py +32 -19
  14. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/config.toml +1 -1
  15. glitchlings-0.4.3/src/glitchlings/dlc/__init__.py +7 -0
  16. glitchlings-0.4.3/src/glitchlings/dlc/pytorch.py +216 -0
  17. glitchlings-0.4.3/src/glitchlings/dlc/pytorch_lightning.py +233 -0
  18. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/__init__.py +5 -15
  19. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/_cache.py +21 -15
  20. glitchlings-0.4.3/src/glitchlings/lexicon/data/default_vector_cache.json +82 -0
  21. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/vector.py +94 -15
  22. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/wordnet.py +66 -25
  23. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/main.py +21 -11
  24. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/__init__.py +5 -1
  25. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/adjax.py +2 -2
  26. glitchlings-0.4.3/src/glitchlings/zoo/apostrofae.py +128 -0
  27. glitchlings-0.4.3/src/glitchlings/zoo/assets/__init__.py +0 -0
  28. glitchlings-0.4.3/src/glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
  29. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/core.py +40 -14
  30. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/jargoyle.py +44 -34
  31. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/redactyl.py +11 -8
  32. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/reduple.py +2 -2
  33. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/rushmore.py +2 -2
  34. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/scannequin.py +2 -2
  35. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/typogre.py +5 -2
  36. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/zeedub.py +5 -2
  37. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/PKG-INFO +35 -2
  38. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/SOURCES.txt +6 -2
  39. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/requires.txt +26 -0
  40. glitchlings-0.4.2/src/glitchlings/dlc/__init__.py +0 -5
  41. glitchlings-0.4.2/src/glitchlings/lexicon/data/default_vector_cache.json +0 -16
  42. glitchlings-0.4.2/src/glitchlings/lexicon/graph.py +0 -282
  43. {glitchlings-0.4.2 → glitchlings-0.4.3}/LICENSE +0 -0
  44. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/Cargo.toml +0 -0
  45. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/assets/ocr_confusions.tsv +0 -0
  46. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/pipeline.rs +0 -0
  47. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/rng.rs +0 -0
  48. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/text_buffer.rs +0 -0
  49. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/typogre.rs +0 -0
  50. {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/zeedub.rs +0 -0
  51. {glitchlings-0.4.2 → glitchlings-0.4.3}/setup.cfg +0 -0
  52. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/__main__.py +0 -0
  53. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/dlc/_shared.py +0 -0
  54. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/dlc/huggingface.py +0 -0
  55. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/dlc/prime.py +0 -0
  56. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/metrics.py +0 -0
  57. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/util/__init__.py +0 -0
  58. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/util/adapters.py +0 -0
  59. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
  60. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_rate.py +0 -0
  61. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_sampling.py +0 -0
  62. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_text_utils.py +0 -0
  63. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/mim1c.py +0 -0
  64. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
  65. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/dependency_links.txt +0 -0
  66. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/entry_points.txt +0 -0
  67. {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  recursive-include rust *.rs *.toml *.lock *.tsv
2
- recursive-include src/glitchlings/zoo *.tsv
2
+ recursive-include src/glitchlings/zoo *.tsv *.json
3
3
  recursive-include src/glitchlings/lexicon/data *.json
4
4
  include src/glitchlings/config.toml
5
5
  prune rust/target
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -226,15 +226,37 @@ License-File: LICENSE
226
226
  Requires-Dist: confusable-homoglyphs>=3.3.1
227
227
  Requires-Dist: tomli>=2.0.1; python_version < "3.11"
228
228
  Requires-Dist: pyyaml>=6.0.0
229
+ Provides-Extra: all
230
+ Requires-Dist: black>=24.4.0; extra == "all"
231
+ Requires-Dist: hypothesis>=6.140.0; extra == "all"
232
+ Requires-Dist: interrogate>=1.5.0; extra == "all"
233
+ Requires-Dist: jellyfish>=1.2.0; extra == "all"
234
+ Requires-Dist: isort>=5.13.0; extra == "all"
235
+ Requires-Dist: mkdocs>=1.6.0; extra == "all"
236
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "all"
237
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "all"
238
+ Requires-Dist: mkdocstrings-python>=1.10.0; extra == "all"
239
+ Requires-Dist: mypy>=1.8.0; extra == "all"
240
+ Requires-Dist: numpy<=2.0,>=1.24; extra == "all"
241
+ Requires-Dist: pre-commit>=3.8.0; extra == "all"
242
+ Requires-Dist: pytest>=8.0.0; extra == "all"
243
+ Requires-Dist: ruff>=0.6.0; extra == "all"
244
+ Requires-Dist: verifiers>=0.1.3.post0; extra == "all"
229
245
  Provides-Extra: hf
230
246
  Requires-Dist: datasets>=4.0.0; extra == "hf"
247
+ Provides-Extra: lightning
248
+ Requires-Dist: pytorch_lightning>=2.0.0; extra == "lightning"
231
249
  Provides-Extra: vectors
232
250
  Requires-Dist: numpy<=2.0,>=1.24; extra == "vectors"
233
251
  Requires-Dist: spacy>=3.7.2; extra == "vectors"
234
252
  Requires-Dist: gensim>=4.3.2; extra == "vectors"
253
+ Provides-Extra: st
254
+ Requires-Dist: sentence-transformers>=3.0.0; extra == "st"
235
255
  Provides-Extra: prime
236
256
  Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
237
257
  Requires-Dist: jellyfish>=1.2.0; extra == "prime"
258
+ Provides-Extra: torch
259
+ Requires-Dist: torch>=2.0.0; extra == "torch"
238
260
  Provides-Extra: dev
239
261
  Requires-Dist: pytest>=8.0.0; extra == "dev"
240
262
  Requires-Dist: hypothesis>=6.140.0; extra == "dev"
@@ -307,7 +329,7 @@ print(gaggle(SAMPLE_TEXT))
307
329
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
308
330
 
309
331
  Consult the [Glitchlings Usage Guide](docs/index.md)
310
- for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
332
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
311
333
  integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
312
334
 
313
335
  ## Motivation
@@ -356,6 +378,7 @@ glitchlings --list
356
378
 
357
379
  ```text
358
380
  Typogre — scope: Character, order: early
381
+ Apostrofae — scope: Character, order: normal
359
382
  Mim1c — scope: Character, order: last
360
383
  Jargoyle — scope: Word, order: normal
361
384
  Adjax — scope: Word, order: normal
@@ -458,6 +481,16 @@ _What a nice word, would be a shame if something happened to it._
458
481
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
459
482
  > - `seed (int)`: The random seed for reproducibility (default: 151).
460
483
 
484
+ ### Apostrofae
485
+
486
+ _It looks like you're trying to paste some text. Can I help?_
487
+
488
+ > _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
489
+ >
490
+ > Args
491
+ >
492
+ > - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
493
+
461
494
  ### Mim1c
462
495
 
463
496
  _Wait, was that...?_
@@ -54,7 +54,7 @@ print(gaggle(SAMPLE_TEXT))
54
54
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
55
55
 
56
56
  Consult the [Glitchlings Usage Guide](docs/index.md)
57
- for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
57
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
58
58
  integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
59
59
 
60
60
  ## Motivation
@@ -103,6 +103,7 @@ glitchlings --list
103
103
 
104
104
  ```text
105
105
  Typogre — scope: Character, order: early
106
+ Apostrofae — scope: Character, order: normal
106
107
  Mim1c — scope: Character, order: last
107
108
  Jargoyle — scope: Word, order: normal
108
109
  Adjax — scope: Word, order: normal
@@ -205,6 +206,16 @@ _What a nice word, would be a shame if something happened to it._
205
206
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
206
207
  > - `seed (int)`: The random seed for reproducibility (default: 151).
207
208
 
209
+ ### Apostrofae
210
+
211
+ _It looks like you're trying to paste some text. Can I help?_
212
+
213
+ > _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
214
+ >
215
+ > Args
216
+ >
217
+ > - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
218
+
208
219
  ### Mim1c
209
220
 
210
221
  _Wait, was that...?_
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "glitchlings"
3
- version = "0.4.2"
3
+ version = "0.4.3"
4
4
  description = "Monsters for your language games."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -49,9 +49,29 @@ Changelog = "https://github.com/osoleve/glitchlings/releases"
49
49
  glitchlings = "glitchlings.main:main"
50
50
 
51
51
  [project.optional-dependencies]
52
+ all = [
53
+ "black>=24.4.0",
54
+ "hypothesis>=6.140.0",
55
+ "interrogate>=1.5.0",
56
+ "jellyfish>=1.2.0",
57
+ "isort>=5.13.0",
58
+ "mkdocs>=1.6.0",
59
+ "mkdocs-material>=9.5.0",
60
+ "mkdocstrings[python]>=0.24.0",
61
+ "mkdocstrings-python>=1.10.0",
62
+ "mypy>=1.8.0",
63
+ "numpy>=1.24,<=2.0",
64
+ "pre-commit>=3.8.0",
65
+ "pytest>=8.0.0",
66
+ "ruff>=0.6.0",
67
+ "verifiers>=0.1.3.post0",
68
+ ]
52
69
  hf = ["datasets>=4.0.0"]
70
+ lightning = ["pytorch_lightning>=2.0.0"]
53
71
  vectors = ["numpy>=1.24,<=2.0", "spacy>=3.7.2", "gensim>=4.3.2"]
72
+ st = ["sentence-transformers>=3.0.0"]
54
73
  prime = ["verifiers>=0.1.3.post0", "jellyfish>=1.2.0"]
74
+ torch = ["torch>=2.0.0"]
55
75
  dev = [
56
76
  "pytest>=8.0.0",
57
77
  "hypothesis>=6.140.0",
@@ -80,6 +100,7 @@ include-package-data = true
80
100
  "glitchlings" = ["config.toml"]
81
101
  "glitchlings.lexicon" = ["data/*.json"]
82
102
  "glitchlings.zoo" = ["ocr_confusions.tsv"]
103
+ "glitchlings.zoo.assets" = ["apostrofae_pairs.json"]
83
104
 
84
105
  [tool.setuptools.packages.find]
85
106
  where = ["src"]
@@ -151,10 +172,15 @@ module = [
151
172
  "glitchlings.config",
152
173
  "glitchlings.lexicon",
153
174
  "glitchlings.lexicon.*",
154
- "glitchlings.zoo",
155
- "glitchlings.zoo.*",
175
+ ]
176
+ strict = true
177
+
178
+ [[tool.mypy.overrides]]
179
+ module = [
156
180
  "glitchlings.main",
157
181
  "glitchlings.__main__",
158
182
  "glitchlings.__init__",
183
+ "glitchlings.zoo",
184
+ "glitchlings.zoo.*",
159
185
  ]
160
- ignore_errors = true
186
+ strict = true
@@ -90,6 +90,12 @@ version = "2.0.6"
90
90
  source = "registry+https://github.com/rust-lang/crates.io-index"
91
91
  checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
92
92
 
93
+ [[package]]
94
+ name = "itoa"
95
+ version = "1.0.15"
96
+ source = "registry+https://github.com/rust-lang/crates.io-index"
97
+ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
98
+
93
99
  [[package]]
94
100
  name = "libc"
95
101
  version = "0.2.176"
@@ -275,12 +281,60 @@ version = "0.8.6"
275
281
  source = "registry+https://github.com/rust-lang/crates.io-index"
276
282
  checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
277
283
 
284
+ [[package]]
285
+ name = "ryu"
286
+ version = "1.0.20"
287
+ source = "registry+https://github.com/rust-lang/crates.io-index"
288
+ checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
289
+
278
290
  [[package]]
279
291
  name = "scopeguard"
280
292
  version = "1.2.0"
281
293
  source = "registry+https://github.com/rust-lang/crates.io-index"
282
294
  checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
283
295
 
296
+ [[package]]
297
+ name = "serde"
298
+ version = "1.0.228"
299
+ source = "registry+https://github.com/rust-lang/crates.io-index"
300
+ checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
301
+ dependencies = [
302
+ "serde_core",
303
+ ]
304
+
305
+ [[package]]
306
+ name = "serde_core"
307
+ version = "1.0.228"
308
+ source = "registry+https://github.com/rust-lang/crates.io-index"
309
+ checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
310
+ dependencies = [
311
+ "serde_derive",
312
+ ]
313
+
314
+ [[package]]
315
+ name = "serde_derive"
316
+ version = "1.0.228"
317
+ source = "registry+https://github.com/rust-lang/crates.io-index"
318
+ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
319
+ dependencies = [
320
+ "proc-macro2",
321
+ "quote",
322
+ "syn",
323
+ ]
324
+
325
+ [[package]]
326
+ name = "serde_json"
327
+ version = "1.0.145"
328
+ source = "registry+https://github.com/rust-lang/crates.io-index"
329
+ checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
330
+ dependencies = [
331
+ "itoa",
332
+ "memchr",
333
+ "ryu",
334
+ "serde",
335
+ "serde_core",
336
+ ]
337
+
284
338
  [[package]]
285
339
  name = "smallvec"
286
340
  version = "1.15.1"
@@ -407,5 +461,6 @@ dependencies = [
407
461
  "pyo3",
408
462
  "pyo3-build-config",
409
463
  "regex",
464
+ "serde_json",
410
465
  "smallvec",
411
466
  ]
@@ -14,6 +14,7 @@ regex = { workspace = true }
14
14
  once_cell = { workspace = true }
15
15
  blake2 = { workspace = true }
16
16
  smallvec = "1"
17
+ serde_json = "1"
17
18
 
18
19
  [package.metadata.maturin]
19
20
  module-name = "glitchlings._zoo_rust"
@@ -6,7 +6,9 @@ use std::path::PathBuf;
6
6
  use std::process::Command;
7
7
 
8
8
  fn main() {
9
- prepare_confusion_table().expect("failed to stage OCR confusion table for compilation");
9
+ stage_asset("ocr_confusions.tsv").expect("failed to stage OCR confusion table for compilation");
10
+ stage_asset("apostrofae_pairs.json")
11
+ .expect("failed to stage Apostrofae replacement table for compilation");
10
12
  pyo3_build_config::add_extension_module_link_args();
11
13
 
12
14
  // Only perform custom Python linking on non-Linux platforms.
@@ -97,46 +99,60 @@ fn query_python(python: &OsStr, command: &str) -> Option<String> {
97
99
  Some(value)
98
100
  }
99
101
 
100
- fn prepare_confusion_table() -> io::Result<()> {
102
+ fn stage_asset(asset_name: &str) -> io::Result<()> {
101
103
  let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("missing manifest dir"));
102
104
  let out_dir = PathBuf::from(env::var("OUT_DIR").expect("missing OUT_DIR"));
103
105
 
104
- let repo_path = manifest_dir.join("../../src/glitchlings/zoo/ocr_confusions.tsv");
105
- let packaged_path = manifest_dir.join("assets/ocr_confusions.tsv");
106
+ let repo_candidates = [
107
+ manifest_dir
108
+ .join("../../src/glitchlings/zoo/assets")
109
+ .join(asset_name),
110
+ manifest_dir
111
+ .join("../../src/glitchlings/zoo")
112
+ .join(asset_name),
113
+ ];
114
+ let packaged_path = manifest_dir.join("assets").join(asset_name);
106
115
  println!("cargo:rerun-if-changed={}", packaged_path.display());
107
116
 
108
- let source_path = if repo_path.exists() {
109
- println!("cargo:rerun-if-changed={}", repo_path.display());
110
- if packaged_path.exists() {
111
- let repo_bytes = fs::read(&repo_path)?;
112
- let packaged_bytes = fs::read(&packaged_path)?;
113
- if repo_bytes != packaged_bytes {
114
- return Err(io::Error::new(
115
- ErrorKind::Other,
116
- format!(
117
- "OCR confusion table at {} is out of sync with {}",
118
- packaged_path.display(),
119
- repo_path.display()
120
- ),
121
- ));
117
+ let mut source_path: Option<PathBuf> = None;
118
+ for candidate in &repo_candidates {
119
+ if candidate.exists() {
120
+ println!("cargo:rerun-if-changed={}", candidate.display());
121
+ if packaged_path.exists() {
122
+ let repo_bytes = fs::read(candidate)?;
123
+ let packaged_bytes = fs::read(&packaged_path)?;
124
+ if repo_bytes != packaged_bytes {
125
+ return Err(io::Error::new(
126
+ ErrorKind::Other,
127
+ format!(
128
+ "asset {} is out of sync with {}",
129
+ packaged_path.display(),
130
+ candidate.display()
131
+ ),
132
+ ));
133
+ }
122
134
  }
135
+ source_path = Some(candidate.clone());
136
+ break;
123
137
  }
124
- repo_path
125
- } else {
126
- if !packaged_path.exists() {
127
- return Err(io::Error::new(
128
- ErrorKind::NotFound,
129
- format!(
130
- "missing OCR confusion table; looked for {} and {}",
131
- repo_path.display(),
132
- packaged_path.display()
133
- ),
134
- ));
135
- }
138
+ }
139
+
140
+ let source_path = if let Some(path) = source_path {
141
+ path
142
+ } else if packaged_path.exists() {
136
143
  packaged_path
144
+ } else {
145
+ return Err(io::Error::new(
146
+ ErrorKind::NotFound,
147
+ format!(
148
+ "missing asset {asset_name}; looked for {} and {}",
149
+ repo_candidates[0].display(),
150
+ packaged_path.display()
151
+ ),
152
+ ));
137
153
  };
138
154
 
139
155
  fs::create_dir_all(&out_dir)?;
140
- fs::copy(&source_path, out_dir.join("ocr_confusions.tsv"))?;
156
+ fs::copy(&source_path, out_dir.join(asset_name))?;
141
157
  Ok(())
142
158
  }
@@ -6,8 +6,8 @@ use std::collections::HashMap;
6
6
  use std::sync::{Mutex, OnceLock};
7
7
 
8
8
  use crate::resources::{
9
- affix_bounds, confusion_table, is_whitespace_only, split_affixes, MULTIPLE_WHITESPACE,
10
- SPACE_BEFORE_PUNCTUATION,
9
+ affix_bounds, apostrofae_pairs, confusion_table, is_whitespace_only, split_affixes,
10
+ MULTIPLE_WHITESPACE, SPACE_BEFORE_PUNCTUATION,
11
11
  };
12
12
  use crate::rng::{PyRng, PyRngError};
13
13
  use crate::text_buffer::{SegmentKind, TextBuffer, TextBufferError};
@@ -988,6 +988,156 @@ impl GlitchOp for TypoOp {
988
988
  }
989
989
  }
990
990
 
991
+ #[derive(Clone, Copy, Debug)]
992
+ enum QuoteKind {
993
+ Double,
994
+ Single,
995
+ Backtick,
996
+ }
997
+
998
+ impl QuoteKind {
999
+ fn from_char(ch: char) -> Option<Self> {
1000
+ match ch {
1001
+ '"' => Some(Self::Double),
1002
+ '\'' => Some(Self::Single),
1003
+ '`' => Some(Self::Backtick),
1004
+ _ => None,
1005
+ }
1006
+ }
1007
+
1008
+ fn as_char(self) -> char {
1009
+ match self {
1010
+ Self::Double => '"',
1011
+ Self::Single => '\'',
1012
+ Self::Backtick => '`',
1013
+ }
1014
+ }
1015
+
1016
+ fn index(self) -> usize {
1017
+ match self {
1018
+ Self::Double => 0,
1019
+ Self::Single => 1,
1020
+ Self::Backtick => 2,
1021
+ }
1022
+ }
1023
+ }
1024
+
1025
+ #[derive(Debug, Clone, Copy)]
1026
+ struct QuotePair {
1027
+ start: usize,
1028
+ end: usize,
1029
+ kind: QuoteKind,
1030
+ }
1031
+
1032
+ #[derive(Debug)]
1033
+ struct Replacement {
1034
+ start: usize,
1035
+ end: usize,
1036
+ value: String,
1037
+ }
1038
+
1039
+ #[derive(Debug, Default, Clone, Copy)]
1040
+ pub struct QuotePairsOp;
1041
+
1042
+ impl QuotePairsOp {
1043
+ fn collect_pairs(text: &str) -> Vec<QuotePair> {
1044
+ let mut pairs: Vec<QuotePair> = Vec::new();
1045
+ let mut stack: [Option<usize>; 3] = [None, None, None];
1046
+
1047
+ for (idx, ch) in text.char_indices() {
1048
+ if let Some(kind) = QuoteKind::from_char(ch) {
1049
+ let slot = kind.index();
1050
+ if let Some(start) = stack[slot] {
1051
+ pairs.push(QuotePair {
1052
+ start,
1053
+ end: idx,
1054
+ kind,
1055
+ });
1056
+ stack[slot] = None;
1057
+ } else {
1058
+ stack[slot] = Some(idx);
1059
+ }
1060
+ }
1061
+ }
1062
+
1063
+ pairs
1064
+ }
1065
+ }
1066
+
1067
+ impl GlitchOp for QuotePairsOp {
1068
+ fn apply(&self, buffer: &mut TextBuffer, rng: &mut dyn GlitchRng) -> Result<(), GlitchOpError> {
1069
+ let text = buffer.to_string();
1070
+ if text.is_empty() {
1071
+ return Ok(());
1072
+ }
1073
+
1074
+ let pairs = Self::collect_pairs(&text);
1075
+ if pairs.is_empty() {
1076
+ return Ok(());
1077
+ }
1078
+
1079
+ let table = apostrofae_pairs();
1080
+ if table.is_empty() {
1081
+ return Ok(());
1082
+ }
1083
+
1084
+ let mut replacements: Vec<Replacement> = Vec::with_capacity(pairs.len() * 2);
1085
+
1086
+ for pair in pairs {
1087
+ let key = pair.kind.as_char();
1088
+ let Some(options) = table.get(&key) else {
1089
+ continue;
1090
+ };
1091
+ if options.is_empty() {
1092
+ continue;
1093
+ }
1094
+ let choice = rng.rand_index(options.len())?;
1095
+ let (left, right) = &options[choice];
1096
+ let glyph_len = pair.kind.as_char().len_utf8();
1097
+ replacements.push(Replacement {
1098
+ start: pair.start,
1099
+ end: pair.start + glyph_len,
1100
+ value: left.clone(),
1101
+ });
1102
+ replacements.push(Replacement {
1103
+ start: pair.end,
1104
+ end: pair.end + glyph_len,
1105
+ value: right.clone(),
1106
+ });
1107
+ }
1108
+
1109
+ if replacements.is_empty() {
1110
+ return Ok(());
1111
+ }
1112
+
1113
+ replacements.sort_by_key(|replacement| replacement.start);
1114
+ let mut extra_capacity = 0usize;
1115
+ for replacement in &replacements {
1116
+ let span = replacement.end - replacement.start;
1117
+ if replacement.value.len() > span {
1118
+ extra_capacity += replacement.value.len() - span;
1119
+ }
1120
+ }
1121
+
1122
+ let mut result = String::with_capacity(text.len() + extra_capacity);
1123
+ let mut cursor = 0usize;
1124
+
1125
+ for replacement in replacements {
1126
+ if cursor < replacement.start {
1127
+ result.push_str(&text[cursor..replacement.start]);
1128
+ }
1129
+ result.push_str(&replacement.value);
1130
+ cursor = replacement.end;
1131
+ }
1132
+ if cursor < text.len() {
1133
+ result.push_str(&text[cursor..]);
1134
+ }
1135
+
1136
+ *buffer = TextBuffer::from_owned(result);
1137
+ Ok(())
1138
+ }
1139
+ }
1140
+
991
1141
  /// Type-erased glitchling operation for pipeline sequencing.
992
1142
  #[derive(Debug, Clone)]
993
1143
  pub enum GlitchOperation {
@@ -998,6 +1148,7 @@ pub enum GlitchOperation {
998
1148
  Ocr(OcrArtifactsOp),
999
1149
  Typo(TypoOp),
1000
1150
  ZeroWidth(ZeroWidthOp),
1151
+ QuotePairs(QuotePairsOp),
1001
1152
  }
1002
1153
 
1003
1154
  impl GlitchOp for GlitchOperation {
@@ -1010,6 +1161,7 @@ impl GlitchOp for GlitchOperation {
1010
1161
  GlitchOperation::Ocr(op) => op.apply(buffer, rng),
1011
1162
  GlitchOperation::Typo(op) => op.apply(buffer, rng),
1012
1163
  GlitchOperation::ZeroWidth(op) => op.apply(buffer, rng),
1164
+ GlitchOperation::QuotePairs(op) => op.apply(buffer, rng),
1013
1165
  }
1014
1166
  }
1015
1167
  }
@@ -15,8 +15,8 @@ use std::collections::HashMap;
15
15
  use std::sync::{Arc, OnceLock, RwLock};
16
16
 
17
17
  pub use glitch_ops::{
18
- DeleteRandomWordsOp, GlitchOpError, GlitchOperation, OcrArtifactsOp, RedactWordsOp,
19
- ReduplicateWordsOp, SwapAdjacentWordsOp, TypoOp, ZeroWidthOp,
18
+ DeleteRandomWordsOp, GlitchOpError, GlitchOperation, OcrArtifactsOp, QuotePairsOp,
19
+ RedactWordsOp, ReduplicateWordsOp, SwapAdjacentWordsOp, TypoOp, ZeroWidthOp,
20
20
  };
21
21
  pub use pipeline::{derive_seed, GlitchDescriptor, Pipeline, PipelineError};
22
22
  pub use rng::{PyRng, PyRngError};
@@ -193,6 +193,7 @@ enum PyGlitchOperation {
193
193
  rate: f64,
194
194
  characters: Vec<String>,
195
195
  },
196
+ QuotePairs,
196
197
  }
197
198
 
198
199
  impl<'py> FromPyObject<'py> for PyGlitchOperation {
@@ -308,6 +309,7 @@ impl<'py> FromPyObject<'py> for PyGlitchOperation {
308
309
  .unwrap_or_default();
309
310
  Ok(PyGlitchOperation::ZeroWidth { rate, characters })
310
311
  }
312
+ "apostrofae" | "quote_pairs" => Ok(PyGlitchOperation::QuotePairs),
311
313
  other => Err(PyValueError::new_err(format!(
312
314
  "unsupported operation type: {other}"
313
315
  ))),
@@ -363,6 +365,12 @@ fn swap_adjacent_words(text: &str, swap_rate: f64, rng: &Bound<'_, PyAny>) -> Py
363
365
  apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
364
366
  }
365
367
 
368
+ #[pyfunction]
369
+ fn apostrofae(text: &str, rng: &Bound<'_, PyAny>) -> PyResult<String> {
370
+ let op = QuotePairsOp::default();
371
+ apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
372
+ }
373
+
366
374
  #[pyfunction]
367
375
  fn ocr_artifacts(text: &str, error_rate: f64, rng: &Bound<'_, PyAny>) -> PyResult<String> {
368
376
  let op = OcrArtifactsOp { error_rate };
@@ -463,6 +471,9 @@ fn compose_glitchlings(
463
471
  PyGlitchOperation::ZeroWidth { rate, characters } => {
464
472
  GlitchOperation::ZeroWidth(glitch_ops::ZeroWidthOp { rate, characters })
465
473
  }
474
+ PyGlitchOperation::QuotePairs => {
475
+ GlitchOperation::QuotePairs(glitch_ops::QuotePairsOp::default())
476
+ }
466
477
  };
467
478
  Ok(GlitchDescriptor {
468
479
  name: descriptor.name,
@@ -481,6 +492,7 @@ fn _zoo_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
481
492
  m.add_function(wrap_pyfunction!(reduplicate_words, m)?)?;
482
493
  m.add_function(wrap_pyfunction!(delete_random_words, m)?)?;
483
494
  m.add_function(wrap_pyfunction!(swap_adjacent_words, m)?)?;
495
+ m.add_function(wrap_pyfunction!(apostrofae, m)?)?;
484
496
  m.add_function(wrap_pyfunction!(ocr_artifacts, m)?)?;
485
497
  m.add_function(wrap_pyfunction!(redact_words, m)?)?;
486
498
  m.add_function(wrap_pyfunction!(plan_glitchlings, m)?)?;
@@ -1,5 +1,8 @@
1
1
  use once_cell::sync::Lazy;
2
2
  use regex::Regex;
3
+ use std::collections::HashMap;
4
+
5
+ const RAW_APOSTROFAE_PAIRS: &str = include_str!(concat!(env!("OUT_DIR"), "/apostrofae_pairs.json"));
3
6
 
4
7
  const RAW_OCR_CONFUSIONS: &str = include_str!(concat!(env!("OUT_DIR"), "/ocr_confusions.tsv"));
5
8
 
@@ -11,6 +14,23 @@ pub static SPACE_BEFORE_PUNCTUATION: Lazy<Regex> =
11
14
  pub static MULTIPLE_WHITESPACE: Lazy<Regex> =
12
15
  Lazy::new(|| Regex::new(r"\s{2,}").expect("valid multi-whitespace regex"));
13
16
 
17
+ /// Replacement pairs used by the Apostrofae glitchling.
18
+ pub static APOSTROFAE_PAIR_TABLE: Lazy<HashMap<char, Vec<(String, String)>>> = Lazy::new(|| {
19
+ let raw: HashMap<String, Vec<[String; 2]>> = serde_json::from_str(RAW_APOSTROFAE_PAIRS)
20
+ .expect("apostrofae pair table should be valid JSON");
21
+ let mut table: HashMap<char, Vec<(String, String)>> = HashMap::new();
22
+ for (key, pairs) in raw {
23
+ if let Some(ch) = key.chars().next() {
24
+ let entries: Vec<(String, String)> = pairs
25
+ .into_iter()
26
+ .map(|pair| (pair[0].to_string(), pair[1].to_string()))
27
+ .collect();
28
+ table.insert(ch, entries);
29
+ }
30
+ }
31
+ table
32
+ });
33
+
14
34
  /// Sorted confusion pairs reused by glitchling implementations.
15
35
  pub static OCR_CONFUSION_TABLE: Lazy<Vec<(&'static str, &'static [&'static str])>> =
16
36
  Lazy::new(|| {
@@ -50,6 +70,11 @@ pub fn confusion_table() -> &'static [(&'static str, &'static [&'static str])] {
50
70
  OCR_CONFUSION_TABLE.as_slice()
51
71
  }
52
72
 
73
+ /// Returns the Apostrofae replacement pairs keyed by the straight glyph.
74
+ pub fn apostrofae_pairs() -> &'static HashMap<char, Vec<(String, String)>> {
75
+ &APOSTROFAE_PAIR_TABLE
76
+ }
77
+
53
78
  #[inline]
54
79
  pub fn is_whitespace_only(s: &str) -> bool {
55
80
  s.chars().all(char::is_whitespace)
@@ -126,7 +151,7 @@ pub fn split_affixes(word: &str) -> (String, String, String) {
126
151
 
127
152
  #[cfg(test)]
128
153
  mod tests {
129
- use super::{confusion_table, split_affixes, split_with_separators};
154
+ use super::{apostrofae_pairs, confusion_table, split_affixes, split_with_separators};
130
155
 
131
156
  #[test]
132
157
  fn split_with_separators_matches_expected_boundaries() {
@@ -162,4 +187,13 @@ mod tests {
162
187
  a_src.len() >= b_src.len()
163
188
  }));
164
189
  }
190
+
191
+ #[test]
192
+ fn apostrofae_pairs_loaded_from_asset() {
193
+ let table = apostrofae_pairs();
194
+ assert!(table.contains_key(&'"'));
195
+ assert!(table.contains_key(&'\''));
196
+ assert!(table.contains_key(&'`'));
197
+ assert!(table.values().all(|entries| !entries.is_empty()));
198
+ }
165
199
  }