glitchlings 0.4.2__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- {glitchlings-0.4.2 → glitchlings-0.4.3}/MANIFEST.in +1 -1
- {glitchlings-0.4.2 → glitchlings-0.4.3}/PKG-INFO +35 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/README.md +12 -1
- {glitchlings-0.4.2 → glitchlings-0.4.3}/pyproject.toml +30 -4
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/Cargo.lock +55 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/Cargo.toml +1 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/build.rs +47 -31
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/glitch_ops.rs +154 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/lib.rs +14 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/resources.rs +35 -1
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/__init__.py +4 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/compat.py +80 -11
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/config.py +32 -19
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/config.toml +1 -1
- glitchlings-0.4.3/src/glitchlings/dlc/__init__.py +7 -0
- glitchlings-0.4.3/src/glitchlings/dlc/pytorch.py +216 -0
- glitchlings-0.4.3/src/glitchlings/dlc/pytorch_lightning.py +233 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/__init__.py +5 -15
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/_cache.py +21 -15
- glitchlings-0.4.3/src/glitchlings/lexicon/data/default_vector_cache.json +82 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/vector.py +94 -15
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/wordnet.py +66 -25
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/main.py +21 -11
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/__init__.py +5 -1
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/adjax.py +2 -2
- glitchlings-0.4.3/src/glitchlings/zoo/apostrofae.py +128 -0
- glitchlings-0.4.3/src/glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings-0.4.3/src/glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/core.py +40 -14
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/jargoyle.py +44 -34
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/redactyl.py +11 -8
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/reduple.py +2 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/rushmore.py +2 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/scannequin.py +2 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/typogre.py +5 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/zeedub.py +5 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/PKG-INFO +35 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/SOURCES.txt +6 -2
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/requires.txt +26 -0
- glitchlings-0.4.2/src/glitchlings/dlc/__init__.py +0 -5
- glitchlings-0.4.2/src/glitchlings/lexicon/data/default_vector_cache.json +0 -16
- glitchlings-0.4.2/src/glitchlings/lexicon/graph.py +0 -282
- {glitchlings-0.4.2 → glitchlings-0.4.3}/LICENSE +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/Cargo.toml +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/assets/ocr_confusions.tsv +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/pipeline.rs +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/rng.rs +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/text_buffer.rs +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/typogre.rs +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/rust/zoo/src/zeedub.rs +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/setup.cfg +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/dlc/_shared.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/dlc/huggingface.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/dlc/prime.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/lexicon/metrics.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/util/adapters.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_rate.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_sampling.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/_text_utils.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/mim1c.py +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/dependency_links.txt +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/entry_points.txt +0 -0
- {glitchlings-0.4.2 → glitchlings-0.4.3}/src/glitchlings.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: glitchlings
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Monsters for your language games.
|
|
5
5
|
Author: osoleve
|
|
6
6
|
License: Apache License
|
|
@@ -226,15 +226,37 @@ License-File: LICENSE
|
|
|
226
226
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
|
227
227
|
Requires-Dist: tomli>=2.0.1; python_version < "3.11"
|
|
228
228
|
Requires-Dist: pyyaml>=6.0.0
|
|
229
|
+
Provides-Extra: all
|
|
230
|
+
Requires-Dist: black>=24.4.0; extra == "all"
|
|
231
|
+
Requires-Dist: hypothesis>=6.140.0; extra == "all"
|
|
232
|
+
Requires-Dist: interrogate>=1.5.0; extra == "all"
|
|
233
|
+
Requires-Dist: jellyfish>=1.2.0; extra == "all"
|
|
234
|
+
Requires-Dist: isort>=5.13.0; extra == "all"
|
|
235
|
+
Requires-Dist: mkdocs>=1.6.0; extra == "all"
|
|
236
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == "all"
|
|
237
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "all"
|
|
238
|
+
Requires-Dist: mkdocstrings-python>=1.10.0; extra == "all"
|
|
239
|
+
Requires-Dist: mypy>=1.8.0; extra == "all"
|
|
240
|
+
Requires-Dist: numpy<=2.0,>=1.24; extra == "all"
|
|
241
|
+
Requires-Dist: pre-commit>=3.8.0; extra == "all"
|
|
242
|
+
Requires-Dist: pytest>=8.0.0; extra == "all"
|
|
243
|
+
Requires-Dist: ruff>=0.6.0; extra == "all"
|
|
244
|
+
Requires-Dist: verifiers>=0.1.3.post0; extra == "all"
|
|
229
245
|
Provides-Extra: hf
|
|
230
246
|
Requires-Dist: datasets>=4.0.0; extra == "hf"
|
|
247
|
+
Provides-Extra: lightning
|
|
248
|
+
Requires-Dist: pytorch_lightning>=2.0.0; extra == "lightning"
|
|
231
249
|
Provides-Extra: vectors
|
|
232
250
|
Requires-Dist: numpy<=2.0,>=1.24; extra == "vectors"
|
|
233
251
|
Requires-Dist: spacy>=3.7.2; extra == "vectors"
|
|
234
252
|
Requires-Dist: gensim>=4.3.2; extra == "vectors"
|
|
253
|
+
Provides-Extra: st
|
|
254
|
+
Requires-Dist: sentence-transformers>=3.0.0; extra == "st"
|
|
235
255
|
Provides-Extra: prime
|
|
236
256
|
Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
|
|
237
257
|
Requires-Dist: jellyfish>=1.2.0; extra == "prime"
|
|
258
|
+
Provides-Extra: torch
|
|
259
|
+
Requires-Dist: torch>=2.0.0; extra == "torch"
|
|
238
260
|
Provides-Extra: dev
|
|
239
261
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
240
262
|
Requires-Dist: hypothesis>=6.140.0; extra == "dev"
|
|
@@ -307,7 +329,7 @@ print(gaggle(SAMPLE_TEXT))
|
|
|
307
329
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
|
308
330
|
|
|
309
331
|
Consult the [Glitchlings Usage Guide](docs/index.md)
|
|
310
|
-
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
|
332
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
|
|
311
333
|
integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
|
|
312
334
|
|
|
313
335
|
## Motivation
|
|
@@ -356,6 +378,7 @@ glitchlings --list
|
|
|
356
378
|
|
|
357
379
|
```text
|
|
358
380
|
Typogre — scope: Character, order: early
|
|
381
|
+
Apostrofae — scope: Character, order: normal
|
|
359
382
|
Mim1c — scope: Character, order: last
|
|
360
383
|
Jargoyle — scope: Word, order: normal
|
|
361
384
|
Adjax — scope: Word, order: normal
|
|
@@ -458,6 +481,16 @@ _What a nice word, would be a shame if something happened to it._
|
|
|
458
481
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
|
459
482
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
|
460
483
|
|
|
484
|
+
### Apostrofae
|
|
485
|
+
|
|
486
|
+
_It looks like you're trying to paste some text. Can I help?_
|
|
487
|
+
|
|
488
|
+
> _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
|
|
489
|
+
>
|
|
490
|
+
> Args
|
|
491
|
+
>
|
|
492
|
+
> - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
|
|
493
|
+
|
|
461
494
|
### Mim1c
|
|
462
495
|
|
|
463
496
|
_Wait, was that...?_
|
|
@@ -54,7 +54,7 @@ print(gaggle(SAMPLE_TEXT))
|
|
|
54
54
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
|
55
55
|
|
|
56
56
|
Consult the [Glitchlings Usage Guide](docs/index.md)
|
|
57
|
-
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
|
57
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
|
|
58
58
|
integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
|
|
59
59
|
|
|
60
60
|
## Motivation
|
|
@@ -103,6 +103,7 @@ glitchlings --list
|
|
|
103
103
|
|
|
104
104
|
```text
|
|
105
105
|
Typogre — scope: Character, order: early
|
|
106
|
+
Apostrofae — scope: Character, order: normal
|
|
106
107
|
Mim1c — scope: Character, order: last
|
|
107
108
|
Jargoyle — scope: Word, order: normal
|
|
108
109
|
Adjax — scope: Word, order: normal
|
|
@@ -205,6 +206,16 @@ _What a nice word, would be a shame if something happened to it._
|
|
|
205
206
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
|
206
207
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
|
207
208
|
|
|
209
|
+
### Apostrofae
|
|
210
|
+
|
|
211
|
+
_It looks like you're trying to paste some text. Can I help?_
|
|
212
|
+
|
|
213
|
+
> _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
|
|
214
|
+
>
|
|
215
|
+
> Args
|
|
216
|
+
>
|
|
217
|
+
> - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
|
|
218
|
+
|
|
208
219
|
### Mim1c
|
|
209
220
|
|
|
210
221
|
_Wait, was that...?_
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "glitchlings"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.3"
|
|
4
4
|
description = "Monsters for your language games."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -49,9 +49,29 @@ Changelog = "https://github.com/osoleve/glitchlings/releases"
|
|
|
49
49
|
glitchlings = "glitchlings.main:main"
|
|
50
50
|
|
|
51
51
|
[project.optional-dependencies]
|
|
52
|
+
all = [
|
|
53
|
+
"black>=24.4.0",
|
|
54
|
+
"hypothesis>=6.140.0",
|
|
55
|
+
"interrogate>=1.5.0",
|
|
56
|
+
"jellyfish>=1.2.0",
|
|
57
|
+
"isort>=5.13.0",
|
|
58
|
+
"mkdocs>=1.6.0",
|
|
59
|
+
"mkdocs-material>=9.5.0",
|
|
60
|
+
"mkdocstrings[python]>=0.24.0",
|
|
61
|
+
"mkdocstrings-python>=1.10.0",
|
|
62
|
+
"mypy>=1.8.0",
|
|
63
|
+
"numpy>=1.24,<=2.0",
|
|
64
|
+
"pre-commit>=3.8.0",
|
|
65
|
+
"pytest>=8.0.0",
|
|
66
|
+
"ruff>=0.6.0",
|
|
67
|
+
"verifiers>=0.1.3.post0",
|
|
68
|
+
]
|
|
52
69
|
hf = ["datasets>=4.0.0"]
|
|
70
|
+
lightning = ["pytorch_lightning>=2.0.0"]
|
|
53
71
|
vectors = ["numpy>=1.24,<=2.0", "spacy>=3.7.2", "gensim>=4.3.2"]
|
|
72
|
+
st = ["sentence-transformers>=3.0.0"]
|
|
54
73
|
prime = ["verifiers>=0.1.3.post0", "jellyfish>=1.2.0"]
|
|
74
|
+
torch = ["torch>=2.0.0"]
|
|
55
75
|
dev = [
|
|
56
76
|
"pytest>=8.0.0",
|
|
57
77
|
"hypothesis>=6.140.0",
|
|
@@ -80,6 +100,7 @@ include-package-data = true
|
|
|
80
100
|
"glitchlings" = ["config.toml"]
|
|
81
101
|
"glitchlings.lexicon" = ["data/*.json"]
|
|
82
102
|
"glitchlings.zoo" = ["ocr_confusions.tsv"]
|
|
103
|
+
"glitchlings.zoo.assets" = ["apostrofae_pairs.json"]
|
|
83
104
|
|
|
84
105
|
[tool.setuptools.packages.find]
|
|
85
106
|
where = ["src"]
|
|
@@ -151,10 +172,15 @@ module = [
|
|
|
151
172
|
"glitchlings.config",
|
|
152
173
|
"glitchlings.lexicon",
|
|
153
174
|
"glitchlings.lexicon.*",
|
|
154
|
-
|
|
155
|
-
|
|
175
|
+
]
|
|
176
|
+
strict = true
|
|
177
|
+
|
|
178
|
+
[[tool.mypy.overrides]]
|
|
179
|
+
module = [
|
|
156
180
|
"glitchlings.main",
|
|
157
181
|
"glitchlings.__main__",
|
|
158
182
|
"glitchlings.__init__",
|
|
183
|
+
"glitchlings.zoo",
|
|
184
|
+
"glitchlings.zoo.*",
|
|
159
185
|
]
|
|
160
|
-
|
|
186
|
+
strict = true
|
|
@@ -90,6 +90,12 @@ version = "2.0.6"
|
|
|
90
90
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
91
91
|
checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
|
|
92
92
|
|
|
93
|
+
[[package]]
|
|
94
|
+
name = "itoa"
|
|
95
|
+
version = "1.0.15"
|
|
96
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
97
|
+
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
|
98
|
+
|
|
93
99
|
[[package]]
|
|
94
100
|
name = "libc"
|
|
95
101
|
version = "0.2.176"
|
|
@@ -275,12 +281,60 @@ version = "0.8.6"
|
|
|
275
281
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
276
282
|
checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
|
|
277
283
|
|
|
284
|
+
[[package]]
|
|
285
|
+
name = "ryu"
|
|
286
|
+
version = "1.0.20"
|
|
287
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
288
|
+
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
|
289
|
+
|
|
278
290
|
[[package]]
|
|
279
291
|
name = "scopeguard"
|
|
280
292
|
version = "1.2.0"
|
|
281
293
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
282
294
|
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
|
283
295
|
|
|
296
|
+
[[package]]
|
|
297
|
+
name = "serde"
|
|
298
|
+
version = "1.0.228"
|
|
299
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
300
|
+
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
|
301
|
+
dependencies = [
|
|
302
|
+
"serde_core",
|
|
303
|
+
]
|
|
304
|
+
|
|
305
|
+
[[package]]
|
|
306
|
+
name = "serde_core"
|
|
307
|
+
version = "1.0.228"
|
|
308
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
309
|
+
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
|
310
|
+
dependencies = [
|
|
311
|
+
"serde_derive",
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
[[package]]
|
|
315
|
+
name = "serde_derive"
|
|
316
|
+
version = "1.0.228"
|
|
317
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
318
|
+
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
|
319
|
+
dependencies = [
|
|
320
|
+
"proc-macro2",
|
|
321
|
+
"quote",
|
|
322
|
+
"syn",
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
[[package]]
|
|
326
|
+
name = "serde_json"
|
|
327
|
+
version = "1.0.145"
|
|
328
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
329
|
+
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
|
|
330
|
+
dependencies = [
|
|
331
|
+
"itoa",
|
|
332
|
+
"memchr",
|
|
333
|
+
"ryu",
|
|
334
|
+
"serde",
|
|
335
|
+
"serde_core",
|
|
336
|
+
]
|
|
337
|
+
|
|
284
338
|
[[package]]
|
|
285
339
|
name = "smallvec"
|
|
286
340
|
version = "1.15.1"
|
|
@@ -407,5 +461,6 @@ dependencies = [
|
|
|
407
461
|
"pyo3",
|
|
408
462
|
"pyo3-build-config",
|
|
409
463
|
"regex",
|
|
464
|
+
"serde_json",
|
|
410
465
|
"smallvec",
|
|
411
466
|
]
|
|
@@ -6,7 +6,9 @@ use std::path::PathBuf;
|
|
|
6
6
|
use std::process::Command;
|
|
7
7
|
|
|
8
8
|
fn main() {
|
|
9
|
-
|
|
9
|
+
stage_asset("ocr_confusions.tsv").expect("failed to stage OCR confusion table for compilation");
|
|
10
|
+
stage_asset("apostrofae_pairs.json")
|
|
11
|
+
.expect("failed to stage Apostrofae replacement table for compilation");
|
|
10
12
|
pyo3_build_config::add_extension_module_link_args();
|
|
11
13
|
|
|
12
14
|
// Only perform custom Python linking on non-Linux platforms.
|
|
@@ -97,46 +99,60 @@ fn query_python(python: &OsStr, command: &str) -> Option<String> {
|
|
|
97
99
|
Some(value)
|
|
98
100
|
}
|
|
99
101
|
|
|
100
|
-
fn
|
|
102
|
+
fn stage_asset(asset_name: &str) -> io::Result<()> {
|
|
101
103
|
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("missing manifest dir"));
|
|
102
104
|
let out_dir = PathBuf::from(env::var("OUT_DIR").expect("missing OUT_DIR"));
|
|
103
105
|
|
|
104
|
-
let
|
|
105
|
-
|
|
106
|
+
let repo_candidates = [
|
|
107
|
+
manifest_dir
|
|
108
|
+
.join("../../src/glitchlings/zoo/assets")
|
|
109
|
+
.join(asset_name),
|
|
110
|
+
manifest_dir
|
|
111
|
+
.join("../../src/glitchlings/zoo")
|
|
112
|
+
.join(asset_name),
|
|
113
|
+
];
|
|
114
|
+
let packaged_path = manifest_dir.join("assets").join(asset_name);
|
|
106
115
|
println!("cargo:rerun-if-changed={}", packaged_path.display());
|
|
107
116
|
|
|
108
|
-
let source_path =
|
|
109
|
-
|
|
110
|
-
if
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
117
|
+
let mut source_path: Option<PathBuf> = None;
|
|
118
|
+
for candidate in &repo_candidates {
|
|
119
|
+
if candidate.exists() {
|
|
120
|
+
println!("cargo:rerun-if-changed={}", candidate.display());
|
|
121
|
+
if packaged_path.exists() {
|
|
122
|
+
let repo_bytes = fs::read(candidate)?;
|
|
123
|
+
let packaged_bytes = fs::read(&packaged_path)?;
|
|
124
|
+
if repo_bytes != packaged_bytes {
|
|
125
|
+
return Err(io::Error::new(
|
|
126
|
+
ErrorKind::Other,
|
|
127
|
+
format!(
|
|
128
|
+
"asset {} is out of sync with {}",
|
|
129
|
+
packaged_path.display(),
|
|
130
|
+
candidate.display()
|
|
131
|
+
),
|
|
132
|
+
));
|
|
133
|
+
}
|
|
122
134
|
}
|
|
135
|
+
source_path = Some(candidate.clone());
|
|
136
|
+
break;
|
|
123
137
|
}
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
format!(
|
|
130
|
-
"missing OCR confusion table; looked for {} and {}",
|
|
131
|
-
repo_path.display(),
|
|
132
|
-
packaged_path.display()
|
|
133
|
-
),
|
|
134
|
-
));
|
|
135
|
-
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
let source_path = if let Some(path) = source_path {
|
|
141
|
+
path
|
|
142
|
+
} else if packaged_path.exists() {
|
|
136
143
|
packaged_path
|
|
144
|
+
} else {
|
|
145
|
+
return Err(io::Error::new(
|
|
146
|
+
ErrorKind::NotFound,
|
|
147
|
+
format!(
|
|
148
|
+
"missing asset {asset_name}; looked for {} and {}",
|
|
149
|
+
repo_candidates[0].display(),
|
|
150
|
+
packaged_path.display()
|
|
151
|
+
),
|
|
152
|
+
));
|
|
137
153
|
};
|
|
138
154
|
|
|
139
155
|
fs::create_dir_all(&out_dir)?;
|
|
140
|
-
fs::copy(&source_path, out_dir.join(
|
|
156
|
+
fs::copy(&source_path, out_dir.join(asset_name))?;
|
|
141
157
|
Ok(())
|
|
142
158
|
}
|
|
@@ -6,8 +6,8 @@ use std::collections::HashMap;
|
|
|
6
6
|
use std::sync::{Mutex, OnceLock};
|
|
7
7
|
|
|
8
8
|
use crate::resources::{
|
|
9
|
-
affix_bounds, confusion_table, is_whitespace_only, split_affixes,
|
|
10
|
-
SPACE_BEFORE_PUNCTUATION,
|
|
9
|
+
affix_bounds, apostrofae_pairs, confusion_table, is_whitespace_only, split_affixes,
|
|
10
|
+
MULTIPLE_WHITESPACE, SPACE_BEFORE_PUNCTUATION,
|
|
11
11
|
};
|
|
12
12
|
use crate::rng::{PyRng, PyRngError};
|
|
13
13
|
use crate::text_buffer::{SegmentKind, TextBuffer, TextBufferError};
|
|
@@ -988,6 +988,156 @@ impl GlitchOp for TypoOp {
|
|
|
988
988
|
}
|
|
989
989
|
}
|
|
990
990
|
|
|
991
|
+
#[derive(Clone, Copy, Debug)]
|
|
992
|
+
enum QuoteKind {
|
|
993
|
+
Double,
|
|
994
|
+
Single,
|
|
995
|
+
Backtick,
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
impl QuoteKind {
|
|
999
|
+
fn from_char(ch: char) -> Option<Self> {
|
|
1000
|
+
match ch {
|
|
1001
|
+
'"' => Some(Self::Double),
|
|
1002
|
+
'\'' => Some(Self::Single),
|
|
1003
|
+
'`' => Some(Self::Backtick),
|
|
1004
|
+
_ => None,
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
fn as_char(self) -> char {
|
|
1009
|
+
match self {
|
|
1010
|
+
Self::Double => '"',
|
|
1011
|
+
Self::Single => '\'',
|
|
1012
|
+
Self::Backtick => '`',
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
fn index(self) -> usize {
|
|
1017
|
+
match self {
|
|
1018
|
+
Self::Double => 0,
|
|
1019
|
+
Self::Single => 1,
|
|
1020
|
+
Self::Backtick => 2,
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
#[derive(Debug, Clone, Copy)]
|
|
1026
|
+
struct QuotePair {
|
|
1027
|
+
start: usize,
|
|
1028
|
+
end: usize,
|
|
1029
|
+
kind: QuoteKind,
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
#[derive(Debug)]
|
|
1033
|
+
struct Replacement {
|
|
1034
|
+
start: usize,
|
|
1035
|
+
end: usize,
|
|
1036
|
+
value: String,
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
#[derive(Debug, Default, Clone, Copy)]
|
|
1040
|
+
pub struct QuotePairsOp;
|
|
1041
|
+
|
|
1042
|
+
impl QuotePairsOp {
|
|
1043
|
+
fn collect_pairs(text: &str) -> Vec<QuotePair> {
|
|
1044
|
+
let mut pairs: Vec<QuotePair> = Vec::new();
|
|
1045
|
+
let mut stack: [Option<usize>; 3] = [None, None, None];
|
|
1046
|
+
|
|
1047
|
+
for (idx, ch) in text.char_indices() {
|
|
1048
|
+
if let Some(kind) = QuoteKind::from_char(ch) {
|
|
1049
|
+
let slot = kind.index();
|
|
1050
|
+
if let Some(start) = stack[slot] {
|
|
1051
|
+
pairs.push(QuotePair {
|
|
1052
|
+
start,
|
|
1053
|
+
end: idx,
|
|
1054
|
+
kind,
|
|
1055
|
+
});
|
|
1056
|
+
stack[slot] = None;
|
|
1057
|
+
} else {
|
|
1058
|
+
stack[slot] = Some(idx);
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
pairs
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
impl GlitchOp for QuotePairsOp {
|
|
1068
|
+
fn apply(&self, buffer: &mut TextBuffer, rng: &mut dyn GlitchRng) -> Result<(), GlitchOpError> {
|
|
1069
|
+
let text = buffer.to_string();
|
|
1070
|
+
if text.is_empty() {
|
|
1071
|
+
return Ok(());
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
let pairs = Self::collect_pairs(&text);
|
|
1075
|
+
if pairs.is_empty() {
|
|
1076
|
+
return Ok(());
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
let table = apostrofae_pairs();
|
|
1080
|
+
if table.is_empty() {
|
|
1081
|
+
return Ok(());
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
let mut replacements: Vec<Replacement> = Vec::with_capacity(pairs.len() * 2);
|
|
1085
|
+
|
|
1086
|
+
for pair in pairs {
|
|
1087
|
+
let key = pair.kind.as_char();
|
|
1088
|
+
let Some(options) = table.get(&key) else {
|
|
1089
|
+
continue;
|
|
1090
|
+
};
|
|
1091
|
+
if options.is_empty() {
|
|
1092
|
+
continue;
|
|
1093
|
+
}
|
|
1094
|
+
let choice = rng.rand_index(options.len())?;
|
|
1095
|
+
let (left, right) = &options[choice];
|
|
1096
|
+
let glyph_len = pair.kind.as_char().len_utf8();
|
|
1097
|
+
replacements.push(Replacement {
|
|
1098
|
+
start: pair.start,
|
|
1099
|
+
end: pair.start + glyph_len,
|
|
1100
|
+
value: left.clone(),
|
|
1101
|
+
});
|
|
1102
|
+
replacements.push(Replacement {
|
|
1103
|
+
start: pair.end,
|
|
1104
|
+
end: pair.end + glyph_len,
|
|
1105
|
+
value: right.clone(),
|
|
1106
|
+
});
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
if replacements.is_empty() {
|
|
1110
|
+
return Ok(());
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
replacements.sort_by_key(|replacement| replacement.start);
|
|
1114
|
+
let mut extra_capacity = 0usize;
|
|
1115
|
+
for replacement in &replacements {
|
|
1116
|
+
let span = replacement.end - replacement.start;
|
|
1117
|
+
if replacement.value.len() > span {
|
|
1118
|
+
extra_capacity += replacement.value.len() - span;
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
let mut result = String::with_capacity(text.len() + extra_capacity);
|
|
1123
|
+
let mut cursor = 0usize;
|
|
1124
|
+
|
|
1125
|
+
for replacement in replacements {
|
|
1126
|
+
if cursor < replacement.start {
|
|
1127
|
+
result.push_str(&text[cursor..replacement.start]);
|
|
1128
|
+
}
|
|
1129
|
+
result.push_str(&replacement.value);
|
|
1130
|
+
cursor = replacement.end;
|
|
1131
|
+
}
|
|
1132
|
+
if cursor < text.len() {
|
|
1133
|
+
result.push_str(&text[cursor..]);
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
*buffer = TextBuffer::from_owned(result);
|
|
1137
|
+
Ok(())
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
|
|
991
1141
|
/// Type-erased glitchling operation for pipeline sequencing.
|
|
992
1142
|
#[derive(Debug, Clone)]
|
|
993
1143
|
pub enum GlitchOperation {
|
|
@@ -998,6 +1148,7 @@ pub enum GlitchOperation {
|
|
|
998
1148
|
Ocr(OcrArtifactsOp),
|
|
999
1149
|
Typo(TypoOp),
|
|
1000
1150
|
ZeroWidth(ZeroWidthOp),
|
|
1151
|
+
QuotePairs(QuotePairsOp),
|
|
1001
1152
|
}
|
|
1002
1153
|
|
|
1003
1154
|
impl GlitchOp for GlitchOperation {
|
|
@@ -1010,6 +1161,7 @@ impl GlitchOp for GlitchOperation {
|
|
|
1010
1161
|
GlitchOperation::Ocr(op) => op.apply(buffer, rng),
|
|
1011
1162
|
GlitchOperation::Typo(op) => op.apply(buffer, rng),
|
|
1012
1163
|
GlitchOperation::ZeroWidth(op) => op.apply(buffer, rng),
|
|
1164
|
+
GlitchOperation::QuotePairs(op) => op.apply(buffer, rng),
|
|
1013
1165
|
}
|
|
1014
1166
|
}
|
|
1015
1167
|
}
|
|
@@ -15,8 +15,8 @@ use std::collections::HashMap;
|
|
|
15
15
|
use std::sync::{Arc, OnceLock, RwLock};
|
|
16
16
|
|
|
17
17
|
pub use glitch_ops::{
|
|
18
|
-
DeleteRandomWordsOp, GlitchOpError, GlitchOperation, OcrArtifactsOp,
|
|
19
|
-
ReduplicateWordsOp, SwapAdjacentWordsOp, TypoOp, ZeroWidthOp,
|
|
18
|
+
DeleteRandomWordsOp, GlitchOpError, GlitchOperation, OcrArtifactsOp, QuotePairsOp,
|
|
19
|
+
RedactWordsOp, ReduplicateWordsOp, SwapAdjacentWordsOp, TypoOp, ZeroWidthOp,
|
|
20
20
|
};
|
|
21
21
|
pub use pipeline::{derive_seed, GlitchDescriptor, Pipeline, PipelineError};
|
|
22
22
|
pub use rng::{PyRng, PyRngError};
|
|
@@ -193,6 +193,7 @@ enum PyGlitchOperation {
|
|
|
193
193
|
rate: f64,
|
|
194
194
|
characters: Vec<String>,
|
|
195
195
|
},
|
|
196
|
+
QuotePairs,
|
|
196
197
|
}
|
|
197
198
|
|
|
198
199
|
impl<'py> FromPyObject<'py> for PyGlitchOperation {
|
|
@@ -308,6 +309,7 @@ impl<'py> FromPyObject<'py> for PyGlitchOperation {
|
|
|
308
309
|
.unwrap_or_default();
|
|
309
310
|
Ok(PyGlitchOperation::ZeroWidth { rate, characters })
|
|
310
311
|
}
|
|
312
|
+
"apostrofae" | "quote_pairs" => Ok(PyGlitchOperation::QuotePairs),
|
|
311
313
|
other => Err(PyValueError::new_err(format!(
|
|
312
314
|
"unsupported operation type: {other}"
|
|
313
315
|
))),
|
|
@@ -363,6 +365,12 @@ fn swap_adjacent_words(text: &str, swap_rate: f64, rng: &Bound<'_, PyAny>) -> Py
|
|
|
363
365
|
apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
|
|
364
366
|
}
|
|
365
367
|
|
|
368
|
+
#[pyfunction]
|
|
369
|
+
fn apostrofae(text: &str, rng: &Bound<'_, PyAny>) -> PyResult<String> {
|
|
370
|
+
let op = QuotePairsOp::default();
|
|
371
|
+
apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
|
|
372
|
+
}
|
|
373
|
+
|
|
366
374
|
#[pyfunction]
|
|
367
375
|
fn ocr_artifacts(text: &str, error_rate: f64, rng: &Bound<'_, PyAny>) -> PyResult<String> {
|
|
368
376
|
let op = OcrArtifactsOp { error_rate };
|
|
@@ -463,6 +471,9 @@ fn compose_glitchlings(
|
|
|
463
471
|
PyGlitchOperation::ZeroWidth { rate, characters } => {
|
|
464
472
|
GlitchOperation::ZeroWidth(glitch_ops::ZeroWidthOp { rate, characters })
|
|
465
473
|
}
|
|
474
|
+
PyGlitchOperation::QuotePairs => {
|
|
475
|
+
GlitchOperation::QuotePairs(glitch_ops::QuotePairsOp::default())
|
|
476
|
+
}
|
|
466
477
|
};
|
|
467
478
|
Ok(GlitchDescriptor {
|
|
468
479
|
name: descriptor.name,
|
|
@@ -481,6 +492,7 @@ fn _zoo_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
|
481
492
|
m.add_function(wrap_pyfunction!(reduplicate_words, m)?)?;
|
|
482
493
|
m.add_function(wrap_pyfunction!(delete_random_words, m)?)?;
|
|
483
494
|
m.add_function(wrap_pyfunction!(swap_adjacent_words, m)?)?;
|
|
495
|
+
m.add_function(wrap_pyfunction!(apostrofae, m)?)?;
|
|
484
496
|
m.add_function(wrap_pyfunction!(ocr_artifacts, m)?)?;
|
|
485
497
|
m.add_function(wrap_pyfunction!(redact_words, m)?)?;
|
|
486
498
|
m.add_function(wrap_pyfunction!(plan_glitchlings, m)?)?;
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
use once_cell::sync::Lazy;
|
|
2
2
|
use regex::Regex;
|
|
3
|
+
use std::collections::HashMap;
|
|
4
|
+
|
|
5
|
+
const RAW_APOSTROFAE_PAIRS: &str = include_str!(concat!(env!("OUT_DIR"), "/apostrofae_pairs.json"));
|
|
3
6
|
|
|
4
7
|
const RAW_OCR_CONFUSIONS: &str = include_str!(concat!(env!("OUT_DIR"), "/ocr_confusions.tsv"));
|
|
5
8
|
|
|
@@ -11,6 +14,23 @@ pub static SPACE_BEFORE_PUNCTUATION: Lazy<Regex> =
|
|
|
11
14
|
pub static MULTIPLE_WHITESPACE: Lazy<Regex> =
|
|
12
15
|
Lazy::new(|| Regex::new(r"\s{2,}").expect("valid multi-whitespace regex"));
|
|
13
16
|
|
|
17
|
+
/// Replacement pairs used by the Apostrofae glitchling.
|
|
18
|
+
pub static APOSTROFAE_PAIR_TABLE: Lazy<HashMap<char, Vec<(String, String)>>> = Lazy::new(|| {
|
|
19
|
+
let raw: HashMap<String, Vec<[String; 2]>> = serde_json::from_str(RAW_APOSTROFAE_PAIRS)
|
|
20
|
+
.expect("apostrofae pair table should be valid JSON");
|
|
21
|
+
let mut table: HashMap<char, Vec<(String, String)>> = HashMap::new();
|
|
22
|
+
for (key, pairs) in raw {
|
|
23
|
+
if let Some(ch) = key.chars().next() {
|
|
24
|
+
let entries: Vec<(String, String)> = pairs
|
|
25
|
+
.into_iter()
|
|
26
|
+
.map(|pair| (pair[0].to_string(), pair[1].to_string()))
|
|
27
|
+
.collect();
|
|
28
|
+
table.insert(ch, entries);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
table
|
|
32
|
+
});
|
|
33
|
+
|
|
14
34
|
/// Sorted confusion pairs reused by glitchling implementations.
|
|
15
35
|
pub static OCR_CONFUSION_TABLE: Lazy<Vec<(&'static str, &'static [&'static str])>> =
|
|
16
36
|
Lazy::new(|| {
|
|
@@ -50,6 +70,11 @@ pub fn confusion_table() -> &'static [(&'static str, &'static [&'static str])] {
|
|
|
50
70
|
OCR_CONFUSION_TABLE.as_slice()
|
|
51
71
|
}
|
|
52
72
|
|
|
73
|
+
/// Returns the Apostrofae replacement pairs keyed by the straight glyph.
|
|
74
|
+
pub fn apostrofae_pairs() -> &'static HashMap<char, Vec<(String, String)>> {
|
|
75
|
+
&APOSTROFAE_PAIR_TABLE
|
|
76
|
+
}
|
|
77
|
+
|
|
53
78
|
#[inline]
|
|
54
79
|
pub fn is_whitespace_only(s: &str) -> bool {
|
|
55
80
|
s.chars().all(char::is_whitespace)
|
|
@@ -126,7 +151,7 @@ pub fn split_affixes(word: &str) -> (String, String, String) {
|
|
|
126
151
|
|
|
127
152
|
#[cfg(test)]
|
|
128
153
|
mod tests {
|
|
129
|
-
use super::{confusion_table, split_affixes, split_with_separators};
|
|
154
|
+
use super::{apostrofae_pairs, confusion_table, split_affixes, split_with_separators};
|
|
130
155
|
|
|
131
156
|
#[test]
|
|
132
157
|
fn split_with_separators_matches_expected_boundaries() {
|
|
@@ -162,4 +187,13 @@ mod tests {
|
|
|
162
187
|
a_src.len() >= b_src.len()
|
|
163
188
|
}));
|
|
164
189
|
}
|
|
190
|
+
|
|
191
|
+
#[test]
|
|
192
|
+
fn apostrofae_pairs_loaded_from_asset() {
|
|
193
|
+
let table = apostrofae_pairs();
|
|
194
|
+
assert!(table.contains_key(&'"'));
|
|
195
|
+
assert!(table.contains_key(&'\''));
|
|
196
|
+
assert!(table.contains_key(&'`'));
|
|
197
|
+
assert!(table.values().all(|entries| !entries.is_empty()));
|
|
198
|
+
}
|
|
165
199
|
}
|