glitchlings 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings-0.2.2/MANIFEST.in +4 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/PKG-INFO +12 -18
- {glitchlings-0.2.1 → glitchlings-0.2.2}/README.md +8 -17
- {glitchlings-0.2.1 → glitchlings-0.2.2}/pyproject.toml +8 -7
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/Cargo.lock +0 -7
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/Cargo.toml +0 -1
- glitchlings-0.2.2/rust/zoo/assets/ocr_confusions.tsv +30 -0
- glitchlings-0.2.2/rust/zoo/build.rs +134 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/zoo/src/glitch_ops.rs +1 -1
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/zoo/src/lib.rs +2 -1
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/zoo/src/resources.rs +24 -34
- glitchlings-0.2.1/rust/typogre/src/lib.rs → glitchlings-0.2.2/rust/zoo/src/typogre.rs +3 -9
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/main.py +17 -39
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/util/__init__.py +30 -0
- glitchlings-0.2.2/src/glitchlings/zoo/__init__.py +134 -0
- glitchlings-0.2.2/src/glitchlings/zoo/_ocr_confusions.py +34 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/jargoyle.py +53 -11
- glitchlings-0.2.2/src/glitchlings/zoo/ocr_confusions.tsv +30 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/redactyl.py +3 -1
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/scannequin.py +4 -29
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/typogre.py +12 -4
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings.egg-info/PKG-INFO +12 -18
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings.egg-info/SOURCES.txt +4 -2
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings.egg-info/requires.txt +3 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_cli.py +29 -1
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_gaggle.py +19 -6
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_glitchlings_determinism.py +0 -11
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_jargoyle.py +1 -11
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_keyboard_layouts.py +18 -1
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_parameter_effects.py +6 -1
- glitchlings-0.2.2/tests/test_prime_echo_chamber.py +205 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_property_based.py +1 -1
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_rust_backed_glitchlings.py +41 -1
- glitchlings-0.2.1/MANIFEST.in +0 -4
- glitchlings-0.2.1/rust/typogre/Cargo.toml +0 -14
- glitchlings-0.2.1/rust/zoo/build.rs +0 -60
- glitchlings-0.2.1/src/glitchlings/zoo/__init__.py +0 -57
- glitchlings-0.2.1/tests/test_prime_echo_chamber.py +0 -99
- {glitchlings-0.2.1 → glitchlings-0.2.2}/LICENSE +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/zoo/Cargo.toml +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/zoo/src/pipeline.rs +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/zoo/src/rng.rs +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/rust/zoo/src/text_buffer.rs +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/setup.cfg +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/__init__.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/dlc/__init__.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/dlc/huggingface.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/dlc/prime.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/core.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/mim1c.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/reduple.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings/zoo/rushmore.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings.egg-info/dependency_links.txt +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings.egg-info/entry_points.txt +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/src/glitchlings.egg-info/top_level.txt +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_dataset_corruption.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_glitchling_core.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_huggingface_dlc.py +0 -0
- {glitchlings-0.2.1 → glitchlings-0.2.2}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -232,11 +232,14 @@ Provides-Extra: hf
|
|
232
232
|
Requires-Dist: datasets>=4.0.0; extra == "hf"
|
233
233
|
Provides-Extra: wordnet
|
234
234
|
Requires-Dist: nltk>=3.9.1; extra == "wordnet"
|
235
|
+
Requires-Dist: numpy<=2.0,>=1.24; extra == "wordnet"
|
235
236
|
Provides-Extra: prime
|
236
237
|
Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
|
237
238
|
Provides-Extra: dev
|
238
239
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
239
240
|
Requires-Dist: hypothesis>=6.140.0; extra == "dev"
|
241
|
+
Requires-Dist: nltk>=3.9.1; extra == "dev"
|
242
|
+
Requires-Dist: numpy<=2.0,>=1.24; extra == "dev"
|
240
243
|
Dynamic: license-file
|
241
244
|
|
242
245
|
#
|
@@ -294,22 +297,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
294
297
|
|
295
298
|
## Usage
|
296
299
|
|
297
|
-
|
298
|
-
|
299
|
-
-
|
300
|
-
|
301
|
-
|
302
|
-
### Rust pipeline acceleration (opt-in)
|
303
|
-
|
304
|
-
The refactored Rust pipeline can execute multiple glitchlings without
|
305
|
-
bouncing back through Python, but it is gated behind a feature flag so
|
306
|
-
teams can roll it out gradually. After compiling the Rust extension
|
307
|
-
(`python -m cibuildwheel --output-dir dist`) set
|
308
|
-
`GLITCHLINGS_RUST_PIPELINE=1` (or `true`, `yes`, `on`) before importing
|
309
|
-
`glitchlings`. When the flag is set and the extension is available,
|
310
|
-
`Gaggle` automatically batches compatible glitchlings into the Rust
|
311
|
-
pipeline; otherwise it transparently falls back to the legacy Python
|
312
|
-
loop.
|
300
|
+
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
301
|
+
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
302
|
+
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
303
|
+
integrations, and the feature-flagged Rust pipeline.
|
313
304
|
|
314
305
|
### Prime Intellect environments
|
315
306
|
|
@@ -384,11 +375,14 @@ glitchlings --list
|
|
384
375
|
# Run Typogre against the contents of a file and inspect the diff.
|
385
376
|
glitchlings -g typogre --file documents/report.txt --diff
|
386
377
|
|
378
|
+
# Configure glitchlings inline by passing keyword arguments.
|
379
|
+
glitchlings -g "Typogre(max_change_rate=0.05)" "Ghouls just wanna have fun"
|
380
|
+
|
387
381
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
388
382
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
389
383
|
```
|
390
384
|
|
391
|
-
Use `--help` for a complete breakdown of available options.
|
385
|
+
Use `--help` for a complete breakdown of available options, including support for parameterised glitchlings via `-g "Name(arg=value, ...)"` to mirror the Python API.
|
392
386
|
|
393
387
|
## Development
|
394
388
|
|
@@ -53,22 +53,10 @@ print(gaggle(SAMPLE_TEXT))
|
|
53
53
|
|
54
54
|
## Usage
|
55
55
|
|
56
|
-
|
57
|
-
|
58
|
-
-
|
59
|
-
|
60
|
-
|
61
|
-
### Rust pipeline acceleration (opt-in)
|
62
|
-
|
63
|
-
The refactored Rust pipeline can execute multiple glitchlings without
|
64
|
-
bouncing back through Python, but it is gated behind a feature flag so
|
65
|
-
teams can roll it out gradually. After compiling the Rust extension
|
66
|
-
(`python -m cibuildwheel --output-dir dist`) set
|
67
|
-
`GLITCHLINGS_RUST_PIPELINE=1` (or `true`, `yes`, `on`) before importing
|
68
|
-
`glitchlings`. When the flag is set and the extension is available,
|
69
|
-
`Gaggle` automatically batches compatible glitchlings into the Rust
|
70
|
-
pipeline; otherwise it transparently falls back to the legacy Python
|
71
|
-
loop.
|
56
|
+
Need detailed usage patterns, dataset workflows, or tips for enabling the
|
57
|
+
Rust accelerator? Consult the [Glitchlings Usage Guide](docs/index.md)
|
58
|
+
for end-to-end instructions spanning the Python API, CLI, Hugging Face
|
59
|
+
integrations, and the feature-flagged Rust pipeline.
|
72
60
|
|
73
61
|
### Prime Intellect environments
|
74
62
|
|
@@ -143,11 +131,14 @@ glitchlings --list
|
|
143
131
|
# Run Typogre against the contents of a file and inspect the diff.
|
144
132
|
glitchlings -g typogre --file documents/report.txt --diff
|
145
133
|
|
134
|
+
# Configure glitchlings inline by passing keyword arguments.
|
135
|
+
glitchlings -g "Typogre(max_change_rate=0.05)" "Ghouls just wanna have fun"
|
136
|
+
|
146
137
|
# Pipe text straight into the CLI for an on-the-fly corruption.
|
147
138
|
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
148
139
|
```
|
149
140
|
|
150
|
-
Use `--help` for a complete breakdown of available options.
|
141
|
+
Use `--help` for a complete breakdown of available options, including support for parameterised glitchlings via `-g "Name(arg=value, ...)"` to mirror the Python API.
|
151
142
|
|
152
143
|
## Development
|
153
144
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "glitchlings"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.2"
|
4
4
|
description = "Monsters for your language games."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.12"
|
@@ -46,11 +46,13 @@ glitchlings = "glitchlings.main:main"
|
|
46
46
|
|
47
47
|
[project.optional-dependencies]
|
48
48
|
hf = ["datasets>=4.0.0"]
|
49
|
-
wordnet = ["nltk>=3.9.1"]
|
49
|
+
wordnet = ["nltk>=3.9.1", "numpy>=1.24,<=2.0"]
|
50
50
|
prime = ["verifiers>=0.1.3.post0"]
|
51
51
|
dev = [
|
52
52
|
"pytest>=8.0.0",
|
53
53
|
"hypothesis>=6.140.0",
|
54
|
+
"nltk>=3.9.1",
|
55
|
+
"numpy>=1.24,<=2.0",
|
54
56
|
]
|
55
57
|
|
56
58
|
[build-system]
|
@@ -59,6 +61,10 @@ build-backend = "setuptools.build_meta"
|
|
59
61
|
|
60
62
|
[tool.setuptools]
|
61
63
|
package-dir = {"" = "src"}
|
64
|
+
include-package-data = true
|
65
|
+
|
66
|
+
[tool.setuptools.package-data]
|
67
|
+
"glitchlings.zoo" = ["ocr_confusions.tsv"]
|
62
68
|
|
63
69
|
[tool.setuptools.packages.find]
|
64
70
|
where = ["src"]
|
@@ -69,11 +75,6 @@ path = "rust/zoo/Cargo.toml"
|
|
69
75
|
binding = "PyO3"
|
70
76
|
debug = false
|
71
77
|
|
72
|
-
[[tool.setuptools-rust.ext-modules]]
|
73
|
-
target = "glitchlings._typogre_rust"
|
74
|
-
path = "rust/typogre/Cargo.toml"
|
75
|
-
binding = "PyO3"
|
76
|
-
debug = false
|
77
78
|
|
78
79
|
[tool.pytest.ini_options]
|
79
80
|
pythonpath = [
|
@@ -316,13 +316,6 @@ version = "1.19.0"
|
|
316
316
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
317
317
|
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
|
318
318
|
|
319
|
-
[[package]]
|
320
|
-
name = "typogre_rust"
|
321
|
-
version = "0.1.0"
|
322
|
-
dependencies = [
|
323
|
-
"pyo3",
|
324
|
-
]
|
325
|
-
|
326
319
|
[[package]]
|
327
320
|
name = "unicode-ident"
|
328
321
|
version = "1.0.19"
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Source Replacements (space-separated)
|
2
|
+
li h
|
3
|
+
h li
|
4
|
+
rn m
|
5
|
+
m rn
|
6
|
+
cl d
|
7
|
+
d cl
|
8
|
+
I l
|
9
|
+
l I 1
|
10
|
+
1 l I
|
11
|
+
0 O
|
12
|
+
O 0
|
13
|
+
B 8
|
14
|
+
8 B
|
15
|
+
S 5
|
16
|
+
5 S
|
17
|
+
Z 2
|
18
|
+
2 Z
|
19
|
+
G 6
|
20
|
+
6 G
|
21
|
+
“ "
|
22
|
+
” "
|
23
|
+
‘ '
|
24
|
+
’ '
|
25
|
+
— -
|
26
|
+
– -
|
27
|
+
vv w
|
28
|
+
w vv
|
29
|
+
ri n
|
30
|
+
n ri
|
@@ -0,0 +1,134 @@
|
|
1
|
+
use std::env;
|
2
|
+
use std::ffi::{OsStr, OsString};
|
3
|
+
use std::fs;
|
4
|
+
use std::io::{self, ErrorKind};
|
5
|
+
use std::path::PathBuf;
|
6
|
+
use std::process::Command;
|
7
|
+
|
8
|
+
fn main() {
|
9
|
+
prepare_confusion_table().expect("failed to stage OCR confusion table for compilation");
|
10
|
+
pyo3_build_config::add_extension_module_link_args();
|
11
|
+
|
12
|
+
if let Some(python) = configured_python() {
|
13
|
+
link_python(&python);
|
14
|
+
} else if let Some(python) = detect_python() {
|
15
|
+
link_python(&python);
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
fn configured_python() -> Option<OsString> {
|
20
|
+
std::env::var_os("PYO3_PYTHON")
|
21
|
+
.or_else(|| std::env::var_os("PYTHON"))
|
22
|
+
.filter(|path| !path.is_empty())
|
23
|
+
}
|
24
|
+
|
25
|
+
fn detect_python() -> Option<OsString> {
|
26
|
+
const CANDIDATES: &[&str] = &["python3.12", "python3", "python"];
|
27
|
+
|
28
|
+
for candidate in CANDIDATES {
|
29
|
+
let status = Command::new(candidate)
|
30
|
+
.arg("-c")
|
31
|
+
.arg("import sys")
|
32
|
+
.output();
|
33
|
+
|
34
|
+
if let Ok(output) = status {
|
35
|
+
if output.status.success() {
|
36
|
+
return Some(OsString::from(candidate));
|
37
|
+
}
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
None
|
42
|
+
}
|
43
|
+
|
44
|
+
fn link_python(python: &OsStr) {
|
45
|
+
if let Some(path) = query_python(
|
46
|
+
python,
|
47
|
+
"import sysconfig; print(sysconfig.get_config_var('LIBDIR') or '')",
|
48
|
+
) {
|
49
|
+
let trimmed = path.trim();
|
50
|
+
if !trimmed.is_empty() {
|
51
|
+
println!("cargo:rustc-link-search=native={trimmed}");
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
if let Some(path) = query_python(
|
56
|
+
python,
|
57
|
+
"import sysconfig; print(sysconfig.get_config_var('LIBPL') or '')",
|
58
|
+
) {
|
59
|
+
let trimmed = path.trim();
|
60
|
+
if !trimmed.is_empty() {
|
61
|
+
println!("cargo:rustc-link-search=native={trimmed}");
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
if let Some(library) = query_python(
|
66
|
+
python,
|
67
|
+
"import sysconfig; print(sysconfig.get_config_var('LDLIBRARY') or '')",
|
68
|
+
) {
|
69
|
+
let name = library.trim();
|
70
|
+
if let Some(stripped) = name.strip_prefix("lib") {
|
71
|
+
let stem = stripped
|
72
|
+
.strip_suffix(".so")
|
73
|
+
.or_else(|| stripped.strip_suffix(".a"))
|
74
|
+
.unwrap_or(stripped);
|
75
|
+
if !stem.is_empty() {
|
76
|
+
println!("cargo:rustc-link-lib={stem}");
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
fn query_python(python: &OsStr, command: &str) -> Option<String> {
|
83
|
+
let output = Command::new(python).arg("-c").arg(command).output().ok()?;
|
84
|
+
if !output.status.success() {
|
85
|
+
return None;
|
86
|
+
}
|
87
|
+
let value = String::from_utf8(output.stdout).ok()?;
|
88
|
+
Some(value)
|
89
|
+
}
|
90
|
+
|
91
|
+
fn prepare_confusion_table() -> io::Result<()> {
|
92
|
+
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("missing manifest dir"));
|
93
|
+
let out_dir = PathBuf::from(env::var("OUT_DIR").expect("missing OUT_DIR"));
|
94
|
+
|
95
|
+
let repo_path = manifest_dir.join("../../src/glitchlings/zoo/ocr_confusions.tsv");
|
96
|
+
let packaged_path = manifest_dir.join("assets/ocr_confusions.tsv");
|
97
|
+
println!("cargo:rerun-if-changed={}", packaged_path.display());
|
98
|
+
|
99
|
+
let source_path = if repo_path.exists() {
|
100
|
+
println!("cargo:rerun-if-changed={}", repo_path.display());
|
101
|
+
if packaged_path.exists() {
|
102
|
+
let repo_bytes = fs::read(&repo_path)?;
|
103
|
+
let packaged_bytes = fs::read(&packaged_path)?;
|
104
|
+
if repo_bytes != packaged_bytes {
|
105
|
+
return Err(io::Error::new(
|
106
|
+
ErrorKind::Other,
|
107
|
+
format!(
|
108
|
+
"OCR confusion table at {} is out of sync with {}",
|
109
|
+
packaged_path.display(),
|
110
|
+
repo_path.display()
|
111
|
+
),
|
112
|
+
));
|
113
|
+
}
|
114
|
+
}
|
115
|
+
repo_path
|
116
|
+
} else {
|
117
|
+
if !packaged_path.exists() {
|
118
|
+
return Err(io::Error::new(
|
119
|
+
ErrorKind::NotFound,
|
120
|
+
format!(
|
121
|
+
"missing OCR confusion table; looked for {} and {}",
|
122
|
+
repo_path.display(),
|
123
|
+
packaged_path.display()
|
124
|
+
),
|
125
|
+
));
|
126
|
+
}
|
127
|
+
packaged_path
|
128
|
+
};
|
129
|
+
|
130
|
+
fs::create_dir_all(&out_dir)?;
|
131
|
+
fs::copy(&source_path, out_dir.join("ocr_confusions.tsv"))?;
|
132
|
+
Ok(())
|
133
|
+
}
|
134
|
+
|
@@ -500,6 +500,6 @@ mod tests {
|
|
500
500
|
let mut rng = PyRng::new(1);
|
501
501
|
let op = OcrArtifactsOp { error_rate: 1.0 };
|
502
502
|
op.apply(&mut buffer, &mut rng).expect("ocr succeeds");
|
503
|
-
assert_eq!(buffer.to_string(), "Tlie rn
|
503
|
+
assert_eq!(buffer.to_string(), "Tlie rn rri");
|
504
504
|
}
|
505
505
|
}
|
@@ -3,6 +3,7 @@ mod pipeline;
|
|
3
3
|
mod resources;
|
4
4
|
mod rng;
|
5
5
|
mod text_buffer;
|
6
|
+
mod typogre;
|
6
7
|
|
7
8
|
use glitch_ops::{GlitchOp, GlitchRng};
|
8
9
|
use pyo3::prelude::*;
|
@@ -17,7 +18,6 @@ pub use glitch_ops::{
|
|
17
18
|
pub use pipeline::{derive_seed, GlitchDescriptor, Pipeline, PipelineError};
|
18
19
|
pub use rng::{PyRng, PyRngError};
|
19
20
|
pub use text_buffer::{SegmentKind, TextBuffer, TextBufferError, TextSegment, TextSpan};
|
20
|
-
|
21
21
|
struct PythonRngAdapter<'py> {
|
22
22
|
rng: Bound<'py, PyAny>,
|
23
23
|
}
|
@@ -279,5 +279,6 @@ fn _zoo_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
279
279
|
m.add_function(wrap_pyfunction!(ocr_artifacts, m)?)?;
|
280
280
|
m.add_function(wrap_pyfunction!(redact_words, m)?)?;
|
281
281
|
m.add_function(wrap_pyfunction!(compose_glitchlings, m)?)?;
|
282
|
+
m.add_function(wrap_pyfunction!(typogre::fatfinger, m)?)?;
|
282
283
|
Ok(())
|
283
284
|
}
|
@@ -1,6 +1,8 @@
|
|
1
1
|
use once_cell::sync::Lazy;
|
2
2
|
use regex::Regex;
|
3
3
|
|
4
|
+
const RAW_OCR_CONFUSIONS: &str = include_str!(concat!(env!("OUT_DIR"), "/ocr_confusions.tsv"));
|
5
|
+
|
4
6
|
/// Precompiled regex removing spaces before punctuation characters.
|
5
7
|
pub static SPACE_BEFORE_PUNCTUATION: Lazy<Regex> =
|
6
8
|
Lazy::new(|| Regex::new(r"\s+([.,;:])").expect("valid punctuation regex"));
|
@@ -9,43 +11,30 @@ pub static SPACE_BEFORE_PUNCTUATION: Lazy<Regex> =
|
|
9
11
|
pub static MULTIPLE_WHITESPACE: Lazy<Regex> =
|
10
12
|
Lazy::new(|| Regex::new(r"\s{2,}").expect("valid multi-whitespace regex"));
|
11
13
|
|
12
|
-
static BASE_CONFUSION_TABLE: &[(&str, &[&str])] = &[
|
13
|
-
("li", &["h"]),
|
14
|
-
("h", &["li"]),
|
15
|
-
("rn", &["m"]),
|
16
|
-
("m", &["rn"]),
|
17
|
-
("cl", &["d"]),
|
18
|
-
("d", &["cl"]),
|
19
|
-
("I", &["l"]),
|
20
|
-
("l", &["I", "1"]),
|
21
|
-
("1", &["l", "I"]),
|
22
|
-
("0", &["O"]),
|
23
|
-
("O", &["0"]),
|
24
|
-
("B", &["8"]),
|
25
|
-
("8", &["B"]),
|
26
|
-
("S", &["5"]),
|
27
|
-
("5", &["S"]),
|
28
|
-
("Z", &["2"]),
|
29
|
-
("2", &["Z"]),
|
30
|
-
("G", &["6"]),
|
31
|
-
("6", &["G"]),
|
32
|
-
("“", &["\""]),
|
33
|
-
("”", &["\""]),
|
34
|
-
("‘", &["'"]),
|
35
|
-
("’", &["'"]),
|
36
|
-
("—", &["-"]),
|
37
|
-
("–", &["-"]),
|
38
|
-
];
|
39
|
-
|
40
14
|
/// Sorted confusion pairs reused by glitchling implementations.
|
41
15
|
pub static OCR_CONFUSION_TABLE: Lazy<Vec<(&'static str, &'static [&'static str])>> =
|
42
16
|
Lazy::new(|| {
|
43
|
-
let mut entries: Vec<(usize, (&'static str, &'static [&'static str]))> =
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
17
|
+
let mut entries: Vec<(usize, (&'static str, &'static [&'static str]))> = Vec::new();
|
18
|
+
|
19
|
+
for (line_number, line) in RAW_OCR_CONFUSIONS.lines().enumerate() {
|
20
|
+
let trimmed = line.trim();
|
21
|
+
if trimmed.is_empty() || trimmed.starts_with('#') {
|
22
|
+
continue;
|
23
|
+
}
|
24
|
+
|
25
|
+
let mut parts = trimmed.split_whitespace();
|
26
|
+
let Some(source) = parts.next() else {
|
27
|
+
continue;
|
28
|
+
};
|
29
|
+
let replacements: Vec<&'static str> = parts.collect();
|
30
|
+
if replacements.is_empty() {
|
31
|
+
continue;
|
32
|
+
}
|
33
|
+
|
34
|
+
let leaked: &'static [&'static str] = Box::leak(replacements.into_boxed_slice());
|
35
|
+
entries.push((line_number, (source, leaked)));
|
36
|
+
}
|
37
|
+
|
49
38
|
entries.sort_by(|a, b| {
|
50
39
|
let a_len = a.1 .0.len();
|
51
40
|
let b_len = b.1 .0.len();
|
@@ -53,6 +42,7 @@ pub static OCR_CONFUSION_TABLE: Lazy<Vec<(&'static str, &'static [&'static str])
|
|
53
42
|
.cmp(&a_len)
|
54
43
|
.then_with(|| a.0.cmp(&b.0))
|
55
44
|
});
|
45
|
+
|
56
46
|
entries.into_iter().map(|(_, pair)| pair).collect()
|
57
47
|
});
|
58
48
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
use pyo3::prelude::*;
|
2
|
-
use pyo3::types::{PyAny, PyDict, PyList
|
2
|
+
use pyo3::types::{PyAny, PyDict, PyList};
|
3
3
|
use pyo3::Bound;
|
4
4
|
use std::collections::HashMap;
|
5
5
|
|
@@ -205,7 +205,7 @@ fn global_action(rng: &Bound<'_, PyAny>, action: &str, chars: &mut Vec<char>) ->
|
|
205
205
|
}
|
206
206
|
|
207
207
|
#[pyfunction]
|
208
|
-
fn fatfinger(
|
208
|
+
pub(crate) fn fatfinger(
|
209
209
|
text: &str,
|
210
210
|
max_change_rate: f64,
|
211
211
|
layout: &Bound<'_, PyDict>,
|
@@ -224,7 +224,7 @@ fn fatfinger(
|
|
224
224
|
}
|
225
225
|
|
226
226
|
let length = chars.len();
|
227
|
-
let mut max_changes = (length as f64 * max_change_rate).
|
227
|
+
let mut max_changes = (length as f64 * max_change_rate).ceil() as usize;
|
228
228
|
if max_changes < 1 {
|
229
229
|
max_changes = 1;
|
230
230
|
}
|
@@ -252,9 +252,3 @@ fn fatfinger(
|
|
252
252
|
|
253
253
|
Ok(chars.into_iter().collect())
|
254
254
|
}
|
255
|
-
|
256
|
-
#[pymodule]
|
257
|
-
fn _typogre_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
258
|
-
m.add_function(wrap_pyfunction!(fatfinger, m)?)?;
|
259
|
-
Ok(())
|
260
|
-
}
|
@@ -11,31 +11,12 @@ from . import SAMPLE_TEXT
|
|
11
11
|
from .zoo import (
|
12
12
|
Glitchling,
|
13
13
|
Gaggle,
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
reduple,
|
18
|
-
rushmore,
|
19
|
-
redactyl,
|
20
|
-
scannequin,
|
14
|
+
BUILTIN_GLITCHLINGS,
|
15
|
+
DEFAULT_GLITCHLING_NAMES,
|
16
|
+
parse_glitchling_spec,
|
21
17
|
summon,
|
22
18
|
)
|
23
19
|
|
24
|
-
|
25
|
-
BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
26
|
-
g.name.lower(): g
|
27
|
-
for g in [
|
28
|
-
typogre,
|
29
|
-
mim1c,
|
30
|
-
jargoyle,
|
31
|
-
reduple,
|
32
|
-
rushmore,
|
33
|
-
redactyl,
|
34
|
-
scannequin,
|
35
|
-
]
|
36
|
-
}
|
37
|
-
|
38
|
-
DEFAULT_GLITCHLING_NAMES: list[str] = list(BUILTIN_GLITCHLINGS.keys())
|
39
20
|
MAX_NAME_WIDTH = max(len(glitchling.name) for glitchling in BUILTIN_GLITCHLINGS.values())
|
40
21
|
|
41
22
|
|
@@ -62,8 +43,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|
62
43
|
"--glitchling",
|
63
44
|
dest="glitchlings",
|
64
45
|
action="append",
|
65
|
-
metavar="
|
66
|
-
help=
|
46
|
+
metavar="SPEC",
|
47
|
+
help=(
|
48
|
+
"Glitchling to apply, optionally with parameters like "
|
49
|
+
"Typogre(max_change_rate=0.05). Repeat for multiples; defaults to all built-ins."
|
50
|
+
),
|
67
51
|
)
|
68
52
|
parser.add_argument(
|
69
53
|
"-s",
|
@@ -147,23 +131,16 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
|
|
147
131
|
def summon_glitchlings(
|
148
132
|
names: list[str] | None, parser: argparse.ArgumentParser, seed: int
|
149
133
|
) -> Gaggle:
|
150
|
-
"""Instantiate the requested glitchlings and bundle them in a ``Gaggle``.
|
151
|
-
|
152
|
-
Args:
|
153
|
-
names: Optional list of glitchling names provided by the user.
|
154
|
-
parser: The argument parser used for emitting user-facing errors.
|
155
|
-
seed: Master seed controlling deterministic corruption order.
|
156
|
-
|
157
|
-
Returns:
|
158
|
-
Gaggle: A ready-to-use collection of glitchlings.
|
159
|
-
|
160
|
-
Raises:
|
161
|
-
SystemExit: Raised indirectly via ``parser.error`` when a provided glitchling
|
162
|
-
name is invalid.
|
163
|
-
"""
|
134
|
+
"""Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
|
164
135
|
|
165
136
|
if names:
|
166
|
-
normalized
|
137
|
+
normalized: list[str | Glitchling] = []
|
138
|
+
for specification in names:
|
139
|
+
try:
|
140
|
+
normalized.append(parse_glitchling_spec(specification))
|
141
|
+
except ValueError as exc:
|
142
|
+
parser.error(str(exc))
|
143
|
+
raise AssertionError("parser.error should exit")
|
167
144
|
else:
|
168
145
|
normalized = DEFAULT_GLITCHLING_NAMES
|
169
146
|
|
@@ -174,6 +151,7 @@ def summon_glitchlings(
|
|
174
151
|
raise AssertionError("parser.error should exit")
|
175
152
|
|
176
153
|
|
154
|
+
|
177
155
|
def show_diff(original: str, corrupted: str) -> None:
|
178
156
|
"""Display a unified diff between the original and corrupted text."""
|
179
157
|
|
@@ -141,6 +141,36 @@ _register_layout(
|
|
141
141
|
),
|
142
142
|
)
|
143
143
|
|
144
|
+
_register_layout(
|
145
|
+
"QWERTZ",
|
146
|
+
(
|
147
|
+
"^1234567890ß´",
|
148
|
+
" qwertzuiopü+",
|
149
|
+
" asdfghjklöä#",
|
150
|
+
" yxcvbnm,.-",
|
151
|
+
),
|
152
|
+
)
|
153
|
+
|
154
|
+
_register_layout(
|
155
|
+
"SPANISH_QWERTY",
|
156
|
+
(
|
157
|
+
"º1234567890'¡",
|
158
|
+
" qwertyuiop´+",
|
159
|
+
" asdfghjklñ´",
|
160
|
+
" <zxcvbnm,.-",
|
161
|
+
),
|
162
|
+
)
|
163
|
+
|
164
|
+
_register_layout(
|
165
|
+
"SWEDISH_QWERTY",
|
166
|
+
(
|
167
|
+
"§1234567890+´",
|
168
|
+
" qwertyuiopå¨",
|
169
|
+
" asdfghjklöä'",
|
170
|
+
" <zxcvbnm,.-",
|
171
|
+
),
|
172
|
+
)
|
173
|
+
|
144
174
|
|
145
175
|
class KeyNeighbors:
|
146
176
|
def __init__(self) -> None:
|