glitchlings 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {glitchlings-0.2.3 → glitchlings-0.2.5}/PKG-INFO +15 -8
- {glitchlings-0.2.3 → glitchlings-0.2.5}/README.md +12 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/pyproject.toml +3 -9
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/build.rs +11 -9
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/glitch_ops.rs +269 -63
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/lib.rs +56 -11
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/pipeline.rs +8 -2
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/resources.rs +1 -3
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/typogre.rs +0 -2
- glitchlings-0.2.5/rust/zoo/src/zeedub.rs +89 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/__init__.py +4 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/__init__.py +5 -1
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/redactyl.py +70 -9
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/reduple.py +57 -16
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/rushmore.py +52 -17
- glitchlings-0.2.5/src/glitchlings/zoo/zeedub.py +144 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/PKG-INFO +15 -8
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/SOURCES.txt +2 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_glitchlings_determinism.py +8 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_parameter_effects.py +108 -1
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_rust_backed_glitchlings.py +109 -8
- {glitchlings-0.2.3 → glitchlings-0.2.5}/LICENSE +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/MANIFEST.in +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/Cargo.lock +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/Cargo.toml +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/Cargo.toml +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/assets/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/rng.rs +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/text_buffer.rs +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/setup.cfg +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/dlc/__init__.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/dlc/huggingface.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/dlc/prime.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/main.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/_rate.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/core.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/jargoyle.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/mim1c.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/scannequin.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/typogre.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/dependency_links.txt +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/entry_points.txt +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/requires.txt +1 -1
- {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/top_level.txt +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_cli.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_dataset_corruption.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_gaggle.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_glitchling_core.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_huggingface_dlc.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_jargoyle.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_keyboard_layouts.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_prime_echo_chamber.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_property_based.py +0 -0
- {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.5
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -209,27 +209,21 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
|
209
209
|
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
210
210
|
Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
|
211
211
|
Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
|
212
|
-
Keywords: nlp,text,adversarial augmentation,text augmentation
|
212
|
+
Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,confusables,typo,
|
213
213
|
Classifier: Development Status :: 3 - Alpha
|
214
214
|
Classifier: Intended Audience :: Developers
|
215
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
216
215
|
Classifier: Programming Language :: Python
|
217
216
|
Classifier: Programming Language :: Python :: 3
|
218
217
|
Classifier: Programming Language :: Python :: 3.10
|
219
218
|
Classifier: Programming Language :: Python :: 3.11
|
220
219
|
Classifier: Programming Language :: Python :: 3.12
|
221
220
|
Classifier: Programming Language :: Rust
|
222
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
223
|
-
Classifier: Operating System :: Microsoft :: Windows
|
224
|
-
Classifier: Operating System :: POSIX :: Linux
|
225
|
-
Classifier: Operating System :: OS Independent
|
226
221
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
227
222
|
Classifier: Topic :: Software Development :: Testing
|
228
223
|
Requires-Python: >=3.10
|
229
224
|
Description-Content-Type: text/markdown
|
230
225
|
License-File: LICENSE
|
231
226
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
232
|
-
Requires-Dist: jellyfish>=1.2.0
|
233
227
|
Provides-Extra: hf
|
234
228
|
Requires-Dist: datasets>=4.0.0; extra == "hf"
|
235
229
|
Provides-Extra: wordnet
|
@@ -237,6 +231,7 @@ Requires-Dist: nltk>=3.9.1; extra == "wordnet"
|
|
237
231
|
Requires-Dist: numpy<=2.0,>=1.24; extra == "wordnet"
|
238
232
|
Provides-Extra: prime
|
239
233
|
Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
|
234
|
+
Requires-Dist: jellyfish>=1.2.0; extra == "prime"
|
240
235
|
Provides-Extra: dev
|
241
236
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
242
237
|
Requires-Dist: hypothesis>=6.140.0; extra == "dev"
|
@@ -401,6 +396,18 @@ _How can a computer need reading glasses?_
|
|
401
396
|
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
402
397
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
403
398
|
|
399
|
+
### Zeedub
|
400
|
+
|
401
|
+
_A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
|
402
|
+
|
403
|
+
> _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
|
404
|
+
>
|
405
|
+
> Args
|
406
|
+
>
|
407
|
+
> - `rate (float)`: Expected number of zero-width insertions as a proportion of eligible bigrams (default: 0.02, 2%).
|
408
|
+
> - `characters (Sequence[str])`: Optional override for the pool of zero-width strings to inject (default: curated invisibles such as U+200B, U+200C, U+200D, U+FEFF, U+2060).
|
409
|
+
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
410
|
+
|
404
411
|
### Jargoyle
|
405
412
|
|
406
413
|
_Uh oh. The worst person you know just bought a thesaurus._
|
@@ -155,6 +155,18 @@ _How can a computer need reading glasses?_
|
|
155
155
|
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
156
156
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
157
157
|
|
158
|
+
### Zeedub
|
159
|
+
|
160
|
+
_A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
|
161
|
+
|
162
|
+
> _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
|
163
|
+
>
|
164
|
+
> Args
|
165
|
+
>
|
166
|
+
> - `rate (float)`: Expected number of zero-width insertions as a proportion of eligible bigrams (default: 0.02, 2%).
|
167
|
+
> - `characters (Sequence[str])`: Optional override for the pool of zero-width strings to inject (default: curated invisibles such as U+200B, U+200C, U+200D, U+FEFF, U+2060).
|
168
|
+
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
169
|
+
|
158
170
|
### Jargoyle
|
159
171
|
|
160
172
|
_Uh oh. The worst person you know just bought a thesaurus._
|
@@ -1,35 +1,29 @@
|
|
1
1
|
[project]
|
2
2
|
name = "glitchlings"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.5"
|
4
4
|
description = "Monsters for your language games."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.10"
|
7
7
|
|
8
8
|
dependencies = [
|
9
9
|
"confusable-homoglyphs>=3.3.1",
|
10
|
-
"jellyfish>=1.2.0",
|
11
10
|
]
|
12
11
|
|
13
12
|
authors = [
|
14
13
|
{ name = "osoleve" }
|
15
14
|
]
|
16
15
|
|
17
|
-
keywords = ["nlp", "text", "adversarial augmentation", "text augmentation"]
|
16
|
+
keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "confusables", "typo", ""]
|
18
17
|
|
19
18
|
classifiers = [
|
20
19
|
"Development Status :: 3 - Alpha",
|
21
20
|
"Intended Audience :: Developers",
|
22
|
-
"License :: OSI Approved :: Apache Software License",
|
23
21
|
"Programming Language :: Python",
|
24
22
|
"Programming Language :: Python :: 3",
|
25
23
|
"Programming Language :: Python :: 3.10",
|
26
24
|
"Programming Language :: Python :: 3.11",
|
27
25
|
"Programming Language :: Python :: 3.12",
|
28
26
|
"Programming Language :: Rust",
|
29
|
-
"Operating System :: MacOS :: MacOS X",
|
30
|
-
"Operating System :: Microsoft :: Windows",
|
31
|
-
"Operating System :: POSIX :: Linux",
|
32
|
-
"Operating System :: OS Independent",
|
33
27
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
34
28
|
"Topic :: Software Development :: Testing",
|
35
29
|
]
|
@@ -49,7 +43,7 @@ glitchlings = "glitchlings.main:main"
|
|
49
43
|
[project.optional-dependencies]
|
50
44
|
hf = ["datasets>=4.0.0"]
|
51
45
|
wordnet = ["nltk>=3.9.1", "numpy>=1.24,<=2.0"]
|
52
|
-
prime = ["verifiers>=0.1.3.post0"]
|
46
|
+
prime = ["verifiers>=0.1.3.post0", "jellyfish>=1.2.0"]
|
53
47
|
dev = [
|
54
48
|
"pytest>=8.0.0",
|
55
49
|
"hypothesis>=6.140.0",
|
@@ -9,10 +9,15 @@ fn main() {
|
|
9
9
|
prepare_confusion_table().expect("failed to stage OCR confusion table for compilation");
|
10
10
|
pyo3_build_config::add_extension_module_link_args();
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
// Only perform custom Python linking on non-Linux platforms.
|
13
|
+
// On Linux, manylinux wheels must NOT link against libpython to ensure portability.
|
14
|
+
// PyO3's add_extension_module_link_args() already handles this correctly by default.
|
15
|
+
if cfg!(not(target_os = "linux")) {
|
16
|
+
if let Some(python) = configured_python() {
|
17
|
+
link_python(&python);
|
18
|
+
} else if let Some(python) = detect_python() {
|
19
|
+
link_python(&python);
|
20
|
+
}
|
16
21
|
}
|
17
22
|
}
|
18
23
|
|
@@ -32,10 +37,7 @@ fn detect_python() -> Option<OsString> {
|
|
32
37
|
];
|
33
38
|
|
34
39
|
for candidate in CANDIDATES {
|
35
|
-
let status = Command::new(candidate)
|
36
|
-
.arg("-c")
|
37
|
-
.arg("import sys")
|
38
|
-
.output();
|
40
|
+
let status = Command::new(candidate).arg("-c").arg("import sys").output();
|
39
41
|
|
40
42
|
if let Ok(output) = status {
|
41
43
|
if output.status.success() {
|
@@ -77,6 +79,7 @@ fn link_python(python: &OsStr) {
|
|
77
79
|
let stem = stripped
|
78
80
|
.strip_suffix(".so")
|
79
81
|
.or_else(|| stripped.strip_suffix(".a"))
|
82
|
+
.or_else(|| stripped.strip_suffix(".dylib"))
|
80
83
|
.unwrap_or(stripped);
|
81
84
|
if !stem.is_empty() {
|
82
85
|
println!("cargo:rustc-link-lib={stem}");
|
@@ -137,4 +140,3 @@ fn prepare_confusion_table() -> io::Result<()> {
|
|
137
140
|
fs::copy(&source_path, out_dir.join("ocr_confusions.tsv"))?;
|
138
141
|
Ok(())
|
139
142
|
}
|
140
|
-
|
@@ -58,6 +58,7 @@ impl From<PyRngError> for GlitchOpError {
|
|
58
58
|
pub trait GlitchRng {
|
59
59
|
fn random(&mut self) -> Result<f64, GlitchOpError>;
|
60
60
|
fn rand_index(&mut self, upper: usize) -> Result<usize, GlitchOpError>;
|
61
|
+
#[allow(dead_code)]
|
61
62
|
fn sample_indices(&mut self, population: usize, k: usize) -> Result<Vec<usize>, GlitchOpError>;
|
62
63
|
}
|
63
64
|
|
@@ -71,11 +72,115 @@ impl GlitchRng for PyRng {
|
|
71
72
|
Ok(value as usize)
|
72
73
|
}
|
73
74
|
|
75
|
+
#[allow(dead_code)]
|
74
76
|
fn sample_indices(&mut self, population: usize, k: usize) -> Result<Vec<usize>, GlitchOpError> {
|
75
77
|
PyRng::sample_indices(self, population, k).map_err(GlitchOpError::from)
|
76
78
|
}
|
77
79
|
}
|
78
80
|
|
81
|
+
fn core_length_for_weight(core: &str, original: &str) -> usize {
|
82
|
+
let mut length = if !core.is_empty() {
|
83
|
+
core.chars().count()
|
84
|
+
} else {
|
85
|
+
original.chars().count()
|
86
|
+
};
|
87
|
+
if length == 0 {
|
88
|
+
let trimmed = original.trim();
|
89
|
+
length = if trimmed.is_empty() {
|
90
|
+
original.chars().count()
|
91
|
+
} else {
|
92
|
+
trimmed.chars().count()
|
93
|
+
};
|
94
|
+
}
|
95
|
+
if length == 0 {
|
96
|
+
length = 1;
|
97
|
+
}
|
98
|
+
length
|
99
|
+
}
|
100
|
+
|
101
|
+
fn inverse_length_weight(core: &str, original: &str) -> f64 {
|
102
|
+
1.0 / (core_length_for_weight(core, original) as f64)
|
103
|
+
}
|
104
|
+
|
105
|
+
fn direct_length_weight(core: &str, original: &str) -> f64 {
|
106
|
+
core_length_for_weight(core, original) as f64
|
107
|
+
}
|
108
|
+
|
109
|
+
#[derive(Debug)]
|
110
|
+
struct ReduplicateCandidate {
|
111
|
+
index: usize,
|
112
|
+
prefix: String,
|
113
|
+
core: String,
|
114
|
+
suffix: String,
|
115
|
+
weight: f64,
|
116
|
+
}
|
117
|
+
|
118
|
+
#[derive(Debug)]
|
119
|
+
struct DeleteCandidate {
|
120
|
+
index: usize,
|
121
|
+
prefix: String,
|
122
|
+
suffix: String,
|
123
|
+
weight: f64,
|
124
|
+
}
|
125
|
+
|
126
|
+
#[derive(Debug)]
|
127
|
+
struct RedactCandidate {
|
128
|
+
index: usize,
|
129
|
+
prefix: String,
|
130
|
+
suffix: String,
|
131
|
+
repeat: usize,
|
132
|
+
weight: f64,
|
133
|
+
}
|
134
|
+
|
135
|
+
fn weighted_sample_without_replacement(
|
136
|
+
rng: &mut dyn GlitchRng,
|
137
|
+
items: &[(usize, f64)],
|
138
|
+
k: usize,
|
139
|
+
) -> Result<Vec<usize>, GlitchOpError> {
|
140
|
+
if k == 0 || items.is_empty() {
|
141
|
+
return Ok(Vec::new());
|
142
|
+
}
|
143
|
+
|
144
|
+
let mut pool: Vec<(usize, f64)> = items
|
145
|
+
.iter()
|
146
|
+
.map(|(index, weight)| (*index, *weight))
|
147
|
+
.collect();
|
148
|
+
|
149
|
+
if k > pool.len() {
|
150
|
+
return Err(GlitchOpError::ExcessiveRedaction {
|
151
|
+
requested: k,
|
152
|
+
available: pool.len(),
|
153
|
+
});
|
154
|
+
}
|
155
|
+
|
156
|
+
let mut selections: Vec<usize> = Vec::with_capacity(k);
|
157
|
+
for _ in 0..k {
|
158
|
+
if pool.is_empty() {
|
159
|
+
break;
|
160
|
+
}
|
161
|
+
let total_weight: f64 = pool.iter().map(|(_, weight)| weight.max(0.0)).sum();
|
162
|
+
let chosen_index = if total_weight <= f64::EPSILON {
|
163
|
+
rng.rand_index(pool.len())?
|
164
|
+
} else {
|
165
|
+
let threshold = rng.random()? * total_weight;
|
166
|
+
let mut cumulative = 0.0;
|
167
|
+
let mut selected = pool.len() - 1;
|
168
|
+
for (idx, (_, weight)) in pool.iter().enumerate() {
|
169
|
+
cumulative += weight.max(0.0);
|
170
|
+
if cumulative >= threshold {
|
171
|
+
selected = idx;
|
172
|
+
break;
|
173
|
+
}
|
174
|
+
}
|
175
|
+
selected
|
176
|
+
};
|
177
|
+
let (value, _) = pool.remove(chosen_index);
|
178
|
+
selections.push(value);
|
179
|
+
}
|
180
|
+
|
181
|
+
Ok(selections)
|
182
|
+
}
|
183
|
+
|
79
184
|
/// Trait implemented by each glitchling mutation so they can be sequenced by
|
80
185
|
/// the pipeline.
|
81
186
|
pub trait GlitchOp {
|
@@ -86,6 +191,7 @@ pub trait GlitchOp {
|
|
86
191
|
#[derive(Debug, Clone, Copy)]
|
87
192
|
pub struct ReduplicateWordsOp {
|
88
193
|
pub reduplication_rate: f64,
|
194
|
+
pub unweighted: bool,
|
89
195
|
}
|
90
196
|
|
91
197
|
impl GlitchOp for ReduplicateWordsOp {
|
@@ -94,35 +200,68 @@ impl GlitchOp for ReduplicateWordsOp {
|
|
94
200
|
return Ok(());
|
95
201
|
}
|
96
202
|
|
97
|
-
let
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
let original = segment.text().to_string();
|
107
|
-
if original.trim().is_empty() {
|
108
|
-
word_index += 1;
|
109
|
-
continue;
|
110
|
-
}
|
111
|
-
|
112
|
-
if rng.random()? < self.reduplication_rate {
|
113
|
-
let (prefix, core, suffix) = split_affixes(&original);
|
114
|
-
if core.is_empty() {
|
115
|
-
word_index += 1;
|
203
|
+
let total_words = buffer.word_count();
|
204
|
+
let mut candidates: Vec<ReduplicateCandidate> = Vec::new();
|
205
|
+
for idx in 0..total_words {
|
206
|
+
if let Some(segment) = buffer.word_segment(idx) {
|
207
|
+
if matches!(segment.kind(), SegmentKind::Separator) {
|
208
|
+
continue;
|
209
|
+
}
|
210
|
+
let original = segment.text().to_string();
|
211
|
+
if original.trim().is_empty() {
|
116
212
|
continue;
|
117
213
|
}
|
118
|
-
let
|
119
|
-
let
|
120
|
-
|
121
|
-
|
122
|
-
|
214
|
+
let (prefix, core, suffix) = split_affixes(&original);
|
215
|
+
let weight = if self.unweighted {
|
216
|
+
1.0
|
217
|
+
} else {
|
218
|
+
inverse_length_weight(&core, &original)
|
219
|
+
};
|
220
|
+
candidates.push(ReduplicateCandidate {
|
221
|
+
index: idx,
|
222
|
+
prefix,
|
223
|
+
core,
|
224
|
+
suffix,
|
225
|
+
weight,
|
226
|
+
});
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
if candidates.is_empty() {
|
231
|
+
return Ok(());
|
232
|
+
}
|
233
|
+
|
234
|
+
let effective_rate = self.reduplication_rate.max(0.0);
|
235
|
+
if effective_rate <= 0.0 {
|
236
|
+
return Ok(());
|
237
|
+
}
|
238
|
+
|
239
|
+
let mean_weight = candidates
|
240
|
+
.iter()
|
241
|
+
.map(|candidate| candidate.weight)
|
242
|
+
.sum::<f64>()
|
243
|
+
/ (candidates.len() as f64);
|
244
|
+
|
245
|
+
let mut offset = 0usize;
|
246
|
+
for candidate in candidates.into_iter() {
|
247
|
+
let probability = if effective_rate >= 1.0 {
|
248
|
+
1.0
|
249
|
+
} else if mean_weight <= f64::EPSILON {
|
250
|
+
effective_rate
|
123
251
|
} else {
|
124
|
-
|
252
|
+
(effective_rate * (candidate.weight / mean_weight)).min(1.0)
|
253
|
+
};
|
254
|
+
|
255
|
+
if rng.random()? >= probability {
|
256
|
+
continue;
|
125
257
|
}
|
258
|
+
|
259
|
+
let target = candidate.index + offset;
|
260
|
+
let first = format!("{}{}", candidate.prefix, candidate.core);
|
261
|
+
let second = format!("{}{}", candidate.core, candidate.suffix);
|
262
|
+
buffer.replace_word(target, &first)?;
|
263
|
+
buffer.insert_word_after(target, &second, Some(" "))?;
|
264
|
+
offset += 1;
|
126
265
|
}
|
127
266
|
|
128
267
|
Ok(())
|
@@ -133,6 +272,7 @@ impl GlitchOp for ReduplicateWordsOp {
|
|
133
272
|
#[derive(Debug, Clone, Copy)]
|
134
273
|
pub struct DeleteRandomWordsOp {
|
135
274
|
pub max_deletion_rate: f64,
|
275
|
+
pub unweighted: bool,
|
136
276
|
}
|
137
277
|
|
138
278
|
impl GlitchOp for DeleteRandomWordsOp {
|
@@ -141,13 +281,27 @@ impl GlitchOp for DeleteRandomWordsOp {
|
|
141
281
|
return Ok(());
|
142
282
|
}
|
143
283
|
|
144
|
-
let
|
145
|
-
|
284
|
+
let total_words = buffer.word_count();
|
285
|
+
let mut candidates: Vec<DeleteCandidate> = Vec::new();
|
286
|
+
for idx in 1..total_words {
|
146
287
|
if let Some(segment) = buffer.word_segment(idx) {
|
147
288
|
let text = segment.text();
|
148
|
-
if
|
149
|
-
|
289
|
+
if text.is_empty() || is_whitespace_only(text) {
|
290
|
+
continue;
|
150
291
|
}
|
292
|
+
let original = text.to_string();
|
293
|
+
let (prefix, core, suffix) = split_affixes(&original);
|
294
|
+
let weight = if self.unweighted {
|
295
|
+
1.0
|
296
|
+
} else {
|
297
|
+
inverse_length_weight(&core, &original)
|
298
|
+
};
|
299
|
+
candidates.push(DeleteCandidate {
|
300
|
+
index: idx,
|
301
|
+
prefix,
|
302
|
+
suffix,
|
303
|
+
weight,
|
304
|
+
});
|
151
305
|
}
|
152
306
|
}
|
153
307
|
|
@@ -155,23 +309,43 @@ impl GlitchOp for DeleteRandomWordsOp {
|
|
155
309
|
return Ok(());
|
156
310
|
}
|
157
311
|
|
158
|
-
let
|
312
|
+
let effective_rate = self.max_deletion_rate.max(0.0);
|
313
|
+
if effective_rate <= 0.0 {
|
314
|
+
return Ok(());
|
315
|
+
}
|
316
|
+
|
317
|
+
let allowed = ((candidates.len() as f64) * effective_rate).floor() as usize;
|
159
318
|
if allowed == 0 {
|
160
319
|
return Ok(());
|
161
320
|
}
|
162
321
|
|
322
|
+
let mean_weight = candidates
|
323
|
+
.iter()
|
324
|
+
.map(|candidate| candidate.weight)
|
325
|
+
.sum::<f64>()
|
326
|
+
/ (candidates.len() as f64);
|
327
|
+
|
163
328
|
let mut deletions = 0usize;
|
164
|
-
for
|
329
|
+
for candidate in candidates.into_iter() {
|
165
330
|
if deletions >= allowed {
|
166
331
|
break;
|
167
332
|
}
|
168
333
|
|
169
|
-
if
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
334
|
+
let probability = if effective_rate >= 1.0 {
|
335
|
+
1.0
|
336
|
+
} else if mean_weight <= f64::EPSILON {
|
337
|
+
effective_rate
|
338
|
+
} else {
|
339
|
+
(effective_rate * (candidate.weight / mean_weight)).min(1.0)
|
340
|
+
};
|
341
|
+
|
342
|
+
if rng.random()? >= probability {
|
343
|
+
continue;
|
174
344
|
}
|
345
|
+
|
346
|
+
let replacement = format!("{}{}", candidate.prefix.trim(), candidate.suffix.trim());
|
347
|
+
buffer.replace_word(candidate.index, &replacement)?;
|
348
|
+
deletions += 1;
|
175
349
|
}
|
176
350
|
|
177
351
|
let mut joined = buffer.to_string();
|
@@ -191,6 +365,7 @@ pub struct RedactWordsOp {
|
|
191
365
|
pub replacement_char: String,
|
192
366
|
pub redaction_rate: f64,
|
193
367
|
pub merge_adjacent: bool,
|
368
|
+
pub unweighted: bool,
|
194
369
|
}
|
195
370
|
|
196
371
|
impl GlitchOp for RedactWordsOp {
|
@@ -199,51 +374,77 @@ impl GlitchOp for RedactWordsOp {
|
|
199
374
|
return Err(GlitchOpError::NoRedactableWords);
|
200
375
|
}
|
201
376
|
|
202
|
-
let
|
203
|
-
|
377
|
+
let total_words = buffer.word_count();
|
378
|
+
let mut candidates: Vec<RedactCandidate> = Vec::new();
|
379
|
+
for idx in 0..total_words {
|
204
380
|
if let Some(segment) = buffer.word_segment(idx) {
|
205
381
|
let text = segment.text();
|
206
|
-
if
|
207
|
-
|
382
|
+
if text.trim().is_empty() {
|
383
|
+
continue;
|
208
384
|
}
|
385
|
+
let original = text.to_string();
|
386
|
+
let (prefix, core, suffix) = split_affixes(&original);
|
387
|
+
if core.is_empty() {
|
388
|
+
continue;
|
389
|
+
}
|
390
|
+
let repeat = core.chars().count();
|
391
|
+
if repeat == 0 {
|
392
|
+
continue;
|
393
|
+
}
|
394
|
+
let weight = if self.unweighted {
|
395
|
+
1.0
|
396
|
+
} else {
|
397
|
+
direct_length_weight(&core, &original)
|
398
|
+
};
|
399
|
+
candidates.push(RedactCandidate {
|
400
|
+
index: idx,
|
401
|
+
prefix,
|
402
|
+
suffix,
|
403
|
+
repeat,
|
404
|
+
weight,
|
405
|
+
});
|
209
406
|
}
|
210
407
|
}
|
211
408
|
|
212
|
-
if
|
409
|
+
if candidates.is_empty() {
|
213
410
|
return Err(GlitchOpError::NoRedactableWords);
|
214
411
|
}
|
215
412
|
|
216
|
-
let
|
217
|
-
|
413
|
+
let effective_rate = self.redaction_rate.max(0.0);
|
414
|
+
let mut num_to_redact = ((candidates.len() as f64) * effective_rate).floor() as usize;
|
218
415
|
if num_to_redact < 1 {
|
219
416
|
num_to_redact = 1;
|
220
417
|
}
|
221
|
-
if num_to_redact >
|
418
|
+
if num_to_redact > candidates.len() {
|
222
419
|
return Err(GlitchOpError::ExcessiveRedaction {
|
223
420
|
requested: num_to_redact,
|
224
|
-
available:
|
421
|
+
available: candidates.len(),
|
225
422
|
});
|
226
423
|
}
|
227
424
|
|
228
|
-
let
|
229
|
-
|
425
|
+
let weighted_indices: Vec<(usize, f64)> = candidates
|
426
|
+
.iter()
|
427
|
+
.enumerate()
|
428
|
+
.map(|(idx, candidate)| (idx, candidate.weight))
|
429
|
+
.collect();
|
430
|
+
|
431
|
+
let mut selections =
|
432
|
+
weighted_sample_without_replacement(rng, &weighted_indices, num_to_redact)?;
|
433
|
+
selections.sort_unstable_by_key(|candidate_idx| candidates[*candidate_idx].index);
|
230
434
|
|
231
435
|
for selection in selections {
|
232
|
-
let
|
233
|
-
let (prefix, core, suffix) = split_affixes(original);
|
234
|
-
if core.is_empty() {
|
235
|
-
continue;
|
236
|
-
}
|
237
|
-
let repeat = core.chars().count();
|
436
|
+
let candidate = &candidates[selection];
|
238
437
|
let mut replacement = String::with_capacity(
|
239
|
-
prefix.len()
|
438
|
+
candidate.prefix.len()
|
439
|
+
+ candidate.suffix.len()
|
440
|
+
+ self.replacement_char.len() * candidate.repeat,
|
240
441
|
);
|
241
|
-
replacement.push_str(&prefix);
|
242
|
-
for _ in 0..repeat {
|
442
|
+
replacement.push_str(&candidate.prefix);
|
443
|
+
for _ in 0..candidate.repeat {
|
243
444
|
replacement.push_str(&self.replacement_char);
|
244
445
|
}
|
245
|
-
replacement.push_str(&suffix);
|
246
|
-
buffer.replace_word(
|
446
|
+
replacement.push_str(&candidate.suffix);
|
447
|
+
buffer.replace_word(candidate.index, &replacement)?;
|
247
448
|
}
|
248
449
|
|
249
450
|
if self.merge_adjacent {
|
@@ -384,6 +585,7 @@ mod tests {
|
|
384
585
|
let mut rng = PyRng::new(151);
|
385
586
|
let op = ReduplicateWordsOp {
|
386
587
|
reduplication_rate: 1.0,
|
588
|
+
unweighted: false,
|
387
589
|
};
|
388
590
|
op.apply(&mut buffer, &mut rng)
|
389
591
|
.expect("reduplication works");
|
@@ -396,6 +598,7 @@ mod tests {
|
|
396
598
|
let mut rng = PyRng::new(151);
|
397
599
|
let op = DeleteRandomWordsOp {
|
398
600
|
max_deletion_rate: 0.75,
|
601
|
+
unweighted: false,
|
399
602
|
};
|
400
603
|
op.apply(&mut buffer, &mut rng).expect("deletion works");
|
401
604
|
assert_eq!(buffer.to_string(), "One three four");
|
@@ -409,6 +612,7 @@ mod tests {
|
|
409
612
|
replacement_char: "█".to_string(),
|
410
613
|
redaction_rate: 0.8,
|
411
614
|
merge_adjacent: true,
|
615
|
+
unweighted: false,
|
412
616
|
};
|
413
617
|
op.apply(&mut buffer, &mut rng).expect("redaction works");
|
414
618
|
let result = buffer.to_string();
|
@@ -423,6 +627,7 @@ mod tests {
|
|
423
627
|
replacement_char: "█".to_string(),
|
424
628
|
redaction_rate: 0.5,
|
425
629
|
merge_adjacent: false,
|
630
|
+
unweighted: false,
|
426
631
|
};
|
427
632
|
let error = op.apply(&mut buffer, &mut rng).unwrap_err();
|
428
633
|
match error {
|
@@ -448,13 +653,11 @@ mod tests {
|
|
448
653
|
let mut rng = PyRng::new(123);
|
449
654
|
let op = ReduplicateWordsOp {
|
450
655
|
reduplication_rate: 0.5,
|
656
|
+
unweighted: false,
|
451
657
|
};
|
452
658
|
op.apply(&mut buffer, &mut rng)
|
453
659
|
.expect("reduplication succeeds");
|
454
|
-
assert_eq!(
|
455
|
-
buffer.to_string(),
|
456
|
-
"The The quick quick brown brown fox fox"
|
457
|
-
);
|
660
|
+
assert_eq!(buffer.to_string(), "The The quick quick brown fox fox");
|
458
661
|
}
|
459
662
|
|
460
663
|
#[test]
|
@@ -463,6 +666,7 @@ mod tests {
|
|
463
666
|
let mut rng = PyRng::new(123);
|
464
667
|
let op = DeleteRandomWordsOp {
|
465
668
|
max_deletion_rate: 0.5,
|
669
|
+
unweighted: false,
|
466
670
|
};
|
467
671
|
op.apply(&mut buffer, &mut rng).expect("deletion succeeds");
|
468
672
|
assert_eq!(buffer.to_string(), "The over the lazy dog.");
|
@@ -476,9 +680,10 @@ mod tests {
|
|
476
680
|
replacement_char: "█".to_string(),
|
477
681
|
redaction_rate: 0.5,
|
478
682
|
merge_adjacent: false,
|
683
|
+
unweighted: false,
|
479
684
|
};
|
480
685
|
op.apply(&mut buffer, &mut rng).expect("redaction succeeds");
|
481
|
-
assert_eq!(buffer.to_string(), "████ these
|
686
|
+
assert_eq!(buffer.to_string(), "████ these █████ please");
|
482
687
|
}
|
483
688
|
|
484
689
|
#[test]
|
@@ -489,6 +694,7 @@ mod tests {
|
|
489
694
|
replacement_char: "█".to_string(),
|
490
695
|
redaction_rate: 1.0,
|
491
696
|
merge_adjacent: true,
|
697
|
+
unweighted: false,
|
492
698
|
};
|
493
699
|
op.apply(&mut buffer, &mut rng).expect("redaction succeeds");
|
494
700
|
assert_eq!(buffer.to_string(), "█████████████████");
|