glitchlings 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {glitchlings-0.2.3 → glitchlings-0.2.5}/PKG-INFO +15 -8
  2. {glitchlings-0.2.3 → glitchlings-0.2.5}/README.md +12 -0
  3. {glitchlings-0.2.3 → glitchlings-0.2.5}/pyproject.toml +3 -9
  4. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/build.rs +11 -9
  5. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/glitch_ops.rs +269 -63
  6. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/lib.rs +56 -11
  7. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/pipeline.rs +8 -2
  8. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/resources.rs +1 -3
  9. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/typogre.rs +0 -2
  10. glitchlings-0.2.5/rust/zoo/src/zeedub.rs +89 -0
  11. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/__init__.py +4 -0
  12. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/__init__.py +5 -1
  13. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/redactyl.py +70 -9
  14. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/reduple.py +57 -16
  15. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/rushmore.py +52 -17
  16. glitchlings-0.2.5/src/glitchlings/zoo/zeedub.py +144 -0
  17. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/PKG-INFO +15 -8
  18. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/SOURCES.txt +2 -0
  19. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_glitchlings_determinism.py +8 -0
  20. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_parameter_effects.py +108 -1
  21. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_rust_backed_glitchlings.py +109 -8
  22. {glitchlings-0.2.3 → glitchlings-0.2.5}/LICENSE +0 -0
  23. {glitchlings-0.2.3 → glitchlings-0.2.5}/MANIFEST.in +0 -0
  24. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/Cargo.lock +0 -0
  25. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/Cargo.toml +0 -0
  26. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/Cargo.toml +0 -0
  27. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/assets/ocr_confusions.tsv +0 -0
  28. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/rng.rs +0 -0
  29. {glitchlings-0.2.3 → glitchlings-0.2.5}/rust/zoo/src/text_buffer.rs +0 -0
  30. {glitchlings-0.2.3 → glitchlings-0.2.5}/setup.cfg +0 -0
  31. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/__main__.py +0 -0
  32. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/dlc/__init__.py +0 -0
  33. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/dlc/huggingface.py +0 -0
  34. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/dlc/prime.py +0 -0
  35. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/main.py +0 -0
  36. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/util/__init__.py +0 -0
  37. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
  38. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/_rate.py +0 -0
  39. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/core.py +0 -0
  40. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/jargoyle.py +0 -0
  41. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/mim1c.py +0 -0
  42. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
  43. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/scannequin.py +0 -0
  44. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings/zoo/typogre.py +0 -0
  45. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/dependency_links.txt +0 -0
  46. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/entry_points.txt +0 -0
  47. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/requires.txt +1 -1
  48. {glitchlings-0.2.3 → glitchlings-0.2.5}/src/glitchlings.egg-info/top_level.txt +0 -0
  49. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_cli.py +0 -0
  50. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_dataset_corruption.py +0 -0
  51. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_gaggle.py +0 -0
  52. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_glitchling_core.py +0 -0
  53. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_huggingface_dlc.py +0 -0
  54. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_jargoyle.py +0 -0
  55. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_keyboard_layouts.py +0 -0
  56. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_prime_echo_chamber.py +0 -0
  57. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_property_based.py +0 -0
  58. {glitchlings-0.2.3 → glitchlings-0.2.5}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -209,27 +209,21 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
209
209
  Project-URL: Repository, https://github.com/osoleve/glitchlings.git
210
210
  Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
211
211
  Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
212
- Keywords: nlp,text,adversarial augmentation,text augmentation
212
+ Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,confusables,typo,
213
213
  Classifier: Development Status :: 3 - Alpha
214
214
  Classifier: Intended Audience :: Developers
215
- Classifier: License :: OSI Approved :: Apache Software License
216
215
  Classifier: Programming Language :: Python
217
216
  Classifier: Programming Language :: Python :: 3
218
217
  Classifier: Programming Language :: Python :: 3.10
219
218
  Classifier: Programming Language :: Python :: 3.11
220
219
  Classifier: Programming Language :: Python :: 3.12
221
220
  Classifier: Programming Language :: Rust
222
- Classifier: Operating System :: MacOS :: MacOS X
223
- Classifier: Operating System :: Microsoft :: Windows
224
- Classifier: Operating System :: POSIX :: Linux
225
- Classifier: Operating System :: OS Independent
226
221
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
227
222
  Classifier: Topic :: Software Development :: Testing
228
223
  Requires-Python: >=3.10
229
224
  Description-Content-Type: text/markdown
230
225
  License-File: LICENSE
231
226
  Requires-Dist: confusable-homoglyphs>=3.3.1
232
- Requires-Dist: jellyfish>=1.2.0
233
227
  Provides-Extra: hf
234
228
  Requires-Dist: datasets>=4.0.0; extra == "hf"
235
229
  Provides-Extra: wordnet
@@ -237,6 +231,7 @@ Requires-Dist: nltk>=3.9.1; extra == "wordnet"
237
231
  Requires-Dist: numpy<=2.0,>=1.24; extra == "wordnet"
238
232
  Provides-Extra: prime
239
233
  Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
234
+ Requires-Dist: jellyfish>=1.2.0; extra == "prime"
240
235
  Provides-Extra: dev
241
236
  Requires-Dist: pytest>=8.0.0; extra == "dev"
242
237
  Requires-Dist: hypothesis>=6.140.0; extra == "dev"
@@ -401,6 +396,18 @@ _How can a computer need reading glasses?_
401
396
  > - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
402
397
  > - `seed (int)`: The random seed for reproducibility (default: 151).
403
398
 
399
+ ### Zeedub
400
+
401
+ _A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
402
+
403
+ > _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
404
+ >
405
+ > Args
406
+ >
407
+ > - `rate (float)`: Expected number of zero-width insertions as a proportion of eligible bigrams (default: 0.02, 2%).
408
+ > - `characters (Sequence[str])`: Optional override for the pool of zero-width strings to inject (default: curated invisibles such as U+200B, U+200C, U+200D, U+FEFF, U+2060).
409
+ > - `seed (int)`: The random seed for reproducibility (default: 151).
410
+
404
411
  ### Jargoyle
405
412
 
406
413
  _Uh oh. The worst person you know just bought a thesaurus._
@@ -155,6 +155,18 @@ _How can a computer need reading glasses?_
155
155
  > - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
156
156
  > - `seed (int)`: The random seed for reproducibility (default: 151).
157
157
 
158
+ ### Zeedub
159
+
160
+ _A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
161
+
162
+ > _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
163
+ >
164
+ > Args
165
+ >
166
+ > - `rate (float)`: Expected number of zero-width insertions as a proportion of eligible bigrams (default: 0.02, 2%).
167
+ > - `characters (Sequence[str])`: Optional override for the pool of zero-width strings to inject (default: curated invisibles such as U+200B, U+200C, U+200D, U+FEFF, U+2060).
168
+ > - `seed (int)`: The random seed for reproducibility (default: 151).
169
+
158
170
  ### Jargoyle
159
171
 
160
172
  _Uh oh. The worst person you know just bought a thesaurus._
@@ -1,35 +1,29 @@
1
1
  [project]
2
2
  name = "glitchlings"
3
- version = "0.2.3"
3
+ version = "0.2.5"
4
4
  description = "Monsters for your language games."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
7
7
 
8
8
  dependencies = [
9
9
  "confusable-homoglyphs>=3.3.1",
10
- "jellyfish>=1.2.0",
11
10
  ]
12
11
 
13
12
  authors = [
14
13
  { name = "osoleve" }
15
14
  ]
16
15
 
17
- keywords = ["nlp", "text", "adversarial augmentation", "text augmentation"]
16
+ keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "confusables", "typo", ""]
18
17
 
19
18
  classifiers = [
20
19
  "Development Status :: 3 - Alpha",
21
20
  "Intended Audience :: Developers",
22
- "License :: OSI Approved :: Apache Software License",
23
21
  "Programming Language :: Python",
24
22
  "Programming Language :: Python :: 3",
25
23
  "Programming Language :: Python :: 3.10",
26
24
  "Programming Language :: Python :: 3.11",
27
25
  "Programming Language :: Python :: 3.12",
28
26
  "Programming Language :: Rust",
29
- "Operating System :: MacOS :: MacOS X",
30
- "Operating System :: Microsoft :: Windows",
31
- "Operating System :: POSIX :: Linux",
32
- "Operating System :: OS Independent",
33
27
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
34
28
  "Topic :: Software Development :: Testing",
35
29
  ]
@@ -49,7 +43,7 @@ glitchlings = "glitchlings.main:main"
49
43
  [project.optional-dependencies]
50
44
  hf = ["datasets>=4.0.0"]
51
45
  wordnet = ["nltk>=3.9.1", "numpy>=1.24,<=2.0"]
52
- prime = ["verifiers>=0.1.3.post0"]
46
+ prime = ["verifiers>=0.1.3.post0", "jellyfish>=1.2.0"]
53
47
  dev = [
54
48
  "pytest>=8.0.0",
55
49
  "hypothesis>=6.140.0",
@@ -9,10 +9,15 @@ fn main() {
9
9
  prepare_confusion_table().expect("failed to stage OCR confusion table for compilation");
10
10
  pyo3_build_config::add_extension_module_link_args();
11
11
 
12
- if let Some(python) = configured_python() {
13
- link_python(&python);
14
- } else if let Some(python) = detect_python() {
15
- link_python(&python);
12
+ // Only perform custom Python linking on non-Linux platforms.
13
+ // On Linux, manylinux wheels must NOT link against libpython to ensure portability.
14
+ // PyO3's add_extension_module_link_args() already handles this correctly by default.
15
+ if cfg!(not(target_os = "linux")) {
16
+ if let Some(python) = configured_python() {
17
+ link_python(&python);
18
+ } else if let Some(python) = detect_python() {
19
+ link_python(&python);
20
+ }
16
21
  }
17
22
  }
18
23
 
@@ -32,10 +37,7 @@ fn detect_python() -> Option<OsString> {
32
37
  ];
33
38
 
34
39
  for candidate in CANDIDATES {
35
- let status = Command::new(candidate)
36
- .arg("-c")
37
- .arg("import sys")
38
- .output();
40
+ let status = Command::new(candidate).arg("-c").arg("import sys").output();
39
41
 
40
42
  if let Ok(output) = status {
41
43
  if output.status.success() {
@@ -77,6 +79,7 @@ fn link_python(python: &OsStr) {
77
79
  let stem = stripped
78
80
  .strip_suffix(".so")
79
81
  .or_else(|| stripped.strip_suffix(".a"))
82
+ .or_else(|| stripped.strip_suffix(".dylib"))
80
83
  .unwrap_or(stripped);
81
84
  if !stem.is_empty() {
82
85
  println!("cargo:rustc-link-lib={stem}");
@@ -137,4 +140,3 @@ fn prepare_confusion_table() -> io::Result<()> {
137
140
  fs::copy(&source_path, out_dir.join("ocr_confusions.tsv"))?;
138
141
  Ok(())
139
142
  }
140
-
@@ -58,6 +58,7 @@ impl From<PyRngError> for GlitchOpError {
58
58
  pub trait GlitchRng {
59
59
  fn random(&mut self) -> Result<f64, GlitchOpError>;
60
60
  fn rand_index(&mut self, upper: usize) -> Result<usize, GlitchOpError>;
61
+ #[allow(dead_code)]
61
62
  fn sample_indices(&mut self, population: usize, k: usize) -> Result<Vec<usize>, GlitchOpError>;
62
63
  }
63
64
 
@@ -71,11 +72,115 @@ impl GlitchRng for PyRng {
71
72
  Ok(value as usize)
72
73
  }
73
74
 
75
+ #[allow(dead_code)]
74
76
  fn sample_indices(&mut self, population: usize, k: usize) -> Result<Vec<usize>, GlitchOpError> {
75
77
  PyRng::sample_indices(self, population, k).map_err(GlitchOpError::from)
76
78
  }
77
79
  }
78
80
 
81
+ fn core_length_for_weight(core: &str, original: &str) -> usize {
82
+ let mut length = if !core.is_empty() {
83
+ core.chars().count()
84
+ } else {
85
+ original.chars().count()
86
+ };
87
+ if length == 0 {
88
+ let trimmed = original.trim();
89
+ length = if trimmed.is_empty() {
90
+ original.chars().count()
91
+ } else {
92
+ trimmed.chars().count()
93
+ };
94
+ }
95
+ if length == 0 {
96
+ length = 1;
97
+ }
98
+ length
99
+ }
100
+
101
+ fn inverse_length_weight(core: &str, original: &str) -> f64 {
102
+ 1.0 / (core_length_for_weight(core, original) as f64)
103
+ }
104
+
105
+ fn direct_length_weight(core: &str, original: &str) -> f64 {
106
+ core_length_for_weight(core, original) as f64
107
+ }
108
+
109
+ #[derive(Debug)]
110
+ struct ReduplicateCandidate {
111
+ index: usize,
112
+ prefix: String,
113
+ core: String,
114
+ suffix: String,
115
+ weight: f64,
116
+ }
117
+
118
+ #[derive(Debug)]
119
+ struct DeleteCandidate {
120
+ index: usize,
121
+ prefix: String,
122
+ suffix: String,
123
+ weight: f64,
124
+ }
125
+
126
+ #[derive(Debug)]
127
+ struct RedactCandidate {
128
+ index: usize,
129
+ prefix: String,
130
+ suffix: String,
131
+ repeat: usize,
132
+ weight: f64,
133
+ }
134
+
135
+ fn weighted_sample_without_replacement(
136
+ rng: &mut dyn GlitchRng,
137
+ items: &[(usize, f64)],
138
+ k: usize,
139
+ ) -> Result<Vec<usize>, GlitchOpError> {
140
+ if k == 0 || items.is_empty() {
141
+ return Ok(Vec::new());
142
+ }
143
+
144
+ let mut pool: Vec<(usize, f64)> = items
145
+ .iter()
146
+ .map(|(index, weight)| (*index, *weight))
147
+ .collect();
148
+
149
+ if k > pool.len() {
150
+ return Err(GlitchOpError::ExcessiveRedaction {
151
+ requested: k,
152
+ available: pool.len(),
153
+ });
154
+ }
155
+
156
+ let mut selections: Vec<usize> = Vec::with_capacity(k);
157
+ for _ in 0..k {
158
+ if pool.is_empty() {
159
+ break;
160
+ }
161
+ let total_weight: f64 = pool.iter().map(|(_, weight)| weight.max(0.0)).sum();
162
+ let chosen_index = if total_weight <= f64::EPSILON {
163
+ rng.rand_index(pool.len())?
164
+ } else {
165
+ let threshold = rng.random()? * total_weight;
166
+ let mut cumulative = 0.0;
167
+ let mut selected = pool.len() - 1;
168
+ for (idx, (_, weight)) in pool.iter().enumerate() {
169
+ cumulative += weight.max(0.0);
170
+ if cumulative >= threshold {
171
+ selected = idx;
172
+ break;
173
+ }
174
+ }
175
+ selected
176
+ };
177
+ let (value, _) = pool.remove(chosen_index);
178
+ selections.push(value);
179
+ }
180
+
181
+ Ok(selections)
182
+ }
183
+
79
184
  /// Trait implemented by each glitchling mutation so they can be sequenced by
80
185
  /// the pipeline.
81
186
  pub trait GlitchOp {
@@ -86,6 +191,7 @@ pub trait GlitchOp {
86
191
  #[derive(Debug, Clone, Copy)]
87
192
  pub struct ReduplicateWordsOp {
88
193
  pub reduplication_rate: f64,
194
+ pub unweighted: bool,
89
195
  }
90
196
 
91
197
  impl GlitchOp for ReduplicateWordsOp {
@@ -94,35 +200,68 @@ impl GlitchOp for ReduplicateWordsOp {
94
200
  return Ok(());
95
201
  }
96
202
 
97
- let mut word_index = 0;
98
- while word_index < buffer.word_count() {
99
- let Some(segment) = buffer.word_segment(word_index) else {
100
- break;
101
- };
102
- if matches!(segment.kind(), SegmentKind::Separator) {
103
- word_index += 1;
104
- continue;
105
- }
106
- let original = segment.text().to_string();
107
- if original.trim().is_empty() {
108
- word_index += 1;
109
- continue;
110
- }
111
-
112
- if rng.random()? < self.reduplication_rate {
113
- let (prefix, core, suffix) = split_affixes(&original);
114
- if core.is_empty() {
115
- word_index += 1;
203
+ let total_words = buffer.word_count();
204
+ let mut candidates: Vec<ReduplicateCandidate> = Vec::new();
205
+ for idx in 0..total_words {
206
+ if let Some(segment) = buffer.word_segment(idx) {
207
+ if matches!(segment.kind(), SegmentKind::Separator) {
208
+ continue;
209
+ }
210
+ let original = segment.text().to_string();
211
+ if original.trim().is_empty() {
116
212
  continue;
117
213
  }
118
- let first = format!("{prefix}{core}");
119
- let second = format!("{core}{suffix}");
120
- buffer.replace_word(word_index, &first)?;
121
- buffer.insert_word_after(word_index, &second, Some(" "))?;
122
- word_index += 2;
214
+ let (prefix, core, suffix) = split_affixes(&original);
215
+ let weight = if self.unweighted {
216
+ 1.0
217
+ } else {
218
+ inverse_length_weight(&core, &original)
219
+ };
220
+ candidates.push(ReduplicateCandidate {
221
+ index: idx,
222
+ prefix,
223
+ core,
224
+ suffix,
225
+ weight,
226
+ });
227
+ }
228
+ }
229
+
230
+ if candidates.is_empty() {
231
+ return Ok(());
232
+ }
233
+
234
+ let effective_rate = self.reduplication_rate.max(0.0);
235
+ if effective_rate <= 0.0 {
236
+ return Ok(());
237
+ }
238
+
239
+ let mean_weight = candidates
240
+ .iter()
241
+ .map(|candidate| candidate.weight)
242
+ .sum::<f64>()
243
+ / (candidates.len() as f64);
244
+
245
+ let mut offset = 0usize;
246
+ for candidate in candidates.into_iter() {
247
+ let probability = if effective_rate >= 1.0 {
248
+ 1.0
249
+ } else if mean_weight <= f64::EPSILON {
250
+ effective_rate
123
251
  } else {
124
- word_index += 1;
252
+ (effective_rate * (candidate.weight / mean_weight)).min(1.0)
253
+ };
254
+
255
+ if rng.random()? >= probability {
256
+ continue;
125
257
  }
258
+
259
+ let target = candidate.index + offset;
260
+ let first = format!("{}{}", candidate.prefix, candidate.core);
261
+ let second = format!("{}{}", candidate.core, candidate.suffix);
262
+ buffer.replace_word(target, &first)?;
263
+ buffer.insert_word_after(target, &second, Some(" "))?;
264
+ offset += 1;
126
265
  }
127
266
 
128
267
  Ok(())
@@ -133,6 +272,7 @@ impl GlitchOp for ReduplicateWordsOp {
133
272
  #[derive(Debug, Clone, Copy)]
134
273
  pub struct DeleteRandomWordsOp {
135
274
  pub max_deletion_rate: f64,
275
+ pub unweighted: bool,
136
276
  }
137
277
 
138
278
  impl GlitchOp for DeleteRandomWordsOp {
@@ -141,13 +281,27 @@ impl GlitchOp for DeleteRandomWordsOp {
141
281
  return Ok(());
142
282
  }
143
283
 
144
- let mut candidates: Vec<(usize, String)> = Vec::new();
145
- for idx in 1..buffer.word_count() {
284
+ let total_words = buffer.word_count();
285
+ let mut candidates: Vec<DeleteCandidate> = Vec::new();
286
+ for idx in 1..total_words {
146
287
  if let Some(segment) = buffer.word_segment(idx) {
147
288
  let text = segment.text();
148
- if !text.is_empty() && !is_whitespace_only(text) {
149
- candidates.push((idx, text.to_string()));
289
+ if text.is_empty() || is_whitespace_only(text) {
290
+ continue;
150
291
  }
292
+ let original = text.to_string();
293
+ let (prefix, core, suffix) = split_affixes(&original);
294
+ let weight = if self.unweighted {
295
+ 1.0
296
+ } else {
297
+ inverse_length_weight(&core, &original)
298
+ };
299
+ candidates.push(DeleteCandidate {
300
+ index: idx,
301
+ prefix,
302
+ suffix,
303
+ weight,
304
+ });
151
305
  }
152
306
  }
153
307
 
@@ -155,23 +309,43 @@ impl GlitchOp for DeleteRandomWordsOp {
155
309
  return Ok(());
156
310
  }
157
311
 
158
- let allowed = ((candidates.len() as f64) * self.max_deletion_rate).floor() as usize;
312
+ let effective_rate = self.max_deletion_rate.max(0.0);
313
+ if effective_rate <= 0.0 {
314
+ return Ok(());
315
+ }
316
+
317
+ let allowed = ((candidates.len() as f64) * effective_rate).floor() as usize;
159
318
  if allowed == 0 {
160
319
  return Ok(());
161
320
  }
162
321
 
322
+ let mean_weight = candidates
323
+ .iter()
324
+ .map(|candidate| candidate.weight)
325
+ .sum::<f64>()
326
+ / (candidates.len() as f64);
327
+
163
328
  let mut deletions = 0usize;
164
- for (word_index, original) in candidates {
329
+ for candidate in candidates.into_iter() {
165
330
  if deletions >= allowed {
166
331
  break;
167
332
  }
168
333
 
169
- if rng.random()? < self.max_deletion_rate {
170
- let (prefix, _, suffix) = split_affixes(&original);
171
- let replacement = format!("{}{}", prefix.trim(), suffix.trim());
172
- buffer.replace_word(word_index, &replacement)?;
173
- deletions += 1;
334
+ let probability = if effective_rate >= 1.0 {
335
+ 1.0
336
+ } else if mean_weight <= f64::EPSILON {
337
+ effective_rate
338
+ } else {
339
+ (effective_rate * (candidate.weight / mean_weight)).min(1.0)
340
+ };
341
+
342
+ if rng.random()? >= probability {
343
+ continue;
174
344
  }
345
+
346
+ let replacement = format!("{}{}", candidate.prefix.trim(), candidate.suffix.trim());
347
+ buffer.replace_word(candidate.index, &replacement)?;
348
+ deletions += 1;
175
349
  }
176
350
 
177
351
  let mut joined = buffer.to_string();
@@ -191,6 +365,7 @@ pub struct RedactWordsOp {
191
365
  pub replacement_char: String,
192
366
  pub redaction_rate: f64,
193
367
  pub merge_adjacent: bool,
368
+ pub unweighted: bool,
194
369
  }
195
370
 
196
371
  impl GlitchOp for RedactWordsOp {
@@ -199,51 +374,77 @@ impl GlitchOp for RedactWordsOp {
199
374
  return Err(GlitchOpError::NoRedactableWords);
200
375
  }
201
376
 
202
- let mut word_indices: Vec<(usize, String)> = Vec::new();
203
- for idx in 0..buffer.word_count() {
377
+ let total_words = buffer.word_count();
378
+ let mut candidates: Vec<RedactCandidate> = Vec::new();
379
+ for idx in 0..total_words {
204
380
  if let Some(segment) = buffer.word_segment(idx) {
205
381
  let text = segment.text();
206
- if !text.trim().is_empty() {
207
- word_indices.push((idx, text.to_string()));
382
+ if text.trim().is_empty() {
383
+ continue;
208
384
  }
385
+ let original = text.to_string();
386
+ let (prefix, core, suffix) = split_affixes(&original);
387
+ if core.is_empty() {
388
+ continue;
389
+ }
390
+ let repeat = core.chars().count();
391
+ if repeat == 0 {
392
+ continue;
393
+ }
394
+ let weight = if self.unweighted {
395
+ 1.0
396
+ } else {
397
+ direct_length_weight(&core, &original)
398
+ };
399
+ candidates.push(RedactCandidate {
400
+ index: idx,
401
+ prefix,
402
+ suffix,
403
+ repeat,
404
+ weight,
405
+ });
209
406
  }
210
407
  }
211
408
 
212
- if word_indices.is_empty() {
409
+ if candidates.is_empty() {
213
410
  return Err(GlitchOpError::NoRedactableWords);
214
411
  }
215
412
 
216
- let mut num_to_redact =
217
- ((word_indices.len() as f64) * self.redaction_rate).floor() as usize;
413
+ let effective_rate = self.redaction_rate.max(0.0);
414
+ let mut num_to_redact = ((candidates.len() as f64) * effective_rate).floor() as usize;
218
415
  if num_to_redact < 1 {
219
416
  num_to_redact = 1;
220
417
  }
221
- if num_to_redact > word_indices.len() {
418
+ if num_to_redact > candidates.len() {
222
419
  return Err(GlitchOpError::ExcessiveRedaction {
223
420
  requested: num_to_redact,
224
- available: word_indices.len(),
421
+ available: candidates.len(),
225
422
  });
226
423
  }
227
424
 
228
- let mut selections = rng.sample_indices(word_indices.len(), num_to_redact)?;
229
- selections.sort_unstable();
425
+ let weighted_indices: Vec<(usize, f64)> = candidates
426
+ .iter()
427
+ .enumerate()
428
+ .map(|(idx, candidate)| (idx, candidate.weight))
429
+ .collect();
430
+
431
+ let mut selections =
432
+ weighted_sample_without_replacement(rng, &weighted_indices, num_to_redact)?;
433
+ selections.sort_unstable_by_key(|candidate_idx| candidates[*candidate_idx].index);
230
434
 
231
435
  for selection in selections {
232
- let (word_index, original) = &word_indices[selection];
233
- let (prefix, core, suffix) = split_affixes(original);
234
- if core.is_empty() {
235
- continue;
236
- }
237
- let repeat = core.chars().count();
436
+ let candidate = &candidates[selection];
238
437
  let mut replacement = String::with_capacity(
239
- prefix.len() + suffix.len() + self.replacement_char.len() * repeat,
438
+ candidate.prefix.len()
439
+ + candidate.suffix.len()
440
+ + self.replacement_char.len() * candidate.repeat,
240
441
  );
241
- replacement.push_str(&prefix);
242
- for _ in 0..repeat {
442
+ replacement.push_str(&candidate.prefix);
443
+ for _ in 0..candidate.repeat {
243
444
  replacement.push_str(&self.replacement_char);
244
445
  }
245
- replacement.push_str(&suffix);
246
- buffer.replace_word(*word_index, &replacement)?;
446
+ replacement.push_str(&candidate.suffix);
447
+ buffer.replace_word(candidate.index, &replacement)?;
247
448
  }
248
449
 
249
450
  if self.merge_adjacent {
@@ -384,6 +585,7 @@ mod tests {
384
585
  let mut rng = PyRng::new(151);
385
586
  let op = ReduplicateWordsOp {
386
587
  reduplication_rate: 1.0,
588
+ unweighted: false,
387
589
  };
388
590
  op.apply(&mut buffer, &mut rng)
389
591
  .expect("reduplication works");
@@ -396,6 +598,7 @@ mod tests {
396
598
  let mut rng = PyRng::new(151);
397
599
  let op = DeleteRandomWordsOp {
398
600
  max_deletion_rate: 0.75,
601
+ unweighted: false,
399
602
  };
400
603
  op.apply(&mut buffer, &mut rng).expect("deletion works");
401
604
  assert_eq!(buffer.to_string(), "One three four");
@@ -409,6 +612,7 @@ mod tests {
409
612
  replacement_char: "█".to_string(),
410
613
  redaction_rate: 0.8,
411
614
  merge_adjacent: true,
615
+ unweighted: false,
412
616
  };
413
617
  op.apply(&mut buffer, &mut rng).expect("redaction works");
414
618
  let result = buffer.to_string();
@@ -423,6 +627,7 @@ mod tests {
423
627
  replacement_char: "█".to_string(),
424
628
  redaction_rate: 0.5,
425
629
  merge_adjacent: false,
630
+ unweighted: false,
426
631
  };
427
632
  let error = op.apply(&mut buffer, &mut rng).unwrap_err();
428
633
  match error {
@@ -448,13 +653,11 @@ mod tests {
448
653
  let mut rng = PyRng::new(123);
449
654
  let op = ReduplicateWordsOp {
450
655
  reduplication_rate: 0.5,
656
+ unweighted: false,
451
657
  };
452
658
  op.apply(&mut buffer, &mut rng)
453
659
  .expect("reduplication succeeds");
454
- assert_eq!(
455
- buffer.to_string(),
456
- "The The quick quick brown brown fox fox"
457
- );
660
+ assert_eq!(buffer.to_string(), "The The quick quick brown fox fox");
458
661
  }
459
662
 
460
663
  #[test]
@@ -463,6 +666,7 @@ mod tests {
463
666
  let mut rng = PyRng::new(123);
464
667
  let op = DeleteRandomWordsOp {
465
668
  max_deletion_rate: 0.5,
669
+ unweighted: false,
466
670
  };
467
671
  op.apply(&mut buffer, &mut rng).expect("deletion succeeds");
468
672
  assert_eq!(buffer.to_string(), "The over the lazy dog.");
@@ -476,9 +680,10 @@ mod tests {
476
680
  replacement_char: "█".to_string(),
477
681
  redaction_rate: 0.5,
478
682
  merge_adjacent: false,
683
+ unweighted: false,
479
684
  };
480
685
  op.apply(&mut buffer, &mut rng).expect("redaction succeeds");
481
- assert_eq!(buffer.to_string(), "████ these words ██████");
686
+ assert_eq!(buffer.to_string(), "████ these █████ please");
482
687
  }
483
688
 
484
689
  #[test]
@@ -489,6 +694,7 @@ mod tests {
489
694
  replacement_char: "█".to_string(),
490
695
  redaction_rate: 1.0,
491
696
  merge_adjacent: true,
697
+ unweighted: false,
492
698
  };
493
699
  op.apply(&mut buffer, &mut rng).expect("redaction succeeds");
494
700
  assert_eq!(buffer.to_string(), "█████████████████");