glitchlings 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- {glitchlings-0.4.0 → glitchlings-0.4.1}/PKG-INFO +2 -2
- {glitchlings-0.4.0 → glitchlings-0.4.1}/README.md +1 -1
- {glitchlings-0.4.0 → glitchlings-0.4.1}/pyproject.toml +1 -1
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/glitch_ops.rs +7 -2
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/lib.rs +66 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/pipeline.rs +105 -1
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/rng.rs +19 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/text_buffer.rs +45 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/__init__.py +18 -0
- glitchlings-0.4.1/src/glitchlings/lexicon/_cache.py +111 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/graph.py +16 -29
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/vector.py +16 -35
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/wordnet.py +12 -2
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/core.py +103 -13
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/PKG-INFO +2 -2
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/SOURCES.txt +4 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_benchmarks.py +49 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_cli.py +63 -0
- glitchlings-0.4.1/tests/test_config.py +196 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_gaggle.py +22 -0
- glitchlings-0.4.1/tests/test_graph_lexicon.py +81 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_huggingface_dlc.py +20 -0
- glitchlings-0.4.1/tests/test_lexicon_backends.py +85 -0
- glitchlings-0.4.1/tests/test_pipeline_operations.py +95 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_prime_echo_chamber.py +82 -0
- glitchlings-0.4.1/tests/test_rate_and_sampling.py +51 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_rust_backed_glitchlings.py +262 -10
- glitchlings-0.4.1/tests/test_vector_lexicon.py +438 -0
- glitchlings-0.4.0/tests/test_config.py +0 -59
- glitchlings-0.4.0/tests/test_graph_lexicon.py +0 -70
- glitchlings-0.4.0/tests/test_vector_lexicon.py +0 -193
- {glitchlings-0.4.0 → glitchlings-0.4.1}/LICENSE +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/MANIFEST.in +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/Cargo.lock +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/Cargo.toml +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/Cargo.toml +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/assets/ocr_confusions.tsv +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/build.rs +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/resources.rs +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/typogre.rs +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/zeedub.rs +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/setup.cfg +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/__init__.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/config.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/config.toml +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/dlc/__init__.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/dlc/huggingface.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/dlc/prime.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/data/default_vector_cache.json +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/metrics.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/main.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/__init__.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_rate.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_sampling.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_text_utils.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/adjax.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/jargoyle.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/mim1c.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/redactyl.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/reduple.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/rushmore.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/scannequin.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/typogre.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/zeedub.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/dependency_links.txt +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/entry_points.txt +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/requires.txt +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/top_level.txt +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_dataset_corruption.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_glitchling_core.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_glitchlings_determinism.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_jargoyle.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_keyboard_layouts.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_lexicon_config.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_lexicon_metrics.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_parameter_effects.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_property_based.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_text_utils.py +0 -0
- {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: glitchlings
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Monsters for your language games.
|
|
5
5
|
Author: osoleve
|
|
6
6
|
License: Apache License
|
|
@@ -420,7 +420,7 @@ _How can a computer need reading glasses?_
|
|
|
420
420
|
|
|
421
421
|
### Zeedub
|
|
422
422
|
|
|
423
|
-
|
|
423
|
+
_Watch your step around here._
|
|
424
424
|
|
|
425
425
|
> _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
|
|
426
426
|
>
|
|
@@ -177,7 +177,7 @@ _How can a computer need reading glasses?_
|
|
|
177
177
|
|
|
178
178
|
### Zeedub
|
|
179
179
|
|
|
180
|
-
|
|
180
|
+
_Watch your step around here._
|
|
181
181
|
|
|
182
182
|
> _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
|
|
183
183
|
>
|
|
@@ -398,6 +398,7 @@ impl GlitchOp for SwapAdjacentWordsOp {
|
|
|
398
398
|
}
|
|
399
399
|
|
|
400
400
|
let mut index = 0usize;
|
|
401
|
+
let mut replacements: SmallVec<[(usize, String); 8]> = SmallVec::new();
|
|
401
402
|
while index + 1 < total_words {
|
|
402
403
|
let left_segment = match buffer.word_segment(index) {
|
|
403
404
|
Some(segment) => segment,
|
|
@@ -423,13 +424,17 @@ impl GlitchOp for SwapAdjacentWordsOp {
|
|
|
423
424
|
if should_swap {
|
|
424
425
|
let left_replacement = format!("{left_prefix}{right_core}{left_suffix}");
|
|
425
426
|
let right_replacement = format!("{right_prefix}{left_core}{right_suffix}");
|
|
426
|
-
|
|
427
|
-
|
|
427
|
+
replacements.push((index, left_replacement));
|
|
428
|
+
replacements.push((index + 1, right_replacement));
|
|
428
429
|
}
|
|
429
430
|
|
|
430
431
|
index += 2;
|
|
431
432
|
}
|
|
432
433
|
|
|
434
|
+
if !replacements.is_empty() {
|
|
435
|
+
buffer.replace_words_bulk(replacements.into_iter())?;
|
|
436
|
+
}
|
|
437
|
+
|
|
433
438
|
Ok(())
|
|
434
439
|
}
|
|
435
440
|
}
|
|
@@ -122,6 +122,47 @@ fn cached_layout_vec(layout_dict: &PyDict) -> PyResult<Arc<Vec<(String, Vec<Stri
|
|
|
122
122
|
Ok(entry.clone())
|
|
123
123
|
}
|
|
124
124
|
|
|
125
|
+
#[derive(Debug)]
|
|
126
|
+
struct PyGagglePlanInput {
|
|
127
|
+
name: String,
|
|
128
|
+
scope: i32,
|
|
129
|
+
order: i32,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
impl<'py> FromPyObject<'py> for PyGagglePlanInput {
|
|
133
|
+
fn extract(obj: &'py PyAny) -> PyResult<Self> {
|
|
134
|
+
if let Ok(dict) = obj.downcast::<PyDict>() {
|
|
135
|
+
let name: String = dict
|
|
136
|
+
.get_item("name")?
|
|
137
|
+
.ok_or_else(|| PyValueError::new_err("plan input missing 'name' field"))?
|
|
138
|
+
.extract()?;
|
|
139
|
+
let scope: i32 = dict
|
|
140
|
+
.get_item("scope")?
|
|
141
|
+
.ok_or_else(|| PyValueError::new_err("plan input missing 'scope' field"))?
|
|
142
|
+
.extract()?;
|
|
143
|
+
let order: i32 = dict
|
|
144
|
+
.get_item("order")?
|
|
145
|
+
.ok_or_else(|| PyValueError::new_err("plan input missing 'order' field"))?
|
|
146
|
+
.extract()?;
|
|
147
|
+
return Ok(Self { name, scope, order });
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
let name = obj
|
|
151
|
+
.getattr("name")
|
|
152
|
+
.map_err(|_| PyValueError::new_err("plan input missing attribute 'name'"))?
|
|
153
|
+
.extract()?;
|
|
154
|
+
let scope = obj
|
|
155
|
+
.getattr("scope")
|
|
156
|
+
.map_err(|_| PyValueError::new_err("plan input missing attribute 'scope'"))?
|
|
157
|
+
.extract()?;
|
|
158
|
+
let order = obj
|
|
159
|
+
.getattr("order")
|
|
160
|
+
.map_err(|_| PyValueError::new_err("plan input missing attribute 'order'"))?
|
|
161
|
+
.extract()?;
|
|
162
|
+
Ok(Self { name, scope, order })
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
125
166
|
#[derive(Debug)]
|
|
126
167
|
enum PyGlitchOperation {
|
|
127
168
|
Reduplicate {
|
|
@@ -346,6 +387,30 @@ fn redact_words(
|
|
|
346
387
|
apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
|
|
347
388
|
}
|
|
348
389
|
|
|
390
|
+
#[pyfunction]
|
|
391
|
+
fn plan_glitchlings(
|
|
392
|
+
glitchlings: Vec<PyGagglePlanInput>,
|
|
393
|
+
master_seed: i128,
|
|
394
|
+
) -> PyResult<Vec<(usize, u64)>> {
|
|
395
|
+
let plan = pipeline::plan_gaggle(
|
|
396
|
+
glitchlings
|
|
397
|
+
.into_iter()
|
|
398
|
+
.enumerate()
|
|
399
|
+
.map(|(index, input)| pipeline::GagglePlanInput {
|
|
400
|
+
index,
|
|
401
|
+
name: input.name,
|
|
402
|
+
scope: input.scope,
|
|
403
|
+
order: input.order,
|
|
404
|
+
})
|
|
405
|
+
.collect(),
|
|
406
|
+
master_seed,
|
|
407
|
+
);
|
|
408
|
+
Ok(plan
|
|
409
|
+
.into_iter()
|
|
410
|
+
.map(|entry| (entry.index, entry.seed))
|
|
411
|
+
.collect())
|
|
412
|
+
}
|
|
413
|
+
|
|
349
414
|
#[pyfunction]
|
|
350
415
|
fn compose_glitchlings(
|
|
351
416
|
text: &str,
|
|
@@ -418,6 +483,7 @@ fn _zoo_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
|
418
483
|
m.add_function(wrap_pyfunction!(swap_adjacent_words, m)?)?;
|
|
419
484
|
m.add_function(wrap_pyfunction!(ocr_artifacts, m)?)?;
|
|
420
485
|
m.add_function(wrap_pyfunction!(redact_words, m)?)?;
|
|
486
|
+
m.add_function(wrap_pyfunction!(plan_glitchlings, m)?)?;
|
|
421
487
|
m.add_function(wrap_pyfunction!(compose_glitchlings, m)?)?;
|
|
422
488
|
m.add_function(wrap_pyfunction!(typogre::fatfinger, m)?)?;
|
|
423
489
|
m.add_function(wrap_pyfunction!(zeedub::inject_zero_widths, m)?)?;
|
|
@@ -68,6 +68,57 @@ impl Pipeline {
|
|
|
68
68
|
}
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
72
|
+
pub struct GagglePlanEntry {
|
|
73
|
+
pub index: usize,
|
|
74
|
+
pub seed: u64,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
#[derive(Debug, Clone)]
|
|
78
|
+
pub struct GagglePlanInput {
|
|
79
|
+
pub index: usize,
|
|
80
|
+
pub name: String,
|
|
81
|
+
pub scope: i32,
|
|
82
|
+
pub order: i32,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
struct PlannedGlitchling {
|
|
86
|
+
index: usize,
|
|
87
|
+
name: String,
|
|
88
|
+
scope: i32,
|
|
89
|
+
order: i32,
|
|
90
|
+
seed: u64,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
pub fn plan_gaggle(inputs: Vec<GagglePlanInput>, master_seed: i128) -> Vec<GagglePlanEntry> {
|
|
94
|
+
let mut planned: Vec<PlannedGlitchling> = inputs
|
|
95
|
+
.into_iter()
|
|
96
|
+
.map(|input| PlannedGlitchling {
|
|
97
|
+
seed: derive_seed(master_seed, &input.name, input.index as i128),
|
|
98
|
+
index: input.index,
|
|
99
|
+
name: input.name,
|
|
100
|
+
scope: input.scope,
|
|
101
|
+
order: input.order,
|
|
102
|
+
})
|
|
103
|
+
.collect();
|
|
104
|
+
|
|
105
|
+
planned.sort_by(|left, right| {
|
|
106
|
+
left.scope
|
|
107
|
+
.cmp(&right.scope)
|
|
108
|
+
.then(left.order.cmp(&right.order))
|
|
109
|
+
.then(left.name.cmp(&right.name))
|
|
110
|
+
.then(left.index.cmp(&right.index))
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
planned
|
|
114
|
+
.into_iter()
|
|
115
|
+
.map(|item| GagglePlanEntry {
|
|
116
|
+
index: item.index,
|
|
117
|
+
seed: item.seed,
|
|
118
|
+
})
|
|
119
|
+
.collect()
|
|
120
|
+
}
|
|
121
|
+
|
|
71
122
|
pub fn derive_seed(master_seed: i128, glitchling_name: &str, index: i128) -> u64 {
|
|
72
123
|
let mut hasher = Blake2s::<U8>::new();
|
|
73
124
|
Digest::update(&mut hasher, int_to_bytes(master_seed));
|
|
@@ -109,7 +160,9 @@ fn int_to_bytes(value: i128) -> Vec<u8> {
|
|
|
109
160
|
|
|
110
161
|
#[cfg(test)]
|
|
111
162
|
mod tests {
|
|
112
|
-
use super::{
|
|
163
|
+
use super::{
|
|
164
|
+
derive_seed, plan_gaggle, GagglePlanEntry, GagglePlanInput, GlitchDescriptor, Pipeline,
|
|
165
|
+
};
|
|
113
166
|
use crate::glitch_ops::{
|
|
114
167
|
DeleteRandomWordsOp, GlitchOperation, OcrArtifactsOp, RedactWordsOp, ReduplicateWordsOp,
|
|
115
168
|
SwapAdjacentWordsOp,
|
|
@@ -222,4 +275,55 @@ mod tests {
|
|
|
222
275
|
.expect("pipeline succeeds");
|
|
223
276
|
assert_eq!(output, "this Echo please line");
|
|
224
277
|
}
|
|
278
|
+
|
|
279
|
+
#[test]
|
|
280
|
+
fn plan_gaggle_orders_by_scope_order_and_name() {
|
|
281
|
+
let master_seed = 5151i128;
|
|
282
|
+
let inputs = vec![
|
|
283
|
+
GagglePlanInput {
|
|
284
|
+
index: 0,
|
|
285
|
+
name: "Typogre".to_string(),
|
|
286
|
+
scope: 5,
|
|
287
|
+
order: 3,
|
|
288
|
+
},
|
|
289
|
+
GagglePlanInput {
|
|
290
|
+
index: 1,
|
|
291
|
+
name: "Reduple".to_string(),
|
|
292
|
+
scope: 4,
|
|
293
|
+
order: 3,
|
|
294
|
+
},
|
|
295
|
+
GagglePlanInput {
|
|
296
|
+
index: 2,
|
|
297
|
+
name: "Rushmore".to_string(),
|
|
298
|
+
scope: 4,
|
|
299
|
+
order: 2,
|
|
300
|
+
},
|
|
301
|
+
GagglePlanInput {
|
|
302
|
+
index: 3,
|
|
303
|
+
name: "Mim1c".to_string(),
|
|
304
|
+
scope: 5,
|
|
305
|
+
order: 2,
|
|
306
|
+
},
|
|
307
|
+
];
|
|
308
|
+
let plan = plan_gaggle(inputs, master_seed);
|
|
309
|
+
let expected = vec![
|
|
310
|
+
GagglePlanEntry {
|
|
311
|
+
index: 2,
|
|
312
|
+
seed: derive_seed(master_seed, "Rushmore", 2),
|
|
313
|
+
},
|
|
314
|
+
GagglePlanEntry {
|
|
315
|
+
index: 1,
|
|
316
|
+
seed: derive_seed(master_seed, "Reduple", 1),
|
|
317
|
+
},
|
|
318
|
+
GagglePlanEntry {
|
|
319
|
+
index: 3,
|
|
320
|
+
seed: derive_seed(master_seed, "Mim1c", 3),
|
|
321
|
+
},
|
|
322
|
+
GagglePlanEntry {
|
|
323
|
+
index: 0,
|
|
324
|
+
seed: derive_seed(master_seed, "Typogre", 0),
|
|
325
|
+
},
|
|
326
|
+
];
|
|
327
|
+
assert_eq!(plan, expected);
|
|
328
|
+
}
|
|
225
329
|
}
|
|
@@ -323,6 +323,25 @@ mod tests {
|
|
|
323
323
|
}
|
|
324
324
|
}
|
|
325
325
|
|
|
326
|
+
#[test]
|
|
327
|
+
fn random_matches_python_for_additional_seed() {
|
|
328
|
+
let mut rng = PyRng::new(3815924951222172525);
|
|
329
|
+
let expected = [
|
|
330
|
+
0.18518006574496737,
|
|
331
|
+
0.5841689581060610,
|
|
332
|
+
0.3699113163178772,
|
|
333
|
+
0.7394349068470196,
|
|
334
|
+
0.6855497906317899,
|
|
335
|
+
];
|
|
336
|
+
for value in expected {
|
|
337
|
+
let actual = rng.random();
|
|
338
|
+
assert!(
|
|
339
|
+
(actual - value).abs() < 1e-15,
|
|
340
|
+
"expected {value}, got {actual}"
|
|
341
|
+
);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
326
345
|
#[test]
|
|
327
346
|
fn randrange_supports_default_arguments() {
|
|
328
347
|
let mut rng = PyRng::new(151);
|
|
@@ -171,6 +171,35 @@ impl TextBuffer {
|
|
|
171
171
|
Ok(())
|
|
172
172
|
}
|
|
173
173
|
|
|
174
|
+
/// Replace multiple words in a single pass, avoiding repeated reindexing.
|
|
175
|
+
pub fn replace_words_bulk<I>(
|
|
176
|
+
&mut self,
|
|
177
|
+
replacements: I,
|
|
178
|
+
) -> Result<(), TextBufferError>
|
|
179
|
+
where
|
|
180
|
+
I: IntoIterator<Item = (usize, String)>,
|
|
181
|
+
{
|
|
182
|
+
let mut applied_any = false;
|
|
183
|
+
for (word_index, replacement) in replacements {
|
|
184
|
+
let segment_index = self
|
|
185
|
+
.word_segment_indices
|
|
186
|
+
.get(word_index)
|
|
187
|
+
.copied()
|
|
188
|
+
.ok_or(TextBufferError::InvalidWordIndex { index: word_index })?;
|
|
189
|
+
let segment = self
|
|
190
|
+
.segments
|
|
191
|
+
.get_mut(segment_index)
|
|
192
|
+
.ok_or(TextBufferError::InvalidWordIndex { index: word_index })?;
|
|
193
|
+
segment.set_text(replacement, SegmentKind::Word);
|
|
194
|
+
applied_any = true;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if applied_any {
|
|
198
|
+
self.reindex();
|
|
199
|
+
}
|
|
200
|
+
Ok(())
|
|
201
|
+
}
|
|
202
|
+
|
|
174
203
|
/// Deletes the word at the requested index.
|
|
175
204
|
pub fn delete_word(&mut self, word_index: usize) -> Result<(), TextBufferError> {
|
|
176
205
|
let segment_index = self
|
|
@@ -402,6 +431,22 @@ mod tests {
|
|
|
402
431
|
assert_eq!(buffer.spans().len(), 5);
|
|
403
432
|
}
|
|
404
433
|
|
|
434
|
+
#[test]
|
|
435
|
+
fn bulk_replace_words_updates_multiple_entries() {
|
|
436
|
+
let mut buffer = TextBuffer::from_str("alpha beta gamma delta");
|
|
437
|
+
buffer
|
|
438
|
+
.replace_words_bulk(vec![
|
|
439
|
+
(0, "delta".to_string()),
|
|
440
|
+
(3, "alpha".to_string()),
|
|
441
|
+
])
|
|
442
|
+
.expect("bulk replace succeeds");
|
|
443
|
+
assert_eq!(buffer.to_string(), "delta beta gamma alpha");
|
|
444
|
+
let spans = buffer.spans();
|
|
445
|
+
assert_eq!(spans[0].char_range, 0..5);
|
|
446
|
+
assert_eq!(spans.len(), 7);
|
|
447
|
+
assert_eq!(spans.last().unwrap().char_range, 17..22);
|
|
448
|
+
}
|
|
449
|
+
|
|
405
450
|
#[test]
|
|
406
451
|
fn replace_char_range_handles_multisegment_updates() {
|
|
407
452
|
let mut buffer = TextBuffer::from_str("Hello world");
|
|
@@ -4,10 +4,12 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
6
|
from hashlib import blake2s
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
import random
|
|
8
9
|
from typing import Callable, Iterable
|
|
9
10
|
|
|
10
11
|
from glitchlings.config import get_config
|
|
12
|
+
from ._cache import CacheEntries, CacheSnapshot
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
class Lexicon(ABC):
|
|
@@ -79,6 +81,21 @@ class Lexicon(ABC):
|
|
|
79
81
|
return f"{self.__class__.__name__}(seed={self._seed!r})"
|
|
80
82
|
|
|
81
83
|
|
|
84
|
+
class LexiconBackend(Lexicon):
|
|
85
|
+
"""Extended lexicon interface that supports cache persistence."""
|
|
86
|
+
|
|
87
|
+
Cache = CacheEntries
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
@abstractmethod
|
|
91
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
92
|
+
"""Return a validated cache snapshot loaded from ``path``."""
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def save_cache(self, path: str | Path | None = None) -> Path | None:
|
|
96
|
+
"""Persist the backend cache to ``path`` and return the destination."""
|
|
97
|
+
|
|
98
|
+
|
|
82
99
|
from .graph import GraphLexicon
|
|
83
100
|
from .metrics import (
|
|
84
101
|
compare_lexicons,
|
|
@@ -176,6 +193,7 @@ def get_default_lexicon(seed: int | None = None) -> Lexicon:
|
|
|
176
193
|
|
|
177
194
|
__all__ = [
|
|
178
195
|
"Lexicon",
|
|
196
|
+
"LexiconBackend",
|
|
179
197
|
"VectorLexicon",
|
|
180
198
|
"GraphLexicon",
|
|
181
199
|
"WordNetLexicon",
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Shared cache helpers for lexicon backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from hashlib import blake2s
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Mapping, Sequence
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
CacheEntries = dict[str, list[str]]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class CacheSnapshot:
|
|
17
|
+
"""Materialised cache data and its integrity checksum."""
|
|
18
|
+
|
|
19
|
+
entries: CacheEntries
|
|
20
|
+
checksum: str | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
|
|
24
|
+
"""Convert raw cache payloads into canonical mapping form."""
|
|
25
|
+
|
|
26
|
+
entries: CacheEntries = {}
|
|
27
|
+
for key, values in payload.items():
|
|
28
|
+
if not isinstance(key, str):
|
|
29
|
+
raise RuntimeError("Synonym cache keys must be strings.")
|
|
30
|
+
if not isinstance(values, Sequence):
|
|
31
|
+
raise RuntimeError("Synonym cache values must be sequences of strings.")
|
|
32
|
+
entries[key] = [str(value) for value in values]
|
|
33
|
+
return entries
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
|
|
37
|
+
"""Return a deterministic JSON serialisation for ``entries``."""
|
|
38
|
+
|
|
39
|
+
serialisable = {key: list(values) for key, values in sorted(entries.items())}
|
|
40
|
+
return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
|
|
44
|
+
"""Return a BLAKE2s checksum for ``entries``."""
|
|
45
|
+
|
|
46
|
+
digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
|
|
47
|
+
return digest.hexdigest()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def load_cache(path: Path) -> CacheSnapshot:
|
|
51
|
+
"""Load a cache from ``path`` and verify its checksum if present."""
|
|
52
|
+
|
|
53
|
+
if not path.exists():
|
|
54
|
+
return CacheSnapshot(entries={}, checksum=None)
|
|
55
|
+
|
|
56
|
+
with path.open("r", encoding="utf8") as handle:
|
|
57
|
+
payload = json.load(handle)
|
|
58
|
+
|
|
59
|
+
checksum: str | None = None
|
|
60
|
+
entries_payload: Mapping[str, Sequence[str]]
|
|
61
|
+
|
|
62
|
+
if isinstance(payload, Mapping) and "__meta__" in payload and "entries" in payload:
|
|
63
|
+
meta = payload["__meta__"]
|
|
64
|
+
entries_payload = payload["entries"] # type: ignore[assignment]
|
|
65
|
+
if not isinstance(entries_payload, Mapping):
|
|
66
|
+
raise RuntimeError("Synonym cache entries must be stored as a mapping.")
|
|
67
|
+
if isinstance(meta, Mapping):
|
|
68
|
+
raw_checksum = meta.get("checksum")
|
|
69
|
+
if raw_checksum is not None and not isinstance(raw_checksum, str):
|
|
70
|
+
raise RuntimeError("Synonym cache checksum must be a string when provided.")
|
|
71
|
+
checksum = raw_checksum
|
|
72
|
+
else:
|
|
73
|
+
raise RuntimeError("Synonym cache metadata must be a mapping.")
|
|
74
|
+
elif isinstance(payload, Mapping):
|
|
75
|
+
entries_payload = payload # legacy format without metadata
|
|
76
|
+
else:
|
|
77
|
+
raise RuntimeError("Synonym cache payload must be a mapping of strings to lists.")
|
|
78
|
+
|
|
79
|
+
entries = _normalise_entries(entries_payload)
|
|
80
|
+
if checksum is not None:
|
|
81
|
+
expected = compute_checksum(entries)
|
|
82
|
+
if checksum != expected:
|
|
83
|
+
raise RuntimeError(
|
|
84
|
+
"Synonym cache checksum mismatch; the cache file appears to be corrupted."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return CacheSnapshot(entries=entries, checksum=checksum)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
|
|
91
|
+
"""Persist ``entries`` to ``path`` with checksum metadata."""
|
|
92
|
+
|
|
93
|
+
serialisable = {key: list(values) for key, values in sorted(entries.items())}
|
|
94
|
+
checksum = compute_checksum(serialisable)
|
|
95
|
+
payload = {
|
|
96
|
+
"__meta__": {
|
|
97
|
+
"checksum": checksum,
|
|
98
|
+
"entries": len(serialisable),
|
|
99
|
+
},
|
|
100
|
+
"entries": serialisable,
|
|
101
|
+
}
|
|
102
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
with path.open("w", encoding="utf8") as handle:
|
|
105
|
+
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
|
106
|
+
|
|
107
|
+
return CacheSnapshot(entries=serialisable, checksum=checksum)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
__all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]
|
|
111
|
+
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
import re
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from typing import Iterable, Mapping, MutableMapping, Sequence
|
|
9
8
|
|
|
10
|
-
from . import
|
|
9
|
+
from . import LexiconBackend
|
|
10
|
+
from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
|
|
11
11
|
from .vector import VectorLexicon
|
|
12
12
|
|
|
13
13
|
|
|
@@ -140,30 +140,7 @@ def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[f
|
|
|
140
140
|
return embeddings
|
|
141
141
|
|
|
142
142
|
|
|
143
|
-
|
|
144
|
-
if not path.exists():
|
|
145
|
-
return {}
|
|
146
|
-
with path.open("r", encoding="utf8") as handle:
|
|
147
|
-
payload = json.load(handle)
|
|
148
|
-
if not isinstance(payload, Mapping):
|
|
149
|
-
raise RuntimeError("Graph lexicon cache must be a mapping of strings to lists.")
|
|
150
|
-
cache: dict[str, list[str]] = {}
|
|
151
|
-
for key, values in payload.items():
|
|
152
|
-
if not isinstance(key, str):
|
|
153
|
-
raise RuntimeError("Graph lexicon cache keys must be strings.")
|
|
154
|
-
if not isinstance(values, Sequence):
|
|
155
|
-
raise RuntimeError("Graph lexicon cache values must be sequences of strings.")
|
|
156
|
-
cache[key] = [str(value) for value in values]
|
|
157
|
-
return cache
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
|
|
161
|
-
serialisable = {key: list(values) for key, values in sorted(cache.items())}
|
|
162
|
-
with path.open("w", encoding="utf8") as handle:
|
|
163
|
-
json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
class GraphLexicon(Lexicon):
|
|
143
|
+
class GraphLexicon(LexiconBackend):
|
|
167
144
|
"""Lexicon backed by ConceptNet/Numberbatch embeddings."""
|
|
168
145
|
|
|
169
146
|
def __init__(
|
|
@@ -184,9 +161,12 @@ class GraphLexicon(Lexicon):
|
|
|
184
161
|
self._max_neighbors = max(1, max_neighbors)
|
|
185
162
|
self._min_similarity = min_similarity
|
|
186
163
|
self._cache: MutableMapping[str, list[str]] = {}
|
|
187
|
-
self._cache_path = Path(cache_path) if cache_path is not None else None
|
|
164
|
+
self._cache_path: Path | None = Path(cache_path) if cache_path is not None else None
|
|
165
|
+
self._cache_checksum: str | None = None
|
|
188
166
|
if self._cache_path is not None:
|
|
189
|
-
|
|
167
|
+
snapshot = _load_cache_file(self._cache_path)
|
|
168
|
+
self._cache.update(snapshot.entries)
|
|
169
|
+
self._cache_checksum = snapshot.checksum
|
|
190
170
|
if cache is not None:
|
|
191
171
|
for key, values in cache.items():
|
|
192
172
|
self._cache[str(key)] = [str(value) for value in values]
|
|
@@ -278,6 +258,12 @@ class GraphLexicon(Lexicon):
|
|
|
278
258
|
def export_cache(self) -> dict[str, list[str]]:
|
|
279
259
|
return {key: list(values) for key, values in self._cache.items()}
|
|
280
260
|
|
|
261
|
+
@classmethod
|
|
262
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
263
|
+
"""Load and validate a persisted ConceptNet cache file."""
|
|
264
|
+
|
|
265
|
+
return _load_cache_file(Path(path))
|
|
266
|
+
|
|
281
267
|
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
282
268
|
if path is None:
|
|
283
269
|
if self._cache_path is None:
|
|
@@ -286,7 +272,8 @@ class GraphLexicon(Lexicon):
|
|
|
286
272
|
else:
|
|
287
273
|
target = Path(path)
|
|
288
274
|
self._cache_path = target
|
|
289
|
-
|
|
275
|
+
snapshot = _write_cache_file(target, self._cache)
|
|
276
|
+
self._cache_checksum = snapshot.checksum
|
|
290
277
|
self._cache_dirty = False
|
|
291
278
|
return target
|
|
292
279
|
|
|
@@ -10,7 +10,8 @@ from pathlib import Path
|
|
|
10
10
|
import sys
|
|
11
11
|
from typing import Any, Callable, Iterable, Iterator, Mapping, MutableMapping, Sequence
|
|
12
12
|
|
|
13
|
-
from . import
|
|
13
|
+
from . import LexiconBackend
|
|
14
|
+
from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
@@ -241,38 +242,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
|
|
|
241
242
|
raise RuntimeError("Unsupported vector source supplied to VectorLexicon.")
|
|
242
243
|
|
|
243
244
|
|
|
244
|
-
|
|
245
|
-
"""Load a synonym cache from ``path`` if it exists."""
|
|
246
|
-
|
|
247
|
-
if not path.exists():
|
|
248
|
-
return {}
|
|
249
|
-
|
|
250
|
-
with path.open("r", encoding="utf8") as handle:
|
|
251
|
-
payload = json.load(handle)
|
|
252
|
-
|
|
253
|
-
if not isinstance(payload, Mapping):
|
|
254
|
-
raise RuntimeError("Synonym cache must be a JSON mapping of strings to lists.")
|
|
255
|
-
|
|
256
|
-
cache: dict[str, list[str]] = {}
|
|
257
|
-
for key, values in payload.items():
|
|
258
|
-
if not isinstance(key, str):
|
|
259
|
-
raise RuntimeError("Synonym cache keys must be strings.")
|
|
260
|
-
if not isinstance(values, Sequence):
|
|
261
|
-
raise RuntimeError("Synonym cache values must be lists of strings.")
|
|
262
|
-
cache[key] = [str(value) for value in values]
|
|
263
|
-
|
|
264
|
-
return cache
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
|
|
268
|
-
"""Write ``cache`` to ``path`` deterministically."""
|
|
269
|
-
|
|
270
|
-
serialisable = {key: list(values) for key, values in sorted(cache.items())}
|
|
271
|
-
with path.open("w", encoding="utf8") as handle:
|
|
272
|
-
json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
class VectorLexicon(Lexicon):
|
|
245
|
+
class VectorLexicon(LexiconBackend):
|
|
276
246
|
"""Lexicon implementation backed by dense word embeddings."""
|
|
277
247
|
|
|
278
248
|
def __init__(
|
|
@@ -292,9 +262,13 @@ class VectorLexicon(Lexicon):
|
|
|
292
262
|
self._max_neighbors = max(1, max_neighbors)
|
|
293
263
|
self._min_similarity = min_similarity
|
|
294
264
|
self._cache: MutableMapping[str, list[str]] = {}
|
|
265
|
+
self._cache_path: Path | None
|
|
266
|
+
self._cache_checksum: str | None = None
|
|
295
267
|
if cache_path is not None:
|
|
296
268
|
path = Path(cache_path)
|
|
297
|
-
|
|
269
|
+
snapshot = _load_cache_file(path)
|
|
270
|
+
self._cache.update(snapshot.entries)
|
|
271
|
+
self._cache_checksum = snapshot.checksum
|
|
298
272
|
self._cache_path = path
|
|
299
273
|
else:
|
|
300
274
|
self._cache_path = None
|
|
@@ -411,6 +385,12 @@ class VectorLexicon(Lexicon):
|
|
|
411
385
|
|
|
412
386
|
return {key: list(values) for key, values in self._cache.items()}
|
|
413
387
|
|
|
388
|
+
@classmethod
|
|
389
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
390
|
+
"""Load and validate a cache file for reuse."""
|
|
391
|
+
|
|
392
|
+
return _load_cache_file(Path(path))
|
|
393
|
+
|
|
414
394
|
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
415
395
|
"""Persist the current cache to disk, returning the path used."""
|
|
416
396
|
|
|
@@ -422,7 +402,8 @@ class VectorLexicon(Lexicon):
|
|
|
422
402
|
target = Path(path)
|
|
423
403
|
self._cache_path = target
|
|
424
404
|
|
|
425
|
-
|
|
405
|
+
snapshot = _write_cache_file(target, self._cache)
|
|
406
|
+
self._cache_checksum = snapshot.checksum
|
|
426
407
|
self._cache_dirty = False
|
|
427
408
|
return target
|
|
428
409
|
|