glitchlings 0.4.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (83) hide show
  1. {glitchlings-0.4.0 → glitchlings-0.4.1}/PKG-INFO +2 -2
  2. {glitchlings-0.4.0 → glitchlings-0.4.1}/README.md +1 -1
  3. {glitchlings-0.4.0 → glitchlings-0.4.1}/pyproject.toml +1 -1
  4. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/glitch_ops.rs +7 -2
  5. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/lib.rs +66 -0
  6. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/pipeline.rs +105 -1
  7. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/rng.rs +19 -0
  8. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/text_buffer.rs +45 -0
  9. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/__init__.py +18 -0
  10. glitchlings-0.4.1/src/glitchlings/lexicon/_cache.py +111 -0
  11. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/graph.py +16 -29
  12. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/vector.py +16 -35
  13. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/wordnet.py +12 -2
  14. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/core.py +103 -13
  15. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/PKG-INFO +2 -2
  16. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/SOURCES.txt +4 -0
  17. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_benchmarks.py +49 -0
  18. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_cli.py +63 -0
  19. glitchlings-0.4.1/tests/test_config.py +196 -0
  20. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_gaggle.py +22 -0
  21. glitchlings-0.4.1/tests/test_graph_lexicon.py +81 -0
  22. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_huggingface_dlc.py +20 -0
  23. glitchlings-0.4.1/tests/test_lexicon_backends.py +85 -0
  24. glitchlings-0.4.1/tests/test_pipeline_operations.py +95 -0
  25. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_prime_echo_chamber.py +82 -0
  26. glitchlings-0.4.1/tests/test_rate_and_sampling.py +51 -0
  27. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_rust_backed_glitchlings.py +262 -10
  28. glitchlings-0.4.1/tests/test_vector_lexicon.py +438 -0
  29. glitchlings-0.4.0/tests/test_config.py +0 -59
  30. glitchlings-0.4.0/tests/test_graph_lexicon.py +0 -70
  31. glitchlings-0.4.0/tests/test_vector_lexicon.py +0 -193
  32. {glitchlings-0.4.0 → glitchlings-0.4.1}/LICENSE +0 -0
  33. {glitchlings-0.4.0 → glitchlings-0.4.1}/MANIFEST.in +0 -0
  34. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/Cargo.lock +0 -0
  35. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/Cargo.toml +0 -0
  36. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/Cargo.toml +0 -0
  37. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/assets/ocr_confusions.tsv +0 -0
  38. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/build.rs +0 -0
  39. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/resources.rs +0 -0
  40. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/typogre.rs +0 -0
  41. {glitchlings-0.4.0 → glitchlings-0.4.1}/rust/zoo/src/zeedub.rs +0 -0
  42. {glitchlings-0.4.0 → glitchlings-0.4.1}/setup.cfg +0 -0
  43. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/__init__.py +0 -0
  44. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/__main__.py +0 -0
  45. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/config.py +0 -0
  46. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/config.toml +0 -0
  47. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/dlc/__init__.py +0 -0
  48. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/dlc/huggingface.py +0 -0
  49. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/dlc/prime.py +0 -0
  50. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/data/default_vector_cache.json +0 -0
  51. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/lexicon/metrics.py +0 -0
  52. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/main.py +0 -0
  53. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/util/__init__.py +0 -0
  54. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/__init__.py +0 -0
  55. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
  56. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_rate.py +0 -0
  57. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_sampling.py +0 -0
  58. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/_text_utils.py +0 -0
  59. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/adjax.py +0 -0
  60. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/jargoyle.py +0 -0
  61. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/mim1c.py +0 -0
  62. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
  63. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/redactyl.py +0 -0
  64. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/reduple.py +0 -0
  65. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/rushmore.py +0 -0
  66. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/scannequin.py +0 -0
  67. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/typogre.py +0 -0
  68. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings/zoo/zeedub.py +0 -0
  69. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/dependency_links.txt +0 -0
  70. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/entry_points.txt +0 -0
  71. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/requires.txt +0 -0
  72. {glitchlings-0.4.0 → glitchlings-0.4.1}/src/glitchlings.egg-info/top_level.txt +0 -0
  73. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_dataset_corruption.py +0 -0
  74. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_glitchling_core.py +0 -0
  75. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_glitchlings_determinism.py +0 -0
  76. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_jargoyle.py +0 -0
  77. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_keyboard_layouts.py +0 -0
  78. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_lexicon_config.py +0 -0
  79. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_lexicon_metrics.py +0 -0
  80. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_parameter_effects.py +0 -0
  81. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_property_based.py +0 -0
  82. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_text_utils.py +0 -0
  83. {glitchlings-0.4.0 → glitchlings-0.4.1}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -420,7 +420,7 @@ _How can a computer need reading glasses?_
420
420
 
421
421
  ### Zeedub
422
422
 
423
- _A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
423
+ _Watch your step around here._
424
424
 
425
425
  > _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
426
426
  >
@@ -177,7 +177,7 @@ _How can a computer need reading glasses?_
177
177
 
178
178
  ### Zeedub
179
179
 
180
- _A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
180
+ _Watch your step around here._
181
181
 
182
182
  > _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
183
183
  >
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "glitchlings"
3
- version = "0.4.0"
3
+ version = "0.4.1"
4
4
  description = "Monsters for your language games."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -398,6 +398,7 @@ impl GlitchOp for SwapAdjacentWordsOp {
398
398
  }
399
399
 
400
400
  let mut index = 0usize;
401
+ let mut replacements: SmallVec<[(usize, String); 8]> = SmallVec::new();
401
402
  while index + 1 < total_words {
402
403
  let left_segment = match buffer.word_segment(index) {
403
404
  Some(segment) => segment,
@@ -423,13 +424,17 @@ impl GlitchOp for SwapAdjacentWordsOp {
423
424
  if should_swap {
424
425
  let left_replacement = format!("{left_prefix}{right_core}{left_suffix}");
425
426
  let right_replacement = format!("{right_prefix}{left_core}{right_suffix}");
426
- buffer.replace_word(index, &left_replacement)?;
427
- buffer.replace_word(index + 1, &right_replacement)?;
427
+ replacements.push((index, left_replacement));
428
+ replacements.push((index + 1, right_replacement));
428
429
  }
429
430
 
430
431
  index += 2;
431
432
  }
432
433
 
434
+ if !replacements.is_empty() {
435
+ buffer.replace_words_bulk(replacements.into_iter())?;
436
+ }
437
+
433
438
  Ok(())
434
439
  }
435
440
  }
@@ -122,6 +122,47 @@ fn cached_layout_vec(layout_dict: &PyDict) -> PyResult<Arc<Vec<(String, Vec<Stri
122
122
  Ok(entry.clone())
123
123
  }
124
124
 
125
+ #[derive(Debug)]
126
+ struct PyGagglePlanInput {
127
+ name: String,
128
+ scope: i32,
129
+ order: i32,
130
+ }
131
+
132
+ impl<'py> FromPyObject<'py> for PyGagglePlanInput {
133
+ fn extract(obj: &'py PyAny) -> PyResult<Self> {
134
+ if let Ok(dict) = obj.downcast::<PyDict>() {
135
+ let name: String = dict
136
+ .get_item("name")?
137
+ .ok_or_else(|| PyValueError::new_err("plan input missing 'name' field"))?
138
+ .extract()?;
139
+ let scope: i32 = dict
140
+ .get_item("scope")?
141
+ .ok_or_else(|| PyValueError::new_err("plan input missing 'scope' field"))?
142
+ .extract()?;
143
+ let order: i32 = dict
144
+ .get_item("order")?
145
+ .ok_or_else(|| PyValueError::new_err("plan input missing 'order' field"))?
146
+ .extract()?;
147
+ return Ok(Self { name, scope, order });
148
+ }
149
+
150
+ let name = obj
151
+ .getattr("name")
152
+ .map_err(|_| PyValueError::new_err("plan input missing attribute 'name'"))?
153
+ .extract()?;
154
+ let scope = obj
155
+ .getattr("scope")
156
+ .map_err(|_| PyValueError::new_err("plan input missing attribute 'scope'"))?
157
+ .extract()?;
158
+ let order = obj
159
+ .getattr("order")
160
+ .map_err(|_| PyValueError::new_err("plan input missing attribute 'order'"))?
161
+ .extract()?;
162
+ Ok(Self { name, scope, order })
163
+ }
164
+ }
165
+
125
166
  #[derive(Debug)]
126
167
  enum PyGlitchOperation {
127
168
  Reduplicate {
@@ -346,6 +387,30 @@ fn redact_words(
346
387
  apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
347
388
  }
348
389
 
390
+ #[pyfunction]
391
+ fn plan_glitchlings(
392
+ glitchlings: Vec<PyGagglePlanInput>,
393
+ master_seed: i128,
394
+ ) -> PyResult<Vec<(usize, u64)>> {
395
+ let plan = pipeline::plan_gaggle(
396
+ glitchlings
397
+ .into_iter()
398
+ .enumerate()
399
+ .map(|(index, input)| pipeline::GagglePlanInput {
400
+ index,
401
+ name: input.name,
402
+ scope: input.scope,
403
+ order: input.order,
404
+ })
405
+ .collect(),
406
+ master_seed,
407
+ );
408
+ Ok(plan
409
+ .into_iter()
410
+ .map(|entry| (entry.index, entry.seed))
411
+ .collect())
412
+ }
413
+
349
414
  #[pyfunction]
350
415
  fn compose_glitchlings(
351
416
  text: &str,
@@ -418,6 +483,7 @@ fn _zoo_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
418
483
  m.add_function(wrap_pyfunction!(swap_adjacent_words, m)?)?;
419
484
  m.add_function(wrap_pyfunction!(ocr_artifacts, m)?)?;
420
485
  m.add_function(wrap_pyfunction!(redact_words, m)?)?;
486
+ m.add_function(wrap_pyfunction!(plan_glitchlings, m)?)?;
421
487
  m.add_function(wrap_pyfunction!(compose_glitchlings, m)?)?;
422
488
  m.add_function(wrap_pyfunction!(typogre::fatfinger, m)?)?;
423
489
  m.add_function(wrap_pyfunction!(zeedub::inject_zero_widths, m)?)?;
@@ -68,6 +68,57 @@ impl Pipeline {
68
68
  }
69
69
  }
70
70
 
71
+ #[derive(Debug, Clone, PartialEq, Eq)]
72
+ pub struct GagglePlanEntry {
73
+ pub index: usize,
74
+ pub seed: u64,
75
+ }
76
+
77
+ #[derive(Debug, Clone)]
78
+ pub struct GagglePlanInput {
79
+ pub index: usize,
80
+ pub name: String,
81
+ pub scope: i32,
82
+ pub order: i32,
83
+ }
84
+
85
+ struct PlannedGlitchling {
86
+ index: usize,
87
+ name: String,
88
+ scope: i32,
89
+ order: i32,
90
+ seed: u64,
91
+ }
92
+
93
+ pub fn plan_gaggle(inputs: Vec<GagglePlanInput>, master_seed: i128) -> Vec<GagglePlanEntry> {
94
+ let mut planned: Vec<PlannedGlitchling> = inputs
95
+ .into_iter()
96
+ .map(|input| PlannedGlitchling {
97
+ seed: derive_seed(master_seed, &input.name, input.index as i128),
98
+ index: input.index,
99
+ name: input.name,
100
+ scope: input.scope,
101
+ order: input.order,
102
+ })
103
+ .collect();
104
+
105
+ planned.sort_by(|left, right| {
106
+ left.scope
107
+ .cmp(&right.scope)
108
+ .then(left.order.cmp(&right.order))
109
+ .then(left.name.cmp(&right.name))
110
+ .then(left.index.cmp(&right.index))
111
+ });
112
+
113
+ planned
114
+ .into_iter()
115
+ .map(|item| GagglePlanEntry {
116
+ index: item.index,
117
+ seed: item.seed,
118
+ })
119
+ .collect()
120
+ }
121
+
71
122
  pub fn derive_seed(master_seed: i128, glitchling_name: &str, index: i128) -> u64 {
72
123
  let mut hasher = Blake2s::<U8>::new();
73
124
  Digest::update(&mut hasher, int_to_bytes(master_seed));
@@ -109,7 +160,9 @@ fn int_to_bytes(value: i128) -> Vec<u8> {
109
160
 
110
161
  #[cfg(test)]
111
162
  mod tests {
112
- use super::{derive_seed, GlitchDescriptor, Pipeline};
163
+ use super::{
164
+ derive_seed, plan_gaggle, GagglePlanEntry, GagglePlanInput, GlitchDescriptor, Pipeline,
165
+ };
113
166
  use crate::glitch_ops::{
114
167
  DeleteRandomWordsOp, GlitchOperation, OcrArtifactsOp, RedactWordsOp, ReduplicateWordsOp,
115
168
  SwapAdjacentWordsOp,
@@ -222,4 +275,55 @@ mod tests {
222
275
  .expect("pipeline succeeds");
223
276
  assert_eq!(output, "this Echo please line");
224
277
  }
278
+
279
+ #[test]
280
+ fn plan_gaggle_orders_by_scope_order_and_name() {
281
+ let master_seed = 5151i128;
282
+ let inputs = vec![
283
+ GagglePlanInput {
284
+ index: 0,
285
+ name: "Typogre".to_string(),
286
+ scope: 5,
287
+ order: 3,
288
+ },
289
+ GagglePlanInput {
290
+ index: 1,
291
+ name: "Reduple".to_string(),
292
+ scope: 4,
293
+ order: 3,
294
+ },
295
+ GagglePlanInput {
296
+ index: 2,
297
+ name: "Rushmore".to_string(),
298
+ scope: 4,
299
+ order: 2,
300
+ },
301
+ GagglePlanInput {
302
+ index: 3,
303
+ name: "Mim1c".to_string(),
304
+ scope: 5,
305
+ order: 2,
306
+ },
307
+ ];
308
+ let plan = plan_gaggle(inputs, master_seed);
309
+ let expected = vec![
310
+ GagglePlanEntry {
311
+ index: 2,
312
+ seed: derive_seed(master_seed, "Rushmore", 2),
313
+ },
314
+ GagglePlanEntry {
315
+ index: 1,
316
+ seed: derive_seed(master_seed, "Reduple", 1),
317
+ },
318
+ GagglePlanEntry {
319
+ index: 3,
320
+ seed: derive_seed(master_seed, "Mim1c", 3),
321
+ },
322
+ GagglePlanEntry {
323
+ index: 0,
324
+ seed: derive_seed(master_seed, "Typogre", 0),
325
+ },
326
+ ];
327
+ assert_eq!(plan, expected);
328
+ }
225
329
  }
@@ -323,6 +323,25 @@ mod tests {
323
323
  }
324
324
  }
325
325
 
326
+ #[test]
327
+ fn random_matches_python_for_additional_seed() {
328
+ let mut rng = PyRng::new(3815924951222172525);
329
+ let expected = [
330
+ 0.18518006574496737,
331
+ 0.5841689581060610,
332
+ 0.3699113163178772,
333
+ 0.7394349068470196,
334
+ 0.6855497906317899,
335
+ ];
336
+ for value in expected {
337
+ let actual = rng.random();
338
+ assert!(
339
+ (actual - value).abs() < 1e-15,
340
+ "expected {value}, got {actual}"
341
+ );
342
+ }
343
+ }
344
+
326
345
  #[test]
327
346
  fn randrange_supports_default_arguments() {
328
347
  let mut rng = PyRng::new(151);
@@ -171,6 +171,35 @@ impl TextBuffer {
171
171
  Ok(())
172
172
  }
173
173
 
174
+ /// Replace multiple words in a single pass, avoiding repeated reindexing.
175
+ pub fn replace_words_bulk<I>(
176
+ &mut self,
177
+ replacements: I,
178
+ ) -> Result<(), TextBufferError>
179
+ where
180
+ I: IntoIterator<Item = (usize, String)>,
181
+ {
182
+ let mut applied_any = false;
183
+ for (word_index, replacement) in replacements {
184
+ let segment_index = self
185
+ .word_segment_indices
186
+ .get(word_index)
187
+ .copied()
188
+ .ok_or(TextBufferError::InvalidWordIndex { index: word_index })?;
189
+ let segment = self
190
+ .segments
191
+ .get_mut(segment_index)
192
+ .ok_or(TextBufferError::InvalidWordIndex { index: word_index })?;
193
+ segment.set_text(replacement, SegmentKind::Word);
194
+ applied_any = true;
195
+ }
196
+
197
+ if applied_any {
198
+ self.reindex();
199
+ }
200
+ Ok(())
201
+ }
202
+
174
203
  /// Deletes the word at the requested index.
175
204
  pub fn delete_word(&mut self, word_index: usize) -> Result<(), TextBufferError> {
176
205
  let segment_index = self
@@ -402,6 +431,22 @@ mod tests {
402
431
  assert_eq!(buffer.spans().len(), 5);
403
432
  }
404
433
 
434
+ #[test]
435
+ fn bulk_replace_words_updates_multiple_entries() {
436
+ let mut buffer = TextBuffer::from_str("alpha beta gamma delta");
437
+ buffer
438
+ .replace_words_bulk(vec![
439
+ (0, "delta".to_string()),
440
+ (3, "alpha".to_string()),
441
+ ])
442
+ .expect("bulk replace succeeds");
443
+ assert_eq!(buffer.to_string(), "delta beta gamma alpha");
444
+ let spans = buffer.spans();
445
+ assert_eq!(spans[0].char_range, 0..5);
446
+ assert_eq!(spans.len(), 7);
447
+ assert_eq!(spans.last().unwrap().char_range, 17..22);
448
+ }
449
+
405
450
  #[test]
406
451
  fn replace_char_range_handles_multisegment_updates() {
407
452
  let mut buffer = TextBuffer::from_str("Hello world");
@@ -4,10 +4,12 @@ from __future__ import annotations
4
4
 
5
5
  from abc import ABC, abstractmethod
6
6
  from hashlib import blake2s
7
+ from pathlib import Path
7
8
  import random
8
9
  from typing import Callable, Iterable
9
10
 
10
11
  from glitchlings.config import get_config
12
+ from ._cache import CacheEntries, CacheSnapshot
11
13
 
12
14
 
13
15
  class Lexicon(ABC):
@@ -79,6 +81,21 @@ class Lexicon(ABC):
79
81
  return f"{self.__class__.__name__}(seed={self._seed!r})"
80
82
 
81
83
 
84
+ class LexiconBackend(Lexicon):
85
+ """Extended lexicon interface that supports cache persistence."""
86
+
87
+ Cache = CacheEntries
88
+
89
+ @classmethod
90
+ @abstractmethod
91
+ def load_cache(cls, path: str | Path) -> CacheSnapshot:
92
+ """Return a validated cache snapshot loaded from ``path``."""
93
+
94
+ @abstractmethod
95
+ def save_cache(self, path: str | Path | None = None) -> Path | None:
96
+ """Persist the backend cache to ``path`` and return the destination."""
97
+
98
+
82
99
  from .graph import GraphLexicon
83
100
  from .metrics import (
84
101
  compare_lexicons,
@@ -176,6 +193,7 @@ def get_default_lexicon(seed: int | None = None) -> Lexicon:
176
193
 
177
194
  __all__ = [
178
195
  "Lexicon",
196
+ "LexiconBackend",
179
197
  "VectorLexicon",
180
198
  "GraphLexicon",
181
199
  "WordNetLexicon",
@@ -0,0 +1,111 @@
1
+ """Shared cache helpers for lexicon backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from hashlib import blake2s
8
+ from pathlib import Path
9
+ from typing import Mapping, Sequence
10
+
11
+
12
+ CacheEntries = dict[str, list[str]]
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class CacheSnapshot:
17
+ """Materialised cache data and its integrity checksum."""
18
+
19
+ entries: CacheEntries
20
+ checksum: str | None = None
21
+
22
+
23
+ def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
24
+ """Convert raw cache payloads into canonical mapping form."""
25
+
26
+ entries: CacheEntries = {}
27
+ for key, values in payload.items():
28
+ if not isinstance(key, str):
29
+ raise RuntimeError("Synonym cache keys must be strings.")
30
+ if not isinstance(values, Sequence):
31
+ raise RuntimeError("Synonym cache values must be sequences of strings.")
32
+ entries[key] = [str(value) for value in values]
33
+ return entries
34
+
35
+
36
+ def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
37
+ """Return a deterministic JSON serialisation for ``entries``."""
38
+
39
+ serialisable = {key: list(values) for key, values in sorted(entries.items())}
40
+ return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
41
+
42
+
43
+ def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
44
+ """Return a BLAKE2s checksum for ``entries``."""
45
+
46
+ digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
47
+ return digest.hexdigest()
48
+
49
+
50
+ def load_cache(path: Path) -> CacheSnapshot:
51
+ """Load a cache from ``path`` and verify its checksum if present."""
52
+
53
+ if not path.exists():
54
+ return CacheSnapshot(entries={}, checksum=None)
55
+
56
+ with path.open("r", encoding="utf8") as handle:
57
+ payload = json.load(handle)
58
+
59
+ checksum: str | None = None
60
+ entries_payload: Mapping[str, Sequence[str]]
61
+
62
+ if isinstance(payload, Mapping) and "__meta__" in payload and "entries" in payload:
63
+ meta = payload["__meta__"]
64
+ entries_payload = payload["entries"] # type: ignore[assignment]
65
+ if not isinstance(entries_payload, Mapping):
66
+ raise RuntimeError("Synonym cache entries must be stored as a mapping.")
67
+ if isinstance(meta, Mapping):
68
+ raw_checksum = meta.get("checksum")
69
+ if raw_checksum is not None and not isinstance(raw_checksum, str):
70
+ raise RuntimeError("Synonym cache checksum must be a string when provided.")
71
+ checksum = raw_checksum
72
+ else:
73
+ raise RuntimeError("Synonym cache metadata must be a mapping.")
74
+ elif isinstance(payload, Mapping):
75
+ entries_payload = payload # legacy format without metadata
76
+ else:
77
+ raise RuntimeError("Synonym cache payload must be a mapping of strings to lists.")
78
+
79
+ entries = _normalise_entries(entries_payload)
80
+ if checksum is not None:
81
+ expected = compute_checksum(entries)
82
+ if checksum != expected:
83
+ raise RuntimeError(
84
+ "Synonym cache checksum mismatch; the cache file appears to be corrupted."
85
+ )
86
+
87
+ return CacheSnapshot(entries=entries, checksum=checksum)
88
+
89
+
90
+ def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
91
+ """Persist ``entries`` to ``path`` with checksum metadata."""
92
+
93
+ serialisable = {key: list(values) for key, values in sorted(entries.items())}
94
+ checksum = compute_checksum(serialisable)
95
+ payload = {
96
+ "__meta__": {
97
+ "checksum": checksum,
98
+ "entries": len(serialisable),
99
+ },
100
+ "entries": serialisable,
101
+ }
102
+ path.parent.mkdir(parents=True, exist_ok=True)
103
+
104
+ with path.open("w", encoding="utf8") as handle:
105
+ json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
106
+
107
+ return CacheSnapshot(entries=serialisable, checksum=checksum)
108
+
109
+
110
+ __all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]
111
+
@@ -2,12 +2,12 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import json
6
5
  import re
7
6
  from pathlib import Path
8
7
  from typing import Iterable, Mapping, MutableMapping, Sequence
9
8
 
10
- from . import Lexicon
9
+ from . import LexiconBackend
10
+ from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
11
11
  from .vector import VectorLexicon
12
12
 
13
13
 
@@ -140,30 +140,7 @@ def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[f
140
140
  return embeddings
141
141
 
142
142
 
143
- def _load_cache(path: Path) -> dict[str, list[str]]:
144
- if not path.exists():
145
- return {}
146
- with path.open("r", encoding="utf8") as handle:
147
- payload = json.load(handle)
148
- if not isinstance(payload, Mapping):
149
- raise RuntimeError("Graph lexicon cache must be a mapping of strings to lists.")
150
- cache: dict[str, list[str]] = {}
151
- for key, values in payload.items():
152
- if not isinstance(key, str):
153
- raise RuntimeError("Graph lexicon cache keys must be strings.")
154
- if not isinstance(values, Sequence):
155
- raise RuntimeError("Graph lexicon cache values must be sequences of strings.")
156
- cache[key] = [str(value) for value in values]
157
- return cache
158
-
159
-
160
- def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
161
- serialisable = {key: list(values) for key, values in sorted(cache.items())}
162
- with path.open("w", encoding="utf8") as handle:
163
- json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
164
-
165
-
166
- class GraphLexicon(Lexicon):
143
+ class GraphLexicon(LexiconBackend):
167
144
  """Lexicon backed by ConceptNet/Numberbatch embeddings."""
168
145
 
169
146
  def __init__(
@@ -184,9 +161,12 @@ class GraphLexicon(Lexicon):
184
161
  self._max_neighbors = max(1, max_neighbors)
185
162
  self._min_similarity = min_similarity
186
163
  self._cache: MutableMapping[str, list[str]] = {}
187
- self._cache_path = Path(cache_path) if cache_path is not None else None
164
+ self._cache_path: Path | None = Path(cache_path) if cache_path is not None else None
165
+ self._cache_checksum: str | None = None
188
166
  if self._cache_path is not None:
189
- self._cache.update(_load_cache(self._cache_path))
167
+ snapshot = _load_cache_file(self._cache_path)
168
+ self._cache.update(snapshot.entries)
169
+ self._cache_checksum = snapshot.checksum
190
170
  if cache is not None:
191
171
  for key, values in cache.items():
192
172
  self._cache[str(key)] = [str(value) for value in values]
@@ -278,6 +258,12 @@ class GraphLexicon(Lexicon):
278
258
  def export_cache(self) -> dict[str, list[str]]:
279
259
  return {key: list(values) for key, values in self._cache.items()}
280
260
 
261
+ @classmethod
262
+ def load_cache(cls, path: str | Path) -> CacheSnapshot:
263
+ """Load and validate a persisted ConceptNet cache file."""
264
+
265
+ return _load_cache_file(Path(path))
266
+
281
267
  def save_cache(self, path: str | Path | None = None) -> Path:
282
268
  if path is None:
283
269
  if self._cache_path is None:
@@ -286,7 +272,8 @@ class GraphLexicon(Lexicon):
286
272
  else:
287
273
  target = Path(path)
288
274
  self._cache_path = target
289
- _write_cache(target, self._cache)
275
+ snapshot = _write_cache_file(target, self._cache)
276
+ self._cache_checksum = snapshot.checksum
290
277
  self._cache_dirty = False
291
278
  return target
292
279
 
@@ -10,7 +10,8 @@ from pathlib import Path
10
10
  import sys
11
11
  from typing import Any, Callable, Iterable, Iterator, Mapping, MutableMapping, Sequence
12
12
 
13
- from . import Lexicon
13
+ from . import LexiconBackend
14
+ from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
14
15
 
15
16
 
16
17
  def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
@@ -241,38 +242,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
241
242
  raise RuntimeError("Unsupported vector source supplied to VectorLexicon.")
242
243
 
243
244
 
244
- def _load_cache(path: Path) -> dict[str, list[str]]:
245
- """Load a synonym cache from ``path`` if it exists."""
246
-
247
- if not path.exists():
248
- return {}
249
-
250
- with path.open("r", encoding="utf8") as handle:
251
- payload = json.load(handle)
252
-
253
- if not isinstance(payload, Mapping):
254
- raise RuntimeError("Synonym cache must be a JSON mapping of strings to lists.")
255
-
256
- cache: dict[str, list[str]] = {}
257
- for key, values in payload.items():
258
- if not isinstance(key, str):
259
- raise RuntimeError("Synonym cache keys must be strings.")
260
- if not isinstance(values, Sequence):
261
- raise RuntimeError("Synonym cache values must be lists of strings.")
262
- cache[key] = [str(value) for value in values]
263
-
264
- return cache
265
-
266
-
267
- def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
268
- """Write ``cache`` to ``path`` deterministically."""
269
-
270
- serialisable = {key: list(values) for key, values in sorted(cache.items())}
271
- with path.open("w", encoding="utf8") as handle:
272
- json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
273
-
274
-
275
- class VectorLexicon(Lexicon):
245
+ class VectorLexicon(LexiconBackend):
276
246
  """Lexicon implementation backed by dense word embeddings."""
277
247
 
278
248
  def __init__(
@@ -292,9 +262,13 @@ class VectorLexicon(Lexicon):
292
262
  self._max_neighbors = max(1, max_neighbors)
293
263
  self._min_similarity = min_similarity
294
264
  self._cache: MutableMapping[str, list[str]] = {}
265
+ self._cache_path: Path | None
266
+ self._cache_checksum: str | None = None
295
267
  if cache_path is not None:
296
268
  path = Path(cache_path)
297
- self._cache.update(_load_cache(path))
269
+ snapshot = _load_cache_file(path)
270
+ self._cache.update(snapshot.entries)
271
+ self._cache_checksum = snapshot.checksum
298
272
  self._cache_path = path
299
273
  else:
300
274
  self._cache_path = None
@@ -411,6 +385,12 @@ class VectorLexicon(Lexicon):
411
385
 
412
386
  return {key: list(values) for key, values in self._cache.items()}
413
387
 
388
+ @classmethod
389
+ def load_cache(cls, path: str | Path) -> CacheSnapshot:
390
+ """Load and validate a cache file for reuse."""
391
+
392
+ return _load_cache_file(Path(path))
393
+
414
394
  def save_cache(self, path: str | Path | None = None) -> Path:
415
395
  """Persist the current cache to disk, returning the path used."""
416
396
 
@@ -422,7 +402,8 @@ class VectorLexicon(Lexicon):
422
402
  target = Path(path)
423
403
  self._cache_path = target
424
404
 
425
- _write_cache(target, self._cache)
405
+ snapshot = _write_cache_file(target, self._cache)
406
+ self._cache_checksum = snapshot.checksum
426
407
  self._cache_dirty = False
427
408
  return target
428
409