glitchlings 0.2.5__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {glitchlings-0.2.5 → glitchlings-0.3.0}/PKG-INFO +21 -6
  2. {glitchlings-0.2.5 → glitchlings-0.3.0}/README.md +19 -4
  3. {glitchlings-0.2.5 → glitchlings-0.3.0}/pyproject.toml +2 -2
  4. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/glitch_ops.rs +79 -1
  5. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/lib.rs +29 -1
  6. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/pipeline.rs +15 -0
  7. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/__init__.py +4 -0
  8. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/dlc/prime.py +18 -1
  9. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/__init__.py +5 -1
  10. glitchlings-0.3.0/src/glitchlings/zoo/_text_utils.py +42 -0
  11. glitchlings-0.3.0/src/glitchlings/zoo/adjax.py +131 -0
  12. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/core.py +28 -18
  13. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/redactyl.py +16 -20
  14. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/reduple.py +13 -24
  15. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/rushmore.py +12 -17
  16. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/PKG-INFO +21 -6
  17. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/SOURCES.txt +4 -0
  18. glitchlings-0.3.0/tests/test_benchmarks.py +88 -0
  19. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_glitchlings_determinism.py +8 -0
  20. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_parameter_effects.py +31 -1
  21. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_prime_echo_chamber.py +24 -0
  22. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_property_based.py +1 -0
  23. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_rust_backed_glitchlings.py +71 -3
  24. glitchlings-0.3.0/tests/test_text_utils.py +37 -0
  25. {glitchlings-0.2.5 → glitchlings-0.3.0}/LICENSE +0 -0
  26. {glitchlings-0.2.5 → glitchlings-0.3.0}/MANIFEST.in +0 -0
  27. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/Cargo.lock +0 -0
  28. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/Cargo.toml +0 -0
  29. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/Cargo.toml +0 -0
  30. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/assets/ocr_confusions.tsv +0 -0
  31. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/build.rs +0 -0
  32. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/resources.rs +0 -0
  33. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/rng.rs +0 -0
  34. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/text_buffer.rs +0 -0
  35. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/typogre.rs +0 -0
  36. {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/zeedub.rs +0 -0
  37. {glitchlings-0.2.5 → glitchlings-0.3.0}/setup.cfg +0 -0
  38. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/__main__.py +0 -0
  39. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/dlc/__init__.py +0 -0
  40. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/dlc/huggingface.py +0 -0
  41. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/main.py +0 -0
  42. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/util/__init__.py +0 -0
  43. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
  44. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/_rate.py +0 -0
  45. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/jargoyle.py +0 -0
  46. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/mim1c.py +0 -0
  47. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
  48. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/scannequin.py +0 -0
  49. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/typogre.py +0 -0
  50. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/zeedub.py +0 -0
  51. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/dependency_links.txt +0 -0
  52. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/entry_points.txt +0 -0
  53. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/requires.txt +0 -0
  54. {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/top_level.txt +0 -0
  55. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_cli.py +0 -0
  56. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_dataset_corruption.py +0 -0
  57. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_gaggle.py +0 -0
  58. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_glitchling_core.py +0 -0
  59. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_huggingface_dlc.py +0 -0
  60. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_jargoyle.py +0 -0
  61. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_keyboard_layouts.py +0 -0
  62. {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.2.5
3
+ Version: 0.3.0
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -209,7 +209,7 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
209
209
  Project-URL: Repository, https://github.com/osoleve/glitchlings.git
210
210
  Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
211
211
  Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
212
- Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,confusables,typo,
212
+ Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,rlvr
213
213
  Classifier: Development Status :: 3 - Alpha
214
214
  Classifier: Intended Audience :: Developers
215
215
  Classifier: Programming Language :: Python
@@ -296,7 +296,7 @@ print(gaggle(SAMPLE_TEXT))
296
296
 
297
297
  Consult the [Glitchlings Usage Guide](docs/index.md)
298
298
  for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
299
- integrations, and the feature-flagged Rust pipeline.
299
+ integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
300
300
 
301
301
  ## Motivation
302
302
 
@@ -428,7 +428,8 @@ _Did you say that or did I?_
428
428
  >
429
429
  > Args
430
430
  >
431
- > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
431
+ > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.01, 1%).
432
+ > - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
432
433
  > - `seed (int)`: The random seed for reproducibility (default: 151).
433
434
 
434
435
  ### Rushmore
@@ -440,6 +441,19 @@ _I accidentally an entire word._
440
441
  > Args
441
442
  >
442
443
  > - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
444
+ > - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
445
+ > - `seed (int)`: The random seed for reproducibility (default: 151).
446
+
447
+ ### Adjax
448
+
449
+ _Keep your hands and punctuation where I can see them._
450
+
451
+ > _**Perfect Shuffle.**_ Adjax trades the cores of neighbouring words while leaving punctuation, casing, and surrounding whitespace untouched, turning fluent prose into locally scrambled tongue-twisters.
452
+ >
453
+ > Args
454
+ >
455
+ > - `rate (float)`: Probability that each adjacent pair swaps cores (default: 0.5, 50%).
456
+ > - `swap_rate (float)`: Alias for `rate`, retained for backward compatibility.
443
457
  > - `seed (int)`: The random seed for reproducibility (default: 151).
444
458
 
445
459
  ### Redactyl
@@ -450,9 +464,10 @@ _Oops, that was my black highlighter._
450
464
  >
451
465
  > ### Args
452
466
  >
453
- > - `replacement_char (str)`: The character to use for redaction (default: ).
454
- > - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
467
+ > - `replacement_char (str)`: The character to use for redaction (default: FULL_BLOCK).
468
+ > - `rate (float)`: The maximum proportion of words to redact (default: 0.025, 2.5%).
455
469
  > - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
470
+ > - `unweighted (bool)`: Sample words uniformly instead of biasing toward longer tokens (default: False).
456
471
  > - `seed (int)`: The random seed for reproducibility (default: 151).
457
472
 
458
473
  ## Field Report: Uncontained Specimens
@@ -55,7 +55,7 @@ print(gaggle(SAMPLE_TEXT))
55
55
 
56
56
  Consult the [Glitchlings Usage Guide](docs/index.md)
57
57
  for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
58
- integrations, and the feature-flagged Rust pipeline.
58
+ integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
59
59
 
60
60
  ## Motivation
61
61
 
@@ -187,7 +187,8 @@ _Did you say that or did I?_
187
187
  >
188
188
  > Args
189
189
  >
190
- > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
190
+ > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.01, 1%).
191
+ > - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
191
192
  > - `seed (int)`: The random seed for reproducibility (default: 151).
192
193
 
193
194
  ### Rushmore
@@ -199,6 +200,19 @@ _I accidentally an entire word._
199
200
  > Args
200
201
  >
201
202
  > - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
203
+ > - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
204
+ > - `seed (int)`: The random seed for reproducibility (default: 151).
205
+
206
+ ### Adjax
207
+
208
+ _Keep your hands and punctuation where I can see them._
209
+
210
+ > _**Perfect Shuffle.**_ Adjax trades the cores of neighbouring words while leaving punctuation, casing, and surrounding whitespace untouched, turning fluent prose into locally scrambled tongue-twisters.
211
+ >
212
+ > Args
213
+ >
214
+ > - `rate (float)`: Probability that each adjacent pair swaps cores (default: 0.5, 50%).
215
+ > - `swap_rate (float)`: Alias for `rate`, retained for backward compatibility.
202
216
  > - `seed (int)`: The random seed for reproducibility (default: 151).
203
217
 
204
218
  ### Redactyl
@@ -209,9 +223,10 @@ _Oops, that was my black highlighter._
209
223
  >
210
224
  > ### Args
211
225
  >
212
- > - `replacement_char (str)`: The character to use for redaction (default: ).
213
- > - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
226
+ > - `replacement_char (str)`: The character to use for redaction (default: FULL_BLOCK).
227
+ > - `rate (float)`: The maximum proportion of words to redact (default: 0.025, 2.5%).
214
228
  > - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
229
+ > - `unweighted (bool)`: Sample words uniformly instead of biasing toward longer tokens (default: False).
215
230
  > - `seed (int)`: The random seed for reproducibility (default: 151).
216
231
 
217
232
  ## Field Report: Uncontained Specimens
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "glitchlings"
3
- version = "0.2.5"
3
+ version = "0.3.0"
4
4
  description = "Monsters for your language games."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -13,7 +13,7 @@ authors = [
13
13
  { name = "osoleve" }
14
14
  ]
15
15
 
16
- keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "confusables", "typo", ""]
16
+ keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "rlvr"]
17
17
 
18
18
  classifiers = [
19
19
  "Development Status :: 3 - Alpha",
@@ -359,6 +359,61 @@ impl GlitchOp for DeleteRandomWordsOp {
359
359
  }
360
360
  }
361
361
 
362
+ /// Swaps adjacent word cores while keeping punctuation and spacing intact.
363
+ #[derive(Debug, Clone, Copy)]
364
+ pub struct SwapAdjacentWordsOp {
365
+ pub swap_rate: f64,
366
+ }
367
+
368
+ impl GlitchOp for SwapAdjacentWordsOp {
369
+ fn apply(&self, buffer: &mut TextBuffer, rng: &mut dyn GlitchRng) -> Result<(), GlitchOpError> {
370
+ let total_words = buffer.word_count();
371
+ if total_words < 2 {
372
+ return Ok(());
373
+ }
374
+
375
+ let clamped = self.swap_rate.max(0.0).min(1.0);
376
+ if clamped <= 0.0 {
377
+ return Ok(());
378
+ }
379
+
380
+ let mut index = 0usize;
381
+ while index + 1 < total_words {
382
+ let left_segment = match buffer.word_segment(index) {
383
+ Some(segment) => segment,
384
+ None => break,
385
+ };
386
+ let right_segment = match buffer.word_segment(index + 1) {
387
+ Some(segment) => segment,
388
+ None => break,
389
+ };
390
+
391
+ let left_original = left_segment.text().to_string();
392
+ let right_original = right_segment.text().to_string();
393
+
394
+ let (left_prefix, left_core, left_suffix) = split_affixes(&left_original);
395
+ let (right_prefix, right_core, right_suffix) = split_affixes(&right_original);
396
+
397
+ if left_core.is_empty() || right_core.is_empty() {
398
+ index += 2;
399
+ continue;
400
+ }
401
+
402
+ let should_swap = clamped >= 1.0 || rng.random()? < clamped;
403
+ if should_swap {
404
+ let left_replacement = format!("{left_prefix}{right_core}{left_suffix}");
405
+ let right_replacement = format!("{right_prefix}{left_core}{right_suffix}");
406
+ buffer.replace_word(index, &left_replacement)?;
407
+ buffer.replace_word(index + 1, &right_replacement)?;
408
+ }
409
+
410
+ index += 2;
411
+ }
412
+
413
+ Ok(())
414
+ }
415
+ }
416
+
362
417
  /// Redacts words by replacing core characters with a replacement token.
363
418
  #[derive(Debug, Clone)]
364
419
  pub struct RedactWordsOp {
@@ -555,6 +610,7 @@ impl GlitchOp for OcrArtifactsOp {
555
610
  pub enum GlitchOperation {
556
611
  Reduplicate(ReduplicateWordsOp),
557
612
  Delete(DeleteRandomWordsOp),
613
+ SwapAdjacent(SwapAdjacentWordsOp),
558
614
  Redact(RedactWordsOp),
559
615
  Ocr(OcrArtifactsOp),
560
616
  }
@@ -564,6 +620,7 @@ impl GlitchOp for GlitchOperation {
564
620
  match self {
565
621
  GlitchOperation::Reduplicate(op) => op.apply(buffer, rng),
566
622
  GlitchOperation::Delete(op) => op.apply(buffer, rng),
623
+ GlitchOperation::SwapAdjacent(op) => op.apply(buffer, rng),
567
624
  GlitchOperation::Redact(op) => op.apply(buffer, rng),
568
625
  GlitchOperation::Ocr(op) => op.apply(buffer, rng),
569
626
  }
@@ -574,7 +631,7 @@ impl GlitchOp for GlitchOperation {
574
631
  mod tests {
575
632
  use super::{
576
633
  DeleteRandomWordsOp, GlitchOp, GlitchOpError, OcrArtifactsOp, RedactWordsOp,
577
- ReduplicateWordsOp,
634
+ ReduplicateWordsOp, SwapAdjacentWordsOp,
578
635
  };
579
636
  use crate::rng::PyRng;
580
637
  use crate::text_buffer::TextBuffer;
@@ -592,6 +649,27 @@ mod tests {
592
649
  assert_eq!(buffer.to_string(), "Hello Hello world world");
593
650
  }
594
651
 
652
+ #[test]
653
+ fn swap_adjacent_words_swaps_cores() {
654
+ let mut buffer = TextBuffer::from_str("Alpha, beta! Gamma delta");
655
+ let mut rng = PyRng::new(7);
656
+ let op = SwapAdjacentWordsOp { swap_rate: 1.0 };
657
+ op.apply(&mut buffer, &mut rng)
658
+ .expect("swap operation succeeds");
659
+ assert_eq!(buffer.to_string(), "beta, Alpha! delta Gamma");
660
+ }
661
+
662
+ #[test]
663
+ fn swap_adjacent_words_respects_zero_rate() {
664
+ let original = "Do not move these words";
665
+ let mut buffer = TextBuffer::from_str(original);
666
+ let mut rng = PyRng::new(42);
667
+ let op = SwapAdjacentWordsOp { swap_rate: 0.0 };
668
+ op.apply(&mut buffer, &mut rng)
669
+ .expect("swap operation succeeds");
670
+ assert_eq!(buffer.to_string(), original);
671
+ }
672
+
595
673
  #[test]
596
674
  fn delete_random_words_cleans_up_spacing() {
597
675
  let mut buffer = TextBuffer::from_str("One two three four five");
@@ -14,7 +14,7 @@ use pyo3::{exceptions::PyValueError, FromPyObject};
14
14
 
15
15
  pub use glitch_ops::{
16
16
  DeleteRandomWordsOp, GlitchOpError, GlitchOperation, OcrArtifactsOp, RedactWordsOp,
17
- ReduplicateWordsOp,
17
+ ReduplicateWordsOp, SwapAdjacentWordsOp,
18
18
  };
19
19
  pub use pipeline::{derive_seed, GlitchDescriptor, Pipeline, PipelineError};
20
20
  pub use rng::{PyRng, PyRngError};
@@ -101,6 +101,9 @@ enum PyGlitchOperation {
101
101
  max_deletion_rate: f64,
102
102
  unweighted: bool,
103
103
  },
104
+ SwapAdjacent {
105
+ swap_rate: f64,
106
+ },
104
107
  Redact {
105
108
  replacement_char: String,
106
109
  redaction_rate: f64,
@@ -154,6 +157,15 @@ impl<'py> FromPyObject<'py> for PyGlitchOperation {
154
157
  unweighted,
155
158
  })
156
159
  }
160
+ "swap_adjacent" => {
161
+ let rate = dict
162
+ .get_item("swap_rate")?
163
+ .ok_or_else(|| {
164
+ PyValueError::new_err("swap_adjacent operation missing 'swap_rate'")
165
+ })?
166
+ .extract()?;
167
+ Ok(PyGlitchOperation::SwapAdjacent { swap_rate: rate })
168
+ }
157
169
  "redact" => {
158
170
  let replacement_char = dict
159
171
  .get_item("replacement_char")?
@@ -241,6 +253,16 @@ fn delete_random_words(
241
253
  apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
242
254
  }
243
255
 
256
+ #[pyfunction]
257
+ fn swap_adjacent_words(
258
+ text: &str,
259
+ swap_rate: f64,
260
+ rng: &Bound<'_, PyAny>,
261
+ ) -> PyResult<String> {
262
+ let op = SwapAdjacentWordsOp { swap_rate };
263
+ apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
264
+ }
265
+
244
266
  #[pyfunction]
245
267
  fn ocr_artifacts(text: &str, error_rate: f64, rng: &Bound<'_, PyAny>) -> PyResult<String> {
246
268
  let op = OcrArtifactsOp { error_rate };
@@ -289,6 +311,11 @@ fn compose_glitchlings(
289
311
  max_deletion_rate,
290
312
  unweighted,
291
313
  }),
314
+ PyGlitchOperation::SwapAdjacent { swap_rate } => {
315
+ GlitchOperation::SwapAdjacent(glitch_ops::SwapAdjacentWordsOp {
316
+ swap_rate,
317
+ })
318
+ }
292
319
  PyGlitchOperation::Redact {
293
320
  replacement_char,
294
321
  redaction_rate,
@@ -320,6 +347,7 @@ fn compose_glitchlings(
320
347
  fn _zoo_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
321
348
  m.add_function(wrap_pyfunction!(reduplicate_words, m)?)?;
322
349
  m.add_function(wrap_pyfunction!(delete_random_words, m)?)?;
350
+ m.add_function(wrap_pyfunction!(swap_adjacent_words, m)?)?;
323
351
  m.add_function(wrap_pyfunction!(ocr_artifacts, m)?)?;
324
352
  m.add_function(wrap_pyfunction!(redact_words, m)?)?;
325
353
  m.add_function(wrap_pyfunction!(compose_glitchlings, m)?)?;
@@ -112,6 +112,7 @@ mod tests {
112
112
  use super::{derive_seed, GlitchDescriptor, Pipeline};
113
113
  use crate::glitch_ops::{
114
114
  DeleteRandomWordsOp, GlitchOperation, OcrArtifactsOp, RedactWordsOp, ReduplicateWordsOp,
115
+ SwapAdjacentWordsOp,
115
116
  };
116
117
 
117
118
  #[test]
@@ -207,4 +208,18 @@ mod tests {
207
208
  .expect("pipeline run succeeds");
208
209
  assert_eq!(output, "Guard the ██ at ██████████");
209
210
  }
211
+ #[test]
212
+ fn pipeline_swaps_adjacent_words() {
213
+ let master_seed = 2025i128;
214
+ let descriptors = vec![GlitchDescriptor {
215
+ name: "Adjax".to_string(),
216
+ seed: derive_seed(master_seed, "Adjax", 0),
217
+ operation: GlitchOperation::SwapAdjacent(SwapAdjacentWordsOp { swap_rate: 1.0 }),
218
+ }];
219
+ let pipeline = Pipeline::new(master_seed, descriptors);
220
+ let output = pipeline
221
+ .run("Echo this line please")
222
+ .expect("pipeline succeeds");
223
+ assert_eq!(output, "this Echo please line");
224
+ }
210
225
  }
@@ -5,6 +5,8 @@ from .zoo import (
5
5
  mim1c,
6
6
  Jargoyle,
7
7
  jargoyle,
8
+ Adjax,
9
+ adjax,
8
10
  Redactyl,
9
11
  redactyl,
10
12
  Reduple,
@@ -29,6 +31,8 @@ __all__ = [
29
31
  "mim1c",
30
32
  "Jargoyle",
31
33
  "jargoyle",
34
+ "Adjax",
35
+ "adjax",
32
36
  "Redactyl",
33
37
  "redactyl",
34
38
  "Reduple",
@@ -49,7 +49,24 @@ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[st
49
49
  if candidate in available:
50
50
  return [candidate]
51
51
 
52
- sample = dataset[0] if len(dataset) else {}
52
+ try:
53
+ dataset_length = len(dataset) # type: ignore[arg-type]
54
+ except TypeError:
55
+ preview_rows: list[dict[str, Any]]
56
+ take_fn = getattr(dataset, "take", None)
57
+ if callable(take_fn):
58
+ preview_rows = list(take_fn(1))
59
+ else:
60
+ iterator = iter(dataset)
61
+ try:
62
+ first_row = next(iterator)
63
+ except StopIteration:
64
+ preview_rows = []
65
+ else:
66
+ preview_rows = [first_row]
67
+ sample = dict(preview_rows[0]) if preview_rows else {}
68
+ else:
69
+ sample = dataset[0] if dataset_length else {}
53
70
  inferred = [
54
71
  name
55
72
  for name in dataset.column_names
@@ -6,6 +6,7 @@ from typing import Any
6
6
  from .typogre import Typogre, typogre
7
7
  from .mim1c import Mim1c, mim1c
8
8
  from .jargoyle import Jargoyle, jargoyle, dependencies_available as _jargoyle_available
9
+ from .adjax import Adjax, adjax
9
10
  from .reduple import Reduple, reduple
10
11
  from .rushmore import Rushmore, rushmore
11
12
  from .redactyl import Redactyl, redactyl
@@ -20,6 +21,8 @@ __all__ = [
20
21
  "mim1c",
21
22
  "Jargoyle",
22
23
  "jargoyle",
24
+ "Adjax",
25
+ "adjax",
23
26
  "Reduple",
24
27
  "reduple",
25
28
  "Rushmore",
@@ -43,7 +46,7 @@ _HAS_JARGOYLE = _jargoyle_available()
43
46
  _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
44
47
  if _HAS_JARGOYLE:
45
48
  _BUILTIN_GLITCHLING_LIST.append(jargoyle)
46
- _BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
49
+ _BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
47
50
 
48
51
  BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
49
52
  glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
@@ -52,6 +55,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
52
55
  _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
53
56
  typogre.name.lower(): Typogre,
54
57
  mim1c.name.lower(): Mim1c,
58
+ adjax.name.lower(): Adjax,
55
59
  reduple.name.lower(): Reduple,
56
60
  rushmore.name.lower(): Rushmore,
57
61
  redactyl.name.lower(): Redactyl,
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ _WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
6
+ _TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
7
+
8
+
9
+ def split_preserving_whitespace(text: str) -> list[str]:
10
+ """Split text while keeping whitespace tokens for stable reconstruction."""
11
+
12
+ return _WORD_SPLIT_PATTERN.split(text)
13
+
14
+
15
+ def split_token_edges(token: str) -> tuple[str, str, str]:
16
+ """Return leading, core, and trailing segments for a token."""
17
+
18
+ match = _TOKEN_EDGES_PATTERN.match(token)
19
+ if match is None:
20
+ return "", token, ""
21
+ return match.group(1), match.group(2), match.group(3)
22
+
23
+
24
+ def token_core_length(token: str) -> int:
25
+ """Return the length of the main word characters for weighting heuristics."""
26
+
27
+ _, core, _ = split_token_edges(token)
28
+ candidate = core if core else token
29
+ length = len(candidate)
30
+ if length <= 0:
31
+ stripped = token.strip()
32
+ length = len(stripped) if stripped else len(token)
33
+ if length <= 0:
34
+ length = 1
35
+ return length
36
+
37
+
38
+ __all__ = [
39
+ "split_preserving_whitespace",
40
+ "split_token_edges",
41
+ "token_core_length",
42
+ ]
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import Any
5
+
6
+ from ._rate import resolve_rate
7
+ from ._text_utils import split_preserving_whitespace, split_token_edges
8
+ from .core import AttackWave, Glitchling
9
+
10
+ try:
11
+ from glitchlings._zoo_rust import swap_adjacent_words as _swap_adjacent_words_rust
12
+ except ImportError: # pragma: no cover - optional acceleration
13
+ _swap_adjacent_words_rust = None
14
+
15
+
16
+ def _python_swap_adjacent_words(
17
+ text: str,
18
+ *,
19
+ rate: float,
20
+ rng: random.Random,
21
+ ) -> str:
22
+ """Swap the cores of adjacent words while keeping affixes and spacing intact."""
23
+
24
+ tokens = split_preserving_whitespace(text)
25
+ if len(tokens) < 2:
26
+ return text
27
+
28
+ word_indices: list[int] = []
29
+ for index in range(len(tokens)):
30
+ token = tokens[index]
31
+ if not token or token.isspace():
32
+ continue
33
+ if index % 2 == 0:
34
+ word_indices.append(index)
35
+
36
+ if len(word_indices) < 2:
37
+ return text
38
+
39
+ clamped = max(0.0, min(rate, 1.0))
40
+ if clamped <= 0.0:
41
+ return text
42
+
43
+ for cursor in range(0, len(word_indices) - 1, 2):
44
+ left_index = word_indices[cursor]
45
+ right_index = word_indices[cursor + 1]
46
+
47
+ left_token = tokens[left_index]
48
+ right_token = tokens[right_index]
49
+
50
+ left_prefix, left_core, left_suffix = split_token_edges(left_token)
51
+ right_prefix, right_core, right_suffix = split_token_edges(right_token)
52
+
53
+ if not left_core or not right_core:
54
+ continue
55
+
56
+ should_swap = clamped >= 1.0 or rng.random() < clamped
57
+ if not should_swap:
58
+ continue
59
+
60
+ tokens[left_index] = f"{left_prefix}{right_core}{left_suffix}"
61
+ tokens[right_index] = f"{right_prefix}{left_core}{right_suffix}"
62
+
63
+ return "".join(tokens)
64
+
65
+
66
+ def swap_adjacent_words(
67
+ text: str,
68
+ rate: float | None = None,
69
+ seed: int | None = None,
70
+ rng: random.Random | None = None,
71
+ *,
72
+ swap_rate: float | None = None,
73
+ ) -> str:
74
+ """Swap adjacent word cores while preserving spacing and punctuation."""
75
+
76
+ effective_rate = resolve_rate(
77
+ rate=rate,
78
+ legacy_value=swap_rate,
79
+ default=0.5,
80
+ legacy_name="swap_rate",
81
+ )
82
+ clamped_rate = max(0.0, min(effective_rate, 1.0))
83
+
84
+ if rng is None:
85
+ rng = random.Random(seed)
86
+
87
+ if _swap_adjacent_words_rust is not None:
88
+ return _swap_adjacent_words_rust(text, clamped_rate, rng)
89
+
90
+ return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
91
+
92
+
93
+ class Adjax(Glitchling):
94
+ """Glitchling that swaps adjacent words to scramble local semantics."""
95
+
96
+ def __init__(
97
+ self,
98
+ *,
99
+ rate: float | None = None,
100
+ swap_rate: float | None = None,
101
+ seed: int | None = None,
102
+ ) -> None:
103
+ self._param_aliases = {"swap_rate": "rate"}
104
+ effective_rate = resolve_rate(
105
+ rate=rate,
106
+ legacy_value=swap_rate,
107
+ default=0.5,
108
+ legacy_name="swap_rate",
109
+ )
110
+ super().__init__(
111
+ name="Adjax",
112
+ corruption_function=swap_adjacent_words,
113
+ scope=AttackWave.WORD,
114
+ seed=seed,
115
+ rate=effective_rate,
116
+ )
117
+
118
+ def pipeline_operation(self) -> dict[str, Any] | None:
119
+ rate = self.kwargs.get("rate")
120
+ if rate is None:
121
+ return None
122
+ return {
123
+ "type": "swap_adjacent",
124
+ "swap_rate": float(rate),
125
+ }
126
+
127
+
128
+ adjax = Adjax()
129
+
130
+
131
+ __all__ = ["Adjax", "adjax", "swap_adjacent_words"]