glitchlings 0.2.5__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {glitchlings-0.2.5 → glitchlings-0.3.0}/PKG-INFO +21 -6
- {glitchlings-0.2.5 → glitchlings-0.3.0}/README.md +19 -4
- {glitchlings-0.2.5 → glitchlings-0.3.0}/pyproject.toml +2 -2
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/glitch_ops.rs +79 -1
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/lib.rs +29 -1
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/pipeline.rs +15 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/__init__.py +4 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/dlc/prime.py +18 -1
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/__init__.py +5 -1
- glitchlings-0.3.0/src/glitchlings/zoo/_text_utils.py +42 -0
- glitchlings-0.3.0/src/glitchlings/zoo/adjax.py +131 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/core.py +28 -18
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/redactyl.py +16 -20
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/reduple.py +13 -24
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/rushmore.py +12 -17
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/PKG-INFO +21 -6
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/SOURCES.txt +4 -0
- glitchlings-0.3.0/tests/test_benchmarks.py +88 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_glitchlings_determinism.py +8 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_parameter_effects.py +31 -1
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_prime_echo_chamber.py +24 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_property_based.py +1 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_rust_backed_glitchlings.py +71 -3
- glitchlings-0.3.0/tests/test_text_utils.py +37 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/LICENSE +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/MANIFEST.in +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/Cargo.lock +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/Cargo.toml +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/Cargo.toml +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/assets/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/build.rs +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/resources.rs +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/rng.rs +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/text_buffer.rs +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/typogre.rs +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/rust/zoo/src/zeedub.rs +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/setup.cfg +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/dlc/__init__.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/dlc/huggingface.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/main.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/_ocr_confusions.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/_rate.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/jargoyle.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/mim1c.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/scannequin.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/typogre.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings/zoo/zeedub.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/dependency_links.txt +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/entry_points.txt +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/requires.txt +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/src/glitchlings.egg-info/top_level.txt +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_cli.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_dataset_corruption.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_gaggle.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_glitchling_core.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_huggingface_dlc.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_jargoyle.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_keyboard_layouts.py +0 -0
- {glitchlings-0.2.5 → glitchlings-0.3.0}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -209,7 +209,7 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
|
209
209
|
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
210
210
|
Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
|
211
211
|
Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
|
212
|
-
Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,
|
212
|
+
Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,rlvr
|
213
213
|
Classifier: Development Status :: 3 - Alpha
|
214
214
|
Classifier: Intended Audience :: Developers
|
215
215
|
Classifier: Programming Language :: Python
|
@@ -296,7 +296,7 @@ print(gaggle(SAMPLE_TEXT))
|
|
296
296
|
|
297
297
|
Consult the [Glitchlings Usage Guide](docs/index.md)
|
298
298
|
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
299
|
-
integrations, and the
|
299
|
+
integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
|
300
300
|
|
301
301
|
## Motivation
|
302
302
|
|
@@ -428,7 +428,8 @@ _Did you say that or did I?_
|
|
428
428
|
>
|
429
429
|
> Args
|
430
430
|
>
|
431
|
-
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.
|
431
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.01, 1%).
|
432
|
+
> - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
|
432
433
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
433
434
|
|
434
435
|
### Rushmore
|
@@ -440,6 +441,19 @@ _I accidentally an entire word._
|
|
440
441
|
> Args
|
441
442
|
>
|
442
443
|
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
444
|
+
> - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
|
445
|
+
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
446
|
+
|
447
|
+
### Adjax
|
448
|
+
|
449
|
+
_Keep your hands and punctuation where I can see them._
|
450
|
+
|
451
|
+
> _**Perfect Shuffle.**_ Adjax trades the cores of neighbouring words while leaving punctuation, casing, and surrounding whitespace untouched, turning fluent prose into locally scrambled tongue-twisters.
|
452
|
+
>
|
453
|
+
> Args
|
454
|
+
>
|
455
|
+
> - `rate (float)`: Probability that each adjacent pair swaps cores (default: 0.5, 50%).
|
456
|
+
> - `swap_rate (float)`: Alias for `rate`, retained for backward compatibility.
|
443
457
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
444
458
|
|
445
459
|
### Redactyl
|
@@ -450,9 +464,10 @@ _Oops, that was my black highlighter._
|
|
450
464
|
>
|
451
465
|
> ### Args
|
452
466
|
>
|
453
|
-
> - `replacement_char (str)`: The character to use for redaction (default:
|
454
|
-
> - `rate (float)`: The maximum proportion of words to redact (default: 0.
|
467
|
+
> - `replacement_char (str)`: The character to use for redaction (default: FULL_BLOCK).
|
468
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.025, 2.5%).
|
455
469
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
470
|
+
> - `unweighted (bool)`: Sample words uniformly instead of biasing toward longer tokens (default: False).
|
456
471
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
457
472
|
|
458
473
|
## Field Report: Uncontained Specimens
|
@@ -55,7 +55,7 @@ print(gaggle(SAMPLE_TEXT))
|
|
55
55
|
|
56
56
|
Consult the [Glitchlings Usage Guide](docs/index.md)
|
57
57
|
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
58
|
-
integrations, and the
|
58
|
+
integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
|
59
59
|
|
60
60
|
## Motivation
|
61
61
|
|
@@ -187,7 +187,8 @@ _Did you say that or did I?_
|
|
187
187
|
>
|
188
188
|
> Args
|
189
189
|
>
|
190
|
-
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.
|
190
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.01, 1%).
|
191
|
+
> - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
|
191
192
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
192
193
|
|
193
194
|
### Rushmore
|
@@ -199,6 +200,19 @@ _I accidentally an entire word._
|
|
199
200
|
> Args
|
200
201
|
>
|
201
202
|
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
203
|
+
> - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
|
204
|
+
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
205
|
+
|
206
|
+
### Adjax
|
207
|
+
|
208
|
+
_Keep your hands and punctuation where I can see them._
|
209
|
+
|
210
|
+
> _**Perfect Shuffle.**_ Adjax trades the cores of neighbouring words while leaving punctuation, casing, and surrounding whitespace untouched, turning fluent prose into locally scrambled tongue-twisters.
|
211
|
+
>
|
212
|
+
> Args
|
213
|
+
>
|
214
|
+
> - `rate (float)`: Probability that each adjacent pair swaps cores (default: 0.5, 50%).
|
215
|
+
> - `swap_rate (float)`: Alias for `rate`, retained for backward compatibility.
|
202
216
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
203
217
|
|
204
218
|
### Redactyl
|
@@ -209,9 +223,10 @@ _Oops, that was my black highlighter._
|
|
209
223
|
>
|
210
224
|
> ### Args
|
211
225
|
>
|
212
|
-
> - `replacement_char (str)`: The character to use for redaction (default:
|
213
|
-
> - `rate (float)`: The maximum proportion of words to redact (default: 0.
|
226
|
+
> - `replacement_char (str)`: The character to use for redaction (default: FULL_BLOCK).
|
227
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.025, 2.5%).
|
214
228
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
229
|
+
> - `unweighted (bool)`: Sample words uniformly instead of biasing toward longer tokens (default: False).
|
215
230
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
216
231
|
|
217
232
|
## Field Report: Uncontained Specimens
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "glitchlings"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.0"
|
4
4
|
description = "Monsters for your language games."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.10"
|
@@ -13,7 +13,7 @@ authors = [
|
|
13
13
|
{ name = "osoleve" }
|
14
14
|
]
|
15
15
|
|
16
|
-
keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "
|
16
|
+
keywords = ["nlp", "text", "adversarial augmentation", "text augmentation", "large language models", "llms", "data augmentation", "rlvr"]
|
17
17
|
|
18
18
|
classifiers = [
|
19
19
|
"Development Status :: 3 - Alpha",
|
@@ -359,6 +359,61 @@ impl GlitchOp for DeleteRandomWordsOp {
|
|
359
359
|
}
|
360
360
|
}
|
361
361
|
|
362
|
+
/// Swaps adjacent word cores while keeping punctuation and spacing intact.
|
363
|
+
#[derive(Debug, Clone, Copy)]
|
364
|
+
pub struct SwapAdjacentWordsOp {
|
365
|
+
pub swap_rate: f64,
|
366
|
+
}
|
367
|
+
|
368
|
+
impl GlitchOp for SwapAdjacentWordsOp {
|
369
|
+
fn apply(&self, buffer: &mut TextBuffer, rng: &mut dyn GlitchRng) -> Result<(), GlitchOpError> {
|
370
|
+
let total_words = buffer.word_count();
|
371
|
+
if total_words < 2 {
|
372
|
+
return Ok(());
|
373
|
+
}
|
374
|
+
|
375
|
+
let clamped = self.swap_rate.max(0.0).min(1.0);
|
376
|
+
if clamped <= 0.0 {
|
377
|
+
return Ok(());
|
378
|
+
}
|
379
|
+
|
380
|
+
let mut index = 0usize;
|
381
|
+
while index + 1 < total_words {
|
382
|
+
let left_segment = match buffer.word_segment(index) {
|
383
|
+
Some(segment) => segment,
|
384
|
+
None => break,
|
385
|
+
};
|
386
|
+
let right_segment = match buffer.word_segment(index + 1) {
|
387
|
+
Some(segment) => segment,
|
388
|
+
None => break,
|
389
|
+
};
|
390
|
+
|
391
|
+
let left_original = left_segment.text().to_string();
|
392
|
+
let right_original = right_segment.text().to_string();
|
393
|
+
|
394
|
+
let (left_prefix, left_core, left_suffix) = split_affixes(&left_original);
|
395
|
+
let (right_prefix, right_core, right_suffix) = split_affixes(&right_original);
|
396
|
+
|
397
|
+
if left_core.is_empty() || right_core.is_empty() {
|
398
|
+
index += 2;
|
399
|
+
continue;
|
400
|
+
}
|
401
|
+
|
402
|
+
let should_swap = clamped >= 1.0 || rng.random()? < clamped;
|
403
|
+
if should_swap {
|
404
|
+
let left_replacement = format!("{left_prefix}{right_core}{left_suffix}");
|
405
|
+
let right_replacement = format!("{right_prefix}{left_core}{right_suffix}");
|
406
|
+
buffer.replace_word(index, &left_replacement)?;
|
407
|
+
buffer.replace_word(index + 1, &right_replacement)?;
|
408
|
+
}
|
409
|
+
|
410
|
+
index += 2;
|
411
|
+
}
|
412
|
+
|
413
|
+
Ok(())
|
414
|
+
}
|
415
|
+
}
|
416
|
+
|
362
417
|
/// Redacts words by replacing core characters with a replacement token.
|
363
418
|
#[derive(Debug, Clone)]
|
364
419
|
pub struct RedactWordsOp {
|
@@ -555,6 +610,7 @@ impl GlitchOp for OcrArtifactsOp {
|
|
555
610
|
pub enum GlitchOperation {
|
556
611
|
Reduplicate(ReduplicateWordsOp),
|
557
612
|
Delete(DeleteRandomWordsOp),
|
613
|
+
SwapAdjacent(SwapAdjacentWordsOp),
|
558
614
|
Redact(RedactWordsOp),
|
559
615
|
Ocr(OcrArtifactsOp),
|
560
616
|
}
|
@@ -564,6 +620,7 @@ impl GlitchOp for GlitchOperation {
|
|
564
620
|
match self {
|
565
621
|
GlitchOperation::Reduplicate(op) => op.apply(buffer, rng),
|
566
622
|
GlitchOperation::Delete(op) => op.apply(buffer, rng),
|
623
|
+
GlitchOperation::SwapAdjacent(op) => op.apply(buffer, rng),
|
567
624
|
GlitchOperation::Redact(op) => op.apply(buffer, rng),
|
568
625
|
GlitchOperation::Ocr(op) => op.apply(buffer, rng),
|
569
626
|
}
|
@@ -574,7 +631,7 @@ impl GlitchOp for GlitchOperation {
|
|
574
631
|
mod tests {
|
575
632
|
use super::{
|
576
633
|
DeleteRandomWordsOp, GlitchOp, GlitchOpError, OcrArtifactsOp, RedactWordsOp,
|
577
|
-
ReduplicateWordsOp,
|
634
|
+
ReduplicateWordsOp, SwapAdjacentWordsOp,
|
578
635
|
};
|
579
636
|
use crate::rng::PyRng;
|
580
637
|
use crate::text_buffer::TextBuffer;
|
@@ -592,6 +649,27 @@ mod tests {
|
|
592
649
|
assert_eq!(buffer.to_string(), "Hello Hello world world");
|
593
650
|
}
|
594
651
|
|
652
|
+
#[test]
|
653
|
+
fn swap_adjacent_words_swaps_cores() {
|
654
|
+
let mut buffer = TextBuffer::from_str("Alpha, beta! Gamma delta");
|
655
|
+
let mut rng = PyRng::new(7);
|
656
|
+
let op = SwapAdjacentWordsOp { swap_rate: 1.0 };
|
657
|
+
op.apply(&mut buffer, &mut rng)
|
658
|
+
.expect("swap operation succeeds");
|
659
|
+
assert_eq!(buffer.to_string(), "beta, Alpha! delta Gamma");
|
660
|
+
}
|
661
|
+
|
662
|
+
#[test]
|
663
|
+
fn swap_adjacent_words_respects_zero_rate() {
|
664
|
+
let original = "Do not move these words";
|
665
|
+
let mut buffer = TextBuffer::from_str(original);
|
666
|
+
let mut rng = PyRng::new(42);
|
667
|
+
let op = SwapAdjacentWordsOp { swap_rate: 0.0 };
|
668
|
+
op.apply(&mut buffer, &mut rng)
|
669
|
+
.expect("swap operation succeeds");
|
670
|
+
assert_eq!(buffer.to_string(), original);
|
671
|
+
}
|
672
|
+
|
595
673
|
#[test]
|
596
674
|
fn delete_random_words_cleans_up_spacing() {
|
597
675
|
let mut buffer = TextBuffer::from_str("One two three four five");
|
@@ -14,7 +14,7 @@ use pyo3::{exceptions::PyValueError, FromPyObject};
|
|
14
14
|
|
15
15
|
pub use glitch_ops::{
|
16
16
|
DeleteRandomWordsOp, GlitchOpError, GlitchOperation, OcrArtifactsOp, RedactWordsOp,
|
17
|
-
ReduplicateWordsOp,
|
17
|
+
ReduplicateWordsOp, SwapAdjacentWordsOp,
|
18
18
|
};
|
19
19
|
pub use pipeline::{derive_seed, GlitchDescriptor, Pipeline, PipelineError};
|
20
20
|
pub use rng::{PyRng, PyRngError};
|
@@ -101,6 +101,9 @@ enum PyGlitchOperation {
|
|
101
101
|
max_deletion_rate: f64,
|
102
102
|
unweighted: bool,
|
103
103
|
},
|
104
|
+
SwapAdjacent {
|
105
|
+
swap_rate: f64,
|
106
|
+
},
|
104
107
|
Redact {
|
105
108
|
replacement_char: String,
|
106
109
|
redaction_rate: f64,
|
@@ -154,6 +157,15 @@ impl<'py> FromPyObject<'py> for PyGlitchOperation {
|
|
154
157
|
unweighted,
|
155
158
|
})
|
156
159
|
}
|
160
|
+
"swap_adjacent" => {
|
161
|
+
let rate = dict
|
162
|
+
.get_item("swap_rate")?
|
163
|
+
.ok_or_else(|| {
|
164
|
+
PyValueError::new_err("swap_adjacent operation missing 'swap_rate'")
|
165
|
+
})?
|
166
|
+
.extract()?;
|
167
|
+
Ok(PyGlitchOperation::SwapAdjacent { swap_rate: rate })
|
168
|
+
}
|
157
169
|
"redact" => {
|
158
170
|
let replacement_char = dict
|
159
171
|
.get_item("replacement_char")?
|
@@ -241,6 +253,16 @@ fn delete_random_words(
|
|
241
253
|
apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
|
242
254
|
}
|
243
255
|
|
256
|
+
#[pyfunction]
|
257
|
+
fn swap_adjacent_words(
|
258
|
+
text: &str,
|
259
|
+
swap_rate: f64,
|
260
|
+
rng: &Bound<'_, PyAny>,
|
261
|
+
) -> PyResult<String> {
|
262
|
+
let op = SwapAdjacentWordsOp { swap_rate };
|
263
|
+
apply_operation(text, op, rng).map_err(glitch_ops::GlitchOpError::into_pyerr)
|
264
|
+
}
|
265
|
+
|
244
266
|
#[pyfunction]
|
245
267
|
fn ocr_artifacts(text: &str, error_rate: f64, rng: &Bound<'_, PyAny>) -> PyResult<String> {
|
246
268
|
let op = OcrArtifactsOp { error_rate };
|
@@ -289,6 +311,11 @@ fn compose_glitchlings(
|
|
289
311
|
max_deletion_rate,
|
290
312
|
unweighted,
|
291
313
|
}),
|
314
|
+
PyGlitchOperation::SwapAdjacent { swap_rate } => {
|
315
|
+
GlitchOperation::SwapAdjacent(glitch_ops::SwapAdjacentWordsOp {
|
316
|
+
swap_rate,
|
317
|
+
})
|
318
|
+
}
|
292
319
|
PyGlitchOperation::Redact {
|
293
320
|
replacement_char,
|
294
321
|
redaction_rate,
|
@@ -320,6 +347,7 @@ fn compose_glitchlings(
|
|
320
347
|
fn _zoo_rust(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
321
348
|
m.add_function(wrap_pyfunction!(reduplicate_words, m)?)?;
|
322
349
|
m.add_function(wrap_pyfunction!(delete_random_words, m)?)?;
|
350
|
+
m.add_function(wrap_pyfunction!(swap_adjacent_words, m)?)?;
|
323
351
|
m.add_function(wrap_pyfunction!(ocr_artifacts, m)?)?;
|
324
352
|
m.add_function(wrap_pyfunction!(redact_words, m)?)?;
|
325
353
|
m.add_function(wrap_pyfunction!(compose_glitchlings, m)?)?;
|
@@ -112,6 +112,7 @@ mod tests {
|
|
112
112
|
use super::{derive_seed, GlitchDescriptor, Pipeline};
|
113
113
|
use crate::glitch_ops::{
|
114
114
|
DeleteRandomWordsOp, GlitchOperation, OcrArtifactsOp, RedactWordsOp, ReduplicateWordsOp,
|
115
|
+
SwapAdjacentWordsOp,
|
115
116
|
};
|
116
117
|
|
117
118
|
#[test]
|
@@ -207,4 +208,18 @@ mod tests {
|
|
207
208
|
.expect("pipeline run succeeds");
|
208
209
|
assert_eq!(output, "Guard the ██ at ██████████");
|
209
210
|
}
|
211
|
+
#[test]
|
212
|
+
fn pipeline_swaps_adjacent_words() {
|
213
|
+
let master_seed = 2025i128;
|
214
|
+
let descriptors = vec![GlitchDescriptor {
|
215
|
+
name: "Adjax".to_string(),
|
216
|
+
seed: derive_seed(master_seed, "Adjax", 0),
|
217
|
+
operation: GlitchOperation::SwapAdjacent(SwapAdjacentWordsOp { swap_rate: 1.0 }),
|
218
|
+
}];
|
219
|
+
let pipeline = Pipeline::new(master_seed, descriptors);
|
220
|
+
let output = pipeline
|
221
|
+
.run("Echo this line please")
|
222
|
+
.expect("pipeline succeeds");
|
223
|
+
assert_eq!(output, "this Echo please line");
|
224
|
+
}
|
210
225
|
}
|
@@ -5,6 +5,8 @@ from .zoo import (
|
|
5
5
|
mim1c,
|
6
6
|
Jargoyle,
|
7
7
|
jargoyle,
|
8
|
+
Adjax,
|
9
|
+
adjax,
|
8
10
|
Redactyl,
|
9
11
|
redactyl,
|
10
12
|
Reduple,
|
@@ -29,6 +31,8 @@ __all__ = [
|
|
29
31
|
"mim1c",
|
30
32
|
"Jargoyle",
|
31
33
|
"jargoyle",
|
34
|
+
"Adjax",
|
35
|
+
"adjax",
|
32
36
|
"Redactyl",
|
33
37
|
"redactyl",
|
34
38
|
"Reduple",
|
@@ -49,7 +49,24 @@ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[st
|
|
49
49
|
if candidate in available:
|
50
50
|
return [candidate]
|
51
51
|
|
52
|
-
|
52
|
+
try:
|
53
|
+
dataset_length = len(dataset) # type: ignore[arg-type]
|
54
|
+
except TypeError:
|
55
|
+
preview_rows: list[dict[str, Any]]
|
56
|
+
take_fn = getattr(dataset, "take", None)
|
57
|
+
if callable(take_fn):
|
58
|
+
preview_rows = list(take_fn(1))
|
59
|
+
else:
|
60
|
+
iterator = iter(dataset)
|
61
|
+
try:
|
62
|
+
first_row = next(iterator)
|
63
|
+
except StopIteration:
|
64
|
+
preview_rows = []
|
65
|
+
else:
|
66
|
+
preview_rows = [first_row]
|
67
|
+
sample = dict(preview_rows[0]) if preview_rows else {}
|
68
|
+
else:
|
69
|
+
sample = dataset[0] if dataset_length else {}
|
53
70
|
inferred = [
|
54
71
|
name
|
55
72
|
for name in dataset.column_names
|
@@ -6,6 +6,7 @@ from typing import Any
|
|
6
6
|
from .typogre import Typogre, typogre
|
7
7
|
from .mim1c import Mim1c, mim1c
|
8
8
|
from .jargoyle import Jargoyle, jargoyle, dependencies_available as _jargoyle_available
|
9
|
+
from .adjax import Adjax, adjax
|
9
10
|
from .reduple import Reduple, reduple
|
10
11
|
from .rushmore import Rushmore, rushmore
|
11
12
|
from .redactyl import Redactyl, redactyl
|
@@ -20,6 +21,8 @@ __all__ = [
|
|
20
21
|
"mim1c",
|
21
22
|
"Jargoyle",
|
22
23
|
"jargoyle",
|
24
|
+
"Adjax",
|
25
|
+
"adjax",
|
23
26
|
"Reduple",
|
24
27
|
"reduple",
|
25
28
|
"Rushmore",
|
@@ -43,7 +46,7 @@ _HAS_JARGOYLE = _jargoyle_available()
|
|
43
46
|
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
|
44
47
|
if _HAS_JARGOYLE:
|
45
48
|
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
46
|
-
_BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
|
49
|
+
_BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
|
47
50
|
|
48
51
|
BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
49
52
|
glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
|
@@ -52,6 +55,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
|
52
55
|
_BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
53
56
|
typogre.name.lower(): Typogre,
|
54
57
|
mim1c.name.lower(): Mim1c,
|
58
|
+
adjax.name.lower(): Adjax,
|
55
59
|
reduple.name.lower(): Reduple,
|
56
60
|
rushmore.name.lower(): Rushmore,
|
57
61
|
redactyl.name.lower(): Redactyl,
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
|
5
|
+
_WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
|
6
|
+
_TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
|
7
|
+
|
8
|
+
|
9
|
+
def split_preserving_whitespace(text: str) -> list[str]:
|
10
|
+
"""Split text while keeping whitespace tokens for stable reconstruction."""
|
11
|
+
|
12
|
+
return _WORD_SPLIT_PATTERN.split(text)
|
13
|
+
|
14
|
+
|
15
|
+
def split_token_edges(token: str) -> tuple[str, str, str]:
|
16
|
+
"""Return leading, core, and trailing segments for a token."""
|
17
|
+
|
18
|
+
match = _TOKEN_EDGES_PATTERN.match(token)
|
19
|
+
if match is None:
|
20
|
+
return "", token, ""
|
21
|
+
return match.group(1), match.group(2), match.group(3)
|
22
|
+
|
23
|
+
|
24
|
+
def token_core_length(token: str) -> int:
|
25
|
+
"""Return the length of the main word characters for weighting heuristics."""
|
26
|
+
|
27
|
+
_, core, _ = split_token_edges(token)
|
28
|
+
candidate = core if core else token
|
29
|
+
length = len(candidate)
|
30
|
+
if length <= 0:
|
31
|
+
stripped = token.strip()
|
32
|
+
length = len(stripped) if stripped else len(token)
|
33
|
+
if length <= 0:
|
34
|
+
length = 1
|
35
|
+
return length
|
36
|
+
|
37
|
+
|
38
|
+
__all__ = [
|
39
|
+
"split_preserving_whitespace",
|
40
|
+
"split_token_edges",
|
41
|
+
"token_core_length",
|
42
|
+
]
|
@@ -0,0 +1,131 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import random
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
from ._rate import resolve_rate
|
7
|
+
from ._text_utils import split_preserving_whitespace, split_token_edges
|
8
|
+
from .core import AttackWave, Glitchling
|
9
|
+
|
10
|
+
try:
|
11
|
+
from glitchlings._zoo_rust import swap_adjacent_words as _swap_adjacent_words_rust
|
12
|
+
except ImportError: # pragma: no cover - optional acceleration
|
13
|
+
_swap_adjacent_words_rust = None
|
14
|
+
|
15
|
+
|
16
|
+
def _python_swap_adjacent_words(
|
17
|
+
text: str,
|
18
|
+
*,
|
19
|
+
rate: float,
|
20
|
+
rng: random.Random,
|
21
|
+
) -> str:
|
22
|
+
"""Swap the cores of adjacent words while keeping affixes and spacing intact."""
|
23
|
+
|
24
|
+
tokens = split_preserving_whitespace(text)
|
25
|
+
if len(tokens) < 2:
|
26
|
+
return text
|
27
|
+
|
28
|
+
word_indices: list[int] = []
|
29
|
+
for index in range(len(tokens)):
|
30
|
+
token = tokens[index]
|
31
|
+
if not token or token.isspace():
|
32
|
+
continue
|
33
|
+
if index % 2 == 0:
|
34
|
+
word_indices.append(index)
|
35
|
+
|
36
|
+
if len(word_indices) < 2:
|
37
|
+
return text
|
38
|
+
|
39
|
+
clamped = max(0.0, min(rate, 1.0))
|
40
|
+
if clamped <= 0.0:
|
41
|
+
return text
|
42
|
+
|
43
|
+
for cursor in range(0, len(word_indices) - 1, 2):
|
44
|
+
left_index = word_indices[cursor]
|
45
|
+
right_index = word_indices[cursor + 1]
|
46
|
+
|
47
|
+
left_token = tokens[left_index]
|
48
|
+
right_token = tokens[right_index]
|
49
|
+
|
50
|
+
left_prefix, left_core, left_suffix = split_token_edges(left_token)
|
51
|
+
right_prefix, right_core, right_suffix = split_token_edges(right_token)
|
52
|
+
|
53
|
+
if not left_core or not right_core:
|
54
|
+
continue
|
55
|
+
|
56
|
+
should_swap = clamped >= 1.0 or rng.random() < clamped
|
57
|
+
if not should_swap:
|
58
|
+
continue
|
59
|
+
|
60
|
+
tokens[left_index] = f"{left_prefix}{right_core}{left_suffix}"
|
61
|
+
tokens[right_index] = f"{right_prefix}{left_core}{right_suffix}"
|
62
|
+
|
63
|
+
return "".join(tokens)
|
64
|
+
|
65
|
+
|
66
|
+
def swap_adjacent_words(
|
67
|
+
text: str,
|
68
|
+
rate: float | None = None,
|
69
|
+
seed: int | None = None,
|
70
|
+
rng: random.Random | None = None,
|
71
|
+
*,
|
72
|
+
swap_rate: float | None = None,
|
73
|
+
) -> str:
|
74
|
+
"""Swap adjacent word cores while preserving spacing and punctuation."""
|
75
|
+
|
76
|
+
effective_rate = resolve_rate(
|
77
|
+
rate=rate,
|
78
|
+
legacy_value=swap_rate,
|
79
|
+
default=0.5,
|
80
|
+
legacy_name="swap_rate",
|
81
|
+
)
|
82
|
+
clamped_rate = max(0.0, min(effective_rate, 1.0))
|
83
|
+
|
84
|
+
if rng is None:
|
85
|
+
rng = random.Random(seed)
|
86
|
+
|
87
|
+
if _swap_adjacent_words_rust is not None:
|
88
|
+
return _swap_adjacent_words_rust(text, clamped_rate, rng)
|
89
|
+
|
90
|
+
return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
|
91
|
+
|
92
|
+
|
93
|
+
class Adjax(Glitchling):
|
94
|
+
"""Glitchling that swaps adjacent words to scramble local semantics."""
|
95
|
+
|
96
|
+
def __init__(
|
97
|
+
self,
|
98
|
+
*,
|
99
|
+
rate: float | None = None,
|
100
|
+
swap_rate: float | None = None,
|
101
|
+
seed: int | None = None,
|
102
|
+
) -> None:
|
103
|
+
self._param_aliases = {"swap_rate": "rate"}
|
104
|
+
effective_rate = resolve_rate(
|
105
|
+
rate=rate,
|
106
|
+
legacy_value=swap_rate,
|
107
|
+
default=0.5,
|
108
|
+
legacy_name="swap_rate",
|
109
|
+
)
|
110
|
+
super().__init__(
|
111
|
+
name="Adjax",
|
112
|
+
corruption_function=swap_adjacent_words,
|
113
|
+
scope=AttackWave.WORD,
|
114
|
+
seed=seed,
|
115
|
+
rate=effective_rate,
|
116
|
+
)
|
117
|
+
|
118
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
119
|
+
rate = self.kwargs.get("rate")
|
120
|
+
if rate is None:
|
121
|
+
return None
|
122
|
+
return {
|
123
|
+
"type": "swap_adjacent",
|
124
|
+
"swap_rate": float(rate),
|
125
|
+
}
|
126
|
+
|
127
|
+
|
128
|
+
adjax = Adjax()
|
129
|
+
|
130
|
+
|
131
|
+
__all__ = ["Adjax", "adjax", "swap_adjacent_words"]
|