glitchlings 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {glitchlings-0.1.2 → glitchlings-0.1.4}/.gitignore +3 -0
- glitchlings-0.1.4/AGENTS.md +55 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/PKG-INFO +26 -1
- {glitchlings-0.1.2 → glitchlings-0.1.4}/README.md +24 -0
- glitchlings-0.1.4/docs/index.md +253 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/pyproject.toml +2 -1
- glitchlings-0.1.4/rust/typogre/Cargo.lock +295 -0
- glitchlings-0.1.4/rust/typogre/Cargo.toml +14 -0
- glitchlings-0.1.4/rust/typogre/src/lib.rs +260 -0
- glitchlings-0.1.4/rust/zoo/Cargo.lock +340 -0
- glitchlings-0.1.4/rust/zoo/Cargo.toml +15 -0
- glitchlings-0.1.4/rust/zoo/src/lib.rs +367 -0
- glitchlings-0.1.4/src/glitchlings/dlc/prime.py +113 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/core.py +45 -5
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/redactyl.py +46 -9
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/reduple.py +35 -8
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/rushmore.py +48 -25
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/scannequin.py +33 -7
- glitchlings-0.1.4/src/glitchlings/zoo/typogre.py +184 -0
- glitchlings-0.1.4/tests/test_cli.py +150 -0
- glitchlings-0.1.4/tests/test_dataset_corruption.py +51 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/tests/test_gaggle.py +6 -0
- glitchlings-0.1.4/tests/test_glitchling_core.py +24 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/tests/test_glitchlings_determinism.py +18 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/tests/test_parameter_effects.py +16 -0
- glitchlings-0.1.4/tests/test_property_based.py +93 -0
- glitchlings-0.1.4/tests/test_rust_backed_glitchlings.py +110 -0
- glitchlings-0.1.4/tests/test_util.py +35 -0
- glitchlings-0.1.2/AGENTS.md +0 -42
- glitchlings-0.1.2/src/glitchlings/dlc/prime.py +0 -52
- glitchlings-0.1.2/src/glitchlings/zoo/typogre.py +0 -231
- {glitchlings-0.1.2 → glitchlings-0.1.4}/.github/workflows/publish.yml +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/LICENSE +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/MONSTER_MANUAL.md +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/__init__.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/dlc/__init__.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/main.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/__init__.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/jargoyle.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/src/glitchlings/zoo/mim1c.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/tests/conftest.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/tests/test_jargoyle.py +0 -0
- {glitchlings-0.1.2 → glitchlings-0.1.4}/tests/test_keyboard_layouts.py +0 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
# Glitchlings – Agent Handbook
|
2
|
+
|
3
|
+
Welcome! This repository corrals a roster of deterministic text-corruption "glitchlings" plus a CLI for orchestrating them.
|
4
|
+
Treat this handbook as the default guidance for any work in the repo.
|
5
|
+
|
6
|
+
## Repository Tour
|
7
|
+
- **`src/glitchlings/`** – Installable Python package.
|
8
|
+
- `__init__.py` exposes the public API (glitchlings, `Gaggle`, `summon`, `SAMPLE_TEXT`).
|
9
|
+
- `__main__.py` wires `python -m glitchlings` to the CLI entry point in `main.py`.
|
10
|
+
- `main.py` implements the CLI: parser construction, text sourcing, glitchling summoning, and optional diff output.
|
11
|
+
- **`src/glitchlings/zoo/`** – Core glitchling implementations.
|
12
|
+
- `core.py` defines the `Glitchling` base class, `AttackWave`/`AttackOrder` enums, and the `Gaggle` orchestrator.
|
13
|
+
- `typogre.py`, `mim1c.py`, `reduple.py`, `rushmore.py`, `redactyl.py`, `jargoyle.py`, and `scannequin.py` provide concrete glitchlings.
|
14
|
+
Each module offers a pure-Python implementation and, when available, dispatches to an optional Rust acceleration layer.
|
15
|
+
- **`src/glitchlings/util/__init__.py`** – Shared helpers including `SAMPLE_TEXT`, keyboard-neighbour layouts, and diff utilities.
|
16
|
+
- **`src/glitchlings/dlc/prime.py`** – Optional DLC integration with the `verifiers` environments (install via `pip install -e .[prime]`).
|
17
|
+
- **`rust/`** – PyO3 crates backing the optional Rust extensions.
|
18
|
+
- `rust/zoo/` builds `glitchlings._zoo_rust` (used by Reduple, Rushmore, Redactyl, and Scannequin).
|
19
|
+
- `rust/typogre/` builds `glitchlings._typogre_rust` (Typogre's fast path).
|
20
|
+
- Use `maturin develop` (or `maturin build`) from each crate directory to compile the wheels when you need the acceleration paths.
|
21
|
+
- **`tests/`** – Pytest suite covering determinism, keyboard layouts, CLI behaviour, and parity between Python and Rust implementations.
|
22
|
+
- `test_glitchlings_determinism.py`, `test_parameter_effects.py`, and `test_gaggle.py` validate orchestration and RNG guarantees.
|
23
|
+
- `test_rust_backed_glitchlings.py` ensures Rust fast paths match the Python fallbacks.
|
24
|
+
- **Top-level docs** – `README.md` introduces the project and CLI, `MONSTER_MANUAL.md` serves as the glitchling bestiary.
|
25
|
+
|
26
|
+
## Coding Conventions
|
27
|
+
- Target **Python 3.12+** (see `pyproject.toml`).
|
28
|
+
- Follow the import order used in the package: standard library, third-party, then local modules.
|
29
|
+
- Every new glitchling must:
|
30
|
+
- Subclass `Glitchling`, setting `scope` and `order` via `AttackWave` / `AttackOrder` from `core.py`.
|
31
|
+
- Accept keyword-only parameters in `__init__`, forwarding them through `super().__init__` so they are tracked by `set_param`.
|
32
|
+
- Drive all randomness through the instance's `rng` (do not rely on module-level RNG state) to keep `Gaggle` runs deterministic.
|
33
|
+
- Keep helper functions small and well-scoped; include docstrings that describe behaviour and note any determinism considerations.
|
34
|
+
- When mutating token sequences, preserve whitespace and punctuation via separator-preserving regex splits (see `reduple.py`, `rushmore.py`, `redactyl.py`).
|
35
|
+
- CLI work should continue the existing UX: validate inputs with `ArgumentParser.error`, keep deterministic output ordering, and gate optional behaviours behind explicit flags.
|
36
|
+
- Rust fast paths must remain optional: guard imports with `try`/`except ImportError`, surface identical signatures, and fall back to the Python implementation when the extension is absent.
|
37
|
+
|
38
|
+
## Testing & Tooling
|
39
|
+
- Run the full suite with `pytest` from the repository root.
|
40
|
+
- Some tests rely on the NLTK WordNet corpus; if it is missing they skip automatically. Install it via `python -c "import nltk; nltk.download('wordnet')"` to exercise Jargoyle thoroughly.
|
41
|
+
- If you modify Rust-backed modules, rerun `pytest tests/test_rust_backed_glitchlings.py` with and without the compiled extensions to keep both code paths healthy.
|
42
|
+
- Optional extras (e.g., DLC) depend on `verifiers`. Install the `prime` extra (`pip install -e .[prime]`) when working in `src/glitchlings/dlc/`.
|
43
|
+
|
44
|
+
## Determinism Checklist
|
45
|
+
- Expose configurable parameters via `set_param` so fixtures in `tests/test_glitchlings_determinism.py` can reset seeds predictably.
|
46
|
+
- Derive RNGs from the enclosing context (`Gaggle.derive_seed`) instead of using global state.
|
47
|
+
- When sampling subsets (e.g., replacements or deletions), stabilise candidate ordering before selecting to keep results reproducible.
|
48
|
+
- Preserve signature parity between Python and Rust implementations so switching paths does not alter behaviour.
|
49
|
+
|
50
|
+
## Workflow Tips
|
51
|
+
- Use `summon([...], seed=...)` for programmatic orchestration when reproducing tests or crafting examples.
|
52
|
+
- The CLI lists built-in glitchlings (`glitchlings --list`) and can show diffs; update `BUILTIN_GLITCHLINGS` and help text when introducing new creatures.
|
53
|
+
- Keep documentation synchronized: update both `README.md` and `MONSTER_MANUAL.md` when adding or altering glitchlings or behaviours.
|
54
|
+
- When editing keyboard layouts or homoglyph mappings, ensure downstream consumers continue to work with lowercase keys (`util.KEYNEIGHBORS`).
|
55
|
+
- Rust builds are optional—keep the project functional when extensions are absent (e.g., in CI or user installs without `maturin`).
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
6
6
|
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
@@ -225,6 +225,7 @@ Requires-Dist: datasets>=4.0.0
|
|
225
225
|
Requires-Dist: jellyfish>=1.2.0
|
226
226
|
Requires-Dist: nltk>=3.9.1
|
227
227
|
Provides-Extra: dev
|
228
|
+
Requires-Dist: hypothesis>=6.100.0; extra == 'dev'
|
228
229
|
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
229
230
|
Provides-Extra: prime
|
230
231
|
Requires-Dist: verifiers>=0.1.3.post0; extra == 'prime'
|
@@ -283,6 +284,30 @@ print(gaggle(SAMPLE_TEXT))
|
|
283
284
|
|
284
285
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
285
286
|
|
287
|
+
## Usage
|
288
|
+
|
289
|
+
Glitchlings slot into evaluation pipelines just as easily as they corrupt stray strings.
|
290
|
+
|
291
|
+
- **Direct invocation** – Instantiate a glitchling (or `Gaggle`) and call it on strings, iterables, or datasets. Keep the seed stable to make every run deterministic.
|
292
|
+
- **Dataset corruption** – Use a `Gaggle`'s `.corrupt_dataset` helper to perturb a Hugging Face `datasets.Dataset` and return a corrupted copy for training or evaluation.
|
293
|
+
|
294
|
+
### Prime Intellect environments
|
295
|
+
|
296
|
+
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
297
|
+
|
298
|
+
```python
|
299
|
+
from glitchlings import Mim1c, Typogre
|
300
|
+
from glitchlings.dlc.prime import load_environment
|
301
|
+
|
302
|
+
env = load_environment(
|
303
|
+
"osoleve/syllabify-en",
|
304
|
+
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
305
|
+
seed=404,
|
306
|
+
)
|
307
|
+
```
|
308
|
+
|
309
|
+
Skip the `glitchlings` argument to receive an untouched verifier dataset.
|
310
|
+
|
286
311
|
## Motivation
|
287
312
|
|
288
313
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -51,6 +51,30 @@ print(gaggle(SAMPLE_TEXT))
|
|
51
51
|
|
52
52
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
53
53
|
|
54
|
+
## Usage
|
55
|
+
|
56
|
+
Glitchlings slot into evaluation pipelines just as easily as they corrupt stray strings.
|
57
|
+
|
58
|
+
- **Direct invocation** – Instantiate a glitchling (or `Gaggle`) and call it on strings, iterables, or datasets. Keep the seed stable to make every run deterministic.
|
59
|
+
- **Dataset corruption** – Use a `Gaggle`'s `.corrupt_dataset` helper to perturb a Hugging Face `datasets.Dataset` and return a corrupted copy for training or evaluation.
|
60
|
+
|
61
|
+
### Prime Intellect environments
|
62
|
+
|
63
|
+
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
64
|
+
|
65
|
+
```python
|
66
|
+
from glitchlings import Mim1c, Typogre
|
67
|
+
from glitchlings.dlc.prime import load_environment
|
68
|
+
|
69
|
+
env = load_environment(
|
70
|
+
"osoleve/syllabify-en",
|
71
|
+
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
72
|
+
seed=404,
|
73
|
+
)
|
74
|
+
```
|
75
|
+
|
76
|
+
Skip the `glitchlings` argument to receive an untouched verifier dataset.
|
77
|
+
|
54
78
|
## Motivation
|
55
79
|
|
56
80
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -0,0 +1,253 @@
|
|
1
|
+
# Glitchlings Usage Guide
|
2
|
+
|
3
|
+
Welcome to the Glitchlings field manual! This GitHub Pages-ready guide explains how to install the toolkit, orchestrate chaos with the `Gaggle`, and wield every individual glitchling (Typogre, Mim1c, Reduple, Rushmore, Redactyl, Jargoyle, and Scannequin). It closes with deep coverage of the optional Prime Intellect integration so you can perturb verifier datasets with confidence.
|
4
|
+
|
5
|
+
## Table of contents
|
6
|
+
|
7
|
+
1. [Installation](#installation)
|
8
|
+
2. [Quickstart](#quickstart)
|
9
|
+
3. [The Gaggle orchestrator](#the-gaggle-orchestrator)
|
10
|
+
4. [Glitchling reference](#glitchling-reference)
|
11
|
+
- [Typogre](#typogre)
|
12
|
+
- [Mim1c](#mim1c)
|
13
|
+
- [Reduple](#reduple)
|
14
|
+
- [Rushmore](#rushmore)
|
15
|
+
- [Redactyl](#redactyl)
|
16
|
+
- [Jargoyle](#jargoyle)
|
17
|
+
- [Scannequin](#scannequin)
|
18
|
+
5. [Dataset workflows](#dataset-workflows)
|
19
|
+
6. [Prime Intellect integration](#prime-intellect-integration)
|
20
|
+
7. [Ensuring determinism](#ensuring-determinism)
|
21
|
+
8. [Testing checklist](#testing-checklist)
|
22
|
+
9. [Additional resources](#additional-resources)
|
23
|
+
|
24
|
+
## Installation
|
25
|
+
|
26
|
+
Install the latest release directly from PyPI:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
pip install -U glitchlings
|
30
|
+
```
|
31
|
+
|
32
|
+
Need the optional Prime Intellect loader or the NLTK-powered Jargoyle ready to go? Pull in the documented extras:
|
33
|
+
|
34
|
+
```bash
|
35
|
+
# Prime Intellect DLC + verifiers dependency
|
36
|
+
pip install -U 'glitchlings[prime]'
|
37
|
+
|
38
|
+
# NLTK WordNet corpora for Jargoyle synonym swaps
|
39
|
+
python -m nltk.downloader wordnet
|
40
|
+
```
|
41
|
+
|
42
|
+
### Source install
|
43
|
+
|
44
|
+
When working from a local clone, install in editable mode so your changes take effect immediately:
|
45
|
+
|
46
|
+
```bash
|
47
|
+
pip install -e .
|
48
|
+
```
|
49
|
+
|
50
|
+
If you plan to experiment with the PyO3 acceleration crates, install `maturin` and run `maturin develop` from each crate directory inside the `rust/` folder to compile the optional Rust fast paths.
|
51
|
+
|
52
|
+
## Quickstart
|
53
|
+
|
54
|
+
Glitchlings are callable objects that accept strings (and string-like iterables) and return corrupted copies. Summon a single glitchling or gather multiple into a `Gaggle` to orchestrate compound effects:
|
55
|
+
|
56
|
+
```python
|
57
|
+
from glitchlings import Gaggle, SAMPLE_TEXT, Typogre, Mim1c, Reduple, Rushmore
|
58
|
+
|
59
|
+
gaggle = Gaggle([
|
60
|
+
Typogre(max_change_rate=0.03),
|
61
|
+
Mim1c(replacement_rate=0.02),
|
62
|
+
Reduple(seed=404),
|
63
|
+
Rushmore(max_deletion_rate=0.02),
|
64
|
+
], seed=1234)
|
65
|
+
|
66
|
+
print(gaggle(SAMPLE_TEXT))
|
67
|
+
```
|
68
|
+
|
69
|
+
All glitchlings are deterministic: pass a `seed` during construction (or on the enclosing `Gaggle`) to make the chaos reproducible.
|
70
|
+
|
71
|
+
### Command line interface
|
72
|
+
|
73
|
+
Prefer not to touch Python? The `glitchlings` CLI exposes the same functionality:
|
74
|
+
|
75
|
+
```bash
|
76
|
+
# Discover all built-in glitchlings.
|
77
|
+
glitchlings --list
|
78
|
+
|
79
|
+
# Glitch an entire file with Typogre and inspect the unified diff.
|
80
|
+
glitchlings -g typogre --file documents/report.txt --diff
|
81
|
+
|
82
|
+
# Pipe text through Mim1c for on-the-fly homoglyph swaps.
|
83
|
+
echo "Beware LLM-written flavor-text" | glitchlings -g mim1c
|
84
|
+
```
|
85
|
+
|
86
|
+
Append `--diff` to render a unified diff comparing the original and corrupted outputs. Combine it with `--color=always` in terminals that support ANSI colours to highlight changes more clearly.
|
87
|
+
|
88
|
+
## The Gaggle orchestrator
|
89
|
+
|
90
|
+
The `Gaggle` class coordinates multiple glitchlings with deterministic sequencing and shared seeding:
|
91
|
+
|
92
|
+
- **Seed derivation** – pass `seed=` to `Gaggle(...)` and it will derive per-glitchling seeds via `derive_seed`, ensuring cross-run stability without repeated outputs.
|
93
|
+
- **Attack scopes & order** – glitchlings declare a scope (`document`, `sentence`, `word`, `character`) and attack order (`early`, `late`, etc.). By default the gaggle sorts by scope, then by order so character-level edits (Typogre, Mim1c, Scannequin) happen after word-level operations (Reduple, Rushmore, Redactyl, Jargoyle). Override this via `Gaggle([...], attack_order=[...])` when you need bespoke choreography.
|
94
|
+
- **Dynamic configuration** – use `gaggle.set_param("Typogre", "max_change_rate", 0.05)` to tweak nested glitchling parameters without rebuilding the ensemble.
|
95
|
+
- **Dataset utilities** – call `gaggle.corrupt_dataset(dataset, columns=[...])` to clone and perturb Hugging Face datasets while leaving the original untouched. Column inference automatically targets `text`, `prompt`, or similar string columns when none are provided.
|
96
|
+
- **Summoning from shorthand** – `glitchlings.summon` lets you build a gaggle from names or partially-configured objects (`summon(["typogre", Mim1c(replacement_rate=0.01)], seed=404)`).
|
97
|
+
|
98
|
+
## Glitchling reference
|
99
|
+
|
100
|
+
Each glitchling subclasses the shared `Glitchling` base class and exposes the same interface: call the instance with text, adjust parameters via `set_param`, and rely on deterministic seeds. This section summarises every built-in creature, its defaults, and practical usage notes.
|
101
|
+
|
102
|
+
### Typogre
|
103
|
+
|
104
|
+
- **Scope**: character level (early in the pipeline).
|
105
|
+
- **Signature**: `Typogre(max_change_rate=0.02, keyboard="CURATOR_QWERTY", seed=None)`.
|
106
|
+
- **Behaviour**: simulates fat-finger typing by swapping neighbouring keys, dropping spaces, inserting doubles, or choosing layout-adjacent characters. Keyboard layouts map through `glitchlings.util.KEYNEIGHBORS` and include curated QWERTY, DVORAK, and custom research boards.
|
107
|
+
- **Usage tips**:
|
108
|
+
- Lower `max_change_rate` (0.005–0.01) for gentle noise; raise it for more chaotic misspellings.
|
109
|
+
- Swap to `keyboard="DVORAK"` or supply a custom adjacency dict to model alternative hardware.
|
110
|
+
- Combine with Rushmore deletions to simulate hurried note-taking.
|
111
|
+
|
112
|
+
### Mim1c
|
113
|
+
|
114
|
+
- **Scope**: character level (late attack order so it acts after insertions/deletions).
|
115
|
+
- **Signature**: `Mim1c(replacement_rate=0.02, classes=None, seed=None)`.
|
116
|
+
- **Behaviour**: replaces alphanumeric characters with visually confusable Unicode homoglyphs via `confusable_homoglyphs` (e.g., `A → Α`, `e → е`). When `classes` is omitted it targets Latin, Greek, and Cyrillic scripts; pass `classes="all"` to consider every alias.
|
117
|
+
- **Usage tips**:
|
118
|
+
- Restrict `classes` (e.g., `classes=["LATIN"]`) when evaluation pipelines reject non-Latin scripts.
|
119
|
+
- Keep `replacement_rate` below 0.03 for legible perturbations; higher values can break tokenisers that expect ASCII.
|
120
|
+
- Pairs well with Typogre for keyboard + homoglyph chaos.
|
121
|
+
|
122
|
+
### Reduple
|
123
|
+
|
124
|
+
- **Scope**: word level.
|
125
|
+
- **Signature**: `Reduple(reduplication_rate=0.05, seed=None)`.
|
126
|
+
- **Behaviour**: randomly repeats words (“reduplication”) to mimic stuttering transcripts or speech disfluencies while preserving whitespace and punctuation.
|
127
|
+
- **Usage tips**:
|
128
|
+
- Use `reduplication_rate=0.01` to emulate occasional hesitations; bump to ≥0.08 for heavy repetition stress tests.
|
129
|
+
- Because edits preserve separators, downstream whitespace-sensitive parsers remain stable.
|
130
|
+
- Combine with Jargoyle to mix synonym swaps and repeated words for lexical drift.
|
131
|
+
|
132
|
+
### Rushmore
|
133
|
+
|
134
|
+
- **Scope**: word level.
|
135
|
+
- **Signature**: `Rushmore(max_deletion_rate=0.01, seed=None)`.
|
136
|
+
- **Behaviour**: deletes randomly selected words (skipping the first to preserve context) and tidies double spaces/punctuation afterwards.
|
137
|
+
- **Usage tips**:
|
138
|
+
- Keep `max_deletion_rate` conservative (<0.03) to avoid stripping sentences bare.
|
139
|
+
- Because the first word is preserved, prepend short context sentences when you need deletions deeper in the passage.
|
140
|
+
- Sandwich between Reduple and Redactyl to test summarisation robustness under missing context.
|
141
|
+
|
142
|
+
### Redactyl
|
143
|
+
|
144
|
+
- **Scope**: word level.
|
145
|
+
- **Signature**: `Redactyl(replacement_char="█", redaction_rate=0.05, merge_adjacent=False, seed=151)`.
|
146
|
+
- **Behaviour**: replaces the core characters of selected words with a replacement glyph (default FULL BLOCK) to simulate document redaction. Optionally merges adjacent redaction blocks across punctuation.
|
147
|
+
- **Usage tips**:
|
148
|
+
- Switch `replacement_char` to `_` or `*` when terminals struggle with block glyphs.
|
149
|
+
- Enable `merge_adjacent=True` to form continuous bars when redacting phrases.
|
150
|
+
- When no redactable words exist, the underlying implementation raises a `ValueError`—wrap calls with try/except in automated pipelines.
|
151
|
+
|
152
|
+
### Jargoyle
|
153
|
+
|
154
|
+
- **Scope**: word level.
|
155
|
+
- **Signature**: `Jargoyle(replacement_rate=0.1, part_of_speech="n", seed=None)`.
|
156
|
+
- **Behaviour**: swaps nouns/verbs/adjectives/adverbs with WordNet synonyms. Downloads the WordNet corpus on demand when missing and maintains deterministic sampling by sorting candidate lemmas.
|
157
|
+
- **Usage tips**:
|
158
|
+
- Target specific POS tags (e.g., `part_of_speech=("n", "v")`) to limit changes to content words.
|
159
|
+
- Lower `replacement_rate` (0.02–0.05) for subtle lexical variety; higher rates explore paraphrasing extremes.
|
160
|
+
- Ensure your environment has the WordNet data pre-cached to avoid first-run download delays.
|
161
|
+
|
162
|
+
### Scannequin
|
163
|
+
|
164
|
+
- **Scope**: character level (late order).
|
165
|
+
- **Signature**: `Scannequin(error_rate=0.02, seed=None)`.
|
166
|
+
- **Behaviour**: introduces OCR-style confusion pairs (rn↔m, cl↔d, O↔0, curly quotes to ASCII, etc.) using deterministic span selection. Supports a Rust acceleration path when compiled.
|
167
|
+
- **Usage tips**:
|
168
|
+
- Bump `error_rate` for scanned-document stress tests or reduce it for light OCR noise.
|
169
|
+
- Because replacements can change token length, run Scannequin after word-level glitchlings to avoid offset drift.
|
170
|
+
- Combine with Redactyl to mimic heavily redacted, poorly scanned archives.
|
171
|
+
|
172
|
+
## Dataset workflows
|
173
|
+
|
174
|
+
Leverage the Hugging Face integration to perturb large corpora reproducibly:
|
175
|
+
|
176
|
+
```python
|
177
|
+
from datasets import load_dataset
|
178
|
+
from glitchlings import Gaggle, Typogre, Mim1c
|
179
|
+
|
180
|
+
dataset = load_dataset("ag_news")
|
181
|
+
gaggle = Gaggle([Typogre(max_change_rate=0.02), Mim1c(replacement_rate=0.01)], seed=404)
|
182
|
+
|
183
|
+
corrupted = gaggle.corrupt_dataset(
|
184
|
+
dataset,
|
185
|
+
columns=["text"],
|
186
|
+
description="ag_news with typographic noise",
|
187
|
+
)
|
188
|
+
```
|
189
|
+
|
190
|
+
Key points:
|
191
|
+
|
192
|
+
- When `columns` is omitted, Glitchlings infers targets (`prompt`, `question`, or all string columns) using `_resolve_columns` semantics from the Prime loader.
|
193
|
+
- The returned dataset is a shallow copy containing both clean and corrupted columns—persist it with `corrupted.push_to_hub(...)` or `corrupted.save_to_disk(...)`.
|
194
|
+
- Use dataset-level seeds (`seed=` on the gaggle) so repeated corruptions are stable across machines.
|
195
|
+
|
196
|
+
## Prime Intellect integration
|
197
|
+
|
198
|
+
Installing the `prime` extra exposes `glitchlings.dlc.prime.load_environment`, a convenience wrapper around `verifiers.load_environment` that lets you pre-inject glitchlings into benchmark datasets.
|
199
|
+
|
200
|
+
```python
|
201
|
+
from glitchlings import Mim1c, Typogre
|
202
|
+
from glitchlings.dlc.prime import load_environment, tutorial_level, Difficulty
|
203
|
+
|
204
|
+
# Load an existing environment and apply custom corruption
|
205
|
+
custom_env = load_environment(
|
206
|
+
"osoleve/syllabify-en",
|
207
|
+
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
208
|
+
seed=404,
|
209
|
+
columns=["prompt"], # optional; inferred when omitted
|
210
|
+
)
|
211
|
+
|
212
|
+
# Or bootstrap a difficulty-scaled tutorial environment
|
213
|
+
practice_env = tutorial_level(
|
214
|
+
"osoleve/syllabify-en",
|
215
|
+
difficulty=Difficulty.Hard,
|
216
|
+
)
|
217
|
+
```
|
218
|
+
|
219
|
+
Capabilities at a glance:
|
220
|
+
|
221
|
+
- **Flexible inputs** – pass a string environment slug, an instantiated `verifiers.Environment`, a single glitchling, a list of glitchlings or names, or a pre-built `Gaggle`.
|
222
|
+
- **Column inference** – when `columns` is `None`, the loader searches for `prompt`/`question` columns, otherwise falls back to all string-valued columns. Explicitly list columns to target subsets (e.g., prompts but not references).
|
223
|
+
- **Deterministic summoning** – non-`Gaggle` inputs are normalised via `summon(...)` with the provided `seed`, so repeated calls produce matching corruption ensembles.
|
224
|
+
- **Tutorial difficulty scaling** – `tutorial_level` wires in tuned Mim1c/Typogre parameters multiplied by the selected `Difficulty` enum. Use `Difficulty.Easy` for gentle practice or `Difficulty.Extreme` to hammer robustness.
|
225
|
+
- **Dataset mutation** – environments are returned with their dataset replaced by the corrupted clone. Skip the `glitchlings` argument to leave the dataset untouched.
|
226
|
+
|
227
|
+
## Ensuring determinism
|
228
|
+
|
229
|
+
- Derive seeds from the surrounding context (`Gaggle.derive_seed`) when spawning new RNGs.
|
230
|
+
- Stabilise candidate order before sampling subsets to keep runs reproducible.
|
231
|
+
- Use `set_param` to expose tunable values so they can be reset between tests.
|
232
|
+
- When writing new glitchlings, route randomness through the instance RNG rather than module-level state.
|
233
|
+
|
234
|
+
## Testing checklist
|
235
|
+
|
236
|
+
Before publishing changes or documenting new glitchlings, run the Pytest suite from the repository root:
|
237
|
+
|
238
|
+
```bash
|
239
|
+
pytest
|
240
|
+
```
|
241
|
+
|
242
|
+
Some tests require the NLTK WordNet corpus. If you see skips mentioning WordNet, install it with:
|
243
|
+
|
244
|
+
```bash
|
245
|
+
python -c "import nltk; nltk.download('wordnet')"
|
246
|
+
```
|
247
|
+
|
248
|
+
## Additional resources
|
249
|
+
|
250
|
+
- [Monster Manual](../MONSTER_MANUAL.md) – complete bestiary with flavour text.
|
251
|
+
- [Repository README](../README.md) – project overview and ASCII ambience.
|
252
|
+
|
253
|
+
Once the `/docs` folder is published through GitHub Pages, this guide becomes the landing site for your glitchling adventures.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "glitchlings"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.4"
|
4
4
|
description = "Monsters for your language games."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.12"
|
@@ -48,6 +48,7 @@ prime = [
|
|
48
48
|
]
|
49
49
|
dev = [
|
50
50
|
"pytest>=8.0.0",
|
51
|
+
"hypothesis>=6.100.0",
|
51
52
|
]
|
52
53
|
|
53
54
|
[build-system]
|