glitchlings 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings-0.2.0/MANIFEST.in +1 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/PKG-INFO +50 -12
- {glitchlings-0.1.3 → glitchlings-0.2.0}/README.md +35 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/pyproject.toml +19 -24
- {glitchlings-0.1.3/rust/zoo → glitchlings-0.2.0/rust}/Cargo.lock +7 -0
- glitchlings-0.2.0/rust/Cargo.toml +10 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/typogre/Cargo.toml +1 -1
- {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/zoo/Cargo.toml +2 -2
- {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/zoo/src/lib.rs +22 -7
- glitchlings-0.2.0/setup.cfg +7 -0
- glitchlings-0.2.0/src/glitchlings/dlc/__init__.py +5 -0
- glitchlings-0.2.0/src/glitchlings/dlc/huggingface.py +96 -0
- glitchlings-0.2.0/src/glitchlings/dlc/prime.py +252 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/main.py +4 -2
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/core.py +60 -8
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/rushmore.py +8 -1
- glitchlings-0.2.0/src/glitchlings.egg-info/PKG-INFO +493 -0
- glitchlings-0.2.0/src/glitchlings.egg-info/SOURCES.txt +46 -0
- glitchlings-0.2.0/src/glitchlings.egg-info/dependency_links.txt +1 -0
- glitchlings-0.2.0/src/glitchlings.egg-info/entry_points.txt +2 -0
- glitchlings-0.2.0/src/glitchlings.egg-info/requires.txt +11 -0
- glitchlings-0.2.0/src/glitchlings.egg-info/top_level.txt +1 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_cli.py +18 -0
- glitchlings-0.2.0/tests/test_dataset_corruption.py +91 -0
- glitchlings-0.2.0/tests/test_glitchling_core.py +24 -0
- glitchlings-0.2.0/tests/test_huggingface_dlc.py +56 -0
- glitchlings-0.2.0/tests/test_prime_echo_chamber.py +35 -0
- glitchlings-0.2.0/tests/test_property_based.py +93 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_rust_backed_glitchlings.py +1 -1
- glitchlings-0.2.0/tests/test_util.py +35 -0
- glitchlings-0.1.3/.github/workflows/publish.yml +0 -41
- glitchlings-0.1.3/.gitignore +0 -17
- glitchlings-0.1.3/AGENTS.md +0 -55
- glitchlings-0.1.3/MONSTER_MANUAL.md +0 -271
- glitchlings-0.1.3/rust/typogre/Cargo.lock +0 -295
- glitchlings-0.1.3/src/glitchlings/dlc/__init__.py +0 -0
- glitchlings-0.1.3/src/glitchlings/dlc/prime.py +0 -52
- glitchlings-0.1.3/tests/conftest.py +0 -8
- glitchlings-0.1.3/tests/test_dataset_corruption.py +0 -51
- {glitchlings-0.1.3 → glitchlings-0.2.0}/LICENSE +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/typogre/src/lib.rs +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/__init__.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/__main__.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/util/__init__.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/__init__.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/jargoyle.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/mim1c.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/redactyl.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/reduple.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/scannequin.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/typogre.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_gaggle.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_glitchlings_determinism.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_jargoyle.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_keyboard_layouts.py +0 -0
- {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_parameter_effects.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
recursive-include rust *
|
@@ -1,11 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Monsters for your language games.
|
5
|
-
Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
6
|
-
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
7
|
-
Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
|
8
|
-
Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
|
9
5
|
Author: osoleve
|
10
6
|
License: Apache License
|
11
7
|
Version 2.0, January 2004
|
@@ -208,27 +204,34 @@ License: Apache License
|
|
208
204
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
209
205
|
See the License for the specific language governing permissions and
|
210
206
|
limitations under the License.
|
211
|
-
|
212
|
-
|
207
|
+
|
208
|
+
Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
209
|
+
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
210
|
+
Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
|
211
|
+
Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
|
212
|
+
Keywords: nlp,text,adversarial augmentation,text augmentation
|
213
213
|
Classifier: Development Status :: 3 - Alpha
|
214
214
|
Classifier: Intended Audience :: Developers
|
215
215
|
Classifier: License :: OSI Approved :: Apache Software License
|
216
|
-
Classifier: Operating System :: OS Independent
|
217
216
|
Classifier: Programming Language :: Python
|
218
217
|
Classifier: Programming Language :: Python :: 3
|
219
218
|
Classifier: Programming Language :: Python :: 3.12
|
219
|
+
Classifier: Operating System :: OS Independent
|
220
220
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
221
221
|
Classifier: Topic :: Software Development :: Testing
|
222
222
|
Requires-Python: >=3.12
|
223
|
+
Description-Content-Type: text/markdown
|
224
|
+
License-File: LICENSE
|
223
225
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
224
226
|
Requires-Dist: datasets>=4.0.0
|
225
227
|
Requires-Dist: jellyfish>=1.2.0
|
226
228
|
Requires-Dist: nltk>=3.9.1
|
227
|
-
Provides-Extra: dev
|
228
|
-
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
229
229
|
Provides-Extra: prime
|
230
|
-
Requires-Dist: verifiers>=0.1.3.post0; extra ==
|
231
|
-
|
230
|
+
Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
|
231
|
+
Provides-Extra: dev
|
232
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
233
|
+
Requires-Dist: hypothesis>=6.140.0; extra == "dev"
|
234
|
+
Dynamic: license-file
|
232
235
|
|
233
236
|
#
|
234
237
|
|
@@ -283,6 +286,41 @@ print(gaggle(SAMPLE_TEXT))
|
|
283
286
|
|
284
287
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
285
288
|
|
289
|
+
## Usage
|
290
|
+
|
291
|
+
Glitchlings slot into evaluation pipelines just as easily as they corrupt stray strings.
|
292
|
+
|
293
|
+
- **Direct invocation** – Instantiate a glitchling (or `Gaggle`) and call it on strings, iterables, or datasets. Keep the seed stable to make every run deterministic.
|
294
|
+
- **Dataset corruption** – After ``import glitchlings.dlc.huggingface``, call ``Dataset.glitch(...)`` (or a `Gaggle`'s `.corrupt_dataset`) to perturb a Hugging Face `datasets.Dataset` and return a corrupted copy for training or evaluation.
|
295
|
+
|
296
|
+
### Prime Intellect environments
|
297
|
+
|
298
|
+
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
299
|
+
|
300
|
+
```python
|
301
|
+
from glitchlings import Mim1c, Typogre
|
302
|
+
from glitchlings.dlc.prime import echo_chamber, load_environment
|
303
|
+
|
304
|
+
env = load_environment(
|
305
|
+
"osoleve/syllabify-en",
|
306
|
+
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
307
|
+
seed=404,
|
308
|
+
)
|
309
|
+
|
310
|
+
# Spin up an echo chamber that corrupts a dataset column and
|
311
|
+
# rewards models for perfectly restoring it
|
312
|
+
practice_env = echo_chamber(
|
313
|
+
"osoleve/clean-room",
|
314
|
+
column="text",
|
315
|
+
glitchlings=["Typogre", "Mim1c"],
|
316
|
+
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
317
|
+
)
|
318
|
+
```
|
319
|
+
|
320
|
+
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
321
|
+
override `reward_function` when you want to evaluate completions with a custom
|
322
|
+
scoring routine.
|
323
|
+
|
286
324
|
## Motivation
|
287
325
|
|
288
326
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -51,6 +51,41 @@ print(gaggle(SAMPLE_TEXT))
|
|
51
51
|
|
52
52
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
53
53
|
|
54
|
+
## Usage
|
55
|
+
|
56
|
+
Glitchlings slot into evaluation pipelines just as easily as they corrupt stray strings.
|
57
|
+
|
58
|
+
- **Direct invocation** – Instantiate a glitchling (or `Gaggle`) and call it on strings, iterables, or datasets. Keep the seed stable to make every run deterministic.
|
59
|
+
- **Dataset corruption** – After ``import glitchlings.dlc.huggingface``, call ``Dataset.glitch(...)`` (or a `Gaggle`'s `.corrupt_dataset`) to perturb a Hugging Face `datasets.Dataset` and return a corrupted copy for training or evaluation.
|
60
|
+
|
61
|
+
### Prime Intellect environments
|
62
|
+
|
63
|
+
After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
|
64
|
+
|
65
|
+
```python
|
66
|
+
from glitchlings import Mim1c, Typogre
|
67
|
+
from glitchlings.dlc.prime import echo_chamber, load_environment
|
68
|
+
|
69
|
+
env = load_environment(
|
70
|
+
"osoleve/syllabify-en",
|
71
|
+
glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
|
72
|
+
seed=404,
|
73
|
+
)
|
74
|
+
|
75
|
+
# Spin up an echo chamber that corrupts a dataset column and
|
76
|
+
# rewards models for perfectly restoring it
|
77
|
+
practice_env = echo_chamber(
|
78
|
+
"osoleve/clean-room",
|
79
|
+
column="text",
|
80
|
+
glitchlings=["Typogre", "Mim1c"],
|
81
|
+
reward_function=lambda prompt, completion, answer: float(completion == answer),
|
82
|
+
)
|
83
|
+
```
|
84
|
+
|
85
|
+
Skip the `glitchlings` argument to receive an untouched verifier dataset, and
|
86
|
+
override `reward_function` when you want to evaluate completions with a custom
|
87
|
+
scoring routine.
|
88
|
+
|
54
89
|
## Motivation
|
55
90
|
|
56
91
|
If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "glitchlings"
|
3
|
-
version = "0.
|
3
|
+
version = "0.2.0"
|
4
4
|
description = "Monsters for your language games."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.12"
|
@@ -48,35 +48,30 @@ prime = [
|
|
48
48
|
]
|
49
49
|
dev = [
|
50
50
|
"pytest>=8.0.0",
|
51
|
+
"hypothesis>=6.140.0",
|
51
52
|
]
|
52
53
|
|
53
54
|
[build-system]
|
54
|
-
requires = ["
|
55
|
-
build-backend = "
|
55
|
+
requires = ["setuptools>=64", "wheel", "setuptools-rust>=1.8"]
|
56
|
+
build-backend = "setuptools.build_meta"
|
56
57
|
|
57
|
-
[tool.
|
58
|
-
|
59
|
-
"**/__pycache__/**",
|
60
|
-
".git/**",
|
61
|
-
".venv/**",
|
62
|
-
]
|
58
|
+
[tool.setuptools]
|
59
|
+
package-dir = {"" = "src"}
|
63
60
|
|
64
|
-
[tool.
|
65
|
-
|
66
|
-
"**/__pycache__/**",
|
67
|
-
".git/**",
|
68
|
-
".venv/**",
|
69
|
-
]
|
61
|
+
[tool.setuptools.packages.find]
|
62
|
+
where = ["src"]
|
70
63
|
|
71
|
-
[tool.
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
64
|
+
[[tool.setuptools-rust.ext-modules]]
|
65
|
+
target = "glitchlings._zoo_rust"
|
66
|
+
path = "rust/zoo/Cargo.toml"
|
67
|
+
binding = "PyO3"
|
68
|
+
debug = false
|
69
|
+
|
70
|
+
[[tool.setuptools-rust.ext-modules]]
|
71
|
+
target = "glitchlings._typogre_rust"
|
72
|
+
path = "rust/typogre/Cargo.toml"
|
73
|
+
binding = "PyO3"
|
74
|
+
debug = false
|
80
75
|
|
81
76
|
[tool.pytest.ini_options]
|
82
77
|
pythonpath = [
|
@@ -255,6 +255,13 @@ version = "0.12.16"
|
|
255
255
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
256
256
|
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
|
257
257
|
|
258
|
+
[[package]]
|
259
|
+
name = "typogre_rust"
|
260
|
+
version = "0.1.0"
|
261
|
+
dependencies = [
|
262
|
+
"pyo3",
|
263
|
+
]
|
264
|
+
|
258
265
|
[[package]]
|
259
266
|
name = "unicode-ident"
|
260
267
|
version = "1.0.19"
|
@@ -8,8 +8,8 @@ name = "_zoo_rust"
|
|
8
8
|
crate-type = ["cdylib"]
|
9
9
|
|
10
10
|
[dependencies]
|
11
|
-
pyo3 = {
|
12
|
-
regex =
|
11
|
+
pyo3 = { workspace = true }
|
12
|
+
regex = { workspace = true }
|
13
13
|
|
14
14
|
[package.metadata.maturin]
|
15
15
|
module-name = "glitchlings._zoo_rust"
|
@@ -116,20 +116,35 @@ fn delete_random_words(
|
|
116
116
|
}
|
117
117
|
|
118
118
|
let mut tokens = split_with_separators(text);
|
119
|
+
let mut candidate_indices: Vec<usize> = Vec::new();
|
119
120
|
let mut i = 2;
|
120
121
|
while i < tokens.len() {
|
121
|
-
let word = tokens[i]
|
122
|
-
if word.is_empty()
|
123
|
-
i
|
124
|
-
|
122
|
+
let word = &tokens[i];
|
123
|
+
if !word.is_empty() && !is_whitespace_only(word) {
|
124
|
+
candidate_indices.push(i);
|
125
|
+
}
|
126
|
+
i += 2;
|
127
|
+
}
|
128
|
+
|
129
|
+
let allowed = ((candidate_indices.len() as f64) * max_deletion_rate).floor() as usize;
|
130
|
+
if allowed == 0 {
|
131
|
+
return Ok(text.to_string());
|
132
|
+
}
|
133
|
+
|
134
|
+
let mut deletions = 0;
|
135
|
+
for idx in candidate_indices {
|
136
|
+
if deletions >= allowed {
|
137
|
+
break;
|
125
138
|
}
|
139
|
+
|
140
|
+
let word = tokens[idx].clone();
|
126
141
|
if random_unit(rng)? < max_deletion_rate {
|
127
142
|
let (prefix, _, suffix) = split_affixes(&word);
|
128
143
|
let trimmed_prefix = prefix.trim();
|
129
144
|
let trimmed_suffix = suffix.trim();
|
130
|
-
tokens[
|
145
|
+
tokens[idx] = format!("{trimmed_prefix}{trimmed_suffix}");
|
146
|
+
deletions += 1;
|
131
147
|
}
|
132
|
-
i += 2;
|
133
148
|
}
|
134
149
|
|
135
150
|
let mut joined = tokens.concat();
|
@@ -283,7 +298,7 @@ fn redact_words(
|
|
283
298
|
|
284
299
|
if word_indices.is_empty() {
|
285
300
|
return Err(pyo3::exceptions::PyValueError::new_err(
|
286
|
-
"
|
301
|
+
"Cannot redact words because the input text contains no redactable words.",
|
287
302
|
));
|
288
303
|
}
|
289
304
|
|
@@ -0,0 +1,96 @@
|
|
1
|
+
"""Integration helpers for the Hugging Face datasets library."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from collections.abc import Iterable, Sequence
|
6
|
+
from typing import Any
|
7
|
+
|
8
|
+
try: # pragma: no cover - optional dependency is required at runtime
|
9
|
+
from datasets import Dataset as _DatasetsDataset
|
10
|
+
except ModuleNotFoundError as _datasets_error: # pragma: no cover - optional dependency
|
11
|
+
_DatasetsDataset = None # type: ignore[assignment]
|
12
|
+
else:
|
13
|
+
_datasets_error = None
|
14
|
+
|
15
|
+
from ..zoo import Gaggle, Glitchling, summon
|
16
|
+
|
17
|
+
|
18
|
+
def _normalise_columns(column: str | Sequence[str]) -> list[str]:
|
19
|
+
"""Normalise a column specification to a list."""
|
20
|
+
|
21
|
+
if isinstance(column, str):
|
22
|
+
return [column]
|
23
|
+
|
24
|
+
normalised = list(column)
|
25
|
+
if not normalised:
|
26
|
+
raise ValueError("At least one column must be specified")
|
27
|
+
return normalised
|
28
|
+
|
29
|
+
|
30
|
+
def _as_gaggle(glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling], seed: int) -> Gaggle:
|
31
|
+
"""Coerce any supported glitchling specification into a :class:`Gaggle`."""
|
32
|
+
|
33
|
+
if isinstance(glitchlings, Gaggle):
|
34
|
+
return glitchlings
|
35
|
+
|
36
|
+
if isinstance(glitchlings, (Glitchling, str)):
|
37
|
+
resolved: Iterable[str | Glitchling] = [glitchlings]
|
38
|
+
else:
|
39
|
+
resolved = glitchlings
|
40
|
+
|
41
|
+
return summon(list(resolved), seed=seed)
|
42
|
+
|
43
|
+
|
44
|
+
def _glitch_dataset(
|
45
|
+
dataset: Any,
|
46
|
+
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
47
|
+
column: str | Sequence[str],
|
48
|
+
*,
|
49
|
+
seed: int = 151,
|
50
|
+
) -> Any:
|
51
|
+
"""Internal helper implementing :meth:`Dataset.glitch`."""
|
52
|
+
|
53
|
+
columns = _normalise_columns(column)
|
54
|
+
gaggle = _as_gaggle(glitchlings, seed=seed)
|
55
|
+
return gaggle.corrupt_dataset(dataset, columns)
|
56
|
+
|
57
|
+
|
58
|
+
def _ensure_dataset_class() -> Any:
|
59
|
+
"""Return the Hugging Face :class:`~datasets.Dataset` patched with ``.glitch``."""
|
60
|
+
|
61
|
+
if _DatasetsDataset is None: # pragma: no cover - datasets is an install-time dependency
|
62
|
+
message = "datasets is not installed"
|
63
|
+
raise ModuleNotFoundError(message) from _datasets_error
|
64
|
+
|
65
|
+
if getattr(_DatasetsDataset, "glitch", None) is None:
|
66
|
+
|
67
|
+
def glitch( # type: ignore[override]
|
68
|
+
self: Any,
|
69
|
+
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
70
|
+
*,
|
71
|
+
column: str | Sequence[str],
|
72
|
+
seed: int = 151,
|
73
|
+
**_: Any,
|
74
|
+
) -> Any:
|
75
|
+
"""Return a lazily corrupted copy of the dataset."""
|
76
|
+
|
77
|
+
return _glitch_dataset(self, glitchlings, column, seed=seed)
|
78
|
+
|
79
|
+
setattr(_DatasetsDataset, "glitch", glitch)
|
80
|
+
|
81
|
+
return _DatasetsDataset
|
82
|
+
|
83
|
+
|
84
|
+
def install() -> None:
|
85
|
+
"""Monkeypatch the Hugging Face :class:`~datasets.Dataset` with ``.glitch``."""
|
86
|
+
|
87
|
+
_ensure_dataset_class()
|
88
|
+
|
89
|
+
|
90
|
+
if _DatasetsDataset is not None:
|
91
|
+
Dataset = _ensure_dataset_class()
|
92
|
+
else: # pragma: no cover - datasets is an install-time dependency
|
93
|
+
Dataset = None # type: ignore[assignment]
|
94
|
+
|
95
|
+
|
96
|
+
__all__ = ["Dataset", "install"]
|
@@ -0,0 +1,252 @@
|
|
1
|
+
"""Integration helpers for the optional verifiers prime DLC."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from collections.abc import Iterable, Sequence
|
6
|
+
from enum import Enum
|
7
|
+
from typing import Any, Callable
|
8
|
+
|
9
|
+
import verifiers as vf
|
10
|
+
|
11
|
+
from jellyfish import damerau_levenshtein_distance
|
12
|
+
|
13
|
+
try:
|
14
|
+
from .huggingface import Dataset
|
15
|
+
except ModuleNotFoundError: # pragma: no cover - optional dependency
|
16
|
+
Dataset = object # type: ignore[assignment]
|
17
|
+
else:
|
18
|
+
if Dataset is None: # pragma: no cover - optional dependency
|
19
|
+
Dataset = object # type: ignore[assignment]
|
20
|
+
|
21
|
+
from ..zoo import Gaggle, Glitchling, Mim1c, Typogre, summon
|
22
|
+
|
23
|
+
|
24
|
+
def _resolve_environment(env: str | vf.Environment) -> vf.Environment:
|
25
|
+
"""Return a fully-instantiated verifier environment."""
|
26
|
+
|
27
|
+
if isinstance(env, str):
|
28
|
+
env = vf.load_environment(env)
|
29
|
+
|
30
|
+
if not isinstance(env, vf.Environment):
|
31
|
+
raise TypeError("Invalid environment type")
|
32
|
+
|
33
|
+
return env
|
34
|
+
|
35
|
+
|
36
|
+
def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[str]:
|
37
|
+
"""Identify which dataset columns should be corrupted."""
|
38
|
+
|
39
|
+
available = set(dataset.column_names)
|
40
|
+
|
41
|
+
if columns is not None:
|
42
|
+
missing = sorted(set(columns) - available)
|
43
|
+
if missing:
|
44
|
+
missing_str = ", ".join(missing)
|
45
|
+
raise ValueError(f"Columns not found in dataset: {missing_str}")
|
46
|
+
return list(columns)
|
47
|
+
|
48
|
+
for candidate in ("prompt", "question"):
|
49
|
+
if candidate in available:
|
50
|
+
return [candidate]
|
51
|
+
|
52
|
+
sample = dataset[0] if len(dataset) else {}
|
53
|
+
inferred = [
|
54
|
+
name
|
55
|
+
for name in dataset.column_names
|
56
|
+
if isinstance(sample.get(name), str)
|
57
|
+
]
|
58
|
+
|
59
|
+
if inferred:
|
60
|
+
return inferred
|
61
|
+
|
62
|
+
raise ValueError("Unable to determine which dataset columns to corrupt.")
|
63
|
+
|
64
|
+
|
65
|
+
class Difficulty(Enum):
|
66
|
+
"""Difficulty levels for tutorial environments."""
|
67
|
+
|
68
|
+
Easy = 0.25
|
69
|
+
Normal = 1.0
|
70
|
+
Hard = 1.75
|
71
|
+
Extreme = 3
|
72
|
+
Impossible = 9
|
73
|
+
|
74
|
+
|
75
|
+
def tutorial_level(
|
76
|
+
env: vf.Environment | str,
|
77
|
+
seed: int = 151,
|
78
|
+
difficulty: Difficulty = Difficulty.Normal,
|
79
|
+
) -> vf.Environment:
|
80
|
+
"""Create a low-corruption environment using tuned defaults."""
|
81
|
+
|
82
|
+
tuned_mim1c = Mim1c(replacement_rate=0.01 * difficulty.value)
|
83
|
+
tuned_typogre = Typogre(max_change_rate=0.025 * difficulty.value)
|
84
|
+
|
85
|
+
return load_environment(
|
86
|
+
env,
|
87
|
+
glitchlings=[tuned_mim1c, tuned_typogre],
|
88
|
+
seed=seed,
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
def load_environment(
|
93
|
+
env: str | vf.Environment,
|
94
|
+
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
|
95
|
+
*,
|
96
|
+
seed: int = 151,
|
97
|
+
columns: Sequence[str] | None = None,
|
98
|
+
) -> vf.Environment:
|
99
|
+
"""Load an environment and optionally corrupt it with glitchlings."""
|
100
|
+
|
101
|
+
environment = _resolve_environment(env)
|
102
|
+
|
103
|
+
if glitchlings is None:
|
104
|
+
return environment
|
105
|
+
|
106
|
+
if isinstance(glitchlings, Gaggle):
|
107
|
+
gaggle = glitchlings
|
108
|
+
else:
|
109
|
+
if isinstance(glitchlings, (Glitchling, str)):
|
110
|
+
resolved = [glitchlings]
|
111
|
+
else:
|
112
|
+
resolved = list(glitchlings)
|
113
|
+
|
114
|
+
gaggle = summon(resolved, seed=seed)
|
115
|
+
|
116
|
+
dataset = environment.dataset
|
117
|
+
corrupt_columns = _resolve_columns(dataset, columns)
|
118
|
+
environment.dataset = gaggle.corrupt_dataset(dataset, corrupt_columns)
|
119
|
+
return environment
|
120
|
+
|
121
|
+
|
122
|
+
def _as_gaggle(
|
123
|
+
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
|
124
|
+
*,
|
125
|
+
seed: int,
|
126
|
+
) -> Gaggle:
|
127
|
+
"""Coerce any supported glitchling specification into a :class:`Gaggle`."""
|
128
|
+
|
129
|
+
if isinstance(glitchlings, Gaggle):
|
130
|
+
return glitchlings
|
131
|
+
|
132
|
+
if isinstance(glitchlings, (Glitchling, str)):
|
133
|
+
resolved: Iterable[str | Glitchling] = [glitchlings]
|
134
|
+
else:
|
135
|
+
resolved = glitchlings
|
136
|
+
|
137
|
+
return summon(list(resolved), seed=seed)
|
138
|
+
|
139
|
+
|
140
|
+
def _extract_completion_text(completion: Any) -> str:
|
141
|
+
"""Normalise a completion payload into a plain string."""
|
142
|
+
|
143
|
+
if isinstance(completion, str):
|
144
|
+
return completion
|
145
|
+
|
146
|
+
if isinstance(completion, list) and completion:
|
147
|
+
first = completion[0]
|
148
|
+
if isinstance(first, dict) and "content" in first:
|
149
|
+
return str(first["content"])
|
150
|
+
return str(first)
|
151
|
+
|
152
|
+
return str(completion)
|
153
|
+
|
154
|
+
|
155
|
+
def symmetric_damerau_levenshtein_similarity(
|
156
|
+
_: Any,
|
157
|
+
completion: Any,
|
158
|
+
answer: str,
|
159
|
+
) -> float:
|
160
|
+
"""Return ``1 - (distance / max_len)`` using Damerau-Levenshtein distance."""
|
161
|
+
|
162
|
+
completion_text = _extract_completion_text(completion)
|
163
|
+
target = answer or ""
|
164
|
+
denominator = max(len(completion_text), len(target), 1)
|
165
|
+
distance = damerau_levenshtein_distance(completion_text, target)
|
166
|
+
score = 1.0 - (distance / denominator)
|
167
|
+
return max(0.0, min(1.0, score))
|
168
|
+
|
169
|
+
|
170
|
+
DEFAULT_CLEANUP_INSTRUCTIONS = (
|
171
|
+
"You are a meticulous copy editor. Restore the provided text to its original form."
|
172
|
+
)
|
173
|
+
|
174
|
+
|
175
|
+
def echo_chamber(
|
176
|
+
dataset_id: str,
|
177
|
+
column: str,
|
178
|
+
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
|
179
|
+
*,
|
180
|
+
seed: int = 151,
|
181
|
+
instructions: str = DEFAULT_CLEANUP_INSTRUCTIONS,
|
182
|
+
reward_function: Callable[..., float] | None = None,
|
183
|
+
split: str | None = None,
|
184
|
+
**load_dataset_kwargs: Any,
|
185
|
+
) -> vf.Environment:
|
186
|
+
"""Create an Echo Chamber Prime environment from a Hugging Face dataset column.
|
187
|
+
|
188
|
+
Args:
|
189
|
+
dataset_id: Identifier of the Hugging Face dataset to load.
|
190
|
+
column: Name of the column whose text should be glitched.
|
191
|
+
glitchlings: Glitchling specifiers that will corrupt the prompts.
|
192
|
+
seed: RNG seed forwarded to :func:`summon`.
|
193
|
+
instructions: System instructions supplied to the environment prompts.
|
194
|
+
reward_function: Optional callable used to score completions. Defaults to
|
195
|
+
:func:`symmetric_damerau_levenshtein_similarity` when omitted.
|
196
|
+
split: Optional dataset split to load.
|
197
|
+
**load_dataset_kwargs: Extra keyword arguments forwarded to
|
198
|
+
:func:`datasets.load_dataset`.
|
199
|
+
"""
|
200
|
+
|
201
|
+
try:
|
202
|
+
from datasets import Dataset as HFDataset, DatasetDict, load_dataset
|
203
|
+
except ModuleNotFoundError as exc: # pragma: no cover - optional dependency
|
204
|
+
message = "datasets is required to build an echo chamber"
|
205
|
+
raise ModuleNotFoundError(message) from exc
|
206
|
+
|
207
|
+
hf_dataset: HFDataset | DatasetDict
|
208
|
+
if split is None:
|
209
|
+
hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
|
210
|
+
if isinstance(hf_dataset, DatasetDict):
|
211
|
+
try:
|
212
|
+
hf_dataset = next(iter(hf_dataset.values()))
|
213
|
+
except StopIteration as exc: # pragma: no cover - defensive
|
214
|
+
raise ValueError("The specified dataset does not contain any splits") from exc
|
215
|
+
else:
|
216
|
+
hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
|
217
|
+
|
218
|
+
if isinstance(hf_dataset, DatasetDict):
|
219
|
+
raise ValueError(
|
220
|
+
"Specify which split to use when the dataset loads as a DatasetDict."
|
221
|
+
)
|
222
|
+
|
223
|
+
prompts: list[list[dict[str, str]]] = []
|
224
|
+
answers: list[str] = []
|
225
|
+
|
226
|
+
for row in hf_dataset:
|
227
|
+
value = row.get(column)
|
228
|
+
if value is None:
|
229
|
+
continue
|
230
|
+
|
231
|
+
text = str(value)
|
232
|
+
prompts.append(
|
233
|
+
[
|
234
|
+
{"role": "system", "content": instructions},
|
235
|
+
{"role": "user", "content": f"Corrupted text:\n{text}"},
|
236
|
+
]
|
237
|
+
)
|
238
|
+
answers.append(text)
|
239
|
+
|
240
|
+
if not prompts:
|
241
|
+
raise ValueError(
|
242
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
243
|
+
)
|
244
|
+
|
245
|
+
dataset = HFDataset.from_dict({"prompt": prompts, "answer": answers})
|
246
|
+
|
247
|
+
gaggle = _as_gaggle(glitchlings, seed=seed)
|
248
|
+
glitched_dataset = gaggle.corrupt_dataset(dataset, ["prompt"])
|
249
|
+
|
250
|
+
rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
|
251
|
+
rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
|
252
|
+
return vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric)
|
@@ -124,8 +124,10 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
|
|
124
124
|
if args.file is not None:
|
125
125
|
try:
|
126
126
|
return args.file.read_text(encoding="utf-8")
|
127
|
-
except OSError as exc:
|
128
|
-
|
127
|
+
except OSError as exc:
|
128
|
+
filename = getattr(exc, "filename", None) or args.file
|
129
|
+
reason = exc.strerror or str(exc)
|
130
|
+
parser.error(f"Failed to read file {filename}: {reason}")
|
129
131
|
|
130
132
|
if args.text:
|
131
133
|
return args.text
|