glitchlings 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. glitchlings-0.2.0/MANIFEST.in +1 -0
  2. {glitchlings-0.1.3 → glitchlings-0.2.0}/PKG-INFO +50 -12
  3. {glitchlings-0.1.3 → glitchlings-0.2.0}/README.md +35 -0
  4. {glitchlings-0.1.3 → glitchlings-0.2.0}/pyproject.toml +19 -24
  5. {glitchlings-0.1.3/rust/zoo → glitchlings-0.2.0/rust}/Cargo.lock +7 -0
  6. glitchlings-0.2.0/rust/Cargo.toml +10 -0
  7. {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/typogre/Cargo.toml +1 -1
  8. {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/zoo/Cargo.toml +2 -2
  9. {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/zoo/src/lib.rs +22 -7
  10. glitchlings-0.2.0/setup.cfg +7 -0
  11. glitchlings-0.2.0/src/glitchlings/dlc/__init__.py +5 -0
  12. glitchlings-0.2.0/src/glitchlings/dlc/huggingface.py +96 -0
  13. glitchlings-0.2.0/src/glitchlings/dlc/prime.py +252 -0
  14. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/main.py +4 -2
  15. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/core.py +60 -8
  16. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/rushmore.py +8 -1
  17. glitchlings-0.2.0/src/glitchlings.egg-info/PKG-INFO +493 -0
  18. glitchlings-0.2.0/src/glitchlings.egg-info/SOURCES.txt +46 -0
  19. glitchlings-0.2.0/src/glitchlings.egg-info/dependency_links.txt +1 -0
  20. glitchlings-0.2.0/src/glitchlings.egg-info/entry_points.txt +2 -0
  21. glitchlings-0.2.0/src/glitchlings.egg-info/requires.txt +11 -0
  22. glitchlings-0.2.0/src/glitchlings.egg-info/top_level.txt +1 -0
  23. {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_cli.py +18 -0
  24. glitchlings-0.2.0/tests/test_dataset_corruption.py +91 -0
  25. glitchlings-0.2.0/tests/test_glitchling_core.py +24 -0
  26. glitchlings-0.2.0/tests/test_huggingface_dlc.py +56 -0
  27. glitchlings-0.2.0/tests/test_prime_echo_chamber.py +35 -0
  28. glitchlings-0.2.0/tests/test_property_based.py +93 -0
  29. {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_rust_backed_glitchlings.py +1 -1
  30. glitchlings-0.2.0/tests/test_util.py +35 -0
  31. glitchlings-0.1.3/.github/workflows/publish.yml +0 -41
  32. glitchlings-0.1.3/.gitignore +0 -17
  33. glitchlings-0.1.3/AGENTS.md +0 -55
  34. glitchlings-0.1.3/MONSTER_MANUAL.md +0 -271
  35. glitchlings-0.1.3/rust/typogre/Cargo.lock +0 -295
  36. glitchlings-0.1.3/src/glitchlings/dlc/__init__.py +0 -0
  37. glitchlings-0.1.3/src/glitchlings/dlc/prime.py +0 -52
  38. glitchlings-0.1.3/tests/conftest.py +0 -8
  39. glitchlings-0.1.3/tests/test_dataset_corruption.py +0 -51
  40. {glitchlings-0.1.3 → glitchlings-0.2.0}/LICENSE +0 -0
  41. {glitchlings-0.1.3 → glitchlings-0.2.0}/rust/typogre/src/lib.rs +0 -0
  42. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/__init__.py +0 -0
  43. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/__main__.py +0 -0
  44. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/util/__init__.py +0 -0
  45. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/__init__.py +0 -0
  46. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/jargoyle.py +0 -0
  47. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/mim1c.py +0 -0
  48. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/redactyl.py +0 -0
  49. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/reduple.py +0 -0
  50. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/scannequin.py +0 -0
  51. {glitchlings-0.1.3 → glitchlings-0.2.0}/src/glitchlings/zoo/typogre.py +0 -0
  52. {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_gaggle.py +0 -0
  53. {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_glitchlings_determinism.py +0 -0
  54. {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_jargoyle.py +0 -0
  55. {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_keyboard_layouts.py +0 -0
  56. {glitchlings-0.1.3 → glitchlings-0.2.0}/tests/test_parameter_effects.py +0 -0
@@ -0,0 +1 @@
1
+ recursive-include rust *
@@ -1,11 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: Monsters for your language games.
5
- Project-URL: Homepage, https://github.com/osoleve/glitchlings
6
- Project-URL: Repository, https://github.com/osoleve/glitchlings.git
7
- Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
8
- Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
9
5
  Author: osoleve
10
6
  License: Apache License
11
7
  Version 2.0, January 2004
@@ -208,27 +204,34 @@ License: Apache License
208
204
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
209
205
  See the License for the specific language governing permissions and
210
206
  limitations under the License.
211
- License-File: LICENSE
212
- Keywords: adversarial augmentation,nlp,text,text augmentation
207
+
208
+ Project-URL: Homepage, https://github.com/osoleve/glitchlings
209
+ Project-URL: Repository, https://github.com/osoleve/glitchlings.git
210
+ Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
211
+ Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
212
+ Keywords: nlp,text,adversarial augmentation,text augmentation
213
213
  Classifier: Development Status :: 3 - Alpha
214
214
  Classifier: Intended Audience :: Developers
215
215
  Classifier: License :: OSI Approved :: Apache Software License
216
- Classifier: Operating System :: OS Independent
217
216
  Classifier: Programming Language :: Python
218
217
  Classifier: Programming Language :: Python :: 3
219
218
  Classifier: Programming Language :: Python :: 3.12
219
+ Classifier: Operating System :: OS Independent
220
220
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
221
221
  Classifier: Topic :: Software Development :: Testing
222
222
  Requires-Python: >=3.12
223
+ Description-Content-Type: text/markdown
224
+ License-File: LICENSE
223
225
  Requires-Dist: confusable-homoglyphs>=3.3.1
224
226
  Requires-Dist: datasets>=4.0.0
225
227
  Requires-Dist: jellyfish>=1.2.0
226
228
  Requires-Dist: nltk>=3.9.1
227
- Provides-Extra: dev
228
- Requires-Dist: pytest>=8.0.0; extra == 'dev'
229
229
  Provides-Extra: prime
230
- Requires-Dist: verifiers>=0.1.3.post0; extra == 'prime'
231
- Description-Content-Type: text/markdown
230
+ Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
231
+ Provides-Extra: dev
232
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
233
+ Requires-Dist: hypothesis>=6.140.0; extra == "dev"
234
+ Dynamic: license-file
232
235
 
233
236
  #
234
237
 
@@ -283,6 +286,41 @@ print(gaggle(SAMPLE_TEXT))
283
286
 
284
287
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
285
288
 
289
+ ## Usage
290
+
291
+ Glitchlings slot into evaluation pipelines just as easily as they corrupt stray strings.
292
+
293
+ - **Direct invocation** – Instantiate a glitchling (or `Gaggle`) and call it on strings, iterables, or datasets. Keep the seed stable to make every run deterministic.
294
+ - **Dataset corruption** – After ``import glitchlings.dlc.huggingface``, call ``Dataset.glitch(...)`` (or a `Gaggle`'s `.corrupt_dataset`) to perturb a Hugging Face `datasets.Dataset` and return a corrupted copy for training or evaluation.
295
+
296
+ ### Prime Intellect environments
297
+
298
+ After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
299
+
300
+ ```python
301
+ from glitchlings import Mim1c, Typogre
302
+ from glitchlings.dlc.prime import echo_chamber, load_environment
303
+
304
+ env = load_environment(
305
+ "osoleve/syllabify-en",
306
+ glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
307
+ seed=404,
308
+ )
309
+
310
+ # Spin up an echo chamber that corrupts a dataset column and
311
+ # rewards models for perfectly restoring it
312
+ practice_env = echo_chamber(
313
+ "osoleve/clean-room",
314
+ column="text",
315
+ glitchlings=["Typogre", "Mim1c"],
316
+ reward_function=lambda prompt, completion, answer: float(completion == answer),
317
+ )
318
+ ```
319
+
320
+ Skip the `glitchlings` argument to receive an untouched verifier dataset, and
321
+ override `reward_function` when you want to evaluate completions with a custom
322
+ scoring routine.
323
+
286
324
  ## Motivation
287
325
 
288
326
  If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
@@ -51,6 +51,41 @@ print(gaggle(SAMPLE_TEXT))
51
51
 
52
52
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
53
53
 
54
+ ## Usage
55
+
56
+ Glitchlings slot into evaluation pipelines just as easily as they corrupt stray strings.
57
+
58
+ - **Direct invocation** – Instantiate a glitchling (or `Gaggle`) and call it on strings, iterables, or datasets. Keep the seed stable to make every run deterministic.
59
+ - **Dataset corruption** – After ``import glitchlings.dlc.huggingface``, call ``Dataset.glitch(...)`` (or a `Gaggle`'s `.corrupt_dataset`) to perturb a Hugging Face `datasets.Dataset` and return a corrupted copy for training or evaluation.
60
+
61
+ ### Prime Intellect environments
62
+
63
+ After `pip install -e .[prime]`, the `glitchlings.dlc.prime.load_environment` helper mirrors `verifiers.load_environment` for Prime Intellect scenarios while optionally applying glitchlings before returning the environment:
64
+
65
+ ```python
66
+ from glitchlings import Mim1c, Typogre
67
+ from glitchlings.dlc.prime import echo_chamber, load_environment
68
+
69
+ env = load_environment(
70
+ "osoleve/syllabify-en",
71
+ glitchlings=[Mim1c(replacement_rate=0.01), Typogre(max_change_rate=0.02)],
72
+ seed=404,
73
+ )
74
+
75
+ # Spin up an echo chamber that corrupts a dataset column and
76
+ # rewards models for perfectly restoring it
77
+ practice_env = echo_chamber(
78
+ "osoleve/clean-room",
79
+ column="text",
80
+ glitchlings=["Typogre", "Mim1c"],
81
+ reward_function=lambda prompt, completion, answer: float(completion == answer),
82
+ )
83
+ ```
84
+
85
+ Skip the `glitchlings` argument to receive an untouched verifier dataset, and
86
+ override `reward_function` when you want to evaluate completions with a custom
87
+ scoring routine.
88
+
54
89
  ## Motivation
55
90
 
56
91
  If your model performs well on a particular task, but not when `Glitchling`s are present, it's a sign that it hasn't actually generalized to the problem.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "glitchlings"
3
- version = "0.1.3"
3
+ version = "0.2.0"
4
4
  description = "Monsters for your language games."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -48,35 +48,30 @@ prime = [
48
48
  ]
49
49
  dev = [
50
50
  "pytest>=8.0.0",
51
+ "hypothesis>=6.140.0",
51
52
  ]
52
53
 
53
54
  [build-system]
54
- requires = ["hatchling>=1.18"]
55
- build-backend = "hatchling.build"
55
+ requires = ["setuptools>=64", "wheel", "setuptools-rust>=1.8"]
56
+ build-backend = "setuptools.build_meta"
56
57
 
57
- [tool.hatch.build]
58
- exclude = [
59
- "**/__pycache__/**",
60
- ".git/**",
61
- ".venv/**",
62
- ]
58
+ [tool.setuptools]
59
+ package-dir = {"" = "src"}
63
60
 
64
- [tool.hatch.build.targets.sdist]
65
- exclude = [
66
- "**/__pycache__/**",
67
- ".git/**",
68
- ".venv/**",
69
- ]
61
+ [tool.setuptools.packages.find]
62
+ where = ["src"]
70
63
 
71
- [tool.hatch.build.targets.wheel]
72
- packages = [
73
- "src/glitchlings"
74
- ]
75
- exclude = [
76
- "tests/**",
77
- "tests",
78
- "**/__pycache__/**",
79
- ]
64
+ [[tool.setuptools-rust.ext-modules]]
65
+ target = "glitchlings._zoo_rust"
66
+ path = "rust/zoo/Cargo.toml"
67
+ binding = "PyO3"
68
+ debug = false
69
+
70
+ [[tool.setuptools-rust.ext-modules]]
71
+ target = "glitchlings._typogre_rust"
72
+ path = "rust/typogre/Cargo.toml"
73
+ binding = "PyO3"
74
+ debug = false
80
75
 
81
76
  [tool.pytest.ini_options]
82
77
  pythonpath = [
@@ -255,6 +255,13 @@ version = "0.12.16"
255
255
  source = "registry+https://github.com/rust-lang/crates.io-index"
256
256
  checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
257
257
 
258
+ [[package]]
259
+ name = "typogre_rust"
260
+ version = "0.1.0"
261
+ dependencies = [
262
+ "pyo3",
263
+ ]
264
+
258
265
  [[package]]
259
266
  name = "unicode-ident"
260
267
  version = "1.0.19"
@@ -0,0 +1,10 @@
1
+ [workspace]
2
+ members = [
3
+ "typogre",
4
+ "zoo",
5
+ ]
6
+ resolver = "2"
7
+
8
+ [workspace.dependencies]
9
+ pyo3 = { version = "0.21", features = ["extension-module"] }
10
+ regex = "1"
@@ -8,7 +8,7 @@ name = "_typogre_rust"
8
8
  crate-type = ["cdylib"]
9
9
 
10
10
  [dependencies]
11
- pyo3 = { version = "0.21", features = ["extension-module"] }
11
+ pyo3 = { workspace = true }
12
12
 
13
13
  [package.metadata.maturin]
14
14
  module-name = "glitchlings._typogre_rust"
@@ -8,8 +8,8 @@ name = "_zoo_rust"
8
8
  crate-type = ["cdylib"]
9
9
 
10
10
  [dependencies]
11
- pyo3 = { version = "0.21", features = ["extension-module"] }
12
- regex = "1"
11
+ pyo3 = { workspace = true }
12
+ regex = { workspace = true }
13
13
 
14
14
  [package.metadata.maturin]
15
15
  module-name = "glitchlings._zoo_rust"
@@ -116,20 +116,35 @@ fn delete_random_words(
116
116
  }
117
117
 
118
118
  let mut tokens = split_with_separators(text);
119
+ let mut candidate_indices: Vec<usize> = Vec::new();
119
120
  let mut i = 2;
120
121
  while i < tokens.len() {
121
- let word = tokens[i].clone();
122
- if word.is_empty() || is_whitespace_only(&word) {
123
- i += 2;
124
- continue;
122
+ let word = &tokens[i];
123
+ if !word.is_empty() && !is_whitespace_only(word) {
124
+ candidate_indices.push(i);
125
+ }
126
+ i += 2;
127
+ }
128
+
129
+ let allowed = ((candidate_indices.len() as f64) * max_deletion_rate).floor() as usize;
130
+ if allowed == 0 {
131
+ return Ok(text.to_string());
132
+ }
133
+
134
+ let mut deletions = 0;
135
+ for idx in candidate_indices {
136
+ if deletions >= allowed {
137
+ break;
125
138
  }
139
+
140
+ let word = tokens[idx].clone();
126
141
  if random_unit(rng)? < max_deletion_rate {
127
142
  let (prefix, _, suffix) = split_affixes(&word);
128
143
  let trimmed_prefix = prefix.trim();
129
144
  let trimmed_suffix = suffix.trim();
130
- tokens[i] = format!("{trimmed_prefix}{trimmed_suffix}");
145
+ tokens[idx] = format!("{trimmed_prefix}{trimmed_suffix}");
146
+ deletions += 1;
131
147
  }
132
- i += 2;
133
148
  }
134
149
 
135
150
  let mut joined = tokens.concat();
@@ -283,7 +298,7 @@ fn redact_words(
283
298
 
284
299
  if word_indices.is_empty() {
285
300
  return Err(pyo3::exceptions::PyValueError::new_err(
286
- "No words found to redact",
301
+ "Cannot redact words because the input text contains no redactable words.",
287
302
  ));
288
303
  }
289
304
 
@@ -0,0 +1,7 @@
1
+ [bdist_wheel]
2
+ plat_name = manylinux_2_17_x86_64
3
+
4
+ [egg_info]
5
+ tag_build =
6
+ tag_date = 0
7
+
@@ -0,0 +1,5 @@
1
+ """Optional DLC integrations for Glitchlings."""
2
+
3
+ from .huggingface import install as install_huggingface
4
+
5
+ __all__ = ["install_huggingface"]
@@ -0,0 +1,96 @@
1
+ """Integration helpers for the Hugging Face datasets library."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from typing import Any
7
+
8
+ try: # pragma: no cover - optional dependency is required at runtime
9
+ from datasets import Dataset as _DatasetsDataset
10
+ except ModuleNotFoundError as _datasets_error: # pragma: no cover - optional dependency
11
+ _DatasetsDataset = None # type: ignore[assignment]
12
+ else:
13
+ _datasets_error = None
14
+
15
+ from ..zoo import Gaggle, Glitchling, summon
16
+
17
+
18
+ def _normalise_columns(column: str | Sequence[str]) -> list[str]:
19
+ """Normalise a column specification to a list."""
20
+
21
+ if isinstance(column, str):
22
+ return [column]
23
+
24
+ normalised = list(column)
25
+ if not normalised:
26
+ raise ValueError("At least one column must be specified")
27
+ return normalised
28
+
29
+
30
+ def _as_gaggle(glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling], seed: int) -> Gaggle:
31
+ """Coerce any supported glitchling specification into a :class:`Gaggle`."""
32
+
33
+ if isinstance(glitchlings, Gaggle):
34
+ return glitchlings
35
+
36
+ if isinstance(glitchlings, (Glitchling, str)):
37
+ resolved: Iterable[str | Glitchling] = [glitchlings]
38
+ else:
39
+ resolved = glitchlings
40
+
41
+ return summon(list(resolved), seed=seed)
42
+
43
+
44
+ def _glitch_dataset(
45
+ dataset: Any,
46
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
47
+ column: str | Sequence[str],
48
+ *,
49
+ seed: int = 151,
50
+ ) -> Any:
51
+ """Internal helper implementing :meth:`Dataset.glitch`."""
52
+
53
+ columns = _normalise_columns(column)
54
+ gaggle = _as_gaggle(glitchlings, seed=seed)
55
+ return gaggle.corrupt_dataset(dataset, columns)
56
+
57
+
58
+ def _ensure_dataset_class() -> Any:
59
+ """Return the Hugging Face :class:`~datasets.Dataset` patched with ``.glitch``."""
60
+
61
+ if _DatasetsDataset is None: # pragma: no cover - datasets is an install-time dependency
62
+ message = "datasets is not installed"
63
+ raise ModuleNotFoundError(message) from _datasets_error
64
+
65
+ if getattr(_DatasetsDataset, "glitch", None) is None:
66
+
67
+ def glitch( # type: ignore[override]
68
+ self: Any,
69
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
70
+ *,
71
+ column: str | Sequence[str],
72
+ seed: int = 151,
73
+ **_: Any,
74
+ ) -> Any:
75
+ """Return a lazily corrupted copy of the dataset."""
76
+
77
+ return _glitch_dataset(self, glitchlings, column, seed=seed)
78
+
79
+ setattr(_DatasetsDataset, "glitch", glitch)
80
+
81
+ return _DatasetsDataset
82
+
83
+
84
+ def install() -> None:
85
+ """Monkeypatch the Hugging Face :class:`~datasets.Dataset` with ``.glitch``."""
86
+
87
+ _ensure_dataset_class()
88
+
89
+
90
+ if _DatasetsDataset is not None:
91
+ Dataset = _ensure_dataset_class()
92
+ else: # pragma: no cover - datasets is an install-time dependency
93
+ Dataset = None # type: ignore[assignment]
94
+
95
+
96
+ __all__ = ["Dataset", "install"]
@@ -0,0 +1,252 @@
1
+ """Integration helpers for the optional verifiers prime DLC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from enum import Enum
7
+ from typing import Any, Callable
8
+
9
+ import verifiers as vf
10
+
11
+ from jellyfish import damerau_levenshtein_distance
12
+
13
+ try:
14
+ from .huggingface import Dataset
15
+ except ModuleNotFoundError: # pragma: no cover - optional dependency
16
+ Dataset = object # type: ignore[assignment]
17
+ else:
18
+ if Dataset is None: # pragma: no cover - optional dependency
19
+ Dataset = object # type: ignore[assignment]
20
+
21
+ from ..zoo import Gaggle, Glitchling, Mim1c, Typogre, summon
22
+
23
+
24
+ def _resolve_environment(env: str | vf.Environment) -> vf.Environment:
25
+ """Return a fully-instantiated verifier environment."""
26
+
27
+ if isinstance(env, str):
28
+ env = vf.load_environment(env)
29
+
30
+ if not isinstance(env, vf.Environment):
31
+ raise TypeError("Invalid environment type")
32
+
33
+ return env
34
+
35
+
36
+ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[str]:
37
+ """Identify which dataset columns should be corrupted."""
38
+
39
+ available = set(dataset.column_names)
40
+
41
+ if columns is not None:
42
+ missing = sorted(set(columns) - available)
43
+ if missing:
44
+ missing_str = ", ".join(missing)
45
+ raise ValueError(f"Columns not found in dataset: {missing_str}")
46
+ return list(columns)
47
+
48
+ for candidate in ("prompt", "question"):
49
+ if candidate in available:
50
+ return [candidate]
51
+
52
+ sample = dataset[0] if len(dataset) else {}
53
+ inferred = [
54
+ name
55
+ for name in dataset.column_names
56
+ if isinstance(sample.get(name), str)
57
+ ]
58
+
59
+ if inferred:
60
+ return inferred
61
+
62
+ raise ValueError("Unable to determine which dataset columns to corrupt.")
63
+
64
+
65
+ class Difficulty(Enum):
66
+ """Difficulty levels for tutorial environments."""
67
+
68
+ Easy = 0.25
69
+ Normal = 1.0
70
+ Hard = 1.75
71
+ Extreme = 3
72
+ Impossible = 9
73
+
74
+
75
+ def tutorial_level(
76
+ env: vf.Environment | str,
77
+ seed: int = 151,
78
+ difficulty: Difficulty = Difficulty.Normal,
79
+ ) -> vf.Environment:
80
+ """Create a low-corruption environment using tuned defaults."""
81
+
82
+ tuned_mim1c = Mim1c(replacement_rate=0.01 * difficulty.value)
83
+ tuned_typogre = Typogre(max_change_rate=0.025 * difficulty.value)
84
+
85
+ return load_environment(
86
+ env,
87
+ glitchlings=[tuned_mim1c, tuned_typogre],
88
+ seed=seed,
89
+ )
90
+
91
+
92
+ def load_environment(
93
+ env: str | vf.Environment,
94
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
95
+ *,
96
+ seed: int = 151,
97
+ columns: Sequence[str] | None = None,
98
+ ) -> vf.Environment:
99
+ """Load an environment and optionally corrupt it with glitchlings."""
100
+
101
+ environment = _resolve_environment(env)
102
+
103
+ if glitchlings is None:
104
+ return environment
105
+
106
+ if isinstance(glitchlings, Gaggle):
107
+ gaggle = glitchlings
108
+ else:
109
+ if isinstance(glitchlings, (Glitchling, str)):
110
+ resolved = [glitchlings]
111
+ else:
112
+ resolved = list(glitchlings)
113
+
114
+ gaggle = summon(resolved, seed=seed)
115
+
116
+ dataset = environment.dataset
117
+ corrupt_columns = _resolve_columns(dataset, columns)
118
+ environment.dataset = gaggle.corrupt_dataset(dataset, corrupt_columns)
119
+ return environment
120
+
121
+
122
+ def _as_gaggle(
123
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
124
+ *,
125
+ seed: int,
126
+ ) -> Gaggle:
127
+ """Coerce any supported glitchling specification into a :class:`Gaggle`."""
128
+
129
+ if isinstance(glitchlings, Gaggle):
130
+ return glitchlings
131
+
132
+ if isinstance(glitchlings, (Glitchling, str)):
133
+ resolved: Iterable[str | Glitchling] = [glitchlings]
134
+ else:
135
+ resolved = glitchlings
136
+
137
+ return summon(list(resolved), seed=seed)
138
+
139
+
140
+ def _extract_completion_text(completion: Any) -> str:
141
+ """Normalise a completion payload into a plain string."""
142
+
143
+ if isinstance(completion, str):
144
+ return completion
145
+
146
+ if isinstance(completion, list) and completion:
147
+ first = completion[0]
148
+ if isinstance(first, dict) and "content" in first:
149
+ return str(first["content"])
150
+ return str(first)
151
+
152
+ return str(completion)
153
+
154
+
155
+ def symmetric_damerau_levenshtein_similarity(
156
+ _: Any,
157
+ completion: Any,
158
+ answer: str,
159
+ ) -> float:
160
+ """Return ``1 - (distance / max_len)`` using Damerau-Levenshtein distance."""
161
+
162
+ completion_text = _extract_completion_text(completion)
163
+ target = answer or ""
164
+ denominator = max(len(completion_text), len(target), 1)
165
+ distance = damerau_levenshtein_distance(completion_text, target)
166
+ score = 1.0 - (distance / denominator)
167
+ return max(0.0, min(1.0, score))
168
+
169
+
170
+ DEFAULT_CLEANUP_INSTRUCTIONS = (
171
+ "You are a meticulous copy editor. Restore the provided text to its original form."
172
+ )
173
+
174
+
175
+ def echo_chamber(
176
+ dataset_id: str,
177
+ column: str,
178
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
179
+ *,
180
+ seed: int = 151,
181
+ instructions: str = DEFAULT_CLEANUP_INSTRUCTIONS,
182
+ reward_function: Callable[..., float] | None = None,
183
+ split: str | None = None,
184
+ **load_dataset_kwargs: Any,
185
+ ) -> vf.Environment:
186
+ """Create an Echo Chamber Prime environment from a Hugging Face dataset column.
187
+
188
+ Args:
189
+ dataset_id: Identifier of the Hugging Face dataset to load.
190
+ column: Name of the column whose text should be glitched.
191
+ glitchlings: Glitchling specifiers that will corrupt the prompts.
192
+ seed: RNG seed forwarded to :func:`summon`.
193
+ instructions: System instructions supplied to the environment prompts.
194
+ reward_function: Optional callable used to score completions. Defaults to
195
+ :func:`symmetric_damerau_levenshtein_similarity` when omitted.
196
+ split: Optional dataset split to load.
197
+ **load_dataset_kwargs: Extra keyword arguments forwarded to
198
+ :func:`datasets.load_dataset`.
199
+ """
200
+
201
+ try:
202
+ from datasets import Dataset as HFDataset, DatasetDict, load_dataset
203
+ except ModuleNotFoundError as exc: # pragma: no cover - optional dependency
204
+ message = "datasets is required to build an echo chamber"
205
+ raise ModuleNotFoundError(message) from exc
206
+
207
+ hf_dataset: HFDataset | DatasetDict
208
+ if split is None:
209
+ hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
210
+ if isinstance(hf_dataset, DatasetDict):
211
+ try:
212
+ hf_dataset = next(iter(hf_dataset.values()))
213
+ except StopIteration as exc: # pragma: no cover - defensive
214
+ raise ValueError("The specified dataset does not contain any splits") from exc
215
+ else:
216
+ hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
217
+
218
+ if isinstance(hf_dataset, DatasetDict):
219
+ raise ValueError(
220
+ "Specify which split to use when the dataset loads as a DatasetDict."
221
+ )
222
+
223
+ prompts: list[list[dict[str, str]]] = []
224
+ answers: list[str] = []
225
+
226
+ for row in hf_dataset:
227
+ value = row.get(column)
228
+ if value is None:
229
+ continue
230
+
231
+ text = str(value)
232
+ prompts.append(
233
+ [
234
+ {"role": "system", "content": instructions},
235
+ {"role": "user", "content": f"Corrupted text:\n{text}"},
236
+ ]
237
+ )
238
+ answers.append(text)
239
+
240
+ if not prompts:
241
+ raise ValueError(
242
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
243
+ )
244
+
245
+ dataset = HFDataset.from_dict({"prompt": prompts, "answer": answers})
246
+
247
+ gaggle = _as_gaggle(glitchlings, seed=seed)
248
+ glitched_dataset = gaggle.corrupt_dataset(dataset, ["prompt"])
249
+
250
+ rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
251
+ rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
252
+ return vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric)
@@ -124,8 +124,10 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
124
124
  if args.file is not None:
125
125
  try:
126
126
  return args.file.read_text(encoding="utf-8")
127
- except OSError as exc: # pragma: no cover - exercised via CLI
128
- parser.error(str(exc))
127
+ except OSError as exc:
128
+ filename = getattr(exc, "filename", None) or args.file
129
+ reason = exc.strerror or str(exc)
130
+ parser.error(f"Failed to read file {filename}: {reason}")
129
131
 
130
132
  if args.text:
131
133
  return args.text