glitchlings 0.2.3__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ from .zoo import (
2
+ Typogre,
3
+ typogre,
4
+ Mim1c,
5
+ mim1c,
6
+ Jargoyle,
7
+ jargoyle,
8
+ Redactyl,
9
+ redactyl,
10
+ Reduple,
11
+ reduple,
12
+ Rushmore,
13
+ rushmore,
14
+ Scannequin,
15
+ scannequin,
16
+ Glitchling,
17
+ Gaggle,
18
+ summon,
19
+ )
20
+ from .util import SAMPLE_TEXT
21
+
22
+
23
+ __all__ = [
24
+ "Typogre",
25
+ "typogre",
26
+ "Mim1c",
27
+ "mim1c",
28
+ "Jargoyle",
29
+ "jargoyle",
30
+ "Redactyl",
31
+ "redactyl",
32
+ "Reduple",
33
+ "reduple",
34
+ "Rushmore",
35
+ "rushmore",
36
+ "Scannequin",
37
+ "scannequin",
38
+ "summon",
39
+ "Glitchling",
40
+ "Gaggle",
41
+ "SAMPLE_TEXT",
42
+ ]
@@ -0,0 +1,9 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+
5
+ from .main import main
6
+
7
+
8
+ if __name__ == "__main__":
9
+ sys.exit(main())
@@ -0,0 +1,5 @@
1
+ """Optional DLC integrations for Glitchlings."""
2
+
3
+ from .huggingface import install as install_huggingface
4
+
5
+ __all__ = ["install_huggingface"]
@@ -0,0 +1,96 @@
1
+ """Integration helpers for the Hugging Face datasets library."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from typing import Any
7
+
8
+ try: # pragma: no cover - optional dependency is required at runtime
9
+ from datasets import Dataset as _DatasetsDataset
10
+ except ModuleNotFoundError as _datasets_error: # pragma: no cover - optional dependency
11
+ _DatasetsDataset = None # type: ignore[assignment]
12
+ else:
13
+ _datasets_error = None
14
+
15
+ from ..zoo import Gaggle, Glitchling, summon
16
+
17
+
18
+ def _normalise_columns(column: str | Sequence[str]) -> list[str]:
19
+ """Normalise a column specification to a list."""
20
+
21
+ if isinstance(column, str):
22
+ return [column]
23
+
24
+ normalised = list(column)
25
+ if not normalised:
26
+ raise ValueError("At least one column must be specified")
27
+ return normalised
28
+
29
+
30
+ def _as_gaggle(glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling], seed: int) -> Gaggle:
31
+ """Coerce any supported glitchling specification into a :class:`Gaggle`."""
32
+
33
+ if isinstance(glitchlings, Gaggle):
34
+ return glitchlings
35
+
36
+ if isinstance(glitchlings, (Glitchling, str)):
37
+ resolved: Iterable[str | Glitchling] = [glitchlings]
38
+ else:
39
+ resolved = glitchlings
40
+
41
+ return summon(list(resolved), seed=seed)
42
+
43
+
44
+ def _glitch_dataset(
45
+ dataset: Any,
46
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
47
+ column: str | Sequence[str],
48
+ *,
49
+ seed: int = 151,
50
+ ) -> Any:
51
+ """Internal helper implementing :meth:`Dataset.glitch`."""
52
+
53
+ columns = _normalise_columns(column)
54
+ gaggle = _as_gaggle(glitchlings, seed=seed)
55
+ return gaggle.corrupt_dataset(dataset, columns)
56
+
57
+
58
+ def _ensure_dataset_class() -> Any:
59
+ """Return the Hugging Face :class:`~datasets.Dataset` patched with ``.glitch``."""
60
+
61
+ if _DatasetsDataset is None: # pragma: no cover - datasets is an install-time dependency
62
+ message = "datasets is not installed"
63
+ raise ModuleNotFoundError(message) from _datasets_error
64
+
65
+ if getattr(_DatasetsDataset, "glitch", None) is None:
66
+
67
+ def glitch( # type: ignore[override]
68
+ self: Any,
69
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
70
+ *,
71
+ column: str | Sequence[str],
72
+ seed: int = 151,
73
+ **_: Any,
74
+ ) -> Any:
75
+ """Return a lazily corrupted copy of the dataset."""
76
+
77
+ return _glitch_dataset(self, glitchlings, column, seed=seed)
78
+
79
+ setattr(_DatasetsDataset, "glitch", glitch)
80
+
81
+ return _DatasetsDataset
82
+
83
+
84
+ def install() -> None:
85
+ """Monkeypatch the Hugging Face :class:`~datasets.Dataset` with ``.glitch``."""
86
+
87
+ _ensure_dataset_class()
88
+
89
+
90
+ if _DatasetsDataset is not None:
91
+ Dataset = _ensure_dataset_class()
92
+ else: # pragma: no cover - datasets is an install-time dependency
93
+ Dataset = None # type: ignore[assignment]
94
+
95
+
96
+ __all__ = ["Dataset", "install"]
@@ -0,0 +1,274 @@
1
+ """Integration helpers for the optional verifiers prime DLC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from enum import Enum
7
+ from typing import Any, Callable
8
+
9
+ import verifiers as vf
10
+
11
+ from jellyfish import damerau_levenshtein_distance
12
+
13
+ try:
14
+ from .huggingface import Dataset
15
+ except ModuleNotFoundError: # pragma: no cover - optional dependency
16
+ Dataset = object # type: ignore[assignment]
17
+ else:
18
+ if Dataset is None: # pragma: no cover - optional dependency
19
+ Dataset = object # type: ignore[assignment]
20
+
21
+ from ..zoo import Gaggle, Glitchling, Mim1c, Typogre, summon
22
+
23
+
24
+ def _resolve_environment(env: str | vf.Environment) -> vf.Environment:
25
+ """Return a fully-instantiated verifier environment."""
26
+
27
+ if isinstance(env, str):
28
+ env = vf.load_environment(env)
29
+
30
+ if not isinstance(env, vf.Environment):
31
+ raise TypeError("Invalid environment type")
32
+
33
+ return env
34
+
35
+
36
+ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[str]:
37
+ """Identify which dataset columns should be corrupted."""
38
+
39
+ available = set(dataset.column_names)
40
+
41
+ if columns is not None:
42
+ missing = sorted(set(columns) - available)
43
+ if missing:
44
+ missing_str = ", ".join(missing)
45
+ raise ValueError(f"Columns not found in dataset: {missing_str}")
46
+ return list(columns)
47
+
48
+ for candidate in ("prompt", "question"):
49
+ if candidate in available:
50
+ return [candidate]
51
+
52
+ sample = dataset[0] if len(dataset) else {}
53
+ inferred = [
54
+ name
55
+ for name in dataset.column_names
56
+ if isinstance(sample.get(name), str)
57
+ ]
58
+
59
+ if inferred:
60
+ return inferred
61
+
62
+ raise ValueError("Unable to determine which dataset columns to corrupt.")
63
+
64
+
65
+ class Difficulty(Enum):
66
+ """Difficulty levels for tutorial environments."""
67
+
68
+ Easy = 0.25
69
+ Normal = 1.0
70
+ Hard = 1.75
71
+ Extreme = 3
72
+ Impossible = 9
73
+
74
+
75
+ def tutorial_level(
76
+ env: vf.Environment | str,
77
+ seed: int = 151,
78
+ difficulty: Difficulty = Difficulty.Normal,
79
+ ) -> vf.Environment:
80
+ """Create a low-corruption environment using tuned defaults."""
81
+
82
+ tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
83
+ tuned_typogre = Typogre(rate=0.025 * difficulty.value)
84
+
85
+ return load_environment(
86
+ env,
87
+ glitchlings=[tuned_mim1c, tuned_typogre],
88
+ seed=seed,
89
+ )
90
+
91
+
92
+ def load_environment(
93
+ env: str | vf.Environment,
94
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
95
+ *,
96
+ seed: int = 151,
97
+ columns: Sequence[str] | None = None,
98
+ ) -> vf.Environment:
99
+ """Load an environment and optionally corrupt it with glitchlings."""
100
+
101
+ environment = _resolve_environment(env)
102
+
103
+ if glitchlings is None:
104
+ return environment
105
+
106
+ if isinstance(glitchlings, Gaggle):
107
+ gaggle = glitchlings
108
+ else:
109
+ if isinstance(glitchlings, (Glitchling, str)):
110
+ resolved = [glitchlings]
111
+ else:
112
+ resolved = list(glitchlings)
113
+
114
+ gaggle = summon(resolved, seed=seed)
115
+
116
+ dataset = environment.dataset
117
+ corrupt_columns = _resolve_columns(dataset, columns)
118
+ environment.dataset = gaggle.corrupt_dataset(dataset, corrupt_columns)
119
+ return environment
120
+
121
+
122
+ def _as_gaggle(
123
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
124
+ *,
125
+ seed: int,
126
+ ) -> Gaggle:
127
+ """Coerce any supported glitchling specification into a :class:`Gaggle`."""
128
+
129
+ if isinstance(glitchlings, Gaggle):
130
+ return glitchlings
131
+
132
+ if isinstance(glitchlings, (Glitchling, str)):
133
+ resolved: Iterable[str | Glitchling] = [glitchlings]
134
+ else:
135
+ resolved = glitchlings
136
+
137
+ return summon(list(resolved), seed=seed)
138
+
139
+
140
+ def _extract_completion_text(completion: Any) -> str:
141
+ """Normalise a completion payload into a plain string."""
142
+
143
+ if isinstance(completion, str):
144
+ return completion
145
+
146
+ if isinstance(completion, list) and completion:
147
+ first = completion[0]
148
+ if isinstance(first, dict) and "content" in first:
149
+ return str(first["content"])
150
+ return str(first)
151
+
152
+ return str(completion)
153
+
154
+
155
+ def symmetric_damerau_levenshtein_similarity(
156
+ _: Any,
157
+ completion: Any,
158
+ answer: str,
159
+ ) -> float:
160
+ """Return ``1 - (distance / max_len)`` using Damerau-Levenshtein distance."""
161
+
162
+ completion_text = _extract_completion_text(completion)
163
+ target = answer or ""
164
+ denominator = max(len(completion_text), len(target), 1)
165
+ distance = damerau_levenshtein_distance(completion_text, target)
166
+ score = 1.0 - (distance / denominator)
167
+ return max(0.0, min(1.0, score))
168
+
169
+
170
+ DEFAULT_CLEANUP_INSTRUCTIONS = (
171
+ "You are a meticulous copy editor. Restore the provided text to its original form."
172
+ )
173
+
174
+
175
+ def echo_chamber(
176
+ dataset_id: str,
177
+ column: str,
178
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
179
+ *,
180
+ seed: int = 151,
181
+ instructions: str = DEFAULT_CLEANUP_INSTRUCTIONS,
182
+ reward_function: Callable[..., float] | None = None,
183
+ split: str | None = None,
184
+ **load_dataset_kwargs: Any,
185
+ ) -> vf.Environment:
186
+ """Create an Echo Chamber Prime environment from a Hugging Face dataset column.
187
+
188
+ Args:
189
+ dataset_id: Identifier of the Hugging Face dataset to load.
190
+ column: Name of the column whose text should be glitched.
191
+ glitchlings: Glitchling specifiers that will corrupt the prompts.
192
+ seed: RNG seed forwarded to :func:`summon`.
193
+ instructions: System instructions supplied to the environment prompts.
194
+ reward_function: Optional callable used to score completions. Defaults to
195
+ :func:`symmetric_damerau_levenshtein_similarity` when omitted.
196
+ split: Optional dataset split to load.
197
+ **load_dataset_kwargs: Extra keyword arguments forwarded to
198
+ :func:`datasets.load_dataset`.
199
+ """
200
+
201
+ try:
202
+ from datasets import Dataset as HFDataset, DatasetDict, load_dataset
203
+ except ModuleNotFoundError as exc: # pragma: no cover - optional dependency
204
+ message = "datasets is required to build an echo chamber"
205
+ raise ModuleNotFoundError(message) from exc
206
+
207
+ hf_dataset: HFDataset | DatasetDict
208
+ if split is None:
209
+ hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
210
+ if isinstance(hf_dataset, DatasetDict):
211
+ try:
212
+ hf_dataset = next(iter(hf_dataset.values()))
213
+ except StopIteration as exc: # pragma: no cover - defensive
214
+ raise ValueError("The specified dataset does not contain any splits") from exc
215
+ else:
216
+ hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
217
+
218
+ if isinstance(hf_dataset, DatasetDict):
219
+ raise ValueError(
220
+ "Specify which split to use when the dataset loads as a DatasetDict."
221
+ )
222
+
223
+ filtered_dataset = hf_dataset.filter(
224
+ lambda row: row.get(column) is not None,
225
+ load_from_cache_file=False,
226
+ )
227
+
228
+ source_column_names = list(filtered_dataset.column_names)
229
+
230
+ def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
231
+ text = str(row[column])
232
+ prompt = [
233
+ {"role": "system", "content": instructions},
234
+ {"role": "user", "content": f"Corrupted text:\n{text}"},
235
+ ]
236
+ return {"prompt": prompt, "answer": text}
237
+
238
+ base_dataset = filtered_dataset.map(
239
+ _build_prompt,
240
+ remove_columns=source_column_names,
241
+ load_from_cache_file=False,
242
+ )
243
+
244
+ try:
245
+ dataset_length = len(base_dataset) # type: ignore[arg-type]
246
+ except TypeError:
247
+ preview_rows: list[dict[str, Any]]
248
+ take_fn = getattr(base_dataset, "take", None)
249
+ if callable(take_fn):
250
+ preview_rows = list(take_fn(1))
251
+ else:
252
+ iterator = iter(base_dataset)
253
+ try:
254
+ first_row = next(iterator)
255
+ except StopIteration:
256
+ preview_rows = []
257
+ else:
258
+ preview_rows = [first_row]
259
+ if not preview_rows:
260
+ raise ValueError(
261
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
262
+ )
263
+ else:
264
+ if dataset_length == 0:
265
+ raise ValueError(
266
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
267
+ )
268
+
269
+ gaggle = _as_gaggle(glitchlings, seed=seed)
270
+ glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
271
+
272
+ rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
273
+ rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
274
+ return vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric)
glitchlings/main.py ADDED
@@ -0,0 +1,218 @@
1
+ """Command line interface for summoning and running glitchlings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import difflib
7
+ from pathlib import Path
8
+ import sys
9
+
10
+ from . import SAMPLE_TEXT
11
+ from .zoo import (
12
+ Glitchling,
13
+ Gaggle,
14
+ BUILTIN_GLITCHLINGS,
15
+ DEFAULT_GLITCHLING_NAMES,
16
+ parse_glitchling_spec,
17
+ summon,
18
+ )
19
+
20
+ MAX_NAME_WIDTH = max(len(glitchling.name) for glitchling in BUILTIN_GLITCHLINGS.values())
21
+
22
+
23
+ def build_parser() -> argparse.ArgumentParser:
24
+ """Create and configure the CLI argument parser.
25
+
26
+ Returns:
27
+ argparse.ArgumentParser: The configured argument parser instance.
28
+ """
29
+
30
+ parser = argparse.ArgumentParser(
31
+ description=(
32
+ "Summon glitchlings to corrupt text. Provide input text as an argument, "
33
+ "via --file, or pipe it on stdin."
34
+ )
35
+ )
36
+ parser.add_argument(
37
+ "text",
38
+ nargs="?",
39
+ help="Text to corrupt. If omitted, stdin is used or --sample provides fallback text.",
40
+ )
41
+ parser.add_argument(
42
+ "-g",
43
+ "--glitchling",
44
+ dest="glitchlings",
45
+ action="append",
46
+ metavar="SPEC",
47
+ help=(
48
+ "Glitchling to apply, optionally with parameters like "
49
+ "Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
50
+ ),
51
+ )
52
+ parser.add_argument(
53
+ "-s",
54
+ "--seed",
55
+ type=int,
56
+ default=151,
57
+ help="Seed controlling deterministic corruption order (default: 151).",
58
+ )
59
+ parser.add_argument(
60
+ "-f",
61
+ "--file",
62
+ type=Path,
63
+ help="Read input text from a file instead of the command line argument.",
64
+ )
65
+ parser.add_argument(
66
+ "--sample",
67
+ action="store_true",
68
+ help="Use the included SAMPLE_TEXT when no other input is provided.",
69
+ )
70
+ parser.add_argument(
71
+ "--diff",
72
+ action="store_true",
73
+ help="Show a unified diff between the original and corrupted text.",
74
+ )
75
+ parser.add_argument(
76
+ "--list",
77
+ action="store_true",
78
+ help="List available glitchlings and exit.",
79
+ )
80
+ return parser
81
+
82
+
83
+ def list_glitchlings() -> None:
84
+ """Print information about the available built-in glitchlings."""
85
+
86
+ for key in DEFAULT_GLITCHLING_NAMES:
87
+ glitchling = BUILTIN_GLITCHLINGS[key]
88
+ display_name = glitchling.name
89
+ scope = glitchling.level.name.title()
90
+ order = glitchling.order.name.lower()
91
+ print(f"{display_name:>{MAX_NAME_WIDTH}} — scope: {scope}, order: {order}")
92
+
93
+
94
+ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
95
+ """Resolve the input text based on CLI arguments.
96
+
97
+ Args:
98
+ args: Parsed arguments from the CLI.
99
+ parser: The argument parser used for emitting user-facing errors.
100
+
101
+ Returns:
102
+ str: The text to corrupt.
103
+
104
+ Raises:
105
+ SystemExit: Raised indirectly via ``parser.error`` on failure.
106
+ """
107
+
108
+ if args.file is not None:
109
+ try:
110
+ return args.file.read_text(encoding="utf-8")
111
+ except OSError as exc:
112
+ filename = getattr(exc, "filename", None) or args.file
113
+ reason = exc.strerror or str(exc)
114
+ parser.error(f"Failed to read file {filename}: {reason}")
115
+
116
+ if args.text:
117
+ return args.text
118
+
119
+ if not sys.stdin.isatty():
120
+ return sys.stdin.read()
121
+
122
+ if args.sample:
123
+ return SAMPLE_TEXT
124
+
125
+ parser.error(
126
+ "No input text provided. Supply text as an argument, use --file, pipe input, or pass --sample."
127
+ )
128
+ raise AssertionError("parser.error should exit")
129
+
130
+
131
+ def summon_glitchlings(
132
+ names: list[str] | None, parser: argparse.ArgumentParser, seed: int
133
+ ) -> Gaggle:
134
+ """Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
135
+
136
+ if names:
137
+ normalized: list[str | Glitchling] = []
138
+ for specification in names:
139
+ try:
140
+ normalized.append(parse_glitchling_spec(specification))
141
+ except ValueError as exc:
142
+ parser.error(str(exc))
143
+ raise AssertionError("parser.error should exit")
144
+ else:
145
+ normalized = DEFAULT_GLITCHLING_NAMES
146
+
147
+ try:
148
+ return summon(normalized, seed=seed)
149
+ except ValueError as exc:
150
+ parser.error(str(exc))
151
+ raise AssertionError("parser.error should exit")
152
+
153
+
154
+
155
+ def show_diff(original: str, corrupted: str) -> None:
156
+ """Display a unified diff between the original and corrupted text."""
157
+
158
+ diff_lines = list(
159
+ difflib.unified_diff(
160
+ original.splitlines(keepends=True),
161
+ corrupted.splitlines(keepends=True),
162
+ fromfile="original",
163
+ tofile="corrupted",
164
+ lineterm="",
165
+ )
166
+ )
167
+ if diff_lines:
168
+ for line in diff_lines:
169
+ print(line)
170
+ else:
171
+ print("No changes detected.")
172
+
173
+
174
+ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
175
+ """Execute the CLI workflow using the provided arguments.
176
+
177
+ Args:
178
+ args: Parsed CLI arguments.
179
+ parser: Argument parser used for error reporting.
180
+
181
+ Returns:
182
+ int: Exit code for the process (``0`` on success).
183
+ """
184
+
185
+ if args.list:
186
+ list_glitchlings()
187
+ return 0
188
+
189
+ text = read_text(args, parser)
190
+ gaggle = summon_glitchlings(args.glitchlings, parser, args.seed)
191
+
192
+ corrupted = gaggle(text)
193
+
194
+ if args.diff:
195
+ show_diff(text, corrupted)
196
+ else:
197
+ print(corrupted)
198
+
199
+ return 0
200
+
201
+
202
+ def main(argv: list[str] | None = None) -> int:
203
+ """Entry point for the ``glitchlings`` command line interface.
204
+
205
+ Args:
206
+ argv: Optional list of command line arguments. Defaults to ``sys.argv``.
207
+
208
+ Returns:
209
+ int: Exit code suitable for use with ``sys.exit``.
210
+ """
211
+
212
+ parser = build_parser()
213
+ args = parser.parse_args(argv)
214
+ return run_cli(args, parser)
215
+
216
+
217
+ if __name__ == "__main__":
218
+ sys.exit(main())