glitchlings 0.10.2__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (83) hide show
  1. glitchlings/__init__.py +99 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_zoo_rust/__init__.py +12 -0
  4. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +147 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +493 -0
  21. glitchlings/attack/core_execution.py +367 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +218 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +227 -0
  27. glitchlings/auggie.py +284 -0
  28. glitchlings/compat/__init__.py +9 -0
  29. glitchlings/compat/loaders.py +355 -0
  30. glitchlings/compat/types.py +41 -0
  31. glitchlings/conf/__init__.py +41 -0
  32. glitchlings/conf/loaders.py +331 -0
  33. glitchlings/conf/schema.py +156 -0
  34. glitchlings/conf/types.py +72 -0
  35. glitchlings/config.toml +2 -0
  36. glitchlings/constants.py +59 -0
  37. glitchlings/dev/__init__.py +3 -0
  38. glitchlings/dev/docs.py +45 -0
  39. glitchlings/dlc/__init__.py +19 -0
  40. glitchlings/dlc/_shared.py +296 -0
  41. glitchlings/dlc/gutenberg.py +400 -0
  42. glitchlings/dlc/huggingface.py +68 -0
  43. glitchlings/dlc/prime.py +215 -0
  44. glitchlings/dlc/pytorch.py +98 -0
  45. glitchlings/dlc/pytorch_lightning.py +173 -0
  46. glitchlings/internal/__init__.py +16 -0
  47. glitchlings/internal/rust.py +159 -0
  48. glitchlings/internal/rust_ffi.py +490 -0
  49. glitchlings/main.py +426 -0
  50. glitchlings/protocols.py +91 -0
  51. glitchlings/runtime_config.py +24 -0
  52. glitchlings/util/__init__.py +27 -0
  53. glitchlings/util/adapters.py +65 -0
  54. glitchlings/util/keyboards.py +356 -0
  55. glitchlings/util/transcripts.py +108 -0
  56. glitchlings/zoo/__init__.py +161 -0
  57. glitchlings/zoo/assets/__init__.py +29 -0
  58. glitchlings/zoo/core.py +678 -0
  59. glitchlings/zoo/core_execution.py +154 -0
  60. glitchlings/zoo/core_planning.py +451 -0
  61. glitchlings/zoo/corrupt_dispatch.py +295 -0
  62. glitchlings/zoo/hokey.py +139 -0
  63. glitchlings/zoo/jargoyle.py +243 -0
  64. glitchlings/zoo/mim1c.py +148 -0
  65. glitchlings/zoo/pedant/__init__.py +109 -0
  66. glitchlings/zoo/pedant/core.py +105 -0
  67. glitchlings/zoo/pedant/forms.py +74 -0
  68. glitchlings/zoo/pedant/stones.py +74 -0
  69. glitchlings/zoo/redactyl.py +97 -0
  70. glitchlings/zoo/rng.py +259 -0
  71. glitchlings/zoo/rushmore.py +416 -0
  72. glitchlings/zoo/scannequin.py +66 -0
  73. glitchlings/zoo/transforms.py +346 -0
  74. glitchlings/zoo/typogre.py +128 -0
  75. glitchlings/zoo/validation.py +477 -0
  76. glitchlings/zoo/wherewolf.py +120 -0
  77. glitchlings/zoo/zeedub.py +93 -0
  78. glitchlings-0.10.2.dist-info/METADATA +337 -0
  79. glitchlings-0.10.2.dist-info/RECORD +83 -0
  80. glitchlings-0.10.2.dist-info/WHEEL +5 -0
  81. glitchlings-0.10.2.dist-info/entry_points.txt +3 -0
  82. glitchlings-0.10.2.dist-info/licenses/LICENSE +201 -0
  83. glitchlings-0.10.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,400 @@
1
+ """Integration helpers for the py-gutenberg library.
2
+
3
+ This module provides a wrapper around the GutenbergAPI that applies
4
+ glitchlings to book text as it's fetched.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections.abc import Iterable
10
+ from dataclasses import dataclass, field
11
+ from functools import cached_property
12
+ from typing import Any, Protocol, TypeAlias, cast
13
+
14
+ from ..util.adapters import coerce_gaggle
15
+ from ..zoo import Gaggle, Glitchling
16
+ from ._shared import corrupt_text_value
17
+
18
+ #: Default Gutendex API instance URL (public instance hosted at gutendex.com).
19
+ DEFAULT_GUTENDEX_URL = "https://gutendex.com"
20
+
21
+
22
+ class PersonProtocol(Protocol):
23
+ """Minimal interface for py-gutenberg Person objects."""
24
+
25
+ name: str
26
+
27
+
28
+ class BookProtocol(Protocol):
29
+ """Minimal interface for py-gutenberg Book objects."""
30
+
31
+ id: int
32
+ title: str
33
+ authors: list[PersonProtocol]
34
+ translators: list[PersonProtocol]
35
+ subjects: list[str]
36
+ bookshelves: list[str]
37
+ languages: list[str]
38
+ copyright: bool
39
+ media_type: str
40
+ formats: dict[str, str]
41
+ download_count: int
42
+
43
+ def get_text(self) -> str: ...
44
+
45
+
46
+ class GutenbergAPIProtocol(Protocol):
47
+ """Subset of the py-gutenberg API we rely on."""
48
+
49
+ instance_url: str
50
+
51
+ def get_all_books(self) -> Iterable[BookProtocol]: ...
52
+
53
+ def get_public_domain_books(self) -> Iterable[BookProtocol]: ...
54
+
55
+ def get_copyrighted_books(self) -> Iterable[BookProtocol]: ...
56
+
57
+ def get_books_by_author(self, author: str) -> Iterable[BookProtocol]: ...
58
+
59
+ def get_books_by_ids(self, ids: list[int]) -> Iterable[BookProtocol]: ...
60
+
61
+ def get_books_by_language(self, languages: list[str]) -> Iterable[BookProtocol]: ...
62
+
63
+ def get_books_by_search(self, query: str) -> Iterable[BookProtocol]: ...
64
+
65
+ def get_books_by_mime_type(self, mime_type: str) -> Iterable[BookProtocol]: ...
66
+
67
+ def get_books_ascending(self) -> Iterable[BookProtocol]: ...
68
+
69
+ def get_oldest(self) -> Iterable[BookProtocol]: ...
70
+
71
+ def get_latest(self, topic: str = "recent") -> Iterable[BookProtocol]: ...
72
+
73
+ def get_book(self, book_id: int) -> BookProtocol: ...
74
+
75
+ def get_book_metadata(self, book_id: int) -> BookProtocol: ...
76
+
77
+ def get_book_text(self, book_id: int) -> BookProtocol: ...
78
+
79
+
80
+ Person: TypeAlias = PersonProtocol
81
+ Book: TypeAlias = BookProtocol
82
+ GutenbergAPI: TypeAlias = GutenbergAPIProtocol
83
+
84
+
85
+ @dataclass
86
+ class GlitchedBook:
87
+ """A Book wrapper that corrupts text content via glitchlings.
88
+
89
+ This class wraps a py-gutenberg Book object but provides corrupted text
90
+ when accessed. The original Book attributes are preserved.
91
+
92
+ Attributes:
93
+ id: The Gutenberg book ID.
94
+ title: The corrupted book title.
95
+ original_title: The original (uncorrupted) book title.
96
+ authors: List of book authors.
97
+ translators: List of book translators.
98
+ subjects: List of subject categories.
99
+ bookshelves: List of bookshelf categories.
100
+ languages: List of language codes.
101
+ copyright: Whether the book is under copyright.
102
+ media_type: The media type of the book.
103
+ formats: Dictionary mapping MIME types to download URLs.
104
+ download_count: Number of times the book has been downloaded.
105
+ """
106
+
107
+ id: int
108
+ title: str
109
+ original_title: str
110
+ authors: list[Person]
111
+ translators: list[Person]
112
+ subjects: list[str]
113
+ bookshelves: list[str]
114
+ languages: list[str]
115
+ copyright: bool
116
+ media_type: str
117
+ formats: dict[str, str]
118
+ download_count: int
119
+ _original_book: Book = field(repr=False)
120
+ _gaggle: Gaggle = field(repr=False)
121
+
122
+ @classmethod
123
+ def from_book(cls, book: Book, gaggle: Gaggle) -> GlitchedBook:
124
+ """Create a GlitchedBook from a py-gutenberg Book.
125
+
126
+ Args:
127
+ book: The original Book object from py-gutenberg.
128
+ gaggle: The gaggle of glitchlings to apply to text.
129
+
130
+ Returns:
131
+ A GlitchedBook that corrupts text with the provided gaggle.
132
+ """
133
+ # Use shared utility for consistent corruption; cast tells mypy this is str
134
+ corrupted_title = cast(str, corrupt_text_value(book.title, gaggle))
135
+ return cls(
136
+ id=book.id,
137
+ title=corrupted_title,
138
+ original_title=book.title,
139
+ authors=book.authors,
140
+ translators=book.translators,
141
+ subjects=book.subjects,
142
+ bookshelves=book.bookshelves,
143
+ languages=book.languages,
144
+ copyright=book.copyright,
145
+ media_type=book.media_type,
146
+ formats=book.formats,
147
+ download_count=book.download_count,
148
+ _original_book=book,
149
+ _gaggle=gaggle,
150
+ )
151
+
152
+ @cached_property
153
+ def _text_content(self) -> str:
154
+ """Lazily fetch and corrupt the full text content of the book."""
155
+ original_text: str = self._original_book.get_text()
156
+ return cast(str, corrupt_text_value(original_text, self._gaggle))
157
+
158
+ def get_text(self) -> str:
159
+ """Fetch and corrupt the full text content of the book.
160
+
161
+ This method fetches the book's text from Project Gutenberg and applies
162
+ glitchlings corruption to it. The text is fetched fresh on the first call
163
+ and cached for subsequent calls.
164
+
165
+ Returns:
166
+ The corrupted full text of the book.
167
+
168
+ Raises:
169
+ AttributeError: If the underlying Book doesn't support get_text().
170
+ """
171
+ return self._text_content
172
+
173
+ def __repr__(self) -> str:
174
+ """Return a concise representation of the GlitchedBook."""
175
+ return (
176
+ f"GlitchedBook(id={self.id}, title={self.title!r}, "
177
+ f"authors={[a.name for a in self.authors]!r})"
178
+ )
179
+
180
+ def __getattr__(self, name: str) -> Any:
181
+ """Delegate attribute access to the original book."""
182
+ return getattr(self._original_book, name)
183
+
184
+
185
+ class GlitchenbergAPI:
186
+ """A wrapper around GutenbergAPI that corrupts book text with glitchlings.
187
+
188
+ This class provides the same interface as GutenbergAPI but applies
189
+ glitchlings to corrupt book text as it's fetched.
190
+
191
+ Example:
192
+ >>> from glitchlings.dlc.gutenberg import GlitchenbergAPI
193
+ >>> from glitchlings import Typogre
194
+ >>> api = GlitchenbergAPI(Typogre(rate=0.05), seed=42)
195
+ >>> book = api.get_book(1342) # Pride and Prejudice
196
+ >>> print(book.title) # Title will have typos applied
197
+ """
198
+
199
+ def __init__(
200
+ self,
201
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
202
+ *,
203
+ seed: int = 151,
204
+ instance_url: str = DEFAULT_GUTENDEX_URL,
205
+ ) -> None:
206
+ """Initialize the GlitchenbergAPI.
207
+
208
+ Args:
209
+ glitchlings: A glitchling, gaggle, or specification of glitchlings to apply.
210
+ seed: RNG seed for deterministic corruption (default: 151).
211
+ instance_url: The Gutendex instance URL to use for API requests.
212
+ Defaults to the public instance at gutendex.com. For production use,
213
+ consider self-hosting Gutendex.
214
+ """
215
+ self._gaggle = coerce_gaggle(glitchlings, seed=seed)
216
+ self._api = _get_gutenberg_api(instance_url)
217
+
218
+ @property
219
+ def instance_url(self) -> str:
220
+ """Return the Gutendex instance URL."""
221
+ return str(self._api.instance_url)
222
+
223
+ @property
224
+ def gaggle(self) -> Gaggle:
225
+ """Return the gaggle used for corruption."""
226
+ return self._gaggle
227
+
228
+ def _corrupt_book(self, book: Book) -> GlitchedBook:
229
+ """Apply glitchlings to a Book object."""
230
+ return GlitchedBook.from_book(book, self._gaggle)
231
+
232
+ def _corrupt_books(self, books: Iterable[Book]) -> list[GlitchedBook]:
233
+ """Apply glitchlings to a list of Book objects."""
234
+ return [self._corrupt_book(book) for book in books]
235
+
236
+ def corrupt_books(self, books: list[Book]) -> list[GlitchedBook]:
237
+ """Apply glitchlings to a list of Book objects.
238
+
239
+ This method allows batch corruption of books fetched from other sources
240
+ or the underlying API.
241
+
242
+ Args:
243
+ books: List of py-gutenberg Book objects to corrupt.
244
+
245
+ Returns:
246
+ List of GlitchedBook objects with corrupted text.
247
+
248
+ Example:
249
+ >>> # Fetch from underlying API and corrupt separately
250
+ >>> raw_books = api._api.get_books_by_author("Austen")
251
+ >>> glitched = api.corrupt_books(raw_books)
252
+ """
253
+ return self._corrupt_books(books)
254
+
255
+ # Methods that return lists of books
256
+ def get_all_books(self) -> list[GlitchedBook]:
257
+ """Get all books with glitchling corruption applied."""
258
+ return self._corrupt_books(self._api.get_all_books())
259
+
260
+ def get_public_domain_books(self) -> list[GlitchedBook]:
261
+ """Get public domain books with glitchling corruption applied."""
262
+ return self._corrupt_books(self._api.get_public_domain_books())
263
+
264
+ def get_copyrighted_books(self) -> list[GlitchedBook]:
265
+ """Get copyrighted books with glitchling corruption applied."""
266
+ return self._corrupt_books(self._api.get_copyrighted_books())
267
+
268
+ def get_books_by_author(self, author: str) -> list[GlitchedBook]:
269
+ """Get books by author with glitchling corruption applied.
270
+
271
+ Args:
272
+ author: Author name to search for.
273
+
274
+ Returns:
275
+ List of GlitchedBook objects with corrupted text.
276
+ """
277
+ return self._corrupt_books(self._api.get_books_by_author(author))
278
+
279
+ def get_books_by_ids(self, ids: list[int]) -> list[GlitchedBook]:
280
+ """Get books by IDs with glitchling corruption applied.
281
+
282
+ Args:
283
+ ids: List of Gutenberg book IDs to retrieve.
284
+
285
+ Returns:
286
+ List of GlitchedBook objects with corrupted text.
287
+ """
288
+ return self._corrupt_books(self._api.get_books_by_ids(ids))
289
+
290
+ def get_books_by_language(self, languages: list[str]) -> list[GlitchedBook]:
291
+ """Get books by language with glitchling corruption applied.
292
+
293
+ Args:
294
+ languages: List of language codes (e.g., ["en", "fr"]).
295
+
296
+ Returns:
297
+ List of GlitchedBook objects with corrupted text.
298
+ """
299
+ return self._corrupt_books(self._api.get_books_by_language(languages))
300
+
301
+ def get_books_by_search(self, query: str) -> list[GlitchedBook]:
302
+ """Search for books with glitchling corruption applied.
303
+
304
+ Args:
305
+ query: Search query string.
306
+
307
+ Returns:
308
+ List of GlitchedBook objects with corrupted text.
309
+ """
310
+ return self._corrupt_books(self._api.get_books_by_search(query))
311
+
312
+ def get_books_by_mime_type(self, mime_type: str) -> list[GlitchedBook]:
313
+ """Get books by MIME type with glitchling corruption applied.
314
+
315
+ Args:
316
+ mime_type: MIME type filter (e.g., "text/plain").
317
+
318
+ Returns:
319
+ List of GlitchedBook objects with corrupted text.
320
+ """
321
+ return self._corrupt_books(self._api.get_books_by_mime_type(mime_type))
322
+
323
+ def get_books_ascending(self) -> list[GlitchedBook]:
324
+ """Get books sorted ascending with glitchling corruption applied."""
325
+ return self._corrupt_books(self._api.get_books_ascending())
326
+
327
+ def get_oldest(self) -> list[GlitchedBook]:
328
+ """Get oldest books with glitchling corruption applied."""
329
+ return self._corrupt_books(self._api.get_oldest())
330
+
331
+ def get_latest(self, topic: str = "recent") -> list[GlitchedBook]:
332
+ """Get latest books by topic with glitchling corruption applied.
333
+
334
+ Args:
335
+ topic: Topic string to filter books by (e.g., "fiction", "science").
336
+ Defaults to "recent".
337
+
338
+ Returns:
339
+ List of GlitchedBook objects with corrupted text.
340
+ """
341
+ return self._corrupt_books(self._api.get_latest(topic))
342
+
343
+ # Methods that return single books
344
+ def get_book(self, book_id: int) -> GlitchedBook:
345
+ """Get a book by ID with glitchling corruption applied.
346
+
347
+ Args:
348
+ book_id: Gutenberg book ID.
349
+
350
+ Returns:
351
+ GlitchedBook with corrupted text.
352
+ """
353
+ return self._corrupt_book(self._api.get_book(book_id))
354
+
355
+ def get_book_metadata(self, book_id: int) -> GlitchedBook:
356
+ """Get book metadata by ID with glitchling corruption applied.
357
+
358
+ Args:
359
+ book_id: Gutenberg book ID.
360
+
361
+ Returns:
362
+ GlitchedBook with corrupted metadata.
363
+ """
364
+ return self._corrupt_book(self._api.get_book_metadata(book_id))
365
+
366
+ def get_book_text(self, book_id: int) -> GlitchedBook:
367
+ """Get book text by ID with glitchling corruption applied.
368
+
369
+ Args:
370
+ book_id: Gutenberg book ID.
371
+
372
+ Returns:
373
+ GlitchedBook with corrupted text.
374
+ """
375
+ return self._corrupt_book(self._api.get_book_text(book_id))
376
+
377
+ def __getattr__(self, name: str) -> Any:
378
+ """Delegate attribute access to the underlying API."""
379
+ return getattr(self._api, name)
380
+
381
+
382
+ def _get_gutenberg_api(instance_url: str) -> GutenbergAPI:
383
+ """Import and return a GutenbergAPI instance.
384
+
385
+ Raises:
386
+ ImportError: If py-gutenberg is not installed.
387
+ """
388
+ try:
389
+ from gutenberg import GutenbergAPI
390
+ except ImportError as exc:
391
+ raise ImportError(
392
+ "py-gutenberg is required for the GlitchenbergAPI integration. "
393
+ "Install it with: pip install py-gutenberg"
394
+ ) from exc
395
+
396
+ api = GutenbergAPI(instance_url=instance_url)
397
+ return cast(GutenbergAPIProtocol, api)
398
+
399
+
400
+ __all__ = ["DEFAULT_GUTENDEX_URL", "GlitchenbergAPI", "GlitchedBook"]
@@ -0,0 +1,68 @@
1
+ """Integration helpers for the Hugging Face datasets library."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from typing import Any
7
+
8
+ from ..util.adapters import coerce_gaggle
9
+ from ..zoo import Gaggle, Glitchling
10
+
11
+
12
+ def _normalize_columns(column: str | Sequence[str]) -> list[str]:
13
+ """Normalize a column specification to a list."""
14
+ if isinstance(column, str):
15
+ return [column]
16
+
17
+ normalized = list(column)
18
+ if not normalized:
19
+ raise ValueError("At least one column must be specified")
20
+ return normalized
21
+
22
+
23
+ def _glitch_dataset(
24
+ dataset: Any,
25
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
26
+ column: str | Sequence[str],
27
+ *,
28
+ seed: int = 151,
29
+ ) -> Any:
30
+ """Apply glitchlings to the provided dataset columns."""
31
+ columns = _normalize_columns(column)
32
+ gaggle = coerce_gaggle(glitchlings, seed=seed)
33
+ return gaggle.corrupt_dataset(dataset, columns)
34
+
35
+
36
+ def GlitchedDataset(
37
+ dataset: Any,
38
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
39
+ *,
40
+ column: str | Sequence[str],
41
+ seed: int = 151,
42
+ ) -> Any:
43
+ """Return a lazily corrupted copy of a Hugging Face dataset.
44
+
45
+ This function applies glitchlings to the specified columns of a dataset,
46
+ returning a new dataset that lazily corrupts data as it's accessed.
47
+
48
+ Args:
49
+ dataset: The Hugging Face Dataset to corrupt.
50
+ glitchlings: A glitchling, gaggle, or specification of glitchlings to apply.
51
+ column: The column name (string) or names (sequence of strings) to corrupt.
52
+ seed: RNG seed for deterministic corruption (default: 151).
53
+
54
+ Returns:
55
+ A new dataset with the specified columns corrupted by the glitchlings.
56
+
57
+ Example:
58
+ >>> from datasets import Dataset
59
+ >>> from glitchlings.dlc.huggingface import GlitchedDataset
60
+ >>> dataset = Dataset.from_dict({"text": ["hello", "world"]})
61
+ >>> corrupted = GlitchedDataset(dataset, "typogre", column="text")
62
+ >>> list(corrupted)
63
+ [{'text': 'helo'}, {'text': 'wrold'}]
64
+ """
65
+ return _glitch_dataset(dataset, glitchlings, column, seed=seed)
66
+
67
+
68
+ __all__ = ["GlitchedDataset"]
@@ -0,0 +1,215 @@
1
+ """Integration helpers for the optional verifiers prime DLC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from typing import Any, Callable, Protocol, cast
7
+
8
+ from ..compat.loaders import require_datasets, require_jellyfish, require_verifiers
9
+ from ..util.adapters import coerce_gaggle
10
+ from ..zoo import Gaggle, Glitchling, Mim1c, Typogre # noqa: F401
11
+ from ._shared import resolve_columns as _resolve_columns_shared
12
+
13
+
14
+ class VerifierEnvironment(Protocol):
15
+ """Minimal interface for verifiers environments."""
16
+
17
+ dataset: Any
18
+
19
+
20
+ class VerifierSingleTurnEnv(Protocol):
21
+ """Minimal interface for single-turn verifier environments."""
22
+
23
+ dataset: Any
24
+ rubric: Any
25
+
26
+
27
+ vf = require_verifiers("verifiers is not installed; install glitchlings[prime]")
28
+ _jellyfish = require_jellyfish("jellyfish is not installed; install glitchlings[prime]")
29
+ levenshtein_distance = _jellyfish.levenshtein_distance
30
+
31
+
32
+ def _resolve_environment(env: str | VerifierEnvironment) -> VerifierEnvironment:
33
+ """Return a fully-instantiated verifier environment."""
34
+ if isinstance(env, str):
35
+ env = vf.load_environment(env)
36
+
37
+ if not isinstance(env, cast(type[Any], vf.Environment)):
38
+ raise TypeError("Invalid environment type")
39
+
40
+ return cast(VerifierEnvironment, env)
41
+
42
+
43
+ def _resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
44
+ """Identify which dataset columns should be corrupted."""
45
+ return _resolve_columns_shared(dataset, columns)
46
+
47
+
48
+ def load_environment(
49
+ env: str | VerifierEnvironment,
50
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
51
+ *,
52
+ seed: int = 151,
53
+ columns: Sequence[str] | None = None,
54
+ ) -> VerifierEnvironment:
55
+ """Load an environment and optionally corrupt it with glitchlings."""
56
+ environment = _resolve_environment(env)
57
+
58
+ if glitchlings is None:
59
+ return environment
60
+
61
+ gaggle = coerce_gaggle(glitchlings, seed=seed)
62
+
63
+ dataset = environment.dataset
64
+ corrupt_columns = _resolve_columns(dataset, columns)
65
+ environment.dataset = gaggle.corrupt_dataset(dataset, corrupt_columns)
66
+ return environment
67
+
68
+
69
+ def _as_gaggle(
70
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
71
+ *,
72
+ seed: int,
73
+ ) -> Gaggle:
74
+ """Coerce any supported glitchling specification into a :class:`Gaggle`."""
75
+ return coerce_gaggle(glitchlings, seed=seed)
76
+
77
+
78
+ def _extract_completion_text(completion: Any) -> str:
79
+ """Normalize a completion payload into a plain string."""
80
+ if isinstance(completion, str):
81
+ return completion
82
+
83
+ if isinstance(completion, list) and completion:
84
+ first = completion[0]
85
+ if isinstance(first, dict) and "content" in first:
86
+ return str(first["content"])
87
+ return str(first)
88
+
89
+ return str(completion)
90
+
91
+
92
+ def normalized_edit_distance(
93
+ _: Any,
94
+ completion: Any,
95
+ answer: str,
96
+ ) -> float:
97
+ """Return ``1 - (distance / max_len)`` using Levenshtein distance."""
98
+ completion_text = _extract_completion_text(completion)
99
+ target = answer or ""
100
+ denominator = max(len(completion_text), len(target), 1)
101
+ distance = cast(int, levenshtein_distance(completion_text, target))
102
+ score = 1.0 - (distance / denominator)
103
+ return max(0.0, min(1.0, score))
104
+
105
+
106
+ symmetric_levenshtein_similarity = normalized_edit_distance
107
+
108
+ DEFAULT_CLEANUP_INSTRUCTIONS = (
109
+ "You are a meticulous copy editor. Restore the provided text to its original form."
110
+ )
111
+
112
+
113
+ def echo_chamber(
114
+ dataset_id: str,
115
+ column: str,
116
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
117
+ *,
118
+ seed: int = 151,
119
+ instructions: str = DEFAULT_CLEANUP_INSTRUCTIONS,
120
+ reward_function: Callable[..., float] | None = None,
121
+ split: str | None = None,
122
+ **load_dataset_kwargs: Any,
123
+ ) -> VerifierSingleTurnEnv:
124
+ """Create an Echo Chamber Prime environment from a Hugging Face dataset column.
125
+
126
+ Args:
127
+ dataset_id: Identifier of the Hugging Face dataset to load.
128
+ column: Name of the column whose text should be glitched.
129
+ glitchlings: Glitchling specifiers that will corrupt the prompts.
130
+ seed: RNG seed forwarded to :func:`glitchlings.util.adapters.coerce_gaggle`.
131
+ instructions: System instructions supplied to the environment prompts.
132
+ reward_function: Optional callable used to score completions. Defaults to
133
+ :func:`symmetric_levenshtein_similarity` when omitted.
134
+ split: Optional dataset split to load.
135
+ **load_dataset_kwargs: Extra keyword arguments forwarded to
136
+ :func:`datasets.load_dataset`.
137
+
138
+ """
139
+ datasets_module = require_datasets("datasets is required to build an echo chamber")
140
+ load_dataset = getattr(datasets_module, "load_dataset", None)
141
+ if load_dataset is None: # pragma: no cover - defensive
142
+ message = "datasets is required to build an echo chamber"
143
+ raise ModuleNotFoundError(message)
144
+
145
+ dataset_dict_cls = getattr(datasets_module, "DatasetDict", dict)
146
+
147
+ hf_dataset: Any
148
+ if split is None:
149
+ hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
150
+ if isinstance(hf_dataset, dataset_dict_cls):
151
+ try:
152
+ hf_dataset = next(iter(hf_dataset.values()))
153
+ except StopIteration as exc: # pragma: no cover - defensive
154
+ raise ValueError("The specified dataset does not contain any splits") from exc
155
+ else:
156
+ hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
157
+
158
+ if isinstance(hf_dataset, dataset_dict_cls):
159
+ raise ValueError("Specify which split to use when the dataset loads as a DatasetDict.")
160
+
161
+ filtered_dataset = hf_dataset.filter(
162
+ lambda row: row.get(column) is not None,
163
+ load_from_cache_file=False,
164
+ )
165
+
166
+ source_column_names = list(filtered_dataset.column_names)
167
+
168
+ def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
169
+ text = str(row[column])
170
+ prompt = [
171
+ {"role": "system", "content": instructions},
172
+ {"role": "user", "content": f"Corrupted text:\n{text}"},
173
+ ]
174
+ return {"prompt": prompt, "answer": text}
175
+
176
+ base_dataset = filtered_dataset.map(
177
+ _build_prompt,
178
+ remove_columns=source_column_names,
179
+ load_from_cache_file=False,
180
+ )
181
+
182
+ try:
183
+ dataset_length = len(base_dataset)
184
+ except TypeError:
185
+ preview_rows: list[dict[str, Any]]
186
+ take_fn = getattr(base_dataset, "take", None)
187
+ if callable(take_fn):
188
+ preview_rows = list(take_fn(1))
189
+ else:
190
+ iterator = iter(base_dataset)
191
+ try:
192
+ first_row = next(iterator)
193
+ except StopIteration:
194
+ preview_rows = []
195
+ else:
196
+ preview_rows = [first_row]
197
+ if not preview_rows:
198
+ raise ValueError(
199
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
200
+ )
201
+ else:
202
+ if dataset_length == 0:
203
+ raise ValueError(
204
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
205
+ )
206
+
207
+ gaggle = _as_gaggle(glitchlings, seed=seed)
208
+ glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
209
+
210
+ rubric_func = reward_function or normalized_edit_distance
211
+ rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
212
+ return cast(
213
+ VerifierSingleTurnEnv,
214
+ vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric),
215
+ )