glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. glitchlings/__init__.py +101 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_corruption_engine/__init__.py +12 -0
  4. glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +184 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +819 -0
  21. glitchlings/attack/core_execution.py +378 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +211 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +338 -0
  27. glitchlings/attack/tokenizer_metrics.py +373 -0
  28. glitchlings/auggie.py +285 -0
  29. glitchlings/compat/__init__.py +9 -0
  30. glitchlings/compat/loaders.py +355 -0
  31. glitchlings/compat/types.py +41 -0
  32. glitchlings/conf/__init__.py +39 -0
  33. glitchlings/conf/loaders.py +331 -0
  34. glitchlings/conf/schema.py +156 -0
  35. glitchlings/conf/types.py +72 -0
  36. glitchlings/config.toml +2 -0
  37. glitchlings/constants.py +139 -0
  38. glitchlings/dev/__init__.py +3 -0
  39. glitchlings/dev/docs.py +45 -0
  40. glitchlings/dlc/__init__.py +21 -0
  41. glitchlings/dlc/_shared.py +300 -0
  42. glitchlings/dlc/gutenberg.py +400 -0
  43. glitchlings/dlc/huggingface.py +68 -0
  44. glitchlings/dlc/langchain.py +147 -0
  45. glitchlings/dlc/nemo.py +283 -0
  46. glitchlings/dlc/prime.py +215 -0
  47. glitchlings/dlc/pytorch.py +98 -0
  48. glitchlings/dlc/pytorch_lightning.py +173 -0
  49. glitchlings/internal/__init__.py +16 -0
  50. glitchlings/internal/rust.py +159 -0
  51. glitchlings/internal/rust_ffi.py +599 -0
  52. glitchlings/main.py +426 -0
  53. glitchlings/protocols.py +91 -0
  54. glitchlings/runtime_config.py +24 -0
  55. glitchlings/util/__init__.py +41 -0
  56. glitchlings/util/adapters.py +65 -0
  57. glitchlings/util/keyboards.py +508 -0
  58. glitchlings/util/transcripts.py +108 -0
  59. glitchlings/zoo/__init__.py +161 -0
  60. glitchlings/zoo/assets/__init__.py +29 -0
  61. glitchlings/zoo/core.py +852 -0
  62. glitchlings/zoo/core_execution.py +154 -0
  63. glitchlings/zoo/core_planning.py +451 -0
  64. glitchlings/zoo/corrupt_dispatch.py +291 -0
  65. glitchlings/zoo/hokey.py +139 -0
  66. glitchlings/zoo/jargoyle.py +301 -0
  67. glitchlings/zoo/mim1c.py +269 -0
  68. glitchlings/zoo/pedant/__init__.py +109 -0
  69. glitchlings/zoo/pedant/core.py +99 -0
  70. glitchlings/zoo/pedant/forms.py +50 -0
  71. glitchlings/zoo/pedant/stones.py +83 -0
  72. glitchlings/zoo/redactyl.py +94 -0
  73. glitchlings/zoo/rng.py +280 -0
  74. glitchlings/zoo/rushmore.py +416 -0
  75. glitchlings/zoo/scannequin.py +370 -0
  76. glitchlings/zoo/transforms.py +331 -0
  77. glitchlings/zoo/typogre.py +194 -0
  78. glitchlings/zoo/validation.py +643 -0
  79. glitchlings/zoo/wherewolf.py +120 -0
  80. glitchlings/zoo/zeedub.py +165 -0
  81. glitchlings-1.0.0.dist-info/METADATA +404 -0
  82. glitchlings-1.0.0.dist-info/RECORD +86 -0
  83. glitchlings-1.0.0.dist-info/WHEEL +5 -0
  84. glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
  85. glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
  86. glitchlings-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
1
+ """NVIDIA NeMo DataDesigner plugin for Glitchlings text corruption.
2
+
3
+ This module provides a DataDesigner column generator that applies Glitchlings
4
+ transformations to text columns, enabling deterministic text corruption for
5
+ model robustness testing and adversarial augmentation.
6
+
7
+ The plugin integrates with NeMo's experimental plugin system, exposing
8
+ Glitchlings as a discoverable column generator.
9
+
10
+ Example:
11
+ >>> from glitchlings.dlc.nemo import GlitchlingColumnConfig
12
+ >>> from data_designer import DataDesignerConfigBuilder
13
+ >>> builder = DataDesignerConfigBuilder()
14
+ >>> builder.add_column(
15
+ ... GlitchlingColumnConfig(
16
+ ... name="corrupted_prompt",
17
+ ... source_column="prompt",
18
+ ... glitchlings=["Typogre(rate=0.02)", "Mim1c(rate=0.01)"],
19
+ ... seed=404,
20
+ ... )
21
+ ... )
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from collections.abc import Sequence
27
+ from pathlib import Path
28
+ from typing import TYPE_CHECKING, Any, Literal, Union
29
+
30
+ from ..auggie import Auggie
31
+ from ..conf.loaders import load_attack_config
32
+ from ..util.adapters import coerce_gaggle
33
+ from ..zoo.core import Gaggle, Glitchling
34
+
35
+ if TYPE_CHECKING: # pragma: no cover - typing only
36
+ import pandas as pd
37
+
38
+ # Type alias for flexible glitchling specification
39
+ GlitchlingSpec = Union[
40
+ Gaggle,
41
+ Auggie,
42
+ Glitchling,
43
+ str,
44
+ Sequence[Union[str, Glitchling]],
45
+ Path,
46
+ ]
47
+
48
+
49
+ def _resolve_gaggle(
50
+ spec: GlitchlingSpec,
51
+ *,
52
+ seed: int,
53
+ ) -> Gaggle:
54
+ """Resolve a flexible glitchling specification into a Gaggle.
55
+
56
+ Args:
57
+ spec: One of:
58
+ - A pre-constructed ``Gaggle``
59
+ - An ``Auggie`` builder (which is a Gaggle subclass)
60
+ - A single ``Glitchling`` instance
61
+ - A string glitchling name (e.g., "typogre")
62
+ - A list of glitchling names or instances
63
+ - A ``Path`` to a YAML config file
64
+ seed: Seed for deterministic corruption.
65
+
66
+ Returns:
67
+ A configured Gaggle ready for corruption.
68
+
69
+ Raises:
70
+ TypeError: If the spec type is not recognized.
71
+ ValueError: If a config path doesn't exist or is invalid.
72
+ """
73
+ # Auggie is a subclass of Gaggle, so check it first
74
+ if isinstance(spec, Auggie):
75
+ return spec.clone(seed=seed)
76
+
77
+ if isinstance(spec, Gaggle):
78
+ return spec.clone(seed=seed)
79
+
80
+ # Path to YAML config
81
+ if isinstance(spec, Path):
82
+ config = load_attack_config(spec)
83
+ from ..conf.loaders import build_gaggle
84
+
85
+ return build_gaggle(config, seed_override=seed)
86
+
87
+ # String path (check if it looks like a file path)
88
+ if isinstance(spec, str) and (spec.endswith(".yaml") or spec.endswith(".yml")):
89
+ path = Path(spec)
90
+ if not path.exists():
91
+ raise FileNotFoundError(
92
+ f"Glitchling config file not found: {spec!r}. "
93
+ "If this was intended as a glitchling name, remove the .yaml/.yml extension."
94
+ )
95
+ config = load_attack_config(path)
96
+ from ..conf.loaders import build_gaggle
97
+
98
+ return build_gaggle(config, seed_override=seed)
99
+
100
+ # Single glitchling or string, or sequence
101
+ return coerce_gaggle(spec, seed=seed)
102
+
103
+
104
+ def _apply_corruption(
105
+ series: "pd.Series[str]",
106
+ gaggle: Gaggle,
107
+ ) -> "pd.Series[str]":
108
+ """Apply gaggle corruption to a pandas Series.
109
+
110
+ Uses batch corruption for efficiency when possible.
111
+
112
+ Args:
113
+ series: Pandas Series of strings to corrupt.
114
+ gaggle: Configured Gaggle to apply.
115
+
116
+ Returns:
117
+ Series with corrupted strings.
118
+ """
119
+ # Use batch corruption for better performance via Rust pipeline
120
+ values = series.tolist()
121
+ corrupted = gaggle.corrupt_batch(values)
122
+ return series.__class__(corrupted, index=series.index, name=series.name)
123
+
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # DataDesigner Integration Classes
127
+ # ---------------------------------------------------------------------------
128
+ # These classes follow the NeMo DataDesigner plugin interface.
129
+ # They are defined conditionally to avoid hard dependency on data-designer.
130
+
131
+
132
+ def _create_plugin_classes() -> tuple[type, type, Any] | None:
133
+ """Create plugin classes if data-designer is available.
134
+
135
+ Returns:
136
+ Tuple of (config_class, generator_class, plugin_object) or None.
137
+ """
138
+ try:
139
+ from data_designer.config.base import SingleColumnConfig
140
+ from data_designer.engine.column_generators.generators.base import (
141
+ ColumnGenerator,
142
+ GenerationStrategy,
143
+ GeneratorMetadata,
144
+ )
145
+ from data_designer.plugins import Plugin, PluginType
146
+ except ImportError:
147
+ return None
148
+
149
+ class GlitchlingColumnConfig(SingleColumnConfig): # type: ignore[misc]
150
+ """Configuration for Glitchlings text corruption column generator.
151
+
152
+ Attributes:
153
+ name: Output column name.
154
+ column_type: Discriminator field for DataDesigner plugin discovery.
155
+ glitchlings: Glitchling specification. Can be:
156
+ - A string glitchling name: ``"typogre"``
157
+ - A spec with parameters: ``"Typogre(rate=0.02)"``
158
+ - A list of specs: ``["Typogre(rate=0.02)", "Mim1c(rate=0.01)"]``
159
+ - A path to YAML config: ``"configs/chaos.yaml"``
160
+ source_column: Column to corrupt. If None, corrupts the column
161
+ specified by ``name`` (in-place style).
162
+ seed: RNG seed for deterministic corruption. If None, uses
163
+ a default seed for reproducibility.
164
+ """
165
+
166
+ column_type: Literal["glitchlings"] = "glitchlings"
167
+ glitchlings: str | list[str] = "typogre"
168
+ source_column: str | None = None
169
+ seed: int | None = None
170
+
171
+ class GlitchlingColumnGenerator(ColumnGenerator): # type: ignore[misc]
172
+ """Column generator that applies Glitchlings text corruption.
173
+
174
+ This generator corrupts text in the source column using the configured
175
+ glitchlings and writes the result to the output column.
176
+ """
177
+
178
+ config: GlitchlingColumnConfig
179
+
180
+ @staticmethod
181
+ def metadata() -> GeneratorMetadata:
182
+ """Return metadata describing this generator."""
183
+ return GeneratorMetadata(
184
+ name="glitchlings",
185
+ description=(
186
+ "Apply deterministic, linguistically-principled text corruption "
187
+ "via Glitchlings for model robustness testing and adversarial augmentation."
188
+ ),
189
+ generation_strategy=GenerationStrategy.FULL_COLUMN,
190
+ required_resources=None,
191
+ )
192
+
193
+ def generate(self, data: "pd.DataFrame") -> "pd.DataFrame":
194
+ """Generate corrupted text column.
195
+
196
+ Args:
197
+ data: Input DataFrame.
198
+
199
+ Returns:
200
+ DataFrame with the corrupted column added/updated.
201
+ """
202
+ source = self.config.source_column or self.config.name
203
+ seed = self.config.seed if self.config.seed is not None else 151
204
+
205
+ # Resolve glitchlings specification
206
+ spec: GlitchlingSpec = self.config.glitchlings
207
+
208
+ gaggle = _resolve_gaggle(spec, seed=seed)
209
+
210
+ # Apply corruption
211
+ data[self.config.name] = _apply_corruption(data[source], gaggle)
212
+ return data
213
+
214
+ plugin = Plugin(
215
+ task_cls=GlitchlingColumnGenerator,
216
+ config_cls=GlitchlingColumnConfig,
217
+ plugin_type=PluginType.COLUMN_GENERATOR,
218
+ emoji="👾",
219
+ )
220
+
221
+ return GlitchlingColumnConfig, GlitchlingColumnGenerator, plugin
222
+
223
+
224
+ # Try to create the plugin classes
225
+ _plugin_result = _create_plugin_classes()
226
+
227
+ if _plugin_result is not None:
228
+ GlitchlingColumnConfig, GlitchlingColumnGenerator, plugin = _plugin_result
229
+ else:
230
+ # Provide stub classes for documentation and type checking
231
+ GlitchlingColumnConfig = None # type: ignore[assignment]
232
+ GlitchlingColumnGenerator = None # type: ignore[assignment]
233
+ plugin = None
234
+
235
+
236
+ # ---------------------------------------------------------------------------
237
+ # Standalone Functions (usable without DataDesigner)
238
+ # ---------------------------------------------------------------------------
239
+
240
+
241
+ def corrupt_dataframe(
242
+ df: "pd.DataFrame",
243
+ glitchlings: GlitchlingSpec,
244
+ *,
245
+ column: str,
246
+ output_column: str | None = None,
247
+ seed: int = 151,
248
+ ) -> "pd.DataFrame":
249
+ """Corrupt a DataFrame column using Glitchlings.
250
+
251
+ This function provides DataFrame corruption without requiring the full
252
+ DataDesigner plugin infrastructure.
253
+
254
+ Args:
255
+ df: Input DataFrame.
256
+ glitchlings: Glitchling specification (see ``GlitchlingSpec``).
257
+ column: Source column to corrupt.
258
+ output_column: Output column name. If None, overwrites source column.
259
+ seed: RNG seed for deterministic corruption.
260
+
261
+ Returns:
262
+ DataFrame with corrupted column.
263
+
264
+ Example:
265
+ >>> import pandas as pd
266
+ >>> from glitchlings.dlc.nemo import corrupt_dataframe
267
+ >>> df = pd.DataFrame({"text": ["Hello world", "Test input"]})
268
+ >>> result = corrupt_dataframe(df, "typogre", column="text", seed=42)
269
+ """
270
+ gaggle = _resolve_gaggle(glitchlings, seed=seed)
271
+ target = output_column if output_column is not None else column
272
+ df = df.copy()
273
+ df[target] = _apply_corruption(df[column], gaggle)
274
+ return df
275
+
276
+
277
+ __all__ = [
278
+ "GlitchlingColumnConfig",
279
+ "GlitchlingColumnGenerator",
280
+ "GlitchlingSpec",
281
+ "corrupt_dataframe",
282
+ "plugin",
283
+ ]
@@ -0,0 +1,215 @@
1
+ """Integration helpers for the optional verifiers prime DLC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from typing import Any, Callable, Protocol, cast
7
+
8
+ from ..compat.loaders import require_datasets, require_jellyfish, require_verifiers
9
+ from ..util.adapters import coerce_gaggle
10
+ from ..zoo import Gaggle, Glitchling, Mim1c, Typogre # noqa: F401
11
+ from ._shared import resolve_columns as _resolve_columns_shared
12
+
13
+
14
+ class VerifierEnvironment(Protocol):
15
+ """Minimal interface for verifiers environments."""
16
+
17
+ dataset: Any
18
+
19
+
20
+ class VerifierSingleTurnEnv(Protocol):
21
+ """Minimal interface for single-turn verifier environments."""
22
+
23
+ dataset: Any
24
+ rubric: Any
25
+
26
+
27
+ vf = require_verifiers("verifiers is not installed; install glitchlings[prime]")
28
+ _jellyfish = require_jellyfish("jellyfish is not installed; install glitchlings[prime]")
29
+ levenshtein_distance = _jellyfish.levenshtein_distance
30
+
31
+
32
+ def _resolve_environment(env: str | VerifierEnvironment) -> VerifierEnvironment:
33
+ """Return a fully-instantiated verifier environment."""
34
+ if isinstance(env, str):
35
+ env = vf.load_environment(env)
36
+
37
+ if not isinstance(env, cast(type[Any], vf.Environment)):
38
+ raise TypeError("Invalid environment type")
39
+
40
+ return cast(VerifierEnvironment, env)
41
+
42
+
43
+ def _resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
44
+ """Identify which dataset columns should be corrupted."""
45
+ return _resolve_columns_shared(dataset, columns)
46
+
47
+
48
+ def load_environment(
49
+ env: str | VerifierEnvironment,
50
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
51
+ *,
52
+ seed: int = 151,
53
+ columns: Sequence[str] | None = None,
54
+ ) -> VerifierEnvironment:
55
+ """Load an environment and optionally corrupt it with glitchlings."""
56
+ environment = _resolve_environment(env)
57
+
58
+ if glitchlings is None:
59
+ return environment
60
+
61
+ gaggle = coerce_gaggle(glitchlings, seed=seed)
62
+
63
+ dataset = environment.dataset
64
+ corrupt_columns = _resolve_columns(dataset, columns)
65
+ environment.dataset = gaggle.corrupt_dataset(dataset, corrupt_columns)
66
+ return environment
67
+
68
+
69
+ def _as_gaggle(
70
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
71
+ *,
72
+ seed: int,
73
+ ) -> Gaggle:
74
+ """Coerce any supported glitchling specification into a :class:`Gaggle`."""
75
+ return coerce_gaggle(glitchlings, seed=seed)
76
+
77
+
78
+ def _extract_completion_text(completion: Any) -> str:
79
+ """Normalize a completion payload into a plain string."""
80
+ if isinstance(completion, str):
81
+ return completion
82
+
83
+ if isinstance(completion, list) and completion:
84
+ first = completion[0]
85
+ if isinstance(first, dict) and "content" in first:
86
+ return str(first["content"])
87
+ return str(first)
88
+
89
+ return str(completion)
90
+
91
+
92
+ def normalized_edit_distance(
93
+ _: Any,
94
+ completion: Any,
95
+ answer: str,
96
+ ) -> float:
97
+ """Return ``1 - (distance / max_len)`` using Levenshtein distance."""
98
+ completion_text = _extract_completion_text(completion)
99
+ target = answer or ""
100
+ denominator = max(len(completion_text), len(target), 1)
101
+ distance = cast(int, levenshtein_distance(completion_text, target))
102
+ score = 1.0 - (distance / denominator)
103
+ return max(0.0, min(1.0, score))
104
+
105
+
106
+ symmetric_levenshtein_similarity = normalized_edit_distance
107
+
108
+ DEFAULT_CLEANUP_INSTRUCTIONS = (
109
+ "You are a meticulous copy editor. Restore the provided text to its original form."
110
+ )
111
+
112
+
113
+ def echo_chamber(
114
+ dataset_id: str,
115
+ column: str,
116
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
117
+ *,
118
+ seed: int = 151,
119
+ instructions: str = DEFAULT_CLEANUP_INSTRUCTIONS,
120
+ reward_function: Callable[..., float] | None = None,
121
+ split: str | None = None,
122
+ **load_dataset_kwargs: Any,
123
+ ) -> VerifierSingleTurnEnv:
124
+ """Create an Echo Chamber Prime environment from a Hugging Face dataset column.
125
+
126
+ Args:
127
+ dataset_id: Identifier of the Hugging Face dataset to load.
128
+ column: Name of the column whose text should be glitched.
129
+ glitchlings: Glitchling specifiers that will corrupt the prompts.
130
+ seed: RNG seed forwarded to :func:`glitchlings.util.adapters.coerce_gaggle`.
131
+ instructions: System instructions supplied to the environment prompts.
132
+ reward_function: Optional callable used to score completions. Defaults to
133
+ :func:`symmetric_levenshtein_similarity` when omitted.
134
+ split: Optional dataset split to load.
135
+ **load_dataset_kwargs: Extra keyword arguments forwarded to
136
+ :func:`datasets.load_dataset`.
137
+
138
+ """
139
+ datasets_module = require_datasets("datasets is required to build an echo chamber")
140
+ load_dataset = getattr(datasets_module, "load_dataset", None)
141
+ if load_dataset is None: # pragma: no cover - defensive
142
+ message = "datasets is required to build an echo chamber"
143
+ raise ModuleNotFoundError(message)
144
+
145
+ dataset_dict_cls = getattr(datasets_module, "DatasetDict", dict)
146
+
147
+ hf_dataset: Any
148
+ if split is None:
149
+ hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
150
+ if isinstance(hf_dataset, dataset_dict_cls):
151
+ try:
152
+ hf_dataset = next(iter(hf_dataset.values()))
153
+ except StopIteration as exc: # pragma: no cover - defensive
154
+ raise ValueError("The specified dataset does not contain any splits") from exc
155
+ else:
156
+ hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
157
+
158
+ if isinstance(hf_dataset, dataset_dict_cls):
159
+ raise ValueError("Specify which split to use when the dataset loads as a DatasetDict.")
160
+
161
+ filtered_dataset = hf_dataset.filter(
162
+ lambda row: row.get(column) is not None,
163
+ load_from_cache_file=False,
164
+ )
165
+
166
+ source_column_names = list(filtered_dataset.column_names)
167
+
168
+ def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
169
+ text = str(row[column])
170
+ prompt = [
171
+ {"role": "system", "content": instructions},
172
+ {"role": "user", "content": f"Corrupted text:\n{text}"},
173
+ ]
174
+ return {"prompt": prompt, "answer": text}
175
+
176
+ base_dataset = filtered_dataset.map(
177
+ _build_prompt,
178
+ remove_columns=source_column_names,
179
+ load_from_cache_file=False,
180
+ )
181
+
182
+ try:
183
+ dataset_length = len(base_dataset)
184
+ except TypeError:
185
+ preview_rows: list[dict[str, Any]]
186
+ take_fn = getattr(base_dataset, "take", None)
187
+ if callable(take_fn):
188
+ preview_rows = list(take_fn(1))
189
+ else:
190
+ iterator = iter(base_dataset)
191
+ try:
192
+ first_row = next(iterator)
193
+ except StopIteration:
194
+ preview_rows = []
195
+ else:
196
+ preview_rows = [first_row]
197
+ if not preview_rows:
198
+ raise ValueError(
199
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
200
+ )
201
+ else:
202
+ if dataset_length == 0:
203
+ raise ValueError(
204
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
205
+ )
206
+
207
+ gaggle = _as_gaggle(glitchlings, seed=seed)
208
+ glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
209
+
210
+ rubric_func = reward_function or normalized_edit_distance
211
+ rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
212
+ return cast(
213
+ VerifierSingleTurnEnv,
214
+ vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric),
215
+ )
@@ -0,0 +1,98 @@
1
+ """Integration helpers for PyTorch data loaders."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Iterator, Sequence
6
+ from typing import Any, cast
7
+
8
+ from ..util.adapters import coerce_gaggle
9
+ from ..zoo import Gaggle, Glitchling
10
+ from ._shared import corrupt_batch, infer_batch_targets, normalize_column_spec
11
+
12
+
13
+ class _GlitchedDataLoader(Iterable[Any]):
14
+ """Wrapper that applies glitchlings lazily to each batch from a data loader."""
15
+
16
+ def __init__(
17
+ self,
18
+ dataloader: Any,
19
+ gaggle: Gaggle,
20
+ *,
21
+ columns: list[str | int] | None,
22
+ ) -> None:
23
+ self._dataloader = dataloader
24
+ self._gaggle = gaggle
25
+ self._explicit_columns = columns
26
+ self._inferred_columns: list[str | int] | None | _Sentinel = _UNINITIALISED
27
+
28
+ def __iter__(self) -> Iterator[Any]:
29
+ # Reset all glitchling RNGs before each fresh pass for determinism.
30
+ self._gaggle.sort_glitchlings()
31
+ for batch in self._dataloader:
32
+ targets = self._resolve_columns(batch)
33
+ yield corrupt_batch(batch, targets, self._gaggle)
34
+
35
+ def __len__(self) -> int:
36
+ return len(self._dataloader)
37
+
38
+ def __getattr__(self, attribute: str) -> Any:
39
+ return getattr(self._dataloader, attribute)
40
+
41
+ def _resolve_columns(self, batch: Any) -> list[str | int] | None:
42
+ if self._explicit_columns is not None:
43
+ return self._explicit_columns
44
+
45
+ if self._inferred_columns is _UNINITIALISED:
46
+ self._inferred_columns = infer_batch_targets(batch)
47
+
48
+ return cast(list[str | int] | None, self._inferred_columns)
49
+
50
+
51
+ class _Sentinel:
52
+ """Sentinel type for deferred column inference."""
53
+
54
+
55
+ _UNINITIALISED = _Sentinel()
56
+
57
+
58
+ def GlitchedDataLoader(
59
+ dataloader: Any,
60
+ glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
61
+ *,
62
+ columns: str | int | Sequence[str | int] | None = None,
63
+ seed: int = 151,
64
+ ) -> _GlitchedDataLoader:
65
+ """Return a lazily glitched view of a PyTorch DataLoader's batches.
66
+
67
+ This function wraps a PyTorch DataLoader to apply glitchlings to specified
68
+ columns (or auto-inferred text columns) in each batch as it's yielded.
69
+
70
+ Args:
71
+ dataloader: The PyTorch DataLoader to wrap.
72
+ glitchlings: A glitchling, gaggle, or specification of glitchlings to apply.
73
+ columns: Column name(s) or index/indices to corrupt. Can be:
74
+ - A single string column name (for dict-like batches)
75
+ - A single integer index (for sequence-like batches)
76
+ - A sequence of column names or indices
77
+ - None to auto-infer text columns (default)
78
+ seed: RNG seed for deterministic corruption (default: 151).
79
+
80
+ Returns:
81
+ A wrapped dataloader that yields corrupted batches.
82
+
83
+ Example:
84
+ >>> from torch.utils.data import DataLoader
85
+ >>> from glitchlings.dlc.pytorch import GlitchedDataLoader
86
+ >>> dataset = [{"text": "hello", "label": 0}]
87
+ >>> loader = DataLoader(dataset)
88
+ >>> glitched = GlitchedDataLoader(loader, "typogre", columns="text")
89
+ >>> for batch in glitched:
90
+ ... print(batch)
91
+ {'text': 'helo', 'label': 0}
92
+ """
93
+ gaggle = coerce_gaggle(glitchlings, seed=seed)
94
+ normalized = normalize_column_spec(columns)
95
+ return _GlitchedDataLoader(dataloader, gaggle, columns=normalized)
96
+
97
+
98
+ __all__ = ["GlitchedDataLoader"]