glitchlings 1.0.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +101 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_corruption_engine/__init__.py +12 -0
- glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +184 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +819 -0
- glitchlings/attack/core_execution.py +378 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +211 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +338 -0
- glitchlings/attack/tokenizer_metrics.py +373 -0
- glitchlings/auggie.py +285 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +39 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +139 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +21 -0
- glitchlings/dlc/_shared.py +300 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/langchain.py +147 -0
- glitchlings/dlc/nemo.py +283 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +599 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +41 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +508 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +852 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +291 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +301 -0
- glitchlings/zoo/mim1c.py +269 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +99 -0
- glitchlings/zoo/pedant/forms.py +50 -0
- glitchlings/zoo/pedant/stones.py +83 -0
- glitchlings/zoo/redactyl.py +94 -0
- glitchlings/zoo/rng.py +280 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +370 -0
- glitchlings/zoo/transforms.py +331 -0
- glitchlings/zoo/typogre.py +194 -0
- glitchlings/zoo/validation.py +643 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +165 -0
- glitchlings-1.0.0.dist-info/METADATA +404 -0
- glitchlings-1.0.0.dist-info/RECORD +86 -0
- glitchlings-1.0.0.dist-info/WHEEL +5 -0
- glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
- glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
- glitchlings-1.0.0.dist-info/top_level.txt +1 -0
glitchlings/dlc/nemo.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
"""NVIDIA NeMo DataDesigner plugin for Glitchlings text corruption.
|
|
2
|
+
|
|
3
|
+
This module provides a DataDesigner column generator that applies Glitchlings
|
|
4
|
+
transformations to text columns, enabling deterministic text corruption for
|
|
5
|
+
model robustness testing and adversarial augmentation.
|
|
6
|
+
|
|
7
|
+
The plugin integrates with NeMo's experimental plugin system, exposing
|
|
8
|
+
Glitchlings as a discoverable column generator.
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
>>> from glitchlings.dlc.nemo import GlitchlingColumnConfig
|
|
12
|
+
>>> from data_designer import DataDesignerConfigBuilder
|
|
13
|
+
>>> builder = DataDesignerConfigBuilder()
|
|
14
|
+
>>> builder.add_column(
|
|
15
|
+
... GlitchlingColumnConfig(
|
|
16
|
+
... name="corrupted_prompt",
|
|
17
|
+
... source_column="prompt",
|
|
18
|
+
... glitchlings=["Typogre(rate=0.02)", "Mim1c(rate=0.01)"],
|
|
19
|
+
... seed=404,
|
|
20
|
+
... )
|
|
21
|
+
... )
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from collections.abc import Sequence
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import TYPE_CHECKING, Any, Literal, Union
|
|
29
|
+
|
|
30
|
+
from ..auggie import Auggie
|
|
31
|
+
from ..conf.loaders import load_attack_config
|
|
32
|
+
from ..util.adapters import coerce_gaggle
|
|
33
|
+
from ..zoo.core import Gaggle, Glitchling
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
36
|
+
import pandas as pd
|
|
37
|
+
|
|
38
|
+
# Type alias for flexible glitchling specification
|
|
39
|
+
GlitchlingSpec = Union[
|
|
40
|
+
Gaggle,
|
|
41
|
+
Auggie,
|
|
42
|
+
Glitchling,
|
|
43
|
+
str,
|
|
44
|
+
Sequence[Union[str, Glitchling]],
|
|
45
|
+
Path,
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _resolve_gaggle(
|
|
50
|
+
spec: GlitchlingSpec,
|
|
51
|
+
*,
|
|
52
|
+
seed: int,
|
|
53
|
+
) -> Gaggle:
|
|
54
|
+
"""Resolve a flexible glitchling specification into a Gaggle.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
spec: One of:
|
|
58
|
+
- A pre-constructed ``Gaggle``
|
|
59
|
+
- An ``Auggie`` builder (which is a Gaggle subclass)
|
|
60
|
+
- A single ``Glitchling`` instance
|
|
61
|
+
- A string glitchling name (e.g., "typogre")
|
|
62
|
+
- A list of glitchling names or instances
|
|
63
|
+
- A ``Path`` to a YAML config file
|
|
64
|
+
seed: Seed for deterministic corruption.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
A configured Gaggle ready for corruption.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
TypeError: If the spec type is not recognized.
|
|
71
|
+
ValueError: If a config path doesn't exist or is invalid.
|
|
72
|
+
"""
|
|
73
|
+
# Auggie is a subclass of Gaggle, so check it first
|
|
74
|
+
if isinstance(spec, Auggie):
|
|
75
|
+
return spec.clone(seed=seed)
|
|
76
|
+
|
|
77
|
+
if isinstance(spec, Gaggle):
|
|
78
|
+
return spec.clone(seed=seed)
|
|
79
|
+
|
|
80
|
+
# Path to YAML config
|
|
81
|
+
if isinstance(spec, Path):
|
|
82
|
+
config = load_attack_config(spec)
|
|
83
|
+
from ..conf.loaders import build_gaggle
|
|
84
|
+
|
|
85
|
+
return build_gaggle(config, seed_override=seed)
|
|
86
|
+
|
|
87
|
+
# String path (check if it looks like a file path)
|
|
88
|
+
if isinstance(spec, str) and (spec.endswith(".yaml") or spec.endswith(".yml")):
|
|
89
|
+
path = Path(spec)
|
|
90
|
+
if not path.exists():
|
|
91
|
+
raise FileNotFoundError(
|
|
92
|
+
f"Glitchling config file not found: {spec!r}. "
|
|
93
|
+
"If this was intended as a glitchling name, remove the .yaml/.yml extension."
|
|
94
|
+
)
|
|
95
|
+
config = load_attack_config(path)
|
|
96
|
+
from ..conf.loaders import build_gaggle
|
|
97
|
+
|
|
98
|
+
return build_gaggle(config, seed_override=seed)
|
|
99
|
+
|
|
100
|
+
# Single glitchling or string, or sequence
|
|
101
|
+
return coerce_gaggle(spec, seed=seed)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _apply_corruption(
|
|
105
|
+
series: "pd.Series[str]",
|
|
106
|
+
gaggle: Gaggle,
|
|
107
|
+
) -> "pd.Series[str]":
|
|
108
|
+
"""Apply gaggle corruption to a pandas Series.
|
|
109
|
+
|
|
110
|
+
Uses batch corruption for efficiency when possible.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
series: Pandas Series of strings to corrupt.
|
|
114
|
+
gaggle: Configured Gaggle to apply.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Series with corrupted strings.
|
|
118
|
+
"""
|
|
119
|
+
# Use batch corruption for better performance via Rust pipeline
|
|
120
|
+
values = series.tolist()
|
|
121
|
+
corrupted = gaggle.corrupt_batch(values)
|
|
122
|
+
return series.__class__(corrupted, index=series.index, name=series.name)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ---------------------------------------------------------------------------
|
|
126
|
+
# DataDesigner Integration Classes
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
# These classes follow the NeMo DataDesigner plugin interface.
|
|
129
|
+
# They are defined conditionally to avoid hard dependency on data-designer.
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _create_plugin_classes() -> tuple[type, type, Any] | None:
|
|
133
|
+
"""Create plugin classes if data-designer is available.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Tuple of (config_class, generator_class, plugin_object) or None.
|
|
137
|
+
"""
|
|
138
|
+
try:
|
|
139
|
+
from data_designer.config.base import SingleColumnConfig
|
|
140
|
+
from data_designer.engine.column_generators.generators.base import (
|
|
141
|
+
ColumnGenerator,
|
|
142
|
+
GenerationStrategy,
|
|
143
|
+
GeneratorMetadata,
|
|
144
|
+
)
|
|
145
|
+
from data_designer.plugins import Plugin, PluginType
|
|
146
|
+
except ImportError:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
class GlitchlingColumnConfig(SingleColumnConfig): # type: ignore[misc]
|
|
150
|
+
"""Configuration for Glitchlings text corruption column generator.
|
|
151
|
+
|
|
152
|
+
Attributes:
|
|
153
|
+
name: Output column name.
|
|
154
|
+
column_type: Discriminator field for DataDesigner plugin discovery.
|
|
155
|
+
glitchlings: Glitchling specification. Can be:
|
|
156
|
+
- A string glitchling name: ``"typogre"``
|
|
157
|
+
- A spec with parameters: ``"Typogre(rate=0.02)"``
|
|
158
|
+
- A list of specs: ``["Typogre(rate=0.02)", "Mim1c(rate=0.01)"]``
|
|
159
|
+
- A path to YAML config: ``"configs/chaos.yaml"``
|
|
160
|
+
source_column: Column to corrupt. If None, corrupts the column
|
|
161
|
+
specified by ``name`` (in-place style).
|
|
162
|
+
seed: RNG seed for deterministic corruption. If None, uses
|
|
163
|
+
a default seed for reproducibility.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
column_type: Literal["glitchlings"] = "glitchlings"
|
|
167
|
+
glitchlings: str | list[str] = "typogre"
|
|
168
|
+
source_column: str | None = None
|
|
169
|
+
seed: int | None = None
|
|
170
|
+
|
|
171
|
+
class GlitchlingColumnGenerator(ColumnGenerator): # type: ignore[misc]
|
|
172
|
+
"""Column generator that applies Glitchlings text corruption.
|
|
173
|
+
|
|
174
|
+
This generator corrupts text in the source column using the configured
|
|
175
|
+
glitchlings and writes the result to the output column.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
config: GlitchlingColumnConfig
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def metadata() -> GeneratorMetadata:
|
|
182
|
+
"""Return metadata describing this generator."""
|
|
183
|
+
return GeneratorMetadata(
|
|
184
|
+
name="glitchlings",
|
|
185
|
+
description=(
|
|
186
|
+
"Apply deterministic, linguistically-principled text corruption "
|
|
187
|
+
"via Glitchlings for model robustness testing and adversarial augmentation."
|
|
188
|
+
),
|
|
189
|
+
generation_strategy=GenerationStrategy.FULL_COLUMN,
|
|
190
|
+
required_resources=None,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def generate(self, data: "pd.DataFrame") -> "pd.DataFrame":
|
|
194
|
+
"""Generate corrupted text column.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
data: Input DataFrame.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
DataFrame with the corrupted column added/updated.
|
|
201
|
+
"""
|
|
202
|
+
source = self.config.source_column or self.config.name
|
|
203
|
+
seed = self.config.seed if self.config.seed is not None else 151
|
|
204
|
+
|
|
205
|
+
# Resolve glitchlings specification
|
|
206
|
+
spec: GlitchlingSpec = self.config.glitchlings
|
|
207
|
+
|
|
208
|
+
gaggle = _resolve_gaggle(spec, seed=seed)
|
|
209
|
+
|
|
210
|
+
# Apply corruption
|
|
211
|
+
data[self.config.name] = _apply_corruption(data[source], gaggle)
|
|
212
|
+
return data
|
|
213
|
+
|
|
214
|
+
plugin = Plugin(
|
|
215
|
+
task_cls=GlitchlingColumnGenerator,
|
|
216
|
+
config_cls=GlitchlingColumnConfig,
|
|
217
|
+
plugin_type=PluginType.COLUMN_GENERATOR,
|
|
218
|
+
emoji="👾",
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
return GlitchlingColumnConfig, GlitchlingColumnGenerator, plugin
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# Try to create the plugin classes
|
|
225
|
+
_plugin_result = _create_plugin_classes()
|
|
226
|
+
|
|
227
|
+
if _plugin_result is not None:
|
|
228
|
+
GlitchlingColumnConfig, GlitchlingColumnGenerator, plugin = _plugin_result
|
|
229
|
+
else:
|
|
230
|
+
# Provide stub classes for documentation and type checking
|
|
231
|
+
GlitchlingColumnConfig = None # type: ignore[assignment]
|
|
232
|
+
GlitchlingColumnGenerator = None # type: ignore[assignment]
|
|
233
|
+
plugin = None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# ---------------------------------------------------------------------------
|
|
237
|
+
# Standalone Functions (usable without DataDesigner)
|
|
238
|
+
# ---------------------------------------------------------------------------
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def corrupt_dataframe(
|
|
242
|
+
df: "pd.DataFrame",
|
|
243
|
+
glitchlings: GlitchlingSpec,
|
|
244
|
+
*,
|
|
245
|
+
column: str,
|
|
246
|
+
output_column: str | None = None,
|
|
247
|
+
seed: int = 151,
|
|
248
|
+
) -> "pd.DataFrame":
|
|
249
|
+
"""Corrupt a DataFrame column using Glitchlings.
|
|
250
|
+
|
|
251
|
+
This function provides DataFrame corruption without requiring the full
|
|
252
|
+
DataDesigner plugin infrastructure.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
df: Input DataFrame.
|
|
256
|
+
glitchlings: Glitchling specification (see ``GlitchlingSpec``).
|
|
257
|
+
column: Source column to corrupt.
|
|
258
|
+
output_column: Output column name. If None, overwrites source column.
|
|
259
|
+
seed: RNG seed for deterministic corruption.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
DataFrame with corrupted column.
|
|
263
|
+
|
|
264
|
+
Example:
|
|
265
|
+
>>> import pandas as pd
|
|
266
|
+
>>> from glitchlings.dlc.nemo import corrupt_dataframe
|
|
267
|
+
>>> df = pd.DataFrame({"text": ["Hello world", "Test input"]})
|
|
268
|
+
>>> result = corrupt_dataframe(df, "typogre", column="text", seed=42)
|
|
269
|
+
"""
|
|
270
|
+
gaggle = _resolve_gaggle(glitchlings, seed=seed)
|
|
271
|
+
target = output_column if output_column is not None else column
|
|
272
|
+
df = df.copy()
|
|
273
|
+
df[target] = _apply_corruption(df[column], gaggle)
|
|
274
|
+
return df
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
__all__ = [
|
|
278
|
+
"GlitchlingColumnConfig",
|
|
279
|
+
"GlitchlingColumnGenerator",
|
|
280
|
+
"GlitchlingSpec",
|
|
281
|
+
"corrupt_dataframe",
|
|
282
|
+
"plugin",
|
|
283
|
+
]
|
glitchlings/dlc/prime.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Integration helpers for the optional verifiers prime DLC."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable, Sequence
|
|
6
|
+
from typing import Any, Callable, Protocol, cast
|
|
7
|
+
|
|
8
|
+
from ..compat.loaders import require_datasets, require_jellyfish, require_verifiers
|
|
9
|
+
from ..util.adapters import coerce_gaggle
|
|
10
|
+
from ..zoo import Gaggle, Glitchling, Mim1c, Typogre # noqa: F401
|
|
11
|
+
from ._shared import resolve_columns as _resolve_columns_shared
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VerifierEnvironment(Protocol):
|
|
15
|
+
"""Minimal interface for verifiers environments."""
|
|
16
|
+
|
|
17
|
+
dataset: Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VerifierSingleTurnEnv(Protocol):
|
|
21
|
+
"""Minimal interface for single-turn verifier environments."""
|
|
22
|
+
|
|
23
|
+
dataset: Any
|
|
24
|
+
rubric: Any
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
vf = require_verifiers("verifiers is not installed; install glitchlings[prime]")
|
|
28
|
+
_jellyfish = require_jellyfish("jellyfish is not installed; install glitchlings[prime]")
|
|
29
|
+
levenshtein_distance = _jellyfish.levenshtein_distance
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _resolve_environment(env: str | VerifierEnvironment) -> VerifierEnvironment:
|
|
33
|
+
"""Return a fully-instantiated verifier environment."""
|
|
34
|
+
if isinstance(env, str):
|
|
35
|
+
env = vf.load_environment(env)
|
|
36
|
+
|
|
37
|
+
if not isinstance(env, cast(type[Any], vf.Environment)):
|
|
38
|
+
raise TypeError("Invalid environment type")
|
|
39
|
+
|
|
40
|
+
return cast(VerifierEnvironment, env)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
|
|
44
|
+
"""Identify which dataset columns should be corrupted."""
|
|
45
|
+
return _resolve_columns_shared(dataset, columns)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_environment(
|
|
49
|
+
env: str | VerifierEnvironment,
|
|
50
|
+
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
|
|
51
|
+
*,
|
|
52
|
+
seed: int = 151,
|
|
53
|
+
columns: Sequence[str] | None = None,
|
|
54
|
+
) -> VerifierEnvironment:
|
|
55
|
+
"""Load an environment and optionally corrupt it with glitchlings."""
|
|
56
|
+
environment = _resolve_environment(env)
|
|
57
|
+
|
|
58
|
+
if glitchlings is None:
|
|
59
|
+
return environment
|
|
60
|
+
|
|
61
|
+
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
62
|
+
|
|
63
|
+
dataset = environment.dataset
|
|
64
|
+
corrupt_columns = _resolve_columns(dataset, columns)
|
|
65
|
+
environment.dataset = gaggle.corrupt_dataset(dataset, corrupt_columns)
|
|
66
|
+
return environment
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _as_gaggle(
|
|
70
|
+
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
|
|
71
|
+
*,
|
|
72
|
+
seed: int,
|
|
73
|
+
) -> Gaggle:
|
|
74
|
+
"""Coerce any supported glitchling specification into a :class:`Gaggle`."""
|
|
75
|
+
return coerce_gaggle(glitchlings, seed=seed)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _extract_completion_text(completion: Any) -> str:
|
|
79
|
+
"""Normalize a completion payload into a plain string."""
|
|
80
|
+
if isinstance(completion, str):
|
|
81
|
+
return completion
|
|
82
|
+
|
|
83
|
+
if isinstance(completion, list) and completion:
|
|
84
|
+
first = completion[0]
|
|
85
|
+
if isinstance(first, dict) and "content" in first:
|
|
86
|
+
return str(first["content"])
|
|
87
|
+
return str(first)
|
|
88
|
+
|
|
89
|
+
return str(completion)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def normalized_edit_distance(
|
|
93
|
+
_: Any,
|
|
94
|
+
completion: Any,
|
|
95
|
+
answer: str,
|
|
96
|
+
) -> float:
|
|
97
|
+
"""Return ``1 - (distance / max_len)`` using Levenshtein distance."""
|
|
98
|
+
completion_text = _extract_completion_text(completion)
|
|
99
|
+
target = answer or ""
|
|
100
|
+
denominator = max(len(completion_text), len(target), 1)
|
|
101
|
+
distance = cast(int, levenshtein_distance(completion_text, target))
|
|
102
|
+
score = 1.0 - (distance / denominator)
|
|
103
|
+
return max(0.0, min(1.0, score))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
symmetric_levenshtein_similarity = normalized_edit_distance
|
|
107
|
+
|
|
108
|
+
DEFAULT_CLEANUP_INSTRUCTIONS = (
|
|
109
|
+
"You are a meticulous copy editor. Restore the provided text to its original form."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def echo_chamber(
|
|
114
|
+
dataset_id: str,
|
|
115
|
+
column: str,
|
|
116
|
+
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
|
|
117
|
+
*,
|
|
118
|
+
seed: int = 151,
|
|
119
|
+
instructions: str = DEFAULT_CLEANUP_INSTRUCTIONS,
|
|
120
|
+
reward_function: Callable[..., float] | None = None,
|
|
121
|
+
split: str | None = None,
|
|
122
|
+
**load_dataset_kwargs: Any,
|
|
123
|
+
) -> VerifierSingleTurnEnv:
|
|
124
|
+
"""Create an Echo Chamber Prime environment from a Hugging Face dataset column.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
dataset_id: Identifier of the Hugging Face dataset to load.
|
|
128
|
+
column: Name of the column whose text should be glitched.
|
|
129
|
+
glitchlings: Glitchling specifiers that will corrupt the prompts.
|
|
130
|
+
seed: RNG seed forwarded to :func:`glitchlings.util.adapters.coerce_gaggle`.
|
|
131
|
+
instructions: System instructions supplied to the environment prompts.
|
|
132
|
+
reward_function: Optional callable used to score completions. Defaults to
|
|
133
|
+
:func:`symmetric_levenshtein_similarity` when omitted.
|
|
134
|
+
split: Optional dataset split to load.
|
|
135
|
+
**load_dataset_kwargs: Extra keyword arguments forwarded to
|
|
136
|
+
:func:`datasets.load_dataset`.
|
|
137
|
+
|
|
138
|
+
"""
|
|
139
|
+
datasets_module = require_datasets("datasets is required to build an echo chamber")
|
|
140
|
+
load_dataset = getattr(datasets_module, "load_dataset", None)
|
|
141
|
+
if load_dataset is None: # pragma: no cover - defensive
|
|
142
|
+
message = "datasets is required to build an echo chamber"
|
|
143
|
+
raise ModuleNotFoundError(message)
|
|
144
|
+
|
|
145
|
+
dataset_dict_cls = getattr(datasets_module, "DatasetDict", dict)
|
|
146
|
+
|
|
147
|
+
hf_dataset: Any
|
|
148
|
+
if split is None:
|
|
149
|
+
hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
|
|
150
|
+
if isinstance(hf_dataset, dataset_dict_cls):
|
|
151
|
+
try:
|
|
152
|
+
hf_dataset = next(iter(hf_dataset.values()))
|
|
153
|
+
except StopIteration as exc: # pragma: no cover - defensive
|
|
154
|
+
raise ValueError("The specified dataset does not contain any splits") from exc
|
|
155
|
+
else:
|
|
156
|
+
hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
|
|
157
|
+
|
|
158
|
+
if isinstance(hf_dataset, dataset_dict_cls):
|
|
159
|
+
raise ValueError("Specify which split to use when the dataset loads as a DatasetDict.")
|
|
160
|
+
|
|
161
|
+
filtered_dataset = hf_dataset.filter(
|
|
162
|
+
lambda row: row.get(column) is not None,
|
|
163
|
+
load_from_cache_file=False,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
source_column_names = list(filtered_dataset.column_names)
|
|
167
|
+
|
|
168
|
+
def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
|
|
169
|
+
text = str(row[column])
|
|
170
|
+
prompt = [
|
|
171
|
+
{"role": "system", "content": instructions},
|
|
172
|
+
{"role": "user", "content": f"Corrupted text:\n{text}"},
|
|
173
|
+
]
|
|
174
|
+
return {"prompt": prompt, "answer": text}
|
|
175
|
+
|
|
176
|
+
base_dataset = filtered_dataset.map(
|
|
177
|
+
_build_prompt,
|
|
178
|
+
remove_columns=source_column_names,
|
|
179
|
+
load_from_cache_file=False,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
dataset_length = len(base_dataset)
|
|
184
|
+
except TypeError:
|
|
185
|
+
preview_rows: list[dict[str, Any]]
|
|
186
|
+
take_fn = getattr(base_dataset, "take", None)
|
|
187
|
+
if callable(take_fn):
|
|
188
|
+
preview_rows = list(take_fn(1))
|
|
189
|
+
else:
|
|
190
|
+
iterator = iter(base_dataset)
|
|
191
|
+
try:
|
|
192
|
+
first_row = next(iterator)
|
|
193
|
+
except StopIteration:
|
|
194
|
+
preview_rows = []
|
|
195
|
+
else:
|
|
196
|
+
preview_rows = [first_row]
|
|
197
|
+
if not preview_rows:
|
|
198
|
+
raise ValueError(
|
|
199
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
if dataset_length == 0:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
gaggle = _as_gaggle(glitchlings, seed=seed)
|
|
208
|
+
glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
|
|
209
|
+
|
|
210
|
+
rubric_func = reward_function or normalized_edit_distance
|
|
211
|
+
rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
|
|
212
|
+
return cast(
|
|
213
|
+
VerifierSingleTurnEnv,
|
|
214
|
+
vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric),
|
|
215
|
+
)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Integration helpers for PyTorch data loaders."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from ..util.adapters import coerce_gaggle
|
|
9
|
+
from ..zoo import Gaggle, Glitchling
|
|
10
|
+
from ._shared import corrupt_batch, infer_batch_targets, normalize_column_spec
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _GlitchedDataLoader(Iterable[Any]):
|
|
14
|
+
"""Wrapper that applies glitchlings lazily to each batch from a data loader."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
dataloader: Any,
|
|
19
|
+
gaggle: Gaggle,
|
|
20
|
+
*,
|
|
21
|
+
columns: list[str | int] | None,
|
|
22
|
+
) -> None:
|
|
23
|
+
self._dataloader = dataloader
|
|
24
|
+
self._gaggle = gaggle
|
|
25
|
+
self._explicit_columns = columns
|
|
26
|
+
self._inferred_columns: list[str | int] | None | _Sentinel = _UNINITIALISED
|
|
27
|
+
|
|
28
|
+
def __iter__(self) -> Iterator[Any]:
|
|
29
|
+
# Reset all glitchling RNGs before each fresh pass for determinism.
|
|
30
|
+
self._gaggle.sort_glitchlings()
|
|
31
|
+
for batch in self._dataloader:
|
|
32
|
+
targets = self._resolve_columns(batch)
|
|
33
|
+
yield corrupt_batch(batch, targets, self._gaggle)
|
|
34
|
+
|
|
35
|
+
def __len__(self) -> int:
|
|
36
|
+
return len(self._dataloader)
|
|
37
|
+
|
|
38
|
+
def __getattr__(self, attribute: str) -> Any:
|
|
39
|
+
return getattr(self._dataloader, attribute)
|
|
40
|
+
|
|
41
|
+
def _resolve_columns(self, batch: Any) -> list[str | int] | None:
|
|
42
|
+
if self._explicit_columns is not None:
|
|
43
|
+
return self._explicit_columns
|
|
44
|
+
|
|
45
|
+
if self._inferred_columns is _UNINITIALISED:
|
|
46
|
+
self._inferred_columns = infer_batch_targets(batch)
|
|
47
|
+
|
|
48
|
+
return cast(list[str | int] | None, self._inferred_columns)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class _Sentinel:
|
|
52
|
+
"""Sentinel type for deferred column inference."""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
_UNINITIALISED = _Sentinel()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def GlitchedDataLoader(
|
|
59
|
+
dataloader: Any,
|
|
60
|
+
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
|
|
61
|
+
*,
|
|
62
|
+
columns: str | int | Sequence[str | int] | None = None,
|
|
63
|
+
seed: int = 151,
|
|
64
|
+
) -> _GlitchedDataLoader:
|
|
65
|
+
"""Return a lazily glitched view of a PyTorch DataLoader's batches.
|
|
66
|
+
|
|
67
|
+
This function wraps a PyTorch DataLoader to apply glitchlings to specified
|
|
68
|
+
columns (or auto-inferred text columns) in each batch as it's yielded.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
dataloader: The PyTorch DataLoader to wrap.
|
|
72
|
+
glitchlings: A glitchling, gaggle, or specification of glitchlings to apply.
|
|
73
|
+
columns: Column name(s) or index/indices to corrupt. Can be:
|
|
74
|
+
- A single string column name (for dict-like batches)
|
|
75
|
+
- A single integer index (for sequence-like batches)
|
|
76
|
+
- A sequence of column names or indices
|
|
77
|
+
- None to auto-infer text columns (default)
|
|
78
|
+
seed: RNG seed for deterministic corruption (default: 151).
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
A wrapped dataloader that yields corrupted batches.
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
>>> from torch.utils.data import DataLoader
|
|
85
|
+
>>> from glitchlings.dlc.pytorch import GlitchedDataLoader
|
|
86
|
+
>>> dataset = [{"text": "hello", "label": 0}]
|
|
87
|
+
>>> loader = DataLoader(dataset)
|
|
88
|
+
>>> glitched = GlitchedDataLoader(loader, "typogre", columns="text")
|
|
89
|
+
>>> for batch in glitched:
|
|
90
|
+
... print(batch)
|
|
91
|
+
{'text': 'helo', 'label': 0}
|
|
92
|
+
"""
|
|
93
|
+
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
94
|
+
normalized = normalize_column_spec(columns)
|
|
95
|
+
return _GlitchedDataLoader(dataloader, gaggle, columns=normalized)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
__all__ = ["GlitchedDataLoader"]
|