glitchlings 0.2.3__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +42 -0
- glitchlings/__main__.py +9 -0
- glitchlings/_zoo_rust.cpython-310-x86_64-linux-gnu.so +0 -0
- glitchlings/dlc/__init__.py +5 -0
- glitchlings/dlc/huggingface.py +96 -0
- glitchlings/dlc/prime.py +274 -0
- glitchlings/main.py +218 -0
- glitchlings/util/__init__.py +181 -0
- glitchlings/zoo/__init__.py +134 -0
- glitchlings/zoo/_ocr_confusions.py +34 -0
- glitchlings/zoo/_rate.py +21 -0
- glitchlings/zoo/core.py +405 -0
- glitchlings/zoo/jargoyle.py +336 -0
- glitchlings/zoo/mim1c.py +108 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +165 -0
- glitchlings/zoo/reduple.py +128 -0
- glitchlings/zoo/rushmore.py +136 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +212 -0
- glitchlings-0.2.3.dist-info/METADATA +478 -0
- glitchlings-0.2.3.dist-info/RECORD +26 -0
- glitchlings-0.2.3.dist-info/WHEEL +5 -0
- glitchlings-0.2.3.dist-info/entry_points.txt +2 -0
- glitchlings-0.2.3.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.2.3.dist-info/top_level.txt +1 -0
glitchlings/zoo/core.py
ADDED
@@ -0,0 +1,405 @@
|
|
1
|
+
"""Core data structures used to model glitchlings and their interactions."""
|
2
|
+
|
3
|
+
import inspect
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import random
|
7
|
+
from enum import IntEnum, auto
|
8
|
+
from hashlib import blake2s
|
9
|
+
from typing import TYPE_CHECKING, Any, Callable, Protocol
|
10
|
+
|
11
|
+
_datasets_error: ModuleNotFoundError | None = None
|
12
|
+
try: # pragma: no cover - optional dependency
|
13
|
+
from datasets import Dataset as _DatasetsDataset
|
14
|
+
except ModuleNotFoundError as error: # pragma: no cover - optional dependency
|
15
|
+
_DatasetsDataset = None # type: ignore[assignment]
|
16
|
+
_datasets_error = error
|
17
|
+
else:
|
18
|
+
_datasets_error = None
|
19
|
+
|
20
|
+
try: # pragma: no cover - optional dependency
|
21
|
+
from glitchlings._zoo_rust import compose_glitchlings as _compose_glitchlings_rust
|
22
|
+
except ImportError: # pragma: no cover - compiled extension not present
|
23
|
+
_compose_glitchlings_rust = None
|
24
|
+
|
25
|
+
|
26
|
+
log = logging.getLogger(__name__)
|
27
|
+
|
28
|
+
|
29
|
+
_PIPELINE_FEATURE_FLAG_ENV = "GLITCHLINGS_RUST_PIPELINE"
|
30
|
+
|
31
|
+
|
32
|
+
def _pipeline_feature_flag_enabled() -> bool:
|
33
|
+
"""Return ``True`` when the environment explicitly opts into the Rust pipeline."""
|
34
|
+
|
35
|
+
value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
|
36
|
+
if value is None:
|
37
|
+
return False
|
38
|
+
|
39
|
+
normalized = value.strip().lower()
|
40
|
+
return normalized in {"1", "true", "yes", "on"}
|
41
|
+
|
42
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
43
|
+
from datasets import Dataset # type: ignore
|
44
|
+
elif _DatasetsDataset is not None:
|
45
|
+
Dataset = _DatasetsDataset
|
46
|
+
else:
|
47
|
+
|
48
|
+
class Dataset(Protocol): # type: ignore[no-redef]
|
49
|
+
"""Typed stub mirroring the Hugging Face dataset interface used here."""
|
50
|
+
|
51
|
+
def with_transform(self, function: Any) -> "Dataset": ...
|
52
|
+
|
53
|
+
|
54
|
+
def _is_transcript(value: Any) -> bool:
|
55
|
+
"""Return True when the value resembles a chat transcript."""
|
56
|
+
|
57
|
+
if not isinstance(value, list):
|
58
|
+
return False
|
59
|
+
|
60
|
+
if not value:
|
61
|
+
return True
|
62
|
+
|
63
|
+
if not all(isinstance(turn, dict) for turn in value):
|
64
|
+
return False
|
65
|
+
|
66
|
+
return "content" in value[-1]
|
67
|
+
|
68
|
+
|
69
|
+
class CorruptionCallable(Protocol):
|
70
|
+
"""Protocol describing a callable capable of corrupting text."""
|
71
|
+
|
72
|
+
def __call__(self, text: str, *args: Any, **kwargs: Any) -> str: ...
|
73
|
+
|
74
|
+
|
75
|
+
# Text levels for glitchlings, to enforce a sort order
|
76
|
+
# Work from highest level down, because e.g.
|
77
|
+
# duplicating a word then adding a typo is potentially different than
|
78
|
+
# adding a typo then duplicating a word
|
79
|
+
class AttackWave(IntEnum):
|
80
|
+
"""Granularity of text that a glitchling corrupts."""
|
81
|
+
|
82
|
+
DOCUMENT = auto()
|
83
|
+
PARAGRAPH = auto()
|
84
|
+
SENTENCE = auto()
|
85
|
+
WORD = auto()
|
86
|
+
CHARACTER = auto()
|
87
|
+
|
88
|
+
|
89
|
+
# Modifier for within the same attack wave
|
90
|
+
class AttackOrder(IntEnum):
|
91
|
+
"""Relative execution order for glitchlings within the same wave."""
|
92
|
+
|
93
|
+
FIRST = auto()
|
94
|
+
EARLY = auto()
|
95
|
+
NORMAL = auto()
|
96
|
+
LATE = auto()
|
97
|
+
LAST = auto()
|
98
|
+
|
99
|
+
|
100
|
+
class Glitchling:
|
101
|
+
"""A single text corruption agent with deterministic behaviour."""
|
102
|
+
|
103
|
+
def __init__(
|
104
|
+
self,
|
105
|
+
name: str,
|
106
|
+
corruption_function: CorruptionCallable,
|
107
|
+
scope: AttackWave,
|
108
|
+
order: AttackOrder = AttackOrder.NORMAL,
|
109
|
+
seed: int | None = None,
|
110
|
+
pipeline_operation: Callable[["Glitchling"], dict[str, Any] | None] | None = None,
|
111
|
+
**kwargs: Any,
|
112
|
+
) -> None:
|
113
|
+
"""Initialize a glitchling.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
name: Human readable glitchling name.
|
117
|
+
corruption_function: Callable used to transform text.
|
118
|
+
scope: Text granularity on which the glitchling operates.
|
119
|
+
order: Relative ordering within the same scope.
|
120
|
+
seed: Optional seed for deterministic random behaviour.
|
121
|
+
**kwargs: Additional parameters forwarded to the corruption callable.
|
122
|
+
"""
|
123
|
+
|
124
|
+
# Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
|
125
|
+
# If no seed is supplied, we fall back to Python's default entropy.
|
126
|
+
self.seed = seed
|
127
|
+
self.rng: random.Random = random.Random(seed)
|
128
|
+
self.name: str = name
|
129
|
+
self.corruption_function: CorruptionCallable = corruption_function
|
130
|
+
self.level: AttackWave = scope
|
131
|
+
self.order: AttackOrder = order
|
132
|
+
self._pipeline_descriptor_factory = pipeline_operation
|
133
|
+
self.kwargs: dict[str, Any] = {}
|
134
|
+
self._cached_rng_callable: CorruptionCallable | None = None
|
135
|
+
self._cached_rng_expectation: bool | None = None
|
136
|
+
for kw, val in kwargs.items():
|
137
|
+
self.set_param(kw, val)
|
138
|
+
|
139
|
+
def set_param(self, key: str, value: Any) -> None:
|
140
|
+
"""Persist a parameter for use by the corruption callable."""
|
141
|
+
|
142
|
+
aliases = getattr(self, "_param_aliases", {})
|
143
|
+
canonical = aliases.get(key, key)
|
144
|
+
|
145
|
+
# Drop stale alias keys so we only forward canonical kwargs.
|
146
|
+
self.kwargs.pop(key, None)
|
147
|
+
for alias, target in aliases.items():
|
148
|
+
if target == canonical:
|
149
|
+
self.kwargs.pop(alias, None)
|
150
|
+
|
151
|
+
self.kwargs[canonical] = value
|
152
|
+
setattr(self, canonical, value)
|
153
|
+
|
154
|
+
if canonical == "seed":
|
155
|
+
self.reset_rng(value)
|
156
|
+
|
157
|
+
for alias, target in aliases.items():
|
158
|
+
if target == canonical:
|
159
|
+
setattr(self, alias, value)
|
160
|
+
|
161
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
162
|
+
"""Return the Rust pipeline operation descriptor for this glitchling."""
|
163
|
+
|
164
|
+
factory = self._pipeline_descriptor_factory
|
165
|
+
if factory is None:
|
166
|
+
return None
|
167
|
+
|
168
|
+
return factory(self)
|
169
|
+
|
170
|
+
def _corruption_expects_rng(self) -> bool:
|
171
|
+
"""Return `True` when the corruption function accepts an rng keyword."""
|
172
|
+
|
173
|
+
cached_callable = self._cached_rng_callable
|
174
|
+
cached_expectation = self._cached_rng_expectation
|
175
|
+
corruption_function = self.corruption_function
|
176
|
+
|
177
|
+
if (
|
178
|
+
cached_callable is corruption_function
|
179
|
+
and cached_expectation is not None
|
180
|
+
):
|
181
|
+
return cached_expectation
|
182
|
+
|
183
|
+
expects_rng = False
|
184
|
+
try:
|
185
|
+
signature = inspect.signature(corruption_function)
|
186
|
+
except (TypeError, ValueError):
|
187
|
+
signature = None
|
188
|
+
|
189
|
+
if signature is not None:
|
190
|
+
expects_rng = "rng" in signature.parameters
|
191
|
+
|
192
|
+
self._cached_rng_callable = corruption_function
|
193
|
+
self._cached_rng_expectation = expects_rng
|
194
|
+
return expects_rng
|
195
|
+
|
196
|
+
def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
|
197
|
+
"""Execute the corruption callable, injecting the RNG when required."""
|
198
|
+
|
199
|
+
# Pass rng to underlying corruption function if it expects it.
|
200
|
+
expects_rng = self._corruption_expects_rng()
|
201
|
+
|
202
|
+
if expects_rng:
|
203
|
+
corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
|
204
|
+
else:
|
205
|
+
corrupted = self.corruption_function(text, *args, **kwargs)
|
206
|
+
return corrupted
|
207
|
+
|
208
|
+
def corrupt(self, text: str | list[dict[str, Any]]) -> str | list[dict[str, Any]]:
|
209
|
+
"""Apply the corruption function to text or conversational transcripts."""
|
210
|
+
|
211
|
+
if _is_transcript(text):
|
212
|
+
transcript = [dict(turn) for turn in text]
|
213
|
+
if transcript:
|
214
|
+
transcript[-1]["content"] = self.__corrupt(
|
215
|
+
transcript[-1]["content"], **self.kwargs
|
216
|
+
)
|
217
|
+
return transcript
|
218
|
+
|
219
|
+
return self.__corrupt(text, **self.kwargs)
|
220
|
+
|
221
|
+
def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
|
222
|
+
"""Apply corruption lazily across dataset columns."""
|
223
|
+
|
224
|
+
if _DatasetsDataset is None:
|
225
|
+
message = "datasets is not installed"
|
226
|
+
raise ModuleNotFoundError(message) from _datasets_error
|
227
|
+
|
228
|
+
def _is_transcript(value: Any) -> bool:
|
229
|
+
"""Return ``True`` when the value resembles a chat transcript."""
|
230
|
+
|
231
|
+
if not isinstance(value, list) or not value:
|
232
|
+
return False
|
233
|
+
|
234
|
+
return all(
|
235
|
+
isinstance(turn, dict) and "content" in turn for turn in value
|
236
|
+
)
|
237
|
+
|
238
|
+
def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
|
239
|
+
row = dict(row)
|
240
|
+
for column in columns:
|
241
|
+
value = row[column]
|
242
|
+
if _is_transcript(value):
|
243
|
+
row[column] = self.corrupt(value)
|
244
|
+
elif isinstance(value, list):
|
245
|
+
row[column] = [self.corrupt(item) for item in value]
|
246
|
+
else:
|
247
|
+
row[column] = self.corrupt(value)
|
248
|
+
return row
|
249
|
+
|
250
|
+
return dataset.with_transform(__corrupt_row)
|
251
|
+
|
252
|
+
def __call__(self, text: str, *args: Any, **kwds: Any) -> str | list[dict[str, Any]]:
|
253
|
+
"""Allow a glitchling to be invoked directly like a callable."""
|
254
|
+
|
255
|
+
return self.corrupt(text, *args, **kwds)
|
256
|
+
|
257
|
+
def reset_rng(self, seed: int | None = None) -> None:
|
258
|
+
"""Reset the glitchling's RNG to its initial seed."""
|
259
|
+
|
260
|
+
if seed is not None:
|
261
|
+
self.seed = seed
|
262
|
+
if self.seed is not None:
|
263
|
+
self.rng = random.Random(self.seed)
|
264
|
+
|
265
|
+
def clone(self, seed: int | None = None) -> "Glitchling":
|
266
|
+
"""Create a copy of this glitchling, optionally with a new seed."""
|
267
|
+
|
268
|
+
cls = self.__class__
|
269
|
+
filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
|
270
|
+
clone_seed = seed if seed is not None else self.seed
|
271
|
+
if clone_seed is not None:
|
272
|
+
filtered_kwargs["seed"] = clone_seed
|
273
|
+
|
274
|
+
if cls is Glitchling:
|
275
|
+
return Glitchling(
|
276
|
+
self.name,
|
277
|
+
self.corruption_function,
|
278
|
+
self.level,
|
279
|
+
self.order,
|
280
|
+
pipeline_operation=self._pipeline_descriptor_factory,
|
281
|
+
**filtered_kwargs,
|
282
|
+
)
|
283
|
+
|
284
|
+
return cls(**filtered_kwargs)
|
285
|
+
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
|
290
|
+
class Gaggle(Glitchling):
|
291
|
+
"""A collection of glitchlings executed in a deterministic order."""
|
292
|
+
|
293
|
+
def __init__(self, glitchlings: list[Glitchling], seed: int = 151):
|
294
|
+
"""Initialize the gaggle and derive per-glitchling RNG seeds.
|
295
|
+
|
296
|
+
Args:
|
297
|
+
glitchlings: Glitchlings to orchestrate.
|
298
|
+
seed: Master seed used to derive per-glitchling seeds.
|
299
|
+
"""
|
300
|
+
|
301
|
+
super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
|
302
|
+
self.glitchlings: dict[AttackWave, list[Glitchling]] = {
|
303
|
+
level: [] for level in AttackWave
|
304
|
+
}
|
305
|
+
self.apply_order: list[Glitchling] = []
|
306
|
+
# Derive deterministic per-glitchling seeds from master seed if provided
|
307
|
+
for idx, g in enumerate(glitchlings):
|
308
|
+
_g = g.clone()
|
309
|
+
derived_seed = Gaggle.derive_seed(seed, _g.name, idx)
|
310
|
+
_g.reset_rng(derived_seed)
|
311
|
+
setattr(_g, "_gaggle_index", idx)
|
312
|
+
self.glitchlings[g.level].append(_g)
|
313
|
+
self.sort_glitchlings()
|
314
|
+
|
315
|
+
@staticmethod
|
316
|
+
def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
|
317
|
+
"""Derive a deterministic seed for a glitchling based on the master seed."""
|
318
|
+
def _int_to_bytes(value: int) -> bytes:
|
319
|
+
if value == 0:
|
320
|
+
return b"\x00"
|
321
|
+
|
322
|
+
abs_value = abs(value)
|
323
|
+
length = max(1, (abs_value.bit_length() + 7) // 8)
|
324
|
+
|
325
|
+
if value < 0:
|
326
|
+
while True:
|
327
|
+
try:
|
328
|
+
return value.to_bytes(length, "big", signed=True)
|
329
|
+
except OverflowError:
|
330
|
+
length += 1
|
331
|
+
|
332
|
+
return abs_value.to_bytes(length, "big", signed=False)
|
333
|
+
|
334
|
+
hasher = blake2s(digest_size=8)
|
335
|
+
hasher.update(_int_to_bytes(master_seed))
|
336
|
+
hasher.update(b"\x00")
|
337
|
+
hasher.update(glitchling_name.encode("utf-8"))
|
338
|
+
hasher.update(b"\x00")
|
339
|
+
hasher.update(_int_to_bytes(index))
|
340
|
+
return int.from_bytes(hasher.digest(), "big")
|
341
|
+
|
342
|
+
def sort_glitchlings(self) -> None:
|
343
|
+
"""Sort glitchlings by wave then order to produce application order."""
|
344
|
+
|
345
|
+
self.apply_order = [
|
346
|
+
g
|
347
|
+
for _, glitchlings in sorted(self.glitchlings.items())
|
348
|
+
for g in sorted(glitchlings, key=lambda x: (x.order, x.name))
|
349
|
+
]
|
350
|
+
|
351
|
+
@staticmethod
|
352
|
+
def rust_pipeline_supported() -> bool:
|
353
|
+
"""Return ``True`` when the compiled Rust pipeline is importable."""
|
354
|
+
|
355
|
+
return _compose_glitchlings_rust is not None
|
356
|
+
|
357
|
+
@staticmethod
|
358
|
+
def rust_pipeline_enabled() -> bool:
|
359
|
+
"""Return ``True`` when the Rust pipeline is available and opted in."""
|
360
|
+
|
361
|
+
return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
|
362
|
+
|
363
|
+
def _pipeline_descriptors(self) -> list[dict[str, Any]] | None:
|
364
|
+
if not self.rust_pipeline_enabled():
|
365
|
+
return None
|
366
|
+
|
367
|
+
descriptors: list[dict[str, Any]] = []
|
368
|
+
for glitchling in self.apply_order:
|
369
|
+
operation = glitchling.pipeline_operation()
|
370
|
+
if operation is None:
|
371
|
+
return None
|
372
|
+
|
373
|
+
seed = glitchling.seed
|
374
|
+
if seed is None:
|
375
|
+
index = getattr(glitchling, "_gaggle_index", None)
|
376
|
+
master_seed = self.seed
|
377
|
+
if index is None or master_seed is None:
|
378
|
+
return None
|
379
|
+
seed = Gaggle.derive_seed(master_seed, glitchling.name, index)
|
380
|
+
|
381
|
+
descriptors.append(
|
382
|
+
{
|
383
|
+
"name": glitchling.name,
|
384
|
+
"operation": operation,
|
385
|
+
"seed": int(seed),
|
386
|
+
}
|
387
|
+
)
|
388
|
+
|
389
|
+
return descriptors
|
390
|
+
|
391
|
+
def corrupt(self, text: str) -> str:
|
392
|
+
"""Apply each glitchling to the provided text sequentially."""
|
393
|
+
|
394
|
+
master_seed = self.seed
|
395
|
+
descriptors = self._pipeline_descriptors()
|
396
|
+
if master_seed is not None and descriptors is not None:
|
397
|
+
try:
|
398
|
+
return _compose_glitchlings_rust(text, descriptors, master_seed)
|
399
|
+
except Exception: # pragma: no cover - fall back to Python execution
|
400
|
+
log.debug("Rust pipeline failed; falling back", exc_info=True)
|
401
|
+
|
402
|
+
corrupted = text
|
403
|
+
for glitchling in self.apply_order:
|
404
|
+
corrupted = glitchling(corrupted)
|
405
|
+
return corrupted
|