glitchlings 0.2.1__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ import difflib
2
+ from collections.abc import Iterable
3
+
4
+ SAMPLE_TEXT = "One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked."
5
+
6
+
7
+ def string_diffs(a: str, b: str) -> list[list[tuple[str, str, str]]]:
8
+ """
9
+ Compare two strings using SequenceMatcher and return
10
+ grouped adjacent opcodes (excluding 'equal' tags).
11
+
12
+ Each element is a tuple: (tag, a_text, b_text).
13
+ """
14
+ sm = difflib.SequenceMatcher(None, a, b)
15
+ ops: list[list[tuple[str, str, str]]] = []
16
+ buffer: list[tuple[str, str, str]] = []
17
+
18
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
19
+ if tag == "equal":
20
+ # flush any buffered operations before skipping
21
+ if buffer:
22
+ ops.append(buffer)
23
+ buffer = []
24
+ continue
25
+
26
+ # append operation to buffer
27
+ buffer.append((tag, a[i1:i2], b[j1:j2]))
28
+
29
+ # flush trailing buffer
30
+ if buffer:
31
+ ops.append(buffer)
32
+
33
+ return ops
34
+
35
+
36
+ KeyNeighborMap = dict[str, list[str]]
37
+ KeyboardLayouts = dict[str, KeyNeighborMap]
38
+
39
+
40
+ def _build_neighbor_map(rows: Iterable[str]) -> KeyNeighborMap:
41
+ """Derive 8-neighbour adjacency lists from keyboard layout rows."""
42
+
43
+ grid: dict[tuple[int, int], str] = {}
44
+ for y, row in enumerate(rows):
45
+ for x, char in enumerate(row):
46
+ if char == " ":
47
+ continue
48
+ grid[(x, y)] = char.lower()
49
+
50
+ neighbors: KeyNeighborMap = {}
51
+ for (x, y), char in grid.items():
52
+ seen: list[str] = []
53
+ for dy in (-1, 0, 1):
54
+ for dx in (-1, 0, 1):
55
+ if dx == 0 and dy == 0:
56
+ continue
57
+ candidate = grid.get((x + dx, y + dy))
58
+ if candidate is None:
59
+ continue
60
+ seen.append(candidate)
61
+ # Preserve encounter order but drop duplicates for determinism
62
+ deduped = list(dict.fromkeys(seen))
63
+ neighbors[char] = deduped
64
+
65
+ return neighbors
66
+
67
+
68
+ _KEYNEIGHBORS: KeyboardLayouts = {
69
+ "CURATOR_QWERTY": {
70
+ "a": [*"qwsz"],
71
+ "b": [*"vghn "],
72
+ "c": [*"xdfv "],
73
+ "d": [*"serfcx"],
74
+ "e": [*"wsdrf34"],
75
+ "f": [*"drtgvc"],
76
+ "g": [*"ftyhbv"],
77
+ "h": [*"gyujnb"],
78
+ "i": [*"ujko89"],
79
+ "j": [*"huikmn"],
80
+ "k": [*"jilom,"],
81
+ "l": [*"kop;.,"],
82
+ "m": [*"njk, "],
83
+ "n": [*"bhjm "],
84
+ "o": [*"iklp90"],
85
+ "p": [*"o0-[;l"],
86
+ "q": [*"was 12"],
87
+ "r": [*"edft45"],
88
+ "s": [*"awedxz"],
89
+ "t": [*"r56ygf"],
90
+ "u": [*"y78ijh"],
91
+ "v": [*"cfgb "],
92
+ "w": [*"q23esa"],
93
+ "x": [*"zsdc "],
94
+ "y": [*"t67uhg"],
95
+ "z": [*"asx"],
96
+ }
97
+ }
98
+
99
+
100
+ def _register_layout(name: str, rows: Iterable[str]) -> None:
101
+ _KEYNEIGHBORS[name] = _build_neighbor_map(rows)
102
+
103
+
104
+ _register_layout(
105
+ "DVORAK",
106
+ (
107
+ "`1234567890[]\\",
108
+ " ',.pyfgcrl/=\\",
109
+ " aoeuidhtns-",
110
+ " ;qjkxbmwvz",
111
+ ),
112
+ )
113
+
114
+ _register_layout(
115
+ "COLEMAK",
116
+ (
117
+ "`1234567890-=",
118
+ " qwfpgjluy;[]\\",
119
+ " arstdhneio'",
120
+ " zxcvbkm,./",
121
+ ),
122
+ )
123
+
124
+ _register_layout(
125
+ "QWERTY",
126
+ (
127
+ "`1234567890-=",
128
+ " qwertyuiop[]\\",
129
+ " asdfghjkl;'",
130
+ " zxcvbnm,./",
131
+ ),
132
+ )
133
+
134
+ _register_layout(
135
+ "AZERTY",
136
+ (
137
+ "²&é\"'(-è_çà)=",
138
+ " azertyuiop^$",
139
+ " qsdfghjklmù*",
140
+ " <wxcvbn,;:!",
141
+ ),
142
+ )
143
+
144
+
145
+ class KeyNeighbors:
146
+ def __init__(self) -> None:
147
+ for layout_name, layout in _KEYNEIGHBORS.items():
148
+ setattr(self, layout_name, layout)
149
+
150
+
151
+ KEYNEIGHBORS: KeyNeighbors = KeyNeighbors()
@@ -0,0 +1,57 @@
1
+ from .typogre import Typogre, typogre
2
+ from .mim1c import Mim1c, mim1c
3
+ from .jargoyle import Jargoyle, jargoyle
4
+ from .reduple import Reduple, reduple
5
+ from .rushmore import Rushmore, rushmore
6
+ from .redactyl import Redactyl, redactyl
7
+ from .scannequin import Scannequin, scannequin
8
+ from .core import Glitchling, Gaggle
9
+
10
+ __all__ = [
11
+ "Typogre",
12
+ "typogre",
13
+ "Mim1c",
14
+ "mim1c",
15
+ "Jargoyle",
16
+ "jargoyle",
17
+ "Reduple",
18
+ "reduple",
19
+ "Rushmore",
20
+ "rushmore",
21
+ "Redactyl",
22
+ "redactyl",
23
+ "Scannequin",
24
+ "scannequin",
25
+ "Glitchling",
26
+ "Gaggle",
27
+ "summon",
28
+ ]
29
+
30
+
31
+ def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
32
+ """Summon glitchlings by name (using defaults) or instance (to change parameters)."""
33
+ available = {
34
+ g.name.lower(): g
35
+ for g in [
36
+ typogre,
37
+ mim1c,
38
+ jargoyle,
39
+ reduple,
40
+ rushmore,
41
+ redactyl,
42
+ scannequin,
43
+ ]
44
+ }
45
+ summoned = []
46
+ for entry in glitchlings:
47
+ if isinstance(entry, Glitchling):
48
+ summoned.append(entry)
49
+ continue
50
+
51
+ g = available.get(entry.lower())
52
+ if g:
53
+ summoned.append(g)
54
+ else:
55
+ raise ValueError(f"Glitchling '{entry}' not found.")
56
+
57
+ return Gaggle(summoned, seed=seed)
@@ -0,0 +1,401 @@
1
+ """Core data structures used to model glitchlings and their interactions."""
2
+
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import random
7
+ from enum import IntEnum, auto
8
+ from hashlib import blake2s
9
+ from typing import TYPE_CHECKING, Any, Callable, Protocol
10
+
11
+ _datasets_error: ModuleNotFoundError | None = None
12
+ try: # pragma: no cover - optional dependency
13
+ from datasets import Dataset as _DatasetsDataset
14
+ except ModuleNotFoundError as error: # pragma: no cover - optional dependency
15
+ _DatasetsDataset = None # type: ignore[assignment]
16
+ _datasets_error = error
17
+ else:
18
+ _datasets_error = None
19
+
20
+ try: # pragma: no cover - optional dependency
21
+ from glitchlings._zoo_rust import compose_glitchlings as _compose_glitchlings_rust
22
+ except ImportError: # pragma: no cover - compiled extension not present
23
+ _compose_glitchlings_rust = None
24
+
25
+
26
+ log = logging.getLogger(__name__)
27
+
28
+
29
+ _PIPELINE_FEATURE_FLAG_ENV = "GLITCHLINGS_RUST_PIPELINE"
30
+
31
+
32
+ def _pipeline_feature_flag_enabled() -> bool:
33
+ """Return ``True`` when the environment explicitly opts into the Rust pipeline."""
34
+
35
+ value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
36
+ if value is None:
37
+ return False
38
+
39
+ normalized = value.strip().lower()
40
+ return normalized in {"1", "true", "yes", "on"}
41
+
42
+ if TYPE_CHECKING: # pragma: no cover - typing only
43
+ from datasets import Dataset # type: ignore
44
+ elif _DatasetsDataset is not None:
45
+ Dataset = _DatasetsDataset
46
+ else:
47
+
48
+ class Dataset(Protocol): # type: ignore[no-redef]
49
+ """Typed stub mirroring the Hugging Face dataset interface used here."""
50
+
51
+ def with_transform(self, function: Any) -> "Dataset": ...
52
+
53
+
54
+ def _is_transcript(value: Any) -> bool:
55
+ """Return True when the value resembles a chat transcript."""
56
+
57
+ if not isinstance(value, list):
58
+ return False
59
+
60
+ if not value:
61
+ return True
62
+
63
+ if not all(isinstance(turn, dict) for turn in value):
64
+ return False
65
+
66
+ return "content" in value[-1]
67
+
68
+
69
+ class CorruptionCallable(Protocol):
70
+ """Protocol describing a callable capable of corrupting text."""
71
+
72
+ def __call__(self, text: str, *args: Any, **kwargs: Any) -> str: ...
73
+
74
+
75
+ # Text levels for glitchlings, to enforce a sort order
76
+ # Work from highest level down, because e.g.
77
+ # duplicating a word then adding a typo is potentially different than
78
+ # adding a typo then duplicating a word
79
+ class AttackWave(IntEnum):
80
+ """Granularity of text that a glitchling corrupts."""
81
+
82
+ DOCUMENT = auto()
83
+ PARAGRAPH = auto()
84
+ SENTENCE = auto()
85
+ WORD = auto()
86
+ CHARACTER = auto()
87
+
88
+
89
+ # Modifier for within the same attack wave
90
+ class AttackOrder(IntEnum):
91
+ """Relative execution order for glitchlings within the same wave."""
92
+
93
+ FIRST = auto()
94
+ EARLY = auto()
95
+ NORMAL = auto()
96
+ LATE = auto()
97
+ LAST = auto()
98
+
99
+
100
+ class Glitchling:
101
+ """A single text corruption agent with deterministic behaviour."""
102
+
103
+ def __init__(
104
+ self,
105
+ name: str,
106
+ corruption_function: CorruptionCallable,
107
+ scope: AttackWave,
108
+ order: AttackOrder = AttackOrder.NORMAL,
109
+ seed: int | None = None,
110
+ **kwargs: Any,
111
+ ) -> None:
112
+ """Initialize a glitchling.
113
+
114
+ Args:
115
+ name: Human readable glitchling name.
116
+ corruption_function: Callable used to transform text.
117
+ scope: Text granularity on which the glitchling operates.
118
+ order: Relative ordering within the same scope.
119
+ seed: Optional seed for deterministic random behaviour.
120
+ **kwargs: Additional parameters forwarded to the corruption callable.
121
+ """
122
+
123
+ # Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
124
+ # If no seed is supplied, we fall back to Python's default entropy.
125
+ self.seed = seed
126
+ self.rng: random.Random = random.Random(seed)
127
+ self.name: str = name
128
+ self.corruption_function: CorruptionCallable = corruption_function
129
+ self.level: AttackWave = scope
130
+ self.order: AttackOrder = order
131
+ self.kwargs: dict[str, Any] = {}
132
+ for kw, val in kwargs.items():
133
+ self.set_param(kw, val)
134
+
135
+ def set_param(self, key: str, value: Any) -> None:
136
+ """Persist a parameter for use by the corruption callable."""
137
+
138
+ setattr(self, key, value)
139
+ self.kwargs[key] = value
140
+ if key == "seed":
141
+ self.reset_rng(value)
142
+
143
+ def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
144
+ """Execute the corruption callable, injecting the RNG when required."""
145
+
146
+ # Pass rng to underlying corruption function if it expects it.
147
+ try:
148
+ signature = inspect.signature(self.corruption_function)
149
+ except (TypeError, ValueError):
150
+ signature = None
151
+
152
+ expects_rng = False
153
+ if signature is not None:
154
+ expects_rng = "rng" in signature.parameters
155
+
156
+ if expects_rng:
157
+ corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
158
+ else:
159
+ corrupted = self.corruption_function(text, *args, **kwargs)
160
+ return corrupted
161
+
162
+ def corrupt(self, text: str | list[dict[str, Any]]) -> str | list[dict[str, Any]]:
163
+ """Apply the corruption function to text or conversational transcripts."""
164
+
165
+ if _is_transcript(text):
166
+ transcript = [dict(turn) for turn in text]
167
+ if transcript:
168
+ transcript[-1]["content"] = self.__corrupt(
169
+ transcript[-1]["content"], **self.kwargs
170
+ )
171
+ return transcript
172
+
173
+ return self.__corrupt(text, **self.kwargs)
174
+
175
+ def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
176
+ """Apply corruption lazily across dataset columns."""
177
+
178
+ if _DatasetsDataset is None:
179
+ message = "datasets is not installed"
180
+ raise ModuleNotFoundError(message) from _datasets_error
181
+
182
+ def _is_transcript(value: Any) -> bool:
183
+ """Return ``True`` when the value resembles a chat transcript."""
184
+
185
+ if not isinstance(value, list) or not value:
186
+ return False
187
+
188
+ return all(
189
+ isinstance(turn, dict) and "content" in turn for turn in value
190
+ )
191
+
192
+ def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
193
+ row = dict(row)
194
+ for column in columns:
195
+ value = row[column]
196
+ if _is_transcript(value):
197
+ row[column] = self.corrupt(value)
198
+ elif isinstance(value, list):
199
+ row[column] = [self.corrupt(item) for item in value]
200
+ else:
201
+ row[column] = self.corrupt(value)
202
+ return row
203
+
204
+ return dataset.with_transform(__corrupt_row)
205
+
206
+ def __call__(self, text: str, *args: Any, **kwds: Any) -> str | list[dict[str, Any]]:
207
+ """Allow a glitchling to be invoked directly like a callable."""
208
+
209
+ return self.corrupt(text, *args, **kwds)
210
+
211
+ def reset_rng(self, seed: int | None = None) -> None:
212
+ """Reset the glitchling's RNG to its initial seed."""
213
+
214
+ if seed is not None:
215
+ self.seed = seed
216
+ if self.seed is not None:
217
+ self.rng = random.Random(self.seed)
218
+
219
+ def clone(self, seed: int | None = None) -> "Glitchling":
220
+ """Create a copy of this glitchling, optionally with a new seed."""
221
+
222
+ cls = self.__class__
223
+ filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
224
+ clone_seed = seed if seed is not None else self.seed
225
+ if clone_seed is not None:
226
+ filtered_kwargs["seed"] = clone_seed
227
+
228
+ if cls is Glitchling:
229
+ return Glitchling(
230
+ self.name,
231
+ self.corruption_function,
232
+ self.level,
233
+ self.order,
234
+ **filtered_kwargs,
235
+ )
236
+
237
+ return cls(**filtered_kwargs)
238
+
239
+
240
+ def _pipeline_operation_reduplicate(glitchling: "Glitchling") -> dict[str, Any] | None:
241
+ rate = glitchling.kwargs.get("reduplication_rate")
242
+ if rate is None:
243
+ return None
244
+ return {"type": "reduplicate", "reduplication_rate": float(rate)}
245
+
246
+
247
+ def _pipeline_operation_delete(glitchling: "Glitchling") -> dict[str, Any] | None:
248
+ rate = glitchling.kwargs.get("max_deletion_rate")
249
+ if rate is None:
250
+ return None
251
+ return {"type": "delete", "max_deletion_rate": float(rate)}
252
+
253
+
254
+ def _pipeline_operation_redact(glitchling: "Glitchling") -> dict[str, Any] | None:
255
+ replacement_char = glitchling.kwargs.get("replacement_char")
256
+ redaction_rate = glitchling.kwargs.get("redaction_rate")
257
+ merge_adjacent = glitchling.kwargs.get("merge_adjacent")
258
+ if replacement_char is None or redaction_rate is None or merge_adjacent is None:
259
+ return None
260
+ return {
261
+ "type": "redact",
262
+ "replacement_char": str(replacement_char),
263
+ "redaction_rate": float(redaction_rate),
264
+ "merge_adjacent": bool(merge_adjacent),
265
+ }
266
+
267
+
268
+ def _pipeline_operation_ocr(glitchling: "Glitchling") -> dict[str, Any] | None:
269
+ error_rate = glitchling.kwargs.get("error_rate")
270
+ if error_rate is None:
271
+ return None
272
+ return {"type": "ocr", "error_rate": float(error_rate)}
273
+
274
+
275
+ _PIPELINE_OPERATION_BUILDERS: dict[str, Callable[["Glitchling"], dict[str, Any] | None]] = {
276
+ "Reduple": _pipeline_operation_reduplicate,
277
+ "Rushmore": _pipeline_operation_delete,
278
+ "Redactyl": _pipeline_operation_redact,
279
+ "Scannequin": _pipeline_operation_ocr,
280
+ }
281
+
282
+
283
+ class Gaggle(Glitchling):
284
+ """A collection of glitchlings executed in a deterministic order."""
285
+
286
+ def __init__(self, glitchlings: list[Glitchling], seed: int = 151):
287
+ """Initialize the gaggle and derive per-glitchling RNG seeds.
288
+
289
+ Args:
290
+ glitchlings: Glitchlings to orchestrate.
291
+ seed: Master seed used to derive per-glitchling seeds.
292
+ """
293
+
294
+ super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
295
+ self.glitchlings: dict[AttackWave, list[Glitchling]] = {
296
+ level: [] for level in AttackWave
297
+ }
298
+ self.apply_order: list[Glitchling] = []
299
+ # Derive deterministic per-glitchling seeds from master seed if provided
300
+ for idx, g in enumerate(glitchlings):
301
+ _g = g.clone()
302
+ derived_seed = Gaggle.derive_seed(seed, _g.name, idx)
303
+ _g.reset_rng(derived_seed)
304
+ setattr(_g, "_gaggle_index", idx)
305
+ self.glitchlings[g.level].append(_g)
306
+ self.sort_glitchlings()
307
+
308
+ @staticmethod
309
+ def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
310
+ """Derive a deterministic seed for a glitchling based on the master seed."""
311
+ def _int_to_bytes(value: int) -> bytes:
312
+ if value == 0:
313
+ return b"\x00"
314
+
315
+ abs_value = abs(value)
316
+ length = max(1, (abs_value.bit_length() + 7) // 8)
317
+
318
+ if value < 0:
319
+ while True:
320
+ try:
321
+ return value.to_bytes(length, "big", signed=True)
322
+ except OverflowError:
323
+ length += 1
324
+
325
+ return abs_value.to_bytes(length, "big", signed=False)
326
+
327
+ hasher = blake2s(digest_size=8)
328
+ hasher.update(_int_to_bytes(master_seed))
329
+ hasher.update(b"\x00")
330
+ hasher.update(glitchling_name.encode("utf-8"))
331
+ hasher.update(b"\x00")
332
+ hasher.update(_int_to_bytes(index))
333
+ return int.from_bytes(hasher.digest(), "big")
334
+
335
+ def sort_glitchlings(self) -> None:
336
+ """Sort glitchlings by wave then order to produce application order."""
337
+
338
+ self.apply_order = [
339
+ g
340
+ for _, glitchlings in sorted(self.glitchlings.items())
341
+ for g in sorted(glitchlings, key=lambda x: (x.order, x.name))
342
+ ]
343
+
344
+ @staticmethod
345
+ def rust_pipeline_supported() -> bool:
346
+ """Return ``True`` when the compiled Rust pipeline is importable."""
347
+
348
+ return _compose_glitchlings_rust is not None
349
+
350
+ @staticmethod
351
+ def rust_pipeline_enabled() -> bool:
352
+ """Return ``True`` when the Rust pipeline is available and opted in."""
353
+
354
+ return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
355
+
356
+ def _pipeline_descriptors(self) -> list[dict[str, Any]] | None:
357
+ if not self.rust_pipeline_enabled():
358
+ return None
359
+
360
+ descriptors: list[dict[str, Any]] = []
361
+ for glitchling in self.apply_order:
362
+ builder = _PIPELINE_OPERATION_BUILDERS.get(glitchling.name)
363
+ if builder is None:
364
+ return None
365
+ operation = builder(glitchling)
366
+ if operation is None:
367
+ return None
368
+
369
+ seed = glitchling.seed
370
+ if seed is None:
371
+ index = getattr(glitchling, "_gaggle_index", None)
372
+ master_seed = self.seed
373
+ if index is None or master_seed is None:
374
+ return None
375
+ seed = Gaggle.derive_seed(master_seed, glitchling.name, index)
376
+
377
+ descriptors.append(
378
+ {
379
+ "name": glitchling.name,
380
+ "operation": operation,
381
+ "seed": int(seed),
382
+ }
383
+ )
384
+
385
+ return descriptors
386
+
387
+ def corrupt(self, text: str) -> str:
388
+ """Apply each glitchling to the provided text sequentially."""
389
+
390
+ master_seed = self.seed
391
+ descriptors = self._pipeline_descriptors()
392
+ if master_seed is not None and descriptors is not None:
393
+ try:
394
+ return _compose_glitchlings_rust(text, descriptors, master_seed)
395
+ except Exception: # pragma: no cover - fall back to Python execution
396
+ log.debug("Rust pipeline failed; falling back", exc_info=True)
397
+
398
+ corrupted = text
399
+ for glitchling in self.apply_order:
400
+ corrupted = glitchling(corrupted)
401
+ return corrupted