glitchlings 0.2.0__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ import difflib
2
+ from collections.abc import Iterable
3
+
4
+ SAMPLE_TEXT = "One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked."
5
+
6
+
7
+ def string_diffs(a: str, b: str) -> list[list[tuple[str, str, str]]]:
8
+ """
9
+ Compare two strings using SequenceMatcher and return
10
+ grouped adjacent opcodes (excluding 'equal' tags).
11
+
12
+ Each element is a tuple: (tag, a_text, b_text).
13
+ """
14
+ sm = difflib.SequenceMatcher(None, a, b)
15
+ ops: list[list[tuple[str, str, str]]] = []
16
+ buffer: list[tuple[str, str, str]] = []
17
+
18
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
19
+ if tag == "equal":
20
+ # flush any buffered operations before skipping
21
+ if buffer:
22
+ ops.append(buffer)
23
+ buffer = []
24
+ continue
25
+
26
+ # append operation to buffer
27
+ buffer.append((tag, a[i1:i2], b[j1:j2]))
28
+
29
+ # flush trailing buffer
30
+ if buffer:
31
+ ops.append(buffer)
32
+
33
+ return ops
34
+
35
+
36
+ KeyNeighborMap = dict[str, list[str]]
37
+ KeyboardLayouts = dict[str, KeyNeighborMap]
38
+
39
+
40
+ def _build_neighbor_map(rows: Iterable[str]) -> KeyNeighborMap:
41
+ """Derive 8-neighbour adjacency lists from keyboard layout rows."""
42
+
43
+ grid: dict[tuple[int, int], str] = {}
44
+ for y, row in enumerate(rows):
45
+ for x, char in enumerate(row):
46
+ if char == " ":
47
+ continue
48
+ grid[(x, y)] = char.lower()
49
+
50
+ neighbors: KeyNeighborMap = {}
51
+ for (x, y), char in grid.items():
52
+ seen: list[str] = []
53
+ for dy in (-1, 0, 1):
54
+ for dx in (-1, 0, 1):
55
+ if dx == 0 and dy == 0:
56
+ continue
57
+ candidate = grid.get((x + dx, y + dy))
58
+ if candidate is None:
59
+ continue
60
+ seen.append(candidate)
61
+ # Preserve encounter order but drop duplicates for determinism
62
+ deduped = list(dict.fromkeys(seen))
63
+ neighbors[char] = deduped
64
+
65
+ return neighbors
66
+
67
+
68
+ _KEYNEIGHBORS: KeyboardLayouts = {
69
+ "CURATOR_QWERTY": {
70
+ "a": [*"qwsz"],
71
+ "b": [*"vghn "],
72
+ "c": [*"xdfv "],
73
+ "d": [*"serfcx"],
74
+ "e": [*"wsdrf34"],
75
+ "f": [*"drtgvc"],
76
+ "g": [*"ftyhbv"],
77
+ "h": [*"gyujnb"],
78
+ "i": [*"ujko89"],
79
+ "j": [*"huikmn"],
80
+ "k": [*"jilom,"],
81
+ "l": [*"kop;.,"],
82
+ "m": [*"njk, "],
83
+ "n": [*"bhjm "],
84
+ "o": [*"iklp90"],
85
+ "p": [*"o0-[;l"],
86
+ "q": [*"was 12"],
87
+ "r": [*"edft45"],
88
+ "s": [*"awedxz"],
89
+ "t": [*"r56ygf"],
90
+ "u": [*"y78ijh"],
91
+ "v": [*"cfgb "],
92
+ "w": [*"q23esa"],
93
+ "x": [*"zsdc "],
94
+ "y": [*"t67uhg"],
95
+ "z": [*"asx"],
96
+ }
97
+ }
98
+
99
+
100
+ def _register_layout(name: str, rows: Iterable[str]) -> None:
101
+ _KEYNEIGHBORS[name] = _build_neighbor_map(rows)
102
+
103
+
104
+ _register_layout(
105
+ "DVORAK",
106
+ (
107
+ "`1234567890[]\\",
108
+ " ',.pyfgcrl/=\\",
109
+ " aoeuidhtns-",
110
+ " ;qjkxbmwvz",
111
+ ),
112
+ )
113
+
114
+ _register_layout(
115
+ "COLEMAK",
116
+ (
117
+ "`1234567890-=",
118
+ " qwfpgjluy;[]\\",
119
+ " arstdhneio'",
120
+ " zxcvbkm,./",
121
+ ),
122
+ )
123
+
124
+ _register_layout(
125
+ "QWERTY",
126
+ (
127
+ "`1234567890-=",
128
+ " qwertyuiop[]\\",
129
+ " asdfghjkl;'",
130
+ " zxcvbnm,./",
131
+ ),
132
+ )
133
+
134
+ _register_layout(
135
+ "AZERTY",
136
+ (
137
+ "²&é\"'(-è_çà)=",
138
+ " azertyuiop^$",
139
+ " qsdfghjklmù*",
140
+ " <wxcvbn,;:!",
141
+ ),
142
+ )
143
+
144
+
145
+ class KeyNeighbors:
146
+ def __init__(self) -> None:
147
+ for layout_name, layout in _KEYNEIGHBORS.items():
148
+ setattr(self, layout_name, layout)
149
+
150
+
151
+ KEYNEIGHBORS: KeyNeighbors = KeyNeighbors()
@@ -0,0 +1,57 @@
1
+ from .typogre import Typogre, typogre
2
+ from .mim1c import Mim1c, mim1c
3
+ from .jargoyle import Jargoyle, jargoyle
4
+ from .reduple import Reduple, reduple
5
+ from .rushmore import Rushmore, rushmore
6
+ from .redactyl import Redactyl, redactyl
7
+ from .scannequin import Scannequin, scannequin
8
+ from .core import Glitchling, Gaggle
9
+
10
+ __all__ = [
11
+ "Typogre",
12
+ "typogre",
13
+ "Mim1c",
14
+ "mim1c",
15
+ "Jargoyle",
16
+ "jargoyle",
17
+ "Reduple",
18
+ "reduple",
19
+ "Rushmore",
20
+ "rushmore",
21
+ "Redactyl",
22
+ "redactyl",
23
+ "Scannequin",
24
+ "scannequin",
25
+ "Glitchling",
26
+ "Gaggle",
27
+ "summon",
28
+ ]
29
+
30
+
31
+ def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
32
+ """Summon glitchlings by name (using defaults) or instance (to change parameters)."""
33
+ available = {
34
+ g.name.lower(): g
35
+ for g in [
36
+ typogre,
37
+ mim1c,
38
+ jargoyle,
39
+ reduple,
40
+ rushmore,
41
+ redactyl,
42
+ scannequin,
43
+ ]
44
+ }
45
+ summoned = []
46
+ for entry in glitchlings:
47
+ if isinstance(entry, Glitchling):
48
+ summoned.append(entry)
49
+ continue
50
+
51
+ g = available.get(entry.lower())
52
+ if g:
53
+ summoned.append(g)
54
+ else:
55
+ raise ValueError(f"Glitchling '{entry}' not found.")
56
+
57
+ return Gaggle(summoned, seed=seed)
@@ -0,0 +1,282 @@
1
+ """Core data structures used to model glitchlings and their interactions."""
2
+
3
+ import inspect
4
+ import random
5
+ from enum import IntEnum, auto
6
+ from hashlib import blake2s
7
+ from typing import TYPE_CHECKING, Any, Protocol
8
+
9
+ _datasets_error: ModuleNotFoundError | None = None
10
+ try: # pragma: no cover - optional dependency
11
+ from datasets import Dataset as _DatasetsDataset
12
+ except ModuleNotFoundError as error: # pragma: no cover - optional dependency
13
+ _DatasetsDataset = None # type: ignore[assignment]
14
+ _datasets_error = error
15
+ else:
16
+ _datasets_error = None
17
+
18
+ if TYPE_CHECKING: # pragma: no cover - typing only
19
+ from datasets import Dataset # type: ignore
20
+ elif _DatasetsDataset is not None:
21
+ Dataset = _DatasetsDataset
22
+ else:
23
+
24
+ class Dataset(Protocol): # type: ignore[no-redef]
25
+ """Typed stub mirroring the Hugging Face dataset interface used here."""
26
+
27
+ def with_transform(self, function: Any) -> "Dataset": ...
28
+
29
+
30
+ def _is_transcript(value: Any) -> bool:
31
+ """Return True when the value resembles a chat transcript."""
32
+
33
+ if not isinstance(value, list):
34
+ return False
35
+
36
+ if not value:
37
+ return True
38
+
39
+ if not all(isinstance(turn, dict) for turn in value):
40
+ return False
41
+
42
+ return "content" in value[-1]
43
+
44
+
45
+ class CorruptionCallable(Protocol):
46
+ """Protocol describing a callable capable of corrupting text."""
47
+
48
+ def __call__(self, text: str, *args: Any, **kwargs: Any) -> str: ...
49
+
50
+
51
+ # Text levels for glitchlings, to enforce a sort order
52
+ # Work from highest level down, because e.g.
53
+ # duplicating a word then adding a typo is potentially different than
54
+ # adding a typo then duplicating a word
55
+ class AttackWave(IntEnum):
56
+ """Granularity of text that a glitchling corrupts."""
57
+
58
+ DOCUMENT = auto()
59
+ PARAGRAPH = auto()
60
+ SENTENCE = auto()
61
+ WORD = auto()
62
+ CHARACTER = auto()
63
+
64
+
65
+ # Modifier for within the same attack wave
66
+ class AttackOrder(IntEnum):
67
+ """Relative execution order for glitchlings within the same wave."""
68
+
69
+ FIRST = auto()
70
+ EARLY = auto()
71
+ NORMAL = auto()
72
+ LATE = auto()
73
+ LAST = auto()
74
+
75
+
76
+ class Glitchling:
77
+ """A single text corruption agent with deterministic behaviour."""
78
+
79
+ def __init__(
80
+ self,
81
+ name: str,
82
+ corruption_function: CorruptionCallable,
83
+ scope: AttackWave,
84
+ order: AttackOrder = AttackOrder.NORMAL,
85
+ seed: int | None = None,
86
+ **kwargs: Any,
87
+ ) -> None:
88
+ """Initialize a glitchling.
89
+
90
+ Args:
91
+ name: Human readable glitchling name.
92
+ corruption_function: Callable used to transform text.
93
+ scope: Text granularity on which the glitchling operates.
94
+ order: Relative ordering within the same scope.
95
+ seed: Optional seed for deterministic random behaviour.
96
+ **kwargs: Additional parameters forwarded to the corruption callable.
97
+ """
98
+
99
+ # Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
100
+ # If no seed is supplied, we fall back to Python's default entropy.
101
+ self.seed = seed
102
+ self.rng: random.Random = random.Random(seed)
103
+ self.name: str = name
104
+ self.corruption_function: CorruptionCallable = corruption_function
105
+ self.level: AttackWave = scope
106
+ self.order: AttackOrder = order
107
+ self.kwargs: dict[str, Any] = {}
108
+ for kw, val in kwargs.items():
109
+ self.set_param(kw, val)
110
+
111
+ def set_param(self, key: str, value: Any) -> None:
112
+ """Persist a parameter for use by the corruption callable."""
113
+
114
+ setattr(self, key, value)
115
+ self.kwargs[key] = value
116
+ if key == "seed":
117
+ self.reset_rng(value)
118
+
119
+ def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
120
+ """Execute the corruption callable, injecting the RNG when required."""
121
+
122
+ # Pass rng to underlying corruption function if it expects it.
123
+ try:
124
+ signature = inspect.signature(self.corruption_function)
125
+ except (TypeError, ValueError):
126
+ signature = None
127
+
128
+ expects_rng = False
129
+ if signature is not None:
130
+ expects_rng = "rng" in signature.parameters
131
+
132
+ if expects_rng:
133
+ corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
134
+ else:
135
+ corrupted = self.corruption_function(text, *args, **kwargs)
136
+ return corrupted
137
+
138
+ def corrupt(self, text: str | list[dict[str, Any]]) -> str | list[dict[str, Any]]:
139
+ """Apply the corruption function to text or conversational transcripts."""
140
+
141
+ if _is_transcript(text):
142
+ transcript = [dict(turn) for turn in text]
143
+ if transcript:
144
+ transcript[-1]["content"] = self.__corrupt(
145
+ transcript[-1]["content"], **self.kwargs
146
+ )
147
+ return transcript
148
+
149
+ return self.__corrupt(text, **self.kwargs)
150
+
151
+ def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
152
+ """Apply corruption lazily across dataset columns."""
153
+
154
+ if _DatasetsDataset is None:
155
+ message = "datasets is not installed"
156
+ raise ModuleNotFoundError(message) from _datasets_error
157
+
158
+ def _is_transcript(value: Any) -> bool:
159
+ """Return ``True`` when the value resembles a chat transcript."""
160
+
161
+ if not isinstance(value, list) or not value:
162
+ return False
163
+
164
+ return all(
165
+ isinstance(turn, dict) and "content" in turn for turn in value
166
+ )
167
+
168
+ def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
169
+ row = dict(row)
170
+ for column in columns:
171
+ value = row[column]
172
+ if _is_transcript(value):
173
+ row[column] = self.corrupt(value)
174
+ elif isinstance(value, list):
175
+ row[column] = [self.corrupt(item) for item in value]
176
+ else:
177
+ row[column] = self.corrupt(value)
178
+ return row
179
+
180
+ return dataset.with_transform(__corrupt_row)
181
+
182
+ def __call__(self, text: str, *args: Any, **kwds: Any) -> str | list[dict[str, Any]]:
183
+ """Allow a glitchling to be invoked directly like a callable."""
184
+
185
+ return self.corrupt(text, *args, **kwds)
186
+
187
+ def reset_rng(self, seed: int | None = None) -> None:
188
+ """Reset the glitchling's RNG to its initial seed."""
189
+
190
+ if seed is not None:
191
+ self.seed = seed
192
+ if self.seed is not None:
193
+ self.rng = random.Random(self.seed)
194
+
195
+ def clone(self, seed: int | None = None) -> "Glitchling":
196
+ """Create a copy of this glitchling, optionally with a new seed."""
197
+
198
+ cls = self.__class__
199
+ filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
200
+ clone_seed = seed if seed is not None else self.seed
201
+ if clone_seed is not None:
202
+ filtered_kwargs["seed"] = clone_seed
203
+
204
+ if cls is Glitchling:
205
+ return Glitchling(
206
+ self.name,
207
+ self.corruption_function,
208
+ self.level,
209
+ self.order,
210
+ **filtered_kwargs,
211
+ )
212
+
213
+ return cls(**filtered_kwargs)
214
+
215
+
216
+ class Gaggle(Glitchling):
217
+ """A collection of glitchlings executed in a deterministic order."""
218
+
219
+ def __init__(self, glitchlings: list[Glitchling], seed: int = 151):
220
+ """Initialize the gaggle and derive per-glitchling RNG seeds.
221
+
222
+ Args:
223
+ glitchlings: Glitchlings to orchestrate.
224
+ seed: Master seed used to derive per-glitchling seeds.
225
+ """
226
+
227
+ super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
228
+ self.glitchlings: dict[AttackWave, list[Glitchling]] = {
229
+ level: [] for level in AttackWave
230
+ }
231
+ self.apply_order: list[Glitchling] = []
232
+ # Derive deterministic per-glitchling seeds from master seed if provided
233
+ for idx, g in enumerate(glitchlings):
234
+ _g = g.clone()
235
+ derived_seed = Gaggle.derive_seed(seed, _g.name, idx)
236
+ _g.reset_rng(derived_seed)
237
+ self.glitchlings[g.level].append(_g)
238
+ self.sort_glitchlings()
239
+
240
+ @staticmethod
241
+ def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
242
+ """Derive a deterministic seed for a glitchling based on the master seed."""
243
+ def _int_to_bytes(value: int) -> bytes:
244
+ if value == 0:
245
+ return b"\x00"
246
+
247
+ abs_value = abs(value)
248
+ length = max(1, (abs_value.bit_length() + 7) // 8)
249
+
250
+ if value < 0:
251
+ while True:
252
+ try:
253
+ return value.to_bytes(length, "big", signed=True)
254
+ except OverflowError:
255
+ length += 1
256
+
257
+ return abs_value.to_bytes(length, "big", signed=False)
258
+
259
+ hasher = blake2s(digest_size=8)
260
+ hasher.update(_int_to_bytes(master_seed))
261
+ hasher.update(b"\x00")
262
+ hasher.update(glitchling_name.encode("utf-8"))
263
+ hasher.update(b"\x00")
264
+ hasher.update(_int_to_bytes(index))
265
+ return int.from_bytes(hasher.digest(), "big")
266
+
267
+ def sort_glitchlings(self) -> None:
268
+ """Sort glitchlings by wave then order to produce application order."""
269
+
270
+ self.apply_order = [
271
+ g
272
+ for _, glitchlings in sorted(self.glitchlings.items())
273
+ for g in sorted(glitchlings, key=lambda x: (x.order, x.name))
274
+ ]
275
+
276
+ def corrupt(self, text: str) -> str:
277
+ """Apply each glitchling to the provided text sequentially."""
278
+
279
+ corrupted = text
280
+ for glitchling in self.apply_order:
281
+ corrupted = glitchling(corrupted)
282
+ return corrupted