glitchlings 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,230 @@
1
+ """Core data structures used to model glitchlings and their interactions."""
2
+
3
+ import inspect
4
+ import random
5
+ from enum import IntEnum, auto
6
+ from hashlib import blake2s
7
+ from typing import Any, Protocol
8
+
9
+ from datasets import Dataset
10
+
11
+
12
+ class CorruptionCallable(Protocol):
13
+ """Protocol describing a callable capable of corrupting text."""
14
+
15
+ def __call__(self, text: str, *args: Any, **kwargs: Any) -> str: ...
16
+
17
+
18
+ # Text levels for glitchlings, to enforce a sort order
19
+ # Work from highest level down, because e.g.
20
+ # duplicating a word then adding a typo is potentially different than
21
+ # adding a typo then duplicating a word
22
+ class AttackWave(IntEnum):
23
+ """Granularity of text that a glitchling corrupts."""
24
+
25
+ DOCUMENT = auto()
26
+ PARAGRAPH = auto()
27
+ SENTENCE = auto()
28
+ WORD = auto()
29
+ CHARACTER = auto()
30
+
31
+
32
+ # Modifier for within the same attack wave
33
+ class AttackOrder(IntEnum):
34
+ """Relative execution order for glitchlings within the same wave."""
35
+
36
+ FIRST = auto()
37
+ EARLY = auto()
38
+ NORMAL = auto()
39
+ LATE = auto()
40
+ LAST = auto()
41
+
42
+
43
+ class Glitchling:
44
+ """A single text corruption agent with deterministic behaviour."""
45
+
46
+ def __init__(
47
+ self,
48
+ name: str,
49
+ corruption_function: CorruptionCallable,
50
+ scope: AttackWave,
51
+ order: AttackOrder = AttackOrder.NORMAL,
52
+ seed: int | None = None,
53
+ **kwargs: Any,
54
+ ) -> None:
55
+ """Initialize a glitchling.
56
+
57
+ Args:
58
+ name: Human readable glitchling name.
59
+ corruption_function: Callable used to transform text.
60
+ scope: Text granularity on which the glitchling operates.
61
+ order: Relative ordering within the same scope.
62
+ seed: Optional seed for deterministic random behaviour.
63
+ **kwargs: Additional parameters forwarded to the corruption callable.
64
+ """
65
+
66
+ # Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
67
+ # If no seed is supplied, we fall back to Python's default entropy.
68
+ self.seed = seed
69
+ self.rng: random.Random = random.Random(seed)
70
+ self.name: str = name
71
+ self.corruption_function: CorruptionCallable = corruption_function
72
+ self.level: AttackWave = scope
73
+ self.order: AttackOrder = order
74
+ self.kwargs: dict[str, Any] = {}
75
+ for kw, val in kwargs.items():
76
+ self.set_param(kw, val)
77
+
78
+ def set_param(self, key: str, value: Any) -> None:
79
+ """Persist a parameter for use by the corruption callable."""
80
+
81
+ setattr(self, key, value)
82
+ self.kwargs[key] = value
83
+ if key == "seed":
84
+ self.reset_rng(value)
85
+
86
+ def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
87
+ """Execute the corruption callable, injecting the RNG when required."""
88
+
89
+ # Pass rng to underlying corruption function if it expects it.
90
+ try:
91
+ signature = inspect.signature(self.corruption_function)
92
+ except (TypeError, ValueError):
93
+ signature = None
94
+
95
+ expects_rng = False
96
+ if signature is not None:
97
+ expects_rng = "rng" in signature.parameters
98
+
99
+ if expects_rng:
100
+ corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
101
+ else:
102
+ corrupted = self.corruption_function(text, *args, **kwargs)
103
+ return corrupted
104
+
105
+ def corrupt(self, text: str | list[dict[str, Any]]) -> str | list[dict[str, Any]]:
106
+ """Apply the corruption function to text or conversational transcripts."""
107
+
108
+ if isinstance(text, list):
109
+ text[-1]["content"] = self.__corrupt(text[-1]["content"], **self.kwargs)
110
+ else:
111
+ text = self.__corrupt(text, **self.kwargs)
112
+
113
+ return text
114
+
115
+ def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
116
+ """Apply corruption lazily across dataset columns."""
117
+
118
+ def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
119
+ row = dict(row)
120
+ for column in columns:
121
+ value = row[column]
122
+ if isinstance(value, list):
123
+ row[column] = [self.corrupt(item) for item in value]
124
+ else:
125
+ row[column] = self.corrupt(value)
126
+ return row
127
+
128
+ return dataset.with_transform(__corrupt_row)
129
+
130
+ def __call__(self, text: str, *args: Any, **kwds: Any) -> str | list[dict[str, Any]]:
131
+ """Allow a glitchling to be invoked directly like a callable."""
132
+
133
+ return self.corrupt(text, *args, **kwds)
134
+
135
+ def reset_rng(self, seed: int | None = None) -> None:
136
+ """Reset the glitchling's RNG to its initial seed."""
137
+
138
+ if seed is not None:
139
+ self.seed = seed
140
+ if self.seed is not None:
141
+ self.rng = random.Random(self.seed)
142
+
143
+ def clone(self, seed: int | None = None) -> "Glitchling":
144
+ """Create a copy of this glitchling, optionally with a new seed."""
145
+
146
+ cls = self.__class__
147
+ filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
148
+ clone_seed = seed if seed is not None else self.seed
149
+ if clone_seed is not None:
150
+ filtered_kwargs["seed"] = clone_seed
151
+
152
+ if cls is Glitchling:
153
+ return Glitchling(
154
+ self.name,
155
+ self.corruption_function,
156
+ self.level,
157
+ self.order,
158
+ **filtered_kwargs,
159
+ )
160
+
161
+ return cls(**filtered_kwargs)
162
+
163
+
164
+ class Gaggle(Glitchling):
165
+ """A collection of glitchlings executed in a deterministic order."""
166
+
167
+ def __init__(self, glitchlings: list[Glitchling], seed: int = 151):
168
+ """Initialize the gaggle and derive per-glitchling RNG seeds.
169
+
170
+ Args:
171
+ glitchlings: Glitchlings to orchestrate.
172
+ seed: Master seed used to derive per-glitchling seeds.
173
+ """
174
+
175
+ super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
176
+ self.glitchlings: dict[AttackWave, list[Glitchling]] = {
177
+ level: [] for level in AttackWave
178
+ }
179
+ self.apply_order: list[Glitchling] = []
180
+ # Derive deterministic per-glitchling seeds from master seed if provided
181
+ for idx, g in enumerate(glitchlings):
182
+ _g = g.clone()
183
+ derived_seed = Gaggle.derive_seed(seed, _g.name, idx)
184
+ _g.reset_rng(derived_seed)
185
+ self.glitchlings[g.level].append(_g)
186
+ self.sort_glitchlings()
187
+
188
+ @staticmethod
189
+ def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
190
+ """Derive a deterministic seed for a glitchling based on the master seed."""
191
+ def _int_to_bytes(value: int) -> bytes:
192
+ if value == 0:
193
+ return b"\x00"
194
+
195
+ abs_value = abs(value)
196
+ length = max(1, (abs_value.bit_length() + 7) // 8)
197
+
198
+ if value < 0:
199
+ while True:
200
+ try:
201
+ return value.to_bytes(length, "big", signed=True)
202
+ except OverflowError:
203
+ length += 1
204
+
205
+ return abs_value.to_bytes(length, "big", signed=False)
206
+
207
+ hasher = blake2s(digest_size=8)
208
+ hasher.update(_int_to_bytes(master_seed))
209
+ hasher.update(b"\x00")
210
+ hasher.update(glitchling_name.encode("utf-8"))
211
+ hasher.update(b"\x00")
212
+ hasher.update(_int_to_bytes(index))
213
+ return int.from_bytes(hasher.digest(), "big")
214
+
215
+ def sort_glitchlings(self) -> None:
216
+ """Sort glitchlings by wave then order to produce application order."""
217
+
218
+ self.apply_order = [
219
+ g
220
+ for _, glitchlings in sorted(self.glitchlings.items())
221
+ for g in sorted(glitchlings, key=lambda x: (x.order, x.name))
222
+ ]
223
+
224
+ def corrupt(self, text: str) -> str:
225
+ """Apply each glitchling to the provided text sequentially."""
226
+
227
+ corrupted = text
228
+ for glitchling in self.apply_order:
229
+ corrupted = glitchling(corrupted)
230
+ return corrupted
@@ -0,0 +1,225 @@
1
+ import random
2
+ import re
3
+ from collections.abc import Iterable
4
+ from dataclasses import dataclass
5
+ from typing import Any, Literal, cast
6
+
7
+ import nltk
8
+ from nltk.corpus import wordnet as wn
9
+ from .core import Glitchling, AttackWave
10
+
11
+ _wordnet_ready = False
12
+
13
+
14
+ def _ensure_wordnet() -> None:
15
+ """Ensure the WordNet corpus is available before use."""
16
+
17
+ global _wordnet_ready
18
+ if _wordnet_ready:
19
+ return
20
+
21
+ try:
22
+ wn.ensure_loaded()
23
+ except LookupError:
24
+ nltk.download("wordnet", quiet=True)
25
+ try:
26
+ wn.ensure_loaded()
27
+ except LookupError as exc: # pragma: no cover - only triggered when download fails
28
+ raise RuntimeError(
29
+ "Unable to load NLTK WordNet corpus for the jargoyle glitchling."
30
+ ) from exc
31
+
32
+ _wordnet_ready = True
33
+
34
+
35
+ PartOfSpeech = Literal["n", "v", "a", "r"]
36
+ PartOfSpeechInput = PartOfSpeech | Iterable[PartOfSpeech] | Literal["any"]
37
+ NormalizedPartsOfSpeech = tuple[PartOfSpeech, ...]
38
+
39
+ _VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
40
+
41
+
42
+ def _split_token(token: str) -> tuple[str, str, str]:
43
+ """Split a token into leading punctuation, core word, and trailing punctuation."""
44
+
45
+ match = re.match(r"^(\W*)(.*?)(\W*)$", token)
46
+ if not match:
47
+ return "", token, ""
48
+ prefix, core, suffix = match.groups()
49
+ return prefix, core, suffix
50
+
51
+
52
+ def _normalize_parts_of_speech(part_of_speech: PartOfSpeechInput) -> NormalizedPartsOfSpeech:
53
+ """Coerce user input into a tuple of valid WordNet POS tags."""
54
+
55
+ if isinstance(part_of_speech, str):
56
+ lowered = part_of_speech.lower()
57
+ if lowered == "any":
58
+ return _VALID_POS
59
+ if lowered not in _VALID_POS:
60
+ raise ValueError(
61
+ "part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'"
62
+ )
63
+ return (cast(PartOfSpeech, lowered),)
64
+
65
+ normalized: list[PartOfSpeech] = []
66
+ for pos in part_of_speech:
67
+ if pos not in _VALID_POS:
68
+ raise ValueError(
69
+ "part_of_speech entries must be one of 'n', 'v', 'a', or 'r'"
70
+ )
71
+ if pos not in normalized:
72
+ normalized.append(pos)
73
+ if not normalized:
74
+ raise ValueError("part_of_speech iterable may not be empty")
75
+ return tuple(normalized)
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class CandidateInfo:
80
+ """Metadata for a candidate token that may be replaced."""
81
+
82
+ prefix: str
83
+ core_word: str
84
+ suffix: str
85
+ parts_of_speech: NormalizedPartsOfSpeech
86
+
87
+
88
+ def _collect_synonyms(
89
+ word: str, parts_of_speech: NormalizedPartsOfSpeech
90
+ ) -> list[str]:
91
+ """Gather deterministic synonym candidates for the supplied word."""
92
+
93
+ normalized_word = word.lower()
94
+ synonyms: set[str] = set()
95
+ for pos_tag in parts_of_speech:
96
+ synsets = wn.synsets(word, pos=pos_tag)
97
+ if not synsets:
98
+ continue
99
+
100
+ for synset in synsets:
101
+ lemmas_list = [lemma.name() for lemma in cast(Any, synset).lemmas()]
102
+ if not lemmas_list:
103
+ continue
104
+
105
+ filtered = []
106
+ for lemma_str in lemmas_list:
107
+ cleaned = lemma_str.replace("_", " ")
108
+ if cleaned.lower() != normalized_word:
109
+ filtered.append(cleaned)
110
+
111
+ if filtered:
112
+ synonyms.update(filtered)
113
+ break
114
+
115
+ if synonyms:
116
+ break
117
+
118
+ return sorted(synonyms)
119
+
120
+
121
+ def substitute_random_synonyms(
122
+ text: str,
123
+ replacement_rate: float = 0.1,
124
+ part_of_speech: PartOfSpeechInput = "n",
125
+ seed: int | None = None,
126
+ rng: random.Random | None = None,
127
+ ) -> str:
128
+ """Replace words with random WordNet synonyms.
129
+
130
+ Parameters
131
+ - text: Input text.
132
+ - replacement_rate: Max proportion of candidate words to replace (default 0.1).
133
+ - part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
134
+ any iterable of those tags, or "any" to include all four.
135
+ - rng: Optional RNG instance used for deterministic sampling.
136
+ - seed: Optional seed if `rng` not provided.
137
+
138
+ Determinism
139
+ - Candidates collected in left-to-right order; no set() reordering.
140
+ - Replacement positions chosen via rng.sample.
141
+ - Synonyms sorted before rng.choice to fix ordering.
142
+ - For each POS, the first synset containing alternate lemmas is used for stability.
143
+ """
144
+ _ensure_wordnet()
145
+
146
+ active_rng: random.Random
147
+ if rng is not None:
148
+ active_rng = rng
149
+ else:
150
+ active_rng = random.Random(seed)
151
+
152
+ target_pos = _normalize_parts_of_speech(part_of_speech)
153
+
154
+ # Split but keep whitespace separators so we can rebuild easily
155
+ tokens = re.split(r"(\s+)", text)
156
+
157
+ # Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
158
+ candidate_indices: list[int] = []
159
+ candidate_metadata: dict[int, CandidateInfo] = {}
160
+ for idx, tok in enumerate(tokens):
161
+ if idx % 2 == 0 and tok and not tok.isspace():
162
+ prefix, core_word, suffix = _split_token(tok)
163
+ if not core_word:
164
+ continue
165
+
166
+ available_pos: NormalizedPartsOfSpeech = tuple(
167
+ pos for pos in target_pos if wn.synsets(core_word, pos=pos)
168
+ )
169
+ if available_pos:
170
+ candidate_indices.append(idx)
171
+ candidate_metadata[idx] = CandidateInfo(
172
+ prefix=prefix,
173
+ core_word=core_word,
174
+ suffix=suffix,
175
+ parts_of_speech=available_pos,
176
+ )
177
+
178
+ if not candidate_indices:
179
+ return text
180
+
181
+ max_replacements = int(len(candidate_indices) * replacement_rate)
182
+ if max_replacements <= 0:
183
+ return text
184
+
185
+ # Choose which positions to replace deterministically via rng.sample
186
+ replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
187
+ # Process in ascending order to avoid affecting later indices
188
+ replace_positions.sort()
189
+
190
+ for pos in replace_positions:
191
+ metadata = candidate_metadata[pos]
192
+ synonyms = _collect_synonyms(metadata.core_word, metadata.parts_of_speech)
193
+ if not synonyms:
194
+ continue
195
+
196
+ replacement = active_rng.choice(synonyms)
197
+ tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
198
+
199
+ return "".join(tokens)
200
+
201
+
202
+ class Jargoyle(Glitchling):
203
+ """Glitchling that swaps words with random WordNet synonyms."""
204
+
205
+ def __init__(
206
+ self,
207
+ *,
208
+ replacement_rate: float = 0.1,
209
+ part_of_speech: PartOfSpeechInput = "n",
210
+ seed: int | None = None,
211
+ ) -> None:
212
+ super().__init__(
213
+ name="Jargoyle",
214
+ corruption_function=substitute_random_synonyms,
215
+ scope=AttackWave.WORD,
216
+ seed=seed,
217
+ replacement_rate=replacement_rate,
218
+ part_of_speech=part_of_speech,
219
+ )
220
+
221
+
222
+ jargoyle = Jargoyle()
223
+
224
+
225
+ __all__ = ["Jargoyle", "jargoyle"]
@@ -1,62 +1,79 @@
1
- from typing import Literal
2
- from .core import Glitchling, AttackWave, AttackOrder
3
- import random
4
- from confusable_homoglyphs import confusables
5
-
6
-
7
- def swap_homoglyphs(
8
- text: str,
9
- replacement_rate: float = 0.02,
10
- classes: list[str] | Literal["all"] | None = None,
11
- seed: int | None = None,
12
- rng: random.Random | None = None,
13
- ) -> str:
14
- """Replace characters with visually confusable homoglyphs.
15
-
16
- Parameters
17
- - text: Input text.
18
- - replacement_rate: Max proportion of eligible characters to replace (default 0.02).
19
- - classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
20
- - seed: Optional seed if `rng` not provided.
21
- - rng: Optional RNG; overrides seed.
22
-
23
- Notes
24
- - Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
25
- - Maintains determinism by shuffling candidates and sampling via the provided RNG.
26
- """
27
- if rng is None:
28
- rng = random.Random(seed)
29
-
30
- if classes is None:
31
- classes = ["LATIN", "GREEK", "CYRILLIC"]
32
-
33
- target_chars = [char for char in text if char.isalnum()]
34
- confusable_chars = [
35
- char for char in target_chars if char in confusables.confusables_data
36
- ]
37
- num_replacements = int(len(confusable_chars) * replacement_rate)
38
- done = 0
39
- rng.shuffle(confusable_chars)
40
- for char in confusable_chars:
41
- if done >= num_replacements:
42
- break
43
- options = [
44
- o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
45
- ]
46
- if classes != "all":
47
- options = [opt for opt in options if confusables.alias(opt) in classes]
48
- if not options:
49
- continue
50
- text = text.replace(char, rng.choice(options), 1)
51
- done += 1
52
- return text
53
-
54
-
55
- mim1c = Glitchling(
56
- name="Mim1c",
57
- corruption_function=swap_homoglyphs,
58
- scope=AttackWave.CHARACTER,
59
- order=AttackOrder.LAST,
60
- replacement_rate=0.02,
61
- classes=["LATIN", "GREEK", "CYRILLIC"],
62
- )
1
+ from typing import Literal
2
+ from .core import Glitchling, AttackWave, AttackOrder
3
+ import random
4
+ from confusable_homoglyphs import confusables
5
+
6
+
7
+ def swap_homoglyphs(
8
+ text: str,
9
+ replacement_rate: float = 0.02,
10
+ classes: list[str] | Literal["all"] | None = None,
11
+ seed: int | None = None,
12
+ rng: random.Random | None = None,
13
+ ) -> str:
14
+ """Replace characters with visually confusable homoglyphs.
15
+
16
+ Parameters
17
+ - text: Input text.
18
+ - replacement_rate: Max proportion of eligible characters to replace (default 0.02).
19
+ - classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
20
+ - seed: Optional seed if `rng` not provided.
21
+ - rng: Optional RNG; overrides seed.
22
+
23
+ Notes
24
+ - Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
25
+ - Maintains determinism by shuffling candidates and sampling via the provided RNG.
26
+ """
27
+ if rng is None:
28
+ rng = random.Random(seed)
29
+
30
+ if classes is None:
31
+ classes = ["LATIN", "GREEK", "CYRILLIC"]
32
+
33
+ target_chars = [char for char in text if char.isalnum()]
34
+ confusable_chars = [
35
+ char for char in target_chars if char in confusables.confusables_data
36
+ ]
37
+ num_replacements = int(len(confusable_chars) * replacement_rate)
38
+ done = 0
39
+ rng.shuffle(confusable_chars)
40
+ for char in confusable_chars:
41
+ if done >= num_replacements:
42
+ break
43
+ options = [
44
+ o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
45
+ ]
46
+ if classes != "all":
47
+ options = [opt for opt in options if confusables.alias(opt) in classes]
48
+ if not options:
49
+ continue
50
+ text = text.replace(char, rng.choice(options), 1)
51
+ done += 1
52
+ return text
53
+
54
+
55
+ class Mim1c(Glitchling):
56
+ """Glitchling that swaps characters for visually similar homoglyphs."""
57
+
58
+ def __init__(
59
+ self,
60
+ *,
61
+ replacement_rate: float = 0.02,
62
+ classes: list[str] | Literal["all"] | None = None,
63
+ seed: int | None = None,
64
+ ) -> None:
65
+ super().__init__(
66
+ name="Mim1c",
67
+ corruption_function=swap_homoglyphs,
68
+ scope=AttackWave.CHARACTER,
69
+ order=AttackOrder.LAST,
70
+ seed=seed,
71
+ replacement_rate=replacement_rate,
72
+ classes=classes,
73
+ )
74
+
75
+
76
+ mim1c = Mim1c()
77
+
78
+
79
+ __all__ = ["Mim1c", "mim1c"]