glitchlings 0.2.3__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ import re
2
+ import random
3
+ from typing import Any
4
+
5
+ from .core import Glitchling, AttackWave
6
+ from ._rate import resolve_rate
7
+
8
+ try:
9
+ from glitchlings._zoo_rust import reduplicate_words as _reduplicate_words_rust
10
+ except ImportError: # pragma: no cover - compiled extension not present
11
+ _reduplicate_words_rust = None
12
+
13
+
14
+ def _python_reduplicate_words(
15
+ text: str,
16
+ *,
17
+ rate: float,
18
+ rng: random.Random,
19
+ ) -> str:
20
+ """Randomly reduplicate words in the text.
21
+
22
+ Parameters
23
+ - text: Input text.
24
+ - rate: Max proportion of words to reduplicate (default 0.05).
25
+ - seed: Optional seed if `rng` not provided.
26
+ - rng: Optional RNG; overrides seed.
27
+
28
+ Notes
29
+ - Preserves spacing and punctuation by tokenizing with separators.
30
+ - Deterministic when run with a fixed seed or via Gaggle.
31
+ """
32
+ # Preserve exact spacing and punctuation by using regex
33
+ tokens = re.split(r"(\s+)", text) # Split but keep separators
34
+
35
+ for i in range(0, len(tokens), 2): # Every other token is a word
36
+ if i >= len(tokens):
37
+ break
38
+
39
+ word = tokens[i]
40
+ if not word or word.isspace(): # Skip empty or whitespace
41
+ continue
42
+
43
+ # Only consider actual words for reduplication
44
+ if rng.random() < rate:
45
+ # Check if word has trailing punctuation
46
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
47
+ if match:
48
+ prefix, core, suffix = match.groups()
49
+ # Reduplicate with a space: "word" -> "word word"
50
+ tokens[i] = f"{prefix}{core} {core}{suffix}"
51
+ else:
52
+ tokens[i] = f"{word} {word}"
53
+ return "".join(tokens)
54
+
55
+
56
+ def reduplicate_words(
57
+ text: str,
58
+ rate: float | None = None,
59
+ seed: int | None = None,
60
+ rng: random.Random | None = None,
61
+ *,
62
+ reduplication_rate: float | None = None,
63
+ ) -> str:
64
+ """Randomly reduplicate words in the text.
65
+
66
+ Falls back to the Python implementation when the optional Rust
67
+ extension is unavailable.
68
+ """
69
+
70
+ effective_rate = resolve_rate(
71
+ rate=rate,
72
+ legacy_value=reduplication_rate,
73
+ default=0.05,
74
+ legacy_name="reduplication_rate",
75
+ )
76
+
77
+ if rng is None:
78
+ rng = random.Random(seed)
79
+
80
+ clamped_rate = max(0.0, effective_rate)
81
+
82
+ if _reduplicate_words_rust is not None:
83
+ return _reduplicate_words_rust(text, clamped_rate, rng)
84
+
85
+ return _python_reduplicate_words(
86
+ text,
87
+ rate=clamped_rate,
88
+ rng=rng,
89
+ )
90
+
91
+
92
+ class Reduple(Glitchling):
93
+ """Glitchling that repeats words to simulate stuttering speech."""
94
+
95
+ def __init__(
96
+ self,
97
+ *,
98
+ rate: float | None = None,
99
+ reduplication_rate: float | None = None,
100
+ seed: int | None = None,
101
+ ) -> None:
102
+ self._param_aliases = {"reduplication_rate": "rate"}
103
+ effective_rate = resolve_rate(
104
+ rate=rate,
105
+ legacy_value=reduplication_rate,
106
+ default=0.05,
107
+ legacy_name="reduplication_rate",
108
+ )
109
+ super().__init__(
110
+ name="Reduple",
111
+ corruption_function=reduplicate_words,
112
+ scope=AttackWave.WORD,
113
+ seed=seed,
114
+ rate=effective_rate,
115
+ )
116
+
117
+ def pipeline_operation(self) -> dict[str, Any] | None:
118
+ rate = self.kwargs.get("rate")
119
+ if rate is None:
120
+ return None
121
+ return {"type": "reduplicate", "reduplication_rate": float(rate)}
122
+
123
+
124
+
125
+ reduple = Reduple()
126
+
127
+
128
+ __all__ = ["Reduple", "reduple"]
@@ -0,0 +1,136 @@
1
+ import math
2
+ import random
3
+ import re
4
+ from typing import Any
5
+
6
+ from .core import Glitchling, AttackWave
7
+ from ._rate import resolve_rate
8
+
9
+ try:
10
+ from glitchlings._zoo_rust import delete_random_words as _delete_random_words_rust
11
+ except ImportError: # pragma: no cover - compiled extension not present
12
+ _delete_random_words_rust = None
13
+
14
+
15
+ def _python_delete_random_words(
16
+ text: str,
17
+ *,
18
+ rate: float,
19
+ rng: random.Random,
20
+ ) -> str:
21
+ """Delete random words from the input text while preserving whitespace."""
22
+
23
+ if rate <= 0.0:
24
+ return text
25
+
26
+ tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
27
+
28
+ candidate_indices: list[int] = []
29
+ for i in range(2, len(tokens), 2): # Every other token is a word, skip the first word
30
+ word = tokens[i]
31
+ if not word or word.isspace():
32
+ continue
33
+
34
+ candidate_indices.append(i)
35
+
36
+ allowed_deletions = min(
37
+ len(candidate_indices), math.floor(len(candidate_indices) * rate)
38
+ )
39
+ if allowed_deletions <= 0:
40
+ return text
41
+
42
+ deletions = 0
43
+ for i in candidate_indices:
44
+ if rng.random() < rate:
45
+ word = tokens[i]
46
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
47
+ if match:
48
+ prefix, _, suffix = match.groups()
49
+ tokens[i] = f"{prefix.strip()}{suffix.strip()}"
50
+ else:
51
+ tokens[i] = ""
52
+
53
+ deletions += 1
54
+ if deletions >= allowed_deletions:
55
+ break
56
+
57
+ text = "".join(tokens)
58
+ text = re.sub(r"\s+([.,;:])", r"\1", text)
59
+ text = re.sub(r"\s{2,}", " ", text).strip()
60
+
61
+ return text
62
+
63
+
64
+ def delete_random_words(
65
+ text: str,
66
+ rate: float | None = None,
67
+ seed: int | None = None,
68
+ rng: random.Random | None = None,
69
+ *,
70
+ max_deletion_rate: float | None = None,
71
+ ) -> str:
72
+ """Delete random words from the input text.
73
+
74
+ Uses the optional Rust implementation when available.
75
+ """
76
+
77
+ effective_rate = resolve_rate(
78
+ rate=rate,
79
+ legacy_value=max_deletion_rate,
80
+ default=0.01,
81
+ legacy_name="max_deletion_rate",
82
+ )
83
+
84
+ if rng is None:
85
+ rng = random.Random(seed)
86
+
87
+ clamped_rate = max(0.0, effective_rate)
88
+
89
+ if _delete_random_words_rust is not None:
90
+ return _delete_random_words_rust(text, clamped_rate, rng)
91
+
92
+ return _python_delete_random_words(
93
+ text,
94
+ rate=clamped_rate,
95
+ rng=rng,
96
+ )
97
+
98
+
99
+ class Rushmore(Glitchling):
100
+ """Glitchling that deletes words to simulate missing information."""
101
+
102
+ def __init__(
103
+ self,
104
+ *,
105
+ rate: float | None = None,
106
+ max_deletion_rate: float | None = None,
107
+ seed: int | None = None,
108
+ ) -> None:
109
+ self._param_aliases = {"max_deletion_rate": "rate"}
110
+ effective_rate = resolve_rate(
111
+ rate=rate,
112
+ legacy_value=max_deletion_rate,
113
+ default=0.01,
114
+ legacy_name="max_deletion_rate",
115
+ )
116
+ super().__init__(
117
+ name="Rushmore",
118
+ corruption_function=delete_random_words,
119
+ scope=AttackWave.WORD,
120
+ seed=seed,
121
+ rate=effective_rate,
122
+ )
123
+
124
+ def pipeline_operation(self) -> dict[str, Any] | None:
125
+ rate = self.kwargs.get("rate")
126
+ if rate is None:
127
+ rate = self.kwargs.get("max_deletion_rate")
128
+ if rate is None:
129
+ return None
130
+ return {"type": "delete", "max_deletion_rate": float(rate)}
131
+
132
+
133
+ rushmore = Rushmore()
134
+
135
+
136
+ __all__ = ["Rushmore", "rushmore"]
@@ -0,0 +1,171 @@
1
+ import re
2
+ import random
3
+ from typing import Any
4
+
5
+ from ._ocr_confusions import load_confusion_table
6
+ from .core import Glitchling, AttackWave, AttackOrder
7
+ from ._rate import resolve_rate
8
+
9
+ try:
10
+ from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
11
+ except ImportError: # pragma: no cover - compiled extension not present
12
+ _ocr_artifacts_rust = None
13
+
14
+
15
+ def _python_ocr_artifacts(
16
+ text: str,
17
+ *,
18
+ rate: float,
19
+ rng: random.Random,
20
+ ) -> str:
21
+ """Introduce OCR-like artifacts into text.
22
+
23
+ Parameters
24
+ - text: Input text to corrupt.
25
+ - rate: Max proportion of eligible confusion matches to replace (default 0.02).
26
+ - seed: Optional seed if `rng` not provided.
27
+ - rng: Optional RNG; overrides seed.
28
+
29
+ Notes
30
+ - Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
31
+ - Collects all non-overlapping candidate spans in reading order, then samples
32
+ a subset deterministically with the provided RNG.
33
+ - Replacements can change length (e.g., m→rn), so edits are applied from left
34
+ to right using precomputed spans to avoid index drift.
35
+ """
36
+ if not text:
37
+ return text
38
+
39
+ # Keep the confusion definitions in a shared data file so both the Python
40
+ # and Rust implementations stay in sync.
41
+ confusion_table = load_confusion_table()
42
+
43
+ # Build candidate matches as (start, end, choices)
44
+ candidates: list[tuple[int, int, list[str]]] = []
45
+
46
+ # To avoid double-counting overlapping patterns (like 'l' inside 'li'),
47
+ # we will scan longer patterns first by sorting by len(src) desc.
48
+ for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
49
+ pattern = re.escape(src)
50
+ for m in re.finditer(pattern, text):
51
+ start, end = m.span()
52
+ candidates.append((start, end, choices))
53
+
54
+ if not candidates:
55
+ return text
56
+
57
+ # Decide how many to replace
58
+ k = int(len(candidates) * rate)
59
+ if k <= 0:
60
+ return text
61
+
62
+ # Shuffle deterministically and select non-overlapping k spans
63
+ rng.shuffle(candidates)
64
+ chosen: list[tuple[int, int, str]] = []
65
+ occupied: list[tuple[int, int]] = []
66
+
67
+ def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
68
+ return not (a[1] <= b[0] or b[1] <= a[0])
69
+
70
+ for start, end, choices in candidates:
71
+ if len(chosen) >= k:
72
+ break
73
+ span = (start, end)
74
+ if any(overlaps(span, occ) for occ in occupied):
75
+ continue
76
+ replacement = rng.choice(choices)
77
+ chosen.append((start, end, replacement))
78
+ occupied.append(span)
79
+
80
+ if not chosen:
81
+ return text
82
+
83
+ # Apply edits from left to right
84
+ chosen.sort(key=lambda t: t[0])
85
+ out_parts = []
86
+ cursor = 0
87
+ for start, end, rep in chosen:
88
+ if cursor < start:
89
+ out_parts.append(text[cursor:start])
90
+ out_parts.append(rep)
91
+ cursor = end
92
+ if cursor < len(text):
93
+ out_parts.append(text[cursor:])
94
+
95
+ return "".join(out_parts)
96
+
97
+
98
+ def ocr_artifacts(
99
+ text: str,
100
+ rate: float | None = None,
101
+ seed: int | None = None,
102
+ rng: random.Random | None = None,
103
+ *,
104
+ error_rate: float | None = None,
105
+ ) -> str:
106
+ """Introduce OCR-like artifacts into text.
107
+
108
+ Prefers the Rust implementation when available.
109
+ """
110
+
111
+ if not text:
112
+ return text
113
+
114
+ effective_rate = resolve_rate(
115
+ rate=rate,
116
+ legacy_value=error_rate,
117
+ default=0.02,
118
+ legacy_name="error_rate",
119
+ )
120
+
121
+ if rng is None:
122
+ rng = random.Random(seed)
123
+
124
+ clamped_rate = max(0.0, effective_rate)
125
+
126
+ if _ocr_artifacts_rust is not None:
127
+ return _ocr_artifacts_rust(text, clamped_rate, rng)
128
+
129
+ return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
130
+
131
+
132
+ class Scannequin(Glitchling):
133
+ """Glitchling that simulates OCR artifacts using common confusions."""
134
+
135
+ def __init__(
136
+ self,
137
+ *,
138
+ rate: float | None = None,
139
+ error_rate: float | None = None,
140
+ seed: int | None = None,
141
+ ) -> None:
142
+ self._param_aliases = {"error_rate": "rate"}
143
+ effective_rate = resolve_rate(
144
+ rate=rate,
145
+ legacy_value=error_rate,
146
+ default=0.02,
147
+ legacy_name="error_rate",
148
+ )
149
+ super().__init__(
150
+ name="Scannequin",
151
+ corruption_function=ocr_artifacts,
152
+ scope=AttackWave.CHARACTER,
153
+ order=AttackOrder.LATE,
154
+ seed=seed,
155
+ rate=effective_rate,
156
+ )
157
+
158
+ def pipeline_operation(self) -> dict[str, Any] | None:
159
+ rate = self.kwargs.get("rate")
160
+ if rate is None:
161
+ rate = self.kwargs.get("error_rate")
162
+ if rate is None:
163
+ return None
164
+ return {"type": "ocr", "error_rate": float(rate)}
165
+
166
+
167
+
168
+ scannequin = Scannequin()
169
+
170
+
171
+ __all__ = ["Scannequin", "scannequin"]
@@ -0,0 +1,212 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import random
5
+ from typing import Optional
6
+
7
+ from .core import Glitchling, AttackWave, AttackOrder
8
+ from ._rate import resolve_rate
9
+ from ..util import KEYNEIGHBORS
10
+
11
+ try:
12
+ from glitchlings._zoo_rust import fatfinger as _fatfinger_rust
13
+ except ImportError: # pragma: no cover - compiled extension not present
14
+ _fatfinger_rust = None
15
+
16
+
17
+ def _python_unichar(text: str, rng: random.Random) -> str:
18
+ """Collapse one random doubled letter (like 'ee' in 'seed') to a single occurrence."""
19
+ import re
20
+
21
+ matches = list(re.finditer(r"((.)\2)(?=\w)", text))
22
+ if not matches:
23
+ return text
24
+ start, end = rng.choice(matches).span(1)
25
+ return text[:start] + text[start] + text[end:]
26
+
27
+
28
+ def _python_skipped_space(text: str, rng: random.Random) -> str:
29
+ import re
30
+
31
+ space_positions = [m.start() for m in re.finditer(r" ", text)]
32
+ if not space_positions:
33
+ return text
34
+ idx = rng.choice(space_positions)
35
+ return text[:idx] + text[idx + 1 :]
36
+
37
+
38
+ def _python_random_space(text: str, rng: random.Random) -> str:
39
+ if len(text) < 2:
40
+ return text
41
+ idx = rng.randrange(1, len(text))
42
+ return text[:idx] + " " + text[idx:]
43
+
44
+
45
+ def _python_repeated_char(text: str, rng: random.Random) -> str:
46
+ positions = [i for i, c in enumerate(text) if not c.isspace()]
47
+ if not positions:
48
+ return text
49
+ i = rng.choice(positions)
50
+ return text[:i] + text[i] + text[i:]
51
+
52
+
53
+ def _python_is_word_char(c: str) -> bool:
54
+ return c.isalnum() or c == "_"
55
+
56
+
57
+ def _python_eligible_idx(s: str, i: int) -> bool:
58
+ if i < 0 or i >= len(s):
59
+ return False
60
+ if not _python_is_word_char(s[i]):
61
+ return False
62
+ left_ok = i > 0 and _python_is_word_char(s[i - 1])
63
+ right_ok = i + 1 < len(s) and _python_is_word_char(s[i + 1])
64
+ return left_ok and right_ok
65
+
66
+
67
+ def _python_draw_eligible_index(
68
+ rng: random.Random, s: str, max_tries: int = 16
69
+ ) -> Optional[int]:
70
+ n = len(s)
71
+ if n == 0:
72
+ return None
73
+ for _ in range(max_tries):
74
+ i = rng.randrange(n)
75
+ if _python_eligible_idx(s, i):
76
+ return i
77
+ start = rng.randrange(n)
78
+ i = start
79
+ while True:
80
+ if _python_eligible_idx(s, i):
81
+ return i
82
+ i += 1
83
+ if i == n:
84
+ i = 0
85
+ if i == start:
86
+ return None
87
+
88
+
89
+ def _fatfinger_python(
90
+ text: str,
91
+ *,
92
+ rate: float,
93
+ layout: dict[str, list[str]],
94
+ rng: random.Random,
95
+ ) -> str:
96
+ if rate <= 0.0:
97
+ return text
98
+
99
+ s = text
100
+ max_changes = math.ceil(len(s) * rate)
101
+ if max_changes == 0:
102
+ return s
103
+
104
+ positional_actions = ("char_swap", "missing_char", "extra_char", "nearby_char")
105
+ global_actions = ("skipped_space", "random_space", "unichar", "repeated_char")
106
+ all_actions = positional_actions + global_actions
107
+
108
+ actions_drawn = [rng.choice(all_actions) for _ in range(max_changes)]
109
+
110
+ for action in actions_drawn:
111
+ if action in positional_actions:
112
+ idx = _python_draw_eligible_index(rng, s)
113
+ if idx is None:
114
+ continue
115
+ if action == "char_swap":
116
+ j = idx + 1
117
+ s = s[:idx] + s[j] + s[idx] + s[j + 1 :]
118
+ elif action == "missing_char":
119
+ if _python_eligible_idx(s, idx):
120
+ s = s[:idx] + s[idx + 1 :]
121
+ elif action == "extra_char":
122
+ ch = s[idx]
123
+ neighbors = layout.get(ch.lower(), []) or [ch]
124
+ ins = rng.choice(neighbors) or ch
125
+ s = s[:idx] + ins + s[idx:]
126
+ elif action == "nearby_char":
127
+ ch = s[idx]
128
+ neighbors = layout.get(ch.lower(), [])
129
+ if neighbors:
130
+ rep = rng.choice(neighbors)
131
+ s = s[:idx] + rep + s[idx + 1 :]
132
+ else:
133
+ if action == "skipped_space":
134
+ s = _python_skipped_space(s, rng)
135
+ elif action == "random_space":
136
+ s = _python_random_space(s, rng)
137
+ elif action == "unichar":
138
+ s = _python_unichar(s, rng)
139
+ elif action == "repeated_char":
140
+ s = _python_repeated_char(s, rng)
141
+ return s
142
+
143
+
144
+ def fatfinger(
145
+ text: str,
146
+ rate: float | None = None,
147
+ keyboard: str = "CURATOR_QWERTY",
148
+ seed: int | None = None,
149
+ rng: random.Random | None = None,
150
+ *,
151
+ max_change_rate: float | None = None,
152
+ ) -> str:
153
+ """Introduce character-level "fat finger" edits with a Rust fast path."""
154
+
155
+ effective_rate = resolve_rate(
156
+ rate=rate,
157
+ legacy_value=max_change_rate,
158
+ default=0.02,
159
+ legacy_name="max_change_rate",
160
+ )
161
+
162
+ if rng is None:
163
+ rng = random.Random(seed)
164
+ if not text:
165
+ return ""
166
+
167
+ clamped_rate = max(0.0, effective_rate)
168
+ if clamped_rate == 0.0:
169
+ return text
170
+
171
+ layout = getattr(KEYNEIGHBORS, keyboard)
172
+
173
+ if _fatfinger_rust is not None:
174
+ return _fatfinger_rust(text, max_change_rate=clamped_rate, layout=layout, rng=rng)
175
+
176
+ return _fatfinger_python(text, rate=clamped_rate, layout=layout, rng=rng)
177
+
178
+
179
+ class Typogre(Glitchling):
180
+ """Glitchling that introduces deterministic keyboard-typing errors."""
181
+
182
+ def __init__(
183
+ self,
184
+ *,
185
+ rate: float | None = None,
186
+ max_change_rate: float | None = None,
187
+ keyboard: str = "CURATOR_QWERTY",
188
+ seed: int | None = None,
189
+ ) -> None:
190
+ self._param_aliases = {"max_change_rate": "rate"}
191
+ effective_rate = resolve_rate(
192
+ rate=rate,
193
+ legacy_value=max_change_rate,
194
+ default=0.02,
195
+ legacy_name="max_change_rate",
196
+ )
197
+ super().__init__(
198
+ name="Typogre",
199
+ corruption_function=fatfinger,
200
+ scope=AttackWave.CHARACTER,
201
+ order=AttackOrder.EARLY,
202
+ seed=seed,
203
+ rate=effective_rate,
204
+ keyboard=keyboard,
205
+ )
206
+
207
+
208
+ typogre = Typogre()
209
+
210
+
211
+ __all__ = ["Typogre", "typogre"]
212
+