glitchlings 0.2.0__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ import re
2
+ import random
3
+
4
+ from .core import Glitchling, AttackWave, AttackOrder
5
+
6
+ try:
7
+ from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
8
+ except ImportError: # pragma: no cover - compiled extension not present
9
+ _ocr_artifacts_rust = None
10
+
11
+
12
+ def _python_ocr_artifacts(
13
+ text: str,
14
+ *,
15
+ error_rate: float,
16
+ rng: random.Random,
17
+ ) -> str:
18
+ """Introduce OCR-like artifacts into text.
19
+
20
+ Parameters
21
+ - text: Input text to corrupt.
22
+ - error_rate: Max proportion of eligible confusion matches to replace (default 0.02).
23
+ - seed: Optional seed if `rng` not provided.
24
+ - rng: Optional RNG; overrides seed.
25
+
26
+ Notes
27
+ - Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
28
+ - Collects all non-overlapping candidate spans in reading order, then samples
29
+ a subset deterministically with the provided RNG.
30
+ - Replacements can change length (e.g., m→rn), so edits are applied from left
31
+ to right using precomputed spans to avoid index drift.
32
+ """
33
+ if not text:
34
+ return text
35
+
36
+ # map: source -> list of possible replacements
37
+ # Keep patterns small and specific; longer patterns first avoid overmatching
38
+ confusion_table: list[tuple[str, list[str]]] = [
39
+ ("li", ["h"]),
40
+ ("h", ["li"]),
41
+ ("rn", ["m"]),
42
+ ("m", ["rn"]),
43
+ ("cl", ["d"]),
44
+ ("d", ["cl"]),
45
+ ("I", ["l"]),
46
+ ("l", ["I", "1"]),
47
+ ("1", ["l", "I"]),
48
+ ("0", ["O"]),
49
+ ("O", ["0"]),
50
+ ("B", ["8"]),
51
+ ("8", ["B"]),
52
+ ("S", ["5"]),
53
+ ("5", ["S"]),
54
+ ("Z", ["2"]),
55
+ ("2", ["Z"]),
56
+ ("G", ["6"]),
57
+ ("6", ["G"]),
58
+ ("“", ['"']),
59
+ ("”", ['"']),
60
+ ("‘", ["'"]),
61
+ ("’", ["'"]),
62
+ ("—", ["-"]), # em dash -> hyphen
63
+ ("–", ["-"]), # en dash -> hyphen
64
+ ]
65
+
66
+ # Build candidate matches as (start, end, choices)
67
+ candidates: list[tuple[int, int, list[str]]] = []
68
+
69
+ # To avoid double-counting overlapping patterns (like 'l' inside 'li'),
70
+ # we will scan longer patterns first by sorting by len(src) desc.
71
+ for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
72
+ pattern = re.escape(src)
73
+ for m in re.finditer(pattern, text):
74
+ start, end = m.span()
75
+ candidates.append((start, end, choices))
76
+
77
+ if not candidates:
78
+ return text
79
+
80
+ # Decide how many to replace
81
+ k = int(len(candidates) * error_rate)
82
+ if k <= 0:
83
+ return text
84
+
85
+ # Shuffle deterministically and select non-overlapping k spans
86
+ rng.shuffle(candidates)
87
+ chosen: list[tuple[int, int, str]] = []
88
+ occupied: list[tuple[int, int]] = []
89
+
90
+ def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
91
+ return not (a[1] <= b[0] or b[1] <= a[0])
92
+
93
+ for start, end, choices in candidates:
94
+ if len(chosen) >= k:
95
+ break
96
+ span = (start, end)
97
+ if any(overlaps(span, occ) for occ in occupied):
98
+ continue
99
+ replacement = rng.choice(choices)
100
+ chosen.append((start, end, replacement))
101
+ occupied.append(span)
102
+
103
+ if not chosen:
104
+ return text
105
+
106
+ # Apply edits from left to right
107
+ chosen.sort(key=lambda t: t[0])
108
+ out_parts = []
109
+ cursor = 0
110
+ for start, end, rep in chosen:
111
+ if cursor < start:
112
+ out_parts.append(text[cursor:start])
113
+ out_parts.append(rep)
114
+ cursor = end
115
+ if cursor < len(text):
116
+ out_parts.append(text[cursor:])
117
+
118
+ return "".join(out_parts)
119
+
120
+
121
+ def ocr_artifacts(
122
+ text: str,
123
+ error_rate: float = 0.02,
124
+ seed: int | None = None,
125
+ rng: random.Random | None = None,
126
+ ) -> str:
127
+ """Introduce OCR-like artifacts into text.
128
+
129
+ Prefers the Rust implementation when available.
130
+ """
131
+
132
+ if not text:
133
+ return text
134
+
135
+ if rng is None:
136
+ rng = random.Random(seed)
137
+
138
+ if _ocr_artifacts_rust is not None:
139
+ return _ocr_artifacts_rust(text, error_rate, rng)
140
+
141
+ return _python_ocr_artifacts(text, error_rate=error_rate, rng=rng)
142
+
143
+
144
+ class Scannequin(Glitchling):
145
+ """Glitchling that simulates OCR artifacts using common confusions."""
146
+
147
+ def __init__(
148
+ self,
149
+ *,
150
+ error_rate: float = 0.02,
151
+ seed: int | None = None,
152
+ ) -> None:
153
+ super().__init__(
154
+ name="Scannequin",
155
+ corruption_function=ocr_artifacts,
156
+ scope=AttackWave.CHARACTER,
157
+ order=AttackOrder.LATE,
158
+ seed=seed,
159
+ error_rate=error_rate,
160
+ )
161
+
162
+
163
+ scannequin = Scannequin()
164
+
165
+
166
+ __all__ = ["Scannequin", "scannequin"]
@@ -0,0 +1,184 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import Optional
5
+
6
+ from .core import Glitchling, AttackWave, AttackOrder
7
+ from ..util import KEYNEIGHBORS
8
+
9
+ try:
10
+ from glitchlings._typogre_rust import fatfinger as _fatfinger_rust
11
+ except ImportError: # pragma: no cover - compiled extension not present
12
+ _fatfinger_rust = None
13
+
14
+
15
+ def _python_unichar(text: str, rng: random.Random) -> str:
16
+ """Collapse one random doubled letter (like 'ee' in 'seed') to a single occurrence."""
17
+ import re
18
+
19
+ matches = list(re.finditer(r"((.)\2)(?=\w)", text))
20
+ if not matches:
21
+ return text
22
+ start, end = rng.choice(matches).span(1)
23
+ return text[:start] + text[start] + text[end:]
24
+
25
+
26
+ def _python_skipped_space(text: str, rng: random.Random) -> str:
27
+ import re
28
+
29
+ space_positions = [m.start() for m in re.finditer(r" ", text)]
30
+ if not space_positions:
31
+ return text
32
+ idx = rng.choice(space_positions)
33
+ return text[:idx] + text[idx + 1 :]
34
+
35
+
36
+ def _python_random_space(text: str, rng: random.Random) -> str:
37
+ if len(text) < 2:
38
+ return text
39
+ idx = rng.randrange(1, len(text))
40
+ return text[:idx] + " " + text[idx:]
41
+
42
+
43
+ def _python_repeated_char(text: str, rng: random.Random) -> str:
44
+ positions = [i for i, c in enumerate(text) if not c.isspace()]
45
+ if not positions:
46
+ return text
47
+ i = rng.choice(positions)
48
+ return text[:i] + text[i] + text[i:]
49
+
50
+
51
+ def _python_is_word_char(c: str) -> bool:
52
+ return c.isalnum() or c == "_"
53
+
54
+
55
+ def _python_eligible_idx(s: str, i: int) -> bool:
56
+ if i < 0 or i >= len(s):
57
+ return False
58
+ if not _python_is_word_char(s[i]):
59
+ return False
60
+ left_ok = i > 0 and _python_is_word_char(s[i - 1])
61
+ right_ok = i + 1 < len(s) and _python_is_word_char(s[i + 1])
62
+ return left_ok and right_ok
63
+
64
+
65
+ def _python_draw_eligible_index(
66
+ rng: random.Random, s: str, max_tries: int = 16
67
+ ) -> Optional[int]:
68
+ n = len(s)
69
+ if n == 0:
70
+ return None
71
+ for _ in range(max_tries):
72
+ i = rng.randrange(n)
73
+ if _python_eligible_idx(s, i):
74
+ return i
75
+ start = rng.randrange(n)
76
+ i = start
77
+ while True:
78
+ if _python_eligible_idx(s, i):
79
+ return i
80
+ i += 1
81
+ if i == n:
82
+ i = 0
83
+ if i == start:
84
+ return None
85
+
86
+
87
+ def _fatfinger_python(
88
+ text: str,
89
+ *,
90
+ max_change_rate: float,
91
+ layout: dict[str, list[str]],
92
+ rng: random.Random,
93
+ ) -> str:
94
+ s = text
95
+ max_changes = max(1, int(len(s) * max_change_rate))
96
+
97
+ positional_actions = ("char_swap", "missing_char", "extra_char", "nearby_char")
98
+ global_actions = ("skipped_space", "random_space", "unichar", "repeated_char")
99
+ all_actions = positional_actions + global_actions
100
+
101
+ actions_drawn = [rng.choice(all_actions) for _ in range(max_changes)]
102
+
103
+ for action in actions_drawn:
104
+ if action in positional_actions:
105
+ idx = _python_draw_eligible_index(rng, s)
106
+ if idx is None:
107
+ continue
108
+ if action == "char_swap":
109
+ j = idx + 1
110
+ s = s[:idx] + s[j] + s[idx] + s[j + 1 :]
111
+ elif action == "missing_char":
112
+ if _python_eligible_idx(s, idx):
113
+ s = s[:idx] + s[idx + 1 :]
114
+ elif action == "extra_char":
115
+ ch = s[idx]
116
+ neighbors = layout.get(ch.lower(), []) or [ch]
117
+ ins = rng.choice(neighbors) or ch
118
+ s = s[:idx] + ins + s[idx:]
119
+ elif action == "nearby_char":
120
+ ch = s[idx]
121
+ neighbors = layout.get(ch.lower(), [])
122
+ if neighbors:
123
+ rep = rng.choice(neighbors)
124
+ s = s[:idx] + rep + s[idx + 1 :]
125
+ else:
126
+ if action == "skipped_space":
127
+ s = _python_skipped_space(s, rng)
128
+ elif action == "random_space":
129
+ s = _python_random_space(s, rng)
130
+ elif action == "unichar":
131
+ s = _python_unichar(s, rng)
132
+ elif action == "repeated_char":
133
+ s = _python_repeated_char(s, rng)
134
+ return s
135
+
136
+
137
+ def fatfinger(
138
+ text: str,
139
+ max_change_rate: float = 0.02,
140
+ keyboard: str = "CURATOR_QWERTY",
141
+ seed: int | None = None,
142
+ rng: random.Random | None = None,
143
+ ) -> str:
144
+ """Introduce character-level "fat finger" edits with a Rust fast path."""
145
+
146
+ if rng is None:
147
+ rng = random.Random(seed)
148
+ if not text:
149
+ return ""
150
+
151
+ layout = getattr(KEYNEIGHBORS, keyboard)
152
+
153
+ if _fatfinger_rust is not None:
154
+ return _fatfinger_rust(text, max_change_rate=max_change_rate, layout=layout, rng=rng)
155
+
156
+ return _fatfinger_python(text, max_change_rate=max_change_rate, layout=layout, rng=rng)
157
+
158
+
159
+ class Typogre(Glitchling):
160
+ """Glitchling that introduces deterministic keyboard-typing errors."""
161
+
162
+ def __init__(
163
+ self,
164
+ *,
165
+ max_change_rate: float = 0.02,
166
+ keyboard: str = "CURATOR_QWERTY",
167
+ seed: int | None = None,
168
+ ) -> None:
169
+ super().__init__(
170
+ name="Typogre",
171
+ corruption_function=fatfinger,
172
+ scope=AttackWave.CHARACTER,
173
+ order=AttackOrder.EARLY,
174
+ seed=seed,
175
+ max_change_rate=max_change_rate,
176
+ keyboard=keyboard,
177
+ )
178
+
179
+
180
+ typogre = Typogre()
181
+
182
+
183
+ __all__ = ["Typogre", "typogre"]
184
+