glitchlings 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +42 -0
- glitchlings/__main__.py +9 -0
- {dlc → glitchlings/dlc}/prime.py +52 -50
- glitchlings/main.py +238 -0
- glitchlings/util/__init__.py +151 -0
- {zoo → glitchlings/zoo}/__init__.py +57 -50
- {zoo → glitchlings/zoo}/core.py +190 -136
- glitchlings/zoo/jargoyle.py +225 -0
- {zoo → glitchlings/zoo}/mim1c.py +79 -62
- {zoo → glitchlings/zoo}/redactyl.py +91 -73
- {zoo → glitchlings/zoo}/reduple.py +73 -54
- {zoo → glitchlings/zoo}/rushmore.py +74 -53
- {zoo → glitchlings/zoo}/scannequin.py +140 -124
- {zoo → glitchlings/zoo}/typogre.py +231 -224
- glitchlings-0.1.2.dist-info/METADATA +455 -0
- glitchlings-0.1.2.dist-info/RECORD +20 -0
- glitchlings-0.1.2.dist-info/licenses/LICENSE +201 -0
- .github/workflows/publish.yml +0 -42
- .github/workflows/testpypi.yml +0 -38
- .gitignore +0 -12
- LICENSE +0 -21
- MONSTER_MANUAL.md +0 -272
- PKG-INFO +0 -244
- README.md +0 -192
- RELEASE.md +0 -47
- __init__.py +0 -73
- glitchlings-0.1.0.dist-info/METADATA +0 -244
- glitchlings-0.1.0.dist-info/RECORD +0 -28
- glitchlings-0.1.0.dist-info/licenses/LICENSE +0 -21
- main.py +0 -6
- pyproject.toml +0 -74
- util/__init__.py +0 -73
- zoo/jargoyle.py +0 -89
- {dlc → glitchlings/dlc}/__init__.py +0 -0
- {glitchlings-0.1.0.dist-info → glitchlings-0.1.2.dist-info}/WHEEL +0 -0
- {glitchlings-0.1.0.dist-info → glitchlings-0.1.2.dist-info}/entry_points.txt +0 -0
@@ -1,124 +1,140 @@
|
|
1
|
-
import re
|
2
|
-
import random
|
3
|
-
from .core import Glitchling, AttackWave, AttackOrder
|
4
|
-
|
5
|
-
|
6
|
-
def ocr_artifacts(
|
7
|
-
text: str,
|
8
|
-
error_rate: float = 0.02,
|
9
|
-
seed: int | None = None,
|
10
|
-
rng: random.Random | None = None,
|
11
|
-
) -> str:
|
12
|
-
"""Introduce OCR-like artifacts into text.
|
13
|
-
|
14
|
-
Parameters
|
15
|
-
- text: Input text to corrupt.
|
16
|
-
- error_rate: Max proportion of eligible confusion matches to replace (default 0.02).
|
17
|
-
- seed: Optional seed if `rng` not provided.
|
18
|
-
- rng: Optional RNG; overrides seed.
|
19
|
-
|
20
|
-
Notes
|
21
|
-
- Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
|
22
|
-
- Collects all non-overlapping candidate spans in reading order, then samples
|
23
|
-
a subset deterministically with the provided RNG.
|
24
|
-
- Replacements can change length (e.g., m→rn), so edits are applied from left
|
25
|
-
to right using precomputed spans to avoid index drift.
|
26
|
-
"""
|
27
|
-
if not text:
|
28
|
-
return text
|
29
|
-
|
30
|
-
if rng is None:
|
31
|
-
rng = random.Random(seed)
|
32
|
-
|
33
|
-
# map: source -> list of possible replacements
|
34
|
-
# Keep patterns small and specific; longer patterns first avoid overmatching
|
35
|
-
confusion_table: list[tuple[str, list[str]]] = [
|
36
|
-
("li", ["h"]),
|
37
|
-
("h", ["li"]),
|
38
|
-
("rn", ["m"]),
|
39
|
-
("m", ["rn"]),
|
40
|
-
("cl", ["d"]),
|
41
|
-
("d", ["cl"]),
|
42
|
-
("I", ["l"]),
|
43
|
-
("l", ["I", "1"]),
|
44
|
-
("1", ["l", "I"]),
|
45
|
-
("0", ["O"]),
|
46
|
-
("O", ["0"]),
|
47
|
-
("B", ["8"]),
|
48
|
-
("8", ["B"]),
|
49
|
-
("S", ["5"]),
|
50
|
-
("5", ["S"]),
|
51
|
-
("Z", ["2"]),
|
52
|
-
("2", ["Z"]),
|
53
|
-
("G", ["6"]),
|
54
|
-
("6", ["G"]),
|
55
|
-
("“", ['"']),
|
56
|
-
("”", ['"']),
|
57
|
-
("‘", ["'"]),
|
58
|
-
("’", ["'"]),
|
59
|
-
("—", ["-"]), # em dash -> hyphen
|
60
|
-
("–", ["-"]), # en dash -> hyphen
|
61
|
-
]
|
62
|
-
|
63
|
-
# Build candidate matches as (start, end, choices)
|
64
|
-
candidates: list[tuple[int, int, list[str]]] = []
|
65
|
-
|
66
|
-
# To avoid double-counting overlapping patterns (like 'l' inside 'li'),
|
67
|
-
# we will scan longer patterns first by sorting by len(src) desc.
|
68
|
-
for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
|
69
|
-
pattern = re.escape(src)
|
70
|
-
for m in re.finditer(pattern, text):
|
71
|
-
start, end = m.span()
|
72
|
-
candidates.append((start, end, choices))
|
73
|
-
|
74
|
-
if not candidates:
|
75
|
-
return text
|
76
|
-
|
77
|
-
# Decide how many to replace
|
78
|
-
k = int(len(candidates) * error_rate)
|
79
|
-
if k <= 0:
|
80
|
-
return text
|
81
|
-
|
82
|
-
# Shuffle deterministically and select non-overlapping k spans
|
83
|
-
rng.shuffle(candidates)
|
84
|
-
chosen: list[tuple[int, int, str]] = []
|
85
|
-
occupied: list[tuple[int, int]] = []
|
86
|
-
|
87
|
-
def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
|
88
|
-
return not (a[1] <= b[0] or b[1] <= a[0])
|
89
|
-
|
90
|
-
for start, end, choices in candidates:
|
91
|
-
if len(chosen) >= k:
|
92
|
-
break
|
93
|
-
span = (start, end)
|
94
|
-
if any(overlaps(span, occ) for occ in occupied):
|
95
|
-
continue
|
96
|
-
replacement = rng.choice(choices)
|
97
|
-
chosen.append((start, end, replacement))
|
98
|
-
occupied.append(span)
|
99
|
-
|
100
|
-
if not chosen:
|
101
|
-
return text
|
102
|
-
|
103
|
-
# Apply edits from left to right
|
104
|
-
chosen.sort(key=lambda t: t[0])
|
105
|
-
out_parts = []
|
106
|
-
cursor = 0
|
107
|
-
for start, end, rep in chosen:
|
108
|
-
if cursor < start:
|
109
|
-
out_parts.append(text[cursor:start])
|
110
|
-
out_parts.append(rep)
|
111
|
-
cursor = end
|
112
|
-
if cursor < len(text):
|
113
|
-
out_parts.append(text[cursor:])
|
114
|
-
|
115
|
-
return "".join(out_parts)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
1
|
+
import re
|
2
|
+
import random
|
3
|
+
from .core import Glitchling, AttackWave, AttackOrder
|
4
|
+
|
5
|
+
|
6
|
+
def ocr_artifacts(
|
7
|
+
text: str,
|
8
|
+
error_rate: float = 0.02,
|
9
|
+
seed: int | None = None,
|
10
|
+
rng: random.Random | None = None,
|
11
|
+
) -> str:
|
12
|
+
"""Introduce OCR-like artifacts into text.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
- text: Input text to corrupt.
|
16
|
+
- error_rate: Max proportion of eligible confusion matches to replace (default 0.02).
|
17
|
+
- seed: Optional seed if `rng` not provided.
|
18
|
+
- rng: Optional RNG; overrides seed.
|
19
|
+
|
20
|
+
Notes
|
21
|
+
- Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
|
22
|
+
- Collects all non-overlapping candidate spans in reading order, then samples
|
23
|
+
a subset deterministically with the provided RNG.
|
24
|
+
- Replacements can change length (e.g., m→rn), so edits are applied from left
|
25
|
+
to right using precomputed spans to avoid index drift.
|
26
|
+
"""
|
27
|
+
if not text:
|
28
|
+
return text
|
29
|
+
|
30
|
+
if rng is None:
|
31
|
+
rng = random.Random(seed)
|
32
|
+
|
33
|
+
# map: source -> list of possible replacements
|
34
|
+
# Keep patterns small and specific; longer patterns first avoid overmatching
|
35
|
+
confusion_table: list[tuple[str, list[str]]] = [
|
36
|
+
("li", ["h"]),
|
37
|
+
("h", ["li"]),
|
38
|
+
("rn", ["m"]),
|
39
|
+
("m", ["rn"]),
|
40
|
+
("cl", ["d"]),
|
41
|
+
("d", ["cl"]),
|
42
|
+
("I", ["l"]),
|
43
|
+
("l", ["I", "1"]),
|
44
|
+
("1", ["l", "I"]),
|
45
|
+
("0", ["O"]),
|
46
|
+
("O", ["0"]),
|
47
|
+
("B", ["8"]),
|
48
|
+
("8", ["B"]),
|
49
|
+
("S", ["5"]),
|
50
|
+
("5", ["S"]),
|
51
|
+
("Z", ["2"]),
|
52
|
+
("2", ["Z"]),
|
53
|
+
("G", ["6"]),
|
54
|
+
("6", ["G"]),
|
55
|
+
("“", ['"']),
|
56
|
+
("”", ['"']),
|
57
|
+
("‘", ["'"]),
|
58
|
+
("’", ["'"]),
|
59
|
+
("—", ["-"]), # em dash -> hyphen
|
60
|
+
("–", ["-"]), # en dash -> hyphen
|
61
|
+
]
|
62
|
+
|
63
|
+
# Build candidate matches as (start, end, choices)
|
64
|
+
candidates: list[tuple[int, int, list[str]]] = []
|
65
|
+
|
66
|
+
# To avoid double-counting overlapping patterns (like 'l' inside 'li'),
|
67
|
+
# we will scan longer patterns first by sorting by len(src) desc.
|
68
|
+
for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
|
69
|
+
pattern = re.escape(src)
|
70
|
+
for m in re.finditer(pattern, text):
|
71
|
+
start, end = m.span()
|
72
|
+
candidates.append((start, end, choices))
|
73
|
+
|
74
|
+
if not candidates:
|
75
|
+
return text
|
76
|
+
|
77
|
+
# Decide how many to replace
|
78
|
+
k = int(len(candidates) * error_rate)
|
79
|
+
if k <= 0:
|
80
|
+
return text
|
81
|
+
|
82
|
+
# Shuffle deterministically and select non-overlapping k spans
|
83
|
+
rng.shuffle(candidates)
|
84
|
+
chosen: list[tuple[int, int, str]] = []
|
85
|
+
occupied: list[tuple[int, int]] = []
|
86
|
+
|
87
|
+
def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
|
88
|
+
return not (a[1] <= b[0] or b[1] <= a[0])
|
89
|
+
|
90
|
+
for start, end, choices in candidates:
|
91
|
+
if len(chosen) >= k:
|
92
|
+
break
|
93
|
+
span = (start, end)
|
94
|
+
if any(overlaps(span, occ) for occ in occupied):
|
95
|
+
continue
|
96
|
+
replacement = rng.choice(choices)
|
97
|
+
chosen.append((start, end, replacement))
|
98
|
+
occupied.append(span)
|
99
|
+
|
100
|
+
if not chosen:
|
101
|
+
return text
|
102
|
+
|
103
|
+
# Apply edits from left to right
|
104
|
+
chosen.sort(key=lambda t: t[0])
|
105
|
+
out_parts = []
|
106
|
+
cursor = 0
|
107
|
+
for start, end, rep in chosen:
|
108
|
+
if cursor < start:
|
109
|
+
out_parts.append(text[cursor:start])
|
110
|
+
out_parts.append(rep)
|
111
|
+
cursor = end
|
112
|
+
if cursor < len(text):
|
113
|
+
out_parts.append(text[cursor:])
|
114
|
+
|
115
|
+
return "".join(out_parts)
|
116
|
+
|
117
|
+
|
118
|
+
class Scannequin(Glitchling):
|
119
|
+
"""Glitchling that simulates OCR artifacts using common confusions."""
|
120
|
+
|
121
|
+
def __init__(
|
122
|
+
self,
|
123
|
+
*,
|
124
|
+
error_rate: float = 0.02,
|
125
|
+
seed: int | None = None,
|
126
|
+
) -> None:
|
127
|
+
super().__init__(
|
128
|
+
name="Scannequin",
|
129
|
+
corruption_function=ocr_artifacts,
|
130
|
+
scope=AttackWave.CHARACTER,
|
131
|
+
order=AttackOrder.LATE,
|
132
|
+
seed=seed,
|
133
|
+
error_rate=error_rate,
|
134
|
+
)
|
135
|
+
|
136
|
+
|
137
|
+
scannequin = Scannequin()
|
138
|
+
|
139
|
+
|
140
|
+
__all__ = ["Scannequin", "scannequin"]
|