glitchlings 0.4.4__cp313-cp313-win_amd64.whl → 0.5.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cp313-win_amd64.pyd +0 -0
- glitchlings/compat.py +2 -4
- glitchlings/config.py +14 -28
- glitchlings/dev/__init__.py +5 -0
- glitchlings/dev/sync_assets.py +153 -0
- glitchlings/dlc/_shared.py +6 -6
- glitchlings/dlc/huggingface.py +6 -6
- glitchlings/dlc/prime.py +1 -1
- glitchlings/dlc/pytorch.py +3 -3
- glitchlings/dlc/pytorch_lightning.py +4 -10
- glitchlings/lexicon/_cache.py +3 -5
- glitchlings/lexicon/vector.py +6 -5
- glitchlings/lexicon/wordnet.py +4 -8
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +370 -0
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/_ocr_confusions.py +3 -3
- glitchlings/zoo/_text_utils.py +10 -9
- glitchlings/zoo/adjax.py +3 -18
- glitchlings/zoo/apostrofae.py +2 -5
- glitchlings/zoo/assets/__init__.py +54 -0
- glitchlings/zoo/assets/hokey_assets.json +193 -0
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +2 -16
- glitchlings/zoo/mim1c.py +2 -17
- glitchlings/zoo/redactyl.py +3 -17
- glitchlings/zoo/reduple.py +3 -17
- glitchlings/zoo/rushmore.py +3 -20
- glitchlings/zoo/scannequin.py +3 -20
- glitchlings/zoo/typogre.py +2 -19
- glitchlings/zoo/zeedub.py +2 -13
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/METADATA +29 -6
- glitchlings-0.5.0.dist-info/RECORD +53 -0
- glitchlings/zoo/_rate.py +0 -131
- glitchlings-0.4.4.dist-info/RECORD +0 -47
- /glitchlings/zoo/{ocr_confusions.tsv → assets/ocr_confusions.tsv} +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Identify where expressive stretches should occur within a token."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
VOWELS = set("aeiouyAEIOUY")
|
|
9
|
+
SONORANTS = set("rlmnwyhRLMNWYH")
|
|
10
|
+
SIBILANTS = set("sSzZxXcCjJ") | {"sh", "Sh", "sH", "SH", "zh", "Zh"}
|
|
11
|
+
DIGRAPHS = {
|
|
12
|
+
"aa",
|
|
13
|
+
"ae",
|
|
14
|
+
"ai",
|
|
15
|
+
"ay",
|
|
16
|
+
"ee",
|
|
17
|
+
"ei",
|
|
18
|
+
"ey",
|
|
19
|
+
"ie",
|
|
20
|
+
"io",
|
|
21
|
+
"oa",
|
|
22
|
+
"oe",
|
|
23
|
+
"oi",
|
|
24
|
+
"oo",
|
|
25
|
+
"ou",
|
|
26
|
+
"ua",
|
|
27
|
+
"ue",
|
|
28
|
+
"ui",
|
|
29
|
+
"ya",
|
|
30
|
+
"yo",
|
|
31
|
+
"yu",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(slots=True)
|
|
36
|
+
class StretchSite:
|
|
37
|
+
"""Location of a stretchable grapheme."""
|
|
38
|
+
|
|
39
|
+
start: int
|
|
40
|
+
end: int
|
|
41
|
+
category: str
|
|
42
|
+
|
|
43
|
+
def unit(self, token: str) -> str:
|
|
44
|
+
return token[self.start : self.end]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _alpha_indices(token: str) -> list[int]:
|
|
48
|
+
return [idx for idx, char in enumerate(token) if char.isalpha()]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _vowel_clusters(token: str, indices: Iterable[int]) -> list[tuple[int, int]]:
|
|
52
|
+
clusters: list[tuple[int, int]] = []
|
|
53
|
+
start: int | None = None
|
|
54
|
+
prev_idx: int | None = None
|
|
55
|
+
for idx in indices:
|
|
56
|
+
char = token[idx]
|
|
57
|
+
if char in VOWELS:
|
|
58
|
+
if start is None:
|
|
59
|
+
start = idx
|
|
60
|
+
elif prev_idx is not None and idx != prev_idx + 1:
|
|
61
|
+
clusters.append((start, prev_idx + 1))
|
|
62
|
+
start = idx
|
|
63
|
+
else:
|
|
64
|
+
if start is not None:
|
|
65
|
+
clusters.append((start, idx))
|
|
66
|
+
start = None
|
|
67
|
+
prev_idx = idx
|
|
68
|
+
if start is not None and prev_idx is not None:
|
|
69
|
+
clusters.append((start, prev_idx + 1))
|
|
70
|
+
return clusters
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def find_stretch_site(token: str) -> StretchSite | None:
|
|
74
|
+
"""Return the most suitable stretch site for ``token``."""
|
|
75
|
+
|
|
76
|
+
alpha_indices = _alpha_indices(token)
|
|
77
|
+
if not alpha_indices:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
lower = token.lower()
|
|
81
|
+
clusters = _vowel_clusters(lower, alpha_indices)
|
|
82
|
+
candidates: list[tuple[int, StretchSite]] = []
|
|
83
|
+
|
|
84
|
+
# Sibilant/sonorant coda extension (yes -> yesss, hmm -> hmmmm)
|
|
85
|
+
last_idx = alpha_indices[-1]
|
|
86
|
+
last_char = lower[last_idx]
|
|
87
|
+
if len(alpha_indices) >= 2:
|
|
88
|
+
prev_char = lower[alpha_indices[-2]]
|
|
89
|
+
else:
|
|
90
|
+
prev_char = ""
|
|
91
|
+
has_multi_vowel = any(
|
|
92
|
+
(end - start >= 2) and not (lower[start] == "y" and start == 0) for start, end in clusters
|
|
93
|
+
)
|
|
94
|
+
if last_char in {"s", "z"} and prev_char in VOWELS and not has_multi_vowel:
|
|
95
|
+
candidates.append((5, StretchSite(last_idx, last_idx + 1, "coda")))
|
|
96
|
+
elif last_char in SONORANTS and prev_char in VOWELS and not has_multi_vowel:
|
|
97
|
+
candidates.append((4, StretchSite(last_idx, last_idx + 1, "coda")))
|
|
98
|
+
elif not clusters:
|
|
99
|
+
candidates.append((2, StretchSite(last_idx, last_idx + 1, "consonant")))
|
|
100
|
+
|
|
101
|
+
# CVCe pattern (cute -> cuuute)
|
|
102
|
+
if lower.endswith("e") and len(alpha_indices) >= 3:
|
|
103
|
+
final_letter = alpha_indices[-1]
|
|
104
|
+
if token[final_letter].lower() == "e":
|
|
105
|
+
c_idx = alpha_indices[-2]
|
|
106
|
+
v_idx = alpha_indices[-3]
|
|
107
|
+
if token[c_idx].lower() not in VOWELS and token[v_idx].lower() in VOWELS:
|
|
108
|
+
candidates.append((4, StretchSite(v_idx, v_idx + 1, "cvce")))
|
|
109
|
+
|
|
110
|
+
for cluster in clusters:
|
|
111
|
+
start, end = cluster
|
|
112
|
+
substring = lower[start:end]
|
|
113
|
+
category = "vowel"
|
|
114
|
+
if any(substring[i : i + 2] in DIGRAPHS for i in range(max(0, len(substring) - 1))):
|
|
115
|
+
category = "digraph"
|
|
116
|
+
priority = 3 if cluster == clusters[-1] else 2
|
|
117
|
+
candidates.append((priority, StretchSite(start, end, category)))
|
|
118
|
+
|
|
119
|
+
if not candidates:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
candidates.sort(key=lambda item: (item[0], item[1].end - item[1].start, -item[1].start))
|
|
123
|
+
return candidates[-1][1]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def apply_stretch(token: str, site: StretchSite, repeats: int) -> str:
|
|
127
|
+
"""Return ``token`` with ``repeats`` extra copies of the grapheme at ``site``."""
|
|
128
|
+
|
|
129
|
+
if repeats <= 0:
|
|
130
|
+
return token
|
|
131
|
+
chars = list(token)
|
|
132
|
+
stretched: list[str] = []
|
|
133
|
+
for idx, char in enumerate(chars):
|
|
134
|
+
stretched.append(char)
|
|
135
|
+
if site.start <= idx < site.end:
|
|
136
|
+
stretched.append(char * repeats)
|
|
137
|
+
return "".join(stretched)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
__all__ = ["StretchSite", "find_stretch_site", "apply_stretch"]
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
"""Stretchability scoring and candidate selection for Hokey."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Protocol, Sequence, TypedDict, cast
|
|
8
|
+
|
|
9
|
+
from glitchlings.zoo import assets
|
|
10
|
+
|
|
11
|
+
# Regexes reused across the module
|
|
12
|
+
TOKEN_REGEX = re.compile(r"\w+|\W+")
|
|
13
|
+
ALPHA_REGEX = re.compile(r"[A-Za-z]")
|
|
14
|
+
EMOJI_REGEX = re.compile(r"[\U0001F300-\U0001FAFF]")
|
|
15
|
+
CLAUSE_PUNCTUATION = {".", "?", "!", ";"}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HokeyAssets(TypedDict):
|
|
19
|
+
lexical_prior: dict[str, float]
|
|
20
|
+
interjections: list[str]
|
|
21
|
+
intensifiers: list[str]
|
|
22
|
+
evaluatives: list[str]
|
|
23
|
+
positive_lexicon: list[str]
|
|
24
|
+
negative_lexicon: list[str]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RandomLike(Protocol):
|
|
28
|
+
"""Interface for RNGs that expose ``random()``."""
|
|
29
|
+
|
|
30
|
+
def random(self) -> float: ...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Lexical prior probabilities and pragmatic lexica shared with the Rust fast path.
|
|
34
|
+
def _load_assets() -> HokeyAssets:
|
|
35
|
+
data = assets.load_json("hokey_assets.json")
|
|
36
|
+
return cast(HokeyAssets, data)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_ASSETS = _load_assets()
|
|
40
|
+
LEXICAL_PRIOR: dict[str, float] = {
|
|
41
|
+
token: float(score) for token, score in _ASSETS["lexical_prior"].items()
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Pragmatic lexica for POS/discourse cues
|
|
45
|
+
INTERJECTIONS = frozenset(_ASSETS["interjections"])
|
|
46
|
+
INTENSIFIERS = frozenset(_ASSETS["intensifiers"])
|
|
47
|
+
EVALUATIVES = frozenset(_ASSETS["evaluatives"])
|
|
48
|
+
POSITIVE_LEXICON = frozenset(_ASSETS["positive_lexicon"])
|
|
49
|
+
NEGATIVE_LEXICON = frozenset(_ASSETS["negative_lexicon"])
|
|
50
|
+
|
|
51
|
+
VOWELS = set("aeiouy")
|
|
52
|
+
SONORANT_CODAS = set("rlmnwyh")
|
|
53
|
+
SIBILANT_CODAS = {"s", "z", "x", "c", "j", "sh", "zh"}
|
|
54
|
+
DIGRAPHS = {
|
|
55
|
+
"aa",
|
|
56
|
+
"ae",
|
|
57
|
+
"ai",
|
|
58
|
+
"ay",
|
|
59
|
+
"ee",
|
|
60
|
+
"ei",
|
|
61
|
+
"ey",
|
|
62
|
+
"ie",
|
|
63
|
+
"oa",
|
|
64
|
+
"oe",
|
|
65
|
+
"oi",
|
|
66
|
+
"oo",
|
|
67
|
+
"ou",
|
|
68
|
+
"ue",
|
|
69
|
+
"ui",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
MAX_CANDIDATES_PER_CLAUSE = 4
|
|
73
|
+
MIN_SCORE_THRESHOLD = 0.18
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(slots=True)
|
|
77
|
+
class TokenInfo:
|
|
78
|
+
text: str
|
|
79
|
+
start: int
|
|
80
|
+
end: int
|
|
81
|
+
is_word: bool
|
|
82
|
+
clause_index: int
|
|
83
|
+
preceding_punct: str
|
|
84
|
+
following_punct: str
|
|
85
|
+
index: int
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def normalised(self) -> str:
|
|
89
|
+
return self.text.lower()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass(slots=True)
|
|
93
|
+
class StretchabilityFeatures:
|
|
94
|
+
lexical: float
|
|
95
|
+
pos: float
|
|
96
|
+
sentiment: float
|
|
97
|
+
phonotactic: float
|
|
98
|
+
context: float
|
|
99
|
+
sentiment_swing: float
|
|
100
|
+
|
|
101
|
+
def intensity(self) -> float:
|
|
102
|
+
"""Map features to an intensity scalar in [0, 1.5]."""
|
|
103
|
+
emphasis = 0.6 * self.context + 0.4 * self.sentiment_swing
|
|
104
|
+
return max(0.0, min(1.5, 0.5 * (self.lexical + self.phonotactic) + emphasis))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass(slots=True)
|
|
108
|
+
class StretchCandidate:
|
|
109
|
+
token: TokenInfo
|
|
110
|
+
score: float
|
|
111
|
+
features: StretchabilityFeatures
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class StretchabilityAnalyzer:
|
|
115
|
+
"""Compute stretchability scores and select candidates."""
|
|
116
|
+
|
|
117
|
+
def __init__(
|
|
118
|
+
self,
|
|
119
|
+
*,
|
|
120
|
+
lexical_prior: dict[str, float] | None = None,
|
|
121
|
+
weights: tuple[float, float, float, float, float] = (0.32, 0.18, 0.14, 0.22, 0.14),
|
|
122
|
+
) -> None:
|
|
123
|
+
self.lexical_prior = lexical_prior or LEXICAL_PRIOR
|
|
124
|
+
self.weights = weights
|
|
125
|
+
|
|
126
|
+
# ------------------------------------------------------------------
|
|
127
|
+
# Public API
|
|
128
|
+
# ------------------------------------------------------------------
|
|
129
|
+
def tokenise(self, text: str) -> list[TokenInfo]:
|
|
130
|
+
"""Tokenise text preserving separator tokens."""
|
|
131
|
+
return self._tokenise(text)
|
|
132
|
+
|
|
133
|
+
def analyse(self, text: str) -> list[StretchCandidate]:
|
|
134
|
+
if not text:
|
|
135
|
+
return []
|
|
136
|
+
tokens = self._tokenise(text)
|
|
137
|
+
return self.analyse_tokens(tokens)
|
|
138
|
+
|
|
139
|
+
def analyse_tokens(self, tokens: Sequence[TokenInfo]) -> list[StretchCandidate]:
|
|
140
|
+
candidates: list[StretchCandidate] = []
|
|
141
|
+
for idx, token in enumerate(tokens):
|
|
142
|
+
if not token.is_word:
|
|
143
|
+
continue
|
|
144
|
+
if self._excluded(token, tokens, idx):
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
features = self._compute_features(token, tokens, idx)
|
|
148
|
+
score = self._composite_score(features)
|
|
149
|
+
if score < MIN_SCORE_THRESHOLD:
|
|
150
|
+
continue
|
|
151
|
+
candidates.append(StretchCandidate(token=token, score=score, features=features))
|
|
152
|
+
return candidates
|
|
153
|
+
|
|
154
|
+
def select_candidates(
|
|
155
|
+
self,
|
|
156
|
+
candidates: Sequence[StretchCandidate],
|
|
157
|
+
*,
|
|
158
|
+
rate: float,
|
|
159
|
+
rng: RandomLike,
|
|
160
|
+
) -> list[StretchCandidate]:
|
|
161
|
+
if not candidates or rate <= 0:
|
|
162
|
+
return []
|
|
163
|
+
|
|
164
|
+
grouped: dict[int, list[StretchCandidate]] = {}
|
|
165
|
+
for candidate in candidates:
|
|
166
|
+
grouped.setdefault(candidate.token.clause_index, []).append(candidate)
|
|
167
|
+
|
|
168
|
+
selected: list[StretchCandidate] = []
|
|
169
|
+
total_expected = max(0, min(len(candidates), int(round(len(candidates) * rate))))
|
|
170
|
+
|
|
171
|
+
for clause_index in sorted(grouped):
|
|
172
|
+
clause_candidates = sorted(
|
|
173
|
+
grouped[clause_index], key=lambda c: (-c.score, c.token.start)
|
|
174
|
+
)
|
|
175
|
+
clause_candidates = clause_candidates[:MAX_CANDIDATES_PER_CLAUSE]
|
|
176
|
+
clause_quota = max(
|
|
177
|
+
0, min(len(clause_candidates), int(round(len(clause_candidates) * rate)))
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
provisional: list[StretchCandidate] = []
|
|
181
|
+
for candidate in clause_candidates:
|
|
182
|
+
probability = min(1.0, rate * (0.35 + 0.65 * candidate.score))
|
|
183
|
+
if rng.random() < probability:
|
|
184
|
+
provisional.append(candidate)
|
|
185
|
+
if len(provisional) >= clause_quota:
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
if len(provisional) < clause_quota:
|
|
189
|
+
leftovers = [c for c in clause_candidates if c not in provisional]
|
|
190
|
+
needed = clause_quota - len(provisional)
|
|
191
|
+
provisional.extend(leftovers[:needed])
|
|
192
|
+
|
|
193
|
+
selected.extend(provisional)
|
|
194
|
+
|
|
195
|
+
if len(selected) < total_expected:
|
|
196
|
+
remaining = [c for c in candidates if c not in selected]
|
|
197
|
+
remaining.sort(key=lambda c: (-c.score, c.token.start))
|
|
198
|
+
selected.extend(remaining[: total_expected - len(selected)])
|
|
199
|
+
|
|
200
|
+
# Keep deterministic order by position
|
|
201
|
+
selected.sort(key=lambda c: c.token.start)
|
|
202
|
+
return selected
|
|
203
|
+
|
|
204
|
+
# ------------------------------------------------------------------
|
|
205
|
+
# Internal helpers
|
|
206
|
+
# ------------------------------------------------------------------
|
|
207
|
+
def _tokenise(self, text: str) -> list[TokenInfo]:
|
|
208
|
+
tokens: list[TokenInfo] = []
|
|
209
|
+
clause_index = 0
|
|
210
|
+
matches = list(TOKEN_REGEX.finditer(text))
|
|
211
|
+
for idx, match in enumerate(matches):
|
|
212
|
+
token_text = match.group(0)
|
|
213
|
+
is_word = bool(ALPHA_REGEX.search(token_text)) and token_text.strip().isalnum()
|
|
214
|
+
preceding = matches[idx - 1].group(0) if idx > 0 else ""
|
|
215
|
+
following = matches[idx + 1].group(0) if idx + 1 < len(matches) else ""
|
|
216
|
+
tokens.append(
|
|
217
|
+
TokenInfo(
|
|
218
|
+
text=token_text,
|
|
219
|
+
start=match.start(),
|
|
220
|
+
end=match.end(),
|
|
221
|
+
is_word=is_word,
|
|
222
|
+
clause_index=clause_index,
|
|
223
|
+
preceding_punct=preceding,
|
|
224
|
+
following_punct=following,
|
|
225
|
+
index=idx,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
if any(ch in CLAUSE_PUNCTUATION for ch in token_text):
|
|
229
|
+
clause_index += 1
|
|
230
|
+
return tokens
|
|
231
|
+
|
|
232
|
+
def _excluded(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> bool:
|
|
233
|
+
text = token.text
|
|
234
|
+
normalised = token.normalised
|
|
235
|
+
if sum(ch.isalpha() for ch in text) < 2:
|
|
236
|
+
return True
|
|
237
|
+
if any(ch.isdigit() for ch in text):
|
|
238
|
+
return True
|
|
239
|
+
lowered = normalised
|
|
240
|
+
if "http" in lowered or "www" in lowered or "//" in lowered:
|
|
241
|
+
return True
|
|
242
|
+
if any(symbol in text for symbol in {"#", "@", "&", "{", "}", "<", ">"}):
|
|
243
|
+
return True
|
|
244
|
+
if "_" in text:
|
|
245
|
+
return True
|
|
246
|
+
if "/" in text or "\\" in text:
|
|
247
|
+
return True
|
|
248
|
+
|
|
249
|
+
# Heuristic proper noun check: Title case mid-clause counts as proper noun
|
|
250
|
+
if text[:1].isupper() and text[1:].islower():
|
|
251
|
+
previous_clause_start = index == 0
|
|
252
|
+
if not previous_clause_start:
|
|
253
|
+
for prior in reversed(tokens[:index]):
|
|
254
|
+
stripped = prior.text.strip()
|
|
255
|
+
if not stripped:
|
|
256
|
+
continue
|
|
257
|
+
if stripped[-1] in CLAUSE_PUNCTUATION:
|
|
258
|
+
previous_clause_start = True
|
|
259
|
+
break
|
|
260
|
+
if not previous_clause_start:
|
|
261
|
+
return True
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
def _compute_features(
|
|
265
|
+
self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int
|
|
266
|
+
) -> StretchabilityFeatures:
|
|
267
|
+
lexical = self.lexical_prior.get(token.normalised, 0.12)
|
|
268
|
+
pos_score = self._pos_score(token)
|
|
269
|
+
sentiment_score, sentiment_swing = self._sentiment(tokens, index)
|
|
270
|
+
phon_score = self._phonotactic(token.normalised)
|
|
271
|
+
context_score = self._contextual(token, tokens, index)
|
|
272
|
+
return StretchabilityFeatures(
|
|
273
|
+
lexical=lexical,
|
|
274
|
+
pos=pos_score,
|
|
275
|
+
sentiment=sentiment_score,
|
|
276
|
+
phonotactic=phon_score,
|
|
277
|
+
context=context_score,
|
|
278
|
+
sentiment_swing=sentiment_swing,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
def _composite_score(self, features: StretchabilityFeatures) -> float:
|
|
282
|
+
lex_w, pos_w, sent_w, phon_w, ctx_w = self.weights
|
|
283
|
+
weighted = (
|
|
284
|
+
lex_w * features.lexical
|
|
285
|
+
+ pos_w * features.pos
|
|
286
|
+
+ sent_w * features.sentiment
|
|
287
|
+
+ phon_w * features.phonotactic
|
|
288
|
+
+ ctx_w * features.context
|
|
289
|
+
)
|
|
290
|
+
total_weight = sum(self.weights)
|
|
291
|
+
score = weighted / total_weight if total_weight else 0.0
|
|
292
|
+
return max(0.0, min(1.0, score))
|
|
293
|
+
|
|
294
|
+
# ------------------------------------------------------------------
|
|
295
|
+
# Feature helpers
|
|
296
|
+
# ------------------------------------------------------------------
|
|
297
|
+
def _pos_score(self, token: TokenInfo) -> float:
|
|
298
|
+
normalised = token.normalised
|
|
299
|
+
if normalised in INTERJECTIONS:
|
|
300
|
+
return 0.95
|
|
301
|
+
if normalised in INTENSIFIERS:
|
|
302
|
+
return 0.85
|
|
303
|
+
if normalised in EVALUATIVES:
|
|
304
|
+
return 0.7
|
|
305
|
+
if normalised.endswith("ly"):
|
|
306
|
+
return 0.55
|
|
307
|
+
if token.text.isupper() and len(token.text) > 1:
|
|
308
|
+
return 0.65
|
|
309
|
+
return 0.3
|
|
310
|
+
|
|
311
|
+
def _sentiment(self, tokens: Sequence[TokenInfo], index: int) -> tuple[float, float]:
|
|
312
|
+
window = [tok for tok in tokens[max(0, index - 2) : index + 3] if tok.is_word]
|
|
313
|
+
if not window:
|
|
314
|
+
return 0.5, 0.0
|
|
315
|
+
pos_hits = sum(1 for tok in window if tok.normalised in POSITIVE_LEXICON)
|
|
316
|
+
neg_hits = sum(1 for tok in window if tok.normalised in NEGATIVE_LEXICON)
|
|
317
|
+
total = len(window)
|
|
318
|
+
balance = (pos_hits - neg_hits) / total
|
|
319
|
+
sentiment_score = 0.5 + 0.5 * max(-1.0, min(1.0, balance))
|
|
320
|
+
swing = abs(balance)
|
|
321
|
+
return sentiment_score, swing
|
|
322
|
+
|
|
323
|
+
def _phonotactic(self, normalised: str) -> float:
|
|
324
|
+
if not any(ch in VOWELS for ch in normalised):
|
|
325
|
+
return 0.0
|
|
326
|
+
score = 0.25
|
|
327
|
+
if any(normalised.endswith(c) for c in SONORANT_CODAS):
|
|
328
|
+
score += 0.2
|
|
329
|
+
if any(normalised.endswith(c) for c in SIBILANT_CODAS):
|
|
330
|
+
score += 0.18
|
|
331
|
+
if any(digraph in normalised for digraph in DIGRAPHS):
|
|
332
|
+
score += 0.22
|
|
333
|
+
if re.search(r"[aeiouy]{2,}", normalised):
|
|
334
|
+
score += 0.22
|
|
335
|
+
if re.search(r"(.)(?!\1)(.)\1", normalised):
|
|
336
|
+
score += 0.08
|
|
337
|
+
return max(0.0, min(1.0, score))
|
|
338
|
+
|
|
339
|
+
def _contextual(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> float:
|
|
340
|
+
score = 0.2
|
|
341
|
+
before = token.preceding_punct
|
|
342
|
+
after = token.following_punct
|
|
343
|
+
token_text = token.text
|
|
344
|
+
if after and after.count("!") >= 1:
|
|
345
|
+
score += 0.25
|
|
346
|
+
if after and after.count("?") >= 1:
|
|
347
|
+
score += 0.2
|
|
348
|
+
if before and before.count("!") >= 2:
|
|
349
|
+
score += 0.2
|
|
350
|
+
if after and ("!!" in after or "??" in after):
|
|
351
|
+
score += 0.15
|
|
352
|
+
if token_text.isupper() and len(token_text) > 1:
|
|
353
|
+
score += 0.25
|
|
354
|
+
if EMOJI_REGEX.search(before or "") or EMOJI_REGEX.search(after or ""):
|
|
355
|
+
score += 0.15
|
|
356
|
+
# Clause-final emphasis
|
|
357
|
+
if index + 1 < len(tokens):
|
|
358
|
+
trailing = tokens[index + 1].text
|
|
359
|
+
if any(p in trailing for p in {"!!!", "??", "?!"}):
|
|
360
|
+
score += 0.2
|
|
361
|
+
return max(0.0, min(1.0, score))
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
__all__ = [
|
|
365
|
+
"StretchabilityAnalyzer",
|
|
366
|
+
"StretchCandidate",
|
|
367
|
+
"StretchabilityFeatures",
|
|
368
|
+
"TokenInfo",
|
|
369
|
+
"RandomLike",
|
|
370
|
+
]
|
glitchlings/zoo/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@ from .core import (
|
|
|
14
14
|
plan_glitchling_specs,
|
|
15
15
|
plan_glitchlings,
|
|
16
16
|
)
|
|
17
|
+
from .hokey import Hokey, hokey
|
|
17
18
|
from .jargoyle import Jargoyle, jargoyle
|
|
18
19
|
from .jargoyle import dependencies_available as _jargoyle_available
|
|
19
20
|
from .mim1c import Mim1c, mim1c
|
|
@@ -33,6 +34,8 @@ __all__ = [
|
|
|
33
34
|
"jargoyle",
|
|
34
35
|
"Apostrofae",
|
|
35
36
|
"apostrofae",
|
|
37
|
+
"Hokey",
|
|
38
|
+
"hokey",
|
|
36
39
|
"Adjax",
|
|
37
40
|
"adjax",
|
|
38
41
|
"Reduple",
|
|
@@ -61,7 +64,7 @@ __all__ = [
|
|
|
61
64
|
|
|
62
65
|
_HAS_JARGOYLE = _jargoyle_available()
|
|
63
66
|
|
|
64
|
-
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, mim1c]
|
|
67
|
+
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, hokey, mim1c]
|
|
65
68
|
if _HAS_JARGOYLE:
|
|
66
69
|
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
|
67
70
|
_BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
|
|
@@ -73,6 +76,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
|
|
73
76
|
_BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
|
74
77
|
typogre.name.lower(): Typogre,
|
|
75
78
|
apostrofae.name.lower(): Apostrofae,
|
|
79
|
+
hokey.name.lower(): Hokey,
|
|
76
80
|
mim1c.name.lower(): Mim1c,
|
|
77
81
|
adjax.name.lower(): Adjax,
|
|
78
82
|
reduple.name.lower(): Reduple,
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from .assets import read_text
|
|
4
4
|
|
|
5
5
|
_CONFUSION_TABLE: list[tuple[str, list[str]]] | None = None
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def load_confusion_table() -> list[tuple[str, list[str]]]:
|
|
9
9
|
"""Load the OCR confusion table shared by Python and Rust implementations."""
|
|
10
|
+
|
|
10
11
|
global _CONFUSION_TABLE
|
|
11
12
|
if _CONFUSION_TABLE is not None:
|
|
12
13
|
return _CONFUSION_TABLE
|
|
13
14
|
|
|
14
|
-
|
|
15
|
-
text = data.read_text(encoding="utf-8")
|
|
15
|
+
text = read_text("ocr_confusions.tsv")
|
|
16
16
|
indexed_entries: list[tuple[int, tuple[str, list[str]]]] = []
|
|
17
17
|
for line_number, line in enumerate(text.splitlines()):
|
|
18
18
|
stripped = line.strip()
|
glitchlings/zoo/_text_utils.py
CHANGED
|
@@ -21,9 +21,9 @@ def split_token_edges(token: str) -> tuple[str, str, str]:
|
|
|
21
21
|
return match.group(1), match.group(2), match.group(3)
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
25
|
-
"""Return
|
|
26
|
-
|
|
24
|
+
def _resolve_core_length(core: str, token: str) -> int:
|
|
25
|
+
"""Return a stable core-length measurement used by weighting heuristics."""
|
|
26
|
+
|
|
27
27
|
candidate = core if core else token
|
|
28
28
|
length = len(candidate)
|
|
29
29
|
if length <= 0:
|
|
@@ -34,6 +34,12 @@ def token_core_length(token: str) -> int:
|
|
|
34
34
|
return length
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
def token_core_length(token: str) -> int:
|
|
38
|
+
"""Return the length of the main word characters for weighting heuristics."""
|
|
39
|
+
_, core, _ = split_token_edges(token)
|
|
40
|
+
return _resolve_core_length(core, token)
|
|
41
|
+
|
|
42
|
+
|
|
37
43
|
@dataclass(frozen=True)
|
|
38
44
|
class WordToken:
|
|
39
45
|
"""Metadata describing a non-whitespace token yielded by word splitters."""
|
|
@@ -71,12 +77,7 @@ def collect_word_tokens(
|
|
|
71
77
|
continue
|
|
72
78
|
|
|
73
79
|
prefix, core, suffix = split_token_edges(token)
|
|
74
|
-
core_length =
|
|
75
|
-
if core_length <= 0:
|
|
76
|
-
stripped = token.strip()
|
|
77
|
-
core_length = len(stripped) if stripped else len(token)
|
|
78
|
-
if core_length <= 0:
|
|
79
|
-
core_length = 1
|
|
80
|
+
core_length = _resolve_core_length(core, token)
|
|
80
81
|
|
|
81
82
|
collected.append(
|
|
82
83
|
WordToken(
|
glitchlings/zoo/adjax.py
CHANGED
|
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
|
3
3
|
import random
|
|
4
4
|
from typing import Any, cast
|
|
5
5
|
|
|
6
|
-
from ._rate import resolve_rate
|
|
7
6
|
from ._rust_extensions import get_rust_operation
|
|
8
7
|
from ._text_utils import split_preserving_whitespace, split_token_edges
|
|
9
8
|
from .core import AttackWave, Glitchling
|
|
@@ -66,16 +65,9 @@ def swap_adjacent_words(
|
|
|
66
65
|
rate: float | None = None,
|
|
67
66
|
seed: int | None = None,
|
|
68
67
|
rng: random.Random | None = None,
|
|
69
|
-
*,
|
|
70
|
-
swap_rate: float | None = None,
|
|
71
68
|
) -> str:
|
|
72
69
|
"""Swap adjacent word cores while preserving spacing and punctuation."""
|
|
73
|
-
effective_rate =
|
|
74
|
-
rate=rate,
|
|
75
|
-
legacy_value=swap_rate,
|
|
76
|
-
default=0.5,
|
|
77
|
-
legacy_name="swap_rate",
|
|
78
|
-
)
|
|
70
|
+
effective_rate = 0.5 if rate is None else rate
|
|
79
71
|
clamped_rate = max(0.0, min(effective_rate, 1.0))
|
|
80
72
|
|
|
81
73
|
if rng is None:
|
|
@@ -94,16 +86,9 @@ class Adjax(Glitchling):
|
|
|
94
86
|
self,
|
|
95
87
|
*,
|
|
96
88
|
rate: float | None = None,
|
|
97
|
-
swap_rate: float | None = None,
|
|
98
89
|
seed: int | None = None,
|
|
99
90
|
) -> None:
|
|
100
|
-
|
|
101
|
-
effective_rate = resolve_rate(
|
|
102
|
-
rate=rate,
|
|
103
|
-
legacy_value=swap_rate,
|
|
104
|
-
default=0.5,
|
|
105
|
-
legacy_name="swap_rate",
|
|
106
|
-
)
|
|
91
|
+
effective_rate = 0.5 if rate is None else rate
|
|
107
92
|
super().__init__(
|
|
108
93
|
name="Adjax",
|
|
109
94
|
corruption_function=swap_adjacent_words,
|
|
@@ -118,7 +103,7 @@ class Adjax(Glitchling):
|
|
|
118
103
|
return None
|
|
119
104
|
return {
|
|
120
105
|
"type": "swap_adjacent",
|
|
121
|
-
"
|
|
106
|
+
"rate": float(rate),
|
|
122
107
|
}
|
|
123
108
|
|
|
124
109
|
|
glitchlings/zoo/apostrofae.py
CHANGED
|
@@ -2,13 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
import random
|
|
7
6
|
from functools import cache
|
|
8
|
-
from importlib import resources
|
|
9
7
|
from typing import Any, Sequence, cast
|
|
10
8
|
|
|
11
9
|
from ._rust_extensions import get_rust_operation
|
|
10
|
+
from .assets import load_json
|
|
12
11
|
from .core import AttackOrder, AttackWave, Gaggle, Glitchling
|
|
13
12
|
|
|
14
13
|
# Load Rust-accelerated operation if available
|
|
@@ -19,9 +18,7 @@ _apostrofae_rust = get_rust_operation("apostrofae")
|
|
|
19
18
|
def _load_replacement_pairs() -> dict[str, list[tuple[str, str]]]:
|
|
20
19
|
"""Load the curated mapping of straight quotes to fancy pairs."""
|
|
21
20
|
|
|
22
|
-
|
|
23
|
-
with resource.open("r", encoding="utf-8") as handle:
|
|
24
|
-
data: dict[str, list[Sequence[str]]] = json.load(handle)
|
|
21
|
+
data: dict[str, list[Sequence[str]]] = load_json("apostrofae_pairs.json")
|
|
25
22
|
|
|
26
23
|
parsed: dict[str, list[tuple[str, str]]] = {}
|
|
27
24
|
for straight, replacements in data.items():
|