glitchlings 0.4.4__cp310-cp310-win_amd64.whl → 0.5.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (42) hide show
  1. glitchlings/__init__.py +4 -0
  2. glitchlings/_zoo_rust.cp310-win_amd64.pyd +0 -0
  3. glitchlings/compat.py +2 -4
  4. glitchlings/config.py +14 -28
  5. glitchlings/dev/__init__.py +5 -0
  6. glitchlings/dev/sync_assets.py +153 -0
  7. glitchlings/dlc/_shared.py +6 -6
  8. glitchlings/dlc/huggingface.py +6 -6
  9. glitchlings/dlc/prime.py +1 -1
  10. glitchlings/dlc/pytorch.py +3 -3
  11. glitchlings/dlc/pytorch_lightning.py +4 -10
  12. glitchlings/lexicon/_cache.py +3 -5
  13. glitchlings/lexicon/vector.py +6 -5
  14. glitchlings/lexicon/wordnet.py +4 -8
  15. glitchlings/util/hokey_generator.py +144 -0
  16. glitchlings/util/stretch_locator.py +140 -0
  17. glitchlings/util/stretchability.py +370 -0
  18. glitchlings/zoo/__init__.py +5 -1
  19. glitchlings/zoo/_ocr_confusions.py +3 -3
  20. glitchlings/zoo/_text_utils.py +10 -9
  21. glitchlings/zoo/adjax.py +3 -18
  22. glitchlings/zoo/apostrofae.py +2 -5
  23. glitchlings/zoo/assets/__init__.py +54 -0
  24. glitchlings/zoo/assets/hokey_assets.json +193 -0
  25. glitchlings/zoo/hokey.py +173 -0
  26. glitchlings/zoo/jargoyle.py +2 -16
  27. glitchlings/zoo/mim1c.py +2 -17
  28. glitchlings/zoo/redactyl.py +3 -17
  29. glitchlings/zoo/reduple.py +3 -17
  30. glitchlings/zoo/rushmore.py +3 -20
  31. glitchlings/zoo/scannequin.py +3 -20
  32. glitchlings/zoo/typogre.py +2 -19
  33. glitchlings/zoo/zeedub.py +2 -13
  34. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/METADATA +29 -6
  35. glitchlings-0.5.0.dist-info/RECORD +53 -0
  36. glitchlings/zoo/_rate.py +0 -131
  37. glitchlings-0.4.4.dist-info/RECORD +0 -47
  38. /glitchlings/zoo/{ocr_confusions.tsv → assets/ocr_confusions.tsv} +0 -0
  39. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/WHEEL +0 -0
  40. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/entry_points.txt +0 -0
  41. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/licenses/LICENSE +0 -0
  42. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,140 @@
1
+ """Identify where expressive stretches should occur within a token."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Iterable
7
+
8
+ VOWELS = set("aeiouyAEIOUY")
9
+ SONORANTS = set("rlmnwyhRLMNWYH")
10
+ SIBILANTS = set("sSzZxXcCjJ") | {"sh", "Sh", "sH", "SH", "zh", "Zh"}
11
+ DIGRAPHS = {
12
+ "aa",
13
+ "ae",
14
+ "ai",
15
+ "ay",
16
+ "ee",
17
+ "ei",
18
+ "ey",
19
+ "ie",
20
+ "io",
21
+ "oa",
22
+ "oe",
23
+ "oi",
24
+ "oo",
25
+ "ou",
26
+ "ua",
27
+ "ue",
28
+ "ui",
29
+ "ya",
30
+ "yo",
31
+ "yu",
32
+ }
33
+
34
+
35
+ @dataclass(slots=True)
36
+ class StretchSite:
37
+ """Location of a stretchable grapheme."""
38
+
39
+ start: int
40
+ end: int
41
+ category: str
42
+
43
+ def unit(self, token: str) -> str:
44
+ return token[self.start : self.end]
45
+
46
+
47
+ def _alpha_indices(token: str) -> list[int]:
48
+ return [idx for idx, char in enumerate(token) if char.isalpha()]
49
+
50
+
51
+ def _vowel_clusters(token: str, indices: Iterable[int]) -> list[tuple[int, int]]:
52
+ clusters: list[tuple[int, int]] = []
53
+ start: int | None = None
54
+ prev_idx: int | None = None
55
+ for idx in indices:
56
+ char = token[idx]
57
+ if char in VOWELS:
58
+ if start is None:
59
+ start = idx
60
+ elif prev_idx is not None and idx != prev_idx + 1:
61
+ clusters.append((start, prev_idx + 1))
62
+ start = idx
63
+ else:
64
+ if start is not None:
65
+ clusters.append((start, idx))
66
+ start = None
67
+ prev_idx = idx
68
+ if start is not None and prev_idx is not None:
69
+ clusters.append((start, prev_idx + 1))
70
+ return clusters
71
+
72
+
73
+ def find_stretch_site(token: str) -> StretchSite | None:
74
+ """Return the most suitable stretch site for ``token``."""
75
+
76
+ alpha_indices = _alpha_indices(token)
77
+ if not alpha_indices:
78
+ return None
79
+
80
+ lower = token.lower()
81
+ clusters = _vowel_clusters(lower, alpha_indices)
82
+ candidates: list[tuple[int, StretchSite]] = []
83
+
84
+ # Sibilant/sonorant coda extension (yes -> yesss, hmm -> hmmmm)
85
+ last_idx = alpha_indices[-1]
86
+ last_char = lower[last_idx]
87
+ if len(alpha_indices) >= 2:
88
+ prev_char = lower[alpha_indices[-2]]
89
+ else:
90
+ prev_char = ""
91
+ has_multi_vowel = any(
92
+ (end - start >= 2) and not (lower[start] == "y" and start == 0) for start, end in clusters
93
+ )
94
+ if last_char in {"s", "z"} and prev_char in VOWELS and not has_multi_vowel:
95
+ candidates.append((5, StretchSite(last_idx, last_idx + 1, "coda")))
96
+ elif last_char in SONORANTS and prev_char in VOWELS and not has_multi_vowel:
97
+ candidates.append((4, StretchSite(last_idx, last_idx + 1, "coda")))
98
+ elif not clusters:
99
+ candidates.append((2, StretchSite(last_idx, last_idx + 1, "consonant")))
100
+
101
+ # CVCe pattern (cute -> cuuute)
102
+ if lower.endswith("e") and len(alpha_indices) >= 3:
103
+ final_letter = alpha_indices[-1]
104
+ if token[final_letter].lower() == "e":
105
+ c_idx = alpha_indices[-2]
106
+ v_idx = alpha_indices[-3]
107
+ if token[c_idx].lower() not in VOWELS and token[v_idx].lower() in VOWELS:
108
+ candidates.append((4, StretchSite(v_idx, v_idx + 1, "cvce")))
109
+
110
+ for cluster in clusters:
111
+ start, end = cluster
112
+ substring = lower[start:end]
113
+ category = "vowel"
114
+ if any(substring[i : i + 2] in DIGRAPHS for i in range(max(0, len(substring) - 1))):
115
+ category = "digraph"
116
+ priority = 3 if cluster == clusters[-1] else 2
117
+ candidates.append((priority, StretchSite(start, end, category)))
118
+
119
+ if not candidates:
120
+ return None
121
+
122
+ candidates.sort(key=lambda item: (item[0], item[1].end - item[1].start, -item[1].start))
123
+ return candidates[-1][1]
124
+
125
+
126
+ def apply_stretch(token: str, site: StretchSite, repeats: int) -> str:
127
+ """Return ``token`` with ``repeats`` extra copies of the grapheme at ``site``."""
128
+
129
+ if repeats <= 0:
130
+ return token
131
+ chars = list(token)
132
+ stretched: list[str] = []
133
+ for idx, char in enumerate(chars):
134
+ stretched.append(char)
135
+ if site.start <= idx < site.end:
136
+ stretched.append(char * repeats)
137
+ return "".join(stretched)
138
+
139
+
140
+ __all__ = ["StretchSite", "find_stretch_site", "apply_stretch"]
@@ -0,0 +1,370 @@
1
+ """Stretchability scoring and candidate selection for Hokey."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+ from typing import Protocol, Sequence, TypedDict, cast
8
+
9
+ from glitchlings.zoo import assets
10
+
11
+ # Regexes reused across the module
12
+ TOKEN_REGEX = re.compile(r"\w+|\W+")
13
+ ALPHA_REGEX = re.compile(r"[A-Za-z]")
14
+ EMOJI_REGEX = re.compile(r"[\U0001F300-\U0001FAFF]")
15
+ CLAUSE_PUNCTUATION = {".", "?", "!", ";"}
16
+
17
+
18
+ class HokeyAssets(TypedDict):
19
+ lexical_prior: dict[str, float]
20
+ interjections: list[str]
21
+ intensifiers: list[str]
22
+ evaluatives: list[str]
23
+ positive_lexicon: list[str]
24
+ negative_lexicon: list[str]
25
+
26
+
27
+ class RandomLike(Protocol):
28
+ """Interface for RNGs that expose ``random()``."""
29
+
30
+ def random(self) -> float: ...
31
+
32
+
33
+ # Lexical prior probabilities and pragmatic lexica shared with the Rust fast path.
34
+ def _load_assets() -> HokeyAssets:
35
+ data = assets.load_json("hokey_assets.json")
36
+ return cast(HokeyAssets, data)
37
+
38
+
39
+ _ASSETS = _load_assets()
40
+ LEXICAL_PRIOR: dict[str, float] = {
41
+ token: float(score) for token, score in _ASSETS["lexical_prior"].items()
42
+ }
43
+
44
+ # Pragmatic lexica for POS/discourse cues
45
+ INTERJECTIONS = frozenset(_ASSETS["interjections"])
46
+ INTENSIFIERS = frozenset(_ASSETS["intensifiers"])
47
+ EVALUATIVES = frozenset(_ASSETS["evaluatives"])
48
+ POSITIVE_LEXICON = frozenset(_ASSETS["positive_lexicon"])
49
+ NEGATIVE_LEXICON = frozenset(_ASSETS["negative_lexicon"])
50
+
51
+ VOWELS = set("aeiouy")
52
+ SONORANT_CODAS = set("rlmnwyh")
53
+ SIBILANT_CODAS = {"s", "z", "x", "c", "j", "sh", "zh"}
54
+ DIGRAPHS = {
55
+ "aa",
56
+ "ae",
57
+ "ai",
58
+ "ay",
59
+ "ee",
60
+ "ei",
61
+ "ey",
62
+ "ie",
63
+ "oa",
64
+ "oe",
65
+ "oi",
66
+ "oo",
67
+ "ou",
68
+ "ue",
69
+ "ui",
70
+ }
71
+
72
+ MAX_CANDIDATES_PER_CLAUSE = 4
73
+ MIN_SCORE_THRESHOLD = 0.18
74
+
75
+
76
+ @dataclass(slots=True)
77
+ class TokenInfo:
78
+ text: str
79
+ start: int
80
+ end: int
81
+ is_word: bool
82
+ clause_index: int
83
+ preceding_punct: str
84
+ following_punct: str
85
+ index: int
86
+
87
+ @property
88
+ def normalised(self) -> str:
89
+ return self.text.lower()
90
+
91
+
92
+ @dataclass(slots=True)
93
+ class StretchabilityFeatures:
94
+ lexical: float
95
+ pos: float
96
+ sentiment: float
97
+ phonotactic: float
98
+ context: float
99
+ sentiment_swing: float
100
+
101
+ def intensity(self) -> float:
102
+ """Map features to an intensity scalar in [0, 1.5]."""
103
+ emphasis = 0.6 * self.context + 0.4 * self.sentiment_swing
104
+ return max(0.0, min(1.5, 0.5 * (self.lexical + self.phonotactic) + emphasis))
105
+
106
+
107
+ @dataclass(slots=True)
108
+ class StretchCandidate:
109
+ token: TokenInfo
110
+ score: float
111
+ features: StretchabilityFeatures
112
+
113
+
114
+ class StretchabilityAnalyzer:
115
+ """Compute stretchability scores and select candidates."""
116
+
117
+ def __init__(
118
+ self,
119
+ *,
120
+ lexical_prior: dict[str, float] | None = None,
121
+ weights: tuple[float, float, float, float, float] = (0.32, 0.18, 0.14, 0.22, 0.14),
122
+ ) -> None:
123
+ self.lexical_prior = lexical_prior or LEXICAL_PRIOR
124
+ self.weights = weights
125
+
126
+ # ------------------------------------------------------------------
127
+ # Public API
128
+ # ------------------------------------------------------------------
129
+ def tokenise(self, text: str) -> list[TokenInfo]:
130
+ """Tokenise text preserving separator tokens."""
131
+ return self._tokenise(text)
132
+
133
+ def analyse(self, text: str) -> list[StretchCandidate]:
134
+ if not text:
135
+ return []
136
+ tokens = self._tokenise(text)
137
+ return self.analyse_tokens(tokens)
138
+
139
+ def analyse_tokens(self, tokens: Sequence[TokenInfo]) -> list[StretchCandidate]:
140
+ candidates: list[StretchCandidate] = []
141
+ for idx, token in enumerate(tokens):
142
+ if not token.is_word:
143
+ continue
144
+ if self._excluded(token, tokens, idx):
145
+ continue
146
+
147
+ features = self._compute_features(token, tokens, idx)
148
+ score = self._composite_score(features)
149
+ if score < MIN_SCORE_THRESHOLD:
150
+ continue
151
+ candidates.append(StretchCandidate(token=token, score=score, features=features))
152
+ return candidates
153
+
154
+ def select_candidates(
155
+ self,
156
+ candidates: Sequence[StretchCandidate],
157
+ *,
158
+ rate: float,
159
+ rng: RandomLike,
160
+ ) -> list[StretchCandidate]:
161
+ if not candidates or rate <= 0:
162
+ return []
163
+
164
+ grouped: dict[int, list[StretchCandidate]] = {}
165
+ for candidate in candidates:
166
+ grouped.setdefault(candidate.token.clause_index, []).append(candidate)
167
+
168
+ selected: list[StretchCandidate] = []
169
+ total_expected = max(0, min(len(candidates), int(round(len(candidates) * rate))))
170
+
171
+ for clause_index in sorted(grouped):
172
+ clause_candidates = sorted(
173
+ grouped[clause_index], key=lambda c: (-c.score, c.token.start)
174
+ )
175
+ clause_candidates = clause_candidates[:MAX_CANDIDATES_PER_CLAUSE]
176
+ clause_quota = max(
177
+ 0, min(len(clause_candidates), int(round(len(clause_candidates) * rate)))
178
+ )
179
+
180
+ provisional: list[StretchCandidate] = []
181
+ for candidate in clause_candidates:
182
+ probability = min(1.0, rate * (0.35 + 0.65 * candidate.score))
183
+ if rng.random() < probability:
184
+ provisional.append(candidate)
185
+ if len(provisional) >= clause_quota:
186
+ break
187
+
188
+ if len(provisional) < clause_quota:
189
+ leftovers = [c for c in clause_candidates if c not in provisional]
190
+ needed = clause_quota - len(provisional)
191
+ provisional.extend(leftovers[:needed])
192
+
193
+ selected.extend(provisional)
194
+
195
+ if len(selected) < total_expected:
196
+ remaining = [c for c in candidates if c not in selected]
197
+ remaining.sort(key=lambda c: (-c.score, c.token.start))
198
+ selected.extend(remaining[: total_expected - len(selected)])
199
+
200
+ # Keep deterministic order by position
201
+ selected.sort(key=lambda c: c.token.start)
202
+ return selected
203
+
204
+ # ------------------------------------------------------------------
205
+ # Internal helpers
206
+ # ------------------------------------------------------------------
207
+ def _tokenise(self, text: str) -> list[TokenInfo]:
208
+ tokens: list[TokenInfo] = []
209
+ clause_index = 0
210
+ matches = list(TOKEN_REGEX.finditer(text))
211
+ for idx, match in enumerate(matches):
212
+ token_text = match.group(0)
213
+ is_word = bool(ALPHA_REGEX.search(token_text)) and token_text.strip().isalnum()
214
+ preceding = matches[idx - 1].group(0) if idx > 0 else ""
215
+ following = matches[idx + 1].group(0) if idx + 1 < len(matches) else ""
216
+ tokens.append(
217
+ TokenInfo(
218
+ text=token_text,
219
+ start=match.start(),
220
+ end=match.end(),
221
+ is_word=is_word,
222
+ clause_index=clause_index,
223
+ preceding_punct=preceding,
224
+ following_punct=following,
225
+ index=idx,
226
+ )
227
+ )
228
+ if any(ch in CLAUSE_PUNCTUATION for ch in token_text):
229
+ clause_index += 1
230
+ return tokens
231
+
232
+ def _excluded(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> bool:
233
+ text = token.text
234
+ normalised = token.normalised
235
+ if sum(ch.isalpha() for ch in text) < 2:
236
+ return True
237
+ if any(ch.isdigit() for ch in text):
238
+ return True
239
+ lowered = normalised
240
+ if "http" in lowered or "www" in lowered or "//" in lowered:
241
+ return True
242
+ if any(symbol in text for symbol in {"#", "@", "&", "{", "}", "<", ">"}):
243
+ return True
244
+ if "_" in text:
245
+ return True
246
+ if "/" in text or "\\" in text:
247
+ return True
248
+
249
+ # Heuristic proper noun check: Title case mid-clause counts as proper noun
250
+ if text[:1].isupper() and text[1:].islower():
251
+ previous_clause_start = index == 0
252
+ if not previous_clause_start:
253
+ for prior in reversed(tokens[:index]):
254
+ stripped = prior.text.strip()
255
+ if not stripped:
256
+ continue
257
+ if stripped[-1] in CLAUSE_PUNCTUATION:
258
+ previous_clause_start = True
259
+ break
260
+ if not previous_clause_start:
261
+ return True
262
+ return False
263
+
264
+ def _compute_features(
265
+ self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int
266
+ ) -> StretchabilityFeatures:
267
+ lexical = self.lexical_prior.get(token.normalised, 0.12)
268
+ pos_score = self._pos_score(token)
269
+ sentiment_score, sentiment_swing = self._sentiment(tokens, index)
270
+ phon_score = self._phonotactic(token.normalised)
271
+ context_score = self._contextual(token, tokens, index)
272
+ return StretchabilityFeatures(
273
+ lexical=lexical,
274
+ pos=pos_score,
275
+ sentiment=sentiment_score,
276
+ phonotactic=phon_score,
277
+ context=context_score,
278
+ sentiment_swing=sentiment_swing,
279
+ )
280
+
281
+ def _composite_score(self, features: StretchabilityFeatures) -> float:
282
+ lex_w, pos_w, sent_w, phon_w, ctx_w = self.weights
283
+ weighted = (
284
+ lex_w * features.lexical
285
+ + pos_w * features.pos
286
+ + sent_w * features.sentiment
287
+ + phon_w * features.phonotactic
288
+ + ctx_w * features.context
289
+ )
290
+ total_weight = sum(self.weights)
291
+ score = weighted / total_weight if total_weight else 0.0
292
+ return max(0.0, min(1.0, score))
293
+
294
+ # ------------------------------------------------------------------
295
+ # Feature helpers
296
+ # ------------------------------------------------------------------
297
+ def _pos_score(self, token: TokenInfo) -> float:
298
+ normalised = token.normalised
299
+ if normalised in INTERJECTIONS:
300
+ return 0.95
301
+ if normalised in INTENSIFIERS:
302
+ return 0.85
303
+ if normalised in EVALUATIVES:
304
+ return 0.7
305
+ if normalised.endswith("ly"):
306
+ return 0.55
307
+ if token.text.isupper() and len(token.text) > 1:
308
+ return 0.65
309
+ return 0.3
310
+
311
+ def _sentiment(self, tokens: Sequence[TokenInfo], index: int) -> tuple[float, float]:
312
+ window = [tok for tok in tokens[max(0, index - 2) : index + 3] if tok.is_word]
313
+ if not window:
314
+ return 0.5, 0.0
315
+ pos_hits = sum(1 for tok in window if tok.normalised in POSITIVE_LEXICON)
316
+ neg_hits = sum(1 for tok in window if tok.normalised in NEGATIVE_LEXICON)
317
+ total = len(window)
318
+ balance = (pos_hits - neg_hits) / total
319
+ sentiment_score = 0.5 + 0.5 * max(-1.0, min(1.0, balance))
320
+ swing = abs(balance)
321
+ return sentiment_score, swing
322
+
323
+ def _phonotactic(self, normalised: str) -> float:
324
+ if not any(ch in VOWELS for ch in normalised):
325
+ return 0.0
326
+ score = 0.25
327
+ if any(normalised.endswith(c) for c in SONORANT_CODAS):
328
+ score += 0.2
329
+ if any(normalised.endswith(c) for c in SIBILANT_CODAS):
330
+ score += 0.18
331
+ if any(digraph in normalised for digraph in DIGRAPHS):
332
+ score += 0.22
333
+ if re.search(r"[aeiouy]{2,}", normalised):
334
+ score += 0.22
335
+ if re.search(r"(.)(?!\1)(.)\1", normalised):
336
+ score += 0.08
337
+ return max(0.0, min(1.0, score))
338
+
339
+ def _contextual(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> float:
340
+ score = 0.2
341
+ before = token.preceding_punct
342
+ after = token.following_punct
343
+ token_text = token.text
344
+ if after and after.count("!") >= 1:
345
+ score += 0.25
346
+ if after and after.count("?") >= 1:
347
+ score += 0.2
348
+ if before and before.count("!") >= 2:
349
+ score += 0.2
350
+ if after and ("!!" in after or "??" in after):
351
+ score += 0.15
352
+ if token_text.isupper() and len(token_text) > 1:
353
+ score += 0.25
354
+ if EMOJI_REGEX.search(before or "") or EMOJI_REGEX.search(after or ""):
355
+ score += 0.15
356
+ # Clause-final emphasis
357
+ if index + 1 < len(tokens):
358
+ trailing = tokens[index + 1].text
359
+ if any(p in trailing for p in {"!!!", "??", "?!"}):
360
+ score += 0.2
361
+ return max(0.0, min(1.0, score))
362
+
363
+
364
+ __all__ = [
365
+ "StretchabilityAnalyzer",
366
+ "StretchCandidate",
367
+ "StretchabilityFeatures",
368
+ "TokenInfo",
369
+ "RandomLike",
370
+ ]
@@ -14,6 +14,7 @@ from .core import (
14
14
  plan_glitchling_specs,
15
15
  plan_glitchlings,
16
16
  )
17
+ from .hokey import Hokey, hokey
17
18
  from .jargoyle import Jargoyle, jargoyle
18
19
  from .jargoyle import dependencies_available as _jargoyle_available
19
20
  from .mim1c import Mim1c, mim1c
@@ -33,6 +34,8 @@ __all__ = [
33
34
  "jargoyle",
34
35
  "Apostrofae",
35
36
  "apostrofae",
37
+ "Hokey",
38
+ "hokey",
36
39
  "Adjax",
37
40
  "adjax",
38
41
  "Reduple",
@@ -61,7 +64,7 @@ __all__ = [
61
64
 
62
65
  _HAS_JARGOYLE = _jargoyle_available()
63
66
 
64
- _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, mim1c]
67
+ _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, hokey, mim1c]
65
68
  if _HAS_JARGOYLE:
66
69
  _BUILTIN_GLITCHLING_LIST.append(jargoyle)
67
70
  _BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
@@ -73,6 +76,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
73
76
  _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
74
77
  typogre.name.lower(): Typogre,
75
78
  apostrofae.name.lower(): Apostrofae,
79
+ hokey.name.lower(): Hokey,
76
80
  mim1c.name.lower(): Mim1c,
77
81
  adjax.name.lower(): Adjax,
78
82
  reduple.name.lower(): Reduple,
@@ -1,18 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
- from importlib import resources
3
+ from .assets import read_text
4
4
 
5
5
  _CONFUSION_TABLE: list[tuple[str, list[str]]] | None = None
6
6
 
7
7
 
8
8
  def load_confusion_table() -> list[tuple[str, list[str]]]:
9
9
  """Load the OCR confusion table shared by Python and Rust implementations."""
10
+
10
11
  global _CONFUSION_TABLE
11
12
  if _CONFUSION_TABLE is not None:
12
13
  return _CONFUSION_TABLE
13
14
 
14
- data = resources.files(__package__) / "ocr_confusions.tsv"
15
- text = data.read_text(encoding="utf-8")
15
+ text = read_text("ocr_confusions.tsv")
16
16
  indexed_entries: list[tuple[int, tuple[str, list[str]]]] = []
17
17
  for line_number, line in enumerate(text.splitlines()):
18
18
  stripped = line.strip()
@@ -21,9 +21,9 @@ def split_token_edges(token: str) -> tuple[str, str, str]:
21
21
  return match.group(1), match.group(2), match.group(3)
22
22
 
23
23
 
24
- def token_core_length(token: str) -> int:
25
- """Return the length of the main word characters for weighting heuristics."""
26
- _, core, _ = split_token_edges(token)
24
+ def _resolve_core_length(core: str, token: str) -> int:
25
+ """Return a stable core-length measurement used by weighting heuristics."""
26
+
27
27
  candidate = core if core else token
28
28
  length = len(candidate)
29
29
  if length <= 0:
@@ -34,6 +34,12 @@ def token_core_length(token: str) -> int:
34
34
  return length
35
35
 
36
36
 
37
+ def token_core_length(token: str) -> int:
38
+ """Return the length of the main word characters for weighting heuristics."""
39
+ _, core, _ = split_token_edges(token)
40
+ return _resolve_core_length(core, token)
41
+
42
+
37
43
  @dataclass(frozen=True)
38
44
  class WordToken:
39
45
  """Metadata describing a non-whitespace token yielded by word splitters."""
@@ -71,12 +77,7 @@ def collect_word_tokens(
71
77
  continue
72
78
 
73
79
  prefix, core, suffix = split_token_edges(token)
74
- core_length = len(core)
75
- if core_length <= 0:
76
- stripped = token.strip()
77
- core_length = len(stripped) if stripped else len(token)
78
- if core_length <= 0:
79
- core_length = 1
80
+ core_length = _resolve_core_length(core, token)
80
81
 
81
82
  collected.append(
82
83
  WordToken(
glitchlings/zoo/adjax.py CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import random
4
4
  from typing import Any, cast
5
5
 
6
- from ._rate import resolve_rate
7
6
  from ._rust_extensions import get_rust_operation
8
7
  from ._text_utils import split_preserving_whitespace, split_token_edges
9
8
  from .core import AttackWave, Glitchling
@@ -66,16 +65,9 @@ def swap_adjacent_words(
66
65
  rate: float | None = None,
67
66
  seed: int | None = None,
68
67
  rng: random.Random | None = None,
69
- *,
70
- swap_rate: float | None = None,
71
68
  ) -> str:
72
69
  """Swap adjacent word cores while preserving spacing and punctuation."""
73
- effective_rate = resolve_rate(
74
- rate=rate,
75
- legacy_value=swap_rate,
76
- default=0.5,
77
- legacy_name="swap_rate",
78
- )
70
+ effective_rate = 0.5 if rate is None else rate
79
71
  clamped_rate = max(0.0, min(effective_rate, 1.0))
80
72
 
81
73
  if rng is None:
@@ -94,16 +86,9 @@ class Adjax(Glitchling):
94
86
  self,
95
87
  *,
96
88
  rate: float | None = None,
97
- swap_rate: float | None = None,
98
89
  seed: int | None = None,
99
90
  ) -> None:
100
- self._param_aliases = {"swap_rate": "rate"}
101
- effective_rate = resolve_rate(
102
- rate=rate,
103
- legacy_value=swap_rate,
104
- default=0.5,
105
- legacy_name="swap_rate",
106
- )
91
+ effective_rate = 0.5 if rate is None else rate
107
92
  super().__init__(
108
93
  name="Adjax",
109
94
  corruption_function=swap_adjacent_words,
@@ -118,7 +103,7 @@ class Adjax(Glitchling):
118
103
  return None
119
104
  return {
120
105
  "type": "swap_adjacent",
121
- "swap_rate": float(rate),
106
+ "rate": float(rate),
122
107
  }
123
108
 
124
109
 
@@ -2,13 +2,12 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import json
6
5
  import random
7
6
  from functools import cache
8
- from importlib import resources
9
7
  from typing import Any, Sequence, cast
10
8
 
11
9
  from ._rust_extensions import get_rust_operation
10
+ from .assets import load_json
12
11
  from .core import AttackOrder, AttackWave, Gaggle, Glitchling
13
12
 
14
13
  # Load Rust-accelerated operation if available
@@ -19,9 +18,7 @@ _apostrofae_rust = get_rust_operation("apostrofae")
19
18
  def _load_replacement_pairs() -> dict[str, list[tuple[str, str]]]:
20
19
  """Load the curated mapping of straight quotes to fancy pairs."""
21
20
 
22
- resource = resources.files(f"{__package__}.assets").joinpath("apostrofae_pairs.json")
23
- with resource.open("r", encoding="utf-8") as handle:
24
- data: dict[str, list[Sequence[str]]] = json.load(handle)
21
+ data: dict[str, list[Sequence[str]]] = load_json("apostrofae_pairs.json")
25
22
 
26
23
  parsed: dict[str, list[tuple[str, str]]] = {}
27
24
  for straight, replacements in data.items():