glitchlings 0.4.4__cp310-cp310-win_amd64.whl → 0.4.5__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

@@ -0,0 +1,140 @@
1
+ """Identify where expressive stretches should occur within a token."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Iterable
7
+
8
+ VOWELS = set("aeiouyAEIOUY")
9
+ SONORANTS = set("rlmnwyhRLMNWYH")
10
+ SIBILANTS = set("sSzZxXcCjJ") | {"sh", "Sh", "sH", "SH", "zh", "Zh"}
11
+ DIGRAPHS = {
12
+ "aa",
13
+ "ae",
14
+ "ai",
15
+ "ay",
16
+ "ee",
17
+ "ei",
18
+ "ey",
19
+ "ie",
20
+ "io",
21
+ "oa",
22
+ "oe",
23
+ "oi",
24
+ "oo",
25
+ "ou",
26
+ "ua",
27
+ "ue",
28
+ "ui",
29
+ "ya",
30
+ "yo",
31
+ "yu",
32
+ }
33
+
34
+
35
+ @dataclass(slots=True)
36
+ class StretchSite:
37
+ """Location of a stretchable grapheme."""
38
+
39
+ start: int
40
+ end: int
41
+ category: str
42
+
43
+ def unit(self, token: str) -> str:
44
+ return token[self.start : self.end]
45
+
46
+
47
+ def _alpha_indices(token: str) -> list[int]:
48
+ return [idx for idx, char in enumerate(token) if char.isalpha()]
49
+
50
+
51
+ def _vowel_clusters(token: str, indices: Iterable[int]) -> list[tuple[int, int]]:
52
+ clusters: list[tuple[int, int]] = []
53
+ start: int | None = None
54
+ prev_idx: int | None = None
55
+ for idx in indices:
56
+ char = token[idx]
57
+ if char in VOWELS:
58
+ if start is None:
59
+ start = idx
60
+ elif prev_idx is not None and idx != prev_idx + 1:
61
+ clusters.append((start, prev_idx + 1))
62
+ start = idx
63
+ else:
64
+ if start is not None:
65
+ clusters.append((start, idx))
66
+ start = None
67
+ prev_idx = idx
68
+ if start is not None and prev_idx is not None:
69
+ clusters.append((start, prev_idx + 1))
70
+ return clusters
71
+
72
+
73
+ def find_stretch_site(token: str) -> StretchSite | None:
74
+ """Return the most suitable stretch site for ``token``."""
75
+
76
+ alpha_indices = _alpha_indices(token)
77
+ if not alpha_indices:
78
+ return None
79
+
80
+ lower = token.lower()
81
+ clusters = _vowel_clusters(lower, alpha_indices)
82
+ candidates: list[tuple[int, StretchSite]] = []
83
+
84
+ # Sibilant/sonorant coda extension (yes -> yesss, hmm -> hmmmm)
85
+ last_idx = alpha_indices[-1]
86
+ last_char = lower[last_idx]
87
+ if len(alpha_indices) >= 2:
88
+ prev_char = lower[alpha_indices[-2]]
89
+ else:
90
+ prev_char = ""
91
+ has_multi_vowel = any(
92
+ (end - start >= 2) and not (lower[start] == "y" and start == 0) for start, end in clusters
93
+ )
94
+ if last_char in {"s", "z"} and prev_char in VOWELS and not has_multi_vowel:
95
+ candidates.append((5, StretchSite(last_idx, last_idx + 1, "coda")))
96
+ elif last_char in SONORANTS and prev_char in VOWELS and not has_multi_vowel:
97
+ candidates.append((4, StretchSite(last_idx, last_idx + 1, "coda")))
98
+ elif not clusters:
99
+ candidates.append((2, StretchSite(last_idx, last_idx + 1, "consonant")))
100
+
101
+ # CVCe pattern (cute -> cuuute)
102
+ if lower.endswith("e") and len(alpha_indices) >= 3:
103
+ final_letter = alpha_indices[-1]
104
+ if token[final_letter].lower() == "e":
105
+ c_idx = alpha_indices[-2]
106
+ v_idx = alpha_indices[-3]
107
+ if token[c_idx].lower() not in VOWELS and token[v_idx].lower() in VOWELS:
108
+ candidates.append((4, StretchSite(v_idx, v_idx + 1, "cvce")))
109
+
110
+ for cluster in clusters:
111
+ start, end = cluster
112
+ substring = lower[start:end]
113
+ category = "vowel"
114
+ if any(substring[i : i + 2] in DIGRAPHS for i in range(max(0, len(substring) - 1))):
115
+ category = "digraph"
116
+ priority = 3 if cluster == clusters[-1] else 2
117
+ candidates.append((priority, StretchSite(start, end, category)))
118
+
119
+ if not candidates:
120
+ return None
121
+
122
+ candidates.sort(key=lambda item: (item[0], item[1].end - item[1].start, -item[1].start))
123
+ return candidates[-1][1]
124
+
125
+
126
+ def apply_stretch(token: str, site: StretchSite, repeats: int) -> str:
127
+ """Return ``token`` with ``repeats`` extra copies of the grapheme at ``site``."""
128
+
129
+ if repeats <= 0:
130
+ return token
131
+ chars = list(token)
132
+ stretched: list[str] = []
133
+ for idx, char in enumerate(chars):
134
+ stretched.append(char)
135
+ if site.start <= idx < site.end:
136
+ stretched.append(char * repeats)
137
+ return "".join(stretched)
138
+
139
+
140
+ __all__ = ["StretchSite", "find_stretch_site", "apply_stretch"]
@@ -0,0 +1,375 @@
1
+ """Stretchability scoring and candidate selection for Hokey."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass
8
+ from importlib import resources
9
+ from typing import Any, Protocol, Sequence, TypedDict, cast
10
+
11
+ # Regexes reused across the module
12
+ TOKEN_REGEX = re.compile(r"\w+|\W+")
13
+ ALPHA_REGEX = re.compile(r"[A-Za-z]")
14
+ EMOJI_REGEX = re.compile(r"[\U0001F300-\U0001FAFF]")
15
+ CLAUSE_PUNCTUATION = {".", "?", "!", ";"}
16
+
17
+
18
+ class HokeyAssets(TypedDict):
19
+ lexical_prior: dict[str, float]
20
+ interjections: list[str]
21
+ intensifiers: list[str]
22
+ evaluatives: list[str]
23
+ positive_lexicon: list[str]
24
+ negative_lexicon: list[str]
25
+
26
+
27
+ class RandomLike(Protocol):
28
+ """Interface for RNGs that expose ``random()``."""
29
+
30
+ def random(self) -> float: ...
31
+
32
+
33
+ # Lexical prior probabilities and pragmatic lexica shared with the Rust fast path.
34
+ def _load_assets() -> HokeyAssets:
35
+ with (
36
+ resources.files("glitchlings.data")
37
+ .joinpath("hokey_assets.json")
38
+ .open("r", encoding="utf-8") as payload
39
+ ):
40
+ data: Any = json.load(payload)
41
+ return cast(HokeyAssets, data)
42
+
43
+
44
+ _ASSETS = _load_assets()
45
+ LEXICAL_PRIOR: dict[str, float] = {
46
+ token: float(score) for token, score in _ASSETS["lexical_prior"].items()
47
+ }
48
+
49
+ # Pragmatic lexica for POS/discourse cues
50
+ INTERJECTIONS = frozenset(_ASSETS["interjections"])
51
+ INTENSIFIERS = frozenset(_ASSETS["intensifiers"])
52
+ EVALUATIVES = frozenset(_ASSETS["evaluatives"])
53
+ POSITIVE_LEXICON = frozenset(_ASSETS["positive_lexicon"])
54
+ NEGATIVE_LEXICON = frozenset(_ASSETS["negative_lexicon"])
55
+
56
+ VOWELS = set("aeiouy")
57
+ SONORANT_CODAS = set("rlmnwyh")
58
+ SIBILANT_CODAS = {"s", "z", "x", "c", "j", "sh", "zh"}
59
+ DIGRAPHS = {
60
+ "aa",
61
+ "ae",
62
+ "ai",
63
+ "ay",
64
+ "ee",
65
+ "ei",
66
+ "ey",
67
+ "ie",
68
+ "oa",
69
+ "oe",
70
+ "oi",
71
+ "oo",
72
+ "ou",
73
+ "ue",
74
+ "ui",
75
+ }
76
+
77
+ MAX_CANDIDATES_PER_CLAUSE = 4
78
+ MIN_SCORE_THRESHOLD = 0.18
79
+
80
+
81
+ @dataclass(slots=True)
82
+ class TokenInfo:
83
+ text: str
84
+ start: int
85
+ end: int
86
+ is_word: bool
87
+ clause_index: int
88
+ preceding_punct: str
89
+ following_punct: str
90
+ index: int
91
+
92
+ @property
93
+ def normalised(self) -> str:
94
+ return self.text.lower()
95
+
96
+
97
+ @dataclass(slots=True)
98
+ class StretchabilityFeatures:
99
+ lexical: float
100
+ pos: float
101
+ sentiment: float
102
+ phonotactic: float
103
+ context: float
104
+ sentiment_swing: float
105
+
106
+ def intensity(self) -> float:
107
+ """Map features to an intensity scalar in [0, 1.5]."""
108
+ emphasis = 0.6 * self.context + 0.4 * self.sentiment_swing
109
+ return max(0.0, min(1.5, 0.5 * (self.lexical + self.phonotactic) + emphasis))
110
+
111
+
112
+ @dataclass(slots=True)
113
+ class StretchCandidate:
114
+ token: TokenInfo
115
+ score: float
116
+ features: StretchabilityFeatures
117
+
118
+
119
+ class StretchabilityAnalyzer:
120
+ """Compute stretchability scores and select candidates."""
121
+
122
+ def __init__(
123
+ self,
124
+ *,
125
+ lexical_prior: dict[str, float] | None = None,
126
+ weights: tuple[float, float, float, float, float] = (0.32, 0.18, 0.14, 0.22, 0.14),
127
+ ) -> None:
128
+ self.lexical_prior = lexical_prior or LEXICAL_PRIOR
129
+ self.weights = weights
130
+
131
+ # ------------------------------------------------------------------
132
+ # Public API
133
+ # ------------------------------------------------------------------
134
+ def tokenise(self, text: str) -> list[TokenInfo]:
135
+ """Tokenise text preserving separator tokens."""
136
+ return self._tokenise(text)
137
+
138
+ def analyse(self, text: str) -> list[StretchCandidate]:
139
+ if not text:
140
+ return []
141
+ tokens = self._tokenise(text)
142
+ return self.analyse_tokens(tokens)
143
+
144
+ def analyse_tokens(self, tokens: Sequence[TokenInfo]) -> list[StretchCandidate]:
145
+ candidates: list[StretchCandidate] = []
146
+ for idx, token in enumerate(tokens):
147
+ if not token.is_word:
148
+ continue
149
+ if self._excluded(token, tokens, idx):
150
+ continue
151
+
152
+ features = self._compute_features(token, tokens, idx)
153
+ score = self._composite_score(features)
154
+ if score < MIN_SCORE_THRESHOLD:
155
+ continue
156
+ candidates.append(StretchCandidate(token=token, score=score, features=features))
157
+ return candidates
158
+
159
+ def select_candidates(
160
+ self,
161
+ candidates: Sequence[StretchCandidate],
162
+ *,
163
+ rate: float,
164
+ rng: RandomLike,
165
+ ) -> list[StretchCandidate]:
166
+ if not candidates or rate <= 0:
167
+ return []
168
+
169
+ grouped: dict[int, list[StretchCandidate]] = {}
170
+ for candidate in candidates:
171
+ grouped.setdefault(candidate.token.clause_index, []).append(candidate)
172
+
173
+ selected: list[StretchCandidate] = []
174
+ total_expected = max(0, min(len(candidates), int(round(len(candidates) * rate))))
175
+
176
+ for clause_index in sorted(grouped):
177
+ clause_candidates = sorted(
178
+ grouped[clause_index], key=lambda c: (-c.score, c.token.start)
179
+ )
180
+ clause_candidates = clause_candidates[:MAX_CANDIDATES_PER_CLAUSE]
181
+ clause_quota = max(
182
+ 0, min(len(clause_candidates), int(round(len(clause_candidates) * rate)))
183
+ )
184
+
185
+ provisional: list[StretchCandidate] = []
186
+ for candidate in clause_candidates:
187
+ probability = min(1.0, rate * (0.35 + 0.65 * candidate.score))
188
+ if rng.random() < probability:
189
+ provisional.append(candidate)
190
+ if len(provisional) >= clause_quota:
191
+ break
192
+
193
+ if len(provisional) < clause_quota:
194
+ leftovers = [c for c in clause_candidates if c not in provisional]
195
+ needed = clause_quota - len(provisional)
196
+ provisional.extend(leftovers[:needed])
197
+
198
+ selected.extend(provisional)
199
+
200
+ if len(selected) < total_expected:
201
+ remaining = [c for c in candidates if c not in selected]
202
+ remaining.sort(key=lambda c: (-c.score, c.token.start))
203
+ selected.extend(remaining[: total_expected - len(selected)])
204
+
205
+ # Keep deterministic order by position
206
+ selected.sort(key=lambda c: c.token.start)
207
+ return selected
208
+
209
+ # ------------------------------------------------------------------
210
+ # Internal helpers
211
+ # ------------------------------------------------------------------
212
+ def _tokenise(self, text: str) -> list[TokenInfo]:
213
+ tokens: list[TokenInfo] = []
214
+ clause_index = 0
215
+ matches = list(TOKEN_REGEX.finditer(text))
216
+ for idx, match in enumerate(matches):
217
+ token_text = match.group(0)
218
+ is_word = bool(ALPHA_REGEX.search(token_text)) and token_text.strip().isalnum()
219
+ preceding = matches[idx - 1].group(0) if idx > 0 else ""
220
+ following = matches[idx + 1].group(0) if idx + 1 < len(matches) else ""
221
+ tokens.append(
222
+ TokenInfo(
223
+ text=token_text,
224
+ start=match.start(),
225
+ end=match.end(),
226
+ is_word=is_word,
227
+ clause_index=clause_index,
228
+ preceding_punct=preceding,
229
+ following_punct=following,
230
+ index=idx,
231
+ )
232
+ )
233
+ if any(ch in CLAUSE_PUNCTUATION for ch in token_text):
234
+ clause_index += 1
235
+ return tokens
236
+
237
+ def _excluded(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> bool:
238
+ text = token.text
239
+ normalised = token.normalised
240
+ if sum(ch.isalpha() for ch in text) < 2:
241
+ return True
242
+ if any(ch.isdigit() for ch in text):
243
+ return True
244
+ lowered = normalised
245
+ if "http" in lowered or "www" in lowered or "//" in lowered:
246
+ return True
247
+ if any(symbol in text for symbol in {"#", "@", "&", "{", "}", "<", ">"}):
248
+ return True
249
+ if "_" in text:
250
+ return True
251
+ if "/" in text or "\\" in text:
252
+ return True
253
+
254
+ # Heuristic proper noun check: Title case mid-clause counts as proper noun
255
+ if text[:1].isupper() and text[1:].islower():
256
+ previous_clause_start = index == 0
257
+ if not previous_clause_start:
258
+ for prior in reversed(tokens[:index]):
259
+ stripped = prior.text.strip()
260
+ if not stripped:
261
+ continue
262
+ if stripped[-1] in CLAUSE_PUNCTUATION:
263
+ previous_clause_start = True
264
+ break
265
+ if not previous_clause_start:
266
+ return True
267
+ return False
268
+
269
+ def _compute_features(
270
+ self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int
271
+ ) -> StretchabilityFeatures:
272
+ lexical = self.lexical_prior.get(token.normalised, 0.12)
273
+ pos_score = self._pos_score(token)
274
+ sentiment_score, sentiment_swing = self._sentiment(tokens, index)
275
+ phon_score = self._phonotactic(token.normalised)
276
+ context_score = self._contextual(token, tokens, index)
277
+ return StretchabilityFeatures(
278
+ lexical=lexical,
279
+ pos=pos_score,
280
+ sentiment=sentiment_score,
281
+ phonotactic=phon_score,
282
+ context=context_score,
283
+ sentiment_swing=sentiment_swing,
284
+ )
285
+
286
+ def _composite_score(self, features: StretchabilityFeatures) -> float:
287
+ lex_w, pos_w, sent_w, phon_w, ctx_w = self.weights
288
+ weighted = (
289
+ lex_w * features.lexical
290
+ + pos_w * features.pos
291
+ + sent_w * features.sentiment
292
+ + phon_w * features.phonotactic
293
+ + ctx_w * features.context
294
+ )
295
+ total_weight = sum(self.weights)
296
+ score = weighted / total_weight if total_weight else 0.0
297
+ return max(0.0, min(1.0, score))
298
+
299
+ # ------------------------------------------------------------------
300
+ # Feature helpers
301
+ # ------------------------------------------------------------------
302
+ def _pos_score(self, token: TokenInfo) -> float:
303
+ normalised = token.normalised
304
+ if normalised in INTERJECTIONS:
305
+ return 0.95
306
+ if normalised in INTENSIFIERS:
307
+ return 0.85
308
+ if normalised in EVALUATIVES:
309
+ return 0.7
310
+ if normalised.endswith("ly"):
311
+ return 0.55
312
+ if token.text.isupper() and len(token.text) > 1:
313
+ return 0.65
314
+ return 0.3
315
+
316
+ def _sentiment(self, tokens: Sequence[TokenInfo], index: int) -> tuple[float, float]:
317
+ window = [tok for tok in tokens[max(0, index - 2) : index + 3] if tok.is_word]
318
+ if not window:
319
+ return 0.5, 0.0
320
+ pos_hits = sum(1 for tok in window if tok.normalised in POSITIVE_LEXICON)
321
+ neg_hits = sum(1 for tok in window if tok.normalised in NEGATIVE_LEXICON)
322
+ total = len(window)
323
+ balance = (pos_hits - neg_hits) / total
324
+ sentiment_score = 0.5 + 0.5 * max(-1.0, min(1.0, balance))
325
+ swing = abs(balance)
326
+ return sentiment_score, swing
327
+
328
+ def _phonotactic(self, normalised: str) -> float:
329
+ if not any(ch in VOWELS for ch in normalised):
330
+ return 0.0
331
+ score = 0.25
332
+ if any(normalised.endswith(c) for c in SONORANT_CODAS):
333
+ score += 0.2
334
+ if any(normalised.endswith(c) for c in SIBILANT_CODAS):
335
+ score += 0.18
336
+ if any(digraph in normalised for digraph in DIGRAPHS):
337
+ score += 0.22
338
+ if re.search(r"[aeiouy]{2,}", normalised):
339
+ score += 0.22
340
+ if re.search(r"(.)(?!\1)(.)\1", normalised):
341
+ score += 0.08
342
+ return max(0.0, min(1.0, score))
343
+
344
+ def _contextual(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> float:
345
+ score = 0.2
346
+ before = token.preceding_punct
347
+ after = token.following_punct
348
+ token_text = token.text
349
+ if after and after.count("!") >= 1:
350
+ score += 0.25
351
+ if after and after.count("?") >= 1:
352
+ score += 0.2
353
+ if before and before.count("!") >= 2:
354
+ score += 0.2
355
+ if after and ("!!" in after or "??" in after):
356
+ score += 0.15
357
+ if token_text.isupper() and len(token_text) > 1:
358
+ score += 0.25
359
+ if EMOJI_REGEX.search(before or "") or EMOJI_REGEX.search(after or ""):
360
+ score += 0.15
361
+ # Clause-final emphasis
362
+ if index + 1 < len(tokens):
363
+ trailing = tokens[index + 1].text
364
+ if any(p in trailing for p in {"!!!", "??", "?!"}):
365
+ score += 0.2
366
+ return max(0.0, min(1.0, score))
367
+
368
+
369
+ __all__ = [
370
+ "StretchabilityAnalyzer",
371
+ "StretchCandidate",
372
+ "StretchabilityFeatures",
373
+ "TokenInfo",
374
+ "RandomLike",
375
+ ]
@@ -14,6 +14,7 @@ from .core import (
14
14
  plan_glitchling_specs,
15
15
  plan_glitchlings,
16
16
  )
17
+ from .hokey import Hokey, hokey
17
18
  from .jargoyle import Jargoyle, jargoyle
18
19
  from .jargoyle import dependencies_available as _jargoyle_available
19
20
  from .mim1c import Mim1c, mim1c
@@ -33,6 +34,8 @@ __all__ = [
33
34
  "jargoyle",
34
35
  "Apostrofae",
35
36
  "apostrofae",
37
+ "Hokey",
38
+ "hokey",
36
39
  "Adjax",
37
40
  "adjax",
38
41
  "Reduple",
@@ -61,7 +64,7 @@ __all__ = [
61
64
 
62
65
  _HAS_JARGOYLE = _jargoyle_available()
63
66
 
64
- _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, mim1c]
67
+ _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, hokey, mim1c]
65
68
  if _HAS_JARGOYLE:
66
69
  _BUILTIN_GLITCHLING_LIST.append(jargoyle)
67
70
  _BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
@@ -73,6 +76,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
73
76
  _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
74
77
  typogre.name.lower(): Typogre,
75
78
  apostrofae.name.lower(): Apostrofae,
79
+ hokey.name.lower(): Hokey,
76
80
  mim1c.name.lower(): Mim1c,
77
81
  adjax.name.lower(): Adjax,
78
82
  reduple.name.lower(): Reduple,