glitchlings 0.4.4__cp313-cp313-win_amd64.whl → 0.4.5__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cp313-win_amd64.pyd +0 -0
- glitchlings/compat.py +2 -4
- glitchlings/config.py +2 -4
- glitchlings/data/__init__.py +1 -0
- glitchlings/data/hokey_assets.json +193 -0
- glitchlings/dlc/_shared.py +6 -6
- glitchlings/dlc/huggingface.py +6 -6
- glitchlings/dlc/prime.py +1 -1
- glitchlings/dlc/pytorch.py +3 -3
- glitchlings/dlc/pytorch_lightning.py +4 -10
- glitchlings/lexicon/_cache.py +3 -5
- glitchlings/lexicon/vector.py +6 -5
- glitchlings/lexicon/wordnet.py +4 -8
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +375 -0
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/hokey.py +173 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/METADATA +26 -5
- {glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/RECORD +25 -19
- {glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Identify where expressive stretches should occur within a token."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
VOWELS = set("aeiouyAEIOUY")
|
|
9
|
+
SONORANTS = set("rlmnwyhRLMNWYH")
|
|
10
|
+
SIBILANTS = set("sSzZxXcCjJ") | {"sh", "Sh", "sH", "SH", "zh", "Zh"}
|
|
11
|
+
DIGRAPHS = {
|
|
12
|
+
"aa",
|
|
13
|
+
"ae",
|
|
14
|
+
"ai",
|
|
15
|
+
"ay",
|
|
16
|
+
"ee",
|
|
17
|
+
"ei",
|
|
18
|
+
"ey",
|
|
19
|
+
"ie",
|
|
20
|
+
"io",
|
|
21
|
+
"oa",
|
|
22
|
+
"oe",
|
|
23
|
+
"oi",
|
|
24
|
+
"oo",
|
|
25
|
+
"ou",
|
|
26
|
+
"ua",
|
|
27
|
+
"ue",
|
|
28
|
+
"ui",
|
|
29
|
+
"ya",
|
|
30
|
+
"yo",
|
|
31
|
+
"yu",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(slots=True)
|
|
36
|
+
class StretchSite:
|
|
37
|
+
"""Location of a stretchable grapheme."""
|
|
38
|
+
|
|
39
|
+
start: int
|
|
40
|
+
end: int
|
|
41
|
+
category: str
|
|
42
|
+
|
|
43
|
+
def unit(self, token: str) -> str:
|
|
44
|
+
return token[self.start : self.end]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _alpha_indices(token: str) -> list[int]:
|
|
48
|
+
return [idx for idx, char in enumerate(token) if char.isalpha()]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _vowel_clusters(token: str, indices: Iterable[int]) -> list[tuple[int, int]]:
|
|
52
|
+
clusters: list[tuple[int, int]] = []
|
|
53
|
+
start: int | None = None
|
|
54
|
+
prev_idx: int | None = None
|
|
55
|
+
for idx in indices:
|
|
56
|
+
char = token[idx]
|
|
57
|
+
if char in VOWELS:
|
|
58
|
+
if start is None:
|
|
59
|
+
start = idx
|
|
60
|
+
elif prev_idx is not None and idx != prev_idx + 1:
|
|
61
|
+
clusters.append((start, prev_idx + 1))
|
|
62
|
+
start = idx
|
|
63
|
+
else:
|
|
64
|
+
if start is not None:
|
|
65
|
+
clusters.append((start, idx))
|
|
66
|
+
start = None
|
|
67
|
+
prev_idx = idx
|
|
68
|
+
if start is not None and prev_idx is not None:
|
|
69
|
+
clusters.append((start, prev_idx + 1))
|
|
70
|
+
return clusters
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def find_stretch_site(token: str) -> StretchSite | None:
|
|
74
|
+
"""Return the most suitable stretch site for ``token``."""
|
|
75
|
+
|
|
76
|
+
alpha_indices = _alpha_indices(token)
|
|
77
|
+
if not alpha_indices:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
lower = token.lower()
|
|
81
|
+
clusters = _vowel_clusters(lower, alpha_indices)
|
|
82
|
+
candidates: list[tuple[int, StretchSite]] = []
|
|
83
|
+
|
|
84
|
+
# Sibilant/sonorant coda extension (yes -> yesss, hmm -> hmmmm)
|
|
85
|
+
last_idx = alpha_indices[-1]
|
|
86
|
+
last_char = lower[last_idx]
|
|
87
|
+
if len(alpha_indices) >= 2:
|
|
88
|
+
prev_char = lower[alpha_indices[-2]]
|
|
89
|
+
else:
|
|
90
|
+
prev_char = ""
|
|
91
|
+
has_multi_vowel = any(
|
|
92
|
+
(end - start >= 2) and not (lower[start] == "y" and start == 0) for start, end in clusters
|
|
93
|
+
)
|
|
94
|
+
if last_char in {"s", "z"} and prev_char in VOWELS and not has_multi_vowel:
|
|
95
|
+
candidates.append((5, StretchSite(last_idx, last_idx + 1, "coda")))
|
|
96
|
+
elif last_char in SONORANTS and prev_char in VOWELS and not has_multi_vowel:
|
|
97
|
+
candidates.append((4, StretchSite(last_idx, last_idx + 1, "coda")))
|
|
98
|
+
elif not clusters:
|
|
99
|
+
candidates.append((2, StretchSite(last_idx, last_idx + 1, "consonant")))
|
|
100
|
+
|
|
101
|
+
# CVCe pattern (cute -> cuuute)
|
|
102
|
+
if lower.endswith("e") and len(alpha_indices) >= 3:
|
|
103
|
+
final_letter = alpha_indices[-1]
|
|
104
|
+
if token[final_letter].lower() == "e":
|
|
105
|
+
c_idx = alpha_indices[-2]
|
|
106
|
+
v_idx = alpha_indices[-3]
|
|
107
|
+
if token[c_idx].lower() not in VOWELS and token[v_idx].lower() in VOWELS:
|
|
108
|
+
candidates.append((4, StretchSite(v_idx, v_idx + 1, "cvce")))
|
|
109
|
+
|
|
110
|
+
for cluster in clusters:
|
|
111
|
+
start, end = cluster
|
|
112
|
+
substring = lower[start:end]
|
|
113
|
+
category = "vowel"
|
|
114
|
+
if any(substring[i : i + 2] in DIGRAPHS for i in range(max(0, len(substring) - 1))):
|
|
115
|
+
category = "digraph"
|
|
116
|
+
priority = 3 if cluster == clusters[-1] else 2
|
|
117
|
+
candidates.append((priority, StretchSite(start, end, category)))
|
|
118
|
+
|
|
119
|
+
if not candidates:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
candidates.sort(key=lambda item: (item[0], item[1].end - item[1].start, -item[1].start))
|
|
123
|
+
return candidates[-1][1]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def apply_stretch(token: str, site: StretchSite, repeats: int) -> str:
|
|
127
|
+
"""Return ``token`` with ``repeats`` extra copies of the grapheme at ``site``."""
|
|
128
|
+
|
|
129
|
+
if repeats <= 0:
|
|
130
|
+
return token
|
|
131
|
+
chars = list(token)
|
|
132
|
+
stretched: list[str] = []
|
|
133
|
+
for idx, char in enumerate(chars):
|
|
134
|
+
stretched.append(char)
|
|
135
|
+
if site.start <= idx < site.end:
|
|
136
|
+
stretched.append(char * repeats)
|
|
137
|
+
return "".join(stretched)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
__all__ = ["StretchSite", "find_stretch_site", "apply_stretch"]
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""Stretchability scoring and candidate selection for Hokey."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from importlib import resources
|
|
9
|
+
from typing import Any, Protocol, Sequence, TypedDict, cast
|
|
10
|
+
|
|
11
|
+
# Regexes reused across the module
|
|
12
|
+
TOKEN_REGEX = re.compile(r"\w+|\W+")
|
|
13
|
+
ALPHA_REGEX = re.compile(r"[A-Za-z]")
|
|
14
|
+
EMOJI_REGEX = re.compile(r"[\U0001F300-\U0001FAFF]")
|
|
15
|
+
CLAUSE_PUNCTUATION = {".", "?", "!", ";"}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HokeyAssets(TypedDict):
|
|
19
|
+
lexical_prior: dict[str, float]
|
|
20
|
+
interjections: list[str]
|
|
21
|
+
intensifiers: list[str]
|
|
22
|
+
evaluatives: list[str]
|
|
23
|
+
positive_lexicon: list[str]
|
|
24
|
+
negative_lexicon: list[str]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RandomLike(Protocol):
|
|
28
|
+
"""Interface for RNGs that expose ``random()``."""
|
|
29
|
+
|
|
30
|
+
def random(self) -> float: ...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Lexical prior probabilities and pragmatic lexica shared with the Rust fast path.
|
|
34
|
+
def _load_assets() -> HokeyAssets:
|
|
35
|
+
with (
|
|
36
|
+
resources.files("glitchlings.data")
|
|
37
|
+
.joinpath("hokey_assets.json")
|
|
38
|
+
.open("r", encoding="utf-8") as payload
|
|
39
|
+
):
|
|
40
|
+
data: Any = json.load(payload)
|
|
41
|
+
return cast(HokeyAssets, data)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_ASSETS = _load_assets()
|
|
45
|
+
LEXICAL_PRIOR: dict[str, float] = {
|
|
46
|
+
token: float(score) for token, score in _ASSETS["lexical_prior"].items()
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Pragmatic lexica for POS/discourse cues
|
|
50
|
+
INTERJECTIONS = frozenset(_ASSETS["interjections"])
|
|
51
|
+
INTENSIFIERS = frozenset(_ASSETS["intensifiers"])
|
|
52
|
+
EVALUATIVES = frozenset(_ASSETS["evaluatives"])
|
|
53
|
+
POSITIVE_LEXICON = frozenset(_ASSETS["positive_lexicon"])
|
|
54
|
+
NEGATIVE_LEXICON = frozenset(_ASSETS["negative_lexicon"])
|
|
55
|
+
|
|
56
|
+
VOWELS = set("aeiouy")
|
|
57
|
+
SONORANT_CODAS = set("rlmnwyh")
|
|
58
|
+
SIBILANT_CODAS = {"s", "z", "x", "c", "j", "sh", "zh"}
|
|
59
|
+
DIGRAPHS = {
|
|
60
|
+
"aa",
|
|
61
|
+
"ae",
|
|
62
|
+
"ai",
|
|
63
|
+
"ay",
|
|
64
|
+
"ee",
|
|
65
|
+
"ei",
|
|
66
|
+
"ey",
|
|
67
|
+
"ie",
|
|
68
|
+
"oa",
|
|
69
|
+
"oe",
|
|
70
|
+
"oi",
|
|
71
|
+
"oo",
|
|
72
|
+
"ou",
|
|
73
|
+
"ue",
|
|
74
|
+
"ui",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
MAX_CANDIDATES_PER_CLAUSE = 4
|
|
78
|
+
MIN_SCORE_THRESHOLD = 0.18
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(slots=True)
|
|
82
|
+
class TokenInfo:
|
|
83
|
+
text: str
|
|
84
|
+
start: int
|
|
85
|
+
end: int
|
|
86
|
+
is_word: bool
|
|
87
|
+
clause_index: int
|
|
88
|
+
preceding_punct: str
|
|
89
|
+
following_punct: str
|
|
90
|
+
index: int
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def normalised(self) -> str:
|
|
94
|
+
return self.text.lower()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass(slots=True)
|
|
98
|
+
class StretchabilityFeatures:
|
|
99
|
+
lexical: float
|
|
100
|
+
pos: float
|
|
101
|
+
sentiment: float
|
|
102
|
+
phonotactic: float
|
|
103
|
+
context: float
|
|
104
|
+
sentiment_swing: float
|
|
105
|
+
|
|
106
|
+
def intensity(self) -> float:
|
|
107
|
+
"""Map features to an intensity scalar in [0, 1.5]."""
|
|
108
|
+
emphasis = 0.6 * self.context + 0.4 * self.sentiment_swing
|
|
109
|
+
return max(0.0, min(1.5, 0.5 * (self.lexical + self.phonotactic) + emphasis))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass(slots=True)
|
|
113
|
+
class StretchCandidate:
|
|
114
|
+
token: TokenInfo
|
|
115
|
+
score: float
|
|
116
|
+
features: StretchabilityFeatures
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class StretchabilityAnalyzer:
|
|
120
|
+
"""Compute stretchability scores and select candidates."""
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
*,
|
|
125
|
+
lexical_prior: dict[str, float] | None = None,
|
|
126
|
+
weights: tuple[float, float, float, float, float] = (0.32, 0.18, 0.14, 0.22, 0.14),
|
|
127
|
+
) -> None:
|
|
128
|
+
self.lexical_prior = lexical_prior or LEXICAL_PRIOR
|
|
129
|
+
self.weights = weights
|
|
130
|
+
|
|
131
|
+
# ------------------------------------------------------------------
|
|
132
|
+
# Public API
|
|
133
|
+
# ------------------------------------------------------------------
|
|
134
|
+
def tokenise(self, text: str) -> list[TokenInfo]:
|
|
135
|
+
"""Tokenise text preserving separator tokens."""
|
|
136
|
+
return self._tokenise(text)
|
|
137
|
+
|
|
138
|
+
def analyse(self, text: str) -> list[StretchCandidate]:
|
|
139
|
+
if not text:
|
|
140
|
+
return []
|
|
141
|
+
tokens = self._tokenise(text)
|
|
142
|
+
return self.analyse_tokens(tokens)
|
|
143
|
+
|
|
144
|
+
def analyse_tokens(self, tokens: Sequence[TokenInfo]) -> list[StretchCandidate]:
|
|
145
|
+
candidates: list[StretchCandidate] = []
|
|
146
|
+
for idx, token in enumerate(tokens):
|
|
147
|
+
if not token.is_word:
|
|
148
|
+
continue
|
|
149
|
+
if self._excluded(token, tokens, idx):
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
features = self._compute_features(token, tokens, idx)
|
|
153
|
+
score = self._composite_score(features)
|
|
154
|
+
if score < MIN_SCORE_THRESHOLD:
|
|
155
|
+
continue
|
|
156
|
+
candidates.append(StretchCandidate(token=token, score=score, features=features))
|
|
157
|
+
return candidates
|
|
158
|
+
|
|
159
|
+
def select_candidates(
|
|
160
|
+
self,
|
|
161
|
+
candidates: Sequence[StretchCandidate],
|
|
162
|
+
*,
|
|
163
|
+
rate: float,
|
|
164
|
+
rng: RandomLike,
|
|
165
|
+
) -> list[StretchCandidate]:
|
|
166
|
+
if not candidates or rate <= 0:
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
grouped: dict[int, list[StretchCandidate]] = {}
|
|
170
|
+
for candidate in candidates:
|
|
171
|
+
grouped.setdefault(candidate.token.clause_index, []).append(candidate)
|
|
172
|
+
|
|
173
|
+
selected: list[StretchCandidate] = []
|
|
174
|
+
total_expected = max(0, min(len(candidates), int(round(len(candidates) * rate))))
|
|
175
|
+
|
|
176
|
+
for clause_index in sorted(grouped):
|
|
177
|
+
clause_candidates = sorted(
|
|
178
|
+
grouped[clause_index], key=lambda c: (-c.score, c.token.start)
|
|
179
|
+
)
|
|
180
|
+
clause_candidates = clause_candidates[:MAX_CANDIDATES_PER_CLAUSE]
|
|
181
|
+
clause_quota = max(
|
|
182
|
+
0, min(len(clause_candidates), int(round(len(clause_candidates) * rate)))
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
provisional: list[StretchCandidate] = []
|
|
186
|
+
for candidate in clause_candidates:
|
|
187
|
+
probability = min(1.0, rate * (0.35 + 0.65 * candidate.score))
|
|
188
|
+
if rng.random() < probability:
|
|
189
|
+
provisional.append(candidate)
|
|
190
|
+
if len(provisional) >= clause_quota:
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
if len(provisional) < clause_quota:
|
|
194
|
+
leftovers = [c for c in clause_candidates if c not in provisional]
|
|
195
|
+
needed = clause_quota - len(provisional)
|
|
196
|
+
provisional.extend(leftovers[:needed])
|
|
197
|
+
|
|
198
|
+
selected.extend(provisional)
|
|
199
|
+
|
|
200
|
+
if len(selected) < total_expected:
|
|
201
|
+
remaining = [c for c in candidates if c not in selected]
|
|
202
|
+
remaining.sort(key=lambda c: (-c.score, c.token.start))
|
|
203
|
+
selected.extend(remaining[: total_expected - len(selected)])
|
|
204
|
+
|
|
205
|
+
# Keep deterministic order by position
|
|
206
|
+
selected.sort(key=lambda c: c.token.start)
|
|
207
|
+
return selected
|
|
208
|
+
|
|
209
|
+
# ------------------------------------------------------------------
|
|
210
|
+
# Internal helpers
|
|
211
|
+
# ------------------------------------------------------------------
|
|
212
|
+
def _tokenise(self, text: str) -> list[TokenInfo]:
|
|
213
|
+
tokens: list[TokenInfo] = []
|
|
214
|
+
clause_index = 0
|
|
215
|
+
matches = list(TOKEN_REGEX.finditer(text))
|
|
216
|
+
for idx, match in enumerate(matches):
|
|
217
|
+
token_text = match.group(0)
|
|
218
|
+
is_word = bool(ALPHA_REGEX.search(token_text)) and token_text.strip().isalnum()
|
|
219
|
+
preceding = matches[idx - 1].group(0) if idx > 0 else ""
|
|
220
|
+
following = matches[idx + 1].group(0) if idx + 1 < len(matches) else ""
|
|
221
|
+
tokens.append(
|
|
222
|
+
TokenInfo(
|
|
223
|
+
text=token_text,
|
|
224
|
+
start=match.start(),
|
|
225
|
+
end=match.end(),
|
|
226
|
+
is_word=is_word,
|
|
227
|
+
clause_index=clause_index,
|
|
228
|
+
preceding_punct=preceding,
|
|
229
|
+
following_punct=following,
|
|
230
|
+
index=idx,
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
if any(ch in CLAUSE_PUNCTUATION for ch in token_text):
|
|
234
|
+
clause_index += 1
|
|
235
|
+
return tokens
|
|
236
|
+
|
|
237
|
+
def _excluded(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> bool:
|
|
238
|
+
text = token.text
|
|
239
|
+
normalised = token.normalised
|
|
240
|
+
if sum(ch.isalpha() for ch in text) < 2:
|
|
241
|
+
return True
|
|
242
|
+
if any(ch.isdigit() for ch in text):
|
|
243
|
+
return True
|
|
244
|
+
lowered = normalised
|
|
245
|
+
if "http" in lowered or "www" in lowered or "//" in lowered:
|
|
246
|
+
return True
|
|
247
|
+
if any(symbol in text for symbol in {"#", "@", "&", "{", "}", "<", ">"}):
|
|
248
|
+
return True
|
|
249
|
+
if "_" in text:
|
|
250
|
+
return True
|
|
251
|
+
if "/" in text or "\\" in text:
|
|
252
|
+
return True
|
|
253
|
+
|
|
254
|
+
# Heuristic proper noun check: Title case mid-clause counts as proper noun
|
|
255
|
+
if text[:1].isupper() and text[1:].islower():
|
|
256
|
+
previous_clause_start = index == 0
|
|
257
|
+
if not previous_clause_start:
|
|
258
|
+
for prior in reversed(tokens[:index]):
|
|
259
|
+
stripped = prior.text.strip()
|
|
260
|
+
if not stripped:
|
|
261
|
+
continue
|
|
262
|
+
if stripped[-1] in CLAUSE_PUNCTUATION:
|
|
263
|
+
previous_clause_start = True
|
|
264
|
+
break
|
|
265
|
+
if not previous_clause_start:
|
|
266
|
+
return True
|
|
267
|
+
return False
|
|
268
|
+
|
|
269
|
+
def _compute_features(
|
|
270
|
+
self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int
|
|
271
|
+
) -> StretchabilityFeatures:
|
|
272
|
+
lexical = self.lexical_prior.get(token.normalised, 0.12)
|
|
273
|
+
pos_score = self._pos_score(token)
|
|
274
|
+
sentiment_score, sentiment_swing = self._sentiment(tokens, index)
|
|
275
|
+
phon_score = self._phonotactic(token.normalised)
|
|
276
|
+
context_score = self._contextual(token, tokens, index)
|
|
277
|
+
return StretchabilityFeatures(
|
|
278
|
+
lexical=lexical,
|
|
279
|
+
pos=pos_score,
|
|
280
|
+
sentiment=sentiment_score,
|
|
281
|
+
phonotactic=phon_score,
|
|
282
|
+
context=context_score,
|
|
283
|
+
sentiment_swing=sentiment_swing,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
def _composite_score(self, features: StretchabilityFeatures) -> float:
|
|
287
|
+
lex_w, pos_w, sent_w, phon_w, ctx_w = self.weights
|
|
288
|
+
weighted = (
|
|
289
|
+
lex_w * features.lexical
|
|
290
|
+
+ pos_w * features.pos
|
|
291
|
+
+ sent_w * features.sentiment
|
|
292
|
+
+ phon_w * features.phonotactic
|
|
293
|
+
+ ctx_w * features.context
|
|
294
|
+
)
|
|
295
|
+
total_weight = sum(self.weights)
|
|
296
|
+
score = weighted / total_weight if total_weight else 0.0
|
|
297
|
+
return max(0.0, min(1.0, score))
|
|
298
|
+
|
|
299
|
+
# ------------------------------------------------------------------
|
|
300
|
+
# Feature helpers
|
|
301
|
+
# ------------------------------------------------------------------
|
|
302
|
+
def _pos_score(self, token: TokenInfo) -> float:
|
|
303
|
+
normalised = token.normalised
|
|
304
|
+
if normalised in INTERJECTIONS:
|
|
305
|
+
return 0.95
|
|
306
|
+
if normalised in INTENSIFIERS:
|
|
307
|
+
return 0.85
|
|
308
|
+
if normalised in EVALUATIVES:
|
|
309
|
+
return 0.7
|
|
310
|
+
if normalised.endswith("ly"):
|
|
311
|
+
return 0.55
|
|
312
|
+
if token.text.isupper() and len(token.text) > 1:
|
|
313
|
+
return 0.65
|
|
314
|
+
return 0.3
|
|
315
|
+
|
|
316
|
+
def _sentiment(self, tokens: Sequence[TokenInfo], index: int) -> tuple[float, float]:
|
|
317
|
+
window = [tok for tok in tokens[max(0, index - 2) : index + 3] if tok.is_word]
|
|
318
|
+
if not window:
|
|
319
|
+
return 0.5, 0.0
|
|
320
|
+
pos_hits = sum(1 for tok in window if tok.normalised in POSITIVE_LEXICON)
|
|
321
|
+
neg_hits = sum(1 for tok in window if tok.normalised in NEGATIVE_LEXICON)
|
|
322
|
+
total = len(window)
|
|
323
|
+
balance = (pos_hits - neg_hits) / total
|
|
324
|
+
sentiment_score = 0.5 + 0.5 * max(-1.0, min(1.0, balance))
|
|
325
|
+
swing = abs(balance)
|
|
326
|
+
return sentiment_score, swing
|
|
327
|
+
|
|
328
|
+
def _phonotactic(self, normalised: str) -> float:
|
|
329
|
+
if not any(ch in VOWELS for ch in normalised):
|
|
330
|
+
return 0.0
|
|
331
|
+
score = 0.25
|
|
332
|
+
if any(normalised.endswith(c) for c in SONORANT_CODAS):
|
|
333
|
+
score += 0.2
|
|
334
|
+
if any(normalised.endswith(c) for c in SIBILANT_CODAS):
|
|
335
|
+
score += 0.18
|
|
336
|
+
if any(digraph in normalised for digraph in DIGRAPHS):
|
|
337
|
+
score += 0.22
|
|
338
|
+
if re.search(r"[aeiouy]{2,}", normalised):
|
|
339
|
+
score += 0.22
|
|
340
|
+
if re.search(r"(.)(?!\1)(.)\1", normalised):
|
|
341
|
+
score += 0.08
|
|
342
|
+
return max(0.0, min(1.0, score))
|
|
343
|
+
|
|
344
|
+
def _contextual(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> float:
|
|
345
|
+
score = 0.2
|
|
346
|
+
before = token.preceding_punct
|
|
347
|
+
after = token.following_punct
|
|
348
|
+
token_text = token.text
|
|
349
|
+
if after and after.count("!") >= 1:
|
|
350
|
+
score += 0.25
|
|
351
|
+
if after and after.count("?") >= 1:
|
|
352
|
+
score += 0.2
|
|
353
|
+
if before and before.count("!") >= 2:
|
|
354
|
+
score += 0.2
|
|
355
|
+
if after and ("!!" in after or "??" in after):
|
|
356
|
+
score += 0.15
|
|
357
|
+
if token_text.isupper() and len(token_text) > 1:
|
|
358
|
+
score += 0.25
|
|
359
|
+
if EMOJI_REGEX.search(before or "") or EMOJI_REGEX.search(after or ""):
|
|
360
|
+
score += 0.15
|
|
361
|
+
# Clause-final emphasis
|
|
362
|
+
if index + 1 < len(tokens):
|
|
363
|
+
trailing = tokens[index + 1].text
|
|
364
|
+
if any(p in trailing for p in {"!!!", "??", "?!"}):
|
|
365
|
+
score += 0.2
|
|
366
|
+
return max(0.0, min(1.0, score))
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
__all__ = [
|
|
370
|
+
"StretchabilityAnalyzer",
|
|
371
|
+
"StretchCandidate",
|
|
372
|
+
"StretchabilityFeatures",
|
|
373
|
+
"TokenInfo",
|
|
374
|
+
"RandomLike",
|
|
375
|
+
]
|
glitchlings/zoo/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@ from .core import (
|
|
|
14
14
|
plan_glitchling_specs,
|
|
15
15
|
plan_glitchlings,
|
|
16
16
|
)
|
|
17
|
+
from .hokey import Hokey, hokey
|
|
17
18
|
from .jargoyle import Jargoyle, jargoyle
|
|
18
19
|
from .jargoyle import dependencies_available as _jargoyle_available
|
|
19
20
|
from .mim1c import Mim1c, mim1c
|
|
@@ -33,6 +34,8 @@ __all__ = [
|
|
|
33
34
|
"jargoyle",
|
|
34
35
|
"Apostrofae",
|
|
35
36
|
"apostrofae",
|
|
37
|
+
"Hokey",
|
|
38
|
+
"hokey",
|
|
36
39
|
"Adjax",
|
|
37
40
|
"adjax",
|
|
38
41
|
"Reduple",
|
|
@@ -61,7 +64,7 @@ __all__ = [
|
|
|
61
64
|
|
|
62
65
|
_HAS_JARGOYLE = _jargoyle_available()
|
|
63
66
|
|
|
64
|
-
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, mim1c]
|
|
67
|
+
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, hokey, mim1c]
|
|
65
68
|
if _HAS_JARGOYLE:
|
|
66
69
|
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
|
67
70
|
_BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
|
|
@@ -73,6 +76,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
|
|
73
76
|
_BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
|
74
77
|
typogre.name.lower(): Typogre,
|
|
75
78
|
apostrofae.name.lower(): Apostrofae,
|
|
79
|
+
hokey.name.lower(): Hokey,
|
|
76
80
|
mim1c.name.lower(): Mim1c,
|
|
77
81
|
adjax.name.lower(): Adjax,
|
|
78
82
|
reduple.name.lower(): Reduple,
|