glitchlings 0.4.3__cp312-cp312-macosx_11_0_universal2.whl → 0.4.5__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. glitchlings/__init__.py +4 -0
  2. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  3. glitchlings/compat.py +2 -4
  4. glitchlings/config.py +2 -4
  5. glitchlings/data/__init__.py +1 -0
  6. glitchlings/data/hokey_assets.json +193 -0
  7. glitchlings/dlc/_shared.py +86 -1
  8. glitchlings/dlc/huggingface.py +6 -6
  9. glitchlings/dlc/prime.py +1 -1
  10. glitchlings/dlc/pytorch.py +9 -59
  11. glitchlings/dlc/pytorch_lightning.py +10 -34
  12. glitchlings/lexicon/__init__.py +5 -1
  13. glitchlings/lexicon/_cache.py +3 -5
  14. glitchlings/lexicon/vector.py +6 -5
  15. glitchlings/lexicon/wordnet.py +4 -8
  16. glitchlings/util/hokey_generator.py +144 -0
  17. glitchlings/util/stretch_locator.py +140 -0
  18. glitchlings/util/stretchability.py +375 -0
  19. glitchlings/zoo/__init__.py +5 -1
  20. glitchlings/zoo/_rate.py +114 -1
  21. glitchlings/zoo/_rust_extensions.py +143 -0
  22. glitchlings/zoo/adjax.py +3 -4
  23. glitchlings/zoo/apostrofae.py +3 -4
  24. glitchlings/zoo/core.py +21 -9
  25. glitchlings/zoo/hokey.py +173 -0
  26. glitchlings/zoo/jargoyle.py +6 -2
  27. glitchlings/zoo/redactyl.py +4 -5
  28. glitchlings/zoo/reduple.py +3 -4
  29. glitchlings/zoo/rushmore.py +3 -4
  30. glitchlings/zoo/scannequin.py +3 -4
  31. glitchlings/zoo/typogre.py +3 -4
  32. glitchlings/zoo/zeedub.py +3 -4
  33. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/METADATA +32 -8
  34. glitchlings-0.4.5.dist-info/RECORD +53 -0
  35. glitchlings-0.4.3.dist-info/RECORD +0 -46
  36. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/WHEEL +0 -0
  37. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/entry_points.txt +0 -0
  38. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/licenses/LICENSE +0 -0
  39. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/top_level.txt +0 -0
@@ -13,21 +13,17 @@ from ._cache import CacheSnapshot
13
13
 
14
14
 
15
15
  class _LemmaProtocol(Protocol):
16
- def name(self) -> str:
17
- ...
16
+ def name(self) -> str: ...
18
17
 
19
18
 
20
19
  class _SynsetProtocol(Protocol):
21
- def lemmas(self) -> Sequence[_LemmaProtocol]:
22
- ...
20
+ def lemmas(self) -> Sequence[_LemmaProtocol]: ...
23
21
 
24
22
 
25
23
  class _WordNetResource(Protocol):
26
- def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
27
- ...
24
+ def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]: ...
28
25
 
29
- def ensure_loaded(self) -> None:
30
- ...
26
+ def ensure_loaded(self) -> None: ...
31
27
 
32
28
 
33
29
  WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
@@ -0,0 +1,144 @@
1
+ """Hokey expressive lengthening generator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from .stretch_locator import StretchSite, apply_stretch, find_stretch_site
8
+ from .stretchability import RandomLike, StretchabilityAnalyzer, StretchabilityFeatures
9
+
10
+
11
+ @dataclass(slots=True)
12
+ class HokeyConfig:
13
+ rate: float = 0.3
14
+ extension_min: int = 2
15
+ extension_max: int = 5
16
+ base_p: float = 0.45
17
+ word_length_threshold: int = 6
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class StretchEvent:
22
+ token_index: int
23
+ original: str
24
+ stretched: str
25
+ repeats: int
26
+ site: StretchSite
27
+ score: float
28
+ features: StretchabilityFeatures
29
+
30
+
31
+ class NegativeBinomialSampler:
32
+ """Sample stretch lengths from a clipped negative binomial distribution."""
33
+
34
+ def __init__(self, base_p: float = 0.45) -> None:
35
+ self.base_p = base_p
36
+
37
+ def sample(
38
+ self,
39
+ rng: RandomLike,
40
+ *,
41
+ intensity: float,
42
+ minimum: int,
43
+ maximum: int,
44
+ ) -> int:
45
+ minimum = max(0, int(minimum))
46
+ maximum = max(minimum, int(maximum))
47
+ if maximum == 0:
48
+ return 0
49
+ if maximum == minimum:
50
+ return maximum
51
+
52
+ r = max(1, int(round(1 + 2 * intensity)))
53
+ adjusted_p = self.base_p / (1.0 + 0.75 * max(0.0, intensity))
54
+ adjusted_p = max(0.05, min(0.95, adjusted_p))
55
+ failures = sum(self._geometric_sample(rng, adjusted_p) for _ in range(r))
56
+ extra = minimum + failures
57
+ return max(minimum, min(maximum, extra))
58
+
59
+ @staticmethod
60
+ def _geometric_sample(rng: RandomLike, p: float) -> int:
61
+ count = 0
62
+ while rng.random() > p:
63
+ count += 1
64
+ return count
65
+
66
+
67
+ class HokeyGenerator:
68
+ """Full expressive lengthening pipeline."""
69
+
70
+ def __init__(
71
+ self,
72
+ analyzer: StretchabilityAnalyzer | None = None,
73
+ sampler: NegativeBinomialSampler | None = None,
74
+ ) -> None:
75
+ self.analyzer = analyzer or StretchabilityAnalyzer()
76
+ self.sampler = sampler or NegativeBinomialSampler()
77
+
78
+ def generate(
79
+ self,
80
+ text: str,
81
+ *,
82
+ rng: RandomLike,
83
+ config: HokeyConfig,
84
+ ) -> tuple[str, list[StretchEvent]]:
85
+ if not text:
86
+ return text, []
87
+
88
+ if config.base_p != self.sampler.base_p:
89
+ self.sampler.base_p = config.base_p
90
+
91
+ tokens = self.analyzer.tokenise(text)
92
+ candidates = self.analyzer.analyse_tokens(tokens)
93
+ selected = self.analyzer.select_candidates(candidates, rate=config.rate, rng=rng)
94
+ if not selected:
95
+ return text, []
96
+
97
+ token_strings = [token.text for token in tokens]
98
+ events: list[StretchEvent] = []
99
+
100
+ for candidate in selected:
101
+ token_idx = candidate.token.index
102
+ original = token_strings[token_idx]
103
+ site = find_stretch_site(original)
104
+ if site is None:
105
+ continue
106
+
107
+ intensity = min(1.5, candidate.features.intensity() + 0.35 * candidate.score)
108
+ alpha_count = sum(1 for ch in original if ch.isalpha())
109
+ if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold * 2:
110
+ continue
111
+ if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold:
112
+ excess = alpha_count - config.word_length_threshold
113
+ intensity = intensity / (1.0 + 0.35 * excess)
114
+ if candidate.score < 0.35 and excess >= 2:
115
+ continue
116
+ intensity = max(0.05, intensity)
117
+
118
+ repeats = self.sampler.sample(
119
+ rng,
120
+ intensity=intensity,
121
+ minimum=config.extension_min,
122
+ maximum=config.extension_max,
123
+ )
124
+ if repeats <= 0:
125
+ continue
126
+
127
+ stretched_word = apply_stretch(original, site, repeats)
128
+ token_strings[token_idx] = stretched_word
129
+ events.append(
130
+ StretchEvent(
131
+ token_index=token_idx,
132
+ original=original,
133
+ stretched=stretched_word,
134
+ repeats=repeats,
135
+ site=site,
136
+ score=candidate.score,
137
+ features=candidate.features,
138
+ )
139
+ )
140
+
141
+ return "".join(token_strings), events
142
+
143
+
144
+ __all__ = ["HokeyGenerator", "HokeyConfig", "StretchEvent", "NegativeBinomialSampler"]
@@ -0,0 +1,140 @@
1
+ """Identify where expressive stretches should occur within a token."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Iterable
7
+
8
+ VOWELS = set("aeiouyAEIOUY")
9
+ SONORANTS = set("rlmnwyhRLMNWYH")
10
+ SIBILANTS = set("sSzZxXcCjJ") | {"sh", "Sh", "sH", "SH", "zh", "Zh"}
11
+ DIGRAPHS = {
12
+ "aa",
13
+ "ae",
14
+ "ai",
15
+ "ay",
16
+ "ee",
17
+ "ei",
18
+ "ey",
19
+ "ie",
20
+ "io",
21
+ "oa",
22
+ "oe",
23
+ "oi",
24
+ "oo",
25
+ "ou",
26
+ "ua",
27
+ "ue",
28
+ "ui",
29
+ "ya",
30
+ "yo",
31
+ "yu",
32
+ }
33
+
34
+
35
+ @dataclass(slots=True)
36
+ class StretchSite:
37
+ """Location of a stretchable grapheme."""
38
+
39
+ start: int
40
+ end: int
41
+ category: str
42
+
43
+ def unit(self, token: str) -> str:
44
+ return token[self.start : self.end]
45
+
46
+
47
+ def _alpha_indices(token: str) -> list[int]:
48
+ return [idx for idx, char in enumerate(token) if char.isalpha()]
49
+
50
+
51
+ def _vowel_clusters(token: str, indices: Iterable[int]) -> list[tuple[int, int]]:
52
+ clusters: list[tuple[int, int]] = []
53
+ start: int | None = None
54
+ prev_idx: int | None = None
55
+ for idx in indices:
56
+ char = token[idx]
57
+ if char in VOWELS:
58
+ if start is None:
59
+ start = idx
60
+ elif prev_idx is not None and idx != prev_idx + 1:
61
+ clusters.append((start, prev_idx + 1))
62
+ start = idx
63
+ else:
64
+ if start is not None:
65
+ clusters.append((start, idx))
66
+ start = None
67
+ prev_idx = idx
68
+ if start is not None and prev_idx is not None:
69
+ clusters.append((start, prev_idx + 1))
70
+ return clusters
71
+
72
+
73
+ def find_stretch_site(token: str) -> StretchSite | None:
74
+ """Return the most suitable stretch site for ``token``."""
75
+
76
+ alpha_indices = _alpha_indices(token)
77
+ if not alpha_indices:
78
+ return None
79
+
80
+ lower = token.lower()
81
+ clusters = _vowel_clusters(lower, alpha_indices)
82
+ candidates: list[tuple[int, StretchSite]] = []
83
+
84
+ # Sibilant/sonorant coda extension (yes -> yesss, hmm -> hmmmm)
85
+ last_idx = alpha_indices[-1]
86
+ last_char = lower[last_idx]
87
+ if len(alpha_indices) >= 2:
88
+ prev_char = lower[alpha_indices[-2]]
89
+ else:
90
+ prev_char = ""
91
+ has_multi_vowel = any(
92
+ (end - start >= 2) and not (lower[start] == "y" and start == 0) for start, end in clusters
93
+ )
94
+ if last_char in {"s", "z"} and prev_char in VOWELS and not has_multi_vowel:
95
+ candidates.append((5, StretchSite(last_idx, last_idx + 1, "coda")))
96
+ elif last_char in SONORANTS and prev_char in VOWELS and not has_multi_vowel:
97
+ candidates.append((4, StretchSite(last_idx, last_idx + 1, "coda")))
98
+ elif not clusters:
99
+ candidates.append((2, StretchSite(last_idx, last_idx + 1, "consonant")))
100
+
101
+ # CVCe pattern (cute -> cuuute)
102
+ if lower.endswith("e") and len(alpha_indices) >= 3:
103
+ final_letter = alpha_indices[-1]
104
+ if token[final_letter].lower() == "e":
105
+ c_idx = alpha_indices[-2]
106
+ v_idx = alpha_indices[-3]
107
+ if token[c_idx].lower() not in VOWELS and token[v_idx].lower() in VOWELS:
108
+ candidates.append((4, StretchSite(v_idx, v_idx + 1, "cvce")))
109
+
110
+ for cluster in clusters:
111
+ start, end = cluster
112
+ substring = lower[start:end]
113
+ category = "vowel"
114
+ if any(substring[i : i + 2] in DIGRAPHS for i in range(max(0, len(substring) - 1))):
115
+ category = "digraph"
116
+ priority = 3 if cluster == clusters[-1] else 2
117
+ candidates.append((priority, StretchSite(start, end, category)))
118
+
119
+ if not candidates:
120
+ return None
121
+
122
+ candidates.sort(key=lambda item: (item[0], item[1].end - item[1].start, -item[1].start))
123
+ return candidates[-1][1]
124
+
125
+
126
+ def apply_stretch(token: str, site: StretchSite, repeats: int) -> str:
127
+ """Return ``token`` with ``repeats`` extra copies of the grapheme at ``site``."""
128
+
129
+ if repeats <= 0:
130
+ return token
131
+ chars = list(token)
132
+ stretched: list[str] = []
133
+ for idx, char in enumerate(chars):
134
+ stretched.append(char)
135
+ if site.start <= idx < site.end:
136
+ stretched.append(char * repeats)
137
+ return "".join(stretched)
138
+
139
+
140
+ __all__ = ["StretchSite", "find_stretch_site", "apply_stretch"]
@@ -0,0 +1,375 @@
1
+ """Stretchability scoring and candidate selection for Hokey."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass
8
+ from importlib import resources
9
+ from typing import Any, Protocol, Sequence, TypedDict, cast
10
+
11
+ # Regexes reused across the module
12
+ TOKEN_REGEX = re.compile(r"\w+|\W+")
13
+ ALPHA_REGEX = re.compile(r"[A-Za-z]")
14
+ EMOJI_REGEX = re.compile(r"[\U0001F300-\U0001FAFF]")
15
+ CLAUSE_PUNCTUATION = {".", "?", "!", ";"}
16
+
17
+
18
+ class HokeyAssets(TypedDict):
19
+ lexical_prior: dict[str, float]
20
+ interjections: list[str]
21
+ intensifiers: list[str]
22
+ evaluatives: list[str]
23
+ positive_lexicon: list[str]
24
+ negative_lexicon: list[str]
25
+
26
+
27
+ class RandomLike(Protocol):
28
+ """Interface for RNGs that expose ``random()``."""
29
+
30
+ def random(self) -> float: ...
31
+
32
+
33
+ # Lexical prior probabilities and pragmatic lexica shared with the Rust fast path.
34
+ def _load_assets() -> HokeyAssets:
35
+ with (
36
+ resources.files("glitchlings.data")
37
+ .joinpath("hokey_assets.json")
38
+ .open("r", encoding="utf-8") as payload
39
+ ):
40
+ data: Any = json.load(payload)
41
+ return cast(HokeyAssets, data)
42
+
43
+
44
+ _ASSETS = _load_assets()
45
+ LEXICAL_PRIOR: dict[str, float] = {
46
+ token: float(score) for token, score in _ASSETS["lexical_prior"].items()
47
+ }
48
+
49
+ # Pragmatic lexica for POS/discourse cues
50
+ INTERJECTIONS = frozenset(_ASSETS["interjections"])
51
+ INTENSIFIERS = frozenset(_ASSETS["intensifiers"])
52
+ EVALUATIVES = frozenset(_ASSETS["evaluatives"])
53
+ POSITIVE_LEXICON = frozenset(_ASSETS["positive_lexicon"])
54
+ NEGATIVE_LEXICON = frozenset(_ASSETS["negative_lexicon"])
55
+
56
+ VOWELS = set("aeiouy")
57
+ SONORANT_CODAS = set("rlmnwyh")
58
+ SIBILANT_CODAS = {"s", "z", "x", "c", "j", "sh", "zh"}
59
+ DIGRAPHS = {
60
+ "aa",
61
+ "ae",
62
+ "ai",
63
+ "ay",
64
+ "ee",
65
+ "ei",
66
+ "ey",
67
+ "ie",
68
+ "oa",
69
+ "oe",
70
+ "oi",
71
+ "oo",
72
+ "ou",
73
+ "ue",
74
+ "ui",
75
+ }
76
+
77
+ MAX_CANDIDATES_PER_CLAUSE = 4
78
+ MIN_SCORE_THRESHOLD = 0.18
79
+
80
+
81
+ @dataclass(slots=True)
82
+ class TokenInfo:
83
+ text: str
84
+ start: int
85
+ end: int
86
+ is_word: bool
87
+ clause_index: int
88
+ preceding_punct: str
89
+ following_punct: str
90
+ index: int
91
+
92
+ @property
93
+ def normalised(self) -> str:
94
+ return self.text.lower()
95
+
96
+
97
+ @dataclass(slots=True)
98
+ class StretchabilityFeatures:
99
+ lexical: float
100
+ pos: float
101
+ sentiment: float
102
+ phonotactic: float
103
+ context: float
104
+ sentiment_swing: float
105
+
106
+ def intensity(self) -> float:
107
+ """Map features to an intensity scalar in [0, 1.5]."""
108
+ emphasis = 0.6 * self.context + 0.4 * self.sentiment_swing
109
+ return max(0.0, min(1.5, 0.5 * (self.lexical + self.phonotactic) + emphasis))
110
+
111
+
112
+ @dataclass(slots=True)
113
+ class StretchCandidate:
114
+ token: TokenInfo
115
+ score: float
116
+ features: StretchabilityFeatures
117
+
118
+
119
+ class StretchabilityAnalyzer:
120
+ """Compute stretchability scores and select candidates."""
121
+
122
+ def __init__(
123
+ self,
124
+ *,
125
+ lexical_prior: dict[str, float] | None = None,
126
+ weights: tuple[float, float, float, float, float] = (0.32, 0.18, 0.14, 0.22, 0.14),
127
+ ) -> None:
128
+ self.lexical_prior = lexical_prior or LEXICAL_PRIOR
129
+ self.weights = weights
130
+
131
+ # ------------------------------------------------------------------
132
+ # Public API
133
+ # ------------------------------------------------------------------
134
+ def tokenise(self, text: str) -> list[TokenInfo]:
135
+ """Tokenise text preserving separator tokens."""
136
+ return self._tokenise(text)
137
+
138
+ def analyse(self, text: str) -> list[StretchCandidate]:
139
+ if not text:
140
+ return []
141
+ tokens = self._tokenise(text)
142
+ return self.analyse_tokens(tokens)
143
+
144
+ def analyse_tokens(self, tokens: Sequence[TokenInfo]) -> list[StretchCandidate]:
145
+ candidates: list[StretchCandidate] = []
146
+ for idx, token in enumerate(tokens):
147
+ if not token.is_word:
148
+ continue
149
+ if self._excluded(token, tokens, idx):
150
+ continue
151
+
152
+ features = self._compute_features(token, tokens, idx)
153
+ score = self._composite_score(features)
154
+ if score < MIN_SCORE_THRESHOLD:
155
+ continue
156
+ candidates.append(StretchCandidate(token=token, score=score, features=features))
157
+ return candidates
158
+
159
+ def select_candidates(
160
+ self,
161
+ candidates: Sequence[StretchCandidate],
162
+ *,
163
+ rate: float,
164
+ rng: RandomLike,
165
+ ) -> list[StretchCandidate]:
166
+ if not candidates or rate <= 0:
167
+ return []
168
+
169
+ grouped: dict[int, list[StretchCandidate]] = {}
170
+ for candidate in candidates:
171
+ grouped.setdefault(candidate.token.clause_index, []).append(candidate)
172
+
173
+ selected: list[StretchCandidate] = []
174
+ total_expected = max(0, min(len(candidates), int(round(len(candidates) * rate))))
175
+
176
+ for clause_index in sorted(grouped):
177
+ clause_candidates = sorted(
178
+ grouped[clause_index], key=lambda c: (-c.score, c.token.start)
179
+ )
180
+ clause_candidates = clause_candidates[:MAX_CANDIDATES_PER_CLAUSE]
181
+ clause_quota = max(
182
+ 0, min(len(clause_candidates), int(round(len(clause_candidates) * rate)))
183
+ )
184
+
185
+ provisional: list[StretchCandidate] = []
186
+ for candidate in clause_candidates:
187
+ probability = min(1.0, rate * (0.35 + 0.65 * candidate.score))
188
+ if rng.random() < probability:
189
+ provisional.append(candidate)
190
+ if len(provisional) >= clause_quota:
191
+ break
192
+
193
+ if len(provisional) < clause_quota:
194
+ leftovers = [c for c in clause_candidates if c not in provisional]
195
+ needed = clause_quota - len(provisional)
196
+ provisional.extend(leftovers[:needed])
197
+
198
+ selected.extend(provisional)
199
+
200
+ if len(selected) < total_expected:
201
+ remaining = [c for c in candidates if c not in selected]
202
+ remaining.sort(key=lambda c: (-c.score, c.token.start))
203
+ selected.extend(remaining[: total_expected - len(selected)])
204
+
205
+ # Keep deterministic order by position
206
+ selected.sort(key=lambda c: c.token.start)
207
+ return selected
208
+
209
+ # ------------------------------------------------------------------
210
+ # Internal helpers
211
+ # ------------------------------------------------------------------
212
+ def _tokenise(self, text: str) -> list[TokenInfo]:
213
+ tokens: list[TokenInfo] = []
214
+ clause_index = 0
215
+ matches = list(TOKEN_REGEX.finditer(text))
216
+ for idx, match in enumerate(matches):
217
+ token_text = match.group(0)
218
+ is_word = bool(ALPHA_REGEX.search(token_text)) and token_text.strip().isalnum()
219
+ preceding = matches[idx - 1].group(0) if idx > 0 else ""
220
+ following = matches[idx + 1].group(0) if idx + 1 < len(matches) else ""
221
+ tokens.append(
222
+ TokenInfo(
223
+ text=token_text,
224
+ start=match.start(),
225
+ end=match.end(),
226
+ is_word=is_word,
227
+ clause_index=clause_index,
228
+ preceding_punct=preceding,
229
+ following_punct=following,
230
+ index=idx,
231
+ )
232
+ )
233
+ if any(ch in CLAUSE_PUNCTUATION for ch in token_text):
234
+ clause_index += 1
235
+ return tokens
236
+
237
+ def _excluded(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> bool:
238
+ text = token.text
239
+ normalised = token.normalised
240
+ if sum(ch.isalpha() for ch in text) < 2:
241
+ return True
242
+ if any(ch.isdigit() for ch in text):
243
+ return True
244
+ lowered = normalised
245
+ if "http" in lowered or "www" in lowered or "//" in lowered:
246
+ return True
247
+ if any(symbol in text for symbol in {"#", "@", "&", "{", "}", "<", ">"}):
248
+ return True
249
+ if "_" in text:
250
+ return True
251
+ if "/" in text or "\\" in text:
252
+ return True
253
+
254
+ # Heuristic proper noun check: Title case mid-clause counts as proper noun
255
+ if text[:1].isupper() and text[1:].islower():
256
+ previous_clause_start = index == 0
257
+ if not previous_clause_start:
258
+ for prior in reversed(tokens[:index]):
259
+ stripped = prior.text.strip()
260
+ if not stripped:
261
+ continue
262
+ if stripped[-1] in CLAUSE_PUNCTUATION:
263
+ previous_clause_start = True
264
+ break
265
+ if not previous_clause_start:
266
+ return True
267
+ return False
268
+
269
+ def _compute_features(
270
+ self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int
271
+ ) -> StretchabilityFeatures:
272
+ lexical = self.lexical_prior.get(token.normalised, 0.12)
273
+ pos_score = self._pos_score(token)
274
+ sentiment_score, sentiment_swing = self._sentiment(tokens, index)
275
+ phon_score = self._phonotactic(token.normalised)
276
+ context_score = self._contextual(token, tokens, index)
277
+ return StretchabilityFeatures(
278
+ lexical=lexical,
279
+ pos=pos_score,
280
+ sentiment=sentiment_score,
281
+ phonotactic=phon_score,
282
+ context=context_score,
283
+ sentiment_swing=sentiment_swing,
284
+ )
285
+
286
+ def _composite_score(self, features: StretchabilityFeatures) -> float:
287
+ lex_w, pos_w, sent_w, phon_w, ctx_w = self.weights
288
+ weighted = (
289
+ lex_w * features.lexical
290
+ + pos_w * features.pos
291
+ + sent_w * features.sentiment
292
+ + phon_w * features.phonotactic
293
+ + ctx_w * features.context
294
+ )
295
+ total_weight = sum(self.weights)
296
+ score = weighted / total_weight if total_weight else 0.0
297
+ return max(0.0, min(1.0, score))
298
+
299
+ # ------------------------------------------------------------------
300
+ # Feature helpers
301
+ # ------------------------------------------------------------------
302
+ def _pos_score(self, token: TokenInfo) -> float:
303
+ normalised = token.normalised
304
+ if normalised in INTERJECTIONS:
305
+ return 0.95
306
+ if normalised in INTENSIFIERS:
307
+ return 0.85
308
+ if normalised in EVALUATIVES:
309
+ return 0.7
310
+ if normalised.endswith("ly"):
311
+ return 0.55
312
+ if token.text.isupper() and len(token.text) > 1:
313
+ return 0.65
314
+ return 0.3
315
+
316
+ def _sentiment(self, tokens: Sequence[TokenInfo], index: int) -> tuple[float, float]:
317
+ window = [tok for tok in tokens[max(0, index - 2) : index + 3] if tok.is_word]
318
+ if not window:
319
+ return 0.5, 0.0
320
+ pos_hits = sum(1 for tok in window if tok.normalised in POSITIVE_LEXICON)
321
+ neg_hits = sum(1 for tok in window if tok.normalised in NEGATIVE_LEXICON)
322
+ total = len(window)
323
+ balance = (pos_hits - neg_hits) / total
324
+ sentiment_score = 0.5 + 0.5 * max(-1.0, min(1.0, balance))
325
+ swing = abs(balance)
326
+ return sentiment_score, swing
327
+
328
+ def _phonotactic(self, normalised: str) -> float:
329
+ if not any(ch in VOWELS for ch in normalised):
330
+ return 0.0
331
+ score = 0.25
332
+ if any(normalised.endswith(c) for c in SONORANT_CODAS):
333
+ score += 0.2
334
+ if any(normalised.endswith(c) for c in SIBILANT_CODAS):
335
+ score += 0.18
336
+ if any(digraph in normalised for digraph in DIGRAPHS):
337
+ score += 0.22
338
+ if re.search(r"[aeiouy]{2,}", normalised):
339
+ score += 0.22
340
+ if re.search(r"(.)(?!\1)(.)\1", normalised):
341
+ score += 0.08
342
+ return max(0.0, min(1.0, score))
343
+
344
+ def _contextual(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> float:
345
+ score = 0.2
346
+ before = token.preceding_punct
347
+ after = token.following_punct
348
+ token_text = token.text
349
+ if after and after.count("!") >= 1:
350
+ score += 0.25
351
+ if after and after.count("?") >= 1:
352
+ score += 0.2
353
+ if before and before.count("!") >= 2:
354
+ score += 0.2
355
+ if after and ("!!" in after or "??" in after):
356
+ score += 0.15
357
+ if token_text.isupper() and len(token_text) > 1:
358
+ score += 0.25
359
+ if EMOJI_REGEX.search(before or "") or EMOJI_REGEX.search(after or ""):
360
+ score += 0.15
361
+ # Clause-final emphasis
362
+ if index + 1 < len(tokens):
363
+ trailing = tokens[index + 1].text
364
+ if any(p in trailing for p in {"!!!", "??", "?!"}):
365
+ score += 0.2
366
+ return max(0.0, min(1.0, score))
367
+
368
+
369
+ __all__ = [
370
+ "StretchabilityAnalyzer",
371
+ "StretchCandidate",
372
+ "StretchabilityFeatures",
373
+ "TokenInfo",
374
+ "RandomLike",
375
+ ]