glitchlings 0.4.5__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +71 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cp311-win_amd64.pyd +0 -0
- glitchlings/compat.py +282 -0
- glitchlings/config.py +386 -0
- glitchlings/config.toml +3 -0
- glitchlings/data/__init__.py +1 -0
- glitchlings/data/hokey_assets.json +193 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +209 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +108 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +652 -0
- glitchlings/lexicon/wordnet.py +228 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +375 -0
- glitchlings/zoo/__init__.py +172 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.5.dist-info/METADATA +648 -0
- glitchlings-0.4.5.dist-info/RECORD +53 -0
- glitchlings-0.4.5.dist-info/WHEEL +5 -0
- glitchlings-0.4.5.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.5.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import difflib
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"SAMPLE_TEXT",
|
|
6
|
+
"string_diffs",
|
|
7
|
+
"KeyNeighborMap",
|
|
8
|
+
"KeyboardLayouts",
|
|
9
|
+
"KeyNeighbors",
|
|
10
|
+
"KEYNEIGHBORS",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
SAMPLE_TEXT = (
|
|
14
|
+
"One morning, when Gregor Samsa woke from troubled dreams, he found himself "
|
|
15
|
+
"transformed in his bed into a horrible vermin. He lay on his armour-like back, and "
|
|
16
|
+
"if he lifted his head a little he could see his brown belly, slightly domed and "
|
|
17
|
+
"divided by arches into stiff sections. The bedding was hardly able to cover it and "
|
|
18
|
+
"seemed ready to slide off any moment. His many legs, pitifully thin compared with "
|
|
19
|
+
"the size of the rest of him, waved about helplessly as he looked."
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def string_diffs(a: str, b: str) -> list[list[tuple[str, str, str]]]:
|
|
24
|
+
"""Compare two strings using SequenceMatcher and return
|
|
25
|
+
grouped adjacent opcodes (excluding 'equal' tags).
|
|
26
|
+
|
|
27
|
+
Each element is a tuple: (tag, a_text, b_text).
|
|
28
|
+
"""
|
|
29
|
+
sm = difflib.SequenceMatcher(None, a, b)
|
|
30
|
+
ops: list[list[tuple[str, str, str]]] = []
|
|
31
|
+
buffer: list[tuple[str, str, str]] = []
|
|
32
|
+
|
|
33
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
|
34
|
+
if tag == "equal":
|
|
35
|
+
# flush any buffered operations before skipping
|
|
36
|
+
if buffer:
|
|
37
|
+
ops.append(buffer)
|
|
38
|
+
buffer = []
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
# append operation to buffer
|
|
42
|
+
buffer.append((tag, a[i1:i2], b[j1:j2]))
|
|
43
|
+
|
|
44
|
+
# flush trailing buffer
|
|
45
|
+
if buffer:
|
|
46
|
+
ops.append(buffer)
|
|
47
|
+
|
|
48
|
+
return ops
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
KeyNeighborMap = dict[str, list[str]]
|
|
52
|
+
KeyboardLayouts = dict[str, KeyNeighborMap]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _build_neighbor_map(rows: Iterable[str]) -> KeyNeighborMap:
|
|
56
|
+
"""Derive 8-neighbour adjacency lists from keyboard layout rows."""
|
|
57
|
+
grid: dict[tuple[int, int], str] = {}
|
|
58
|
+
for y, row in enumerate(rows):
|
|
59
|
+
for x, char in enumerate(row):
|
|
60
|
+
if char == " ":
|
|
61
|
+
continue
|
|
62
|
+
grid[(x, y)] = char.lower()
|
|
63
|
+
|
|
64
|
+
neighbors: KeyNeighborMap = {}
|
|
65
|
+
for (x, y), char in grid.items():
|
|
66
|
+
seen: list[str] = []
|
|
67
|
+
for dy in (-1, 0, 1):
|
|
68
|
+
for dx in (-1, 0, 1):
|
|
69
|
+
if dx == 0 and dy == 0:
|
|
70
|
+
continue
|
|
71
|
+
candidate = grid.get((x + dx, y + dy))
|
|
72
|
+
if candidate is None:
|
|
73
|
+
continue
|
|
74
|
+
seen.append(candidate)
|
|
75
|
+
# Preserve encounter order but drop duplicates for determinism
|
|
76
|
+
deduped = list(dict.fromkeys(seen))
|
|
77
|
+
neighbors[char] = deduped
|
|
78
|
+
|
|
79
|
+
return neighbors
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
_KEYNEIGHBORS: KeyboardLayouts = {
|
|
83
|
+
"CURATOR_QWERTY": {
|
|
84
|
+
"a": [*"qwsz"],
|
|
85
|
+
"b": [*"vghn "],
|
|
86
|
+
"c": [*"xdfv "],
|
|
87
|
+
"d": [*"serfcx"],
|
|
88
|
+
"e": [*"wsdrf34"],
|
|
89
|
+
"f": [*"drtgvc"],
|
|
90
|
+
"g": [*"ftyhbv"],
|
|
91
|
+
"h": [*"gyujnb"],
|
|
92
|
+
"i": [*"ujko89"],
|
|
93
|
+
"j": [*"huikmn"],
|
|
94
|
+
"k": [*"jilom,"],
|
|
95
|
+
"l": [*"kop;.,"],
|
|
96
|
+
"m": [*"njk, "],
|
|
97
|
+
"n": [*"bhjm "],
|
|
98
|
+
"o": [*"iklp90"],
|
|
99
|
+
"p": [*"o0-[;l"],
|
|
100
|
+
"q": [*"was 12"],
|
|
101
|
+
"r": [*"edft45"],
|
|
102
|
+
"s": [*"awedxz"],
|
|
103
|
+
"t": [*"r56ygf"],
|
|
104
|
+
"u": [*"y78ijh"],
|
|
105
|
+
"v": [*"cfgb "],
|
|
106
|
+
"w": [*"q23esa"],
|
|
107
|
+
"x": [*"zsdc "],
|
|
108
|
+
"y": [*"t67uhg"],
|
|
109
|
+
"z": [*"asx"],
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _register_layout(name: str, rows: Iterable[str]) -> None:
|
|
115
|
+
_KEYNEIGHBORS[name] = _build_neighbor_map(rows)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
_register_layout(
|
|
119
|
+
"DVORAK",
|
|
120
|
+
(
|
|
121
|
+
"`1234567890[]\\",
|
|
122
|
+
" ',.pyfgcrl/=\\",
|
|
123
|
+
" aoeuidhtns-",
|
|
124
|
+
" ;qjkxbmwvz",
|
|
125
|
+
),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
_register_layout(
|
|
129
|
+
"COLEMAK",
|
|
130
|
+
(
|
|
131
|
+
"`1234567890-=",
|
|
132
|
+
" qwfpgjluy;[]\\",
|
|
133
|
+
" arstdhneio'",
|
|
134
|
+
" zxcvbkm,./",
|
|
135
|
+
),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
_register_layout(
|
|
139
|
+
"QWERTY",
|
|
140
|
+
(
|
|
141
|
+
"`1234567890-=",
|
|
142
|
+
" qwertyuiop[]\\",
|
|
143
|
+
" asdfghjkl;'",
|
|
144
|
+
" zxcvbnm,./",
|
|
145
|
+
),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
_register_layout(
|
|
149
|
+
"AZERTY",
|
|
150
|
+
(
|
|
151
|
+
"²&é\"'(-è_çà)=",
|
|
152
|
+
" azertyuiop^$",
|
|
153
|
+
" qsdfghjklmù*",
|
|
154
|
+
" <wxcvbn,;:!",
|
|
155
|
+
),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
_register_layout(
|
|
159
|
+
"QWERTZ",
|
|
160
|
+
(
|
|
161
|
+
"^1234567890ß´",
|
|
162
|
+
" qwertzuiopü+",
|
|
163
|
+
" asdfghjklöä#",
|
|
164
|
+
" yxcvbnm,.-",
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
_register_layout(
|
|
169
|
+
"SPANISH_QWERTY",
|
|
170
|
+
(
|
|
171
|
+
"º1234567890'¡",
|
|
172
|
+
" qwertyuiop´+",
|
|
173
|
+
" asdfghjklñ´",
|
|
174
|
+
" <zxcvbnm,.-",
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
_register_layout(
|
|
179
|
+
"SWEDISH_QWERTY",
|
|
180
|
+
(
|
|
181
|
+
"§1234567890+´",
|
|
182
|
+
" qwertyuiopå¨",
|
|
183
|
+
" asdfghjklöä'",
|
|
184
|
+
" <zxcvbnm,.-",
|
|
185
|
+
),
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class KeyNeighbors:
|
|
190
|
+
def __init__(self) -> None:
|
|
191
|
+
for layout_name, layout in _KEYNEIGHBORS.items():
|
|
192
|
+
setattr(self, layout_name, layout)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
KEYNEIGHBORS: KeyNeighbors = KeyNeighbors()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Adapter helpers shared across Python and DLC integrations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
|
|
7
|
+
from ..zoo import Gaggle, Glitchling, summon
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def coerce_gaggle(
|
|
11
|
+
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
|
12
|
+
*,
|
|
13
|
+
seed: int,
|
|
14
|
+
) -> Gaggle:
|
|
15
|
+
"""Return a :class:`Gaggle` built from any supported glitchling specifier."""
|
|
16
|
+
if isinstance(glitchlings, Gaggle):
|
|
17
|
+
return glitchlings
|
|
18
|
+
|
|
19
|
+
if isinstance(glitchlings, (Glitchling, str)):
|
|
20
|
+
resolved: Iterable[str | Glitchling] = [glitchlings]
|
|
21
|
+
else:
|
|
22
|
+
resolved = glitchlings
|
|
23
|
+
|
|
24
|
+
return summon(list(resolved), seed=seed)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
__all__ = ["coerce_gaggle"]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Hokey expressive lengthening generator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from .stretch_locator import StretchSite, apply_stretch, find_stretch_site
|
|
8
|
+
from .stretchability import RandomLike, StretchabilityAnalyzer, StretchabilityFeatures
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(slots=True)
|
|
12
|
+
class HokeyConfig:
|
|
13
|
+
rate: float = 0.3
|
|
14
|
+
extension_min: int = 2
|
|
15
|
+
extension_max: int = 5
|
|
16
|
+
base_p: float = 0.45
|
|
17
|
+
word_length_threshold: int = 6
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(slots=True)
|
|
21
|
+
class StretchEvent:
|
|
22
|
+
token_index: int
|
|
23
|
+
original: str
|
|
24
|
+
stretched: str
|
|
25
|
+
repeats: int
|
|
26
|
+
site: StretchSite
|
|
27
|
+
score: float
|
|
28
|
+
features: StretchabilityFeatures
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class NegativeBinomialSampler:
|
|
32
|
+
"""Sample stretch lengths from a clipped negative binomial distribution."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, base_p: float = 0.45) -> None:
|
|
35
|
+
self.base_p = base_p
|
|
36
|
+
|
|
37
|
+
def sample(
|
|
38
|
+
self,
|
|
39
|
+
rng: RandomLike,
|
|
40
|
+
*,
|
|
41
|
+
intensity: float,
|
|
42
|
+
minimum: int,
|
|
43
|
+
maximum: int,
|
|
44
|
+
) -> int:
|
|
45
|
+
minimum = max(0, int(minimum))
|
|
46
|
+
maximum = max(minimum, int(maximum))
|
|
47
|
+
if maximum == 0:
|
|
48
|
+
return 0
|
|
49
|
+
if maximum == minimum:
|
|
50
|
+
return maximum
|
|
51
|
+
|
|
52
|
+
r = max(1, int(round(1 + 2 * intensity)))
|
|
53
|
+
adjusted_p = self.base_p / (1.0 + 0.75 * max(0.0, intensity))
|
|
54
|
+
adjusted_p = max(0.05, min(0.95, adjusted_p))
|
|
55
|
+
failures = sum(self._geometric_sample(rng, adjusted_p) for _ in range(r))
|
|
56
|
+
extra = minimum + failures
|
|
57
|
+
return max(minimum, min(maximum, extra))
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _geometric_sample(rng: RandomLike, p: float) -> int:
|
|
61
|
+
count = 0
|
|
62
|
+
while rng.random() > p:
|
|
63
|
+
count += 1
|
|
64
|
+
return count
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class HokeyGenerator:
|
|
68
|
+
"""Full expressive lengthening pipeline."""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
analyzer: StretchabilityAnalyzer | None = None,
|
|
73
|
+
sampler: NegativeBinomialSampler | None = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
self.analyzer = analyzer or StretchabilityAnalyzer()
|
|
76
|
+
self.sampler = sampler or NegativeBinomialSampler()
|
|
77
|
+
|
|
78
|
+
def generate(
|
|
79
|
+
self,
|
|
80
|
+
text: str,
|
|
81
|
+
*,
|
|
82
|
+
rng: RandomLike,
|
|
83
|
+
config: HokeyConfig,
|
|
84
|
+
) -> tuple[str, list[StretchEvent]]:
|
|
85
|
+
if not text:
|
|
86
|
+
return text, []
|
|
87
|
+
|
|
88
|
+
if config.base_p != self.sampler.base_p:
|
|
89
|
+
self.sampler.base_p = config.base_p
|
|
90
|
+
|
|
91
|
+
tokens = self.analyzer.tokenise(text)
|
|
92
|
+
candidates = self.analyzer.analyse_tokens(tokens)
|
|
93
|
+
selected = self.analyzer.select_candidates(candidates, rate=config.rate, rng=rng)
|
|
94
|
+
if not selected:
|
|
95
|
+
return text, []
|
|
96
|
+
|
|
97
|
+
token_strings = [token.text for token in tokens]
|
|
98
|
+
events: list[StretchEvent] = []
|
|
99
|
+
|
|
100
|
+
for candidate in selected:
|
|
101
|
+
token_idx = candidate.token.index
|
|
102
|
+
original = token_strings[token_idx]
|
|
103
|
+
site = find_stretch_site(original)
|
|
104
|
+
if site is None:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
intensity = min(1.5, candidate.features.intensity() + 0.35 * candidate.score)
|
|
108
|
+
alpha_count = sum(1 for ch in original if ch.isalpha())
|
|
109
|
+
if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold * 2:
|
|
110
|
+
continue
|
|
111
|
+
if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold:
|
|
112
|
+
excess = alpha_count - config.word_length_threshold
|
|
113
|
+
intensity = intensity / (1.0 + 0.35 * excess)
|
|
114
|
+
if candidate.score < 0.35 and excess >= 2:
|
|
115
|
+
continue
|
|
116
|
+
intensity = max(0.05, intensity)
|
|
117
|
+
|
|
118
|
+
repeats = self.sampler.sample(
|
|
119
|
+
rng,
|
|
120
|
+
intensity=intensity,
|
|
121
|
+
minimum=config.extension_min,
|
|
122
|
+
maximum=config.extension_max,
|
|
123
|
+
)
|
|
124
|
+
if repeats <= 0:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
stretched_word = apply_stretch(original, site, repeats)
|
|
128
|
+
token_strings[token_idx] = stretched_word
|
|
129
|
+
events.append(
|
|
130
|
+
StretchEvent(
|
|
131
|
+
token_index=token_idx,
|
|
132
|
+
original=original,
|
|
133
|
+
stretched=stretched_word,
|
|
134
|
+
repeats=repeats,
|
|
135
|
+
site=site,
|
|
136
|
+
score=candidate.score,
|
|
137
|
+
features=candidate.features,
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return "".join(token_strings), events
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
__all__ = ["HokeyGenerator", "HokeyConfig", "StretchEvent", "NegativeBinomialSampler"]
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Identify where expressive stretches should occur within a token."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
VOWELS = set("aeiouyAEIOUY")
|
|
9
|
+
SONORANTS = set("rlmnwyhRLMNWYH")
|
|
10
|
+
SIBILANTS = set("sSzZxXcCjJ") | {"sh", "Sh", "sH", "SH", "zh", "Zh"}
|
|
11
|
+
DIGRAPHS = {
|
|
12
|
+
"aa",
|
|
13
|
+
"ae",
|
|
14
|
+
"ai",
|
|
15
|
+
"ay",
|
|
16
|
+
"ee",
|
|
17
|
+
"ei",
|
|
18
|
+
"ey",
|
|
19
|
+
"ie",
|
|
20
|
+
"io",
|
|
21
|
+
"oa",
|
|
22
|
+
"oe",
|
|
23
|
+
"oi",
|
|
24
|
+
"oo",
|
|
25
|
+
"ou",
|
|
26
|
+
"ua",
|
|
27
|
+
"ue",
|
|
28
|
+
"ui",
|
|
29
|
+
"ya",
|
|
30
|
+
"yo",
|
|
31
|
+
"yu",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(slots=True)
|
|
36
|
+
class StretchSite:
|
|
37
|
+
"""Location of a stretchable grapheme."""
|
|
38
|
+
|
|
39
|
+
start: int
|
|
40
|
+
end: int
|
|
41
|
+
category: str
|
|
42
|
+
|
|
43
|
+
def unit(self, token: str) -> str:
|
|
44
|
+
return token[self.start : self.end]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _alpha_indices(token: str) -> list[int]:
|
|
48
|
+
return [idx for idx, char in enumerate(token) if char.isalpha()]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _vowel_clusters(token: str, indices: Iterable[int]) -> list[tuple[int, int]]:
|
|
52
|
+
clusters: list[tuple[int, int]] = []
|
|
53
|
+
start: int | None = None
|
|
54
|
+
prev_idx: int | None = None
|
|
55
|
+
for idx in indices:
|
|
56
|
+
char = token[idx]
|
|
57
|
+
if char in VOWELS:
|
|
58
|
+
if start is None:
|
|
59
|
+
start = idx
|
|
60
|
+
elif prev_idx is not None and idx != prev_idx + 1:
|
|
61
|
+
clusters.append((start, prev_idx + 1))
|
|
62
|
+
start = idx
|
|
63
|
+
else:
|
|
64
|
+
if start is not None:
|
|
65
|
+
clusters.append((start, idx))
|
|
66
|
+
start = None
|
|
67
|
+
prev_idx = idx
|
|
68
|
+
if start is not None and prev_idx is not None:
|
|
69
|
+
clusters.append((start, prev_idx + 1))
|
|
70
|
+
return clusters
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def find_stretch_site(token: str) -> StretchSite | None:
|
|
74
|
+
"""Return the most suitable stretch site for ``token``."""
|
|
75
|
+
|
|
76
|
+
alpha_indices = _alpha_indices(token)
|
|
77
|
+
if not alpha_indices:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
lower = token.lower()
|
|
81
|
+
clusters = _vowel_clusters(lower, alpha_indices)
|
|
82
|
+
candidates: list[tuple[int, StretchSite]] = []
|
|
83
|
+
|
|
84
|
+
# Sibilant/sonorant coda extension (yes -> yesss, hmm -> hmmmm)
|
|
85
|
+
last_idx = alpha_indices[-1]
|
|
86
|
+
last_char = lower[last_idx]
|
|
87
|
+
if len(alpha_indices) >= 2:
|
|
88
|
+
prev_char = lower[alpha_indices[-2]]
|
|
89
|
+
else:
|
|
90
|
+
prev_char = ""
|
|
91
|
+
has_multi_vowel = any(
|
|
92
|
+
(end - start >= 2) and not (lower[start] == "y" and start == 0) for start, end in clusters
|
|
93
|
+
)
|
|
94
|
+
if last_char in {"s", "z"} and prev_char in VOWELS and not has_multi_vowel:
|
|
95
|
+
candidates.append((5, StretchSite(last_idx, last_idx + 1, "coda")))
|
|
96
|
+
elif last_char in SONORANTS and prev_char in VOWELS and not has_multi_vowel:
|
|
97
|
+
candidates.append((4, StretchSite(last_idx, last_idx + 1, "coda")))
|
|
98
|
+
elif not clusters:
|
|
99
|
+
candidates.append((2, StretchSite(last_idx, last_idx + 1, "consonant")))
|
|
100
|
+
|
|
101
|
+
# CVCe pattern (cute -> cuuute)
|
|
102
|
+
if lower.endswith("e") and len(alpha_indices) >= 3:
|
|
103
|
+
final_letter = alpha_indices[-1]
|
|
104
|
+
if token[final_letter].lower() == "e":
|
|
105
|
+
c_idx = alpha_indices[-2]
|
|
106
|
+
v_idx = alpha_indices[-3]
|
|
107
|
+
if token[c_idx].lower() not in VOWELS and token[v_idx].lower() in VOWELS:
|
|
108
|
+
candidates.append((4, StretchSite(v_idx, v_idx + 1, "cvce")))
|
|
109
|
+
|
|
110
|
+
for cluster in clusters:
|
|
111
|
+
start, end = cluster
|
|
112
|
+
substring = lower[start:end]
|
|
113
|
+
category = "vowel"
|
|
114
|
+
if any(substring[i : i + 2] in DIGRAPHS for i in range(max(0, len(substring) - 1))):
|
|
115
|
+
category = "digraph"
|
|
116
|
+
priority = 3 if cluster == clusters[-1] else 2
|
|
117
|
+
candidates.append((priority, StretchSite(start, end, category)))
|
|
118
|
+
|
|
119
|
+
if not candidates:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
candidates.sort(key=lambda item: (item[0], item[1].end - item[1].start, -item[1].start))
|
|
123
|
+
return candidates[-1][1]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def apply_stretch(token: str, site: StretchSite, repeats: int) -> str:
|
|
127
|
+
"""Return ``token`` with ``repeats`` extra copies of the grapheme at ``site``."""
|
|
128
|
+
|
|
129
|
+
if repeats <= 0:
|
|
130
|
+
return token
|
|
131
|
+
chars = list(token)
|
|
132
|
+
stretched: list[str] = []
|
|
133
|
+
for idx, char in enumerate(chars):
|
|
134
|
+
stretched.append(char)
|
|
135
|
+
if site.start <= idx < site.end:
|
|
136
|
+
stretched.append(char * repeats)
|
|
137
|
+
return "".join(stretched)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
__all__ = ["StretchSite", "find_stretch_site", "apply_stretch"]
|