glitchlings 0.4.5__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +71 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cp311-win_amd64.pyd +0 -0
- glitchlings/compat.py +282 -0
- glitchlings/config.py +386 -0
- glitchlings/config.toml +3 -0
- glitchlings/data/__init__.py +1 -0
- glitchlings/data/hokey_assets.json +193 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +209 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +108 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +652 -0
- glitchlings/lexicon/wordnet.py +228 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +375 -0
- glitchlings/zoo/__init__.py +172 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.5.dist-info/METADATA +648 -0
- glitchlings-0.4.5.dist-info/RECORD +53 -0
- glitchlings-0.4.5.dist-info/WHEEL +5 -0
- glitchlings-0.4.5.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.5.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""Stretchability scoring and candidate selection for Hokey."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from importlib import resources
|
|
9
|
+
from typing import Any, Protocol, Sequence, TypedDict, cast
|
|
10
|
+
|
|
11
|
+
# Regexes reused across the module
|
|
12
|
+
TOKEN_REGEX = re.compile(r"\w+|\W+")
|
|
13
|
+
ALPHA_REGEX = re.compile(r"[A-Za-z]")
|
|
14
|
+
EMOJI_REGEX = re.compile(r"[\U0001F300-\U0001FAFF]")
|
|
15
|
+
CLAUSE_PUNCTUATION = {".", "?", "!", ";"}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HokeyAssets(TypedDict):
|
|
19
|
+
lexical_prior: dict[str, float]
|
|
20
|
+
interjections: list[str]
|
|
21
|
+
intensifiers: list[str]
|
|
22
|
+
evaluatives: list[str]
|
|
23
|
+
positive_lexicon: list[str]
|
|
24
|
+
negative_lexicon: list[str]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RandomLike(Protocol):
|
|
28
|
+
"""Interface for RNGs that expose ``random()``."""
|
|
29
|
+
|
|
30
|
+
def random(self) -> float: ...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Lexical prior probabilities and pragmatic lexica shared with the Rust fast path.
|
|
34
|
+
def _load_assets() -> HokeyAssets:
|
|
35
|
+
with (
|
|
36
|
+
resources.files("glitchlings.data")
|
|
37
|
+
.joinpath("hokey_assets.json")
|
|
38
|
+
.open("r", encoding="utf-8") as payload
|
|
39
|
+
):
|
|
40
|
+
data: Any = json.load(payload)
|
|
41
|
+
return cast(HokeyAssets, data)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_ASSETS = _load_assets()
|
|
45
|
+
LEXICAL_PRIOR: dict[str, float] = {
|
|
46
|
+
token: float(score) for token, score in _ASSETS["lexical_prior"].items()
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Pragmatic lexica for POS/discourse cues
|
|
50
|
+
INTERJECTIONS = frozenset(_ASSETS["interjections"])
|
|
51
|
+
INTENSIFIERS = frozenset(_ASSETS["intensifiers"])
|
|
52
|
+
EVALUATIVES = frozenset(_ASSETS["evaluatives"])
|
|
53
|
+
POSITIVE_LEXICON = frozenset(_ASSETS["positive_lexicon"])
|
|
54
|
+
NEGATIVE_LEXICON = frozenset(_ASSETS["negative_lexicon"])
|
|
55
|
+
|
|
56
|
+
VOWELS = set("aeiouy")
|
|
57
|
+
SONORANT_CODAS = set("rlmnwyh")
|
|
58
|
+
SIBILANT_CODAS = {"s", "z", "x", "c", "j", "sh", "zh"}
|
|
59
|
+
DIGRAPHS = {
|
|
60
|
+
"aa",
|
|
61
|
+
"ae",
|
|
62
|
+
"ai",
|
|
63
|
+
"ay",
|
|
64
|
+
"ee",
|
|
65
|
+
"ei",
|
|
66
|
+
"ey",
|
|
67
|
+
"ie",
|
|
68
|
+
"oa",
|
|
69
|
+
"oe",
|
|
70
|
+
"oi",
|
|
71
|
+
"oo",
|
|
72
|
+
"ou",
|
|
73
|
+
"ue",
|
|
74
|
+
"ui",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
MAX_CANDIDATES_PER_CLAUSE = 4
|
|
78
|
+
MIN_SCORE_THRESHOLD = 0.18
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(slots=True)
|
|
82
|
+
class TokenInfo:
|
|
83
|
+
text: str
|
|
84
|
+
start: int
|
|
85
|
+
end: int
|
|
86
|
+
is_word: bool
|
|
87
|
+
clause_index: int
|
|
88
|
+
preceding_punct: str
|
|
89
|
+
following_punct: str
|
|
90
|
+
index: int
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def normalised(self) -> str:
|
|
94
|
+
return self.text.lower()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass(slots=True)
|
|
98
|
+
class StretchabilityFeatures:
|
|
99
|
+
lexical: float
|
|
100
|
+
pos: float
|
|
101
|
+
sentiment: float
|
|
102
|
+
phonotactic: float
|
|
103
|
+
context: float
|
|
104
|
+
sentiment_swing: float
|
|
105
|
+
|
|
106
|
+
def intensity(self) -> float:
|
|
107
|
+
"""Map features to an intensity scalar in [0, 1.5]."""
|
|
108
|
+
emphasis = 0.6 * self.context + 0.4 * self.sentiment_swing
|
|
109
|
+
return max(0.0, min(1.5, 0.5 * (self.lexical + self.phonotactic) + emphasis))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass(slots=True)
|
|
113
|
+
class StretchCandidate:
|
|
114
|
+
token: TokenInfo
|
|
115
|
+
score: float
|
|
116
|
+
features: StretchabilityFeatures
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class StretchabilityAnalyzer:
|
|
120
|
+
"""Compute stretchability scores and select candidates."""
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
*,
|
|
125
|
+
lexical_prior: dict[str, float] | None = None,
|
|
126
|
+
weights: tuple[float, float, float, float, float] = (0.32, 0.18, 0.14, 0.22, 0.14),
|
|
127
|
+
) -> None:
|
|
128
|
+
self.lexical_prior = lexical_prior or LEXICAL_PRIOR
|
|
129
|
+
self.weights = weights
|
|
130
|
+
|
|
131
|
+
# ------------------------------------------------------------------
|
|
132
|
+
# Public API
|
|
133
|
+
# ------------------------------------------------------------------
|
|
134
|
+
def tokenise(self, text: str) -> list[TokenInfo]:
|
|
135
|
+
"""Tokenise text preserving separator tokens."""
|
|
136
|
+
return self._tokenise(text)
|
|
137
|
+
|
|
138
|
+
def analyse(self, text: str) -> list[StretchCandidate]:
|
|
139
|
+
if not text:
|
|
140
|
+
return []
|
|
141
|
+
tokens = self._tokenise(text)
|
|
142
|
+
return self.analyse_tokens(tokens)
|
|
143
|
+
|
|
144
|
+
def analyse_tokens(self, tokens: Sequence[TokenInfo]) -> list[StretchCandidate]:
|
|
145
|
+
candidates: list[StretchCandidate] = []
|
|
146
|
+
for idx, token in enumerate(tokens):
|
|
147
|
+
if not token.is_word:
|
|
148
|
+
continue
|
|
149
|
+
if self._excluded(token, tokens, idx):
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
features = self._compute_features(token, tokens, idx)
|
|
153
|
+
score = self._composite_score(features)
|
|
154
|
+
if score < MIN_SCORE_THRESHOLD:
|
|
155
|
+
continue
|
|
156
|
+
candidates.append(StretchCandidate(token=token, score=score, features=features))
|
|
157
|
+
return candidates
|
|
158
|
+
|
|
159
|
+
def select_candidates(
|
|
160
|
+
self,
|
|
161
|
+
candidates: Sequence[StretchCandidate],
|
|
162
|
+
*,
|
|
163
|
+
rate: float,
|
|
164
|
+
rng: RandomLike,
|
|
165
|
+
) -> list[StretchCandidate]:
|
|
166
|
+
if not candidates or rate <= 0:
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
grouped: dict[int, list[StretchCandidate]] = {}
|
|
170
|
+
for candidate in candidates:
|
|
171
|
+
grouped.setdefault(candidate.token.clause_index, []).append(candidate)
|
|
172
|
+
|
|
173
|
+
selected: list[StretchCandidate] = []
|
|
174
|
+
total_expected = max(0, min(len(candidates), int(round(len(candidates) * rate))))
|
|
175
|
+
|
|
176
|
+
for clause_index in sorted(grouped):
|
|
177
|
+
clause_candidates = sorted(
|
|
178
|
+
grouped[clause_index], key=lambda c: (-c.score, c.token.start)
|
|
179
|
+
)
|
|
180
|
+
clause_candidates = clause_candidates[:MAX_CANDIDATES_PER_CLAUSE]
|
|
181
|
+
clause_quota = max(
|
|
182
|
+
0, min(len(clause_candidates), int(round(len(clause_candidates) * rate)))
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
provisional: list[StretchCandidate] = []
|
|
186
|
+
for candidate in clause_candidates:
|
|
187
|
+
probability = min(1.0, rate * (0.35 + 0.65 * candidate.score))
|
|
188
|
+
if rng.random() < probability:
|
|
189
|
+
provisional.append(candidate)
|
|
190
|
+
if len(provisional) >= clause_quota:
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
if len(provisional) < clause_quota:
|
|
194
|
+
leftovers = [c for c in clause_candidates if c not in provisional]
|
|
195
|
+
needed = clause_quota - len(provisional)
|
|
196
|
+
provisional.extend(leftovers[:needed])
|
|
197
|
+
|
|
198
|
+
selected.extend(provisional)
|
|
199
|
+
|
|
200
|
+
if len(selected) < total_expected:
|
|
201
|
+
remaining = [c for c in candidates if c not in selected]
|
|
202
|
+
remaining.sort(key=lambda c: (-c.score, c.token.start))
|
|
203
|
+
selected.extend(remaining[: total_expected - len(selected)])
|
|
204
|
+
|
|
205
|
+
# Keep deterministic order by position
|
|
206
|
+
selected.sort(key=lambda c: c.token.start)
|
|
207
|
+
return selected
|
|
208
|
+
|
|
209
|
+
# ------------------------------------------------------------------
|
|
210
|
+
# Internal helpers
|
|
211
|
+
# ------------------------------------------------------------------
|
|
212
|
+
def _tokenise(self, text: str) -> list[TokenInfo]:
|
|
213
|
+
tokens: list[TokenInfo] = []
|
|
214
|
+
clause_index = 0
|
|
215
|
+
matches = list(TOKEN_REGEX.finditer(text))
|
|
216
|
+
for idx, match in enumerate(matches):
|
|
217
|
+
token_text = match.group(0)
|
|
218
|
+
is_word = bool(ALPHA_REGEX.search(token_text)) and token_text.strip().isalnum()
|
|
219
|
+
preceding = matches[idx - 1].group(0) if idx > 0 else ""
|
|
220
|
+
following = matches[idx + 1].group(0) if idx + 1 < len(matches) else ""
|
|
221
|
+
tokens.append(
|
|
222
|
+
TokenInfo(
|
|
223
|
+
text=token_text,
|
|
224
|
+
start=match.start(),
|
|
225
|
+
end=match.end(),
|
|
226
|
+
is_word=is_word,
|
|
227
|
+
clause_index=clause_index,
|
|
228
|
+
preceding_punct=preceding,
|
|
229
|
+
following_punct=following,
|
|
230
|
+
index=idx,
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
if any(ch in CLAUSE_PUNCTUATION for ch in token_text):
|
|
234
|
+
clause_index += 1
|
|
235
|
+
return tokens
|
|
236
|
+
|
|
237
|
+
def _excluded(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> bool:
|
|
238
|
+
text = token.text
|
|
239
|
+
normalised = token.normalised
|
|
240
|
+
if sum(ch.isalpha() for ch in text) < 2:
|
|
241
|
+
return True
|
|
242
|
+
if any(ch.isdigit() for ch in text):
|
|
243
|
+
return True
|
|
244
|
+
lowered = normalised
|
|
245
|
+
if "http" in lowered or "www" in lowered or "//" in lowered:
|
|
246
|
+
return True
|
|
247
|
+
if any(symbol in text for symbol in {"#", "@", "&", "{", "}", "<", ">"}):
|
|
248
|
+
return True
|
|
249
|
+
if "_" in text:
|
|
250
|
+
return True
|
|
251
|
+
if "/" in text or "\\" in text:
|
|
252
|
+
return True
|
|
253
|
+
|
|
254
|
+
# Heuristic proper noun check: Title case mid-clause counts as proper noun
|
|
255
|
+
if text[:1].isupper() and text[1:].islower():
|
|
256
|
+
previous_clause_start = index == 0
|
|
257
|
+
if not previous_clause_start:
|
|
258
|
+
for prior in reversed(tokens[:index]):
|
|
259
|
+
stripped = prior.text.strip()
|
|
260
|
+
if not stripped:
|
|
261
|
+
continue
|
|
262
|
+
if stripped[-1] in CLAUSE_PUNCTUATION:
|
|
263
|
+
previous_clause_start = True
|
|
264
|
+
break
|
|
265
|
+
if not previous_clause_start:
|
|
266
|
+
return True
|
|
267
|
+
return False
|
|
268
|
+
|
|
269
|
+
def _compute_features(
|
|
270
|
+
self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int
|
|
271
|
+
) -> StretchabilityFeatures:
|
|
272
|
+
lexical = self.lexical_prior.get(token.normalised, 0.12)
|
|
273
|
+
pos_score = self._pos_score(token)
|
|
274
|
+
sentiment_score, sentiment_swing = self._sentiment(tokens, index)
|
|
275
|
+
phon_score = self._phonotactic(token.normalised)
|
|
276
|
+
context_score = self._contextual(token, tokens, index)
|
|
277
|
+
return StretchabilityFeatures(
|
|
278
|
+
lexical=lexical,
|
|
279
|
+
pos=pos_score,
|
|
280
|
+
sentiment=sentiment_score,
|
|
281
|
+
phonotactic=phon_score,
|
|
282
|
+
context=context_score,
|
|
283
|
+
sentiment_swing=sentiment_swing,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
def _composite_score(self, features: StretchabilityFeatures) -> float:
|
|
287
|
+
lex_w, pos_w, sent_w, phon_w, ctx_w = self.weights
|
|
288
|
+
weighted = (
|
|
289
|
+
lex_w * features.lexical
|
|
290
|
+
+ pos_w * features.pos
|
|
291
|
+
+ sent_w * features.sentiment
|
|
292
|
+
+ phon_w * features.phonotactic
|
|
293
|
+
+ ctx_w * features.context
|
|
294
|
+
)
|
|
295
|
+
total_weight = sum(self.weights)
|
|
296
|
+
score = weighted / total_weight if total_weight else 0.0
|
|
297
|
+
return max(0.0, min(1.0, score))
|
|
298
|
+
|
|
299
|
+
# ------------------------------------------------------------------
|
|
300
|
+
# Feature helpers
|
|
301
|
+
# ------------------------------------------------------------------
|
|
302
|
+
def _pos_score(self, token: TokenInfo) -> float:
|
|
303
|
+
normalised = token.normalised
|
|
304
|
+
if normalised in INTERJECTIONS:
|
|
305
|
+
return 0.95
|
|
306
|
+
if normalised in INTENSIFIERS:
|
|
307
|
+
return 0.85
|
|
308
|
+
if normalised in EVALUATIVES:
|
|
309
|
+
return 0.7
|
|
310
|
+
if normalised.endswith("ly"):
|
|
311
|
+
return 0.55
|
|
312
|
+
if token.text.isupper() and len(token.text) > 1:
|
|
313
|
+
return 0.65
|
|
314
|
+
return 0.3
|
|
315
|
+
|
|
316
|
+
def _sentiment(self, tokens: Sequence[TokenInfo], index: int) -> tuple[float, float]:
|
|
317
|
+
window = [tok for tok in tokens[max(0, index - 2) : index + 3] if tok.is_word]
|
|
318
|
+
if not window:
|
|
319
|
+
return 0.5, 0.0
|
|
320
|
+
pos_hits = sum(1 for tok in window if tok.normalised in POSITIVE_LEXICON)
|
|
321
|
+
neg_hits = sum(1 for tok in window if tok.normalised in NEGATIVE_LEXICON)
|
|
322
|
+
total = len(window)
|
|
323
|
+
balance = (pos_hits - neg_hits) / total
|
|
324
|
+
sentiment_score = 0.5 + 0.5 * max(-1.0, min(1.0, balance))
|
|
325
|
+
swing = abs(balance)
|
|
326
|
+
return sentiment_score, swing
|
|
327
|
+
|
|
328
|
+
def _phonotactic(self, normalised: str) -> float:
|
|
329
|
+
if not any(ch in VOWELS for ch in normalised):
|
|
330
|
+
return 0.0
|
|
331
|
+
score = 0.25
|
|
332
|
+
if any(normalised.endswith(c) for c in SONORANT_CODAS):
|
|
333
|
+
score += 0.2
|
|
334
|
+
if any(normalised.endswith(c) for c in SIBILANT_CODAS):
|
|
335
|
+
score += 0.18
|
|
336
|
+
if any(digraph in normalised for digraph in DIGRAPHS):
|
|
337
|
+
score += 0.22
|
|
338
|
+
if re.search(r"[aeiouy]{2,}", normalised):
|
|
339
|
+
score += 0.22
|
|
340
|
+
if re.search(r"(.)(?!\1)(.)\1", normalised):
|
|
341
|
+
score += 0.08
|
|
342
|
+
return max(0.0, min(1.0, score))
|
|
343
|
+
|
|
344
|
+
def _contextual(self, token: TokenInfo, tokens: Sequence[TokenInfo], index: int) -> float:
|
|
345
|
+
score = 0.2
|
|
346
|
+
before = token.preceding_punct
|
|
347
|
+
after = token.following_punct
|
|
348
|
+
token_text = token.text
|
|
349
|
+
if after and after.count("!") >= 1:
|
|
350
|
+
score += 0.25
|
|
351
|
+
if after and after.count("?") >= 1:
|
|
352
|
+
score += 0.2
|
|
353
|
+
if before and before.count("!") >= 2:
|
|
354
|
+
score += 0.2
|
|
355
|
+
if after and ("!!" in after or "??" in after):
|
|
356
|
+
score += 0.15
|
|
357
|
+
if token_text.isupper() and len(token_text) > 1:
|
|
358
|
+
score += 0.25
|
|
359
|
+
if EMOJI_REGEX.search(before or "") or EMOJI_REGEX.search(after or ""):
|
|
360
|
+
score += 0.15
|
|
361
|
+
# Clause-final emphasis
|
|
362
|
+
if index + 1 < len(tokens):
|
|
363
|
+
trailing = tokens[index + 1].text
|
|
364
|
+
if any(p in trailing for p in {"!!!", "??", "?!"}):
|
|
365
|
+
score += 0.2
|
|
366
|
+
return max(0.0, min(1.0, score))
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
__all__ = [
|
|
370
|
+
"StretchabilityAnalyzer",
|
|
371
|
+
"StretchCandidate",
|
|
372
|
+
"StretchabilityFeatures",
|
|
373
|
+
"TokenInfo",
|
|
374
|
+
"RandomLike",
|
|
375
|
+
]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .adjax import Adjax, adjax
|
|
7
|
+
from .apostrofae import Apostrofae, apostrofae
|
|
8
|
+
from .core import (
|
|
9
|
+
Gaggle,
|
|
10
|
+
Glitchling,
|
|
11
|
+
is_rust_pipeline_enabled,
|
|
12
|
+
is_rust_pipeline_supported,
|
|
13
|
+
pipeline_feature_flag_enabled,
|
|
14
|
+
plan_glitchling_specs,
|
|
15
|
+
plan_glitchlings,
|
|
16
|
+
)
|
|
17
|
+
from .hokey import Hokey, hokey
|
|
18
|
+
from .jargoyle import Jargoyle, jargoyle
|
|
19
|
+
from .jargoyle import dependencies_available as _jargoyle_available
|
|
20
|
+
from .mim1c import Mim1c, mim1c
|
|
21
|
+
from .redactyl import Redactyl, redactyl
|
|
22
|
+
from .reduple import Reduple, reduple
|
|
23
|
+
from .rushmore import Rushmore, rushmore
|
|
24
|
+
from .scannequin import Scannequin, scannequin
|
|
25
|
+
from .typogre import Typogre, typogre
|
|
26
|
+
from .zeedub import Zeedub, zeedub
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"Typogre",
|
|
30
|
+
"typogre",
|
|
31
|
+
"Mim1c",
|
|
32
|
+
"mim1c",
|
|
33
|
+
"Jargoyle",
|
|
34
|
+
"jargoyle",
|
|
35
|
+
"Apostrofae",
|
|
36
|
+
"apostrofae",
|
|
37
|
+
"Hokey",
|
|
38
|
+
"hokey",
|
|
39
|
+
"Adjax",
|
|
40
|
+
"adjax",
|
|
41
|
+
"Reduple",
|
|
42
|
+
"reduple",
|
|
43
|
+
"Rushmore",
|
|
44
|
+
"rushmore",
|
|
45
|
+
"Redactyl",
|
|
46
|
+
"redactyl",
|
|
47
|
+
"Scannequin",
|
|
48
|
+
"scannequin",
|
|
49
|
+
"Zeedub",
|
|
50
|
+
"zeedub",
|
|
51
|
+
"Glitchling",
|
|
52
|
+
"Gaggle",
|
|
53
|
+
"plan_glitchlings",
|
|
54
|
+
"plan_glitchling_specs",
|
|
55
|
+
"is_rust_pipeline_enabled",
|
|
56
|
+
"is_rust_pipeline_supported",
|
|
57
|
+
"pipeline_feature_flag_enabled",
|
|
58
|
+
"summon",
|
|
59
|
+
"BUILTIN_GLITCHLINGS",
|
|
60
|
+
"DEFAULT_GLITCHLING_NAMES",
|
|
61
|
+
"parse_glitchling_spec",
|
|
62
|
+
"get_glitchling_class",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
_HAS_JARGOYLE = _jargoyle_available()
|
|
66
|
+
|
|
67
|
+
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, hokey, mim1c]
|
|
68
|
+
if _HAS_JARGOYLE:
|
|
69
|
+
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
|
70
|
+
_BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
|
|
71
|
+
|
|
72
|
+
BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
|
73
|
+
glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
_BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
|
77
|
+
typogre.name.lower(): Typogre,
|
|
78
|
+
apostrofae.name.lower(): Apostrofae,
|
|
79
|
+
hokey.name.lower(): Hokey,
|
|
80
|
+
mim1c.name.lower(): Mim1c,
|
|
81
|
+
adjax.name.lower(): Adjax,
|
|
82
|
+
reduple.name.lower(): Reduple,
|
|
83
|
+
rushmore.name.lower(): Rushmore,
|
|
84
|
+
redactyl.name.lower(): Redactyl,
|
|
85
|
+
scannequin.name.lower(): Scannequin,
|
|
86
|
+
zeedub.name.lower(): Zeedub,
|
|
87
|
+
}
|
|
88
|
+
if _HAS_JARGOYLE:
|
|
89
|
+
_BUILTIN_GLITCHLING_TYPES[jargoyle.name.lower()] = Jargoyle
|
|
90
|
+
|
|
91
|
+
DEFAULT_GLITCHLING_NAMES: list[str] = list(BUILTIN_GLITCHLINGS.keys())
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def parse_glitchling_spec(specification: str) -> Glitchling:
|
|
95
|
+
"""Return a glitchling instance configured according to ``specification``."""
|
|
96
|
+
text = specification.strip()
|
|
97
|
+
if not text:
|
|
98
|
+
raise ValueError("Glitchling specification cannot be empty.")
|
|
99
|
+
|
|
100
|
+
if "(" not in text:
|
|
101
|
+
glitchling = BUILTIN_GLITCHLINGS.get(text.lower())
|
|
102
|
+
if glitchling is None:
|
|
103
|
+
raise ValueError(f"Glitchling '{text}' not found.")
|
|
104
|
+
return glitchling
|
|
105
|
+
|
|
106
|
+
if not text.endswith(")"):
|
|
107
|
+
raise ValueError(f"Invalid parameter syntax for glitchling '{text}'.")
|
|
108
|
+
|
|
109
|
+
name_part, arg_source = text[:-1].split("(", 1)
|
|
110
|
+
name = name_part.strip()
|
|
111
|
+
if not name:
|
|
112
|
+
raise ValueError(f"Invalid glitchling specification '{text}'.")
|
|
113
|
+
|
|
114
|
+
lower_name = name.lower()
|
|
115
|
+
glitchling_type = _BUILTIN_GLITCHLING_TYPES.get(lower_name)
|
|
116
|
+
if glitchling_type is None:
|
|
117
|
+
raise ValueError(f"Glitchling '{name}' not found.")
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
call_expr = ast.parse(f"_({arg_source})", mode="eval").body
|
|
121
|
+
except SyntaxError as exc:
|
|
122
|
+
raise ValueError(f"Invalid parameter syntax for glitchling '{name}': {exc.msg}") from exc
|
|
123
|
+
|
|
124
|
+
if not isinstance(call_expr, ast.Call) or call_expr.args:
|
|
125
|
+
raise ValueError(f"Glitchling '{name}' parameters must be provided as keyword arguments.")
|
|
126
|
+
|
|
127
|
+
kwargs: dict[str, Any] = {}
|
|
128
|
+
for keyword in call_expr.keywords:
|
|
129
|
+
if keyword.arg is None:
|
|
130
|
+
raise ValueError(
|
|
131
|
+
f"Glitchling '{name}' does not support unpacking arbitrary keyword arguments."
|
|
132
|
+
)
|
|
133
|
+
try:
|
|
134
|
+
kwargs[keyword.arg] = ast.literal_eval(keyword.value)
|
|
135
|
+
except (ValueError, SyntaxError) as exc:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
f"Failed to parse value for parameter '{keyword.arg}' on glitchling '{name}': {exc}"
|
|
138
|
+
) from exc
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
return glitchling_type(**kwargs)
|
|
142
|
+
except TypeError as exc:
|
|
143
|
+
raise ValueError(f"Failed to instantiate glitchling '{name}': {exc}") from exc
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_glitchling_class(name: str) -> type[Glitchling]:
|
|
147
|
+
"""Look up the glitchling class registered under ``name``."""
|
|
148
|
+
key = name.strip().lower()
|
|
149
|
+
if not key:
|
|
150
|
+
raise ValueError("Glitchling name cannot be empty.")
|
|
151
|
+
|
|
152
|
+
glitchling_type = _BUILTIN_GLITCHLING_TYPES.get(key)
|
|
153
|
+
if glitchling_type is None:
|
|
154
|
+
raise ValueError(f"Glitchling '{name}' not found.")
|
|
155
|
+
|
|
156
|
+
return glitchling_type
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
|
|
160
|
+
"""Summon glitchlings by name (using defaults) or instance (to change parameters)."""
|
|
161
|
+
summoned: list[Glitchling] = []
|
|
162
|
+
for entry in glitchlings:
|
|
163
|
+
if isinstance(entry, Glitchling):
|
|
164
|
+
summoned.append(entry)
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
summoned.append(parse_glitchling_spec(entry))
|
|
169
|
+
except ValueError as exc:
|
|
170
|
+
raise ValueError(str(exc)) from exc
|
|
171
|
+
|
|
172
|
+
return Gaggle(summoned, seed=seed)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from importlib import resources
|
|
4
|
+
|
|
5
|
+
_CONFUSION_TABLE: list[tuple[str, list[str]]] | None = None
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_confusion_table() -> list[tuple[str, list[str]]]:
|
|
9
|
+
"""Load the OCR confusion table shared by Python and Rust implementations."""
|
|
10
|
+
global _CONFUSION_TABLE
|
|
11
|
+
if _CONFUSION_TABLE is not None:
|
|
12
|
+
return _CONFUSION_TABLE
|
|
13
|
+
|
|
14
|
+
data = resources.files(__package__) / "ocr_confusions.tsv"
|
|
15
|
+
text = data.read_text(encoding="utf-8")
|
|
16
|
+
indexed_entries: list[tuple[int, tuple[str, list[str]]]] = []
|
|
17
|
+
for line_number, line in enumerate(text.splitlines()):
|
|
18
|
+
stripped = line.strip()
|
|
19
|
+
if not stripped or stripped.startswith("#"):
|
|
20
|
+
continue
|
|
21
|
+
parts = stripped.split()
|
|
22
|
+
if len(parts) < 2:
|
|
23
|
+
continue
|
|
24
|
+
source, *replacements = parts
|
|
25
|
+
indexed_entries.append((line_number, (source, replacements)))
|
|
26
|
+
|
|
27
|
+
# Sort longer patterns first to avoid overlapping matches, mirroring the
|
|
28
|
+
# behaviour of the Rust `confusion_table` helper.
|
|
29
|
+
indexed_entries.sort(key=lambda item: (-len(item[1][0]), item[0]))
|
|
30
|
+
entries = [entry for _, entry in indexed_entries]
|
|
31
|
+
_CONFUSION_TABLE = entries
|
|
32
|
+
return entries
|
glitchlings/zoo/_rate.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Utilities for handling legacy parameter names across glitchling classes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def resolve_rate(
|
|
9
|
+
*,
|
|
10
|
+
rate: float | None,
|
|
11
|
+
legacy_value: float | None,
|
|
12
|
+
default: float,
|
|
13
|
+
legacy_name: str,
|
|
14
|
+
) -> float:
|
|
15
|
+
"""Return the effective rate while enforcing mutual exclusivity.
|
|
16
|
+
|
|
17
|
+
This function centralizes the handling of legacy parameter names, allowing
|
|
18
|
+
glitchlings to maintain backwards compatibility while encouraging migration
|
|
19
|
+
to the standardized 'rate' parameter.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
rate : float | None
|
|
24
|
+
The preferred parameter value.
|
|
25
|
+
legacy_value : float | None
|
|
26
|
+
The deprecated legacy parameter value.
|
|
27
|
+
default : float
|
|
28
|
+
Default value if neither parameter is specified.
|
|
29
|
+
legacy_name : str
|
|
30
|
+
Name of the legacy parameter for error/warning messages.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
float
|
|
35
|
+
The resolved rate value.
|
|
36
|
+
|
|
37
|
+
Raises
|
|
38
|
+
------
|
|
39
|
+
ValueError
|
|
40
|
+
If both rate and legacy_value are specified simultaneously.
|
|
41
|
+
|
|
42
|
+
Warnings
|
|
43
|
+
--------
|
|
44
|
+
DeprecationWarning
|
|
45
|
+
If the legacy parameter is used, a deprecation warning is issued.
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
>>> resolve_rate(rate=0.5, legacy_value=None, default=0.1, legacy_name="old_rate")
|
|
50
|
+
0.5
|
|
51
|
+
>>> resolve_rate(rate=None, legacy_value=0.3, default=0.1, legacy_name="old_rate")
|
|
52
|
+
0.3 # Issues deprecation warning
|
|
53
|
+
>>> resolve_rate(rate=None, legacy_value=None, default=0.1, legacy_name="old_rate")
|
|
54
|
+
0.1
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
if rate is not None and legacy_value is not None:
|
|
58
|
+
raise ValueError(f"Specify either 'rate' or '{legacy_name}', not both.")
|
|
59
|
+
|
|
60
|
+
if rate is not None:
|
|
61
|
+
return rate
|
|
62
|
+
|
|
63
|
+
if legacy_value is not None:
|
|
64
|
+
warnings.warn(
|
|
65
|
+
f"The '{legacy_name}' parameter is deprecated and will be removed in version 0.6.0. "
|
|
66
|
+
f"Use 'rate' instead.",
|
|
67
|
+
DeprecationWarning,
|
|
68
|
+
stacklevel=3,
|
|
69
|
+
)
|
|
70
|
+
return legacy_value
|
|
71
|
+
|
|
72
|
+
return default
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def resolve_legacy_param(
|
|
76
|
+
*,
|
|
77
|
+
preferred_value: object,
|
|
78
|
+
legacy_value: object,
|
|
79
|
+
default: object,
|
|
80
|
+
preferred_name: str,
|
|
81
|
+
legacy_name: str,
|
|
82
|
+
) -> object:
|
|
83
|
+
"""Resolve a parameter that has both preferred and legacy names.
|
|
84
|
+
|
|
85
|
+
This is a generalized version of resolve_rate() that works with any type.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
preferred_value : object
|
|
90
|
+
The value from the preferred parameter name.
|
|
91
|
+
legacy_value : object
|
|
92
|
+
The value from the legacy parameter name.
|
|
93
|
+
default : object
|
|
94
|
+
Default value if neither parameter is specified.
|
|
95
|
+
preferred_name : str
|
|
96
|
+
Name of the preferred parameter.
|
|
97
|
+
legacy_name : str
|
|
98
|
+
Name of the legacy parameter for warning messages.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
object
|
|
103
|
+
The resolved parameter value.
|
|
104
|
+
|
|
105
|
+
Raises
|
|
106
|
+
------
|
|
107
|
+
ValueError
|
|
108
|
+
If both preferred and legacy values are specified simultaneously.
|
|
109
|
+
|
|
110
|
+
Warnings
|
|
111
|
+
--------
|
|
112
|
+
DeprecationWarning
|
|
113
|
+
If the legacy parameter is used.
|
|
114
|
+
|
|
115
|
+
"""
|
|
116
|
+
if preferred_value is not None and legacy_value is not None:
|
|
117
|
+
raise ValueError(f"Specify either '{preferred_name}' or '{legacy_name}', not both.")
|
|
118
|
+
|
|
119
|
+
if preferred_value is not None:
|
|
120
|
+
return preferred_value
|
|
121
|
+
|
|
122
|
+
if legacy_value is not None:
|
|
123
|
+
warnings.warn(
|
|
124
|
+
f"The '{legacy_name}' parameter is deprecated and will be removed in version 0.6.0. "
|
|
125
|
+
f"Use '{preferred_name}' instead.",
|
|
126
|
+
DeprecationWarning,
|
|
127
|
+
stacklevel=3,
|
|
128
|
+
)
|
|
129
|
+
return legacy_value
|
|
130
|
+
|
|
131
|
+
return default
|