glitchlings 0.4.4__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +67 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cp313-win_amd64.pyd +0 -0
- glitchlings/compat.py +284 -0
- glitchlings/config.py +388 -0
- glitchlings/config.toml +3 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +215 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +110 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +651 -0
- glitchlings/lexicon/wordnet.py +232 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +168 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.4.dist-info/METADATA +627 -0
- glitchlings-0.4.4.dist-info/RECORD +47 -0
- glitchlings-0.4.4.dist-info/WHEEL +5 -0
- glitchlings-0.4.4.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.4.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,651 @@
|
|
|
1
|
+
"""Vector-space lexicon implementation and cache building utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import importlib
|
|
7
|
+
import importlib.util
|
|
8
|
+
import json
|
|
9
|
+
import math
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Callable, Iterable, Iterator, Mapping, MutableMapping, Sequence
|
|
13
|
+
|
|
14
|
+
from . import LexiconBackend
|
|
15
|
+
from ._cache import CacheSnapshot
|
|
16
|
+
from ._cache import load_cache as _load_cache_file
|
|
17
|
+
from ._cache import write_cache as _write_cache_file
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
21
|
+
"""Return the cosine similarity between two dense vectors."""
|
|
22
|
+
dot_product = 0.0
|
|
23
|
+
norm_a = 0.0
|
|
24
|
+
norm_b = 0.0
|
|
25
|
+
for value_a, value_b in zip(vector_a, vector_b):
|
|
26
|
+
dot_product += value_a * value_b
|
|
27
|
+
norm_a += value_a * value_a
|
|
28
|
+
norm_b += value_b * value_b
|
|
29
|
+
|
|
30
|
+
if norm_a == 0.0 or norm_b == 0.0:
|
|
31
|
+
return 0.0
|
|
32
|
+
|
|
33
|
+
magnitude = math.sqrt(norm_a) * math.sqrt(norm_b)
|
|
34
|
+
if magnitude == 0.0:
|
|
35
|
+
return 0.0
|
|
36
|
+
|
|
37
|
+
return dot_product / magnitude
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class _Adapter:
|
|
41
|
+
"""Base adapter that exposes nearest-neighbour queries for embeddings."""
|
|
42
|
+
|
|
43
|
+
def contains(self, word: str) -> bool:
|
|
44
|
+
raise NotImplementedError
|
|
45
|
+
|
|
46
|
+
def nearest(self, word: str, *, limit: int) -> list[tuple[str, float]]:
|
|
47
|
+
raise NotImplementedError
|
|
48
|
+
|
|
49
|
+
def iter_keys(self) -> Iterator[str]:
|
|
50
|
+
raise NotImplementedError
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class _MappingAdapter(_Adapter):
|
|
54
|
+
"""Adapter for in-memory ``Mapping[str, Sequence[float]]`` embeddings."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, mapping: Mapping[str, Sequence[float]]) -> None:
|
|
57
|
+
self._mapping = mapping
|
|
58
|
+
|
|
59
|
+
def contains(self, word: str) -> bool:
|
|
60
|
+
return word in self._mapping
|
|
61
|
+
|
|
62
|
+
def nearest(self, word: str, *, limit: int) -> list[tuple[str, float]]:
|
|
63
|
+
if word not in self._mapping:
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
target_vector = self._mapping[word]
|
|
67
|
+
scores: list[tuple[str, float]] = []
|
|
68
|
+
for candidate, candidate_vector in self._mapping.items():
|
|
69
|
+
if candidate == word:
|
|
70
|
+
continue
|
|
71
|
+
similarity = _cosine_similarity(target_vector, candidate_vector)
|
|
72
|
+
if similarity == 0.0:
|
|
73
|
+
continue
|
|
74
|
+
scores.append((candidate, similarity))
|
|
75
|
+
|
|
76
|
+
scores.sort(key=lambda pair: pair[1], reverse=True)
|
|
77
|
+
if limit < len(scores):
|
|
78
|
+
return scores[:limit]
|
|
79
|
+
return scores
|
|
80
|
+
|
|
81
|
+
def iter_keys(self) -> Iterator[str]:
|
|
82
|
+
return iter(self._mapping.keys())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class _GensimAdapter(_Adapter):
|
|
86
|
+
"""Adapter that proxies to ``gensim`` ``KeyedVectors`` instances."""
|
|
87
|
+
|
|
88
|
+
def __init__(self, keyed_vectors: Any) -> None:
|
|
89
|
+
self._keyed_vectors = keyed_vectors
|
|
90
|
+
|
|
91
|
+
def contains(self, word: str) -> bool:
|
|
92
|
+
return word in self._keyed_vectors.key_to_index
|
|
93
|
+
|
|
94
|
+
def nearest(self, word: str, *, limit: int) -> list[tuple[str, float]]:
|
|
95
|
+
try:
|
|
96
|
+
raw_neighbors = self._keyed_vectors.most_similar(word, topn=limit)
|
|
97
|
+
except KeyError:
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
return [(candidate, float(score)) for candidate, score in raw_neighbors]
|
|
101
|
+
|
|
102
|
+
def iter_keys(self) -> Iterator[str]:
|
|
103
|
+
return iter(self._keyed_vectors.key_to_index.keys())
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class _SpaCyAdapter(_Adapter):
|
|
107
|
+
"""Adapter that interacts with spaCy ``Language`` objects."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, language: Any) -> None:
|
|
110
|
+
self._language = language
|
|
111
|
+
self._vectors = language.vocab.vectors
|
|
112
|
+
spec = importlib.util.find_spec("numpy")
|
|
113
|
+
if spec is None:
|
|
114
|
+
raise RuntimeError("spaCy vector lexicons require NumPy to be installed.")
|
|
115
|
+
self._numpy = importlib.import_module("numpy")
|
|
116
|
+
|
|
117
|
+
def contains(self, word: str) -> bool:
|
|
118
|
+
strings = self._language.vocab.strings
|
|
119
|
+
return word in strings and strings[word] in self._vectors
|
|
120
|
+
|
|
121
|
+
def nearest(self, word: str, *, limit: int) -> list[tuple[str, float]]:
|
|
122
|
+
strings = self._language.vocab.strings
|
|
123
|
+
if word not in strings:
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
key = strings[word]
|
|
127
|
+
if key not in self._vectors:
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
vector = self._vectors.get(key)
|
|
131
|
+
query = self._numpy.asarray([vector])
|
|
132
|
+
keys, scores = self._vectors.most_similar(query, n=limit)
|
|
133
|
+
candidates: list[tuple[str, float]] = []
|
|
134
|
+
for candidate_key, score in zip(keys[0], scores[0]):
|
|
135
|
+
candidate_word = strings[candidate_key]
|
|
136
|
+
if candidate_word == word:
|
|
137
|
+
continue
|
|
138
|
+
candidates.append((candidate_word, float(score)))
|
|
139
|
+
return candidates
|
|
140
|
+
|
|
141
|
+
def iter_keys(self) -> Iterator[str]:
|
|
142
|
+
strings = self._language.vocab.strings
|
|
143
|
+
for key in self._vectors.keys():
|
|
144
|
+
yield strings[key]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
|
|
148
|
+
"""Load embeddings from a JSON mapping of token to vector list."""
|
|
149
|
+
with path.open("r", encoding="utf8") as handle:
|
|
150
|
+
payload = json.load(handle)
|
|
151
|
+
|
|
152
|
+
if not isinstance(payload, Mapping):
|
|
153
|
+
raise RuntimeError("Vector JSON payload must map tokens to dense vectors.")
|
|
154
|
+
|
|
155
|
+
validated: dict[str, list[float]] = {}
|
|
156
|
+
for token, raw_vector in payload.items():
|
|
157
|
+
if not isinstance(token, str):
|
|
158
|
+
raise RuntimeError("Vector JSON keys must be strings.")
|
|
159
|
+
if not isinstance(raw_vector, Sequence):
|
|
160
|
+
raise RuntimeError(f"Vector for '{token}' must be a sequence of floats.")
|
|
161
|
+
validated[token] = [float(value) for value in raw_vector]
|
|
162
|
+
|
|
163
|
+
return validated
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
|
|
167
|
+
"""Load ``gensim`` vectors from ``path``."""
|
|
168
|
+
if importlib.util.find_spec("gensim") is None:
|
|
169
|
+
raise RuntimeError("The gensim package is required to load keyed vector embeddings.")
|
|
170
|
+
|
|
171
|
+
keyed_vectors_module = importlib.import_module("gensim.models.keyedvectors")
|
|
172
|
+
if binary is None:
|
|
173
|
+
binary = path.suffix in {".bin", ".gz"}
|
|
174
|
+
|
|
175
|
+
if path.suffix in {".kv", ".kv2"}:
|
|
176
|
+
return keyed_vectors_module.KeyedVectors.load(str(path), mmap="r")
|
|
177
|
+
|
|
178
|
+
return keyed_vectors_module.KeyedVectors.load_word2vec_format(str(path), binary=binary)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _load_spacy_language(model_name: str) -> Any:
|
|
182
|
+
"""Load a spaCy language pipeline by name."""
|
|
183
|
+
if importlib.util.find_spec("spacy") is None:
|
|
184
|
+
raise RuntimeError(
|
|
185
|
+
"spaCy is required to use spaCy-backed vector lexicons; install the 'vectors' extra."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
spacy_module = importlib.import_module("spacy")
|
|
189
|
+
return spacy_module.load(model_name)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _load_sentence_transformer(model_name: str) -> Any:
|
|
193
|
+
"""Return a ``SentenceTransformer`` instance for ``model_name``."""
|
|
194
|
+
|
|
195
|
+
if importlib.util.find_spec("sentence_transformers") is None:
|
|
196
|
+
raise RuntimeError(
|
|
197
|
+
"sentence-transformers is required for this source; install the 'st' extra."
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
module = importlib.import_module("sentence_transformers")
|
|
201
|
+
try:
|
|
202
|
+
model_cls = getattr(module, "SentenceTransformer")
|
|
203
|
+
except AttributeError as exc: # pragma: no cover - defensive
|
|
204
|
+
raise RuntimeError("sentence-transformers does not expose SentenceTransformer") from exc
|
|
205
|
+
|
|
206
|
+
return model_cls(model_name)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _build_sentence_transformer_embeddings(
|
|
210
|
+
model_name: str, tokens: Sequence[str]
|
|
211
|
+
) -> Mapping[str, Sequence[float]]:
|
|
212
|
+
"""Return embeddings for ``tokens`` using ``model_name``."""
|
|
213
|
+
|
|
214
|
+
if not tokens:
|
|
215
|
+
return {}
|
|
216
|
+
|
|
217
|
+
model = _load_sentence_transformer(model_name)
|
|
218
|
+
|
|
219
|
+
unique_tokens: list[str] = []
|
|
220
|
+
seen: set[str] = set()
|
|
221
|
+
for token in tokens:
|
|
222
|
+
normalized = token.strip()
|
|
223
|
+
if not normalized or normalized in seen:
|
|
224
|
+
continue
|
|
225
|
+
unique_tokens.append(normalized)
|
|
226
|
+
seen.add(normalized)
|
|
227
|
+
|
|
228
|
+
if not unique_tokens:
|
|
229
|
+
return {}
|
|
230
|
+
|
|
231
|
+
embeddings = model.encode(
|
|
232
|
+
unique_tokens,
|
|
233
|
+
batch_size=64,
|
|
234
|
+
normalize_embeddings=True,
|
|
235
|
+
convert_to_numpy=True,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
token: [float(value) for value in vector]
|
|
240
|
+
for token, vector in zip(unique_tokens, embeddings, strict=True)
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _resolve_source(source: Any | None) -> _Adapter | None:
|
|
245
|
+
"""Return an adapter instance for ``source`` if possible."""
|
|
246
|
+
if source is None:
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
if isinstance(source, _Adapter):
|
|
250
|
+
return source
|
|
251
|
+
|
|
252
|
+
if isinstance(source, Mapping):
|
|
253
|
+
return _MappingAdapter(source)
|
|
254
|
+
|
|
255
|
+
module_name = type(source).__module__
|
|
256
|
+
if module_name.startswith("gensim") and hasattr(source, "most_similar"):
|
|
257
|
+
return _GensimAdapter(source)
|
|
258
|
+
|
|
259
|
+
if module_name.startswith("spacy") and hasattr(source, "vocab"):
|
|
260
|
+
return _SpaCyAdapter(source)
|
|
261
|
+
|
|
262
|
+
if isinstance(source, (str, Path)):
|
|
263
|
+
text_source = str(source)
|
|
264
|
+
if text_source.startswith("spacy:"):
|
|
265
|
+
model = text_source.split(":", 1)[1]
|
|
266
|
+
return _SpaCyAdapter(_load_spacy_language(model))
|
|
267
|
+
|
|
268
|
+
resolved_path = Path(text_source)
|
|
269
|
+
if not resolved_path.exists():
|
|
270
|
+
raise RuntimeError(f"Vector source '{text_source}' does not exist.")
|
|
271
|
+
|
|
272
|
+
suffix = resolved_path.suffix.lower()
|
|
273
|
+
if suffix == ".json":
|
|
274
|
+
return _MappingAdapter(_load_json_vectors(resolved_path))
|
|
275
|
+
|
|
276
|
+
if suffix in {".kv", ".kv2", ".bin", ".gz", ".txt", ".vec"}:
|
|
277
|
+
binary_flag = False if suffix in {".txt", ".vec"} else None
|
|
278
|
+
return _GensimAdapter(_load_gensim_vectors(resolved_path, binary=binary_flag))
|
|
279
|
+
|
|
280
|
+
if hasattr(source, "most_similar") and hasattr(source, "key_to_index"):
|
|
281
|
+
return _GensimAdapter(source)
|
|
282
|
+
|
|
283
|
+
if hasattr(source, "vocab") and hasattr(source.vocab, "vectors"):
|
|
284
|
+
return _SpaCyAdapter(source)
|
|
285
|
+
|
|
286
|
+
raise RuntimeError("Unsupported vector source supplied to VectorLexicon.")
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class VectorLexicon(LexiconBackend):
|
|
290
|
+
"""Lexicon implementation backed by dense word embeddings."""
|
|
291
|
+
|
|
292
|
+
def __init__(
|
|
293
|
+
self,
|
|
294
|
+
*,
|
|
295
|
+
source: Any | None = None,
|
|
296
|
+
cache: Mapping[str, Sequence[str]] | None = None,
|
|
297
|
+
cache_path: str | Path | None = None,
|
|
298
|
+
max_neighbors: int = 50,
|
|
299
|
+
min_similarity: float = 0.0,
|
|
300
|
+
normalizer: Callable[[str], str] | None = None,
|
|
301
|
+
case_sensitive: bool = False,
|
|
302
|
+
seed: int | None = None,
|
|
303
|
+
) -> None:
|
|
304
|
+
"""Initialise the lexicon with an embedding ``source`` and optional cache."""
|
|
305
|
+
super().__init__(seed=seed)
|
|
306
|
+
self._adapter = _resolve_source(source)
|
|
307
|
+
self._max_neighbors = max(1, max_neighbors)
|
|
308
|
+
self._min_similarity = min_similarity
|
|
309
|
+
self._cache: MutableMapping[str, list[str]] = {}
|
|
310
|
+
self._cache_path: Path | None
|
|
311
|
+
self._cache_checksum: str | None = None
|
|
312
|
+
if cache_path is not None:
|
|
313
|
+
path = Path(cache_path)
|
|
314
|
+
snapshot = _load_cache_file(path)
|
|
315
|
+
self._cache.update(snapshot.entries)
|
|
316
|
+
self._cache_checksum = snapshot.checksum
|
|
317
|
+
self._cache_path = path
|
|
318
|
+
else:
|
|
319
|
+
self._cache_path = None
|
|
320
|
+
if cache is not None:
|
|
321
|
+
for key, values in cache.items():
|
|
322
|
+
self._cache[str(key)] = [str(value) for value in values]
|
|
323
|
+
self._cache_dirty = False
|
|
324
|
+
self._case_sensitive = case_sensitive
|
|
325
|
+
if normalizer is not None:
|
|
326
|
+
self._lookup_normalizer: Callable[[str], str] = normalizer
|
|
327
|
+
self._dedupe_normalizer: Callable[[str], str] = normalizer
|
|
328
|
+
elif case_sensitive:
|
|
329
|
+
self._lookup_normalizer = str.lower
|
|
330
|
+
self._dedupe_normalizer = lambda value: value
|
|
331
|
+
else:
|
|
332
|
+
self._lookup_normalizer = str.lower
|
|
333
|
+
self._dedupe_normalizer = str.lower
|
|
334
|
+
|
|
335
|
+
def _normalize_for_lookup(self, word: str) -> str:
|
|
336
|
+
return self._lookup_normalizer(word)
|
|
337
|
+
|
|
338
|
+
def _normalize_for_dedupe(self, word: str) -> str:
|
|
339
|
+
return self._dedupe_normalizer(word)
|
|
340
|
+
|
|
341
|
+
def _fetch_neighbors(
|
|
342
|
+
self, *, original: str, normalized: str, limit: int
|
|
343
|
+
) -> list[tuple[str, float]]:
|
|
344
|
+
if self._adapter is None:
|
|
345
|
+
return []
|
|
346
|
+
|
|
347
|
+
attempts = [original]
|
|
348
|
+
if normalized != original:
|
|
349
|
+
attempts.append(normalized)
|
|
350
|
+
|
|
351
|
+
collected: list[tuple[str, float]] = []
|
|
352
|
+
seen: set[str] = set()
|
|
353
|
+
for token in attempts:
|
|
354
|
+
neighbors = self._adapter.nearest(token, limit=limit)
|
|
355
|
+
for candidate, score in neighbors:
|
|
356
|
+
if candidate in seen:
|
|
357
|
+
continue
|
|
358
|
+
collected.append((candidate, score))
|
|
359
|
+
seen.add(candidate)
|
|
360
|
+
if len(collected) >= limit:
|
|
361
|
+
break
|
|
362
|
+
|
|
363
|
+
if len(collected) > limit:
|
|
364
|
+
return collected[:limit]
|
|
365
|
+
return collected
|
|
366
|
+
|
|
367
|
+
def _ensure_cached(
|
|
368
|
+
self, *, original: str, normalized: str, limit: int | None = None
|
|
369
|
+
) -> list[str]:
|
|
370
|
+
cache_key = normalized if not self._case_sensitive else original
|
|
371
|
+
if cache_key in self._cache:
|
|
372
|
+
return self._cache[cache_key]
|
|
373
|
+
|
|
374
|
+
neighbor_limit = self._max_neighbors if limit is None else max(1, limit)
|
|
375
|
+
neighbors = self._fetch_neighbors(
|
|
376
|
+
original=original, normalized=normalized, limit=neighbor_limit
|
|
377
|
+
)
|
|
378
|
+
synonyms: list[str] = []
|
|
379
|
+
seen_candidates: set[str] = set()
|
|
380
|
+
original_lookup = normalized
|
|
381
|
+
original_dedupe = self._normalize_for_dedupe(original)
|
|
382
|
+
for candidate, similarity in neighbors:
|
|
383
|
+
if similarity < self._min_similarity:
|
|
384
|
+
continue
|
|
385
|
+
if self._case_sensitive:
|
|
386
|
+
if candidate == original:
|
|
387
|
+
continue
|
|
388
|
+
dedupe_key = self._normalize_for_dedupe(candidate)
|
|
389
|
+
if dedupe_key == original_dedupe:
|
|
390
|
+
continue
|
|
391
|
+
else:
|
|
392
|
+
candidate_lookup = self._normalize_for_lookup(candidate)
|
|
393
|
+
if candidate_lookup == original_lookup:
|
|
394
|
+
continue
|
|
395
|
+
dedupe_key = candidate_lookup
|
|
396
|
+
if dedupe_key in seen_candidates:
|
|
397
|
+
continue
|
|
398
|
+
seen_candidates.add(dedupe_key)
|
|
399
|
+
synonyms.append(candidate)
|
|
400
|
+
|
|
401
|
+
self._cache[cache_key] = synonyms
|
|
402
|
+
if self._cache_path is not None:
|
|
403
|
+
self._cache_dirty = True
|
|
404
|
+
return synonyms
|
|
405
|
+
|
|
406
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
407
|
+
"""Return up to ``n`` deterministic synonyms drawn from the embedding cache."""
|
|
408
|
+
normalized = self._normalize_for_lookup(word)
|
|
409
|
+
synonyms = self._ensure_cached(original=word, normalized=normalized)
|
|
410
|
+
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
411
|
+
|
|
412
|
+
def precompute(self, word: str, *, limit: int | None = None) -> list[str]:
|
|
413
|
+
"""Populate the cache for ``word`` and return the stored synonyms."""
|
|
414
|
+
normalized = self._normalize_for_lookup(word)
|
|
415
|
+
return list(self._ensure_cached(original=word, normalized=normalized, limit=limit))
|
|
416
|
+
|
|
417
|
+
def iter_vocabulary(self) -> Iterator[str]:
|
|
418
|
+
"""Yield vocabulary tokens from the underlying embedding source."""
|
|
419
|
+
if self._adapter is None:
|
|
420
|
+
return iter(())
|
|
421
|
+
return self._adapter.iter_keys()
|
|
422
|
+
|
|
423
|
+
def export_cache(self) -> dict[str, list[str]]:
|
|
424
|
+
"""Return a copy of the in-memory synonym cache."""
|
|
425
|
+
return {key: list(values) for key, values in self._cache.items()}
|
|
426
|
+
|
|
427
|
+
@classmethod
|
|
428
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
429
|
+
"""Load and validate a cache file for reuse."""
|
|
430
|
+
return _load_cache_file(Path(path))
|
|
431
|
+
|
|
432
|
+
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
433
|
+
"""Persist the current cache to disk, returning the path used."""
|
|
434
|
+
if path is None:
|
|
435
|
+
if self._cache_path is None:
|
|
436
|
+
raise RuntimeError("No cache path supplied to VectorLexicon.")
|
|
437
|
+
target = self._cache_path
|
|
438
|
+
else:
|
|
439
|
+
target = Path(path)
|
|
440
|
+
self._cache_path = target
|
|
441
|
+
|
|
442
|
+
snapshot = _write_cache_file(target, self._cache)
|
|
443
|
+
self._cache_checksum = snapshot.checksum
|
|
444
|
+
self._cache_dirty = False
|
|
445
|
+
return target
|
|
446
|
+
|
|
447
|
+
def supports_pos(self, pos: str | None) -> bool:
|
|
448
|
+
"""Always return ``True`` because vector sources do not encode POS metadata."""
|
|
449
|
+
return True
|
|
450
|
+
|
|
451
|
+
def __repr__(self) -> str: # pragma: no cover - debug helper
|
|
452
|
+
source_name = self._adapter.__class__.__name__ if self._adapter else "None"
|
|
453
|
+
return (
|
|
454
|
+
f"VectorLexicon(source={source_name}, max_neighbors={self._max_neighbors}, "
|
|
455
|
+
f"seed={self.seed!r})"
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def build_vector_cache(
|
|
460
|
+
*,
|
|
461
|
+
source: Any,
|
|
462
|
+
words: Iterable[str],
|
|
463
|
+
output_path: Path,
|
|
464
|
+
max_neighbors: int = 50,
|
|
465
|
+
min_similarity: float = 0.0,
|
|
466
|
+
case_sensitive: bool = False,
|
|
467
|
+
seed: int | None = None,
|
|
468
|
+
normalizer: Callable[[str], str] | None = None,
|
|
469
|
+
) -> Path:
|
|
470
|
+
"""Generate a synonym cache for ``words`` using ``source`` embeddings."""
|
|
471
|
+
lexicon = VectorLexicon(
|
|
472
|
+
source=source,
|
|
473
|
+
max_neighbors=max_neighbors,
|
|
474
|
+
min_similarity=min_similarity,
|
|
475
|
+
case_sensitive=case_sensitive,
|
|
476
|
+
normalizer=normalizer,
|
|
477
|
+
seed=seed,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
for word in words:
|
|
481
|
+
lexicon.precompute(word)
|
|
482
|
+
|
|
483
|
+
return lexicon.save_cache(output_path)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def load_vector_source(spec: str) -> Any:
|
|
487
|
+
"""Resolve ``spec`` strings for the cache-building CLI."""
|
|
488
|
+
if spec.startswith("spacy:"):
|
|
489
|
+
model_name = spec.split(":", 1)[1]
|
|
490
|
+
return _load_spacy_language(model_name)
|
|
491
|
+
|
|
492
|
+
path = Path(spec).expanduser()
|
|
493
|
+
if not path.exists():
|
|
494
|
+
raise RuntimeError(f"Vector source '{spec}' does not exist.")
|
|
495
|
+
|
|
496
|
+
if path.suffix.lower() == ".json":
|
|
497
|
+
return _load_json_vectors(path)
|
|
498
|
+
|
|
499
|
+
return _load_gensim_vectors(path)
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _parse_cli(argv: Sequence[str] | None = None) -> argparse.Namespace:
|
|
503
|
+
parser = argparse.ArgumentParser(
|
|
504
|
+
prog="python -m glitchlings.lexicon.vector",
|
|
505
|
+
description="Precompute synonym caches for the vector lexicon backend.",
|
|
506
|
+
)
|
|
507
|
+
parser.add_argument(
|
|
508
|
+
"--source",
|
|
509
|
+
required=True,
|
|
510
|
+
help=(
|
|
511
|
+
"Vector source specification. Use 'spacy:<model>' for spaCy pipelines, "
|
|
512
|
+
"'sentence-transformers:<model>' for HuggingFace checkpoints (requires --tokens), "
|
|
513
|
+
"or provide a path to a gensim KeyedVectors/word2vec file."
|
|
514
|
+
),
|
|
515
|
+
)
|
|
516
|
+
parser.add_argument(
|
|
517
|
+
"--output",
|
|
518
|
+
required=True,
|
|
519
|
+
type=Path,
|
|
520
|
+
help="Path to the JSON file that will receive the synonym cache.",
|
|
521
|
+
)
|
|
522
|
+
parser.add_argument(
|
|
523
|
+
"--tokens",
|
|
524
|
+
type=Path,
|
|
525
|
+
help="Optional newline-delimited vocabulary file to restrict generation.",
|
|
526
|
+
)
|
|
527
|
+
parser.add_argument(
|
|
528
|
+
"--max-neighbors",
|
|
529
|
+
type=int,
|
|
530
|
+
default=50,
|
|
531
|
+
help="Number of nearest neighbours to cache per token (default: 50).",
|
|
532
|
+
)
|
|
533
|
+
parser.add_argument(
|
|
534
|
+
"--min-similarity",
|
|
535
|
+
type=float,
|
|
536
|
+
default=0.0,
|
|
537
|
+
help="Minimum cosine similarity required to keep a synonym (default: 0.0).",
|
|
538
|
+
)
|
|
539
|
+
parser.add_argument(
|
|
540
|
+
"--seed",
|
|
541
|
+
type=int,
|
|
542
|
+
help="Optional deterministic seed to bake into the resulting cache.",
|
|
543
|
+
)
|
|
544
|
+
parser.add_argument(
|
|
545
|
+
"--case-sensitive",
|
|
546
|
+
action="store_true",
|
|
547
|
+
help="Preserve original casing instead of lower-casing cache keys.",
|
|
548
|
+
)
|
|
549
|
+
parser.add_argument(
|
|
550
|
+
"--normalizer",
|
|
551
|
+
choices=["lower", "identity"],
|
|
552
|
+
default="lower",
|
|
553
|
+
help="Token normalization strategy for cache keys (default: lower).",
|
|
554
|
+
)
|
|
555
|
+
parser.add_argument(
|
|
556
|
+
"--overwrite",
|
|
557
|
+
action="store_true",
|
|
558
|
+
help="Allow overwriting an existing cache file.",
|
|
559
|
+
)
|
|
560
|
+
parser.add_argument(
|
|
561
|
+
"--limit",
|
|
562
|
+
type=int,
|
|
563
|
+
help="Optional maximum number of tokens to process.",
|
|
564
|
+
)
|
|
565
|
+
return parser.parse_args(argv)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _iter_tokens_from_file(path: Path) -> Iterator[str]:
|
|
569
|
+
with path.open("r", encoding="utf8") as handle:
|
|
570
|
+
for line in handle:
|
|
571
|
+
token = line.strip()
|
|
572
|
+
if token:
|
|
573
|
+
yield token
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
577
|
+
"""Entry-point for ``python -m glitchlings.lexicon.vector``."""
|
|
578
|
+
args = _parse_cli(argv)
|
|
579
|
+
|
|
580
|
+
if args.output.exists() and not args.overwrite:
|
|
581
|
+
raise SystemExit(
|
|
582
|
+
f"Refusing to overwrite existing cache at {args.output!s}; pass --overwrite."
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if args.normalizer == "lower":
|
|
586
|
+
normalizer: Callable[[str], str] | None = None if args.case_sensitive else str.lower
|
|
587
|
+
else:
|
|
588
|
+
|
|
589
|
+
def _identity(value: str) -> str:
|
|
590
|
+
return value
|
|
591
|
+
|
|
592
|
+
normalizer = _identity
|
|
593
|
+
|
|
594
|
+
tokens_from_file: list[str] | None = None
|
|
595
|
+
if args.tokens is not None:
|
|
596
|
+
tokens_from_file = list(_iter_tokens_from_file(args.tokens))
|
|
597
|
+
if args.limit is not None:
|
|
598
|
+
tokens_from_file = tokens_from_file[: args.limit]
|
|
599
|
+
|
|
600
|
+
source_spec = args.source
|
|
601
|
+
token_iter: Iterable[str]
|
|
602
|
+
if source_spec.startswith("sentence-transformers:"):
|
|
603
|
+
model_name = source_spec.split(":", 1)[1].strip()
|
|
604
|
+
if not model_name:
|
|
605
|
+
model_name = "sentence-transformers/all-mpnet-base-v2"
|
|
606
|
+
if tokens_from_file is None:
|
|
607
|
+
raise SystemExit(
|
|
608
|
+
"Sentence-transformers sources require --tokens to supply a vocabulary."
|
|
609
|
+
)
|
|
610
|
+
source = _build_sentence_transformer_embeddings(model_name, tokens_from_file)
|
|
611
|
+
token_iter = tokens_from_file
|
|
612
|
+
else:
|
|
613
|
+
source = load_vector_source(source_spec)
|
|
614
|
+
if tokens_from_file is not None:
|
|
615
|
+
token_iter = tokens_from_file
|
|
616
|
+
else:
|
|
617
|
+
lexicon = VectorLexicon(
|
|
618
|
+
source=source,
|
|
619
|
+
max_neighbors=args.max_neighbors,
|
|
620
|
+
min_similarity=args.min_similarity,
|
|
621
|
+
case_sensitive=args.case_sensitive,
|
|
622
|
+
normalizer=normalizer,
|
|
623
|
+
seed=args.seed,
|
|
624
|
+
)
|
|
625
|
+
iterator = lexicon.iter_vocabulary()
|
|
626
|
+
if args.limit is not None:
|
|
627
|
+
token_iter = (
|
|
628
|
+
token for index, token in enumerate(iterator) if index < args.limit
|
|
629
|
+
)
|
|
630
|
+
else:
|
|
631
|
+
token_iter = iterator
|
|
632
|
+
|
|
633
|
+
build_vector_cache(
|
|
634
|
+
source=source,
|
|
635
|
+
words=token_iter,
|
|
636
|
+
output_path=args.output,
|
|
637
|
+
max_neighbors=args.max_neighbors,
|
|
638
|
+
min_similarity=args.min_similarity,
|
|
639
|
+
case_sensitive=args.case_sensitive,
|
|
640
|
+
seed=args.seed,
|
|
641
|
+
normalizer=normalizer,
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
return 0
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
if __name__ == "__main__": # pragma: no cover - manual CLI entry point
|
|
648
|
+
sys.exit(main())
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
__all__ = ["VectorLexicon", "build_vector_cache", "load_vector_source", "main"]
|