glitchlings 0.4.1__cp312-cp312-win_amd64.whl → 0.4.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +30 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/compat.py +284 -0
- glitchlings/config.py +164 -34
- glitchlings/config.toml +1 -1
- glitchlings/dlc/__init__.py +3 -1
- glitchlings/dlc/_shared.py +68 -0
- glitchlings/dlc/huggingface.py +26 -41
- glitchlings/dlc/prime.py +64 -101
- glitchlings/dlc/pytorch.py +216 -0
- glitchlings/dlc/pytorch_lightning.py +233 -0
- glitchlings/lexicon/__init__.py +12 -33
- glitchlings/lexicon/_cache.py +21 -22
- glitchlings/lexicon/data/default_vector_cache.json +80 -14
- glitchlings/lexicon/metrics.py +1 -8
- glitchlings/lexicon/vector.py +109 -49
- glitchlings/lexicon/wordnet.py +89 -49
- glitchlings/main.py +30 -24
- glitchlings/util/__init__.py +18 -4
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +26 -15
- glitchlings/zoo/_ocr_confusions.py +1 -3
- glitchlings/zoo/_rate.py +1 -4
- glitchlings/zoo/_sampling.py +0 -1
- glitchlings/zoo/_text_utils.py +1 -5
- glitchlings/zoo/adjax.py +2 -4
- glitchlings/zoo/apostrofae.py +128 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +152 -87
- glitchlings/zoo/jargoyle.py +50 -45
- glitchlings/zoo/mim1c.py +11 -10
- glitchlings/zoo/redactyl.py +16 -16
- glitchlings/zoo/reduple.py +5 -3
- glitchlings/zoo/rushmore.py +4 -10
- glitchlings/zoo/scannequin.py +7 -6
- glitchlings/zoo/typogre.py +8 -9
- glitchlings/zoo/zeedub.py +6 -3
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/METADATA +101 -4
- glitchlings-0.4.3.dist-info/RECORD +46 -0
- glitchlings/lexicon/graph.py +0 -290
- glitchlings-0.4.1.dist-info/RECORD +0 -39
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
glitchlings/lexicon/vector.py
CHANGED
|
@@ -4,19 +4,21 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import argparse
|
|
6
6
|
import importlib
|
|
7
|
+
import importlib.util
|
|
7
8
|
import json
|
|
8
9
|
import math
|
|
9
|
-
from pathlib import Path
|
|
10
10
|
import sys
|
|
11
|
+
from pathlib import Path
|
|
11
12
|
from typing import Any, Callable, Iterable, Iterator, Mapping, MutableMapping, Sequence
|
|
12
13
|
|
|
13
14
|
from . import LexiconBackend
|
|
14
|
-
from ._cache import CacheSnapshot
|
|
15
|
+
from ._cache import CacheSnapshot
|
|
16
|
+
from ._cache import load_cache as _load_cache_file
|
|
17
|
+
from ._cache import write_cache as _write_cache_file
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
18
21
|
"""Return the cosine similarity between two dense vectors."""
|
|
19
|
-
|
|
20
22
|
dot_product = 0.0
|
|
21
23
|
norm_a = 0.0
|
|
22
24
|
norm_b = 0.0
|
|
@@ -144,7 +146,6 @@ class _SpaCyAdapter(_Adapter):
|
|
|
144
146
|
|
|
145
147
|
def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
|
|
146
148
|
"""Load embeddings from a JSON mapping of token to vector list."""
|
|
147
|
-
|
|
148
149
|
with path.open("r", encoding="utf8") as handle:
|
|
149
150
|
payload = json.load(handle)
|
|
150
151
|
|
|
@@ -164,11 +165,8 @@ def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
|
|
|
164
165
|
|
|
165
166
|
def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
|
|
166
167
|
"""Load ``gensim`` vectors from ``path``."""
|
|
167
|
-
|
|
168
168
|
if importlib.util.find_spec("gensim") is None:
|
|
169
|
-
raise RuntimeError(
|
|
170
|
-
"The gensim package is required to load keyed vector embeddings."
|
|
171
|
-
)
|
|
169
|
+
raise RuntimeError("The gensim package is required to load keyed vector embeddings.")
|
|
172
170
|
|
|
173
171
|
keyed_vectors_module = importlib.import_module("gensim.models.keyedvectors")
|
|
174
172
|
if binary is None:
|
|
@@ -177,14 +175,11 @@ def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
|
|
|
177
175
|
if path.suffix in {".kv", ".kv2"}:
|
|
178
176
|
return keyed_vectors_module.KeyedVectors.load(str(path), mmap="r")
|
|
179
177
|
|
|
180
|
-
return keyed_vectors_module.KeyedVectors.load_word2vec_format(
|
|
181
|
-
str(path), binary=binary
|
|
182
|
-
)
|
|
178
|
+
return keyed_vectors_module.KeyedVectors.load_word2vec_format(str(path), binary=binary)
|
|
183
179
|
|
|
184
180
|
|
|
185
181
|
def _load_spacy_language(model_name: str) -> Any:
|
|
186
182
|
"""Load a spaCy language pipeline by name."""
|
|
187
|
-
|
|
188
183
|
if importlib.util.find_spec("spacy") is None:
|
|
189
184
|
raise RuntimeError(
|
|
190
185
|
"spaCy is required to use spaCy-backed vector lexicons; install the 'vectors' extra."
|
|
@@ -194,9 +189,60 @@ def _load_spacy_language(model_name: str) -> Any:
|
|
|
194
189
|
return spacy_module.load(model_name)
|
|
195
190
|
|
|
196
191
|
|
|
192
|
+
def _load_sentence_transformer(model_name: str) -> Any:
|
|
193
|
+
"""Return a ``SentenceTransformer`` instance for ``model_name``."""
|
|
194
|
+
|
|
195
|
+
if importlib.util.find_spec("sentence_transformers") is None:
|
|
196
|
+
raise RuntimeError(
|
|
197
|
+
"sentence-transformers is required for this source; install the 'st' extra."
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
module = importlib.import_module("sentence_transformers")
|
|
201
|
+
try:
|
|
202
|
+
model_cls = getattr(module, "SentenceTransformer")
|
|
203
|
+
except AttributeError as exc: # pragma: no cover - defensive
|
|
204
|
+
raise RuntimeError("sentence-transformers does not expose SentenceTransformer") from exc
|
|
205
|
+
|
|
206
|
+
return model_cls(model_name)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _build_sentence_transformer_embeddings(
|
|
210
|
+
model_name: str, tokens: Sequence[str]
|
|
211
|
+
) -> Mapping[str, Sequence[float]]:
|
|
212
|
+
"""Return embeddings for ``tokens`` using ``model_name``."""
|
|
213
|
+
|
|
214
|
+
if not tokens:
|
|
215
|
+
return {}
|
|
216
|
+
|
|
217
|
+
model = _load_sentence_transformer(model_name)
|
|
218
|
+
|
|
219
|
+
unique_tokens: list[str] = []
|
|
220
|
+
seen: set[str] = set()
|
|
221
|
+
for token in tokens:
|
|
222
|
+
normalized = token.strip()
|
|
223
|
+
if not normalized or normalized in seen:
|
|
224
|
+
continue
|
|
225
|
+
unique_tokens.append(normalized)
|
|
226
|
+
seen.add(normalized)
|
|
227
|
+
|
|
228
|
+
if not unique_tokens:
|
|
229
|
+
return {}
|
|
230
|
+
|
|
231
|
+
embeddings = model.encode(
|
|
232
|
+
unique_tokens,
|
|
233
|
+
batch_size=64,
|
|
234
|
+
normalize_embeddings=True,
|
|
235
|
+
convert_to_numpy=True,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
token: [float(value) for value in vector]
|
|
240
|
+
for token, vector in zip(unique_tokens, embeddings, strict=True)
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
|
|
197
244
|
def _resolve_source(source: Any | None) -> _Adapter | None:
|
|
198
245
|
"""Return an adapter instance for ``source`` if possible."""
|
|
199
|
-
|
|
200
246
|
if source is None:
|
|
201
247
|
return None
|
|
202
248
|
|
|
@@ -229,9 +275,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
|
|
|
229
275
|
|
|
230
276
|
if suffix in {".kv", ".kv2", ".bin", ".gz", ".txt", ".vec"}:
|
|
231
277
|
binary_flag = False if suffix in {".txt", ".vec"} else None
|
|
232
|
-
return _GensimAdapter(
|
|
233
|
-
_load_gensim_vectors(resolved_path, binary=binary_flag)
|
|
234
|
-
)
|
|
278
|
+
return _GensimAdapter(_load_gensim_vectors(resolved_path, binary=binary_flag))
|
|
235
279
|
|
|
236
280
|
if hasattr(source, "most_similar") and hasattr(source, "key_to_index"):
|
|
237
281
|
return _GensimAdapter(source)
|
|
@@ -257,6 +301,7 @@ class VectorLexicon(LexiconBackend):
|
|
|
257
301
|
case_sensitive: bool = False,
|
|
258
302
|
seed: int | None = None,
|
|
259
303
|
) -> None:
|
|
304
|
+
"""Initialise the lexicon with an embedding ``source`` and optional cache."""
|
|
260
305
|
super().__init__(seed=seed)
|
|
261
306
|
self._adapter = _resolve_source(source)
|
|
262
307
|
self._max_neighbors = max(1, max_neighbors)
|
|
@@ -358,42 +403,34 @@ class VectorLexicon(LexiconBackend):
|
|
|
358
403
|
self._cache_dirty = True
|
|
359
404
|
return synonyms
|
|
360
405
|
|
|
361
|
-
def get_synonyms(
|
|
362
|
-
|
|
363
|
-
) -> list[str]:
|
|
406
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
407
|
+
"""Return up to ``n`` deterministic synonyms drawn from the embedding cache."""
|
|
364
408
|
normalized = self._normalize_for_lookup(word)
|
|
365
409
|
synonyms = self._ensure_cached(original=word, normalized=normalized)
|
|
366
410
|
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
367
411
|
|
|
368
412
|
def precompute(self, word: str, *, limit: int | None = None) -> list[str]:
|
|
369
413
|
"""Populate the cache for ``word`` and return the stored synonyms."""
|
|
370
|
-
|
|
371
414
|
normalized = self._normalize_for_lookup(word)
|
|
372
|
-
return list(
|
|
373
|
-
self._ensure_cached(original=word, normalized=normalized, limit=limit)
|
|
374
|
-
)
|
|
415
|
+
return list(self._ensure_cached(original=word, normalized=normalized, limit=limit))
|
|
375
416
|
|
|
376
417
|
def iter_vocabulary(self) -> Iterator[str]:
|
|
377
418
|
"""Yield vocabulary tokens from the underlying embedding source."""
|
|
378
|
-
|
|
379
419
|
if self._adapter is None:
|
|
380
420
|
return iter(())
|
|
381
421
|
return self._adapter.iter_keys()
|
|
382
422
|
|
|
383
423
|
def export_cache(self) -> dict[str, list[str]]:
|
|
384
424
|
"""Return a copy of the in-memory synonym cache."""
|
|
385
|
-
|
|
386
425
|
return {key: list(values) for key, values in self._cache.items()}
|
|
387
426
|
|
|
388
427
|
@classmethod
|
|
389
428
|
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
390
429
|
"""Load and validate a cache file for reuse."""
|
|
391
|
-
|
|
392
430
|
return _load_cache_file(Path(path))
|
|
393
431
|
|
|
394
432
|
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
395
433
|
"""Persist the current cache to disk, returning the path used."""
|
|
396
|
-
|
|
397
434
|
if path is None:
|
|
398
435
|
if self._cache_path is None:
|
|
399
436
|
raise RuntimeError("No cache path supplied to VectorLexicon.")
|
|
@@ -408,6 +445,7 @@ class VectorLexicon(LexiconBackend):
|
|
|
408
445
|
return target
|
|
409
446
|
|
|
410
447
|
def supports_pos(self, pos: str | None) -> bool:
|
|
448
|
+
"""Always return ``True`` because vector sources do not encode POS metadata."""
|
|
411
449
|
return True
|
|
412
450
|
|
|
413
451
|
def __repr__(self) -> str: # pragma: no cover - debug helper
|
|
@@ -430,7 +468,6 @@ def build_vector_cache(
|
|
|
430
468
|
normalizer: Callable[[str], str] | None = None,
|
|
431
469
|
) -> Path:
|
|
432
470
|
"""Generate a synonym cache for ``words`` using ``source`` embeddings."""
|
|
433
|
-
|
|
434
471
|
lexicon = VectorLexicon(
|
|
435
472
|
source=source,
|
|
436
473
|
max_neighbors=max_neighbors,
|
|
@@ -448,7 +485,6 @@ def build_vector_cache(
|
|
|
448
485
|
|
|
449
486
|
def load_vector_source(spec: str) -> Any:
|
|
450
487
|
"""Resolve ``spec`` strings for the cache-building CLI."""
|
|
451
|
-
|
|
452
488
|
if spec.startswith("spacy:"):
|
|
453
489
|
model_name = spec.split(":", 1)[1]
|
|
454
490
|
return _load_spacy_language(model_name)
|
|
@@ -472,7 +508,8 @@ def _parse_cli(argv: Sequence[str] | None = None) -> argparse.Namespace:
|
|
|
472
508
|
"--source",
|
|
473
509
|
required=True,
|
|
474
510
|
help=(
|
|
475
|
-
"Vector source specification. Use 'spacy:<model>' for spaCy pipelines "
|
|
511
|
+
"Vector source specification. Use 'spacy:<model>' for spaCy pipelines, "
|
|
512
|
+
"'sentence-transformers:<model>' for HuggingFace checkpoints (requires --tokens), "
|
|
476
513
|
"or provide a path to a gensim KeyedVectors/word2vec file."
|
|
477
514
|
),
|
|
478
515
|
)
|
|
@@ -538,7 +575,6 @@ def _iter_tokens_from_file(path: Path) -> Iterator[str]:
|
|
|
538
575
|
|
|
539
576
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
540
577
|
"""Entry-point for ``python -m glitchlings.lexicon.vector``."""
|
|
541
|
-
|
|
542
578
|
args = _parse_cli(argv)
|
|
543
579
|
|
|
544
580
|
if args.output.exists() and not args.overwrite:
|
|
@@ -547,28 +583,52 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
547
583
|
)
|
|
548
584
|
|
|
549
585
|
if args.normalizer == "lower":
|
|
550
|
-
normalizer: Callable[[str], str] | None =
|
|
551
|
-
None if args.case_sensitive else str.lower
|
|
552
|
-
)
|
|
586
|
+
normalizer: Callable[[str], str] | None = None if args.case_sensitive else str.lower
|
|
553
587
|
else:
|
|
554
|
-
normalizer = lambda value: value
|
|
555
588
|
|
|
556
|
-
|
|
589
|
+
def _identity(value: str) -> str:
|
|
590
|
+
return value
|
|
591
|
+
|
|
592
|
+
normalizer = _identity
|
|
593
|
+
|
|
594
|
+
tokens_from_file: list[str] | None = None
|
|
557
595
|
if args.tokens is not None:
|
|
558
|
-
|
|
596
|
+
tokens_from_file = list(_iter_tokens_from_file(args.tokens))
|
|
597
|
+
if args.limit is not None:
|
|
598
|
+
tokens_from_file = tokens_from_file[: args.limit]
|
|
599
|
+
|
|
600
|
+
source_spec = args.source
|
|
601
|
+
token_iter: Iterable[str]
|
|
602
|
+
if source_spec.startswith("sentence-transformers:"):
|
|
603
|
+
model_name = source_spec.split(":", 1)[1].strip()
|
|
604
|
+
if not model_name:
|
|
605
|
+
model_name = "sentence-transformers/all-mpnet-base-v2"
|
|
606
|
+
if tokens_from_file is None:
|
|
607
|
+
raise SystemExit(
|
|
608
|
+
"Sentence-transformers sources require --tokens to supply a vocabulary."
|
|
609
|
+
)
|
|
610
|
+
source = _build_sentence_transformer_embeddings(model_name, tokens_from_file)
|
|
611
|
+
token_iter = tokens_from_file
|
|
559
612
|
else:
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
613
|
+
source = load_vector_source(source_spec)
|
|
614
|
+
if tokens_from_file is not None:
|
|
615
|
+
token_iter = tokens_from_file
|
|
616
|
+
else:
|
|
617
|
+
lexicon = VectorLexicon(
|
|
618
|
+
source=source,
|
|
619
|
+
max_neighbors=args.max_neighbors,
|
|
620
|
+
min_similarity=args.min_similarity,
|
|
621
|
+
case_sensitive=args.case_sensitive,
|
|
622
|
+
normalizer=normalizer,
|
|
623
|
+
seed=args.seed,
|
|
624
|
+
)
|
|
625
|
+
iterator = lexicon.iter_vocabulary()
|
|
626
|
+
if args.limit is not None:
|
|
627
|
+
token_iter = (
|
|
628
|
+
token for index, token in enumerate(iterator) if index < args.limit
|
|
629
|
+
)
|
|
630
|
+
else:
|
|
631
|
+
token_iter = iterator
|
|
572
632
|
|
|
573
633
|
build_vector_cache(
|
|
574
634
|
source=source,
|
glitchlings/lexicon/wordnet.py
CHANGED
|
@@ -2,42 +2,76 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
5
|
+
from importlib import import_module
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Any, Callable, Protocol, Sequence, cast
|
|
9
|
+
|
|
10
|
+
from ..compat import nltk as _nltk_dependency
|
|
11
|
+
from . import LexiconBackend
|
|
12
|
+
from ._cache import CacheSnapshot
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _LemmaProtocol(Protocol):
|
|
16
|
+
def name(self) -> str:
|
|
17
|
+
...
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class _SynsetProtocol(Protocol):
|
|
21
|
+
def lemmas(self) -> Sequence[_LemmaProtocol]:
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _WordNetResource(Protocol):
|
|
26
|
+
def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
def ensure_loaded(self) -> None:
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
|
|
34
|
+
|
|
35
|
+
nltk: ModuleType | None = _nltk_dependency.get()
|
|
36
|
+
_NLTK_IMPORT_ERROR: ModuleNotFoundError | None = _nltk_dependency.error
|
|
37
|
+
|
|
38
|
+
WordNetCorpusReader: WordNetCorpusReaderFactory | None = None
|
|
39
|
+
find: Callable[[str], Any] | None = None
|
|
40
|
+
_WORDNET_MODULE: _WordNetResource | None = None
|
|
24
41
|
|
|
25
42
|
if nltk is not None: # pragma: no cover - guarded by import success
|
|
26
43
|
try:
|
|
27
|
-
|
|
44
|
+
corpus_reader_module = import_module("nltk.corpus.reader")
|
|
45
|
+
except ModuleNotFoundError as exc: # pragma: no cover - triggered when corpus missing
|
|
46
|
+
if _NLTK_IMPORT_ERROR is None:
|
|
47
|
+
_NLTK_IMPORT_ERROR = exc
|
|
48
|
+
else:
|
|
49
|
+
reader_candidate = getattr(corpus_reader_module, "WordNetCorpusReader", None)
|
|
50
|
+
if reader_candidate is not None:
|
|
51
|
+
WordNetCorpusReader = cast(WordNetCorpusReaderFactory, reader_candidate)
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
data_module = import_module("nltk.data")
|
|
55
|
+
except ModuleNotFoundError as exc: # pragma: no cover - triggered when data missing
|
|
56
|
+
if _NLTK_IMPORT_ERROR is None:
|
|
57
|
+
_NLTK_IMPORT_ERROR = exc
|
|
58
|
+
else:
|
|
59
|
+
locator = getattr(data_module, "find", None)
|
|
60
|
+
if callable(locator):
|
|
61
|
+
find = cast(Callable[[str], Any], locator)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
module_candidate = import_module("nltk.corpus.wordnet")
|
|
28
65
|
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
|
29
66
|
_WORDNET_MODULE = None
|
|
30
67
|
else:
|
|
31
|
-
|
|
68
|
+
_WORDNET_MODULE = cast(_WordNetResource, module_candidate)
|
|
32
69
|
else:
|
|
70
|
+
nltk = None
|
|
71
|
+
find = None
|
|
33
72
|
_WORDNET_MODULE = None
|
|
34
73
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
from . import LexiconBackend
|
|
38
|
-
from ._cache import CacheSnapshot
|
|
39
|
-
|
|
40
|
-
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
|
74
|
+
_WORDNET_HANDLE: _WordNetResource | None = _WORDNET_MODULE
|
|
41
75
|
_wordnet_ready = False
|
|
42
76
|
|
|
43
77
|
_VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
|
|
@@ -45,33 +79,37 @@ _VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
|
|
|
45
79
|
|
|
46
80
|
def _require_nltk() -> None:
|
|
47
81
|
"""Ensure the NLTK dependency is present before continuing."""
|
|
48
|
-
|
|
49
82
|
if nltk is None or find is None:
|
|
50
83
|
message = (
|
|
51
84
|
"The NLTK package is required for WordNet-backed lexicons; install "
|
|
52
85
|
"`nltk` and its WordNet corpus manually to enable this backend."
|
|
53
86
|
)
|
|
54
|
-
if
|
|
87
|
+
if "_NLTK_IMPORT_ERROR" in globals() and _NLTK_IMPORT_ERROR is not None:
|
|
55
88
|
raise RuntimeError(message) from _NLTK_IMPORT_ERROR
|
|
56
89
|
raise RuntimeError(message)
|
|
57
90
|
|
|
58
91
|
|
|
59
92
|
def dependencies_available() -> bool:
|
|
60
93
|
"""Return ``True`` when the runtime NLTK dependency is present."""
|
|
61
|
-
|
|
62
94
|
return nltk is not None and find is not None
|
|
63
95
|
|
|
64
96
|
|
|
65
|
-
def _load_wordnet_reader() ->
|
|
97
|
+
def _load_wordnet_reader() -> _WordNetResource:
|
|
66
98
|
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
|
67
|
-
|
|
68
99
|
_require_nltk()
|
|
69
100
|
|
|
101
|
+
if WordNetCorpusReader is None:
|
|
102
|
+
raise RuntimeError("The NLTK WordNet corpus reader is unavailable.")
|
|
103
|
+
|
|
104
|
+
locator = find
|
|
105
|
+
if locator is None:
|
|
106
|
+
raise RuntimeError("The NLTK data locator is unavailable.")
|
|
107
|
+
|
|
70
108
|
try:
|
|
71
|
-
root =
|
|
109
|
+
root = locator("corpora/wordnet")
|
|
72
110
|
except LookupError:
|
|
73
111
|
try:
|
|
74
|
-
zip_root =
|
|
112
|
+
zip_root = locator("corpora/wordnet.zip")
|
|
75
113
|
except LookupError as exc:
|
|
76
114
|
raise RuntimeError(
|
|
77
115
|
"The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
|
|
@@ -81,24 +119,24 @@ def _load_wordnet_reader() -> WordNetCorpusReader:
|
|
|
81
119
|
return WordNetCorpusReader(root, None)
|
|
82
120
|
|
|
83
121
|
|
|
84
|
-
def _wordnet(force_refresh: bool = False) ->
|
|
122
|
+
def _wordnet(force_refresh: bool = False) -> _WordNetResource:
|
|
85
123
|
"""Retrieve the active WordNet handle, rebuilding it on demand."""
|
|
86
|
-
|
|
87
124
|
global _WORDNET_HANDLE
|
|
88
125
|
|
|
89
126
|
if force_refresh:
|
|
90
127
|
_WORDNET_HANDLE = _WORDNET_MODULE
|
|
91
128
|
|
|
92
|
-
|
|
93
|
-
|
|
129
|
+
cached = _WORDNET_HANDLE
|
|
130
|
+
if cached is not None:
|
|
131
|
+
return cached
|
|
94
132
|
|
|
95
|
-
|
|
96
|
-
|
|
133
|
+
resource = _load_wordnet_reader()
|
|
134
|
+
_WORDNET_HANDLE = resource
|
|
135
|
+
return resource
|
|
97
136
|
|
|
98
137
|
|
|
99
138
|
def ensure_wordnet() -> None:
|
|
100
139
|
"""Ensure the WordNet corpus is available before use."""
|
|
101
|
-
|
|
102
140
|
global _wordnet_ready
|
|
103
141
|
if _wordnet_ready:
|
|
104
142
|
return
|
|
@@ -106,25 +144,25 @@ def ensure_wordnet() -> None:
|
|
|
106
144
|
_require_nltk()
|
|
107
145
|
|
|
108
146
|
resource = _wordnet()
|
|
147
|
+
nltk_module = nltk
|
|
148
|
+
if nltk_module is None:
|
|
149
|
+
raise RuntimeError("The NLTK dependency is unexpectedly unavailable.")
|
|
109
150
|
|
|
110
151
|
try:
|
|
111
152
|
resource.ensure_loaded()
|
|
112
153
|
except LookupError:
|
|
113
|
-
|
|
154
|
+
nltk_module.download("wordnet", quiet=True)
|
|
114
155
|
try:
|
|
115
156
|
resource = _wordnet(force_refresh=True)
|
|
116
157
|
resource.ensure_loaded()
|
|
117
158
|
except LookupError as exc: # pragma: no cover - only triggered when download fails
|
|
118
|
-
raise RuntimeError(
|
|
119
|
-
"Unable to load NLTK WordNet corpus for synonym lookups."
|
|
120
|
-
) from exc
|
|
159
|
+
raise RuntimeError("Unable to load NLTK WordNet corpus for synonym lookups.") from exc
|
|
121
160
|
|
|
122
161
|
_wordnet_ready = True
|
|
123
162
|
|
|
124
163
|
|
|
125
164
|
def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
|
|
126
165
|
"""Gather deterministic synonym candidates for the supplied word."""
|
|
127
|
-
|
|
128
166
|
normalized_word = word.lower()
|
|
129
167
|
wordnet = _wordnet()
|
|
130
168
|
synonyms: set[str] = set()
|
|
@@ -157,9 +195,8 @@ def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
|
|
|
157
195
|
class WordNetLexicon(LexiconBackend):
|
|
158
196
|
"""Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
|
|
159
197
|
|
|
160
|
-
def get_synonyms(
|
|
161
|
-
|
|
162
|
-
) -> list[str]:
|
|
198
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
199
|
+
"""Return up to ``n`` WordNet lemmas for ``word`` filtered by ``pos`` if provided."""
|
|
163
200
|
ensure_wordnet()
|
|
164
201
|
|
|
165
202
|
if pos is None:
|
|
@@ -174,15 +211,18 @@ class WordNetLexicon(LexiconBackend):
|
|
|
174
211
|
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
175
212
|
|
|
176
213
|
def supports_pos(self, pos: str | None) -> bool:
|
|
214
|
+
"""Return ``True`` when ``pos`` is unset or recognised by the WordNet corpus."""
|
|
177
215
|
if pos is None:
|
|
178
216
|
return True
|
|
179
217
|
return pos.lower() in _VALID_POS
|
|
180
218
|
|
|
181
219
|
@classmethod
|
|
182
220
|
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
221
|
+
"""WordNet lexicons do not persist caches; raising keeps the contract explicit."""
|
|
183
222
|
raise RuntimeError("WordNetLexicon does not persist or load caches.")
|
|
184
223
|
|
|
185
224
|
def save_cache(self, path: str | Path | None = None) -> Path | None:
|
|
225
|
+
"""WordNet lexicons do not persist caches; raising keeps the contract explicit."""
|
|
186
226
|
raise RuntimeError("WordNetLexicon does not persist or load caches.")
|
|
187
227
|
|
|
188
228
|
def __repr__(self) -> str: # pragma: no cover - trivial representation
|