glitchlings 0.4.0__cp312-cp312-macosx_11_0_universal2.whl → 0.4.2__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (39) hide show
  1. glitchlings/__init__.py +26 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  4. glitchlings/compat.py +215 -0
  5. glitchlings/config.py +136 -19
  6. glitchlings/dlc/_shared.py +68 -0
  7. glitchlings/dlc/huggingface.py +26 -41
  8. glitchlings/dlc/prime.py +64 -101
  9. glitchlings/lexicon/__init__.py +26 -19
  10. glitchlings/lexicon/_cache.py +104 -0
  11. glitchlings/lexicon/graph.py +18 -39
  12. glitchlings/lexicon/metrics.py +1 -8
  13. glitchlings/lexicon/vector.py +29 -67
  14. glitchlings/lexicon/wordnet.py +39 -30
  15. glitchlings/main.py +9 -13
  16. glitchlings/util/__init__.py +18 -4
  17. glitchlings/util/adapters.py +27 -0
  18. glitchlings/zoo/__init__.py +21 -14
  19. glitchlings/zoo/_ocr_confusions.py +1 -3
  20. glitchlings/zoo/_rate.py +1 -4
  21. glitchlings/zoo/_sampling.py +0 -1
  22. glitchlings/zoo/_text_utils.py +1 -5
  23. glitchlings/zoo/adjax.py +0 -2
  24. glitchlings/zoo/core.py +185 -56
  25. glitchlings/zoo/jargoyle.py +9 -14
  26. glitchlings/zoo/mim1c.py +11 -10
  27. glitchlings/zoo/redactyl.py +5 -8
  28. glitchlings/zoo/reduple.py +3 -1
  29. glitchlings/zoo/rushmore.py +2 -8
  30. glitchlings/zoo/scannequin.py +5 -4
  31. glitchlings/zoo/typogre.py +3 -7
  32. glitchlings/zoo/zeedub.py +2 -2
  33. {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/METADATA +68 -4
  34. glitchlings-0.4.2.dist-info/RECORD +42 -0
  35. glitchlings-0.4.0.dist-info/RECORD +0 -38
  36. {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/WHEEL +0 -0
  37. {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/entry_points.txt +0 -0
  38. {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/licenses/LICENSE +0 -0
  39. {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/top_level.txt +0 -0
@@ -2,22 +2,22 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import json
6
5
  import re
7
6
  from pathlib import Path
8
7
  from typing import Iterable, Mapping, MutableMapping, Sequence
9
8
 
10
- from . import Lexicon
9
+ from . import LexiconBackend
10
+ from ._cache import CacheSnapshot
11
+ from ._cache import load_cache as _load_cache_file
12
+ from ._cache import write_cache as _write_cache_file
11
13
  from .vector import VectorLexicon
12
14
 
13
-
14
15
  _CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
15
16
  _PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
16
17
 
17
18
 
18
19
  def _lemmatize_token(token: str) -> str:
19
20
  """Return a lightweight lemma for ``token`` using heuristic rules."""
20
-
21
21
  irregular = {
22
22
  "children": "child",
23
23
  "mice": "mouse",
@@ -60,7 +60,6 @@ def _lemmatize_token(token: str) -> str:
60
60
 
61
61
  def _normalize_phrase(phrase: str) -> str:
62
62
  """Normalise ``phrase`` for ConceptNet lookups."""
63
-
64
63
  stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
65
64
  tokens = [token for token in stripped.split() if token]
66
65
  if not tokens:
@@ -71,7 +70,6 @@ def _normalize_phrase(phrase: str) -> str:
71
70
 
72
71
  def _concept_terms(normalized: str) -> list[str]:
73
72
  """Return ConceptNet term variants for ``normalized``."""
74
-
75
73
  collapsed = normalized.replace(" ", "_")
76
74
  if not collapsed:
77
75
  return []
@@ -83,7 +81,6 @@ def _concept_terms(normalized: str) -> list[str]:
83
81
 
84
82
  def _surface_from_concept(concept: str) -> str | None:
85
83
  """Return a human-readable surface form for ``concept``."""
86
-
87
84
  match = _CONCEPT_RE.match(concept)
88
85
  if match is None:
89
86
  return None
@@ -102,7 +99,6 @@ def _language_from_concept(concept: str) -> str | None:
102
99
 
103
100
  def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
104
101
  """Load ConceptNet Numberbatch embeddings from ``path``."""
105
-
106
102
  if not path.exists():
107
103
  return {}
108
104
 
@@ -140,30 +136,7 @@ def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[f
140
136
  return embeddings
141
137
 
142
138
 
143
- def _load_cache(path: Path) -> dict[str, list[str]]:
144
- if not path.exists():
145
- return {}
146
- with path.open("r", encoding="utf8") as handle:
147
- payload = json.load(handle)
148
- if not isinstance(payload, Mapping):
149
- raise RuntimeError("Graph lexicon cache must be a mapping of strings to lists.")
150
- cache: dict[str, list[str]] = {}
151
- for key, values in payload.items():
152
- if not isinstance(key, str):
153
- raise RuntimeError("Graph lexicon cache keys must be strings.")
154
- if not isinstance(values, Sequence):
155
- raise RuntimeError("Graph lexicon cache values must be sequences of strings.")
156
- cache[key] = [str(value) for value in values]
157
- return cache
158
-
159
-
160
- def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
161
- serialisable = {key: list(values) for key, values in sorted(cache.items())}
162
- with path.open("w", encoding="utf8") as handle:
163
- json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
164
-
165
-
166
- class GraphLexicon(Lexicon):
139
+ class GraphLexicon(LexiconBackend):
167
140
  """Lexicon backed by ConceptNet/Numberbatch embeddings."""
168
141
 
169
142
  def __init__(
@@ -184,9 +157,12 @@ class GraphLexicon(Lexicon):
184
157
  self._max_neighbors = max(1, max_neighbors)
185
158
  self._min_similarity = min_similarity
186
159
  self._cache: MutableMapping[str, list[str]] = {}
187
- self._cache_path = Path(cache_path) if cache_path is not None else None
160
+ self._cache_path: Path | None = Path(cache_path) if cache_path is not None else None
161
+ self._cache_checksum: str | None = None
188
162
  if self._cache_path is not None:
189
- self._cache.update(_load_cache(self._cache_path))
163
+ snapshot = _load_cache_file(self._cache_path)
164
+ self._cache.update(snapshot.entries)
165
+ self._cache_checksum = snapshot.checksum
190
166
  if cache is not None:
191
167
  for key, values in cache.items():
192
168
  self._cache[str(key)] = [str(value) for value in values]
@@ -260,9 +236,7 @@ class GraphLexicon(Lexicon):
260
236
  self._cache_dirty = True
261
237
  return synonyms
262
238
 
263
- def get_synonyms(
264
- self, word: str, pos: str | None = None, n: int = 5
265
- ) -> list[str]:
239
+ def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
266
240
  normalized = _normalize_phrase(word)
267
241
  if not normalized:
268
242
  return []
@@ -278,6 +252,11 @@ class GraphLexicon(Lexicon):
278
252
  def export_cache(self) -> dict[str, list[str]]:
279
253
  return {key: list(values) for key, values in self._cache.items()}
280
254
 
255
+ @classmethod
256
+ def load_cache(cls, path: str | Path) -> CacheSnapshot:
257
+ """Load and validate a persisted ConceptNet cache file."""
258
+ return _load_cache_file(Path(path))
259
+
281
260
  def save_cache(self, path: str | Path | None = None) -> Path:
282
261
  if path is None:
283
262
  if self._cache_path is None:
@@ -286,7 +265,8 @@ class GraphLexicon(Lexicon):
286
265
  else:
287
266
  target = Path(path)
288
267
  self._cache_path = target
289
- _write_cache(target, self._cache)
268
+ snapshot = _write_cache_file(target, self._cache)
269
+ self._cache_checksum = snapshot.checksum
290
270
  self._cache_dirty = False
291
271
  return target
292
272
 
@@ -300,4 +280,3 @@ class GraphLexicon(Lexicon):
300
280
  f"GraphLexicon(languages={sorted(self._languages)!r}, "
301
281
  f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
302
282
  )
303
-
@@ -18,7 +18,6 @@ def _unique_synonyms(
18
18
  sample_size: int,
19
19
  ) -> list[str]:
20
20
  """Return unique synonym candidates excluding the original token."""
21
-
22
21
  collected: list[str] = []
23
22
  seen: set[str] = set()
24
23
  source = word.lower()
@@ -41,7 +40,6 @@ def synonym_diversity(
41
40
  sample_size: int = 5,
42
41
  ) -> float:
43
42
  """Return the mean unique-synonym count for ``words`` using ``lexicon``."""
44
-
45
43
  totals = []
46
44
  for word in words:
47
45
  synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
@@ -60,7 +58,6 @@ def coverage_ratio(
60
58
  min_synonyms: int = 3,
61
59
  ) -> float:
62
60
  """Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
63
-
64
61
  total = 0
65
62
  hits = 0
66
63
  for word in words:
@@ -96,7 +93,6 @@ def mean_cosine_similarity(
96
93
  sample_size: int = 5,
97
94
  ) -> float:
98
95
  """Return the mean cosine similarity between each word and its candidates."""
99
-
100
96
  total = 0.0
101
97
  count = 0
102
98
  for word in words:
@@ -126,11 +122,8 @@ def compare_lexicons(
126
122
  embeddings: Mapping[str, Sequence[float]] | None = None,
127
123
  ) -> dict[str, float]:
128
124
  """Return comparative coverage and diversity statistics for two lexicons."""
129
-
130
125
  stats = {
131
- "baseline_diversity": synonym_diversity(
132
- baseline, words, pos=pos, sample_size=sample_size
133
- ),
126
+ "baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
134
127
  "candidate_diversity": synonym_diversity(
135
128
  candidate, words, pos=pos, sample_size=sample_size
136
129
  ),
@@ -6,16 +6,18 @@ import argparse
6
6
  import importlib
7
7
  import json
8
8
  import math
9
- from pathlib import Path
10
9
  import sys
10
+ from pathlib import Path
11
11
  from typing import Any, Callable, Iterable, Iterator, Mapping, MutableMapping, Sequence
12
12
 
13
- from . import Lexicon
13
+ from . import LexiconBackend
14
+ from ._cache import CacheSnapshot
15
+ from ._cache import load_cache as _load_cache_file
16
+ from ._cache import write_cache as _write_cache_file
14
17
 
15
18
 
16
19
  def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
17
20
  """Return the cosine similarity between two dense vectors."""
18
-
19
21
  dot_product = 0.0
20
22
  norm_a = 0.0
21
23
  norm_b = 0.0
@@ -143,7 +145,6 @@ class _SpaCyAdapter(_Adapter):
143
145
 
144
146
  def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
145
147
  """Load embeddings from a JSON mapping of token to vector list."""
146
-
147
148
  with path.open("r", encoding="utf8") as handle:
148
149
  payload = json.load(handle)
149
150
 
@@ -163,11 +164,8 @@ def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
163
164
 
164
165
  def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
165
166
  """Load ``gensim`` vectors from ``path``."""
166
-
167
167
  if importlib.util.find_spec("gensim") is None:
168
- raise RuntimeError(
169
- "The gensim package is required to load keyed vector embeddings."
170
- )
168
+ raise RuntimeError("The gensim package is required to load keyed vector embeddings.")
171
169
 
172
170
  keyed_vectors_module = importlib.import_module("gensim.models.keyedvectors")
173
171
  if binary is None:
@@ -176,14 +174,11 @@ def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
176
174
  if path.suffix in {".kv", ".kv2"}:
177
175
  return keyed_vectors_module.KeyedVectors.load(str(path), mmap="r")
178
176
 
179
- return keyed_vectors_module.KeyedVectors.load_word2vec_format(
180
- str(path), binary=binary
181
- )
177
+ return keyed_vectors_module.KeyedVectors.load_word2vec_format(str(path), binary=binary)
182
178
 
183
179
 
184
180
  def _load_spacy_language(model_name: str) -> Any:
185
181
  """Load a spaCy language pipeline by name."""
186
-
187
182
  if importlib.util.find_spec("spacy") is None:
188
183
  raise RuntimeError(
189
184
  "spaCy is required to use spaCy-backed vector lexicons; install the 'vectors' extra."
@@ -195,7 +190,6 @@ def _load_spacy_language(model_name: str) -> Any:
195
190
 
196
191
  def _resolve_source(source: Any | None) -> _Adapter | None:
197
192
  """Return an adapter instance for ``source`` if possible."""
198
-
199
193
  if source is None:
200
194
  return None
201
195
 
@@ -228,9 +222,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
228
222
 
229
223
  if suffix in {".kv", ".kv2", ".bin", ".gz", ".txt", ".vec"}:
230
224
  binary_flag = False if suffix in {".txt", ".vec"} else None
231
- return _GensimAdapter(
232
- _load_gensim_vectors(resolved_path, binary=binary_flag)
233
- )
225
+ return _GensimAdapter(_load_gensim_vectors(resolved_path, binary=binary_flag))
234
226
 
235
227
  if hasattr(source, "most_similar") and hasattr(source, "key_to_index"):
236
228
  return _GensimAdapter(source)
@@ -241,38 +233,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
241
233
  raise RuntimeError("Unsupported vector source supplied to VectorLexicon.")
242
234
 
243
235
 
244
- def _load_cache(path: Path) -> dict[str, list[str]]:
245
- """Load a synonym cache from ``path`` if it exists."""
246
-
247
- if not path.exists():
248
- return {}
249
-
250
- with path.open("r", encoding="utf8") as handle:
251
- payload = json.load(handle)
252
-
253
- if not isinstance(payload, Mapping):
254
- raise RuntimeError("Synonym cache must be a JSON mapping of strings to lists.")
255
-
256
- cache: dict[str, list[str]] = {}
257
- for key, values in payload.items():
258
- if not isinstance(key, str):
259
- raise RuntimeError("Synonym cache keys must be strings.")
260
- if not isinstance(values, Sequence):
261
- raise RuntimeError("Synonym cache values must be lists of strings.")
262
- cache[key] = [str(value) for value in values]
263
-
264
- return cache
265
-
266
-
267
- def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
268
- """Write ``cache`` to ``path`` deterministically."""
269
-
270
- serialisable = {key: list(values) for key, values in sorted(cache.items())}
271
- with path.open("w", encoding="utf8") as handle:
272
- json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
273
-
274
-
275
- class VectorLexicon(Lexicon):
236
+ class VectorLexicon(LexiconBackend):
276
237
  """Lexicon implementation backed by dense word embeddings."""
277
238
 
278
239
  def __init__(
@@ -292,9 +253,13 @@ class VectorLexicon(Lexicon):
292
253
  self._max_neighbors = max(1, max_neighbors)
293
254
  self._min_similarity = min_similarity
294
255
  self._cache: MutableMapping[str, list[str]] = {}
256
+ self._cache_path: Path | None
257
+ self._cache_checksum: str | None = None
295
258
  if cache_path is not None:
296
259
  path = Path(cache_path)
297
- self._cache.update(_load_cache(path))
260
+ snapshot = _load_cache_file(path)
261
+ self._cache.update(snapshot.entries)
262
+ self._cache_checksum = snapshot.checksum
298
263
  self._cache_path = path
299
264
  else:
300
265
  self._cache_path = None
@@ -384,36 +349,33 @@ class VectorLexicon(Lexicon):
384
349
  self._cache_dirty = True
385
350
  return synonyms
386
351
 
387
- def get_synonyms(
388
- self, word: str, pos: str | None = None, n: int = 5
389
- ) -> list[str]:
352
+ def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
390
353
  normalized = self._normalize_for_lookup(word)
391
354
  synonyms = self._ensure_cached(original=word, normalized=normalized)
392
355
  return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
393
356
 
394
357
  def precompute(self, word: str, *, limit: int | None = None) -> list[str]:
395
358
  """Populate the cache for ``word`` and return the stored synonyms."""
396
-
397
359
  normalized = self._normalize_for_lookup(word)
398
- return list(
399
- self._ensure_cached(original=word, normalized=normalized, limit=limit)
400
- )
360
+ return list(self._ensure_cached(original=word, normalized=normalized, limit=limit))
401
361
 
402
362
  def iter_vocabulary(self) -> Iterator[str]:
403
363
  """Yield vocabulary tokens from the underlying embedding source."""
404
-
405
364
  if self._adapter is None:
406
365
  return iter(())
407
366
  return self._adapter.iter_keys()
408
367
 
409
368
  def export_cache(self) -> dict[str, list[str]]:
410
369
  """Return a copy of the in-memory synonym cache."""
411
-
412
370
  return {key: list(values) for key, values in self._cache.items()}
413
371
 
372
+ @classmethod
373
+ def load_cache(cls, path: str | Path) -> CacheSnapshot:
374
+ """Load and validate a cache file for reuse."""
375
+ return _load_cache_file(Path(path))
376
+
414
377
  def save_cache(self, path: str | Path | None = None) -> Path:
415
378
  """Persist the current cache to disk, returning the path used."""
416
-
417
379
  if path is None:
418
380
  if self._cache_path is None:
419
381
  raise RuntimeError("No cache path supplied to VectorLexicon.")
@@ -422,7 +384,8 @@ class VectorLexicon(Lexicon):
422
384
  target = Path(path)
423
385
  self._cache_path = target
424
386
 
425
- _write_cache(target, self._cache)
387
+ snapshot = _write_cache_file(target, self._cache)
388
+ self._cache_checksum = snapshot.checksum
426
389
  self._cache_dirty = False
427
390
  return target
428
391
 
@@ -449,7 +412,6 @@ def build_vector_cache(
449
412
  normalizer: Callable[[str], str] | None = None,
450
413
  ) -> Path:
451
414
  """Generate a synonym cache for ``words`` using ``source`` embeddings."""
452
-
453
415
  lexicon = VectorLexicon(
454
416
  source=source,
455
417
  max_neighbors=max_neighbors,
@@ -467,7 +429,6 @@ def build_vector_cache(
467
429
 
468
430
  def load_vector_source(spec: str) -> Any:
469
431
  """Resolve ``spec`` strings for the cache-building CLI."""
470
-
471
432
  if spec.startswith("spacy:"):
472
433
  model_name = spec.split(":", 1)[1]
473
434
  return _load_spacy_language(model_name)
@@ -557,7 +518,6 @@ def _iter_tokens_from_file(path: Path) -> Iterator[str]:
557
518
 
558
519
  def main(argv: Sequence[str] | None = None) -> int:
559
520
  """Entry-point for ``python -m glitchlings.lexicon.vector``."""
560
-
561
521
  args = _parse_cli(argv)
562
522
 
563
523
  if args.output.exists() and not args.overwrite:
@@ -566,11 +526,13 @@ def main(argv: Sequence[str] | None = None) -> int:
566
526
  )
567
527
 
568
528
  if args.normalizer == "lower":
569
- normalizer: Callable[[str], str] | None = (
570
- None if args.case_sensitive else str.lower
571
- )
529
+ normalizer: Callable[[str], str] | None = None if args.case_sensitive else str.lower
572
530
  else:
573
- normalizer = lambda value: value
531
+
532
+ def _identity(value: str) -> str:
533
+ return value
534
+
535
+ normalizer = _identity
574
536
 
575
537
  source = load_vector_source(args.source)
576
538
  if args.tokens is not None:
@@ -2,38 +2,50 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from importlib import import_module
6
+ from pathlib import Path
5
7
  from typing import TYPE_CHECKING, Any
6
8
 
7
- try: # pragma: no cover - exercised when NLTK is available
8
- import nltk # type: ignore[import]
9
- except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
10
- nltk = None # type: ignore[assignment]
11
- find = None # type: ignore[assignment]
12
- _NLTK_IMPORT_ERROR = exc
13
- else: # pragma: no cover - executed when NLTK is present
14
- from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
15
- from nltk.data import find as _nltk_find # type: ignore[import]
9
+ from ..compat import nltk as _nltk_dependency
10
+ from . import LexiconBackend
11
+ from ._cache import CacheSnapshot
16
12
 
17
- find = _nltk_find
18
- _NLTK_IMPORT_ERROR = None
13
+ nltk = _nltk_dependency.get() # type: ignore[assignment]
14
+ _NLTK_IMPORT_ERROR = _nltk_dependency.error
19
15
 
20
16
  if TYPE_CHECKING: # pragma: no cover - typing aid only
21
17
  from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
22
18
  else: # pragma: no cover - runtime fallback to avoid hard dependency
23
19
  WordNetCorpusReader = Any
24
20
 
21
+ find: Any | None = None
22
+ _WORDNET_MODULE: Any | None = None
23
+
25
24
  if nltk is not None: # pragma: no cover - guarded by import success
26
25
  try:
27
- from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
26
+ corpus_reader_module = import_module("nltk.corpus.reader")
27
+ WordNetCorpusReader = corpus_reader_module.WordNetCorpusReader # type: ignore[assignment]
28
+ except ModuleNotFoundError as exc: # pragma: no cover - triggered when corpus missing
29
+ if _NLTK_IMPORT_ERROR is None:
30
+ _NLTK_IMPORT_ERROR = exc # type: ignore[assignment]
31
+ else:
32
+ try:
33
+ data_module = import_module("nltk.data")
34
+ except ModuleNotFoundError as exc: # pragma: no cover - triggered when data missing
35
+ if _NLTK_IMPORT_ERROR is None:
36
+ _NLTK_IMPORT_ERROR = exc # type: ignore[assignment]
37
+ else:
38
+ find = getattr(data_module, "find", None)
39
+
40
+ try:
41
+ _WORDNET_MODULE = import_module("nltk.corpus.wordnet")
28
42
  except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
29
43
  _WORDNET_MODULE = None
30
- else:
31
- WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
32
44
  else:
45
+ nltk = None # type: ignore[assignment]
46
+ find = None
33
47
  _WORDNET_MODULE = None
34
48
 
35
- from . import Lexicon
36
-
37
49
  _WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
38
50
  _wordnet_ready = False
39
51
 
@@ -42,26 +54,23 @@ _VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
42
54
 
43
55
  def _require_nltk() -> None:
44
56
  """Ensure the NLTK dependency is present before continuing."""
45
-
46
57
  if nltk is None or find is None:
47
58
  message = (
48
59
  "The NLTK package is required for WordNet-backed lexicons; install "
49
60
  "`nltk` and its WordNet corpus manually to enable this backend."
50
61
  )
51
- if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
62
+ if "_NLTK_IMPORT_ERROR" in globals() and _NLTK_IMPORT_ERROR is not None:
52
63
  raise RuntimeError(message) from _NLTK_IMPORT_ERROR
53
64
  raise RuntimeError(message)
54
65
 
55
66
 
56
67
  def dependencies_available() -> bool:
57
68
  """Return ``True`` when the runtime NLTK dependency is present."""
58
-
59
69
  return nltk is not None and find is not None
60
70
 
61
71
 
62
72
  def _load_wordnet_reader() -> WordNetCorpusReader:
63
73
  """Return a WordNet corpus reader from the downloaded corpus files."""
64
-
65
74
  _require_nltk()
66
75
 
67
76
  try:
@@ -80,7 +89,6 @@ def _load_wordnet_reader() -> WordNetCorpusReader:
80
89
 
81
90
  def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
82
91
  """Retrieve the active WordNet handle, rebuilding it on demand."""
83
-
84
92
  global _WORDNET_HANDLE
85
93
 
86
94
  if force_refresh:
@@ -95,7 +103,6 @@ def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
95
103
 
96
104
  def ensure_wordnet() -> None:
97
105
  """Ensure the WordNet corpus is available before use."""
98
-
99
106
  global _wordnet_ready
100
107
  if _wordnet_ready:
101
108
  return
@@ -112,16 +119,13 @@ def ensure_wordnet() -> None:
112
119
  resource = _wordnet(force_refresh=True)
113
120
  resource.ensure_loaded()
114
121
  except LookupError as exc: # pragma: no cover - only triggered when download fails
115
- raise RuntimeError(
116
- "Unable to load NLTK WordNet corpus for synonym lookups."
117
- ) from exc
122
+ raise RuntimeError("Unable to load NLTK WordNet corpus for synonym lookups.") from exc
118
123
 
119
124
  _wordnet_ready = True
120
125
 
121
126
 
122
127
  def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
123
128
  """Gather deterministic synonym candidates for the supplied word."""
124
-
125
129
  normalized_word = word.lower()
126
130
  wordnet = _wordnet()
127
131
  synonyms: set[str] = set()
@@ -151,12 +155,10 @@ def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
151
155
  return sorted(synonyms)
152
156
 
153
157
 
154
- class WordNetLexicon(Lexicon):
158
+ class WordNetLexicon(LexiconBackend):
155
159
  """Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
156
160
 
157
- def get_synonyms(
158
- self, word: str, pos: str | None = None, n: int = 5
159
- ) -> list[str]:
161
+ def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
160
162
  ensure_wordnet()
161
163
 
162
164
  if pos is None:
@@ -175,6 +177,13 @@ class WordNetLexicon(Lexicon):
175
177
  return True
176
178
  return pos.lower() in _VALID_POS
177
179
 
180
+ @classmethod
181
+ def load_cache(cls, path: str | Path) -> CacheSnapshot:
182
+ raise RuntimeError("WordNetLexicon does not persist or load caches.")
183
+
184
+ def save_cache(self, path: str | Path | None = None) -> Path | None:
185
+ raise RuntimeError("WordNetLexicon does not persist or load caches.")
186
+
178
187
  def __repr__(self) -> str: # pragma: no cover - trivial representation
179
188
  return f"WordNetLexicon(seed={self.seed!r})"
180
189
 
glitchlings/main.py CHANGED
@@ -4,16 +4,16 @@ from __future__ import annotations
4
4
 
5
5
  import argparse
6
6
  import difflib
7
- from pathlib import Path
8
7
  import sys
8
+ from pathlib import Path
9
9
 
10
10
  from . import SAMPLE_TEXT
11
11
  from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
12
12
  from .zoo import (
13
- Glitchling,
14
- Gaggle,
15
13
  BUILTIN_GLITCHLINGS,
16
14
  DEFAULT_GLITCHLING_NAMES,
15
+ Gaggle,
16
+ Glitchling,
17
17
  parse_glitchling_spec,
18
18
  summon,
19
19
  )
@@ -26,8 +26,8 @@ def build_parser() -> argparse.ArgumentParser:
26
26
 
27
27
  Returns:
28
28
  argparse.ArgumentParser: The configured argument parser instance.
29
- """
30
29
 
30
+ """
31
31
  parser = argparse.ArgumentParser(
32
32
  description=(
33
33
  "Summon glitchlings to corrupt text. Provide input text as an argument, "
@@ -157,7 +157,6 @@ def build_lexicon_parser() -> argparse.ArgumentParser:
157
157
 
158
158
  def list_glitchlings() -> None:
159
159
  """Print information about the available built-in glitchlings."""
160
-
161
160
  for key in DEFAULT_GLITCHLING_NAMES:
162
161
  glitchling = BUILTIN_GLITCHLINGS[key]
163
162
  display_name = glitchling.name
@@ -178,8 +177,8 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
178
177
 
179
178
  Raises:
180
179
  SystemExit: Raised indirectly via ``parser.error`` on failure.
181
- """
182
180
 
181
+ """
183
182
  if args.file is not None:
184
183
  try:
185
184
  return args.file.read_text(encoding="utf-8")
@@ -198,7 +197,8 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
198
197
  return SAMPLE_TEXT
199
198
 
200
199
  parser.error(
201
- "No input text provided. Supply text as an argument, use --file, pipe input, or pass --sample."
200
+ "No input text provided. Supply text as an argument, use --file, pipe input, or "
201
+ "pass --sample."
202
202
  )
203
203
  raise AssertionError("parser.error should exit")
204
204
 
@@ -211,7 +211,6 @@ def summon_glitchlings(
211
211
  config_path: Path | None = None,
212
212
  ) -> Gaggle:
213
213
  """Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
214
-
215
214
  if config_path is not None:
216
215
  if names:
217
216
  parser.error("Cannot combine --config with --glitchling.")
@@ -245,10 +244,8 @@ def summon_glitchlings(
245
244
  raise AssertionError("parser.error should exit")
246
245
 
247
246
 
248
-
249
247
  def show_diff(original: str, corrupted: str) -> None:
250
248
  """Display a unified diff between the original and corrupted text."""
251
-
252
249
  diff_lines = list(
253
250
  difflib.unified_diff(
254
251
  original.splitlines(keepends=True),
@@ -274,8 +271,8 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
274
271
 
275
272
  Returns:
276
273
  int: Exit code for the process (``0`` on success).
277
- """
278
274
 
275
+ """
279
276
  if args.list:
280
277
  list_glitchlings()
281
278
  return 0
@@ -300,7 +297,6 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
300
297
 
301
298
  def run_build_lexicon(args: argparse.Namespace) -> int:
302
299
  """Delegate to the vector lexicon cache builder using CLI arguments."""
303
-
304
300
  from glitchlings.lexicon.vector import main as vector_main
305
301
 
306
302
  vector_args = [
@@ -337,8 +333,8 @@ def main(argv: list[str] | None = None) -> int:
337
333
 
338
334
  Returns:
339
335
  int: Exit code suitable for use with ``sys.exit``.
340
- """
341
336
 
337
+ """
342
338
  if argv is None:
343
339
  raw_args = sys.argv[1:]
344
340
  else: