glitchlings 0.4.0__cp312-cp312-macosx_11_0_universal2.whl → 0.4.2__cp312-cp312-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +26 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
- glitchlings/compat.py +215 -0
- glitchlings/config.py +136 -19
- glitchlings/dlc/_shared.py +68 -0
- glitchlings/dlc/huggingface.py +26 -41
- glitchlings/dlc/prime.py +64 -101
- glitchlings/lexicon/__init__.py +26 -19
- glitchlings/lexicon/_cache.py +104 -0
- glitchlings/lexicon/graph.py +18 -39
- glitchlings/lexicon/metrics.py +1 -8
- glitchlings/lexicon/vector.py +29 -67
- glitchlings/lexicon/wordnet.py +39 -30
- glitchlings/main.py +9 -13
- glitchlings/util/__init__.py +18 -4
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +21 -14
- glitchlings/zoo/_ocr_confusions.py +1 -3
- glitchlings/zoo/_rate.py +1 -4
- glitchlings/zoo/_sampling.py +0 -1
- glitchlings/zoo/_text_utils.py +1 -5
- glitchlings/zoo/adjax.py +0 -2
- glitchlings/zoo/core.py +185 -56
- glitchlings/zoo/jargoyle.py +9 -14
- glitchlings/zoo/mim1c.py +11 -10
- glitchlings/zoo/redactyl.py +5 -8
- glitchlings/zoo/reduple.py +3 -1
- glitchlings/zoo/rushmore.py +2 -8
- glitchlings/zoo/scannequin.py +5 -4
- glitchlings/zoo/typogre.py +3 -7
- glitchlings/zoo/zeedub.py +2 -2
- {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/METADATA +68 -4
- glitchlings-0.4.2.dist-info/RECORD +42 -0
- glitchlings-0.4.0.dist-info/RECORD +0 -38
- {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.0.dist-info → glitchlings-0.4.2.dist-info}/top_level.txt +0 -0
glitchlings/lexicon/graph.py
CHANGED
|
@@ -2,22 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
import re
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from typing import Iterable, Mapping, MutableMapping, Sequence
|
|
9
8
|
|
|
10
|
-
from . import
|
|
9
|
+
from . import LexiconBackend
|
|
10
|
+
from ._cache import CacheSnapshot
|
|
11
|
+
from ._cache import load_cache as _load_cache_file
|
|
12
|
+
from ._cache import write_cache as _write_cache_file
|
|
11
13
|
from .vector import VectorLexicon
|
|
12
14
|
|
|
13
|
-
|
|
14
15
|
_CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
|
|
15
16
|
_PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
def _lemmatize_token(token: str) -> str:
|
|
19
20
|
"""Return a lightweight lemma for ``token`` using heuristic rules."""
|
|
20
|
-
|
|
21
21
|
irregular = {
|
|
22
22
|
"children": "child",
|
|
23
23
|
"mice": "mouse",
|
|
@@ -60,7 +60,6 @@ def _lemmatize_token(token: str) -> str:
|
|
|
60
60
|
|
|
61
61
|
def _normalize_phrase(phrase: str) -> str:
|
|
62
62
|
"""Normalise ``phrase`` for ConceptNet lookups."""
|
|
63
|
-
|
|
64
63
|
stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
|
|
65
64
|
tokens = [token for token in stripped.split() if token]
|
|
66
65
|
if not tokens:
|
|
@@ -71,7 +70,6 @@ def _normalize_phrase(phrase: str) -> str:
|
|
|
71
70
|
|
|
72
71
|
def _concept_terms(normalized: str) -> list[str]:
|
|
73
72
|
"""Return ConceptNet term variants for ``normalized``."""
|
|
74
|
-
|
|
75
73
|
collapsed = normalized.replace(" ", "_")
|
|
76
74
|
if not collapsed:
|
|
77
75
|
return []
|
|
@@ -83,7 +81,6 @@ def _concept_terms(normalized: str) -> list[str]:
|
|
|
83
81
|
|
|
84
82
|
def _surface_from_concept(concept: str) -> str | None:
|
|
85
83
|
"""Return a human-readable surface form for ``concept``."""
|
|
86
|
-
|
|
87
84
|
match = _CONCEPT_RE.match(concept)
|
|
88
85
|
if match is None:
|
|
89
86
|
return None
|
|
@@ -102,7 +99,6 @@ def _language_from_concept(concept: str) -> str | None:
|
|
|
102
99
|
|
|
103
100
|
def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
|
|
104
101
|
"""Load ConceptNet Numberbatch embeddings from ``path``."""
|
|
105
|
-
|
|
106
102
|
if not path.exists():
|
|
107
103
|
return {}
|
|
108
104
|
|
|
@@ -140,30 +136,7 @@ def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[f
|
|
|
140
136
|
return embeddings
|
|
141
137
|
|
|
142
138
|
|
|
143
|
-
|
|
144
|
-
if not path.exists():
|
|
145
|
-
return {}
|
|
146
|
-
with path.open("r", encoding="utf8") as handle:
|
|
147
|
-
payload = json.load(handle)
|
|
148
|
-
if not isinstance(payload, Mapping):
|
|
149
|
-
raise RuntimeError("Graph lexicon cache must be a mapping of strings to lists.")
|
|
150
|
-
cache: dict[str, list[str]] = {}
|
|
151
|
-
for key, values in payload.items():
|
|
152
|
-
if not isinstance(key, str):
|
|
153
|
-
raise RuntimeError("Graph lexicon cache keys must be strings.")
|
|
154
|
-
if not isinstance(values, Sequence):
|
|
155
|
-
raise RuntimeError("Graph lexicon cache values must be sequences of strings.")
|
|
156
|
-
cache[key] = [str(value) for value in values]
|
|
157
|
-
return cache
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
|
|
161
|
-
serialisable = {key: list(values) for key, values in sorted(cache.items())}
|
|
162
|
-
with path.open("w", encoding="utf8") as handle:
|
|
163
|
-
json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
class GraphLexicon(Lexicon):
|
|
139
|
+
class GraphLexicon(LexiconBackend):
|
|
167
140
|
"""Lexicon backed by ConceptNet/Numberbatch embeddings."""
|
|
168
141
|
|
|
169
142
|
def __init__(
|
|
@@ -184,9 +157,12 @@ class GraphLexicon(Lexicon):
|
|
|
184
157
|
self._max_neighbors = max(1, max_neighbors)
|
|
185
158
|
self._min_similarity = min_similarity
|
|
186
159
|
self._cache: MutableMapping[str, list[str]] = {}
|
|
187
|
-
self._cache_path = Path(cache_path) if cache_path is not None else None
|
|
160
|
+
self._cache_path: Path | None = Path(cache_path) if cache_path is not None else None
|
|
161
|
+
self._cache_checksum: str | None = None
|
|
188
162
|
if self._cache_path is not None:
|
|
189
|
-
|
|
163
|
+
snapshot = _load_cache_file(self._cache_path)
|
|
164
|
+
self._cache.update(snapshot.entries)
|
|
165
|
+
self._cache_checksum = snapshot.checksum
|
|
190
166
|
if cache is not None:
|
|
191
167
|
for key, values in cache.items():
|
|
192
168
|
self._cache[str(key)] = [str(value) for value in values]
|
|
@@ -260,9 +236,7 @@ class GraphLexicon(Lexicon):
|
|
|
260
236
|
self._cache_dirty = True
|
|
261
237
|
return synonyms
|
|
262
238
|
|
|
263
|
-
def get_synonyms(
|
|
264
|
-
self, word: str, pos: str | None = None, n: int = 5
|
|
265
|
-
) -> list[str]:
|
|
239
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
266
240
|
normalized = _normalize_phrase(word)
|
|
267
241
|
if not normalized:
|
|
268
242
|
return []
|
|
@@ -278,6 +252,11 @@ class GraphLexicon(Lexicon):
|
|
|
278
252
|
def export_cache(self) -> dict[str, list[str]]:
|
|
279
253
|
return {key: list(values) for key, values in self._cache.items()}
|
|
280
254
|
|
|
255
|
+
@classmethod
|
|
256
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
257
|
+
"""Load and validate a persisted ConceptNet cache file."""
|
|
258
|
+
return _load_cache_file(Path(path))
|
|
259
|
+
|
|
281
260
|
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
282
261
|
if path is None:
|
|
283
262
|
if self._cache_path is None:
|
|
@@ -286,7 +265,8 @@ class GraphLexicon(Lexicon):
|
|
|
286
265
|
else:
|
|
287
266
|
target = Path(path)
|
|
288
267
|
self._cache_path = target
|
|
289
|
-
|
|
268
|
+
snapshot = _write_cache_file(target, self._cache)
|
|
269
|
+
self._cache_checksum = snapshot.checksum
|
|
290
270
|
self._cache_dirty = False
|
|
291
271
|
return target
|
|
292
272
|
|
|
@@ -300,4 +280,3 @@ class GraphLexicon(Lexicon):
|
|
|
300
280
|
f"GraphLexicon(languages={sorted(self._languages)!r}, "
|
|
301
281
|
f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
|
|
302
282
|
)
|
|
303
|
-
|
glitchlings/lexicon/metrics.py
CHANGED
|
@@ -18,7 +18,6 @@ def _unique_synonyms(
|
|
|
18
18
|
sample_size: int,
|
|
19
19
|
) -> list[str]:
|
|
20
20
|
"""Return unique synonym candidates excluding the original token."""
|
|
21
|
-
|
|
22
21
|
collected: list[str] = []
|
|
23
22
|
seen: set[str] = set()
|
|
24
23
|
source = word.lower()
|
|
@@ -41,7 +40,6 @@ def synonym_diversity(
|
|
|
41
40
|
sample_size: int = 5,
|
|
42
41
|
) -> float:
|
|
43
42
|
"""Return the mean unique-synonym count for ``words`` using ``lexicon``."""
|
|
44
|
-
|
|
45
43
|
totals = []
|
|
46
44
|
for word in words:
|
|
47
45
|
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
@@ -60,7 +58,6 @@ def coverage_ratio(
|
|
|
60
58
|
min_synonyms: int = 3,
|
|
61
59
|
) -> float:
|
|
62
60
|
"""Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
|
|
63
|
-
|
|
64
61
|
total = 0
|
|
65
62
|
hits = 0
|
|
66
63
|
for word in words:
|
|
@@ -96,7 +93,6 @@ def mean_cosine_similarity(
|
|
|
96
93
|
sample_size: int = 5,
|
|
97
94
|
) -> float:
|
|
98
95
|
"""Return the mean cosine similarity between each word and its candidates."""
|
|
99
|
-
|
|
100
96
|
total = 0.0
|
|
101
97
|
count = 0
|
|
102
98
|
for word in words:
|
|
@@ -126,11 +122,8 @@ def compare_lexicons(
|
|
|
126
122
|
embeddings: Mapping[str, Sequence[float]] | None = None,
|
|
127
123
|
) -> dict[str, float]:
|
|
128
124
|
"""Return comparative coverage and diversity statistics for two lexicons."""
|
|
129
|
-
|
|
130
125
|
stats = {
|
|
131
|
-
"baseline_diversity": synonym_diversity(
|
|
132
|
-
baseline, words, pos=pos, sample_size=sample_size
|
|
133
|
-
),
|
|
126
|
+
"baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
|
|
134
127
|
"candidate_diversity": synonym_diversity(
|
|
135
128
|
candidate, words, pos=pos, sample_size=sample_size
|
|
136
129
|
),
|
glitchlings/lexicon/vector.py
CHANGED
|
@@ -6,16 +6,18 @@ import argparse
|
|
|
6
6
|
import importlib
|
|
7
7
|
import json
|
|
8
8
|
import math
|
|
9
|
-
from pathlib import Path
|
|
10
9
|
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
11
|
from typing import Any, Callable, Iterable, Iterator, Mapping, MutableMapping, Sequence
|
|
12
12
|
|
|
13
|
-
from . import
|
|
13
|
+
from . import LexiconBackend
|
|
14
|
+
from ._cache import CacheSnapshot
|
|
15
|
+
from ._cache import load_cache as _load_cache_file
|
|
16
|
+
from ._cache import write_cache as _write_cache_file
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
17
20
|
"""Return the cosine similarity between two dense vectors."""
|
|
18
|
-
|
|
19
21
|
dot_product = 0.0
|
|
20
22
|
norm_a = 0.0
|
|
21
23
|
norm_b = 0.0
|
|
@@ -143,7 +145,6 @@ class _SpaCyAdapter(_Adapter):
|
|
|
143
145
|
|
|
144
146
|
def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
|
|
145
147
|
"""Load embeddings from a JSON mapping of token to vector list."""
|
|
146
|
-
|
|
147
148
|
with path.open("r", encoding="utf8") as handle:
|
|
148
149
|
payload = json.load(handle)
|
|
149
150
|
|
|
@@ -163,11 +164,8 @@ def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
|
|
|
163
164
|
|
|
164
165
|
def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
|
|
165
166
|
"""Load ``gensim`` vectors from ``path``."""
|
|
166
|
-
|
|
167
167
|
if importlib.util.find_spec("gensim") is None:
|
|
168
|
-
raise RuntimeError(
|
|
169
|
-
"The gensim package is required to load keyed vector embeddings."
|
|
170
|
-
)
|
|
168
|
+
raise RuntimeError("The gensim package is required to load keyed vector embeddings.")
|
|
171
169
|
|
|
172
170
|
keyed_vectors_module = importlib.import_module("gensim.models.keyedvectors")
|
|
173
171
|
if binary is None:
|
|
@@ -176,14 +174,11 @@ def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
|
|
|
176
174
|
if path.suffix in {".kv", ".kv2"}:
|
|
177
175
|
return keyed_vectors_module.KeyedVectors.load(str(path), mmap="r")
|
|
178
176
|
|
|
179
|
-
return keyed_vectors_module.KeyedVectors.load_word2vec_format(
|
|
180
|
-
str(path), binary=binary
|
|
181
|
-
)
|
|
177
|
+
return keyed_vectors_module.KeyedVectors.load_word2vec_format(str(path), binary=binary)
|
|
182
178
|
|
|
183
179
|
|
|
184
180
|
def _load_spacy_language(model_name: str) -> Any:
|
|
185
181
|
"""Load a spaCy language pipeline by name."""
|
|
186
|
-
|
|
187
182
|
if importlib.util.find_spec("spacy") is None:
|
|
188
183
|
raise RuntimeError(
|
|
189
184
|
"spaCy is required to use spaCy-backed vector lexicons; install the 'vectors' extra."
|
|
@@ -195,7 +190,6 @@ def _load_spacy_language(model_name: str) -> Any:
|
|
|
195
190
|
|
|
196
191
|
def _resolve_source(source: Any | None) -> _Adapter | None:
|
|
197
192
|
"""Return an adapter instance for ``source`` if possible."""
|
|
198
|
-
|
|
199
193
|
if source is None:
|
|
200
194
|
return None
|
|
201
195
|
|
|
@@ -228,9 +222,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
|
|
|
228
222
|
|
|
229
223
|
if suffix in {".kv", ".kv2", ".bin", ".gz", ".txt", ".vec"}:
|
|
230
224
|
binary_flag = False if suffix in {".txt", ".vec"} else None
|
|
231
|
-
return _GensimAdapter(
|
|
232
|
-
_load_gensim_vectors(resolved_path, binary=binary_flag)
|
|
233
|
-
)
|
|
225
|
+
return _GensimAdapter(_load_gensim_vectors(resolved_path, binary=binary_flag))
|
|
234
226
|
|
|
235
227
|
if hasattr(source, "most_similar") and hasattr(source, "key_to_index"):
|
|
236
228
|
return _GensimAdapter(source)
|
|
@@ -241,38 +233,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
|
|
|
241
233
|
raise RuntimeError("Unsupported vector source supplied to VectorLexicon.")
|
|
242
234
|
|
|
243
235
|
|
|
244
|
-
|
|
245
|
-
"""Load a synonym cache from ``path`` if it exists."""
|
|
246
|
-
|
|
247
|
-
if not path.exists():
|
|
248
|
-
return {}
|
|
249
|
-
|
|
250
|
-
with path.open("r", encoding="utf8") as handle:
|
|
251
|
-
payload = json.load(handle)
|
|
252
|
-
|
|
253
|
-
if not isinstance(payload, Mapping):
|
|
254
|
-
raise RuntimeError("Synonym cache must be a JSON mapping of strings to lists.")
|
|
255
|
-
|
|
256
|
-
cache: dict[str, list[str]] = {}
|
|
257
|
-
for key, values in payload.items():
|
|
258
|
-
if not isinstance(key, str):
|
|
259
|
-
raise RuntimeError("Synonym cache keys must be strings.")
|
|
260
|
-
if not isinstance(values, Sequence):
|
|
261
|
-
raise RuntimeError("Synonym cache values must be lists of strings.")
|
|
262
|
-
cache[key] = [str(value) for value in values]
|
|
263
|
-
|
|
264
|
-
return cache
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
|
|
268
|
-
"""Write ``cache`` to ``path`` deterministically."""
|
|
269
|
-
|
|
270
|
-
serialisable = {key: list(values) for key, values in sorted(cache.items())}
|
|
271
|
-
with path.open("w", encoding="utf8") as handle:
|
|
272
|
-
json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
class VectorLexicon(Lexicon):
|
|
236
|
+
class VectorLexicon(LexiconBackend):
|
|
276
237
|
"""Lexicon implementation backed by dense word embeddings."""
|
|
277
238
|
|
|
278
239
|
def __init__(
|
|
@@ -292,9 +253,13 @@ class VectorLexicon(Lexicon):
|
|
|
292
253
|
self._max_neighbors = max(1, max_neighbors)
|
|
293
254
|
self._min_similarity = min_similarity
|
|
294
255
|
self._cache: MutableMapping[str, list[str]] = {}
|
|
256
|
+
self._cache_path: Path | None
|
|
257
|
+
self._cache_checksum: str | None = None
|
|
295
258
|
if cache_path is not None:
|
|
296
259
|
path = Path(cache_path)
|
|
297
|
-
|
|
260
|
+
snapshot = _load_cache_file(path)
|
|
261
|
+
self._cache.update(snapshot.entries)
|
|
262
|
+
self._cache_checksum = snapshot.checksum
|
|
298
263
|
self._cache_path = path
|
|
299
264
|
else:
|
|
300
265
|
self._cache_path = None
|
|
@@ -384,36 +349,33 @@ class VectorLexicon(Lexicon):
|
|
|
384
349
|
self._cache_dirty = True
|
|
385
350
|
return synonyms
|
|
386
351
|
|
|
387
|
-
def get_synonyms(
|
|
388
|
-
self, word: str, pos: str | None = None, n: int = 5
|
|
389
|
-
) -> list[str]:
|
|
352
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
390
353
|
normalized = self._normalize_for_lookup(word)
|
|
391
354
|
synonyms = self._ensure_cached(original=word, normalized=normalized)
|
|
392
355
|
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
393
356
|
|
|
394
357
|
def precompute(self, word: str, *, limit: int | None = None) -> list[str]:
|
|
395
358
|
"""Populate the cache for ``word`` and return the stored synonyms."""
|
|
396
|
-
|
|
397
359
|
normalized = self._normalize_for_lookup(word)
|
|
398
|
-
return list(
|
|
399
|
-
self._ensure_cached(original=word, normalized=normalized, limit=limit)
|
|
400
|
-
)
|
|
360
|
+
return list(self._ensure_cached(original=word, normalized=normalized, limit=limit))
|
|
401
361
|
|
|
402
362
|
def iter_vocabulary(self) -> Iterator[str]:
|
|
403
363
|
"""Yield vocabulary tokens from the underlying embedding source."""
|
|
404
|
-
|
|
405
364
|
if self._adapter is None:
|
|
406
365
|
return iter(())
|
|
407
366
|
return self._adapter.iter_keys()
|
|
408
367
|
|
|
409
368
|
def export_cache(self) -> dict[str, list[str]]:
|
|
410
369
|
"""Return a copy of the in-memory synonym cache."""
|
|
411
|
-
|
|
412
370
|
return {key: list(values) for key, values in self._cache.items()}
|
|
413
371
|
|
|
372
|
+
@classmethod
|
|
373
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
374
|
+
"""Load and validate a cache file for reuse."""
|
|
375
|
+
return _load_cache_file(Path(path))
|
|
376
|
+
|
|
414
377
|
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
415
378
|
"""Persist the current cache to disk, returning the path used."""
|
|
416
|
-
|
|
417
379
|
if path is None:
|
|
418
380
|
if self._cache_path is None:
|
|
419
381
|
raise RuntimeError("No cache path supplied to VectorLexicon.")
|
|
@@ -422,7 +384,8 @@ class VectorLexicon(Lexicon):
|
|
|
422
384
|
target = Path(path)
|
|
423
385
|
self._cache_path = target
|
|
424
386
|
|
|
425
|
-
|
|
387
|
+
snapshot = _write_cache_file(target, self._cache)
|
|
388
|
+
self._cache_checksum = snapshot.checksum
|
|
426
389
|
self._cache_dirty = False
|
|
427
390
|
return target
|
|
428
391
|
|
|
@@ -449,7 +412,6 @@ def build_vector_cache(
|
|
|
449
412
|
normalizer: Callable[[str], str] | None = None,
|
|
450
413
|
) -> Path:
|
|
451
414
|
"""Generate a synonym cache for ``words`` using ``source`` embeddings."""
|
|
452
|
-
|
|
453
415
|
lexicon = VectorLexicon(
|
|
454
416
|
source=source,
|
|
455
417
|
max_neighbors=max_neighbors,
|
|
@@ -467,7 +429,6 @@ def build_vector_cache(
|
|
|
467
429
|
|
|
468
430
|
def load_vector_source(spec: str) -> Any:
|
|
469
431
|
"""Resolve ``spec`` strings for the cache-building CLI."""
|
|
470
|
-
|
|
471
432
|
if spec.startswith("spacy:"):
|
|
472
433
|
model_name = spec.split(":", 1)[1]
|
|
473
434
|
return _load_spacy_language(model_name)
|
|
@@ -557,7 +518,6 @@ def _iter_tokens_from_file(path: Path) -> Iterator[str]:
|
|
|
557
518
|
|
|
558
519
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
559
520
|
"""Entry-point for ``python -m glitchlings.lexicon.vector``."""
|
|
560
|
-
|
|
561
521
|
args = _parse_cli(argv)
|
|
562
522
|
|
|
563
523
|
if args.output.exists() and not args.overwrite:
|
|
@@ -566,11 +526,13 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
566
526
|
)
|
|
567
527
|
|
|
568
528
|
if args.normalizer == "lower":
|
|
569
|
-
normalizer: Callable[[str], str] | None =
|
|
570
|
-
None if args.case_sensitive else str.lower
|
|
571
|
-
)
|
|
529
|
+
normalizer: Callable[[str], str] | None = None if args.case_sensitive else str.lower
|
|
572
530
|
else:
|
|
573
|
-
|
|
531
|
+
|
|
532
|
+
def _identity(value: str) -> str:
|
|
533
|
+
return value
|
|
534
|
+
|
|
535
|
+
normalizer = _identity
|
|
574
536
|
|
|
575
537
|
source = load_vector_source(args.source)
|
|
576
538
|
if args.tokens is not None:
|
glitchlings/lexicon/wordnet.py
CHANGED
|
@@ -2,38 +2,50 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from importlib import import_module
|
|
6
|
+
from pathlib import Path
|
|
5
7
|
from typing import TYPE_CHECKING, Any
|
|
6
8
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
nltk = None # type: ignore[assignment]
|
|
11
|
-
find = None # type: ignore[assignment]
|
|
12
|
-
_NLTK_IMPORT_ERROR = exc
|
|
13
|
-
else: # pragma: no cover - executed when NLTK is present
|
|
14
|
-
from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
|
|
15
|
-
from nltk.data import find as _nltk_find # type: ignore[import]
|
|
9
|
+
from ..compat import nltk as _nltk_dependency
|
|
10
|
+
from . import LexiconBackend
|
|
11
|
+
from ._cache import CacheSnapshot
|
|
16
12
|
|
|
17
|
-
|
|
18
|
-
|
|
13
|
+
nltk = _nltk_dependency.get() # type: ignore[assignment]
|
|
14
|
+
_NLTK_IMPORT_ERROR = _nltk_dependency.error
|
|
19
15
|
|
|
20
16
|
if TYPE_CHECKING: # pragma: no cover - typing aid only
|
|
21
17
|
from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
|
|
22
18
|
else: # pragma: no cover - runtime fallback to avoid hard dependency
|
|
23
19
|
WordNetCorpusReader = Any
|
|
24
20
|
|
|
21
|
+
find: Any | None = None
|
|
22
|
+
_WORDNET_MODULE: Any | None = None
|
|
23
|
+
|
|
25
24
|
if nltk is not None: # pragma: no cover - guarded by import success
|
|
26
25
|
try:
|
|
27
|
-
|
|
26
|
+
corpus_reader_module = import_module("nltk.corpus.reader")
|
|
27
|
+
WordNetCorpusReader = corpus_reader_module.WordNetCorpusReader # type: ignore[assignment]
|
|
28
|
+
except ModuleNotFoundError as exc: # pragma: no cover - triggered when corpus missing
|
|
29
|
+
if _NLTK_IMPORT_ERROR is None:
|
|
30
|
+
_NLTK_IMPORT_ERROR = exc # type: ignore[assignment]
|
|
31
|
+
else:
|
|
32
|
+
try:
|
|
33
|
+
data_module = import_module("nltk.data")
|
|
34
|
+
except ModuleNotFoundError as exc: # pragma: no cover - triggered when data missing
|
|
35
|
+
if _NLTK_IMPORT_ERROR is None:
|
|
36
|
+
_NLTK_IMPORT_ERROR = exc # type: ignore[assignment]
|
|
37
|
+
else:
|
|
38
|
+
find = getattr(data_module, "find", None)
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
_WORDNET_MODULE = import_module("nltk.corpus.wordnet")
|
|
28
42
|
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
|
29
43
|
_WORDNET_MODULE = None
|
|
30
|
-
else:
|
|
31
|
-
WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
|
|
32
44
|
else:
|
|
45
|
+
nltk = None # type: ignore[assignment]
|
|
46
|
+
find = None
|
|
33
47
|
_WORDNET_MODULE = None
|
|
34
48
|
|
|
35
|
-
from . import Lexicon
|
|
36
|
-
|
|
37
49
|
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
|
38
50
|
_wordnet_ready = False
|
|
39
51
|
|
|
@@ -42,26 +54,23 @@ _VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
|
|
|
42
54
|
|
|
43
55
|
def _require_nltk() -> None:
|
|
44
56
|
"""Ensure the NLTK dependency is present before continuing."""
|
|
45
|
-
|
|
46
57
|
if nltk is None or find is None:
|
|
47
58
|
message = (
|
|
48
59
|
"The NLTK package is required for WordNet-backed lexicons; install "
|
|
49
60
|
"`nltk` and its WordNet corpus manually to enable this backend."
|
|
50
61
|
)
|
|
51
|
-
if
|
|
62
|
+
if "_NLTK_IMPORT_ERROR" in globals() and _NLTK_IMPORT_ERROR is not None:
|
|
52
63
|
raise RuntimeError(message) from _NLTK_IMPORT_ERROR
|
|
53
64
|
raise RuntimeError(message)
|
|
54
65
|
|
|
55
66
|
|
|
56
67
|
def dependencies_available() -> bool:
|
|
57
68
|
"""Return ``True`` when the runtime NLTK dependency is present."""
|
|
58
|
-
|
|
59
69
|
return nltk is not None and find is not None
|
|
60
70
|
|
|
61
71
|
|
|
62
72
|
def _load_wordnet_reader() -> WordNetCorpusReader:
|
|
63
73
|
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
|
64
|
-
|
|
65
74
|
_require_nltk()
|
|
66
75
|
|
|
67
76
|
try:
|
|
@@ -80,7 +89,6 @@ def _load_wordnet_reader() -> WordNetCorpusReader:
|
|
|
80
89
|
|
|
81
90
|
def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
|
|
82
91
|
"""Retrieve the active WordNet handle, rebuilding it on demand."""
|
|
83
|
-
|
|
84
92
|
global _WORDNET_HANDLE
|
|
85
93
|
|
|
86
94
|
if force_refresh:
|
|
@@ -95,7 +103,6 @@ def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
|
|
|
95
103
|
|
|
96
104
|
def ensure_wordnet() -> None:
|
|
97
105
|
"""Ensure the WordNet corpus is available before use."""
|
|
98
|
-
|
|
99
106
|
global _wordnet_ready
|
|
100
107
|
if _wordnet_ready:
|
|
101
108
|
return
|
|
@@ -112,16 +119,13 @@ def ensure_wordnet() -> None:
|
|
|
112
119
|
resource = _wordnet(force_refresh=True)
|
|
113
120
|
resource.ensure_loaded()
|
|
114
121
|
except LookupError as exc: # pragma: no cover - only triggered when download fails
|
|
115
|
-
raise RuntimeError(
|
|
116
|
-
"Unable to load NLTK WordNet corpus for synonym lookups."
|
|
117
|
-
) from exc
|
|
122
|
+
raise RuntimeError("Unable to load NLTK WordNet corpus for synonym lookups.") from exc
|
|
118
123
|
|
|
119
124
|
_wordnet_ready = True
|
|
120
125
|
|
|
121
126
|
|
|
122
127
|
def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
|
|
123
128
|
"""Gather deterministic synonym candidates for the supplied word."""
|
|
124
|
-
|
|
125
129
|
normalized_word = word.lower()
|
|
126
130
|
wordnet = _wordnet()
|
|
127
131
|
synonyms: set[str] = set()
|
|
@@ -151,12 +155,10 @@ def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
|
|
|
151
155
|
return sorted(synonyms)
|
|
152
156
|
|
|
153
157
|
|
|
154
|
-
class WordNetLexicon(
|
|
158
|
+
class WordNetLexicon(LexiconBackend):
|
|
155
159
|
"""Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
|
|
156
160
|
|
|
157
|
-
def get_synonyms(
|
|
158
|
-
self, word: str, pos: str | None = None, n: int = 5
|
|
159
|
-
) -> list[str]:
|
|
161
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
160
162
|
ensure_wordnet()
|
|
161
163
|
|
|
162
164
|
if pos is None:
|
|
@@ -175,6 +177,13 @@ class WordNetLexicon(Lexicon):
|
|
|
175
177
|
return True
|
|
176
178
|
return pos.lower() in _VALID_POS
|
|
177
179
|
|
|
180
|
+
@classmethod
|
|
181
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
182
|
+
raise RuntimeError("WordNetLexicon does not persist or load caches.")
|
|
183
|
+
|
|
184
|
+
def save_cache(self, path: str | Path | None = None) -> Path | None:
|
|
185
|
+
raise RuntimeError("WordNetLexicon does not persist or load caches.")
|
|
186
|
+
|
|
178
187
|
def __repr__(self) -> str: # pragma: no cover - trivial representation
|
|
179
188
|
return f"WordNetLexicon(seed={self.seed!r})"
|
|
180
189
|
|
glitchlings/main.py
CHANGED
|
@@ -4,16 +4,16 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import argparse
|
|
6
6
|
import difflib
|
|
7
|
-
from pathlib import Path
|
|
8
7
|
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
9
|
|
|
10
10
|
from . import SAMPLE_TEXT
|
|
11
11
|
from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
|
|
12
12
|
from .zoo import (
|
|
13
|
-
Glitchling,
|
|
14
|
-
Gaggle,
|
|
15
13
|
BUILTIN_GLITCHLINGS,
|
|
16
14
|
DEFAULT_GLITCHLING_NAMES,
|
|
15
|
+
Gaggle,
|
|
16
|
+
Glitchling,
|
|
17
17
|
parse_glitchling_spec,
|
|
18
18
|
summon,
|
|
19
19
|
)
|
|
@@ -26,8 +26,8 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
26
26
|
|
|
27
27
|
Returns:
|
|
28
28
|
argparse.ArgumentParser: The configured argument parser instance.
|
|
29
|
-
"""
|
|
30
29
|
|
|
30
|
+
"""
|
|
31
31
|
parser = argparse.ArgumentParser(
|
|
32
32
|
description=(
|
|
33
33
|
"Summon glitchlings to corrupt text. Provide input text as an argument, "
|
|
@@ -157,7 +157,6 @@ def build_lexicon_parser() -> argparse.ArgumentParser:
|
|
|
157
157
|
|
|
158
158
|
def list_glitchlings() -> None:
|
|
159
159
|
"""Print information about the available built-in glitchlings."""
|
|
160
|
-
|
|
161
160
|
for key in DEFAULT_GLITCHLING_NAMES:
|
|
162
161
|
glitchling = BUILTIN_GLITCHLINGS[key]
|
|
163
162
|
display_name = glitchling.name
|
|
@@ -178,8 +177,8 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
|
|
|
178
177
|
|
|
179
178
|
Raises:
|
|
180
179
|
SystemExit: Raised indirectly via ``parser.error`` on failure.
|
|
181
|
-
"""
|
|
182
180
|
|
|
181
|
+
"""
|
|
183
182
|
if args.file is not None:
|
|
184
183
|
try:
|
|
185
184
|
return args.file.read_text(encoding="utf-8")
|
|
@@ -198,7 +197,8 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
|
|
|
198
197
|
return SAMPLE_TEXT
|
|
199
198
|
|
|
200
199
|
parser.error(
|
|
201
|
-
"No input text provided. Supply text as an argument, use --file, pipe input, or
|
|
200
|
+
"No input text provided. Supply text as an argument, use --file, pipe input, or "
|
|
201
|
+
"pass --sample."
|
|
202
202
|
)
|
|
203
203
|
raise AssertionError("parser.error should exit")
|
|
204
204
|
|
|
@@ -211,7 +211,6 @@ def summon_glitchlings(
|
|
|
211
211
|
config_path: Path | None = None,
|
|
212
212
|
) -> Gaggle:
|
|
213
213
|
"""Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
|
|
214
|
-
|
|
215
214
|
if config_path is not None:
|
|
216
215
|
if names:
|
|
217
216
|
parser.error("Cannot combine --config with --glitchling.")
|
|
@@ -245,10 +244,8 @@ def summon_glitchlings(
|
|
|
245
244
|
raise AssertionError("parser.error should exit")
|
|
246
245
|
|
|
247
246
|
|
|
248
|
-
|
|
249
247
|
def show_diff(original: str, corrupted: str) -> None:
|
|
250
248
|
"""Display a unified diff between the original and corrupted text."""
|
|
251
|
-
|
|
252
249
|
diff_lines = list(
|
|
253
250
|
difflib.unified_diff(
|
|
254
251
|
original.splitlines(keepends=True),
|
|
@@ -274,8 +271,8 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
|
|
|
274
271
|
|
|
275
272
|
Returns:
|
|
276
273
|
int: Exit code for the process (``0`` on success).
|
|
277
|
-
"""
|
|
278
274
|
|
|
275
|
+
"""
|
|
279
276
|
if args.list:
|
|
280
277
|
list_glitchlings()
|
|
281
278
|
return 0
|
|
@@ -300,7 +297,6 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
|
|
|
300
297
|
|
|
301
298
|
def run_build_lexicon(args: argparse.Namespace) -> int:
|
|
302
299
|
"""Delegate to the vector lexicon cache builder using CLI arguments."""
|
|
303
|
-
|
|
304
300
|
from glitchlings.lexicon.vector import main as vector_main
|
|
305
301
|
|
|
306
302
|
vector_args = [
|
|
@@ -337,8 +333,8 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
337
333
|
|
|
338
334
|
Returns:
|
|
339
335
|
int: Exit code suitable for use with ``sys.exit``.
|
|
340
|
-
"""
|
|
341
336
|
|
|
337
|
+
"""
|
|
342
338
|
if argv is None:
|
|
343
339
|
raw_args = sys.argv[1:]
|
|
344
340
|
else:
|