PyPI - glitchlings - Versions diffs - 0.4.1__cp312-cp312-win_amd64.whl → 0.4.3__cp312-cp312-win_amd64.whl - Mend

glitchlings 0.4.1__cp312-cp312-win_amd64.whl → 0.4.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of glitchlings might be problematic. Click here for more details.

Files changed (47) hide show

glitchlings/__init__.py +30 -17
glitchlings/__main__.py +0 -1
glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
glitchlings/compat.py +284 -0
glitchlings/config.py +164 -34
glitchlings/config.toml +1 -1
glitchlings/dlc/__init__.py +3 -1
glitchlings/dlc/_shared.py +68 -0
glitchlings/dlc/huggingface.py +26 -41
glitchlings/dlc/prime.py +64 -101
glitchlings/dlc/pytorch.py +216 -0
glitchlings/dlc/pytorch_lightning.py +233 -0
glitchlings/lexicon/__init__.py +12 -33
glitchlings/lexicon/_cache.py +21 -22
glitchlings/lexicon/data/default_vector_cache.json +80 -14
glitchlings/lexicon/metrics.py +1 -8
glitchlings/lexicon/vector.py +109 -49
glitchlings/lexicon/wordnet.py +89 -49
glitchlings/main.py +30 -24
glitchlings/util/__init__.py +18 -4
glitchlings/util/adapters.py +27 -0
glitchlings/zoo/__init__.py +26 -15
glitchlings/zoo/_ocr_confusions.py +1 -3
glitchlings/zoo/_rate.py +1 -4
glitchlings/zoo/_sampling.py +0 -1
glitchlings/zoo/_text_utils.py +1 -5
glitchlings/zoo/adjax.py +2 -4
glitchlings/zoo/apostrofae.py +128 -0
glitchlings/zoo/assets/__init__.py +0 -0
glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
glitchlings/zoo/core.py +152 -87
glitchlings/zoo/jargoyle.py +50 -45
glitchlings/zoo/mim1c.py +11 -10
glitchlings/zoo/redactyl.py +16 -16
glitchlings/zoo/reduple.py +5 -3
glitchlings/zoo/rushmore.py +4 -10
glitchlings/zoo/scannequin.py +7 -6
glitchlings/zoo/typogre.py +8 -9
glitchlings/zoo/zeedub.py +6 -3
{glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/METADATA +101 -4
glitchlings-0.4.3.dist-info/RECORD +46 -0
glitchlings/lexicon/graph.py +0 -290
glitchlings-0.4.1.dist-info/RECORD +0 -39
{glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
{glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
{glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0

glitchlings/lexicon/vector.py CHANGED Viewed

@@ -4,19 +4,21 @@ from __future__ import annotations
 import argparse
 import importlib
+import importlib.util
 import json
 import math
-from pathlib import Path
 import sys
+from pathlib import Path
 from typing import Any, Callable, Iterable, Iterator, Mapping, MutableMapping, Sequence
 from . import LexiconBackend
-from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
+from ._cache import CacheSnapshot
+from ._cache import load_cache as _load_cache_file
+from ._cache import write_cache as _write_cache_file
 def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
     """Return the cosine similarity between two dense vectors."""
     dot_product = 0.0
     norm_a = 0.0
     norm_b = 0.0
@@ -144,7 +146,6 @@ class _SpaCyAdapter(_Adapter):
 def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
     """Load embeddings from a JSON mapping of token to vector list."""
     with path.open("r", encoding="utf8") as handle:
         payload = json.load(handle)
@@ -164,11 +165,8 @@ def _load_json_vectors(path: Path) -> Mapping[str, Sequence[float]]:
 def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
     """Load ``gensim`` vectors from ``path``."""
     if importlib.util.find_spec("gensim") is None:
-        raise RuntimeError(
-            "The gensim package is required to load keyed vector embeddings."
-        )
+        raise RuntimeError("The gensim package is required to load keyed vector embeddings.")
     keyed_vectors_module = importlib.import_module("gensim.models.keyedvectors")
     if binary is None:
@@ -177,14 +175,11 @@ def _load_gensim_vectors(path: Path, *, binary: bool | None = None) -> Any:
     if path.suffix in {".kv", ".kv2"}:
         return keyed_vectors_module.KeyedVectors.load(str(path), mmap="r")
-    return keyed_vectors_module.KeyedVectors.load_word2vec_format(
-        str(path), binary=binary
-    )
+    return keyed_vectors_module.KeyedVectors.load_word2vec_format(str(path), binary=binary)
 def _load_spacy_language(model_name: str) -> Any:
     """Load a spaCy language pipeline by name."""
     if importlib.util.find_spec("spacy") is None:
         raise RuntimeError(
             "spaCy is required to use spaCy-backed vector lexicons; install the 'vectors' extra."
@@ -194,9 +189,60 @@ def _load_spacy_language(model_name: str) -> Any:
     return spacy_module.load(model_name)
+def _load_sentence_transformer(model_name: str) -> Any:
+    """Return a ``SentenceTransformer`` instance for ``model_name``."""
+    if importlib.util.find_spec("sentence_transformers") is None:
+        raise RuntimeError(
+            "sentence-transformers is required for this source; install the 'st' extra."
+        )
+    module = importlib.import_module("sentence_transformers")
+    try:
+        model_cls = getattr(module, "SentenceTransformer")
+    except AttributeError as exc:  # pragma: no cover - defensive
+        raise RuntimeError("sentence-transformers does not expose SentenceTransformer") from exc
+    return model_cls(model_name)
+def _build_sentence_transformer_embeddings(
+    model_name: str, tokens: Sequence[str]
+) -> Mapping[str, Sequence[float]]:
+    """Return embeddings for ``tokens`` using ``model_name``."""
+    if not tokens:
+        return {}
+    model = _load_sentence_transformer(model_name)
+    unique_tokens: list[str] = []
+    seen: set[str] = set()
+    for token in tokens:
+        normalized = token.strip()
+        if not normalized or normalized in seen:
+            continue
+        unique_tokens.append(normalized)
+        seen.add(normalized)
+    if not unique_tokens:
+        return {}
+    embeddings = model.encode(
+        unique_tokens,
+        batch_size=64,
+        normalize_embeddings=True,
+        convert_to_numpy=True,
+    )
+    return {
+        token: [float(value) for value in vector]
+        for token, vector in zip(unique_tokens, embeddings, strict=True)
+    }
 def _resolve_source(source: Any | None) -> _Adapter | None:
     """Return an adapter instance for ``source`` if possible."""
     if source is None:
         return None
@@ -229,9 +275,7 @@ def _resolve_source(source: Any | None) -> _Adapter | None:
         if suffix in {".kv", ".kv2", ".bin", ".gz", ".txt", ".vec"}:
             binary_flag = False if suffix in {".txt", ".vec"} else None
-            return _GensimAdapter(
-                _load_gensim_vectors(resolved_path, binary=binary_flag)
-            )
+            return _GensimAdapter(_load_gensim_vectors(resolved_path, binary=binary_flag))
     if hasattr(source, "most_similar") and hasattr(source, "key_to_index"):
         return _GensimAdapter(source)
@@ -257,6 +301,7 @@ class VectorLexicon(LexiconBackend):
         case_sensitive: bool = False,
         seed: int | None = None,
     ) -> None:
+        """Initialise the lexicon with an embedding ``source`` and optional cache."""
         super().__init__(seed=seed)
         self._adapter = _resolve_source(source)
         self._max_neighbors = max(1, max_neighbors)
@@ -358,42 +403,34 @@ class VectorLexicon(LexiconBackend):
             self._cache_dirty = True
         return synonyms
-    def get_synonyms(
-        self, word: str, pos: str | None = None, n: int = 5
-    ) -> list[str]:
+    def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
+        """Return up to ``n`` deterministic synonyms drawn from the embedding cache."""
         normalized = self._normalize_for_lookup(word)
         synonyms = self._ensure_cached(original=word, normalized=normalized)
         return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
     def precompute(self, word: str, *, limit: int | None = None) -> list[str]:
         """Populate the cache for ``word`` and return the stored synonyms."""
         normalized = self._normalize_for_lookup(word)
-        return list(
-            self._ensure_cached(original=word, normalized=normalized, limit=limit)
-        )
+        return list(self._ensure_cached(original=word, normalized=normalized, limit=limit))
     def iter_vocabulary(self) -> Iterator[str]:
         """Yield vocabulary tokens from the underlying embedding source."""
         if self._adapter is None:
             return iter(())
         return self._adapter.iter_keys()
     def export_cache(self) -> dict[str, list[str]]:
         """Return a copy of the in-memory synonym cache."""
         return {key: list(values) for key, values in self._cache.items()}
     @classmethod
     def load_cache(cls, path: str | Path) -> CacheSnapshot:
         """Load and validate a cache file for reuse."""
         return _load_cache_file(Path(path))
     def save_cache(self, path: str | Path | None = None) -> Path:
         """Persist the current cache to disk, returning the path used."""
         if path is None:
             if self._cache_path is None:
                 raise RuntimeError("No cache path supplied to VectorLexicon.")
@@ -408,6 +445,7 @@ class VectorLexicon(LexiconBackend):
         return target
     def supports_pos(self, pos: str | None) -> bool:
+        """Always return ``True`` because vector sources do not encode POS metadata."""
         return True
     def __repr__(self) -> str:  # pragma: no cover - debug helper
@@ -430,7 +468,6 @@ def build_vector_cache(
     normalizer: Callable[[str], str] | None = None,
 ) -> Path:
     """Generate a synonym cache for ``words`` using ``source`` embeddings."""
     lexicon = VectorLexicon(
         source=source,
         max_neighbors=max_neighbors,
@@ -448,7 +485,6 @@ def build_vector_cache(
 def load_vector_source(spec: str) -> Any:
     """Resolve ``spec`` strings for the cache-building CLI."""
     if spec.startswith("spacy:"):
         model_name = spec.split(":", 1)[1]
         return _load_spacy_language(model_name)
@@ -472,7 +508,8 @@ def _parse_cli(argv: Sequence[str] | None = None) -> argparse.Namespace:
         "--source",
         required=True,
         help=(
-            "Vector source specification. Use 'spacy:<model>' for spaCy pipelines "
+            "Vector source specification. Use 'spacy:<model>' for spaCy pipelines, "
+            "'sentence-transformers:<model>' for HuggingFace checkpoints (requires --tokens), "
             "or provide a path to a gensim KeyedVectors/word2vec file."
         ),
     )
@@ -538,7 +575,6 @@ def _iter_tokens_from_file(path: Path) -> Iterator[str]:
 def main(argv: Sequence[str] | None = None) -> int:
     """Entry-point for ``python -m glitchlings.lexicon.vector``."""
     args = _parse_cli(argv)
     if args.output.exists() and not args.overwrite:
@@ -547,28 +583,52 @@ def main(argv: Sequence[str] | None = None) -> int:
         )
     if args.normalizer == "lower":
-        normalizer: Callable[[str], str] | None = (
-            None if args.case_sensitive else str.lower
-        )
+        normalizer: Callable[[str], str] | None = None if args.case_sensitive else str.lower
     else:
-        normalizer = lambda value: value
-    source = load_vector_source(args.source)
+        def _identity(value: str) -> str:
+            return value
+        normalizer = _identity
+    tokens_from_file: list[str] | None = None
     if args.tokens is not None:
-        token_iter: Iterable[str] = _iter_tokens_from_file(args.tokens)
+        tokens_from_file = list(_iter_tokens_from_file(args.tokens))
+        if args.limit is not None:
+            tokens_from_file = tokens_from_file[: args.limit]
+    source_spec = args.source
+    token_iter: Iterable[str]
+    if source_spec.startswith("sentence-transformers:"):
+        model_name = source_spec.split(":", 1)[1].strip()
+        if not model_name:
+            model_name = "sentence-transformers/all-mpnet-base-v2"
+        if tokens_from_file is None:
+            raise SystemExit(
+                "Sentence-transformers sources require --tokens to supply a vocabulary."
+            )
+        source = _build_sentence_transformer_embeddings(model_name, tokens_from_file)
+        token_iter = tokens_from_file
     else:
-        lexicon = VectorLexicon(
-            source=source,
-            max_neighbors=args.max_neighbors,
-            min_similarity=args.min_similarity,
-            case_sensitive=args.case_sensitive,
-            normalizer=normalizer,
-            seed=args.seed,
-        )
-        token_iter = lexicon.iter_vocabulary()
-    if args.limit is not None:
-        token_iter = (token for index, token in enumerate(token_iter) if index < args.limit)
+        source = load_vector_source(source_spec)
+        if tokens_from_file is not None:
+            token_iter = tokens_from_file
+        else:
+            lexicon = VectorLexicon(
+                source=source,
+                max_neighbors=args.max_neighbors,
+                min_similarity=args.min_similarity,
+                case_sensitive=args.case_sensitive,
+                normalizer=normalizer,
+                seed=args.seed,
+            )
+            iterator = lexicon.iter_vocabulary()
+            if args.limit is not None:
+                token_iter = (
+                    token for index, token in enumerate(iterator) if index < args.limit
+                )
+            else:
+                token_iter = iterator
     build_vector_cache(
         source=source,

glitchlings/lexicon/wordnet.py CHANGED Viewed

@@ -2,42 +2,76 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any
-try:  # pragma: no cover - exercised when NLTK is available
-    import nltk  # type: ignore[import]
-except ModuleNotFoundError as exc:  # pragma: no cover - triggered when NLTK missing
-    nltk = None  # type: ignore[assignment]
-    find = None  # type: ignore[assignment]
-    _NLTK_IMPORT_ERROR = exc
-else:  # pragma: no cover - executed when NLTK is present
-    from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader  # type: ignore[import]
-    from nltk.data import find as _nltk_find  # type: ignore[import]
-    find = _nltk_find
-    _NLTK_IMPORT_ERROR = None
-if TYPE_CHECKING:  # pragma: no cover - typing aid only
-    from nltk.corpus.reader import WordNetCorpusReader  # type: ignore[import]
-else:  # pragma: no cover - runtime fallback to avoid hard dependency
-    WordNetCorpusReader = Any
+from importlib import import_module
+from pathlib import Path
+from types import ModuleType
+from typing import Any, Callable, Protocol, Sequence, cast
+from ..compat import nltk as _nltk_dependency
+from . import LexiconBackend
+from ._cache import CacheSnapshot
+class _LemmaProtocol(Protocol):
+    def name(self) -> str:
+        ...
+class _SynsetProtocol(Protocol):
+    def lemmas(self) -> Sequence[_LemmaProtocol]:
+        ...
+class _WordNetResource(Protocol):
+    def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
+        ...
+    def ensure_loaded(self) -> None:
+        ...
+WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
+nltk: ModuleType | None = _nltk_dependency.get()
+_NLTK_IMPORT_ERROR: ModuleNotFoundError | None = _nltk_dependency.error
+WordNetCorpusReader: WordNetCorpusReaderFactory | None = None
+find: Callable[[str], Any] | None = None
+_WORDNET_MODULE: _WordNetResource | None = None
 if nltk is not None:  # pragma: no cover - guarded by import success
     try:
-        from nltk.corpus import wordnet as _WORDNET_MODULE  # type: ignore[import]
+        corpus_reader_module = import_module("nltk.corpus.reader")
+    except ModuleNotFoundError as exc:  # pragma: no cover - triggered when corpus missing
+        if _NLTK_IMPORT_ERROR is None:
+            _NLTK_IMPORT_ERROR = exc
+    else:
+        reader_candidate = getattr(corpus_reader_module, "WordNetCorpusReader", None)
+        if reader_candidate is not None:
+            WordNetCorpusReader = cast(WordNetCorpusReaderFactory, reader_candidate)
+        try:
+            data_module = import_module("nltk.data")
+        except ModuleNotFoundError as exc:  # pragma: no cover - triggered when data missing
+            if _NLTK_IMPORT_ERROR is None:
+                _NLTK_IMPORT_ERROR = exc
+        else:
+            locator = getattr(data_module, "find", None)
+            if callable(locator):
+                find = cast(Callable[[str], Any], locator)
+    try:
+        module_candidate = import_module("nltk.corpus.wordnet")
     except ModuleNotFoundError:  # pragma: no cover - only hit on namespace packages
         _WORDNET_MODULE = None
     else:
-        WordNetCorpusReader = _WordNetCorpusReader  # type: ignore[assignment]
+        _WORDNET_MODULE = cast(_WordNetResource, module_candidate)
 else:
+    nltk = None
+    find = None
     _WORDNET_MODULE = None
-from pathlib import Path
-from . import LexiconBackend
-from ._cache import CacheSnapshot
-_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
+_WORDNET_HANDLE: _WordNetResource | None = _WORDNET_MODULE
 _wordnet_ready = False
 _VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
@@ -45,33 +79,37 @@ _VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
 def _require_nltk() -> None:
     """Ensure the NLTK dependency is present before continuing."""
     if nltk is None or find is None:
         message = (
             "The NLTK package is required for WordNet-backed lexicons; install "
             "`nltk` and its WordNet corpus manually to enable this backend."
         )
-        if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
+        if "_NLTK_IMPORT_ERROR" in globals() and _NLTK_IMPORT_ERROR is not None:
             raise RuntimeError(message) from _NLTK_IMPORT_ERROR
         raise RuntimeError(message)
 def dependencies_available() -> bool:
     """Return ``True`` when the runtime NLTK dependency is present."""
     return nltk is not None and find is not None
-def _load_wordnet_reader() -> WordNetCorpusReader:
+def _load_wordnet_reader() -> _WordNetResource:
     """Return a WordNet corpus reader from the downloaded corpus files."""
     _require_nltk()
+    if WordNetCorpusReader is None:
+        raise RuntimeError("The NLTK WordNet corpus reader is unavailable.")
+    locator = find
+    if locator is None:
+        raise RuntimeError("The NLTK data locator is unavailable.")
     try:
-        root = find("corpora/wordnet")
+        root = locator("corpora/wordnet")
     except LookupError:
         try:
-            zip_root = find("corpora/wordnet.zip")
+            zip_root = locator("corpora/wordnet.zip")
         except LookupError as exc:
             raise RuntimeError(
                 "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
@@ -81,24 +119,24 @@ def _load_wordnet_reader() -> WordNetCorpusReader:
     return WordNetCorpusReader(root, None)
-def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
+def _wordnet(force_refresh: bool = False) -> _WordNetResource:
     """Retrieve the active WordNet handle, rebuilding it on demand."""
     global _WORDNET_HANDLE
     if force_refresh:
         _WORDNET_HANDLE = _WORDNET_MODULE
-    if _WORDNET_HANDLE is not None:
-        return _WORDNET_HANDLE
+    cached = _WORDNET_HANDLE
+    if cached is not None:
+        return cached
-    _WORDNET_HANDLE = _load_wordnet_reader()
-    return _WORDNET_HANDLE
+    resource = _load_wordnet_reader()
+    _WORDNET_HANDLE = resource
+    return resource
 def ensure_wordnet() -> None:
     """Ensure the WordNet corpus is available before use."""
     global _wordnet_ready
     if _wordnet_ready:
         return
@@ -106,25 +144,25 @@ def ensure_wordnet() -> None:
     _require_nltk()
     resource = _wordnet()
+    nltk_module = nltk
+    if nltk_module is None:
+        raise RuntimeError("The NLTK dependency is unexpectedly unavailable.")
     try:
         resource.ensure_loaded()
     except LookupError:
-        nltk.download("wordnet", quiet=True)
+        nltk_module.download("wordnet", quiet=True)
         try:
             resource = _wordnet(force_refresh=True)
             resource.ensure_loaded()
         except LookupError as exc:  # pragma: no cover - only triggered when download fails
-            raise RuntimeError(
-                "Unable to load NLTK WordNet corpus for synonym lookups."
-            ) from exc
+            raise RuntimeError("Unable to load NLTK WordNet corpus for synonym lookups.") from exc
     _wordnet_ready = True
 def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
     """Gather deterministic synonym candidates for the supplied word."""
     normalized_word = word.lower()
     wordnet = _wordnet()
     synonyms: set[str] = set()
@@ -157,9 +195,8 @@ def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
 class WordNetLexicon(LexiconBackend):
     """Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
-    def get_synonyms(
-        self, word: str, pos: str | None = None, n: int = 5
-    ) -> list[str]:
+    def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
+        """Return up to ``n`` WordNet lemmas for ``word`` filtered by ``pos`` if provided."""
         ensure_wordnet()
         if pos is None:
@@ -174,15 +211,18 @@ class WordNetLexicon(LexiconBackend):
         return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
     def supports_pos(self, pos: str | None) -> bool:
+        """Return ``True`` when ``pos`` is unset or recognised by the WordNet corpus."""
         if pos is None:
             return True
         return pos.lower() in _VALID_POS
     @classmethod
     def load_cache(cls, path: str | Path) -> CacheSnapshot:
+        """WordNet lexicons do not persist caches; raising keeps the contract explicit."""
         raise RuntimeError("WordNetLexicon does not persist or load caches.")
     def save_cache(self, path: str | Path | None = None) -> Path | None:
+        """WordNet lexicons do not persist caches; raising keeps the contract explicit."""
         raise RuntimeError("WordNetLexicon does not persist or load caches.")
     def __repr__(self) -> str:  # pragma: no cover - trivial representation