PyPI - glitchlings - Versions diffs - 0.4.1__cp311-cp311-macosx_11_0_universal2.whl → 0.4.2__cp311-cp311-macosx_11_0_universal2.whl - Mend

glitchlings 0.4.1__cp311-cp311-macosx_11_0_universal2.whl → 0.4.2__cp311-cp311-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of glitchlings might be problematic. Click here for more details.

Files changed (39) hide show

glitchlings/__init__.py +26 -17
glitchlings/__main__.py +0 -1
glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
glitchlings/compat.py +215 -0
glitchlings/config.py +136 -19
glitchlings/dlc/_shared.py +68 -0
glitchlings/dlc/huggingface.py +26 -41
glitchlings/dlc/prime.py +64 -101
glitchlings/lexicon/__init__.py +8 -19
glitchlings/lexicon/_cache.py +0 -7
glitchlings/lexicon/graph.py +4 -12
glitchlings/lexicon/metrics.py +1 -8
glitchlings/lexicon/vector.py +15 -34
glitchlings/lexicon/wordnet.py +31 -32
glitchlings/main.py +9 -13
glitchlings/util/__init__.py +18 -4
glitchlings/util/adapters.py +27 -0
glitchlings/zoo/__init__.py +21 -14
glitchlings/zoo/_ocr_confusions.py +1 -3
glitchlings/zoo/_rate.py +1 -4
glitchlings/zoo/_sampling.py +0 -1
glitchlings/zoo/_text_utils.py +1 -5
glitchlings/zoo/adjax.py +0 -2
glitchlings/zoo/core.py +114 -75
glitchlings/zoo/jargoyle.py +9 -14
glitchlings/zoo/mim1c.py +11 -10
glitchlings/zoo/redactyl.py +5 -8
glitchlings/zoo/reduple.py +3 -1
glitchlings/zoo/rushmore.py +2 -8
glitchlings/zoo/scannequin.py +5 -4
glitchlings/zoo/typogre.py +3 -7
glitchlings/zoo/zeedub.py +2 -2
{glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/METADATA +67 -3
glitchlings-0.4.2.dist-info/RECORD +42 -0
glitchlings-0.4.1.dist-info/RECORD +0 -39
{glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/WHEEL +0 -0
{glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/entry_points.txt +0 -0
{glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/top_level.txt +0 -0

glitchlings/dlc/huggingface.py CHANGED Viewed

@@ -3,21 +3,15 @@
 from __future__ import annotations
 from collections.abc import Iterable, Sequence
-from typing import Any
+from typing import Any, cast
-try:  # pragma: no cover - optional dependency is required at runtime
-    from datasets import Dataset as _DatasetsDataset
-except ModuleNotFoundError as _datasets_error:  # pragma: no cover - optional dependency
-    _DatasetsDataset = None  # type: ignore[assignment]
-else:
-    _datasets_error = None
-from ..zoo import Gaggle, Glitchling, summon
+from ..compat import datasets, get_datasets_dataset, require_datasets
+from ..util.adapters import coerce_gaggle
+from ..zoo import Gaggle, Glitchling
 def _normalise_columns(column: str | Sequence[str]) -> list[str]:
     """Normalise a column specification to a list."""
     if isinstance(column, str):
         return [column]
@@ -27,20 +21,6 @@ def _normalise_columns(column: str | Sequence[str]) -> list[str]:
     return normalised
-def _as_gaggle(glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling], seed: int) -> Gaggle:
-    """Coerce any supported glitchling specification into a :class:`Gaggle`."""
-    if isinstance(glitchlings, Gaggle):
-        return glitchlings
-    if isinstance(glitchlings, (Glitchling, str)):
-        resolved: Iterable[str | Glitchling] = [glitchlings]
-    else:
-        resolved = glitchlings
-    return summon(list(resolved), seed=seed)
 def _glitch_dataset(
     dataset: Any,
     glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
@@ -48,23 +28,28 @@ def _glitch_dataset(
     *,
     seed: int = 151,
 ) -> Any:
-    """Internal helper implementing :meth:`Dataset.glitch`."""
+    """Apply glitchlings to the provided dataset columns."""
     columns = _normalise_columns(column)
-    gaggle = _as_gaggle(glitchlings, seed=seed)
+    gaggle = coerce_gaggle(glitchlings, seed=seed)
     return gaggle.corrupt_dataset(dataset, columns)
 def _ensure_dataset_class() -> Any:
     """Return the Hugging Face :class:`~datasets.Dataset` patched with ``.glitch``."""
-    if _DatasetsDataset is None:  # pragma: no cover - datasets is an install-time dependency
-        message = "datasets is not installed"
-        raise ModuleNotFoundError(message) from _datasets_error
-    if getattr(_DatasetsDataset, "glitch", None) is None:
-        def glitch(  # type: ignore[override]
+    dataset_cls = get_datasets_dataset()
+    if dataset_cls is None:  # pragma: no cover - datasets is an install-time dependency
+        require_datasets("datasets is not installed")
+        dataset_cls = get_datasets_dataset()
+        if dataset_cls is None:
+            message = "datasets is not installed"
+            error = datasets.error
+            if error is not None:
+                raise ModuleNotFoundError(message) from error
+            raise ModuleNotFoundError(message)
+    if getattr(dataset_cls, "glitch", None) is None:
+        def glitch(
             self: Any,
             glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
             *,
@@ -73,24 +58,24 @@ def _ensure_dataset_class() -> Any:
             **_: Any,
         ) -> Any:
             """Return a lazily corrupted copy of the dataset."""
             return _glitch_dataset(self, glitchlings, column, seed=seed)
-        setattr(_DatasetsDataset, "glitch", glitch)
+        setattr(dataset_cls, "glitch", glitch)
-    return _DatasetsDataset
+    return cast(type[Any], dataset_cls)
 def install() -> None:
     """Monkeypatch the Hugging Face :class:`~datasets.Dataset` with ``.glitch``."""
     _ensure_dataset_class()
-if _DatasetsDataset is not None:
+Dataset: type[Any] | None
+_DatasetAlias = get_datasets_dataset()
+if _DatasetAlias is not None:
     Dataset = _ensure_dataset_class()
 else:  # pragma: no cover - datasets is an install-time dependency
-    Dataset = None  # type: ignore[assignment]
+    Dataset = None
 __all__ = ["Dataset", "install"]

glitchlings/dlc/prime.py CHANGED Viewed

@@ -4,79 +4,60 @@ from __future__ import annotations
 from collections.abc import Iterable, Sequence
 from enum import Enum
-from typing import Any, Callable
+from typing import Any, Callable, Protocol, cast
-import verifiers as vf
+from ..compat import require_datasets, require_jellyfish, require_verifiers
+from ..util.adapters import coerce_gaggle
+from ..zoo import Gaggle, Glitchling, Mim1c, Typogre
+from ._shared import resolve_columns as _resolve_columns_shared
+from ._shared import resolve_environment as _resolve_environment_shared
-from jellyfish import damerau_levenshtein_distance
-try:
-    from .huggingface import Dataset
-except ModuleNotFoundError:  # pragma: no cover - optional dependency
-    Dataset = object  # type: ignore[assignment]
-else:
-    if Dataset is None:  # pragma: no cover - optional dependency
-        Dataset = object  # type: ignore[assignment]
+class VerifierEnvironment(Protocol):
+    """Minimal interface for verifiers environments."""
-from ..zoo import Gaggle, Glitchling, Mim1c, Typogre, summon
+    dataset: Any
-def _resolve_environment(env: str | vf.Environment) -> vf.Environment:
-    """Return a fully-instantiated verifier environment."""
-    if isinstance(env, str):
-        env = vf.load_environment(env)
+class VerifierSingleTurnEnv(Protocol):
+    """Minimal interface for single-turn verifier environments."""
-    if not isinstance(env, vf.Environment):
-        raise TypeError("Invalid environment type")
+    dataset: Any
+    rubric: Any
-    return env
+vf = require_verifiers("verifiers is not installed; install glitchlings[prime]")
+_jellyfish = require_jellyfish("jellyfish is not installed; install glitchlings[prime]")
+damerau_levenshtein_distance = _jellyfish.damerau_levenshtein_distance
-def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[str]:
-    """Identify which dataset columns should be corrupted."""
-    available = set(dataset.column_names)
+try:
+    from .huggingface import Dataset as _HuggingFaceDataset
+except ModuleNotFoundError:  # pragma: no cover - optional dependency
+    _HuggingFaceDataset = None
+else:
+    if _HuggingFaceDataset is None:  # pragma: no cover - optional dependency
+        _HuggingFaceDataset = None
-    if columns is not None:
-        missing = sorted(set(columns) - available)
-        if missing:
-            missing_str = ", ".join(missing)
-            raise ValueError(f"Columns not found in dataset: {missing_str}")
-        return list(columns)
+Dataset: type[Any]
+if _HuggingFaceDataset is None:
+    Dataset = object
+else:
+    Dataset = _HuggingFaceDataset
-    for candidate in ("prompt", "question"):
-        if candidate in available:
-            return [candidate]
-    try:
-        dataset_length = len(dataset)  # type: ignore[arg-type]
-    except TypeError:
-        preview_rows: list[dict[str, Any]]
-        take_fn = getattr(dataset, "take", None)
-        if callable(take_fn):
-            preview_rows = list(take_fn(1))
-        else:
-            iterator = iter(dataset)
-            try:
-                first_row = next(iterator)
-            except StopIteration:
-                preview_rows = []
-            else:
-                preview_rows = [first_row]
-        sample = dict(preview_rows[0]) if preview_rows else {}
-    else:
-        sample = dataset[0] if dataset_length else {}
-    inferred = [
-        name
-        for name in dataset.column_names
-        if isinstance(sample.get(name), str)
-    ]
+def _resolve_environment(env: str | VerifierEnvironment) -> VerifierEnvironment:
+    """Return a fully-instantiated verifier environment."""
+    resolved = _resolve_environment_shared(
+        env,
+        loader=vf.load_environment,
+        environment_type=cast(type[Any], vf.Environment),
+    )
+    return cast(VerifierEnvironment, resolved)
-    if inferred:
-        return inferred
-    raise ValueError("Unable to determine which dataset columns to corrupt.")
+def _resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
+    """Identify which dataset columns should be corrupted."""
+    return _resolve_columns_shared(dataset, columns)
 class Difficulty(Enum):
@@ -90,12 +71,11 @@ class Difficulty(Enum):
 def tutorial_level(
-    env: vf.Environment | str,
+    env: VerifierEnvironment | str,
     seed: int = 151,
     difficulty: Difficulty = Difficulty.Normal,
-) -> vf.Environment:
+) -> VerifierEnvironment:
     """Create a low-corruption environment using tuned defaults."""
     tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
     tuned_typogre = Typogre(rate=0.025 * difficulty.value)
@@ -107,28 +87,19 @@ def tutorial_level(
 def load_environment(
-    env: str | vf.Environment,
+    env: str | VerifierEnvironment,
     glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
     *,
     seed: int = 151,
     columns: Sequence[str] | None = None,
-) -> vf.Environment:
+) -> VerifierEnvironment:
     """Load an environment and optionally corrupt it with glitchlings."""
     environment = _resolve_environment(env)
     if glitchlings is None:
         return environment
-    if isinstance(glitchlings, Gaggle):
-        gaggle = glitchlings
-    else:
-        if isinstance(glitchlings, (Glitchling, str)):
-            resolved = [glitchlings]
-        else:
-            resolved = list(glitchlings)
-        gaggle = summon(resolved, seed=seed)
+    gaggle = coerce_gaggle(glitchlings, seed=seed)
     dataset = environment.dataset
     corrupt_columns = _resolve_columns(dataset, columns)
@@ -142,21 +113,11 @@ def _as_gaggle(
     seed: int,
 ) -> Gaggle:
     """Coerce any supported glitchling specification into a :class:`Gaggle`."""
-    if isinstance(glitchlings, Gaggle):
-        return glitchlings
-    if isinstance(glitchlings, (Glitchling, str)):
-        resolved: Iterable[str | Glitchling] = [glitchlings]
-    else:
-        resolved = glitchlings
-    return summon(list(resolved), seed=seed)
+    return coerce_gaggle(glitchlings, seed=seed)
 def _extract_completion_text(completion: Any) -> str:
     """Normalise a completion payload into a plain string."""
     if isinstance(completion, str):
         return completion
@@ -175,11 +136,10 @@ def symmetric_damerau_levenshtein_similarity(
     answer: str,
 ) -> float:
     """Return ``1 - (distance / max_len)`` using Damerau-Levenshtein distance."""
     completion_text = _extract_completion_text(completion)
     target = answer or ""
     denominator = max(len(completion_text), len(target), 1)
-    distance = damerau_levenshtein_distance(completion_text, target)
+    distance = cast(int, damerau_levenshtein_distance(completion_text, target))
     score = 1.0 - (distance / denominator)
     return max(0.0, min(1.0, score))
@@ -199,32 +159,34 @@ def echo_chamber(
     reward_function: Callable[..., float] | None = None,
     split: str | None = None,
     **load_dataset_kwargs: Any,
-) -> vf.Environment:
+) -> VerifierSingleTurnEnv:
     """Create an Echo Chamber Prime environment from a Hugging Face dataset column.
     Args:
         dataset_id: Identifier of the Hugging Face dataset to load.
         column: Name of the column whose text should be glitched.
         glitchlings: Glitchling specifiers that will corrupt the prompts.
-        seed: RNG seed forwarded to :func:`summon`.
+        seed: RNG seed forwarded to :func:`glitchlings.util.adapters.coerce_gaggle`.
         instructions: System instructions supplied to the environment prompts.
         reward_function: Optional callable used to score completions. Defaults to
             :func:`symmetric_damerau_levenshtein_similarity` when omitted.
         split: Optional dataset split to load.
         **load_dataset_kwargs: Extra keyword arguments forwarded to
             :func:`datasets.load_dataset`.
-    """
-    try:
-        from datasets import Dataset as HFDataset, DatasetDict, load_dataset
-    except ModuleNotFoundError as exc:  # pragma: no cover - optional dependency
+    """
+    datasets_module = require_datasets("datasets is required to build an echo chamber")
+    load_dataset = getattr(datasets_module, "load_dataset", None)
+    if load_dataset is None:  # pragma: no cover - defensive
         message = "datasets is required to build an echo chamber"
-        raise ModuleNotFoundError(message) from exc
+        raise ModuleNotFoundError(message)
-    hf_dataset: HFDataset | DatasetDict
+    dataset_dict_cls = getattr(datasets_module, "DatasetDict", dict)
+    hf_dataset: Any
     if split is None:
         hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
-        if isinstance(hf_dataset, DatasetDict):
+        if isinstance(hf_dataset, dataset_dict_cls):
             try:
                 hf_dataset = next(iter(hf_dataset.values()))
             except StopIteration as exc:  # pragma: no cover - defensive
@@ -232,10 +194,8 @@ def echo_chamber(
     else:
         hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
-    if isinstance(hf_dataset, DatasetDict):
-        raise ValueError(
-            "Specify which split to use when the dataset loads as a DatasetDict."
-        )
+    if isinstance(hf_dataset, dataset_dict_cls):
+        raise ValueError("Specify which split to use when the dataset loads as a DatasetDict.")
     filtered_dataset = hf_dataset.filter(
         lambda row: row.get(column) is not None,
@@ -259,7 +219,7 @@ def echo_chamber(
     )
     try:
-        dataset_length = len(base_dataset)  # type: ignore[arg-type]
+        dataset_length = len(base_dataset)
     except TypeError:
         preview_rows: list[dict[str, Any]]
         take_fn = getattr(base_dataset, "take", None)
@@ -288,4 +248,7 @@ def echo_chamber(
     rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
     rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
-    return vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric)
+    return cast(
+        VerifierSingleTurnEnv,
+        vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric),
+    )

glitchlings/lexicon/__init__.py CHANGED Viewed

@@ -2,13 +2,14 @@
 from __future__ import annotations
+import random
 from abc import ABC, abstractmethod
 from hashlib import blake2s
 from pathlib import Path
-import random
 from typing import Callable, Iterable
 from glitchlings.config import get_config
 from ._cache import CacheEntries, CacheSnapshot
@@ -21,6 +22,7 @@ class Lexicon(ABC):
         Optional integer used to derive deterministic random number generators
         for synonym sampling. Identical seeds guarantee reproducible results for
         the same word/part-of-speech queries.
     """
     def __init__(self, *, seed: int | None = None) -> None:
@@ -29,17 +31,14 @@ class Lexicon(ABC):
     @property
     def seed(self) -> int | None:
         """Return the current base seed used for deterministic sampling."""
         return self._seed
     def reseed(self, seed: int | None) -> None:
         """Update the base seed driving deterministic synonym sampling."""
         self._seed = seed
     def _derive_rng(self, word: str, pos: str | None) -> random.Random:
         """Return an RNG derived from the base seed, word, and POS tag."""
         seed_material = blake2s(digest_size=8)
         seed_material.update(word.lower().encode("utf8"))
         if pos is not None:
@@ -53,7 +52,6 @@ class Lexicon(ABC):
         self, values: Iterable[str], *, limit: int, word: str, pos: str | None
     ) -> list[str]:
         """Return up to ``limit`` values sampled deterministically."""
         if limit <= 0:
             return []
@@ -67,14 +65,11 @@ class Lexicon(ABC):
         return [items[index] for index in indices]
     @abstractmethod
-    def get_synonyms(
-        self, word: str, pos: str | None = None, n: int = 5
-    ) -> list[str]:
+    def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
         """Return up to ``n`` synonyms for ``word`` constrained by ``pos``."""
     def supports_pos(self, pos: str | None) -> bool:
         """Return ``True`` when the backend can service ``pos`` queries."""
         return True
     def __repr__(self) -> str:  # pragma: no cover - trivial representation
@@ -96,14 +91,14 @@ class LexiconBackend(Lexicon):
         """Persist the backend cache to ``path`` and return the destination."""
-from .graph import GraphLexicon
-from .metrics import (
+from .graph import GraphLexicon  # noqa: E402
+from .metrics import (  # noqa: E402
     compare_lexicons,
     coverage_ratio,
     mean_cosine_similarity,
     synonym_diversity,
 )
-from .vector import VectorLexicon, build_vector_cache
+from .vector import VectorLexicon, build_vector_cache  # noqa: E402
 try:  # pragma: no cover - optional dependency
     from .wordnet import WordNetLexicon
@@ -114,24 +109,19 @@ except Exception:  # pragma: no cover - triggered when nltk unavailable
 _BACKEND_FACTORIES: dict[str, Callable[[int | None], Lexicon | None]] = {}
-def register_backend(
-    name: str, factory: Callable[[int | None], Lexicon | None]
-) -> None:
+def register_backend(name: str, factory: Callable[[int | None], Lexicon | None]) -> None:
     """Register ``factory`` for ``name`` so it can be selected via config."""
     normalized = name.lower()
     _BACKEND_FACTORIES[normalized] = factory
 def unregister_backend(name: str) -> None:
     """Remove a previously registered backend."""
     _BACKEND_FACTORIES.pop(name.lower(), None)
 def available_backends() -> list[str]:
     """Return the names of registered lexicon factories."""
     return sorted(_BACKEND_FACTORIES)
@@ -172,7 +162,6 @@ register_backend("wordnet", _wordnet_backend)
 def get_default_lexicon(seed: int | None = None) -> Lexicon:
     """Return the first available lexicon according to configuration priority."""
     config = get_config()
     attempts: list[str] = []
     for name in config.lexicon.priority:

glitchlings/lexicon/_cache.py CHANGED Viewed

@@ -8,7 +8,6 @@ from hashlib import blake2s
 from pathlib import Path
 from typing import Mapping, Sequence
 CacheEntries = dict[str, list[str]]
@@ -22,7 +21,6 @@ class CacheSnapshot:
 def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
     """Convert raw cache payloads into canonical mapping form."""
     entries: CacheEntries = {}
     for key, values in payload.items():
         if not isinstance(key, str):
@@ -35,21 +33,18 @@ def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
 def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
     """Return a deterministic JSON serialisation for ``entries``."""
     serialisable = {key: list(values) for key, values in sorted(entries.items())}
     return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
 def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
     """Return a BLAKE2s checksum for ``entries``."""
     digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
     return digest.hexdigest()
 def load_cache(path: Path) -> CacheSnapshot:
     """Load a cache from ``path`` and verify its checksum if present."""
     if not path.exists():
         return CacheSnapshot(entries={}, checksum=None)
@@ -89,7 +84,6 @@ def load_cache(path: Path) -> CacheSnapshot:
 def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
     """Persist ``entries`` to ``path`` with checksum metadata."""
     serialisable = {key: list(values) for key, values in sorted(entries.items())}
     checksum = compute_checksum(serialisable)
     payload = {
@@ -108,4 +102,3 @@ def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapsh
 __all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]

glitchlings/lexicon/graph.py CHANGED Viewed

@@ -7,17 +7,17 @@ from pathlib import Path
 from typing import Iterable, Mapping, MutableMapping, Sequence
 from . import LexiconBackend
-from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
+from ._cache import CacheSnapshot
+from ._cache import load_cache as _load_cache_file
+from ._cache import write_cache as _write_cache_file
 from .vector import VectorLexicon
 _CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
 _PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
 def _lemmatize_token(token: str) -> str:
     """Return a lightweight lemma for ``token`` using heuristic rules."""
     irregular = {
         "children": "child",
         "mice": "mouse",
@@ -60,7 +60,6 @@ def _lemmatize_token(token: str) -> str:
 def _normalize_phrase(phrase: str) -> str:
     """Normalise ``phrase`` for ConceptNet lookups."""
     stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
     tokens = [token for token in stripped.split() if token]
     if not tokens:
@@ -71,7 +70,6 @@ def _normalize_phrase(phrase: str) -> str:
 def _concept_terms(normalized: str) -> list[str]:
     """Return ConceptNet term variants for ``normalized``."""
     collapsed = normalized.replace(" ", "_")
     if not collapsed:
         return []
@@ -83,7 +81,6 @@ def _concept_terms(normalized: str) -> list[str]:
 def _surface_from_concept(concept: str) -> str | None:
     """Return a human-readable surface form for ``concept``."""
     match = _CONCEPT_RE.match(concept)
     if match is None:
         return None
@@ -102,7 +99,6 @@ def _language_from_concept(concept: str) -> str | None:
 def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
     """Load ConceptNet Numberbatch embeddings from ``path``."""
     if not path.exists():
         return {}
@@ -240,9 +236,7 @@ class GraphLexicon(LexiconBackend):
             self._cache_dirty = True
         return synonyms
-    def get_synonyms(
-        self, word: str, pos: str | None = None, n: int = 5
-    ) -> list[str]:
+    def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
         normalized = _normalize_phrase(word)
         if not normalized:
             return []
@@ -261,7 +255,6 @@ class GraphLexicon(LexiconBackend):
     @classmethod
     def load_cache(cls, path: str | Path) -> CacheSnapshot:
         """Load and validate a persisted ConceptNet cache file."""
         return _load_cache_file(Path(path))
     def save_cache(self, path: str | Path | None = None) -> Path:
@@ -287,4 +280,3 @@ class GraphLexicon(LexiconBackend):
             f"GraphLexicon(languages={sorted(self._languages)!r}, "
             f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
         )

glitchlings/lexicon/metrics.py CHANGED Viewed

@@ -18,7 +18,6 @@ def _unique_synonyms(
     sample_size: int,
 ) -> list[str]:
     """Return unique synonym candidates excluding the original token."""
     collected: list[str] = []
     seen: set[str] = set()
     source = word.lower()
@@ -41,7 +40,6 @@ def synonym_diversity(
     sample_size: int = 5,
 ) -> float:
     """Return the mean unique-synonym count for ``words`` using ``lexicon``."""
     totals = []
     for word in words:
         synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
@@ -60,7 +58,6 @@ def coverage_ratio(
     min_synonyms: int = 3,
 ) -> float:
     """Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
     total = 0
     hits = 0
     for word in words:
@@ -96,7 +93,6 @@ def mean_cosine_similarity(
     sample_size: int = 5,
 ) -> float:
     """Return the mean cosine similarity between each word and its candidates."""
     total = 0.0
     count = 0
     for word in words:
@@ -126,11 +122,8 @@ def compare_lexicons(
     embeddings: Mapping[str, Sequence[float]] | None = None,
 ) -> dict[str, float]:
     """Return comparative coverage and diversity statistics for two lexicons."""
     stats = {
-        "baseline_diversity": synonym_diversity(
-            baseline, words, pos=pos, sample_size=sample_size
-        ),
+        "baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
         "candidate_diversity": synonym_diversity(
             candidate, words, pos=pos, sample_size=sample_size
         ),