glitchlings 0.4.1__cp311-cp311-macosx_11_0_universal2.whl → 0.4.2__cp311-cp311-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (39) hide show
  1. glitchlings/__init__.py +26 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
  4. glitchlings/compat.py +215 -0
  5. glitchlings/config.py +136 -19
  6. glitchlings/dlc/_shared.py +68 -0
  7. glitchlings/dlc/huggingface.py +26 -41
  8. glitchlings/dlc/prime.py +64 -101
  9. glitchlings/lexicon/__init__.py +8 -19
  10. glitchlings/lexicon/_cache.py +0 -7
  11. glitchlings/lexicon/graph.py +4 -12
  12. glitchlings/lexicon/metrics.py +1 -8
  13. glitchlings/lexicon/vector.py +15 -34
  14. glitchlings/lexicon/wordnet.py +31 -32
  15. glitchlings/main.py +9 -13
  16. glitchlings/util/__init__.py +18 -4
  17. glitchlings/util/adapters.py +27 -0
  18. glitchlings/zoo/__init__.py +21 -14
  19. glitchlings/zoo/_ocr_confusions.py +1 -3
  20. glitchlings/zoo/_rate.py +1 -4
  21. glitchlings/zoo/_sampling.py +0 -1
  22. glitchlings/zoo/_text_utils.py +1 -5
  23. glitchlings/zoo/adjax.py +0 -2
  24. glitchlings/zoo/core.py +114 -75
  25. glitchlings/zoo/jargoyle.py +9 -14
  26. glitchlings/zoo/mim1c.py +11 -10
  27. glitchlings/zoo/redactyl.py +5 -8
  28. glitchlings/zoo/reduple.py +3 -1
  29. glitchlings/zoo/rushmore.py +2 -8
  30. glitchlings/zoo/scannequin.py +5 -4
  31. glitchlings/zoo/typogre.py +3 -7
  32. glitchlings/zoo/zeedub.py +2 -2
  33. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/METADATA +67 -3
  34. glitchlings-0.4.2.dist-info/RECORD +42 -0
  35. glitchlings-0.4.1.dist-info/RECORD +0 -39
  36. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/WHEEL +0 -0
  37. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/entry_points.txt +0 -0
  38. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/licenses/LICENSE +0 -0
  39. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/top_level.txt +0 -0
@@ -3,21 +3,15 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from collections.abc import Iterable, Sequence
6
- from typing import Any
6
+ from typing import Any, cast
7
7
 
8
- try: # pragma: no cover - optional dependency is required at runtime
9
- from datasets import Dataset as _DatasetsDataset
10
- except ModuleNotFoundError as _datasets_error: # pragma: no cover - optional dependency
11
- _DatasetsDataset = None # type: ignore[assignment]
12
- else:
13
- _datasets_error = None
14
-
15
- from ..zoo import Gaggle, Glitchling, summon
8
+ from ..compat import datasets, get_datasets_dataset, require_datasets
9
+ from ..util.adapters import coerce_gaggle
10
+ from ..zoo import Gaggle, Glitchling
16
11
 
17
12
 
18
13
  def _normalise_columns(column: str | Sequence[str]) -> list[str]:
19
14
  """Normalise a column specification to a list."""
20
-
21
15
  if isinstance(column, str):
22
16
  return [column]
23
17
 
@@ -27,20 +21,6 @@ def _normalise_columns(column: str | Sequence[str]) -> list[str]:
27
21
  return normalised
28
22
 
29
23
 
30
- def _as_gaggle(glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling], seed: int) -> Gaggle:
31
- """Coerce any supported glitchling specification into a :class:`Gaggle`."""
32
-
33
- if isinstance(glitchlings, Gaggle):
34
- return glitchlings
35
-
36
- if isinstance(glitchlings, (Glitchling, str)):
37
- resolved: Iterable[str | Glitchling] = [glitchlings]
38
- else:
39
- resolved = glitchlings
40
-
41
- return summon(list(resolved), seed=seed)
42
-
43
-
44
24
  def _glitch_dataset(
45
25
  dataset: Any,
46
26
  glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
@@ -48,23 +28,28 @@ def _glitch_dataset(
48
28
  *,
49
29
  seed: int = 151,
50
30
  ) -> Any:
51
- """Internal helper implementing :meth:`Dataset.glitch`."""
52
-
31
+ """Apply glitchlings to the provided dataset columns."""
53
32
  columns = _normalise_columns(column)
54
- gaggle = _as_gaggle(glitchlings, seed=seed)
33
+ gaggle = coerce_gaggle(glitchlings, seed=seed)
55
34
  return gaggle.corrupt_dataset(dataset, columns)
56
35
 
57
36
 
58
37
  def _ensure_dataset_class() -> Any:
59
38
  """Return the Hugging Face :class:`~datasets.Dataset` patched with ``.glitch``."""
60
-
61
- if _DatasetsDataset is None: # pragma: no cover - datasets is an install-time dependency
62
- message = "datasets is not installed"
63
- raise ModuleNotFoundError(message) from _datasets_error
64
-
65
- if getattr(_DatasetsDataset, "glitch", None) is None:
66
-
67
- def glitch( # type: ignore[override]
39
+ dataset_cls = get_datasets_dataset()
40
+ if dataset_cls is None: # pragma: no cover - datasets is an install-time dependency
41
+ require_datasets("datasets is not installed")
42
+ dataset_cls = get_datasets_dataset()
43
+ if dataset_cls is None:
44
+ message = "datasets is not installed"
45
+ error = datasets.error
46
+ if error is not None:
47
+ raise ModuleNotFoundError(message) from error
48
+ raise ModuleNotFoundError(message)
49
+
50
+ if getattr(dataset_cls, "glitch", None) is None:
51
+
52
+ def glitch(
68
53
  self: Any,
69
54
  glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
70
55
  *,
@@ -73,24 +58,24 @@ def _ensure_dataset_class() -> Any:
73
58
  **_: Any,
74
59
  ) -> Any:
75
60
  """Return a lazily corrupted copy of the dataset."""
76
-
77
61
  return _glitch_dataset(self, glitchlings, column, seed=seed)
78
62
 
79
- setattr(_DatasetsDataset, "glitch", glitch)
63
+ setattr(dataset_cls, "glitch", glitch)
80
64
 
81
- return _DatasetsDataset
65
+ return cast(type[Any], dataset_cls)
82
66
 
83
67
 
84
68
  def install() -> None:
85
69
  """Monkeypatch the Hugging Face :class:`~datasets.Dataset` with ``.glitch``."""
86
-
87
70
  _ensure_dataset_class()
88
71
 
89
72
 
90
- if _DatasetsDataset is not None:
73
+ Dataset: type[Any] | None
74
+ _DatasetAlias = get_datasets_dataset()
75
+ if _DatasetAlias is not None:
91
76
  Dataset = _ensure_dataset_class()
92
77
  else: # pragma: no cover - datasets is an install-time dependency
93
- Dataset = None # type: ignore[assignment]
78
+ Dataset = None
94
79
 
95
80
 
96
81
  __all__ = ["Dataset", "install"]
glitchlings/dlc/prime.py CHANGED
@@ -4,79 +4,60 @@ from __future__ import annotations
4
4
 
5
5
  from collections.abc import Iterable, Sequence
6
6
  from enum import Enum
7
- from typing import Any, Callable
7
+ from typing import Any, Callable, Protocol, cast
8
8
 
9
- import verifiers as vf
9
+ from ..compat import require_datasets, require_jellyfish, require_verifiers
10
+ from ..util.adapters import coerce_gaggle
11
+ from ..zoo import Gaggle, Glitchling, Mim1c, Typogre
12
+ from ._shared import resolve_columns as _resolve_columns_shared
13
+ from ._shared import resolve_environment as _resolve_environment_shared
10
14
 
11
- from jellyfish import damerau_levenshtein_distance
12
15
 
13
- try:
14
- from .huggingface import Dataset
15
- except ModuleNotFoundError: # pragma: no cover - optional dependency
16
- Dataset = object # type: ignore[assignment]
17
- else:
18
- if Dataset is None: # pragma: no cover - optional dependency
19
- Dataset = object # type: ignore[assignment]
16
+ class VerifierEnvironment(Protocol):
17
+ """Minimal interface for verifiers environments."""
20
18
 
21
- from ..zoo import Gaggle, Glitchling, Mim1c, Typogre, summon
19
+ dataset: Any
22
20
 
23
21
 
24
- def _resolve_environment(env: str | vf.Environment) -> vf.Environment:
25
- """Return a fully-instantiated verifier environment."""
26
-
27
- if isinstance(env, str):
28
- env = vf.load_environment(env)
22
+ class VerifierSingleTurnEnv(Protocol):
23
+ """Minimal interface for single-turn verifier environments."""
29
24
 
30
- if not isinstance(env, vf.Environment):
31
- raise TypeError("Invalid environment type")
25
+ dataset: Any
26
+ rubric: Any
32
27
 
33
- return env
34
28
 
29
+ vf = require_verifiers("verifiers is not installed; install glitchlings[prime]")
30
+ _jellyfish = require_jellyfish("jellyfish is not installed; install glitchlings[prime]")
31
+ damerau_levenshtein_distance = _jellyfish.damerau_levenshtein_distance
35
32
 
36
- def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[str]:
37
- """Identify which dataset columns should be corrupted."""
38
-
39
- available = set(dataset.column_names)
33
+ try:
34
+ from .huggingface import Dataset as _HuggingFaceDataset
35
+ except ModuleNotFoundError: # pragma: no cover - optional dependency
36
+ _HuggingFaceDataset = None
37
+ else:
38
+ if _HuggingFaceDataset is None: # pragma: no cover - optional dependency
39
+ _HuggingFaceDataset = None
40
40
 
41
- if columns is not None:
42
- missing = sorted(set(columns) - available)
43
- if missing:
44
- missing_str = ", ".join(missing)
45
- raise ValueError(f"Columns not found in dataset: {missing_str}")
46
- return list(columns)
41
+ Dataset: type[Any]
42
+ if _HuggingFaceDataset is None:
43
+ Dataset = object
44
+ else:
45
+ Dataset = _HuggingFaceDataset
47
46
 
48
- for candidate in ("prompt", "question"):
49
- if candidate in available:
50
- return [candidate]
51
47
 
52
- try:
53
- dataset_length = len(dataset) # type: ignore[arg-type]
54
- except TypeError:
55
- preview_rows: list[dict[str, Any]]
56
- take_fn = getattr(dataset, "take", None)
57
- if callable(take_fn):
58
- preview_rows = list(take_fn(1))
59
- else:
60
- iterator = iter(dataset)
61
- try:
62
- first_row = next(iterator)
63
- except StopIteration:
64
- preview_rows = []
65
- else:
66
- preview_rows = [first_row]
67
- sample = dict(preview_rows[0]) if preview_rows else {}
68
- else:
69
- sample = dataset[0] if dataset_length else {}
70
- inferred = [
71
- name
72
- for name in dataset.column_names
73
- if isinstance(sample.get(name), str)
74
- ]
48
+ def _resolve_environment(env: str | VerifierEnvironment) -> VerifierEnvironment:
49
+ """Return a fully-instantiated verifier environment."""
50
+ resolved = _resolve_environment_shared(
51
+ env,
52
+ loader=vf.load_environment,
53
+ environment_type=cast(type[Any], vf.Environment),
54
+ )
55
+ return cast(VerifierEnvironment, resolved)
75
56
 
76
- if inferred:
77
- return inferred
78
57
 
79
- raise ValueError("Unable to determine which dataset columns to corrupt.")
58
+ def _resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
59
+ """Identify which dataset columns should be corrupted."""
60
+ return _resolve_columns_shared(dataset, columns)
80
61
 
81
62
 
82
63
  class Difficulty(Enum):
@@ -90,12 +71,11 @@ class Difficulty(Enum):
90
71
 
91
72
 
92
73
  def tutorial_level(
93
- env: vf.Environment | str,
74
+ env: VerifierEnvironment | str,
94
75
  seed: int = 151,
95
76
  difficulty: Difficulty = Difficulty.Normal,
96
- ) -> vf.Environment:
77
+ ) -> VerifierEnvironment:
97
78
  """Create a low-corruption environment using tuned defaults."""
98
-
99
79
  tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
100
80
  tuned_typogre = Typogre(rate=0.025 * difficulty.value)
101
81
 
@@ -107,28 +87,19 @@ def tutorial_level(
107
87
 
108
88
 
109
89
  def load_environment(
110
- env: str | vf.Environment,
90
+ env: str | VerifierEnvironment,
111
91
  glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
112
92
  *,
113
93
  seed: int = 151,
114
94
  columns: Sequence[str] | None = None,
115
- ) -> vf.Environment:
95
+ ) -> VerifierEnvironment:
116
96
  """Load an environment and optionally corrupt it with glitchlings."""
117
-
118
97
  environment = _resolve_environment(env)
119
98
 
120
99
  if glitchlings is None:
121
100
  return environment
122
101
 
123
- if isinstance(glitchlings, Gaggle):
124
- gaggle = glitchlings
125
- else:
126
- if isinstance(glitchlings, (Glitchling, str)):
127
- resolved = [glitchlings]
128
- else:
129
- resolved = list(glitchlings)
130
-
131
- gaggle = summon(resolved, seed=seed)
102
+ gaggle = coerce_gaggle(glitchlings, seed=seed)
132
103
 
133
104
  dataset = environment.dataset
134
105
  corrupt_columns = _resolve_columns(dataset, columns)
@@ -142,21 +113,11 @@ def _as_gaggle(
142
113
  seed: int,
143
114
  ) -> Gaggle:
144
115
  """Coerce any supported glitchling specification into a :class:`Gaggle`."""
145
-
146
- if isinstance(glitchlings, Gaggle):
147
- return glitchlings
148
-
149
- if isinstance(glitchlings, (Glitchling, str)):
150
- resolved: Iterable[str | Glitchling] = [glitchlings]
151
- else:
152
- resolved = glitchlings
153
-
154
- return summon(list(resolved), seed=seed)
116
+ return coerce_gaggle(glitchlings, seed=seed)
155
117
 
156
118
 
157
119
  def _extract_completion_text(completion: Any) -> str:
158
120
  """Normalise a completion payload into a plain string."""
159
-
160
121
  if isinstance(completion, str):
161
122
  return completion
162
123
 
@@ -175,11 +136,10 @@ def symmetric_damerau_levenshtein_similarity(
175
136
  answer: str,
176
137
  ) -> float:
177
138
  """Return ``1 - (distance / max_len)`` using Damerau-Levenshtein distance."""
178
-
179
139
  completion_text = _extract_completion_text(completion)
180
140
  target = answer or ""
181
141
  denominator = max(len(completion_text), len(target), 1)
182
- distance = damerau_levenshtein_distance(completion_text, target)
142
+ distance = cast(int, damerau_levenshtein_distance(completion_text, target))
183
143
  score = 1.0 - (distance / denominator)
184
144
  return max(0.0, min(1.0, score))
185
145
 
@@ -199,32 +159,34 @@ def echo_chamber(
199
159
  reward_function: Callable[..., float] | None = None,
200
160
  split: str | None = None,
201
161
  **load_dataset_kwargs: Any,
202
- ) -> vf.Environment:
162
+ ) -> VerifierSingleTurnEnv:
203
163
  """Create an Echo Chamber Prime environment from a Hugging Face dataset column.
204
164
 
205
165
  Args:
206
166
  dataset_id: Identifier of the Hugging Face dataset to load.
207
167
  column: Name of the column whose text should be glitched.
208
168
  glitchlings: Glitchling specifiers that will corrupt the prompts.
209
- seed: RNG seed forwarded to :func:`summon`.
169
+ seed: RNG seed forwarded to :func:`glitchlings.util.adapters.coerce_gaggle`.
210
170
  instructions: System instructions supplied to the environment prompts.
211
171
  reward_function: Optional callable used to score completions. Defaults to
212
172
  :func:`symmetric_damerau_levenshtein_similarity` when omitted.
213
173
  split: Optional dataset split to load.
214
174
  **load_dataset_kwargs: Extra keyword arguments forwarded to
215
175
  :func:`datasets.load_dataset`.
216
- """
217
176
 
218
- try:
219
- from datasets import Dataset as HFDataset, DatasetDict, load_dataset
220
- except ModuleNotFoundError as exc: # pragma: no cover - optional dependency
177
+ """
178
+ datasets_module = require_datasets("datasets is required to build an echo chamber")
179
+ load_dataset = getattr(datasets_module, "load_dataset", None)
180
+ if load_dataset is None: # pragma: no cover - defensive
221
181
  message = "datasets is required to build an echo chamber"
222
- raise ModuleNotFoundError(message) from exc
182
+ raise ModuleNotFoundError(message)
223
183
 
224
- hf_dataset: HFDataset | DatasetDict
184
+ dataset_dict_cls = getattr(datasets_module, "DatasetDict", dict)
185
+
186
+ hf_dataset: Any
225
187
  if split is None:
226
188
  hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
227
- if isinstance(hf_dataset, DatasetDict):
189
+ if isinstance(hf_dataset, dataset_dict_cls):
228
190
  try:
229
191
  hf_dataset = next(iter(hf_dataset.values()))
230
192
  except StopIteration as exc: # pragma: no cover - defensive
@@ -232,10 +194,8 @@ def echo_chamber(
232
194
  else:
233
195
  hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
234
196
 
235
- if isinstance(hf_dataset, DatasetDict):
236
- raise ValueError(
237
- "Specify which split to use when the dataset loads as a DatasetDict."
238
- )
197
+ if isinstance(hf_dataset, dataset_dict_cls):
198
+ raise ValueError("Specify which split to use when the dataset loads as a DatasetDict.")
239
199
 
240
200
  filtered_dataset = hf_dataset.filter(
241
201
  lambda row: row.get(column) is not None,
@@ -259,7 +219,7 @@ def echo_chamber(
259
219
  )
260
220
 
261
221
  try:
262
- dataset_length = len(base_dataset) # type: ignore[arg-type]
222
+ dataset_length = len(base_dataset)
263
223
  except TypeError:
264
224
  preview_rows: list[dict[str, Any]]
265
225
  take_fn = getattr(base_dataset, "take", None)
@@ -288,4 +248,7 @@ def echo_chamber(
288
248
 
289
249
  rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
290
250
  rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
291
- return vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric)
251
+ return cast(
252
+ VerifierSingleTurnEnv,
253
+ vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric),
254
+ )
@@ -2,13 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import random
5
6
  from abc import ABC, abstractmethod
6
7
  from hashlib import blake2s
7
8
  from pathlib import Path
8
- import random
9
9
  from typing import Callable, Iterable
10
10
 
11
11
  from glitchlings.config import get_config
12
+
12
13
  from ._cache import CacheEntries, CacheSnapshot
13
14
 
14
15
 
@@ -21,6 +22,7 @@ class Lexicon(ABC):
21
22
  Optional integer used to derive deterministic random number generators
22
23
  for synonym sampling. Identical seeds guarantee reproducible results for
23
24
  the same word/part-of-speech queries.
25
+
24
26
  """
25
27
 
26
28
  def __init__(self, *, seed: int | None = None) -> None:
@@ -29,17 +31,14 @@ class Lexicon(ABC):
29
31
  @property
30
32
  def seed(self) -> int | None:
31
33
  """Return the current base seed used for deterministic sampling."""
32
-
33
34
  return self._seed
34
35
 
35
36
  def reseed(self, seed: int | None) -> None:
36
37
  """Update the base seed driving deterministic synonym sampling."""
37
-
38
38
  self._seed = seed
39
39
 
40
40
  def _derive_rng(self, word: str, pos: str | None) -> random.Random:
41
41
  """Return an RNG derived from the base seed, word, and POS tag."""
42
-
43
42
  seed_material = blake2s(digest_size=8)
44
43
  seed_material.update(word.lower().encode("utf8"))
45
44
  if pos is not None:
@@ -53,7 +52,6 @@ class Lexicon(ABC):
53
52
  self, values: Iterable[str], *, limit: int, word: str, pos: str | None
54
53
  ) -> list[str]:
55
54
  """Return up to ``limit`` values sampled deterministically."""
56
-
57
55
  if limit <= 0:
58
56
  return []
59
57
 
@@ -67,14 +65,11 @@ class Lexicon(ABC):
67
65
  return [items[index] for index in indices]
68
66
 
69
67
  @abstractmethod
70
- def get_synonyms(
71
- self, word: str, pos: str | None = None, n: int = 5
72
- ) -> list[str]:
68
+ def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
73
69
  """Return up to ``n`` synonyms for ``word`` constrained by ``pos``."""
74
70
 
75
71
  def supports_pos(self, pos: str | None) -> bool:
76
72
  """Return ``True`` when the backend can service ``pos`` queries."""
77
-
78
73
  return True
79
74
 
80
75
  def __repr__(self) -> str: # pragma: no cover - trivial representation
@@ -96,14 +91,14 @@ class LexiconBackend(Lexicon):
96
91
  """Persist the backend cache to ``path`` and return the destination."""
97
92
 
98
93
 
99
- from .graph import GraphLexicon
100
- from .metrics import (
94
+ from .graph import GraphLexicon # noqa: E402
95
+ from .metrics import ( # noqa: E402
101
96
  compare_lexicons,
102
97
  coverage_ratio,
103
98
  mean_cosine_similarity,
104
99
  synonym_diversity,
105
100
  )
106
- from .vector import VectorLexicon, build_vector_cache
101
+ from .vector import VectorLexicon, build_vector_cache # noqa: E402
107
102
 
108
103
  try: # pragma: no cover - optional dependency
109
104
  from .wordnet import WordNetLexicon
@@ -114,24 +109,19 @@ except Exception: # pragma: no cover - triggered when nltk unavailable
114
109
  _BACKEND_FACTORIES: dict[str, Callable[[int | None], Lexicon | None]] = {}
115
110
 
116
111
 
117
- def register_backend(
118
- name: str, factory: Callable[[int | None], Lexicon | None]
119
- ) -> None:
112
+ def register_backend(name: str, factory: Callable[[int | None], Lexicon | None]) -> None:
120
113
  """Register ``factory`` for ``name`` so it can be selected via config."""
121
-
122
114
  normalized = name.lower()
123
115
  _BACKEND_FACTORIES[normalized] = factory
124
116
 
125
117
 
126
118
  def unregister_backend(name: str) -> None:
127
119
  """Remove a previously registered backend."""
128
-
129
120
  _BACKEND_FACTORIES.pop(name.lower(), None)
130
121
 
131
122
 
132
123
  def available_backends() -> list[str]:
133
124
  """Return the names of registered lexicon factories."""
134
-
135
125
  return sorted(_BACKEND_FACTORIES)
136
126
 
137
127
 
@@ -172,7 +162,6 @@ register_backend("wordnet", _wordnet_backend)
172
162
 
173
163
  def get_default_lexicon(seed: int | None = None) -> Lexicon:
174
164
  """Return the first available lexicon according to configuration priority."""
175
-
176
165
  config = get_config()
177
166
  attempts: list[str] = []
178
167
  for name in config.lexicon.priority:
@@ -8,7 +8,6 @@ from hashlib import blake2s
8
8
  from pathlib import Path
9
9
  from typing import Mapping, Sequence
10
10
 
11
-
12
11
  CacheEntries = dict[str, list[str]]
13
12
 
14
13
 
@@ -22,7 +21,6 @@ class CacheSnapshot:
22
21
 
23
22
  def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
24
23
  """Convert raw cache payloads into canonical mapping form."""
25
-
26
24
  entries: CacheEntries = {}
27
25
  for key, values in payload.items():
28
26
  if not isinstance(key, str):
@@ -35,21 +33,18 @@ def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
35
33
 
36
34
  def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
37
35
  """Return a deterministic JSON serialisation for ``entries``."""
38
-
39
36
  serialisable = {key: list(values) for key, values in sorted(entries.items())}
40
37
  return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
41
38
 
42
39
 
43
40
  def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
44
41
  """Return a BLAKE2s checksum for ``entries``."""
45
-
46
42
  digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
47
43
  return digest.hexdigest()
48
44
 
49
45
 
50
46
  def load_cache(path: Path) -> CacheSnapshot:
51
47
  """Load a cache from ``path`` and verify its checksum if present."""
52
-
53
48
  if not path.exists():
54
49
  return CacheSnapshot(entries={}, checksum=None)
55
50
 
@@ -89,7 +84,6 @@ def load_cache(path: Path) -> CacheSnapshot:
89
84
 
90
85
  def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
91
86
  """Persist ``entries`` to ``path`` with checksum metadata."""
92
-
93
87
  serialisable = {key: list(values) for key, values in sorted(entries.items())}
94
88
  checksum = compute_checksum(serialisable)
95
89
  payload = {
@@ -108,4 +102,3 @@ def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapsh
108
102
 
109
103
 
110
104
  __all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]
111
-
@@ -7,17 +7,17 @@ from pathlib import Path
7
7
  from typing import Iterable, Mapping, MutableMapping, Sequence
8
8
 
9
9
  from . import LexiconBackend
10
- from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
10
+ from ._cache import CacheSnapshot
11
+ from ._cache import load_cache as _load_cache_file
12
+ from ._cache import write_cache as _write_cache_file
11
13
  from .vector import VectorLexicon
12
14
 
13
-
14
15
  _CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
15
16
  _PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
16
17
 
17
18
 
18
19
  def _lemmatize_token(token: str) -> str:
19
20
  """Return a lightweight lemma for ``token`` using heuristic rules."""
20
-
21
21
  irregular = {
22
22
  "children": "child",
23
23
  "mice": "mouse",
@@ -60,7 +60,6 @@ def _lemmatize_token(token: str) -> str:
60
60
 
61
61
  def _normalize_phrase(phrase: str) -> str:
62
62
  """Normalise ``phrase`` for ConceptNet lookups."""
63
-
64
63
  stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
65
64
  tokens = [token for token in stripped.split() if token]
66
65
  if not tokens:
@@ -71,7 +70,6 @@ def _normalize_phrase(phrase: str) -> str:
71
70
 
72
71
  def _concept_terms(normalized: str) -> list[str]:
73
72
  """Return ConceptNet term variants for ``normalized``."""
74
-
75
73
  collapsed = normalized.replace(" ", "_")
76
74
  if not collapsed:
77
75
  return []
@@ -83,7 +81,6 @@ def _concept_terms(normalized: str) -> list[str]:
83
81
 
84
82
  def _surface_from_concept(concept: str) -> str | None:
85
83
  """Return a human-readable surface form for ``concept``."""
86
-
87
84
  match = _CONCEPT_RE.match(concept)
88
85
  if match is None:
89
86
  return None
@@ -102,7 +99,6 @@ def _language_from_concept(concept: str) -> str | None:
102
99
 
103
100
  def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
104
101
  """Load ConceptNet Numberbatch embeddings from ``path``."""
105
-
106
102
  if not path.exists():
107
103
  return {}
108
104
 
@@ -240,9 +236,7 @@ class GraphLexicon(LexiconBackend):
240
236
  self._cache_dirty = True
241
237
  return synonyms
242
238
 
243
- def get_synonyms(
244
- self, word: str, pos: str | None = None, n: int = 5
245
- ) -> list[str]:
239
+ def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
246
240
  normalized = _normalize_phrase(word)
247
241
  if not normalized:
248
242
  return []
@@ -261,7 +255,6 @@ class GraphLexicon(LexiconBackend):
261
255
  @classmethod
262
256
  def load_cache(cls, path: str | Path) -> CacheSnapshot:
263
257
  """Load and validate a persisted ConceptNet cache file."""
264
-
265
258
  return _load_cache_file(Path(path))
266
259
 
267
260
  def save_cache(self, path: str | Path | None = None) -> Path:
@@ -287,4 +280,3 @@ class GraphLexicon(LexiconBackend):
287
280
  f"GraphLexicon(languages={sorted(self._languages)!r}, "
288
281
  f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
289
282
  )
290
-
@@ -18,7 +18,6 @@ def _unique_synonyms(
18
18
  sample_size: int,
19
19
  ) -> list[str]:
20
20
  """Return unique synonym candidates excluding the original token."""
21
-
22
21
  collected: list[str] = []
23
22
  seen: set[str] = set()
24
23
  source = word.lower()
@@ -41,7 +40,6 @@ def synonym_diversity(
41
40
  sample_size: int = 5,
42
41
  ) -> float:
43
42
  """Return the mean unique-synonym count for ``words`` using ``lexicon``."""
44
-
45
43
  totals = []
46
44
  for word in words:
47
45
  synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
@@ -60,7 +58,6 @@ def coverage_ratio(
60
58
  min_synonyms: int = 3,
61
59
  ) -> float:
62
60
  """Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
63
-
64
61
  total = 0
65
62
  hits = 0
66
63
  for word in words:
@@ -96,7 +93,6 @@ def mean_cosine_similarity(
96
93
  sample_size: int = 5,
97
94
  ) -> float:
98
95
  """Return the mean cosine similarity between each word and its candidates."""
99
-
100
96
  total = 0.0
101
97
  count = 0
102
98
  for word in words:
@@ -126,11 +122,8 @@ def compare_lexicons(
126
122
  embeddings: Mapping[str, Sequence[float]] | None = None,
127
123
  ) -> dict[str, float]:
128
124
  """Return comparative coverage and diversity statistics for two lexicons."""
129
-
130
125
  stats = {
131
- "baseline_diversity": synonym_diversity(
132
- baseline, words, pos=pos, sample_size=sample_size
133
- ),
126
+ "baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
134
127
  "candidate_diversity": synonym_diversity(
135
128
  candidate, words, pos=pos, sample_size=sample_size
136
129
  ),