glitchlings 0.4.1__cp311-cp311-macosx_11_0_universal2.whl → 0.4.3__cp311-cp311-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (47) hide show
  1. glitchlings/__init__.py +30 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
  4. glitchlings/compat.py +284 -0
  5. glitchlings/config.py +164 -34
  6. glitchlings/config.toml +1 -1
  7. glitchlings/dlc/__init__.py +3 -1
  8. glitchlings/dlc/_shared.py +68 -0
  9. glitchlings/dlc/huggingface.py +26 -41
  10. glitchlings/dlc/prime.py +64 -101
  11. glitchlings/dlc/pytorch.py +216 -0
  12. glitchlings/dlc/pytorch_lightning.py +233 -0
  13. glitchlings/lexicon/__init__.py +12 -33
  14. glitchlings/lexicon/_cache.py +21 -22
  15. glitchlings/lexicon/data/default_vector_cache.json +80 -14
  16. glitchlings/lexicon/metrics.py +1 -8
  17. glitchlings/lexicon/vector.py +109 -49
  18. glitchlings/lexicon/wordnet.py +89 -49
  19. glitchlings/main.py +30 -24
  20. glitchlings/util/__init__.py +18 -4
  21. glitchlings/util/adapters.py +27 -0
  22. glitchlings/zoo/__init__.py +26 -15
  23. glitchlings/zoo/_ocr_confusions.py +1 -3
  24. glitchlings/zoo/_rate.py +1 -4
  25. glitchlings/zoo/_sampling.py +0 -1
  26. glitchlings/zoo/_text_utils.py +1 -5
  27. glitchlings/zoo/adjax.py +2 -4
  28. glitchlings/zoo/apostrofae.py +128 -0
  29. glitchlings/zoo/assets/__init__.py +0 -0
  30. glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
  31. glitchlings/zoo/core.py +152 -87
  32. glitchlings/zoo/jargoyle.py +50 -45
  33. glitchlings/zoo/mim1c.py +11 -10
  34. glitchlings/zoo/redactyl.py +16 -16
  35. glitchlings/zoo/reduple.py +5 -3
  36. glitchlings/zoo/rushmore.py +4 -10
  37. glitchlings/zoo/scannequin.py +7 -6
  38. glitchlings/zoo/typogre.py +8 -9
  39. glitchlings/zoo/zeedub.py +6 -3
  40. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/METADATA +101 -4
  41. glitchlings-0.4.3.dist-info/RECORD +46 -0
  42. glitchlings/lexicon/graph.py +0 -290
  43. glitchlings-0.4.1.dist-info/RECORD +0 -39
  44. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
  45. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
  46. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
  47. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,233 @@
1
+ """Integration helpers for PyTorch Lightning data modules."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Mapping, Sequence
6
+ from typing import Any, cast
7
+
8
+ from ..compat import get_pytorch_lightning_datamodule, require_pytorch_lightning
9
+ from ..util.adapters import coerce_gaggle
10
+ from ..zoo import Gaggle, Glitchling
11
+ from ..zoo.core import _is_transcript
12
+
13
+
14
+ def _normalise_columns(column: str | Sequence[str]) -> list[str]:
15
+ """Normalise a column specification to a list."""
16
+ if isinstance(column, str):
17
+ return [column]
18
+
19
+ normalised = list(column)
20
+ if not normalised:
21
+ raise ValueError("At least one column must be specified")
22
+ return normalised
23
+
24
+
25
+ def _glitch_value(value: Any, gaggle: Gaggle) -> Any:
26
+ """Apply glitchlings to a value when it contains textual content."""
27
+ if isinstance(value, str) or _is_transcript(value, allow_empty=False, require_all_content=True):
28
+ return gaggle.corrupt(value)
29
+
30
+ if isinstance(value, Sequence) and value and all(isinstance(item, str) for item in value):
31
+ return [gaggle.corrupt(item) for item in value]
32
+
33
+ return value
34
+
35
+
36
+ def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
37
+ """Apply glitchlings to the configured batch columns."""
38
+ if not isinstance(batch, Mapping):
39
+ return batch
40
+
41
+ if hasattr(batch, "copy"):
42
+ mutated = batch.copy()
43
+ else:
44
+ mutated = dict(batch)
45
+
46
+ missing = [column for column in columns if column not in mutated]
47
+ if missing:
48
+ missing_str = ", ".join(sorted(missing))
49
+ raise ValueError(f"Columns not found in batch: {missing_str}")
50
+
51
+ for column in columns:
52
+ mutated[column] = _glitch_value(mutated[column], gaggle)
53
+
54
+ return mutated
55
+
56
+
57
+ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any:
58
+ """Wrap a dataloader so yielded batches are corrupted lazily."""
59
+ if dataloader is None:
60
+ return None
61
+
62
+ if isinstance(dataloader, Mapping):
63
+ mapping_type = cast(type[Any], dataloader.__class__)
64
+ return mapping_type(
65
+ {
66
+ key: _wrap_dataloader(value, columns, gaggle)
67
+ for key, value in dataloader.items()
68
+ }
69
+ )
70
+
71
+ if isinstance(dataloader, list):
72
+ return [_wrap_dataloader(value, columns, gaggle) for value in dataloader]
73
+
74
+ if isinstance(dataloader, tuple):
75
+ return tuple(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
76
+
77
+ if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
78
+ sequence_type = cast(type[Any], dataloader.__class__)
79
+ return sequence_type(
80
+ _wrap_dataloader(value, columns, gaggle) for value in dataloader
81
+ )
82
+
83
+ return _GlitchedDataLoader(dataloader, columns, gaggle)
84
+
85
+
86
+ class _GlitchedDataLoader:
87
+ """Proxy dataloader that glitches batches produced by the wrapped loader."""
88
+
89
+ def __init__(self, dataloader: Any, columns: list[str], gaggle: Gaggle) -> None:
90
+ self._dataloader = dataloader
91
+ self._columns = columns
92
+ self._gaggle = gaggle
93
+
94
+ def __iter__(self) -> Any:
95
+ for batch in self._dataloader:
96
+ yield _glitch_batch(batch, self._columns, self._gaggle)
97
+
98
+ def __len__(self) -> int:
99
+ return len(self._dataloader)
100
+
101
+ def __getattr__(self, attribute: str) -> Any:
102
+ return getattr(self._dataloader, attribute)
103
+
104
+
105
+ def _glitch_datamodule(
106
+ datamodule: Any,
107
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
108
+ column: str | Sequence[str],
109
+ *,
110
+ seed: int = 151,
111
+ ) -> Any:
112
+ """Return a proxy that applies glitchlings to batches from the datamodule."""
113
+
114
+ columns = _normalise_columns(column)
115
+ gaggle = coerce_gaggle(glitchlings, seed=seed)
116
+ return _GlitchedLightningDataModule(datamodule, columns, gaggle)
117
+
118
+
119
+ class _GlitchedLightningDataModule:
120
+ """Proxy wrapper around a LightningDataModule applying glitchlings to batches."""
121
+
122
+ def __init__(self, base: Any, columns: list[str], gaggle: Gaggle) -> None:
123
+ object.__setattr__(self, "_glitch_base", base)
124
+ object.__setattr__(self, "_glitch_columns", columns)
125
+ object.__setattr__(self, "_glitch_gaggle", gaggle)
126
+
127
+ def __getattr__(self, attribute: str) -> Any:
128
+ return getattr(self._glitch_base, attribute)
129
+
130
+ def __setattr__(self, attribute: str, value: Any) -> None:
131
+ if attribute.startswith("_glitch_"):
132
+ object.__setattr__(self, attribute, value)
133
+ else:
134
+ setattr(self._glitch_base, attribute, value)
135
+
136
+ def __delattr__(self, attribute: str) -> None:
137
+ if attribute.startswith("_glitch_"):
138
+ object.__delattr__(self, attribute)
139
+ else:
140
+ delattr(self._glitch_base, attribute)
141
+
142
+ def __dir__(self) -> list[str]:
143
+ return sorted(set(dir(self.__class__)) | set(dir(self._glitch_base)))
144
+
145
+ # LightningDataModule API -------------------------------------------------
146
+ def prepare_data(self, *args: Any, **kwargs: Any) -> Any:
147
+ return self._glitch_base.prepare_data(*args, **kwargs)
148
+
149
+ def setup(self, *args: Any, **kwargs: Any) -> Any:
150
+ return self._glitch_base.setup(*args, **kwargs)
151
+
152
+ def teardown(self, *args: Any, **kwargs: Any) -> Any:
153
+ return self._glitch_base.teardown(*args, **kwargs)
154
+
155
+ def state_dict(self) -> Mapping[str, Any]:
156
+ state = self._glitch_base.state_dict()
157
+ return cast(Mapping[str, Any], state)
158
+
159
+ def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
160
+ self._glitch_base.load_state_dict(state_dict)
161
+
162
+ def transfer_batch_to_device(self, batch: Any, device: Any, dataloader_idx: int) -> Any:
163
+ return self._glitch_base.transfer_batch_to_device(batch, device, dataloader_idx)
164
+
165
+ def on_before_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
166
+ return self._glitch_base.on_before_batch_transfer(batch, dataloader_idx)
167
+
168
+ def on_after_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
169
+ return self._glitch_base.on_after_batch_transfer(batch, dataloader_idx)
170
+
171
+ def train_dataloader(self, *args: Any, **kwargs: Any) -> Any:
172
+ loader = self._glitch_base.train_dataloader(*args, **kwargs)
173
+ return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
174
+
175
+ def val_dataloader(self, *args: Any, **kwargs: Any) -> Any:
176
+ loader = self._glitch_base.val_dataloader(*args, **kwargs)
177
+ return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
178
+
179
+ def test_dataloader(self, *args: Any, **kwargs: Any) -> Any:
180
+ loader = self._glitch_base.test_dataloader(*args, **kwargs)
181
+ return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
182
+
183
+ def predict_dataloader(self, *args: Any, **kwargs: Any) -> Any:
184
+ loader = self._glitch_base.predict_dataloader(*args, **kwargs)
185
+ return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
186
+
187
+
188
+ def _ensure_datamodule_class() -> Any:
189
+ """Return the Lightning ``LightningDataModule`` patched with ``.glitch``."""
190
+
191
+ datamodule_cls = get_pytorch_lightning_datamodule()
192
+ if datamodule_cls is None: # pragma: no cover - dependency is optional
193
+ module = require_pytorch_lightning("pytorch_lightning is not installed")
194
+ datamodule_cls = getattr(module, "LightningDataModule", None)
195
+ if datamodule_cls is None:
196
+ raise ModuleNotFoundError("pytorch_lightning is not installed")
197
+
198
+ if getattr(datamodule_cls, "glitch", None) is None:
199
+
200
+ def glitch(
201
+ self: Any,
202
+ glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
203
+ *,
204
+ column: str | Sequence[str],
205
+ seed: int = 151,
206
+ **_: Any,
207
+ ) -> Any:
208
+ return _glitch_datamodule(self, glitchlings, column, seed=seed)
209
+
210
+ setattr(datamodule_cls, "glitch", glitch)
211
+
212
+ if not issubclass(_GlitchedLightningDataModule, datamodule_cls):
213
+ _GlitchedLightningDataModule.__bases__ = (datamodule_cls,)
214
+
215
+ return datamodule_cls
216
+
217
+
218
+ def install() -> None:
219
+ """Monkeypatch ``LightningDataModule`` with ``.glitch``."""
220
+
221
+ _ensure_datamodule_class()
222
+
223
+
224
+ LightningDataModule: type[Any] | None
225
+ _LightningDataModuleAlias = get_pytorch_lightning_datamodule()
226
+ if _LightningDataModuleAlias is not None:
227
+ LightningDataModule = _ensure_datamodule_class()
228
+ else: # pragma: no cover - optional dependency
229
+ LightningDataModule = None
230
+
231
+
232
+ __all__ = ["LightningDataModule", "install"]
233
+
@@ -2,13 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import random
5
6
  from abc import ABC, abstractmethod
6
7
  from hashlib import blake2s
7
8
  from pathlib import Path
8
- import random
9
9
  from typing import Callable, Iterable
10
10
 
11
11
  from glitchlings.config import get_config
12
+
12
13
  from ._cache import CacheEntries, CacheSnapshot
13
14
 
14
15
 
@@ -21,6 +22,7 @@ class Lexicon(ABC):
21
22
  Optional integer used to derive deterministic random number generators
22
23
  for synonym sampling. Identical seeds guarantee reproducible results for
23
24
  the same word/part-of-speech queries.
25
+
24
26
  """
25
27
 
26
28
  def __init__(self, *, seed: int | None = None) -> None:
@@ -29,17 +31,14 @@ class Lexicon(ABC):
29
31
  @property
30
32
  def seed(self) -> int | None:
31
33
  """Return the current base seed used for deterministic sampling."""
32
-
33
34
  return self._seed
34
35
 
35
36
  def reseed(self, seed: int | None) -> None:
36
37
  """Update the base seed driving deterministic synonym sampling."""
37
-
38
38
  self._seed = seed
39
39
 
40
40
  def _derive_rng(self, word: str, pos: str | None) -> random.Random:
41
41
  """Return an RNG derived from the base seed, word, and POS tag."""
42
-
43
42
  seed_material = blake2s(digest_size=8)
44
43
  seed_material.update(word.lower().encode("utf8"))
45
44
  if pos is not None:
@@ -53,7 +52,6 @@ class Lexicon(ABC):
53
52
  self, values: Iterable[str], *, limit: int, word: str, pos: str | None
54
53
  ) -> list[str]:
55
54
  """Return up to ``limit`` values sampled deterministically."""
56
-
57
55
  if limit <= 0:
58
56
  return []
59
57
 
@@ -67,14 +65,11 @@ class Lexicon(ABC):
67
65
  return [items[index] for index in indices]
68
66
 
69
67
  @abstractmethod
70
- def get_synonyms(
71
- self, word: str, pos: str | None = None, n: int = 5
72
- ) -> list[str]:
68
+ def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
73
69
  """Return up to ``n`` synonyms for ``word`` constrained by ``pos``."""
74
70
 
75
71
  def supports_pos(self, pos: str | None) -> bool:
76
72
  """Return ``True`` when the backend can service ``pos`` queries."""
77
-
78
73
  return True
79
74
 
80
75
  def __repr__(self) -> str: # pragma: no cover - trivial representation
@@ -96,42 +91,39 @@ class LexiconBackend(Lexicon):
96
91
  """Persist the backend cache to ``path`` and return the destination."""
97
92
 
98
93
 
99
- from .graph import GraphLexicon
100
- from .metrics import (
94
+ from .metrics import ( # noqa: E402
101
95
  compare_lexicons,
102
96
  coverage_ratio,
103
97
  mean_cosine_similarity,
104
98
  synonym_diversity,
105
99
  )
106
- from .vector import VectorLexicon, build_vector_cache
100
+ from .vector import VectorLexicon, build_vector_cache # noqa: E402
107
101
 
102
+ _WordNetLexicon: type[LexiconBackend] | None
108
103
  try: # pragma: no cover - optional dependency
109
- from .wordnet import WordNetLexicon
104
+ from .wordnet import WordNetLexicon as _WordNetLexicon
110
105
  except Exception: # pragma: no cover - triggered when nltk unavailable
111
- WordNetLexicon = None # type: ignore[assignment]
106
+ _WordNetLexicon = None
107
+
108
+ WordNetLexicon: type[LexiconBackend] | None = _WordNetLexicon
112
109
 
113
110
 
114
111
  _BACKEND_FACTORIES: dict[str, Callable[[int | None], Lexicon | None]] = {}
115
112
 
116
113
 
117
- def register_backend(
118
- name: str, factory: Callable[[int | None], Lexicon | None]
119
- ) -> None:
114
+ def register_backend(name: str, factory: Callable[[int | None], Lexicon | None]) -> None:
120
115
  """Register ``factory`` for ``name`` so it can be selected via config."""
121
-
122
116
  normalized = name.lower()
123
117
  _BACKEND_FACTORIES[normalized] = factory
124
118
 
125
119
 
126
120
  def unregister_backend(name: str) -> None:
127
121
  """Remove a previously registered backend."""
128
-
129
122
  _BACKEND_FACTORIES.pop(name.lower(), None)
130
123
 
131
124
 
132
125
  def available_backends() -> list[str]:
133
126
  """Return the names of registered lexicon factories."""
134
-
135
127
  return sorted(_BACKEND_FACTORIES)
136
128
 
137
129
 
@@ -145,16 +137,6 @@ def _vector_backend(seed: int | None) -> Lexicon | None:
145
137
  return VectorLexicon(cache_path=cache_path, seed=seed)
146
138
 
147
139
 
148
- def _graph_backend(seed: int | None) -> Lexicon | None:
149
- config = get_config()
150
- cache_path = config.lexicon.graph_cache
151
- if cache_path is None:
152
- return None
153
- if not cache_path.exists():
154
- return None
155
- return GraphLexicon(cache_path=cache_path, seed=seed)
156
-
157
-
158
140
  def _wordnet_backend(seed: int | None) -> Lexicon | None: # pragma: no cover - optional
159
141
  if WordNetLexicon is None:
160
142
  return None
@@ -166,13 +148,11 @@ def _wordnet_backend(seed: int | None) -> Lexicon | None: # pragma: no cover -
166
148
 
167
149
 
168
150
  register_backend("vector", _vector_backend)
169
- register_backend("graph", _graph_backend)
170
151
  register_backend("wordnet", _wordnet_backend)
171
152
 
172
153
 
173
154
  def get_default_lexicon(seed: int | None = None) -> Lexicon:
174
155
  """Return the first available lexicon according to configuration priority."""
175
-
176
156
  config = get_config()
177
157
  attempts: list[str] = []
178
158
  for name in config.lexicon.priority:
@@ -195,7 +175,6 @@ __all__ = [
195
175
  "Lexicon",
196
176
  "LexiconBackend",
197
177
  "VectorLexicon",
198
- "GraphLexicon",
199
178
  "WordNetLexicon",
200
179
  "build_vector_cache",
201
180
  "compare_lexicons",
@@ -6,8 +6,7 @@ import json
6
6
  from dataclasses import dataclass
7
7
  from hashlib import blake2s
8
8
  from pathlib import Path
9
- from typing import Mapping, Sequence
10
-
9
+ from typing import Mapping, Sequence, cast
11
10
 
12
11
  CacheEntries = dict[str, list[str]]
13
12
 
@@ -20,9 +19,8 @@ class CacheSnapshot:
20
19
  checksum: str | None = None
21
20
 
22
21
 
23
- def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
22
+ def _normalise_entries(payload: Mapping[str, object]) -> CacheEntries:
24
23
  """Convert raw cache payloads into canonical mapping form."""
25
-
26
24
  entries: CacheEntries = {}
27
25
  for key, values in payload.items():
28
26
  if not isinstance(key, str):
@@ -35,46 +33,47 @@ def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
35
33
 
36
34
  def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
37
35
  """Return a deterministic JSON serialisation for ``entries``."""
38
-
39
36
  serialisable = {key: list(values) for key, values in sorted(entries.items())}
40
37
  return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
41
38
 
42
39
 
43
40
  def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
44
41
  """Return a BLAKE2s checksum for ``entries``."""
45
-
46
42
  digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
47
43
  return digest.hexdigest()
48
44
 
49
45
 
50
46
  def load_cache(path: Path) -> CacheSnapshot:
51
47
  """Load a cache from ``path`` and verify its checksum if present."""
52
-
53
48
  if not path.exists():
54
49
  return CacheSnapshot(entries={}, checksum=None)
55
50
 
56
51
  with path.open("r", encoding="utf8") as handle:
57
- payload = json.load(handle)
52
+ payload_obj = json.load(handle)
58
53
 
59
54
  checksum: str | None = None
60
- entries_payload: Mapping[str, Sequence[str]]
55
+ entries_payload: Mapping[str, object]
56
+
57
+ if not isinstance(payload_obj, Mapping):
58
+ raise RuntimeError("Synonym cache payload must be a mapping of strings to lists.")
59
+
60
+ payload = cast(Mapping[str, object], payload_obj)
61
61
 
62
- if isinstance(payload, Mapping) and "__meta__" in payload and "entries" in payload:
63
- meta = payload["__meta__"]
64
- entries_payload = payload["entries"] # type: ignore[assignment]
65
- if not isinstance(entries_payload, Mapping):
62
+ if "__meta__" in payload and "entries" in payload:
63
+ meta_obj = payload["__meta__"]
64
+ entries_obj = payload["entries"]
65
+ if not isinstance(entries_obj, Mapping):
66
66
  raise RuntimeError("Synonym cache entries must be stored as a mapping.")
67
- if isinstance(meta, Mapping):
68
- raw_checksum = meta.get("checksum")
67
+ entries_payload = cast(Mapping[str, object], entries_obj)
68
+ if isinstance(meta_obj, Mapping):
69
+ raw_checksum = meta_obj.get("checksum")
69
70
  if raw_checksum is not None and not isinstance(raw_checksum, str):
70
71
  raise RuntimeError("Synonym cache checksum must be a string when provided.")
71
- checksum = raw_checksum
72
+ checksum = raw_checksum if isinstance(raw_checksum, str) else None
72
73
  else:
73
74
  raise RuntimeError("Synonym cache metadata must be a mapping.")
74
- elif isinstance(payload, Mapping):
75
- entries_payload = payload # legacy format without metadata
76
75
  else:
77
- raise RuntimeError("Synonym cache payload must be a mapping of strings to lists.")
76
+ entries_payload = payload # legacy format without metadata
78
77
 
79
78
  entries = _normalise_entries(entries_payload)
80
79
  if checksum is not None:
@@ -89,8 +88,9 @@ def load_cache(path: Path) -> CacheSnapshot:
89
88
 
90
89
  def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
91
90
  """Persist ``entries`` to ``path`` with checksum metadata."""
92
-
93
- serialisable = {key: list(values) for key, values in sorted(entries.items())}
91
+ serialisable: CacheEntries = {
92
+ key: list(values) for key, values in sorted(entries.items())
93
+ }
94
94
  checksum = compute_checksum(serialisable)
95
95
  payload = {
96
96
  "__meta__": {
@@ -108,4 +108,3 @@ def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapsh
108
108
 
109
109
 
110
110
  __all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]
111
-
@@ -1,16 +1,82 @@
1
1
  {
2
- "sing": ["croon", "warble", "chant", "serenade"],
3
- "happy": ["cheerful", "joyful", "contented", "gleeful"],
4
- "songs": ["tunes", "melodies", "ballads", "airs"],
5
- "quickly": ["rapidly", "swiftly", "speedily", "promptly"],
6
- "text": ["passage", "excerpt", "phrase", "content"],
7
- "words": ["terms", "phrases", "lexemes", "expressions"],
8
- "alpha": ["beta", "gamma", "delta"],
9
- "beta": ["alpha", "gamma", "delta"],
10
- "gamma": ["alpha", "beta", "delta"],
11
- "delta": ["alpha", "beta", "gamma"],
12
- "they": ["these people", "those folks", "those individuals"],
13
- "quick": ["rapid", "swift", "brisk", "prompt"],
14
- "fast": ["rapid", "swift", "quick", "speedy"],
15
- "slow": ["sluggish", "lethargic", "unhurried", "deliberate"]
2
+ "alpha": [
3
+ "beta",
4
+ "gamma",
5
+ "delta"
6
+ ],
7
+ "beta": [
8
+ "alpha",
9
+ "gamma",
10
+ "delta"
11
+ ],
12
+ "delta": [
13
+ "alpha",
14
+ "beta",
15
+ "gamma"
16
+ ],
17
+ "fast": [
18
+ "rapid",
19
+ "swift",
20
+ "speedy",
21
+ "brisk"
22
+ ],
23
+ "gamma": [
24
+ "alpha",
25
+ "beta",
26
+ "delta"
27
+ ],
28
+ "happy": [
29
+ "glad",
30
+ "joyful",
31
+ "content",
32
+ "upbeat"
33
+ ],
34
+ "quick": [
35
+ "swift",
36
+ "rapid",
37
+ "speedy",
38
+ "nimble"
39
+ ],
40
+ "quickly": [
41
+ "swiftly",
42
+ "rapidly",
43
+ "promptly",
44
+ "speedily"
45
+ ],
46
+ "sing": [
47
+ "croon",
48
+ "serenade",
49
+ "vocalize",
50
+ "perform"
51
+ ],
52
+ "slow": [
53
+ "sluggish",
54
+ "leisurely",
55
+ "unhurried",
56
+ "gradual"
57
+ ],
58
+ "songs": [
59
+ "tracks",
60
+ "melodies",
61
+ "ballads",
62
+ "tunes"
63
+ ],
64
+ "text": [
65
+ "passage",
66
+ "copy",
67
+ "script",
68
+ "narrative"
69
+ ],
70
+ "they": [
71
+ "those people",
72
+ "those individuals",
73
+ "the group",
74
+ "those folks"
75
+ ],
76
+ "words": [
77
+ "terms",
78
+ "phrases",
79
+ "lexicon",
80
+ "vocabulary"
81
+ ]
16
82
  }
@@ -18,7 +18,6 @@ def _unique_synonyms(
18
18
  sample_size: int,
19
19
  ) -> list[str]:
20
20
  """Return unique synonym candidates excluding the original token."""
21
-
22
21
  collected: list[str] = []
23
22
  seen: set[str] = set()
24
23
  source = word.lower()
@@ -41,7 +40,6 @@ def synonym_diversity(
41
40
  sample_size: int = 5,
42
41
  ) -> float:
43
42
  """Return the mean unique-synonym count for ``words`` using ``lexicon``."""
44
-
45
43
  totals = []
46
44
  for word in words:
47
45
  synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
@@ -60,7 +58,6 @@ def coverage_ratio(
60
58
  min_synonyms: int = 3,
61
59
  ) -> float:
62
60
  """Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
63
-
64
61
  total = 0
65
62
  hits = 0
66
63
  for word in words:
@@ -96,7 +93,6 @@ def mean_cosine_similarity(
96
93
  sample_size: int = 5,
97
94
  ) -> float:
98
95
  """Return the mean cosine similarity between each word and its candidates."""
99
-
100
96
  total = 0.0
101
97
  count = 0
102
98
  for word in words:
@@ -126,11 +122,8 @@ def compare_lexicons(
126
122
  embeddings: Mapping[str, Sequence[float]] | None = None,
127
123
  ) -> dict[str, float]:
128
124
  """Return comparative coverage and diversity statistics for two lexicons."""
129
-
130
125
  stats = {
131
- "baseline_diversity": synonym_diversity(
132
- baseline, words, pos=pos, sample_size=sample_size
133
- ),
126
+ "baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
134
127
  "candidate_diversity": synonym_diversity(
135
128
  candidate, words, pos=pos, sample_size=sample_size
136
129
  ),