glitchlings 0.4.5__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +71 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cp311-win_amd64.pyd +0 -0
- glitchlings/compat.py +282 -0
- glitchlings/config.py +386 -0
- glitchlings/config.toml +3 -0
- glitchlings/data/__init__.py +1 -0
- glitchlings/data/hokey_assets.json +193 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +209 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +108 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +652 -0
- glitchlings/lexicon/wordnet.py +228 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +375 -0
- glitchlings/zoo/__init__.py +172 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.5.dist-info/METADATA +648 -0
- glitchlings-0.4.5.dist-info/RECORD +53 -0
- glitchlings-0.4.5.dist-info/WHEEL +5 -0
- glitchlings-0.4.5.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.5.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""Integration helpers for PyTorch Lightning data modules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from ..compat import get_pytorch_lightning_datamodule, require_pytorch_lightning
|
|
9
|
+
from ..util.adapters import coerce_gaggle
|
|
10
|
+
from ..zoo import Gaggle, Glitchling
|
|
11
|
+
from ._shared import corrupt_text_value, normalize_column_spec
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
|
|
15
|
+
"""Apply glitchlings to the configured batch columns."""
|
|
16
|
+
if not isinstance(batch, Mapping):
|
|
17
|
+
return batch
|
|
18
|
+
|
|
19
|
+
if hasattr(batch, "copy"):
|
|
20
|
+
mutated = batch.copy()
|
|
21
|
+
else:
|
|
22
|
+
mutated = dict(batch)
|
|
23
|
+
|
|
24
|
+
missing = [column for column in columns if column not in mutated]
|
|
25
|
+
if missing:
|
|
26
|
+
missing_str = ", ".join(sorted(missing))
|
|
27
|
+
raise ValueError(f"Columns not found in batch: {missing_str}")
|
|
28
|
+
|
|
29
|
+
for column in columns:
|
|
30
|
+
mutated[column] = corrupt_text_value(mutated[column], gaggle)
|
|
31
|
+
|
|
32
|
+
return mutated
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any:
|
|
36
|
+
"""Wrap a dataloader so yielded batches are corrupted lazily."""
|
|
37
|
+
if dataloader is None:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
if isinstance(dataloader, Mapping):
|
|
41
|
+
mapping_type = cast(type[Any], dataloader.__class__)
|
|
42
|
+
return mapping_type(
|
|
43
|
+
{key: _wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if isinstance(dataloader, list):
|
|
47
|
+
return [_wrap_dataloader(value, columns, gaggle) for value in dataloader]
|
|
48
|
+
|
|
49
|
+
if isinstance(dataloader, tuple):
|
|
50
|
+
return tuple(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
51
|
+
|
|
52
|
+
if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
|
|
53
|
+
sequence_type = cast(type[Any], dataloader.__class__)
|
|
54
|
+
return sequence_type(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
55
|
+
|
|
56
|
+
return _GlitchedDataLoader(dataloader, columns, gaggle)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class _GlitchedDataLoader:
|
|
60
|
+
"""Proxy dataloader that glitches batches produced by the wrapped loader."""
|
|
61
|
+
|
|
62
|
+
def __init__(self, dataloader: Any, columns: list[str], gaggle: Gaggle) -> None:
|
|
63
|
+
self._dataloader = dataloader
|
|
64
|
+
self._columns = columns
|
|
65
|
+
self._gaggle = gaggle
|
|
66
|
+
|
|
67
|
+
def __iter__(self) -> Any:
|
|
68
|
+
for batch in self._dataloader:
|
|
69
|
+
yield _glitch_batch(batch, self._columns, self._gaggle)
|
|
70
|
+
|
|
71
|
+
def __len__(self) -> int:
|
|
72
|
+
return len(self._dataloader)
|
|
73
|
+
|
|
74
|
+
def __getattr__(self, attribute: str) -> Any:
|
|
75
|
+
return getattr(self._dataloader, attribute)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _glitch_datamodule(
|
|
79
|
+
datamodule: Any,
|
|
80
|
+
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
|
81
|
+
column: str | Sequence[str],
|
|
82
|
+
*,
|
|
83
|
+
seed: int = 151,
|
|
84
|
+
) -> Any:
|
|
85
|
+
"""Return a proxy that applies glitchlings to batches from the datamodule."""
|
|
86
|
+
|
|
87
|
+
columns = normalize_column_spec(column)
|
|
88
|
+
if columns is None: # pragma: no cover - defensive
|
|
89
|
+
raise ValueError("At least one column must be specified")
|
|
90
|
+
# Lightning datamodules only support string column names (mapping keys)
|
|
91
|
+
columns_str = cast(list[str], columns)
|
|
92
|
+
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
93
|
+
return _GlitchedLightningDataModule(datamodule, columns_str, gaggle)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class _GlitchedLightningDataModule:
|
|
97
|
+
"""Proxy wrapper around a LightningDataModule applying glitchlings to batches."""
|
|
98
|
+
|
|
99
|
+
def __init__(self, base: Any, columns: list[str], gaggle: Gaggle) -> None:
|
|
100
|
+
object.__setattr__(self, "_glitch_base", base)
|
|
101
|
+
object.__setattr__(self, "_glitch_columns", columns)
|
|
102
|
+
object.__setattr__(self, "_glitch_gaggle", gaggle)
|
|
103
|
+
|
|
104
|
+
def __getattr__(self, attribute: str) -> Any:
|
|
105
|
+
return getattr(self._glitch_base, attribute)
|
|
106
|
+
|
|
107
|
+
def __setattr__(self, attribute: str, value: Any) -> None:
|
|
108
|
+
if attribute.startswith("_glitch_"):
|
|
109
|
+
object.__setattr__(self, attribute, value)
|
|
110
|
+
else:
|
|
111
|
+
setattr(self._glitch_base, attribute, value)
|
|
112
|
+
|
|
113
|
+
def __delattr__(self, attribute: str) -> None:
|
|
114
|
+
if attribute.startswith("_glitch_"):
|
|
115
|
+
object.__delattr__(self, attribute)
|
|
116
|
+
else:
|
|
117
|
+
delattr(self._glitch_base, attribute)
|
|
118
|
+
|
|
119
|
+
def __dir__(self) -> list[str]:
|
|
120
|
+
return sorted(set(dir(self.__class__)) | set(dir(self._glitch_base)))
|
|
121
|
+
|
|
122
|
+
# LightningDataModule API -------------------------------------------------
|
|
123
|
+
def prepare_data(self, *args: Any, **kwargs: Any) -> Any:
|
|
124
|
+
return self._glitch_base.prepare_data(*args, **kwargs)
|
|
125
|
+
|
|
126
|
+
def setup(self, *args: Any, **kwargs: Any) -> Any:
|
|
127
|
+
return self._glitch_base.setup(*args, **kwargs)
|
|
128
|
+
|
|
129
|
+
def teardown(self, *args: Any, **kwargs: Any) -> Any:
|
|
130
|
+
return self._glitch_base.teardown(*args, **kwargs)
|
|
131
|
+
|
|
132
|
+
def state_dict(self) -> Mapping[str, Any]:
|
|
133
|
+
state = self._glitch_base.state_dict()
|
|
134
|
+
return cast(Mapping[str, Any], state)
|
|
135
|
+
|
|
136
|
+
def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
|
|
137
|
+
self._glitch_base.load_state_dict(state_dict)
|
|
138
|
+
|
|
139
|
+
def transfer_batch_to_device(self, batch: Any, device: Any, dataloader_idx: int) -> Any:
|
|
140
|
+
return self._glitch_base.transfer_batch_to_device(batch, device, dataloader_idx)
|
|
141
|
+
|
|
142
|
+
def on_before_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
|
|
143
|
+
return self._glitch_base.on_before_batch_transfer(batch, dataloader_idx)
|
|
144
|
+
|
|
145
|
+
def on_after_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
|
|
146
|
+
return self._glitch_base.on_after_batch_transfer(batch, dataloader_idx)
|
|
147
|
+
|
|
148
|
+
def train_dataloader(self, *args: Any, **kwargs: Any) -> Any:
|
|
149
|
+
loader = self._glitch_base.train_dataloader(*args, **kwargs)
|
|
150
|
+
return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
|
|
151
|
+
|
|
152
|
+
def val_dataloader(self, *args: Any, **kwargs: Any) -> Any:
|
|
153
|
+
loader = self._glitch_base.val_dataloader(*args, **kwargs)
|
|
154
|
+
return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
|
|
155
|
+
|
|
156
|
+
def test_dataloader(self, *args: Any, **kwargs: Any) -> Any:
|
|
157
|
+
loader = self._glitch_base.test_dataloader(*args, **kwargs)
|
|
158
|
+
return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
|
|
159
|
+
|
|
160
|
+
def predict_dataloader(self, *args: Any, **kwargs: Any) -> Any:
|
|
161
|
+
loader = self._glitch_base.predict_dataloader(*args, **kwargs)
|
|
162
|
+
return _wrap_dataloader(loader, self._glitch_columns, self._glitch_gaggle)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _ensure_datamodule_class() -> Any:
|
|
166
|
+
"""Return the Lightning ``LightningDataModule`` patched with ``.glitch``."""
|
|
167
|
+
|
|
168
|
+
datamodule_cls = get_pytorch_lightning_datamodule()
|
|
169
|
+
if datamodule_cls is None: # pragma: no cover - dependency is optional
|
|
170
|
+
module = require_pytorch_lightning("pytorch_lightning is not installed")
|
|
171
|
+
datamodule_cls = getattr(module, "LightningDataModule", None)
|
|
172
|
+
if datamodule_cls is None:
|
|
173
|
+
raise ModuleNotFoundError("pytorch_lightning is not installed")
|
|
174
|
+
|
|
175
|
+
if getattr(datamodule_cls, "glitch", None) is None:
|
|
176
|
+
|
|
177
|
+
def glitch(
|
|
178
|
+
self: Any,
|
|
179
|
+
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
|
180
|
+
*,
|
|
181
|
+
column: str | Sequence[str],
|
|
182
|
+
seed: int = 151,
|
|
183
|
+
**_: Any,
|
|
184
|
+
) -> Any:
|
|
185
|
+
return _glitch_datamodule(self, glitchlings, column, seed=seed)
|
|
186
|
+
|
|
187
|
+
setattr(datamodule_cls, "glitch", glitch)
|
|
188
|
+
|
|
189
|
+
if not issubclass(_GlitchedLightningDataModule, datamodule_cls):
|
|
190
|
+
_GlitchedLightningDataModule.__bases__ = (datamodule_cls,)
|
|
191
|
+
|
|
192
|
+
return datamodule_cls
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def install() -> None:
|
|
196
|
+
"""Monkeypatch ``LightningDataModule`` with ``.glitch``."""
|
|
197
|
+
|
|
198
|
+
_ensure_datamodule_class()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
LightningDataModule: type[Any] | None
|
|
202
|
+
_LightningDataModuleAlias = get_pytorch_lightning_datamodule()
|
|
203
|
+
if _LightningDataModuleAlias is not None:
|
|
204
|
+
LightningDataModule = _ensure_datamodule_class()
|
|
205
|
+
else: # pragma: no cover - optional dependency
|
|
206
|
+
LightningDataModule = None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
__all__ = ["LightningDataModule", "install"]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Lexicon abstractions and default backend resolution helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from hashlib import blake2s
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Callable, Iterable
|
|
10
|
+
|
|
11
|
+
from glitchlings.config import get_config
|
|
12
|
+
|
|
13
|
+
from ._cache import CacheEntries, CacheSnapshot
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Lexicon(ABC):
|
|
17
|
+
"""Abstract interface describing synonym lookup backends.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
seed:
|
|
22
|
+
Optional integer used to derive deterministic random number generators
|
|
23
|
+
for synonym sampling. Identical seeds guarantee reproducible results for
|
|
24
|
+
the same word/part-of-speech queries.
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, *, seed: int | None = None) -> None:
|
|
29
|
+
self._seed = seed
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def seed(self) -> int | None:
|
|
33
|
+
"""Return the current base seed used for deterministic sampling."""
|
|
34
|
+
return self._seed
|
|
35
|
+
|
|
36
|
+
def reseed(self, seed: int | None) -> None:
|
|
37
|
+
"""Update the base seed driving deterministic synonym sampling."""
|
|
38
|
+
self._seed = seed
|
|
39
|
+
|
|
40
|
+
def _derive_rng(self, word: str, pos: str | None) -> random.Random:
|
|
41
|
+
"""Return an RNG derived from the base seed, word, and POS tag."""
|
|
42
|
+
seed_material = blake2s(digest_size=8)
|
|
43
|
+
seed_material.update(word.lower().encode("utf8"))
|
|
44
|
+
if pos is not None:
|
|
45
|
+
seed_material.update(pos.lower().encode("utf8"))
|
|
46
|
+
seed_repr = "None" if self._seed is None else str(self._seed)
|
|
47
|
+
seed_material.update(seed_repr.encode("utf8"))
|
|
48
|
+
derived_seed = int.from_bytes(seed_material.digest(), "big", signed=False)
|
|
49
|
+
return random.Random(derived_seed)
|
|
50
|
+
|
|
51
|
+
def _deterministic_sample(
|
|
52
|
+
self, values: Iterable[str], *, limit: int, word: str, pos: str | None
|
|
53
|
+
) -> list[str]:
|
|
54
|
+
"""Return up to ``limit`` values sampled deterministically."""
|
|
55
|
+
if limit <= 0:
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
items = list(values)
|
|
59
|
+
if len(items) <= limit:
|
|
60
|
+
return items
|
|
61
|
+
|
|
62
|
+
rng = self._derive_rng(word, pos)
|
|
63
|
+
indices = rng.sample(range(len(items)), k=limit)
|
|
64
|
+
indices.sort()
|
|
65
|
+
return [items[index] for index in indices]
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
69
|
+
"""Return up to ``n`` synonyms for ``word`` constrained by ``pos``."""
|
|
70
|
+
|
|
71
|
+
def supports_pos(self, pos: str | None) -> bool:
|
|
72
|
+
"""Return ``True`` when the backend can service ``pos`` queries."""
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
def __repr__(self) -> str: # pragma: no cover - trivial representation
|
|
76
|
+
return f"{self.__class__.__name__}(seed={self._seed!r})"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class LexiconBackend(Lexicon):
|
|
80
|
+
"""Extended lexicon interface that supports cache persistence."""
|
|
81
|
+
|
|
82
|
+
Cache = CacheEntries
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
87
|
+
"""Return a validated cache snapshot loaded from ``path``."""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def save_cache(self, path: str | Path | None = None) -> Path | None:
|
|
91
|
+
"""Persist the backend cache to ``path`` and return the destination."""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
from .metrics import ( # noqa: E402
|
|
95
|
+
compare_lexicons,
|
|
96
|
+
coverage_ratio,
|
|
97
|
+
mean_cosine_similarity,
|
|
98
|
+
synonym_diversity,
|
|
99
|
+
)
|
|
100
|
+
from .vector import VectorLexicon, build_vector_cache # noqa: E402
|
|
101
|
+
|
|
102
|
+
_WordNetLexicon: type[LexiconBackend] | None
|
|
103
|
+
try: # pragma: no cover - optional dependency
|
|
104
|
+
from .wordnet import WordNetLexicon as _WordNetLexicon
|
|
105
|
+
except (
|
|
106
|
+
ImportError,
|
|
107
|
+
ModuleNotFoundError,
|
|
108
|
+
AttributeError,
|
|
109
|
+
): # pragma: no cover - triggered when nltk unavailable
|
|
110
|
+
_WordNetLexicon = None
|
|
111
|
+
|
|
112
|
+
WordNetLexicon: type[LexiconBackend] | None = _WordNetLexicon
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
_BACKEND_FACTORIES: dict[str, Callable[[int | None], Lexicon | None]] = {}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def register_backend(name: str, factory: Callable[[int | None], Lexicon | None]) -> None:
|
|
119
|
+
"""Register ``factory`` for ``name`` so it can be selected via config."""
|
|
120
|
+
normalized = name.lower()
|
|
121
|
+
_BACKEND_FACTORIES[normalized] = factory
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def unregister_backend(name: str) -> None:
|
|
125
|
+
"""Remove a previously registered backend."""
|
|
126
|
+
_BACKEND_FACTORIES.pop(name.lower(), None)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def available_backends() -> list[str]:
|
|
130
|
+
"""Return the names of registered lexicon factories."""
|
|
131
|
+
return sorted(_BACKEND_FACTORIES)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _vector_backend(seed: int | None) -> Lexicon | None:
|
|
135
|
+
config = get_config()
|
|
136
|
+
cache_path = config.lexicon.vector_cache
|
|
137
|
+
if cache_path is None:
|
|
138
|
+
return None
|
|
139
|
+
if not cache_path.exists():
|
|
140
|
+
return None
|
|
141
|
+
return VectorLexicon(cache_path=cache_path, seed=seed)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _wordnet_backend(seed: int | None) -> Lexicon | None: # pragma: no cover - optional
|
|
145
|
+
if WordNetLexicon is None:
|
|
146
|
+
return None
|
|
147
|
+
try:
|
|
148
|
+
lexicon = WordNetLexicon(seed=seed)
|
|
149
|
+
except RuntimeError:
|
|
150
|
+
return None
|
|
151
|
+
return lexicon
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
register_backend("vector", _vector_backend)
|
|
155
|
+
register_backend("wordnet", _wordnet_backend)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def get_default_lexicon(seed: int | None = None) -> Lexicon:
|
|
159
|
+
"""Return the first available lexicon according to configuration priority."""
|
|
160
|
+
config = get_config()
|
|
161
|
+
attempts: list[str] = []
|
|
162
|
+
for name in config.lexicon.priority:
|
|
163
|
+
factory = _BACKEND_FACTORIES.get(name.lower())
|
|
164
|
+
if factory is None:
|
|
165
|
+
attempts.append(f"{name} (unknown)")
|
|
166
|
+
continue
|
|
167
|
+
lexicon = factory(seed)
|
|
168
|
+
if lexicon is not None:
|
|
169
|
+
return lexicon
|
|
170
|
+
attempts.append(f"{name} (unavailable)")
|
|
171
|
+
attempted = ", ".join(attempts) or "<none>"
|
|
172
|
+
raise RuntimeError(
|
|
173
|
+
"No lexicon backends available; configure lexicon.priority with at least one "
|
|
174
|
+
f"working backend. Attempts: {attempted}."
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
__all__ = [
|
|
179
|
+
"Lexicon",
|
|
180
|
+
"LexiconBackend",
|
|
181
|
+
"VectorLexicon",
|
|
182
|
+
"WordNetLexicon",
|
|
183
|
+
"build_vector_cache",
|
|
184
|
+
"compare_lexicons",
|
|
185
|
+
"coverage_ratio",
|
|
186
|
+
"mean_cosine_similarity",
|
|
187
|
+
"synonym_diversity",
|
|
188
|
+
"get_default_lexicon",
|
|
189
|
+
"register_backend",
|
|
190
|
+
"unregister_backend",
|
|
191
|
+
"available_backends",
|
|
192
|
+
]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Shared cache helpers for lexicon backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from hashlib import blake2s
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Mapping, Sequence, cast
|
|
10
|
+
|
|
11
|
+
CacheEntries = dict[str, list[str]]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class CacheSnapshot:
|
|
16
|
+
"""Materialised cache data and its integrity checksum."""
|
|
17
|
+
|
|
18
|
+
entries: CacheEntries
|
|
19
|
+
checksum: str | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _normalize_entries(payload: Mapping[str, object]) -> CacheEntries:
|
|
23
|
+
"""Convert raw cache payloads into canonical mapping form."""
|
|
24
|
+
entries: CacheEntries = {}
|
|
25
|
+
for key, values in payload.items():
|
|
26
|
+
if not isinstance(key, str):
|
|
27
|
+
raise RuntimeError("Synonym cache keys must be strings.")
|
|
28
|
+
if not isinstance(values, Sequence):
|
|
29
|
+
raise RuntimeError("Synonym cache values must be sequences of strings.")
|
|
30
|
+
entries[key] = [str(value) for value in values]
|
|
31
|
+
return entries
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
|
|
35
|
+
"""Return a deterministic JSON serialisation for ``entries``."""
|
|
36
|
+
serialisable = {key: list(values) for key, values in sorted(entries.items())}
|
|
37
|
+
return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
|
|
41
|
+
"""Return a BLAKE2s checksum for ``entries``."""
|
|
42
|
+
digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
|
|
43
|
+
return digest.hexdigest()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def load_cache(path: Path) -> CacheSnapshot:
|
|
47
|
+
"""Load a cache from ``path`` and verify its checksum if present."""
|
|
48
|
+
if not path.exists():
|
|
49
|
+
return CacheSnapshot(entries={}, checksum=None)
|
|
50
|
+
|
|
51
|
+
with path.open("r", encoding="utf8") as handle:
|
|
52
|
+
payload_obj = json.load(handle)
|
|
53
|
+
|
|
54
|
+
checksum: str | None = None
|
|
55
|
+
entries_payload: Mapping[str, object]
|
|
56
|
+
|
|
57
|
+
if not isinstance(payload_obj, Mapping):
|
|
58
|
+
raise RuntimeError("Synonym cache payload must be a mapping of strings to lists.")
|
|
59
|
+
|
|
60
|
+
payload = cast(Mapping[str, object], payload_obj)
|
|
61
|
+
|
|
62
|
+
if "__meta__" in payload and "entries" in payload:
|
|
63
|
+
meta_obj = payload["__meta__"]
|
|
64
|
+
entries_obj = payload["entries"]
|
|
65
|
+
if not isinstance(entries_obj, Mapping):
|
|
66
|
+
raise RuntimeError("Synonym cache entries must be stored as a mapping.")
|
|
67
|
+
entries_payload = cast(Mapping[str, object], entries_obj)
|
|
68
|
+
if isinstance(meta_obj, Mapping):
|
|
69
|
+
raw_checksum = meta_obj.get("checksum")
|
|
70
|
+
if raw_checksum is not None and not isinstance(raw_checksum, str):
|
|
71
|
+
raise RuntimeError("Synonym cache checksum must be a string when provided.")
|
|
72
|
+
checksum = raw_checksum if isinstance(raw_checksum, str) else None
|
|
73
|
+
else:
|
|
74
|
+
raise RuntimeError("Synonym cache metadata must be a mapping.")
|
|
75
|
+
else:
|
|
76
|
+
entries_payload = payload # legacy format without metadata
|
|
77
|
+
|
|
78
|
+
entries = _normalize_entries(entries_payload)
|
|
79
|
+
if checksum is not None:
|
|
80
|
+
expected = compute_checksum(entries)
|
|
81
|
+
if checksum != expected:
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
"Synonym cache checksum mismatch; the cache file appears to be corrupted."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return CacheSnapshot(entries=entries, checksum=checksum)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
|
|
90
|
+
"""Persist ``entries`` to ``path`` with checksum metadata."""
|
|
91
|
+
serialisable: CacheEntries = {key: list(values) for key, values in sorted(entries.items())}
|
|
92
|
+
checksum = compute_checksum(serialisable)
|
|
93
|
+
payload = {
|
|
94
|
+
"__meta__": {
|
|
95
|
+
"checksum": checksum,
|
|
96
|
+
"entries": len(serialisable),
|
|
97
|
+
},
|
|
98
|
+
"entries": serialisable,
|
|
99
|
+
}
|
|
100
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
with path.open("w", encoding="utf8") as handle:
|
|
103
|
+
json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
|
104
|
+
|
|
105
|
+
return CacheSnapshot(entries=serialisable, checksum=checksum)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
__all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
{
|
|
2
|
+
"alpha": [
|
|
3
|
+
"beta",
|
|
4
|
+
"gamma",
|
|
5
|
+
"delta"
|
|
6
|
+
],
|
|
7
|
+
"beta": [
|
|
8
|
+
"alpha",
|
|
9
|
+
"gamma",
|
|
10
|
+
"delta"
|
|
11
|
+
],
|
|
12
|
+
"delta": [
|
|
13
|
+
"alpha",
|
|
14
|
+
"beta",
|
|
15
|
+
"gamma"
|
|
16
|
+
],
|
|
17
|
+
"fast": [
|
|
18
|
+
"rapid",
|
|
19
|
+
"swift",
|
|
20
|
+
"speedy",
|
|
21
|
+
"brisk"
|
|
22
|
+
],
|
|
23
|
+
"gamma": [
|
|
24
|
+
"alpha",
|
|
25
|
+
"beta",
|
|
26
|
+
"delta"
|
|
27
|
+
],
|
|
28
|
+
"happy": [
|
|
29
|
+
"glad",
|
|
30
|
+
"joyful",
|
|
31
|
+
"content",
|
|
32
|
+
"upbeat"
|
|
33
|
+
],
|
|
34
|
+
"quick": [
|
|
35
|
+
"swift",
|
|
36
|
+
"rapid",
|
|
37
|
+
"speedy",
|
|
38
|
+
"nimble"
|
|
39
|
+
],
|
|
40
|
+
"quickly": [
|
|
41
|
+
"swiftly",
|
|
42
|
+
"rapidly",
|
|
43
|
+
"promptly",
|
|
44
|
+
"speedily"
|
|
45
|
+
],
|
|
46
|
+
"sing": [
|
|
47
|
+
"croon",
|
|
48
|
+
"serenade",
|
|
49
|
+
"vocalize",
|
|
50
|
+
"perform"
|
|
51
|
+
],
|
|
52
|
+
"slow": [
|
|
53
|
+
"sluggish",
|
|
54
|
+
"leisurely",
|
|
55
|
+
"unhurried",
|
|
56
|
+
"gradual"
|
|
57
|
+
],
|
|
58
|
+
"songs": [
|
|
59
|
+
"tracks",
|
|
60
|
+
"melodies",
|
|
61
|
+
"ballads",
|
|
62
|
+
"tunes"
|
|
63
|
+
],
|
|
64
|
+
"text": [
|
|
65
|
+
"passage",
|
|
66
|
+
"copy",
|
|
67
|
+
"script",
|
|
68
|
+
"narrative"
|
|
69
|
+
],
|
|
70
|
+
"they": [
|
|
71
|
+
"those people",
|
|
72
|
+
"those individuals",
|
|
73
|
+
"the group",
|
|
74
|
+
"those folks"
|
|
75
|
+
],
|
|
76
|
+
"words": [
|
|
77
|
+
"terms",
|
|
78
|
+
"phrases",
|
|
79
|
+
"lexicon",
|
|
80
|
+
"vocabulary"
|
|
81
|
+
]
|
|
82
|
+
}
|