glitchlings 0.4.1__cp311-cp311-macosx_11_0_universal2.whl → 0.4.2__cp311-cp311-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +26 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
- glitchlings/compat.py +215 -0
- glitchlings/config.py +136 -19
- glitchlings/dlc/_shared.py +68 -0
- glitchlings/dlc/huggingface.py +26 -41
- glitchlings/dlc/prime.py +64 -101
- glitchlings/lexicon/__init__.py +8 -19
- glitchlings/lexicon/_cache.py +0 -7
- glitchlings/lexicon/graph.py +4 -12
- glitchlings/lexicon/metrics.py +1 -8
- glitchlings/lexicon/vector.py +15 -34
- glitchlings/lexicon/wordnet.py +31 -32
- glitchlings/main.py +9 -13
- glitchlings/util/__init__.py +18 -4
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +21 -14
- glitchlings/zoo/_ocr_confusions.py +1 -3
- glitchlings/zoo/_rate.py +1 -4
- glitchlings/zoo/_sampling.py +0 -1
- glitchlings/zoo/_text_utils.py +1 -5
- glitchlings/zoo/adjax.py +0 -2
- glitchlings/zoo/core.py +114 -75
- glitchlings/zoo/jargoyle.py +9 -14
- glitchlings/zoo/mim1c.py +11 -10
- glitchlings/zoo/redactyl.py +5 -8
- glitchlings/zoo/reduple.py +3 -1
- glitchlings/zoo/rushmore.py +2 -8
- glitchlings/zoo/scannequin.py +5 -4
- glitchlings/zoo/typogre.py +3 -7
- glitchlings/zoo/zeedub.py +2 -2
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/METADATA +67 -3
- glitchlings-0.4.2.dist-info/RECORD +42 -0
- glitchlings-0.4.1.dist-info/RECORD +0 -39
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/top_level.txt +0 -0
glitchlings/dlc/huggingface.py
CHANGED
|
@@ -3,21 +3,15 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from collections.abc import Iterable, Sequence
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
_DatasetsDataset = None # type: ignore[assignment]
|
|
12
|
-
else:
|
|
13
|
-
_datasets_error = None
|
|
14
|
-
|
|
15
|
-
from ..zoo import Gaggle, Glitchling, summon
|
|
8
|
+
from ..compat import datasets, get_datasets_dataset, require_datasets
|
|
9
|
+
from ..util.adapters import coerce_gaggle
|
|
10
|
+
from ..zoo import Gaggle, Glitchling
|
|
16
11
|
|
|
17
12
|
|
|
18
13
|
def _normalise_columns(column: str | Sequence[str]) -> list[str]:
|
|
19
14
|
"""Normalise a column specification to a list."""
|
|
20
|
-
|
|
21
15
|
if isinstance(column, str):
|
|
22
16
|
return [column]
|
|
23
17
|
|
|
@@ -27,20 +21,6 @@ def _normalise_columns(column: str | Sequence[str]) -> list[str]:
|
|
|
27
21
|
return normalised
|
|
28
22
|
|
|
29
23
|
|
|
30
|
-
def _as_gaggle(glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling], seed: int) -> Gaggle:
|
|
31
|
-
"""Coerce any supported glitchling specification into a :class:`Gaggle`."""
|
|
32
|
-
|
|
33
|
-
if isinstance(glitchlings, Gaggle):
|
|
34
|
-
return glitchlings
|
|
35
|
-
|
|
36
|
-
if isinstance(glitchlings, (Glitchling, str)):
|
|
37
|
-
resolved: Iterable[str | Glitchling] = [glitchlings]
|
|
38
|
-
else:
|
|
39
|
-
resolved = glitchlings
|
|
40
|
-
|
|
41
|
-
return summon(list(resolved), seed=seed)
|
|
42
|
-
|
|
43
|
-
|
|
44
24
|
def _glitch_dataset(
|
|
45
25
|
dataset: Any,
|
|
46
26
|
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
|
@@ -48,23 +28,28 @@ def _glitch_dataset(
|
|
|
48
28
|
*,
|
|
49
29
|
seed: int = 151,
|
|
50
30
|
) -> Any:
|
|
51
|
-
"""
|
|
52
|
-
|
|
31
|
+
"""Apply glitchlings to the provided dataset columns."""
|
|
53
32
|
columns = _normalise_columns(column)
|
|
54
|
-
gaggle =
|
|
33
|
+
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
55
34
|
return gaggle.corrupt_dataset(dataset, columns)
|
|
56
35
|
|
|
57
36
|
|
|
58
37
|
def _ensure_dataset_class() -> Any:
|
|
59
38
|
"""Return the Hugging Face :class:`~datasets.Dataset` patched with ``.glitch``."""
|
|
60
|
-
|
|
61
|
-
if
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
39
|
+
dataset_cls = get_datasets_dataset()
|
|
40
|
+
if dataset_cls is None: # pragma: no cover - datasets is an install-time dependency
|
|
41
|
+
require_datasets("datasets is not installed")
|
|
42
|
+
dataset_cls = get_datasets_dataset()
|
|
43
|
+
if dataset_cls is None:
|
|
44
|
+
message = "datasets is not installed"
|
|
45
|
+
error = datasets.error
|
|
46
|
+
if error is not None:
|
|
47
|
+
raise ModuleNotFoundError(message) from error
|
|
48
|
+
raise ModuleNotFoundError(message)
|
|
49
|
+
|
|
50
|
+
if getattr(dataset_cls, "glitch", None) is None:
|
|
51
|
+
|
|
52
|
+
def glitch(
|
|
68
53
|
self: Any,
|
|
69
54
|
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
|
70
55
|
*,
|
|
@@ -73,24 +58,24 @@ def _ensure_dataset_class() -> Any:
|
|
|
73
58
|
**_: Any,
|
|
74
59
|
) -> Any:
|
|
75
60
|
"""Return a lazily corrupted copy of the dataset."""
|
|
76
|
-
|
|
77
61
|
return _glitch_dataset(self, glitchlings, column, seed=seed)
|
|
78
62
|
|
|
79
|
-
setattr(
|
|
63
|
+
setattr(dataset_cls, "glitch", glitch)
|
|
80
64
|
|
|
81
|
-
return
|
|
65
|
+
return cast(type[Any], dataset_cls)
|
|
82
66
|
|
|
83
67
|
|
|
84
68
|
def install() -> None:
|
|
85
69
|
"""Monkeypatch the Hugging Face :class:`~datasets.Dataset` with ``.glitch``."""
|
|
86
|
-
|
|
87
70
|
_ensure_dataset_class()
|
|
88
71
|
|
|
89
72
|
|
|
90
|
-
|
|
73
|
+
Dataset: type[Any] | None
|
|
74
|
+
_DatasetAlias = get_datasets_dataset()
|
|
75
|
+
if _DatasetAlias is not None:
|
|
91
76
|
Dataset = _ensure_dataset_class()
|
|
92
77
|
else: # pragma: no cover - datasets is an install-time dependency
|
|
93
|
-
Dataset = None
|
|
78
|
+
Dataset = None
|
|
94
79
|
|
|
95
80
|
|
|
96
81
|
__all__ = ["Dataset", "install"]
|
glitchlings/dlc/prime.py
CHANGED
|
@@ -4,79 +4,60 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from collections.abc import Iterable, Sequence
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from typing import Any, Callable
|
|
7
|
+
from typing import Any, Callable, Protocol, cast
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
from ..compat import require_datasets, require_jellyfish, require_verifiers
|
|
10
|
+
from ..util.adapters import coerce_gaggle
|
|
11
|
+
from ..zoo import Gaggle, Glitchling, Mim1c, Typogre
|
|
12
|
+
from ._shared import resolve_columns as _resolve_columns_shared
|
|
13
|
+
from ._shared import resolve_environment as _resolve_environment_shared
|
|
10
14
|
|
|
11
|
-
from jellyfish import damerau_levenshtein_distance
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
except ModuleNotFoundError: # pragma: no cover - optional dependency
|
|
16
|
-
Dataset = object # type: ignore[assignment]
|
|
17
|
-
else:
|
|
18
|
-
if Dataset is None: # pragma: no cover - optional dependency
|
|
19
|
-
Dataset = object # type: ignore[assignment]
|
|
16
|
+
class VerifierEnvironment(Protocol):
|
|
17
|
+
"""Minimal interface for verifiers environments."""
|
|
20
18
|
|
|
21
|
-
|
|
19
|
+
dataset: Any
|
|
22
20
|
|
|
23
21
|
|
|
24
|
-
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
if isinstance(env, str):
|
|
28
|
-
env = vf.load_environment(env)
|
|
22
|
+
class VerifierSingleTurnEnv(Protocol):
|
|
23
|
+
"""Minimal interface for single-turn verifier environments."""
|
|
29
24
|
|
|
30
|
-
|
|
31
|
-
|
|
25
|
+
dataset: Any
|
|
26
|
+
rubric: Any
|
|
32
27
|
|
|
33
|
-
return env
|
|
34
28
|
|
|
29
|
+
vf = require_verifiers("verifiers is not installed; install glitchlings[prime]")
|
|
30
|
+
_jellyfish = require_jellyfish("jellyfish is not installed; install glitchlings[prime]")
|
|
31
|
+
damerau_levenshtein_distance = _jellyfish.damerau_levenshtein_distance
|
|
35
32
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
33
|
+
try:
|
|
34
|
+
from .huggingface import Dataset as _HuggingFaceDataset
|
|
35
|
+
except ModuleNotFoundError: # pragma: no cover - optional dependency
|
|
36
|
+
_HuggingFaceDataset = None
|
|
37
|
+
else:
|
|
38
|
+
if _HuggingFaceDataset is None: # pragma: no cover - optional dependency
|
|
39
|
+
_HuggingFaceDataset = None
|
|
40
40
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return list(columns)
|
|
41
|
+
Dataset: type[Any]
|
|
42
|
+
if _HuggingFaceDataset is None:
|
|
43
|
+
Dataset = object
|
|
44
|
+
else:
|
|
45
|
+
Dataset = _HuggingFaceDataset
|
|
47
46
|
|
|
48
|
-
for candidate in ("prompt", "question"):
|
|
49
|
-
if candidate in available:
|
|
50
|
-
return [candidate]
|
|
51
47
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
iterator = iter(dataset)
|
|
61
|
-
try:
|
|
62
|
-
first_row = next(iterator)
|
|
63
|
-
except StopIteration:
|
|
64
|
-
preview_rows = []
|
|
65
|
-
else:
|
|
66
|
-
preview_rows = [first_row]
|
|
67
|
-
sample = dict(preview_rows[0]) if preview_rows else {}
|
|
68
|
-
else:
|
|
69
|
-
sample = dataset[0] if dataset_length else {}
|
|
70
|
-
inferred = [
|
|
71
|
-
name
|
|
72
|
-
for name in dataset.column_names
|
|
73
|
-
if isinstance(sample.get(name), str)
|
|
74
|
-
]
|
|
48
|
+
def _resolve_environment(env: str | VerifierEnvironment) -> VerifierEnvironment:
|
|
49
|
+
"""Return a fully-instantiated verifier environment."""
|
|
50
|
+
resolved = _resolve_environment_shared(
|
|
51
|
+
env,
|
|
52
|
+
loader=vf.load_environment,
|
|
53
|
+
environment_type=cast(type[Any], vf.Environment),
|
|
54
|
+
)
|
|
55
|
+
return cast(VerifierEnvironment, resolved)
|
|
75
56
|
|
|
76
|
-
if inferred:
|
|
77
|
-
return inferred
|
|
78
57
|
|
|
79
|
-
|
|
58
|
+
def _resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
|
|
59
|
+
"""Identify which dataset columns should be corrupted."""
|
|
60
|
+
return _resolve_columns_shared(dataset, columns)
|
|
80
61
|
|
|
81
62
|
|
|
82
63
|
class Difficulty(Enum):
|
|
@@ -90,12 +71,11 @@ class Difficulty(Enum):
|
|
|
90
71
|
|
|
91
72
|
|
|
92
73
|
def tutorial_level(
|
|
93
|
-
env:
|
|
74
|
+
env: VerifierEnvironment | str,
|
|
94
75
|
seed: int = 151,
|
|
95
76
|
difficulty: Difficulty = Difficulty.Normal,
|
|
96
|
-
) ->
|
|
77
|
+
) -> VerifierEnvironment:
|
|
97
78
|
"""Create a low-corruption environment using tuned defaults."""
|
|
98
|
-
|
|
99
79
|
tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
|
|
100
80
|
tuned_typogre = Typogre(rate=0.025 * difficulty.value)
|
|
101
81
|
|
|
@@ -107,28 +87,19 @@ def tutorial_level(
|
|
|
107
87
|
|
|
108
88
|
|
|
109
89
|
def load_environment(
|
|
110
|
-
env: str |
|
|
90
|
+
env: str | VerifierEnvironment,
|
|
111
91
|
glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
|
|
112
92
|
*,
|
|
113
93
|
seed: int = 151,
|
|
114
94
|
columns: Sequence[str] | None = None,
|
|
115
|
-
) ->
|
|
95
|
+
) -> VerifierEnvironment:
|
|
116
96
|
"""Load an environment and optionally corrupt it with glitchlings."""
|
|
117
|
-
|
|
118
97
|
environment = _resolve_environment(env)
|
|
119
98
|
|
|
120
99
|
if glitchlings is None:
|
|
121
100
|
return environment
|
|
122
101
|
|
|
123
|
-
|
|
124
|
-
gaggle = glitchlings
|
|
125
|
-
else:
|
|
126
|
-
if isinstance(glitchlings, (Glitchling, str)):
|
|
127
|
-
resolved = [glitchlings]
|
|
128
|
-
else:
|
|
129
|
-
resolved = list(glitchlings)
|
|
130
|
-
|
|
131
|
-
gaggle = summon(resolved, seed=seed)
|
|
102
|
+
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
132
103
|
|
|
133
104
|
dataset = environment.dataset
|
|
134
105
|
corrupt_columns = _resolve_columns(dataset, columns)
|
|
@@ -142,21 +113,11 @@ def _as_gaggle(
|
|
|
142
113
|
seed: int,
|
|
143
114
|
) -> Gaggle:
|
|
144
115
|
"""Coerce any supported glitchling specification into a :class:`Gaggle`."""
|
|
145
|
-
|
|
146
|
-
if isinstance(glitchlings, Gaggle):
|
|
147
|
-
return glitchlings
|
|
148
|
-
|
|
149
|
-
if isinstance(glitchlings, (Glitchling, str)):
|
|
150
|
-
resolved: Iterable[str | Glitchling] = [glitchlings]
|
|
151
|
-
else:
|
|
152
|
-
resolved = glitchlings
|
|
153
|
-
|
|
154
|
-
return summon(list(resolved), seed=seed)
|
|
116
|
+
return coerce_gaggle(glitchlings, seed=seed)
|
|
155
117
|
|
|
156
118
|
|
|
157
119
|
def _extract_completion_text(completion: Any) -> str:
|
|
158
120
|
"""Normalise a completion payload into a plain string."""
|
|
159
|
-
|
|
160
121
|
if isinstance(completion, str):
|
|
161
122
|
return completion
|
|
162
123
|
|
|
@@ -175,11 +136,10 @@ def symmetric_damerau_levenshtein_similarity(
|
|
|
175
136
|
answer: str,
|
|
176
137
|
) -> float:
|
|
177
138
|
"""Return ``1 - (distance / max_len)`` using Damerau-Levenshtein distance."""
|
|
178
|
-
|
|
179
139
|
completion_text = _extract_completion_text(completion)
|
|
180
140
|
target = answer or ""
|
|
181
141
|
denominator = max(len(completion_text), len(target), 1)
|
|
182
|
-
distance = damerau_levenshtein_distance(completion_text, target)
|
|
142
|
+
distance = cast(int, damerau_levenshtein_distance(completion_text, target))
|
|
183
143
|
score = 1.0 - (distance / denominator)
|
|
184
144
|
return max(0.0, min(1.0, score))
|
|
185
145
|
|
|
@@ -199,32 +159,34 @@ def echo_chamber(
|
|
|
199
159
|
reward_function: Callable[..., float] | None = None,
|
|
200
160
|
split: str | None = None,
|
|
201
161
|
**load_dataset_kwargs: Any,
|
|
202
|
-
) ->
|
|
162
|
+
) -> VerifierSingleTurnEnv:
|
|
203
163
|
"""Create an Echo Chamber Prime environment from a Hugging Face dataset column.
|
|
204
164
|
|
|
205
165
|
Args:
|
|
206
166
|
dataset_id: Identifier of the Hugging Face dataset to load.
|
|
207
167
|
column: Name of the column whose text should be glitched.
|
|
208
168
|
glitchlings: Glitchling specifiers that will corrupt the prompts.
|
|
209
|
-
seed: RNG seed forwarded to :func:`
|
|
169
|
+
seed: RNG seed forwarded to :func:`glitchlings.util.adapters.coerce_gaggle`.
|
|
210
170
|
instructions: System instructions supplied to the environment prompts.
|
|
211
171
|
reward_function: Optional callable used to score completions. Defaults to
|
|
212
172
|
:func:`symmetric_damerau_levenshtein_similarity` when omitted.
|
|
213
173
|
split: Optional dataset split to load.
|
|
214
174
|
**load_dataset_kwargs: Extra keyword arguments forwarded to
|
|
215
175
|
:func:`datasets.load_dataset`.
|
|
216
|
-
"""
|
|
217
176
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
177
|
+
"""
|
|
178
|
+
datasets_module = require_datasets("datasets is required to build an echo chamber")
|
|
179
|
+
load_dataset = getattr(datasets_module, "load_dataset", None)
|
|
180
|
+
if load_dataset is None: # pragma: no cover - defensive
|
|
221
181
|
message = "datasets is required to build an echo chamber"
|
|
222
|
-
raise ModuleNotFoundError(message)
|
|
182
|
+
raise ModuleNotFoundError(message)
|
|
223
183
|
|
|
224
|
-
|
|
184
|
+
dataset_dict_cls = getattr(datasets_module, "DatasetDict", dict)
|
|
185
|
+
|
|
186
|
+
hf_dataset: Any
|
|
225
187
|
if split is None:
|
|
226
188
|
hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
|
|
227
|
-
if isinstance(hf_dataset,
|
|
189
|
+
if isinstance(hf_dataset, dataset_dict_cls):
|
|
228
190
|
try:
|
|
229
191
|
hf_dataset = next(iter(hf_dataset.values()))
|
|
230
192
|
except StopIteration as exc: # pragma: no cover - defensive
|
|
@@ -232,10 +194,8 @@ def echo_chamber(
|
|
|
232
194
|
else:
|
|
233
195
|
hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
|
|
234
196
|
|
|
235
|
-
if isinstance(hf_dataset,
|
|
236
|
-
raise ValueError(
|
|
237
|
-
"Specify which split to use when the dataset loads as a DatasetDict."
|
|
238
|
-
)
|
|
197
|
+
if isinstance(hf_dataset, dataset_dict_cls):
|
|
198
|
+
raise ValueError("Specify which split to use when the dataset loads as a DatasetDict.")
|
|
239
199
|
|
|
240
200
|
filtered_dataset = hf_dataset.filter(
|
|
241
201
|
lambda row: row.get(column) is not None,
|
|
@@ -259,7 +219,7 @@ def echo_chamber(
|
|
|
259
219
|
)
|
|
260
220
|
|
|
261
221
|
try:
|
|
262
|
-
dataset_length = len(base_dataset)
|
|
222
|
+
dataset_length = len(base_dataset)
|
|
263
223
|
except TypeError:
|
|
264
224
|
preview_rows: list[dict[str, Any]]
|
|
265
225
|
take_fn = getattr(base_dataset, "take", None)
|
|
@@ -288,4 +248,7 @@ def echo_chamber(
|
|
|
288
248
|
|
|
289
249
|
rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
|
|
290
250
|
rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
|
|
291
|
-
return
|
|
251
|
+
return cast(
|
|
252
|
+
VerifierSingleTurnEnv,
|
|
253
|
+
vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric),
|
|
254
|
+
)
|
glitchlings/lexicon/__init__.py
CHANGED
|
@@ -2,13 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import random
|
|
5
6
|
from abc import ABC, abstractmethod
|
|
6
7
|
from hashlib import blake2s
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
import random
|
|
9
9
|
from typing import Callable, Iterable
|
|
10
10
|
|
|
11
11
|
from glitchlings.config import get_config
|
|
12
|
+
|
|
12
13
|
from ._cache import CacheEntries, CacheSnapshot
|
|
13
14
|
|
|
14
15
|
|
|
@@ -21,6 +22,7 @@ class Lexicon(ABC):
|
|
|
21
22
|
Optional integer used to derive deterministic random number generators
|
|
22
23
|
for synonym sampling. Identical seeds guarantee reproducible results for
|
|
23
24
|
the same word/part-of-speech queries.
|
|
25
|
+
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
28
|
def __init__(self, *, seed: int | None = None) -> None:
|
|
@@ -29,17 +31,14 @@ class Lexicon(ABC):
|
|
|
29
31
|
@property
|
|
30
32
|
def seed(self) -> int | None:
|
|
31
33
|
"""Return the current base seed used for deterministic sampling."""
|
|
32
|
-
|
|
33
34
|
return self._seed
|
|
34
35
|
|
|
35
36
|
def reseed(self, seed: int | None) -> None:
|
|
36
37
|
"""Update the base seed driving deterministic synonym sampling."""
|
|
37
|
-
|
|
38
38
|
self._seed = seed
|
|
39
39
|
|
|
40
40
|
def _derive_rng(self, word: str, pos: str | None) -> random.Random:
|
|
41
41
|
"""Return an RNG derived from the base seed, word, and POS tag."""
|
|
42
|
-
|
|
43
42
|
seed_material = blake2s(digest_size=8)
|
|
44
43
|
seed_material.update(word.lower().encode("utf8"))
|
|
45
44
|
if pos is not None:
|
|
@@ -53,7 +52,6 @@ class Lexicon(ABC):
|
|
|
53
52
|
self, values: Iterable[str], *, limit: int, word: str, pos: str | None
|
|
54
53
|
) -> list[str]:
|
|
55
54
|
"""Return up to ``limit`` values sampled deterministically."""
|
|
56
|
-
|
|
57
55
|
if limit <= 0:
|
|
58
56
|
return []
|
|
59
57
|
|
|
@@ -67,14 +65,11 @@ class Lexicon(ABC):
|
|
|
67
65
|
return [items[index] for index in indices]
|
|
68
66
|
|
|
69
67
|
@abstractmethod
|
|
70
|
-
def get_synonyms(
|
|
71
|
-
self, word: str, pos: str | None = None, n: int = 5
|
|
72
|
-
) -> list[str]:
|
|
68
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
73
69
|
"""Return up to ``n`` synonyms for ``word`` constrained by ``pos``."""
|
|
74
70
|
|
|
75
71
|
def supports_pos(self, pos: str | None) -> bool:
|
|
76
72
|
"""Return ``True`` when the backend can service ``pos`` queries."""
|
|
77
|
-
|
|
78
73
|
return True
|
|
79
74
|
|
|
80
75
|
def __repr__(self) -> str: # pragma: no cover - trivial representation
|
|
@@ -96,14 +91,14 @@ class LexiconBackend(Lexicon):
|
|
|
96
91
|
"""Persist the backend cache to ``path`` and return the destination."""
|
|
97
92
|
|
|
98
93
|
|
|
99
|
-
from .graph import GraphLexicon
|
|
100
|
-
from .metrics import (
|
|
94
|
+
from .graph import GraphLexicon # noqa: E402
|
|
95
|
+
from .metrics import ( # noqa: E402
|
|
101
96
|
compare_lexicons,
|
|
102
97
|
coverage_ratio,
|
|
103
98
|
mean_cosine_similarity,
|
|
104
99
|
synonym_diversity,
|
|
105
100
|
)
|
|
106
|
-
from .vector import VectorLexicon, build_vector_cache
|
|
101
|
+
from .vector import VectorLexicon, build_vector_cache # noqa: E402
|
|
107
102
|
|
|
108
103
|
try: # pragma: no cover - optional dependency
|
|
109
104
|
from .wordnet import WordNetLexicon
|
|
@@ -114,24 +109,19 @@ except Exception: # pragma: no cover - triggered when nltk unavailable
|
|
|
114
109
|
_BACKEND_FACTORIES: dict[str, Callable[[int | None], Lexicon | None]] = {}
|
|
115
110
|
|
|
116
111
|
|
|
117
|
-
def register_backend(
|
|
118
|
-
name: str, factory: Callable[[int | None], Lexicon | None]
|
|
119
|
-
) -> None:
|
|
112
|
+
def register_backend(name: str, factory: Callable[[int | None], Lexicon | None]) -> None:
|
|
120
113
|
"""Register ``factory`` for ``name`` so it can be selected via config."""
|
|
121
|
-
|
|
122
114
|
normalized = name.lower()
|
|
123
115
|
_BACKEND_FACTORIES[normalized] = factory
|
|
124
116
|
|
|
125
117
|
|
|
126
118
|
def unregister_backend(name: str) -> None:
|
|
127
119
|
"""Remove a previously registered backend."""
|
|
128
|
-
|
|
129
120
|
_BACKEND_FACTORIES.pop(name.lower(), None)
|
|
130
121
|
|
|
131
122
|
|
|
132
123
|
def available_backends() -> list[str]:
|
|
133
124
|
"""Return the names of registered lexicon factories."""
|
|
134
|
-
|
|
135
125
|
return sorted(_BACKEND_FACTORIES)
|
|
136
126
|
|
|
137
127
|
|
|
@@ -172,7 +162,6 @@ register_backend("wordnet", _wordnet_backend)
|
|
|
172
162
|
|
|
173
163
|
def get_default_lexicon(seed: int | None = None) -> Lexicon:
|
|
174
164
|
"""Return the first available lexicon according to configuration priority."""
|
|
175
|
-
|
|
176
165
|
config = get_config()
|
|
177
166
|
attempts: list[str] = []
|
|
178
167
|
for name in config.lexicon.priority:
|
glitchlings/lexicon/_cache.py
CHANGED
|
@@ -8,7 +8,6 @@ from hashlib import blake2s
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Mapping, Sequence
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
CacheEntries = dict[str, list[str]]
|
|
13
12
|
|
|
14
13
|
|
|
@@ -22,7 +21,6 @@ class CacheSnapshot:
|
|
|
22
21
|
|
|
23
22
|
def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
|
|
24
23
|
"""Convert raw cache payloads into canonical mapping form."""
|
|
25
|
-
|
|
26
24
|
entries: CacheEntries = {}
|
|
27
25
|
for key, values in payload.items():
|
|
28
26
|
if not isinstance(key, str):
|
|
@@ -35,21 +33,18 @@ def _normalise_entries(payload: Mapping[str, Sequence[str]]) -> CacheEntries:
|
|
|
35
33
|
|
|
36
34
|
def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
|
|
37
35
|
"""Return a deterministic JSON serialisation for ``entries``."""
|
|
38
|
-
|
|
39
36
|
serialisable = {key: list(values) for key, values in sorted(entries.items())}
|
|
40
37
|
return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
|
41
38
|
|
|
42
39
|
|
|
43
40
|
def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
|
|
44
41
|
"""Return a BLAKE2s checksum for ``entries``."""
|
|
45
|
-
|
|
46
42
|
digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
|
|
47
43
|
return digest.hexdigest()
|
|
48
44
|
|
|
49
45
|
|
|
50
46
|
def load_cache(path: Path) -> CacheSnapshot:
|
|
51
47
|
"""Load a cache from ``path`` and verify its checksum if present."""
|
|
52
|
-
|
|
53
48
|
if not path.exists():
|
|
54
49
|
return CacheSnapshot(entries={}, checksum=None)
|
|
55
50
|
|
|
@@ -89,7 +84,6 @@ def load_cache(path: Path) -> CacheSnapshot:
|
|
|
89
84
|
|
|
90
85
|
def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
|
|
91
86
|
"""Persist ``entries`` to ``path`` with checksum metadata."""
|
|
92
|
-
|
|
93
87
|
serialisable = {key: list(values) for key, values in sorted(entries.items())}
|
|
94
88
|
checksum = compute_checksum(serialisable)
|
|
95
89
|
payload = {
|
|
@@ -108,4 +102,3 @@ def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapsh
|
|
|
108
102
|
|
|
109
103
|
|
|
110
104
|
__all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]
|
|
111
|
-
|
glitchlings/lexicon/graph.py
CHANGED
|
@@ -7,17 +7,17 @@ from pathlib import Path
|
|
|
7
7
|
from typing import Iterable, Mapping, MutableMapping, Sequence
|
|
8
8
|
|
|
9
9
|
from . import LexiconBackend
|
|
10
|
-
from ._cache import CacheSnapshot
|
|
10
|
+
from ._cache import CacheSnapshot
|
|
11
|
+
from ._cache import load_cache as _load_cache_file
|
|
12
|
+
from ._cache import write_cache as _write_cache_file
|
|
11
13
|
from .vector import VectorLexicon
|
|
12
14
|
|
|
13
|
-
|
|
14
15
|
_CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
|
|
15
16
|
_PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
def _lemmatize_token(token: str) -> str:
|
|
19
20
|
"""Return a lightweight lemma for ``token`` using heuristic rules."""
|
|
20
|
-
|
|
21
21
|
irregular = {
|
|
22
22
|
"children": "child",
|
|
23
23
|
"mice": "mouse",
|
|
@@ -60,7 +60,6 @@ def _lemmatize_token(token: str) -> str:
|
|
|
60
60
|
|
|
61
61
|
def _normalize_phrase(phrase: str) -> str:
|
|
62
62
|
"""Normalise ``phrase`` for ConceptNet lookups."""
|
|
63
|
-
|
|
64
63
|
stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
|
|
65
64
|
tokens = [token for token in stripped.split() if token]
|
|
66
65
|
if not tokens:
|
|
@@ -71,7 +70,6 @@ def _normalize_phrase(phrase: str) -> str:
|
|
|
71
70
|
|
|
72
71
|
def _concept_terms(normalized: str) -> list[str]:
|
|
73
72
|
"""Return ConceptNet term variants for ``normalized``."""
|
|
74
|
-
|
|
75
73
|
collapsed = normalized.replace(" ", "_")
|
|
76
74
|
if not collapsed:
|
|
77
75
|
return []
|
|
@@ -83,7 +81,6 @@ def _concept_terms(normalized: str) -> list[str]:
|
|
|
83
81
|
|
|
84
82
|
def _surface_from_concept(concept: str) -> str | None:
|
|
85
83
|
"""Return a human-readable surface form for ``concept``."""
|
|
86
|
-
|
|
87
84
|
match = _CONCEPT_RE.match(concept)
|
|
88
85
|
if match is None:
|
|
89
86
|
return None
|
|
@@ -102,7 +99,6 @@ def _language_from_concept(concept: str) -> str | None:
|
|
|
102
99
|
|
|
103
100
|
def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
|
|
104
101
|
"""Load ConceptNet Numberbatch embeddings from ``path``."""
|
|
105
|
-
|
|
106
102
|
if not path.exists():
|
|
107
103
|
return {}
|
|
108
104
|
|
|
@@ -240,9 +236,7 @@ class GraphLexicon(LexiconBackend):
|
|
|
240
236
|
self._cache_dirty = True
|
|
241
237
|
return synonyms
|
|
242
238
|
|
|
243
|
-
def get_synonyms(
|
|
244
|
-
self, word: str, pos: str | None = None, n: int = 5
|
|
245
|
-
) -> list[str]:
|
|
239
|
+
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
246
240
|
normalized = _normalize_phrase(word)
|
|
247
241
|
if not normalized:
|
|
248
242
|
return []
|
|
@@ -261,7 +255,6 @@ class GraphLexicon(LexiconBackend):
|
|
|
261
255
|
@classmethod
|
|
262
256
|
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
263
257
|
"""Load and validate a persisted ConceptNet cache file."""
|
|
264
|
-
|
|
265
258
|
return _load_cache_file(Path(path))
|
|
266
259
|
|
|
267
260
|
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
@@ -287,4 +280,3 @@ class GraphLexicon(LexiconBackend):
|
|
|
287
280
|
f"GraphLexicon(languages={sorted(self._languages)!r}, "
|
|
288
281
|
f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
|
|
289
282
|
)
|
|
290
|
-
|
glitchlings/lexicon/metrics.py
CHANGED
|
@@ -18,7 +18,6 @@ def _unique_synonyms(
|
|
|
18
18
|
sample_size: int,
|
|
19
19
|
) -> list[str]:
|
|
20
20
|
"""Return unique synonym candidates excluding the original token."""
|
|
21
|
-
|
|
22
21
|
collected: list[str] = []
|
|
23
22
|
seen: set[str] = set()
|
|
24
23
|
source = word.lower()
|
|
@@ -41,7 +40,6 @@ def synonym_diversity(
|
|
|
41
40
|
sample_size: int = 5,
|
|
42
41
|
) -> float:
|
|
43
42
|
"""Return the mean unique-synonym count for ``words`` using ``lexicon``."""
|
|
44
|
-
|
|
45
43
|
totals = []
|
|
46
44
|
for word in words:
|
|
47
45
|
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
@@ -60,7 +58,6 @@ def coverage_ratio(
|
|
|
60
58
|
min_synonyms: int = 3,
|
|
61
59
|
) -> float:
|
|
62
60
|
"""Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
|
|
63
|
-
|
|
64
61
|
total = 0
|
|
65
62
|
hits = 0
|
|
66
63
|
for word in words:
|
|
@@ -96,7 +93,6 @@ def mean_cosine_similarity(
|
|
|
96
93
|
sample_size: int = 5,
|
|
97
94
|
) -> float:
|
|
98
95
|
"""Return the mean cosine similarity between each word and its candidates."""
|
|
99
|
-
|
|
100
96
|
total = 0.0
|
|
101
97
|
count = 0
|
|
102
98
|
for word in words:
|
|
@@ -126,11 +122,8 @@ def compare_lexicons(
|
|
|
126
122
|
embeddings: Mapping[str, Sequence[float]] | None = None,
|
|
127
123
|
) -> dict[str, float]:
|
|
128
124
|
"""Return comparative coverage and diversity statistics for two lexicons."""
|
|
129
|
-
|
|
130
125
|
stats = {
|
|
131
|
-
"baseline_diversity": synonym_diversity(
|
|
132
|
-
baseline, words, pos=pos, sample_size=sample_size
|
|
133
|
-
),
|
|
126
|
+
"baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
|
|
134
127
|
"candidate_diversity": synonym_diversity(
|
|
135
128
|
candidate, words, pos=pos, sample_size=sample_size
|
|
136
129
|
),
|