glitchlings 0.2.6__cp311-cp311-macosx_11_0_universal2.whl → 0.4.0__cp311-cp311-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +8 -0
- glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
- glitchlings/config.py +258 -0
- glitchlings/config.toml +3 -0
- glitchlings/lexicon/__init__.py +191 -0
- glitchlings/lexicon/data/default_vector_cache.json +16 -0
- glitchlings/lexicon/graph.py +303 -0
- glitchlings/lexicon/metrics.py +169 -0
- glitchlings/lexicon/vector.py +610 -0
- glitchlings/lexicon/wordnet.py +182 -0
- glitchlings/main.py +145 -5
- glitchlings/zoo/__init__.py +20 -1
- glitchlings/zoo/_sampling.py +55 -0
- glitchlings/zoo/_text_utils.py +104 -0
- glitchlings/zoo/adjax.py +131 -0
- glitchlings/zoo/core.py +16 -14
- glitchlings/zoo/jargoyle.py +190 -200
- glitchlings/zoo/redactyl.py +32 -67
- glitchlings/zoo/reduple.py +13 -35
- glitchlings/zoo/rushmore.py +17 -28
- glitchlings/zoo/typogre.py +22 -1
- glitchlings/zoo/zeedub.py +40 -1
- {glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/METADATA +48 -11
- glitchlings-0.4.0.dist-info/RECORD +38 -0
- glitchlings-0.2.6.dist-info/RECORD +0 -27
- {glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""Graph-based lexicon backed by ConceptNet/Numberbatch embeddings."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterable, Mapping, MutableMapping, Sequence
|
|
9
|
+
|
|
10
|
+
from . import Lexicon
|
|
11
|
+
from .vector import VectorLexicon
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
|
|
15
|
+
_PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _lemmatize_token(token: str) -> str:
|
|
19
|
+
"""Return a lightweight lemma for ``token`` using heuristic rules."""
|
|
20
|
+
|
|
21
|
+
irregular = {
|
|
22
|
+
"children": "child",
|
|
23
|
+
"mice": "mouse",
|
|
24
|
+
"geese": "goose",
|
|
25
|
+
"feet": "foot",
|
|
26
|
+
"teeth": "tooth",
|
|
27
|
+
"men": "man",
|
|
28
|
+
"women": "woman",
|
|
29
|
+
"better": "good",
|
|
30
|
+
"worse": "bad",
|
|
31
|
+
}
|
|
32
|
+
lowered = token.lower()
|
|
33
|
+
if lowered in irregular:
|
|
34
|
+
return irregular[lowered]
|
|
35
|
+
|
|
36
|
+
if lowered.endswith("ies") and len(lowered) > 3:
|
|
37
|
+
return lowered[:-3] + "y"
|
|
38
|
+
if lowered.endswith("ves") and len(lowered) > 3:
|
|
39
|
+
return lowered[:-3] + "f"
|
|
40
|
+
if lowered.endswith("men") and len(lowered) > 3:
|
|
41
|
+
return lowered[:-3] + "man"
|
|
42
|
+
if lowered.endswith("ses") and len(lowered) > 3:
|
|
43
|
+
return lowered[:-2]
|
|
44
|
+
if lowered.endswith("es") and len(lowered) > 3:
|
|
45
|
+
return lowered[:-2]
|
|
46
|
+
if lowered.endswith("s") and len(lowered) > 2 and not lowered.endswith("ss"):
|
|
47
|
+
return lowered[:-1]
|
|
48
|
+
if lowered.endswith("ing") and len(lowered) > 4:
|
|
49
|
+
stem = lowered[:-3]
|
|
50
|
+
if len(stem) > 2 and stem[-1] == stem[-2]:
|
|
51
|
+
stem = stem[:-1]
|
|
52
|
+
return stem
|
|
53
|
+
if lowered.endswith("ed") and len(lowered) > 3:
|
|
54
|
+
stem = lowered[:-2]
|
|
55
|
+
if len(stem) > 2 and stem[-1] == stem[-2]:
|
|
56
|
+
stem = stem[:-1]
|
|
57
|
+
return stem
|
|
58
|
+
return lowered
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _normalize_phrase(phrase: str) -> str:
|
|
62
|
+
"""Normalise ``phrase`` for ConceptNet lookups."""
|
|
63
|
+
|
|
64
|
+
stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
|
|
65
|
+
tokens = [token for token in stripped.split() if token]
|
|
66
|
+
if not tokens:
|
|
67
|
+
return ""
|
|
68
|
+
lemmatised = [_lemmatize_token(token) for token in tokens]
|
|
69
|
+
return " ".join(lemmatised)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _concept_terms(normalized: str) -> list[str]:
|
|
73
|
+
"""Return ConceptNet term variants for ``normalized``."""
|
|
74
|
+
|
|
75
|
+
collapsed = normalized.replace(" ", "_")
|
|
76
|
+
if not collapsed:
|
|
77
|
+
return []
|
|
78
|
+
variants = {collapsed}
|
|
79
|
+
variants.add(collapsed.replace("_", "-"))
|
|
80
|
+
variants.add(collapsed.replace("-", "_"))
|
|
81
|
+
return list(variants)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _surface_from_concept(concept: str) -> str | None:
|
|
85
|
+
"""Return a human-readable surface form for ``concept``."""
|
|
86
|
+
|
|
87
|
+
match = _CONCEPT_RE.match(concept)
|
|
88
|
+
if match is None:
|
|
89
|
+
return None
|
|
90
|
+
term = match.group("term")
|
|
91
|
+
surface = term.replace("_", " ")
|
|
92
|
+
surface = surface.replace("-", " ")
|
|
93
|
+
return " ".join(surface.split())
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _language_from_concept(concept: str) -> str | None:
|
|
97
|
+
match = _CONCEPT_RE.match(concept)
|
|
98
|
+
if match is None:
|
|
99
|
+
return None
|
|
100
|
+
return match.group("lang")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
|
|
104
|
+
"""Load ConceptNet Numberbatch embeddings from ``path``."""
|
|
105
|
+
|
|
106
|
+
if not path.exists():
|
|
107
|
+
return {}
|
|
108
|
+
|
|
109
|
+
if path.suffix == ".gz":
|
|
110
|
+
import gzip
|
|
111
|
+
|
|
112
|
+
handle = gzip.open(path, "rt", encoding="utf8")
|
|
113
|
+
else:
|
|
114
|
+
handle = path.open("r", encoding="utf8")
|
|
115
|
+
|
|
116
|
+
with handle as stream:
|
|
117
|
+
header = stream.readline()
|
|
118
|
+
try:
|
|
119
|
+
parts = header.strip().split()
|
|
120
|
+
if len(parts) >= 2:
|
|
121
|
+
int(parts[0])
|
|
122
|
+
int(parts[1])
|
|
123
|
+
except ValueError:
|
|
124
|
+
stream.seek(0)
|
|
125
|
+
|
|
126
|
+
embeddings: dict[str, list[float]] = {}
|
|
127
|
+
for line in stream:
|
|
128
|
+
tokens = line.strip().split()
|
|
129
|
+
if len(tokens) <= 2:
|
|
130
|
+
continue
|
|
131
|
+
concept = tokens[0]
|
|
132
|
+
lang = _language_from_concept(concept)
|
|
133
|
+
if lang is None or lang not in languages:
|
|
134
|
+
continue
|
|
135
|
+
try:
|
|
136
|
+
vector = [float(value) for value in tokens[1:]]
|
|
137
|
+
except ValueError:
|
|
138
|
+
continue
|
|
139
|
+
embeddings[concept] = vector
|
|
140
|
+
return embeddings
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _load_cache(path: Path) -> dict[str, list[str]]:
|
|
144
|
+
if not path.exists():
|
|
145
|
+
return {}
|
|
146
|
+
with path.open("r", encoding="utf8") as handle:
|
|
147
|
+
payload = json.load(handle)
|
|
148
|
+
if not isinstance(payload, Mapping):
|
|
149
|
+
raise RuntimeError("Graph lexicon cache must be a mapping of strings to lists.")
|
|
150
|
+
cache: dict[str, list[str]] = {}
|
|
151
|
+
for key, values in payload.items():
|
|
152
|
+
if not isinstance(key, str):
|
|
153
|
+
raise RuntimeError("Graph lexicon cache keys must be strings.")
|
|
154
|
+
if not isinstance(values, Sequence):
|
|
155
|
+
raise RuntimeError("Graph lexicon cache values must be sequences of strings.")
|
|
156
|
+
cache[key] = [str(value) for value in values]
|
|
157
|
+
return cache
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
|
|
161
|
+
serialisable = {key: list(values) for key, values in sorted(cache.items())}
|
|
162
|
+
with path.open("w", encoding="utf8") as handle:
|
|
163
|
+
json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class GraphLexicon(Lexicon):
|
|
167
|
+
"""Lexicon backed by ConceptNet/Numberbatch embeddings."""
|
|
168
|
+
|
|
169
|
+
def __init__(
|
|
170
|
+
self,
|
|
171
|
+
*,
|
|
172
|
+
source: Mapping[str, Sequence[float]] | str | Path | None = None,
|
|
173
|
+
cache: Mapping[str, Sequence[str]] | None = None,
|
|
174
|
+
cache_path: str | Path | None = None,
|
|
175
|
+
languages: Iterable[str] = ("en",),
|
|
176
|
+
max_neighbors: int = 50,
|
|
177
|
+
min_similarity: float = 0.0,
|
|
178
|
+
seed: int | None = None,
|
|
179
|
+
) -> None:
|
|
180
|
+
super().__init__(seed=seed)
|
|
181
|
+
self._languages = {language.lower() for language in languages}
|
|
182
|
+
if not self._languages:
|
|
183
|
+
self._languages = {"en"}
|
|
184
|
+
self._max_neighbors = max(1, max_neighbors)
|
|
185
|
+
self._min_similarity = min_similarity
|
|
186
|
+
self._cache: MutableMapping[str, list[str]] = {}
|
|
187
|
+
self._cache_path = Path(cache_path) if cache_path is not None else None
|
|
188
|
+
if self._cache_path is not None:
|
|
189
|
+
self._cache.update(_load_cache(self._cache_path))
|
|
190
|
+
if cache is not None:
|
|
191
|
+
for key, values in cache.items():
|
|
192
|
+
self._cache[str(key)] = [str(value) for value in values]
|
|
193
|
+
self._cache_dirty = False
|
|
194
|
+
|
|
195
|
+
prepared_source = self._prepare_source(source)
|
|
196
|
+
self._backend = VectorLexicon(
|
|
197
|
+
source=prepared_source if prepared_source else None,
|
|
198
|
+
max_neighbors=self._max_neighbors,
|
|
199
|
+
min_similarity=self._min_similarity,
|
|
200
|
+
case_sensitive=True,
|
|
201
|
+
seed=seed,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def _prepare_source(
|
|
205
|
+
self, source: Mapping[str, Sequence[float]] | str | Path | None
|
|
206
|
+
) -> Mapping[str, Sequence[float]]:
|
|
207
|
+
if source is None:
|
|
208
|
+
return {}
|
|
209
|
+
if isinstance(source, Mapping):
|
|
210
|
+
prepared: dict[str, list[float]] = {}
|
|
211
|
+
for key, vector in source.items():
|
|
212
|
+
lang = _language_from_concept(key)
|
|
213
|
+
if lang is None or lang not in self._languages:
|
|
214
|
+
continue
|
|
215
|
+
prepared[key] = [float(value) for value in vector]
|
|
216
|
+
return prepared
|
|
217
|
+
path = Path(source)
|
|
218
|
+
embeddings = _load_numberbatch(path, languages=self._languages)
|
|
219
|
+
return embeddings
|
|
220
|
+
|
|
221
|
+
def reseed(self, seed: int | None) -> None:
|
|
222
|
+
super().reseed(seed)
|
|
223
|
+
self._backend.reseed(seed)
|
|
224
|
+
|
|
225
|
+
def _concept_candidates(self, normalized: str) -> list[str]:
|
|
226
|
+
terms = _concept_terms(normalized)
|
|
227
|
+
concepts = []
|
|
228
|
+
for language in sorted(self._languages):
|
|
229
|
+
for term in terms:
|
|
230
|
+
concepts.append(f"/c/{language}/{term}")
|
|
231
|
+
return concepts
|
|
232
|
+
|
|
233
|
+
def _collect_synonyms(self, normalized: str) -> list[str]:
|
|
234
|
+
candidates: list[str] = []
|
|
235
|
+
seen: set[str] = set()
|
|
236
|
+
for concept in self._concept_candidates(normalized):
|
|
237
|
+
neighbors = self._backend.precompute(concept, limit=self._max_neighbors)
|
|
238
|
+
for neighbor in neighbors:
|
|
239
|
+
lang = _language_from_concept(neighbor)
|
|
240
|
+
if lang is None or lang not in self._languages:
|
|
241
|
+
continue
|
|
242
|
+
surface = _surface_from_concept(neighbor)
|
|
243
|
+
if surface is None:
|
|
244
|
+
continue
|
|
245
|
+
surface_norm = _normalize_phrase(surface)
|
|
246
|
+
if not surface_norm or surface_norm == normalized:
|
|
247
|
+
continue
|
|
248
|
+
if surface_norm in seen:
|
|
249
|
+
continue
|
|
250
|
+
seen.add(surface_norm)
|
|
251
|
+
candidates.append(surface)
|
|
252
|
+
return candidates
|
|
253
|
+
|
|
254
|
+
def _ensure_cached(self, normalized: str) -> list[str]:
|
|
255
|
+
if normalized in self._cache:
|
|
256
|
+
return self._cache[normalized]
|
|
257
|
+
synonyms = self._collect_synonyms(normalized)
|
|
258
|
+
self._cache[normalized] = synonyms
|
|
259
|
+
if self._cache_path is not None:
|
|
260
|
+
self._cache_dirty = True
|
|
261
|
+
return synonyms
|
|
262
|
+
|
|
263
|
+
def get_synonyms(
|
|
264
|
+
self, word: str, pos: str | None = None, n: int = 5
|
|
265
|
+
) -> list[str]:
|
|
266
|
+
normalized = _normalize_phrase(word)
|
|
267
|
+
if not normalized:
|
|
268
|
+
return []
|
|
269
|
+
synonyms = self._ensure_cached(normalized)
|
|
270
|
+
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
271
|
+
|
|
272
|
+
def precompute(self, word: str) -> list[str]:
|
|
273
|
+
normalized = _normalize_phrase(word)
|
|
274
|
+
if not normalized:
|
|
275
|
+
return []
|
|
276
|
+
return list(self._ensure_cached(normalized))
|
|
277
|
+
|
|
278
|
+
def export_cache(self) -> dict[str, list[str]]:
|
|
279
|
+
return {key: list(values) for key, values in self._cache.items()}
|
|
280
|
+
|
|
281
|
+
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
282
|
+
if path is None:
|
|
283
|
+
if self._cache_path is None:
|
|
284
|
+
raise RuntimeError("No cache path supplied to GraphLexicon.")
|
|
285
|
+
target = self._cache_path
|
|
286
|
+
else:
|
|
287
|
+
target = Path(path)
|
|
288
|
+
self._cache_path = target
|
|
289
|
+
_write_cache(target, self._cache)
|
|
290
|
+
self._cache_dirty = False
|
|
291
|
+
return target
|
|
292
|
+
|
|
293
|
+
def supports_pos(self, pos: str | None) -> bool:
|
|
294
|
+
return True
|
|
295
|
+
|
|
296
|
+
def __repr__(self) -> str: # pragma: no cover - debug helper
|
|
297
|
+
adapter = getattr(self._backend, "_adapter", None)
|
|
298
|
+
state = "loaded" if adapter else "empty"
|
|
299
|
+
return (
|
|
300
|
+
f"GraphLexicon(languages={sorted(self._languages)!r}, "
|
|
301
|
+
f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
|
|
302
|
+
)
|
|
303
|
+
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Utility helpers for evaluating lexicon coverage and quality."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING: # pragma: no cover - typing hint only
|
|
10
|
+
from . import Lexicon
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _unique_synonyms(
|
|
14
|
+
lexicon: "Lexicon",
|
|
15
|
+
word: str,
|
|
16
|
+
*,
|
|
17
|
+
pos: str | None,
|
|
18
|
+
sample_size: int,
|
|
19
|
+
) -> list[str]:
|
|
20
|
+
"""Return unique synonym candidates excluding the original token."""
|
|
21
|
+
|
|
22
|
+
collected: list[str] = []
|
|
23
|
+
seen: set[str] = set()
|
|
24
|
+
source = word.lower()
|
|
25
|
+
for synonym in lexicon.get_synonyms(word, pos=pos, n=sample_size):
|
|
26
|
+
normalized = synonym.lower()
|
|
27
|
+
if normalized == source:
|
|
28
|
+
continue
|
|
29
|
+
if normalized in seen:
|
|
30
|
+
continue
|
|
31
|
+
seen.add(normalized)
|
|
32
|
+
collected.append(synonym)
|
|
33
|
+
return collected
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def synonym_diversity(
|
|
37
|
+
lexicon: "Lexicon",
|
|
38
|
+
words: Iterable[str],
|
|
39
|
+
*,
|
|
40
|
+
pos: str | None = None,
|
|
41
|
+
sample_size: int = 5,
|
|
42
|
+
) -> float:
|
|
43
|
+
"""Return the mean unique-synonym count for ``words`` using ``lexicon``."""
|
|
44
|
+
|
|
45
|
+
totals = []
|
|
46
|
+
for word in words:
|
|
47
|
+
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
48
|
+
totals.append(len(synonyms))
|
|
49
|
+
if not totals:
|
|
50
|
+
return 0.0
|
|
51
|
+
return sum(totals) / len(totals)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def coverage_ratio(
|
|
55
|
+
lexicon: "Lexicon",
|
|
56
|
+
words: Iterable[str],
|
|
57
|
+
*,
|
|
58
|
+
pos: str | None = None,
|
|
59
|
+
sample_size: int = 5,
|
|
60
|
+
min_synonyms: int = 3,
|
|
61
|
+
) -> float:
|
|
62
|
+
"""Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
|
|
63
|
+
|
|
64
|
+
total = 0
|
|
65
|
+
hits = 0
|
|
66
|
+
for word in words:
|
|
67
|
+
total += 1
|
|
68
|
+
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
69
|
+
if len(synonyms) >= min_synonyms:
|
|
70
|
+
hits += 1
|
|
71
|
+
if total == 0:
|
|
72
|
+
return 0.0
|
|
73
|
+
return hits / total
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
77
|
+
dot = 0.0
|
|
78
|
+
norm_a = 0.0
|
|
79
|
+
norm_b = 0.0
|
|
80
|
+
for value_a, value_b in zip(vector_a, vector_b):
|
|
81
|
+
dot += value_a * value_b
|
|
82
|
+
norm_a += value_a * value_a
|
|
83
|
+
norm_b += value_b * value_b
|
|
84
|
+
magnitude = math.sqrt(norm_a) * math.sqrt(norm_b)
|
|
85
|
+
if magnitude == 0.0:
|
|
86
|
+
return 0.0
|
|
87
|
+
return dot / magnitude
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def mean_cosine_similarity(
|
|
91
|
+
lexicon: "Lexicon",
|
|
92
|
+
embeddings: Mapping[str, Sequence[float]],
|
|
93
|
+
words: Iterable[str],
|
|
94
|
+
*,
|
|
95
|
+
pos: str | None = None,
|
|
96
|
+
sample_size: int = 5,
|
|
97
|
+
) -> float:
|
|
98
|
+
"""Return the mean cosine similarity between each word and its candidates."""
|
|
99
|
+
|
|
100
|
+
total = 0.0
|
|
101
|
+
count = 0
|
|
102
|
+
for word in words:
|
|
103
|
+
source_vector = embeddings.get(word)
|
|
104
|
+
if source_vector is None:
|
|
105
|
+
continue
|
|
106
|
+
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
107
|
+
for synonym in synonyms:
|
|
108
|
+
synonym_vector = embeddings.get(synonym)
|
|
109
|
+
if synonym_vector is None:
|
|
110
|
+
continue
|
|
111
|
+
total += _cosine_similarity(source_vector, synonym_vector)
|
|
112
|
+
count += 1
|
|
113
|
+
if count == 0:
|
|
114
|
+
return 0.0
|
|
115
|
+
return total / count
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def compare_lexicons(
|
|
119
|
+
baseline: "Lexicon",
|
|
120
|
+
candidate: "Lexicon",
|
|
121
|
+
words: Iterable[str],
|
|
122
|
+
*,
|
|
123
|
+
pos: str | None = None,
|
|
124
|
+
sample_size: int = 5,
|
|
125
|
+
min_synonyms: int = 3,
|
|
126
|
+
embeddings: Mapping[str, Sequence[float]] | None = None,
|
|
127
|
+
) -> dict[str, float]:
|
|
128
|
+
"""Return comparative coverage and diversity statistics for two lexicons."""
|
|
129
|
+
|
|
130
|
+
stats = {
|
|
131
|
+
"baseline_diversity": synonym_diversity(
|
|
132
|
+
baseline, words, pos=pos, sample_size=sample_size
|
|
133
|
+
),
|
|
134
|
+
"candidate_diversity": synonym_diversity(
|
|
135
|
+
candidate, words, pos=pos, sample_size=sample_size
|
|
136
|
+
),
|
|
137
|
+
"baseline_coverage": coverage_ratio(
|
|
138
|
+
baseline,
|
|
139
|
+
words,
|
|
140
|
+
pos=pos,
|
|
141
|
+
sample_size=sample_size,
|
|
142
|
+
min_synonyms=min_synonyms,
|
|
143
|
+
),
|
|
144
|
+
"candidate_coverage": coverage_ratio(
|
|
145
|
+
candidate,
|
|
146
|
+
words,
|
|
147
|
+
pos=pos,
|
|
148
|
+
sample_size=sample_size,
|
|
149
|
+
min_synonyms=min_synonyms,
|
|
150
|
+
),
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if embeddings is not None:
|
|
154
|
+
stats["baseline_similarity"] = mean_cosine_similarity(
|
|
155
|
+
baseline, embeddings, words, pos=pos, sample_size=sample_size
|
|
156
|
+
)
|
|
157
|
+
stats["candidate_similarity"] = mean_cosine_similarity(
|
|
158
|
+
candidate, embeddings, words, pos=pos, sample_size=sample_size
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return stats
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
__all__ = [
|
|
165
|
+
"compare_lexicons",
|
|
166
|
+
"coverage_ratio",
|
|
167
|
+
"mean_cosine_similarity",
|
|
168
|
+
"synonym_diversity",
|
|
169
|
+
]
|