glitchlings 0.3.0__cp312-cp312-macosx_11_0_universal2.whl → 0.4.0__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

@@ -0,0 +1,303 @@
1
+ """Graph-based lexicon backed by ConceptNet/Numberbatch embeddings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from pathlib import Path
8
+ from typing import Iterable, Mapping, MutableMapping, Sequence
9
+
10
+ from . import Lexicon
11
+ from .vector import VectorLexicon
12
+
13
+
14
+ _CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
15
+ _PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
16
+
17
+
18
+ def _lemmatize_token(token: str) -> str:
19
+ """Return a lightweight lemma for ``token`` using heuristic rules."""
20
+
21
+ irregular = {
22
+ "children": "child",
23
+ "mice": "mouse",
24
+ "geese": "goose",
25
+ "feet": "foot",
26
+ "teeth": "tooth",
27
+ "men": "man",
28
+ "women": "woman",
29
+ "better": "good",
30
+ "worse": "bad",
31
+ }
32
+ lowered = token.lower()
33
+ if lowered in irregular:
34
+ return irregular[lowered]
35
+
36
+ if lowered.endswith("ies") and len(lowered) > 3:
37
+ return lowered[:-3] + "y"
38
+ if lowered.endswith("ves") and len(lowered) > 3:
39
+ return lowered[:-3] + "f"
40
+ if lowered.endswith("men") and len(lowered) > 3:
41
+ return lowered[:-3] + "man"
42
+ if lowered.endswith("ses") and len(lowered) > 3:
43
+ return lowered[:-2]
44
+ if lowered.endswith("es") and len(lowered) > 3:
45
+ return lowered[:-2]
46
+ if lowered.endswith("s") and len(lowered) > 2 and not lowered.endswith("ss"):
47
+ return lowered[:-1]
48
+ if lowered.endswith("ing") and len(lowered) > 4:
49
+ stem = lowered[:-3]
50
+ if len(stem) > 2 and stem[-1] == stem[-2]:
51
+ stem = stem[:-1]
52
+ return stem
53
+ if lowered.endswith("ed") and len(lowered) > 3:
54
+ stem = lowered[:-2]
55
+ if len(stem) > 2 and stem[-1] == stem[-2]:
56
+ stem = stem[:-1]
57
+ return stem
58
+ return lowered
59
+
60
+
61
+ def _normalize_phrase(phrase: str) -> str:
62
+ """Normalise ``phrase`` for ConceptNet lookups."""
63
+
64
+ stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
65
+ tokens = [token for token in stripped.split() if token]
66
+ if not tokens:
67
+ return ""
68
+ lemmatised = [_lemmatize_token(token) for token in tokens]
69
+ return " ".join(lemmatised)
70
+
71
+
72
+ def _concept_terms(normalized: str) -> list[str]:
73
+ """Return ConceptNet term variants for ``normalized``."""
74
+
75
+ collapsed = normalized.replace(" ", "_")
76
+ if not collapsed:
77
+ return []
78
+ variants = {collapsed}
79
+ variants.add(collapsed.replace("_", "-"))
80
+ variants.add(collapsed.replace("-", "_"))
81
+ return list(variants)
82
+
83
+
84
+ def _surface_from_concept(concept: str) -> str | None:
85
+ """Return a human-readable surface form for ``concept``."""
86
+
87
+ match = _CONCEPT_RE.match(concept)
88
+ if match is None:
89
+ return None
90
+ term = match.group("term")
91
+ surface = term.replace("_", " ")
92
+ surface = surface.replace("-", " ")
93
+ return " ".join(surface.split())
94
+
95
+
96
+ def _language_from_concept(concept: str) -> str | None:
97
+ match = _CONCEPT_RE.match(concept)
98
+ if match is None:
99
+ return None
100
+ return match.group("lang")
101
+
102
+
103
+ def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
104
+ """Load ConceptNet Numberbatch embeddings from ``path``."""
105
+
106
+ if not path.exists():
107
+ return {}
108
+
109
+ if path.suffix == ".gz":
110
+ import gzip
111
+
112
+ handle = gzip.open(path, "rt", encoding="utf8")
113
+ else:
114
+ handle = path.open("r", encoding="utf8")
115
+
116
+ with handle as stream:
117
+ header = stream.readline()
118
+ try:
119
+ parts = header.strip().split()
120
+ if len(parts) >= 2:
121
+ int(parts[0])
122
+ int(parts[1])
123
+ except ValueError:
124
+ stream.seek(0)
125
+
126
+ embeddings: dict[str, list[float]] = {}
127
+ for line in stream:
128
+ tokens = line.strip().split()
129
+ if len(tokens) <= 2:
130
+ continue
131
+ concept = tokens[0]
132
+ lang = _language_from_concept(concept)
133
+ if lang is None or lang not in languages:
134
+ continue
135
+ try:
136
+ vector = [float(value) for value in tokens[1:]]
137
+ except ValueError:
138
+ continue
139
+ embeddings[concept] = vector
140
+ return embeddings
141
+
142
+
143
+ def _load_cache(path: Path) -> dict[str, list[str]]:
144
+ if not path.exists():
145
+ return {}
146
+ with path.open("r", encoding="utf8") as handle:
147
+ payload = json.load(handle)
148
+ if not isinstance(payload, Mapping):
149
+ raise RuntimeError("Graph lexicon cache must be a mapping of strings to lists.")
150
+ cache: dict[str, list[str]] = {}
151
+ for key, values in payload.items():
152
+ if not isinstance(key, str):
153
+ raise RuntimeError("Graph lexicon cache keys must be strings.")
154
+ if not isinstance(values, Sequence):
155
+ raise RuntimeError("Graph lexicon cache values must be sequences of strings.")
156
+ cache[key] = [str(value) for value in values]
157
+ return cache
158
+
159
+
160
+ def _write_cache(path: Path, cache: Mapping[str, Sequence[str]]) -> None:
161
+ serialisable = {key: list(values) for key, values in sorted(cache.items())}
162
+ with path.open("w", encoding="utf8") as handle:
163
+ json.dump(serialisable, handle, ensure_ascii=False, indent=2, sort_keys=True)
164
+
165
+
166
+ class GraphLexicon(Lexicon):
167
+ """Lexicon backed by ConceptNet/Numberbatch embeddings."""
168
+
169
+ def __init__(
170
+ self,
171
+ *,
172
+ source: Mapping[str, Sequence[float]] | str | Path | None = None,
173
+ cache: Mapping[str, Sequence[str]] | None = None,
174
+ cache_path: str | Path | None = None,
175
+ languages: Iterable[str] = ("en",),
176
+ max_neighbors: int = 50,
177
+ min_similarity: float = 0.0,
178
+ seed: int | None = None,
179
+ ) -> None:
180
+ super().__init__(seed=seed)
181
+ self._languages = {language.lower() for language in languages}
182
+ if not self._languages:
183
+ self._languages = {"en"}
184
+ self._max_neighbors = max(1, max_neighbors)
185
+ self._min_similarity = min_similarity
186
+ self._cache: MutableMapping[str, list[str]] = {}
187
+ self._cache_path = Path(cache_path) if cache_path is not None else None
188
+ if self._cache_path is not None:
189
+ self._cache.update(_load_cache(self._cache_path))
190
+ if cache is not None:
191
+ for key, values in cache.items():
192
+ self._cache[str(key)] = [str(value) for value in values]
193
+ self._cache_dirty = False
194
+
195
+ prepared_source = self._prepare_source(source)
196
+ self._backend = VectorLexicon(
197
+ source=prepared_source if prepared_source else None,
198
+ max_neighbors=self._max_neighbors,
199
+ min_similarity=self._min_similarity,
200
+ case_sensitive=True,
201
+ seed=seed,
202
+ )
203
+
204
+ def _prepare_source(
205
+ self, source: Mapping[str, Sequence[float]] | str | Path | None
206
+ ) -> Mapping[str, Sequence[float]]:
207
+ if source is None:
208
+ return {}
209
+ if isinstance(source, Mapping):
210
+ prepared: dict[str, list[float]] = {}
211
+ for key, vector in source.items():
212
+ lang = _language_from_concept(key)
213
+ if lang is None or lang not in self._languages:
214
+ continue
215
+ prepared[key] = [float(value) for value in vector]
216
+ return prepared
217
+ path = Path(source)
218
+ embeddings = _load_numberbatch(path, languages=self._languages)
219
+ return embeddings
220
+
221
+ def reseed(self, seed: int | None) -> None:
222
+ super().reseed(seed)
223
+ self._backend.reseed(seed)
224
+
225
+ def _concept_candidates(self, normalized: str) -> list[str]:
226
+ terms = _concept_terms(normalized)
227
+ concepts = []
228
+ for language in sorted(self._languages):
229
+ for term in terms:
230
+ concepts.append(f"/c/{language}/{term}")
231
+ return concepts
232
+
233
+ def _collect_synonyms(self, normalized: str) -> list[str]:
234
+ candidates: list[str] = []
235
+ seen: set[str] = set()
236
+ for concept in self._concept_candidates(normalized):
237
+ neighbors = self._backend.precompute(concept, limit=self._max_neighbors)
238
+ for neighbor in neighbors:
239
+ lang = _language_from_concept(neighbor)
240
+ if lang is None or lang not in self._languages:
241
+ continue
242
+ surface = _surface_from_concept(neighbor)
243
+ if surface is None:
244
+ continue
245
+ surface_norm = _normalize_phrase(surface)
246
+ if not surface_norm or surface_norm == normalized:
247
+ continue
248
+ if surface_norm in seen:
249
+ continue
250
+ seen.add(surface_norm)
251
+ candidates.append(surface)
252
+ return candidates
253
+
254
+ def _ensure_cached(self, normalized: str) -> list[str]:
255
+ if normalized in self._cache:
256
+ return self._cache[normalized]
257
+ synonyms = self._collect_synonyms(normalized)
258
+ self._cache[normalized] = synonyms
259
+ if self._cache_path is not None:
260
+ self._cache_dirty = True
261
+ return synonyms
262
+
263
+ def get_synonyms(
264
+ self, word: str, pos: str | None = None, n: int = 5
265
+ ) -> list[str]:
266
+ normalized = _normalize_phrase(word)
267
+ if not normalized:
268
+ return []
269
+ synonyms = self._ensure_cached(normalized)
270
+ return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
271
+
272
+ def precompute(self, word: str) -> list[str]:
273
+ normalized = _normalize_phrase(word)
274
+ if not normalized:
275
+ return []
276
+ return list(self._ensure_cached(normalized))
277
+
278
+ def export_cache(self) -> dict[str, list[str]]:
279
+ return {key: list(values) for key, values in self._cache.items()}
280
+
281
+ def save_cache(self, path: str | Path | None = None) -> Path:
282
+ if path is None:
283
+ if self._cache_path is None:
284
+ raise RuntimeError("No cache path supplied to GraphLexicon.")
285
+ target = self._cache_path
286
+ else:
287
+ target = Path(path)
288
+ self._cache_path = target
289
+ _write_cache(target, self._cache)
290
+ self._cache_dirty = False
291
+ return target
292
+
293
+ def supports_pos(self, pos: str | None) -> bool:
294
+ return True
295
+
296
+ def __repr__(self) -> str: # pragma: no cover - debug helper
297
+ adapter = getattr(self._backend, "_adapter", None)
298
+ state = "loaded" if adapter else "empty"
299
+ return (
300
+ f"GraphLexicon(languages={sorted(self._languages)!r}, "
301
+ f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
302
+ )
303
+
@@ -0,0 +1,169 @@
1
+ """Utility helpers for evaluating lexicon coverage and quality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from collections.abc import Iterable, Mapping, Sequence
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING: # pragma: no cover - typing hint only
10
+ from . import Lexicon
11
+
12
+
13
+ def _unique_synonyms(
14
+ lexicon: "Lexicon",
15
+ word: str,
16
+ *,
17
+ pos: str | None,
18
+ sample_size: int,
19
+ ) -> list[str]:
20
+ """Return unique synonym candidates excluding the original token."""
21
+
22
+ collected: list[str] = []
23
+ seen: set[str] = set()
24
+ source = word.lower()
25
+ for synonym in lexicon.get_synonyms(word, pos=pos, n=sample_size):
26
+ normalized = synonym.lower()
27
+ if normalized == source:
28
+ continue
29
+ if normalized in seen:
30
+ continue
31
+ seen.add(normalized)
32
+ collected.append(synonym)
33
+ return collected
34
+
35
+
36
+ def synonym_diversity(
37
+ lexicon: "Lexicon",
38
+ words: Iterable[str],
39
+ *,
40
+ pos: str | None = None,
41
+ sample_size: int = 5,
42
+ ) -> float:
43
+ """Return the mean unique-synonym count for ``words`` using ``lexicon``."""
44
+
45
+ totals = []
46
+ for word in words:
47
+ synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
48
+ totals.append(len(synonyms))
49
+ if not totals:
50
+ return 0.0
51
+ return sum(totals) / len(totals)
52
+
53
+
54
+ def coverage_ratio(
55
+ lexicon: "Lexicon",
56
+ words: Iterable[str],
57
+ *,
58
+ pos: str | None = None,
59
+ sample_size: int = 5,
60
+ min_synonyms: int = 3,
61
+ ) -> float:
62
+ """Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
63
+
64
+ total = 0
65
+ hits = 0
66
+ for word in words:
67
+ total += 1
68
+ synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
69
+ if len(synonyms) >= min_synonyms:
70
+ hits += 1
71
+ if total == 0:
72
+ return 0.0
73
+ return hits / total
74
+
75
+
76
+ def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
77
+ dot = 0.0
78
+ norm_a = 0.0
79
+ norm_b = 0.0
80
+ for value_a, value_b in zip(vector_a, vector_b):
81
+ dot += value_a * value_b
82
+ norm_a += value_a * value_a
83
+ norm_b += value_b * value_b
84
+ magnitude = math.sqrt(norm_a) * math.sqrt(norm_b)
85
+ if magnitude == 0.0:
86
+ return 0.0
87
+ return dot / magnitude
88
+
89
+
90
+ def mean_cosine_similarity(
91
+ lexicon: "Lexicon",
92
+ embeddings: Mapping[str, Sequence[float]],
93
+ words: Iterable[str],
94
+ *,
95
+ pos: str | None = None,
96
+ sample_size: int = 5,
97
+ ) -> float:
98
+ """Return the mean cosine similarity between each word and its candidates."""
99
+
100
+ total = 0.0
101
+ count = 0
102
+ for word in words:
103
+ source_vector = embeddings.get(word)
104
+ if source_vector is None:
105
+ continue
106
+ synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
107
+ for synonym in synonyms:
108
+ synonym_vector = embeddings.get(synonym)
109
+ if synonym_vector is None:
110
+ continue
111
+ total += _cosine_similarity(source_vector, synonym_vector)
112
+ count += 1
113
+ if count == 0:
114
+ return 0.0
115
+ return total / count
116
+
117
+
118
+ def compare_lexicons(
119
+ baseline: "Lexicon",
120
+ candidate: "Lexicon",
121
+ words: Iterable[str],
122
+ *,
123
+ pos: str | None = None,
124
+ sample_size: int = 5,
125
+ min_synonyms: int = 3,
126
+ embeddings: Mapping[str, Sequence[float]] | None = None,
127
+ ) -> dict[str, float]:
128
+ """Return comparative coverage and diversity statistics for two lexicons."""
129
+
130
+ stats = {
131
+ "baseline_diversity": synonym_diversity(
132
+ baseline, words, pos=pos, sample_size=sample_size
133
+ ),
134
+ "candidate_diversity": synonym_diversity(
135
+ candidate, words, pos=pos, sample_size=sample_size
136
+ ),
137
+ "baseline_coverage": coverage_ratio(
138
+ baseline,
139
+ words,
140
+ pos=pos,
141
+ sample_size=sample_size,
142
+ min_synonyms=min_synonyms,
143
+ ),
144
+ "candidate_coverage": coverage_ratio(
145
+ candidate,
146
+ words,
147
+ pos=pos,
148
+ sample_size=sample_size,
149
+ min_synonyms=min_synonyms,
150
+ ),
151
+ }
152
+
153
+ if embeddings is not None:
154
+ stats["baseline_similarity"] = mean_cosine_similarity(
155
+ baseline, embeddings, words, pos=pos, sample_size=sample_size
156
+ )
157
+ stats["candidate_similarity"] = mean_cosine_similarity(
158
+ candidate, embeddings, words, pos=pos, sample_size=sample_size
159
+ )
160
+
161
+ return stats
162
+
163
+
164
+ __all__ = [
165
+ "compare_lexicons",
166
+ "coverage_ratio",
167
+ "mean_cosine_similarity",
168
+ "synonym_diversity",
169
+ ]