glitchlings 0.4.1__cp312-cp312-macosx_11_0_universal2.whl → 0.4.3__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (47) hide show
  1. glitchlings/__init__.py +30 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  4. glitchlings/compat.py +284 -0
  5. glitchlings/config.py +164 -34
  6. glitchlings/config.toml +1 -1
  7. glitchlings/dlc/__init__.py +3 -1
  8. glitchlings/dlc/_shared.py +68 -0
  9. glitchlings/dlc/huggingface.py +26 -41
  10. glitchlings/dlc/prime.py +64 -101
  11. glitchlings/dlc/pytorch.py +216 -0
  12. glitchlings/dlc/pytorch_lightning.py +233 -0
  13. glitchlings/lexicon/__init__.py +12 -33
  14. glitchlings/lexicon/_cache.py +21 -22
  15. glitchlings/lexicon/data/default_vector_cache.json +80 -14
  16. glitchlings/lexicon/metrics.py +1 -8
  17. glitchlings/lexicon/vector.py +109 -49
  18. glitchlings/lexicon/wordnet.py +89 -49
  19. glitchlings/main.py +30 -24
  20. glitchlings/util/__init__.py +18 -4
  21. glitchlings/util/adapters.py +27 -0
  22. glitchlings/zoo/__init__.py +26 -15
  23. glitchlings/zoo/_ocr_confusions.py +1 -3
  24. glitchlings/zoo/_rate.py +1 -4
  25. glitchlings/zoo/_sampling.py +0 -1
  26. glitchlings/zoo/_text_utils.py +1 -5
  27. glitchlings/zoo/adjax.py +2 -4
  28. glitchlings/zoo/apostrofae.py +128 -0
  29. glitchlings/zoo/assets/__init__.py +0 -0
  30. glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
  31. glitchlings/zoo/core.py +152 -87
  32. glitchlings/zoo/jargoyle.py +50 -45
  33. glitchlings/zoo/mim1c.py +11 -10
  34. glitchlings/zoo/redactyl.py +16 -16
  35. glitchlings/zoo/reduple.py +5 -3
  36. glitchlings/zoo/rushmore.py +4 -10
  37. glitchlings/zoo/scannequin.py +7 -6
  38. glitchlings/zoo/typogre.py +8 -9
  39. glitchlings/zoo/zeedub.py +6 -3
  40. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/METADATA +101 -4
  41. glitchlings-0.4.3.dist-info/RECORD +46 -0
  42. glitchlings/lexicon/graph.py +0 -290
  43. glitchlings-0.4.1.dist-info/RECORD +0 -39
  44. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
  45. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
  46. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
  47. {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,290 +0,0 @@
1
- """Graph-based lexicon backed by ConceptNet/Numberbatch embeddings."""
2
-
3
- from __future__ import annotations
4
-
5
- import re
6
- from pathlib import Path
7
- from typing import Iterable, Mapping, MutableMapping, Sequence
8
-
9
- from . import LexiconBackend
10
- from ._cache import CacheSnapshot, load_cache as _load_cache_file, write_cache as _write_cache_file
11
- from .vector import VectorLexicon
12
-
13
-
14
- _CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
15
- _PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
16
-
17
-
18
- def _lemmatize_token(token: str) -> str:
19
- """Return a lightweight lemma for ``token`` using heuristic rules."""
20
-
21
- irregular = {
22
- "children": "child",
23
- "mice": "mouse",
24
- "geese": "goose",
25
- "feet": "foot",
26
- "teeth": "tooth",
27
- "men": "man",
28
- "women": "woman",
29
- "better": "good",
30
- "worse": "bad",
31
- }
32
- lowered = token.lower()
33
- if lowered in irregular:
34
- return irregular[lowered]
35
-
36
- if lowered.endswith("ies") and len(lowered) > 3:
37
- return lowered[:-3] + "y"
38
- if lowered.endswith("ves") and len(lowered) > 3:
39
- return lowered[:-3] + "f"
40
- if lowered.endswith("men") and len(lowered) > 3:
41
- return lowered[:-3] + "man"
42
- if lowered.endswith("ses") and len(lowered) > 3:
43
- return lowered[:-2]
44
- if lowered.endswith("es") and len(lowered) > 3:
45
- return lowered[:-2]
46
- if lowered.endswith("s") and len(lowered) > 2 and not lowered.endswith("ss"):
47
- return lowered[:-1]
48
- if lowered.endswith("ing") and len(lowered) > 4:
49
- stem = lowered[:-3]
50
- if len(stem) > 2 and stem[-1] == stem[-2]:
51
- stem = stem[:-1]
52
- return stem
53
- if lowered.endswith("ed") and len(lowered) > 3:
54
- stem = lowered[:-2]
55
- if len(stem) > 2 and stem[-1] == stem[-2]:
56
- stem = stem[:-1]
57
- return stem
58
- return lowered
59
-
60
-
61
- def _normalize_phrase(phrase: str) -> str:
62
- """Normalise ``phrase`` for ConceptNet lookups."""
63
-
64
- stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
65
- tokens = [token for token in stripped.split() if token]
66
- if not tokens:
67
- return ""
68
- lemmatised = [_lemmatize_token(token) for token in tokens]
69
- return " ".join(lemmatised)
70
-
71
-
72
- def _concept_terms(normalized: str) -> list[str]:
73
- """Return ConceptNet term variants for ``normalized``."""
74
-
75
- collapsed = normalized.replace(" ", "_")
76
- if not collapsed:
77
- return []
78
- variants = {collapsed}
79
- variants.add(collapsed.replace("_", "-"))
80
- variants.add(collapsed.replace("-", "_"))
81
- return list(variants)
82
-
83
-
84
- def _surface_from_concept(concept: str) -> str | None:
85
- """Return a human-readable surface form for ``concept``."""
86
-
87
- match = _CONCEPT_RE.match(concept)
88
- if match is None:
89
- return None
90
- term = match.group("term")
91
- surface = term.replace("_", " ")
92
- surface = surface.replace("-", " ")
93
- return " ".join(surface.split())
94
-
95
-
96
- def _language_from_concept(concept: str) -> str | None:
97
- match = _CONCEPT_RE.match(concept)
98
- if match is None:
99
- return None
100
- return match.group("lang")
101
-
102
-
103
- def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
104
- """Load ConceptNet Numberbatch embeddings from ``path``."""
105
-
106
- if not path.exists():
107
- return {}
108
-
109
- if path.suffix == ".gz":
110
- import gzip
111
-
112
- handle = gzip.open(path, "rt", encoding="utf8")
113
- else:
114
- handle = path.open("r", encoding="utf8")
115
-
116
- with handle as stream:
117
- header = stream.readline()
118
- try:
119
- parts = header.strip().split()
120
- if len(parts) >= 2:
121
- int(parts[0])
122
- int(parts[1])
123
- except ValueError:
124
- stream.seek(0)
125
-
126
- embeddings: dict[str, list[float]] = {}
127
- for line in stream:
128
- tokens = line.strip().split()
129
- if len(tokens) <= 2:
130
- continue
131
- concept = tokens[0]
132
- lang = _language_from_concept(concept)
133
- if lang is None or lang not in languages:
134
- continue
135
- try:
136
- vector = [float(value) for value in tokens[1:]]
137
- except ValueError:
138
- continue
139
- embeddings[concept] = vector
140
- return embeddings
141
-
142
-
143
- class GraphLexicon(LexiconBackend):
144
- """Lexicon backed by ConceptNet/Numberbatch embeddings."""
145
-
146
- def __init__(
147
- self,
148
- *,
149
- source: Mapping[str, Sequence[float]] | str | Path | None = None,
150
- cache: Mapping[str, Sequence[str]] | None = None,
151
- cache_path: str | Path | None = None,
152
- languages: Iterable[str] = ("en",),
153
- max_neighbors: int = 50,
154
- min_similarity: float = 0.0,
155
- seed: int | None = None,
156
- ) -> None:
157
- super().__init__(seed=seed)
158
- self._languages = {language.lower() for language in languages}
159
- if not self._languages:
160
- self._languages = {"en"}
161
- self._max_neighbors = max(1, max_neighbors)
162
- self._min_similarity = min_similarity
163
- self._cache: MutableMapping[str, list[str]] = {}
164
- self._cache_path: Path | None = Path(cache_path) if cache_path is not None else None
165
- self._cache_checksum: str | None = None
166
- if self._cache_path is not None:
167
- snapshot = _load_cache_file(self._cache_path)
168
- self._cache.update(snapshot.entries)
169
- self._cache_checksum = snapshot.checksum
170
- if cache is not None:
171
- for key, values in cache.items():
172
- self._cache[str(key)] = [str(value) for value in values]
173
- self._cache_dirty = False
174
-
175
- prepared_source = self._prepare_source(source)
176
- self._backend = VectorLexicon(
177
- source=prepared_source if prepared_source else None,
178
- max_neighbors=self._max_neighbors,
179
- min_similarity=self._min_similarity,
180
- case_sensitive=True,
181
- seed=seed,
182
- )
183
-
184
- def _prepare_source(
185
- self, source: Mapping[str, Sequence[float]] | str | Path | None
186
- ) -> Mapping[str, Sequence[float]]:
187
- if source is None:
188
- return {}
189
- if isinstance(source, Mapping):
190
- prepared: dict[str, list[float]] = {}
191
- for key, vector in source.items():
192
- lang = _language_from_concept(key)
193
- if lang is None or lang not in self._languages:
194
- continue
195
- prepared[key] = [float(value) for value in vector]
196
- return prepared
197
- path = Path(source)
198
- embeddings = _load_numberbatch(path, languages=self._languages)
199
- return embeddings
200
-
201
- def reseed(self, seed: int | None) -> None:
202
- super().reseed(seed)
203
- self._backend.reseed(seed)
204
-
205
- def _concept_candidates(self, normalized: str) -> list[str]:
206
- terms = _concept_terms(normalized)
207
- concepts = []
208
- for language in sorted(self._languages):
209
- for term in terms:
210
- concepts.append(f"/c/{language}/{term}")
211
- return concepts
212
-
213
- def _collect_synonyms(self, normalized: str) -> list[str]:
214
- candidates: list[str] = []
215
- seen: set[str] = set()
216
- for concept in self._concept_candidates(normalized):
217
- neighbors = self._backend.precompute(concept, limit=self._max_neighbors)
218
- for neighbor in neighbors:
219
- lang = _language_from_concept(neighbor)
220
- if lang is None or lang not in self._languages:
221
- continue
222
- surface = _surface_from_concept(neighbor)
223
- if surface is None:
224
- continue
225
- surface_norm = _normalize_phrase(surface)
226
- if not surface_norm or surface_norm == normalized:
227
- continue
228
- if surface_norm in seen:
229
- continue
230
- seen.add(surface_norm)
231
- candidates.append(surface)
232
- return candidates
233
-
234
- def _ensure_cached(self, normalized: str) -> list[str]:
235
- if normalized in self._cache:
236
- return self._cache[normalized]
237
- synonyms = self._collect_synonyms(normalized)
238
- self._cache[normalized] = synonyms
239
- if self._cache_path is not None:
240
- self._cache_dirty = True
241
- return synonyms
242
-
243
- def get_synonyms(
244
- self, word: str, pos: str | None = None, n: int = 5
245
- ) -> list[str]:
246
- normalized = _normalize_phrase(word)
247
- if not normalized:
248
- return []
249
- synonyms = self._ensure_cached(normalized)
250
- return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
251
-
252
- def precompute(self, word: str) -> list[str]:
253
- normalized = _normalize_phrase(word)
254
- if not normalized:
255
- return []
256
- return list(self._ensure_cached(normalized))
257
-
258
- def export_cache(self) -> dict[str, list[str]]:
259
- return {key: list(values) for key, values in self._cache.items()}
260
-
261
- @classmethod
262
- def load_cache(cls, path: str | Path) -> CacheSnapshot:
263
- """Load and validate a persisted ConceptNet cache file."""
264
-
265
- return _load_cache_file(Path(path))
266
-
267
- def save_cache(self, path: str | Path | None = None) -> Path:
268
- if path is None:
269
- if self._cache_path is None:
270
- raise RuntimeError("No cache path supplied to GraphLexicon.")
271
- target = self._cache_path
272
- else:
273
- target = Path(path)
274
- self._cache_path = target
275
- snapshot = _write_cache_file(target, self._cache)
276
- self._cache_checksum = snapshot.checksum
277
- self._cache_dirty = False
278
- return target
279
-
280
- def supports_pos(self, pos: str | None) -> bool:
281
- return True
282
-
283
- def __repr__(self) -> str: # pragma: no cover - debug helper
284
- adapter = getattr(self._backend, "_adapter", None)
285
- state = "loaded" if adapter else "empty"
286
- return (
287
- f"GraphLexicon(languages={sorted(self._languages)!r}, "
288
- f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
289
- )
290
-
@@ -1,39 +0,0 @@
1
- glitchlings/__init__.py,sha256=hEmQ1rl3G5uZBDbfJX_W4aIUNSsPAsy_Ai5DgQHasvk,813
2
- glitchlings/__main__.py,sha256=EOiBgay0x6B9VlSDzSQvMuoq6bHJdSvFSgcAVGGKkd4,121
3
- glitchlings/_zoo_rust.cpython-312-darwin.so,sha256=Bo96FBERNP5nurDkdpXPWDzSgvY2NGERbEtMmXHhCW0,2488416
4
- glitchlings/config.py,sha256=hwkcMkhEvUzK8FECgG6kbf_4MpMQcopskiSgXzK5B3o,7785
5
- glitchlings/config.toml,sha256=MWwgbx1-KIRAY3JZmMrCVbZNxFjHgRJXbtNAVuUNcxY,108
6
- glitchlings/main.py,sha256=Rw9pCgNrGxwzC1rZbbng7cHUP9xlL0WWWTdjW95XiSM,10084
7
- glitchlings/dlc/__init__.py,sha256=eTLEEWrVWPqniXHqee4W23H1rjElI1PQ_jcqWFe9D3g,141
8
- glitchlings/dlc/huggingface.py,sha256=I1QWanWVxO02awgSpHDtgQEVF-9AQRLtsta2RCitWhE,2933
9
- glitchlings/dlc/prime.py,sha256=wpRMNtgka1vNlEzifeCjGMp1q_-QclZn3NxXczGnNpM,9278
10
- glitchlings/lexicon/__init__.py,sha256=e3MbtV3R_UOoZXsckR3gnThwgqCi4HXnfduaqxqYXvw,6229
11
- glitchlings/lexicon/_cache.py,sha256=KlcHKtOFH1yPxwhr8_HF_qgpALmUuHkGTzNfWnQ2Jb8,3955
12
- glitchlings/lexicon/graph.py,sha256=YYLrYnmSZ8uf8VvrNLuVF_nIVDH7OoR3RuxJ-9JMA2c,10041
13
- glitchlings/lexicon/metrics.py,sha256=W8TCemZaCjBOUSX8G7JdgQAbMykXXfRTfodkDSkc3aQ,4599
14
- glitchlings/lexicon/vector.py,sha256=oeZQwYxrK25REu4MhUUlMmaStW17Gx6RwrU1v6NooOg,19713
15
- glitchlings/lexicon/wordnet.py,sha256=Zv0YNHSM-DE2ucVZl_OOutTV1s0-i2xPOrfqYYdZKTU,6034
16
- glitchlings/lexicon/data/default_vector_cache.json,sha256=7obKHqmR3odbTfgJPWLSRFYFh4J_6uvv_CntCSe_EjI,725
17
- glitchlings/util/__init__.py,sha256=7KiZ0gKMjocfd34cajneZhTqYb7Hkwi_PpjltPqvkNI,4498
18
- glitchlings/zoo/__init__.py,sha256=eFYmaWeFDlSqfaiED51HWM-OqiTo_BOz0ASeyhOwOsw,4818
19
- glitchlings/zoo/_ocr_confusions.py,sha256=MkCbwk9T24SO2pD3JNPajYCfpMMlm2vQ5_sJty5GoXE,1218
20
- glitchlings/zoo/_rate.py,sha256=TMyfVFV7pLxSGVswPlOAtBvk25Bjtx5xXTtpb_utgik,527
21
- glitchlings/zoo/_sampling.py,sha256=VOSWDgYWXIiAuKxn2IckFJhpRgGotQP_KW28db8kTKI,1587
22
- glitchlings/zoo/_text_utils.py,sha256=nAfFT_VdXMXciCR7eQ5EAmym5wvzL6_Sdn9dvCx2s3Q,2758
23
- glitchlings/zoo/adjax.py,sha256=N3CzfM7m7mAYgFcQYLQkqK2VYLw_vFvEMBM2aNU--ZA,3530
24
- glitchlings/zoo/core.py,sha256=YymiEc66V4mW_4MbTST2038D7YdZVyRkiUZn886IV4I,17203
25
- glitchlings/zoo/jargoyle.py,sha256=6-DJxUFz2AjT-iQDFlK2ZG9pVwq2boDtslEzCNyI_04,11481
26
- glitchlings/zoo/mim1c.py,sha256=yAt1ngR3j2KXLbzc8LhrQlIWRO_KT5dFK1EE8QivMAQ,3429
27
- glitchlings/zoo/ocr_confusions.tsv,sha256=KhtR7vJDTITpfTSGa-I7RHr6CK7LkGi2KjdhEWipI6o,183
28
- glitchlings/zoo/redactyl.py,sha256=H4PwAMBCIsDw1KBOBiTR3VUbRZwynqakwwfx3wHjVp8,5457
29
- glitchlings/zoo/reduple.py,sha256=Q9NRCdvUgaHvvJu8A0n6zW9v_L3pdmNZbWqaJ7uycw4,4216
30
- glitchlings/zoo/rushmore.py,sha256=J1wd4IB7WOAR2TdntkxCMZWseWR0Yii8UQZ7ucfpWCc,4335
31
- glitchlings/zoo/scannequin.py,sha256=Ps8nxysKjkJV408zaL1kjVjy4jliATDBpYcNHLWbNFg,4859
32
- glitchlings/zoo/typogre.py,sha256=0fYaxOEiTnxiCqmsiSN1r_wl1vC1Ueaiks2e94kks70,6668
33
- glitchlings/zoo/zeedub.py,sha256=l51swlo556-TXhDk4nayHOm1XgHwWmfUKzQ01YMuCpE,4801
34
- glitchlings-0.4.1.dist-info/licenses/LICENSE,sha256=YCvGip-LoaRyu6h0nPo71q6eHEkzUpsE11psDJOIRkw,11337
35
- glitchlings-0.4.1.dist-info/METADATA,sha256=9HdqQt7PazdHMtPP5JpINljl3kvL8HOqTFE3Wwyrm2g,28260
36
- glitchlings-0.4.1.dist-info/WHEEL,sha256=o0zAoJUNILGJZxEeFPjb7OMHp_94eqIkZBeZ0gvgOpo,114
37
- glitchlings-0.4.1.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
38
- glitchlings-0.4.1.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
39
- glitchlings-0.4.1.dist-info/RECORD,,