glitchlings 0.4.4__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (47) hide show
  1. glitchlings/__init__.py +67 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_zoo_rust.cpython-313-x86_64-linux-gnu.so +0 -0
  4. glitchlings/compat.py +284 -0
  5. glitchlings/config.py +388 -0
  6. glitchlings/config.toml +3 -0
  7. glitchlings/dlc/__init__.py +7 -0
  8. glitchlings/dlc/_shared.py +153 -0
  9. glitchlings/dlc/huggingface.py +81 -0
  10. glitchlings/dlc/prime.py +254 -0
  11. glitchlings/dlc/pytorch.py +166 -0
  12. glitchlings/dlc/pytorch_lightning.py +215 -0
  13. glitchlings/lexicon/__init__.py +192 -0
  14. glitchlings/lexicon/_cache.py +110 -0
  15. glitchlings/lexicon/data/default_vector_cache.json +82 -0
  16. glitchlings/lexicon/metrics.py +162 -0
  17. glitchlings/lexicon/vector.py +651 -0
  18. glitchlings/lexicon/wordnet.py +232 -0
  19. glitchlings/main.py +364 -0
  20. glitchlings/util/__init__.py +195 -0
  21. glitchlings/util/adapters.py +27 -0
  22. glitchlings/zoo/__init__.py +168 -0
  23. glitchlings/zoo/_ocr_confusions.py +32 -0
  24. glitchlings/zoo/_rate.py +131 -0
  25. glitchlings/zoo/_rust_extensions.py +143 -0
  26. glitchlings/zoo/_sampling.py +54 -0
  27. glitchlings/zoo/_text_utils.py +100 -0
  28. glitchlings/zoo/adjax.py +128 -0
  29. glitchlings/zoo/apostrofae.py +127 -0
  30. glitchlings/zoo/assets/__init__.py +0 -0
  31. glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
  32. glitchlings/zoo/core.py +582 -0
  33. glitchlings/zoo/jargoyle.py +335 -0
  34. glitchlings/zoo/mim1c.py +109 -0
  35. glitchlings/zoo/ocr_confusions.tsv +30 -0
  36. glitchlings/zoo/redactyl.py +193 -0
  37. glitchlings/zoo/reduple.py +148 -0
  38. glitchlings/zoo/rushmore.py +153 -0
  39. glitchlings/zoo/scannequin.py +171 -0
  40. glitchlings/zoo/typogre.py +231 -0
  41. glitchlings/zoo/zeedub.py +185 -0
  42. glitchlings-0.4.4.dist-info/METADATA +627 -0
  43. glitchlings-0.4.4.dist-info/RECORD +47 -0
  44. glitchlings-0.4.4.dist-info/WHEEL +5 -0
  45. glitchlings-0.4.4.dist-info/entry_points.txt +2 -0
  46. glitchlings-0.4.4.dist-info/licenses/LICENSE +201 -0
  47. glitchlings-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,192 @@
1
+ """Lexicon abstractions and default backend resolution helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from abc import ABC, abstractmethod
7
+ from hashlib import blake2s
8
+ from pathlib import Path
9
+ from typing import Callable, Iterable
10
+
11
+ from glitchlings.config import get_config
12
+
13
+ from ._cache import CacheEntries, CacheSnapshot
14
+
15
+
16
+ class Lexicon(ABC):
17
+ """Abstract interface describing synonym lookup backends.
18
+
19
+ Parameters
20
+ ----------
21
+ seed:
22
+ Optional integer used to derive deterministic random number generators
23
+ for synonym sampling. Identical seeds guarantee reproducible results for
24
+ the same word/part-of-speech queries.
25
+
26
+ """
27
+
28
+ def __init__(self, *, seed: int | None = None) -> None:
29
+ self._seed = seed
30
+
31
+ @property
32
+ def seed(self) -> int | None:
33
+ """Return the current base seed used for deterministic sampling."""
34
+ return self._seed
35
+
36
+ def reseed(self, seed: int | None) -> None:
37
+ """Update the base seed driving deterministic synonym sampling."""
38
+ self._seed = seed
39
+
40
+ def _derive_rng(self, word: str, pos: str | None) -> random.Random:
41
+ """Return an RNG derived from the base seed, word, and POS tag."""
42
+ seed_material = blake2s(digest_size=8)
43
+ seed_material.update(word.lower().encode("utf8"))
44
+ if pos is not None:
45
+ seed_material.update(pos.lower().encode("utf8"))
46
+ seed_repr = "None" if self._seed is None else str(self._seed)
47
+ seed_material.update(seed_repr.encode("utf8"))
48
+ derived_seed = int.from_bytes(seed_material.digest(), "big", signed=False)
49
+ return random.Random(derived_seed)
50
+
51
+ def _deterministic_sample(
52
+ self, values: Iterable[str], *, limit: int, word: str, pos: str | None
53
+ ) -> list[str]:
54
+ """Return up to ``limit`` values sampled deterministically."""
55
+ if limit <= 0:
56
+ return []
57
+
58
+ items = list(values)
59
+ if len(items) <= limit:
60
+ return items
61
+
62
+ rng = self._derive_rng(word, pos)
63
+ indices = rng.sample(range(len(items)), k=limit)
64
+ indices.sort()
65
+ return [items[index] for index in indices]
66
+
67
+ @abstractmethod
68
+ def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
69
+ """Return up to ``n`` synonyms for ``word`` constrained by ``pos``."""
70
+
71
+ def supports_pos(self, pos: str | None) -> bool:
72
+ """Return ``True`` when the backend can service ``pos`` queries."""
73
+ return True
74
+
75
+ def __repr__(self) -> str: # pragma: no cover - trivial representation
76
+ return f"{self.__class__.__name__}(seed={self._seed!r})"
77
+
78
+
79
+ class LexiconBackend(Lexicon):
80
+ """Extended lexicon interface that supports cache persistence."""
81
+
82
+ Cache = CacheEntries
83
+
84
+ @classmethod
85
+ @abstractmethod
86
+ def load_cache(cls, path: str | Path) -> CacheSnapshot:
87
+ """Return a validated cache snapshot loaded from ``path``."""
88
+
89
+ @abstractmethod
90
+ def save_cache(self, path: str | Path | None = None) -> Path | None:
91
+ """Persist the backend cache to ``path`` and return the destination."""
92
+
93
+
94
+ from .metrics import ( # noqa: E402
95
+ compare_lexicons,
96
+ coverage_ratio,
97
+ mean_cosine_similarity,
98
+ synonym_diversity,
99
+ )
100
+ from .vector import VectorLexicon, build_vector_cache # noqa: E402
101
+
102
+ _WordNetLexicon: type[LexiconBackend] | None
103
+ try: # pragma: no cover - optional dependency
104
+ from .wordnet import WordNetLexicon as _WordNetLexicon
105
+ except (
106
+ ImportError,
107
+ ModuleNotFoundError,
108
+ AttributeError,
109
+ ): # pragma: no cover - triggered when nltk unavailable
110
+ _WordNetLexicon = None
111
+
112
+ WordNetLexicon: type[LexiconBackend] | None = _WordNetLexicon
113
+
114
+
115
+ _BACKEND_FACTORIES: dict[str, Callable[[int | None], Lexicon | None]] = {}
116
+
117
+
118
+ def register_backend(name: str, factory: Callable[[int | None], Lexicon | None]) -> None:
119
+ """Register ``factory`` for ``name`` so it can be selected via config."""
120
+ normalized = name.lower()
121
+ _BACKEND_FACTORIES[normalized] = factory
122
+
123
+
124
+ def unregister_backend(name: str) -> None:
125
+ """Remove a previously registered backend."""
126
+ _BACKEND_FACTORIES.pop(name.lower(), None)
127
+
128
+
129
+ def available_backends() -> list[str]:
130
+ """Return the names of registered lexicon factories."""
131
+ return sorted(_BACKEND_FACTORIES)
132
+
133
+
134
+ def _vector_backend(seed: int | None) -> Lexicon | None:
135
+ config = get_config()
136
+ cache_path = config.lexicon.vector_cache
137
+ if cache_path is None:
138
+ return None
139
+ if not cache_path.exists():
140
+ return None
141
+ return VectorLexicon(cache_path=cache_path, seed=seed)
142
+
143
+
144
+ def _wordnet_backend(seed: int | None) -> Lexicon | None: # pragma: no cover - optional
145
+ if WordNetLexicon is None:
146
+ return None
147
+ try:
148
+ lexicon = WordNetLexicon(seed=seed)
149
+ except RuntimeError:
150
+ return None
151
+ return lexicon
152
+
153
+
154
+ register_backend("vector", _vector_backend)
155
+ register_backend("wordnet", _wordnet_backend)
156
+
157
+
158
+ def get_default_lexicon(seed: int | None = None) -> Lexicon:
159
+ """Return the first available lexicon according to configuration priority."""
160
+ config = get_config()
161
+ attempts: list[str] = []
162
+ for name in config.lexicon.priority:
163
+ factory = _BACKEND_FACTORIES.get(name.lower())
164
+ if factory is None:
165
+ attempts.append(f"{name} (unknown)")
166
+ continue
167
+ lexicon = factory(seed)
168
+ if lexicon is not None:
169
+ return lexicon
170
+ attempts.append(f"{name} (unavailable)")
171
+ attempted = ", ".join(attempts) or "<none>"
172
+ raise RuntimeError(
173
+ "No lexicon backends available; configure lexicon.priority with at least one "
174
+ f"working backend. Attempts: {attempted}."
175
+ )
176
+
177
+
178
+ __all__ = [
179
+ "Lexicon",
180
+ "LexiconBackend",
181
+ "VectorLexicon",
182
+ "WordNetLexicon",
183
+ "build_vector_cache",
184
+ "compare_lexicons",
185
+ "coverage_ratio",
186
+ "mean_cosine_similarity",
187
+ "synonym_diversity",
188
+ "get_default_lexicon",
189
+ "register_backend",
190
+ "unregister_backend",
191
+ "available_backends",
192
+ ]
@@ -0,0 +1,110 @@
1
+ """Shared cache helpers for lexicon backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from hashlib import blake2s
8
+ from pathlib import Path
9
+ from typing import Mapping, Sequence, cast
10
+
11
+ CacheEntries = dict[str, list[str]]
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class CacheSnapshot:
16
+ """Materialised cache data and its integrity checksum."""
17
+
18
+ entries: CacheEntries
19
+ checksum: str | None = None
20
+
21
+
22
+ def _normalise_entries(payload: Mapping[str, object]) -> CacheEntries:
23
+ """Convert raw cache payloads into canonical mapping form."""
24
+ entries: CacheEntries = {}
25
+ for key, values in payload.items():
26
+ if not isinstance(key, str):
27
+ raise RuntimeError("Synonym cache keys must be strings.")
28
+ if not isinstance(values, Sequence):
29
+ raise RuntimeError("Synonym cache values must be sequences of strings.")
30
+ entries[key] = [str(value) for value in values]
31
+ return entries
32
+
33
+
34
+ def _canonical_json(entries: Mapping[str, Sequence[str]]) -> str:
35
+ """Return a deterministic JSON serialisation for ``entries``."""
36
+ serialisable = {key: list(values) for key, values in sorted(entries.items())}
37
+ return json.dumps(serialisable, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
38
+
39
+
40
+ def compute_checksum(entries: Mapping[str, Sequence[str]]) -> str:
41
+ """Return a BLAKE2s checksum for ``entries``."""
42
+ digest = blake2s(_canonical_json(entries).encode("utf8"), digest_size=16)
43
+ return digest.hexdigest()
44
+
45
+
46
+ def load_cache(path: Path) -> CacheSnapshot:
47
+ """Load a cache from ``path`` and verify its checksum if present."""
48
+ if not path.exists():
49
+ return CacheSnapshot(entries={}, checksum=None)
50
+
51
+ with path.open("r", encoding="utf8") as handle:
52
+ payload_obj = json.load(handle)
53
+
54
+ checksum: str | None = None
55
+ entries_payload: Mapping[str, object]
56
+
57
+ if not isinstance(payload_obj, Mapping):
58
+ raise RuntimeError("Synonym cache payload must be a mapping of strings to lists.")
59
+
60
+ payload = cast(Mapping[str, object], payload_obj)
61
+
62
+ if "__meta__" in payload and "entries" in payload:
63
+ meta_obj = payload["__meta__"]
64
+ entries_obj = payload["entries"]
65
+ if not isinstance(entries_obj, Mapping):
66
+ raise RuntimeError("Synonym cache entries must be stored as a mapping.")
67
+ entries_payload = cast(Mapping[str, object], entries_obj)
68
+ if isinstance(meta_obj, Mapping):
69
+ raw_checksum = meta_obj.get("checksum")
70
+ if raw_checksum is not None and not isinstance(raw_checksum, str):
71
+ raise RuntimeError("Synonym cache checksum must be a string when provided.")
72
+ checksum = raw_checksum if isinstance(raw_checksum, str) else None
73
+ else:
74
+ raise RuntimeError("Synonym cache metadata must be a mapping.")
75
+ else:
76
+ entries_payload = payload # legacy format without metadata
77
+
78
+ entries = _normalise_entries(entries_payload)
79
+ if checksum is not None:
80
+ expected = compute_checksum(entries)
81
+ if checksum != expected:
82
+ raise RuntimeError(
83
+ "Synonym cache checksum mismatch; the cache file appears to be corrupted."
84
+ )
85
+
86
+ return CacheSnapshot(entries=entries, checksum=checksum)
87
+
88
+
89
+ def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
90
+ """Persist ``entries`` to ``path`` with checksum metadata."""
91
+ serialisable: CacheEntries = {
92
+ key: list(values) for key, values in sorted(entries.items())
93
+ }
94
+ checksum = compute_checksum(serialisable)
95
+ payload = {
96
+ "__meta__": {
97
+ "checksum": checksum,
98
+ "entries": len(serialisable),
99
+ },
100
+ "entries": serialisable,
101
+ }
102
+ path.parent.mkdir(parents=True, exist_ok=True)
103
+
104
+ with path.open("w", encoding="utf8") as handle:
105
+ json.dump(payload, handle, ensure_ascii=False, indent=2, sort_keys=True)
106
+
107
+ return CacheSnapshot(entries=serialisable, checksum=checksum)
108
+
109
+
110
+ __all__ = ["CacheEntries", "CacheSnapshot", "compute_checksum", "load_cache", "write_cache"]
@@ -0,0 +1,82 @@
1
+ {
2
+ "alpha": [
3
+ "beta",
4
+ "gamma",
5
+ "delta"
6
+ ],
7
+ "beta": [
8
+ "alpha",
9
+ "gamma",
10
+ "delta"
11
+ ],
12
+ "delta": [
13
+ "alpha",
14
+ "beta",
15
+ "gamma"
16
+ ],
17
+ "fast": [
18
+ "rapid",
19
+ "swift",
20
+ "speedy",
21
+ "brisk"
22
+ ],
23
+ "gamma": [
24
+ "alpha",
25
+ "beta",
26
+ "delta"
27
+ ],
28
+ "happy": [
29
+ "glad",
30
+ "joyful",
31
+ "content",
32
+ "upbeat"
33
+ ],
34
+ "quick": [
35
+ "swift",
36
+ "rapid",
37
+ "speedy",
38
+ "nimble"
39
+ ],
40
+ "quickly": [
41
+ "swiftly",
42
+ "rapidly",
43
+ "promptly",
44
+ "speedily"
45
+ ],
46
+ "sing": [
47
+ "croon",
48
+ "serenade",
49
+ "vocalize",
50
+ "perform"
51
+ ],
52
+ "slow": [
53
+ "sluggish",
54
+ "leisurely",
55
+ "unhurried",
56
+ "gradual"
57
+ ],
58
+ "songs": [
59
+ "tracks",
60
+ "melodies",
61
+ "ballads",
62
+ "tunes"
63
+ ],
64
+ "text": [
65
+ "passage",
66
+ "copy",
67
+ "script",
68
+ "narrative"
69
+ ],
70
+ "they": [
71
+ "those people",
72
+ "those individuals",
73
+ "the group",
74
+ "those folks"
75
+ ],
76
+ "words": [
77
+ "terms",
78
+ "phrases",
79
+ "lexicon",
80
+ "vocabulary"
81
+ ]
82
+ }
@@ -0,0 +1,162 @@
1
+ """Utility helpers for evaluating lexicon coverage and quality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from collections.abc import Iterable, Mapping, Sequence
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING: # pragma: no cover - typing hint only
10
+ from . import Lexicon
11
+
12
+
13
+ def _unique_synonyms(
14
+ lexicon: "Lexicon",
15
+ word: str,
16
+ *,
17
+ pos: str | None,
18
+ sample_size: int,
19
+ ) -> list[str]:
20
+ """Return unique synonym candidates excluding the original token."""
21
+ collected: list[str] = []
22
+ seen: set[str] = set()
23
+ source = word.lower()
24
+ for synonym in lexicon.get_synonyms(word, pos=pos, n=sample_size):
25
+ normalized = synonym.lower()
26
+ if normalized == source:
27
+ continue
28
+ if normalized in seen:
29
+ continue
30
+ seen.add(normalized)
31
+ collected.append(synonym)
32
+ return collected
33
+
34
+
35
+ def synonym_diversity(
36
+ lexicon: "Lexicon",
37
+ words: Iterable[str],
38
+ *,
39
+ pos: str | None = None,
40
+ sample_size: int = 5,
41
+ ) -> float:
42
+ """Return the mean unique-synonym count for ``words`` using ``lexicon``."""
43
+ totals = []
44
+ for word in words:
45
+ synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
46
+ totals.append(len(synonyms))
47
+ if not totals:
48
+ return 0.0
49
+ return sum(totals) / len(totals)
50
+
51
+
52
+ def coverage_ratio(
53
+ lexicon: "Lexicon",
54
+ words: Iterable[str],
55
+ *,
56
+ pos: str | None = None,
57
+ sample_size: int = 5,
58
+ min_synonyms: int = 3,
59
+ ) -> float:
60
+ """Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
61
+ total = 0
62
+ hits = 0
63
+ for word in words:
64
+ total += 1
65
+ synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
66
+ if len(synonyms) >= min_synonyms:
67
+ hits += 1
68
+ if total == 0:
69
+ return 0.0
70
+ return hits / total
71
+
72
+
73
+ def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
74
+ dot = 0.0
75
+ norm_a = 0.0
76
+ norm_b = 0.0
77
+ for value_a, value_b in zip(vector_a, vector_b):
78
+ dot += value_a * value_b
79
+ norm_a += value_a * value_a
80
+ norm_b += value_b * value_b
81
+ magnitude = math.sqrt(norm_a) * math.sqrt(norm_b)
82
+ if magnitude == 0.0:
83
+ return 0.0
84
+ return dot / magnitude
85
+
86
+
87
+ def mean_cosine_similarity(
88
+ lexicon: "Lexicon",
89
+ embeddings: Mapping[str, Sequence[float]],
90
+ words: Iterable[str],
91
+ *,
92
+ pos: str | None = None,
93
+ sample_size: int = 5,
94
+ ) -> float:
95
+ """Return the mean cosine similarity between each word and its candidates."""
96
+ total = 0.0
97
+ count = 0
98
+ for word in words:
99
+ source_vector = embeddings.get(word)
100
+ if source_vector is None:
101
+ continue
102
+ synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
103
+ for synonym in synonyms:
104
+ synonym_vector = embeddings.get(synonym)
105
+ if synonym_vector is None:
106
+ continue
107
+ total += _cosine_similarity(source_vector, synonym_vector)
108
+ count += 1
109
+ if count == 0:
110
+ return 0.0
111
+ return total / count
112
+
113
+
114
+ def compare_lexicons(
115
+ baseline: "Lexicon",
116
+ candidate: "Lexicon",
117
+ words: Iterable[str],
118
+ *,
119
+ pos: str | None = None,
120
+ sample_size: int = 5,
121
+ min_synonyms: int = 3,
122
+ embeddings: Mapping[str, Sequence[float]] | None = None,
123
+ ) -> dict[str, float]:
124
+ """Return comparative coverage and diversity statistics for two lexicons."""
125
+ stats = {
126
+ "baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
127
+ "candidate_diversity": synonym_diversity(
128
+ candidate, words, pos=pos, sample_size=sample_size
129
+ ),
130
+ "baseline_coverage": coverage_ratio(
131
+ baseline,
132
+ words,
133
+ pos=pos,
134
+ sample_size=sample_size,
135
+ min_synonyms=min_synonyms,
136
+ ),
137
+ "candidate_coverage": coverage_ratio(
138
+ candidate,
139
+ words,
140
+ pos=pos,
141
+ sample_size=sample_size,
142
+ min_synonyms=min_synonyms,
143
+ ),
144
+ }
145
+
146
+ if embeddings is not None:
147
+ stats["baseline_similarity"] = mean_cosine_similarity(
148
+ baseline, embeddings, words, pos=pos, sample_size=sample_size
149
+ )
150
+ stats["candidate_similarity"] = mean_cosine_similarity(
151
+ candidate, embeddings, words, pos=pos, sample_size=sample_size
152
+ )
153
+
154
+ return stats
155
+
156
+
157
+ __all__ = [
158
+ "compare_lexicons",
159
+ "coverage_ratio",
160
+ "mean_cosine_similarity",
161
+ "synonym_diversity",
162
+ ]