glitchlings 0.4.5__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +71 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cp311-win_amd64.pyd +0 -0
- glitchlings/compat.py +282 -0
- glitchlings/config.py +386 -0
- glitchlings/config.toml +3 -0
- glitchlings/data/__init__.py +1 -0
- glitchlings/data/hokey_assets.json +193 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +209 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +108 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +652 -0
- glitchlings/lexicon/wordnet.py +228 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +375 -0
- glitchlings/zoo/__init__.py +172 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.5.dist-info/METADATA +648 -0
- glitchlings-0.4.5.dist-info/RECORD +53 -0
- glitchlings-0.4.5.dist-info/WHEEL +5 -0
- glitchlings-0.4.5.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.5.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Utility helpers for evaluating lexicon coverage and quality."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING: # pragma: no cover - typing hint only
|
|
10
|
+
from . import Lexicon
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _unique_synonyms(
|
|
14
|
+
lexicon: "Lexicon",
|
|
15
|
+
word: str,
|
|
16
|
+
*,
|
|
17
|
+
pos: str | None,
|
|
18
|
+
sample_size: int,
|
|
19
|
+
) -> list[str]:
|
|
20
|
+
"""Return unique synonym candidates excluding the original token."""
|
|
21
|
+
collected: list[str] = []
|
|
22
|
+
seen: set[str] = set()
|
|
23
|
+
source = word.lower()
|
|
24
|
+
for synonym in lexicon.get_synonyms(word, pos=pos, n=sample_size):
|
|
25
|
+
normalized = synonym.lower()
|
|
26
|
+
if normalized == source:
|
|
27
|
+
continue
|
|
28
|
+
if normalized in seen:
|
|
29
|
+
continue
|
|
30
|
+
seen.add(normalized)
|
|
31
|
+
collected.append(synonym)
|
|
32
|
+
return collected
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def synonym_diversity(
|
|
36
|
+
lexicon: "Lexicon",
|
|
37
|
+
words: Iterable[str],
|
|
38
|
+
*,
|
|
39
|
+
pos: str | None = None,
|
|
40
|
+
sample_size: int = 5,
|
|
41
|
+
) -> float:
|
|
42
|
+
"""Return the mean unique-synonym count for ``words`` using ``lexicon``."""
|
|
43
|
+
totals = []
|
|
44
|
+
for word in words:
|
|
45
|
+
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
46
|
+
totals.append(len(synonyms))
|
|
47
|
+
if not totals:
|
|
48
|
+
return 0.0
|
|
49
|
+
return sum(totals) / len(totals)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def coverage_ratio(
|
|
53
|
+
lexicon: "Lexicon",
|
|
54
|
+
words: Iterable[str],
|
|
55
|
+
*,
|
|
56
|
+
pos: str | None = None,
|
|
57
|
+
sample_size: int = 5,
|
|
58
|
+
min_synonyms: int = 3,
|
|
59
|
+
) -> float:
|
|
60
|
+
"""Return the fraction of ``words`` with at least ``min_synonyms`` candidates."""
|
|
61
|
+
total = 0
|
|
62
|
+
hits = 0
|
|
63
|
+
for word in words:
|
|
64
|
+
total += 1
|
|
65
|
+
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
66
|
+
if len(synonyms) >= min_synonyms:
|
|
67
|
+
hits += 1
|
|
68
|
+
if total == 0:
|
|
69
|
+
return 0.0
|
|
70
|
+
return hits / total
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
74
|
+
dot = 0.0
|
|
75
|
+
norm_a = 0.0
|
|
76
|
+
norm_b = 0.0
|
|
77
|
+
for value_a, value_b in zip(vector_a, vector_b):
|
|
78
|
+
dot += value_a * value_b
|
|
79
|
+
norm_a += value_a * value_a
|
|
80
|
+
norm_b += value_b * value_b
|
|
81
|
+
magnitude = math.sqrt(norm_a) * math.sqrt(norm_b)
|
|
82
|
+
if magnitude == 0.0:
|
|
83
|
+
return 0.0
|
|
84
|
+
return dot / magnitude
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def mean_cosine_similarity(
|
|
88
|
+
lexicon: "Lexicon",
|
|
89
|
+
embeddings: Mapping[str, Sequence[float]],
|
|
90
|
+
words: Iterable[str],
|
|
91
|
+
*,
|
|
92
|
+
pos: str | None = None,
|
|
93
|
+
sample_size: int = 5,
|
|
94
|
+
) -> float:
|
|
95
|
+
"""Return the mean cosine similarity between each word and its candidates."""
|
|
96
|
+
total = 0.0
|
|
97
|
+
count = 0
|
|
98
|
+
for word in words:
|
|
99
|
+
source_vector = embeddings.get(word)
|
|
100
|
+
if source_vector is None:
|
|
101
|
+
continue
|
|
102
|
+
synonyms = _unique_synonyms(lexicon, word, pos=pos, sample_size=sample_size)
|
|
103
|
+
for synonym in synonyms:
|
|
104
|
+
synonym_vector = embeddings.get(synonym)
|
|
105
|
+
if synonym_vector is None:
|
|
106
|
+
continue
|
|
107
|
+
total += _cosine_similarity(source_vector, synonym_vector)
|
|
108
|
+
count += 1
|
|
109
|
+
if count == 0:
|
|
110
|
+
return 0.0
|
|
111
|
+
return total / count
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def compare_lexicons(
|
|
115
|
+
baseline: "Lexicon",
|
|
116
|
+
candidate: "Lexicon",
|
|
117
|
+
words: Iterable[str],
|
|
118
|
+
*,
|
|
119
|
+
pos: str | None = None,
|
|
120
|
+
sample_size: int = 5,
|
|
121
|
+
min_synonyms: int = 3,
|
|
122
|
+
embeddings: Mapping[str, Sequence[float]] | None = None,
|
|
123
|
+
) -> dict[str, float]:
|
|
124
|
+
"""Return comparative coverage and diversity statistics for two lexicons."""
|
|
125
|
+
stats = {
|
|
126
|
+
"baseline_diversity": synonym_diversity(baseline, words, pos=pos, sample_size=sample_size),
|
|
127
|
+
"candidate_diversity": synonym_diversity(
|
|
128
|
+
candidate, words, pos=pos, sample_size=sample_size
|
|
129
|
+
),
|
|
130
|
+
"baseline_coverage": coverage_ratio(
|
|
131
|
+
baseline,
|
|
132
|
+
words,
|
|
133
|
+
pos=pos,
|
|
134
|
+
sample_size=sample_size,
|
|
135
|
+
min_synonyms=min_synonyms,
|
|
136
|
+
),
|
|
137
|
+
"candidate_coverage": coverage_ratio(
|
|
138
|
+
candidate,
|
|
139
|
+
words,
|
|
140
|
+
pos=pos,
|
|
141
|
+
sample_size=sample_size,
|
|
142
|
+
min_synonyms=min_synonyms,
|
|
143
|
+
),
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if embeddings is not None:
|
|
147
|
+
stats["baseline_similarity"] = mean_cosine_similarity(
|
|
148
|
+
baseline, embeddings, words, pos=pos, sample_size=sample_size
|
|
149
|
+
)
|
|
150
|
+
stats["candidate_similarity"] = mean_cosine_similarity(
|
|
151
|
+
candidate, embeddings, words, pos=pos, sample_size=sample_size
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return stats
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
__all__ = [
|
|
158
|
+
"compare_lexicons",
|
|
159
|
+
"coverage_ratio",
|
|
160
|
+
"mean_cosine_similarity",
|
|
161
|
+
"synonym_diversity",
|
|
162
|
+
]
|