glitchlings 0.4.2__cp312-cp312-win_amd64.whl → 0.4.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/compat.py +80 -11
- glitchlings/config.py +32 -19
- glitchlings/config.toml +1 -1
- glitchlings/dlc/__init__.py +3 -1
- glitchlings/dlc/pytorch.py +216 -0
- glitchlings/dlc/pytorch_lightning.py +233 -0
- glitchlings/lexicon/__init__.py +5 -15
- glitchlings/lexicon/_cache.py +21 -15
- glitchlings/lexicon/data/default_vector_cache.json +80 -14
- glitchlings/lexicon/vector.py +94 -15
- glitchlings/lexicon/wordnet.py +66 -25
- glitchlings/main.py +21 -11
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/adjax.py +2 -2
- glitchlings/zoo/apostrofae.py +128 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +40 -14
- glitchlings/zoo/jargoyle.py +44 -34
- glitchlings/zoo/redactyl.py +11 -8
- glitchlings/zoo/reduple.py +2 -2
- glitchlings/zoo/rushmore.py +2 -2
- glitchlings/zoo/scannequin.py +2 -2
- glitchlings/zoo/typogre.py +5 -2
- glitchlings/zoo/zeedub.py +5 -2
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/METADATA +35 -2
- glitchlings-0.4.3.dist-info/RECORD +46 -0
- glitchlings/lexicon/graph.py +0 -282
- glitchlings-0.4.2.dist-info/RECORD +0 -42
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
glitchlings/zoo/jargoyle.py
CHANGED
|
@@ -2,20 +2,25 @@ import random
|
|
|
2
2
|
import re
|
|
3
3
|
from collections.abc import Iterable
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from types import ModuleType
|
|
5
6
|
from typing import Any, Literal, cast
|
|
6
7
|
|
|
7
8
|
from glitchlings.lexicon import Lexicon, get_default_lexicon
|
|
8
9
|
|
|
10
|
+
from ._rate import resolve_rate
|
|
11
|
+
from .core import AttackWave, Glitchling
|
|
12
|
+
|
|
13
|
+
_wordnet_module: ModuleType | None
|
|
14
|
+
|
|
9
15
|
try: # pragma: no cover - optional WordNet dependency
|
|
10
|
-
|
|
11
|
-
WordNetLexicon,
|
|
12
|
-
)
|
|
13
|
-
from glitchlings.lexicon.wordnet import (
|
|
14
|
-
dependencies_available as _lexicon_dependencies_available,
|
|
15
|
-
)
|
|
16
|
-
from glitchlings.lexicon.wordnet import ensure_wordnet as _lexicon_ensure_wordnet
|
|
16
|
+
import glitchlings.lexicon.wordnet as _wordnet_module
|
|
17
17
|
except Exception: # pragma: no cover - triggered when nltk unavailable
|
|
18
|
-
|
|
18
|
+
_wordnet_module = None
|
|
19
|
+
|
|
20
|
+
_wordnet_runtime: ModuleType | None = _wordnet_module
|
|
21
|
+
|
|
22
|
+
WordNetLexicon: type[Lexicon] | None
|
|
23
|
+
if _wordnet_runtime is None:
|
|
19
24
|
|
|
20
25
|
def _lexicon_dependencies_available() -> bool:
|
|
21
26
|
return False
|
|
@@ -26,9 +31,12 @@ except Exception: # pragma: no cover - triggered when nltk unavailable
|
|
|
26
31
|
"and download its WordNet corpus manually if you need legacy synonyms."
|
|
27
32
|
)
|
|
28
33
|
|
|
34
|
+
WordNetLexicon = None
|
|
35
|
+
else:
|
|
36
|
+
WordNetLexicon = cast(type[Lexicon], _wordnet_runtime.WordNetLexicon)
|
|
37
|
+
_lexicon_dependencies_available = _wordnet_runtime.dependencies_available
|
|
38
|
+
_lexicon_ensure_wordnet = _wordnet_runtime.ensure_wordnet
|
|
29
39
|
|
|
30
|
-
from ._rate import resolve_rate
|
|
31
|
-
from .core import AttackWave, Glitchling
|
|
32
40
|
|
|
33
41
|
ensure_wordnet = _lexicon_ensure_wordnet
|
|
34
42
|
|
|
@@ -169,34 +177,36 @@ def substitute_random_synonyms(
|
|
|
169
177
|
candidate_indices: list[int] = []
|
|
170
178
|
candidate_metadata: dict[int, CandidateInfo] = {}
|
|
171
179
|
for idx, tok in enumerate(tokens):
|
|
172
|
-
if idx % 2
|
|
173
|
-
|
|
174
|
-
if not core_word:
|
|
175
|
-
continue
|
|
176
|
-
|
|
177
|
-
chosen_pos: str | None = None
|
|
178
|
-
synonyms: list[str] = []
|
|
180
|
+
if idx % 2 != 0 or not tok or tok.isspace():
|
|
181
|
+
continue
|
|
179
182
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
synonyms = active_lexicon.get_synonyms(core_word, pos=pos)
|
|
184
|
-
if synonyms:
|
|
185
|
-
chosen_pos = pos
|
|
186
|
-
break
|
|
183
|
+
prefix, core_word, suffix = _split_token(tok)
|
|
184
|
+
if not core_word:
|
|
185
|
+
continue
|
|
187
186
|
|
|
188
|
-
|
|
189
|
-
|
|
187
|
+
chosen_pos: str | None = None
|
|
188
|
+
synonyms: list[str] = []
|
|
190
189
|
|
|
190
|
+
for tag in target_pos:
|
|
191
|
+
if not active_lexicon.supports_pos(tag):
|
|
192
|
+
continue
|
|
193
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=tag)
|
|
191
194
|
if synonyms:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
195
|
+
chosen_pos = tag
|
|
196
|
+
break
|
|
197
|
+
|
|
198
|
+
if not synonyms and active_lexicon.supports_pos(None):
|
|
199
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=None)
|
|
200
|
+
|
|
201
|
+
if synonyms:
|
|
202
|
+
candidate_indices.append(idx)
|
|
203
|
+
candidate_metadata[idx] = CandidateInfo(
|
|
204
|
+
prefix=prefix,
|
|
205
|
+
core_word=core_word,
|
|
206
|
+
suffix=suffix,
|
|
207
|
+
part_of_speech=chosen_pos,
|
|
208
|
+
synonyms=synonyms,
|
|
209
|
+
)
|
|
200
210
|
|
|
201
211
|
if not candidate_indices:
|
|
202
212
|
return text
|
glitchlings/zoo/redactyl.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import random
|
|
2
2
|
import re
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any, cast
|
|
4
4
|
|
|
5
5
|
from ._rate import resolve_rate
|
|
6
6
|
from ._sampling import weighted_sample_without_replacement
|
|
@@ -119,13 +119,16 @@ def redact_words(
|
|
|
119
119
|
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
|
120
120
|
|
|
121
121
|
if use_rust:
|
|
122
|
-
return
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
122
|
+
return cast(
|
|
123
|
+
str,
|
|
124
|
+
_redact_words_rust(
|
|
125
|
+
text,
|
|
126
|
+
replacement_char,
|
|
127
|
+
clamped_rate,
|
|
128
|
+
merge_adjacent,
|
|
129
|
+
unweighted_flag,
|
|
130
|
+
rng,
|
|
131
|
+
),
|
|
129
132
|
)
|
|
130
133
|
|
|
131
134
|
return _python_redact_words(
|
glitchlings/zoo/reduple.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import random
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any, cast
|
|
3
3
|
|
|
4
4
|
from ._rate import resolve_rate
|
|
5
5
|
from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
|
|
@@ -94,7 +94,7 @@ def reduplicate_words(
|
|
|
94
94
|
unweighted_flag = bool(unweighted)
|
|
95
95
|
|
|
96
96
|
if _reduplicate_words_rust is not None:
|
|
97
|
-
return _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng)
|
|
97
|
+
return cast(str, _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng))
|
|
98
98
|
|
|
99
99
|
return _python_reduplicate_words(
|
|
100
100
|
text,
|
glitchlings/zoo/rushmore.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import math
|
|
2
2
|
import random
|
|
3
3
|
import re
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any, cast
|
|
5
5
|
|
|
6
6
|
from ._rate import resolve_rate
|
|
7
7
|
from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
|
|
@@ -97,7 +97,7 @@ def delete_random_words(
|
|
|
97
97
|
unweighted_flag = bool(unweighted)
|
|
98
98
|
|
|
99
99
|
if _delete_random_words_rust is not None:
|
|
100
|
-
return _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng)
|
|
100
|
+
return cast(str, _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng))
|
|
101
101
|
|
|
102
102
|
return _python_delete_random_words(
|
|
103
103
|
text,
|
glitchlings/zoo/scannequin.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import random
|
|
2
2
|
import re
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any, cast
|
|
4
4
|
|
|
5
5
|
from ._ocr_confusions import load_confusion_table
|
|
6
6
|
from ._rate import resolve_rate
|
|
@@ -126,7 +126,7 @@ def ocr_artifacts(
|
|
|
126
126
|
clamped_rate = max(0.0, effective_rate)
|
|
127
127
|
|
|
128
128
|
if _ocr_artifacts_rust is not None:
|
|
129
|
-
return _ocr_artifacts_rust(text, clamped_rate, rng)
|
|
129
|
+
return cast(str, _ocr_artifacts_rust(text, clamped_rate, rng))
|
|
130
130
|
|
|
131
131
|
return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
|
|
132
132
|
|
glitchlings/zoo/typogre.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import math
|
|
4
4
|
import random
|
|
5
|
-
from typing import Any, Optional
|
|
5
|
+
from typing import Any, Optional, cast
|
|
6
6
|
|
|
7
7
|
from ..util import KEYNEIGHBORS
|
|
8
8
|
from ._rate import resolve_rate
|
|
@@ -168,7 +168,10 @@ def fatfinger(
|
|
|
168
168
|
layout = getattr(KEYNEIGHBORS, keyboard)
|
|
169
169
|
|
|
170
170
|
if _fatfinger_rust is not None:
|
|
171
|
-
return
|
|
171
|
+
return cast(
|
|
172
|
+
str,
|
|
173
|
+
_fatfinger_rust(text, max_change_rate=clamped_rate, layout=layout, rng=rng),
|
|
174
|
+
)
|
|
172
175
|
|
|
173
176
|
return _fatfinger_python(text, rate=clamped_rate, layout=layout, rng=rng)
|
|
174
177
|
|
glitchlings/zoo/zeedub.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import math
|
|
4
4
|
import random
|
|
5
5
|
from collections.abc import Sequence
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
8
|
from ._rate import resolve_rate
|
|
9
9
|
from .core import AttackOrder, AttackWave, Glitchling
|
|
@@ -115,7 +115,10 @@ def insert_zero_widths(
|
|
|
115
115
|
if hasattr(rng, "getstate"):
|
|
116
116
|
python_state = rng.getstate()
|
|
117
117
|
rng.setstate(state)
|
|
118
|
-
rust_result =
|
|
118
|
+
rust_result = cast(
|
|
119
|
+
str,
|
|
120
|
+
_inject_zero_widths_rust(text, clamped_rate, list(cleaned_palette), rng),
|
|
121
|
+
)
|
|
119
122
|
if rust_result == python_result:
|
|
120
123
|
return rust_result
|
|
121
124
|
if python_state is not None and hasattr(rng, "setstate"):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: glitchlings
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Monsters for your language games.
|
|
5
5
|
Author: osoleve
|
|
6
6
|
License: Apache License
|
|
@@ -226,15 +226,37 @@ License-File: LICENSE
|
|
|
226
226
|
Requires-Dist: confusable-homoglyphs>=3.3.1
|
|
227
227
|
Requires-Dist: tomli>=2.0.1; python_version < "3.11"
|
|
228
228
|
Requires-Dist: pyyaml>=6.0.0
|
|
229
|
+
Provides-Extra: all
|
|
230
|
+
Requires-Dist: black>=24.4.0; extra == "all"
|
|
231
|
+
Requires-Dist: hypothesis>=6.140.0; extra == "all"
|
|
232
|
+
Requires-Dist: interrogate>=1.5.0; extra == "all"
|
|
233
|
+
Requires-Dist: jellyfish>=1.2.0; extra == "all"
|
|
234
|
+
Requires-Dist: isort>=5.13.0; extra == "all"
|
|
235
|
+
Requires-Dist: mkdocs>=1.6.0; extra == "all"
|
|
236
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == "all"
|
|
237
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "all"
|
|
238
|
+
Requires-Dist: mkdocstrings-python>=1.10.0; extra == "all"
|
|
239
|
+
Requires-Dist: mypy>=1.8.0; extra == "all"
|
|
240
|
+
Requires-Dist: numpy<=2.0,>=1.24; extra == "all"
|
|
241
|
+
Requires-Dist: pre-commit>=3.8.0; extra == "all"
|
|
242
|
+
Requires-Dist: pytest>=8.0.0; extra == "all"
|
|
243
|
+
Requires-Dist: ruff>=0.6.0; extra == "all"
|
|
244
|
+
Requires-Dist: verifiers>=0.1.3.post0; extra == "all"
|
|
229
245
|
Provides-Extra: hf
|
|
230
246
|
Requires-Dist: datasets>=4.0.0; extra == "hf"
|
|
247
|
+
Provides-Extra: lightning
|
|
248
|
+
Requires-Dist: pytorch_lightning>=2.0.0; extra == "lightning"
|
|
231
249
|
Provides-Extra: vectors
|
|
232
250
|
Requires-Dist: numpy<=2.0,>=1.24; extra == "vectors"
|
|
233
251
|
Requires-Dist: spacy>=3.7.2; extra == "vectors"
|
|
234
252
|
Requires-Dist: gensim>=4.3.2; extra == "vectors"
|
|
253
|
+
Provides-Extra: st
|
|
254
|
+
Requires-Dist: sentence-transformers>=3.0.0; extra == "st"
|
|
235
255
|
Provides-Extra: prime
|
|
236
256
|
Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
|
|
237
257
|
Requires-Dist: jellyfish>=1.2.0; extra == "prime"
|
|
258
|
+
Provides-Extra: torch
|
|
259
|
+
Requires-Dist: torch>=2.0.0; extra == "torch"
|
|
238
260
|
Provides-Extra: dev
|
|
239
261
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
240
262
|
Requires-Dist: hypothesis>=6.140.0; extra == "dev"
|
|
@@ -307,7 +329,7 @@ print(gaggle(SAMPLE_TEXT))
|
|
|
307
329
|
> Onҽ mھrning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin٠ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
|
|
308
330
|
|
|
309
331
|
Consult the [Glitchlings Usage Guide](docs/index.md)
|
|
310
|
-
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
|
332
|
+
for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
|
|
311
333
|
integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
|
|
312
334
|
|
|
313
335
|
## Motivation
|
|
@@ -356,6 +378,7 @@ glitchlings --list
|
|
|
356
378
|
|
|
357
379
|
```text
|
|
358
380
|
Typogre — scope: Character, order: early
|
|
381
|
+
Apostrofae — scope: Character, order: normal
|
|
359
382
|
Mim1c — scope: Character, order: last
|
|
360
383
|
Jargoyle — scope: Word, order: normal
|
|
361
384
|
Adjax — scope: Word, order: normal
|
|
@@ -458,6 +481,16 @@ _What a nice word, would be a shame if something happened to it._
|
|
|
458
481
|
> - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
|
|
459
482
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
|
460
483
|
|
|
484
|
+
### Apostrofae
|
|
485
|
+
|
|
486
|
+
_It looks like you're trying to paste some text. Can I help?_
|
|
487
|
+
|
|
488
|
+
> _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
|
|
489
|
+
>
|
|
490
|
+
> Args
|
|
491
|
+
>
|
|
492
|
+
> - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
|
|
493
|
+
|
|
461
494
|
### Mim1c
|
|
462
495
|
|
|
463
496
|
_Wait, was that...?_
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
glitchlings/__init__.py,sha256=jyfvslkaFlYVefDFq0CQKkc52F8p8I1UrbFbcyihFCU,1249
|
|
2
|
+
glitchlings/__main__.py,sha256=nB7btO_T4wBFOcyawfWpjEindVrUfTqqV5hdeeS1HT8,128
|
|
3
|
+
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=pLx_CNCEqSUru7tMbOXImtyEds8deMJ86O5U44Tnydw,2206208
|
|
4
|
+
glitchlings/compat.py,sha256=xM5fT5RELgIdQTmgKQFZaPZJJpOg0noC-gLDcO390Ro,9192
|
|
5
|
+
glitchlings/config.py,sha256=pKpM5onr9UG8sHKVIZKj2Ti2gD5bZ6N1wyxOhzgrviQ,13341
|
|
6
|
+
glitchlings/config.toml,sha256=OywXmEpuOPtyJRbcRt4cwQkHiZ__5axEHoCaX9ye-uA,102
|
|
7
|
+
glitchlings/main.py,sha256=eCUEFsu8-NLDz1xyKNDIucVm975HNQZJYm6YCv8RIyg,10987
|
|
8
|
+
glitchlings/dlc/__init__.py,sha256=iFDTwkaWl2C0_QUYykIXfmOUzy__oURX_BiJhexf-8o,312
|
|
9
|
+
glitchlings/dlc/_shared.py,sha256=i66HyJJiHpR0mug7zmfmxB17Vs3CSqBTsk5CuVaLDk0,2040
|
|
10
|
+
glitchlings/dlc/huggingface.py,sha256=Ym8dTArb-43AnCyukOO1m66iAbs8al9YkIWB3rGdhTk,2657
|
|
11
|
+
glitchlings/dlc/prime.py,sha256=KY3so8WOwksbsKhZXfWorsZSYvIdPiUtc8e9apHabkM,8861
|
|
12
|
+
glitchlings/dlc/pytorch.py,sha256=oQxdRWcPqMhfwjLtzoKjcO9gk-GhOajoUW5o6r1s3eU,8039
|
|
13
|
+
glitchlings/dlc/pytorch_lightning.py,sha256=ZpqCPisplrBGpZQ2nLKvLtcVYpD1WwwW8d35ZpP8YW8,8786
|
|
14
|
+
glitchlings/lexicon/__init__.py,sha256=2DIrssOImSA9Ms7fJRgn-qhBhPQN_E05CuGppyFklzM,6139
|
|
15
|
+
glitchlings/lexicon/_cache.py,sha256=KWqJ__WrM2ccIlplaaqoVT0ns65uU5WHewlJd4BvnJE,4196
|
|
16
|
+
glitchlings/lexicon/metrics.py,sha256=TZAafSKgHpUS4h6vCuhTKGsvu_fru9kMqsXqLID6BTM,4734
|
|
17
|
+
glitchlings/lexicon/vector.py,sha256=x9iT1O8Osolwt08g41V_70WHZt_b4OGzHBU72YHkmwg,23181
|
|
18
|
+
glitchlings/lexicon/wordnet.py,sha256=fJi5SNa-sLpQiTIoXorkYzc2ZArejIms6zhoe8TPIOg,7840
|
|
19
|
+
glitchlings/lexicon/data/default_vector_cache.json,sha256=bnMV4tHIVOQtK7FDH81yqSLRkeViEzclGKXrrS8fEJ8,1079
|
|
20
|
+
glitchlings/util/__init__.py,sha256=Q5lkncOaM6f2eJK3HAtZyxpCjGnekCpwPloqasS3JDo,4869
|
|
21
|
+
glitchlings/util/adapters.py,sha256=mFhPlE8JaFuO_C-3_aqhgwkqa6isV8Y2ifqVh3Iv9JM,720
|
|
22
|
+
glitchlings/zoo/__init__.py,sha256=j21naQtFunJYgdgYsyTNYUSa7sl88yNQWaGP8sWyw5U,5411
|
|
23
|
+
glitchlings/zoo/_ocr_confusions.py,sha256=pPlvJOoan3ouwwGt8hATcO-9luIrGJl0vwUqssUMXD8,1236
|
|
24
|
+
glitchlings/zoo/_rate.py,sha256=o7B9_EfadjshSGLH0B5BHgj1J0OJWVCSXkEE8hljmxc,522
|
|
25
|
+
glitchlings/zoo/_sampling.py,sha256=AAPLObjqKrmX882TX8hdvPHReBOcv0Z4pUuW6AxuGgU,1640
|
|
26
|
+
glitchlings/zoo/_text_utils.py,sha256=LqCa33E-Qxbk6N5AVfxEmAz6C2u7_mCF0xPT9-404A8,2854
|
|
27
|
+
glitchlings/zoo/adjax.py,sha256=R2GSRNZ9tt5uB0YZwZCeMdv2Zu1aZZ_IIEvphNK7W1A,3674
|
|
28
|
+
glitchlings/zoo/apostrofae.py,sha256=z6vjp4dUdYrJptSruOGxSvN-ruKCZvvpDzXNI2sbT1M,4050
|
|
29
|
+
glitchlings/zoo/core.py,sha256=cMAc8LxZBZMnFixRfJGbDaqe_vRHvQORxxn9D0nYj_g,21063
|
|
30
|
+
glitchlings/zoo/jargoyle.py,sha256=SXp-KEbyqLzfKfVAGoXfj1vKbG5qHYmBFs9Q7Ic8JHA,11899
|
|
31
|
+
glitchlings/zoo/mim1c.py,sha256=GqUMErVAVcqMAZjx4hhJ0Af25CxA0Aorv3U_fTqLZek,3546
|
|
32
|
+
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
|
33
|
+
glitchlings/zoo/redactyl.py,sha256=WlWLK1yz9F0572dlvwI72wLrarwiPBD5_pUe2qf6oKY,5710
|
|
34
|
+
glitchlings/zoo/reduple.py,sha256=HIE9T2lXS9Ki-O2dzo9UVAs58uzC6ES_qkj25S0r2zE,4407
|
|
35
|
+
glitchlings/zoo/rushmore.py,sha256=tdAr-Vlrn02dQkQL_rAENv56FltSEW1tutGluMrY98g,4476
|
|
36
|
+
glitchlings/zoo/scannequin.py,sha256=OsUfyHX_dO0T20Md5vePLzr-hL8TYKEOwCGx0r5e6g4,5072
|
|
37
|
+
glitchlings/zoo/typogre.py,sha256=zQPUgdP9xwXiRTzkYvW1SCURHpt4uM5-WIly_SCDcPs,6944
|
|
38
|
+
glitchlings/zoo/zeedub.py,sha256=UxIX5oqtalXgTxKidK12GNOLyXmcMfMJyFMnDSRsGAI,5061
|
|
39
|
+
glitchlings/zoo/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
glitchlings/zoo/assets/apostrofae_pairs.json,sha256=lPLFLndzn_f7_5wZizxsLMnwBY4O63zsCvDjyJ56MLA,553
|
|
41
|
+
glitchlings-0.4.3.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
|
42
|
+
glitchlings-0.4.3.dist-info/METADATA,sha256=Yga6x_fBjSasVQJTQGdN7oB8sJWaX-MTpmlZ8luFomE,32866
|
|
43
|
+
glitchlings-0.4.3.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
|
44
|
+
glitchlings-0.4.3.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
|
45
|
+
glitchlings-0.4.3.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
|
46
|
+
glitchlings-0.4.3.dist-info/RECORD,,
|
glitchlings/lexicon/graph.py
DELETED
|
@@ -1,282 +0,0 @@
|
|
|
1
|
-
"""Graph-based lexicon backed by ConceptNet/Numberbatch embeddings."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import re
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Iterable, Mapping, MutableMapping, Sequence
|
|
8
|
-
|
|
9
|
-
from . import LexiconBackend
|
|
10
|
-
from ._cache import CacheSnapshot
|
|
11
|
-
from ._cache import load_cache as _load_cache_file
|
|
12
|
-
from ._cache import write_cache as _write_cache_file
|
|
13
|
-
from .vector import VectorLexicon
|
|
14
|
-
|
|
15
|
-
_CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
|
|
16
|
-
_PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def _lemmatize_token(token: str) -> str:
|
|
20
|
-
"""Return a lightweight lemma for ``token`` using heuristic rules."""
|
|
21
|
-
irregular = {
|
|
22
|
-
"children": "child",
|
|
23
|
-
"mice": "mouse",
|
|
24
|
-
"geese": "goose",
|
|
25
|
-
"feet": "foot",
|
|
26
|
-
"teeth": "tooth",
|
|
27
|
-
"men": "man",
|
|
28
|
-
"women": "woman",
|
|
29
|
-
"better": "good",
|
|
30
|
-
"worse": "bad",
|
|
31
|
-
}
|
|
32
|
-
lowered = token.lower()
|
|
33
|
-
if lowered in irregular:
|
|
34
|
-
return irregular[lowered]
|
|
35
|
-
|
|
36
|
-
if lowered.endswith("ies") and len(lowered) > 3:
|
|
37
|
-
return lowered[:-3] + "y"
|
|
38
|
-
if lowered.endswith("ves") and len(lowered) > 3:
|
|
39
|
-
return lowered[:-3] + "f"
|
|
40
|
-
if lowered.endswith("men") and len(lowered) > 3:
|
|
41
|
-
return lowered[:-3] + "man"
|
|
42
|
-
if lowered.endswith("ses") and len(lowered) > 3:
|
|
43
|
-
return lowered[:-2]
|
|
44
|
-
if lowered.endswith("es") and len(lowered) > 3:
|
|
45
|
-
return lowered[:-2]
|
|
46
|
-
if lowered.endswith("s") and len(lowered) > 2 and not lowered.endswith("ss"):
|
|
47
|
-
return lowered[:-1]
|
|
48
|
-
if lowered.endswith("ing") and len(lowered) > 4:
|
|
49
|
-
stem = lowered[:-3]
|
|
50
|
-
if len(stem) > 2 and stem[-1] == stem[-2]:
|
|
51
|
-
stem = stem[:-1]
|
|
52
|
-
return stem
|
|
53
|
-
if lowered.endswith("ed") and len(lowered) > 3:
|
|
54
|
-
stem = lowered[:-2]
|
|
55
|
-
if len(stem) > 2 and stem[-1] == stem[-2]:
|
|
56
|
-
stem = stem[:-1]
|
|
57
|
-
return stem
|
|
58
|
-
return lowered
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def _normalize_phrase(phrase: str) -> str:
|
|
62
|
-
"""Normalise ``phrase`` for ConceptNet lookups."""
|
|
63
|
-
stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
|
|
64
|
-
tokens = [token for token in stripped.split() if token]
|
|
65
|
-
if not tokens:
|
|
66
|
-
return ""
|
|
67
|
-
lemmatised = [_lemmatize_token(token) for token in tokens]
|
|
68
|
-
return " ".join(lemmatised)
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def _concept_terms(normalized: str) -> list[str]:
|
|
72
|
-
"""Return ConceptNet term variants for ``normalized``."""
|
|
73
|
-
collapsed = normalized.replace(" ", "_")
|
|
74
|
-
if not collapsed:
|
|
75
|
-
return []
|
|
76
|
-
variants = {collapsed}
|
|
77
|
-
variants.add(collapsed.replace("_", "-"))
|
|
78
|
-
variants.add(collapsed.replace("-", "_"))
|
|
79
|
-
return list(variants)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def _surface_from_concept(concept: str) -> str | None:
|
|
83
|
-
"""Return a human-readable surface form for ``concept``."""
|
|
84
|
-
match = _CONCEPT_RE.match(concept)
|
|
85
|
-
if match is None:
|
|
86
|
-
return None
|
|
87
|
-
term = match.group("term")
|
|
88
|
-
surface = term.replace("_", " ")
|
|
89
|
-
surface = surface.replace("-", " ")
|
|
90
|
-
return " ".join(surface.split())
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _language_from_concept(concept: str) -> str | None:
|
|
94
|
-
match = _CONCEPT_RE.match(concept)
|
|
95
|
-
if match is None:
|
|
96
|
-
return None
|
|
97
|
-
return match.group("lang")
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
|
|
101
|
-
"""Load ConceptNet Numberbatch embeddings from ``path``."""
|
|
102
|
-
if not path.exists():
|
|
103
|
-
return {}
|
|
104
|
-
|
|
105
|
-
if path.suffix == ".gz":
|
|
106
|
-
import gzip
|
|
107
|
-
|
|
108
|
-
handle = gzip.open(path, "rt", encoding="utf8")
|
|
109
|
-
else:
|
|
110
|
-
handle = path.open("r", encoding="utf8")
|
|
111
|
-
|
|
112
|
-
with handle as stream:
|
|
113
|
-
header = stream.readline()
|
|
114
|
-
try:
|
|
115
|
-
parts = header.strip().split()
|
|
116
|
-
if len(parts) >= 2:
|
|
117
|
-
int(parts[0])
|
|
118
|
-
int(parts[1])
|
|
119
|
-
except ValueError:
|
|
120
|
-
stream.seek(0)
|
|
121
|
-
|
|
122
|
-
embeddings: dict[str, list[float]] = {}
|
|
123
|
-
for line in stream:
|
|
124
|
-
tokens = line.strip().split()
|
|
125
|
-
if len(tokens) <= 2:
|
|
126
|
-
continue
|
|
127
|
-
concept = tokens[0]
|
|
128
|
-
lang = _language_from_concept(concept)
|
|
129
|
-
if lang is None or lang not in languages:
|
|
130
|
-
continue
|
|
131
|
-
try:
|
|
132
|
-
vector = [float(value) for value in tokens[1:]]
|
|
133
|
-
except ValueError:
|
|
134
|
-
continue
|
|
135
|
-
embeddings[concept] = vector
|
|
136
|
-
return embeddings
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
class GraphLexicon(LexiconBackend):
|
|
140
|
-
"""Lexicon backed by ConceptNet/Numberbatch embeddings."""
|
|
141
|
-
|
|
142
|
-
def __init__(
|
|
143
|
-
self,
|
|
144
|
-
*,
|
|
145
|
-
source: Mapping[str, Sequence[float]] | str | Path | None = None,
|
|
146
|
-
cache: Mapping[str, Sequence[str]] | None = None,
|
|
147
|
-
cache_path: str | Path | None = None,
|
|
148
|
-
languages: Iterable[str] = ("en",),
|
|
149
|
-
max_neighbors: int = 50,
|
|
150
|
-
min_similarity: float = 0.0,
|
|
151
|
-
seed: int | None = None,
|
|
152
|
-
) -> None:
|
|
153
|
-
super().__init__(seed=seed)
|
|
154
|
-
self._languages = {language.lower() for language in languages}
|
|
155
|
-
if not self._languages:
|
|
156
|
-
self._languages = {"en"}
|
|
157
|
-
self._max_neighbors = max(1, max_neighbors)
|
|
158
|
-
self._min_similarity = min_similarity
|
|
159
|
-
self._cache: MutableMapping[str, list[str]] = {}
|
|
160
|
-
self._cache_path: Path | None = Path(cache_path) if cache_path is not None else None
|
|
161
|
-
self._cache_checksum: str | None = None
|
|
162
|
-
if self._cache_path is not None:
|
|
163
|
-
snapshot = _load_cache_file(self._cache_path)
|
|
164
|
-
self._cache.update(snapshot.entries)
|
|
165
|
-
self._cache_checksum = snapshot.checksum
|
|
166
|
-
if cache is not None:
|
|
167
|
-
for key, values in cache.items():
|
|
168
|
-
self._cache[str(key)] = [str(value) for value in values]
|
|
169
|
-
self._cache_dirty = False
|
|
170
|
-
|
|
171
|
-
prepared_source = self._prepare_source(source)
|
|
172
|
-
self._backend = VectorLexicon(
|
|
173
|
-
source=prepared_source if prepared_source else None,
|
|
174
|
-
max_neighbors=self._max_neighbors,
|
|
175
|
-
min_similarity=self._min_similarity,
|
|
176
|
-
case_sensitive=True,
|
|
177
|
-
seed=seed,
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
def _prepare_source(
|
|
181
|
-
self, source: Mapping[str, Sequence[float]] | str | Path | None
|
|
182
|
-
) -> Mapping[str, Sequence[float]]:
|
|
183
|
-
if source is None:
|
|
184
|
-
return {}
|
|
185
|
-
if isinstance(source, Mapping):
|
|
186
|
-
prepared: dict[str, list[float]] = {}
|
|
187
|
-
for key, vector in source.items():
|
|
188
|
-
lang = _language_from_concept(key)
|
|
189
|
-
if lang is None or lang not in self._languages:
|
|
190
|
-
continue
|
|
191
|
-
prepared[key] = [float(value) for value in vector]
|
|
192
|
-
return prepared
|
|
193
|
-
path = Path(source)
|
|
194
|
-
embeddings = _load_numberbatch(path, languages=self._languages)
|
|
195
|
-
return embeddings
|
|
196
|
-
|
|
197
|
-
def reseed(self, seed: int | None) -> None:
|
|
198
|
-
super().reseed(seed)
|
|
199
|
-
self._backend.reseed(seed)
|
|
200
|
-
|
|
201
|
-
def _concept_candidates(self, normalized: str) -> list[str]:
|
|
202
|
-
terms = _concept_terms(normalized)
|
|
203
|
-
concepts = []
|
|
204
|
-
for language in sorted(self._languages):
|
|
205
|
-
for term in terms:
|
|
206
|
-
concepts.append(f"/c/{language}/{term}")
|
|
207
|
-
return concepts
|
|
208
|
-
|
|
209
|
-
def _collect_synonyms(self, normalized: str) -> list[str]:
|
|
210
|
-
candidates: list[str] = []
|
|
211
|
-
seen: set[str] = set()
|
|
212
|
-
for concept in self._concept_candidates(normalized):
|
|
213
|
-
neighbors = self._backend.precompute(concept, limit=self._max_neighbors)
|
|
214
|
-
for neighbor in neighbors:
|
|
215
|
-
lang = _language_from_concept(neighbor)
|
|
216
|
-
if lang is None or lang not in self._languages:
|
|
217
|
-
continue
|
|
218
|
-
surface = _surface_from_concept(neighbor)
|
|
219
|
-
if surface is None:
|
|
220
|
-
continue
|
|
221
|
-
surface_norm = _normalize_phrase(surface)
|
|
222
|
-
if not surface_norm or surface_norm == normalized:
|
|
223
|
-
continue
|
|
224
|
-
if surface_norm in seen:
|
|
225
|
-
continue
|
|
226
|
-
seen.add(surface_norm)
|
|
227
|
-
candidates.append(surface)
|
|
228
|
-
return candidates
|
|
229
|
-
|
|
230
|
-
def _ensure_cached(self, normalized: str) -> list[str]:
|
|
231
|
-
if normalized in self._cache:
|
|
232
|
-
return self._cache[normalized]
|
|
233
|
-
synonyms = self._collect_synonyms(normalized)
|
|
234
|
-
self._cache[normalized] = synonyms
|
|
235
|
-
if self._cache_path is not None:
|
|
236
|
-
self._cache_dirty = True
|
|
237
|
-
return synonyms
|
|
238
|
-
|
|
239
|
-
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
240
|
-
normalized = _normalize_phrase(word)
|
|
241
|
-
if not normalized:
|
|
242
|
-
return []
|
|
243
|
-
synonyms = self._ensure_cached(normalized)
|
|
244
|
-
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
245
|
-
|
|
246
|
-
def precompute(self, word: str) -> list[str]:
|
|
247
|
-
normalized = _normalize_phrase(word)
|
|
248
|
-
if not normalized:
|
|
249
|
-
return []
|
|
250
|
-
return list(self._ensure_cached(normalized))
|
|
251
|
-
|
|
252
|
-
def export_cache(self) -> dict[str, list[str]]:
|
|
253
|
-
return {key: list(values) for key, values in self._cache.items()}
|
|
254
|
-
|
|
255
|
-
@classmethod
|
|
256
|
-
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
257
|
-
"""Load and validate a persisted ConceptNet cache file."""
|
|
258
|
-
return _load_cache_file(Path(path))
|
|
259
|
-
|
|
260
|
-
def save_cache(self, path: str | Path | None = None) -> Path:
|
|
261
|
-
if path is None:
|
|
262
|
-
if self._cache_path is None:
|
|
263
|
-
raise RuntimeError("No cache path supplied to GraphLexicon.")
|
|
264
|
-
target = self._cache_path
|
|
265
|
-
else:
|
|
266
|
-
target = Path(path)
|
|
267
|
-
self._cache_path = target
|
|
268
|
-
snapshot = _write_cache_file(target, self._cache)
|
|
269
|
-
self._cache_checksum = snapshot.checksum
|
|
270
|
-
self._cache_dirty = False
|
|
271
|
-
return target
|
|
272
|
-
|
|
273
|
-
def supports_pos(self, pos: str | None) -> bool:
|
|
274
|
-
return True
|
|
275
|
-
|
|
276
|
-
def __repr__(self) -> str: # pragma: no cover - debug helper
|
|
277
|
-
adapter = getattr(self._backend, "_adapter", None)
|
|
278
|
-
state = "loaded" if adapter else "empty"
|
|
279
|
-
return (
|
|
280
|
-
f"GraphLexicon(languages={sorted(self._languages)!r}, "
|
|
281
|
-
f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
|
|
282
|
-
)
|