glitchlings 0.4.2__cp310-cp310-macosx_11_0_universal2.whl → 0.4.3__cp310-cp310-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (35) hide show
  1. glitchlings/__init__.py +4 -0
  2. glitchlings/_zoo_rust.cpython-310-darwin.so +0 -0
  3. glitchlings/compat.py +80 -11
  4. glitchlings/config.py +32 -19
  5. glitchlings/config.toml +1 -1
  6. glitchlings/dlc/__init__.py +3 -1
  7. glitchlings/dlc/pytorch.py +216 -0
  8. glitchlings/dlc/pytorch_lightning.py +233 -0
  9. glitchlings/lexicon/__init__.py +5 -15
  10. glitchlings/lexicon/_cache.py +21 -15
  11. glitchlings/lexicon/data/default_vector_cache.json +80 -14
  12. glitchlings/lexicon/vector.py +94 -15
  13. glitchlings/lexicon/wordnet.py +66 -25
  14. glitchlings/main.py +21 -11
  15. glitchlings/zoo/__init__.py +5 -1
  16. glitchlings/zoo/adjax.py +2 -2
  17. glitchlings/zoo/apostrofae.py +128 -0
  18. glitchlings/zoo/assets/__init__.py +0 -0
  19. glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
  20. glitchlings/zoo/core.py +40 -14
  21. glitchlings/zoo/jargoyle.py +44 -34
  22. glitchlings/zoo/redactyl.py +11 -8
  23. glitchlings/zoo/reduple.py +2 -2
  24. glitchlings/zoo/rushmore.py +2 -2
  25. glitchlings/zoo/scannequin.py +2 -2
  26. glitchlings/zoo/typogre.py +5 -2
  27. glitchlings/zoo/zeedub.py +5 -2
  28. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/METADATA +35 -2
  29. glitchlings-0.4.3.dist-info/RECORD +46 -0
  30. glitchlings/lexicon/graph.py +0 -282
  31. glitchlings-0.4.2.dist-info/RECORD +0 -42
  32. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
  33. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
  34. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
  35. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
@@ -2,20 +2,25 @@ import random
2
2
  import re
3
3
  from collections.abc import Iterable
4
4
  from dataclasses import dataclass
5
+ from types import ModuleType
5
6
  from typing import Any, Literal, cast
6
7
 
7
8
  from glitchlings.lexicon import Lexicon, get_default_lexicon
8
9
 
10
+ from ._rate import resolve_rate
11
+ from .core import AttackWave, Glitchling
12
+
13
+ _wordnet_module: ModuleType | None
14
+
9
15
  try: # pragma: no cover - optional WordNet dependency
10
- from glitchlings.lexicon.wordnet import (
11
- WordNetLexicon,
12
- )
13
- from glitchlings.lexicon.wordnet import (
14
- dependencies_available as _lexicon_dependencies_available,
15
- )
16
- from glitchlings.lexicon.wordnet import ensure_wordnet as _lexicon_ensure_wordnet
16
+ import glitchlings.lexicon.wordnet as _wordnet_module
17
17
  except Exception: # pragma: no cover - triggered when nltk unavailable
18
- WordNetLexicon = None # type: ignore[assignment]
18
+ _wordnet_module = None
19
+
20
+ _wordnet_runtime: ModuleType | None = _wordnet_module
21
+
22
+ WordNetLexicon: type[Lexicon] | None
23
+ if _wordnet_runtime is None:
19
24
 
20
25
  def _lexicon_dependencies_available() -> bool:
21
26
  return False
@@ -26,9 +31,12 @@ except Exception: # pragma: no cover - triggered when nltk unavailable
26
31
  "and download its WordNet corpus manually if you need legacy synonyms."
27
32
  )
28
33
 
34
+ WordNetLexicon = None
35
+ else:
36
+ WordNetLexicon = cast(type[Lexicon], _wordnet_runtime.WordNetLexicon)
37
+ _lexicon_dependencies_available = _wordnet_runtime.dependencies_available
38
+ _lexicon_ensure_wordnet = _wordnet_runtime.ensure_wordnet
29
39
 
30
- from ._rate import resolve_rate
31
- from .core import AttackWave, Glitchling
32
40
 
33
41
  ensure_wordnet = _lexicon_ensure_wordnet
34
42
 
@@ -169,34 +177,36 @@ def substitute_random_synonyms(
169
177
  candidate_indices: list[int] = []
170
178
  candidate_metadata: dict[int, CandidateInfo] = {}
171
179
  for idx, tok in enumerate(tokens):
172
- if idx % 2 == 0 and tok and not tok.isspace():
173
- prefix, core_word, suffix = _split_token(tok)
174
- if not core_word:
175
- continue
176
-
177
- chosen_pos: str | None = None
178
- synonyms: list[str] = []
180
+ if idx % 2 != 0 or not tok or tok.isspace():
181
+ continue
179
182
 
180
- for pos in target_pos:
181
- if not active_lexicon.supports_pos(pos):
182
- continue
183
- synonyms = active_lexicon.get_synonyms(core_word, pos=pos)
184
- if synonyms:
185
- chosen_pos = pos
186
- break
183
+ prefix, core_word, suffix = _split_token(tok)
184
+ if not core_word:
185
+ continue
187
186
 
188
- if not synonyms and active_lexicon.supports_pos(None):
189
- synonyms = active_lexicon.get_synonyms(core_word, pos=None)
187
+ chosen_pos: str | None = None
188
+ synonyms: list[str] = []
190
189
 
190
+ for tag in target_pos:
191
+ if not active_lexicon.supports_pos(tag):
192
+ continue
193
+ synonyms = active_lexicon.get_synonyms(core_word, pos=tag)
191
194
  if synonyms:
192
- candidate_indices.append(idx)
193
- candidate_metadata[idx] = CandidateInfo(
194
- prefix=prefix,
195
- core_word=core_word,
196
- suffix=suffix,
197
- part_of_speech=chosen_pos,
198
- synonyms=synonyms,
199
- )
195
+ chosen_pos = tag
196
+ break
197
+
198
+ if not synonyms and active_lexicon.supports_pos(None):
199
+ synonyms = active_lexicon.get_synonyms(core_word, pos=None)
200
+
201
+ if synonyms:
202
+ candidate_indices.append(idx)
203
+ candidate_metadata[idx] = CandidateInfo(
204
+ prefix=prefix,
205
+ core_word=core_word,
206
+ suffix=suffix,
207
+ part_of_speech=chosen_pos,
208
+ synonyms=synonyms,
209
+ )
200
210
 
201
211
  if not candidate_indices:
202
212
  return text
@@ -1,6 +1,6 @@
1
1
  import random
2
2
  import re
3
- from typing import Any
3
+ from typing import Any, cast
4
4
 
5
5
  from ._rate import resolve_rate
6
6
  from ._sampling import weighted_sample_without_replacement
@@ -119,13 +119,16 @@ def redact_words(
119
119
  use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
120
120
 
121
121
  if use_rust:
122
- return _redact_words_rust(
123
- text,
124
- replacement_char,
125
- clamped_rate,
126
- merge_adjacent,
127
- unweighted_flag,
128
- rng,
122
+ return cast(
123
+ str,
124
+ _redact_words_rust(
125
+ text,
126
+ replacement_char,
127
+ clamped_rate,
128
+ merge_adjacent,
129
+ unweighted_flag,
130
+ rng,
131
+ ),
129
132
  )
130
133
 
131
134
  return _python_redact_words(
@@ -1,5 +1,5 @@
1
1
  import random
2
- from typing import Any
2
+ from typing import Any, cast
3
3
 
4
4
  from ._rate import resolve_rate
5
5
  from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
@@ -94,7 +94,7 @@ def reduplicate_words(
94
94
  unweighted_flag = bool(unweighted)
95
95
 
96
96
  if _reduplicate_words_rust is not None:
97
- return _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng)
97
+ return cast(str, _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng))
98
98
 
99
99
  return _python_reduplicate_words(
100
100
  text,
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  import random
3
3
  import re
4
- from typing import Any
4
+ from typing import Any, cast
5
5
 
6
6
  from ._rate import resolve_rate
7
7
  from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
@@ -97,7 +97,7 @@ def delete_random_words(
97
97
  unweighted_flag = bool(unweighted)
98
98
 
99
99
  if _delete_random_words_rust is not None:
100
- return _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng)
100
+ return cast(str, _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng))
101
101
 
102
102
  return _python_delete_random_words(
103
103
  text,
@@ -1,6 +1,6 @@
1
1
  import random
2
2
  import re
3
- from typing import Any
3
+ from typing import Any, cast
4
4
 
5
5
  from ._ocr_confusions import load_confusion_table
6
6
  from ._rate import resolve_rate
@@ -126,7 +126,7 @@ def ocr_artifacts(
126
126
  clamped_rate = max(0.0, effective_rate)
127
127
 
128
128
  if _ocr_artifacts_rust is not None:
129
- return _ocr_artifacts_rust(text, clamped_rate, rng)
129
+ return cast(str, _ocr_artifacts_rust(text, clamped_rate, rng))
130
130
 
131
131
  return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
132
132
 
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import math
4
4
  import random
5
- from typing import Any, Optional
5
+ from typing import Any, Optional, cast
6
6
 
7
7
  from ..util import KEYNEIGHBORS
8
8
  from ._rate import resolve_rate
@@ -168,7 +168,10 @@ def fatfinger(
168
168
  layout = getattr(KEYNEIGHBORS, keyboard)
169
169
 
170
170
  if _fatfinger_rust is not None:
171
- return _fatfinger_rust(text, max_change_rate=clamped_rate, layout=layout, rng=rng)
171
+ return cast(
172
+ str,
173
+ _fatfinger_rust(text, max_change_rate=clamped_rate, layout=layout, rng=rng),
174
+ )
172
175
 
173
176
  return _fatfinger_python(text, rate=clamped_rate, layout=layout, rng=rng)
174
177
 
glitchlings/zoo/zeedub.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import math
4
4
  import random
5
5
  from collections.abc import Sequence
6
- from typing import Any
6
+ from typing import Any, cast
7
7
 
8
8
  from ._rate import resolve_rate
9
9
  from .core import AttackOrder, AttackWave, Glitchling
@@ -115,7 +115,10 @@ def insert_zero_widths(
115
115
  if hasattr(rng, "getstate"):
116
116
  python_state = rng.getstate()
117
117
  rng.setstate(state)
118
- rust_result = _inject_zero_widths_rust(text, clamped_rate, list(cleaned_palette), rng)
118
+ rust_result = cast(
119
+ str,
120
+ _inject_zero_widths_rust(text, clamped_rate, list(cleaned_palette), rng),
121
+ )
119
122
  if rust_result == python_result:
120
123
  return rust_result
121
124
  if python_state is not None and hasattr(rng, "setstate"):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -226,15 +226,37 @@ License-File: LICENSE
226
226
  Requires-Dist: confusable-homoglyphs>=3.3.1
227
227
  Requires-Dist: tomli>=2.0.1; python_version < "3.11"
228
228
  Requires-Dist: pyyaml>=6.0.0
229
+ Provides-Extra: all
230
+ Requires-Dist: black>=24.4.0; extra == "all"
231
+ Requires-Dist: hypothesis>=6.140.0; extra == "all"
232
+ Requires-Dist: interrogate>=1.5.0; extra == "all"
233
+ Requires-Dist: jellyfish>=1.2.0; extra == "all"
234
+ Requires-Dist: isort>=5.13.0; extra == "all"
235
+ Requires-Dist: mkdocs>=1.6.0; extra == "all"
236
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "all"
237
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "all"
238
+ Requires-Dist: mkdocstrings-python>=1.10.0; extra == "all"
239
+ Requires-Dist: mypy>=1.8.0; extra == "all"
240
+ Requires-Dist: numpy<=2.0,>=1.24; extra == "all"
241
+ Requires-Dist: pre-commit>=3.8.0; extra == "all"
242
+ Requires-Dist: pytest>=8.0.0; extra == "all"
243
+ Requires-Dist: ruff>=0.6.0; extra == "all"
244
+ Requires-Dist: verifiers>=0.1.3.post0; extra == "all"
229
245
  Provides-Extra: hf
230
246
  Requires-Dist: datasets>=4.0.0; extra == "hf"
247
+ Provides-Extra: lightning
248
+ Requires-Dist: pytorch_lightning>=2.0.0; extra == "lightning"
231
249
  Provides-Extra: vectors
232
250
  Requires-Dist: numpy<=2.0,>=1.24; extra == "vectors"
233
251
  Requires-Dist: spacy>=3.7.2; extra == "vectors"
234
252
  Requires-Dist: gensim>=4.3.2; extra == "vectors"
253
+ Provides-Extra: st
254
+ Requires-Dist: sentence-transformers>=3.0.0; extra == "st"
235
255
  Provides-Extra: prime
236
256
  Requires-Dist: verifiers>=0.1.3.post0; extra == "prime"
237
257
  Requires-Dist: jellyfish>=1.2.0; extra == "prime"
258
+ Provides-Extra: torch
259
+ Requires-Dist: torch>=2.0.0; extra == "torch"
238
260
  Provides-Extra: dev
239
261
  Requires-Dist: pytest>=8.0.0; extra == "dev"
240
262
  Requires-Dist: hypothesis>=6.140.0; extra == "dev"
@@ -307,7 +329,7 @@ print(gaggle(SAMPLE_TEXT))
307
329
  > Onҽ m‎ھ‎rning, wһen Gregor Samƽa woke from trouble𝐝 𝑑reams, he found himself transformed in his bed into a horrible vermin‎٠‎ He l lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightlh domed and divided by arches ino stiff sections. The bedding was adly able to cover it and and seemed ready to slide off any moment. His many legxs, pitifully thin compared with the size of the the rest of him, waved about helplessly ashe looked looked.
308
330
 
309
331
  Consult the [Glitchlings Usage Guide](docs/index.md)
310
- for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
332
+ for end-to-end instructions spanning the Python API, CLI, HuggingFace, PyTorch, and Prime Intellect
311
333
  integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
312
334
 
313
335
  ## Motivation
@@ -356,6 +378,7 @@ glitchlings --list
356
378
 
357
379
  ```text
358
380
  Typogre — scope: Character, order: early
381
+ Apostrofae — scope: Character, order: normal
359
382
  Mim1c — scope: Character, order: last
360
383
  Jargoyle — scope: Word, order: normal
361
384
  Adjax — scope: Word, order: normal
@@ -458,6 +481,16 @@ _What a nice word, would be a shame if something happened to it._
458
481
  > - `keyboard (str)`: Keyboard layout key-neighbor map to use (default: "CURATOR_QWERTY"; also accepts "QWERTY", "DVORAK", "COLEMAK", and "AZERTY").
459
482
  > - `seed (int)`: The random seed for reproducibility (default: 151).
460
483
 
484
+ ### Apostrofae
485
+
486
+ _It looks like you're trying to paste some text. Can I help?_
487
+
488
+ > _**Paperclip Manager.**_ Apostrofae scans for balanced runs of straight quotes, apostrophes, and backticks before replacing them with randomly sampled smart-quote pairs from a curated lookup table. The swap happens in-place so contractions and unpaired glyphs remain untouched.
489
+ >
490
+ > Args
491
+ >
492
+ > - `seed (int)`: Optional seed controlling the deterministic smart-quote sampling (default: 151).
493
+
461
494
  ### Mim1c
462
495
 
463
496
  _Wait, was that...?_
@@ -0,0 +1,46 @@
1
+ glitchlings/__init__.py,sha256=bkyRgzjC8ssidEO9UL9VpbYXQxTV1Hz3VAPOIqd9uMg,1182
2
+ glitchlings/__main__.py,sha256=f-P4jiVBd7ZpS6QxRpa_6SJgOG03UhZhcWasMDRWLs8,120
3
+ glitchlings/_zoo_rust.cpython-310-darwin.so,sha256=JiW71HlRNYLu0G2bCRnjHrYxjbQ7fvGXjyc4tAWE-5c,2587040
4
+ glitchlings/compat.py,sha256=T_5Ia8yCzZvsMdicZ2TCcOgDO53_AjNGkSXWTR_qEnA,8908
5
+ glitchlings/config.py,sha256=ofxDMkoMg4j51CFube54aca1Ky9y_ZeVktXpeUEdWmA,12953
6
+ glitchlings/config.toml,sha256=04-Y_JCdQU68SRmwk2qZqrH_bbX4jEH9uh7URtxdIHA,99
7
+ glitchlings/main.py,sha256=uw8VbDgxov1m-wYHPDl2dP5ItpLB4ZHpb0ChJXzcL0o,10623
8
+ glitchlings/dlc/__init__.py,sha256=qlY4nuagy4AAWuPMwmuhwK2m36ktp-qkeiIxC7OXg34,305
9
+ glitchlings/dlc/_shared.py,sha256=EFSnush3rjjaf4La5QfVaf_KEp0U_l_3-q4PKx0A6NQ,1972
10
+ glitchlings/dlc/huggingface.py,sha256=9lW7TnTHA_bXyo4Is8pymZchrB9BIL1bMCP2p7LCMtg,2576
11
+ glitchlings/dlc/prime.py,sha256=qGFI1d4BiOEIgQZ5v9QnlbYx4J4q-vNlh5tWZng11xs,8607
12
+ glitchlings/dlc/pytorch.py,sha256=tfHEDsDAOUnEvImFgRMjqC7Ig_aNVO8suXKpv24C2cA,7823
13
+ glitchlings/dlc/pytorch_lightning.py,sha256=Om45BHYx8tMoUwYOOTk5B5A5AIjNkh58V37OC2IBFxE,8553
14
+ glitchlings/lexicon/__init__.py,sha256=PLuu63iX6GSRypGI4DxiN_U-QmqmDobk1Xb7B5IrsZg,5951
15
+ glitchlings/lexicon/_cache.py,sha256=aWSUb5Ex162dr3HouO2Ic2O8ck3ViEFWs8-XMLKMeJ0,4086
16
+ glitchlings/lexicon/metrics.py,sha256=VBFfFpxjiEwZtK-jS55H8xP7MTC_0OjY8lQ5zSQ9aTY,4572
17
+ glitchlings/lexicon/vector.py,sha256=yWf-vlN2OEHnTCPu7tgDnJbhm47cmhdrTtjR0RZKkUM,22530
18
+ glitchlings/lexicon/wordnet.py,sha256=YcOliPHuesdlekmGspwAyR4fWDDxZWR_dIt_Nsq7ag0,7608
19
+ glitchlings/lexicon/data/default_vector_cache.json,sha256=3iVH0nX8EqMbqOkKWvORCGYtN0LKHn5G_Snlizsnm1g,997
20
+ glitchlings/util/__init__.py,sha256=vc3EAY8ehRjbOiryFdaqvvljXcyNGtZSPiEp9ok1vVw,4674
21
+ glitchlings/util/adapters.py,sha256=psxQFYSFmh1u7NuqtIrKwQP5FOhOrZoxZzc7X7DDi9U,693
22
+ glitchlings/zoo/__init__.py,sha256=1dWZPCTXuh5J7WdCxHX7ZX9bNd8bakzYndxQRhF43i8,5243
23
+ glitchlings/zoo/_ocr_confusions.py,sha256=Ju2_avXiwsr1p8zWFUTOzMxJ8vT5PpYobuGIn4L_sqI,1204
24
+ glitchlings/zoo/_rate.py,sha256=Vb1_5HAzrqr9eAh_zzngSV-d0zI264zcYspnT3VHPkE,504
25
+ glitchlings/zoo/_sampling.py,sha256=KrWyUSsYXghlvktS5hQBO0bPqywEEyA49A2qDWInB7Q,1586
26
+ glitchlings/zoo/_text_utils.py,sha256=fS5L_eq-foBbBdiv4ymI8-O0D0csc3yDekHpX8bqfV4,2754
27
+ glitchlings/zoo/adjax.py,sha256=TABKGQOwpyj_5czSoN8tPyEinwp8oZHKOBfU78ae9n0,3545
28
+ glitchlings/zoo/apostrofae.py,sha256=m2-VPO-ahp0zAEJTHPItXMwnpD9D8bQIjVyyIRzj46k,3922
29
+ glitchlings/zoo/core.py,sha256=3IHYEo8f2K7q4EbSZBYPb4MQXUVoMPm6B0IgsjiWNXk,20493
30
+ glitchlings/zoo/jargoyle.py,sha256=zGXi6WFSzYA_44UXvyK0aj18CMFHIFL4eQeijEHfZl4,11568
31
+ glitchlings/zoo/mim1c.py,sha256=-fgodKWZq--Xw8L2t1EqNbsh48bwX5jZxmiXdoaQShI,3437
32
+ glitchlings/zoo/ocr_confusions.tsv,sha256=KhtR7vJDTITpfTSGa-I7RHr6CK7LkGi2KjdhEWipI6o,183
33
+ glitchlings/zoo/redactyl.py,sha256=9Rtgkg87LnGt47DHKsD8XW25gtg9pv2aXvrFv46XOTQ,5516
34
+ glitchlings/zoo/reduple.py,sha256=ttHha3Yl0SRzEyAx9SfENbJRO_WhmJYL8ow5LGKn248,4258
35
+ glitchlings/zoo/rushmore.py,sha256=R6dgt4HSvkt31foazNmUhO4wL9PHpjh_7pzJ8vQPgO0,4322
36
+ glitchlings/zoo/scannequin.py,sha256=AQ7JPIxLiPFy4fDV6MgO4OFo34dMShc7sipStUaCG40,4900
37
+ glitchlings/zoo/typogre.py,sha256=AuAtx-KyWrk-zX3uuxjkvjiduLyDwGJNW7XYktnsuos,6712
38
+ glitchlings/zoo/zeedub.py,sha256=3VneZOEeL98Ek1VnZQI4V2o1alv41vvMzZXrKc9Lt1s,4875
39
+ glitchlings/zoo/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ glitchlings/zoo/assets/apostrofae_pairs.json,sha256=bfjSEaMTI_axGNJ93nI431KXU0IVp7ayO42gGcMgL6U,521
41
+ glitchlings-0.4.3.dist-info/licenses/LICENSE,sha256=YCvGip-LoaRyu6h0nPo71q6eHEkzUpsE11psDJOIRkw,11337
42
+ glitchlings-0.4.3.dist-info/METADATA,sha256=NV0-8T4jx5R2Eswhib6B29vAeoiXBdIDBFOu6KrzqdM,32242
43
+ glitchlings-0.4.3.dist-info/WHEEL,sha256=G4cu_uTI97hAXSudQC0D9fpgNQkuavCNljtwFXiUqZM,114
44
+ glitchlings-0.4.3.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
45
+ glitchlings-0.4.3.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
46
+ glitchlings-0.4.3.dist-info/RECORD,,
@@ -1,282 +0,0 @@
1
- """Graph-based lexicon backed by ConceptNet/Numberbatch embeddings."""
2
-
3
- from __future__ import annotations
4
-
5
- import re
6
- from pathlib import Path
7
- from typing import Iterable, Mapping, MutableMapping, Sequence
8
-
9
- from . import LexiconBackend
10
- from ._cache import CacheSnapshot
11
- from ._cache import load_cache as _load_cache_file
12
- from ._cache import write_cache as _write_cache_file
13
- from .vector import VectorLexicon
14
-
15
- _CONCEPT_RE = re.compile(r"^/c/(?P<lang>[a-z]{2})/(?P<term>[^/]+)")
16
- _PUNCTUATION_RE = re.compile(r"[^\w\s-]+", re.UNICODE)
17
-
18
-
19
- def _lemmatize_token(token: str) -> str:
20
- """Return a lightweight lemma for ``token`` using heuristic rules."""
21
- irregular = {
22
- "children": "child",
23
- "mice": "mouse",
24
- "geese": "goose",
25
- "feet": "foot",
26
- "teeth": "tooth",
27
- "men": "man",
28
- "women": "woman",
29
- "better": "good",
30
- "worse": "bad",
31
- }
32
- lowered = token.lower()
33
- if lowered in irregular:
34
- return irregular[lowered]
35
-
36
- if lowered.endswith("ies") and len(lowered) > 3:
37
- return lowered[:-3] + "y"
38
- if lowered.endswith("ves") and len(lowered) > 3:
39
- return lowered[:-3] + "f"
40
- if lowered.endswith("men") and len(lowered) > 3:
41
- return lowered[:-3] + "man"
42
- if lowered.endswith("ses") and len(lowered) > 3:
43
- return lowered[:-2]
44
- if lowered.endswith("es") and len(lowered) > 3:
45
- return lowered[:-2]
46
- if lowered.endswith("s") and len(lowered) > 2 and not lowered.endswith("ss"):
47
- return lowered[:-1]
48
- if lowered.endswith("ing") and len(lowered) > 4:
49
- stem = lowered[:-3]
50
- if len(stem) > 2 and stem[-1] == stem[-2]:
51
- stem = stem[:-1]
52
- return stem
53
- if lowered.endswith("ed") and len(lowered) > 3:
54
- stem = lowered[:-2]
55
- if len(stem) > 2 and stem[-1] == stem[-2]:
56
- stem = stem[:-1]
57
- return stem
58
- return lowered
59
-
60
-
61
- def _normalize_phrase(phrase: str) -> str:
62
- """Normalise ``phrase`` for ConceptNet lookups."""
63
- stripped = _PUNCTUATION_RE.sub(" ", phrase.lower())
64
- tokens = [token for token in stripped.split() if token]
65
- if not tokens:
66
- return ""
67
- lemmatised = [_lemmatize_token(token) for token in tokens]
68
- return " ".join(lemmatised)
69
-
70
-
71
- def _concept_terms(normalized: str) -> list[str]:
72
- """Return ConceptNet term variants for ``normalized``."""
73
- collapsed = normalized.replace(" ", "_")
74
- if not collapsed:
75
- return []
76
- variants = {collapsed}
77
- variants.add(collapsed.replace("_", "-"))
78
- variants.add(collapsed.replace("-", "_"))
79
- return list(variants)
80
-
81
-
82
- def _surface_from_concept(concept: str) -> str | None:
83
- """Return a human-readable surface form for ``concept``."""
84
- match = _CONCEPT_RE.match(concept)
85
- if match is None:
86
- return None
87
- term = match.group("term")
88
- surface = term.replace("_", " ")
89
- surface = surface.replace("-", " ")
90
- return " ".join(surface.split())
91
-
92
-
93
- def _language_from_concept(concept: str) -> str | None:
94
- match = _CONCEPT_RE.match(concept)
95
- if match is None:
96
- return None
97
- return match.group("lang")
98
-
99
-
100
- def _load_numberbatch(path: Path, *, languages: set[str]) -> Mapping[str, list[float]]:
101
- """Load ConceptNet Numberbatch embeddings from ``path``."""
102
- if not path.exists():
103
- return {}
104
-
105
- if path.suffix == ".gz":
106
- import gzip
107
-
108
- handle = gzip.open(path, "rt", encoding="utf8")
109
- else:
110
- handle = path.open("r", encoding="utf8")
111
-
112
- with handle as stream:
113
- header = stream.readline()
114
- try:
115
- parts = header.strip().split()
116
- if len(parts) >= 2:
117
- int(parts[0])
118
- int(parts[1])
119
- except ValueError:
120
- stream.seek(0)
121
-
122
- embeddings: dict[str, list[float]] = {}
123
- for line in stream:
124
- tokens = line.strip().split()
125
- if len(tokens) <= 2:
126
- continue
127
- concept = tokens[0]
128
- lang = _language_from_concept(concept)
129
- if lang is None or lang not in languages:
130
- continue
131
- try:
132
- vector = [float(value) for value in tokens[1:]]
133
- except ValueError:
134
- continue
135
- embeddings[concept] = vector
136
- return embeddings
137
-
138
-
139
- class GraphLexicon(LexiconBackend):
140
- """Lexicon backed by ConceptNet/Numberbatch embeddings."""
141
-
142
- def __init__(
143
- self,
144
- *,
145
- source: Mapping[str, Sequence[float]] | str | Path | None = None,
146
- cache: Mapping[str, Sequence[str]] | None = None,
147
- cache_path: str | Path | None = None,
148
- languages: Iterable[str] = ("en",),
149
- max_neighbors: int = 50,
150
- min_similarity: float = 0.0,
151
- seed: int | None = None,
152
- ) -> None:
153
- super().__init__(seed=seed)
154
- self._languages = {language.lower() for language in languages}
155
- if not self._languages:
156
- self._languages = {"en"}
157
- self._max_neighbors = max(1, max_neighbors)
158
- self._min_similarity = min_similarity
159
- self._cache: MutableMapping[str, list[str]] = {}
160
- self._cache_path: Path | None = Path(cache_path) if cache_path is not None else None
161
- self._cache_checksum: str | None = None
162
- if self._cache_path is not None:
163
- snapshot = _load_cache_file(self._cache_path)
164
- self._cache.update(snapshot.entries)
165
- self._cache_checksum = snapshot.checksum
166
- if cache is not None:
167
- for key, values in cache.items():
168
- self._cache[str(key)] = [str(value) for value in values]
169
- self._cache_dirty = False
170
-
171
- prepared_source = self._prepare_source(source)
172
- self._backend = VectorLexicon(
173
- source=prepared_source if prepared_source else None,
174
- max_neighbors=self._max_neighbors,
175
- min_similarity=self._min_similarity,
176
- case_sensitive=True,
177
- seed=seed,
178
- )
179
-
180
- def _prepare_source(
181
- self, source: Mapping[str, Sequence[float]] | str | Path | None
182
- ) -> Mapping[str, Sequence[float]]:
183
- if source is None:
184
- return {}
185
- if isinstance(source, Mapping):
186
- prepared: dict[str, list[float]] = {}
187
- for key, vector in source.items():
188
- lang = _language_from_concept(key)
189
- if lang is None or lang not in self._languages:
190
- continue
191
- prepared[key] = [float(value) for value in vector]
192
- return prepared
193
- path = Path(source)
194
- embeddings = _load_numberbatch(path, languages=self._languages)
195
- return embeddings
196
-
197
- def reseed(self, seed: int | None) -> None:
198
- super().reseed(seed)
199
- self._backend.reseed(seed)
200
-
201
- def _concept_candidates(self, normalized: str) -> list[str]:
202
- terms = _concept_terms(normalized)
203
- concepts = []
204
- for language in sorted(self._languages):
205
- for term in terms:
206
- concepts.append(f"/c/{language}/{term}")
207
- return concepts
208
-
209
- def _collect_synonyms(self, normalized: str) -> list[str]:
210
- candidates: list[str] = []
211
- seen: set[str] = set()
212
- for concept in self._concept_candidates(normalized):
213
- neighbors = self._backend.precompute(concept, limit=self._max_neighbors)
214
- for neighbor in neighbors:
215
- lang = _language_from_concept(neighbor)
216
- if lang is None or lang not in self._languages:
217
- continue
218
- surface = _surface_from_concept(neighbor)
219
- if surface is None:
220
- continue
221
- surface_norm = _normalize_phrase(surface)
222
- if not surface_norm or surface_norm == normalized:
223
- continue
224
- if surface_norm in seen:
225
- continue
226
- seen.add(surface_norm)
227
- candidates.append(surface)
228
- return candidates
229
-
230
- def _ensure_cached(self, normalized: str) -> list[str]:
231
- if normalized in self._cache:
232
- return self._cache[normalized]
233
- synonyms = self._collect_synonyms(normalized)
234
- self._cache[normalized] = synonyms
235
- if self._cache_path is not None:
236
- self._cache_dirty = True
237
- return synonyms
238
-
239
- def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
240
- normalized = _normalize_phrase(word)
241
- if not normalized:
242
- return []
243
- synonyms = self._ensure_cached(normalized)
244
- return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
245
-
246
- def precompute(self, word: str) -> list[str]:
247
- normalized = _normalize_phrase(word)
248
- if not normalized:
249
- return []
250
- return list(self._ensure_cached(normalized))
251
-
252
- def export_cache(self) -> dict[str, list[str]]:
253
- return {key: list(values) for key, values in self._cache.items()}
254
-
255
- @classmethod
256
- def load_cache(cls, path: str | Path) -> CacheSnapshot:
257
- """Load and validate a persisted ConceptNet cache file."""
258
- return _load_cache_file(Path(path))
259
-
260
- def save_cache(self, path: str | Path | None = None) -> Path:
261
- if path is None:
262
- if self._cache_path is None:
263
- raise RuntimeError("No cache path supplied to GraphLexicon.")
264
- target = self._cache_path
265
- else:
266
- target = Path(path)
267
- self._cache_path = target
268
- snapshot = _write_cache_file(target, self._cache)
269
- self._cache_checksum = snapshot.checksum
270
- self._cache_dirty = False
271
- return target
272
-
273
- def supports_pos(self, pos: str | None) -> bool:
274
- return True
275
-
276
- def __repr__(self) -> str: # pragma: no cover - debug helper
277
- adapter = getattr(self._backend, "_adapter", None)
278
- state = "loaded" if adapter else "empty"
279
- return (
280
- f"GraphLexicon(languages={sorted(self._languages)!r}, "
281
- f"max_neighbors={self._max_neighbors}, seed={self.seed!r}, state={state})"
282
- )