glitchlings 0.2.6__cp310-cp310-macosx_11_0_universal2.whl → 0.4.0__cp310-cp310-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

@@ -0,0 +1,182 @@
1
+ """WordNet-backed lexicon implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ try: # pragma: no cover - exercised when NLTK is available
8
+ import nltk # type: ignore[import]
9
+ except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
10
+ nltk = None # type: ignore[assignment]
11
+ find = None # type: ignore[assignment]
12
+ _NLTK_IMPORT_ERROR = exc
13
+ else: # pragma: no cover - executed when NLTK is present
14
+ from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
15
+ from nltk.data import find as _nltk_find # type: ignore[import]
16
+
17
+ find = _nltk_find
18
+ _NLTK_IMPORT_ERROR = None
19
+
20
+ if TYPE_CHECKING: # pragma: no cover - typing aid only
21
+ from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
22
+ else: # pragma: no cover - runtime fallback to avoid hard dependency
23
+ WordNetCorpusReader = Any
24
+
25
+ if nltk is not None: # pragma: no cover - guarded by import success
26
+ try:
27
+ from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
28
+ except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
29
+ _WORDNET_MODULE = None
30
+ else:
31
+ WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
32
+ else:
33
+ _WORDNET_MODULE = None
34
+
35
+ from . import Lexicon
36
+
37
+ _WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
38
+ _wordnet_ready = False
39
+
40
+ _VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
41
+
42
+
43
+ def _require_nltk() -> None:
44
+ """Ensure the NLTK dependency is present before continuing."""
45
+
46
+ if nltk is None or find is None:
47
+ message = (
48
+ "The NLTK package is required for WordNet-backed lexicons; install "
49
+ "`nltk` and its WordNet corpus manually to enable this backend."
50
+ )
51
+ if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
52
+ raise RuntimeError(message) from _NLTK_IMPORT_ERROR
53
+ raise RuntimeError(message)
54
+
55
+
56
+ def dependencies_available() -> bool:
57
+ """Return ``True`` when the runtime NLTK dependency is present."""
58
+
59
+ return nltk is not None and find is not None
60
+
61
+
62
+ def _load_wordnet_reader() -> WordNetCorpusReader:
63
+ """Return a WordNet corpus reader from the downloaded corpus files."""
64
+
65
+ _require_nltk()
66
+
67
+ try:
68
+ root = find("corpora/wordnet")
69
+ except LookupError:
70
+ try:
71
+ zip_root = find("corpora/wordnet.zip")
72
+ except LookupError as exc:
73
+ raise RuntimeError(
74
+ "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
75
+ ) from exc
76
+ root = zip_root.join("wordnet/")
77
+
78
+ return WordNetCorpusReader(root, None)
79
+
80
+
81
+ def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
82
+ """Retrieve the active WordNet handle, rebuilding it on demand."""
83
+
84
+ global _WORDNET_HANDLE
85
+
86
+ if force_refresh:
87
+ _WORDNET_HANDLE = _WORDNET_MODULE
88
+
89
+ if _WORDNET_HANDLE is not None:
90
+ return _WORDNET_HANDLE
91
+
92
+ _WORDNET_HANDLE = _load_wordnet_reader()
93
+ return _WORDNET_HANDLE
94
+
95
+
96
+ def ensure_wordnet() -> None:
97
+ """Ensure the WordNet corpus is available before use."""
98
+
99
+ global _wordnet_ready
100
+ if _wordnet_ready:
101
+ return
102
+
103
+ _require_nltk()
104
+
105
+ resource = _wordnet()
106
+
107
+ try:
108
+ resource.ensure_loaded()
109
+ except LookupError:
110
+ nltk.download("wordnet", quiet=True)
111
+ try:
112
+ resource = _wordnet(force_refresh=True)
113
+ resource.ensure_loaded()
114
+ except LookupError as exc: # pragma: no cover - only triggered when download fails
115
+ raise RuntimeError(
116
+ "Unable to load NLTK WordNet corpus for synonym lookups."
117
+ ) from exc
118
+
119
+ _wordnet_ready = True
120
+
121
+
122
+ def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
123
+ """Gather deterministic synonym candidates for the supplied word."""
124
+
125
+ normalized_word = word.lower()
126
+ wordnet = _wordnet()
127
+ synonyms: set[str] = set()
128
+ for pos_tag in parts_of_speech:
129
+ synsets = wordnet.synsets(word, pos=pos_tag)
130
+ if not synsets:
131
+ continue
132
+
133
+ for synset in synsets:
134
+ lemmas_list = [lemma.name() for lemma in synset.lemmas()]
135
+ if not lemmas_list:
136
+ continue
137
+
138
+ filtered = []
139
+ for lemma_str in lemmas_list:
140
+ cleaned = lemma_str.replace("_", " ")
141
+ if cleaned.lower() != normalized_word:
142
+ filtered.append(cleaned)
143
+
144
+ if filtered:
145
+ synonyms.update(filtered)
146
+ break
147
+
148
+ if synonyms:
149
+ break
150
+
151
+ return sorted(synonyms)
152
+
153
+
154
+ class WordNetLexicon(Lexicon):
155
+ """Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
156
+
157
+ def get_synonyms(
158
+ self, word: str, pos: str | None = None, n: int = 5
159
+ ) -> list[str]:
160
+ ensure_wordnet()
161
+
162
+ if pos is None:
163
+ parts: tuple[str, ...] = _VALID_POS
164
+ else:
165
+ normalized_pos = pos.lower()
166
+ if normalized_pos not in _VALID_POS:
167
+ return []
168
+ parts = (normalized_pos,)
169
+
170
+ synonyms = _collect_synonyms(word, parts)
171
+ return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
172
+
173
+ def supports_pos(self, pos: str | None) -> bool:
174
+ if pos is None:
175
+ return True
176
+ return pos.lower() in _VALID_POS
177
+
178
+ def __repr__(self) -> str: # pragma: no cover - trivial representation
179
+ return f"WordNetLexicon(seed={self.seed!r})"
180
+
181
+
182
+ __all__ = ["WordNetLexicon", "dependencies_available", "ensure_wordnet"]
glitchlings/main.py CHANGED
@@ -8,6 +8,7 @@ from pathlib import Path
8
8
  import sys
9
9
 
10
10
  from . import SAMPLE_TEXT
11
+ from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
11
12
  from .zoo import (
12
13
  Glitchling,
13
14
  Gaggle,
@@ -53,7 +54,7 @@ def build_parser() -> argparse.ArgumentParser:
53
54
  "-s",
54
55
  "--seed",
55
56
  type=int,
56
- default=151,
57
+ default=None,
57
58
  help="Seed controlling deterministic corruption order (default: 151).",
58
59
  )
59
60
  parser.add_argument(
@@ -77,9 +78,83 @@ def build_parser() -> argparse.ArgumentParser:
77
78
  action="store_true",
78
79
  help="List available glitchlings and exit.",
79
80
  )
81
+ parser.add_argument(
82
+ "-c",
83
+ "--config",
84
+ type=Path,
85
+ help="Load glitchlings from a YAML configuration file.",
86
+ )
80
87
  return parser
81
88
 
82
89
 
90
+ def build_lexicon_parser() -> argparse.ArgumentParser:
91
+ builder = argparse.ArgumentParser(
92
+ prog="glitchlings build-lexicon",
93
+ description=(
94
+ "Generate deterministic synonym caches using vector embeddings so "
95
+ "they can be distributed without bundling large models."
96
+ ),
97
+ )
98
+ builder.add_argument(
99
+ "--source",
100
+ required=True,
101
+ help=(
102
+ "Vector source specification. Use 'spacy:<model>' for spaCy pipelines "
103
+ "or provide a path to a gensim KeyedVectors/word2vec file."
104
+ ),
105
+ )
106
+ builder.add_argument(
107
+ "--output",
108
+ required=True,
109
+ type=Path,
110
+ help="Path to the JSON file that will receive the synonym cache.",
111
+ )
112
+ builder.add_argument(
113
+ "--tokens",
114
+ type=Path,
115
+ help="Optional newline-delimited vocabulary file to restrict generation.",
116
+ )
117
+ builder.add_argument(
118
+ "--max-neighbors",
119
+ type=int,
120
+ default=50,
121
+ help="Number of nearest neighbours to cache per token (default: 50).",
122
+ )
123
+ builder.add_argument(
124
+ "--min-similarity",
125
+ type=float,
126
+ default=0.0,
127
+ help="Minimum cosine similarity required to keep a synonym (default: 0.0).",
128
+ )
129
+ builder.add_argument(
130
+ "--seed",
131
+ type=int,
132
+ help="Optional deterministic seed to bake into the resulting cache.",
133
+ )
134
+ builder.add_argument(
135
+ "--case-sensitive",
136
+ action="store_true",
137
+ help="Preserve original casing instead of lower-casing cache keys.",
138
+ )
139
+ builder.add_argument(
140
+ "--normalizer",
141
+ choices=["lower", "identity"],
142
+ default="lower",
143
+ help="Token normalization strategy for cache keys (default: lower).",
144
+ )
145
+ builder.add_argument(
146
+ "--limit",
147
+ type=int,
148
+ help="Optional maximum number of tokens to process.",
149
+ )
150
+ builder.add_argument(
151
+ "--overwrite",
152
+ action="store_true",
153
+ help="Allow overwriting an existing cache file.",
154
+ )
155
+ return builder
156
+
157
+
83
158
  def list_glitchlings() -> None:
84
159
  """Print information about the available built-in glitchlings."""
85
160
 
@@ -129,10 +204,27 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
129
204
 
130
205
 
131
206
  def summon_glitchlings(
132
- names: list[str] | None, parser: argparse.ArgumentParser, seed: int
207
+ names: list[str] | None,
208
+ parser: argparse.ArgumentParser,
209
+ seed: int | None,
210
+ *,
211
+ config_path: Path | None = None,
133
212
  ) -> Gaggle:
134
213
  """Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
135
214
 
215
+ if config_path is not None:
216
+ if names:
217
+ parser.error("Cannot combine --config with --glitchling.")
218
+ raise AssertionError("parser.error should exit")
219
+
220
+ try:
221
+ config = load_attack_config(config_path)
222
+ except (TypeError, ValueError) as exc:
223
+ parser.error(str(exc))
224
+ raise AssertionError("parser.error should exit")
225
+
226
+ return build_gaggle(config, seed_override=seed)
227
+
136
228
  if names:
137
229
  normalized: list[str | Glitchling] = []
138
230
  for specification in names:
@@ -144,8 +236,10 @@ def summon_glitchlings(
144
236
  else:
145
237
  normalized = DEFAULT_GLITCHLING_NAMES
146
238
 
239
+ effective_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
240
+
147
241
  try:
148
- return summon(normalized, seed=seed)
242
+ return summon(normalized, seed=effective_seed)
149
243
  except ValueError as exc:
150
244
  parser.error(str(exc))
151
245
  raise AssertionError("parser.error should exit")
@@ -187,7 +281,12 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
187
281
  return 0
188
282
 
189
283
  text = read_text(args, parser)
190
- gaggle = summon_glitchlings(args.glitchlings, parser, args.seed)
284
+ gaggle = summon_glitchlings(
285
+ args.glitchlings,
286
+ parser,
287
+ args.seed,
288
+ config_path=args.config,
289
+ )
191
290
 
192
291
  corrupted = gaggle(text)
193
292
 
@@ -199,6 +298,37 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
199
298
  return 0
200
299
 
201
300
 
301
+ def run_build_lexicon(args: argparse.Namespace) -> int:
302
+ """Delegate to the vector lexicon cache builder using CLI arguments."""
303
+
304
+ from glitchlings.lexicon.vector import main as vector_main
305
+
306
+ vector_args = [
307
+ "--source",
308
+ args.source,
309
+ "--output",
310
+ str(args.output),
311
+ "--max-neighbors",
312
+ str(args.max_neighbors),
313
+ "--min-similarity",
314
+ str(args.min_similarity),
315
+ "--normalizer",
316
+ args.normalizer,
317
+ ]
318
+ if args.tokens is not None:
319
+ vector_args.extend(["--tokens", str(args.tokens)])
320
+ if args.seed is not None:
321
+ vector_args.extend(["--seed", str(args.seed)])
322
+ if args.case_sensitive:
323
+ vector_args.append("--case-sensitive")
324
+ if args.limit is not None:
325
+ vector_args.extend(["--limit", str(args.limit)])
326
+ if args.overwrite:
327
+ vector_args.append("--overwrite")
328
+
329
+ return vector_main(vector_args)
330
+
331
+
202
332
  def main(argv: list[str] | None = None) -> int:
203
333
  """Entry point for the ``glitchlings`` command line interface.
204
334
 
@@ -209,8 +339,18 @@ def main(argv: list[str] | None = None) -> int:
209
339
  int: Exit code suitable for use with ``sys.exit``.
210
340
  """
211
341
 
342
+ if argv is None:
343
+ raw_args = sys.argv[1:]
344
+ else:
345
+ raw_args = list(argv)
346
+
347
+ if raw_args and raw_args[0] == "build-lexicon":
348
+ builder = build_lexicon_parser()
349
+ args = builder.parse_args(raw_args[1:])
350
+ return run_build_lexicon(args)
351
+
212
352
  parser = build_parser()
213
- args = parser.parse_args(argv)
353
+ args = parser.parse_args(raw_args)
214
354
  return run_cli(args, parser)
215
355
 
216
356
 
@@ -6,6 +6,7 @@ from typing import Any
6
6
  from .typogre import Typogre, typogre
7
7
  from .mim1c import Mim1c, mim1c
8
8
  from .jargoyle import Jargoyle, jargoyle, dependencies_available as _jargoyle_available
9
+ from .adjax import Adjax, adjax
9
10
  from .reduple import Reduple, reduple
10
11
  from .rushmore import Rushmore, rushmore
11
12
  from .redactyl import Redactyl, redactyl
@@ -20,6 +21,8 @@ __all__ = [
20
21
  "mim1c",
21
22
  "Jargoyle",
22
23
  "jargoyle",
24
+ "Adjax",
25
+ "adjax",
23
26
  "Reduple",
24
27
  "reduple",
25
28
  "Rushmore",
@@ -36,6 +39,7 @@ __all__ = [
36
39
  "BUILTIN_GLITCHLINGS",
37
40
  "DEFAULT_GLITCHLING_NAMES",
38
41
  "parse_glitchling_spec",
42
+ "get_glitchling_class",
39
43
  ]
40
44
 
41
45
  _HAS_JARGOYLE = _jargoyle_available()
@@ -43,7 +47,7 @@ _HAS_JARGOYLE = _jargoyle_available()
43
47
  _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
44
48
  if _HAS_JARGOYLE:
45
49
  _BUILTIN_GLITCHLING_LIST.append(jargoyle)
46
- _BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
50
+ _BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
47
51
 
48
52
  BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
49
53
  glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
@@ -52,6 +56,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
52
56
  _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
53
57
  typogre.name.lower(): Typogre,
54
58
  mim1c.name.lower(): Mim1c,
59
+ adjax.name.lower(): Adjax,
55
60
  reduple.name.lower(): Reduple,
56
61
  rushmore.name.lower(): Rushmore,
57
62
  redactyl.name.lower(): Redactyl,
@@ -121,6 +126,20 @@ def parse_glitchling_spec(specification: str) -> Glitchling:
121
126
  raise ValueError(f"Failed to instantiate glitchling '{name}': {exc}") from exc
122
127
 
123
128
 
129
+ def get_glitchling_class(name: str) -> type[Glitchling]:
130
+ """Look up the glitchling class registered under ``name``."""
131
+
132
+ key = name.strip().lower()
133
+ if not key:
134
+ raise ValueError("Glitchling name cannot be empty.")
135
+
136
+ glitchling_type = _BUILTIN_GLITCHLING_TYPES.get(key)
137
+ if glitchling_type is None:
138
+ raise ValueError(f"Glitchling '{name}' not found.")
139
+
140
+ return glitchling_type
141
+
142
+
124
143
  def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
125
144
  """Summon glitchlings by name (using defaults) or instance (to change parameters)."""
126
145
 
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import Sequence
5
+
6
+
7
+ def weighted_sample_without_replacement(
8
+ population: Sequence[int],
9
+ weights: Sequence[float],
10
+ *,
11
+ k: int,
12
+ rng: random.Random,
13
+ ) -> list[int]:
14
+ """Sample ``k`` unique indices from ``population`` using ``weights``.
15
+
16
+ Mirrors the behaviour used by several glitchlings while centralising error
17
+ handling and RNG interactions so the Python and Rust implementations remain
18
+ aligned.
19
+ """
20
+
21
+ if k < 0:
22
+ raise ValueError("Sample size cannot be negative")
23
+
24
+ if len(population) != len(weights):
25
+ raise ValueError("Population and weight sequences must be the same length")
26
+
27
+ items = list(zip(population, weights))
28
+ count = len(items)
29
+ if k == 0 or count == 0:
30
+ return []
31
+
32
+ if k > count:
33
+ raise ValueError("Sample larger than population or is negative")
34
+
35
+ selections: list[int] = []
36
+ for _ in range(k):
37
+ total_weight = sum(weight for _, weight in items)
38
+ if total_weight <= 0.0:
39
+ chosen_index = rng.randrange(len(items))
40
+ else:
41
+ threshold = rng.random() * total_weight
42
+ cumulative = 0.0
43
+ chosen_index = len(items) - 1
44
+ for idx, (_, weight) in enumerate(items):
45
+ cumulative += weight
46
+ if cumulative >= threshold:
47
+ chosen_index = idx
48
+ break
49
+ value, _ = items.pop(chosen_index)
50
+ selections.append(value)
51
+
52
+ return selections
53
+
54
+
55
+ __all__ = ["weighted_sample_without_replacement"]
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from typing import Sequence
6
+
7
+ _WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
8
+ _TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
9
+
10
+
11
+ def split_preserving_whitespace(text: str) -> list[str]:
12
+ """Split text while keeping whitespace tokens for stable reconstruction."""
13
+
14
+ return _WORD_SPLIT_PATTERN.split(text)
15
+
16
+
17
+ def split_token_edges(token: str) -> tuple[str, str, str]:
18
+ """Return leading, core, and trailing segments for a token."""
19
+
20
+ match = _TOKEN_EDGES_PATTERN.match(token)
21
+ if match is None:
22
+ return "", token, ""
23
+ return match.group(1), match.group(2), match.group(3)
24
+
25
+
26
+ def token_core_length(token: str) -> int:
27
+ """Return the length of the main word characters for weighting heuristics."""
28
+
29
+ _, core, _ = split_token_edges(token)
30
+ candidate = core if core else token
31
+ length = len(candidate)
32
+ if length <= 0:
33
+ stripped = token.strip()
34
+ length = len(stripped) if stripped else len(token)
35
+ if length <= 0:
36
+ length = 1
37
+ return length
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class WordToken:
42
+ """Metadata describing a non-whitespace token yielded by word splitters."""
43
+
44
+ index: int
45
+ prefix: str
46
+ core: str
47
+ suffix: str
48
+ core_length: int
49
+
50
+ @property
51
+ def has_core(self) -> bool:
52
+ """Return ``True`` when the token contains at least one core character."""
53
+
54
+ return bool(self.core)
55
+
56
+
57
+ def collect_word_tokens(
58
+ tokens: Sequence[str],
59
+ *,
60
+ skip_first_word: bool = False,
61
+ ) -> list[WordToken]:
62
+ """Return structured metadata for non-whitespace tokens within ``tokens``.
63
+
64
+ Args:
65
+ tokens: Token sequence produced by :func:`split_preserving_whitespace`.
66
+ skip_first_word: Exclude the first candidate token (used by Rushmore to
67
+ preserve leading words).
68
+ """
69
+
70
+ start = 2 if skip_first_word else 0
71
+ collected: list[WordToken] = []
72
+ for index in range(start, len(tokens), 2):
73
+ token = tokens[index]
74
+ if not token or token.isspace():
75
+ continue
76
+
77
+ prefix, core, suffix = split_token_edges(token)
78
+ core_length = len(core)
79
+ if core_length <= 0:
80
+ stripped = token.strip()
81
+ core_length = len(stripped) if stripped else len(token)
82
+ if core_length <= 0:
83
+ core_length = 1
84
+
85
+ collected.append(
86
+ WordToken(
87
+ index=index,
88
+ prefix=prefix,
89
+ core=core,
90
+ suffix=suffix,
91
+ core_length=core_length,
92
+ )
93
+ )
94
+
95
+ return collected
96
+
97
+
98
+ __all__ = [
99
+ "split_preserving_whitespace",
100
+ "split_token_edges",
101
+ "token_core_length",
102
+ "WordToken",
103
+ "collect_word_tokens",
104
+ ]