glitchlings 0.4.2__cp310-cp310-macosx_11_0_universal2.whl → 0.4.3__cp310-cp310-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cpython-310-darwin.so +0 -0
- glitchlings/compat.py +80 -11
- glitchlings/config.py +32 -19
- glitchlings/config.toml +1 -1
- glitchlings/dlc/__init__.py +3 -1
- glitchlings/dlc/pytorch.py +216 -0
- glitchlings/dlc/pytorch_lightning.py +233 -0
- glitchlings/lexicon/__init__.py +5 -15
- glitchlings/lexicon/_cache.py +21 -15
- glitchlings/lexicon/data/default_vector_cache.json +80 -14
- glitchlings/lexicon/vector.py +94 -15
- glitchlings/lexicon/wordnet.py +66 -25
- glitchlings/main.py +21 -11
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/adjax.py +2 -2
- glitchlings/zoo/apostrofae.py +128 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +40 -14
- glitchlings/zoo/jargoyle.py +44 -34
- glitchlings/zoo/redactyl.py +11 -8
- glitchlings/zoo/reduple.py +2 -2
- glitchlings/zoo/rushmore.py +2 -2
- glitchlings/zoo/scannequin.py +2 -2
- glitchlings/zoo/typogre.py +5 -2
- glitchlings/zoo/zeedub.py +5 -2
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/METADATA +35 -2
- glitchlings-0.4.3.dist-info/RECORD +46 -0
- glitchlings/lexicon/graph.py +0 -282
- glitchlings-0.4.2.dist-info/RECORD +0 -42
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
glitchlings/lexicon/wordnet.py
CHANGED
|
@@ -4,49 +4,74 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from importlib import import_module
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Any, Callable, Protocol, Sequence, cast
|
|
8
9
|
|
|
9
10
|
from ..compat import nltk as _nltk_dependency
|
|
10
11
|
from . import LexiconBackend
|
|
11
12
|
from ._cache import CacheSnapshot
|
|
12
13
|
|
|
13
|
-
nltk = _nltk_dependency.get() # type: ignore[assignment]
|
|
14
|
-
_NLTK_IMPORT_ERROR = _nltk_dependency.error
|
|
15
14
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
WordNetCorpusReader = Any
|
|
15
|
+
class _LemmaProtocol(Protocol):
|
|
16
|
+
def name(self) -> str:
|
|
17
|
+
...
|
|
20
18
|
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
|
|
20
|
+
class _SynsetProtocol(Protocol):
|
|
21
|
+
def lemmas(self) -> Sequence[_LemmaProtocol]:
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _WordNetResource(Protocol):
|
|
26
|
+
def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
def ensure_loaded(self) -> None:
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
|
|
34
|
+
|
|
35
|
+
nltk: ModuleType | None = _nltk_dependency.get()
|
|
36
|
+
_NLTK_IMPORT_ERROR: ModuleNotFoundError | None = _nltk_dependency.error
|
|
37
|
+
|
|
38
|
+
WordNetCorpusReader: WordNetCorpusReaderFactory | None = None
|
|
39
|
+
find: Callable[[str], Any] | None = None
|
|
40
|
+
_WORDNET_MODULE: _WordNetResource | None = None
|
|
23
41
|
|
|
24
42
|
if nltk is not None: # pragma: no cover - guarded by import success
|
|
25
43
|
try:
|
|
26
44
|
corpus_reader_module = import_module("nltk.corpus.reader")
|
|
27
|
-
WordNetCorpusReader = corpus_reader_module.WordNetCorpusReader # type: ignore[assignment]
|
|
28
45
|
except ModuleNotFoundError as exc: # pragma: no cover - triggered when corpus missing
|
|
29
46
|
if _NLTK_IMPORT_ERROR is None:
|
|
30
|
-
_NLTK_IMPORT_ERROR = exc
|
|
47
|
+
_NLTK_IMPORT_ERROR = exc
|
|
31
48
|
else:
|
|
49
|
+
reader_candidate = getattr(corpus_reader_module, "WordNetCorpusReader", None)
|
|
50
|
+
if reader_candidate is not None:
|
|
51
|
+
WordNetCorpusReader = cast(WordNetCorpusReaderFactory, reader_candidate)
|
|
52
|
+
|
|
32
53
|
try:
|
|
33
54
|
data_module = import_module("nltk.data")
|
|
34
55
|
except ModuleNotFoundError as exc: # pragma: no cover - triggered when data missing
|
|
35
56
|
if _NLTK_IMPORT_ERROR is None:
|
|
36
|
-
_NLTK_IMPORT_ERROR = exc
|
|
57
|
+
_NLTK_IMPORT_ERROR = exc
|
|
37
58
|
else:
|
|
38
|
-
|
|
59
|
+
locator = getattr(data_module, "find", None)
|
|
60
|
+
if callable(locator):
|
|
61
|
+
find = cast(Callable[[str], Any], locator)
|
|
39
62
|
|
|
40
63
|
try:
|
|
41
|
-
|
|
64
|
+
module_candidate = import_module("nltk.corpus.wordnet")
|
|
42
65
|
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
|
43
66
|
_WORDNET_MODULE = None
|
|
67
|
+
else:
|
|
68
|
+
_WORDNET_MODULE = cast(_WordNetResource, module_candidate)
|
|
44
69
|
else:
|
|
45
|
-
nltk = None
|
|
70
|
+
nltk = None
|
|
46
71
|
find = None
|
|
47
72
|
_WORDNET_MODULE = None
|
|
48
73
|
|
|
49
|
-
_WORDNET_HANDLE:
|
|
74
|
+
_WORDNET_HANDLE: _WordNetResource | None = _WORDNET_MODULE
|
|
50
75
|
_wordnet_ready = False
|
|
51
76
|
|
|
52
77
|
_VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
|
|
@@ -69,15 +94,22 @@ def dependencies_available() -> bool:
|
|
|
69
94
|
return nltk is not None and find is not None
|
|
70
95
|
|
|
71
96
|
|
|
72
|
-
def _load_wordnet_reader() ->
|
|
97
|
+
def _load_wordnet_reader() -> _WordNetResource:
|
|
73
98
|
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
|
74
99
|
_require_nltk()
|
|
75
100
|
|
|
101
|
+
if WordNetCorpusReader is None:
|
|
102
|
+
raise RuntimeError("The NLTK WordNet corpus reader is unavailable.")
|
|
103
|
+
|
|
104
|
+
locator = find
|
|
105
|
+
if locator is None:
|
|
106
|
+
raise RuntimeError("The NLTK data locator is unavailable.")
|
|
107
|
+
|
|
76
108
|
try:
|
|
77
|
-
root =
|
|
109
|
+
root = locator("corpora/wordnet")
|
|
78
110
|
except LookupError:
|
|
79
111
|
try:
|
|
80
|
-
zip_root =
|
|
112
|
+
zip_root = locator("corpora/wordnet.zip")
|
|
81
113
|
except LookupError as exc:
|
|
82
114
|
raise RuntimeError(
|
|
83
115
|
"The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
|
|
@@ -87,18 +119,20 @@ def _load_wordnet_reader() -> WordNetCorpusReader:
|
|
|
87
119
|
return WordNetCorpusReader(root, None)
|
|
88
120
|
|
|
89
121
|
|
|
90
|
-
def _wordnet(force_refresh: bool = False) ->
|
|
122
|
+
def _wordnet(force_refresh: bool = False) -> _WordNetResource:
|
|
91
123
|
"""Retrieve the active WordNet handle, rebuilding it on demand."""
|
|
92
124
|
global _WORDNET_HANDLE
|
|
93
125
|
|
|
94
126
|
if force_refresh:
|
|
95
127
|
_WORDNET_HANDLE = _WORDNET_MODULE
|
|
96
128
|
|
|
97
|
-
|
|
98
|
-
|
|
129
|
+
cached = _WORDNET_HANDLE
|
|
130
|
+
if cached is not None:
|
|
131
|
+
return cached
|
|
99
132
|
|
|
100
|
-
|
|
101
|
-
|
|
133
|
+
resource = _load_wordnet_reader()
|
|
134
|
+
_WORDNET_HANDLE = resource
|
|
135
|
+
return resource
|
|
102
136
|
|
|
103
137
|
|
|
104
138
|
def ensure_wordnet() -> None:
|
|
@@ -110,11 +144,14 @@ def ensure_wordnet() -> None:
|
|
|
110
144
|
_require_nltk()
|
|
111
145
|
|
|
112
146
|
resource = _wordnet()
|
|
147
|
+
nltk_module = nltk
|
|
148
|
+
if nltk_module is None:
|
|
149
|
+
raise RuntimeError("The NLTK dependency is unexpectedly unavailable.")
|
|
113
150
|
|
|
114
151
|
try:
|
|
115
152
|
resource.ensure_loaded()
|
|
116
153
|
except LookupError:
|
|
117
|
-
|
|
154
|
+
nltk_module.download("wordnet", quiet=True)
|
|
118
155
|
try:
|
|
119
156
|
resource = _wordnet(force_refresh=True)
|
|
120
157
|
resource.ensure_loaded()
|
|
@@ -159,6 +196,7 @@ class WordNetLexicon(LexiconBackend):
|
|
|
159
196
|
"""Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
|
|
160
197
|
|
|
161
198
|
def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
|
|
199
|
+
"""Return up to ``n`` WordNet lemmas for ``word`` filtered by ``pos`` if provided."""
|
|
162
200
|
ensure_wordnet()
|
|
163
201
|
|
|
164
202
|
if pos is None:
|
|
@@ -173,15 +211,18 @@ class WordNetLexicon(LexiconBackend):
|
|
|
173
211
|
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
174
212
|
|
|
175
213
|
def supports_pos(self, pos: str | None) -> bool:
|
|
214
|
+
"""Return ``True`` when ``pos`` is unset or recognised by the WordNet corpus."""
|
|
176
215
|
if pos is None:
|
|
177
216
|
return True
|
|
178
217
|
return pos.lower() in _VALID_POS
|
|
179
218
|
|
|
180
219
|
@classmethod
|
|
181
220
|
def load_cache(cls, path: str | Path) -> CacheSnapshot:
|
|
221
|
+
"""WordNet lexicons do not persist caches; raising keeps the contract explicit."""
|
|
182
222
|
raise RuntimeError("WordNetLexicon does not persist or load caches.")
|
|
183
223
|
|
|
184
224
|
def save_cache(self, path: str | Path | None = None) -> Path | None:
|
|
225
|
+
"""WordNet lexicons do not persist caches; raising keeps the contract explicit."""
|
|
185
226
|
raise RuntimeError("WordNetLexicon does not persist or load caches.")
|
|
186
227
|
|
|
187
228
|
def __repr__(self) -> str: # pragma: no cover - trivial representation
|
glitchlings/main.py
CHANGED
|
@@ -5,7 +5,9 @@ from __future__ import annotations
|
|
|
5
5
|
import argparse
|
|
6
6
|
import difflib
|
|
7
7
|
import sys
|
|
8
|
+
from collections.abc import Sequence
|
|
8
9
|
from pathlib import Path
|
|
10
|
+
from typing import cast
|
|
9
11
|
|
|
10
12
|
from . import SAMPLE_TEXT
|
|
11
13
|
from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
|
|
@@ -88,6 +90,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
88
90
|
|
|
89
91
|
|
|
90
92
|
def build_lexicon_parser() -> argparse.ArgumentParser:
|
|
93
|
+
"""Create the ``build-lexicon`` subcommand parser with vector cache options."""
|
|
91
94
|
builder = argparse.ArgumentParser(
|
|
92
95
|
prog="glitchlings build-lexicon",
|
|
93
96
|
description=(
|
|
@@ -179,21 +182,23 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
|
|
|
179
182
|
SystemExit: Raised indirectly via ``parser.error`` on failure.
|
|
180
183
|
|
|
181
184
|
"""
|
|
182
|
-
|
|
185
|
+
file_path = cast(Path | None, getattr(args, "file", None))
|
|
186
|
+
if file_path is not None:
|
|
183
187
|
try:
|
|
184
|
-
return
|
|
188
|
+
return file_path.read_text(encoding="utf-8")
|
|
185
189
|
except OSError as exc:
|
|
186
|
-
filename = getattr(exc, "filename", None) or
|
|
190
|
+
filename = getattr(exc, "filename", None) or file_path
|
|
187
191
|
reason = exc.strerror or str(exc)
|
|
188
192
|
parser.error(f"Failed to read file {filename}: {reason}")
|
|
189
193
|
|
|
190
|
-
|
|
191
|
-
|
|
194
|
+
text_argument = cast(str | None, getattr(args, "text", None))
|
|
195
|
+
if text_argument:
|
|
196
|
+
return text_argument
|
|
192
197
|
|
|
193
198
|
if not sys.stdin.isatty():
|
|
194
199
|
return sys.stdin.read()
|
|
195
200
|
|
|
196
|
-
if args
|
|
201
|
+
if bool(getattr(args, "sample", False)):
|
|
197
202
|
return SAMPLE_TEXT
|
|
198
203
|
|
|
199
204
|
parser.error(
|
|
@@ -224,21 +229,23 @@ def summon_glitchlings(
|
|
|
224
229
|
|
|
225
230
|
return build_gaggle(config, seed_override=seed)
|
|
226
231
|
|
|
232
|
+
normalized: Sequence[str | Glitchling]
|
|
227
233
|
if names:
|
|
228
|
-
|
|
234
|
+
parsed: list[str | Glitchling] = []
|
|
229
235
|
for specification in names:
|
|
230
236
|
try:
|
|
231
|
-
|
|
237
|
+
parsed.append(parse_glitchling_spec(specification))
|
|
232
238
|
except ValueError as exc:
|
|
233
239
|
parser.error(str(exc))
|
|
234
240
|
raise AssertionError("parser.error should exit")
|
|
241
|
+
normalized = parsed
|
|
235
242
|
else:
|
|
236
|
-
normalized = DEFAULT_GLITCHLING_NAMES
|
|
243
|
+
normalized = list(DEFAULT_GLITCHLING_NAMES)
|
|
237
244
|
|
|
238
245
|
effective_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
|
|
239
246
|
|
|
240
247
|
try:
|
|
241
|
-
return summon(normalized, seed=effective_seed)
|
|
248
|
+
return summon(list(normalized), seed=effective_seed)
|
|
242
249
|
except ValueError as exc:
|
|
243
250
|
parser.error(str(exc))
|
|
244
251
|
raise AssertionError("parser.error should exit")
|
|
@@ -285,7 +292,10 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
|
|
|
285
292
|
config_path=args.config,
|
|
286
293
|
)
|
|
287
294
|
|
|
288
|
-
corrupted = gaggle(text)
|
|
295
|
+
corrupted = gaggle.corrupt(text)
|
|
296
|
+
if not isinstance(corrupted, str):
|
|
297
|
+
message = "Gaggle returned non-string output for string input"
|
|
298
|
+
raise TypeError(message)
|
|
289
299
|
|
|
290
300
|
if args.diff:
|
|
291
301
|
show_diff(text, corrupted)
|
glitchlings/zoo/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ import ast
|
|
|
4
4
|
from typing import Any
|
|
5
5
|
|
|
6
6
|
from .adjax import Adjax, adjax
|
|
7
|
+
from .apostrofae import Apostrofae, apostrofae
|
|
7
8
|
from .core import (
|
|
8
9
|
Gaggle,
|
|
9
10
|
Glitchling,
|
|
@@ -30,6 +31,8 @@ __all__ = [
|
|
|
30
31
|
"mim1c",
|
|
31
32
|
"Jargoyle",
|
|
32
33
|
"jargoyle",
|
|
34
|
+
"Apostrofae",
|
|
35
|
+
"apostrofae",
|
|
33
36
|
"Adjax",
|
|
34
37
|
"adjax",
|
|
35
38
|
"Reduple",
|
|
@@ -58,7 +61,7 @@ __all__ = [
|
|
|
58
61
|
|
|
59
62
|
_HAS_JARGOYLE = _jargoyle_available()
|
|
60
63
|
|
|
61
|
-
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
|
|
64
|
+
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, mim1c]
|
|
62
65
|
if _HAS_JARGOYLE:
|
|
63
66
|
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
|
64
67
|
_BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
|
|
@@ -69,6 +72,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
|
|
69
72
|
|
|
70
73
|
_BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
|
71
74
|
typogre.name.lower(): Typogre,
|
|
75
|
+
apostrofae.name.lower(): Apostrofae,
|
|
72
76
|
mim1c.name.lower(): Mim1c,
|
|
73
77
|
adjax.name.lower(): Adjax,
|
|
74
78
|
reduple.name.lower(): Reduple,
|
glitchlings/zoo/adjax.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import random
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any, cast
|
|
5
5
|
|
|
6
6
|
from ._rate import resolve_rate
|
|
7
7
|
from ._text_utils import split_preserving_whitespace, split_token_edges
|
|
@@ -83,7 +83,7 @@ def swap_adjacent_words(
|
|
|
83
83
|
rng = random.Random(seed)
|
|
84
84
|
|
|
85
85
|
if _swap_adjacent_words_rust is not None:
|
|
86
|
-
return _swap_adjacent_words_rust(text, clamped_rate, rng)
|
|
86
|
+
return cast(str, _swap_adjacent_words_rust(text, clamped_rate, rng))
|
|
87
87
|
|
|
88
88
|
return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
|
|
89
89
|
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Smart-quote glitchling that swaps straight quotes for fancy counterparts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import random
|
|
7
|
+
from functools import cache
|
|
8
|
+
from importlib import resources
|
|
9
|
+
from typing import Any, Sequence, cast
|
|
10
|
+
|
|
11
|
+
from .core import AttackOrder, AttackWave, Gaggle, Glitchling
|
|
12
|
+
|
|
13
|
+
try: # pragma: no cover - compiled extension not present in pure-Python envs
|
|
14
|
+
from glitchlings._zoo_rust import apostrofae as _apostrofae_rust
|
|
15
|
+
except ImportError: # pragma: no cover - compiled extension not present
|
|
16
|
+
_apostrofae_rust = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@cache
|
|
20
|
+
def _load_replacement_pairs() -> dict[str, list[tuple[str, str]]]:
|
|
21
|
+
"""Load the curated mapping of straight quotes to fancy pairs."""
|
|
22
|
+
|
|
23
|
+
resource = resources.files(f"{__package__}.assets").joinpath("apostrofae_pairs.json")
|
|
24
|
+
with resource.open("r", encoding="utf-8") as handle:
|
|
25
|
+
data: dict[str, list[Sequence[str]]] = json.load(handle)
|
|
26
|
+
|
|
27
|
+
parsed: dict[str, list[tuple[str, str]]] = {}
|
|
28
|
+
for straight, replacements in data.items():
|
|
29
|
+
parsed[straight] = [(pair[0], pair[1]) for pair in replacements if len(pair) == 2]
|
|
30
|
+
return parsed
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _find_quote_pairs(text: str) -> list[tuple[int, int, str]]:
|
|
34
|
+
"""Return all balanced pairs of straight quotes in ``text``.
|
|
35
|
+
|
|
36
|
+
The search walks the string once, pairing sequential occurrences of each quote
|
|
37
|
+
glyph. Unmatched openers remain untouched so contractions (e.g. ``it's``)
|
|
38
|
+
survive unmodified.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
stacks: dict[str, int | None] = {'"': None, "'": None, "`": None}
|
|
42
|
+
pairs: list[tuple[int, int, str]] = []
|
|
43
|
+
|
|
44
|
+
for index, ch in enumerate(text):
|
|
45
|
+
if ch not in stacks:
|
|
46
|
+
continue
|
|
47
|
+
start = stacks[ch]
|
|
48
|
+
if start is None:
|
|
49
|
+
stacks[ch] = index
|
|
50
|
+
else:
|
|
51
|
+
pairs.append((start, index, ch))
|
|
52
|
+
stacks[ch] = None
|
|
53
|
+
|
|
54
|
+
return pairs
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _apostrofae_python(text: str, *, rng: random.Random) -> str:
|
|
58
|
+
"""Python fallback that replaces paired straight quotes with fancy glyphs."""
|
|
59
|
+
|
|
60
|
+
pairs = _load_replacement_pairs()
|
|
61
|
+
candidates = _find_quote_pairs(text)
|
|
62
|
+
if not candidates:
|
|
63
|
+
return text
|
|
64
|
+
|
|
65
|
+
chars = list(text)
|
|
66
|
+
for start, end, glyph in candidates:
|
|
67
|
+
options = pairs.get(glyph)
|
|
68
|
+
if not options:
|
|
69
|
+
continue
|
|
70
|
+
left, right = rng.choice(options)
|
|
71
|
+
chars[start] = left
|
|
72
|
+
chars[end] = right
|
|
73
|
+
return "".join(chars)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def smart_quotes(
|
|
77
|
+
text: str,
|
|
78
|
+
seed: int | None = None,
|
|
79
|
+
rng: random.Random | None = None,
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Replace straight quotes, apostrophes, and backticks with fancy pairs."""
|
|
82
|
+
|
|
83
|
+
if not text:
|
|
84
|
+
return text
|
|
85
|
+
|
|
86
|
+
if rng is None:
|
|
87
|
+
rng = random.Random(seed)
|
|
88
|
+
|
|
89
|
+
if _apostrofae_rust is not None:
|
|
90
|
+
return cast(str, _apostrofae_rust(text, rng))
|
|
91
|
+
|
|
92
|
+
return _apostrofae_python(text, rng=rng)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Apostrofae(Glitchling):
|
|
96
|
+
"""Glitchling that swaps straight quotes for decorative Unicode pairs."""
|
|
97
|
+
|
|
98
|
+
def __init__(self, *, seed: int | None = None) -> None:
|
|
99
|
+
self._master_seed: int | None = seed
|
|
100
|
+
super().__init__(
|
|
101
|
+
name="Apostrofae",
|
|
102
|
+
corruption_function=smart_quotes,
|
|
103
|
+
scope=AttackWave.CHARACTER,
|
|
104
|
+
order=AttackOrder.NORMAL,
|
|
105
|
+
seed=seed,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
109
|
+
return {"type": "apostrofae"}
|
|
110
|
+
|
|
111
|
+
def reset_rng(self, seed: int | None = None) -> None: # pragma: no cover - exercised indirectly
|
|
112
|
+
if seed is not None:
|
|
113
|
+
self._master_seed = seed
|
|
114
|
+
super().reset_rng(seed)
|
|
115
|
+
if self.seed is None:
|
|
116
|
+
return
|
|
117
|
+
derived = Gaggle.derive_seed(int(seed), self.name, 0)
|
|
118
|
+
self.seed = int(derived)
|
|
119
|
+
self.rng = random.Random(self.seed)
|
|
120
|
+
self.kwargs["seed"] = self.seed
|
|
121
|
+
else:
|
|
122
|
+
super().reset_rng(None)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
apostrofae = Apostrofae()
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
__all__ = ["Apostrofae", "apostrofae", "smart_quotes"]
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"\"": [
|
|
3
|
+
["“", "”"],
|
|
4
|
+
["„", "“"],
|
|
5
|
+
["«", "»"],
|
|
6
|
+
["‹", "›"],
|
|
7
|
+
["『", "』"],
|
|
8
|
+
["「", "」"],
|
|
9
|
+
["﹁", "﹂"],
|
|
10
|
+
["﹃", "﹄"],
|
|
11
|
+
["〝", "〞"],
|
|
12
|
+
["❝", "❞"]
|
|
13
|
+
],
|
|
14
|
+
"'": [
|
|
15
|
+
["‘", "’"],
|
|
16
|
+
["‚", "‘"],
|
|
17
|
+
["‹", "›"],
|
|
18
|
+
["❮", "❯"],
|
|
19
|
+
["❛", "❜"],
|
|
20
|
+
["﹇", "﹈"]
|
|
21
|
+
],
|
|
22
|
+
"`": [
|
|
23
|
+
["‵", "′"],
|
|
24
|
+
["﹁", "﹂"],
|
|
25
|
+
["﹃", "﹄"],
|
|
26
|
+
["⌈", "⌉"],
|
|
27
|
+
["⌊", "⌋"],
|
|
28
|
+
["⎡", "⎤"],
|
|
29
|
+
["⎣", "⎦"],
|
|
30
|
+
["〝", "〞"]
|
|
31
|
+
]
|
|
32
|
+
}
|
glitchlings/zoo/core.py
CHANGED
|
@@ -7,7 +7,7 @@ import random
|
|
|
7
7
|
from collections.abc import Mapping, Sequence
|
|
8
8
|
from enum import IntEnum, auto
|
|
9
9
|
from hashlib import blake2s
|
|
10
|
-
from typing import TYPE_CHECKING, Any, Callable, Protocol, TypedDict, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Protocol, TypedDict, TypeGuard, Union, cast
|
|
11
11
|
|
|
12
12
|
from ..compat import get_datasets_dataset, require_datasets
|
|
13
13
|
|
|
@@ -35,6 +35,9 @@ class PlanSpecification(TypedDict):
|
|
|
35
35
|
order: int
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
TranscriptTurn = dict[str, Any]
|
|
39
|
+
Transcript = list[TranscriptTurn]
|
|
40
|
+
|
|
38
41
|
PlanEntry = Union["Glitchling", Mapping[str, Any]]
|
|
39
42
|
|
|
40
43
|
|
|
@@ -186,7 +189,7 @@ def plan_glitchlings(
|
|
|
186
189
|
|
|
187
190
|
|
|
188
191
|
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
189
|
-
from datasets import Dataset
|
|
192
|
+
from datasets import Dataset
|
|
190
193
|
elif _DatasetsDataset is not None:
|
|
191
194
|
Dataset = _DatasetsDataset
|
|
192
195
|
else:
|
|
@@ -202,8 +205,8 @@ def _is_transcript(
|
|
|
202
205
|
*,
|
|
203
206
|
allow_empty: bool = True,
|
|
204
207
|
require_all_content: bool = False,
|
|
205
|
-
) ->
|
|
206
|
-
"""Return
|
|
208
|
+
) -> TypeGuard[Transcript]:
|
|
209
|
+
"""Return ``True`` when ``value`` appears to be a chat transcript."""
|
|
207
210
|
if not isinstance(value, list):
|
|
208
211
|
return False
|
|
209
212
|
|
|
@@ -351,15 +354,17 @@ class Glitchling:
|
|
|
351
354
|
corrupted = self.corruption_function(text, *args, **kwargs)
|
|
352
355
|
return corrupted
|
|
353
356
|
|
|
354
|
-
def corrupt(self, text: str |
|
|
357
|
+
def corrupt(self, text: str | Transcript) -> str | Transcript:
|
|
355
358
|
"""Apply the corruption function to text or conversational transcripts."""
|
|
356
359
|
if _is_transcript(text):
|
|
357
|
-
transcript = [dict(turn) for turn in text]
|
|
360
|
+
transcript: Transcript = [dict(turn) for turn in text]
|
|
358
361
|
if transcript:
|
|
359
|
-
|
|
362
|
+
content = transcript[-1].get("content")
|
|
363
|
+
if isinstance(content, str):
|
|
364
|
+
transcript[-1]["content"] = self.__corrupt(content, **self.kwargs)
|
|
360
365
|
return transcript
|
|
361
366
|
|
|
362
|
-
return self.__corrupt(text, **self.kwargs)
|
|
367
|
+
return self.__corrupt(cast(str, text), **self.kwargs)
|
|
363
368
|
|
|
364
369
|
def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
|
|
365
370
|
"""Apply corruption lazily across dataset columns."""
|
|
@@ -383,7 +388,7 @@ class Glitchling:
|
|
|
383
388
|
|
|
384
389
|
return dataset.with_transform(__corrupt_row)
|
|
385
390
|
|
|
386
|
-
def __call__(self, text: str, *args: Any, **kwds: Any) -> str |
|
|
391
|
+
def __call__(self, text: str, *args: Any, **kwds: Any) -> str | Transcript:
|
|
387
392
|
"""Allow a glitchling to be invoked directly like a callable."""
|
|
388
393
|
return self.corrupt(text, *args, **kwds)
|
|
389
394
|
|
|
@@ -426,7 +431,7 @@ class Gaggle(Glitchling):
|
|
|
426
431
|
seed: Master seed used to derive per-glitchling seeds.
|
|
427
432
|
|
|
428
433
|
"""
|
|
429
|
-
super().__init__("Gaggle", self.
|
|
434
|
+
super().__init__("Gaggle", self._corrupt_text, AttackWave.DOCUMENT, seed=seed)
|
|
430
435
|
self._clones_by_index: list[Glitchling] = []
|
|
431
436
|
for idx, glitchling in enumerate(glitchlings):
|
|
432
437
|
clone = glitchling.clone()
|
|
@@ -528,17 +533,38 @@ class Gaggle(Glitchling):
|
|
|
528
533
|
|
|
529
534
|
return descriptors
|
|
530
535
|
|
|
531
|
-
def
|
|
532
|
-
"""Apply each glitchling to
|
|
536
|
+
def _corrupt_text(self, text: str) -> str:
|
|
537
|
+
"""Apply each glitchling to string input sequentially."""
|
|
533
538
|
master_seed = self.seed
|
|
534
539
|
descriptors = self._pipeline_descriptors()
|
|
535
540
|
if master_seed is not None and descriptors is not None:
|
|
536
541
|
try:
|
|
537
|
-
return _compose_glitchlings_rust(text, descriptors, master_seed)
|
|
542
|
+
return cast(str, _compose_glitchlings_rust(text, descriptors, master_seed))
|
|
538
543
|
except Exception: # pragma: no cover - fall back to Python execution
|
|
539
544
|
log.debug("Rust pipeline failed; falling back", exc_info=True)
|
|
540
545
|
|
|
541
546
|
corrupted = text
|
|
542
547
|
for glitchling in self.apply_order:
|
|
543
|
-
|
|
548
|
+
next_value = glitchling.corrupt(corrupted)
|
|
549
|
+
if not isinstance(next_value, str):
|
|
550
|
+
message = "Glitchling pipeline produced non-string output for string input"
|
|
551
|
+
raise TypeError(message)
|
|
552
|
+
corrupted = next_value
|
|
553
|
+
|
|
544
554
|
return corrupted
|
|
555
|
+
|
|
556
|
+
def corrupt(self, text: str | Transcript) -> str | Transcript:
|
|
557
|
+
"""Apply each glitchling to the provided text sequentially."""
|
|
558
|
+
if isinstance(text, str):
|
|
559
|
+
return self._corrupt_text(text)
|
|
560
|
+
|
|
561
|
+
if _is_transcript(text):
|
|
562
|
+
transcript: Transcript = [dict(turn) for turn in text]
|
|
563
|
+
if transcript and "content" in transcript[-1]:
|
|
564
|
+
content = transcript[-1]["content"]
|
|
565
|
+
if isinstance(content, str):
|
|
566
|
+
transcript[-1]["content"] = self._corrupt_text(content)
|
|
567
|
+
return transcript
|
|
568
|
+
|
|
569
|
+
message = f"Unsupported text type for Gaggle corruption: {type(text)!r}"
|
|
570
|
+
raise TypeError(message)
|