glitchlings 0.4.2__cp312-cp312-macosx_11_0_universal2.whl → 0.4.3__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (35) hide show
  1. glitchlings/__init__.py +4 -0
  2. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  3. glitchlings/compat.py +80 -11
  4. glitchlings/config.py +32 -19
  5. glitchlings/config.toml +1 -1
  6. glitchlings/dlc/__init__.py +3 -1
  7. glitchlings/dlc/pytorch.py +216 -0
  8. glitchlings/dlc/pytorch_lightning.py +233 -0
  9. glitchlings/lexicon/__init__.py +5 -15
  10. glitchlings/lexicon/_cache.py +21 -15
  11. glitchlings/lexicon/data/default_vector_cache.json +80 -14
  12. glitchlings/lexicon/vector.py +94 -15
  13. glitchlings/lexicon/wordnet.py +66 -25
  14. glitchlings/main.py +21 -11
  15. glitchlings/zoo/__init__.py +5 -1
  16. glitchlings/zoo/adjax.py +2 -2
  17. glitchlings/zoo/apostrofae.py +128 -0
  18. glitchlings/zoo/assets/__init__.py +0 -0
  19. glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
  20. glitchlings/zoo/core.py +40 -14
  21. glitchlings/zoo/jargoyle.py +44 -34
  22. glitchlings/zoo/redactyl.py +11 -8
  23. glitchlings/zoo/reduple.py +2 -2
  24. glitchlings/zoo/rushmore.py +2 -2
  25. glitchlings/zoo/scannequin.py +2 -2
  26. glitchlings/zoo/typogre.py +5 -2
  27. glitchlings/zoo/zeedub.py +5 -2
  28. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/METADATA +35 -2
  29. glitchlings-0.4.3.dist-info/RECORD +46 -0
  30. glitchlings/lexicon/graph.py +0 -282
  31. glitchlings-0.4.2.dist-info/RECORD +0 -42
  32. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
  33. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
  34. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
  35. {glitchlings-0.4.2.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
@@ -4,49 +4,74 @@ from __future__ import annotations
4
4
 
5
5
  from importlib import import_module
6
6
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any
7
+ from types import ModuleType
8
+ from typing import Any, Callable, Protocol, Sequence, cast
8
9
 
9
10
  from ..compat import nltk as _nltk_dependency
10
11
  from . import LexiconBackend
11
12
  from ._cache import CacheSnapshot
12
13
 
13
- nltk = _nltk_dependency.get() # type: ignore[assignment]
14
- _NLTK_IMPORT_ERROR = _nltk_dependency.error
15
14
 
16
- if TYPE_CHECKING: # pragma: no cover - typing aid only
17
- from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
18
- else: # pragma: no cover - runtime fallback to avoid hard dependency
19
- WordNetCorpusReader = Any
15
+ class _LemmaProtocol(Protocol):
16
+ def name(self) -> str:
17
+ ...
20
18
 
21
- find: Any | None = None
22
- _WORDNET_MODULE: Any | None = None
19
+
20
+ class _SynsetProtocol(Protocol):
21
+ def lemmas(self) -> Sequence[_LemmaProtocol]:
22
+ ...
23
+
24
+
25
+ class _WordNetResource(Protocol):
26
+ def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
27
+ ...
28
+
29
+ def ensure_loaded(self) -> None:
30
+ ...
31
+
32
+
33
+ WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
34
+
35
+ nltk: ModuleType | None = _nltk_dependency.get()
36
+ _NLTK_IMPORT_ERROR: ModuleNotFoundError | None = _nltk_dependency.error
37
+
38
+ WordNetCorpusReader: WordNetCorpusReaderFactory | None = None
39
+ find: Callable[[str], Any] | None = None
40
+ _WORDNET_MODULE: _WordNetResource | None = None
23
41
 
24
42
  if nltk is not None: # pragma: no cover - guarded by import success
25
43
  try:
26
44
  corpus_reader_module = import_module("nltk.corpus.reader")
27
- WordNetCorpusReader = corpus_reader_module.WordNetCorpusReader # type: ignore[assignment]
28
45
  except ModuleNotFoundError as exc: # pragma: no cover - triggered when corpus missing
29
46
  if _NLTK_IMPORT_ERROR is None:
30
- _NLTK_IMPORT_ERROR = exc # type: ignore[assignment]
47
+ _NLTK_IMPORT_ERROR = exc
31
48
  else:
49
+ reader_candidate = getattr(corpus_reader_module, "WordNetCorpusReader", None)
50
+ if reader_candidate is not None:
51
+ WordNetCorpusReader = cast(WordNetCorpusReaderFactory, reader_candidate)
52
+
32
53
  try:
33
54
  data_module = import_module("nltk.data")
34
55
  except ModuleNotFoundError as exc: # pragma: no cover - triggered when data missing
35
56
  if _NLTK_IMPORT_ERROR is None:
36
- _NLTK_IMPORT_ERROR = exc # type: ignore[assignment]
57
+ _NLTK_IMPORT_ERROR = exc
37
58
  else:
38
- find = getattr(data_module, "find", None)
59
+ locator = getattr(data_module, "find", None)
60
+ if callable(locator):
61
+ find = cast(Callable[[str], Any], locator)
39
62
 
40
63
  try:
41
- _WORDNET_MODULE = import_module("nltk.corpus.wordnet")
64
+ module_candidate = import_module("nltk.corpus.wordnet")
42
65
  except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
43
66
  _WORDNET_MODULE = None
67
+ else:
68
+ _WORDNET_MODULE = cast(_WordNetResource, module_candidate)
44
69
  else:
45
- nltk = None # type: ignore[assignment]
70
+ nltk = None
46
71
  find = None
47
72
  _WORDNET_MODULE = None
48
73
 
49
- _WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
74
+ _WORDNET_HANDLE: _WordNetResource | None = _WORDNET_MODULE
50
75
  _wordnet_ready = False
51
76
 
52
77
  _VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
@@ -69,15 +94,22 @@ def dependencies_available() -> bool:
69
94
  return nltk is not None and find is not None
70
95
 
71
96
 
72
- def _load_wordnet_reader() -> WordNetCorpusReader:
97
+ def _load_wordnet_reader() -> _WordNetResource:
73
98
  """Return a WordNet corpus reader from the downloaded corpus files."""
74
99
  _require_nltk()
75
100
 
101
+ if WordNetCorpusReader is None:
102
+ raise RuntimeError("The NLTK WordNet corpus reader is unavailable.")
103
+
104
+ locator = find
105
+ if locator is None:
106
+ raise RuntimeError("The NLTK data locator is unavailable.")
107
+
76
108
  try:
77
- root = find("corpora/wordnet")
109
+ root = locator("corpora/wordnet")
78
110
  except LookupError:
79
111
  try:
80
- zip_root = find("corpora/wordnet.zip")
112
+ zip_root = locator("corpora/wordnet.zip")
81
113
  except LookupError as exc:
82
114
  raise RuntimeError(
83
115
  "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
@@ -87,18 +119,20 @@ def _load_wordnet_reader() -> WordNetCorpusReader:
87
119
  return WordNetCorpusReader(root, None)
88
120
 
89
121
 
90
- def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
122
+ def _wordnet(force_refresh: bool = False) -> _WordNetResource:
91
123
  """Retrieve the active WordNet handle, rebuilding it on demand."""
92
124
  global _WORDNET_HANDLE
93
125
 
94
126
  if force_refresh:
95
127
  _WORDNET_HANDLE = _WORDNET_MODULE
96
128
 
97
- if _WORDNET_HANDLE is not None:
98
- return _WORDNET_HANDLE
129
+ cached = _WORDNET_HANDLE
130
+ if cached is not None:
131
+ return cached
99
132
 
100
- _WORDNET_HANDLE = _load_wordnet_reader()
101
- return _WORDNET_HANDLE
133
+ resource = _load_wordnet_reader()
134
+ _WORDNET_HANDLE = resource
135
+ return resource
102
136
 
103
137
 
104
138
  def ensure_wordnet() -> None:
@@ -110,11 +144,14 @@ def ensure_wordnet() -> None:
110
144
  _require_nltk()
111
145
 
112
146
  resource = _wordnet()
147
+ nltk_module = nltk
148
+ if nltk_module is None:
149
+ raise RuntimeError("The NLTK dependency is unexpectedly unavailable.")
113
150
 
114
151
  try:
115
152
  resource.ensure_loaded()
116
153
  except LookupError:
117
- nltk.download("wordnet", quiet=True)
154
+ nltk_module.download("wordnet", quiet=True)
118
155
  try:
119
156
  resource = _wordnet(force_refresh=True)
120
157
  resource.ensure_loaded()
@@ -159,6 +196,7 @@ class WordNetLexicon(LexiconBackend):
159
196
  """Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
160
197
 
161
198
  def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
199
+ """Return up to ``n`` WordNet lemmas for ``word`` filtered by ``pos`` if provided."""
162
200
  ensure_wordnet()
163
201
 
164
202
  if pos is None:
@@ -173,15 +211,18 @@ class WordNetLexicon(LexiconBackend):
173
211
  return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
174
212
 
175
213
  def supports_pos(self, pos: str | None) -> bool:
214
+ """Return ``True`` when ``pos`` is unset or recognised by the WordNet corpus."""
176
215
  if pos is None:
177
216
  return True
178
217
  return pos.lower() in _VALID_POS
179
218
 
180
219
  @classmethod
181
220
  def load_cache(cls, path: str | Path) -> CacheSnapshot:
221
+ """WordNet lexicons do not persist caches; raising keeps the contract explicit."""
182
222
  raise RuntimeError("WordNetLexicon does not persist or load caches.")
183
223
 
184
224
  def save_cache(self, path: str | Path | None = None) -> Path | None:
225
+ """WordNet lexicons do not persist caches; raising keeps the contract explicit."""
185
226
  raise RuntimeError("WordNetLexicon does not persist or load caches.")
186
227
 
187
228
  def __repr__(self) -> str: # pragma: no cover - trivial representation
glitchlings/main.py CHANGED
@@ -5,7 +5,9 @@ from __future__ import annotations
5
5
  import argparse
6
6
  import difflib
7
7
  import sys
8
+ from collections.abc import Sequence
8
9
  from pathlib import Path
10
+ from typing import cast
9
11
 
10
12
  from . import SAMPLE_TEXT
11
13
  from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
@@ -88,6 +90,7 @@ def build_parser() -> argparse.ArgumentParser:
88
90
 
89
91
 
90
92
  def build_lexicon_parser() -> argparse.ArgumentParser:
93
+ """Create the ``build-lexicon`` subcommand parser with vector cache options."""
91
94
  builder = argparse.ArgumentParser(
92
95
  prog="glitchlings build-lexicon",
93
96
  description=(
@@ -179,21 +182,23 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
179
182
  SystemExit: Raised indirectly via ``parser.error`` on failure.
180
183
 
181
184
  """
182
- if args.file is not None:
185
+ file_path = cast(Path | None, getattr(args, "file", None))
186
+ if file_path is not None:
183
187
  try:
184
- return args.file.read_text(encoding="utf-8")
188
+ return file_path.read_text(encoding="utf-8")
185
189
  except OSError as exc:
186
- filename = getattr(exc, "filename", None) or args.file
190
+ filename = getattr(exc, "filename", None) or file_path
187
191
  reason = exc.strerror or str(exc)
188
192
  parser.error(f"Failed to read file {filename}: {reason}")
189
193
 
190
- if args.text:
191
- return args.text
194
+ text_argument = cast(str | None, getattr(args, "text", None))
195
+ if text_argument:
196
+ return text_argument
192
197
 
193
198
  if not sys.stdin.isatty():
194
199
  return sys.stdin.read()
195
200
 
196
- if args.sample:
201
+ if bool(getattr(args, "sample", False)):
197
202
  return SAMPLE_TEXT
198
203
 
199
204
  parser.error(
@@ -224,21 +229,23 @@ def summon_glitchlings(
224
229
 
225
230
  return build_gaggle(config, seed_override=seed)
226
231
 
232
+ normalized: Sequence[str | Glitchling]
227
233
  if names:
228
- normalized: list[str | Glitchling] = []
234
+ parsed: list[str | Glitchling] = []
229
235
  for specification in names:
230
236
  try:
231
- normalized.append(parse_glitchling_spec(specification))
237
+ parsed.append(parse_glitchling_spec(specification))
232
238
  except ValueError as exc:
233
239
  parser.error(str(exc))
234
240
  raise AssertionError("parser.error should exit")
241
+ normalized = parsed
235
242
  else:
236
- normalized = DEFAULT_GLITCHLING_NAMES
243
+ normalized = list(DEFAULT_GLITCHLING_NAMES)
237
244
 
238
245
  effective_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
239
246
 
240
247
  try:
241
- return summon(normalized, seed=effective_seed)
248
+ return summon(list(normalized), seed=effective_seed)
242
249
  except ValueError as exc:
243
250
  parser.error(str(exc))
244
251
  raise AssertionError("parser.error should exit")
@@ -285,7 +292,10 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
285
292
  config_path=args.config,
286
293
  )
287
294
 
288
- corrupted = gaggle(text)
295
+ corrupted = gaggle.corrupt(text)
296
+ if not isinstance(corrupted, str):
297
+ message = "Gaggle returned non-string output for string input"
298
+ raise TypeError(message)
289
299
 
290
300
  if args.diff:
291
301
  show_diff(text, corrupted)
@@ -4,6 +4,7 @@ import ast
4
4
  from typing import Any
5
5
 
6
6
  from .adjax import Adjax, adjax
7
+ from .apostrofae import Apostrofae, apostrofae
7
8
  from .core import (
8
9
  Gaggle,
9
10
  Glitchling,
@@ -30,6 +31,8 @@ __all__ = [
30
31
  "mim1c",
31
32
  "Jargoyle",
32
33
  "jargoyle",
34
+ "Apostrofae",
35
+ "apostrofae",
33
36
  "Adjax",
34
37
  "adjax",
35
38
  "Reduple",
@@ -58,7 +61,7 @@ __all__ = [
58
61
 
59
62
  _HAS_JARGOYLE = _jargoyle_available()
60
63
 
61
- _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
64
+ _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, apostrofae, mim1c]
62
65
  if _HAS_JARGOYLE:
63
66
  _BUILTIN_GLITCHLING_LIST.append(jargoyle)
64
67
  _BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
@@ -69,6 +72,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
69
72
 
70
73
  _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
71
74
  typogre.name.lower(): Typogre,
75
+ apostrofae.name.lower(): Apostrofae,
72
76
  mim1c.name.lower(): Mim1c,
73
77
  adjax.name.lower(): Adjax,
74
78
  reduple.name.lower(): Reduple,
glitchlings/zoo/adjax.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import random
4
- from typing import Any
4
+ from typing import Any, cast
5
5
 
6
6
  from ._rate import resolve_rate
7
7
  from ._text_utils import split_preserving_whitespace, split_token_edges
@@ -83,7 +83,7 @@ def swap_adjacent_words(
83
83
  rng = random.Random(seed)
84
84
 
85
85
  if _swap_adjacent_words_rust is not None:
86
- return _swap_adjacent_words_rust(text, clamped_rate, rng)
86
+ return cast(str, _swap_adjacent_words_rust(text, clamped_rate, rng))
87
87
 
88
88
  return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
89
89
 
@@ -0,0 +1,128 @@
1
+ """Smart-quote glitchling that swaps straight quotes for fancy counterparts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import random
7
+ from functools import cache
8
+ from importlib import resources
9
+ from typing import Any, Sequence, cast
10
+
11
+ from .core import AttackOrder, AttackWave, Gaggle, Glitchling
12
+
13
+ try: # pragma: no cover - compiled extension not present in pure-Python envs
14
+ from glitchlings._zoo_rust import apostrofae as _apostrofae_rust
15
+ except ImportError: # pragma: no cover - compiled extension not present
16
+ _apostrofae_rust = None
17
+
18
+
19
+ @cache
20
+ def _load_replacement_pairs() -> dict[str, list[tuple[str, str]]]:
21
+ """Load the curated mapping of straight quotes to fancy pairs."""
22
+
23
+ resource = resources.files(f"{__package__}.assets").joinpath("apostrofae_pairs.json")
24
+ with resource.open("r", encoding="utf-8") as handle:
25
+ data: dict[str, list[Sequence[str]]] = json.load(handle)
26
+
27
+ parsed: dict[str, list[tuple[str, str]]] = {}
28
+ for straight, replacements in data.items():
29
+ parsed[straight] = [(pair[0], pair[1]) for pair in replacements if len(pair) == 2]
30
+ return parsed
31
+
32
+
33
+ def _find_quote_pairs(text: str) -> list[tuple[int, int, str]]:
34
+ """Return all balanced pairs of straight quotes in ``text``.
35
+
36
+ The search walks the string once, pairing sequential occurrences of each quote
37
+ glyph. Unmatched openers remain untouched so contractions (e.g. ``it's``)
38
+ survive unmodified.
39
+ """
40
+
41
+ stacks: dict[str, int | None] = {'"': None, "'": None, "`": None}
42
+ pairs: list[tuple[int, int, str]] = []
43
+
44
+ for index, ch in enumerate(text):
45
+ if ch not in stacks:
46
+ continue
47
+ start = stacks[ch]
48
+ if start is None:
49
+ stacks[ch] = index
50
+ else:
51
+ pairs.append((start, index, ch))
52
+ stacks[ch] = None
53
+
54
+ return pairs
55
+
56
+
57
+ def _apostrofae_python(text: str, *, rng: random.Random) -> str:
58
+ """Python fallback that replaces paired straight quotes with fancy glyphs."""
59
+
60
+ pairs = _load_replacement_pairs()
61
+ candidates = _find_quote_pairs(text)
62
+ if not candidates:
63
+ return text
64
+
65
+ chars = list(text)
66
+ for start, end, glyph in candidates:
67
+ options = pairs.get(glyph)
68
+ if not options:
69
+ continue
70
+ left, right = rng.choice(options)
71
+ chars[start] = left
72
+ chars[end] = right
73
+ return "".join(chars)
74
+
75
+
76
+ def smart_quotes(
77
+ text: str,
78
+ seed: int | None = None,
79
+ rng: random.Random | None = None,
80
+ ) -> str:
81
+ """Replace straight quotes, apostrophes, and backticks with fancy pairs."""
82
+
83
+ if not text:
84
+ return text
85
+
86
+ if rng is None:
87
+ rng = random.Random(seed)
88
+
89
+ if _apostrofae_rust is not None:
90
+ return cast(str, _apostrofae_rust(text, rng))
91
+
92
+ return _apostrofae_python(text, rng=rng)
93
+
94
+
95
+ class Apostrofae(Glitchling):
96
+ """Glitchling that swaps straight quotes for decorative Unicode pairs."""
97
+
98
+ def __init__(self, *, seed: int | None = None) -> None:
99
+ self._master_seed: int | None = seed
100
+ super().__init__(
101
+ name="Apostrofae",
102
+ corruption_function=smart_quotes,
103
+ scope=AttackWave.CHARACTER,
104
+ order=AttackOrder.NORMAL,
105
+ seed=seed,
106
+ )
107
+
108
+ def pipeline_operation(self) -> dict[str, Any] | None:
109
+ return {"type": "apostrofae"}
110
+
111
+ def reset_rng(self, seed: int | None = None) -> None: # pragma: no cover - exercised indirectly
112
+ if seed is not None:
113
+ self._master_seed = seed
114
+ super().reset_rng(seed)
115
+ if self.seed is None:
116
+ return
117
+ derived = Gaggle.derive_seed(int(seed), self.name, 0)
118
+ self.seed = int(derived)
119
+ self.rng = random.Random(self.seed)
120
+ self.kwargs["seed"] = self.seed
121
+ else:
122
+ super().reset_rng(None)
123
+
124
+
125
+ apostrofae = Apostrofae()
126
+
127
+
128
+ __all__ = ["Apostrofae", "apostrofae", "smart_quotes"]
File without changes
@@ -0,0 +1,32 @@
1
+ {
2
+ "\"": [
3
+ ["“", "”"],
4
+ ["„", "“"],
5
+ ["«", "»"],
6
+ ["‹", "›"],
7
+ ["『", "』"],
8
+ ["「", "」"],
9
+ ["﹁", "﹂"],
10
+ ["﹃", "﹄"],
11
+ ["〝", "〞"],
12
+ ["❝", "❞"]
13
+ ],
14
+ "'": [
15
+ ["‘", "’"],
16
+ ["‚", "‘"],
17
+ ["‹", "›"],
18
+ ["❮", "❯"],
19
+ ["❛", "❜"],
20
+ ["﹇", "﹈"]
21
+ ],
22
+ "`": [
23
+ ["‵", "′"],
24
+ ["﹁", "﹂"],
25
+ ["﹃", "﹄"],
26
+ ["⌈", "⌉"],
27
+ ["⌊", "⌋"],
28
+ ["⎡", "⎤"],
29
+ ["⎣", "⎦"],
30
+ ["〝", "〞"]
31
+ ]
32
+ }
glitchlings/zoo/core.py CHANGED
@@ -7,7 +7,7 @@ import random
7
7
  from collections.abc import Mapping, Sequence
8
8
  from enum import IntEnum, auto
9
9
  from hashlib import blake2s
10
- from typing import TYPE_CHECKING, Any, Callable, Protocol, TypedDict, Union
10
+ from typing import TYPE_CHECKING, Any, Callable, Protocol, TypedDict, TypeGuard, Union, cast
11
11
 
12
12
  from ..compat import get_datasets_dataset, require_datasets
13
13
 
@@ -35,6 +35,9 @@ class PlanSpecification(TypedDict):
35
35
  order: int
36
36
 
37
37
 
38
+ TranscriptTurn = dict[str, Any]
39
+ Transcript = list[TranscriptTurn]
40
+
38
41
  PlanEntry = Union["Glitchling", Mapping[str, Any]]
39
42
 
40
43
 
@@ -186,7 +189,7 @@ def plan_glitchlings(
186
189
 
187
190
 
188
191
  if TYPE_CHECKING: # pragma: no cover - typing only
189
- from datasets import Dataset # type: ignore
192
+ from datasets import Dataset
190
193
  elif _DatasetsDataset is not None:
191
194
  Dataset = _DatasetsDataset
192
195
  else:
@@ -202,8 +205,8 @@ def _is_transcript(
202
205
  *,
203
206
  allow_empty: bool = True,
204
207
  require_all_content: bool = False,
205
- ) -> bool:
206
- """Return `True` when `value` appears to be a chat transcript."""
208
+ ) -> TypeGuard[Transcript]:
209
+ """Return ``True`` when ``value`` appears to be a chat transcript."""
207
210
  if not isinstance(value, list):
208
211
  return False
209
212
 
@@ -351,15 +354,17 @@ class Glitchling:
351
354
  corrupted = self.corruption_function(text, *args, **kwargs)
352
355
  return corrupted
353
356
 
354
- def corrupt(self, text: str | list[dict[str, Any]]) -> str | list[dict[str, Any]]:
357
+ def corrupt(self, text: str | Transcript) -> str | Transcript:
355
358
  """Apply the corruption function to text or conversational transcripts."""
356
359
  if _is_transcript(text):
357
- transcript = [dict(turn) for turn in text]
360
+ transcript: Transcript = [dict(turn) for turn in text]
358
361
  if transcript:
359
- transcript[-1]["content"] = self.__corrupt(transcript[-1]["content"], **self.kwargs)
362
+ content = transcript[-1].get("content")
363
+ if isinstance(content, str):
364
+ transcript[-1]["content"] = self.__corrupt(content, **self.kwargs)
360
365
  return transcript
361
366
 
362
- return self.__corrupt(text, **self.kwargs)
367
+ return self.__corrupt(cast(str, text), **self.kwargs)
363
368
 
364
369
  def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
365
370
  """Apply corruption lazily across dataset columns."""
@@ -383,7 +388,7 @@ class Glitchling:
383
388
 
384
389
  return dataset.with_transform(__corrupt_row)
385
390
 
386
- def __call__(self, text: str, *args: Any, **kwds: Any) -> str | list[dict[str, Any]]:
391
+ def __call__(self, text: str, *args: Any, **kwds: Any) -> str | Transcript:
387
392
  """Allow a glitchling to be invoked directly like a callable."""
388
393
  return self.corrupt(text, *args, **kwds)
389
394
 
@@ -426,7 +431,7 @@ class Gaggle(Glitchling):
426
431
  seed: Master seed used to derive per-glitchling seeds.
427
432
 
428
433
  """
429
- super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
434
+ super().__init__("Gaggle", self._corrupt_text, AttackWave.DOCUMENT, seed=seed)
430
435
  self._clones_by_index: list[Glitchling] = []
431
436
  for idx, glitchling in enumerate(glitchlings):
432
437
  clone = glitchling.clone()
@@ -528,17 +533,38 @@ class Gaggle(Glitchling):
528
533
 
529
534
  return descriptors
530
535
 
531
- def corrupt(self, text: str) -> str:
532
- """Apply each glitchling to the provided text sequentially."""
536
+ def _corrupt_text(self, text: str) -> str:
537
+ """Apply each glitchling to string input sequentially."""
533
538
  master_seed = self.seed
534
539
  descriptors = self._pipeline_descriptors()
535
540
  if master_seed is not None and descriptors is not None:
536
541
  try:
537
- return _compose_glitchlings_rust(text, descriptors, master_seed)
542
+ return cast(str, _compose_glitchlings_rust(text, descriptors, master_seed))
538
543
  except Exception: # pragma: no cover - fall back to Python execution
539
544
  log.debug("Rust pipeline failed; falling back", exc_info=True)
540
545
 
541
546
  corrupted = text
542
547
  for glitchling in self.apply_order:
543
- corrupted = glitchling(corrupted)
548
+ next_value = glitchling.corrupt(corrupted)
549
+ if not isinstance(next_value, str):
550
+ message = "Glitchling pipeline produced non-string output for string input"
551
+ raise TypeError(message)
552
+ corrupted = next_value
553
+
544
554
  return corrupted
555
+
556
+ def corrupt(self, text: str | Transcript) -> str | Transcript:
557
+ """Apply each glitchling to the provided text sequentially."""
558
+ if isinstance(text, str):
559
+ return self._corrupt_text(text)
560
+
561
+ if _is_transcript(text):
562
+ transcript: Transcript = [dict(turn) for turn in text]
563
+ if transcript and "content" in transcript[-1]:
564
+ content = transcript[-1]["content"]
565
+ if isinstance(content, str):
566
+ transcript[-1]["content"] = self._corrupt_text(content)
567
+ return transcript
568
+
569
+ message = f"Unsupported text type for Gaggle corruption: {type(text)!r}"
570
+ raise TypeError(message)