glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. glitchlings/__init__.py +101 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_corruption_engine/__init__.py +12 -0
  4. glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +184 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +819 -0
  21. glitchlings/attack/core_execution.py +378 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +211 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +338 -0
  27. glitchlings/attack/tokenizer_metrics.py +373 -0
  28. glitchlings/auggie.py +285 -0
  29. glitchlings/compat/__init__.py +9 -0
  30. glitchlings/compat/loaders.py +355 -0
  31. glitchlings/compat/types.py +41 -0
  32. glitchlings/conf/__init__.py +39 -0
  33. glitchlings/conf/loaders.py +331 -0
  34. glitchlings/conf/schema.py +156 -0
  35. glitchlings/conf/types.py +72 -0
  36. glitchlings/config.toml +2 -0
  37. glitchlings/constants.py +139 -0
  38. glitchlings/dev/__init__.py +3 -0
  39. glitchlings/dev/docs.py +45 -0
  40. glitchlings/dlc/__init__.py +21 -0
  41. glitchlings/dlc/_shared.py +300 -0
  42. glitchlings/dlc/gutenberg.py +400 -0
  43. glitchlings/dlc/huggingface.py +68 -0
  44. glitchlings/dlc/langchain.py +147 -0
  45. glitchlings/dlc/nemo.py +283 -0
  46. glitchlings/dlc/prime.py +215 -0
  47. glitchlings/dlc/pytorch.py +98 -0
  48. glitchlings/dlc/pytorch_lightning.py +173 -0
  49. glitchlings/internal/__init__.py +16 -0
  50. glitchlings/internal/rust.py +159 -0
  51. glitchlings/internal/rust_ffi.py +599 -0
  52. glitchlings/main.py +426 -0
  53. glitchlings/protocols.py +91 -0
  54. glitchlings/runtime_config.py +24 -0
  55. glitchlings/util/__init__.py +41 -0
  56. glitchlings/util/adapters.py +65 -0
  57. glitchlings/util/keyboards.py +508 -0
  58. glitchlings/util/transcripts.py +108 -0
  59. glitchlings/zoo/__init__.py +161 -0
  60. glitchlings/zoo/assets/__init__.py +29 -0
  61. glitchlings/zoo/core.py +852 -0
  62. glitchlings/zoo/core_execution.py +154 -0
  63. glitchlings/zoo/core_planning.py +451 -0
  64. glitchlings/zoo/corrupt_dispatch.py +291 -0
  65. glitchlings/zoo/hokey.py +139 -0
  66. glitchlings/zoo/jargoyle.py +301 -0
  67. glitchlings/zoo/mim1c.py +269 -0
  68. glitchlings/zoo/pedant/__init__.py +109 -0
  69. glitchlings/zoo/pedant/core.py +99 -0
  70. glitchlings/zoo/pedant/forms.py +50 -0
  71. glitchlings/zoo/pedant/stones.py +83 -0
  72. glitchlings/zoo/redactyl.py +94 -0
  73. glitchlings/zoo/rng.py +280 -0
  74. glitchlings/zoo/rushmore.py +416 -0
  75. glitchlings/zoo/scannequin.py +370 -0
  76. glitchlings/zoo/transforms.py +331 -0
  77. glitchlings/zoo/typogre.py +194 -0
  78. glitchlings/zoo/validation.py +643 -0
  79. glitchlings/zoo/wherewolf.py +120 -0
  80. glitchlings/zoo/zeedub.py +165 -0
  81. glitchlings-1.0.0.dist-info/METADATA +404 -0
  82. glitchlings-1.0.0.dist-info/RECORD +86 -0
  83. glitchlings-1.0.0.dist-info/WHEEL +5 -0
  84. glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
  85. glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
  86. glitchlings-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,852 @@
1
+ """Core data structures used to model glitchlings and their interactions."""
2
+
3
+ import inspect
4
+ import random
5
+ from collections.abc import Mapping, Sequence
6
+ from enum import IntEnum, auto
7
+ from typing import TYPE_CHECKING, Any, Callable, Protocol, cast
8
+
9
+ from glitchlings.internal.rust_ffi import build_pipeline_rust, plan_operations_rust
10
+ from glitchlings.zoo.rng import SEED_MASK, _fnv1a_hash, _splitmix64
11
+
12
+ from ..compat.loaders import get_datasets_dataset, require_datasets
13
+ from ..compat.types import Dataset as DatasetProtocol
14
+ from ..util.transcripts import (
15
+ Transcript,
16
+ TranscriptTarget,
17
+ TranscriptTurn,
18
+ is_transcript,
19
+ resolve_transcript_indices,
20
+ )
21
+ from .core_execution import execute_plan
22
+ from .core_planning import (
23
+ PipelineDescriptor,
24
+ PipelineOperationPayload,
25
+ build_execution_plan,
26
+ build_pipeline_descriptor,
27
+ normalize_plan_entries,
28
+ )
29
+ from .core_planning import (
30
+ PlanEntry as _PlanEntry,
31
+ )
32
+
33
+ _DatasetsDataset = get_datasets_dataset()
34
+
35
+ _is_transcript = is_transcript
36
+
37
+
38
+ def plan_operations(
39
+ entries: Sequence[_PlanEntry],
40
+ master_seed: int | None,
41
+ ) -> list[tuple[int, int]]:
42
+ """Normalize operation entries and compute an orchestration plan.
43
+
44
+ Notes
45
+ -----
46
+ The Rust extension is required for orchestration.
47
+ """
48
+ if master_seed is None:
49
+ message = "Gaggle orchestration requires a master seed"
50
+ raise ValueError(message)
51
+
52
+ normalized_specs = [spec.as_mapping() for spec in normalize_plan_entries(entries)]
53
+ master_seed_int = int(master_seed)
54
+ return plan_operations_rust(list(normalized_specs), master_seed_int)
55
+
56
+
57
+ if TYPE_CHECKING: # pragma: no cover - typing only
58
+ from datasets import Dataset
59
+ elif _DatasetsDataset is not None:
60
+ Dataset = _DatasetsDataset
61
+ else:
62
+ Dataset = DatasetProtocol
63
+
64
+
65
+ class CorruptionCallable(Protocol):
66
+ """Protocol describing a callable capable of corrupting text."""
67
+
68
+ def __call__(self, text: str, *args: Any, **kwargs: Any) -> str: ...
69
+
70
+
71
+ # Text levels for glitchlings, to enforce a sort order
72
+ # Work from highest level down, because e.g.
73
+ # duplicating a word then adding a typo is potentially different than
74
+ # adding a typo then duplicating a word
75
+ class AttackWave(IntEnum):
76
+ """Granularity of text that a glitchling corrupts."""
77
+
78
+ DOCUMENT = auto()
79
+ PARAGRAPH = auto()
80
+ SENTENCE = auto()
81
+ WORD = auto()
82
+ CHARACTER = auto()
83
+
84
+
85
+ # Modifier for within the same attack wave
86
+ class AttackOrder(IntEnum):
87
+ """Relative execution order for glitchlings within the same wave."""
88
+
89
+ FIRST = auto()
90
+ EARLY = auto()
91
+ NORMAL = auto()
92
+ LATE = auto()
93
+ LAST = auto()
94
+
95
+
96
+ class Glitchling:
97
+ """A single text corruption agent with deterministic behaviour."""
98
+
99
+ def __init__(
100
+ self,
101
+ name: str,
102
+ corruption_function: CorruptionCallable,
103
+ scope: AttackWave,
104
+ order: AttackOrder = AttackOrder.NORMAL,
105
+ seed: int | None = None,
106
+ pipeline_operation: Callable[["Glitchling"], Mapping[str, Any] | None] | None = None,
107
+ transcript_target: TranscriptTarget = "last",
108
+ exclude_patterns: list[str] | None = None,
109
+ include_only_patterns: list[str] | None = None,
110
+ **kwargs: Any,
111
+ ) -> None:
112
+ """Initialize a glitchling.
113
+
114
+ Args:
115
+ name: Human readable glitchling name.
116
+ corruption_function: Callable used to transform text.
117
+ scope: Text granularity on which the glitchling operates.
118
+ order: Relative ordering within the same scope.
119
+ seed: Optional seed for deterministic random behaviour.
120
+ pipeline_operation: Optional factory for Rust pipeline descriptors.
121
+ transcript_target: Which transcript turns to corrupt. Accepts:
122
+ - ``"last"`` (default): corrupt only the last turn
123
+ - ``"all"``: corrupt all turns
124
+ - ``"assistant"``: corrupt only assistant turns
125
+ - ``"user"``: corrupt only user turns
126
+ - ``int``: corrupt a specific index (negative indexing supported)
127
+ - ``Sequence[int]``: corrupt specific indices
128
+ exclude_patterns: Regex patterns marking text that must not be
129
+ modified by pipeline-backed glitchlings.
130
+ include_only_patterns: Regex patterns restricting corruption to the
131
+ matched regions; text outside these matches is treated as immutable.
132
+ **kwargs: Additional parameters forwarded to the corruption callable.
133
+
134
+ """
135
+ # Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
136
+ # If no seed is supplied, we fall back to Python's default entropy.
137
+ self.seed = seed
138
+ self.rng: random.Random = random.Random(seed)
139
+ self.name: str = name
140
+ self.corruption_function: CorruptionCallable = corruption_function
141
+ self.level: AttackWave = scope
142
+ self.order: AttackOrder = order
143
+ self._pipeline_descriptor_factory = pipeline_operation
144
+ self.transcript_target: TranscriptTarget = transcript_target
145
+ self.kwargs: dict[str, Any] = {}
146
+ self._pipeline: object | None = None
147
+ mask_kwargs = dict(kwargs)
148
+ if "exclude_patterns" not in mask_kwargs:
149
+ mask_kwargs["exclude_patterns"] = (
150
+ list(exclude_patterns) if exclude_patterns is not None else None
151
+ )
152
+ if "include_only_patterns" not in mask_kwargs:
153
+ mask_kwargs["include_only_patterns"] = (
154
+ list(include_only_patterns) if include_only_patterns is not None else None
155
+ )
156
+ for kw, val in mask_kwargs.items():
157
+ self.set_param(kw, val)
158
+
159
+ def set_param(self, key: str, value: Any) -> None:
160
+ """Persist a parameter for use by the corruption callable."""
161
+ aliases = getattr(self, "_param_aliases", {})
162
+ canonical = aliases.get(key, key)
163
+
164
+ # Drop stale alias keys so we only forward canonical kwargs.
165
+ self.kwargs.pop(key, None)
166
+ for alias, target in aliases.items():
167
+ if target == canonical:
168
+ self.kwargs.pop(alias, None)
169
+
170
+ self.kwargs[canonical] = value
171
+ setattr(self, canonical, value)
172
+
173
+ if canonical == "seed":
174
+ self.reset_rng(value)
175
+
176
+ for alias, target in aliases.items():
177
+ if target == canonical:
178
+ setattr(self, alias, value)
179
+
180
+ def pipeline_operation(self) -> PipelineOperationPayload | None:
181
+ """Return the Rust pipeline descriptor or ``None`` when unavailable.
182
+
183
+ Glitchlings that cannot provide a compiled pipeline (for example the
184
+ lightweight helpers used in tests) should override this hook or supply
185
+ a ``pipeline_operation`` factory that returns ``None`` to indicate that
186
+ Python orchestration must be used instead. When a descriptor mapping is
187
+ returned it is validated and forwarded to the Rust pipeline.
188
+ """
189
+
190
+ factory = self._pipeline_descriptor_factory
191
+ if factory is None:
192
+ return None
193
+
194
+ descriptor = factory(self)
195
+ if descriptor is None:
196
+ return None
197
+
198
+ if not isinstance(descriptor, Mapping): # pragma: no cover - defensive
199
+ raise TypeError("Pipeline descriptor factories must return a mapping or None")
200
+
201
+ payload = dict(descriptor)
202
+ payload_type = payload.get("type")
203
+ if not isinstance(payload_type, str):
204
+ message = f"Pipeline descriptor for {self.name} is missing a string 'type' field"
205
+ raise RuntimeError(message)
206
+
207
+ return cast(PipelineOperationPayload, payload)
208
+
209
+ def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
210
+ """Execute the corruption callable, injecting the RNG."""
211
+ return self.corruption_function(text, *args, rng=self.rng, **kwargs)
212
+
213
+ def _execute_corruption(self, text: str) -> str:
214
+ """Execute the actual corruption on a single text string.
215
+
216
+ This is the impure execution point that invokes the corruption callable.
217
+ All corruption for this glitchling flows through this single method.
218
+
219
+ Args:
220
+ text: The text to corrupt.
221
+
222
+ Returns:
223
+ The corrupted text.
224
+ """
225
+ call_kwargs = {
226
+ key: value
227
+ for key, value in self.kwargs.items()
228
+ if key not in {"exclude_patterns", "include_only_patterns"}
229
+ }
230
+ return self.__corrupt(text, **call_kwargs)
231
+
232
+ def corrupt(self, text: str | Transcript) -> str | Transcript:
233
+ """Apply the corruption function to text or conversational transcripts.
234
+
235
+ When the input is a transcript, the ``transcript_target`` setting
236
+ controls which turns are corrupted:
237
+
238
+ - ``"last"``: corrupt only the last turn (default)
239
+ - ``"all"``: corrupt all turns
240
+ - ``"assistant"``: corrupt only turns with ``role="assistant"``
241
+ - ``"user"``: corrupt only turns with ``role="user"``
242
+ - ``int``: corrupt a specific turn by index
243
+ - ``Sequence[int]``: corrupt specific turns by index
244
+ """
245
+ # Fast path for strings (most common case)
246
+ if isinstance(text, str):
247
+ return self._execute_corruption(text)
248
+
249
+ # Handle transcripts
250
+ if _is_transcript(text):
251
+ indices = resolve_transcript_indices(text, self.transcript_target)
252
+ result: list[TranscriptTurn] = [dict(turn) for turn in text]
253
+ for idx in indices:
254
+ turn = text[idx]
255
+ content = turn.get("content")
256
+ if isinstance(content, str):
257
+ result[idx]["content"] = self._execute_corruption(content)
258
+ return result
259
+
260
+ # Fallback: cast to string
261
+ return self._execute_corruption(str(text))
262
+
263
+ def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
264
+ """Apply corruption lazily across dataset columns."""
265
+ require_datasets("datasets is not installed")
266
+
267
+ def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
268
+ row = dict(row)
269
+ for column in columns:
270
+ value = row[column]
271
+ if _is_transcript(
272
+ value,
273
+ allow_empty=False,
274
+ require_all_content=True,
275
+ ):
276
+ row[column] = self.corrupt(value)
277
+ elif isinstance(value, list):
278
+ row[column] = [self.corrupt(item) for item in value]
279
+ else:
280
+ row[column] = self.corrupt(value)
281
+ return row
282
+
283
+ return dataset.with_transform(__corrupt_row)
284
+
285
+ def __call__(self, text: str, *args: Any, **kwds: Any) -> str | Transcript:
286
+ """Allow a glitchling to be invoked directly like a callable."""
287
+ return self.corrupt(text, *args, **kwds)
288
+
289
+ def reset_rng(self, seed: int | None = None) -> None:
290
+ """Reset the glitchling's RNG to its initial seed."""
291
+ if seed is not None:
292
+ self.seed = seed
293
+ if self.seed is not None:
294
+ self.rng = random.Random(self.seed)
295
+
296
+ def clone(self, seed: int | None = None) -> "Glitchling":
297
+ """Create a copy of this glitchling, optionally with a new seed."""
298
+ cls = self.__class__
299
+ filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
300
+ clone_seed = seed if seed is not None else self.seed
301
+
302
+ if cls is Glitchling:
303
+ if clone_seed is not None:
304
+ filtered_kwargs["seed"] = clone_seed
305
+ return Glitchling(
306
+ self.name,
307
+ self.corruption_function,
308
+ self.level,
309
+ self.order,
310
+ pipeline_operation=self._pipeline_descriptor_factory,
311
+ transcript_target=self.transcript_target,
312
+ **filtered_kwargs,
313
+ )
314
+
315
+ # Check which kwargs subclass accepts via **kwargs or explicit params
316
+ try:
317
+ signature = inspect.signature(cls.__init__)
318
+ params = signature.parameters
319
+ has_var_keyword = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
320
+ except (TypeError, ValueError):
321
+ # If we can't introspect, play it safe and pass nothing extra
322
+ return cls(**filtered_kwargs)
323
+
324
+ for key in ("exclude_patterns", "include_only_patterns"):
325
+ if key in filtered_kwargs and not (has_var_keyword or key in params):
326
+ filtered_kwargs.pop(key)
327
+
328
+ # Only include seed if subclass accepts it
329
+ if clone_seed is not None:
330
+ if has_var_keyword or "seed" in params:
331
+ filtered_kwargs["seed"] = clone_seed
332
+
333
+ # Only include transcript_target if subclass accepts it
334
+ if "transcript_target" not in filtered_kwargs:
335
+ if has_var_keyword or "transcript_target" in params:
336
+ filtered_kwargs["transcript_target"] = self.transcript_target
337
+
338
+ return cls(**filtered_kwargs)
339
+
340
+
341
+ class Gaggle(Glitchling):
342
+ """A collection of glitchlings executed in a deterministic order."""
343
+
344
+ def __init__(
345
+ self,
346
+ glitchlings: list[Glitchling],
347
+ seed: int = 151,
348
+ transcript_target: TranscriptTarget = "last",
349
+ exclude_patterns: list[str] | None = None,
350
+ include_only_patterns: list[str] | None = None,
351
+ ):
352
+ """Initialize the gaggle and derive per-glitchling RNG seeds.
353
+
354
+ Args:
355
+ glitchlings: Glitchlings to orchestrate.
356
+ seed: Master seed used to derive per-glitchling seeds.
357
+ transcript_target: Which transcript turns to corrupt. Accepts:
358
+ - ``"last"`` (default): corrupt only the last turn
359
+ - ``"all"``: corrupt all turns
360
+ - ``"assistant"``: corrupt only assistant turns
361
+ - ``"user"``: corrupt only user turns
362
+ - ``int``: corrupt a specific index (negative indexing supported)
363
+ - ``Sequence[int]``: corrupt specific indices
364
+ exclude_patterns: Regex patterns that should be treated as immutable for all members.
365
+ include_only_patterns: Regex patterns restricting corruption to the matched regions.
366
+
367
+ """
368
+ super().__init__(
369
+ "Gaggle",
370
+ self._corrupt_text,
371
+ AttackWave.DOCUMENT,
372
+ seed=seed,
373
+ transcript_target=transcript_target,
374
+ exclude_patterns=exclude_patterns,
375
+ include_only_patterns=include_only_patterns,
376
+ )
377
+ self._clones_by_index: list[Glitchling] = []
378
+ for idx, glitchling in enumerate(glitchlings):
379
+ clone = glitchling.clone()
380
+ merged_exclude = self._merge_pattern_lists(
381
+ exclude_patterns, clone.kwargs.get("exclude_patterns")
382
+ )
383
+ merged_include = self._merge_pattern_lists(
384
+ include_only_patterns, clone.kwargs.get("include_only_patterns")
385
+ )
386
+ if merged_exclude is not None:
387
+ clone.set_param("exclude_patterns", merged_exclude)
388
+ if merged_include is not None:
389
+ clone.set_param("include_only_patterns", merged_include)
390
+ setattr(clone, "_gaggle_index", idx)
391
+ self._clones_by_index.append(clone)
392
+
393
+ self.glitchlings: dict[AttackWave, list[Glitchling]] = {level: [] for level in AttackWave}
394
+ self.apply_order: list[Glitchling] = []
395
+ self._plan: list[tuple[int, int]] = []
396
+ self._pipeline_descriptors_cache: list[PipelineDescriptor] | None = None
397
+ self._missing_pipeline_glitchlings: list[Glitchling] = []
398
+ self._cached_include_patterns: list[str] = []
399
+ self._cached_exclude_patterns: list[str] = []
400
+ self.sort_glitchlings()
401
+ self._initialize_pipeline_cache()
402
+
403
+ def clone(self, seed: int | None = None) -> "Gaggle":
404
+ """Create a copy of this gaggle, cloning member glitchlings."""
405
+ clone_seed = seed if seed is not None else self.seed
406
+ if clone_seed is None:
407
+ clone_seed = 151 # Default seed for Gaggle
408
+ cloned_members = [glitchling.clone() for glitchling in self._clones_by_index]
409
+ return Gaggle(
410
+ cloned_members,
411
+ seed=clone_seed,
412
+ transcript_target=self.transcript_target,
413
+ exclude_patterns=self.kwargs.get("exclude_patterns"),
414
+ include_only_patterns=self.kwargs.get("include_only_patterns"),
415
+ )
416
+
417
+ @staticmethod
418
+ def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
419
+ """Derive a deterministic seed for a glitchling based on the master seed.
420
+
421
+ Uses FNV-1a for string hashing and SplitMix64 for mixing. This provides
422
+ stable, deterministic derivation without cryptographic overhead.
423
+ """
424
+ state = master_seed & SEED_MASK
425
+
426
+ # Mix in glitchling name via FNV-1a
427
+ state ^= _fnv1a_hash(glitchling_name.encode("utf-8"))
428
+ state = _splitmix64(state)
429
+
430
+ # Mix in index
431
+ state ^= abs(index) & SEED_MASK
432
+ state = _splitmix64(state)
433
+
434
+ return state
435
+
436
+ def sort_glitchlings(self) -> None:
437
+ """Sort glitchlings by wave then order to produce application order."""
438
+ plan = plan_operations(self._clones_by_index, self.seed)
439
+ self._plan = plan
440
+
441
+ self.glitchlings = {level: [] for level in AttackWave}
442
+ for clone in self._clones_by_index:
443
+ self.glitchlings[clone.level].append(clone)
444
+
445
+ missing = set(range(len(self._clones_by_index)))
446
+ apply_order: list[Glitchling] = []
447
+ for index, derived_seed in plan:
448
+ clone = self._clones_by_index[index]
449
+ clone.reset_rng(int(derived_seed))
450
+ apply_order.append(clone)
451
+ missing.discard(index)
452
+
453
+ if missing:
454
+ missing_indices = ", ".join(str(idx) for idx in sorted(missing))
455
+ message = f"Orchestration plan missing glitchlings at indices: {missing_indices}"
456
+ raise RuntimeError(message)
457
+
458
+ self.apply_order = apply_order
459
+
460
+ def _initialize_pipeline_cache(self) -> None:
461
+ self._cached_include_patterns, self._cached_exclude_patterns = (
462
+ self._collect_masking_patterns()
463
+ )
464
+ descriptors, missing = self._pipeline_descriptors()
465
+ self._pipeline_descriptors_cache = descriptors
466
+ self._missing_pipeline_glitchlings = missing
467
+ if missing:
468
+ self._pipeline = None
469
+ return
470
+
471
+ master_seed = self.seed
472
+ if master_seed is None: # pragma: no cover - defensive, should be set by __init__
473
+ message = "Gaggle orchestration requires a master seed"
474
+ raise RuntimeError(message)
475
+
476
+ self._pipeline = build_pipeline_rust(
477
+ descriptors,
478
+ int(master_seed),
479
+ include_only_patterns=self._cached_include_patterns or None,
480
+ exclude_patterns=self._cached_exclude_patterns or None,
481
+ )
482
+
483
+ def _invalidate_pipeline_cache(self) -> None:
484
+ """Clear cached pipeline state so it will be rebuilt on next use."""
485
+ self._pipeline = None
486
+ self._pipeline_descriptors_cache = None
487
+ self._missing_pipeline_glitchlings = []
488
+
489
+ def _pipeline_descriptors(self) -> tuple[list[PipelineDescriptor], list[Glitchling]]:
490
+ """Collect pipeline descriptors and track glitchlings missing them."""
491
+ descriptors: list[PipelineDescriptor] = []
492
+ missing: list[Glitchling] = []
493
+ master_seed = self.seed
494
+ for glitchling in self.apply_order:
495
+ descriptor = build_pipeline_descriptor(
496
+ glitchling,
497
+ master_seed=master_seed,
498
+ derive_seed_fn=Gaggle.derive_seed,
499
+ )
500
+ if descriptor is None:
501
+ missing.append(glitchling)
502
+ continue
503
+ descriptors.append(descriptor.as_mapping())
504
+
505
+ return descriptors, missing
506
+
507
+ def _corrupt_text(self, text: str, **kwargs: Any) -> str:
508
+ """Apply each glitchling to string input sequentially.
509
+
510
+ This method uses a batched execution strategy to minimize tokenization
511
+ overhead. Consecutive glitchlings with pipeline support are grouped and
512
+ executed together via the Rust pipeline, while glitchlings without
513
+ pipeline support are executed individually.
514
+
515
+ When glitchlings have heterogeneous masks (different include/exclude
516
+ patterns), they are grouped by mask configuration and each group is
517
+ executed with its own patterns. This ensures each glitchling respects
518
+ its intended mask semantics while still batching where possible.
519
+ """
520
+ master_seed = self.seed
521
+ if master_seed is None:
522
+ message = "Gaggle orchestration requires a master seed"
523
+ raise RuntimeError(message)
524
+
525
+ # Check for heterogeneous masks requiring per-group execution
526
+ if self._has_heterogeneous_masks():
527
+ return self._corrupt_text_heterogeneous(text, master_seed)
528
+
529
+ # Homogeneous masks: use unified pipeline
530
+ self._ensure_pipeline_ready()
531
+
532
+ if self._pipeline is not None and not self._missing_pipeline_glitchlings:
533
+ pipeline = cast(Any, self._pipeline)
534
+ return cast(str, pipeline.run(text))
535
+
536
+ # Build the pure execution plan
537
+ plan = build_execution_plan(
538
+ self.apply_order,
539
+ master_seed=master_seed,
540
+ derive_seed_fn=Gaggle.derive_seed,
541
+ )
542
+
543
+ # Execute via the impure dispatch layer
544
+ return execute_plan(
545
+ text,
546
+ plan,
547
+ master_seed,
548
+ include_only_patterns=self._cached_include_patterns,
549
+ exclude_patterns=self._cached_exclude_patterns,
550
+ )
551
+
552
+ def _corrupt_text_heterogeneous(self, text: str, master_seed: int) -> str:
553
+ """Execute glitchlings grouped by mask configuration.
554
+
555
+ This method handles the case where glitchlings have different mask
556
+ patterns. Groups consecutive glitchlings with matching masks and
557
+ executes each group with its specific patterns, chaining results.
558
+
559
+ Performance note: This path builds a pipeline per mask group rather
560
+ than one unified pipeline. For gaggles where all glitchlings share
561
+ the same masks, the unified path is preferred.
562
+ """
563
+ groups = self._group_by_masks()
564
+ result = text
565
+
566
+ for include_patterns, exclude_patterns, glitchlings in groups:
567
+ # Build execution plan for this group
568
+ plan = build_execution_plan(
569
+ glitchlings,
570
+ master_seed=master_seed,
571
+ derive_seed_fn=Gaggle.derive_seed,
572
+ )
573
+
574
+ # Execute with group-specific masks
575
+ result = execute_plan(
576
+ result,
577
+ plan,
578
+ master_seed,
579
+ include_only_patterns=include_patterns or [],
580
+ exclude_patterns=exclude_patterns or [],
581
+ )
582
+
583
+ return result
584
+
585
+ def corrupt(self, text: str | Transcript) -> str | Transcript:
586
+ """Apply each glitchling to the provided text sequentially.
587
+
588
+ When the input is a transcript, the ``transcript_target`` setting
589
+ controls which turns are corrupted:
590
+
591
+ - ``"last"``: corrupt only the last turn (default)
592
+ - ``"all"``: corrupt all turns
593
+ - ``"assistant"``: corrupt only turns with ``role="assistant"``
594
+ - ``"user"``: corrupt only turns with ``role="user"``
595
+ - ``int``: corrupt a specific turn by index
596
+ - ``Sequence[int]``: corrupt specific turns by index
597
+ """
598
+ # Fast path for strings (most common case)
599
+ if isinstance(text, str):
600
+ return self._corrupt_text(text)
601
+
602
+ # Handle transcripts
603
+ if _is_transcript(text):
604
+ indices = resolve_transcript_indices(text, self.transcript_target)
605
+ result: list[TranscriptTurn] = [dict(turn) for turn in text]
606
+ for idx in indices:
607
+ turn = text[idx]
608
+ content = turn.get("content")
609
+ if isinstance(content, str):
610
+ result[idx]["content"] = self._corrupt_text(content)
611
+ return result
612
+
613
+ # Fallback: cast to string
614
+ return self._corrupt_text(str(text))
615
+
616
+ def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
617
+ """Apply corruption across dataset columns with batch optimization.
618
+
619
+ When all glitchlings support the Rust pipeline and columns contain
620
+ simple strings, this method uses batched parallel processing for
621
+ improved throughput. Falls back to row-by-row processing for
622
+ transcripts or when Python fallback is required.
623
+
624
+ Args:
625
+ dataset: The HuggingFace Dataset to corrupt.
626
+ columns: List of column names to corrupt.
627
+
628
+ Returns:
629
+ A new dataset with the specified columns corrupted.
630
+ """
631
+ require_datasets("datasets is not installed")
632
+
633
+ # Check if we can use batch optimization
634
+ self._ensure_pipeline_ready()
635
+ can_batch = self._pipeline is not None and not self._missing_pipeline_glitchlings
636
+
637
+ if not can_batch:
638
+ # Fall back to base class row-by-row processing
639
+ return super().corrupt_dataset(dataset, columns)
640
+
641
+ def __corrupt_batch(batch: dict[str, list[Any]]) -> dict[str, list[Any]]:
642
+ result = dict(batch)
643
+ for column in columns:
644
+ values = batch[column]
645
+ if not values:
646
+ continue
647
+
648
+ # Check if all values are simple strings (batchable)
649
+ if all(isinstance(v, str) for v in values):
650
+ result[column] = self.corrupt_batch(values)
651
+ else:
652
+ # Mixed types or transcripts - process individually
653
+ corrupted_values: list[Any] = []
654
+ for value in values:
655
+ if _is_transcript(value, allow_empty=False, require_all_content=True):
656
+ corrupted_values.append(self.corrupt(value))
657
+ elif isinstance(value, list) and all(
658
+ isinstance(item, str) for item in value
659
+ ):
660
+ corrupted_values.append(self.corrupt_batch(value))
661
+ elif isinstance(value, str):
662
+ corrupted_values.append(self._corrupt_text(value))
663
+ else:
664
+ corrupted_values.append(value)
665
+ result[column] = corrupted_values
666
+ return result
667
+
668
+ return dataset.map(__corrupt_batch, batched=True)
669
+
670
+ @staticmethod
671
+ def _merge_pattern_lists(base: list[str] | None, extra: list[str] | None) -> list[str] | None:
672
+ if base is None and extra is None:
673
+ return None
674
+
675
+ merged: list[str] = []
676
+ for source in (base, extra):
677
+ if source is None:
678
+ continue
679
+ for pattern in source:
680
+ if pattern not in merged:
681
+ merged.append(pattern)
682
+ return merged
683
+
684
+ def _collect_masking_patterns(self) -> tuple[list[str], list[str]]:
685
+ def _extend_unique(target: list[str], source: list[str] | None) -> None:
686
+ if not source:
687
+ return
688
+ for pattern in source:
689
+ if pattern not in target:
690
+ target.append(pattern)
691
+
692
+ include_patterns: list[str] = []
693
+ exclude_patterns: list[str] = []
694
+
695
+ _extend_unique(include_patterns, self.kwargs.get("include_only_patterns"))
696
+ _extend_unique(exclude_patterns, self.kwargs.get("exclude_patterns"))
697
+
698
+ for clone in self._clones_by_index:
699
+ _extend_unique(include_patterns, clone.kwargs.get("include_only_patterns"))
700
+ _extend_unique(exclude_patterns, clone.kwargs.get("exclude_patterns"))
701
+
702
+ return include_patterns, exclude_patterns
703
+
704
+ def _has_heterogeneous_masks(self) -> bool:
705
+ """Check if glitchlings have different individual mask configurations.
706
+
707
+ Returns True when per-glitchling masks differ, requiring sequential
708
+ execution with individual mask application rather than batched pipeline.
709
+
710
+ Gaggle-level masks are applied uniformly and don't cause heterogeneity.
711
+ Only per-glitchling differences trigger this fallback.
712
+ """
713
+ if len(self._clones_by_index) <= 1:
714
+ return False
715
+
716
+ def _normalize(patterns: list[str] | None) -> tuple[str, ...]:
717
+ if not patterns:
718
+ return ()
719
+ return tuple(sorted(patterns))
720
+
721
+ first_include = _normalize(self._clones_by_index[0].kwargs.get("include_only_patterns"))
722
+ first_exclude = _normalize(self._clones_by_index[0].kwargs.get("exclude_patterns"))
723
+
724
+ for clone in self._clones_by_index[1:]:
725
+ clone_include = _normalize(clone.kwargs.get("include_only_patterns"))
726
+ clone_exclude = _normalize(clone.kwargs.get("exclude_patterns"))
727
+ if clone_include != first_include or clone_exclude != first_exclude:
728
+ return True
729
+
730
+ return False
731
+
732
+ @staticmethod
733
+ def _mask_key(glitchling: Glitchling) -> tuple[tuple[str, ...], tuple[str, ...]]:
734
+ """Return a hashable key representing a glitchling's mask configuration."""
735
+ include = glitchling.kwargs.get("include_only_patterns")
736
+ exclude = glitchling.kwargs.get("exclude_patterns")
737
+ return (
738
+ tuple(sorted(include)) if include else (),
739
+ tuple(sorted(exclude)) if exclude else (),
740
+ )
741
+
742
+ def _group_by_masks(
743
+ self,
744
+ ) -> list[tuple[list[str] | None, list[str] | None, list[Glitchling]]]:
745
+ """Group glitchlings by their mask configuration, preserving execution order.
746
+
747
+ Returns a list of (include_patterns, exclude_patterns, glitchlings) tuples.
748
+ Consecutive glitchlings with the same mask are grouped together for batching.
749
+ """
750
+ if not self.apply_order:
751
+ return []
752
+
753
+ groups: list[tuple[list[str] | None, list[str] | None, list[Glitchling]]] = []
754
+ current_key: tuple[tuple[str, ...], tuple[str, ...]] | None = None
755
+ current_group: list[Glitchling] = []
756
+
757
+ for glitchling in self.apply_order:
758
+ key = self._mask_key(glitchling)
759
+ if key != current_key:
760
+ if current_group and current_key is not None:
761
+ include = list(current_key[0]) if current_key[0] else None
762
+ exclude = list(current_key[1]) if current_key[1] else None
763
+ groups.append((include, exclude, current_group))
764
+ current_key = key
765
+ current_group = [glitchling]
766
+ else:
767
+ current_group.append(glitchling)
768
+
769
+ if current_group and current_key is not None:
770
+ include = list(current_key[0]) if current_key[0] else None
771
+ exclude = list(current_key[1]) if current_key[1] else None
772
+ groups.append((include, exclude, current_group))
773
+
774
+ return groups
775
+
776
+ def _ensure_pipeline_ready(self) -> None:
777
+ """Ensure the pipeline cache is initialized and patterns are current."""
778
+ master_seed = self.seed
779
+ if master_seed is None:
780
+ message = "Gaggle orchestration requires a master seed"
781
+ raise RuntimeError(message)
782
+
783
+ include_patterns, exclude_patterns = self._collect_masking_patterns()
784
+ if (
785
+ include_patterns != self._cached_include_patterns
786
+ or exclude_patterns != self._cached_exclude_patterns
787
+ ):
788
+ self._cached_include_patterns = include_patterns
789
+ self._cached_exclude_patterns = exclude_patterns
790
+ self._pipeline = None
791
+ self._pipeline_descriptors_cache = None
792
+ self._missing_pipeline_glitchlings = []
793
+
794
+ if self._pipeline is None and not self._missing_pipeline_glitchlings:
795
+ self._initialize_pipeline_cache()
796
+
797
+ def _can_use_batch_pipeline(self) -> bool:
798
+ """Return True if all glitchlings support the Rust pipeline."""
799
+ self._ensure_pipeline_ready()
800
+ return self._pipeline is not None and not self._missing_pipeline_glitchlings
801
+
802
+ def corrupt_batch(self, texts: Sequence[str]) -> list[str]:
803
+ """Apply corruptions to multiple texts, using parallel Rust execution when possible.
804
+
805
+ When all glitchlings support the Rust pipeline and share the same mask
806
+ configuration, this method releases the GIL and processes all texts
807
+ concurrently using rayon. This provides significant speedups for large
808
+ batches compared to sequential processing.
809
+
810
+ When glitchlings have heterogeneous masks or require Python fallback,
811
+ texts are processed sequentially.
812
+
813
+ Args:
814
+ texts: Sequence of text strings to corrupt.
815
+
816
+ Returns:
817
+ List of corrupted texts in the same order as inputs.
818
+
819
+ Example:
820
+ >>> gaggle = Gaggle([Typogre(rate=0.05), Mim1c(rate=0.01)], seed=42)
821
+ >>> results = gaggle.corrupt_batch(["Hello world", "How are you?"])
822
+ """
823
+ if not texts:
824
+ return []
825
+
826
+ # Heterogeneous masks require per-text sequential processing
827
+ if self._has_heterogeneous_masks():
828
+ return [self._corrupt_text(text) for text in texts]
829
+
830
+ self._ensure_pipeline_ready()
831
+
832
+ # Fast path: use parallel Rust pipeline when available
833
+ if self._pipeline is not None and not self._missing_pipeline_glitchlings:
834
+ pipeline = cast(Any, self._pipeline)
835
+ return cast(list[str], pipeline.run_batch(list(texts)))
836
+
837
+ # Fallback: sequential processing
838
+ return [self._corrupt_text(text) for text in texts]
839
+
840
+
841
+ __all__ = [
842
+ # Enums
843
+ "AttackWave",
844
+ "AttackOrder",
845
+ # Core classes
846
+ "Glitchling",
847
+ "Gaggle",
848
+ # Planning functions
849
+ "plan_operations",
850
+ "PipelineOperationPayload",
851
+ "PipelineDescriptor",
852
+ ]