glitchlings 0.10.2__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (83) hide show
  1. glitchlings/__init__.py +99 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_zoo_rust/__init__.py +12 -0
  4. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +147 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +493 -0
  21. glitchlings/attack/core_execution.py +367 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +218 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +227 -0
  27. glitchlings/auggie.py +284 -0
  28. glitchlings/compat/__init__.py +9 -0
  29. glitchlings/compat/loaders.py +355 -0
  30. glitchlings/compat/types.py +41 -0
  31. glitchlings/conf/__init__.py +41 -0
  32. glitchlings/conf/loaders.py +331 -0
  33. glitchlings/conf/schema.py +156 -0
  34. glitchlings/conf/types.py +72 -0
  35. glitchlings/config.toml +2 -0
  36. glitchlings/constants.py +59 -0
  37. glitchlings/dev/__init__.py +3 -0
  38. glitchlings/dev/docs.py +45 -0
  39. glitchlings/dlc/__init__.py +19 -0
  40. glitchlings/dlc/_shared.py +296 -0
  41. glitchlings/dlc/gutenberg.py +400 -0
  42. glitchlings/dlc/huggingface.py +68 -0
  43. glitchlings/dlc/prime.py +215 -0
  44. glitchlings/dlc/pytorch.py +98 -0
  45. glitchlings/dlc/pytorch_lightning.py +173 -0
  46. glitchlings/internal/__init__.py +16 -0
  47. glitchlings/internal/rust.py +159 -0
  48. glitchlings/internal/rust_ffi.py +490 -0
  49. glitchlings/main.py +426 -0
  50. glitchlings/protocols.py +91 -0
  51. glitchlings/runtime_config.py +24 -0
  52. glitchlings/util/__init__.py +27 -0
  53. glitchlings/util/adapters.py +65 -0
  54. glitchlings/util/keyboards.py +356 -0
  55. glitchlings/util/transcripts.py +108 -0
  56. glitchlings/zoo/__init__.py +161 -0
  57. glitchlings/zoo/assets/__init__.py +29 -0
  58. glitchlings/zoo/core.py +678 -0
  59. glitchlings/zoo/core_execution.py +154 -0
  60. glitchlings/zoo/core_planning.py +451 -0
  61. glitchlings/zoo/corrupt_dispatch.py +295 -0
  62. glitchlings/zoo/hokey.py +139 -0
  63. glitchlings/zoo/jargoyle.py +243 -0
  64. glitchlings/zoo/mim1c.py +148 -0
  65. glitchlings/zoo/pedant/__init__.py +109 -0
  66. glitchlings/zoo/pedant/core.py +105 -0
  67. glitchlings/zoo/pedant/forms.py +74 -0
  68. glitchlings/zoo/pedant/stones.py +74 -0
  69. glitchlings/zoo/redactyl.py +97 -0
  70. glitchlings/zoo/rng.py +259 -0
  71. glitchlings/zoo/rushmore.py +416 -0
  72. glitchlings/zoo/scannequin.py +66 -0
  73. glitchlings/zoo/transforms.py +346 -0
  74. glitchlings/zoo/typogre.py +128 -0
  75. glitchlings/zoo/validation.py +477 -0
  76. glitchlings/zoo/wherewolf.py +120 -0
  77. glitchlings/zoo/zeedub.py +93 -0
  78. glitchlings-0.10.2.dist-info/METADATA +337 -0
  79. glitchlings-0.10.2.dist-info/RECORD +83 -0
  80. glitchlings-0.10.2.dist-info/WHEEL +5 -0
  81. glitchlings-0.10.2.dist-info/entry_points.txt +3 -0
  82. glitchlings-0.10.2.dist-info/licenses/LICENSE +201 -0
  83. glitchlings-0.10.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,678 @@
1
+ """Core data structures used to model glitchlings and their interactions."""
2
+
3
+ import inspect
4
+ import random
5
+ from collections.abc import Mapping, Sequence
6
+ from enum import IntEnum, auto
7
+ from hashlib import blake2s
8
+ from typing import TYPE_CHECKING, Any, Callable, Protocol, cast
9
+
10
+ from glitchlings.internal.rust_ffi import build_pipeline_rust, plan_glitchlings_rust
11
+
12
+ from ..compat.loaders import get_datasets_dataset, require_datasets
13
+ from ..compat.types import Dataset as DatasetProtocol
14
+ from ..util.transcripts import (
15
+ Transcript,
16
+ TranscriptTarget,
17
+ is_transcript,
18
+ )
19
+ from .core_execution import execute_plan
20
+ from .core_planning import (
21
+ PipelineDescriptor,
22
+ PipelineOperationPayload,
23
+ build_execution_plan,
24
+ build_pipeline_descriptor,
25
+ normalize_plan_entries,
26
+ )
27
+ from .core_planning import (
28
+ PlanEntry as _PlanEntry,
29
+ )
30
+ from .corrupt_dispatch import (
31
+ StringCorruptionTarget,
32
+ assemble_corruption_result,
33
+ resolve_corruption_target,
34
+ )
35
+
36
+ _DatasetsDataset = get_datasets_dataset()
37
+
38
+ _is_transcript = is_transcript
39
+
40
+
41
+ def plan_glitchlings(
42
+ entries: Sequence[_PlanEntry],
43
+ master_seed: int | None,
44
+ ) -> list[tuple[int, int]]:
45
+ """Normalize glitchling instances or specs and compute an orchestration plan.
46
+
47
+ Notes
48
+ -----
49
+ The Rust extension is required for orchestration.
50
+ """
51
+ if master_seed is None:
52
+ message = "Gaggle orchestration requires a master seed"
53
+ raise ValueError(message)
54
+
55
+ normalized_specs = [spec.as_mapping() for spec in normalize_plan_entries(entries)]
56
+ master_seed_int = int(master_seed)
57
+ return plan_glitchlings_rust(list(normalized_specs), master_seed_int)
58
+
59
+
60
+ if TYPE_CHECKING: # pragma: no cover - typing only
61
+ from datasets import Dataset
62
+ elif _DatasetsDataset is not None:
63
+ Dataset = _DatasetsDataset
64
+ else:
65
+ Dataset = DatasetProtocol
66
+
67
+
68
+ class CorruptionCallable(Protocol):
69
+ """Protocol describing a callable capable of corrupting text."""
70
+
71
+ def __call__(self, text: str, *args: Any, **kwargs: Any) -> str: ...
72
+
73
+
74
+ # Text levels for glitchlings, to enforce a sort order
75
+ # Work from highest level down, because e.g.
76
+ # duplicating a word then adding a typo is potentially different than
77
+ # adding a typo then duplicating a word
78
+ class AttackWave(IntEnum):
79
+ """Granularity of text that a glitchling corrupts."""
80
+
81
+ DOCUMENT = auto()
82
+ PARAGRAPH = auto()
83
+ SENTENCE = auto()
84
+ WORD = auto()
85
+ CHARACTER = auto()
86
+
87
+
88
+ # Modifier for within the same attack wave
89
+ class AttackOrder(IntEnum):
90
+ """Relative execution order for glitchlings within the same wave."""
91
+
92
+ FIRST = auto()
93
+ EARLY = auto()
94
+ NORMAL = auto()
95
+ LATE = auto()
96
+ LAST = auto()
97
+
98
+
99
+ class Glitchling:
100
+ """A single text corruption agent with deterministic behaviour."""
101
+
102
+ def __init__(
103
+ self,
104
+ name: str,
105
+ corruption_function: CorruptionCallable,
106
+ scope: AttackWave,
107
+ order: AttackOrder = AttackOrder.NORMAL,
108
+ seed: int | None = None,
109
+ pipeline_operation: Callable[["Glitchling"], Mapping[str, Any] | None] | None = None,
110
+ transcript_target: TranscriptTarget = "last",
111
+ exclude_patterns: list[str] | None = None,
112
+ include_only_patterns: list[str] | None = None,
113
+ **kwargs: Any,
114
+ ) -> None:
115
+ """Initialize a glitchling.
116
+
117
+ Args:
118
+ name: Human readable glitchling name.
119
+ corruption_function: Callable used to transform text.
120
+ scope: Text granularity on which the glitchling operates.
121
+ order: Relative ordering within the same scope.
122
+ seed: Optional seed for deterministic random behaviour.
123
+ pipeline_operation: Optional factory for Rust pipeline descriptors.
124
+ transcript_target: Which transcript turns to corrupt. Accepts:
125
+ - ``"last"`` (default): corrupt only the last turn
126
+ - ``"all"``: corrupt all turns
127
+ - ``"assistant"``: corrupt only assistant turns
128
+ - ``"user"``: corrupt only user turns
129
+ - ``int``: corrupt a specific index (negative indexing supported)
130
+ - ``Sequence[int]``: corrupt specific indices
131
+ exclude_patterns: Regex patterns marking text that must not be
132
+ modified by pipeline-backed glitchlings.
133
+ include_only_patterns: Regex patterns restricting corruption to the
134
+ matched regions; text outside these matches is treated as immutable.
135
+ **kwargs: Additional parameters forwarded to the corruption callable.
136
+
137
+ """
138
+ # Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
139
+ # If no seed is supplied, we fall back to Python's default entropy.
140
+ self.seed = seed
141
+ self.rng: random.Random = random.Random(seed)
142
+ self.name: str = name
143
+ self.corruption_function: CorruptionCallable = corruption_function
144
+ self.level: AttackWave = scope
145
+ self.order: AttackOrder = order
146
+ self._pipeline_descriptor_factory = pipeline_operation
147
+ self.transcript_target: TranscriptTarget = transcript_target
148
+ self.kwargs: dict[str, Any] = {}
149
+ self._cached_rng_callable: CorruptionCallable | None = None
150
+ self._cached_rng_expectation: bool | None = None
151
+ self._pipeline: object | None = None
152
+ mask_kwargs = dict(kwargs)
153
+ if "exclude_patterns" not in mask_kwargs:
154
+ mask_kwargs["exclude_patterns"] = (
155
+ list(exclude_patterns) if exclude_patterns is not None else None
156
+ )
157
+ if "include_only_patterns" not in mask_kwargs:
158
+ mask_kwargs["include_only_patterns"] = (
159
+ list(include_only_patterns) if include_only_patterns is not None else None
160
+ )
161
+ for kw, val in mask_kwargs.items():
162
+ self.set_param(kw, val)
163
+
164
+ def set_param(self, key: str, value: Any) -> None:
165
+ """Persist a parameter for use by the corruption callable."""
166
+ aliases = getattr(self, "_param_aliases", {})
167
+ canonical = aliases.get(key, key)
168
+
169
+ # Drop stale alias keys so we only forward canonical kwargs.
170
+ self.kwargs.pop(key, None)
171
+ for alias, target in aliases.items():
172
+ if target == canonical:
173
+ self.kwargs.pop(alias, None)
174
+
175
+ self.kwargs[canonical] = value
176
+ setattr(self, canonical, value)
177
+
178
+ if canonical == "seed":
179
+ self.reset_rng(value)
180
+
181
+ for alias, target in aliases.items():
182
+ if target == canonical:
183
+ setattr(self, alias, value)
184
+
185
+ def pipeline_operation(self) -> PipelineOperationPayload | None:
186
+ """Return the Rust pipeline descriptor or ``None`` when unavailable.
187
+
188
+ Glitchlings that cannot provide a compiled pipeline (for example the
189
+ lightweight helpers used in tests) should override this hook or supply
190
+ a ``pipeline_operation`` factory that returns ``None`` to indicate that
191
+ Python orchestration must be used instead. When a descriptor mapping is
192
+ returned it is validated and forwarded to the Rust pipeline.
193
+ """
194
+
195
+ factory = self._pipeline_descriptor_factory
196
+ if factory is None:
197
+ return None
198
+
199
+ descriptor = factory(self)
200
+ if descriptor is None:
201
+ return None
202
+
203
+ if not isinstance(descriptor, Mapping): # pragma: no cover - defensive
204
+ raise TypeError("Pipeline descriptor factories must return a mapping or None")
205
+
206
+ payload = dict(descriptor)
207
+ payload_type = payload.get("type")
208
+ if not isinstance(payload_type, str):
209
+ message = f"Pipeline descriptor for {self.name} is missing a string 'type' field"
210
+ raise RuntimeError(message)
211
+
212
+ return cast(PipelineOperationPayload, payload)
213
+
214
+ def _corruption_expects_rng(self) -> bool:
215
+ """Return `True` when the corruption function accepts an rng keyword."""
216
+ cached_callable = self._cached_rng_callable
217
+ cached_expectation = self._cached_rng_expectation
218
+ corruption_function = self.corruption_function
219
+
220
+ if cached_callable is corruption_function and cached_expectation is not None:
221
+ return cached_expectation
222
+
223
+ expects_rng = False
224
+ try:
225
+ signature = inspect.signature(corruption_function)
226
+ except (TypeError, ValueError):
227
+ signature = None
228
+
229
+ if signature is not None:
230
+ expects_rng = "rng" in signature.parameters
231
+
232
+ self._cached_rng_callable = corruption_function
233
+ self._cached_rng_expectation = expects_rng
234
+ return expects_rng
235
+
236
+ def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
237
+ """Execute the corruption callable, injecting the RNG when required."""
238
+ # Pass rng to underlying corruption function if it expects it.
239
+ expects_rng = self._corruption_expects_rng()
240
+
241
+ if expects_rng:
242
+ corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
243
+ else:
244
+ corrupted = self.corruption_function(text, *args, **kwargs)
245
+ return corrupted
246
+
247
+ def _execute_corruption(self, text: str) -> str:
248
+ """Execute the actual corruption on a single text string.
249
+
250
+ This is the impure execution point that invokes the corruption callable.
251
+ All corruption for this glitchling flows through this single method.
252
+
253
+ Args:
254
+ text: The text to corrupt.
255
+
256
+ Returns:
257
+ The corrupted text.
258
+ """
259
+ call_kwargs = {
260
+ key: value
261
+ for key, value in self.kwargs.items()
262
+ if key not in {"exclude_patterns", "include_only_patterns"}
263
+ }
264
+ return self.__corrupt(text, **call_kwargs)
265
+
266
+ def corrupt(self, text: str | Transcript) -> str | Transcript:
267
+ """Apply the corruption function to text or conversational transcripts.
268
+
269
+ This method uses a pure dispatch pattern:
270
+ 1. Resolve the corruption target (pure - what to corrupt)
271
+ 2. Execute corruption (impure - single isolated point)
272
+ 3. Assemble the result (pure - combine results)
273
+
274
+ When the input is a transcript, the ``transcript_target`` setting
275
+ controls which turns are corrupted:
276
+
277
+ - ``"last"``: corrupt only the last turn (default)
278
+ - ``"all"``: corrupt all turns
279
+ - ``"assistant"``: corrupt only turns with ``role="assistant"``
280
+ - ``"user"``: corrupt only turns with ``role="user"``
281
+ - ``int``: corrupt a specific turn by index
282
+ - ``Sequence[int]``: corrupt specific turns by index
283
+ """
284
+ # Step 1: Pure dispatch - determine what to corrupt
285
+ target = resolve_corruption_target(text, self.transcript_target)
286
+
287
+ # Step 2: Impure execution - apply corruption via isolated method
288
+ if isinstance(target, StringCorruptionTarget):
289
+ corrupted: str | dict[int, str] = self._execute_corruption(target.text)
290
+ else:
291
+ # TranscriptCorruptionTarget
292
+ corrupted = {
293
+ turn.index: self._execute_corruption(turn.content) for turn in target.turns
294
+ }
295
+
296
+ # Step 3: Pure assembly - combine results
297
+ return assemble_corruption_result(target, corrupted)
298
+
299
+ def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
300
+ """Apply corruption lazily across dataset columns."""
301
+ require_datasets("datasets is not installed")
302
+
303
+ def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
304
+ row = dict(row)
305
+ for column in columns:
306
+ value = row[column]
307
+ if _is_transcript(
308
+ value,
309
+ allow_empty=False,
310
+ require_all_content=True,
311
+ ):
312
+ row[column] = self.corrupt(value)
313
+ elif isinstance(value, list):
314
+ row[column] = [self.corrupt(item) for item in value]
315
+ else:
316
+ row[column] = self.corrupt(value)
317
+ return row
318
+
319
+ return dataset.with_transform(__corrupt_row)
320
+
321
+ def __call__(self, text: str, *args: Any, **kwds: Any) -> str | Transcript:
322
+ """Allow a glitchling to be invoked directly like a callable."""
323
+ return self.corrupt(text, *args, **kwds)
324
+
325
+ def reset_rng(self, seed: int | None = None) -> None:
326
+ """Reset the glitchling's RNG to its initial seed."""
327
+ if seed is not None:
328
+ self.seed = seed
329
+ if self.seed is not None:
330
+ self.rng = random.Random(self.seed)
331
+
332
+ def clone(self, seed: int | None = None) -> "Glitchling":
333
+ """Create a copy of this glitchling, optionally with a new seed."""
334
+ cls = self.__class__
335
+ filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
336
+ clone_seed = seed if seed is not None else self.seed
337
+
338
+ if cls is Glitchling:
339
+ if clone_seed is not None:
340
+ filtered_kwargs["seed"] = clone_seed
341
+ return Glitchling(
342
+ self.name,
343
+ self.corruption_function,
344
+ self.level,
345
+ self.order,
346
+ pipeline_operation=self._pipeline_descriptor_factory,
347
+ transcript_target=self.transcript_target,
348
+ **filtered_kwargs,
349
+ )
350
+
351
+ # Check which kwargs subclass accepts via **kwargs or explicit params
352
+ try:
353
+ signature = inspect.signature(cls.__init__)
354
+ params = signature.parameters
355
+ has_var_keyword = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
356
+ except (TypeError, ValueError):
357
+ # If we can't introspect, play it safe and pass nothing extra
358
+ return cls(**filtered_kwargs)
359
+
360
+ for key in ("exclude_patterns", "include_only_patterns"):
361
+ if key in filtered_kwargs and not (has_var_keyword or key in params):
362
+ filtered_kwargs.pop(key)
363
+
364
+ # Only include seed if subclass accepts it
365
+ if clone_seed is not None:
366
+ if has_var_keyword or "seed" in params:
367
+ filtered_kwargs["seed"] = clone_seed
368
+
369
+ # Only include transcript_target if subclass accepts it
370
+ if "transcript_target" not in filtered_kwargs:
371
+ if has_var_keyword or "transcript_target" in params:
372
+ filtered_kwargs["transcript_target"] = self.transcript_target
373
+
374
+ return cls(**filtered_kwargs)
375
+
376
+
377
+ class Gaggle(Glitchling):
378
+ """A collection of glitchlings executed in a deterministic order."""
379
+
380
+ def __init__(
381
+ self,
382
+ glitchlings: list[Glitchling],
383
+ seed: int = 151,
384
+ transcript_target: TranscriptTarget = "last",
385
+ exclude_patterns: list[str] | None = None,
386
+ include_only_patterns: list[str] | None = None,
387
+ ):
388
+ """Initialize the gaggle and derive per-glitchling RNG seeds.
389
+
390
+ Args:
391
+ glitchlings: Glitchlings to orchestrate.
392
+ seed: Master seed used to derive per-glitchling seeds.
393
+ transcript_target: Which transcript turns to corrupt. Accepts:
394
+ - ``"last"`` (default): corrupt only the last turn
395
+ - ``"all"``: corrupt all turns
396
+ - ``"assistant"``: corrupt only assistant turns
397
+ - ``"user"``: corrupt only user turns
398
+ - ``int``: corrupt a specific index (negative indexing supported)
399
+ - ``Sequence[int]``: corrupt specific indices
400
+ exclude_patterns: Regex patterns that should be treated as immutable for all members.
401
+ include_only_patterns: Regex patterns restricting corruption to the matched regions.
402
+
403
+ """
404
+ super().__init__(
405
+ "Gaggle",
406
+ self._corrupt_text,
407
+ AttackWave.DOCUMENT,
408
+ seed=seed,
409
+ transcript_target=transcript_target,
410
+ exclude_patterns=exclude_patterns,
411
+ include_only_patterns=include_only_patterns,
412
+ )
413
+ self._clones_by_index: list[Glitchling] = []
414
+ for idx, glitchling in enumerate(glitchlings):
415
+ clone = glitchling.clone()
416
+ merged_exclude = self._merge_pattern_lists(
417
+ exclude_patterns, clone.kwargs.get("exclude_patterns")
418
+ )
419
+ merged_include = self._merge_pattern_lists(
420
+ include_only_patterns, clone.kwargs.get("include_only_patterns")
421
+ )
422
+ if merged_exclude is not None:
423
+ clone.set_param("exclude_patterns", merged_exclude)
424
+ if merged_include is not None:
425
+ clone.set_param("include_only_patterns", merged_include)
426
+ setattr(clone, "_gaggle_index", idx)
427
+ self._clones_by_index.append(clone)
428
+
429
+ self.glitchlings: dict[AttackWave, list[Glitchling]] = {level: [] for level in AttackWave}
430
+ self.apply_order: list[Glitchling] = []
431
+ self._plan: list[tuple[int, int]] = []
432
+ self._pipeline_descriptors_cache: list[PipelineDescriptor] | None = None
433
+ self._missing_pipeline_glitchlings: list[Glitchling] = []
434
+ self._cached_include_patterns: list[str] = []
435
+ self._cached_exclude_patterns: list[str] = []
436
+ self.sort_glitchlings()
437
+ self._initialize_pipeline_cache()
438
+
439
+ def clone(self, seed: int | None = None) -> "Gaggle":
440
+ """Create a copy of this gaggle, cloning member glitchlings."""
441
+ clone_seed = seed if seed is not None else self.seed
442
+ if clone_seed is None:
443
+ clone_seed = 151 # Default seed for Gaggle
444
+ cloned_members = [glitchling.clone() for glitchling in self._clones_by_index]
445
+ return Gaggle(
446
+ cloned_members,
447
+ seed=clone_seed,
448
+ transcript_target=self.transcript_target,
449
+ exclude_patterns=self.kwargs.get("exclude_patterns"),
450
+ include_only_patterns=self.kwargs.get("include_only_patterns"),
451
+ )
452
+
453
+ @staticmethod
454
+ def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
455
+ """Derive a deterministic seed for a glitchling based on the master seed."""
456
+
457
+ def _int_to_bytes(value: int) -> bytes:
458
+ if value == 0:
459
+ return b"\x00"
460
+
461
+ abs_value = abs(value)
462
+ length = max(1, (abs_value.bit_length() + 7) // 8)
463
+
464
+ if value < 0:
465
+ while True:
466
+ try:
467
+ return value.to_bytes(length, "big", signed=True)
468
+ except OverflowError:
469
+ length += 1
470
+
471
+ return abs_value.to_bytes(length, "big", signed=False)
472
+
473
+ hasher = blake2s(digest_size=8)
474
+ hasher.update(_int_to_bytes(master_seed))
475
+ hasher.update(b"\x00")
476
+ hasher.update(glitchling_name.encode("utf-8"))
477
+ hasher.update(b"\x00")
478
+ hasher.update(_int_to_bytes(index))
479
+ return int.from_bytes(hasher.digest(), "big")
480
+
481
+ def sort_glitchlings(self) -> None:
482
+ """Sort glitchlings by wave then order to produce application order."""
483
+ plan = plan_glitchlings(self._clones_by_index, self.seed)
484
+ self._plan = plan
485
+
486
+ self.glitchlings = {level: [] for level in AttackWave}
487
+ for clone in self._clones_by_index:
488
+ self.glitchlings[clone.level].append(clone)
489
+
490
+ missing = set(range(len(self._clones_by_index)))
491
+ apply_order: list[Glitchling] = []
492
+ for index, derived_seed in plan:
493
+ clone = self._clones_by_index[index]
494
+ clone.reset_rng(int(derived_seed))
495
+ apply_order.append(clone)
496
+ missing.discard(index)
497
+
498
+ if missing:
499
+ missing_indices = ", ".join(str(idx) for idx in sorted(missing))
500
+ message = f"Orchestration plan missing glitchlings at indices: {missing_indices}"
501
+ raise RuntimeError(message)
502
+
503
+ self.apply_order = apply_order
504
+
505
+ def _initialize_pipeline_cache(self) -> None:
506
+ self._cached_include_patterns, self._cached_exclude_patterns = (
507
+ self._collect_masking_patterns()
508
+ )
509
+ descriptors, missing = self._pipeline_descriptors()
510
+ self._pipeline_descriptors_cache = descriptors
511
+ self._missing_pipeline_glitchlings = missing
512
+ if missing:
513
+ self._pipeline = None
514
+ return
515
+
516
+ master_seed = self.seed
517
+ if master_seed is None: # pragma: no cover - defensive, should be set by __init__
518
+ message = "Gaggle orchestration requires a master seed"
519
+ raise RuntimeError(message)
520
+
521
+ self._pipeline = build_pipeline_rust(
522
+ descriptors,
523
+ int(master_seed),
524
+ include_only_patterns=self._cached_include_patterns or None,
525
+ exclude_patterns=self._cached_exclude_patterns or None,
526
+ )
527
+
528
+ def _invalidate_pipeline_cache(self) -> None:
529
+ """Clear cached pipeline state so it will be rebuilt on next use."""
530
+ self._pipeline = None
531
+ self._pipeline_descriptors_cache = None
532
+ self._missing_pipeline_glitchlings = []
533
+
534
+ def _pipeline_descriptors(self) -> tuple[list[PipelineDescriptor], list[Glitchling]]:
535
+ """Collect pipeline descriptors and track glitchlings missing them."""
536
+ descriptors: list[PipelineDescriptor] = []
537
+ missing: list[Glitchling] = []
538
+ master_seed = self.seed
539
+ for glitchling in self.apply_order:
540
+ descriptor = build_pipeline_descriptor(
541
+ glitchling,
542
+ master_seed=master_seed,
543
+ derive_seed_fn=Gaggle.derive_seed,
544
+ )
545
+ if descriptor is None:
546
+ missing.append(glitchling)
547
+ continue
548
+ descriptors.append(descriptor.as_mapping())
549
+
550
+ return descriptors, missing
551
+
552
+ def _corrupt_text(self, text: str) -> str:
553
+ """Apply each glitchling to string input sequentially.
554
+
555
+ This method uses a batched execution strategy to minimize tokenization
556
+ overhead. Consecutive glitchlings with pipeline support are grouped and
557
+ executed together via the Rust pipeline, while glitchlings without
558
+ pipeline support are executed individually. This hybrid approach ensures
559
+ the text is tokenized fewer times compared to executing every glitchling
560
+ individually.
561
+ """
562
+ master_seed = self.seed
563
+ if master_seed is None:
564
+ message = "Gaggle orchestration requires a master seed"
565
+ raise RuntimeError(message)
566
+
567
+ include_patterns, exclude_patterns = self._collect_masking_patterns()
568
+ if (
569
+ include_patterns != self._cached_include_patterns
570
+ or exclude_patterns != self._cached_exclude_patterns
571
+ ):
572
+ self._cached_include_patterns = include_patterns
573
+ self._cached_exclude_patterns = exclude_patterns
574
+ self._pipeline = None
575
+ self._pipeline_descriptors_cache = None
576
+ self._missing_pipeline_glitchlings = []
577
+
578
+ if self._pipeline is None and not self._missing_pipeline_glitchlings:
579
+ self._initialize_pipeline_cache()
580
+
581
+ if self._pipeline is not None and not self._missing_pipeline_glitchlings:
582
+ pipeline = cast(Any, self._pipeline)
583
+ return cast(str, pipeline.run(text))
584
+
585
+ # Build the pure execution plan
586
+ plan = build_execution_plan(
587
+ self.apply_order,
588
+ master_seed=master_seed,
589
+ derive_seed_fn=Gaggle.derive_seed,
590
+ )
591
+
592
+ # Execute via the impure dispatch layer
593
+ return execute_plan(
594
+ text,
595
+ plan,
596
+ master_seed,
597
+ include_only_patterns=self._cached_include_patterns,
598
+ exclude_patterns=self._cached_exclude_patterns,
599
+ )
600
+
601
+ def corrupt(self, text: str | Transcript) -> str | Transcript:
602
+ """Apply each glitchling to the provided text sequentially.
603
+
604
+ This method uses a pure dispatch pattern:
605
+ 1. Resolve the corruption target (pure - what to corrupt)
606
+ 2. Execute corruption (impure - single isolated point)
607
+ 3. Assemble the result (pure - combine results)
608
+
609
+ When the input is a transcript, the ``transcript_target`` setting
610
+ controls which turns are corrupted:
611
+
612
+ - ``"last"``: corrupt only the last turn (default)
613
+ - ``"all"``: corrupt all turns
614
+ - ``"assistant"``: corrupt only turns with ``role="assistant"``
615
+ - ``"user"``: corrupt only turns with ``role="user"``
616
+ - ``int``: corrupt a specific turn by index
617
+ - ``Sequence[int]``: corrupt specific turns by index
618
+ """
619
+ # Step 1: Pure dispatch - determine what to corrupt
620
+ target = resolve_corruption_target(text, self.transcript_target)
621
+
622
+ # Step 2: Impure execution - apply corruption via isolated method
623
+ if isinstance(target, StringCorruptionTarget):
624
+ corrupted: str | dict[int, str] = self._corrupt_text(target.text)
625
+ else:
626
+ # TranscriptCorruptionTarget
627
+ corrupted = {turn.index: self._corrupt_text(turn.content) for turn in target.turns}
628
+
629
+ # Step 3: Pure assembly - combine results
630
+ return assemble_corruption_result(target, corrupted)
631
+
632
+ @staticmethod
633
+ def _merge_pattern_lists(base: list[str] | None, extra: list[str] | None) -> list[str] | None:
634
+ if base is None and extra is None:
635
+ return None
636
+
637
+ merged: list[str] = []
638
+ for source in (base, extra):
639
+ if source is None:
640
+ continue
641
+ for pattern in source:
642
+ if pattern not in merged:
643
+ merged.append(pattern)
644
+ return merged
645
+
646
+ def _collect_masking_patterns(self) -> tuple[list[str], list[str]]:
647
+ def _extend_unique(target: list[str], source: list[str] | None) -> None:
648
+ if not source:
649
+ return
650
+ for pattern in source:
651
+ if pattern not in target:
652
+ target.append(pattern)
653
+
654
+ include_patterns: list[str] = []
655
+ exclude_patterns: list[str] = []
656
+
657
+ _extend_unique(include_patterns, self.kwargs.get("include_only_patterns"))
658
+ _extend_unique(exclude_patterns, self.kwargs.get("exclude_patterns"))
659
+
660
+ for clone in self._clones_by_index:
661
+ _extend_unique(include_patterns, clone.kwargs.get("include_only_patterns"))
662
+ _extend_unique(exclude_patterns, clone.kwargs.get("exclude_patterns"))
663
+
664
+ return include_patterns, exclude_patterns
665
+
666
+
667
+ __all__ = [
668
+ # Enums
669
+ "AttackWave",
670
+ "AttackOrder",
671
+ # Core classes
672
+ "Glitchling",
673
+ "Gaggle",
674
+ # Planning functions
675
+ "plan_glitchlings",
676
+ "PipelineOperationPayload",
677
+ "PipelineDescriptor",
678
+ ]