glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. glitchlings/__init__.py +101 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_corruption_engine/__init__.py +12 -0
  4. glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +184 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +819 -0
  21. glitchlings/attack/core_execution.py +378 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +211 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +338 -0
  27. glitchlings/attack/tokenizer_metrics.py +373 -0
  28. glitchlings/auggie.py +285 -0
  29. glitchlings/compat/__init__.py +9 -0
  30. glitchlings/compat/loaders.py +355 -0
  31. glitchlings/compat/types.py +41 -0
  32. glitchlings/conf/__init__.py +39 -0
  33. glitchlings/conf/loaders.py +331 -0
  34. glitchlings/conf/schema.py +156 -0
  35. glitchlings/conf/types.py +72 -0
  36. glitchlings/config.toml +2 -0
  37. glitchlings/constants.py +139 -0
  38. glitchlings/dev/__init__.py +3 -0
  39. glitchlings/dev/docs.py +45 -0
  40. glitchlings/dlc/__init__.py +21 -0
  41. glitchlings/dlc/_shared.py +300 -0
  42. glitchlings/dlc/gutenberg.py +400 -0
  43. glitchlings/dlc/huggingface.py +68 -0
  44. glitchlings/dlc/langchain.py +147 -0
  45. glitchlings/dlc/nemo.py +283 -0
  46. glitchlings/dlc/prime.py +215 -0
  47. glitchlings/dlc/pytorch.py +98 -0
  48. glitchlings/dlc/pytorch_lightning.py +173 -0
  49. glitchlings/internal/__init__.py +16 -0
  50. glitchlings/internal/rust.py +159 -0
  51. glitchlings/internal/rust_ffi.py +599 -0
  52. glitchlings/main.py +426 -0
  53. glitchlings/protocols.py +91 -0
  54. glitchlings/runtime_config.py +24 -0
  55. glitchlings/util/__init__.py +41 -0
  56. glitchlings/util/adapters.py +65 -0
  57. glitchlings/util/keyboards.py +508 -0
  58. glitchlings/util/transcripts.py +108 -0
  59. glitchlings/zoo/__init__.py +161 -0
  60. glitchlings/zoo/assets/__init__.py +29 -0
  61. glitchlings/zoo/core.py +852 -0
  62. glitchlings/zoo/core_execution.py +154 -0
  63. glitchlings/zoo/core_planning.py +451 -0
  64. glitchlings/zoo/corrupt_dispatch.py +291 -0
  65. glitchlings/zoo/hokey.py +139 -0
  66. glitchlings/zoo/jargoyle.py +301 -0
  67. glitchlings/zoo/mim1c.py +269 -0
  68. glitchlings/zoo/pedant/__init__.py +109 -0
  69. glitchlings/zoo/pedant/core.py +99 -0
  70. glitchlings/zoo/pedant/forms.py +50 -0
  71. glitchlings/zoo/pedant/stones.py +83 -0
  72. glitchlings/zoo/redactyl.py +94 -0
  73. glitchlings/zoo/rng.py +280 -0
  74. glitchlings/zoo/rushmore.py +416 -0
  75. glitchlings/zoo/scannequin.py +370 -0
  76. glitchlings/zoo/transforms.py +331 -0
  77. glitchlings/zoo/typogre.py +194 -0
  78. glitchlings/zoo/validation.py +643 -0
  79. glitchlings/zoo/wherewolf.py +120 -0
  80. glitchlings/zoo/zeedub.py +165 -0
  81. glitchlings-1.0.0.dist-info/METADATA +404 -0
  82. glitchlings-1.0.0.dist-info/RECORD +86 -0
  83. glitchlings-1.0.0.dist-info/WHEEL +5 -0
  84. glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
  85. glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
  86. glitchlings-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,819 @@
1
+ """Attack orchestrator for measuring corruption impact.
2
+
3
+ This module provides the Attack class, a boundary layer that coordinates
4
+ glitchling corruption and metric computation. It follows the functional
5
+ purity architecture:
6
+
7
+ - **Pure planning**: Input analysis and result planning (core_planning.py)
8
+ - **Impure execution**: Corruption, tokenization, metrics (core_execution.py)
9
+ - **Boundary layer**: This module - validates inputs and delegates
10
+
11
+ See AGENTS.md "Functional Purity Architecture" for full details.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import inspect
17
+ from collections.abc import Callable, Generator, Iterator, Mapping, Sequence
18
+ from dataclasses import dataclass, field
19
+ from typing import TYPE_CHECKING, cast
20
+
21
+ if TYPE_CHECKING:
22
+ pass # For forward references in type hints
23
+
24
+ from ..conf import DEFAULT_ATTACK_SEED
25
+ from ..protocols import Corruptor
26
+ from ..util.transcripts import Transcript, TranscriptTarget
27
+ from .core_execution import (
28
+ execute_attack,
29
+ get_default_metrics,
30
+ resolve_glitchlings,
31
+ )
32
+ from .core_planning import (
33
+ plan_attack,
34
+ plan_result,
35
+ )
36
+ from .encode import describe_tokenizer
37
+ from .metrics import Metric
38
+ from .tokenization import Tokenizer, resolve_tokenizer
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Streaming Token Iterator
42
+ # ---------------------------------------------------------------------------
43
+
44
+
45
+ @dataclass
46
+ class TokenWindow:
47
+ """A window of tokens for streaming processing.
48
+
49
+ Represents a chunk of tokens that can be processed without loading
50
+ the entire token sequence into memory.
51
+
52
+ Attributes:
53
+ tokens: Token strings in this window.
54
+ token_ids: Token IDs in this window.
55
+ start_index: Starting index of this window in the full sequence.
56
+ is_last: Whether this is the final window.
57
+ """
58
+
59
+ tokens: list[str]
60
+ token_ids: list[int]
61
+ start_index: int
62
+ is_last: bool
63
+
64
+ def __len__(self) -> int:
65
+ return len(self.tokens)
66
+
67
+
68
+ class StreamingTokens:
69
+ """Iterator for windowed access to token sequences.
70
+
71
+ Provides fixed-size window iteration over token sequences, useful for
72
+ processing large results in chunks without copying the entire sequence.
73
+
74
+ Note: This class provides windowed *access* to an existing token list,
75
+ not lazy loading. The full token list must already be in memory. For
76
+ true memory savings during tokenization, process texts in smaller batches.
77
+
78
+ Attributes:
79
+ window_size: Number of tokens per window.
80
+ total_tokens: Total number of tokens.
81
+ """
82
+
83
+ def __init__(
84
+ self,
85
+ tokens: list[str],
86
+ token_ids: list[int],
87
+ *,
88
+ window_size: int = 10000,
89
+ ):
90
+ """Initialize windowed token access.
91
+
92
+ Args:
93
+ tokens: Full token list to provide windowed access to.
94
+ token_ids: Full token ID list (must match tokens length).
95
+ window_size: Number of tokens per window. Defaults to 10000.
96
+ """
97
+ self._tokens = tokens
98
+ self._token_ids = token_ids
99
+ self.window_size = window_size
100
+ self.total_tokens = len(tokens)
101
+
102
+ def __iter__(self) -> Iterator[TokenWindow]:
103
+ """Iterate over token windows."""
104
+ for start in range(0, self.total_tokens, self.window_size):
105
+ end = min(start + self.window_size, self.total_tokens)
106
+ yield TokenWindow(
107
+ tokens=self._tokens[start:end],
108
+ token_ids=self._token_ids[start:end],
109
+ start_index=start,
110
+ is_last=(end >= self.total_tokens),
111
+ )
112
+
113
+ def __len__(self) -> int:
114
+ """Return total number of tokens."""
115
+ return self.total_tokens
116
+
117
+ @property
118
+ def all_tokens(self) -> list[str]:
119
+ """Get all tokens (materializes full list)."""
120
+ return self._tokens
121
+
122
+ @property
123
+ def all_token_ids(self) -> list[int]:
124
+ """Get all token IDs (materializes full list)."""
125
+ return self._token_ids
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Result Data Classes
130
+ # ---------------------------------------------------------------------------
131
+
132
+
133
+ @dataclass
134
+ class AttackResult:
135
+ """Result of an attack operation containing tokens and metrics.
136
+
137
+ Attributes:
138
+ original: Original input (string, transcript, or batch).
139
+ corrupted: Corrupted output (same type as original).
140
+ input_tokens: Tokenized original content.
141
+ output_tokens: Tokenized corrupted content.
142
+ input_token_ids: Token IDs for original.
143
+ output_token_ids: Token IDs for corrupted.
144
+ tokenizer_info: Description of the tokenizer used.
145
+ metrics: Computed metric values.
146
+ """
147
+
148
+ original: str | Transcript | Sequence[str]
149
+ corrupted: str | Transcript | Sequence[str]
150
+ input_tokens: list[str] | list[list[str]]
151
+ output_tokens: list[str] | list[list[str]]
152
+ input_token_ids: list[int] | list[list[int]]
153
+ output_token_ids: list[int] | list[list[int]]
154
+ tokenizer_info: str
155
+ metrics: dict[str, float | list[float]]
156
+
157
+ def _tokens_are_batched(self) -> bool:
158
+ """Check if tokens represent a batch."""
159
+ tokens = self.input_tokens
160
+ if tokens and isinstance(tokens[0], list):
161
+ return True
162
+ return isinstance(self.original, list) or isinstance(self.corrupted, list)
163
+
164
+ def _token_batches(self) -> tuple[list[list[str]], list[list[str]]]:
165
+ """Get tokens as batches (wrapping single sequences if needed)."""
166
+ if self._tokens_are_batched():
167
+ return (
168
+ cast(list[list[str]], self.input_tokens),
169
+ cast(list[list[str]], self.output_tokens),
170
+ )
171
+ return (
172
+ [cast(list[str], self.input_tokens)],
173
+ [cast(list[str], self.output_tokens)],
174
+ )
175
+
176
+ def _token_counts(self) -> tuple[list[int], list[int]]:
177
+ """Compute token counts per batch item."""
178
+ inputs, outputs = self._token_batches()
179
+ return [len(tokens) for tokens in inputs], [len(tokens) for tokens in outputs]
180
+
181
+ @staticmethod
182
+ def _format_metric_value(value: float | list[float]) -> str:
183
+ """Format a metric value for display."""
184
+ if isinstance(value, list):
185
+ if not value:
186
+ return "[]"
187
+ if len(value) <= 4:
188
+ rendered = ", ".join(f"{entry:.3f}" for entry in value)
189
+ return f"[{rendered}]"
190
+ total = sum(value)
191
+ minimum = min(value)
192
+ maximum = max(value)
193
+ mean = total / len(value)
194
+ return f"avg={mean:.3f} min={minimum:.3f} max={maximum:.3f}"
195
+ return f"{value:.3f}"
196
+
197
+ @staticmethod
198
+ def _format_token(token: str, *, max_length: int) -> str:
199
+ """Format a token for display, truncating if needed."""
200
+ clean = token.replace("\n", "\\n")
201
+ if len(clean) > max_length:
202
+ return clean[: max_length - 3] + "..."
203
+ return clean
204
+
205
+ def to_report(self) -> dict[str, object]:
206
+ """Convert to a JSON-serializable dictionary."""
207
+ input_counts, output_counts = self._token_counts()
208
+ return {
209
+ "tokenizer": self.tokenizer_info,
210
+ "original": self.original,
211
+ "corrupted": self.corrupted,
212
+ "input_tokens": self.input_tokens,
213
+ "output_tokens": self.output_tokens,
214
+ "input_token_ids": self.input_token_ids,
215
+ "output_token_ids": self.output_token_ids,
216
+ "token_counts": {
217
+ "input": {"per_sample": input_counts, "total": sum(input_counts)},
218
+ "output": {"per_sample": output_counts, "total": sum(output_counts)},
219
+ },
220
+ "metrics": self.metrics,
221
+ }
222
+
223
+ def summary(self, *, max_rows: int = 8, max_token_length: int = 24) -> str:
224
+ """Generate a human-readable summary.
225
+
226
+ Args:
227
+ max_rows: Maximum rows to display in token drift.
228
+ max_token_length: Maximum characters per token.
229
+
230
+ Returns:
231
+ Formatted multi-line summary string.
232
+ """
233
+ input_batches, output_batches = self._token_batches()
234
+ input_counts, output_counts = self._token_counts()
235
+ is_batch = self._tokens_are_batched()
236
+
237
+ lines: list[str] = [f"Tokenizer: {self.tokenizer_info}"]
238
+ if is_batch:
239
+ lines.append(f"Samples: {len(input_batches)}")
240
+
241
+ lines.append("Token counts:")
242
+ for index, (input_count, output_count) in enumerate(
243
+ zip(input_counts, output_counts), start=1
244
+ ):
245
+ prefix = f"#{index} " if is_batch else ""
246
+ delta = output_count - input_count
247
+ lines.append(f" {prefix}{input_count} -> {output_count} ({delta:+d})")
248
+ if index >= max_rows and len(input_batches) > max_rows:
249
+ remaining = len(input_batches) - max_rows
250
+ lines.append(f" ... {remaining} more samples")
251
+ break
252
+
253
+ lines.append("Metrics:")
254
+ for name, value in self.metrics.items():
255
+ lines.append(f" {name}: {self._format_metric_value(value)}")
256
+
257
+ if input_batches:
258
+ focus_index = 0
259
+ if is_batch and len(input_batches) > 1:
260
+ lines.append("Token drift (first sample):")
261
+ else:
262
+ lines.append("Token drift:")
263
+ input_tokens = input_batches[focus_index]
264
+ output_tokens = output_batches[focus_index]
265
+ rows = max(len(input_tokens), len(output_tokens))
266
+ display_rows = min(rows, max_rows)
267
+ for idx in range(display_rows):
268
+ left = (
269
+ self._format_token(input_tokens[idx], max_length=max_token_length)
270
+ if idx < len(input_tokens)
271
+ else ""
272
+ )
273
+ right = (
274
+ self._format_token(output_tokens[idx], max_length=max_token_length)
275
+ if idx < len(output_tokens)
276
+ else ""
277
+ )
278
+ if idx >= len(input_tokens):
279
+ marker = "+"
280
+ elif idx >= len(output_tokens):
281
+ marker = "-"
282
+ elif input_tokens[idx] == output_tokens[idx]:
283
+ marker = "="
284
+ else:
285
+ marker = "!"
286
+ lines.append(f" {idx + 1:>3}{marker} {left} -> {right}")
287
+ if rows > display_rows:
288
+ lines.append(f" ... {rows - display_rows} more tokens")
289
+ else:
290
+ lines.append("Token drift: (empty input)")
291
+
292
+ return "\n".join(lines)
293
+
294
+ # -------------------------------------------------------------------------
295
+ # Token-Level Analysis
296
+ # -------------------------------------------------------------------------
297
+
298
+ def get_metric(self, name: str) -> float | list[float] | None:
299
+ """Get a specific metric value by name.
300
+
301
+ Args:
302
+ name: Metric name (e.g., 'normalized_edit_distance').
303
+
304
+ Returns:
305
+ Metric value, or None if not found.
306
+ """
307
+ return self.metrics.get(name)
308
+
309
+ def get_changed_tokens(self, batch_index: int = 0) -> list[tuple[str, str]]:
310
+ """Get tokens that changed between original and corrupted.
311
+
312
+ Args:
313
+ batch_index: Which batch item to analyze (0 for single strings).
314
+
315
+ Returns:
316
+ List of (original_token, corrupted_token) pairs where they differ.
317
+ Only includes positions where both tokens exist and are different.
318
+ """
319
+ input_batches, output_batches = self._token_batches()
320
+ if batch_index >= len(input_batches):
321
+ return []
322
+
323
+ input_tokens = input_batches[batch_index]
324
+ output_tokens = output_batches[batch_index]
325
+
326
+ changes: list[tuple[str, str]] = []
327
+ for i in range(min(len(input_tokens), len(output_tokens))):
328
+ if input_tokens[i] != output_tokens[i]:
329
+ changes.append((input_tokens[i], output_tokens[i]))
330
+ return changes
331
+
332
+ def get_mutation_positions(self, batch_index: int = 0) -> list[int]:
333
+ """Get indices of tokens that were mutated.
334
+
335
+ Args:
336
+ batch_index: Which batch item to analyze (0 for single strings).
337
+
338
+ Returns:
339
+ List of token positions where original != corrupted.
340
+ Only includes positions where both tokens exist.
341
+ """
342
+ input_batches, output_batches = self._token_batches()
343
+ if batch_index >= len(input_batches):
344
+ return []
345
+
346
+ input_tokens = input_batches[batch_index]
347
+ output_tokens = output_batches[batch_index]
348
+
349
+ positions: list[int] = []
350
+ for i in range(min(len(input_tokens), len(output_tokens))):
351
+ if input_tokens[i] != output_tokens[i]:
352
+ positions.append(i)
353
+ return positions
354
+
355
+ def get_token_alignment(self, batch_index: int = 0) -> list[dict[str, object]]:
356
+ """Get detailed token-by-token comparison with alignment info.
357
+
358
+ Args:
359
+ batch_index: Which batch item to analyze (0 for single strings).
360
+
361
+ Returns:
362
+ List of alignment entries, each containing:
363
+ - index: Token position
364
+ - original: Original token (empty string if added)
365
+ - corrupted: Corrupted token (empty string if removed)
366
+ - changed: Whether the token changed
367
+ - op: Operation type ('=' unchanged, '!' modified, '+' added, '-' removed)
368
+ """
369
+ input_batches, output_batches = self._token_batches()
370
+ if batch_index >= len(input_batches):
371
+ return []
372
+
373
+ input_tokens = input_batches[batch_index]
374
+ output_tokens = output_batches[batch_index]
375
+
376
+ alignment: list[dict[str, object]] = []
377
+ max_len = max(len(input_tokens), len(output_tokens))
378
+
379
+ for i in range(max_len):
380
+ orig = input_tokens[i] if i < len(input_tokens) else ""
381
+ corr = output_tokens[i] if i < len(output_tokens) else ""
382
+
383
+ if i >= len(input_tokens):
384
+ op = "+"
385
+ changed = True
386
+ elif i >= len(output_tokens):
387
+ op = "-"
388
+ changed = True
389
+ elif orig == corr:
390
+ op = "="
391
+ changed = False
392
+ else:
393
+ op = "!"
394
+ changed = True
395
+
396
+ alignment.append(
397
+ {
398
+ "index": i,
399
+ "original": orig,
400
+ "corrupted": corr,
401
+ "changed": changed,
402
+ "op": op,
403
+ }
404
+ )
405
+
406
+ return alignment
407
+
408
+
409
+ # ---------------------------------------------------------------------------
410
+ # Attack Orchestrator
411
+ # ---------------------------------------------------------------------------
412
+
413
+
414
+ class Attack:
415
+ """Orchestrator for applying glitchling corruptions and measuring impact.
416
+
417
+ Attack is a thin boundary layer that:
418
+ 1. Validates inputs at construction time
419
+ 2. Delegates planning to pure functions (core_planning.py)
420
+ 3. Delegates execution to impure functions (core_execution.py)
421
+
422
+ Example:
423
+ >>> attack = Attack(Typogre(rate=0.05), tokenizer='cl100k_base')
424
+ >>> result = attack.run("Hello world")
425
+ >>> print(result.summary())
426
+ """
427
+
428
+ def __init__(
429
+ self,
430
+ glitchlings: Corruptor | str | Sequence[str | Corruptor],
431
+ tokenizer: str | Tokenizer | None = None,
432
+ metrics: Mapping[str, Metric] | None = None,
433
+ *,
434
+ seed: int | None = None,
435
+ transcript_target: TranscriptTarget | None = None,
436
+ ) -> None:
437
+ """Initialize an Attack.
438
+
439
+ Args:
440
+ glitchlings: Glitchling specification - a single Glitchling,
441
+ string spec (e.g. 'Typogre(rate=0.05)'), or iterable of these.
442
+ tokenizer: Tokenizer name (e.g. 'cl100k_base'), Tokenizer instance,
443
+ or None (defaults to whitespace tokenizer).
444
+ metrics: Dictionary of metric functions. If None, uses defaults
445
+ (jensen_shannon_divergence, normalized_edit_distance,
446
+ subsequence_retention).
447
+ seed: Master seed for the Gaggle. If None, uses DEFAULT_ATTACK_SEED.
448
+ transcript_target: Which transcript turns to corrupt. Accepts:
449
+ - "last": corrupt only the last turn (default)
450
+ - "all": corrupt all turns
451
+ - "assistant"/"user": corrupt only those roles
452
+ - int: corrupt a specific index
453
+ - Sequence[int]: corrupt specific indices
454
+ """
455
+ # Boundary: resolve seed
456
+ gaggle_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
457
+
458
+ # Impure: resolve glitchlings (clones to avoid mutation)
459
+ self.glitchlings = resolve_glitchlings(
460
+ glitchlings,
461
+ seed=gaggle_seed,
462
+ transcript_target=transcript_target,
463
+ )
464
+
465
+ # Impure: resolve tokenizer
466
+ self.tokenizer = resolve_tokenizer(tokenizer)
467
+ self.tokenizer_info = describe_tokenizer(self.tokenizer, tokenizer)
468
+
469
+ # Setup metrics
470
+ if metrics is None:
471
+ self.metrics: dict[str, Metric] = get_default_metrics()
472
+ else:
473
+ self.metrics = dict(metrics)
474
+
475
+ # Validate custom metrics have correct signature
476
+ self._validate_metrics()
477
+
478
+ def _validate_metrics(self) -> None:
479
+ """Validate that metric functions have correct signatures.
480
+
481
+ Uses signature inspection to avoid executing metrics (which may have
482
+ side effects).
483
+
484
+ Raises:
485
+ ValueError: If a metric function has an invalid signature.
486
+ """
487
+ for name, func in self.metrics.items():
488
+ if not callable(func):
489
+ raise ValueError(f"Metric '{name}' is not callable")
490
+
491
+ try:
492
+ sig = inspect.signature(func)
493
+ params = list(sig.parameters.values())
494
+
495
+ # Count required positional parameters (no default, not *args/**kwargs)
496
+ positional_params = [
497
+ p
498
+ for p in params
499
+ if p.kind
500
+ in (
501
+ inspect.Parameter.POSITIONAL_ONLY,
502
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
503
+ )
504
+ and p.default is inspect.Parameter.empty
505
+ ]
506
+
507
+ if len(positional_params) < 2:
508
+ raise ValueError(
509
+ f"Metric '{name}' must accept at least 2 positional arguments "
510
+ f"(original_tokens, corrupted_tokens), found {len(positional_params)}"
511
+ )
512
+ except (ValueError, TypeError) as e:
513
+ if "Metric" in str(e):
514
+ raise
515
+ raise ValueError(f"Metric '{name}' has invalid signature: {e}") from e
516
+
517
+ def run(
518
+ self,
519
+ text: str | Transcript | Sequence[str],
520
+ *,
521
+ include_tokens: bool = True,
522
+ ) -> AttackResult:
523
+ """Apply corruptions and calculate metrics.
524
+
525
+ Supports single strings, batches of strings, and chat transcripts.
526
+ For batched inputs, metrics are computed per entry and returned
527
+ as lists.
528
+
529
+ Args:
530
+ text: Input text, transcript, or batch of strings to corrupt.
531
+ include_tokens: Whether to include tokens in the result. Set to
532
+ False for a lightweight result with only metrics. Tokens are
533
+ still computed internally for metrics but not stored in the
534
+ result. Defaults to True.
535
+
536
+ Returns:
537
+ AttackResult containing original, corrupted, tokens, and metrics.
538
+
539
+ Raises:
540
+ TypeError: If input type is not recognized.
541
+ """
542
+ # Pure: plan the attack
543
+ attack_plan = plan_attack(text)
544
+ result_plan = plan_result(
545
+ attack_plan,
546
+ list(self.metrics.keys()),
547
+ self.tokenizer_info,
548
+ )
549
+
550
+ # Impure: execute the attack
551
+ fields = execute_attack(
552
+ self.glitchlings,
553
+ self.tokenizer,
554
+ self.metrics,
555
+ attack_plan,
556
+ result_plan,
557
+ text,
558
+ include_tokens=include_tokens,
559
+ )
560
+
561
+ return AttackResult(**fields) # type: ignore[arg-type]
562
+
563
+ def run_batch(
564
+ self,
565
+ texts: Sequence[str | Transcript],
566
+ *,
567
+ include_tokens: bool = True,
568
+ progress_callback: Callable[[list[AttackResult]], None] | None = None,
569
+ ) -> list[AttackResult]:
570
+ """Run attack on multiple texts, returning results in order.
571
+
572
+ Args:
573
+ texts: List of inputs to process.
574
+ include_tokens: Whether to include tokens in results. Set to
575
+ False for lightweight results with only metrics. Defaults to True.
576
+ progress_callback: Optional callback called after each result,
577
+ receiving the list of results so far.
578
+
579
+ Returns:
580
+ List of AttackResult objects in input order.
581
+ """
582
+ results: list[AttackResult] = []
583
+ for text in texts:
584
+ result = self.run(text, include_tokens=include_tokens)
585
+ results.append(result)
586
+ if progress_callback is not None:
587
+ progress_callback(results)
588
+ return results
589
+
590
+ def run_stream(
591
+ self,
592
+ texts: Iterator[str | Transcript] | Sequence[str | Transcript],
593
+ *,
594
+ include_tokens: bool = True,
595
+ ) -> Generator[AttackResult, None, None]:
596
+ """Stream attack results as they are computed.
597
+
598
+ Unlike run_batch(), this method yields results immediately as each
599
+ text is processed, allowing for memory-efficient processing of large
600
+ datasets without holding all results in memory.
601
+
602
+ Args:
603
+ texts: Iterator or sequence of inputs to process.
604
+ include_tokens: Whether to include tokens in results. Set to
605
+ False for lightweight results with only metrics. Defaults to True.
606
+
607
+ Yields:
608
+ AttackResult objects as they are computed.
609
+
610
+ Example:
611
+ >>> attack = Attack(Typogre(rate=0.05))
612
+ >>> for result in attack.run_stream(large_text_iterator):
613
+ ... process_result(result) # Process each result immediately
614
+ """
615
+ for text in texts:
616
+ yield self.run(text, include_tokens=include_tokens)
617
+
618
+ def run_streaming_result(
619
+ self,
620
+ text: str | Transcript | Sequence[str],
621
+ *,
622
+ window_size: int = 10000,
623
+ ) -> "StreamingAttackResult":
624
+ """Run attack and return result with windowed token access.
625
+
626
+ Returns a StreamingAttackResult that provides windowed iteration
627
+ over tokens, useful for chunk-based processing of results.
628
+
629
+ Note: This does not reduce memory usage during tokenization. For
630
+ memory-efficient processing of many texts, use run_stream() instead.
631
+
632
+ Args:
633
+ text: Input text, transcript, or batch of strings to corrupt.
634
+ window_size: Number of tokens per window during iteration.
635
+ Defaults to 10000.
636
+
637
+ Returns:
638
+ StreamingAttackResult with windowed token iteration.
639
+ """
640
+ result = self.run(text)
641
+ return StreamingAttackResult.from_attack_result(result, window_size=window_size)
642
+
643
+
644
+ # ---------------------------------------------------------------------------
645
+ # Streaming Attack Result
646
+ # ---------------------------------------------------------------------------
647
+
648
+
649
+ @dataclass
650
+ class StreamingAttackResult:
651
+ """Attack result with windowed token access for chunk-based processing.
652
+
653
+ Wraps an AttackResult and provides windowed iteration over tokens,
654
+ useful for processing results in fixed-size chunks (e.g., for batched
655
+ metric computation or memory-bounded downstream processing).
656
+
657
+ Note: Tokens are still stored in memory. This class provides windowed
658
+ *access*, not lazy loading. For true memory savings with very large
659
+ texts, process inputs in smaller batches using Attack.run_stream().
660
+
661
+ Attributes:
662
+ original: Original input text/transcript/batch.
663
+ corrupted: Corrupted output.
664
+ tokenizer_info: Description of the tokenizer used.
665
+ metrics: Computed metric values.
666
+ window_size: Number of tokens per window iteration.
667
+ """
668
+
669
+ original: str | Transcript | Sequence[str]
670
+ corrupted: str | Transcript | Sequence[str]
671
+ tokenizer_info: str
672
+ metrics: dict[str, float | list[float]]
673
+ window_size: int = field(default=10000)
674
+ _input_tokens: list[str] | list[list[str]] = field(default_factory=list, repr=False)
675
+ _output_tokens: list[str] | list[list[str]] = field(default_factory=list, repr=False)
676
+ _input_token_ids: list[int] | list[list[int]] = field(default_factory=list, repr=False)
677
+ _output_token_ids: list[int] | list[list[int]] = field(default_factory=list, repr=False)
678
+
679
+ @classmethod
680
+ def from_attack_result(
681
+ cls,
682
+ result: AttackResult,
683
+ *,
684
+ window_size: int = 10000,
685
+ ) -> "StreamingAttackResult":
686
+ """Create a StreamingAttackResult from an AttackResult.
687
+
688
+ Args:
689
+ result: The AttackResult to wrap.
690
+ window_size: Number of tokens per window.
691
+
692
+ Returns:
693
+ StreamingAttackResult with windowed token access.
694
+ """
695
+ return cls(
696
+ original=result.original,
697
+ corrupted=result.corrupted,
698
+ tokenizer_info=result.tokenizer_info,
699
+ metrics=result.metrics,
700
+ window_size=window_size,
701
+ _input_tokens=result.input_tokens,
702
+ _output_tokens=result.output_tokens,
703
+ _input_token_ids=result.input_token_ids,
704
+ _output_token_ids=result.output_token_ids,
705
+ )
706
+
707
+ def _is_batched(self) -> bool:
708
+ """Check if tokens represent a batch."""
709
+ tokens = self._input_tokens
710
+ if tokens and isinstance(tokens[0], list):
711
+ return True
712
+ return isinstance(self.original, list) or isinstance(self.corrupted, list)
713
+
714
+ def stream_input_tokens(self, batch_index: int = 0) -> StreamingTokens:
715
+ """Get streaming access to input tokens.
716
+
717
+ Args:
718
+ batch_index: Which batch item to stream (0 for single strings).
719
+
720
+ Returns:
721
+ StreamingTokens iterator for windowed access.
722
+ """
723
+ if self._is_batched():
724
+ tokens = cast(list[list[str]], self._input_tokens)[batch_index]
725
+ token_ids = cast(list[list[int]], self._input_token_ids)[batch_index]
726
+ else:
727
+ tokens = cast(list[str], self._input_tokens)
728
+ token_ids = cast(list[int], self._input_token_ids)
729
+
730
+ return StreamingTokens(tokens, token_ids, window_size=self.window_size)
731
+
732
+ def stream_output_tokens(self, batch_index: int = 0) -> StreamingTokens:
733
+ """Get streaming access to output tokens.
734
+
735
+ Args:
736
+ batch_index: Which batch item to stream (0 for single strings).
737
+
738
+ Returns:
739
+ StreamingTokens iterator for windowed access.
740
+ """
741
+ if self._is_batched():
742
+ tokens = cast(list[list[str]], self._output_tokens)[batch_index]
743
+ token_ids = cast(list[list[int]], self._output_token_ids)[batch_index]
744
+ else:
745
+ tokens = cast(list[str], self._output_tokens)
746
+ token_ids = cast(list[int], self._output_token_ids)
747
+
748
+ return StreamingTokens(tokens, token_ids, window_size=self.window_size)
749
+
750
+ def stream_token_pairs(
751
+ self,
752
+ batch_index: int = 0,
753
+ ) -> Generator[tuple[TokenWindow, TokenWindow], None, None]:
754
+ """Stream paired windows of input and output tokens.
755
+
756
+ Yields aligned (input_window, output_window) pairs for comparison.
757
+ Windows are paired by index, so the first input window pairs with
758
+ the first output window, etc.
759
+
760
+ Note: If input and output have different token counts, iteration
761
+ stops at the shorter sequence (like zip). Use stream_input_tokens()
762
+ and stream_output_tokens() separately if you need all windows.
763
+
764
+ Args:
765
+ batch_index: Which batch item to stream (0 for single strings).
766
+
767
+ Yields:
768
+ Tuples of (input_window, output_window) aligned by window index.
769
+ """
770
+ input_stream = self.stream_input_tokens(batch_index)
771
+ output_stream = self.stream_output_tokens(batch_index)
772
+
773
+ for input_window, output_window in zip(input_stream, output_stream):
774
+ yield input_window, output_window
775
+
776
+ def get_token_count(self, batch_index: int = 0) -> tuple[int, int]:
777
+ """Get token counts without materializing full lists.
778
+
779
+ Args:
780
+ batch_index: Which batch item to count (0 for single strings).
781
+
782
+ Returns:
783
+ Tuple of (input_token_count, output_token_count).
784
+ """
785
+ input_stream = self.stream_input_tokens(batch_index)
786
+ output_stream = self.stream_output_tokens(batch_index)
787
+ return len(input_stream), len(output_stream)
788
+
789
+ def to_attack_result(self) -> AttackResult:
790
+ """Convert back to a standard AttackResult.
791
+
792
+ Warning: This materializes all tokens in memory.
793
+
794
+ Returns:
795
+ AttackResult with all tokens loaded.
796
+ """
797
+ return AttackResult(
798
+ original=self.original,
799
+ corrupted=self.corrupted,
800
+ input_tokens=self._input_tokens,
801
+ output_tokens=self._output_tokens,
802
+ input_token_ids=self._input_token_ids,
803
+ output_token_ids=self._output_token_ids,
804
+ tokenizer_info=self.tokenizer_info,
805
+ metrics=self.metrics,
806
+ )
807
+
808
+ def get_metric(self, name: str) -> float | list[float] | None:
809
+ """Get a specific metric value by name."""
810
+ return self.metrics.get(name)
811
+
812
+
813
+ __all__ = [
814
+ "Attack",
815
+ "AttackResult",
816
+ "StreamingAttackResult",
817
+ "StreamingTokens",
818
+ "TokenWindow",
819
+ ]