glitchlings 1.0.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +101 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_corruption_engine/__init__.py +12 -0
- glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +184 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +819 -0
- glitchlings/attack/core_execution.py +378 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +211 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +338 -0
- glitchlings/attack/tokenizer_metrics.py +373 -0
- glitchlings/auggie.py +285 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +39 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +139 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +21 -0
- glitchlings/dlc/_shared.py +300 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/langchain.py +147 -0
- glitchlings/dlc/nemo.py +283 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +599 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +41 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +508 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +852 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +291 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +301 -0
- glitchlings/zoo/mim1c.py +269 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +99 -0
- glitchlings/zoo/pedant/forms.py +50 -0
- glitchlings/zoo/pedant/stones.py +83 -0
- glitchlings/zoo/redactyl.py +94 -0
- glitchlings/zoo/rng.py +280 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +370 -0
- glitchlings/zoo/transforms.py +331 -0
- glitchlings/zoo/typogre.py +194 -0
- glitchlings/zoo/validation.py +643 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +165 -0
- glitchlings-1.0.0.dist-info/METADATA +404 -0
- glitchlings-1.0.0.dist-info/RECORD +86 -0
- glitchlings-1.0.0.dist-info/WHEEL +5 -0
- glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
- glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
- glitchlings-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""Impure execution dispatch for Attack orchestration.
|
|
2
|
+
|
|
3
|
+
This module handles the actual execution of attack plans, including
|
|
4
|
+
tokenizer resolution, glitchling invocation, and metric computation.
|
|
5
|
+
It is the impure counterpart to core_planning.py.
|
|
6
|
+
|
|
7
|
+
**Design Philosophy:**
|
|
8
|
+
|
|
9
|
+
This module is explicitly *impure* - it resolves tokenizers, invokes
|
|
10
|
+
glitchling corruption functions, and calls Rust metrics. All impure
|
|
11
|
+
operations for Attack execution flow through this module.
|
|
12
|
+
|
|
13
|
+
The separation allows:
|
|
14
|
+
- Pure planning logic to be tested without dependencies
|
|
15
|
+
- Clear boundaries between plan construction and execution
|
|
16
|
+
- Mocking execution for integration tests
|
|
17
|
+
|
|
18
|
+
See AGENTS.md "Functional Purity Architecture" for full details.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from collections.abc import Mapping, Sequence
|
|
24
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
25
|
+
|
|
26
|
+
from ..util.adapters import coerce_gaggle
|
|
27
|
+
from ..util.transcripts import Transcript, is_transcript
|
|
28
|
+
from .core_planning import (
|
|
29
|
+
AttackPlan,
|
|
30
|
+
BatchAdapter,
|
|
31
|
+
EncodedData,
|
|
32
|
+
ResultPlan,
|
|
33
|
+
assemble_empty_result_fields,
|
|
34
|
+
assemble_result_fields,
|
|
35
|
+
extract_transcript_contents,
|
|
36
|
+
)
|
|
37
|
+
from .encode import encode_batch
|
|
38
|
+
from .metrics import (
|
|
39
|
+
Metric,
|
|
40
|
+
entropy_delta,
|
|
41
|
+
jensen_shannon_divergence,
|
|
42
|
+
merge_split_index,
|
|
43
|
+
normalized_edit_distance,
|
|
44
|
+
subsequence_retention,
|
|
45
|
+
)
|
|
46
|
+
from .tokenization import Tokenizer
|
|
47
|
+
|
|
48
|
+
if TYPE_CHECKING:
|
|
49
|
+
from ..protocols import Corruptor
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Default Metrics
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_default_metrics() -> dict[str, Metric]:
|
|
58
|
+
"""Return the default set of metrics for Attack.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dictionary mapping metric names to metric functions.
|
|
62
|
+
"""
|
|
63
|
+
return {
|
|
64
|
+
"jensen_shannon_divergence": jensen_shannon_divergence,
|
|
65
|
+
"normalized_edit_distance": normalized_edit_distance,
|
|
66
|
+
"subsequence_retention": subsequence_retention,
|
|
67
|
+
"entropy_delta": entropy_delta,
|
|
68
|
+
"merge_split_index": merge_split_index,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
# Glitchling Resolution
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def resolve_glitchlings(
|
|
78
|
+
glitchlings: "Corruptor | str | Sequence[str | Corruptor]",
|
|
79
|
+
*,
|
|
80
|
+
seed: int | None,
|
|
81
|
+
transcript_target: Any = None,
|
|
82
|
+
) -> "Corruptor":
|
|
83
|
+
"""Resolve glitchling specification into a Gaggle.
|
|
84
|
+
|
|
85
|
+
This impure function clones glitchlings and coerces them into a
|
|
86
|
+
Gaggle with the specified seed.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
glitchlings: Glitchling specification.
|
|
90
|
+
seed: Master seed for the gaggle. If None, uses DEFAULT_ATTACK_SEED.
|
|
91
|
+
transcript_target: Which transcript turns to corrupt.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
A Gaggle instance ready for corruption.
|
|
95
|
+
"""
|
|
96
|
+
from ..conf import DEFAULT_ATTACK_SEED
|
|
97
|
+
from ..protocols import Corruptor as CorruptorProtocol
|
|
98
|
+
|
|
99
|
+
effective_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
|
|
100
|
+
|
|
101
|
+
# Clone to avoid mutating caller-owned objects
|
|
102
|
+
cloned: Any
|
|
103
|
+
if isinstance(glitchlings, CorruptorProtocol):
|
|
104
|
+
cloned = glitchlings.clone()
|
|
105
|
+
elif isinstance(glitchlings, str):
|
|
106
|
+
cloned = glitchlings
|
|
107
|
+
elif isinstance(glitchlings, Sequence):
|
|
108
|
+
cloned_list: list[str | Corruptor] = []
|
|
109
|
+
for entry in glitchlings:
|
|
110
|
+
if isinstance(entry, CorruptorProtocol):
|
|
111
|
+
cloned_list.append(entry.clone())
|
|
112
|
+
else:
|
|
113
|
+
cloned_list.append(entry)
|
|
114
|
+
cloned = cloned_list
|
|
115
|
+
else:
|
|
116
|
+
cloned = glitchlings
|
|
117
|
+
|
|
118
|
+
return coerce_gaggle(
|
|
119
|
+
cloned,
|
|
120
|
+
seed=effective_seed,
|
|
121
|
+
apply_seed_to_existing=True,
|
|
122
|
+
transcript_target=transcript_target,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ---------------------------------------------------------------------------
|
|
127
|
+
# Corruption Execution
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def execute_corruption(
|
|
132
|
+
gaggle: "Corruptor",
|
|
133
|
+
plan: AttackPlan,
|
|
134
|
+
original_container: str | Transcript | Sequence[str],
|
|
135
|
+
) -> tuple[str | Transcript | Sequence[str], list[str]]:
|
|
136
|
+
"""Execute corruption according to the attack plan.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
gaggle: The glitchling(s) to use for corruption.
|
|
140
|
+
plan: The attack execution plan.
|
|
141
|
+
original_container: The original input container.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Tuple of (corrupted_container, corrupted_contents).
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
TypeError: If output type doesn't match input type.
|
|
148
|
+
"""
|
|
149
|
+
if plan.input_type == "batch":
|
|
150
|
+
original_batch = list(cast(Sequence[str], original_container))
|
|
151
|
+
corrupted_batch: list[str] = []
|
|
152
|
+
for entry in original_batch:
|
|
153
|
+
corrupted = gaggle.corrupt(entry)
|
|
154
|
+
if not isinstance(corrupted, str):
|
|
155
|
+
raise TypeError(
|
|
156
|
+
f"Attack expected str output for batch items, got {type(corrupted).__name__}"
|
|
157
|
+
)
|
|
158
|
+
corrupted_batch.append(corrupted)
|
|
159
|
+
return corrupted_batch, corrupted_batch
|
|
160
|
+
|
|
161
|
+
if plan.input_type == "transcript":
|
|
162
|
+
corrupted_transcript = gaggle.corrupt(cast(Transcript, original_container))
|
|
163
|
+
if not is_transcript(corrupted_transcript):
|
|
164
|
+
raise ValueError(
|
|
165
|
+
f"Attack expected transcript output for transcript input, "
|
|
166
|
+
f"got {type(corrupted_transcript).__name__}"
|
|
167
|
+
)
|
|
168
|
+
corrupted_contents = extract_transcript_contents(
|
|
169
|
+
cast(Sequence[Mapping[str, Any]], corrupted_transcript)
|
|
170
|
+
)
|
|
171
|
+
return corrupted_transcript, corrupted_contents
|
|
172
|
+
|
|
173
|
+
# Single string
|
|
174
|
+
corrupted = gaggle.corrupt(cast(str, original_container))
|
|
175
|
+
if not isinstance(corrupted, str):
|
|
176
|
+
raise TypeError(
|
|
177
|
+
f"Attack expected str output for string input, got {type(corrupted).__name__}"
|
|
178
|
+
)
|
|
179
|
+
return corrupted, [corrupted]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# ---------------------------------------------------------------------------
|
|
183
|
+
# Tokenization Execution
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def execute_tokenization(
|
|
188
|
+
tokenizer: Tokenizer,
|
|
189
|
+
contents: list[str],
|
|
190
|
+
) -> EncodedData:
|
|
191
|
+
"""Execute tokenization on content strings.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
tokenizer: Resolved tokenizer instance.
|
|
195
|
+
contents: List of strings to tokenize.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
EncodedData with tokens and token IDs.
|
|
199
|
+
"""
|
|
200
|
+
if not contents:
|
|
201
|
+
return EncodedData(tokens=[], token_ids=[])
|
|
202
|
+
|
|
203
|
+
batched_tokens, batched_ids = encode_batch(tokenizer, contents)
|
|
204
|
+
return EncodedData(tokens=batched_tokens, token_ids=batched_ids)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# ---------------------------------------------------------------------------
|
|
208
|
+
# Metric Execution
|
|
209
|
+
# ---------------------------------------------------------------------------
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def execute_metrics(
|
|
213
|
+
metrics: dict[str, Metric],
|
|
214
|
+
input_tokens: list[list[str]],
|
|
215
|
+
output_tokens: list[list[str]],
|
|
216
|
+
) -> dict[str, list[float]]:
|
|
217
|
+
"""Execute metric computation on batched tokens.
|
|
218
|
+
|
|
219
|
+
All inputs are processed as batches internally. Use BatchAdapter
|
|
220
|
+
to unwrap results for single-item inputs.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
metrics: Dictionary of metric functions.
|
|
224
|
+
input_tokens: Original tokens (always batched 2D list).
|
|
225
|
+
output_tokens: Corrupted tokens (always batched 2D list).
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Dictionary of computed metric values (always as lists).
|
|
229
|
+
"""
|
|
230
|
+
computed: dict[str, list[float]] = {}
|
|
231
|
+
for name, metric_fn in metrics.items():
|
|
232
|
+
result = metric_fn(input_tokens, output_tokens)
|
|
233
|
+
# Ensure result is always a list
|
|
234
|
+
if isinstance(result, list):
|
|
235
|
+
computed[name] = result
|
|
236
|
+
else:
|
|
237
|
+
computed[name] = [result]
|
|
238
|
+
|
|
239
|
+
return computed
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
# Full Attack Execution
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def execute_attack(
|
|
248
|
+
gaggle: "Corruptor",
|
|
249
|
+
tokenizer: Tokenizer,
|
|
250
|
+
metrics: dict[str, Metric],
|
|
251
|
+
plan: AttackPlan,
|
|
252
|
+
result_plan: ResultPlan,
|
|
253
|
+
original_container: str | Transcript | Sequence[str],
|
|
254
|
+
*,
|
|
255
|
+
include_tokens: bool = True,
|
|
256
|
+
) -> dict[str, object]:
|
|
257
|
+
"""Execute a complete attack and return result fields.
|
|
258
|
+
|
|
259
|
+
This function orchestrates the full attack execution:
|
|
260
|
+
1. Create batch adapter for uniform processing
|
|
261
|
+
2. Execute corruption
|
|
262
|
+
3. Tokenize original and corrupted content (always as batch)
|
|
263
|
+
4. Compute metrics (always as batch)
|
|
264
|
+
5. Assemble result fields (adapter unwraps as needed)
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
gaggle: Glitchling(s) for corruption.
|
|
268
|
+
tokenizer: Resolved tokenizer.
|
|
269
|
+
metrics: Metric functions.
|
|
270
|
+
plan: Attack execution plan.
|
|
271
|
+
result_plan: Result assembly plan.
|
|
272
|
+
original_container: Original input container.
|
|
273
|
+
include_tokens: Whether to include tokens in the result. If False,
|
|
274
|
+
tokens are computed for metrics but not stored in the result.
|
|
275
|
+
Defaults to True.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Dictionary of fields for AttackResult construction.
|
|
279
|
+
"""
|
|
280
|
+
# Handle empty input
|
|
281
|
+
if plan.is_empty:
|
|
282
|
+
return assemble_empty_result_fields(
|
|
283
|
+
original=original_container,
|
|
284
|
+
corrupted=original_container,
|
|
285
|
+
tokenizer_info=result_plan.tokenizer_info,
|
|
286
|
+
metric_names=result_plan.metric_names,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Create batch adapter for uniform processing
|
|
290
|
+
adapter = BatchAdapter.from_plan(plan)
|
|
291
|
+
|
|
292
|
+
# Execute corruption
|
|
293
|
+
corrupted_container, corrupted_contents = execute_corruption(gaggle, plan, original_container)
|
|
294
|
+
|
|
295
|
+
# Tokenize (always returns batched EncodedData)
|
|
296
|
+
input_encoded = execute_tokenization(tokenizer, plan.original_contents)
|
|
297
|
+
output_encoded = execute_tokenization(tokenizer, corrupted_contents)
|
|
298
|
+
|
|
299
|
+
# Compute metrics (always returns batched metrics)
|
|
300
|
+
batch_metrics = execute_metrics(
|
|
301
|
+
metrics,
|
|
302
|
+
input_encoded.tokens,
|
|
303
|
+
output_encoded.tokens,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# If not including tokens, use empty EncodedData for result assembly
|
|
307
|
+
if not include_tokens:
|
|
308
|
+
empty_encoded = EncodedData(tokens=[], token_ids=[])
|
|
309
|
+
input_encoded = empty_encoded
|
|
310
|
+
output_encoded = empty_encoded
|
|
311
|
+
|
|
312
|
+
# Assemble result (adapter handles unwrapping for single inputs)
|
|
313
|
+
return assemble_result_fields(
|
|
314
|
+
adapter=adapter,
|
|
315
|
+
original=original_container,
|
|
316
|
+
corrupted=corrupted_container,
|
|
317
|
+
input_encoded=input_encoded,
|
|
318
|
+
output_encoded=output_encoded,
|
|
319
|
+
tokenizer_info=result_plan.tokenizer_info,
|
|
320
|
+
metrics=batch_metrics,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# ---------------------------------------------------------------------------
|
|
325
|
+
# Comparison Execution
|
|
326
|
+
# ---------------------------------------------------------------------------
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def execute_comparison_entry(
|
|
330
|
+
gaggle: "Corruptor",
|
|
331
|
+
tokenizer: Tokenizer,
|
|
332
|
+
tokenizer_info: str,
|
|
333
|
+
metrics: dict[str, Metric],
|
|
334
|
+
text: str | Transcript | Sequence[str],
|
|
335
|
+
) -> tuple[str, dict[str, object]]:
|
|
336
|
+
"""Execute a single comparison entry.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
gaggle: Glitchling(s) for corruption.
|
|
340
|
+
tokenizer: Resolved tokenizer.
|
|
341
|
+
tokenizer_info: Tokenizer description.
|
|
342
|
+
metrics: Metric functions.
|
|
343
|
+
text: Input text.
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
Tuple of (tokenizer_info, result_fields).
|
|
347
|
+
"""
|
|
348
|
+
from .core_planning import plan_attack, plan_result
|
|
349
|
+
|
|
350
|
+
# Create plans
|
|
351
|
+
attack_plan = plan_attack(text)
|
|
352
|
+
result_plan = plan_result(attack_plan, list(metrics.keys()), tokenizer_info)
|
|
353
|
+
|
|
354
|
+
# Execute
|
|
355
|
+
fields = execute_attack(
|
|
356
|
+
gaggle,
|
|
357
|
+
tokenizer,
|
|
358
|
+
metrics,
|
|
359
|
+
attack_plan,
|
|
360
|
+
result_plan,
|
|
361
|
+
text,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return tokenizer_info, fields
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
__all__ = [
|
|
368
|
+
# Defaults
|
|
369
|
+
"get_default_metrics",
|
|
370
|
+
# Resolution
|
|
371
|
+
"resolve_glitchlings",
|
|
372
|
+
# Execution
|
|
373
|
+
"execute_corruption",
|
|
374
|
+
"execute_tokenization",
|
|
375
|
+
"execute_metrics",
|
|
376
|
+
"execute_attack",
|
|
377
|
+
"execute_comparison_entry",
|
|
378
|
+
]
|