glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +36 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +53 -0
- glitchlings/attack/compose.py +299 -0
- glitchlings/attack/core.py +465 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +104 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +157 -0
- glitchlings/auggie.py +283 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +17 -3
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +37 -65
- glitchlings/dlc/prime.py +55 -114
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +432 -0
- glitchlings/main.py +123 -32
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +29 -176
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +311 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +47 -24
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +301 -167
- glitchlings/zoo/core_execution.py +98 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/ekkokin.py +118 -0
- glitchlings/zoo/hokey.py +137 -0
- glitchlings/zoo/jargoyle.py +179 -274
- glitchlings/zoo/mim1c.py +106 -68
- glitchlings/zoo/pedant/__init__.py +107 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +44 -175
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +359 -116
- glitchlings/zoo/scannequin.py +18 -125
- glitchlings/zoo/transforms.py +386 -0
- glitchlings/zoo/typogre.py +76 -162
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/zeedub.py +33 -86
- glitchlings-0.9.3.dist-info/METADATA +334 -0
- glitchlings-0.9.3.dist-info/RECORD +80 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
- glitchlings/zoo/_ocr_confusions.py +0 -34
- glitchlings/zoo/_rate.py +0 -21
- glitchlings/zoo/reduple.py +0 -169
- glitchlings-0.2.5.dist-info/METADATA +0 -490
- glitchlings-0.2.5.dist-info/RECORD +0 -27
- /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"pipeline_assets": [
|
|
3
|
+
{
|
|
4
|
+
"name": "apostrofae_pairs.json",
|
|
5
|
+
"kind": "copy"
|
|
6
|
+
},
|
|
7
|
+
{
|
|
8
|
+
"name": "ekkokin_homophones.json",
|
|
9
|
+
"kind": "copy"
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"name": "hokey_assets.json",
|
|
13
|
+
"kind": "copy"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "lexemes",
|
|
17
|
+
"kind": "copy"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "ocr_confusions.tsv",
|
|
21
|
+
"kind": "copy"
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "mim1c_homoglyphs.json.gz.b64",
|
|
25
|
+
"kind": "compressed",
|
|
26
|
+
"output": "mim1c_homoglyphs.json"
|
|
27
|
+
}
|
|
28
|
+
]
|
|
29
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Attack submodule for comparing text before and after corruption."""
|
|
2
|
+
|
|
3
|
+
from .compose import (
|
|
4
|
+
AttackResultComponents,
|
|
5
|
+
EncodedPayload,
|
|
6
|
+
build_batch_result,
|
|
7
|
+
build_empty_metrics,
|
|
8
|
+
build_empty_result,
|
|
9
|
+
build_single_result,
|
|
10
|
+
extract_transcript_contents,
|
|
11
|
+
format_metrics_for_batch,
|
|
12
|
+
format_metrics_for_single,
|
|
13
|
+
)
|
|
14
|
+
from .core import Attack, AttackResult, MultiAttackResult
|
|
15
|
+
from .encode import describe_tokenizer, encode_batch, encode_single
|
|
16
|
+
from .metrics import (
|
|
17
|
+
jensen_shannon_divergence,
|
|
18
|
+
normalized_edit_distance,
|
|
19
|
+
subsequence_retention,
|
|
20
|
+
)
|
|
21
|
+
from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
|
|
22
|
+
from .tokenization import Tokenizer
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
# Core
|
|
26
|
+
"Attack",
|
|
27
|
+
"AttackResult",
|
|
28
|
+
"MultiAttackResult",
|
|
29
|
+
"Tokenizer",
|
|
30
|
+
# Metrics
|
|
31
|
+
"jensen_shannon_divergence",
|
|
32
|
+
"normalized_edit_distance",
|
|
33
|
+
"subsequence_retention",
|
|
34
|
+
# Compose (pure)
|
|
35
|
+
"AttackResultComponents",
|
|
36
|
+
"EncodedPayload",
|
|
37
|
+
"build_batch_result",
|
|
38
|
+
"build_empty_metrics",
|
|
39
|
+
"build_empty_result",
|
|
40
|
+
"build_single_result",
|
|
41
|
+
"extract_transcript_contents",
|
|
42
|
+
"format_metrics_for_batch",
|
|
43
|
+
"format_metrics_for_single",
|
|
44
|
+
# Encode (pure)
|
|
45
|
+
"describe_tokenizer",
|
|
46
|
+
"encode_batch",
|
|
47
|
+
"encode_single",
|
|
48
|
+
# Metrics dispatch (pure)
|
|
49
|
+
"TokenBatch",
|
|
50
|
+
"TokenSequence",
|
|
51
|
+
"is_batch",
|
|
52
|
+
"validate_batch_consistency",
|
|
53
|
+
]
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""Pure result assembly functions for Attack.
|
|
2
|
+
|
|
3
|
+
This module contains pure functions for composing AttackResult instances
|
|
4
|
+
from pre-computed components. Functions here are:
|
|
5
|
+
|
|
6
|
+
- **Pure**: Output depends only on inputs, no side effects
|
|
7
|
+
- **Deterministic**: Same inputs always produce same outputs
|
|
8
|
+
- **Self-contained**: No IO, no Rust FFI, no config loading
|
|
9
|
+
|
|
10
|
+
Design Philosophy
|
|
11
|
+
-----------------
|
|
12
|
+
This module implements the innermost layer of Attack composition:
|
|
13
|
+
|
|
14
|
+
Attack.run() → tokenize → corrupt → compose.py → AttackResult
|
|
15
|
+
(orchestrator) (impure) (impure) (pure) (value)
|
|
16
|
+
|
|
17
|
+
Functions receive already-computed tokens, IDs, and metrics. They trust
|
|
18
|
+
that inputs are valid and do not re-validate. Boundary validation happens
|
|
19
|
+
in Attack.__init__ and before calling these functions.
|
|
20
|
+
|
|
21
|
+
See AGENTS.md "Functional Purity Architecture" for full details.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from collections.abc import Mapping, Sequence
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from typing import TYPE_CHECKING
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
31
|
+
from ..util.transcripts import Transcript
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Data Types
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True, slots=True)
|
|
40
|
+
class EncodedPayload:
|
|
41
|
+
"""Encoded representation of text or transcript for metric computation.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
tokens: Token strings from tokenizer (flat or batched).
|
|
45
|
+
token_ids: Token IDs from tokenizer (flat or batched).
|
|
46
|
+
is_batched: True if this represents a transcript (batch of texts).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
tokens: list[str] | list[list[str]]
|
|
50
|
+
token_ids: list[int] | list[list[int]]
|
|
51
|
+
is_batched: bool
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True, slots=True)
|
|
55
|
+
class AttackResultComponents:
|
|
56
|
+
"""Intermediate structure holding all components needed for AttackResult.
|
|
57
|
+
|
|
58
|
+
This is a pure value type that aggregates pre-computed components
|
|
59
|
+
before final assembly into AttackResult.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
original: "str | Transcript"
|
|
63
|
+
corrupted: "str | Transcript"
|
|
64
|
+
input_encoded: EncodedPayload
|
|
65
|
+
output_encoded: EncodedPayload
|
|
66
|
+
tokenizer_info: str
|
|
67
|
+
metrics: dict[str, float | list[float]]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# Transcript Content Extraction
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_transcript_contents(transcript: "Transcript") -> list[str]:
|
|
76
|
+
"""Extract content strings from a chat transcript.
|
|
77
|
+
|
|
78
|
+
This is a pure function that extracts the 'content' field from each
|
|
79
|
+
turn in a transcript. It trusts that the transcript structure is valid
|
|
80
|
+
(validated at Attack boundary).
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
transcript: List of turn dictionaries, each containing a 'content' key.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of content strings in turn order.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
TypeError: If a turn is missing 'content' or it isn't a string.
|
|
90
|
+
"""
|
|
91
|
+
contents: list[str] = []
|
|
92
|
+
for index, turn in enumerate(transcript):
|
|
93
|
+
if not isinstance(turn, Mapping):
|
|
94
|
+
raise TypeError(f"Transcript turn #{index + 1} must be a mapping.")
|
|
95
|
+
content = turn.get("content")
|
|
96
|
+
if not isinstance(content, str):
|
|
97
|
+
raise TypeError(f"Transcript turn #{index + 1} is missing string content.")
|
|
98
|
+
contents.append(content)
|
|
99
|
+
return contents
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# Metric Formatting
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def format_metrics_for_single(
|
|
108
|
+
metrics: dict[str, float | list[float]],
|
|
109
|
+
) -> dict[str, float]:
|
|
110
|
+
"""Collapse batch metrics to single values for non-transcript results.
|
|
111
|
+
|
|
112
|
+
When Attack processes a single string (not a transcript), metrics should
|
|
113
|
+
be scalar floats. This function extracts the first element from any
|
|
114
|
+
list-valued metrics.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
metrics: Dictionary of metric names to values (float or list[float]).
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Dictionary with all values as floats.
|
|
121
|
+
"""
|
|
122
|
+
result: dict[str, float] = {}
|
|
123
|
+
for name, value in metrics.items():
|
|
124
|
+
if isinstance(value, list):
|
|
125
|
+
result[name] = value[0] if value else 0.0
|
|
126
|
+
else:
|
|
127
|
+
result[name] = value
|
|
128
|
+
return result
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def format_metrics_for_batch(
|
|
132
|
+
metrics: dict[str, float | list[float]],
|
|
133
|
+
) -> dict[str, list[float]]:
|
|
134
|
+
"""Normalize metrics to list format for transcript results.
|
|
135
|
+
|
|
136
|
+
When Attack processes a transcript (batch), metrics should be lists.
|
|
137
|
+
This function wraps any scalar floats in single-element lists.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
metrics: Dictionary of metric names to values (float or list[float]).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Dictionary with all values as lists of floats.
|
|
144
|
+
"""
|
|
145
|
+
result: dict[str, list[float]] = {}
|
|
146
|
+
for name, value in metrics.items():
|
|
147
|
+
if isinstance(value, list):
|
|
148
|
+
result[name] = list(value)
|
|
149
|
+
else:
|
|
150
|
+
result[name] = [value]
|
|
151
|
+
return result
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
# Empty Result Construction
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def build_empty_metrics(metric_names: list[str]) -> dict[str, list[float]]:
|
|
160
|
+
"""Create empty metric results for empty transcript input.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
metric_names: Names of metrics to include.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Dictionary mapping each metric name to an empty list.
|
|
167
|
+
"""
|
|
168
|
+
return {name: [] for name in metric_names}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
# Result Assembly
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def build_single_result(
|
|
177
|
+
original: str,
|
|
178
|
+
corrupted: str,
|
|
179
|
+
input_tokens: list[str],
|
|
180
|
+
input_token_ids: list[int],
|
|
181
|
+
output_tokens: list[str],
|
|
182
|
+
output_token_ids: list[int],
|
|
183
|
+
tokenizer_info: str,
|
|
184
|
+
metrics: dict[str, float | list[float]],
|
|
185
|
+
) -> dict[str, object]:
|
|
186
|
+
"""Assemble AttackResult field dictionary for single-string input.
|
|
187
|
+
|
|
188
|
+
This is a pure function that takes all pre-computed components and
|
|
189
|
+
returns a dictionary suitable for constructing an AttackResult.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
original: Original input string.
|
|
193
|
+
corrupted: Corrupted output string.
|
|
194
|
+
input_tokens: Tokenized input.
|
|
195
|
+
input_token_ids: Token IDs for input.
|
|
196
|
+
output_tokens: Tokenized output.
|
|
197
|
+
output_token_ids: Token IDs for output.
|
|
198
|
+
tokenizer_info: Description of the tokenizer used.
|
|
199
|
+
metrics: Computed metrics (will be collapsed to scalars).
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Dictionary with all AttackResult field values.
|
|
203
|
+
"""
|
|
204
|
+
return {
|
|
205
|
+
"original": original,
|
|
206
|
+
"corrupted": corrupted,
|
|
207
|
+
"input_tokens": input_tokens,
|
|
208
|
+
"output_tokens": output_tokens,
|
|
209
|
+
"input_token_ids": input_token_ids,
|
|
210
|
+
"output_token_ids": output_token_ids,
|
|
211
|
+
"tokenizer_info": tokenizer_info,
|
|
212
|
+
"metrics": format_metrics_for_single(metrics),
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def build_batch_result(
|
|
217
|
+
original: "Transcript | Sequence[str]",
|
|
218
|
+
corrupted: "Transcript | Sequence[str]",
|
|
219
|
+
input_tokens: list[list[str]],
|
|
220
|
+
input_token_ids: list[list[int]],
|
|
221
|
+
output_tokens: list[list[str]],
|
|
222
|
+
output_token_ids: list[list[int]],
|
|
223
|
+
tokenizer_info: str,
|
|
224
|
+
metrics: dict[str, float | list[float]],
|
|
225
|
+
) -> dict[str, object]:
|
|
226
|
+
"""Assemble AttackResult field dictionary for batched input.
|
|
227
|
+
|
|
228
|
+
This is a pure function that takes all pre-computed components and
|
|
229
|
+
returns a dictionary suitable for constructing an AttackResult.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
original: Original transcript or list of strings.
|
|
233
|
+
corrupted: Corrupted transcript or list of strings.
|
|
234
|
+
input_tokens: Batched tokenized inputs.
|
|
235
|
+
input_token_ids: Batched token IDs for inputs.
|
|
236
|
+
output_tokens: Batched tokenized outputs.
|
|
237
|
+
output_token_ids: Batched token IDs for outputs.
|
|
238
|
+
tokenizer_info: Description of the tokenizer used.
|
|
239
|
+
metrics: Computed metrics (already in batch format).
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Dictionary with all AttackResult field values.
|
|
243
|
+
"""
|
|
244
|
+
return {
|
|
245
|
+
"original": original,
|
|
246
|
+
"corrupted": corrupted,
|
|
247
|
+
"input_tokens": input_tokens,
|
|
248
|
+
"output_tokens": output_tokens,
|
|
249
|
+
"input_token_ids": input_token_ids,
|
|
250
|
+
"output_token_ids": output_token_ids,
|
|
251
|
+
"tokenizer_info": tokenizer_info,
|
|
252
|
+
"metrics": metrics,
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def build_empty_result(
|
|
257
|
+
original: "Transcript | Sequence[str]",
|
|
258
|
+
corrupted: "Transcript | Sequence[str]",
|
|
259
|
+
tokenizer_info: str,
|
|
260
|
+
metric_names: list[str],
|
|
261
|
+
) -> dict[str, object]:
|
|
262
|
+
"""Assemble AttackResult field dictionary for empty batch input.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
original: Original empty transcript or list.
|
|
266
|
+
corrupted: Corrupted empty transcript or list.
|
|
267
|
+
tokenizer_info: Description of the tokenizer used.
|
|
268
|
+
metric_names: Names of metrics to include as empty lists.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Dictionary with all AttackResult field values for empty input.
|
|
272
|
+
"""
|
|
273
|
+
return {
|
|
274
|
+
"original": original,
|
|
275
|
+
"corrupted": corrupted,
|
|
276
|
+
"input_tokens": [],
|
|
277
|
+
"output_tokens": [],
|
|
278
|
+
"input_token_ids": [],
|
|
279
|
+
"output_token_ids": [],
|
|
280
|
+
"tokenizer_info": tokenizer_info,
|
|
281
|
+
"metrics": build_empty_metrics(metric_names),
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
__all__ = [
|
|
286
|
+
# Types
|
|
287
|
+
"AttackResultComponents",
|
|
288
|
+
"EncodedPayload",
|
|
289
|
+
# Transcript helpers
|
|
290
|
+
"extract_transcript_contents",
|
|
291
|
+
# Metric formatting
|
|
292
|
+
"build_empty_metrics",
|
|
293
|
+
"format_metrics_for_batch",
|
|
294
|
+
"format_metrics_for_single",
|
|
295
|
+
# Result assembly
|
|
296
|
+
"build_batch_result",
|
|
297
|
+
"build_empty_result",
|
|
298
|
+
"build_single_result",
|
|
299
|
+
]
|