glitchlings 1.0.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +101 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_corruption_engine/__init__.py +12 -0
- glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +184 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +819 -0
- glitchlings/attack/core_execution.py +378 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +211 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +338 -0
- glitchlings/attack/tokenizer_metrics.py +373 -0
- glitchlings/auggie.py +285 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +39 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +139 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +21 -0
- glitchlings/dlc/_shared.py +300 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/langchain.py +147 -0
- glitchlings/dlc/nemo.py +283 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +599 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +41 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +508 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +852 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +291 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +301 -0
- glitchlings/zoo/mim1c.py +269 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +99 -0
- glitchlings/zoo/pedant/forms.py +50 -0
- glitchlings/zoo/pedant/stones.py +83 -0
- glitchlings/zoo/redactyl.py +94 -0
- glitchlings/zoo/rng.py +280 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +370 -0
- glitchlings/zoo/transforms.py +331 -0
- glitchlings/zoo/typogre.py +194 -0
- glitchlings/zoo/validation.py +643 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +165 -0
- glitchlings-1.0.0.dist-info/METADATA +404 -0
- glitchlings-1.0.0.dist-info/RECORD +86 -0
- glitchlings-1.0.0.dist-info/WHEEL +5 -0
- glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
- glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
- glitchlings-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Source Replacements (space-separated)
|
|
2
|
+
li h
|
|
3
|
+
h li
|
|
4
|
+
rn m
|
|
5
|
+
m rn
|
|
6
|
+
cl d
|
|
7
|
+
d cl
|
|
8
|
+
I l
|
|
9
|
+
l I 1
|
|
10
|
+
1 l I
|
|
11
|
+
0 O
|
|
12
|
+
O 0
|
|
13
|
+
B 8
|
|
14
|
+
8 B
|
|
15
|
+
S 5
|
|
16
|
+
5 S
|
|
17
|
+
Z 2
|
|
18
|
+
2 Z
|
|
19
|
+
G 6
|
|
20
|
+
6 G
|
|
21
|
+
“ "
|
|
22
|
+
” "
|
|
23
|
+
‘ '
|
|
24
|
+
’ '
|
|
25
|
+
— -
|
|
26
|
+
– -
|
|
27
|
+
vv w
|
|
28
|
+
w vv
|
|
29
|
+
ri n
|
|
30
|
+
n ri
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"pipeline_assets": [
|
|
3
|
+
{
|
|
4
|
+
"name": "apostrofae_pairs.json",
|
|
5
|
+
"kind": "copy"
|
|
6
|
+
},
|
|
7
|
+
{
|
|
8
|
+
"name": "ekkokin_homophones.json",
|
|
9
|
+
"kind": "copy"
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"name": "hokey_assets.json",
|
|
13
|
+
"kind": "copy"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "lexemes",
|
|
17
|
+
"kind": "copy"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "ocr_confusions.tsv",
|
|
21
|
+
"kind": "copy"
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "mim1c_homoglyphs.json.gz.b64",
|
|
25
|
+
"kind": "compressed",
|
|
26
|
+
"output": "mim1c_homoglyphs.json"
|
|
27
|
+
}
|
|
28
|
+
]
|
|
29
|
+
}
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Attack submodule for comparing text before and after corruption.
|
|
2
|
+
|
|
3
|
+
This module follows the functional purity architecture:
|
|
4
|
+
|
|
5
|
+
**Pure Planning** (core_planning.py):
|
|
6
|
+
- Input analysis and type guards
|
|
7
|
+
- Attack plan construction
|
|
8
|
+
- Result assembly helpers
|
|
9
|
+
|
|
10
|
+
**Impure Execution** (core_execution.py):
|
|
11
|
+
- Glitchling resolution
|
|
12
|
+
- Tokenization execution
|
|
13
|
+
- Metric computation
|
|
14
|
+
|
|
15
|
+
**Boundary Layer** (core.py):
|
|
16
|
+
- Input validation
|
|
17
|
+
- Orchestration via Attack class
|
|
18
|
+
|
|
19
|
+
**Analysis Tools** (analysis.py):
|
|
20
|
+
- SeedSweep, GridSearch, TokenizerComparison
|
|
21
|
+
|
|
22
|
+
See AGENTS.md "Functional Purity Architecture" for full details.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from .analysis import (
|
|
26
|
+
GlitchlingComparisonEntry,
|
|
27
|
+
GlitchlingComparisonResult,
|
|
28
|
+
GridSearch,
|
|
29
|
+
GridSearchPoint,
|
|
30
|
+
GridSearchResult,
|
|
31
|
+
SeedSweep,
|
|
32
|
+
SeedSweepResult,
|
|
33
|
+
TokenizerComparison,
|
|
34
|
+
TokenizerComparisonEntry,
|
|
35
|
+
TokenizerComparisonResult,
|
|
36
|
+
compare_glitchlings,
|
|
37
|
+
compare_tokenizers,
|
|
38
|
+
compute_aggregate_stats,
|
|
39
|
+
extract_scalar_metrics,
|
|
40
|
+
format_stats_summary,
|
|
41
|
+
generate_param_combinations,
|
|
42
|
+
rank_grid_points,
|
|
43
|
+
)
|
|
44
|
+
from .core import Attack, AttackResult, StreamingAttackResult, StreamingTokens, TokenWindow
|
|
45
|
+
from .core_execution import (
|
|
46
|
+
execute_attack,
|
|
47
|
+
execute_corruption,
|
|
48
|
+
execute_metrics,
|
|
49
|
+
execute_tokenization,
|
|
50
|
+
get_default_metrics,
|
|
51
|
+
resolve_glitchlings,
|
|
52
|
+
)
|
|
53
|
+
from .core_planning import (
|
|
54
|
+
AttackPlan,
|
|
55
|
+
BatchAdapter,
|
|
56
|
+
EncodedData,
|
|
57
|
+
ResultPlan,
|
|
58
|
+
assemble_batch_result_fields,
|
|
59
|
+
assemble_empty_result_fields,
|
|
60
|
+
assemble_result_fields,
|
|
61
|
+
assemble_single_result_fields,
|
|
62
|
+
compute_token_counts,
|
|
63
|
+
extract_transcript_contents,
|
|
64
|
+
format_token_count_delta,
|
|
65
|
+
is_string_batch,
|
|
66
|
+
is_transcript_like,
|
|
67
|
+
plan_attack,
|
|
68
|
+
plan_result,
|
|
69
|
+
)
|
|
70
|
+
from .encode import describe_tokenizer, encode_batch, encode_single
|
|
71
|
+
from .metrics import (
|
|
72
|
+
MetricName,
|
|
73
|
+
entropy_delta,
|
|
74
|
+
jensen_shannon_divergence,
|
|
75
|
+
merge_split_index,
|
|
76
|
+
normalized_edit_distance,
|
|
77
|
+
subsequence_retention,
|
|
78
|
+
)
|
|
79
|
+
from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
|
|
80
|
+
from .tokenization import (
|
|
81
|
+
Tokenizer,
|
|
82
|
+
clear_tokenizer_cache,
|
|
83
|
+
get_tokenizer_cache_info,
|
|
84
|
+
list_available_tokenizers,
|
|
85
|
+
)
|
|
86
|
+
from .tokenizer_metrics import (
|
|
87
|
+
DEFAULT_UNKNOWN_MARKERS,
|
|
88
|
+
analyze_tokenizer,
|
|
89
|
+
batch_characters_per_token,
|
|
90
|
+
batch_compression_ratio,
|
|
91
|
+
batch_token_entropy,
|
|
92
|
+
batch_unknown_token_rate,
|
|
93
|
+
batch_vocabulary_utilization,
|
|
94
|
+
characters_per_token,
|
|
95
|
+
compression_ratio,
|
|
96
|
+
token_entropy,
|
|
97
|
+
unknown_token_rate,
|
|
98
|
+
vocabulary_utilization,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
__all__ = [
|
|
102
|
+
# Core orchestration
|
|
103
|
+
"Attack",
|
|
104
|
+
"AttackResult",
|
|
105
|
+
"StreamingAttackResult",
|
|
106
|
+
"StreamingTokens",
|
|
107
|
+
"TokenWindow",
|
|
108
|
+
"Tokenizer",
|
|
109
|
+
"clear_tokenizer_cache",
|
|
110
|
+
"get_tokenizer_cache_info",
|
|
111
|
+
"list_available_tokenizers",
|
|
112
|
+
# Metrics
|
|
113
|
+
"MetricName",
|
|
114
|
+
"jensen_shannon_divergence",
|
|
115
|
+
"normalized_edit_distance",
|
|
116
|
+
"subsequence_retention",
|
|
117
|
+
"entropy_delta",
|
|
118
|
+
"merge_split_index",
|
|
119
|
+
# Analysis tools (impure orchestrators)
|
|
120
|
+
"SeedSweep",
|
|
121
|
+
"SeedSweepResult",
|
|
122
|
+
"GridSearch",
|
|
123
|
+
"GridSearchResult",
|
|
124
|
+
"GridSearchPoint",
|
|
125
|
+
"TokenizerComparison",
|
|
126
|
+
"TokenizerComparisonResult",
|
|
127
|
+
"TokenizerComparisonEntry",
|
|
128
|
+
# Comparison functions
|
|
129
|
+
"compare_glitchlings",
|
|
130
|
+
"compare_tokenizers",
|
|
131
|
+
"GlitchlingComparisonEntry",
|
|
132
|
+
"GlitchlingComparisonResult",
|
|
133
|
+
# Analysis pure helpers
|
|
134
|
+
"compute_aggregate_stats",
|
|
135
|
+
"format_stats_summary",
|
|
136
|
+
"extract_scalar_metrics",
|
|
137
|
+
"generate_param_combinations",
|
|
138
|
+
"rank_grid_points",
|
|
139
|
+
# Core planning (pure)
|
|
140
|
+
"AttackPlan",
|
|
141
|
+
"BatchAdapter",
|
|
142
|
+
"ResultPlan",
|
|
143
|
+
"EncodedData",
|
|
144
|
+
"plan_attack",
|
|
145
|
+
"plan_result",
|
|
146
|
+
"is_string_batch",
|
|
147
|
+
"is_transcript_like",
|
|
148
|
+
"assemble_result_fields",
|
|
149
|
+
"assemble_single_result_fields",
|
|
150
|
+
"assemble_batch_result_fields",
|
|
151
|
+
"assemble_empty_result_fields",
|
|
152
|
+
"compute_token_counts",
|
|
153
|
+
"extract_transcript_contents",
|
|
154
|
+
"format_token_count_delta",
|
|
155
|
+
# Core execution (impure)
|
|
156
|
+
"get_default_metrics",
|
|
157
|
+
"resolve_glitchlings",
|
|
158
|
+
"execute_corruption",
|
|
159
|
+
"execute_tokenization",
|
|
160
|
+
"execute_metrics",
|
|
161
|
+
"execute_attack",
|
|
162
|
+
# Encode (pure)
|
|
163
|
+
"describe_tokenizer",
|
|
164
|
+
"encode_batch",
|
|
165
|
+
"encode_single",
|
|
166
|
+
# Metrics dispatch (pure)
|
|
167
|
+
"TokenBatch",
|
|
168
|
+
"TokenSequence",
|
|
169
|
+
"is_batch",
|
|
170
|
+
"validate_batch_consistency",
|
|
171
|
+
# Tokenizer metrics (pure)
|
|
172
|
+
"compression_ratio",
|
|
173
|
+
"batch_compression_ratio",
|
|
174
|
+
"characters_per_token",
|
|
175
|
+
"batch_characters_per_token",
|
|
176
|
+
"token_entropy",
|
|
177
|
+
"batch_token_entropy",
|
|
178
|
+
"vocabulary_utilization",
|
|
179
|
+
"batch_vocabulary_utilization",
|
|
180
|
+
"unknown_token_rate",
|
|
181
|
+
"batch_unknown_token_rate",
|
|
182
|
+
"analyze_tokenizer",
|
|
183
|
+
"DEFAULT_UNKNOWN_MARKERS",
|
|
184
|
+
]
|