glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. glitchlings/__init__.py +101 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_corruption_engine/__init__.py +12 -0
  4. glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +184 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +819 -0
  21. glitchlings/attack/core_execution.py +378 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +211 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +338 -0
  27. glitchlings/attack/tokenizer_metrics.py +373 -0
  28. glitchlings/auggie.py +285 -0
  29. glitchlings/compat/__init__.py +9 -0
  30. glitchlings/compat/loaders.py +355 -0
  31. glitchlings/compat/types.py +41 -0
  32. glitchlings/conf/__init__.py +39 -0
  33. glitchlings/conf/loaders.py +331 -0
  34. glitchlings/conf/schema.py +156 -0
  35. glitchlings/conf/types.py +72 -0
  36. glitchlings/config.toml +2 -0
  37. glitchlings/constants.py +139 -0
  38. glitchlings/dev/__init__.py +3 -0
  39. glitchlings/dev/docs.py +45 -0
  40. glitchlings/dlc/__init__.py +21 -0
  41. glitchlings/dlc/_shared.py +300 -0
  42. glitchlings/dlc/gutenberg.py +400 -0
  43. glitchlings/dlc/huggingface.py +68 -0
  44. glitchlings/dlc/langchain.py +147 -0
  45. glitchlings/dlc/nemo.py +283 -0
  46. glitchlings/dlc/prime.py +215 -0
  47. glitchlings/dlc/pytorch.py +98 -0
  48. glitchlings/dlc/pytorch_lightning.py +173 -0
  49. glitchlings/internal/__init__.py +16 -0
  50. glitchlings/internal/rust.py +159 -0
  51. glitchlings/internal/rust_ffi.py +599 -0
  52. glitchlings/main.py +426 -0
  53. glitchlings/protocols.py +91 -0
  54. glitchlings/runtime_config.py +24 -0
  55. glitchlings/util/__init__.py +41 -0
  56. glitchlings/util/adapters.py +65 -0
  57. glitchlings/util/keyboards.py +508 -0
  58. glitchlings/util/transcripts.py +108 -0
  59. glitchlings/zoo/__init__.py +161 -0
  60. glitchlings/zoo/assets/__init__.py +29 -0
  61. glitchlings/zoo/core.py +852 -0
  62. glitchlings/zoo/core_execution.py +154 -0
  63. glitchlings/zoo/core_planning.py +451 -0
  64. glitchlings/zoo/corrupt_dispatch.py +291 -0
  65. glitchlings/zoo/hokey.py +139 -0
  66. glitchlings/zoo/jargoyle.py +301 -0
  67. glitchlings/zoo/mim1c.py +269 -0
  68. glitchlings/zoo/pedant/__init__.py +109 -0
  69. glitchlings/zoo/pedant/core.py +99 -0
  70. glitchlings/zoo/pedant/forms.py +50 -0
  71. glitchlings/zoo/pedant/stones.py +83 -0
  72. glitchlings/zoo/redactyl.py +94 -0
  73. glitchlings/zoo/rng.py +280 -0
  74. glitchlings/zoo/rushmore.py +416 -0
  75. glitchlings/zoo/scannequin.py +370 -0
  76. glitchlings/zoo/transforms.py +331 -0
  77. glitchlings/zoo/typogre.py +194 -0
  78. glitchlings/zoo/validation.py +643 -0
  79. glitchlings/zoo/wherewolf.py +120 -0
  80. glitchlings/zoo/zeedub.py +165 -0
  81. glitchlings-1.0.0.dist-info/METADATA +404 -0
  82. glitchlings-1.0.0.dist-info/RECORD +86 -0
  83. glitchlings-1.0.0.dist-info/WHEEL +5 -0
  84. glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
  85. glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
  86. glitchlings-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,30 @@
1
+ # Source Replacements (space-separated)
2
+ li h
3
+ h li
4
+ rn m
5
+ m rn
6
+ cl d
7
+ d cl
8
+ I l
9
+ l I 1
10
+ 1 l I
11
+ 0 O
12
+ O 0
13
+ B 8
14
+ 8 B
15
+ S 5
16
+ 5 S
17
+ Z 2
18
+ 2 Z
19
+ G 6
20
+ 6 G
21
+ “ "
22
+ ” "
23
+ ‘ '
24
+ ’ '
25
+ — -
26
+ – -
27
+ vv w
28
+ w vv
29
+ ri n
30
+ n ri
@@ -0,0 +1,29 @@
1
+ {
2
+ "pipeline_assets": [
3
+ {
4
+ "name": "apostrofae_pairs.json",
5
+ "kind": "copy"
6
+ },
7
+ {
8
+ "name": "ekkokin_homophones.json",
9
+ "kind": "copy"
10
+ },
11
+ {
12
+ "name": "hokey_assets.json",
13
+ "kind": "copy"
14
+ },
15
+ {
16
+ "name": "lexemes",
17
+ "kind": "copy"
18
+ },
19
+ {
20
+ "name": "ocr_confusions.tsv",
21
+ "kind": "copy"
22
+ },
23
+ {
24
+ "name": "mim1c_homoglyphs.json.gz.b64",
25
+ "kind": "compressed",
26
+ "output": "mim1c_homoglyphs.json"
27
+ }
28
+ ]
29
+ }
@@ -0,0 +1,184 @@
1
+ """Attack submodule for comparing text before and after corruption.
2
+
3
+ This module follows the functional purity architecture:
4
+
5
+ **Pure Planning** (core_planning.py):
6
+ - Input analysis and type guards
7
+ - Attack plan construction
8
+ - Result assembly helpers
9
+
10
+ **Impure Execution** (core_execution.py):
11
+ - Glitchling resolution
12
+ - Tokenization execution
13
+ - Metric computation
14
+
15
+ **Boundary Layer** (core.py):
16
+ - Input validation
17
+ - Orchestration via Attack class
18
+
19
+ **Analysis Tools** (analysis.py):
20
+ - SeedSweep, GridSearch, TokenizerComparison
21
+
22
+ See AGENTS.md "Functional Purity Architecture" for full details.
23
+ """
24
+
25
+ from .analysis import (
26
+ GlitchlingComparisonEntry,
27
+ GlitchlingComparisonResult,
28
+ GridSearch,
29
+ GridSearchPoint,
30
+ GridSearchResult,
31
+ SeedSweep,
32
+ SeedSweepResult,
33
+ TokenizerComparison,
34
+ TokenizerComparisonEntry,
35
+ TokenizerComparisonResult,
36
+ compare_glitchlings,
37
+ compare_tokenizers,
38
+ compute_aggregate_stats,
39
+ extract_scalar_metrics,
40
+ format_stats_summary,
41
+ generate_param_combinations,
42
+ rank_grid_points,
43
+ )
44
+ from .core import Attack, AttackResult, StreamingAttackResult, StreamingTokens, TokenWindow
45
+ from .core_execution import (
46
+ execute_attack,
47
+ execute_corruption,
48
+ execute_metrics,
49
+ execute_tokenization,
50
+ get_default_metrics,
51
+ resolve_glitchlings,
52
+ )
53
+ from .core_planning import (
54
+ AttackPlan,
55
+ BatchAdapter,
56
+ EncodedData,
57
+ ResultPlan,
58
+ assemble_batch_result_fields,
59
+ assemble_empty_result_fields,
60
+ assemble_result_fields,
61
+ assemble_single_result_fields,
62
+ compute_token_counts,
63
+ extract_transcript_contents,
64
+ format_token_count_delta,
65
+ is_string_batch,
66
+ is_transcript_like,
67
+ plan_attack,
68
+ plan_result,
69
+ )
70
+ from .encode import describe_tokenizer, encode_batch, encode_single
71
+ from .metrics import (
72
+ MetricName,
73
+ entropy_delta,
74
+ jensen_shannon_divergence,
75
+ merge_split_index,
76
+ normalized_edit_distance,
77
+ subsequence_retention,
78
+ )
79
+ from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
80
+ from .tokenization import (
81
+ Tokenizer,
82
+ clear_tokenizer_cache,
83
+ get_tokenizer_cache_info,
84
+ list_available_tokenizers,
85
+ )
86
+ from .tokenizer_metrics import (
87
+ DEFAULT_UNKNOWN_MARKERS,
88
+ analyze_tokenizer,
89
+ batch_characters_per_token,
90
+ batch_compression_ratio,
91
+ batch_token_entropy,
92
+ batch_unknown_token_rate,
93
+ batch_vocabulary_utilization,
94
+ characters_per_token,
95
+ compression_ratio,
96
+ token_entropy,
97
+ unknown_token_rate,
98
+ vocabulary_utilization,
99
+ )
100
+
101
+ __all__ = [
102
+ # Core orchestration
103
+ "Attack",
104
+ "AttackResult",
105
+ "StreamingAttackResult",
106
+ "StreamingTokens",
107
+ "TokenWindow",
108
+ "Tokenizer",
109
+ "clear_tokenizer_cache",
110
+ "get_tokenizer_cache_info",
111
+ "list_available_tokenizers",
112
+ # Metrics
113
+ "MetricName",
114
+ "jensen_shannon_divergence",
115
+ "normalized_edit_distance",
116
+ "subsequence_retention",
117
+ "entropy_delta",
118
+ "merge_split_index",
119
+ # Analysis tools (impure orchestrators)
120
+ "SeedSweep",
121
+ "SeedSweepResult",
122
+ "GridSearch",
123
+ "GridSearchResult",
124
+ "GridSearchPoint",
125
+ "TokenizerComparison",
126
+ "TokenizerComparisonResult",
127
+ "TokenizerComparisonEntry",
128
+ # Comparison functions
129
+ "compare_glitchlings",
130
+ "compare_tokenizers",
131
+ "GlitchlingComparisonEntry",
132
+ "GlitchlingComparisonResult",
133
+ # Analysis pure helpers
134
+ "compute_aggregate_stats",
135
+ "format_stats_summary",
136
+ "extract_scalar_metrics",
137
+ "generate_param_combinations",
138
+ "rank_grid_points",
139
+ # Core planning (pure)
140
+ "AttackPlan",
141
+ "BatchAdapter",
142
+ "ResultPlan",
143
+ "EncodedData",
144
+ "plan_attack",
145
+ "plan_result",
146
+ "is_string_batch",
147
+ "is_transcript_like",
148
+ "assemble_result_fields",
149
+ "assemble_single_result_fields",
150
+ "assemble_batch_result_fields",
151
+ "assemble_empty_result_fields",
152
+ "compute_token_counts",
153
+ "extract_transcript_contents",
154
+ "format_token_count_delta",
155
+ # Core execution (impure)
156
+ "get_default_metrics",
157
+ "resolve_glitchlings",
158
+ "execute_corruption",
159
+ "execute_tokenization",
160
+ "execute_metrics",
161
+ "execute_attack",
162
+ # Encode (pure)
163
+ "describe_tokenizer",
164
+ "encode_batch",
165
+ "encode_single",
166
+ # Metrics dispatch (pure)
167
+ "TokenBatch",
168
+ "TokenSequence",
169
+ "is_batch",
170
+ "validate_batch_consistency",
171
+ # Tokenizer metrics (pure)
172
+ "compression_ratio",
173
+ "batch_compression_ratio",
174
+ "characters_per_token",
175
+ "batch_characters_per_token",
176
+ "token_entropy",
177
+ "batch_token_entropy",
178
+ "vocabulary_utilization",
179
+ "batch_vocabulary_utilization",
180
+ "unknown_token_rate",
181
+ "batch_unknown_token_rate",
182
+ "analyze_tokenizer",
183
+ "DEFAULT_UNKNOWN_MARKERS",
184
+ ]