glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. glitchlings/__init__.py +36 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust/__init__.py +12 -0
  4. glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/pipeline_assets.json +29 -0
  17. glitchlings/attack/__init__.py +53 -0
  18. glitchlings/attack/compose.py +299 -0
  19. glitchlings/attack/core.py +465 -0
  20. glitchlings/attack/encode.py +114 -0
  21. glitchlings/attack/metrics.py +104 -0
  22. glitchlings/attack/metrics_dispatch.py +70 -0
  23. glitchlings/attack/tokenization.py +157 -0
  24. glitchlings/auggie.py +283 -0
  25. glitchlings/compat/__init__.py +9 -0
  26. glitchlings/compat/loaders.py +355 -0
  27. glitchlings/compat/types.py +41 -0
  28. glitchlings/conf/__init__.py +41 -0
  29. glitchlings/conf/loaders.py +331 -0
  30. glitchlings/conf/schema.py +156 -0
  31. glitchlings/conf/types.py +72 -0
  32. glitchlings/config.toml +2 -0
  33. glitchlings/constants.py +59 -0
  34. glitchlings/dev/__init__.py +3 -0
  35. glitchlings/dev/docs.py +45 -0
  36. glitchlings/dlc/__init__.py +17 -3
  37. glitchlings/dlc/_shared.py +296 -0
  38. glitchlings/dlc/gutenberg.py +400 -0
  39. glitchlings/dlc/huggingface.py +37 -65
  40. glitchlings/dlc/prime.py +55 -114
  41. glitchlings/dlc/pytorch.py +98 -0
  42. glitchlings/dlc/pytorch_lightning.py +173 -0
  43. glitchlings/internal/__init__.py +16 -0
  44. glitchlings/internal/rust.py +159 -0
  45. glitchlings/internal/rust_ffi.py +432 -0
  46. glitchlings/main.py +123 -32
  47. glitchlings/runtime_config.py +24 -0
  48. glitchlings/util/__init__.py +29 -176
  49. glitchlings/util/adapters.py +65 -0
  50. glitchlings/util/keyboards.py +311 -0
  51. glitchlings/util/transcripts.py +108 -0
  52. glitchlings/zoo/__init__.py +47 -24
  53. glitchlings/zoo/assets/__init__.py +29 -0
  54. glitchlings/zoo/core.py +301 -167
  55. glitchlings/zoo/core_execution.py +98 -0
  56. glitchlings/zoo/core_planning.py +451 -0
  57. glitchlings/zoo/corrupt_dispatch.py +295 -0
  58. glitchlings/zoo/ekkokin.py +118 -0
  59. glitchlings/zoo/hokey.py +137 -0
  60. glitchlings/zoo/jargoyle.py +179 -274
  61. glitchlings/zoo/mim1c.py +106 -68
  62. glitchlings/zoo/pedant/__init__.py +107 -0
  63. glitchlings/zoo/pedant/core.py +105 -0
  64. glitchlings/zoo/pedant/forms.py +74 -0
  65. glitchlings/zoo/pedant/stones.py +74 -0
  66. glitchlings/zoo/redactyl.py +44 -175
  67. glitchlings/zoo/rng.py +259 -0
  68. glitchlings/zoo/rushmore.py +359 -116
  69. glitchlings/zoo/scannequin.py +18 -125
  70. glitchlings/zoo/transforms.py +386 -0
  71. glitchlings/zoo/typogre.py +76 -162
  72. glitchlings/zoo/validation.py +477 -0
  73. glitchlings/zoo/zeedub.py +33 -86
  74. glitchlings-0.9.3.dist-info/METADATA +334 -0
  75. glitchlings-0.9.3.dist-info/RECORD +80 -0
  76. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
  77. glitchlings/zoo/_ocr_confusions.py +0 -34
  78. glitchlings/zoo/_rate.py +0 -21
  79. glitchlings/zoo/reduple.py +0 -169
  80. glitchlings-0.2.5.dist-info/METADATA +0 -490
  81. glitchlings-0.2.5.dist-info/RECORD +0 -27
  82. /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
  83. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
  84. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
  85. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,29 @@
1
+ {
2
+ "pipeline_assets": [
3
+ {
4
+ "name": "apostrofae_pairs.json",
5
+ "kind": "copy"
6
+ },
7
+ {
8
+ "name": "ekkokin_homophones.json",
9
+ "kind": "copy"
10
+ },
11
+ {
12
+ "name": "hokey_assets.json",
13
+ "kind": "copy"
14
+ },
15
+ {
16
+ "name": "lexemes",
17
+ "kind": "copy"
18
+ },
19
+ {
20
+ "name": "ocr_confusions.tsv",
21
+ "kind": "copy"
22
+ },
23
+ {
24
+ "name": "mim1c_homoglyphs.json.gz.b64",
25
+ "kind": "compressed",
26
+ "output": "mim1c_homoglyphs.json"
27
+ }
28
+ ]
29
+ }
@@ -0,0 +1,53 @@
1
+ """Attack submodule for comparing text before and after corruption."""
2
+
3
+ from .compose import (
4
+ AttackResultComponents,
5
+ EncodedPayload,
6
+ build_batch_result,
7
+ build_empty_metrics,
8
+ build_empty_result,
9
+ build_single_result,
10
+ extract_transcript_contents,
11
+ format_metrics_for_batch,
12
+ format_metrics_for_single,
13
+ )
14
+ from .core import Attack, AttackResult, MultiAttackResult
15
+ from .encode import describe_tokenizer, encode_batch, encode_single
16
+ from .metrics import (
17
+ jensen_shannon_divergence,
18
+ normalized_edit_distance,
19
+ subsequence_retention,
20
+ )
21
+ from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
22
+ from .tokenization import Tokenizer
23
+
24
+ __all__ = [
25
+ # Core
26
+ "Attack",
27
+ "AttackResult",
28
+ "MultiAttackResult",
29
+ "Tokenizer",
30
+ # Metrics
31
+ "jensen_shannon_divergence",
32
+ "normalized_edit_distance",
33
+ "subsequence_retention",
34
+ # Compose (pure)
35
+ "AttackResultComponents",
36
+ "EncodedPayload",
37
+ "build_batch_result",
38
+ "build_empty_metrics",
39
+ "build_empty_result",
40
+ "build_single_result",
41
+ "extract_transcript_contents",
42
+ "format_metrics_for_batch",
43
+ "format_metrics_for_single",
44
+ # Encode (pure)
45
+ "describe_tokenizer",
46
+ "encode_batch",
47
+ "encode_single",
48
+ # Metrics dispatch (pure)
49
+ "TokenBatch",
50
+ "TokenSequence",
51
+ "is_batch",
52
+ "validate_batch_consistency",
53
+ ]
@@ -0,0 +1,299 @@
1
+ """Pure result assembly functions for Attack.
2
+
3
+ This module contains pure functions for composing AttackResult instances
4
+ from pre-computed components. Functions here are:
5
+
6
+ - **Pure**: Output depends only on inputs, no side effects
7
+ - **Deterministic**: Same inputs always produce same outputs
8
+ - **Self-contained**: No IO, no Rust FFI, no config loading
9
+
10
+ Design Philosophy
11
+ -----------------
12
+ This module implements the innermost layer of Attack composition:
13
+
14
+ Attack.run() → tokenize → corrupt → compose.py → AttackResult
15
+ (orchestrator) (impure) (impure) (pure) (value)
16
+
17
+ Functions receive already-computed tokens, IDs, and metrics. They trust
18
+ that inputs are valid and do not re-validate. Boundary validation happens
19
+ in Attack.__init__ and before calling these functions.
20
+
21
+ See AGENTS.md "Functional Purity Architecture" for full details.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from collections.abc import Mapping, Sequence
27
+ from dataclasses import dataclass
28
+ from typing import TYPE_CHECKING
29
+
30
+ if TYPE_CHECKING: # pragma: no cover - typing only
31
+ from ..util.transcripts import Transcript
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Data Types
36
+ # ---------------------------------------------------------------------------
37
+
38
+
39
+ @dataclass(frozen=True, slots=True)
40
+ class EncodedPayload:
41
+ """Encoded representation of text or transcript for metric computation.
42
+
43
+ Attributes:
44
+ tokens: Token strings from tokenizer (flat or batched).
45
+ token_ids: Token IDs from tokenizer (flat or batched).
46
+ is_batched: True if this represents a transcript (batch of texts).
47
+ """
48
+
49
+ tokens: list[str] | list[list[str]]
50
+ token_ids: list[int] | list[list[int]]
51
+ is_batched: bool
52
+
53
+
54
+ @dataclass(frozen=True, slots=True)
55
+ class AttackResultComponents:
56
+ """Intermediate structure holding all components needed for AttackResult.
57
+
58
+ This is a pure value type that aggregates pre-computed components
59
+ before final assembly into AttackResult.
60
+ """
61
+
62
+ original: "str | Transcript"
63
+ corrupted: "str | Transcript"
64
+ input_encoded: EncodedPayload
65
+ output_encoded: EncodedPayload
66
+ tokenizer_info: str
67
+ metrics: dict[str, float | list[float]]
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # Transcript Content Extraction
72
+ # ---------------------------------------------------------------------------
73
+
74
+
75
+ def extract_transcript_contents(transcript: "Transcript") -> list[str]:
76
+ """Extract content strings from a chat transcript.
77
+
78
+ This is a pure function that extracts the 'content' field from each
79
+ turn in a transcript. It trusts that the transcript structure is valid
80
+ (validated at Attack boundary).
81
+
82
+ Args:
83
+ transcript: List of turn dictionaries, each containing a 'content' key.
84
+
85
+ Returns:
86
+ List of content strings in turn order.
87
+
88
+ Raises:
89
+ TypeError: If a turn is missing 'content' or it isn't a string.
90
+ """
91
+ contents: list[str] = []
92
+ for index, turn in enumerate(transcript):
93
+ if not isinstance(turn, Mapping):
94
+ raise TypeError(f"Transcript turn #{index + 1} must be a mapping.")
95
+ content = turn.get("content")
96
+ if not isinstance(content, str):
97
+ raise TypeError(f"Transcript turn #{index + 1} is missing string content.")
98
+ contents.append(content)
99
+ return contents
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Metric Formatting
104
+ # ---------------------------------------------------------------------------
105
+
106
+
107
+ def format_metrics_for_single(
108
+ metrics: dict[str, float | list[float]],
109
+ ) -> dict[str, float]:
110
+ """Collapse batch metrics to single values for non-transcript results.
111
+
112
+ When Attack processes a single string (not a transcript), metrics should
113
+ be scalar floats. This function extracts the first element from any
114
+ list-valued metrics.
115
+
116
+ Args:
117
+ metrics: Dictionary of metric names to values (float or list[float]).
118
+
119
+ Returns:
120
+ Dictionary with all values as floats.
121
+ """
122
+ result: dict[str, float] = {}
123
+ for name, value in metrics.items():
124
+ if isinstance(value, list):
125
+ result[name] = value[0] if value else 0.0
126
+ else:
127
+ result[name] = value
128
+ return result
129
+
130
+
131
+ def format_metrics_for_batch(
132
+ metrics: dict[str, float | list[float]],
133
+ ) -> dict[str, list[float]]:
134
+ """Normalize metrics to list format for transcript results.
135
+
136
+ When Attack processes a transcript (batch), metrics should be lists.
137
+ This function wraps any scalar floats in single-element lists.
138
+
139
+ Args:
140
+ metrics: Dictionary of metric names to values (float or list[float]).
141
+
142
+ Returns:
143
+ Dictionary with all values as lists of floats.
144
+ """
145
+ result: dict[str, list[float]] = {}
146
+ for name, value in metrics.items():
147
+ if isinstance(value, list):
148
+ result[name] = list(value)
149
+ else:
150
+ result[name] = [value]
151
+ return result
152
+
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # Empty Result Construction
156
+ # ---------------------------------------------------------------------------
157
+
158
+
159
+ def build_empty_metrics(metric_names: list[str]) -> dict[str, list[float]]:
160
+ """Create empty metric results for empty transcript input.
161
+
162
+ Args:
163
+ metric_names: Names of metrics to include.
164
+
165
+ Returns:
166
+ Dictionary mapping each metric name to an empty list.
167
+ """
168
+ return {name: [] for name in metric_names}
169
+
170
+
171
+ # ---------------------------------------------------------------------------
172
+ # Result Assembly
173
+ # ---------------------------------------------------------------------------
174
+
175
+
176
+ def build_single_result(
177
+ original: str,
178
+ corrupted: str,
179
+ input_tokens: list[str],
180
+ input_token_ids: list[int],
181
+ output_tokens: list[str],
182
+ output_token_ids: list[int],
183
+ tokenizer_info: str,
184
+ metrics: dict[str, float | list[float]],
185
+ ) -> dict[str, object]:
186
+ """Assemble AttackResult field dictionary for single-string input.
187
+
188
+ This is a pure function that takes all pre-computed components and
189
+ returns a dictionary suitable for constructing an AttackResult.
190
+
191
+ Args:
192
+ original: Original input string.
193
+ corrupted: Corrupted output string.
194
+ input_tokens: Tokenized input.
195
+ input_token_ids: Token IDs for input.
196
+ output_tokens: Tokenized output.
197
+ output_token_ids: Token IDs for output.
198
+ tokenizer_info: Description of the tokenizer used.
199
+ metrics: Computed metrics (will be collapsed to scalars).
200
+
201
+ Returns:
202
+ Dictionary with all AttackResult field values.
203
+ """
204
+ return {
205
+ "original": original,
206
+ "corrupted": corrupted,
207
+ "input_tokens": input_tokens,
208
+ "output_tokens": output_tokens,
209
+ "input_token_ids": input_token_ids,
210
+ "output_token_ids": output_token_ids,
211
+ "tokenizer_info": tokenizer_info,
212
+ "metrics": format_metrics_for_single(metrics),
213
+ }
214
+
215
+
216
+ def build_batch_result(
217
+ original: "Transcript | Sequence[str]",
218
+ corrupted: "Transcript | Sequence[str]",
219
+ input_tokens: list[list[str]],
220
+ input_token_ids: list[list[int]],
221
+ output_tokens: list[list[str]],
222
+ output_token_ids: list[list[int]],
223
+ tokenizer_info: str,
224
+ metrics: dict[str, float | list[float]],
225
+ ) -> dict[str, object]:
226
+ """Assemble AttackResult field dictionary for batched input.
227
+
228
+ This is a pure function that takes all pre-computed components and
229
+ returns a dictionary suitable for constructing an AttackResult.
230
+
231
+ Args:
232
+ original: Original transcript or list of strings.
233
+ corrupted: Corrupted transcript or list of strings.
234
+ input_tokens: Batched tokenized inputs.
235
+ input_token_ids: Batched token IDs for inputs.
236
+ output_tokens: Batched tokenized outputs.
237
+ output_token_ids: Batched token IDs for outputs.
238
+ tokenizer_info: Description of the tokenizer used.
239
+ metrics: Computed metrics (already in batch format).
240
+
241
+ Returns:
242
+ Dictionary with all AttackResult field values.
243
+ """
244
+ return {
245
+ "original": original,
246
+ "corrupted": corrupted,
247
+ "input_tokens": input_tokens,
248
+ "output_tokens": output_tokens,
249
+ "input_token_ids": input_token_ids,
250
+ "output_token_ids": output_token_ids,
251
+ "tokenizer_info": tokenizer_info,
252
+ "metrics": metrics,
253
+ }
254
+
255
+
256
+ def build_empty_result(
257
+ original: "Transcript | Sequence[str]",
258
+ corrupted: "Transcript | Sequence[str]",
259
+ tokenizer_info: str,
260
+ metric_names: list[str],
261
+ ) -> dict[str, object]:
262
+ """Assemble AttackResult field dictionary for empty batch input.
263
+
264
+ Args:
265
+ original: Original empty transcript or list.
266
+ corrupted: Corrupted empty transcript or list.
267
+ tokenizer_info: Description of the tokenizer used.
268
+ metric_names: Names of metrics to include as empty lists.
269
+
270
+ Returns:
271
+ Dictionary with all AttackResult field values for empty input.
272
+ """
273
+ return {
274
+ "original": original,
275
+ "corrupted": corrupted,
276
+ "input_tokens": [],
277
+ "output_tokens": [],
278
+ "input_token_ids": [],
279
+ "output_token_ids": [],
280
+ "tokenizer_info": tokenizer_info,
281
+ "metrics": build_empty_metrics(metric_names),
282
+ }
283
+
284
+
285
+ __all__ = [
286
+ # Types
287
+ "AttackResultComponents",
288
+ "EncodedPayload",
289
+ # Transcript helpers
290
+ "extract_transcript_contents",
291
+ # Metric formatting
292
+ "build_empty_metrics",
293
+ "format_metrics_for_batch",
294
+ "format_metrics_for_single",
295
+ # Result assembly
296
+ "build_batch_result",
297
+ "build_empty_result",
298
+ "build_single_result",
299
+ ]