glitchlings 0.10.2__cp312-cp312-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +99 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +147 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +493 -0
- glitchlings/attack/core_execution.py +367 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +218 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +227 -0
- glitchlings/auggie.py +284 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +19 -0
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +490 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +27 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +356 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +678 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +243 -0
- glitchlings/zoo/mim1c.py +148 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +97 -0
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +66 -0
- glitchlings/zoo/transforms.py +346 -0
- glitchlings/zoo/typogre.py +128 -0
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +93 -0
- glitchlings-0.10.2.dist-info/METADATA +337 -0
- glitchlings-0.10.2.dist-info/RECORD +83 -0
- glitchlings-0.10.2.dist-info/WHEEL +5 -0
- glitchlings-0.10.2.dist-info/entry_points.txt +3 -0
- glitchlings-0.10.2.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.10.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Pure dispatch logic for Glitchling corruption operations.
|
|
2
|
+
|
|
3
|
+
This module contains the deterministic, side-effect-free logic for building
|
|
4
|
+
corruption plans. It separates the "what to corrupt" decision from the
|
|
5
|
+
"how to corrupt" execution.
|
|
6
|
+
|
|
7
|
+
**Design Philosophy:**
|
|
8
|
+
|
|
9
|
+
All functions in this module are *pure* - they perform dispatch analysis
|
|
10
|
+
based solely on their inputs, without side effects. They do not:
|
|
11
|
+
- Invoke corruption functions
|
|
12
|
+
- Modify state
|
|
13
|
+
- Perform I/O
|
|
14
|
+
|
|
15
|
+
The separation allows:
|
|
16
|
+
- Corruption dispatch to be tested without actual corruption
|
|
17
|
+
- Clear boundaries between planning and execution
|
|
18
|
+
- Reasoning about what will be corrupted before execution
|
|
19
|
+
|
|
20
|
+
See AGENTS.md "Functional Purity Architecture" for full details.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from typing import Any, Literal
|
|
27
|
+
|
|
28
|
+
from ..util.transcripts import (
|
|
29
|
+
Transcript,
|
|
30
|
+
TranscriptTarget,
|
|
31
|
+
TranscriptTurn,
|
|
32
|
+
is_transcript,
|
|
33
|
+
resolve_transcript_indices,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Type Definitions
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(slots=True, frozen=True)
|
|
42
|
+
class StringCorruptionTarget:
|
|
43
|
+
"""Target specification for corrupting a plain string.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
text: The string to corrupt.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
text: str
|
|
50
|
+
kind: Literal["string"] = "string"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(slots=True, frozen=True)
|
|
54
|
+
class TranscriptTurnTarget:
|
|
55
|
+
"""Target specification for a single turn within a transcript.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
index: Position of the turn in the transcript.
|
|
59
|
+
content: The text content to corrupt.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
index: int
|
|
63
|
+
content: str
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(slots=True, frozen=True)
|
|
67
|
+
class TranscriptCorruptionTarget:
|
|
68
|
+
"""Target specification for corrupting transcript turns.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
turns: List of turn targets with their indices and content.
|
|
72
|
+
original_transcript: The original transcript for result assembly.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
turns: tuple[TranscriptTurnTarget, ...]
|
|
76
|
+
original_transcript: Transcript
|
|
77
|
+
kind: Literal["transcript"] = "transcript"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Union type for corruption targets
|
|
81
|
+
CorruptionTarget = StringCorruptionTarget | TranscriptCorruptionTarget
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
# Dispatch Functions
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def resolve_corruption_target(
|
|
90
|
+
text: str | Transcript,
|
|
91
|
+
transcript_target: TranscriptTarget,
|
|
92
|
+
) -> CorruptionTarget:
|
|
93
|
+
"""Determine what needs to be corrupted from the input.
|
|
94
|
+
|
|
95
|
+
This is a pure function that analyzes the input and returns a structured
|
|
96
|
+
target specification. It does not perform any corruption.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
text: Input text or transcript to analyze.
|
|
100
|
+
transcript_target: Specification for which transcript turns to target.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
CorruptionTarget describing what should be corrupted.
|
|
104
|
+
|
|
105
|
+
Note:
|
|
106
|
+
For backwards compatibility, lists that are not valid transcripts
|
|
107
|
+
(e.g., lists of strings) are treated as strings. The original corrupt()
|
|
108
|
+
implementation would cast such inputs to str and pass them to the
|
|
109
|
+
corruption function. This behavior is preserved to maintain compatibility
|
|
110
|
+
with dataset column transformations.
|
|
111
|
+
"""
|
|
112
|
+
# Handle plain strings
|
|
113
|
+
if isinstance(text, str):
|
|
114
|
+
return StringCorruptionTarget(text=text)
|
|
115
|
+
|
|
116
|
+
# Handle transcripts (lists of dicts with "content" keys)
|
|
117
|
+
if is_transcript(text):
|
|
118
|
+
indices = resolve_transcript_indices(text, transcript_target)
|
|
119
|
+
turn_targets: list[TranscriptTurnTarget] = []
|
|
120
|
+
|
|
121
|
+
for idx in indices:
|
|
122
|
+
turn = text[idx]
|
|
123
|
+
content = turn.get("content")
|
|
124
|
+
if isinstance(content, str):
|
|
125
|
+
turn_targets.append(TranscriptTurnTarget(index=idx, content=content))
|
|
126
|
+
|
|
127
|
+
return TranscriptCorruptionTarget(
|
|
128
|
+
turns=tuple(turn_targets),
|
|
129
|
+
original_transcript=text,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# For backwards compatibility: treat other types (including lists of strings)
|
|
133
|
+
# as strings by casting. This preserves the original behavior where
|
|
134
|
+
# non-transcript lists were passed to corruption functions after casting.
|
|
135
|
+
# This handles cases like dataset column transformations where HuggingFace
|
|
136
|
+
# may batch values as lists.
|
|
137
|
+
return StringCorruptionTarget(text=str(text))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def count_corruption_targets(target: CorruptionTarget) -> int:
|
|
141
|
+
"""Count how many text segments will be corrupted.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
target: The corruption target specification.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Number of text segments that will be processed.
|
|
148
|
+
"""
|
|
149
|
+
if isinstance(target, StringCorruptionTarget):
|
|
150
|
+
return 1
|
|
151
|
+
return len(target.turns)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def extract_texts_to_corrupt(target: CorruptionTarget) -> list[str]:
|
|
155
|
+
"""Extract all text strings that need to be corrupted.
|
|
156
|
+
|
|
157
|
+
This is useful for batch processing or analysis.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
target: The corruption target specification.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List of text strings to corrupt.
|
|
164
|
+
"""
|
|
165
|
+
if isinstance(target, StringCorruptionTarget):
|
|
166
|
+
return [target.text]
|
|
167
|
+
return [turn.content for turn in target.turns]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
# Result Assembly Functions
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def assemble_string_result(
|
|
176
|
+
_target: StringCorruptionTarget,
|
|
177
|
+
corrupted: str,
|
|
178
|
+
) -> str:
|
|
179
|
+
"""Assemble the result for a string corruption.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
_target: The original target (unused, included for symmetry).
|
|
183
|
+
corrupted: The corrupted text.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
The corrupted string.
|
|
187
|
+
"""
|
|
188
|
+
return corrupted
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def assemble_transcript_result(
|
|
192
|
+
target: TranscriptCorruptionTarget,
|
|
193
|
+
corrupted_contents: dict[int, str],
|
|
194
|
+
) -> Transcript:
|
|
195
|
+
"""Assemble the result for a transcript corruption.
|
|
196
|
+
|
|
197
|
+
Creates a copy of the original transcript with specified turns updated.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
target: The original target specification.
|
|
201
|
+
corrupted_contents: Mapping of turn indices to corrupted content.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
New transcript with corrupted turns.
|
|
205
|
+
"""
|
|
206
|
+
# Create a deep copy of the transcript
|
|
207
|
+
result: list[TranscriptTurn] = [dict(turn) for turn in target.original_transcript]
|
|
208
|
+
|
|
209
|
+
# Apply corrupted content to targeted turns
|
|
210
|
+
for idx, content in corrupted_contents.items():
|
|
211
|
+
if 0 <= idx < len(result):
|
|
212
|
+
result[idx]["content"] = content
|
|
213
|
+
|
|
214
|
+
return result
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def assemble_corruption_result(
|
|
218
|
+
target: CorruptionTarget,
|
|
219
|
+
corrupted: str | dict[int, str],
|
|
220
|
+
) -> str | Transcript:
|
|
221
|
+
"""Assemble the final result based on target type.
|
|
222
|
+
|
|
223
|
+
This is a pure function that combines the original target structure
|
|
224
|
+
with the corrupted content.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
target: The original corruption target.
|
|
228
|
+
corrupted: Either a single corrupted string (for StringCorruptionTarget)
|
|
229
|
+
or a mapping of indices to corrupted content (for TranscriptCorruptionTarget).
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
The assembled result matching the input type.
|
|
233
|
+
|
|
234
|
+
Raises:
|
|
235
|
+
TypeError: If corrupted value type doesn't match target type.
|
|
236
|
+
"""
|
|
237
|
+
if isinstance(target, StringCorruptionTarget):
|
|
238
|
+
if not isinstance(corrupted, str):
|
|
239
|
+
message = "String target requires corrupted string result"
|
|
240
|
+
raise TypeError(message)
|
|
241
|
+
return assemble_string_result(target, corrupted)
|
|
242
|
+
|
|
243
|
+
if isinstance(target, TranscriptCorruptionTarget):
|
|
244
|
+
if not isinstance(corrupted, dict):
|
|
245
|
+
message = "Transcript target requires corrupted content mapping"
|
|
246
|
+
raise TypeError(message)
|
|
247
|
+
return assemble_transcript_result(target, corrupted)
|
|
248
|
+
|
|
249
|
+
# Should be unreachable due to typing, but be explicit
|
|
250
|
+
message = f"Unknown target type: {type(target).__name__}"
|
|
251
|
+
raise TypeError(message)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# ---------------------------------------------------------------------------
|
|
255
|
+
# Validation Helpers
|
|
256
|
+
# ---------------------------------------------------------------------------
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def validate_text_input(text: Any) -> str | Transcript:
|
|
260
|
+
"""Validate that input is a supported text type.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
text: Input to validate.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
The validated input.
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
TypeError: If input is not a string or transcript.
|
|
270
|
+
"""
|
|
271
|
+
if isinstance(text, str):
|
|
272
|
+
return text
|
|
273
|
+
if is_transcript(text):
|
|
274
|
+
return text
|
|
275
|
+
message = f"Expected string or transcript, got {type(text).__name__}"
|
|
276
|
+
raise TypeError(message)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
__all__ = [
|
|
280
|
+
# Target types
|
|
281
|
+
"StringCorruptionTarget",
|
|
282
|
+
"TranscriptTurnTarget",
|
|
283
|
+
"TranscriptCorruptionTarget",
|
|
284
|
+
"CorruptionTarget",
|
|
285
|
+
# Dispatch functions
|
|
286
|
+
"resolve_corruption_target",
|
|
287
|
+
"count_corruption_targets",
|
|
288
|
+
"extract_texts_to_corrupt",
|
|
289
|
+
# Result assembly
|
|
290
|
+
"assemble_string_result",
|
|
291
|
+
"assemble_transcript_result",
|
|
292
|
+
"assemble_corruption_result",
|
|
293
|
+
# Validation
|
|
294
|
+
"validate_text_input",
|
|
295
|
+
]
|
glitchlings/zoo/hokey.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Hokey glitchling that performs expressive lengthening."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from glitchlings.internal.rust_ffi import hokey_rust, resolve_seed
|
|
9
|
+
|
|
10
|
+
from .core import AttackOrder, AttackWave, Gaggle, PipelineOperationPayload
|
|
11
|
+
from .core import Glitchling as GlitchlingBase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extend_vowels(
|
|
15
|
+
text: str,
|
|
16
|
+
rate: float = 0.3,
|
|
17
|
+
extension_min: int = 2,
|
|
18
|
+
extension_max: int = 5,
|
|
19
|
+
word_length_threshold: int = 6,
|
|
20
|
+
seed: int | None = None,
|
|
21
|
+
rng: random.Random | None = None,
|
|
22
|
+
base_p: float | None = None,
|
|
23
|
+
) -> str:
|
|
24
|
+
"""Extend expressive segments of words for emphasis.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
text : str
|
|
29
|
+
Input text to transform.
|
|
30
|
+
rate : float, optional
|
|
31
|
+
Global selection rate for candidate words.
|
|
32
|
+
extension_min : int, optional
|
|
33
|
+
Minimum number of extra repetitions for the stretch unit.
|
|
34
|
+
extension_max : int, optional
|
|
35
|
+
Maximum number of extra repetitions for the stretch unit.
|
|
36
|
+
word_length_threshold : int, optional
|
|
37
|
+
Preferred maximum alphabetic length; longer words are de-emphasised but not
|
|
38
|
+
excluded.
|
|
39
|
+
seed : int, optional
|
|
40
|
+
Deterministic seed when ``rng`` is not supplied.
|
|
41
|
+
rng : random.Random, optional
|
|
42
|
+
Random number generator to drive sampling.
|
|
43
|
+
base_p : float, optional
|
|
44
|
+
Base probability for the negative-binomial sampler (heavier tails for smaller
|
|
45
|
+
values). Defaults to ``0.45``.
|
|
46
|
+
"""
|
|
47
|
+
if not text:
|
|
48
|
+
return text
|
|
49
|
+
|
|
50
|
+
base_probability = base_p if base_p is not None else 0.45
|
|
51
|
+
|
|
52
|
+
seed_value = resolve_seed(seed, rng)
|
|
53
|
+
return hokey_rust(
|
|
54
|
+
text,
|
|
55
|
+
rate,
|
|
56
|
+
extension_min,
|
|
57
|
+
extension_max,
|
|
58
|
+
word_length_threshold,
|
|
59
|
+
base_probability,
|
|
60
|
+
seed_value,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Hokey(GlitchlingBase):
|
|
65
|
+
"""Glitchling that stretches words using linguistic heuristics."""
|
|
66
|
+
|
|
67
|
+
flavor = "Sooooo excited to meet you! We reeeeeally missed you last week."
|
|
68
|
+
|
|
69
|
+
seed: int | None
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
rate: float = 0.3,
|
|
75
|
+
extension_min: int = 2,
|
|
76
|
+
extension_max: int = 5,
|
|
77
|
+
word_length_threshold: int = 6,
|
|
78
|
+
base_p: float = 0.45,
|
|
79
|
+
seed: int | None = None,
|
|
80
|
+
**kwargs: Any,
|
|
81
|
+
) -> None:
|
|
82
|
+
self._master_seed: int | None = seed
|
|
83
|
+
|
|
84
|
+
def _corruption_wrapper(text: str, **kwargs: Any) -> str:
|
|
85
|
+
return extend_vowels(text, **kwargs)
|
|
86
|
+
|
|
87
|
+
super().__init__(
|
|
88
|
+
name="Hokey",
|
|
89
|
+
corruption_function=_corruption_wrapper,
|
|
90
|
+
scope=AttackWave.CHARACTER,
|
|
91
|
+
order=AttackOrder.FIRST,
|
|
92
|
+
seed=seed,
|
|
93
|
+
rate=rate,
|
|
94
|
+
extension_min=extension_min,
|
|
95
|
+
extension_max=extension_max,
|
|
96
|
+
word_length_threshold=word_length_threshold,
|
|
97
|
+
base_p=base_p,
|
|
98
|
+
**kwargs,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
102
|
+
kwargs = self.kwargs
|
|
103
|
+
rate = kwargs.get("rate")
|
|
104
|
+
extension_min = kwargs.get("extension_min")
|
|
105
|
+
extension_max = kwargs.get("extension_max")
|
|
106
|
+
word_length_threshold = kwargs.get("word_length_threshold")
|
|
107
|
+
base_p = kwargs.get("base_p")
|
|
108
|
+
return cast(
|
|
109
|
+
PipelineOperationPayload,
|
|
110
|
+
{
|
|
111
|
+
"type": "hokey",
|
|
112
|
+
"rate": 0.3 if rate is None else float(rate),
|
|
113
|
+
"extension_min": 2 if extension_min is None else int(extension_min),
|
|
114
|
+
"extension_max": 5 if extension_max is None else int(extension_max),
|
|
115
|
+
"word_length_threshold": 6
|
|
116
|
+
if word_length_threshold is None
|
|
117
|
+
else int(word_length_threshold),
|
|
118
|
+
"base_p": 0.45 if base_p is None else float(base_p),
|
|
119
|
+
},
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def reset_rng(self, seed: int | None = None) -> None:
|
|
123
|
+
if seed is not None:
|
|
124
|
+
self._master_seed = seed
|
|
125
|
+
super().reset_rng(seed)
|
|
126
|
+
if self.seed is None:
|
|
127
|
+
return
|
|
128
|
+
derived = Gaggle.derive_seed(int(seed), self.name, 0)
|
|
129
|
+
self.seed = int(derived)
|
|
130
|
+
self.rng = random.Random(self.seed)
|
|
131
|
+
self.kwargs["seed"] = self.seed
|
|
132
|
+
else:
|
|
133
|
+
super().reset_rng(None)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
hokey = Hokey()
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
__all__ = ["Hokey", "hokey", "extend_vowels"]
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""Jargoyle glitchling: Dictionary-based word drift.
|
|
2
|
+
|
|
3
|
+
Jargoyle swaps words with alternatives from bundled lexeme dictionaries.
|
|
4
|
+
Multiple dictionaries are supported:
|
|
5
|
+
- "colors": Color term swapping
|
|
6
|
+
- "synonyms": General synonym substitution
|
|
7
|
+
- "corporate": Business jargon alternatives
|
|
8
|
+
- "academic": Scholarly word substitutions
|
|
9
|
+
- "cyberpunk": Neon cyberpunk slang and gadgetry
|
|
10
|
+
- "lovecraftian": Cosmic horror terminology
|
|
11
|
+
You can also drop additional dictionaries into ``assets/lexemes`` to make
|
|
12
|
+
them available without modifying the code. The backend discovers any
|
|
13
|
+
``*.json`` file in that directory at runtime.
|
|
14
|
+
|
|
15
|
+
Two modes are available:
|
|
16
|
+
- "literal": First entry in each word's alternatives (deterministic mapping)
|
|
17
|
+
- "drift": Random selection from alternatives (probabilistic)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
from importlib import resources
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any, Literal, cast
|
|
26
|
+
|
|
27
|
+
from glitchlings.constants import DEFAULT_JARGOYLE_RATE
|
|
28
|
+
from glitchlings.internal.rust_ffi import (
|
|
29
|
+
jargoyle_drift_rust,
|
|
30
|
+
list_lexeme_dictionaries_rust,
|
|
31
|
+
resolve_seed,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
|
|
35
|
+
|
|
36
|
+
_LEXEME_ENV_VAR = "GLITCHLINGS_LEXEME_DIR"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _configure_lexeme_directory() -> Path | None:
|
|
40
|
+
"""Expose the bundled lexeme directory to the Rust backend via an env var."""
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
lexeme_root = resources.files("glitchlings.assets.lexemes")
|
|
44
|
+
except (ModuleNotFoundError, AttributeError):
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
with resources.as_file(lexeme_root) as resolved:
|
|
49
|
+
path = Path(resolved)
|
|
50
|
+
except FileNotFoundError:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
if not path.is_dir():
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
os.environ.setdefault(_LEXEME_ENV_VAR, str(path))
|
|
57
|
+
return path
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_configure_lexeme_directory()
|
|
61
|
+
|
|
62
|
+
DEFAULT_LEXEMES = "synonyms"
|
|
63
|
+
|
|
64
|
+
# Valid modes
|
|
65
|
+
JargoyleMode = Literal["literal", "drift"]
|
|
66
|
+
VALID_MODES = ("literal", "drift")
|
|
67
|
+
DEFAULT_MODE: JargoyleMode = "drift"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _available_lexemes() -> list[str]:
|
|
71
|
+
return sorted({name.lower() for name in list_lexeme_dictionaries_rust()})
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _validate_lexemes(name: str) -> str:
|
|
75
|
+
normalized = name.lower()
|
|
76
|
+
available = _available_lexemes()
|
|
77
|
+
if normalized not in available:
|
|
78
|
+
raise ValueError(f"Invalid lexemes '{name}'. Must be one of: {', '.join(available)}")
|
|
79
|
+
return normalized
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _validate_mode(mode: JargoyleMode | str) -> JargoyleMode:
|
|
83
|
+
normalized = mode.lower()
|
|
84
|
+
if normalized not in VALID_MODES:
|
|
85
|
+
raise ValueError(f"Invalid mode '{mode}'. Must be one of: {', '.join(VALID_MODES)}")
|
|
86
|
+
return cast(JargoyleMode, normalized)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
VALID_LEXEMES = tuple(_available_lexemes())
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def list_lexeme_dictionaries() -> list[str]:
|
|
93
|
+
"""Return the list of available lexeme dictionaries.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
List of dictionary names that can be used with Jargoyle.
|
|
97
|
+
"""
|
|
98
|
+
return _available_lexemes()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def jargoyle_drift(
|
|
102
|
+
text: str,
|
|
103
|
+
*,
|
|
104
|
+
lexemes: str = DEFAULT_LEXEMES,
|
|
105
|
+
mode: JargoyleMode = DEFAULT_MODE,
|
|
106
|
+
rate: float | None = None,
|
|
107
|
+
seed: int | None = None,
|
|
108
|
+
) -> str:
|
|
109
|
+
"""Apply dictionary-based word drift to text.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
text: Input text to transform.
|
|
113
|
+
lexemes: Name of the dictionary to use.
|
|
114
|
+
mode: "literal" for deterministic first-entry swaps,
|
|
115
|
+
"drift" for random selection from alternatives.
|
|
116
|
+
rate: Probability of transforming each matching word (0.0 to 1.0).
|
|
117
|
+
seed: Seed for deterministic randomness (only used in "drift" mode).
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Text with word substitutions applied.
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
ValueError: If lexemes or mode is invalid.
|
|
124
|
+
"""
|
|
125
|
+
normalized_lexemes = _validate_lexemes(lexemes)
|
|
126
|
+
normalized_mode = _validate_mode(mode)
|
|
127
|
+
|
|
128
|
+
effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else float(rate)
|
|
129
|
+
resolved_seed = resolve_seed(seed, None) if normalized_mode == "drift" else None
|
|
130
|
+
|
|
131
|
+
return jargoyle_drift_rust(
|
|
132
|
+
text,
|
|
133
|
+
normalized_lexemes,
|
|
134
|
+
normalized_mode,
|
|
135
|
+
effective_rate,
|
|
136
|
+
resolved_seed,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class Jargoyle(Glitchling):
|
|
141
|
+
"""Glitchling that swaps words using bundled lexeme dictionaries.
|
|
142
|
+
|
|
143
|
+
Jargoyle replaces words with alternatives from one of several dictionaries:
|
|
144
|
+
|
|
145
|
+
- **colors**: Swap color terms (e.g., "red" -> "blue").
|
|
146
|
+
- **synonyms**: General synonym substitution (e.g., "fast" -> "rapid").
|
|
147
|
+
- **corporate**: Business jargon alternatives.
|
|
148
|
+
- **academic**: Scholarly word substitutions.
|
|
149
|
+
- **cyberpunk**: Neon cyberpunk slang and gadgetry.
|
|
150
|
+
- **lovecraftian**: Cosmic horror terminology.
|
|
151
|
+
- **custom**: Any ``*.json`` dictionary placed in ``assets/lexemes``.
|
|
152
|
+
|
|
153
|
+
Two modes are supported:
|
|
154
|
+
|
|
155
|
+
- **literal**: Use the first (canonical) entry for each word.
|
|
156
|
+
- **drift**: Randomly select from available alternatives.
|
|
157
|
+
|
|
158
|
+
Example:
|
|
159
|
+
>>> from glitchlings import Jargoyle
|
|
160
|
+
>>> jargoyle = Jargoyle(lexemes="colors", mode="literal")
|
|
161
|
+
>>> jargoyle("The red balloon floated away.")
|
|
162
|
+
'The blue balloon floated away.'
|
|
163
|
+
|
|
164
|
+
>>> jargoyle = Jargoyle(lexemes="synonyms", mode="drift", rate=0.5, seed=42)
|
|
165
|
+
>>> jargoyle("The quick fox jumps fast.")
|
|
166
|
+
'The swift fox jumps rapid.'
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
flavor = "Oh no... The worst person you know just bought a thesaurus..."
|
|
170
|
+
|
|
171
|
+
def __init__(
|
|
172
|
+
self,
|
|
173
|
+
*,
|
|
174
|
+
lexemes: str = DEFAULT_LEXEMES,
|
|
175
|
+
mode: JargoyleMode = DEFAULT_MODE,
|
|
176
|
+
rate: float | None = None,
|
|
177
|
+
seed: int | None = None,
|
|
178
|
+
**kwargs: Any,
|
|
179
|
+
) -> None:
|
|
180
|
+
"""Initialize Jargoyle with the specified dictionary and mode.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
lexemes: Name of the dictionary to use. See ``list_lexeme_dictionaries()``
|
|
184
|
+
for the full, dynamic list (including any custom ``*.json`` files).
|
|
185
|
+
mode: Transformation mode. "literal" for deterministic swaps,
|
|
186
|
+
"drift" for random selection.
|
|
187
|
+
rate: Probability of transforming each matching word (0.0 to 1.0).
|
|
188
|
+
Defaults to 0.01.
|
|
189
|
+
seed: Seed for deterministic randomness.
|
|
190
|
+
"""
|
|
191
|
+
# Validate inputs
|
|
192
|
+
normalized_lexemes = _validate_lexemes(lexemes)
|
|
193
|
+
normalized_mode = _validate_mode(mode)
|
|
194
|
+
|
|
195
|
+
effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else rate
|
|
196
|
+
|
|
197
|
+
super().__init__(
|
|
198
|
+
name="Jargoyle",
|
|
199
|
+
corruption_function=jargoyle_drift,
|
|
200
|
+
scope=AttackWave.WORD,
|
|
201
|
+
order=AttackOrder.NORMAL,
|
|
202
|
+
seed=seed,
|
|
203
|
+
lexemes=normalized_lexemes,
|
|
204
|
+
mode=normalized_mode,
|
|
205
|
+
rate=effective_rate,
|
|
206
|
+
**kwargs,
|
|
207
|
+
# Pass seed explicitly to kwargs so corruption_function receives it
|
|
208
|
+
# (seed is stored separately in base class but needed by jargoyle_drift)
|
|
209
|
+
)
|
|
210
|
+
# Ensure seed is in kwargs for the corruption function
|
|
211
|
+
self.kwargs["seed"] = seed
|
|
212
|
+
|
|
213
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
214
|
+
"""Return the pipeline descriptor for the Rust backend."""
|
|
215
|
+
lexemes = self.kwargs.get("lexemes", DEFAULT_LEXEMES)
|
|
216
|
+
mode = self.kwargs.get("mode", DEFAULT_MODE)
|
|
217
|
+
rate = self.kwargs.get("rate", DEFAULT_JARGOYLE_RATE)
|
|
218
|
+
return cast(
|
|
219
|
+
PipelineOperationPayload,
|
|
220
|
+
{
|
|
221
|
+
"type": "jargoyle",
|
|
222
|
+
"lexemes": str(lexemes),
|
|
223
|
+
"mode": str(mode),
|
|
224
|
+
"rate": float(rate),
|
|
225
|
+
},
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# Module-level singleton for convenience
|
|
230
|
+
jargoyle = Jargoyle()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
__all__ = [
|
|
234
|
+
"DEFAULT_LEXEMES",
|
|
235
|
+
"DEFAULT_MODE",
|
|
236
|
+
"Jargoyle",
|
|
237
|
+
"JargoyleMode",
|
|
238
|
+
"VALID_LEXEMES",
|
|
239
|
+
"VALID_MODES",
|
|
240
|
+
"jargoyle",
|
|
241
|
+
"jargoyle_drift",
|
|
242
|
+
"list_lexeme_dictionaries",
|
|
243
|
+
]
|