glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +36 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +53 -0
- glitchlings/attack/compose.py +299 -0
- glitchlings/attack/core.py +465 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +104 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +157 -0
- glitchlings/auggie.py +283 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +17 -3
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +37 -65
- glitchlings/dlc/prime.py +55 -114
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +432 -0
- glitchlings/main.py +123 -32
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +29 -176
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +311 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +47 -24
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +301 -167
- glitchlings/zoo/core_execution.py +98 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/ekkokin.py +118 -0
- glitchlings/zoo/hokey.py +137 -0
- glitchlings/zoo/jargoyle.py +179 -274
- glitchlings/zoo/mim1c.py +106 -68
- glitchlings/zoo/pedant/__init__.py +107 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +44 -175
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +359 -116
- glitchlings/zoo/scannequin.py +18 -125
- glitchlings/zoo/transforms.py +386 -0
- glitchlings/zoo/typogre.py +76 -162
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/zeedub.py +33 -86
- glitchlings-0.9.3.dist-info/METADATA +334 -0
- glitchlings-0.9.3.dist-info/RECORD +80 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
- glitchlings/zoo/_ocr_confusions.py +0 -34
- glitchlings/zoo/_rate.py +0 -21
- glitchlings/zoo/reduple.py +0 -169
- glitchlings-0.2.5.dist-info/METADATA +0 -490
- glitchlings-0.2.5.dist-info/RECORD +0 -27
- /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Pure dispatch logic for Glitchling corruption operations.
|
|
2
|
+
|
|
3
|
+
This module contains the deterministic, side-effect-free logic for building
|
|
4
|
+
corruption plans. It separates the "what to corrupt" decision from the
|
|
5
|
+
"how to corrupt" execution.
|
|
6
|
+
|
|
7
|
+
**Design Philosophy:**
|
|
8
|
+
|
|
9
|
+
All functions in this module are *pure* - they perform dispatch analysis
|
|
10
|
+
based solely on their inputs, without side effects. They do not:
|
|
11
|
+
- Invoke corruption functions
|
|
12
|
+
- Modify state
|
|
13
|
+
- Perform I/O
|
|
14
|
+
|
|
15
|
+
The separation allows:
|
|
16
|
+
- Corruption dispatch to be tested without actual corruption
|
|
17
|
+
- Clear boundaries between planning and execution
|
|
18
|
+
- Reasoning about what will be corrupted before execution
|
|
19
|
+
|
|
20
|
+
See AGENTS.md "Functional Purity Architecture" for full details.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from typing import Any, Literal
|
|
27
|
+
|
|
28
|
+
from ..util.transcripts import (
|
|
29
|
+
Transcript,
|
|
30
|
+
TranscriptTarget,
|
|
31
|
+
TranscriptTurn,
|
|
32
|
+
is_transcript,
|
|
33
|
+
resolve_transcript_indices,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Type Definitions
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(slots=True, frozen=True)
|
|
42
|
+
class StringCorruptionTarget:
|
|
43
|
+
"""Target specification for corrupting a plain string.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
text: The string to corrupt.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
text: str
|
|
50
|
+
kind: Literal["string"] = "string"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(slots=True, frozen=True)
|
|
54
|
+
class TranscriptTurnTarget:
|
|
55
|
+
"""Target specification for a single turn within a transcript.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
index: Position of the turn in the transcript.
|
|
59
|
+
content: The text content to corrupt.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
index: int
|
|
63
|
+
content: str
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(slots=True, frozen=True)
|
|
67
|
+
class TranscriptCorruptionTarget:
|
|
68
|
+
"""Target specification for corrupting transcript turns.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
turns: List of turn targets with their indices and content.
|
|
72
|
+
original_transcript: The original transcript for result assembly.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
turns: tuple[TranscriptTurnTarget, ...]
|
|
76
|
+
original_transcript: Transcript
|
|
77
|
+
kind: Literal["transcript"] = "transcript"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Union type for corruption targets
|
|
81
|
+
CorruptionTarget = StringCorruptionTarget | TranscriptCorruptionTarget
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
# Dispatch Functions
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def resolve_corruption_target(
|
|
90
|
+
text: str | Transcript,
|
|
91
|
+
transcript_target: TranscriptTarget,
|
|
92
|
+
) -> CorruptionTarget:
|
|
93
|
+
"""Determine what needs to be corrupted from the input.
|
|
94
|
+
|
|
95
|
+
This is a pure function that analyzes the input and returns a structured
|
|
96
|
+
target specification. It does not perform any corruption.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
text: Input text or transcript to analyze.
|
|
100
|
+
transcript_target: Specification for which transcript turns to target.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
CorruptionTarget describing what should be corrupted.
|
|
104
|
+
|
|
105
|
+
Note:
|
|
106
|
+
For backwards compatibility, lists that are not valid transcripts
|
|
107
|
+
(e.g., lists of strings) are treated as strings. The original corrupt()
|
|
108
|
+
implementation would cast such inputs to str and pass them to the
|
|
109
|
+
corruption function. This behavior is preserved to maintain compatibility
|
|
110
|
+
with dataset column transformations.
|
|
111
|
+
"""
|
|
112
|
+
# Handle plain strings
|
|
113
|
+
if isinstance(text, str):
|
|
114
|
+
return StringCorruptionTarget(text=text)
|
|
115
|
+
|
|
116
|
+
# Handle transcripts (lists of dicts with "content" keys)
|
|
117
|
+
if is_transcript(text):
|
|
118
|
+
indices = resolve_transcript_indices(text, transcript_target)
|
|
119
|
+
turn_targets: list[TranscriptTurnTarget] = []
|
|
120
|
+
|
|
121
|
+
for idx in indices:
|
|
122
|
+
turn = text[idx]
|
|
123
|
+
content = turn.get("content")
|
|
124
|
+
if isinstance(content, str):
|
|
125
|
+
turn_targets.append(TranscriptTurnTarget(index=idx, content=content))
|
|
126
|
+
|
|
127
|
+
return TranscriptCorruptionTarget(
|
|
128
|
+
turns=tuple(turn_targets),
|
|
129
|
+
original_transcript=text,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# For backwards compatibility: treat other types (including lists of strings)
|
|
133
|
+
# as strings by casting. This preserves the original behavior where
|
|
134
|
+
# non-transcript lists were passed to corruption functions after casting.
|
|
135
|
+
# This handles cases like dataset column transformations where HuggingFace
|
|
136
|
+
# may batch values as lists.
|
|
137
|
+
return StringCorruptionTarget(text=str(text))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def count_corruption_targets(target: CorruptionTarget) -> int:
|
|
141
|
+
"""Count how many text segments will be corrupted.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
target: The corruption target specification.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Number of text segments that will be processed.
|
|
148
|
+
"""
|
|
149
|
+
if isinstance(target, StringCorruptionTarget):
|
|
150
|
+
return 1
|
|
151
|
+
return len(target.turns)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def extract_texts_to_corrupt(target: CorruptionTarget) -> list[str]:
|
|
155
|
+
"""Extract all text strings that need to be corrupted.
|
|
156
|
+
|
|
157
|
+
This is useful for batch processing or analysis.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
target: The corruption target specification.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List of text strings to corrupt.
|
|
164
|
+
"""
|
|
165
|
+
if isinstance(target, StringCorruptionTarget):
|
|
166
|
+
return [target.text]
|
|
167
|
+
return [turn.content for turn in target.turns]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
# Result Assembly Functions
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def assemble_string_result(
|
|
176
|
+
_target: StringCorruptionTarget,
|
|
177
|
+
corrupted: str,
|
|
178
|
+
) -> str:
|
|
179
|
+
"""Assemble the result for a string corruption.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
_target: The original target (unused, included for symmetry).
|
|
183
|
+
corrupted: The corrupted text.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
The corrupted string.
|
|
187
|
+
"""
|
|
188
|
+
return corrupted
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def assemble_transcript_result(
|
|
192
|
+
target: TranscriptCorruptionTarget,
|
|
193
|
+
corrupted_contents: dict[int, str],
|
|
194
|
+
) -> Transcript:
|
|
195
|
+
"""Assemble the result for a transcript corruption.
|
|
196
|
+
|
|
197
|
+
Creates a copy of the original transcript with specified turns updated.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
target: The original target specification.
|
|
201
|
+
corrupted_contents: Mapping of turn indices to corrupted content.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
New transcript with corrupted turns.
|
|
205
|
+
"""
|
|
206
|
+
# Create a deep copy of the transcript
|
|
207
|
+
result: list[TranscriptTurn] = [dict(turn) for turn in target.original_transcript]
|
|
208
|
+
|
|
209
|
+
# Apply corrupted content to targeted turns
|
|
210
|
+
for idx, content in corrupted_contents.items():
|
|
211
|
+
if 0 <= idx < len(result):
|
|
212
|
+
result[idx]["content"] = content
|
|
213
|
+
|
|
214
|
+
return result
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def assemble_corruption_result(
|
|
218
|
+
target: CorruptionTarget,
|
|
219
|
+
corrupted: str | dict[int, str],
|
|
220
|
+
) -> str | Transcript:
|
|
221
|
+
"""Assemble the final result based on target type.
|
|
222
|
+
|
|
223
|
+
This is a pure function that combines the original target structure
|
|
224
|
+
with the corrupted content.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
target: The original corruption target.
|
|
228
|
+
corrupted: Either a single corrupted string (for StringCorruptionTarget)
|
|
229
|
+
or a mapping of indices to corrupted content (for TranscriptCorruptionTarget).
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
The assembled result matching the input type.
|
|
233
|
+
|
|
234
|
+
Raises:
|
|
235
|
+
TypeError: If corrupted value type doesn't match target type.
|
|
236
|
+
"""
|
|
237
|
+
if isinstance(target, StringCorruptionTarget):
|
|
238
|
+
if not isinstance(corrupted, str):
|
|
239
|
+
message = "String target requires corrupted string result"
|
|
240
|
+
raise TypeError(message)
|
|
241
|
+
return assemble_string_result(target, corrupted)
|
|
242
|
+
|
|
243
|
+
if isinstance(target, TranscriptCorruptionTarget):
|
|
244
|
+
if not isinstance(corrupted, dict):
|
|
245
|
+
message = "Transcript target requires corrupted content mapping"
|
|
246
|
+
raise TypeError(message)
|
|
247
|
+
return assemble_transcript_result(target, corrupted)
|
|
248
|
+
|
|
249
|
+
# Should be unreachable due to typing, but be explicit
|
|
250
|
+
message = f"Unknown target type: {type(target).__name__}"
|
|
251
|
+
raise TypeError(message)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# ---------------------------------------------------------------------------
|
|
255
|
+
# Validation Helpers
|
|
256
|
+
# ---------------------------------------------------------------------------
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def validate_text_input(text: Any) -> str | Transcript:
|
|
260
|
+
"""Validate that input is a supported text type.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
text: Input to validate.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
The validated input.
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
TypeError: If input is not a string or transcript.
|
|
270
|
+
"""
|
|
271
|
+
if isinstance(text, str):
|
|
272
|
+
return text
|
|
273
|
+
if is_transcript(text):
|
|
274
|
+
return text
|
|
275
|
+
message = f"Expected string or transcript, got {type(text).__name__}"
|
|
276
|
+
raise TypeError(message)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
__all__ = [
|
|
280
|
+
# Target types
|
|
281
|
+
"StringCorruptionTarget",
|
|
282
|
+
"TranscriptTurnTarget",
|
|
283
|
+
"TranscriptCorruptionTarget",
|
|
284
|
+
"CorruptionTarget",
|
|
285
|
+
# Dispatch functions
|
|
286
|
+
"resolve_corruption_target",
|
|
287
|
+
"count_corruption_targets",
|
|
288
|
+
"extract_texts_to_corrupt",
|
|
289
|
+
# Result assembly
|
|
290
|
+
"assemble_string_result",
|
|
291
|
+
"assemble_transcript_result",
|
|
292
|
+
"assemble_corruption_result",
|
|
293
|
+
# Validation
|
|
294
|
+
"validate_text_input",
|
|
295
|
+
]
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Homophone substitution glitchling implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import random
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Iterable, Mapping, Sequence
|
|
8
|
+
|
|
9
|
+
from glitchlings.assets import load_homophone_groups
|
|
10
|
+
from glitchlings.constants import DEFAULT_EKKOKIN_RATE, DEFAULT_EKKOKIN_WEIGHTING
|
|
11
|
+
from glitchlings.internal.rust_ffi import ekkokin_homophones_rust, resolve_seed
|
|
12
|
+
|
|
13
|
+
from .core import AttackOrder, AttackWave
|
|
14
|
+
from .core import Glitchling as _GlitchlingRuntime
|
|
15
|
+
|
|
16
|
+
_homophone_groups: tuple[tuple[str, ...], ...] = load_homophone_groups()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _normalise_group(group: Sequence[str]) -> tuple[str, ...]:
|
|
20
|
+
"""Return a tuple of lowercase homophones preserving original order."""
|
|
21
|
+
|
|
22
|
+
# Use dict.fromkeys to preserve the original ordering while de-duplicating.
|
|
23
|
+
return tuple(dict.fromkeys(word.lower() for word in group if word))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _build_lookup(groups: Iterable[Sequence[str]]) -> Mapping[str, tuple[str, ...]]:
|
|
27
|
+
"""Return a mapping from word -> homophone group."""
|
|
28
|
+
|
|
29
|
+
lookup: dict[str, tuple[str, ...]] = {}
|
|
30
|
+
for group in groups:
|
|
31
|
+
normalised = _normalise_group(group)
|
|
32
|
+
if len(normalised) < 2:
|
|
33
|
+
continue
|
|
34
|
+
for word in normalised:
|
|
35
|
+
lookup[word] = normalised
|
|
36
|
+
return lookup
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_homophone_lookup = _build_lookup(_homophone_groups)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _GlitchlingProtocol:
|
|
43
|
+
kwargs: dict[str, Any]
|
|
44
|
+
|
|
45
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None: ...
|
|
46
|
+
|
|
47
|
+
def reset_rng(self, seed: int | None = None) -> None: ...
|
|
48
|
+
|
|
49
|
+
def pipeline_operation(self) -> dict[str, object] | None: ...
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if TYPE_CHECKING:
|
|
53
|
+
from .core import Glitchling as _GlitchlingBase
|
|
54
|
+
else:
|
|
55
|
+
_GlitchlingBase = _GlitchlingRuntime
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def substitute_homophones(
|
|
59
|
+
text: str,
|
|
60
|
+
rate: float | None = None,
|
|
61
|
+
seed: int | None = None,
|
|
62
|
+
rng: random.Random | None = None,
|
|
63
|
+
) -> str:
|
|
64
|
+
"""Replace words in ``text`` with curated homophones."""
|
|
65
|
+
|
|
66
|
+
effective_rate = DEFAULT_EKKOKIN_RATE if rate is None else rate
|
|
67
|
+
|
|
68
|
+
clamped_rate = 0.0 if math.isnan(effective_rate) else max(0.0, min(1.0, effective_rate))
|
|
69
|
+
|
|
70
|
+
return ekkokin_homophones_rust(
|
|
71
|
+
text,
|
|
72
|
+
clamped_rate,
|
|
73
|
+
DEFAULT_EKKOKIN_WEIGHTING,
|
|
74
|
+
resolve_seed(seed, rng),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Ekkokin(_GlitchlingBase):
|
|
79
|
+
"""Glitchling that swaps words for curated homophones."""
|
|
80
|
+
|
|
81
|
+
flavor = "Homophonic idiolectician. There leased favourite flavour? Orange."
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
rate: float | None = None,
|
|
87
|
+
seed: int | None = None,
|
|
88
|
+
) -> None:
|
|
89
|
+
effective_rate = DEFAULT_EKKOKIN_RATE if rate is None else rate
|
|
90
|
+
super().__init__(
|
|
91
|
+
name="Ekkokin",
|
|
92
|
+
corruption_function=substitute_homophones,
|
|
93
|
+
scope=AttackWave.WORD,
|
|
94
|
+
order=AttackOrder.EARLY,
|
|
95
|
+
seed=seed,
|
|
96
|
+
pipeline_operation=_build_pipeline_descriptor,
|
|
97
|
+
rate=effective_rate,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _build_pipeline_descriptor(glitch: _GlitchlingBase) -> dict[str, object]:
|
|
102
|
+
rate_value = glitch.kwargs.get("rate")
|
|
103
|
+
rate = DEFAULT_EKKOKIN_RATE if rate_value is None else float(rate_value)
|
|
104
|
+
return {
|
|
105
|
+
"type": "ekkokin",
|
|
106
|
+
"rate": rate,
|
|
107
|
+
"weighting": DEFAULT_EKKOKIN_WEIGHTING,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
ekkokin = Ekkokin()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
__all__ = [
|
|
115
|
+
"Ekkokin",
|
|
116
|
+
"ekkokin",
|
|
117
|
+
"substitute_homophones",
|
|
118
|
+
]
|
glitchlings/zoo/hokey.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Hokey glitchling that performs expressive lengthening."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from glitchlings.internal.rust_ffi import hokey_rust, resolve_seed
|
|
9
|
+
|
|
10
|
+
from .core import AttackOrder, AttackWave, Gaggle, PipelineOperationPayload
|
|
11
|
+
from .core import Glitchling as GlitchlingBase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extend_vowels(
|
|
15
|
+
text: str,
|
|
16
|
+
rate: float = 0.3,
|
|
17
|
+
extension_min: int = 2,
|
|
18
|
+
extension_max: int = 5,
|
|
19
|
+
word_length_threshold: int = 6,
|
|
20
|
+
seed: int | None = None,
|
|
21
|
+
rng: random.Random | None = None,
|
|
22
|
+
base_p: float | None = None,
|
|
23
|
+
) -> str:
|
|
24
|
+
"""Extend expressive segments of words for emphasis.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
text : str
|
|
29
|
+
Input text to transform.
|
|
30
|
+
rate : float, optional
|
|
31
|
+
Global selection rate for candidate words.
|
|
32
|
+
extension_min : int, optional
|
|
33
|
+
Minimum number of extra repetitions for the stretch unit.
|
|
34
|
+
extension_max : int, optional
|
|
35
|
+
Maximum number of extra repetitions for the stretch unit.
|
|
36
|
+
word_length_threshold : int, optional
|
|
37
|
+
Preferred maximum alphabetic length; longer words are de-emphasised but not
|
|
38
|
+
excluded.
|
|
39
|
+
seed : int, optional
|
|
40
|
+
Deterministic seed when ``rng`` is not supplied.
|
|
41
|
+
rng : random.Random, optional
|
|
42
|
+
Random number generator to drive sampling.
|
|
43
|
+
base_p : float, optional
|
|
44
|
+
Base probability for the negative-binomial sampler (heavier tails for smaller
|
|
45
|
+
values). Defaults to ``0.45``.
|
|
46
|
+
"""
|
|
47
|
+
if not text:
|
|
48
|
+
return text
|
|
49
|
+
|
|
50
|
+
base_probability = base_p if base_p is not None else 0.45
|
|
51
|
+
|
|
52
|
+
seed_value = resolve_seed(seed, rng)
|
|
53
|
+
return hokey_rust(
|
|
54
|
+
text,
|
|
55
|
+
rate,
|
|
56
|
+
extension_min,
|
|
57
|
+
extension_max,
|
|
58
|
+
word_length_threshold,
|
|
59
|
+
base_probability,
|
|
60
|
+
seed_value,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Hokey(GlitchlingBase):
|
|
65
|
+
"""Glitchling that stretches words using linguistic heuristics."""
|
|
66
|
+
|
|
67
|
+
flavor = "Sooooo excited to meet you! We reeeeeally missed you last week."
|
|
68
|
+
|
|
69
|
+
seed: int | None
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
rate: float = 0.3,
|
|
75
|
+
extension_min: int = 2,
|
|
76
|
+
extension_max: int = 5,
|
|
77
|
+
word_length_threshold: int = 6,
|
|
78
|
+
base_p: float = 0.45,
|
|
79
|
+
seed: int | None = None,
|
|
80
|
+
) -> None:
|
|
81
|
+
self._master_seed: int | None = seed
|
|
82
|
+
|
|
83
|
+
def _corruption_wrapper(text: str, **kwargs: Any) -> str:
|
|
84
|
+
return extend_vowels(text, **kwargs)
|
|
85
|
+
|
|
86
|
+
super().__init__(
|
|
87
|
+
name="Hokey",
|
|
88
|
+
corruption_function=_corruption_wrapper,
|
|
89
|
+
scope=AttackWave.CHARACTER,
|
|
90
|
+
order=AttackOrder.FIRST,
|
|
91
|
+
seed=seed,
|
|
92
|
+
rate=rate,
|
|
93
|
+
extension_min=extension_min,
|
|
94
|
+
extension_max=extension_max,
|
|
95
|
+
word_length_threshold=word_length_threshold,
|
|
96
|
+
base_p=base_p,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
100
|
+
kwargs = self.kwargs
|
|
101
|
+
rate = kwargs.get("rate")
|
|
102
|
+
extension_min = kwargs.get("extension_min")
|
|
103
|
+
extension_max = kwargs.get("extension_max")
|
|
104
|
+
word_length_threshold = kwargs.get("word_length_threshold")
|
|
105
|
+
base_p = kwargs.get("base_p")
|
|
106
|
+
return cast(
|
|
107
|
+
PipelineOperationPayload,
|
|
108
|
+
{
|
|
109
|
+
"type": "hokey",
|
|
110
|
+
"rate": 0.3 if rate is None else float(rate),
|
|
111
|
+
"extension_min": 2 if extension_min is None else int(extension_min),
|
|
112
|
+
"extension_max": 5 if extension_max is None else int(extension_max),
|
|
113
|
+
"word_length_threshold": 6
|
|
114
|
+
if word_length_threshold is None
|
|
115
|
+
else int(word_length_threshold),
|
|
116
|
+
"base_p": 0.45 if base_p is None else float(base_p),
|
|
117
|
+
},
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def reset_rng(self, seed: int | None = None) -> None:
|
|
121
|
+
if seed is not None:
|
|
122
|
+
self._master_seed = seed
|
|
123
|
+
super().reset_rng(seed)
|
|
124
|
+
if self.seed is None:
|
|
125
|
+
return
|
|
126
|
+
derived = Gaggle.derive_seed(int(seed), self.name, 0)
|
|
127
|
+
self.seed = int(derived)
|
|
128
|
+
self.rng = random.Random(self.seed)
|
|
129
|
+
self.kwargs["seed"] = self.seed
|
|
130
|
+
else:
|
|
131
|
+
super().reset_rng(None)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
hokey = Hokey()
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
__all__ = ["Hokey", "hokey", "extend_vowels"]
|