glitchlings 0.2.4__cp312-cp312-win_amd64.whl → 0.2.5__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/redactyl.py +70 -9
- glitchlings/zoo/reduple.py +57 -16
- glitchlings/zoo/rushmore.py +52 -17
- glitchlings/zoo/zeedub.py +144 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.5.dist-info}/METADATA +13 -1
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.5.dist-info}/RECORD +13 -12
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.5.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.5.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.5.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.5.dist-info}/top_level.txt +0 -0
glitchlings/__init__.py
CHANGED
@@ -13,6 +13,8 @@ from .zoo import (
|
|
13
13
|
rushmore,
|
14
14
|
Scannequin,
|
15
15
|
scannequin,
|
16
|
+
Zeedub,
|
17
|
+
zeedub,
|
16
18
|
Glitchling,
|
17
19
|
Gaggle,
|
18
20
|
summon,
|
@@ -35,6 +37,8 @@ __all__ = [
|
|
35
37
|
"rushmore",
|
36
38
|
"Scannequin",
|
37
39
|
"scannequin",
|
40
|
+
"Zeedub",
|
41
|
+
"zeedub",
|
38
42
|
"summon",
|
39
43
|
"Glitchling",
|
40
44
|
"Gaggle",
|
Binary file
|
glitchlings/zoo/__init__.py
CHANGED
@@ -10,6 +10,7 @@ from .reduple import Reduple, reduple
|
|
10
10
|
from .rushmore import Rushmore, rushmore
|
11
11
|
from .redactyl import Redactyl, redactyl
|
12
12
|
from .scannequin import Scannequin, scannequin
|
13
|
+
from .zeedub import Zeedub, zeedub
|
13
14
|
from .core import Glitchling, Gaggle
|
14
15
|
|
15
16
|
__all__ = [
|
@@ -27,6 +28,8 @@ __all__ = [
|
|
27
28
|
"redactyl",
|
28
29
|
"Scannequin",
|
29
30
|
"scannequin",
|
31
|
+
"Zeedub",
|
32
|
+
"zeedub",
|
30
33
|
"Glitchling",
|
31
34
|
"Gaggle",
|
32
35
|
"summon",
|
@@ -40,7 +43,7 @@ _HAS_JARGOYLE = _jargoyle_available()
|
|
40
43
|
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
|
41
44
|
if _HAS_JARGOYLE:
|
42
45
|
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
43
|
-
_BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin])
|
46
|
+
_BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
|
44
47
|
|
45
48
|
BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
46
49
|
glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
|
@@ -53,6 +56,7 @@ _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
|
53
56
|
rushmore.name.lower(): Rushmore,
|
54
57
|
redactyl.name.lower(): Redactyl,
|
55
58
|
scannequin.name.lower(): Scannequin,
|
59
|
+
zeedub.name.lower(): Zeedub,
|
56
60
|
}
|
57
61
|
if _HAS_JARGOYLE:
|
58
62
|
_BUILTIN_GLITCHLING_TYPES[jargoyle.name.lower()] = Jargoyle
|
glitchlings/zoo/redactyl.py
CHANGED
@@ -14,6 +14,41 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
14
14
|
_redact_words_rust = None
|
15
15
|
|
16
16
|
|
17
|
+
def _weighted_sample_without_replacement(
|
18
|
+
population: list[int],
|
19
|
+
weights: list[float],
|
20
|
+
*,
|
21
|
+
k: int,
|
22
|
+
rng: random.Random,
|
23
|
+
) -> list[int]:
|
24
|
+
"""Select `k` unique indices according to the given weights."""
|
25
|
+
|
26
|
+
selections: list[int] = []
|
27
|
+
items = list(zip(population, weights))
|
28
|
+
if k <= 0 or not items:
|
29
|
+
return selections
|
30
|
+
if k > len(items):
|
31
|
+
raise ValueError("Sample larger than population or is negative")
|
32
|
+
|
33
|
+
for _ in range(k):
|
34
|
+
total_weight = sum(weight for _, weight in items)
|
35
|
+
if total_weight <= 0:
|
36
|
+
chosen_index = rng.randrange(len(items))
|
37
|
+
else:
|
38
|
+
threshold = rng.random() * total_weight
|
39
|
+
cumulative = 0.0
|
40
|
+
chosen_index = len(items) - 1
|
41
|
+
for idx, (_, weight) in enumerate(items):
|
42
|
+
cumulative += weight
|
43
|
+
if cumulative >= threshold:
|
44
|
+
chosen_index = idx
|
45
|
+
break
|
46
|
+
value, _ = items.pop(chosen_index)
|
47
|
+
selections.append(value)
|
48
|
+
|
49
|
+
return selections
|
50
|
+
|
51
|
+
|
17
52
|
def _python_redact_words(
|
18
53
|
text: str,
|
19
54
|
*,
|
@@ -21,6 +56,7 @@ def _python_redact_words(
|
|
21
56
|
rate: float,
|
22
57
|
merge_adjacent: bool,
|
23
58
|
rng: random.Random,
|
59
|
+
unweighted: bool = False,
|
24
60
|
) -> str:
|
25
61
|
"""Redact random words by replacing their characters.
|
26
62
|
|
@@ -29,18 +65,36 @@ def _python_redact_words(
|
|
29
65
|
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
30
66
|
- rate: Max proportion of words to redact (default 0.05).
|
31
67
|
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
32
|
-
-
|
33
|
-
-
|
68
|
+
- rng: RNG used for sampling decisions.
|
69
|
+
- unweighted: When True, sample words uniformly instead of by length.
|
34
70
|
"""
|
35
71
|
# Preserve exact spacing and punctuation by using regex
|
36
72
|
tokens = re.split(r"(\s+)", text)
|
37
73
|
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
38
74
|
if not word_indices:
|
39
|
-
raise ValueError(
|
75
|
+
raise ValueError(
|
76
|
+
"Cannot redact words because the input text contains no redactable words."
|
77
|
+
)
|
78
|
+
weights: list[float] = []
|
79
|
+
for index in word_indices:
|
80
|
+
word = tokens[index]
|
81
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
82
|
+
core = match.group(2) if match else word
|
83
|
+
core_length = len(core) if core else len(word)
|
84
|
+
if core_length <= 0:
|
85
|
+
core_length = len(word.strip()) or len(word)
|
86
|
+
if core_length <= 0:
|
87
|
+
core_length = 1
|
88
|
+
weights.append(1.0 if unweighted else float(core_length))
|
40
89
|
num_to_redact = max(1, int(len(word_indices) * rate))
|
41
|
-
|
42
|
-
|
43
|
-
indices_to_redact =
|
90
|
+
if num_to_redact > len(word_indices):
|
91
|
+
raise ValueError("Sample larger than population or is negative")
|
92
|
+
indices_to_redact = _weighted_sample_without_replacement(
|
93
|
+
word_indices,
|
94
|
+
weights,
|
95
|
+
k=num_to_redact,
|
96
|
+
rng=rng,
|
97
|
+
)
|
44
98
|
indices_to_redact.sort()
|
45
99
|
|
46
100
|
for i in indices_to_redact:
|
@@ -80,13 +134,14 @@ def redact_words(
|
|
80
134
|
rng: random.Random | None = None,
|
81
135
|
*,
|
82
136
|
redaction_rate: float | None = None,
|
137
|
+
unweighted: bool = False,
|
83
138
|
) -> str:
|
84
139
|
"""Redact random words by replacing their characters."""
|
85
140
|
|
86
141
|
effective_rate = resolve_rate(
|
87
142
|
rate=rate,
|
88
143
|
legacy_value=redaction_rate,
|
89
|
-
default=0.
|
144
|
+
default=0.025,
|
90
145
|
legacy_name="redaction_rate",
|
91
146
|
)
|
92
147
|
|
@@ -94,6 +149,7 @@ def redact_words(
|
|
94
149
|
rng = random.Random(seed)
|
95
150
|
|
96
151
|
clamped_rate = max(0.0, effective_rate)
|
152
|
+
unweighted_flag = bool(unweighted)
|
97
153
|
|
98
154
|
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
99
155
|
|
@@ -103,6 +159,7 @@ def redact_words(
|
|
103
159
|
replacement_char,
|
104
160
|
clamped_rate,
|
105
161
|
merge_adjacent,
|
162
|
+
unweighted_flag,
|
106
163
|
rng,
|
107
164
|
)
|
108
165
|
|
@@ -112,6 +169,7 @@ def redact_words(
|
|
112
169
|
rate=clamped_rate,
|
113
170
|
merge_adjacent=merge_adjacent,
|
114
171
|
rng=rng,
|
172
|
+
unweighted=unweighted_flag,
|
115
173
|
)
|
116
174
|
|
117
175
|
|
@@ -126,12 +184,13 @@ class Redactyl(Glitchling):
|
|
126
184
|
redaction_rate: float | None = None,
|
127
185
|
merge_adjacent: bool = False,
|
128
186
|
seed: int = 151,
|
187
|
+
unweighted: bool = False,
|
129
188
|
) -> None:
|
130
189
|
self._param_aliases = {"redaction_rate": "rate"}
|
131
190
|
effective_rate = resolve_rate(
|
132
191
|
rate=rate,
|
133
192
|
legacy_value=redaction_rate,
|
134
|
-
default=0.
|
193
|
+
default=0.025,
|
135
194
|
legacy_name="redaction_rate",
|
136
195
|
)
|
137
196
|
super().__init__(
|
@@ -142,6 +201,7 @@ class Redactyl(Glitchling):
|
|
142
201
|
replacement_char=replacement_char,
|
143
202
|
rate=effective_rate,
|
144
203
|
merge_adjacent=merge_adjacent,
|
204
|
+
unweighted=unweighted,
|
145
205
|
)
|
146
206
|
|
147
207
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
@@ -150,15 +210,16 @@ class Redactyl(Glitchling):
|
|
150
210
|
merge_adjacent = self.kwargs.get("merge_adjacent")
|
151
211
|
if replacement_char is None or rate is None or merge_adjacent is None:
|
152
212
|
return None
|
213
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
153
214
|
return {
|
154
215
|
"type": "redact",
|
155
216
|
"replacement_char": str(replacement_char),
|
156
217
|
"redaction_rate": float(rate),
|
157
218
|
"merge_adjacent": bool(merge_adjacent),
|
219
|
+
"unweighted": unweighted,
|
158
220
|
}
|
159
221
|
|
160
222
|
|
161
|
-
|
162
223
|
redactyl = Redactyl()
|
163
224
|
|
164
225
|
|
glitchlings/zoo/reduple.py
CHANGED
@@ -16,14 +16,15 @@ def _python_reduplicate_words(
|
|
16
16
|
*,
|
17
17
|
rate: float,
|
18
18
|
rng: random.Random,
|
19
|
+
unweighted: bool = False,
|
19
20
|
) -> str:
|
20
21
|
"""Randomly reduplicate words in the text.
|
21
22
|
|
22
23
|
Parameters
|
23
24
|
- text: Input text.
|
24
25
|
- rate: Max proportion of words to reduplicate (default 0.05).
|
25
|
-
-
|
26
|
-
-
|
26
|
+
- rng: RNG used for sampling decisions.
|
27
|
+
- unweighted: When True, sample words uniformly instead of length-weighted.
|
27
28
|
|
28
29
|
Notes
|
29
30
|
- Preserves spacing and punctuation by tokenizing with separators.
|
@@ -32,6 +33,7 @@ def _python_reduplicate_words(
|
|
32
33
|
# Preserve exact spacing and punctuation by using regex
|
33
34
|
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
34
35
|
|
36
|
+
candidate_weights: list[tuple[int, float]] = []
|
35
37
|
for i in range(0, len(tokens), 2): # Every other token is a word
|
36
38
|
if i >= len(tokens):
|
37
39
|
break
|
@@ -40,16 +42,46 @@ def _python_reduplicate_words(
|
|
40
42
|
if not word or word.isspace(): # Skip empty or whitespace
|
41
43
|
continue
|
42
44
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
46
|
+
core = match.group(2) if match else word
|
47
|
+
core_length = len(core) if core else len(word)
|
48
|
+
if core_length <= 0:
|
49
|
+
core_length = len(word.strip()) or len(word)
|
50
|
+
if core_length <= 0:
|
51
|
+
core_length = 1
|
52
|
+
weight = 1.0 if unweighted else 1.0 / core_length
|
53
|
+
candidate_weights.append((i, weight))
|
54
|
+
|
55
|
+
if not candidate_weights:
|
56
|
+
return "".join(tokens)
|
57
|
+
|
58
|
+
effective_rate = max(rate, 0.0)
|
59
|
+
if effective_rate <= 0.0:
|
60
|
+
return "".join(tokens)
|
61
|
+
|
62
|
+
mean_weight = sum(weight for _, weight in candidate_weights) / len(
|
63
|
+
candidate_weights
|
64
|
+
)
|
65
|
+
|
66
|
+
for index, weight in candidate_weights:
|
67
|
+
if effective_rate >= 1.0:
|
68
|
+
probability = 1.0
|
69
|
+
else:
|
70
|
+
if mean_weight <= 0.0:
|
71
|
+
probability = effective_rate
|
51
72
|
else:
|
52
|
-
|
73
|
+
probability = min(1.0, effective_rate * (weight / mean_weight))
|
74
|
+
if rng.random() >= probability:
|
75
|
+
continue
|
76
|
+
|
77
|
+
word = tokens[index]
|
78
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
79
|
+
if match:
|
80
|
+
prefix, core, suffix = match.groups()
|
81
|
+
# Reduplicate with a space: "word" -> "word word"
|
82
|
+
tokens[index] = f"{prefix}{core} {core}{suffix}"
|
83
|
+
else:
|
84
|
+
tokens[index] = f"{word} {word}"
|
53
85
|
return "".join(tokens)
|
54
86
|
|
55
87
|
|
@@ -60,6 +92,7 @@ def reduplicate_words(
|
|
60
92
|
rng: random.Random | None = None,
|
61
93
|
*,
|
62
94
|
reduplication_rate: float | None = None,
|
95
|
+
unweighted: bool = False,
|
63
96
|
) -> str:
|
64
97
|
"""Randomly reduplicate words in the text.
|
65
98
|
|
@@ -70,7 +103,7 @@ def reduplicate_words(
|
|
70
103
|
effective_rate = resolve_rate(
|
71
104
|
rate=rate,
|
72
105
|
legacy_value=reduplication_rate,
|
73
|
-
default=0.
|
106
|
+
default=0.01,
|
74
107
|
legacy_name="reduplication_rate",
|
75
108
|
)
|
76
109
|
|
@@ -78,14 +111,16 @@ def reduplicate_words(
|
|
78
111
|
rng = random.Random(seed)
|
79
112
|
|
80
113
|
clamped_rate = max(0.0, effective_rate)
|
114
|
+
unweighted_flag = bool(unweighted)
|
81
115
|
|
82
116
|
if _reduplicate_words_rust is not None:
|
83
|
-
return _reduplicate_words_rust(text, clamped_rate, rng)
|
117
|
+
return _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng)
|
84
118
|
|
85
119
|
return _python_reduplicate_words(
|
86
120
|
text,
|
87
121
|
rate=clamped_rate,
|
88
122
|
rng=rng,
|
123
|
+
unweighted=unweighted_flag,
|
89
124
|
)
|
90
125
|
|
91
126
|
|
@@ -98,12 +133,13 @@ class Reduple(Glitchling):
|
|
98
133
|
rate: float | None = None,
|
99
134
|
reduplication_rate: float | None = None,
|
100
135
|
seed: int | None = None,
|
136
|
+
unweighted: bool = False,
|
101
137
|
) -> None:
|
102
138
|
self._param_aliases = {"reduplication_rate": "rate"}
|
103
139
|
effective_rate = resolve_rate(
|
104
140
|
rate=rate,
|
105
141
|
legacy_value=reduplication_rate,
|
106
|
-
default=0.
|
142
|
+
default=0.01,
|
107
143
|
legacy_name="reduplication_rate",
|
108
144
|
)
|
109
145
|
super().__init__(
|
@@ -112,14 +148,19 @@ class Reduple(Glitchling):
|
|
112
148
|
scope=AttackWave.WORD,
|
113
149
|
seed=seed,
|
114
150
|
rate=effective_rate,
|
151
|
+
unweighted=unweighted,
|
115
152
|
)
|
116
153
|
|
117
154
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
118
155
|
rate = self.kwargs.get("rate")
|
119
156
|
if rate is None:
|
120
157
|
return None
|
121
|
-
|
122
|
-
|
158
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
159
|
+
return {
|
160
|
+
"type": "reduplicate",
|
161
|
+
"reduplication_rate": float(rate),
|
162
|
+
"unweighted": unweighted,
|
163
|
+
}
|
123
164
|
|
124
165
|
|
125
166
|
reduple = Reduple()
|
glitchlings/zoo/rushmore.py
CHANGED
@@ -17,42 +17,67 @@ def _python_delete_random_words(
|
|
17
17
|
*,
|
18
18
|
rate: float,
|
19
19
|
rng: random.Random,
|
20
|
+
unweighted: bool = False,
|
20
21
|
) -> str:
|
21
22
|
"""Delete random words from the input text while preserving whitespace."""
|
22
23
|
|
23
|
-
|
24
|
+
effective_rate = max(rate, 0.0)
|
25
|
+
if effective_rate <= 0.0:
|
24
26
|
return text
|
25
27
|
|
26
28
|
tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
|
27
29
|
|
28
|
-
|
30
|
+
candidate_data: list[tuple[int, float]] = []
|
29
31
|
for i in range(2, len(tokens), 2): # Every other token is a word, skip the first word
|
30
32
|
word = tokens[i]
|
31
33
|
if not word or word.isspace():
|
32
34
|
continue
|
33
35
|
|
34
|
-
|
36
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
37
|
+
core = match.group(2) if match else word
|
38
|
+
core_length = len(core) if core else len(word)
|
39
|
+
if core_length <= 0:
|
40
|
+
core_length = len(word.strip()) or len(word)
|
41
|
+
if core_length <= 0:
|
42
|
+
core_length = 1
|
43
|
+
weight = 1.0 if unweighted else 1.0 / core_length
|
44
|
+
candidate_data.append((i, weight))
|
45
|
+
|
46
|
+
if not candidate_data:
|
47
|
+
return text
|
35
48
|
|
36
49
|
allowed_deletions = min(
|
37
|
-
len(
|
50
|
+
len(candidate_data), math.floor(len(candidate_data) * effective_rate)
|
38
51
|
)
|
39
52
|
if allowed_deletions <= 0:
|
40
53
|
return text
|
41
54
|
|
55
|
+
mean_weight = sum(weight for _, weight in candidate_data) / len(candidate_data)
|
56
|
+
|
42
57
|
deletions = 0
|
43
|
-
for
|
44
|
-
if
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
58
|
+
for index, weight in candidate_data:
|
59
|
+
if deletions >= allowed_deletions:
|
60
|
+
break
|
61
|
+
|
62
|
+
if effective_rate >= 1.0:
|
63
|
+
probability = 1.0
|
64
|
+
else:
|
65
|
+
if mean_weight <= 0.0:
|
66
|
+
probability = effective_rate
|
50
67
|
else:
|
51
|
-
|
68
|
+
probability = min(1.0, effective_rate * (weight / mean_weight))
|
69
|
+
if rng.random() >= probability:
|
70
|
+
continue
|
71
|
+
|
72
|
+
word = tokens[index]
|
73
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
74
|
+
if match:
|
75
|
+
prefix, _, suffix = match.groups()
|
76
|
+
tokens[index] = f"{prefix.strip()}{suffix.strip()}"
|
77
|
+
else:
|
78
|
+
tokens[index] = ""
|
52
79
|
|
53
|
-
|
54
|
-
if deletions >= allowed_deletions:
|
55
|
-
break
|
80
|
+
deletions += 1
|
56
81
|
|
57
82
|
text = "".join(tokens)
|
58
83
|
text = re.sub(r"\s+([.,;:])", r"\1", text)
|
@@ -68,6 +93,7 @@ def delete_random_words(
|
|
68
93
|
rng: random.Random | None = None,
|
69
94
|
*,
|
70
95
|
max_deletion_rate: float | None = None,
|
96
|
+
unweighted: bool = False,
|
71
97
|
) -> str:
|
72
98
|
"""Delete random words from the input text.
|
73
99
|
|
@@ -85,14 +111,16 @@ def delete_random_words(
|
|
85
111
|
rng = random.Random(seed)
|
86
112
|
|
87
113
|
clamped_rate = max(0.0, effective_rate)
|
114
|
+
unweighted_flag = bool(unweighted)
|
88
115
|
|
89
116
|
if _delete_random_words_rust is not None:
|
90
|
-
return _delete_random_words_rust(text, clamped_rate, rng)
|
117
|
+
return _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng)
|
91
118
|
|
92
119
|
return _python_delete_random_words(
|
93
120
|
text,
|
94
121
|
rate=clamped_rate,
|
95
122
|
rng=rng,
|
123
|
+
unweighted=unweighted_flag,
|
96
124
|
)
|
97
125
|
|
98
126
|
|
@@ -105,6 +133,7 @@ class Rushmore(Glitchling):
|
|
105
133
|
rate: float | None = None,
|
106
134
|
max_deletion_rate: float | None = None,
|
107
135
|
seed: int | None = None,
|
136
|
+
unweighted: bool = False,
|
108
137
|
) -> None:
|
109
138
|
self._param_aliases = {"max_deletion_rate": "rate"}
|
110
139
|
effective_rate = resolve_rate(
|
@@ -119,6 +148,7 @@ class Rushmore(Glitchling):
|
|
119
148
|
scope=AttackWave.WORD,
|
120
149
|
seed=seed,
|
121
150
|
rate=effective_rate,
|
151
|
+
unweighted=unweighted,
|
122
152
|
)
|
123
153
|
|
124
154
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
@@ -127,7 +157,12 @@ class Rushmore(Glitchling):
|
|
127
157
|
rate = self.kwargs.get("max_deletion_rate")
|
128
158
|
if rate is None:
|
129
159
|
return None
|
130
|
-
|
160
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
161
|
+
return {
|
162
|
+
"type": "delete",
|
163
|
+
"max_deletion_rate": float(rate),
|
164
|
+
"unweighted": unweighted,
|
165
|
+
}
|
131
166
|
|
132
167
|
|
133
168
|
rushmore = Rushmore()
|
@@ -0,0 +1,144 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import math
|
4
|
+
import random
|
5
|
+
from collections.abc import Sequence
|
6
|
+
|
7
|
+
from .core import Glitchling, AttackWave, AttackOrder
|
8
|
+
from ._rate import resolve_rate
|
9
|
+
|
10
|
+
try:
|
11
|
+
from glitchlings._zoo_rust import inject_zero_widths as _inject_zero_widths_rust
|
12
|
+
except ImportError: # pragma: no cover - compiled extension not present
|
13
|
+
_inject_zero_widths_rust = None
|
14
|
+
|
15
|
+
_DEFAULT_ZERO_WIDTH_CHARACTERS: tuple[str, ...] = (
|
16
|
+
"\u200b", # ZERO WIDTH SPACE
|
17
|
+
"\u200c", # ZERO WIDTH NON-JOINER
|
18
|
+
"\u200d", # ZERO WIDTH JOINER
|
19
|
+
"\ufeff", # ZERO WIDTH NO-BREAK SPACE
|
20
|
+
"\u2060", # WORD JOINER
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
def _python_insert_zero_widths(
|
25
|
+
text: str,
|
26
|
+
*,
|
27
|
+
rate: float,
|
28
|
+
rng: random.Random,
|
29
|
+
characters: Sequence[str],
|
30
|
+
) -> str:
|
31
|
+
if not text:
|
32
|
+
return text
|
33
|
+
|
34
|
+
palette = [char for char in characters if char]
|
35
|
+
if not palette:
|
36
|
+
return text
|
37
|
+
|
38
|
+
positions = [
|
39
|
+
index + 1
|
40
|
+
for index in range(len(text) - 1)
|
41
|
+
if not text[index].isspace() and not text[index + 1].isspace()
|
42
|
+
]
|
43
|
+
if not positions:
|
44
|
+
return text
|
45
|
+
|
46
|
+
total = len(positions)
|
47
|
+
clamped_rate = max(0.0, rate)
|
48
|
+
if clamped_rate <= 0.0:
|
49
|
+
return text
|
50
|
+
|
51
|
+
target = clamped_rate * total
|
52
|
+
count = math.floor(target)
|
53
|
+
remainder = target - count
|
54
|
+
if remainder > 0.0 and rng.random() < remainder:
|
55
|
+
count += 1
|
56
|
+
count = min(total, count)
|
57
|
+
|
58
|
+
if count <= 0:
|
59
|
+
return text
|
60
|
+
|
61
|
+
chosen = rng.sample(positions, count)
|
62
|
+
chosen.sort()
|
63
|
+
|
64
|
+
chars = list(text)
|
65
|
+
for position in reversed(chosen):
|
66
|
+
chars.insert(position, rng.choice(palette))
|
67
|
+
|
68
|
+
return "".join(chars)
|
69
|
+
|
70
|
+
|
71
|
+
def insert_zero_widths(
|
72
|
+
text: str,
|
73
|
+
rate: float | None = None,
|
74
|
+
seed: int | None = None,
|
75
|
+
rng: random.Random | None = None,
|
76
|
+
*,
|
77
|
+
characters: Sequence[str] | None = None,
|
78
|
+
) -> str:
|
79
|
+
"""Inject zero-width characters between non-space character pairs."""
|
80
|
+
|
81
|
+
effective_rate = resolve_rate(
|
82
|
+
rate=rate,
|
83
|
+
legacy_value=None,
|
84
|
+
default=0.02,
|
85
|
+
legacy_name="rate",
|
86
|
+
)
|
87
|
+
|
88
|
+
if rng is None:
|
89
|
+
rng = random.Random(seed)
|
90
|
+
|
91
|
+
palette: Sequence[str] = (
|
92
|
+
tuple(characters) if characters is not None else _DEFAULT_ZERO_WIDTH_CHARACTERS
|
93
|
+
)
|
94
|
+
|
95
|
+
cleaned_palette = tuple(char for char in palette if char)
|
96
|
+
if not cleaned_palette or not text:
|
97
|
+
return text
|
98
|
+
|
99
|
+
clamped_rate = max(0.0, effective_rate)
|
100
|
+
if clamped_rate == 0.0:
|
101
|
+
return text
|
102
|
+
|
103
|
+
if _inject_zero_widths_rust is not None:
|
104
|
+
return _inject_zero_widths_rust(text, clamped_rate, list(cleaned_palette), rng)
|
105
|
+
|
106
|
+
return _python_insert_zero_widths(
|
107
|
+
text,
|
108
|
+
rate=clamped_rate,
|
109
|
+
rng=rng,
|
110
|
+
characters=cleaned_palette,
|
111
|
+
)
|
112
|
+
|
113
|
+
|
114
|
+
class Zeedub(Glitchling):
|
115
|
+
"""Glitchling that plants zero-width glyphs inside words."""
|
116
|
+
|
117
|
+
def __init__(
|
118
|
+
self,
|
119
|
+
*,
|
120
|
+
rate: float | None = None,
|
121
|
+
seed: int | None = None,
|
122
|
+
characters: Sequence[str] | None = None,
|
123
|
+
) -> None:
|
124
|
+
effective_rate = resolve_rate(
|
125
|
+
rate=rate,
|
126
|
+
legacy_value=None,
|
127
|
+
default=0.02,
|
128
|
+
legacy_name="rate",
|
129
|
+
)
|
130
|
+
super().__init__(
|
131
|
+
name="Zeedub",
|
132
|
+
corruption_function=insert_zero_widths,
|
133
|
+
scope=AttackWave.CHARACTER,
|
134
|
+
order=AttackOrder.LAST,
|
135
|
+
seed=seed,
|
136
|
+
rate=effective_rate,
|
137
|
+
characters=tuple(characters) if characters is not None else None,
|
138
|
+
)
|
139
|
+
|
140
|
+
|
141
|
+
zeedub = Zeedub()
|
142
|
+
|
143
|
+
|
144
|
+
__all__ = ["Zeedub", "zeedub", "insert_zero_widths"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.5
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -396,6 +396,18 @@ _How can a computer need reading glasses?_
|
|
396
396
|
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
397
397
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
398
398
|
|
399
|
+
### Zeedub
|
400
|
+
|
401
|
+
_A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
|
402
|
+
|
403
|
+
> _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
|
404
|
+
>
|
405
|
+
> Args
|
406
|
+
>
|
407
|
+
> - `rate (float)`: Expected number of zero-width insertions as a proportion of eligible bigrams (default: 0.02, 2%).
|
408
|
+
> - `characters (Sequence[str])`: Optional override for the pool of zero-width strings to inject (default: curated invisibles such as U+200B, U+200C, U+200D, U+FEFF, U+2060).
|
409
|
+
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
410
|
+
|
399
411
|
### Jargoyle
|
400
412
|
|
401
413
|
_Uh oh. The worst person you know just bought a thesaurus._
|
@@ -1,26 +1,27 @@
|
|
1
|
-
glitchlings/__init__.py,sha256=
|
1
|
+
glitchlings/__init__.py,sha256=fjerquRITZQY_rY5mhTVVQyeGAz1qTpgicvDhbpqgi8,678
|
2
2
|
glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
|
3
|
-
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=
|
3
|
+
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=JO0QDAqXc14YPXxmghAKNuXZ3uHFEOyBa-SkU4EO_fI,2019328
|
4
4
|
glitchlings/main.py,sha256=QrSSLWcKh1_NDfJDGh-3UVKdI7AkzfMy6Jz1ouxIgnE,6149
|
5
5
|
glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
|
6
6
|
glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
|
7
7
|
glitchlings/dlc/prime.py,sha256=hySyYBncUM-49j6JtrHYO6c3HpbG2vTt2EYZnOJ85C0,8972
|
8
8
|
glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
|
9
|
-
glitchlings/zoo/__init__.py,sha256=
|
9
|
+
glitchlings/zoo/__init__.py,sha256=mAhsnR3ZK9BocxT3J4WF6JcYQMYI9e_EYZ-GMxHv0P4,4420
|
10
10
|
glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
|
11
11
|
glitchlings/zoo/_rate.py,sha256=EYUWXYyR2IK0zYBWyBOlnUjDxU32JE9mZTZeodVx5CA,548
|
12
12
|
glitchlings/zoo/core.py,sha256=QKHmzmONNkiA3RdfgLdNx-FPFwoH4Bm-Tkc3vSCHNpc,14412
|
13
13
|
glitchlings/zoo/jargoyle.py,sha256=1fnL_8bv1Y-T2h1C6NRzIylYyOuAUI-BiMReFewqh00,11002
|
14
14
|
glitchlings/zoo/mim1c.py,sha256=3ddNOzWgLABuEOh5T98Xk439ejx-YHGI7ErXET03Crc,3537
|
15
15
|
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
16
|
-
glitchlings/zoo/redactyl.py,sha256=
|
17
|
-
glitchlings/zoo/reduple.py,sha256=
|
18
|
-
glitchlings/zoo/rushmore.py,sha256=
|
16
|
+
glitchlings/zoo/redactyl.py,sha256=jjLad9ugG8516CNhuUfv16OOs9HwqTiUzVqY0CLskhY,6928
|
17
|
+
glitchlings/zoo/reduple.py,sha256=oGkOkH9bJiG-ogsi5ewglq6FUmzvRM6UC4N61LyNdvk,5026
|
18
|
+
glitchlings/zoo/rushmore.py,sha256=RE-br8OAIBRil3Mz381OcdMtb1fuNCZ7LzAjt44hFkM,4900
|
19
19
|
glitchlings/zoo/scannequin.py,sha256=TJyNYTTIB7rxZH3XKIETy0YVf4EjsMgGWYmYaxH9jxU,5030
|
20
20
|
glitchlings/zoo/typogre.py,sha256=olTTXDmFkVQ3r-T1vxm2mLomRvIDXHrNHfgin316wzE,6221
|
21
|
-
glitchlings
|
22
|
-
glitchlings-0.2.
|
23
|
-
glitchlings-0.2.
|
24
|
-
glitchlings-0.2.
|
25
|
-
glitchlings-0.2.
|
26
|
-
glitchlings-0.2.
|
21
|
+
glitchlings/zoo/zeedub.py,sha256=n1qTKE_Dl0m8SEKhaP91oHAyJ484NxaGLPu_ZLr0Ldo,3696
|
22
|
+
glitchlings-0.2.5.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
23
|
+
glitchlings-0.2.5.dist-info/METADATA,sha256=UG-L-7qePJBz0sKAWGX2yKNmup3D_gaid0DDlPsuf3Y,27198
|
24
|
+
glitchlings-0.2.5.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
25
|
+
glitchlings-0.2.5.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
26
|
+
glitchlings-0.2.5.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
27
|
+
glitchlings-0.2.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|