glitchlings 0.2.4__cp312-cp312-win_amd64.whl → 0.2.6__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/dlc/prime.py +18 -1
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/core.py +12 -4
- glitchlings/zoo/redactyl.py +74 -10
- glitchlings/zoo/reduple.py +57 -16
- glitchlings/zoo/rushmore.py +52 -17
- glitchlings/zoo/zeedub.py +144 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.6.dist-info}/METADATA +14 -2
- glitchlings-0.2.6.dist-info/RECORD +27 -0
- glitchlings-0.2.4.dist-info/RECORD +0 -26
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.6.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.6.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.6.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.4.dist-info → glitchlings-0.2.6.dist-info}/top_level.txt +0 -0
glitchlings/__init__.py
CHANGED
@@ -13,6 +13,8 @@ from .zoo import (
|
|
13
13
|
rushmore,
|
14
14
|
Scannequin,
|
15
15
|
scannequin,
|
16
|
+
Zeedub,
|
17
|
+
zeedub,
|
16
18
|
Glitchling,
|
17
19
|
Gaggle,
|
18
20
|
summon,
|
@@ -35,6 +37,8 @@ __all__ = [
|
|
35
37
|
"rushmore",
|
36
38
|
"Scannequin",
|
37
39
|
"scannequin",
|
40
|
+
"Zeedub",
|
41
|
+
"zeedub",
|
38
42
|
"summon",
|
39
43
|
"Glitchling",
|
40
44
|
"Gaggle",
|
Binary file
|
glitchlings/dlc/prime.py
CHANGED
@@ -49,7 +49,24 @@ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[st
|
|
49
49
|
if candidate in available:
|
50
50
|
return [candidate]
|
51
51
|
|
52
|
-
|
52
|
+
try:
|
53
|
+
dataset_length = len(dataset) # type: ignore[arg-type]
|
54
|
+
except TypeError:
|
55
|
+
preview_rows: list[dict[str, Any]]
|
56
|
+
take_fn = getattr(dataset, "take", None)
|
57
|
+
if callable(take_fn):
|
58
|
+
preview_rows = list(take_fn(1))
|
59
|
+
else:
|
60
|
+
iterator = iter(dataset)
|
61
|
+
try:
|
62
|
+
first_row = next(iterator)
|
63
|
+
except StopIteration:
|
64
|
+
preview_rows = []
|
65
|
+
else:
|
66
|
+
preview_rows = [first_row]
|
67
|
+
sample = dict(preview_rows[0]) if preview_rows else {}
|
68
|
+
else:
|
69
|
+
sample = dataset[0] if dataset_length else {}
|
53
70
|
inferred = [
|
54
71
|
name
|
55
72
|
for name in dataset.column_names
|
glitchlings/zoo/__init__.py
CHANGED
@@ -10,6 +10,7 @@ from .reduple import Reduple, reduple
|
|
10
10
|
from .rushmore import Rushmore, rushmore
|
11
11
|
from .redactyl import Redactyl, redactyl
|
12
12
|
from .scannequin import Scannequin, scannequin
|
13
|
+
from .zeedub import Zeedub, zeedub
|
13
14
|
from .core import Glitchling, Gaggle
|
14
15
|
|
15
16
|
__all__ = [
|
@@ -27,6 +28,8 @@ __all__ = [
|
|
27
28
|
"redactyl",
|
28
29
|
"Scannequin",
|
29
30
|
"scannequin",
|
31
|
+
"Zeedub",
|
32
|
+
"zeedub",
|
30
33
|
"Glitchling",
|
31
34
|
"Gaggle",
|
32
35
|
"summon",
|
@@ -40,7 +43,7 @@ _HAS_JARGOYLE = _jargoyle_available()
|
|
40
43
|
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
|
41
44
|
if _HAS_JARGOYLE:
|
42
45
|
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
43
|
-
_BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin])
|
46
|
+
_BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
|
44
47
|
|
45
48
|
BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
46
49
|
glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
|
@@ -53,6 +56,7 @@ _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
|
53
56
|
rushmore.name.lower(): Rushmore,
|
54
57
|
redactyl.name.lower(): Redactyl,
|
55
58
|
scannequin.name.lower(): Scannequin,
|
59
|
+
zeedub.name.lower(): Zeedub,
|
56
60
|
}
|
57
61
|
if _HAS_JARGOYLE:
|
58
62
|
_BUILTIN_GLITCHLING_TYPES[jargoyle.name.lower()] = Jargoyle
|
glitchlings/zoo/core.py
CHANGED
@@ -27,17 +27,25 @@ log = logging.getLogger(__name__)
|
|
27
27
|
|
28
28
|
|
29
29
|
_PIPELINE_FEATURE_FLAG_ENV = "GLITCHLINGS_RUST_PIPELINE"
|
30
|
+
_PIPELINE_ENABLE_VALUES = {"1", "true", "yes", "on"}
|
31
|
+
_PIPELINE_DISABLE_VALUES = {"0", "false", "no", "off"}
|
30
32
|
|
31
33
|
|
32
34
|
def _pipeline_feature_flag_enabled() -> bool:
|
33
|
-
"""Return ``True`` when the environment explicitly
|
35
|
+
"""Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
|
34
36
|
|
35
37
|
value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
|
36
38
|
if value is None:
|
37
|
-
return
|
39
|
+
return True
|
38
40
|
|
39
41
|
normalized = value.strip().lower()
|
40
|
-
|
42
|
+
if normalized in _PIPELINE_DISABLE_VALUES:
|
43
|
+
return False
|
44
|
+
|
45
|
+
if normalized in _PIPELINE_ENABLE_VALUES:
|
46
|
+
return True
|
47
|
+
|
48
|
+
return True
|
41
49
|
|
42
50
|
if TYPE_CHECKING: # pragma: no cover - typing only
|
43
51
|
from datasets import Dataset # type: ignore
|
@@ -356,7 +364,7 @@ class Gaggle(Glitchling):
|
|
356
364
|
|
357
365
|
@staticmethod
|
358
366
|
def rust_pipeline_enabled() -> bool:
|
359
|
-
"""Return ``True`` when the Rust pipeline is available and
|
367
|
+
"""Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
|
360
368
|
|
361
369
|
return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
|
362
370
|
|
glitchlings/zoo/redactyl.py
CHANGED
@@ -14,6 +14,41 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
14
14
|
_redact_words_rust = None
|
15
15
|
|
16
16
|
|
17
|
+
def _weighted_sample_without_replacement(
|
18
|
+
population: list[int],
|
19
|
+
weights: list[float],
|
20
|
+
*,
|
21
|
+
k: int,
|
22
|
+
rng: random.Random,
|
23
|
+
) -> list[int]:
|
24
|
+
"""Select `k` unique indices according to the given weights."""
|
25
|
+
|
26
|
+
selections: list[int] = []
|
27
|
+
items = list(zip(population, weights))
|
28
|
+
if k <= 0 or not items:
|
29
|
+
return selections
|
30
|
+
if k > len(items):
|
31
|
+
raise ValueError("Sample larger than population or is negative")
|
32
|
+
|
33
|
+
for _ in range(k):
|
34
|
+
total_weight = sum(weight for _, weight in items)
|
35
|
+
if total_weight <= 0:
|
36
|
+
chosen_index = rng.randrange(len(items))
|
37
|
+
else:
|
38
|
+
threshold = rng.random() * total_weight
|
39
|
+
cumulative = 0.0
|
40
|
+
chosen_index = len(items) - 1
|
41
|
+
for idx, (_, weight) in enumerate(items):
|
42
|
+
cumulative += weight
|
43
|
+
if cumulative >= threshold:
|
44
|
+
chosen_index = idx
|
45
|
+
break
|
46
|
+
value, _ = items.pop(chosen_index)
|
47
|
+
selections.append(value)
|
48
|
+
|
49
|
+
return selections
|
50
|
+
|
51
|
+
|
17
52
|
def _python_redact_words(
|
18
53
|
text: str,
|
19
54
|
*,
|
@@ -21,6 +56,7 @@ def _python_redact_words(
|
|
21
56
|
rate: float,
|
22
57
|
merge_adjacent: bool,
|
23
58
|
rng: random.Random,
|
59
|
+
unweighted: bool = False,
|
24
60
|
) -> str:
|
25
61
|
"""Redact random words by replacing their characters.
|
26
62
|
|
@@ -29,18 +65,39 @@ def _python_redact_words(
|
|
29
65
|
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
30
66
|
- rate: Max proportion of words to redact (default 0.05).
|
31
67
|
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
32
|
-
-
|
33
|
-
-
|
68
|
+
- rng: RNG used for sampling decisions.
|
69
|
+
- unweighted: When True, sample words uniformly instead of by length.
|
34
70
|
"""
|
35
71
|
# Preserve exact spacing and punctuation by using regex
|
36
72
|
tokens = re.split(r"(\s+)", text)
|
37
73
|
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
38
74
|
if not word_indices:
|
39
|
-
raise ValueError(
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
75
|
+
raise ValueError(
|
76
|
+
"Cannot redact words because the input text contains no redactable words."
|
77
|
+
)
|
78
|
+
weights: list[float] = []
|
79
|
+
for index in word_indices:
|
80
|
+
word = tokens[index]
|
81
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
82
|
+
core = match.group(2) if match else word
|
83
|
+
core_length = len(core) if core else len(word)
|
84
|
+
if core_length <= 0:
|
85
|
+
core_length = len(word.strip()) or len(word)
|
86
|
+
if core_length <= 0:
|
87
|
+
core_length = 1
|
88
|
+
weights.append(1.0 if unweighted else float(core_length))
|
89
|
+
raw_quota = len(word_indices) * rate
|
90
|
+
num_to_redact = int(raw_quota)
|
91
|
+
if rate > 0:
|
92
|
+
num_to_redact = max(1, num_to_redact)
|
93
|
+
if num_to_redact > len(word_indices):
|
94
|
+
raise ValueError("Sample larger than population or is negative")
|
95
|
+
indices_to_redact = _weighted_sample_without_replacement(
|
96
|
+
word_indices,
|
97
|
+
weights,
|
98
|
+
k=num_to_redact,
|
99
|
+
rng=rng,
|
100
|
+
)
|
44
101
|
indices_to_redact.sort()
|
45
102
|
|
46
103
|
for i in indices_to_redact:
|
@@ -80,13 +137,14 @@ def redact_words(
|
|
80
137
|
rng: random.Random | None = None,
|
81
138
|
*,
|
82
139
|
redaction_rate: float | None = None,
|
140
|
+
unweighted: bool = False,
|
83
141
|
) -> str:
|
84
142
|
"""Redact random words by replacing their characters."""
|
85
143
|
|
86
144
|
effective_rate = resolve_rate(
|
87
145
|
rate=rate,
|
88
146
|
legacy_value=redaction_rate,
|
89
|
-
default=0.
|
147
|
+
default=0.025,
|
90
148
|
legacy_name="redaction_rate",
|
91
149
|
)
|
92
150
|
|
@@ -94,6 +152,7 @@ def redact_words(
|
|
94
152
|
rng = random.Random(seed)
|
95
153
|
|
96
154
|
clamped_rate = max(0.0, effective_rate)
|
155
|
+
unweighted_flag = bool(unweighted)
|
97
156
|
|
98
157
|
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
99
158
|
|
@@ -103,6 +162,7 @@ def redact_words(
|
|
103
162
|
replacement_char,
|
104
163
|
clamped_rate,
|
105
164
|
merge_adjacent,
|
165
|
+
unweighted_flag,
|
106
166
|
rng,
|
107
167
|
)
|
108
168
|
|
@@ -112,6 +172,7 @@ def redact_words(
|
|
112
172
|
rate=clamped_rate,
|
113
173
|
merge_adjacent=merge_adjacent,
|
114
174
|
rng=rng,
|
175
|
+
unweighted=unweighted_flag,
|
115
176
|
)
|
116
177
|
|
117
178
|
|
@@ -126,12 +187,13 @@ class Redactyl(Glitchling):
|
|
126
187
|
redaction_rate: float | None = None,
|
127
188
|
merge_adjacent: bool = False,
|
128
189
|
seed: int = 151,
|
190
|
+
unweighted: bool = False,
|
129
191
|
) -> None:
|
130
192
|
self._param_aliases = {"redaction_rate": "rate"}
|
131
193
|
effective_rate = resolve_rate(
|
132
194
|
rate=rate,
|
133
195
|
legacy_value=redaction_rate,
|
134
|
-
default=0.
|
196
|
+
default=0.025,
|
135
197
|
legacy_name="redaction_rate",
|
136
198
|
)
|
137
199
|
super().__init__(
|
@@ -142,6 +204,7 @@ class Redactyl(Glitchling):
|
|
142
204
|
replacement_char=replacement_char,
|
143
205
|
rate=effective_rate,
|
144
206
|
merge_adjacent=merge_adjacent,
|
207
|
+
unweighted=unweighted,
|
145
208
|
)
|
146
209
|
|
147
210
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
@@ -150,15 +213,16 @@ class Redactyl(Glitchling):
|
|
150
213
|
merge_adjacent = self.kwargs.get("merge_adjacent")
|
151
214
|
if replacement_char is None or rate is None or merge_adjacent is None:
|
152
215
|
return None
|
216
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
153
217
|
return {
|
154
218
|
"type": "redact",
|
155
219
|
"replacement_char": str(replacement_char),
|
156
220
|
"redaction_rate": float(rate),
|
157
221
|
"merge_adjacent": bool(merge_adjacent),
|
222
|
+
"unweighted": unweighted,
|
158
223
|
}
|
159
224
|
|
160
225
|
|
161
|
-
|
162
226
|
redactyl = Redactyl()
|
163
227
|
|
164
228
|
|
glitchlings/zoo/reduple.py
CHANGED
@@ -16,14 +16,15 @@ def _python_reduplicate_words(
|
|
16
16
|
*,
|
17
17
|
rate: float,
|
18
18
|
rng: random.Random,
|
19
|
+
unweighted: bool = False,
|
19
20
|
) -> str:
|
20
21
|
"""Randomly reduplicate words in the text.
|
21
22
|
|
22
23
|
Parameters
|
23
24
|
- text: Input text.
|
24
25
|
- rate: Max proportion of words to reduplicate (default 0.05).
|
25
|
-
-
|
26
|
-
-
|
26
|
+
- rng: RNG used for sampling decisions.
|
27
|
+
- unweighted: When True, sample words uniformly instead of length-weighted.
|
27
28
|
|
28
29
|
Notes
|
29
30
|
- Preserves spacing and punctuation by tokenizing with separators.
|
@@ -32,6 +33,7 @@ def _python_reduplicate_words(
|
|
32
33
|
# Preserve exact spacing and punctuation by using regex
|
33
34
|
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
34
35
|
|
36
|
+
candidate_weights: list[tuple[int, float]] = []
|
35
37
|
for i in range(0, len(tokens), 2): # Every other token is a word
|
36
38
|
if i >= len(tokens):
|
37
39
|
break
|
@@ -40,16 +42,46 @@ def _python_reduplicate_words(
|
|
40
42
|
if not word or word.isspace(): # Skip empty or whitespace
|
41
43
|
continue
|
42
44
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
46
|
+
core = match.group(2) if match else word
|
47
|
+
core_length = len(core) if core else len(word)
|
48
|
+
if core_length <= 0:
|
49
|
+
core_length = len(word.strip()) or len(word)
|
50
|
+
if core_length <= 0:
|
51
|
+
core_length = 1
|
52
|
+
weight = 1.0 if unweighted else 1.0 / core_length
|
53
|
+
candidate_weights.append((i, weight))
|
54
|
+
|
55
|
+
if not candidate_weights:
|
56
|
+
return "".join(tokens)
|
57
|
+
|
58
|
+
effective_rate = max(rate, 0.0)
|
59
|
+
if effective_rate <= 0.0:
|
60
|
+
return "".join(tokens)
|
61
|
+
|
62
|
+
mean_weight = sum(weight for _, weight in candidate_weights) / len(
|
63
|
+
candidate_weights
|
64
|
+
)
|
65
|
+
|
66
|
+
for index, weight in candidate_weights:
|
67
|
+
if effective_rate >= 1.0:
|
68
|
+
probability = 1.0
|
69
|
+
else:
|
70
|
+
if mean_weight <= 0.0:
|
71
|
+
probability = effective_rate
|
51
72
|
else:
|
52
|
-
|
73
|
+
probability = min(1.0, effective_rate * (weight / mean_weight))
|
74
|
+
if rng.random() >= probability:
|
75
|
+
continue
|
76
|
+
|
77
|
+
word = tokens[index]
|
78
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
79
|
+
if match:
|
80
|
+
prefix, core, suffix = match.groups()
|
81
|
+
# Reduplicate with a space: "word" -> "word word"
|
82
|
+
tokens[index] = f"{prefix}{core} {core}{suffix}"
|
83
|
+
else:
|
84
|
+
tokens[index] = f"{word} {word}"
|
53
85
|
return "".join(tokens)
|
54
86
|
|
55
87
|
|
@@ -60,6 +92,7 @@ def reduplicate_words(
|
|
60
92
|
rng: random.Random | None = None,
|
61
93
|
*,
|
62
94
|
reduplication_rate: float | None = None,
|
95
|
+
unweighted: bool = False,
|
63
96
|
) -> str:
|
64
97
|
"""Randomly reduplicate words in the text.
|
65
98
|
|
@@ -70,7 +103,7 @@ def reduplicate_words(
|
|
70
103
|
effective_rate = resolve_rate(
|
71
104
|
rate=rate,
|
72
105
|
legacy_value=reduplication_rate,
|
73
|
-
default=0.
|
106
|
+
default=0.01,
|
74
107
|
legacy_name="reduplication_rate",
|
75
108
|
)
|
76
109
|
|
@@ -78,14 +111,16 @@ def reduplicate_words(
|
|
78
111
|
rng = random.Random(seed)
|
79
112
|
|
80
113
|
clamped_rate = max(0.0, effective_rate)
|
114
|
+
unweighted_flag = bool(unweighted)
|
81
115
|
|
82
116
|
if _reduplicate_words_rust is not None:
|
83
|
-
return _reduplicate_words_rust(text, clamped_rate, rng)
|
117
|
+
return _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng)
|
84
118
|
|
85
119
|
return _python_reduplicate_words(
|
86
120
|
text,
|
87
121
|
rate=clamped_rate,
|
88
122
|
rng=rng,
|
123
|
+
unweighted=unweighted_flag,
|
89
124
|
)
|
90
125
|
|
91
126
|
|
@@ -98,12 +133,13 @@ class Reduple(Glitchling):
|
|
98
133
|
rate: float | None = None,
|
99
134
|
reduplication_rate: float | None = None,
|
100
135
|
seed: int | None = None,
|
136
|
+
unweighted: bool = False,
|
101
137
|
) -> None:
|
102
138
|
self._param_aliases = {"reduplication_rate": "rate"}
|
103
139
|
effective_rate = resolve_rate(
|
104
140
|
rate=rate,
|
105
141
|
legacy_value=reduplication_rate,
|
106
|
-
default=0.
|
142
|
+
default=0.01,
|
107
143
|
legacy_name="reduplication_rate",
|
108
144
|
)
|
109
145
|
super().__init__(
|
@@ -112,14 +148,19 @@ class Reduple(Glitchling):
|
|
112
148
|
scope=AttackWave.WORD,
|
113
149
|
seed=seed,
|
114
150
|
rate=effective_rate,
|
151
|
+
unweighted=unweighted,
|
115
152
|
)
|
116
153
|
|
117
154
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
118
155
|
rate = self.kwargs.get("rate")
|
119
156
|
if rate is None:
|
120
157
|
return None
|
121
|
-
|
122
|
-
|
158
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
159
|
+
return {
|
160
|
+
"type": "reduplicate",
|
161
|
+
"reduplication_rate": float(rate),
|
162
|
+
"unweighted": unweighted,
|
163
|
+
}
|
123
164
|
|
124
165
|
|
125
166
|
reduple = Reduple()
|
glitchlings/zoo/rushmore.py
CHANGED
@@ -17,42 +17,67 @@ def _python_delete_random_words(
|
|
17
17
|
*,
|
18
18
|
rate: float,
|
19
19
|
rng: random.Random,
|
20
|
+
unweighted: bool = False,
|
20
21
|
) -> str:
|
21
22
|
"""Delete random words from the input text while preserving whitespace."""
|
22
23
|
|
23
|
-
|
24
|
+
effective_rate = max(rate, 0.0)
|
25
|
+
if effective_rate <= 0.0:
|
24
26
|
return text
|
25
27
|
|
26
28
|
tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
|
27
29
|
|
28
|
-
|
30
|
+
candidate_data: list[tuple[int, float]] = []
|
29
31
|
for i in range(2, len(tokens), 2): # Every other token is a word, skip the first word
|
30
32
|
word = tokens[i]
|
31
33
|
if not word or word.isspace():
|
32
34
|
continue
|
33
35
|
|
34
|
-
|
36
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
37
|
+
core = match.group(2) if match else word
|
38
|
+
core_length = len(core) if core else len(word)
|
39
|
+
if core_length <= 0:
|
40
|
+
core_length = len(word.strip()) or len(word)
|
41
|
+
if core_length <= 0:
|
42
|
+
core_length = 1
|
43
|
+
weight = 1.0 if unweighted else 1.0 / core_length
|
44
|
+
candidate_data.append((i, weight))
|
45
|
+
|
46
|
+
if not candidate_data:
|
47
|
+
return text
|
35
48
|
|
36
49
|
allowed_deletions = min(
|
37
|
-
len(
|
50
|
+
len(candidate_data), math.floor(len(candidate_data) * effective_rate)
|
38
51
|
)
|
39
52
|
if allowed_deletions <= 0:
|
40
53
|
return text
|
41
54
|
|
55
|
+
mean_weight = sum(weight for _, weight in candidate_data) / len(candidate_data)
|
56
|
+
|
42
57
|
deletions = 0
|
43
|
-
for
|
44
|
-
if
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
58
|
+
for index, weight in candidate_data:
|
59
|
+
if deletions >= allowed_deletions:
|
60
|
+
break
|
61
|
+
|
62
|
+
if effective_rate >= 1.0:
|
63
|
+
probability = 1.0
|
64
|
+
else:
|
65
|
+
if mean_weight <= 0.0:
|
66
|
+
probability = effective_rate
|
50
67
|
else:
|
51
|
-
|
68
|
+
probability = min(1.0, effective_rate * (weight / mean_weight))
|
69
|
+
if rng.random() >= probability:
|
70
|
+
continue
|
71
|
+
|
72
|
+
word = tokens[index]
|
73
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
74
|
+
if match:
|
75
|
+
prefix, _, suffix = match.groups()
|
76
|
+
tokens[index] = f"{prefix.strip()}{suffix.strip()}"
|
77
|
+
else:
|
78
|
+
tokens[index] = ""
|
52
79
|
|
53
|
-
|
54
|
-
if deletions >= allowed_deletions:
|
55
|
-
break
|
80
|
+
deletions += 1
|
56
81
|
|
57
82
|
text = "".join(tokens)
|
58
83
|
text = re.sub(r"\s+([.,;:])", r"\1", text)
|
@@ -68,6 +93,7 @@ def delete_random_words(
|
|
68
93
|
rng: random.Random | None = None,
|
69
94
|
*,
|
70
95
|
max_deletion_rate: float | None = None,
|
96
|
+
unweighted: bool = False,
|
71
97
|
) -> str:
|
72
98
|
"""Delete random words from the input text.
|
73
99
|
|
@@ -85,14 +111,16 @@ def delete_random_words(
|
|
85
111
|
rng = random.Random(seed)
|
86
112
|
|
87
113
|
clamped_rate = max(0.0, effective_rate)
|
114
|
+
unweighted_flag = bool(unweighted)
|
88
115
|
|
89
116
|
if _delete_random_words_rust is not None:
|
90
|
-
return _delete_random_words_rust(text, clamped_rate, rng)
|
117
|
+
return _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng)
|
91
118
|
|
92
119
|
return _python_delete_random_words(
|
93
120
|
text,
|
94
121
|
rate=clamped_rate,
|
95
122
|
rng=rng,
|
123
|
+
unweighted=unweighted_flag,
|
96
124
|
)
|
97
125
|
|
98
126
|
|
@@ -105,6 +133,7 @@ class Rushmore(Glitchling):
|
|
105
133
|
rate: float | None = None,
|
106
134
|
max_deletion_rate: float | None = None,
|
107
135
|
seed: int | None = None,
|
136
|
+
unweighted: bool = False,
|
108
137
|
) -> None:
|
109
138
|
self._param_aliases = {"max_deletion_rate": "rate"}
|
110
139
|
effective_rate = resolve_rate(
|
@@ -119,6 +148,7 @@ class Rushmore(Glitchling):
|
|
119
148
|
scope=AttackWave.WORD,
|
120
149
|
seed=seed,
|
121
150
|
rate=effective_rate,
|
151
|
+
unweighted=unweighted,
|
122
152
|
)
|
123
153
|
|
124
154
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
@@ -127,7 +157,12 @@ class Rushmore(Glitchling):
|
|
127
157
|
rate = self.kwargs.get("max_deletion_rate")
|
128
158
|
if rate is None:
|
129
159
|
return None
|
130
|
-
|
160
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
161
|
+
return {
|
162
|
+
"type": "delete",
|
163
|
+
"max_deletion_rate": float(rate),
|
164
|
+
"unweighted": unweighted,
|
165
|
+
}
|
131
166
|
|
132
167
|
|
133
168
|
rushmore = Rushmore()
|
@@ -0,0 +1,144 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import math
|
4
|
+
import random
|
5
|
+
from collections.abc import Sequence
|
6
|
+
|
7
|
+
from .core import Glitchling, AttackWave, AttackOrder
|
8
|
+
from ._rate import resolve_rate
|
9
|
+
|
10
|
+
try:
|
11
|
+
from glitchlings._zoo_rust import inject_zero_widths as _inject_zero_widths_rust
|
12
|
+
except ImportError: # pragma: no cover - compiled extension not present
|
13
|
+
_inject_zero_widths_rust = None
|
14
|
+
|
15
|
+
_DEFAULT_ZERO_WIDTH_CHARACTERS: tuple[str, ...] = (
|
16
|
+
"\u200b", # ZERO WIDTH SPACE
|
17
|
+
"\u200c", # ZERO WIDTH NON-JOINER
|
18
|
+
"\u200d", # ZERO WIDTH JOINER
|
19
|
+
"\ufeff", # ZERO WIDTH NO-BREAK SPACE
|
20
|
+
"\u2060", # WORD JOINER
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
def _python_insert_zero_widths(
|
25
|
+
text: str,
|
26
|
+
*,
|
27
|
+
rate: float,
|
28
|
+
rng: random.Random,
|
29
|
+
characters: Sequence[str],
|
30
|
+
) -> str:
|
31
|
+
if not text:
|
32
|
+
return text
|
33
|
+
|
34
|
+
palette = [char for char in characters if char]
|
35
|
+
if not palette:
|
36
|
+
return text
|
37
|
+
|
38
|
+
positions = [
|
39
|
+
index + 1
|
40
|
+
for index in range(len(text) - 1)
|
41
|
+
if not text[index].isspace() and not text[index + 1].isspace()
|
42
|
+
]
|
43
|
+
if not positions:
|
44
|
+
return text
|
45
|
+
|
46
|
+
total = len(positions)
|
47
|
+
clamped_rate = max(0.0, rate)
|
48
|
+
if clamped_rate <= 0.0:
|
49
|
+
return text
|
50
|
+
|
51
|
+
target = clamped_rate * total
|
52
|
+
count = math.floor(target)
|
53
|
+
remainder = target - count
|
54
|
+
if remainder > 0.0 and rng.random() < remainder:
|
55
|
+
count += 1
|
56
|
+
count = min(total, count)
|
57
|
+
|
58
|
+
if count <= 0:
|
59
|
+
return text
|
60
|
+
|
61
|
+
chosen = rng.sample(positions, count)
|
62
|
+
chosen.sort()
|
63
|
+
|
64
|
+
chars = list(text)
|
65
|
+
for position in reversed(chosen):
|
66
|
+
chars.insert(position, rng.choice(palette))
|
67
|
+
|
68
|
+
return "".join(chars)
|
69
|
+
|
70
|
+
|
71
|
+
def insert_zero_widths(
|
72
|
+
text: str,
|
73
|
+
rate: float | None = None,
|
74
|
+
seed: int | None = None,
|
75
|
+
rng: random.Random | None = None,
|
76
|
+
*,
|
77
|
+
characters: Sequence[str] | None = None,
|
78
|
+
) -> str:
|
79
|
+
"""Inject zero-width characters between non-space character pairs."""
|
80
|
+
|
81
|
+
effective_rate = resolve_rate(
|
82
|
+
rate=rate,
|
83
|
+
legacy_value=None,
|
84
|
+
default=0.02,
|
85
|
+
legacy_name="rate",
|
86
|
+
)
|
87
|
+
|
88
|
+
if rng is None:
|
89
|
+
rng = random.Random(seed)
|
90
|
+
|
91
|
+
palette: Sequence[str] = (
|
92
|
+
tuple(characters) if characters is not None else _DEFAULT_ZERO_WIDTH_CHARACTERS
|
93
|
+
)
|
94
|
+
|
95
|
+
cleaned_palette = tuple(char for char in palette if char)
|
96
|
+
if not cleaned_palette or not text:
|
97
|
+
return text
|
98
|
+
|
99
|
+
clamped_rate = max(0.0, effective_rate)
|
100
|
+
if clamped_rate == 0.0:
|
101
|
+
return text
|
102
|
+
|
103
|
+
if _inject_zero_widths_rust is not None:
|
104
|
+
return _inject_zero_widths_rust(text, clamped_rate, list(cleaned_palette), rng)
|
105
|
+
|
106
|
+
return _python_insert_zero_widths(
|
107
|
+
text,
|
108
|
+
rate=clamped_rate,
|
109
|
+
rng=rng,
|
110
|
+
characters=cleaned_palette,
|
111
|
+
)
|
112
|
+
|
113
|
+
|
114
|
+
class Zeedub(Glitchling):
|
115
|
+
"""Glitchling that plants zero-width glyphs inside words."""
|
116
|
+
|
117
|
+
def __init__(
|
118
|
+
self,
|
119
|
+
*,
|
120
|
+
rate: float | None = None,
|
121
|
+
seed: int | None = None,
|
122
|
+
characters: Sequence[str] | None = None,
|
123
|
+
) -> None:
|
124
|
+
effective_rate = resolve_rate(
|
125
|
+
rate=rate,
|
126
|
+
legacy_value=None,
|
127
|
+
default=0.02,
|
128
|
+
legacy_name="rate",
|
129
|
+
)
|
130
|
+
super().__init__(
|
131
|
+
name="Zeedub",
|
132
|
+
corruption_function=insert_zero_widths,
|
133
|
+
scope=AttackWave.CHARACTER,
|
134
|
+
order=AttackOrder.LAST,
|
135
|
+
seed=seed,
|
136
|
+
rate=effective_rate,
|
137
|
+
characters=tuple(characters) if characters is not None else None,
|
138
|
+
)
|
139
|
+
|
140
|
+
|
141
|
+
zeedub = Zeedub()
|
142
|
+
|
143
|
+
|
144
|
+
__all__ = ["Zeedub", "zeedub", "insert_zero_widths"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -296,7 +296,7 @@ print(gaggle(SAMPLE_TEXT))
|
|
296
296
|
|
297
297
|
Consult the [Glitchlings Usage Guide](docs/index.md)
|
298
298
|
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
299
|
-
integrations, and the
|
299
|
+
integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
|
300
300
|
|
301
301
|
## Motivation
|
302
302
|
|
@@ -396,6 +396,18 @@ _How can a computer need reading glasses?_
|
|
396
396
|
> - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
|
397
397
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
398
398
|
|
399
|
+
### Zeedub
|
400
|
+
|
401
|
+
_A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
|
402
|
+
|
403
|
+
> _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
|
404
|
+
>
|
405
|
+
> Args
|
406
|
+
>
|
407
|
+
> - `rate (float)`: Expected number of zero-width insertions as a proportion of eligible bigrams (default: 0.02, 2%).
|
408
|
+
> - `characters (Sequence[str])`: Optional override for the pool of zero-width strings to inject (default: curated invisibles such as U+200B, U+200C, U+200D, U+FEFF, U+2060).
|
409
|
+
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
410
|
+
|
399
411
|
### Jargoyle
|
400
412
|
|
401
413
|
_Uh oh. The worst person you know just bought a thesaurus._
|
@@ -0,0 +1,27 @@
|
|
1
|
+
glitchlings/__init__.py,sha256=fjerquRITZQY_rY5mhTVVQyeGAz1qTpgicvDhbpqgi8,678
|
2
|
+
glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
|
3
|
+
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=ix_yBFGWDMdQqMgg0-RlHdnayxiTeJd_N1SQN5XfDhc,2019328
|
4
|
+
glitchlings/main.py,sha256=QrSSLWcKh1_NDfJDGh-3UVKdI7AkzfMy6Jz1ouxIgnE,6149
|
5
|
+
glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
|
6
|
+
glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
|
7
|
+
glitchlings/dlc/prime.py,sha256=b5CE1qDl5MxZjTudlKrqMsmSGxXNKZ16krqPyrr2nK8,9569
|
8
|
+
glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
|
9
|
+
glitchlings/zoo/__init__.py,sha256=mAhsnR3ZK9BocxT3J4WF6JcYQMYI9e_EYZ-GMxHv0P4,4420
|
10
|
+
glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
|
11
|
+
glitchlings/zoo/_rate.py,sha256=EYUWXYyR2IK0zYBWyBOlnUjDxU32JE9mZTZeodVx5CA,548
|
12
|
+
glitchlings/zoo/core.py,sha256=B0E9ycaG9oAOSf2n2gJ4iFbWSjhRUDOdlLIJYeWqICs,14649
|
13
|
+
glitchlings/zoo/jargoyle.py,sha256=1fnL_8bv1Y-T2h1C6NRzIylYyOuAUI-BiMReFewqh00,11002
|
14
|
+
glitchlings/zoo/mim1c.py,sha256=3ddNOzWgLABuEOh5T98Xk439ejx-YHGI7ErXET03Crc,3537
|
15
|
+
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
16
|
+
glitchlings/zoo/redactyl.py,sha256=jPEzP9b6_AizY1xOP4LgjJwWdpZv3Qr7JQPWK96609Y,7012
|
17
|
+
glitchlings/zoo/reduple.py,sha256=oGkOkH9bJiG-ogsi5ewglq6FUmzvRM6UC4N61LyNdvk,5026
|
18
|
+
glitchlings/zoo/rushmore.py,sha256=RE-br8OAIBRil3Mz381OcdMtb1fuNCZ7LzAjt44hFkM,4900
|
19
|
+
glitchlings/zoo/scannequin.py,sha256=TJyNYTTIB7rxZH3XKIETy0YVf4EjsMgGWYmYaxH9jxU,5030
|
20
|
+
glitchlings/zoo/typogre.py,sha256=olTTXDmFkVQ3r-T1vxm2mLomRvIDXHrNHfgin316wzE,6221
|
21
|
+
glitchlings/zoo/zeedub.py,sha256=n1qTKE_Dl0m8SEKhaP91oHAyJ484NxaGLPu_ZLr0Ldo,3696
|
22
|
+
glitchlings-0.2.6.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
23
|
+
glitchlings-0.2.6.dist-info/METADATA,sha256=76zHWRnP4ZRH1iVdyu0jujx1zYLVenFYtrbC6i_NTsg,27239
|
24
|
+
glitchlings-0.2.6.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
25
|
+
glitchlings-0.2.6.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
26
|
+
glitchlings-0.2.6.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
27
|
+
glitchlings-0.2.6.dist-info/RECORD,,
|
@@ -1,26 +0,0 @@
|
|
1
|
-
glitchlings/__init__.py,sha256=w8heFqUejrXM_9NNlM9CQnIGkmGUyBV29acg3WsocXA,622
|
2
|
-
glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
|
3
|
-
glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=qHk8hPmRrzJTwOyhcBNr-2qhXBaEBUy__7_SMFhzWSc,1989632
|
4
|
-
glitchlings/main.py,sha256=QrSSLWcKh1_NDfJDGh-3UVKdI7AkzfMy6Jz1ouxIgnE,6149
|
5
|
-
glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
|
6
|
-
glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
|
7
|
-
glitchlings/dlc/prime.py,sha256=hySyYBncUM-49j6JtrHYO6c3HpbG2vTt2EYZnOJ85C0,8972
|
8
|
-
glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
|
9
|
-
glitchlings/zoo/__init__.py,sha256=kYKKlNvEwKtrD26E1hfde33rkN83CMf_h5AQFGjQyBQ,4312
|
10
|
-
glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
|
11
|
-
glitchlings/zoo/_rate.py,sha256=EYUWXYyR2IK0zYBWyBOlnUjDxU32JE9mZTZeodVx5CA,548
|
12
|
-
glitchlings/zoo/core.py,sha256=QKHmzmONNkiA3RdfgLdNx-FPFwoH4Bm-Tkc3vSCHNpc,14412
|
13
|
-
glitchlings/zoo/jargoyle.py,sha256=1fnL_8bv1Y-T2h1C6NRzIylYyOuAUI-BiMReFewqh00,11002
|
14
|
-
glitchlings/zoo/mim1c.py,sha256=3ddNOzWgLABuEOh5T98Xk439ejx-YHGI7ErXET03Crc,3537
|
15
|
-
glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
|
16
|
-
glitchlings/zoo/redactyl.py,sha256=dM3W59xLhuiS8t5jXETc_L8EEhRN1CpLazBnVPiSknk,4834
|
17
|
-
glitchlings/zoo/reduple.py,sha256=9jid6tCvCaiSxWSPMNuHWZitd7et60RRFYeek3S0ElU,3641
|
18
|
-
glitchlings/zoo/rushmore.py,sha256=pJy3g_H1z8PNoHitvD3-HsytAuE0U6FOdsdaKZy6OqY,3680
|
19
|
-
glitchlings/zoo/scannequin.py,sha256=TJyNYTTIB7rxZH3XKIETy0YVf4EjsMgGWYmYaxH9jxU,5030
|
20
|
-
glitchlings/zoo/typogre.py,sha256=olTTXDmFkVQ3r-T1vxm2mLomRvIDXHrNHfgin316wzE,6221
|
21
|
-
glitchlings-0.2.4.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
|
22
|
-
glitchlings-0.2.4.dist-info/METADATA,sha256=mGKlfmodtLjWsfrz6O0cLk4DDPFeUO5vt6LKgw-uu-M,26513
|
23
|
-
glitchlings-0.2.4.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
|
24
|
-
glitchlings-0.2.4.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
25
|
-
glitchlings-0.2.4.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
26
|
-
glitchlings-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|