glitchlings 0.2.5__cp310-cp310-manylinux_2_28_x86_64.whl → 0.3.0__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cpython-310-x86_64-linux-gnu.so +0 -0
- glitchlings/dlc/prime.py +18 -1
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/_text_utils.py +42 -0
- glitchlings/zoo/adjax.py +131 -0
- glitchlings/zoo/core.py +28 -18
- glitchlings/zoo/redactyl.py +16 -20
- glitchlings/zoo/reduple.py +13 -24
- glitchlings/zoo/rushmore.py +12 -17
- {glitchlings-0.2.5.dist-info → glitchlings-0.3.0.dist-info}/METADATA +21 -6
- glitchlings-0.3.0.dist-info/RECORD +29 -0
- glitchlings-0.2.5.dist-info/RECORD +0 -27
- {glitchlings-0.2.5.dist-info → glitchlings-0.3.0.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.3.0.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.3.0.dist-info}/top_level.txt +0 -0
glitchlings/__init__.py
CHANGED
@@ -5,6 +5,8 @@ from .zoo import (
|
|
5
5
|
mim1c,
|
6
6
|
Jargoyle,
|
7
7
|
jargoyle,
|
8
|
+
Adjax,
|
9
|
+
adjax,
|
8
10
|
Redactyl,
|
9
11
|
redactyl,
|
10
12
|
Reduple,
|
@@ -29,6 +31,8 @@ __all__ = [
|
|
29
31
|
"mim1c",
|
30
32
|
"Jargoyle",
|
31
33
|
"jargoyle",
|
34
|
+
"Adjax",
|
35
|
+
"adjax",
|
32
36
|
"Redactyl",
|
33
37
|
"redactyl",
|
34
38
|
"Reduple",
|
Binary file
|
glitchlings/dlc/prime.py
CHANGED
@@ -49,7 +49,24 @@ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[st
|
|
49
49
|
if candidate in available:
|
50
50
|
return [candidate]
|
51
51
|
|
52
|
-
|
52
|
+
try:
|
53
|
+
dataset_length = len(dataset) # type: ignore[arg-type]
|
54
|
+
except TypeError:
|
55
|
+
preview_rows: list[dict[str, Any]]
|
56
|
+
take_fn = getattr(dataset, "take", None)
|
57
|
+
if callable(take_fn):
|
58
|
+
preview_rows = list(take_fn(1))
|
59
|
+
else:
|
60
|
+
iterator = iter(dataset)
|
61
|
+
try:
|
62
|
+
first_row = next(iterator)
|
63
|
+
except StopIteration:
|
64
|
+
preview_rows = []
|
65
|
+
else:
|
66
|
+
preview_rows = [first_row]
|
67
|
+
sample = dict(preview_rows[0]) if preview_rows else {}
|
68
|
+
else:
|
69
|
+
sample = dataset[0] if dataset_length else {}
|
53
70
|
inferred = [
|
54
71
|
name
|
55
72
|
for name in dataset.column_names
|
glitchlings/zoo/__init__.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Any
|
|
6
6
|
from .typogre import Typogre, typogre
|
7
7
|
from .mim1c import Mim1c, mim1c
|
8
8
|
from .jargoyle import Jargoyle, jargoyle, dependencies_available as _jargoyle_available
|
9
|
+
from .adjax import Adjax, adjax
|
9
10
|
from .reduple import Reduple, reduple
|
10
11
|
from .rushmore import Rushmore, rushmore
|
11
12
|
from .redactyl import Redactyl, redactyl
|
@@ -20,6 +21,8 @@ __all__ = [
|
|
20
21
|
"mim1c",
|
21
22
|
"Jargoyle",
|
22
23
|
"jargoyle",
|
24
|
+
"Adjax",
|
25
|
+
"adjax",
|
23
26
|
"Reduple",
|
24
27
|
"reduple",
|
25
28
|
"Rushmore",
|
@@ -43,7 +46,7 @@ _HAS_JARGOYLE = _jargoyle_available()
|
|
43
46
|
_BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
|
44
47
|
if _HAS_JARGOYLE:
|
45
48
|
_BUILTIN_GLITCHLING_LIST.append(jargoyle)
|
46
|
-
_BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
|
49
|
+
_BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
|
47
50
|
|
48
51
|
BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
49
52
|
glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
|
@@ -52,6 +55,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
|
|
52
55
|
_BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
|
53
56
|
typogre.name.lower(): Typogre,
|
54
57
|
mim1c.name.lower(): Mim1c,
|
58
|
+
adjax.name.lower(): Adjax,
|
55
59
|
reduple.name.lower(): Reduple,
|
56
60
|
rushmore.name.lower(): Rushmore,
|
57
61
|
redactyl.name.lower(): Redactyl,
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
|
5
|
+
_WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
|
6
|
+
_TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
|
7
|
+
|
8
|
+
|
9
|
+
def split_preserving_whitespace(text: str) -> list[str]:
|
10
|
+
"""Split text while keeping whitespace tokens for stable reconstruction."""
|
11
|
+
|
12
|
+
return _WORD_SPLIT_PATTERN.split(text)
|
13
|
+
|
14
|
+
|
15
|
+
def split_token_edges(token: str) -> tuple[str, str, str]:
|
16
|
+
"""Return leading, core, and trailing segments for a token."""
|
17
|
+
|
18
|
+
match = _TOKEN_EDGES_PATTERN.match(token)
|
19
|
+
if match is None:
|
20
|
+
return "", token, ""
|
21
|
+
return match.group(1), match.group(2), match.group(3)
|
22
|
+
|
23
|
+
|
24
|
+
def token_core_length(token: str) -> int:
|
25
|
+
"""Return the length of the main word characters for weighting heuristics."""
|
26
|
+
|
27
|
+
_, core, _ = split_token_edges(token)
|
28
|
+
candidate = core if core else token
|
29
|
+
length = len(candidate)
|
30
|
+
if length <= 0:
|
31
|
+
stripped = token.strip()
|
32
|
+
length = len(stripped) if stripped else len(token)
|
33
|
+
if length <= 0:
|
34
|
+
length = 1
|
35
|
+
return length
|
36
|
+
|
37
|
+
|
38
|
+
__all__ = [
|
39
|
+
"split_preserving_whitespace",
|
40
|
+
"split_token_edges",
|
41
|
+
"token_core_length",
|
42
|
+
]
|
glitchlings/zoo/adjax.py
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import random
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
from ._rate import resolve_rate
|
7
|
+
from ._text_utils import split_preserving_whitespace, split_token_edges
|
8
|
+
from .core import AttackWave, Glitchling
|
9
|
+
|
10
|
+
try:
|
11
|
+
from glitchlings._zoo_rust import swap_adjacent_words as _swap_adjacent_words_rust
|
12
|
+
except ImportError: # pragma: no cover - optional acceleration
|
13
|
+
_swap_adjacent_words_rust = None
|
14
|
+
|
15
|
+
|
16
|
+
def _python_swap_adjacent_words(
|
17
|
+
text: str,
|
18
|
+
*,
|
19
|
+
rate: float,
|
20
|
+
rng: random.Random,
|
21
|
+
) -> str:
|
22
|
+
"""Swap the cores of adjacent words while keeping affixes and spacing intact."""
|
23
|
+
|
24
|
+
tokens = split_preserving_whitespace(text)
|
25
|
+
if len(tokens) < 2:
|
26
|
+
return text
|
27
|
+
|
28
|
+
word_indices: list[int] = []
|
29
|
+
for index in range(len(tokens)):
|
30
|
+
token = tokens[index]
|
31
|
+
if not token or token.isspace():
|
32
|
+
continue
|
33
|
+
if index % 2 == 0:
|
34
|
+
word_indices.append(index)
|
35
|
+
|
36
|
+
if len(word_indices) < 2:
|
37
|
+
return text
|
38
|
+
|
39
|
+
clamped = max(0.0, min(rate, 1.0))
|
40
|
+
if clamped <= 0.0:
|
41
|
+
return text
|
42
|
+
|
43
|
+
for cursor in range(0, len(word_indices) - 1, 2):
|
44
|
+
left_index = word_indices[cursor]
|
45
|
+
right_index = word_indices[cursor + 1]
|
46
|
+
|
47
|
+
left_token = tokens[left_index]
|
48
|
+
right_token = tokens[right_index]
|
49
|
+
|
50
|
+
left_prefix, left_core, left_suffix = split_token_edges(left_token)
|
51
|
+
right_prefix, right_core, right_suffix = split_token_edges(right_token)
|
52
|
+
|
53
|
+
if not left_core or not right_core:
|
54
|
+
continue
|
55
|
+
|
56
|
+
should_swap = clamped >= 1.0 or rng.random() < clamped
|
57
|
+
if not should_swap:
|
58
|
+
continue
|
59
|
+
|
60
|
+
tokens[left_index] = f"{left_prefix}{right_core}{left_suffix}"
|
61
|
+
tokens[right_index] = f"{right_prefix}{left_core}{right_suffix}"
|
62
|
+
|
63
|
+
return "".join(tokens)
|
64
|
+
|
65
|
+
|
66
|
+
def swap_adjacent_words(
|
67
|
+
text: str,
|
68
|
+
rate: float | None = None,
|
69
|
+
seed: int | None = None,
|
70
|
+
rng: random.Random | None = None,
|
71
|
+
*,
|
72
|
+
swap_rate: float | None = None,
|
73
|
+
) -> str:
|
74
|
+
"""Swap adjacent word cores while preserving spacing and punctuation."""
|
75
|
+
|
76
|
+
effective_rate = resolve_rate(
|
77
|
+
rate=rate,
|
78
|
+
legacy_value=swap_rate,
|
79
|
+
default=0.5,
|
80
|
+
legacy_name="swap_rate",
|
81
|
+
)
|
82
|
+
clamped_rate = max(0.0, min(effective_rate, 1.0))
|
83
|
+
|
84
|
+
if rng is None:
|
85
|
+
rng = random.Random(seed)
|
86
|
+
|
87
|
+
if _swap_adjacent_words_rust is not None:
|
88
|
+
return _swap_adjacent_words_rust(text, clamped_rate, rng)
|
89
|
+
|
90
|
+
return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
|
91
|
+
|
92
|
+
|
93
|
+
class Adjax(Glitchling):
|
94
|
+
"""Glitchling that swaps adjacent words to scramble local semantics."""
|
95
|
+
|
96
|
+
def __init__(
|
97
|
+
self,
|
98
|
+
*,
|
99
|
+
rate: float | None = None,
|
100
|
+
swap_rate: float | None = None,
|
101
|
+
seed: int | None = None,
|
102
|
+
) -> None:
|
103
|
+
self._param_aliases = {"swap_rate": "rate"}
|
104
|
+
effective_rate = resolve_rate(
|
105
|
+
rate=rate,
|
106
|
+
legacy_value=swap_rate,
|
107
|
+
default=0.5,
|
108
|
+
legacy_name="swap_rate",
|
109
|
+
)
|
110
|
+
super().__init__(
|
111
|
+
name="Adjax",
|
112
|
+
corruption_function=swap_adjacent_words,
|
113
|
+
scope=AttackWave.WORD,
|
114
|
+
seed=seed,
|
115
|
+
rate=effective_rate,
|
116
|
+
)
|
117
|
+
|
118
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
119
|
+
rate = self.kwargs.get("rate")
|
120
|
+
if rate is None:
|
121
|
+
return None
|
122
|
+
return {
|
123
|
+
"type": "swap_adjacent",
|
124
|
+
"swap_rate": float(rate),
|
125
|
+
}
|
126
|
+
|
127
|
+
|
128
|
+
adjax = Adjax()
|
129
|
+
|
130
|
+
|
131
|
+
__all__ = ["Adjax", "adjax", "swap_adjacent_words"]
|
glitchlings/zoo/core.py
CHANGED
@@ -27,17 +27,25 @@ log = logging.getLogger(__name__)
|
|
27
27
|
|
28
28
|
|
29
29
|
_PIPELINE_FEATURE_FLAG_ENV = "GLITCHLINGS_RUST_PIPELINE"
|
30
|
+
_PIPELINE_ENABLE_VALUES = {"1", "true", "yes", "on"}
|
31
|
+
_PIPELINE_DISABLE_VALUES = {"0", "false", "no", "off"}
|
30
32
|
|
31
33
|
|
32
34
|
def _pipeline_feature_flag_enabled() -> bool:
|
33
|
-
"""Return ``True`` when the environment explicitly
|
35
|
+
"""Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
|
34
36
|
|
35
37
|
value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
|
36
38
|
if value is None:
|
37
|
-
return
|
39
|
+
return True
|
38
40
|
|
39
41
|
normalized = value.strip().lower()
|
40
|
-
|
42
|
+
if normalized in _PIPELINE_DISABLE_VALUES:
|
43
|
+
return False
|
44
|
+
|
45
|
+
if normalized in _PIPELINE_ENABLE_VALUES:
|
46
|
+
return True
|
47
|
+
|
48
|
+
return True
|
41
49
|
|
42
50
|
if TYPE_CHECKING: # pragma: no cover - typing only
|
43
51
|
from datasets import Dataset # type: ignore
|
@@ -51,18 +59,26 @@ else:
|
|
51
59
|
def with_transform(self, function: Any) -> "Dataset": ...
|
52
60
|
|
53
61
|
|
54
|
-
def _is_transcript(
|
55
|
-
|
62
|
+
def _is_transcript(
|
63
|
+
value: Any,
|
64
|
+
*,
|
65
|
+
allow_empty: bool = True,
|
66
|
+
require_all_content: bool = False,
|
67
|
+
) -> bool:
|
68
|
+
"""Return `True` when `value` appears to be a chat transcript."""
|
56
69
|
|
57
70
|
if not isinstance(value, list):
|
58
71
|
return False
|
59
72
|
|
60
73
|
if not value:
|
61
|
-
return
|
74
|
+
return allow_empty
|
62
75
|
|
63
76
|
if not all(isinstance(turn, dict) for turn in value):
|
64
77
|
return False
|
65
78
|
|
79
|
+
if require_all_content:
|
80
|
+
return all("content" in turn for turn in value)
|
81
|
+
|
66
82
|
return "content" in value[-1]
|
67
83
|
|
68
84
|
|
@@ -225,21 +241,15 @@ class Glitchling:
|
|
225
241
|
message = "datasets is not installed"
|
226
242
|
raise ModuleNotFoundError(message) from _datasets_error
|
227
243
|
|
228
|
-
def _is_transcript(value: Any) -> bool:
|
229
|
-
"""Return ``True`` when the value resembles a chat transcript."""
|
230
|
-
|
231
|
-
if not isinstance(value, list) or not value:
|
232
|
-
return False
|
233
|
-
|
234
|
-
return all(
|
235
|
-
isinstance(turn, dict) and "content" in turn for turn in value
|
236
|
-
)
|
237
|
-
|
238
244
|
def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
|
239
245
|
row = dict(row)
|
240
246
|
for column in columns:
|
241
247
|
value = row[column]
|
242
|
-
if _is_transcript(
|
248
|
+
if _is_transcript(
|
249
|
+
value,
|
250
|
+
allow_empty=False,
|
251
|
+
require_all_content=True,
|
252
|
+
):
|
243
253
|
row[column] = self.corrupt(value)
|
244
254
|
elif isinstance(value, list):
|
245
255
|
row[column] = [self.corrupt(item) for item in value]
|
@@ -356,7 +366,7 @@ class Gaggle(Glitchling):
|
|
356
366
|
|
357
367
|
@staticmethod
|
358
368
|
def rust_pipeline_enabled() -> bool:
|
359
|
-
"""Return ``True`` when the Rust pipeline is available and
|
369
|
+
"""Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
|
360
370
|
|
361
371
|
return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
|
362
372
|
|
glitchlings/zoo/redactyl.py
CHANGED
@@ -2,8 +2,13 @@ import re
|
|
2
2
|
import random
|
3
3
|
from typing import Any
|
4
4
|
|
5
|
-
from .core import Glitchling, AttackWave
|
6
5
|
from ._rate import resolve_rate
|
6
|
+
from ._text_utils import (
|
7
|
+
split_preserving_whitespace,
|
8
|
+
split_token_edges,
|
9
|
+
token_core_length,
|
10
|
+
)
|
11
|
+
from .core import AttackWave, Glitchling
|
7
12
|
|
8
13
|
FULL_BLOCK = "█"
|
9
14
|
|
@@ -68,8 +73,7 @@ def _python_redact_words(
|
|
68
73
|
- rng: RNG used for sampling decisions.
|
69
74
|
- unweighted: When True, sample words uniformly instead of by length.
|
70
75
|
"""
|
71
|
-
|
72
|
-
tokens = re.split(r"(\s+)", text)
|
76
|
+
tokens = split_preserving_whitespace(text)
|
73
77
|
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
74
78
|
if not word_indices:
|
75
79
|
raise ValueError(
|
@@ -78,15 +82,12 @@ def _python_redact_words(
|
|
78
82
|
weights: list[float] = []
|
79
83
|
for index in word_indices:
|
80
84
|
word = tokens[index]
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
core_length = 1
|
88
|
-
weights.append(1.0 if unweighted else float(core_length))
|
89
|
-
num_to_redact = max(1, int(len(word_indices) * rate))
|
85
|
+
length = token_core_length(word)
|
86
|
+
weights.append(1.0 if unweighted else float(length))
|
87
|
+
raw_quota = len(word_indices) * rate
|
88
|
+
num_to_redact = int(raw_quota)
|
89
|
+
if rate > 0:
|
90
|
+
num_to_redact = max(1, num_to_redact)
|
90
91
|
if num_to_redact > len(word_indices):
|
91
92
|
raise ValueError("Sample larger than population or is negative")
|
92
93
|
indices_to_redact = _weighted_sample_without_replacement(
|
@@ -102,16 +103,11 @@ def _python_redact_words(
|
|
102
103
|
break
|
103
104
|
|
104
105
|
word = tokens[i]
|
105
|
-
if not word or word.isspace():
|
106
|
+
if not word or word.isspace():
|
106
107
|
continue
|
107
108
|
|
108
|
-
|
109
|
-
|
110
|
-
if match:
|
111
|
-
prefix, core, suffix = match.groups()
|
112
|
-
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
113
|
-
else:
|
114
|
-
tokens[i] = f"{replacement_char * len(word)}"
|
109
|
+
prefix, core, suffix = split_token_edges(word)
|
110
|
+
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
115
111
|
|
116
112
|
text = "".join(tokens)
|
117
113
|
|
glitchlings/zoo/reduple.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
-
import re
|
2
1
|
import random
|
3
2
|
from typing import Any
|
4
3
|
|
5
|
-
from .core import Glitchling, AttackWave
|
6
4
|
from ._rate import resolve_rate
|
5
|
+
from ._text_utils import (
|
6
|
+
split_preserving_whitespace,
|
7
|
+
split_token_edges,
|
8
|
+
token_core_length,
|
9
|
+
)
|
10
|
+
from .core import AttackWave, Glitchling
|
7
11
|
|
8
12
|
try:
|
9
13
|
from glitchlings._zoo_rust import reduplicate_words as _reduplicate_words_rust
|
@@ -30,26 +34,16 @@ def _python_reduplicate_words(
|
|
30
34
|
- Preserves spacing and punctuation by tokenizing with separators.
|
31
35
|
- Deterministic when run with a fixed seed or via Gaggle.
|
32
36
|
"""
|
33
|
-
|
34
|
-
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
37
|
+
tokens = split_preserving_whitespace(text)
|
35
38
|
|
36
39
|
candidate_weights: list[tuple[int, float]] = []
|
37
|
-
for i in range(0, len(tokens), 2):
|
38
|
-
if i >= len(tokens):
|
39
|
-
break
|
40
|
-
|
40
|
+
for i in range(0, len(tokens), 2):
|
41
41
|
word = tokens[i]
|
42
|
-
if not word or word.isspace():
|
42
|
+
if not word or word.isspace():
|
43
43
|
continue
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
core_length = len(core) if core else len(word)
|
48
|
-
if core_length <= 0:
|
49
|
-
core_length = len(word.strip()) or len(word)
|
50
|
-
if core_length <= 0:
|
51
|
-
core_length = 1
|
52
|
-
weight = 1.0 if unweighted else 1.0 / core_length
|
45
|
+
length = token_core_length(word)
|
46
|
+
weight = 1.0 if unweighted else 1.0 / length
|
53
47
|
candidate_weights.append((i, weight))
|
54
48
|
|
55
49
|
if not candidate_weights:
|
@@ -75,13 +69,8 @@ def _python_reduplicate_words(
|
|
75
69
|
continue
|
76
70
|
|
77
71
|
word = tokens[index]
|
78
|
-
|
79
|
-
|
80
|
-
prefix, core, suffix = match.groups()
|
81
|
-
# Reduplicate with a space: "word" -> "word word"
|
82
|
-
tokens[index] = f"{prefix}{core} {core}{suffix}"
|
83
|
-
else:
|
84
|
-
tokens[index] = f"{word} {word}"
|
72
|
+
prefix, core, suffix = split_token_edges(word)
|
73
|
+
tokens[index] = f"{prefix}{core} {core}{suffix}"
|
85
74
|
return "".join(tokens)
|
86
75
|
|
87
76
|
|
glitchlings/zoo/rushmore.py
CHANGED
@@ -3,8 +3,13 @@ import random
|
|
3
3
|
import re
|
4
4
|
from typing import Any
|
5
5
|
|
6
|
-
from .core import Glitchling, AttackWave
|
7
6
|
from ._rate import resolve_rate
|
7
|
+
from ._text_utils import (
|
8
|
+
split_preserving_whitespace,
|
9
|
+
split_token_edges,
|
10
|
+
token_core_length,
|
11
|
+
)
|
12
|
+
from .core import AttackWave, Glitchling
|
8
13
|
|
9
14
|
try:
|
10
15
|
from glitchlings._zoo_rust import delete_random_words as _delete_random_words_rust
|
@@ -25,22 +30,16 @@ def _python_delete_random_words(
|
|
25
30
|
if effective_rate <= 0.0:
|
26
31
|
return text
|
27
32
|
|
28
|
-
tokens =
|
33
|
+
tokens = split_preserving_whitespace(text)
|
29
34
|
|
30
35
|
candidate_data: list[tuple[int, float]] = []
|
31
|
-
for i in range(2, len(tokens), 2):
|
36
|
+
for i in range(2, len(tokens), 2):
|
32
37
|
word = tokens[i]
|
33
38
|
if not word or word.isspace():
|
34
39
|
continue
|
35
40
|
|
36
|
-
|
37
|
-
|
38
|
-
core_length = len(core) if core else len(word)
|
39
|
-
if core_length <= 0:
|
40
|
-
core_length = len(word.strip()) or len(word)
|
41
|
-
if core_length <= 0:
|
42
|
-
core_length = 1
|
43
|
-
weight = 1.0 if unweighted else 1.0 / core_length
|
41
|
+
length = token_core_length(word)
|
42
|
+
weight = 1.0 if unweighted else 1.0 / length
|
44
43
|
candidate_data.append((i, weight))
|
45
44
|
|
46
45
|
if not candidate_data:
|
@@ -70,12 +69,8 @@ def _python_delete_random_words(
|
|
70
69
|
continue
|
71
70
|
|
72
71
|
word = tokens[index]
|
73
|
-
|
74
|
-
|
75
|
-
prefix, _, suffix = match.groups()
|
76
|
-
tokens[index] = f"{prefix.strip()}{suffix.strip()}"
|
77
|
-
else:
|
78
|
-
tokens[index] = ""
|
72
|
+
prefix, _, suffix = split_token_edges(word)
|
73
|
+
tokens[index] = f"{prefix.strip()}{suffix.strip()}"
|
79
74
|
|
80
75
|
deletions += 1
|
81
76
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: glitchlings
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Monsters for your language games.
|
5
5
|
Author: osoleve
|
6
6
|
License: Apache License
|
@@ -209,7 +209,7 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
|
|
209
209
|
Project-URL: Repository, https://github.com/osoleve/glitchlings.git
|
210
210
|
Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
|
211
211
|
Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
|
212
|
-
Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,
|
212
|
+
Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,rlvr
|
213
213
|
Classifier: Development Status :: 3 - Alpha
|
214
214
|
Classifier: Intended Audience :: Developers
|
215
215
|
Classifier: Programming Language :: Python
|
@@ -296,7 +296,7 @@ print(gaggle(SAMPLE_TEXT))
|
|
296
296
|
|
297
297
|
Consult the [Glitchlings Usage Guide](docs/index.md)
|
298
298
|
for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
|
299
|
-
integrations, and the
|
299
|
+
integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
|
300
300
|
|
301
301
|
## Motivation
|
302
302
|
|
@@ -428,7 +428,8 @@ _Did you say that or did I?_
|
|
428
428
|
>
|
429
429
|
> Args
|
430
430
|
>
|
431
|
-
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.
|
431
|
+
> - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.01, 1%).
|
432
|
+
> - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
|
432
433
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
433
434
|
|
434
435
|
### Rushmore
|
@@ -440,6 +441,19 @@ _I accidentally an entire word._
|
|
440
441
|
> Args
|
441
442
|
>
|
442
443
|
> - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
|
444
|
+
> - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
|
445
|
+
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
446
|
+
|
447
|
+
### Adjax
|
448
|
+
|
449
|
+
_Keep your hands and punctuation where I can see them._
|
450
|
+
|
451
|
+
> _**Perfect Shuffle.**_ Adjax trades the cores of neighbouring words while leaving punctuation, casing, and surrounding whitespace untouched, turning fluent prose into locally scrambled tongue-twisters.
|
452
|
+
>
|
453
|
+
> Args
|
454
|
+
>
|
455
|
+
> - `rate (float)`: Probability that each adjacent pair swaps cores (default: 0.5, 50%).
|
456
|
+
> - `swap_rate (float)`: Alias for `rate`, retained for backward compatibility.
|
443
457
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
444
458
|
|
445
459
|
### Redactyl
|
@@ -450,9 +464,10 @@ _Oops, that was my black highlighter._
|
|
450
464
|
>
|
451
465
|
> ### Args
|
452
466
|
>
|
453
|
-
> - `replacement_char (str)`: The character to use for redaction (default:
|
454
|
-
> - `rate (float)`: The maximum proportion of words to redact (default: 0.
|
467
|
+
> - `replacement_char (str)`: The character to use for redaction (default: FULL_BLOCK).
|
468
|
+
> - `rate (float)`: The maximum proportion of words to redact (default: 0.025, 2.5%).
|
455
469
|
> - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
|
470
|
+
> - `unweighted (bool)`: Sample words uniformly instead of biasing toward longer tokens (default: False).
|
456
471
|
> - `seed (int)`: The random seed for reproducibility (default: 151).
|
457
472
|
|
458
473
|
## Field Report: Uncontained Specimens
|
@@ -0,0 +1,29 @@
|
|
1
|
+
glitchlings/__init__.py,sha256=BLwp5ncEEVTurUDEo6DZcYjYz7r12LzblLfOcVc4MEU,680
|
2
|
+
glitchlings/__main__.py,sha256=EOiBgay0x6B9VlSDzSQvMuoq6bHJdSvFSgcAVGGKkd4,121
|
3
|
+
glitchlings/_zoo_rust.cpython-310-x86_64-linux-gnu.so,sha256=ES5PTbdW3auzJV6ADNITU_-M4KodYFTJvtjOh8GvviQ,3164328
|
4
|
+
glitchlings/main.py,sha256=u6969Vl0n47e3S-ZlYZBj3HWVsjs-hvW6RpF9RYuXnc,5931
|
5
|
+
glitchlings/dlc/__init__.py,sha256=eTLEEWrVWPqniXHqee4W23H1rjElI1PQ_jcqWFe9D3g,141
|
6
|
+
glitchlings/dlc/huggingface.py,sha256=I1QWanWVxO02awgSpHDtgQEVF-9AQRLtsta2RCitWhE,2933
|
7
|
+
glitchlings/dlc/prime.py,sha256=wpRMNtgka1vNlEzifeCjGMp1q_-QclZn3NxXczGnNpM,9278
|
8
|
+
glitchlings/util/__init__.py,sha256=7KiZ0gKMjocfd34cajneZhTqYb7Hkwi_PpjltPqvkNI,4498
|
9
|
+
glitchlings/zoo/__init__.py,sha256=sTmh-1u02kgjYlpRPz9lF9c1aXHamcShRXUOGK87J5Q,4378
|
10
|
+
glitchlings/zoo/_ocr_confusions.py,sha256=MkCbwk9T24SO2pD3JNPajYCfpMMlm2vQ5_sJty5GoXE,1218
|
11
|
+
glitchlings/zoo/_rate.py,sha256=TMyfVFV7pLxSGVswPlOAtBvk25Bjtx5xXTtpb_utgik,527
|
12
|
+
glitchlings/zoo/_text_utils.py,sha256=ZXy5khgoMTZp7NHdekkkj4vQjeMWGK2bzXPwIECBIfo,1120
|
13
|
+
glitchlings/zoo/adjax.py,sha256=N3CzfM7m7mAYgFcQYLQkqK2VYLw_vFvEMBM2aNU--ZA,3530
|
14
|
+
glitchlings/zoo/core.py,sha256=fhceCZKa9W1vVlhpR2zVKBXnzgJICB2-nmDywiqx4js,14207
|
15
|
+
glitchlings/zoo/jargoyle.py,sha256=T6vPWBxceIPE6gOQ7BaihaqALOJwzXuhfiZzvKa4S50,10666
|
16
|
+
glitchlings/zoo/mim1c.py,sha256=yAt1ngR3j2KXLbzc8LhrQlIWRO_KT5dFK1EE8QivMAQ,3429
|
17
|
+
glitchlings/zoo/ocr_confusions.tsv,sha256=KhtR7vJDTITpfTSGa-I7RHr6CK7LkGi2KjdhEWipI6o,183
|
18
|
+
glitchlings/zoo/redactyl.py,sha256=8xsamnVt1RFy7ztvfgfJDwCadQIlN-9fDz-TLfBQ89k,6357
|
19
|
+
glitchlings/zoo/reduple.py,sha256=IQM0WYinWJWjMIaBSuPPcpOXOynly9Tp2UtJEZxibGk,4313
|
20
|
+
glitchlings/zoo/rushmore.py,sha256=Cw6qpk3jp8DjtxmFALd5zTIOnS6C0tIkoPFA7F-xlVk,4369
|
21
|
+
glitchlings/zoo/scannequin.py,sha256=Ps8nxysKjkJV408zaL1kjVjy4jliATDBpYcNHLWbNFg,4859
|
22
|
+
glitchlings/zoo/typogre.py,sha256=xD02ldcMIA07XsdSts2bUniOc-k_DqTf0PBMaXGjLZE,6009
|
23
|
+
glitchlings/zoo/zeedub.py,sha256=D6rGk3O02OQ9jEIO9o0Ag-maVzNPN5O6qO3klG6Y62c,3552
|
24
|
+
glitchlings-0.3.0.dist-info/METADATA,sha256=iflxCI-vHtZP-omUePVGqx0QHLoFiJoHzcM7aSNvboQ,27579
|
25
|
+
glitchlings-0.3.0.dist-info/WHEEL,sha256=yzF9ixp0XVYLhnovZSdud9vspTPdVe52BzwI7Tv3jTM,113
|
26
|
+
glitchlings-0.3.0.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
27
|
+
glitchlings-0.3.0.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
28
|
+
glitchlings-0.3.0.dist-info/RECORD,,
|
29
|
+
glitchlings-0.3.0.dist-info/licenses/LICENSE,sha256=YCvGip-LoaRyu6h0nPo71q6eHEkzUpsE11psDJOIRkw,11337
|
@@ -1,27 +0,0 @@
|
|
1
|
-
glitchlings/__init__.py,sha256=ui8kzf7mK5YAlFY1Og5UX5Rp14v4wC2ZqHihAJBBj6s,632
|
2
|
-
glitchlings/__main__.py,sha256=EOiBgay0x6B9VlSDzSQvMuoq6bHJdSvFSgcAVGGKkd4,121
|
3
|
-
glitchlings/_zoo_rust.cpython-310-x86_64-linux-gnu.so,sha256=hNoaFhJbKFM9bOvj-JSSUb_worfI2-YsTVD6PLsHtN4,3158312
|
4
|
-
glitchlings/main.py,sha256=u6969Vl0n47e3S-ZlYZBj3HWVsjs-hvW6RpF9RYuXnc,5931
|
5
|
-
glitchlings/dlc/__init__.py,sha256=eTLEEWrVWPqniXHqee4W23H1rjElI1PQ_jcqWFe9D3g,141
|
6
|
-
glitchlings/dlc/huggingface.py,sha256=I1QWanWVxO02awgSpHDtgQEVF-9AQRLtsta2RCitWhE,2933
|
7
|
-
glitchlings/dlc/prime.py,sha256=v6wzkVxIsjTOAumn9cPfsmjuGf3RitCfUtk9eZzthyg,8698
|
8
|
-
glitchlings/util/__init__.py,sha256=7KiZ0gKMjocfd34cajneZhTqYb7Hkwi_PpjltPqvkNI,4498
|
9
|
-
glitchlings/zoo/__init__.py,sha256=pdQSiQjMCqnhrM3qSRvu98FJd-EyXLNNwvthnYSXpmM,4282
|
10
|
-
glitchlings/zoo/_ocr_confusions.py,sha256=MkCbwk9T24SO2pD3JNPajYCfpMMlm2vQ5_sJty5GoXE,1218
|
11
|
-
glitchlings/zoo/_rate.py,sha256=TMyfVFV7pLxSGVswPlOAtBvk25Bjtx5xXTtpb_utgik,527
|
12
|
-
glitchlings/zoo/core.py,sha256=Fdxx4uoRH1WOL5rH_FeTUuQSwmnagP8mGXALq6IrtGY,14007
|
13
|
-
glitchlings/zoo/jargoyle.py,sha256=T6vPWBxceIPE6gOQ7BaihaqALOJwzXuhfiZzvKa4S50,10666
|
14
|
-
glitchlings/zoo/mim1c.py,sha256=yAt1ngR3j2KXLbzc8LhrQlIWRO_KT5dFK1EE8QivMAQ,3429
|
15
|
-
glitchlings/zoo/ocr_confusions.tsv,sha256=KhtR7vJDTITpfTSGa-I7RHr6CK7LkGi2KjdhEWipI6o,183
|
16
|
-
glitchlings/zoo/redactyl.py,sha256=TjGLBaergqec1RB880WGb28cnwPy6Uj7u9f1Oe1gBeA,6702
|
17
|
-
glitchlings/zoo/reduple.py,sha256=YNhTBH25XsXLeQD8xxXPE_JJMiCtmEpUFGGn36rd2tY,4857
|
18
|
-
glitchlings/zoo/rushmore.py,sha256=oG8MmMbrpmHH4rOp-NXkQznVlBCtSnrOttAZMdVlMkc,4729
|
19
|
-
glitchlings/zoo/scannequin.py,sha256=Ps8nxysKjkJV408zaL1kjVjy4jliATDBpYcNHLWbNFg,4859
|
20
|
-
glitchlings/zoo/typogre.py,sha256=xD02ldcMIA07XsdSts2bUniOc-k_DqTf0PBMaXGjLZE,6009
|
21
|
-
glitchlings/zoo/zeedub.py,sha256=D6rGk3O02OQ9jEIO9o0Ag-maVzNPN5O6qO3klG6Y62c,3552
|
22
|
-
glitchlings-0.2.5.dist-info/METADATA,sha256=8yjFyYvjeUhsxF3rKR8gC7Pv4jKWAht-IXbwcKdur88,26708
|
23
|
-
glitchlings-0.2.5.dist-info/WHEEL,sha256=yzF9ixp0XVYLhnovZSdud9vspTPdVe52BzwI7Tv3jTM,113
|
24
|
-
glitchlings-0.2.5.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
|
25
|
-
glitchlings-0.2.5.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
|
26
|
-
glitchlings-0.2.5.dist-info/RECORD,,
|
27
|
-
glitchlings-0.2.5.dist-info/licenses/LICENSE,sha256=YCvGip-LoaRyu6h0nPo71q6eHEkzUpsE11psDJOIRkw,11337
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|