glitchlings 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .github/workflows/publish.yml +42 -0
- .github/workflows/testpypi.yml +38 -0
- .gitignore +12 -0
- LICENSE +21 -0
- MONSTER_MANUAL.md +272 -0
- PKG-INFO +244 -0
- README.md +192 -0
- RELEASE.md +47 -0
- __init__.py +73 -0
- dlc/__init__.py +0 -0
- dlc/prime.py +50 -0
- glitchlings-0.1.0.dist-info/METADATA +244 -0
- glitchlings-0.1.0.dist-info/RECORD +28 -0
- glitchlings-0.1.0.dist-info/WHEEL +4 -0
- glitchlings-0.1.0.dist-info/entry_points.txt +2 -0
- glitchlings-0.1.0.dist-info/licenses/LICENSE +21 -0
- main.py +6 -0
- pyproject.toml +74 -0
- util/__init__.py +73 -0
- zoo/__init__.py +50 -0
- zoo/core.py +136 -0
- zoo/jargoyle.py +89 -0
- zoo/mim1c.py +62 -0
- zoo/redactyl.py +73 -0
- zoo/reduple.py +54 -0
- zoo/rushmore.py +53 -0
- zoo/scannequin.py +124 -0
- zoo/typogre.py +224 -0
main.py
ADDED
pyproject.toml
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
[project]
|
2
|
+
name = "glitchlings"
|
3
|
+
version = "0.1.0"
|
4
|
+
description = "Monsters for your language games."
|
5
|
+
readme = "README.md"
|
6
|
+
requires-python = ">=3.12"
|
7
|
+
dependencies = [
|
8
|
+
"confusable-homoglyphs>=3.3.1",
|
9
|
+
"datasets>=4.0.0",
|
10
|
+
"jellyfish>=1.2.0",
|
11
|
+
"nltk>=3.9.1",
|
12
|
+
]
|
13
|
+
|
14
|
+
authors = [
|
15
|
+
{ name = "osoleve" }
|
16
|
+
]
|
17
|
+
keywords = ["nlp", "text", "adversarial", "data", "evaluation", "glitch"]
|
18
|
+
classifiers = [
|
19
|
+
"Development Status :: 3 - Alpha",
|
20
|
+
"Intended Audience :: Developers",
|
21
|
+
"License :: OSI Approved :: MIT License",
|
22
|
+
"Programming Language :: Python",
|
23
|
+
"Programming Language :: Python :: 3",
|
24
|
+
"Programming Language :: Python :: 3.12",
|
25
|
+
"Operating System :: OS Independent",
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
27
|
+
"Topic :: Software Development :: Testing",
|
28
|
+
]
|
29
|
+
|
30
|
+
[project.license]
|
31
|
+
file = "LICENSE"
|
32
|
+
|
33
|
+
[project.urls]
|
34
|
+
Homepage = "https://github.com/osoleve/glitchlings"
|
35
|
+
Repository = "https://github.com/osoleve/glitchlings.git"
|
36
|
+
Issues = "https://github.com/osoleve/glitchlings/issues"
|
37
|
+
Changelog = "https://github.com/osoleve/glitchlings/releases"
|
38
|
+
|
39
|
+
[project.scripts]
|
40
|
+
glitchlings = "glitchlings.main:main"
|
41
|
+
|
42
|
+
[project.optional-dependencies]
|
43
|
+
prime = [
|
44
|
+
"verifiers>=0.1.3.post0",
|
45
|
+
]
|
46
|
+
dev = [
|
47
|
+
"pytest>=8.0.0",
|
48
|
+
]
|
49
|
+
|
50
|
+
[build-system]
|
51
|
+
requires = ["hatchling>=1.18"]
|
52
|
+
build-backend = "hatchling.build"
|
53
|
+
|
54
|
+
[tool.hatch.build]
|
55
|
+
exclude = [
|
56
|
+
"**/__pycache__/**",
|
57
|
+
".git/**",
|
58
|
+
".venv/**",
|
59
|
+
]
|
60
|
+
|
61
|
+
[tool.hatch.build.targets.sdist]
|
62
|
+
exclude = [
|
63
|
+
"**/__pycache__/**",
|
64
|
+
".git/**",
|
65
|
+
".venv/**",
|
66
|
+
]
|
67
|
+
|
68
|
+
[tool.hatch.build.targets.wheel]
|
69
|
+
packages = ["."]
|
70
|
+
exclude = [
|
71
|
+
"tests/**",
|
72
|
+
"tests",
|
73
|
+
"**/__pycache__/**",
|
74
|
+
]
|
util/__init__.py
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
import difflib
|
2
|
+
|
3
|
+
SAMPLE_TEXT = "One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked."
|
4
|
+
|
5
|
+
|
6
|
+
def string_diffs(a: str, b: str):
|
7
|
+
"""
|
8
|
+
Compare two strings using SequenceMatcher and return
|
9
|
+
grouped adjacent opcodes (excluding 'equal' tags).
|
10
|
+
|
11
|
+
Each element is a tuple: (tag, a_text, b_text).
|
12
|
+
"""
|
13
|
+
sm = difflib.SequenceMatcher(None, a, b)
|
14
|
+
ops = []
|
15
|
+
buffer = []
|
16
|
+
|
17
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
18
|
+
if tag == "equal":
|
19
|
+
# flush any buffered operations before skipping
|
20
|
+
if buffer:
|
21
|
+
ops.append(buffer)
|
22
|
+
buffer = []
|
23
|
+
continue
|
24
|
+
|
25
|
+
# append operation to buffer
|
26
|
+
buffer.append((tag, a[i1:i2], b[j1:j2]))
|
27
|
+
|
28
|
+
# flush trailing buffer
|
29
|
+
if buffer:
|
30
|
+
ops.append(buffer)
|
31
|
+
|
32
|
+
return ops
|
33
|
+
|
34
|
+
|
35
|
+
_KEYNEIGHBORS = {
|
36
|
+
"CURATOR_QWERTY": {
|
37
|
+
"a": [*"qwsz"],
|
38
|
+
"b": [*"vghn "],
|
39
|
+
"c": [*"xdfv "],
|
40
|
+
"d": [*"serfcx"],
|
41
|
+
"e": [*"wsdrf34"],
|
42
|
+
"f": [*"drtgvc"],
|
43
|
+
"g": [*"ftyhbv"],
|
44
|
+
"h": [*"gyujnb"],
|
45
|
+
"i": [*"ujko89"],
|
46
|
+
"j": [*"huikmn"],
|
47
|
+
"k": [*"jilom,"],
|
48
|
+
"l": [*"kop;.,"],
|
49
|
+
"m": [*"njk, "],
|
50
|
+
"n": [*"bhjm "],
|
51
|
+
"o": [*"iklp90"],
|
52
|
+
"p": [*"o0-[;l"],
|
53
|
+
"q": [*"was 12"],
|
54
|
+
"r": [*"edft45"],
|
55
|
+
"s": [*"awedxz"],
|
56
|
+
"t": [*"r56ygf"],
|
57
|
+
"u": [*"y78ijh"],
|
58
|
+
"v": [*"cfgb "],
|
59
|
+
"w": [*"q23esa"],
|
60
|
+
"x": [*"zsdc "],
|
61
|
+
"y": [*"t67uhg"],
|
62
|
+
"z": [*"asx"],
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
|
67
|
+
class KeyNeighbors:
|
68
|
+
def __init__(self):
|
69
|
+
for layout_name, layout in _KEYNEIGHBORS.items():
|
70
|
+
setattr(self, layout_name, layout)
|
71
|
+
|
72
|
+
|
73
|
+
KEYNEIGHBORS = KeyNeighbors()
|
zoo/__init__.py
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
from .typogre import typogre
|
2
|
+
from .mim1c import mim1c
|
3
|
+
from .jargoyle import jargoyle
|
4
|
+
from .reduple import reduple
|
5
|
+
from .rushmore import rushmore
|
6
|
+
from .redactyl import redactyl
|
7
|
+
from .scannequin import scannequin
|
8
|
+
from .core import Glitchling, Gaggle
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"typogre",
|
12
|
+
"mim1c",
|
13
|
+
"jargoyle",
|
14
|
+
"reduple",
|
15
|
+
"rushmore",
|
16
|
+
"redactyl",
|
17
|
+
"scannequin",
|
18
|
+
"Glitchling",
|
19
|
+
"Gaggle",
|
20
|
+
"summon",
|
21
|
+
]
|
22
|
+
|
23
|
+
|
24
|
+
def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
|
25
|
+
"""Summon glitchlings by name (using defaults) or instance (to change parameters)."""
|
26
|
+
available = {
|
27
|
+
g.name.lower(): g
|
28
|
+
for g in [
|
29
|
+
typogre,
|
30
|
+
mim1c,
|
31
|
+
jargoyle,
|
32
|
+
reduple,
|
33
|
+
rushmore,
|
34
|
+
redactyl,
|
35
|
+
scannequin,
|
36
|
+
]
|
37
|
+
}
|
38
|
+
summoned = []
|
39
|
+
for entry in glitchlings:
|
40
|
+
if isinstance(entry, Glitchling):
|
41
|
+
summoned.append(entry)
|
42
|
+
continue
|
43
|
+
|
44
|
+
g = available.get(entry.lower())
|
45
|
+
if g:
|
46
|
+
summoned.append(g)
|
47
|
+
else:
|
48
|
+
raise ValueError(f"Glitchling '{entry}' not found.")
|
49
|
+
|
50
|
+
return Gaggle(summoned, seed=seed)
|
zoo/core.py
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
from enum import IntEnum, auto
|
2
|
+
from datasets import Dataset
|
3
|
+
import random
|
4
|
+
from typing import Any, Callable
|
5
|
+
|
6
|
+
import functools as ft
|
7
|
+
|
8
|
+
|
9
|
+
# Text levels for glitchlings, to enforce a sort order
|
10
|
+
# Work from highest level down, because e.g.
|
11
|
+
# duplicating a word then adding a typo is potentially different than
|
12
|
+
# adding a typo then duplicating a word
|
13
|
+
class AttackWave(IntEnum):
|
14
|
+
DOCUMENT = auto()
|
15
|
+
PARAGRAPH = auto()
|
16
|
+
SENTENCE = auto()
|
17
|
+
WORD = auto()
|
18
|
+
CHARACTER = auto()
|
19
|
+
|
20
|
+
|
21
|
+
# Modifier for within the same attack wave
|
22
|
+
class AttackOrder(IntEnum):
|
23
|
+
FIRST = auto()
|
24
|
+
EARLY = auto()
|
25
|
+
NORMAL = auto()
|
26
|
+
LATE = auto()
|
27
|
+
LAST = auto()
|
28
|
+
|
29
|
+
|
30
|
+
class Glitchling:
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
name: str,
|
34
|
+
corruption_function: Callable,
|
35
|
+
scope: AttackWave,
|
36
|
+
order: AttackOrder = AttackOrder.NORMAL,
|
37
|
+
seed: int | None = None,
|
38
|
+
**kwargs,
|
39
|
+
):
|
40
|
+
# Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
|
41
|
+
# If no seed is supplied, we fall back to Python's default entropy.
|
42
|
+
self.seed = seed
|
43
|
+
self.rng: random.Random = random.Random(seed)
|
44
|
+
self.name: str = name
|
45
|
+
self.corruption_function: Callable[..., str] = corruption_function
|
46
|
+
self.level: AttackWave = scope
|
47
|
+
self.order: AttackOrder = order
|
48
|
+
self.kwargs: dict[str, Any] = {}
|
49
|
+
for kw, val in kwargs.items():
|
50
|
+
self.set_param(kw, val)
|
51
|
+
|
52
|
+
def set_param(self, key: str, value: Any):
|
53
|
+
setattr(self, key, value)
|
54
|
+
self.kwargs[key] = value
|
55
|
+
|
56
|
+
def __corrupt(self, text, *args, **kwargs):
|
57
|
+
# Pass rng to underlying corruption function if it expects it.
|
58
|
+
if "rng" in self.corruption_function.__code__.co_varnames:
|
59
|
+
corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
|
60
|
+
else:
|
61
|
+
corrupted = self.corruption_function(text, *args, **kwargs)
|
62
|
+
return corrupted
|
63
|
+
|
64
|
+
def corrupt(self, text: str | list[dict]) -> str | list[dict]:
|
65
|
+
if isinstance(text, list):
|
66
|
+
text[-1]["content"] = self.__corrupt(text[-1]["content"], **self.kwargs)
|
67
|
+
else:
|
68
|
+
text = self.__corrupt(text, **self.kwargs)
|
69
|
+
|
70
|
+
return text
|
71
|
+
|
72
|
+
def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
|
73
|
+
def __corrupt_row(row):
|
74
|
+
for column in columns:
|
75
|
+
row[column] = self.corrupt(row[column])
|
76
|
+
return row
|
77
|
+
|
78
|
+
dataset = dataset.map(__corrupt_row)
|
79
|
+
|
80
|
+
return dataset
|
81
|
+
|
82
|
+
def __call__(self, text: str, *args, **kwds) -> str | list[dict]:
|
83
|
+
return self.corrupt(text, *args, **kwds)
|
84
|
+
|
85
|
+
def reset_rng(self, seed=None):
|
86
|
+
"""Reset this glitchling's RNG to its initial seed (if one was provided)."""
|
87
|
+
if seed is not None:
|
88
|
+
self.seed = seed
|
89
|
+
if self.seed is not None:
|
90
|
+
self.rng = random.Random(self.seed)
|
91
|
+
|
92
|
+
def clone(self, seed=None) -> "Glitchling":
|
93
|
+
"""Create a copy of this glitchling, optionally with a new seed."""
|
94
|
+
new_glitchling = Glitchling(
|
95
|
+
self.name,
|
96
|
+
self.corruption_function,
|
97
|
+
self.level,
|
98
|
+
self.order,
|
99
|
+
seed=seed if seed is not None else self.seed,
|
100
|
+
**self.kwargs,
|
101
|
+
)
|
102
|
+
return new_glitchling
|
103
|
+
|
104
|
+
|
105
|
+
class Gaggle(Glitchling):
|
106
|
+
def __init__(self, glitchlings: list[Glitchling], seed: int = 151):
|
107
|
+
super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
|
108
|
+
self.glitchlings: dict[AttackWave, list[Glitchling]] = {
|
109
|
+
level: [] for level in AttackWave
|
110
|
+
}
|
111
|
+
self.apply_order: list[Glitchling] = []
|
112
|
+
# Derive deterministic per-glitchling seeds from master seed if provided
|
113
|
+
for idx, g in enumerate(glitchlings):
|
114
|
+
_g = g.clone()
|
115
|
+
derived_seed = Gaggle.derive_seed(seed, _g.name, idx)
|
116
|
+
_g.reset_rng(derived_seed)
|
117
|
+
self.glitchlings[g.level].append(_g)
|
118
|
+
self.sort_glitchlings()
|
119
|
+
|
120
|
+
@staticmethod
|
121
|
+
def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
|
122
|
+
"""Derive a deterministic seed for a glitchling based on the master seed."""
|
123
|
+
return hash((master_seed, glitchling_name, index)) & 0xFFFFFFFF
|
124
|
+
|
125
|
+
def sort_glitchlings(self):
|
126
|
+
self.apply_order = [
|
127
|
+
g
|
128
|
+
for _, glitchlings in sorted(self.glitchlings.items())
|
129
|
+
for g in sorted(glitchlings, key=lambda x: (x.order, x.name))
|
130
|
+
]
|
131
|
+
|
132
|
+
def corrupt(self, text: str) -> str:
|
133
|
+
corrupted = text
|
134
|
+
for glitchling in self.apply_order:
|
135
|
+
corrupted = glitchling(corrupted)
|
136
|
+
return corrupted
|
zoo/jargoyle.py
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
import random
|
2
|
+
from typing import Literal, Any, cast
|
3
|
+
import nltk
|
4
|
+
import re
|
5
|
+
from nltk.corpus import wordnet as wn
|
6
|
+
from .core import Glitchling, AttackWave
|
7
|
+
|
8
|
+
nltk.download("wordnet", quiet=True)
|
9
|
+
|
10
|
+
|
11
|
+
def substitute_random_synonyms(
|
12
|
+
text: str,
|
13
|
+
replacement_rate: float = 0.1,
|
14
|
+
part_of_speech: Literal["n", "v", "a", "r"] = wn.NOUN,
|
15
|
+
seed: int | None = None,
|
16
|
+
rng: random.Random | None = None,
|
17
|
+
) -> str:
|
18
|
+
"""Replace words with random WordNet synonyms.
|
19
|
+
|
20
|
+
Parameters
|
21
|
+
- text: Input text.
|
22
|
+
- replacement_rate: Max proportion of candidate words to replace (default 0.1).
|
23
|
+
- part_of_speech: WordNet POS to target. One of wn.NOUN (default), wn.VERB, wn.ADJ, wn.ADV.
|
24
|
+
- rng: Optional RNG instance used for deterministic sampling.
|
25
|
+
- seed: Optional seed if `rng` not provided.
|
26
|
+
|
27
|
+
Determinism
|
28
|
+
- Candidates collected in left-to-right order; no set() reordering.
|
29
|
+
- Replacement positions chosen via rng.sample.
|
30
|
+
- Synonyms sorted before rng.choice to fix ordering.
|
31
|
+
- Only first synset is used for stability.
|
32
|
+
"""
|
33
|
+
if rng is None and seed is not None:
|
34
|
+
rng = random.Random(seed)
|
35
|
+
elif rng is None:
|
36
|
+
rng = random.Random()
|
37
|
+
|
38
|
+
# Split but keep whitespace separators so we can rebuild easily
|
39
|
+
tokens = re.split(r"(\s+)", text)
|
40
|
+
|
41
|
+
# Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
|
42
|
+
candidate_indices: list[int] = []
|
43
|
+
for idx, tok in enumerate(tokens):
|
44
|
+
if idx % 2 == 0 and tok and not tok.isspace():
|
45
|
+
if wn.synsets(tok, pos=part_of_speech):
|
46
|
+
candidate_indices.append(idx)
|
47
|
+
|
48
|
+
if not candidate_indices:
|
49
|
+
return text
|
50
|
+
|
51
|
+
max_replacements = int(len(candidate_indices) * replacement_rate)
|
52
|
+
if max_replacements <= 0:
|
53
|
+
return text
|
54
|
+
|
55
|
+
# Choose which positions to replace deterministically via rng.sample
|
56
|
+
replace_positions = rng.sample(candidate_indices, k=max_replacements)
|
57
|
+
# Process in ascending order to avoid affecting later indices
|
58
|
+
replace_positions.sort()
|
59
|
+
|
60
|
+
for pos in replace_positions:
|
61
|
+
word = tokens[pos]
|
62
|
+
synsets = wn.synsets(word, pos=part_of_speech)
|
63
|
+
if not synsets:
|
64
|
+
continue
|
65
|
+
synset0: Any = synsets[0]
|
66
|
+
lemmas_list = [lemma.name() for lemma in cast(Any, synset0).lemmas()]
|
67
|
+
if not lemmas_list:
|
68
|
+
continue
|
69
|
+
# Normalize & dedupe deterministically
|
70
|
+
synonyms = sorted(
|
71
|
+
{
|
72
|
+
lemma_str.replace("_", " ")
|
73
|
+
for lemma_str in lemmas_list
|
74
|
+
if lemma_str.lower() != word.lower()
|
75
|
+
}
|
76
|
+
)
|
77
|
+
if not synonyms:
|
78
|
+
continue
|
79
|
+
replacement = rng.choice(synonyms)
|
80
|
+
tokens[pos] = replacement
|
81
|
+
|
82
|
+
return "".join(tokens)
|
83
|
+
|
84
|
+
|
85
|
+
jargoyle = Glitchling(
|
86
|
+
name="Jargoyle",
|
87
|
+
corruption_function=substitute_random_synonyms,
|
88
|
+
scope=AttackWave.WORD,
|
89
|
+
)
|
zoo/mim1c.py
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
from typing import Literal
|
2
|
+
from .core import Glitchling, AttackWave, AttackOrder
|
3
|
+
import random
|
4
|
+
from confusable_homoglyphs import confusables
|
5
|
+
|
6
|
+
|
7
|
+
def swap_homoglyphs(
|
8
|
+
text: str,
|
9
|
+
replacement_rate: float = 0.02,
|
10
|
+
classes: list[str] | Literal["all"] | None = None,
|
11
|
+
seed: int | None = None,
|
12
|
+
rng: random.Random | None = None,
|
13
|
+
) -> str:
|
14
|
+
"""Replace characters with visually confusable homoglyphs.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
- text: Input text.
|
18
|
+
- replacement_rate: Max proportion of eligible characters to replace (default 0.02).
|
19
|
+
- classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
|
20
|
+
- seed: Optional seed if `rng` not provided.
|
21
|
+
- rng: Optional RNG; overrides seed.
|
22
|
+
|
23
|
+
Notes
|
24
|
+
- Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
|
25
|
+
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
26
|
+
"""
|
27
|
+
if rng is None:
|
28
|
+
rng = random.Random(seed)
|
29
|
+
|
30
|
+
if classes is None:
|
31
|
+
classes = ["LATIN", "GREEK", "CYRILLIC"]
|
32
|
+
|
33
|
+
target_chars = [char for char in text if char.isalnum()]
|
34
|
+
confusable_chars = [
|
35
|
+
char for char in target_chars if char in confusables.confusables_data
|
36
|
+
]
|
37
|
+
num_replacements = int(len(confusable_chars) * replacement_rate)
|
38
|
+
done = 0
|
39
|
+
rng.shuffle(confusable_chars)
|
40
|
+
for char in confusable_chars:
|
41
|
+
if done >= num_replacements:
|
42
|
+
break
|
43
|
+
options = [
|
44
|
+
o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
|
45
|
+
]
|
46
|
+
if classes != "all":
|
47
|
+
options = [opt for opt in options if confusables.alias(opt) in classes]
|
48
|
+
if not options:
|
49
|
+
continue
|
50
|
+
text = text.replace(char, rng.choice(options), 1)
|
51
|
+
done += 1
|
52
|
+
return text
|
53
|
+
|
54
|
+
|
55
|
+
mim1c = Glitchling(
|
56
|
+
name="Mim1c",
|
57
|
+
corruption_function=swap_homoglyphs,
|
58
|
+
scope=AttackWave.CHARACTER,
|
59
|
+
order=AttackOrder.LAST,
|
60
|
+
replacement_rate=0.02,
|
61
|
+
classes=["LATIN", "GREEK", "CYRILLIC"],
|
62
|
+
)
|
zoo/redactyl.py
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
import re
|
2
|
+
import random
|
3
|
+
from .core import Glitchling, AttackWave
|
4
|
+
|
5
|
+
FULL_BLOCK = "█"
|
6
|
+
|
7
|
+
|
8
|
+
def redact_words(
|
9
|
+
text: str,
|
10
|
+
replacement_char: str = FULL_BLOCK,
|
11
|
+
redaction_rate: float = 0.05,
|
12
|
+
merge_adjacent: bool = False,
|
13
|
+
seed: int = 151,
|
14
|
+
rng: random.Random | None = None,
|
15
|
+
) -> str:
|
16
|
+
"""Redact random words by replacing their characters.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
- text: Input text.
|
20
|
+
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
21
|
+
- redaction_rate: Max proportion of words to redact (default 0.05).
|
22
|
+
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
23
|
+
- seed: Seed used if `rng` not provided (default 151).
|
24
|
+
- rng: Optional RNG; overrides seed.
|
25
|
+
"""
|
26
|
+
if rng is None:
|
27
|
+
rng = random.Random(seed)
|
28
|
+
|
29
|
+
# Preserve exact spacing and punctuation by using regex
|
30
|
+
tokens = re.split(r"(\s+)", text)
|
31
|
+
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
32
|
+
num_to_redact = max(1, int(len(word_indices) * redaction_rate))
|
33
|
+
|
34
|
+
# Sample from the indices of actual words
|
35
|
+
indices_to_redact = rng.sample(word_indices, k=num_to_redact)
|
36
|
+
indices_to_redact.sort()
|
37
|
+
|
38
|
+
for i in indices_to_redact:
|
39
|
+
if i >= len(tokens):
|
40
|
+
break
|
41
|
+
|
42
|
+
word = tokens[i]
|
43
|
+
if not word or word.isspace(): # Skip empty or whitespace
|
44
|
+
continue
|
45
|
+
|
46
|
+
# Check if word has trailing punctuation
|
47
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
48
|
+
if match:
|
49
|
+
prefix, core, suffix = match.groups()
|
50
|
+
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
51
|
+
else:
|
52
|
+
tokens[i] = f"{replacement_char * len(word)}"
|
53
|
+
|
54
|
+
text = "".join(tokens)
|
55
|
+
|
56
|
+
if merge_adjacent:
|
57
|
+
text = re.sub(
|
58
|
+
rf"{replacement_char}\W+{replacement_char}",
|
59
|
+
lambda m: replacement_char * (len(m.group(0)) - 1),
|
60
|
+
text,
|
61
|
+
)
|
62
|
+
|
63
|
+
return text
|
64
|
+
|
65
|
+
|
66
|
+
redactyl = Glitchling(
|
67
|
+
name="Redactyl",
|
68
|
+
corruption_function=redact_words,
|
69
|
+
replacement_char=FULL_BLOCK,
|
70
|
+
redaction_rate=0.05,
|
71
|
+
scope=AttackWave.WORD,
|
72
|
+
seed=151,
|
73
|
+
)
|
zoo/reduple.py
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
import re
|
2
|
+
import random
|
3
|
+
from .core import Glitchling, AttackWave
|
4
|
+
|
5
|
+
|
6
|
+
def reduplicate_words(
|
7
|
+
text: str,
|
8
|
+
reduplication_rate: float = 0.05,
|
9
|
+
seed: int | None = None,
|
10
|
+
rng: random.Random | None = None,
|
11
|
+
) -> str:
|
12
|
+
"""Randomly reduplicate words in the text.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
- text: Input text.
|
16
|
+
- reduplication_rate: Max proportion of words to reduplicate (default 0.05).
|
17
|
+
- seed: Optional seed if `rng` not provided.
|
18
|
+
- rng: Optional RNG; overrides seed.
|
19
|
+
|
20
|
+
Notes
|
21
|
+
- Preserves spacing and punctuation by tokenizing with separators.
|
22
|
+
- Deterministic when run with a fixed seed or via Gaggle.
|
23
|
+
"""
|
24
|
+
if rng is None:
|
25
|
+
rng = random.Random(seed)
|
26
|
+
|
27
|
+
# Preserve exact spacing and punctuation by using regex
|
28
|
+
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
29
|
+
|
30
|
+
for i in range(0, len(tokens), 2): # Every other token is a word
|
31
|
+
if i >= len(tokens):
|
32
|
+
break
|
33
|
+
|
34
|
+
word = tokens[i]
|
35
|
+
if not word or word.isspace(): # Skip empty or whitespace
|
36
|
+
continue
|
37
|
+
|
38
|
+
# Only consider actual words for reduplication
|
39
|
+
if rng.random() < reduplication_rate:
|
40
|
+
# Check if word has trailing punctuation
|
41
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
42
|
+
if match:
|
43
|
+
prefix, core, suffix = match.groups()
|
44
|
+
# Reduplicate with a space: "word" -> "word word"
|
45
|
+
tokens[i] = f"{prefix}{core} {core}{suffix}"
|
46
|
+
else:
|
47
|
+
tokens[i] = f"{word} {word}"
|
48
|
+
|
49
|
+
return "".join(tokens)
|
50
|
+
|
51
|
+
|
52
|
+
reduple = Glitchling(
|
53
|
+
name="Reduple", corruption_function=reduplicate_words, scope=AttackWave.WORD
|
54
|
+
)
|
zoo/rushmore.py
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
import random
|
2
|
+
import re
|
3
|
+
from .core import Glitchling, AttackWave
|
4
|
+
|
5
|
+
|
6
|
+
def delete_random_words(
|
7
|
+
text: str,
|
8
|
+
max_deletion_rate: float = 0.01,
|
9
|
+
seed: int | None = None,
|
10
|
+
rng: random.Random | None = None,
|
11
|
+
) -> str:
|
12
|
+
"""Delete random words from the input text.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
- text: The input text.
|
16
|
+
- max_deletion_rate: The maximum proportion of words to delete (default 0.01).
|
17
|
+
- seed: Optional seed if `rng` not provided.
|
18
|
+
- rng: Optional RNG; overrides seed.
|
19
|
+
"""
|
20
|
+
if rng is None:
|
21
|
+
rng = random.Random(seed)
|
22
|
+
|
23
|
+
# Preserve exact spacing and punctuation by using regex
|
24
|
+
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
25
|
+
|
26
|
+
for i in range(
|
27
|
+
2, len(tokens), 2
|
28
|
+
): # Every other token is a word, but skip the first word
|
29
|
+
if i >= len(tokens):
|
30
|
+
break
|
31
|
+
|
32
|
+
word = tokens[i]
|
33
|
+
if not word or word.isspace(): # Skip empty or whitespace
|
34
|
+
continue
|
35
|
+
|
36
|
+
# Only consider actual words for deletion
|
37
|
+
if rng.random() < max_deletion_rate:
|
38
|
+
# Check if word has trailing punctuation
|
39
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
40
|
+
if match:
|
41
|
+
prefix, _, suffix = match.groups()
|
42
|
+
tokens[i] = f"{prefix.strip()}{suffix.strip()}"
|
43
|
+
else:
|
44
|
+
tokens[i] = ""
|
45
|
+
|
46
|
+
text = "".join(tokens)
|
47
|
+
text = re.sub(r"\s+([.,;:])", r"\1", text)
|
48
|
+
text = re.sub(r"\s{2,}", " ", text).strip()
|
49
|
+
|
50
|
+
return text
|
51
|
+
|
52
|
+
|
53
|
+
rushmore = Glitchling("rushmore", delete_random_words, scope=AttackWave.WORD)
|