glitchlings 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +42 -0
- glitchlings/__main__.py +9 -0
- {dlc → glitchlings/dlc}/prime.py +52 -50
- glitchlings/main.py +238 -0
- glitchlings/util/__init__.py +151 -0
- {zoo → glitchlings/zoo}/__init__.py +57 -50
- {zoo → glitchlings/zoo}/core.py +190 -136
- glitchlings/zoo/jargoyle.py +225 -0
- {zoo → glitchlings/zoo}/mim1c.py +79 -62
- {zoo → glitchlings/zoo}/redactyl.py +91 -73
- {zoo → glitchlings/zoo}/reduple.py +73 -54
- {zoo → glitchlings/zoo}/rushmore.py +74 -53
- {zoo → glitchlings/zoo}/scannequin.py +140 -124
- {zoo → glitchlings/zoo}/typogre.py +231 -224
- {glitchlings-0.1.1.dist-info → glitchlings-0.1.2.dist-info}/METADATA +49 -23
- glitchlings-0.1.2.dist-info/RECORD +20 -0
- {glitchlings-0.1.1.dist-info → glitchlings-0.1.2.dist-info}/licenses/LICENSE +201 -201
- .github/workflows/publish.yml +0 -42
- .gitignore +0 -14
- LICENSE +0 -201
- MONSTER_MANUAL.md +0 -272
- PKG-INFO +0 -429
- README.md +0 -196
- __init__.py +0 -73
- glitchlings-0.1.1.dist-info/RECORD +0 -26
- main.py +0 -6
- pyproject.toml +0 -79
- util/__init__.py +0 -73
- zoo/jargoyle.py +0 -89
- {dlc → glitchlings/dlc}/__init__.py +0 -0
- {glitchlings-0.1.1.dist-info → glitchlings-0.1.2.dist-info}/WHEEL +0 -0
- {glitchlings-0.1.1.dist-info → glitchlings-0.1.2.dist-info}/entry_points.txt +0 -0
{zoo → glitchlings/zoo}/mim1c.py
RENAMED
@@ -1,62 +1,79 @@
|
|
1
|
-
from typing import Literal
|
2
|
-
from .core import Glitchling, AttackWave, AttackOrder
|
3
|
-
import random
|
4
|
-
from confusable_homoglyphs import confusables
|
5
|
-
|
6
|
-
|
7
|
-
def swap_homoglyphs(
|
8
|
-
text: str,
|
9
|
-
replacement_rate: float = 0.02,
|
10
|
-
classes: list[str] | Literal["all"] | None = None,
|
11
|
-
seed: int | None = None,
|
12
|
-
rng: random.Random | None = None,
|
13
|
-
) -> str:
|
14
|
-
"""Replace characters with visually confusable homoglyphs.
|
15
|
-
|
16
|
-
Parameters
|
17
|
-
- text: Input text.
|
18
|
-
- replacement_rate: Max proportion of eligible characters to replace (default 0.02).
|
19
|
-
- classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
|
20
|
-
- seed: Optional seed if `rng` not provided.
|
21
|
-
- rng: Optional RNG; overrides seed.
|
22
|
-
|
23
|
-
Notes
|
24
|
-
- Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
|
25
|
-
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
26
|
-
"""
|
27
|
-
if rng is None:
|
28
|
-
rng = random.Random(seed)
|
29
|
-
|
30
|
-
if classes is None:
|
31
|
-
classes = ["LATIN", "GREEK", "CYRILLIC"]
|
32
|
-
|
33
|
-
target_chars = [char for char in text if char.isalnum()]
|
34
|
-
confusable_chars = [
|
35
|
-
char for char in target_chars if char in confusables.confusables_data
|
36
|
-
]
|
37
|
-
num_replacements = int(len(confusable_chars) * replacement_rate)
|
38
|
-
done = 0
|
39
|
-
rng.shuffle(confusable_chars)
|
40
|
-
for char in confusable_chars:
|
41
|
-
if done >= num_replacements:
|
42
|
-
break
|
43
|
-
options = [
|
44
|
-
o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
|
45
|
-
]
|
46
|
-
if classes != "all":
|
47
|
-
options = [opt for opt in options if confusables.alias(opt) in classes]
|
48
|
-
if not options:
|
49
|
-
continue
|
50
|
-
text = text.replace(char, rng.choice(options), 1)
|
51
|
-
done += 1
|
52
|
-
return text
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
1
|
+
from typing import Literal
|
2
|
+
from .core import Glitchling, AttackWave, AttackOrder
|
3
|
+
import random
|
4
|
+
from confusable_homoglyphs import confusables
|
5
|
+
|
6
|
+
|
7
|
+
def swap_homoglyphs(
|
8
|
+
text: str,
|
9
|
+
replacement_rate: float = 0.02,
|
10
|
+
classes: list[str] | Literal["all"] | None = None,
|
11
|
+
seed: int | None = None,
|
12
|
+
rng: random.Random | None = None,
|
13
|
+
) -> str:
|
14
|
+
"""Replace characters with visually confusable homoglyphs.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
- text: Input text.
|
18
|
+
- replacement_rate: Max proportion of eligible characters to replace (default 0.02).
|
19
|
+
- classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
|
20
|
+
- seed: Optional seed if `rng` not provided.
|
21
|
+
- rng: Optional RNG; overrides seed.
|
22
|
+
|
23
|
+
Notes
|
24
|
+
- Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
|
25
|
+
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
26
|
+
"""
|
27
|
+
if rng is None:
|
28
|
+
rng = random.Random(seed)
|
29
|
+
|
30
|
+
if classes is None:
|
31
|
+
classes = ["LATIN", "GREEK", "CYRILLIC"]
|
32
|
+
|
33
|
+
target_chars = [char for char in text if char.isalnum()]
|
34
|
+
confusable_chars = [
|
35
|
+
char for char in target_chars if char in confusables.confusables_data
|
36
|
+
]
|
37
|
+
num_replacements = int(len(confusable_chars) * replacement_rate)
|
38
|
+
done = 0
|
39
|
+
rng.shuffle(confusable_chars)
|
40
|
+
for char in confusable_chars:
|
41
|
+
if done >= num_replacements:
|
42
|
+
break
|
43
|
+
options = [
|
44
|
+
o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
|
45
|
+
]
|
46
|
+
if classes != "all":
|
47
|
+
options = [opt for opt in options if confusables.alias(opt) in classes]
|
48
|
+
if not options:
|
49
|
+
continue
|
50
|
+
text = text.replace(char, rng.choice(options), 1)
|
51
|
+
done += 1
|
52
|
+
return text
|
53
|
+
|
54
|
+
|
55
|
+
class Mim1c(Glitchling):
|
56
|
+
"""Glitchling that swaps characters for visually similar homoglyphs."""
|
57
|
+
|
58
|
+
def __init__(
|
59
|
+
self,
|
60
|
+
*,
|
61
|
+
replacement_rate: float = 0.02,
|
62
|
+
classes: list[str] | Literal["all"] | None = None,
|
63
|
+
seed: int | None = None,
|
64
|
+
) -> None:
|
65
|
+
super().__init__(
|
66
|
+
name="Mim1c",
|
67
|
+
corruption_function=swap_homoglyphs,
|
68
|
+
scope=AttackWave.CHARACTER,
|
69
|
+
order=AttackOrder.LAST,
|
70
|
+
seed=seed,
|
71
|
+
replacement_rate=replacement_rate,
|
72
|
+
classes=classes,
|
73
|
+
)
|
74
|
+
|
75
|
+
|
76
|
+
mim1c = Mim1c()
|
77
|
+
|
78
|
+
|
79
|
+
__all__ = ["Mim1c", "mim1c"]
|
@@ -1,73 +1,91 @@
|
|
1
|
-
import re
|
2
|
-
import random
|
3
|
-
from .core import Glitchling, AttackWave
|
4
|
-
|
5
|
-
FULL_BLOCK = "█"
|
6
|
-
|
7
|
-
|
8
|
-
def redact_words(
|
9
|
-
text: str,
|
10
|
-
replacement_char: str = FULL_BLOCK,
|
11
|
-
redaction_rate: float = 0.05,
|
12
|
-
merge_adjacent: bool = False,
|
13
|
-
seed: int = 151,
|
14
|
-
rng: random.Random | None = None,
|
15
|
-
) -> str:
|
16
|
-
"""Redact random words by replacing their characters.
|
17
|
-
|
18
|
-
Parameters
|
19
|
-
- text: Input text.
|
20
|
-
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
21
|
-
- redaction_rate: Max proportion of words to redact (default 0.05).
|
22
|
-
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
23
|
-
- seed: Seed used if `rng` not provided (default 151).
|
24
|
-
- rng: Optional RNG; overrides seed.
|
25
|
-
"""
|
26
|
-
if rng is None:
|
27
|
-
rng = random.Random(seed)
|
28
|
-
|
29
|
-
# Preserve exact spacing and punctuation by using regex
|
30
|
-
tokens = re.split(r"(\s+)", text)
|
31
|
-
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
32
|
-
num_to_redact = max(1, int(len(word_indices) * redaction_rate))
|
33
|
-
|
34
|
-
# Sample from the indices of actual words
|
35
|
-
indices_to_redact = rng.sample(word_indices, k=num_to_redact)
|
36
|
-
indices_to_redact.sort()
|
37
|
-
|
38
|
-
for i in indices_to_redact:
|
39
|
-
if i >= len(tokens):
|
40
|
-
break
|
41
|
-
|
42
|
-
word = tokens[i]
|
43
|
-
if not word or word.isspace(): # Skip empty or whitespace
|
44
|
-
continue
|
45
|
-
|
46
|
-
# Check if word has trailing punctuation
|
47
|
-
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
48
|
-
if match:
|
49
|
-
prefix, core, suffix = match.groups()
|
50
|
-
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
51
|
-
else:
|
52
|
-
tokens[i] = f"{replacement_char * len(word)}"
|
53
|
-
|
54
|
-
text = "".join(tokens)
|
55
|
-
|
56
|
-
if merge_adjacent:
|
57
|
-
text = re.sub(
|
58
|
-
rf"{replacement_char}\W+{replacement_char}",
|
59
|
-
lambda m: replacement_char * (len(m.group(0)) - 1),
|
60
|
-
text,
|
61
|
-
)
|
62
|
-
|
63
|
-
return text
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
1
|
+
import re
|
2
|
+
import random
|
3
|
+
from .core import Glitchling, AttackWave
|
4
|
+
|
5
|
+
FULL_BLOCK = "█"
|
6
|
+
|
7
|
+
|
8
|
+
def redact_words(
|
9
|
+
text: str,
|
10
|
+
replacement_char: str = FULL_BLOCK,
|
11
|
+
redaction_rate: float = 0.05,
|
12
|
+
merge_adjacent: bool = False,
|
13
|
+
seed: int = 151,
|
14
|
+
rng: random.Random | None = None,
|
15
|
+
) -> str:
|
16
|
+
"""Redact random words by replacing their characters.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
- text: Input text.
|
20
|
+
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
21
|
+
- redaction_rate: Max proportion of words to redact (default 0.05).
|
22
|
+
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
23
|
+
- seed: Seed used if `rng` not provided (default 151).
|
24
|
+
- rng: Optional RNG; overrides seed.
|
25
|
+
"""
|
26
|
+
if rng is None:
|
27
|
+
rng = random.Random(seed)
|
28
|
+
|
29
|
+
# Preserve exact spacing and punctuation by using regex
|
30
|
+
tokens = re.split(r"(\s+)", text)
|
31
|
+
word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
|
32
|
+
num_to_redact = max(1, int(len(word_indices) * redaction_rate))
|
33
|
+
|
34
|
+
# Sample from the indices of actual words
|
35
|
+
indices_to_redact = rng.sample(word_indices, k=num_to_redact)
|
36
|
+
indices_to_redact.sort()
|
37
|
+
|
38
|
+
for i in indices_to_redact:
|
39
|
+
if i >= len(tokens):
|
40
|
+
break
|
41
|
+
|
42
|
+
word = tokens[i]
|
43
|
+
if not word or word.isspace(): # Skip empty or whitespace
|
44
|
+
continue
|
45
|
+
|
46
|
+
# Check if word has trailing punctuation
|
47
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
48
|
+
if match:
|
49
|
+
prefix, core, suffix = match.groups()
|
50
|
+
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
51
|
+
else:
|
52
|
+
tokens[i] = f"{replacement_char * len(word)}"
|
53
|
+
|
54
|
+
text = "".join(tokens)
|
55
|
+
|
56
|
+
if merge_adjacent:
|
57
|
+
text = re.sub(
|
58
|
+
rf"{replacement_char}\W+{replacement_char}",
|
59
|
+
lambda m: replacement_char * (len(m.group(0)) - 1),
|
60
|
+
text,
|
61
|
+
)
|
62
|
+
|
63
|
+
return text
|
64
|
+
|
65
|
+
|
66
|
+
class Redactyl(Glitchling):
|
67
|
+
"""Glitchling that redacts words with block characters."""
|
68
|
+
|
69
|
+
def __init__(
|
70
|
+
self,
|
71
|
+
*,
|
72
|
+
replacement_char: str = FULL_BLOCK,
|
73
|
+
redaction_rate: float = 0.05,
|
74
|
+
merge_adjacent: bool = False,
|
75
|
+
seed: int = 151,
|
76
|
+
) -> None:
|
77
|
+
super().__init__(
|
78
|
+
name="Redactyl",
|
79
|
+
corruption_function=redact_words,
|
80
|
+
scope=AttackWave.WORD,
|
81
|
+
seed=seed,
|
82
|
+
replacement_char=replacement_char,
|
83
|
+
redaction_rate=redaction_rate,
|
84
|
+
merge_adjacent=merge_adjacent,
|
85
|
+
)
|
86
|
+
|
87
|
+
|
88
|
+
redactyl = Redactyl()
|
89
|
+
|
90
|
+
|
91
|
+
__all__ = ["Redactyl", "redactyl"]
|
@@ -1,54 +1,73 @@
|
|
1
|
-
import re
|
2
|
-
import random
|
3
|
-
from .core import Glitchling, AttackWave
|
4
|
-
|
5
|
-
|
6
|
-
def reduplicate_words(
|
7
|
-
text: str,
|
8
|
-
reduplication_rate: float = 0.05,
|
9
|
-
seed: int | None = None,
|
10
|
-
rng: random.Random | None = None,
|
11
|
-
) -> str:
|
12
|
-
"""Randomly reduplicate words in the text.
|
13
|
-
|
14
|
-
Parameters
|
15
|
-
- text: Input text.
|
16
|
-
- reduplication_rate: Max proportion of words to reduplicate (default 0.05).
|
17
|
-
- seed: Optional seed if `rng` not provided.
|
18
|
-
- rng: Optional RNG; overrides seed.
|
19
|
-
|
20
|
-
Notes
|
21
|
-
- Preserves spacing and punctuation by tokenizing with separators.
|
22
|
-
- Deterministic when run with a fixed seed or via Gaggle.
|
23
|
-
"""
|
24
|
-
if rng is None:
|
25
|
-
rng = random.Random(seed)
|
26
|
-
|
27
|
-
# Preserve exact spacing and punctuation by using regex
|
28
|
-
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
29
|
-
|
30
|
-
for i in range(0, len(tokens), 2): # Every other token is a word
|
31
|
-
if i >= len(tokens):
|
32
|
-
break
|
33
|
-
|
34
|
-
word = tokens[i]
|
35
|
-
if not word or word.isspace(): # Skip empty or whitespace
|
36
|
-
continue
|
37
|
-
|
38
|
-
# Only consider actual words for reduplication
|
39
|
-
if rng.random() < reduplication_rate:
|
40
|
-
# Check if word has trailing punctuation
|
41
|
-
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
42
|
-
if match:
|
43
|
-
prefix, core, suffix = match.groups()
|
44
|
-
# Reduplicate with a space: "word" -> "word word"
|
45
|
-
tokens[i] = f"{prefix}{core} {core}{suffix}"
|
46
|
-
else:
|
47
|
-
tokens[i] = f"{word} {word}"
|
48
|
-
|
49
|
-
return "".join(tokens)
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
1
|
+
import re
|
2
|
+
import random
|
3
|
+
from .core import Glitchling, AttackWave
|
4
|
+
|
5
|
+
|
6
|
+
def reduplicate_words(
|
7
|
+
text: str,
|
8
|
+
reduplication_rate: float = 0.05,
|
9
|
+
seed: int | None = None,
|
10
|
+
rng: random.Random | None = None,
|
11
|
+
) -> str:
|
12
|
+
"""Randomly reduplicate words in the text.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
- text: Input text.
|
16
|
+
- reduplication_rate: Max proportion of words to reduplicate (default 0.05).
|
17
|
+
- seed: Optional seed if `rng` not provided.
|
18
|
+
- rng: Optional RNG; overrides seed.
|
19
|
+
|
20
|
+
Notes
|
21
|
+
- Preserves spacing and punctuation by tokenizing with separators.
|
22
|
+
- Deterministic when run with a fixed seed or via Gaggle.
|
23
|
+
"""
|
24
|
+
if rng is None:
|
25
|
+
rng = random.Random(seed)
|
26
|
+
|
27
|
+
# Preserve exact spacing and punctuation by using regex
|
28
|
+
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
29
|
+
|
30
|
+
for i in range(0, len(tokens), 2): # Every other token is a word
|
31
|
+
if i >= len(tokens):
|
32
|
+
break
|
33
|
+
|
34
|
+
word = tokens[i]
|
35
|
+
if not word or word.isspace(): # Skip empty or whitespace
|
36
|
+
continue
|
37
|
+
|
38
|
+
# Only consider actual words for reduplication
|
39
|
+
if rng.random() < reduplication_rate:
|
40
|
+
# Check if word has trailing punctuation
|
41
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
42
|
+
if match:
|
43
|
+
prefix, core, suffix = match.groups()
|
44
|
+
# Reduplicate with a space: "word" -> "word word"
|
45
|
+
tokens[i] = f"{prefix}{core} {core}{suffix}"
|
46
|
+
else:
|
47
|
+
tokens[i] = f"{word} {word}"
|
48
|
+
|
49
|
+
return "".join(tokens)
|
50
|
+
|
51
|
+
|
52
|
+
class Reduple(Glitchling):
|
53
|
+
"""Glitchling that repeats words to simulate stuttering speech."""
|
54
|
+
|
55
|
+
def __init__(
|
56
|
+
self,
|
57
|
+
*,
|
58
|
+
reduplication_rate: float = 0.05,
|
59
|
+
seed: int | None = None,
|
60
|
+
) -> None:
|
61
|
+
super().__init__(
|
62
|
+
name="Reduple",
|
63
|
+
corruption_function=reduplicate_words,
|
64
|
+
scope=AttackWave.WORD,
|
65
|
+
seed=seed,
|
66
|
+
reduplication_rate=reduplication_rate,
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
reduple = Reduple()
|
71
|
+
|
72
|
+
|
73
|
+
__all__ = ["Reduple", "reduple"]
|
@@ -1,53 +1,74 @@
|
|
1
|
-
import random
|
2
|
-
import re
|
3
|
-
from .core import Glitchling, AttackWave
|
4
|
-
|
5
|
-
|
6
|
-
def delete_random_words(
|
7
|
-
text: str,
|
8
|
-
max_deletion_rate: float = 0.01,
|
9
|
-
seed: int | None = None,
|
10
|
-
rng: random.Random | None = None,
|
11
|
-
) -> str:
|
12
|
-
"""Delete random words from the input text.
|
13
|
-
|
14
|
-
Parameters
|
15
|
-
- text: The input text.
|
16
|
-
- max_deletion_rate: The maximum proportion of words to delete (default 0.01).
|
17
|
-
- seed: Optional seed if `rng` not provided.
|
18
|
-
- rng: Optional RNG; overrides seed.
|
19
|
-
"""
|
20
|
-
if rng is None:
|
21
|
-
rng = random.Random(seed)
|
22
|
-
|
23
|
-
# Preserve exact spacing and punctuation by using regex
|
24
|
-
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
25
|
-
|
26
|
-
for i in range(
|
27
|
-
2, len(tokens), 2
|
28
|
-
): # Every other token is a word, but skip the first word
|
29
|
-
if i >= len(tokens):
|
30
|
-
break
|
31
|
-
|
32
|
-
word = tokens[i]
|
33
|
-
if not word or word.isspace(): # Skip empty or whitespace
|
34
|
-
continue
|
35
|
-
|
36
|
-
# Only consider actual words for deletion
|
37
|
-
if rng.random() < max_deletion_rate:
|
38
|
-
# Check if word has trailing punctuation
|
39
|
-
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
40
|
-
if match:
|
41
|
-
prefix, _, suffix = match.groups()
|
42
|
-
tokens[i] = f"{prefix.strip()}{suffix.strip()}"
|
43
|
-
else:
|
44
|
-
tokens[i] = ""
|
45
|
-
|
46
|
-
text = "".join(tokens)
|
47
|
-
text = re.sub(r"\s+([.,;:])", r"\1", text)
|
48
|
-
text = re.sub(r"\s{2,}", " ", text).strip()
|
49
|
-
|
50
|
-
return text
|
51
|
-
|
52
|
-
|
53
|
-
|
1
|
+
import random
|
2
|
+
import re
|
3
|
+
from .core import Glitchling, AttackWave
|
4
|
+
|
5
|
+
|
6
|
+
def delete_random_words(
|
7
|
+
text: str,
|
8
|
+
max_deletion_rate: float = 0.01,
|
9
|
+
seed: int | None = None,
|
10
|
+
rng: random.Random | None = None,
|
11
|
+
) -> str:
|
12
|
+
"""Delete random words from the input text.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
- text: The input text.
|
16
|
+
- max_deletion_rate: The maximum proportion of words to delete (default 0.01).
|
17
|
+
- seed: Optional seed if `rng` not provided.
|
18
|
+
- rng: Optional RNG; overrides seed.
|
19
|
+
"""
|
20
|
+
if rng is None:
|
21
|
+
rng = random.Random(seed)
|
22
|
+
|
23
|
+
# Preserve exact spacing and punctuation by using regex
|
24
|
+
tokens = re.split(r"(\s+)", text) # Split but keep separators
|
25
|
+
|
26
|
+
for i in range(
|
27
|
+
2, len(tokens), 2
|
28
|
+
): # Every other token is a word, but skip the first word
|
29
|
+
if i >= len(tokens):
|
30
|
+
break
|
31
|
+
|
32
|
+
word = tokens[i]
|
33
|
+
if not word or word.isspace(): # Skip empty or whitespace
|
34
|
+
continue
|
35
|
+
|
36
|
+
# Only consider actual words for deletion
|
37
|
+
if rng.random() < max_deletion_rate:
|
38
|
+
# Check if word has trailing punctuation
|
39
|
+
match = re.match(r"^(\W*)(.*?)(\W*)$", word)
|
40
|
+
if match:
|
41
|
+
prefix, _, suffix = match.groups()
|
42
|
+
tokens[i] = f"{prefix.strip()}{suffix.strip()}"
|
43
|
+
else:
|
44
|
+
tokens[i] = ""
|
45
|
+
|
46
|
+
text = "".join(tokens)
|
47
|
+
text = re.sub(r"\s+([.,;:])", r"\1", text)
|
48
|
+
text = re.sub(r"\s{2,}", " ", text).strip()
|
49
|
+
|
50
|
+
return text
|
51
|
+
|
52
|
+
|
53
|
+
class Rushmore(Glitchling):
|
54
|
+
"""Glitchling that deletes words to simulate missing information."""
|
55
|
+
|
56
|
+
def __init__(
|
57
|
+
self,
|
58
|
+
*,
|
59
|
+
max_deletion_rate: float = 0.01,
|
60
|
+
seed: int | None = None,
|
61
|
+
) -> None:
|
62
|
+
super().__init__(
|
63
|
+
name="Rushmore",
|
64
|
+
corruption_function=delete_random_words,
|
65
|
+
scope=AttackWave.WORD,
|
66
|
+
seed=seed,
|
67
|
+
max_deletion_rate=max_deletion_rate,
|
68
|
+
)
|
69
|
+
|
70
|
+
|
71
|
+
rushmore = Rushmore()
|
72
|
+
|
73
|
+
|
74
|
+
__all__ = ["Rushmore", "rushmore"]
|