glitchlings 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,73 +1,128 @@
1
- import re
2
- import random
3
- from .core import Glitchling, AttackWave
4
-
5
- FULL_BLOCK = "█"
6
-
7
-
8
- def redact_words(
9
- text: str,
10
- replacement_char: str = FULL_BLOCK,
11
- redaction_rate: float = 0.05,
12
- merge_adjacent: bool = False,
13
- seed: int = 151,
14
- rng: random.Random | None = None,
15
- ) -> str:
16
- """Redact random words by replacing their characters.
17
-
18
- Parameters
19
- - text: Input text.
20
- - replacement_char: The character to use for redaction (default FULL_BLOCK).
21
- - redaction_rate: Max proportion of words to redact (default 0.05).
22
- - merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
23
- - seed: Seed used if `rng` not provided (default 151).
24
- - rng: Optional RNG; overrides seed.
25
- """
26
- if rng is None:
27
- rng = random.Random(seed)
28
-
29
- # Preserve exact spacing and punctuation by using regex
30
- tokens = re.split(r"(\s+)", text)
31
- word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
32
- num_to_redact = max(1, int(len(word_indices) * redaction_rate))
33
-
34
- # Sample from the indices of actual words
35
- indices_to_redact = rng.sample(word_indices, k=num_to_redact)
36
- indices_to_redact.sort()
37
-
38
- for i in indices_to_redact:
39
- if i >= len(tokens):
40
- break
41
-
42
- word = tokens[i]
43
- if not word or word.isspace(): # Skip empty or whitespace
44
- continue
45
-
46
- # Check if word has trailing punctuation
47
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
48
- if match:
49
- prefix, core, suffix = match.groups()
50
- tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
51
- else:
52
- tokens[i] = f"{replacement_char * len(word)}"
53
-
54
- text = "".join(tokens)
55
-
56
- if merge_adjacent:
57
- text = re.sub(
58
- rf"{replacement_char}\W+{replacement_char}",
59
- lambda m: replacement_char * (len(m.group(0)) - 1),
60
- text,
61
- )
62
-
63
- return text
64
-
65
-
66
- redactyl = Glitchling(
67
- name="Redactyl",
68
- corruption_function=redact_words,
69
- replacement_char=FULL_BLOCK,
70
- redaction_rate=0.05,
71
- scope=AttackWave.WORD,
72
- seed=151,
73
- )
1
+ import re
2
+ import random
3
+
4
+ from .core import Glitchling, AttackWave
5
+
6
+ FULL_BLOCK = "█"
7
+
8
+
9
+ try:
10
+ from glitchlings._zoo_rust import redact_words as _redact_words_rust
11
+ except ImportError: # pragma: no cover - compiled extension not present
12
+ _redact_words_rust = None
13
+
14
+
15
+ def _python_redact_words(
16
+ text: str,
17
+ *,
18
+ replacement_char: str,
19
+ redaction_rate: float,
20
+ merge_adjacent: bool,
21
+ rng: random.Random,
22
+ ) -> str:
23
+ """Redact random words by replacing their characters.
24
+
25
+ Parameters
26
+ - text: Input text.
27
+ - replacement_char: The character to use for redaction (default FULL_BLOCK).
28
+ - redaction_rate: Max proportion of words to redact (default 0.05).
29
+ - merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
30
+ - seed: Seed used if `rng` not provided (default 151).
31
+ - rng: Optional RNG; overrides seed.
32
+ """
33
+ # Preserve exact spacing and punctuation by using regex
34
+ tokens = re.split(r"(\s+)", text)
35
+ word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
36
+ if not word_indices:
37
+ raise ValueError("Cannot redact words because the input text contains no redactable words.")
38
+ num_to_redact = max(1, int(len(word_indices) * redaction_rate))
39
+
40
+ # Sample from the indices of actual words
41
+ indices_to_redact = rng.sample(word_indices, k=num_to_redact)
42
+ indices_to_redact.sort()
43
+
44
+ for i in indices_to_redact:
45
+ if i >= len(tokens):
46
+ break
47
+
48
+ word = tokens[i]
49
+ if not word or word.isspace(): # Skip empty or whitespace
50
+ continue
51
+
52
+ # Check if word has trailing punctuation
53
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
54
+ if match:
55
+ prefix, core, suffix = match.groups()
56
+ tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
57
+ else:
58
+ tokens[i] = f"{replacement_char * len(word)}"
59
+
60
+ text = "".join(tokens)
61
+
62
+ if merge_adjacent:
63
+ text = re.sub(
64
+ rf"{replacement_char}\W+{replacement_char}",
65
+ lambda m: replacement_char * (len(m.group(0)) - 1),
66
+ text,
67
+ )
68
+
69
+ return text
70
+
71
+
72
+ def redact_words(
73
+ text: str,
74
+ replacement_char: str = FULL_BLOCK,
75
+ redaction_rate: float = 0.05,
76
+ merge_adjacent: bool = False,
77
+ seed: int = 151,
78
+ rng: random.Random | None = None,
79
+ ) -> str:
80
+ """Redact random words by replacing their characters."""
81
+
82
+ if rng is None:
83
+ rng = random.Random(seed)
84
+
85
+ if _redact_words_rust is not None:
86
+ return _redact_words_rust(
87
+ text,
88
+ replacement_char,
89
+ redaction_rate,
90
+ merge_adjacent,
91
+ rng,
92
+ )
93
+
94
+ return _python_redact_words(
95
+ text,
96
+ replacement_char=replacement_char,
97
+ redaction_rate=redaction_rate,
98
+ merge_adjacent=merge_adjacent,
99
+ rng=rng,
100
+ )
101
+
102
+
103
+ class Redactyl(Glitchling):
104
+ """Glitchling that redacts words with block characters."""
105
+
106
+ def __init__(
107
+ self,
108
+ *,
109
+ replacement_char: str = FULL_BLOCK,
110
+ redaction_rate: float = 0.05,
111
+ merge_adjacent: bool = False,
112
+ seed: int = 151,
113
+ ) -> None:
114
+ super().__init__(
115
+ name="Redactyl",
116
+ corruption_function=redact_words,
117
+ scope=AttackWave.WORD,
118
+ seed=seed,
119
+ replacement_char=replacement_char,
120
+ redaction_rate=redaction_rate,
121
+ merge_adjacent=merge_adjacent,
122
+ )
123
+
124
+
125
+ redactyl = Redactyl()
126
+
127
+
128
+ __all__ = ["Redactyl", "redactyl"]
@@ -1,54 +1,100 @@
1
- import re
2
- import random
3
- from .core import Glitchling, AttackWave
4
-
5
-
6
- def reduplicate_words(
7
- text: str,
8
- reduplication_rate: float = 0.05,
9
- seed: int | None = None,
10
- rng: random.Random | None = None,
11
- ) -> str:
12
- """Randomly reduplicate words in the text.
13
-
14
- Parameters
15
- - text: Input text.
16
- - reduplication_rate: Max proportion of words to reduplicate (default 0.05).
17
- - seed: Optional seed if `rng` not provided.
18
- - rng: Optional RNG; overrides seed.
19
-
20
- Notes
21
- - Preserves spacing and punctuation by tokenizing with separators.
22
- - Deterministic when run with a fixed seed or via Gaggle.
23
- """
24
- if rng is None:
25
- rng = random.Random(seed)
26
-
27
- # Preserve exact spacing and punctuation by using regex
28
- tokens = re.split(r"(\s+)", text) # Split but keep separators
29
-
30
- for i in range(0, len(tokens), 2): # Every other token is a word
31
- if i >= len(tokens):
32
- break
33
-
34
- word = tokens[i]
35
- if not word or word.isspace(): # Skip empty or whitespace
36
- continue
37
-
38
- # Only consider actual words for reduplication
39
- if rng.random() < reduplication_rate:
40
- # Check if word has trailing punctuation
41
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
42
- if match:
43
- prefix, core, suffix = match.groups()
44
- # Reduplicate with a space: "word" -> "word word"
45
- tokens[i] = f"{prefix}{core} {core}{suffix}"
46
- else:
47
- tokens[i] = f"{word} {word}"
48
-
49
- return "".join(tokens)
50
-
51
-
52
- reduple = Glitchling(
53
- name="Reduple", corruption_function=reduplicate_words, scope=AttackWave.WORD
54
- )
1
+ import re
2
+ import random
3
+
4
+ from .core import Glitchling, AttackWave
5
+
6
+ try:
7
+ from glitchlings._zoo_rust import reduplicate_words as _reduplicate_words_rust
8
+ except ImportError: # pragma: no cover - compiled extension not present
9
+ _reduplicate_words_rust = None
10
+
11
+
12
+ def _python_reduplicate_words(
13
+ text: str,
14
+ *,
15
+ reduplication_rate: float,
16
+ rng: random.Random,
17
+ ) -> str:
18
+ """Randomly reduplicate words in the text.
19
+
20
+ Parameters
21
+ - text: Input text.
22
+ - reduplication_rate: Max proportion of words to reduplicate (default 0.05).
23
+ - seed: Optional seed if `rng` not provided.
24
+ - rng: Optional RNG; overrides seed.
25
+
26
+ Notes
27
+ - Preserves spacing and punctuation by tokenizing with separators.
28
+ - Deterministic when run with a fixed seed or via Gaggle.
29
+ """
30
+ # Preserve exact spacing and punctuation by using regex
31
+ tokens = re.split(r"(\s+)", text) # Split but keep separators
32
+
33
+ for i in range(0, len(tokens), 2): # Every other token is a word
34
+ if i >= len(tokens):
35
+ break
36
+
37
+ word = tokens[i]
38
+ if not word or word.isspace(): # Skip empty or whitespace
39
+ continue
40
+
41
+ # Only consider actual words for reduplication
42
+ if rng.random() < reduplication_rate:
43
+ # Check if word has trailing punctuation
44
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
45
+ if match:
46
+ prefix, core, suffix = match.groups()
47
+ # Reduplicate with a space: "word" -> "word word"
48
+ tokens[i] = f"{prefix}{core} {core}{suffix}"
49
+ else:
50
+ tokens[i] = f"{word} {word}"
51
+ return "".join(tokens)
52
+
53
+
54
+ def reduplicate_words(
55
+ text: str,
56
+ reduplication_rate: float = 0.05,
57
+ seed: int | None = None,
58
+ rng: random.Random | None = None,
59
+ ) -> str:
60
+ """Randomly reduplicate words in the text.
61
+
62
+ Falls back to the Python implementation when the optional Rust
63
+ extension is unavailable.
64
+ """
65
+
66
+ if rng is None:
67
+ rng = random.Random(seed)
68
+
69
+ if _reduplicate_words_rust is not None:
70
+ return _reduplicate_words_rust(text, reduplication_rate, rng)
71
+
72
+ return _python_reduplicate_words(
73
+ text,
74
+ reduplication_rate=reduplication_rate,
75
+ rng=rng,
76
+ )
77
+
78
+
79
+ class Reduple(Glitchling):
80
+ """Glitchling that repeats words to simulate stuttering speech."""
81
+
82
+ def __init__(
83
+ self,
84
+ *,
85
+ reduplication_rate: float = 0.05,
86
+ seed: int | None = None,
87
+ ) -> None:
88
+ super().__init__(
89
+ name="Reduple",
90
+ corruption_function=reduplicate_words,
91
+ scope=AttackWave.WORD,
92
+ seed=seed,
93
+ reduplication_rate=reduplication_rate,
94
+ )
95
+
96
+
97
+ reduple = Reduple()
98
+
99
+
100
+ __all__ = ["Reduple", "reduple"]
@@ -0,0 +1,97 @@
1
+ import math
2
+ import random
3
+ import re
4
+
5
+ from .core import Glitchling, AttackWave
6
+
7
+ try:
8
+ from glitchlings._zoo_rust import delete_random_words as _delete_random_words_rust
9
+ except ImportError: # pragma: no cover - compiled extension not present
10
+ _delete_random_words_rust = None
11
+
12
+
13
+ def _python_delete_random_words(
14
+ text: str,
15
+ *,
16
+ max_deletion_rate: float,
17
+ rng: random.Random,
18
+ ) -> str:
19
+ """Delete random words from the input text while preserving whitespace."""
20
+
21
+ tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
22
+
23
+ candidate_indices: list[int] = []
24
+ for i in range(2, len(tokens), 2): # Every other token is a word, skip the first word
25
+ word = tokens[i]
26
+ if not word or word.isspace():
27
+ continue
28
+
29
+ candidate_indices.append(i)
30
+
31
+ allowed_deletions = math.floor(len(candidate_indices) * max_deletion_rate)
32
+ if allowed_deletions <= 0:
33
+ return text
34
+
35
+ for i in candidate_indices:
36
+ if rng.random() < max_deletion_rate:
37
+ word = tokens[i]
38
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
39
+ if match:
40
+ prefix, _, suffix = match.groups()
41
+ tokens[i] = f"{prefix.strip()}{suffix.strip()}"
42
+ else:
43
+ tokens[i] = ""
44
+
45
+ text = "".join(tokens)
46
+ text = re.sub(r"\s+([.,;:])", r"\1", text)
47
+ text = re.sub(r"\s{2,}", " ", text).strip()
48
+
49
+ return text
50
+
51
+
52
+ def delete_random_words(
53
+ text: str,
54
+ max_deletion_rate: float = 0.01,
55
+ seed: int | None = None,
56
+ rng: random.Random | None = None,
57
+ ) -> str:
58
+ """Delete random words from the input text.
59
+
60
+ Uses the optional Rust implementation when available.
61
+ """
62
+
63
+ if rng is None:
64
+ rng = random.Random(seed)
65
+
66
+ if _delete_random_words_rust is not None:
67
+ return _delete_random_words_rust(text, max_deletion_rate, rng)
68
+
69
+ return _python_delete_random_words(
70
+ text,
71
+ max_deletion_rate=max_deletion_rate,
72
+ rng=rng,
73
+ )
74
+
75
+
76
+ class Rushmore(Glitchling):
77
+ """Glitchling that deletes words to simulate missing information."""
78
+
79
+ def __init__(
80
+ self,
81
+ *,
82
+ max_deletion_rate: float = 0.01,
83
+ seed: int | None = None,
84
+ ) -> None:
85
+ super().__init__(
86
+ name="Rushmore",
87
+ corruption_function=delete_random_words,
88
+ scope=AttackWave.WORD,
89
+ seed=seed,
90
+ max_deletion_rate=max_deletion_rate,
91
+ )
92
+
93
+
94
+ rushmore = Rushmore()
95
+
96
+
97
+ __all__ = ["Rushmore", "rushmore"]