glitchlings 0.4.5__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +71 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cp311-win_amd64.pyd +0 -0
- glitchlings/compat.py +282 -0
- glitchlings/config.py +386 -0
- glitchlings/config.toml +3 -0
- glitchlings/data/__init__.py +1 -0
- glitchlings/data/hokey_assets.json +193 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +209 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +108 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +652 -0
- glitchlings/lexicon/wordnet.py +228 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +375 -0
- glitchlings/zoo/__init__.py +172 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.5.dist-info/METADATA +648 -0
- glitchlings-0.4.5.dist-info/RECORD +53 -0
- glitchlings-0.4.5.dist-info/WHEEL +5 -0
- glitchlings-0.4.5.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.5.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from ._rate import resolve_rate
|
|
6
|
+
from ._rust_extensions import get_rust_operation
|
|
7
|
+
from ._sampling import weighted_sample_without_replacement
|
|
8
|
+
from ._text_utils import (
|
|
9
|
+
WordToken,
|
|
10
|
+
collect_word_tokens,
|
|
11
|
+
split_preserving_whitespace,
|
|
12
|
+
)
|
|
13
|
+
from .core import AttackWave, Glitchling
|
|
14
|
+
|
|
15
|
+
FULL_BLOCK = "█"
|
|
16
|
+
|
|
17
|
+
# Load Rust-accelerated operation if available
|
|
18
|
+
_redact_words_rust = get_rust_operation("redact_words")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _python_redact_words(
|
|
22
|
+
text: str,
|
|
23
|
+
*,
|
|
24
|
+
replacement_char: str,
|
|
25
|
+
rate: float,
|
|
26
|
+
merge_adjacent: bool,
|
|
27
|
+
rng: random.Random,
|
|
28
|
+
unweighted: bool = False,
|
|
29
|
+
) -> str:
|
|
30
|
+
"""Redact random words by replacing their characters.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
- text: Input text.
|
|
35
|
+
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
|
36
|
+
- rate: Max proportion of words to redact (default 0.05).
|
|
37
|
+
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
|
38
|
+
- rng: RNG used for sampling decisions.
|
|
39
|
+
- unweighted: When True, sample words uniformly instead of by length.
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
tokens = split_preserving_whitespace(text)
|
|
43
|
+
word_tokens = collect_word_tokens(tokens)
|
|
44
|
+
if not word_tokens:
|
|
45
|
+
raise ValueError("Cannot redact words because the input text contains no redactable words.")
|
|
46
|
+
|
|
47
|
+
population = [token.index for token in word_tokens]
|
|
48
|
+
weights = [1.0 if unweighted else float(token.core_length) for token in word_tokens]
|
|
49
|
+
|
|
50
|
+
clamped_rate = max(0.0, min(rate, 1.0))
|
|
51
|
+
raw_quota = len(population) * clamped_rate
|
|
52
|
+
num_to_redact = int(raw_quota)
|
|
53
|
+
if clamped_rate > 0.0:
|
|
54
|
+
num_to_redact = max(1, num_to_redact)
|
|
55
|
+
num_to_redact = min(num_to_redact, len(population))
|
|
56
|
+
if num_to_redact <= 0:
|
|
57
|
+
return "".join(tokens)
|
|
58
|
+
|
|
59
|
+
indices_to_redact = weighted_sample_without_replacement(
|
|
60
|
+
population,
|
|
61
|
+
weights,
|
|
62
|
+
k=num_to_redact,
|
|
63
|
+
rng=rng,
|
|
64
|
+
)
|
|
65
|
+
indices_to_redact.sort()
|
|
66
|
+
|
|
67
|
+
token_by_index: dict[int, WordToken] = {token.index: token for token in word_tokens}
|
|
68
|
+
|
|
69
|
+
for i in indices_to_redact:
|
|
70
|
+
if i >= len(tokens):
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
token = token_by_index.get(i)
|
|
74
|
+
if token is None:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
prefix, core, suffix = token.prefix, token.core, token.suffix
|
|
78
|
+
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
|
79
|
+
|
|
80
|
+
text = "".join(tokens)
|
|
81
|
+
|
|
82
|
+
if merge_adjacent:
|
|
83
|
+
text = re.sub(
|
|
84
|
+
rf"{replacement_char}\W+{replacement_char}",
|
|
85
|
+
lambda m: replacement_char * (len(m.group(0)) - 1),
|
|
86
|
+
text,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return text
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def redact_words(
|
|
93
|
+
text: str,
|
|
94
|
+
replacement_char: str = FULL_BLOCK,
|
|
95
|
+
rate: float | None = None,
|
|
96
|
+
merge_adjacent: bool = False,
|
|
97
|
+
seed: int = 151,
|
|
98
|
+
rng: random.Random | None = None,
|
|
99
|
+
*,
|
|
100
|
+
redaction_rate: float | None = None,
|
|
101
|
+
unweighted: bool = False,
|
|
102
|
+
) -> str:
|
|
103
|
+
"""Redact random words by replacing their characters."""
|
|
104
|
+
effective_rate = resolve_rate(
|
|
105
|
+
rate=rate,
|
|
106
|
+
legacy_value=redaction_rate,
|
|
107
|
+
default=0.025,
|
|
108
|
+
legacy_name="redaction_rate",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if rng is None:
|
|
112
|
+
rng = random.Random(seed)
|
|
113
|
+
|
|
114
|
+
clamped_rate = max(0.0, min(effective_rate, 1.0))
|
|
115
|
+
unweighted_flag = bool(unweighted)
|
|
116
|
+
|
|
117
|
+
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
|
118
|
+
|
|
119
|
+
if use_rust:
|
|
120
|
+
assert _redact_words_rust is not None # Type narrowing for mypy
|
|
121
|
+
return cast(
|
|
122
|
+
str,
|
|
123
|
+
_redact_words_rust(
|
|
124
|
+
text,
|
|
125
|
+
replacement_char,
|
|
126
|
+
clamped_rate,
|
|
127
|
+
merge_adjacent,
|
|
128
|
+
unweighted_flag,
|
|
129
|
+
rng,
|
|
130
|
+
),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return _python_redact_words(
|
|
134
|
+
text,
|
|
135
|
+
replacement_char=replacement_char,
|
|
136
|
+
rate=clamped_rate,
|
|
137
|
+
merge_adjacent=merge_adjacent,
|
|
138
|
+
rng=rng,
|
|
139
|
+
unweighted=unweighted_flag,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class Redactyl(Glitchling):
|
|
144
|
+
"""Glitchling that redacts words with block characters."""
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
*,
|
|
149
|
+
replacement_char: str = FULL_BLOCK,
|
|
150
|
+
rate: float | None = None,
|
|
151
|
+
redaction_rate: float | None = None,
|
|
152
|
+
merge_adjacent: bool = False,
|
|
153
|
+
seed: int = 151,
|
|
154
|
+
unweighted: bool = False,
|
|
155
|
+
) -> None:
|
|
156
|
+
self._param_aliases = {"redaction_rate": "rate"}
|
|
157
|
+
effective_rate = resolve_rate(
|
|
158
|
+
rate=rate,
|
|
159
|
+
legacy_value=redaction_rate,
|
|
160
|
+
default=0.025,
|
|
161
|
+
legacy_name="redaction_rate",
|
|
162
|
+
)
|
|
163
|
+
super().__init__(
|
|
164
|
+
name="Redactyl",
|
|
165
|
+
corruption_function=redact_words,
|
|
166
|
+
scope=AttackWave.WORD,
|
|
167
|
+
seed=seed,
|
|
168
|
+
replacement_char=replacement_char,
|
|
169
|
+
rate=effective_rate,
|
|
170
|
+
merge_adjacent=merge_adjacent,
|
|
171
|
+
unweighted=unweighted,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
175
|
+
replacement_char = self.kwargs.get("replacement_char")
|
|
176
|
+
rate = self.kwargs.get("rate")
|
|
177
|
+
merge_adjacent = self.kwargs.get("merge_adjacent")
|
|
178
|
+
if replacement_char is None or rate is None or merge_adjacent is None:
|
|
179
|
+
return None
|
|
180
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
|
181
|
+
return {
|
|
182
|
+
"type": "redact",
|
|
183
|
+
"replacement_char": str(replacement_char),
|
|
184
|
+
"redaction_rate": float(rate),
|
|
185
|
+
"merge_adjacent": bool(merge_adjacent),
|
|
186
|
+
"unweighted": unweighted,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
redactyl = Redactyl()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
__all__ = ["Redactyl", "redactyl"]
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from typing import Any, cast
|
|
3
|
+
|
|
4
|
+
from ._rate import resolve_rate
|
|
5
|
+
from ._rust_extensions import get_rust_operation
|
|
6
|
+
from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
|
|
7
|
+
from .core import AttackWave, Glitchling
|
|
8
|
+
|
|
9
|
+
# Load Rust-accelerated operation if available
|
|
10
|
+
_reduplicate_words_rust = get_rust_operation("reduplicate_words")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _python_reduplicate_words(
|
|
14
|
+
text: str,
|
|
15
|
+
*,
|
|
16
|
+
rate: float,
|
|
17
|
+
rng: random.Random,
|
|
18
|
+
unweighted: bool = False,
|
|
19
|
+
) -> str:
|
|
20
|
+
"""Randomly reduplicate words in the text.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
- text: Input text.
|
|
25
|
+
- rate: Max proportion of words to reduplicate (default 0.05).
|
|
26
|
+
- rng: RNG used for sampling decisions.
|
|
27
|
+
- unweighted: When True, sample words uniformly instead of length-weighted.
|
|
28
|
+
|
|
29
|
+
Notes
|
|
30
|
+
-----
|
|
31
|
+
- Preserves spacing and punctuation by tokenizing with separators.
|
|
32
|
+
- Deterministic when run with a fixed seed or via Gaggle.
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
tokens = split_preserving_whitespace(text)
|
|
36
|
+
word_tokens = collect_word_tokens(tokens)
|
|
37
|
+
|
|
38
|
+
weighted_tokens: list[tuple[int, float, WordToken]] = []
|
|
39
|
+
for token in word_tokens:
|
|
40
|
+
weight = 1.0 if unweighted else 1.0 / float(token.core_length)
|
|
41
|
+
weighted_tokens.append((token.index, weight, token))
|
|
42
|
+
|
|
43
|
+
if not weighted_tokens:
|
|
44
|
+
return "".join(tokens)
|
|
45
|
+
|
|
46
|
+
effective_rate = max(rate, 0.0)
|
|
47
|
+
if effective_rate <= 0.0:
|
|
48
|
+
return "".join(tokens)
|
|
49
|
+
|
|
50
|
+
mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(weighted_tokens)
|
|
51
|
+
|
|
52
|
+
for index, weight, token in weighted_tokens:
|
|
53
|
+
if effective_rate >= 1.0:
|
|
54
|
+
probability = 1.0
|
|
55
|
+
else:
|
|
56
|
+
if mean_weight <= 0.0:
|
|
57
|
+
probability = effective_rate
|
|
58
|
+
else:
|
|
59
|
+
probability = min(1.0, effective_rate * (weight / mean_weight))
|
|
60
|
+
if rng.random() >= probability:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
prefix, core, suffix = token.prefix, token.core, token.suffix
|
|
64
|
+
tokens[index] = f"{prefix}{core} {core}{suffix}"
|
|
65
|
+
return "".join(tokens)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def reduplicate_words(
|
|
69
|
+
text: str,
|
|
70
|
+
rate: float | None = None,
|
|
71
|
+
seed: int | None = None,
|
|
72
|
+
rng: random.Random | None = None,
|
|
73
|
+
*,
|
|
74
|
+
reduplication_rate: float | None = None,
|
|
75
|
+
unweighted: bool = False,
|
|
76
|
+
) -> str:
|
|
77
|
+
"""Randomly reduplicate words in the text.
|
|
78
|
+
|
|
79
|
+
Falls back to the Python implementation when the optional Rust
|
|
80
|
+
extension is unavailable.
|
|
81
|
+
"""
|
|
82
|
+
effective_rate = resolve_rate(
|
|
83
|
+
rate=rate,
|
|
84
|
+
legacy_value=reduplication_rate,
|
|
85
|
+
default=0.01,
|
|
86
|
+
legacy_name="reduplication_rate",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if rng is None:
|
|
90
|
+
rng = random.Random(seed)
|
|
91
|
+
|
|
92
|
+
clamped_rate = max(0.0, effective_rate)
|
|
93
|
+
unweighted_flag = bool(unweighted)
|
|
94
|
+
|
|
95
|
+
if _reduplicate_words_rust is not None:
|
|
96
|
+
return cast(str, _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng))
|
|
97
|
+
|
|
98
|
+
return _python_reduplicate_words(
|
|
99
|
+
text,
|
|
100
|
+
rate=clamped_rate,
|
|
101
|
+
rng=rng,
|
|
102
|
+
unweighted=unweighted_flag,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class Reduple(Glitchling):
|
|
107
|
+
"""Glitchling that repeats words to simulate stuttering speech."""
|
|
108
|
+
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
*,
|
|
112
|
+
rate: float | None = None,
|
|
113
|
+
reduplication_rate: float | None = None,
|
|
114
|
+
seed: int | None = None,
|
|
115
|
+
unweighted: bool = False,
|
|
116
|
+
) -> None:
|
|
117
|
+
self._param_aliases = {"reduplication_rate": "rate"}
|
|
118
|
+
effective_rate = resolve_rate(
|
|
119
|
+
rate=rate,
|
|
120
|
+
legacy_value=reduplication_rate,
|
|
121
|
+
default=0.01,
|
|
122
|
+
legacy_name="reduplication_rate",
|
|
123
|
+
)
|
|
124
|
+
super().__init__(
|
|
125
|
+
name="Reduple",
|
|
126
|
+
corruption_function=reduplicate_words,
|
|
127
|
+
scope=AttackWave.WORD,
|
|
128
|
+
seed=seed,
|
|
129
|
+
rate=effective_rate,
|
|
130
|
+
unweighted=unweighted,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
134
|
+
rate = self.kwargs.get("rate")
|
|
135
|
+
if rate is None:
|
|
136
|
+
return None
|
|
137
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
|
138
|
+
return {
|
|
139
|
+
"type": "reduplicate",
|
|
140
|
+
"reduplication_rate": float(rate),
|
|
141
|
+
"unweighted": unweighted,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
reduple = Reduple()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
__all__ = ["Reduple", "reduple"]
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import random
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, cast
|
|
5
|
+
|
|
6
|
+
from ._rate import resolve_rate
|
|
7
|
+
from ._rust_extensions import get_rust_operation
|
|
8
|
+
from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
|
|
9
|
+
from .core import AttackWave, Glitchling
|
|
10
|
+
|
|
11
|
+
# Load Rust-accelerated operation if available
|
|
12
|
+
_delete_random_words_rust = get_rust_operation("delete_random_words")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _python_delete_random_words(
|
|
16
|
+
text: str,
|
|
17
|
+
*,
|
|
18
|
+
rate: float,
|
|
19
|
+
rng: random.Random,
|
|
20
|
+
unweighted: bool = False,
|
|
21
|
+
) -> str:
|
|
22
|
+
"""Delete random words from the input text while preserving whitespace."""
|
|
23
|
+
effective_rate = max(rate, 0.0)
|
|
24
|
+
if effective_rate <= 0.0:
|
|
25
|
+
return text
|
|
26
|
+
|
|
27
|
+
tokens = split_preserving_whitespace(text)
|
|
28
|
+
word_tokens = collect_word_tokens(tokens, skip_first_word=True)
|
|
29
|
+
|
|
30
|
+
weighted_tokens: list[tuple[int, float, WordToken]] = []
|
|
31
|
+
for token in word_tokens:
|
|
32
|
+
weight = 1.0 if unweighted else 1.0 / float(token.core_length)
|
|
33
|
+
weighted_tokens.append((token.index, weight, token))
|
|
34
|
+
|
|
35
|
+
if not weighted_tokens:
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
allowed_deletions = min(len(weighted_tokens), math.floor(len(weighted_tokens) * effective_rate))
|
|
39
|
+
if allowed_deletions <= 0:
|
|
40
|
+
return text
|
|
41
|
+
|
|
42
|
+
mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(weighted_tokens)
|
|
43
|
+
|
|
44
|
+
deletions = 0
|
|
45
|
+
for index, weight, token in weighted_tokens:
|
|
46
|
+
if deletions >= allowed_deletions:
|
|
47
|
+
break
|
|
48
|
+
|
|
49
|
+
if effective_rate >= 1.0:
|
|
50
|
+
probability = 1.0
|
|
51
|
+
else:
|
|
52
|
+
if mean_weight <= 0.0:
|
|
53
|
+
probability = effective_rate
|
|
54
|
+
else:
|
|
55
|
+
probability = min(1.0, effective_rate * (weight / mean_weight))
|
|
56
|
+
if rng.random() >= probability:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
prefix = token.prefix.strip()
|
|
60
|
+
suffix = token.suffix.strip()
|
|
61
|
+
tokens[index] = f"{prefix}{suffix}"
|
|
62
|
+
|
|
63
|
+
deletions += 1
|
|
64
|
+
|
|
65
|
+
text = "".join(tokens)
|
|
66
|
+
text = re.sub(r"\s+([.,;:])", r"\1", text)
|
|
67
|
+
text = re.sub(r"\s{2,}", " ", text).strip()
|
|
68
|
+
|
|
69
|
+
return text
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def delete_random_words(
|
|
73
|
+
text: str,
|
|
74
|
+
rate: float | None = None,
|
|
75
|
+
seed: int | None = None,
|
|
76
|
+
rng: random.Random | None = None,
|
|
77
|
+
*,
|
|
78
|
+
max_deletion_rate: float | None = None,
|
|
79
|
+
unweighted: bool = False,
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Delete random words from the input text.
|
|
82
|
+
|
|
83
|
+
Uses the optional Rust implementation when available.
|
|
84
|
+
"""
|
|
85
|
+
effective_rate = resolve_rate(
|
|
86
|
+
rate=rate,
|
|
87
|
+
legacy_value=max_deletion_rate,
|
|
88
|
+
default=0.01,
|
|
89
|
+
legacy_name="max_deletion_rate",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if rng is None:
|
|
93
|
+
rng = random.Random(seed)
|
|
94
|
+
|
|
95
|
+
clamped_rate = max(0.0, effective_rate)
|
|
96
|
+
unweighted_flag = bool(unweighted)
|
|
97
|
+
|
|
98
|
+
if _delete_random_words_rust is not None:
|
|
99
|
+
return cast(str, _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng))
|
|
100
|
+
|
|
101
|
+
return _python_delete_random_words(
|
|
102
|
+
text,
|
|
103
|
+
rate=clamped_rate,
|
|
104
|
+
rng=rng,
|
|
105
|
+
unweighted=unweighted_flag,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class Rushmore(Glitchling):
|
|
110
|
+
"""Glitchling that deletes words to simulate missing information."""
|
|
111
|
+
|
|
112
|
+
def __init__(
|
|
113
|
+
self,
|
|
114
|
+
*,
|
|
115
|
+
rate: float | None = None,
|
|
116
|
+
max_deletion_rate: float | None = None,
|
|
117
|
+
seed: int | None = None,
|
|
118
|
+
unweighted: bool = False,
|
|
119
|
+
) -> None:
|
|
120
|
+
self._param_aliases = {"max_deletion_rate": "rate"}
|
|
121
|
+
effective_rate = resolve_rate(
|
|
122
|
+
rate=rate,
|
|
123
|
+
legacy_value=max_deletion_rate,
|
|
124
|
+
default=0.01,
|
|
125
|
+
legacy_name="max_deletion_rate",
|
|
126
|
+
)
|
|
127
|
+
super().__init__(
|
|
128
|
+
name="Rushmore",
|
|
129
|
+
corruption_function=delete_random_words,
|
|
130
|
+
scope=AttackWave.WORD,
|
|
131
|
+
seed=seed,
|
|
132
|
+
rate=effective_rate,
|
|
133
|
+
unweighted=unweighted,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
137
|
+
rate = self.kwargs.get("rate")
|
|
138
|
+
if rate is None:
|
|
139
|
+
rate = self.kwargs.get("max_deletion_rate")
|
|
140
|
+
if rate is None:
|
|
141
|
+
return None
|
|
142
|
+
unweighted = bool(self.kwargs.get("unweighted", False))
|
|
143
|
+
return {
|
|
144
|
+
"type": "delete",
|
|
145
|
+
"max_deletion_rate": float(rate),
|
|
146
|
+
"unweighted": unweighted,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
rushmore = Rushmore()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
__all__ = ["Rushmore", "rushmore"]
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from ._ocr_confusions import load_confusion_table
|
|
6
|
+
from ._rate import resolve_rate
|
|
7
|
+
from ._rust_extensions import get_rust_operation
|
|
8
|
+
from .core import AttackOrder, AttackWave, Glitchling
|
|
9
|
+
|
|
10
|
+
# Load Rust-accelerated operation if available
|
|
11
|
+
_ocr_artifacts_rust = get_rust_operation("ocr_artifacts")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _python_ocr_artifacts(
|
|
15
|
+
text: str,
|
|
16
|
+
*,
|
|
17
|
+
rate: float,
|
|
18
|
+
rng: random.Random,
|
|
19
|
+
) -> str:
|
|
20
|
+
"""Introduce OCR-like artifacts into text.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
- text: Input text to corrupt.
|
|
25
|
+
- rate: Max proportion of eligible confusion matches to replace (default 0.02).
|
|
26
|
+
- seed: Optional seed if `rng` not provided.
|
|
27
|
+
- rng: Optional RNG; overrides seed.
|
|
28
|
+
|
|
29
|
+
Notes
|
|
30
|
+
-----
|
|
31
|
+
- Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
|
|
32
|
+
- Collects all non-overlapping candidate spans in reading order, then samples
|
|
33
|
+
a subset deterministically with the provided RNG.
|
|
34
|
+
- Replacements can change length (e.g., m→rn), so edits are applied from left
|
|
35
|
+
to right using precomputed spans to avoid index drift.
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
if not text:
|
|
39
|
+
return text
|
|
40
|
+
|
|
41
|
+
# Keep the confusion definitions in a shared data file so both the Python
|
|
42
|
+
# and Rust implementations stay in sync.
|
|
43
|
+
confusion_table = load_confusion_table()
|
|
44
|
+
|
|
45
|
+
# Build candidate matches as (start, end, choices)
|
|
46
|
+
candidates: list[tuple[int, int, list[str]]] = []
|
|
47
|
+
|
|
48
|
+
# To avoid double-counting overlapping patterns (like 'l' inside 'li'),
|
|
49
|
+
# we will scan longer patterns first by sorting by len(src) desc.
|
|
50
|
+
for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
|
|
51
|
+
pattern = re.escape(src)
|
|
52
|
+
for m in re.finditer(pattern, text):
|
|
53
|
+
start, end = m.span()
|
|
54
|
+
candidates.append((start, end, choices))
|
|
55
|
+
|
|
56
|
+
if not candidates:
|
|
57
|
+
return text
|
|
58
|
+
|
|
59
|
+
# Decide how many to replace
|
|
60
|
+
k = int(len(candidates) * rate)
|
|
61
|
+
if k <= 0:
|
|
62
|
+
return text
|
|
63
|
+
|
|
64
|
+
# Shuffle deterministically and select non-overlapping k spans
|
|
65
|
+
rng.shuffle(candidates)
|
|
66
|
+
chosen: list[tuple[int, int, str]] = []
|
|
67
|
+
occupied: list[tuple[int, int]] = []
|
|
68
|
+
|
|
69
|
+
def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
|
|
70
|
+
return not (a[1] <= b[0] or b[1] <= a[0])
|
|
71
|
+
|
|
72
|
+
for start, end, choices in candidates:
|
|
73
|
+
if len(chosen) >= k:
|
|
74
|
+
break
|
|
75
|
+
span = (start, end)
|
|
76
|
+
if any(overlaps(span, occ) for occ in occupied):
|
|
77
|
+
continue
|
|
78
|
+
replacement = rng.choice(choices)
|
|
79
|
+
chosen.append((start, end, replacement))
|
|
80
|
+
occupied.append(span)
|
|
81
|
+
|
|
82
|
+
if not chosen:
|
|
83
|
+
return text
|
|
84
|
+
|
|
85
|
+
# Apply edits from left to right
|
|
86
|
+
chosen.sort(key=lambda t: t[0])
|
|
87
|
+
out_parts = []
|
|
88
|
+
cursor = 0
|
|
89
|
+
for start, end, rep in chosen:
|
|
90
|
+
if cursor < start:
|
|
91
|
+
out_parts.append(text[cursor:start])
|
|
92
|
+
out_parts.append(rep)
|
|
93
|
+
cursor = end
|
|
94
|
+
if cursor < len(text):
|
|
95
|
+
out_parts.append(text[cursor:])
|
|
96
|
+
|
|
97
|
+
return "".join(out_parts)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def ocr_artifacts(
|
|
101
|
+
text: str,
|
|
102
|
+
rate: float | None = None,
|
|
103
|
+
seed: int | None = None,
|
|
104
|
+
rng: random.Random | None = None,
|
|
105
|
+
*,
|
|
106
|
+
error_rate: float | None = None,
|
|
107
|
+
) -> str:
|
|
108
|
+
"""Introduce OCR-like artifacts into text.
|
|
109
|
+
|
|
110
|
+
Prefers the Rust implementation when available.
|
|
111
|
+
"""
|
|
112
|
+
if not text:
|
|
113
|
+
return text
|
|
114
|
+
|
|
115
|
+
effective_rate = resolve_rate(
|
|
116
|
+
rate=rate,
|
|
117
|
+
legacy_value=error_rate,
|
|
118
|
+
default=0.02,
|
|
119
|
+
legacy_name="error_rate",
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
if rng is None:
|
|
123
|
+
rng = random.Random(seed)
|
|
124
|
+
|
|
125
|
+
clamped_rate = max(0.0, effective_rate)
|
|
126
|
+
|
|
127
|
+
if _ocr_artifacts_rust is not None:
|
|
128
|
+
return cast(str, _ocr_artifacts_rust(text, clamped_rate, rng))
|
|
129
|
+
|
|
130
|
+
return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class Scannequin(Glitchling):
|
|
134
|
+
"""Glitchling that simulates OCR artifacts using common confusions."""
|
|
135
|
+
|
|
136
|
+
def __init__(
|
|
137
|
+
self,
|
|
138
|
+
*,
|
|
139
|
+
rate: float | None = None,
|
|
140
|
+
error_rate: float | None = None,
|
|
141
|
+
seed: int | None = None,
|
|
142
|
+
) -> None:
|
|
143
|
+
self._param_aliases = {"error_rate": "rate"}
|
|
144
|
+
effective_rate = resolve_rate(
|
|
145
|
+
rate=rate,
|
|
146
|
+
legacy_value=error_rate,
|
|
147
|
+
default=0.02,
|
|
148
|
+
legacy_name="error_rate",
|
|
149
|
+
)
|
|
150
|
+
super().__init__(
|
|
151
|
+
name="Scannequin",
|
|
152
|
+
corruption_function=ocr_artifacts,
|
|
153
|
+
scope=AttackWave.CHARACTER,
|
|
154
|
+
order=AttackOrder.LATE,
|
|
155
|
+
seed=seed,
|
|
156
|
+
rate=effective_rate,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
160
|
+
rate = self.kwargs.get("rate")
|
|
161
|
+
if rate is None:
|
|
162
|
+
rate = self.kwargs.get("error_rate")
|
|
163
|
+
if rate is None:
|
|
164
|
+
return None
|
|
165
|
+
return {"type": "ocr", "error_rate": float(rate)}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
scannequin = Scannequin()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
__all__ = ["Scannequin", "scannequin"]
|