glitchlings 0.4.4__cp313-cp313-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +67 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_zoo_rust.cpython-313-x86_64-linux-gnu.so +0 -0
- glitchlings/compat.py +284 -0
- glitchlings/config.py +388 -0
- glitchlings/config.toml +3 -0
- glitchlings/dlc/__init__.py +7 -0
- glitchlings/dlc/_shared.py +153 -0
- glitchlings/dlc/huggingface.py +81 -0
- glitchlings/dlc/prime.py +254 -0
- glitchlings/dlc/pytorch.py +166 -0
- glitchlings/dlc/pytorch_lightning.py +215 -0
- glitchlings/lexicon/__init__.py +192 -0
- glitchlings/lexicon/_cache.py +110 -0
- glitchlings/lexicon/data/default_vector_cache.json +82 -0
- glitchlings/lexicon/metrics.py +162 -0
- glitchlings/lexicon/vector.py +651 -0
- glitchlings/lexicon/wordnet.py +232 -0
- glitchlings/main.py +364 -0
- glitchlings/util/__init__.py +195 -0
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +168 -0
- glitchlings/zoo/_ocr_confusions.py +32 -0
- glitchlings/zoo/_rate.py +131 -0
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/_sampling.py +54 -0
- glitchlings/zoo/_text_utils.py +100 -0
- glitchlings/zoo/adjax.py +128 -0
- glitchlings/zoo/apostrofae.py +127 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +582 -0
- glitchlings/zoo/jargoyle.py +335 -0
- glitchlings/zoo/mim1c.py +109 -0
- glitchlings/zoo/ocr_confusions.tsv +30 -0
- glitchlings/zoo/redactyl.py +193 -0
- glitchlings/zoo/reduple.py +148 -0
- glitchlings/zoo/rushmore.py +153 -0
- glitchlings/zoo/scannequin.py +171 -0
- glitchlings/zoo/typogre.py +231 -0
- glitchlings/zoo/zeedub.py +185 -0
- glitchlings-0.4.4.dist-info/METADATA +627 -0
- glitchlings-0.4.4.dist-info/RECORD +47 -0
- glitchlings-0.4.4.dist-info/WHEEL +5 -0
- glitchlings-0.4.4.dist-info/entry_points.txt +2 -0
- glitchlings-0.4.4.dist-info/licenses/LICENSE +201 -0
- glitchlings-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
from typing import Sequence
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def weighted_sample_without_replacement(
|
|
8
|
+
population: Sequence[int],
|
|
9
|
+
weights: Sequence[float],
|
|
10
|
+
*,
|
|
11
|
+
k: int,
|
|
12
|
+
rng: random.Random,
|
|
13
|
+
) -> list[int]:
|
|
14
|
+
"""Sample ``k`` unique indices from ``population`` using ``weights``.
|
|
15
|
+
|
|
16
|
+
Mirrors the behaviour used by several glitchlings while centralising error
|
|
17
|
+
handling and RNG interactions so the Python and Rust implementations remain
|
|
18
|
+
aligned.
|
|
19
|
+
"""
|
|
20
|
+
if k < 0:
|
|
21
|
+
raise ValueError("Sample size cannot be negative")
|
|
22
|
+
|
|
23
|
+
if len(population) != len(weights):
|
|
24
|
+
raise ValueError("Population and weight sequences must be the same length")
|
|
25
|
+
|
|
26
|
+
items = list(zip(population, weights))
|
|
27
|
+
count = len(items)
|
|
28
|
+
if k == 0 or count == 0:
|
|
29
|
+
return []
|
|
30
|
+
|
|
31
|
+
if k > count:
|
|
32
|
+
raise ValueError("Sample larger than population or is negative")
|
|
33
|
+
|
|
34
|
+
selections: list[int] = []
|
|
35
|
+
for _ in range(k):
|
|
36
|
+
total_weight = sum(weight for _, weight in items)
|
|
37
|
+
if total_weight <= 0.0:
|
|
38
|
+
chosen_index = rng.randrange(len(items))
|
|
39
|
+
else:
|
|
40
|
+
threshold = rng.random() * total_weight
|
|
41
|
+
cumulative = 0.0
|
|
42
|
+
chosen_index = len(items) - 1
|
|
43
|
+
for idx, (_, weight) in enumerate(items):
|
|
44
|
+
cumulative += weight
|
|
45
|
+
if cumulative >= threshold:
|
|
46
|
+
chosen_index = idx
|
|
47
|
+
break
|
|
48
|
+
value, _ = items.pop(chosen_index)
|
|
49
|
+
selections.append(value)
|
|
50
|
+
|
|
51
|
+
return selections
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
__all__ = ["weighted_sample_without_replacement"]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
7
|
+
_WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
|
|
8
|
+
_TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def split_preserving_whitespace(text: str) -> list[str]:
|
|
12
|
+
"""Split text while keeping whitespace tokens for stable reconstruction."""
|
|
13
|
+
return _WORD_SPLIT_PATTERN.split(text)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def split_token_edges(token: str) -> tuple[str, str, str]:
|
|
17
|
+
"""Return leading, core, and trailing segments for a token."""
|
|
18
|
+
match = _TOKEN_EDGES_PATTERN.match(token)
|
|
19
|
+
if match is None:
|
|
20
|
+
return "", token, ""
|
|
21
|
+
return match.group(1), match.group(2), match.group(3)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def token_core_length(token: str) -> int:
|
|
25
|
+
"""Return the length of the main word characters for weighting heuristics."""
|
|
26
|
+
_, core, _ = split_token_edges(token)
|
|
27
|
+
candidate = core if core else token
|
|
28
|
+
length = len(candidate)
|
|
29
|
+
if length <= 0:
|
|
30
|
+
stripped = token.strip()
|
|
31
|
+
length = len(stripped) if stripped else len(token)
|
|
32
|
+
if length <= 0:
|
|
33
|
+
length = 1
|
|
34
|
+
return length
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class WordToken:
|
|
39
|
+
"""Metadata describing a non-whitespace token yielded by word splitters."""
|
|
40
|
+
|
|
41
|
+
index: int
|
|
42
|
+
prefix: str
|
|
43
|
+
core: str
|
|
44
|
+
suffix: str
|
|
45
|
+
core_length: int
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def has_core(self) -> bool:
|
|
49
|
+
"""Return ``True`` when the token contains at least one core character."""
|
|
50
|
+
return bool(self.core)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def collect_word_tokens(
|
|
54
|
+
tokens: Sequence[str],
|
|
55
|
+
*,
|
|
56
|
+
skip_first_word: bool = False,
|
|
57
|
+
) -> list[WordToken]:
|
|
58
|
+
"""Return structured metadata for non-whitespace tokens within ``tokens``.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
tokens: Token sequence produced by :func:`split_preserving_whitespace`.
|
|
62
|
+
skip_first_word: Exclude the first candidate token (used by Rushmore to
|
|
63
|
+
preserve leading words).
|
|
64
|
+
|
|
65
|
+
"""
|
|
66
|
+
start = 2 if skip_first_word else 0
|
|
67
|
+
collected: list[WordToken] = []
|
|
68
|
+
for index in range(start, len(tokens), 2):
|
|
69
|
+
token = tokens[index]
|
|
70
|
+
if not token or token.isspace():
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
prefix, core, suffix = split_token_edges(token)
|
|
74
|
+
core_length = len(core)
|
|
75
|
+
if core_length <= 0:
|
|
76
|
+
stripped = token.strip()
|
|
77
|
+
core_length = len(stripped) if stripped else len(token)
|
|
78
|
+
if core_length <= 0:
|
|
79
|
+
core_length = 1
|
|
80
|
+
|
|
81
|
+
collected.append(
|
|
82
|
+
WordToken(
|
|
83
|
+
index=index,
|
|
84
|
+
prefix=prefix,
|
|
85
|
+
core=core,
|
|
86
|
+
suffix=suffix,
|
|
87
|
+
core_length=core_length,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return collected
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
__all__ = [
|
|
95
|
+
"split_preserving_whitespace",
|
|
96
|
+
"split_token_edges",
|
|
97
|
+
"token_core_length",
|
|
98
|
+
"WordToken",
|
|
99
|
+
"collect_word_tokens",
|
|
100
|
+
]
|
glitchlings/zoo/adjax.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
from typing import Any, cast
|
|
5
|
+
|
|
6
|
+
from ._rate import resolve_rate
|
|
7
|
+
from ._rust_extensions import get_rust_operation
|
|
8
|
+
from ._text_utils import split_preserving_whitespace, split_token_edges
|
|
9
|
+
from .core import AttackWave, Glitchling
|
|
10
|
+
|
|
11
|
+
# Load Rust-accelerated operation if available
|
|
12
|
+
_swap_adjacent_words_rust = get_rust_operation("swap_adjacent_words")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _python_swap_adjacent_words(
|
|
16
|
+
text: str,
|
|
17
|
+
*,
|
|
18
|
+
rate: float,
|
|
19
|
+
rng: random.Random,
|
|
20
|
+
) -> str:
|
|
21
|
+
"""Swap the cores of adjacent words while keeping affixes and spacing intact."""
|
|
22
|
+
tokens = split_preserving_whitespace(text)
|
|
23
|
+
if len(tokens) < 2:
|
|
24
|
+
return text
|
|
25
|
+
|
|
26
|
+
word_indices: list[int] = []
|
|
27
|
+
for index in range(len(tokens)):
|
|
28
|
+
token = tokens[index]
|
|
29
|
+
if not token or token.isspace():
|
|
30
|
+
continue
|
|
31
|
+
if index % 2 == 0:
|
|
32
|
+
word_indices.append(index)
|
|
33
|
+
|
|
34
|
+
if len(word_indices) < 2:
|
|
35
|
+
return text
|
|
36
|
+
|
|
37
|
+
clamped = max(0.0, min(rate, 1.0))
|
|
38
|
+
if clamped <= 0.0:
|
|
39
|
+
return text
|
|
40
|
+
|
|
41
|
+
for cursor in range(0, len(word_indices) - 1, 2):
|
|
42
|
+
left_index = word_indices[cursor]
|
|
43
|
+
right_index = word_indices[cursor + 1]
|
|
44
|
+
|
|
45
|
+
left_token = tokens[left_index]
|
|
46
|
+
right_token = tokens[right_index]
|
|
47
|
+
|
|
48
|
+
left_prefix, left_core, left_suffix = split_token_edges(left_token)
|
|
49
|
+
right_prefix, right_core, right_suffix = split_token_edges(right_token)
|
|
50
|
+
|
|
51
|
+
if not left_core or not right_core:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
should_swap = clamped >= 1.0 or rng.random() < clamped
|
|
55
|
+
if not should_swap:
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
tokens[left_index] = f"{left_prefix}{right_core}{left_suffix}"
|
|
59
|
+
tokens[right_index] = f"{right_prefix}{left_core}{right_suffix}"
|
|
60
|
+
|
|
61
|
+
return "".join(tokens)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def swap_adjacent_words(
|
|
65
|
+
text: str,
|
|
66
|
+
rate: float | None = None,
|
|
67
|
+
seed: int | None = None,
|
|
68
|
+
rng: random.Random | None = None,
|
|
69
|
+
*,
|
|
70
|
+
swap_rate: float | None = None,
|
|
71
|
+
) -> str:
|
|
72
|
+
"""Swap adjacent word cores while preserving spacing and punctuation."""
|
|
73
|
+
effective_rate = resolve_rate(
|
|
74
|
+
rate=rate,
|
|
75
|
+
legacy_value=swap_rate,
|
|
76
|
+
default=0.5,
|
|
77
|
+
legacy_name="swap_rate",
|
|
78
|
+
)
|
|
79
|
+
clamped_rate = max(0.0, min(effective_rate, 1.0))
|
|
80
|
+
|
|
81
|
+
if rng is None:
|
|
82
|
+
rng = random.Random(seed)
|
|
83
|
+
|
|
84
|
+
if _swap_adjacent_words_rust is not None:
|
|
85
|
+
return cast(str, _swap_adjacent_words_rust(text, clamped_rate, rng))
|
|
86
|
+
|
|
87
|
+
return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Adjax(Glitchling):
|
|
91
|
+
"""Glitchling that swaps adjacent words to scramble local semantics."""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
*,
|
|
96
|
+
rate: float | None = None,
|
|
97
|
+
swap_rate: float | None = None,
|
|
98
|
+
seed: int | None = None,
|
|
99
|
+
) -> None:
|
|
100
|
+
self._param_aliases = {"swap_rate": "rate"}
|
|
101
|
+
effective_rate = resolve_rate(
|
|
102
|
+
rate=rate,
|
|
103
|
+
legacy_value=swap_rate,
|
|
104
|
+
default=0.5,
|
|
105
|
+
legacy_name="swap_rate",
|
|
106
|
+
)
|
|
107
|
+
super().__init__(
|
|
108
|
+
name="Adjax",
|
|
109
|
+
corruption_function=swap_adjacent_words,
|
|
110
|
+
scope=AttackWave.WORD,
|
|
111
|
+
seed=seed,
|
|
112
|
+
rate=effective_rate,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
116
|
+
rate = self.kwargs.get("rate")
|
|
117
|
+
if rate is None:
|
|
118
|
+
return None
|
|
119
|
+
return {
|
|
120
|
+
"type": "swap_adjacent",
|
|
121
|
+
"swap_rate": float(rate),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
adjax = Adjax()
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
__all__ = ["Adjax", "adjax", "swap_adjacent_words"]
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Smart-quote glitchling that swaps straight quotes for fancy counterparts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import random
|
|
7
|
+
from functools import cache
|
|
8
|
+
from importlib import resources
|
|
9
|
+
from typing import Any, Sequence, cast
|
|
10
|
+
|
|
11
|
+
from ._rust_extensions import get_rust_operation
|
|
12
|
+
from .core import AttackOrder, AttackWave, Gaggle, Glitchling
|
|
13
|
+
|
|
14
|
+
# Load Rust-accelerated operation if available
|
|
15
|
+
_apostrofae_rust = get_rust_operation("apostrofae")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@cache
|
|
19
|
+
def _load_replacement_pairs() -> dict[str, list[tuple[str, str]]]:
|
|
20
|
+
"""Load the curated mapping of straight quotes to fancy pairs."""
|
|
21
|
+
|
|
22
|
+
resource = resources.files(f"{__package__}.assets").joinpath("apostrofae_pairs.json")
|
|
23
|
+
with resource.open("r", encoding="utf-8") as handle:
|
|
24
|
+
data: dict[str, list[Sequence[str]]] = json.load(handle)
|
|
25
|
+
|
|
26
|
+
parsed: dict[str, list[tuple[str, str]]] = {}
|
|
27
|
+
for straight, replacements in data.items():
|
|
28
|
+
parsed[straight] = [(pair[0], pair[1]) for pair in replacements if len(pair) == 2]
|
|
29
|
+
return parsed
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _find_quote_pairs(text: str) -> list[tuple[int, int, str]]:
|
|
33
|
+
"""Return all balanced pairs of straight quotes in ``text``.
|
|
34
|
+
|
|
35
|
+
The search walks the string once, pairing sequential occurrences of each quote
|
|
36
|
+
glyph. Unmatched openers remain untouched so contractions (e.g. ``it's``)
|
|
37
|
+
survive unmodified.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
stacks: dict[str, int | None] = {'"': None, "'": None, "`": None}
|
|
41
|
+
pairs: list[tuple[int, int, str]] = []
|
|
42
|
+
|
|
43
|
+
for index, ch in enumerate(text):
|
|
44
|
+
if ch not in stacks:
|
|
45
|
+
continue
|
|
46
|
+
start = stacks[ch]
|
|
47
|
+
if start is None:
|
|
48
|
+
stacks[ch] = index
|
|
49
|
+
else:
|
|
50
|
+
pairs.append((start, index, ch))
|
|
51
|
+
stacks[ch] = None
|
|
52
|
+
|
|
53
|
+
return pairs
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _apostrofae_python(text: str, *, rng: random.Random) -> str:
|
|
57
|
+
"""Python fallback that replaces paired straight quotes with fancy glyphs."""
|
|
58
|
+
|
|
59
|
+
pairs = _load_replacement_pairs()
|
|
60
|
+
candidates = _find_quote_pairs(text)
|
|
61
|
+
if not candidates:
|
|
62
|
+
return text
|
|
63
|
+
|
|
64
|
+
chars = list(text)
|
|
65
|
+
for start, end, glyph in candidates:
|
|
66
|
+
options = pairs.get(glyph)
|
|
67
|
+
if not options:
|
|
68
|
+
continue
|
|
69
|
+
left, right = rng.choice(options)
|
|
70
|
+
chars[start] = left
|
|
71
|
+
chars[end] = right
|
|
72
|
+
return "".join(chars)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def smart_quotes(
|
|
76
|
+
text: str,
|
|
77
|
+
seed: int | None = None,
|
|
78
|
+
rng: random.Random | None = None,
|
|
79
|
+
) -> str:
|
|
80
|
+
"""Replace straight quotes, apostrophes, and backticks with fancy pairs."""
|
|
81
|
+
|
|
82
|
+
if not text:
|
|
83
|
+
return text
|
|
84
|
+
|
|
85
|
+
if rng is None:
|
|
86
|
+
rng = random.Random(seed)
|
|
87
|
+
|
|
88
|
+
if _apostrofae_rust is not None:
|
|
89
|
+
return cast(str, _apostrofae_rust(text, rng))
|
|
90
|
+
|
|
91
|
+
return _apostrofae_python(text, rng=rng)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class Apostrofae(Glitchling):
|
|
95
|
+
"""Glitchling that swaps straight quotes for decorative Unicode pairs."""
|
|
96
|
+
|
|
97
|
+
def __init__(self, *, seed: int | None = None) -> None:
|
|
98
|
+
self._master_seed: int | None = seed
|
|
99
|
+
super().__init__(
|
|
100
|
+
name="Apostrofae",
|
|
101
|
+
corruption_function=smart_quotes,
|
|
102
|
+
scope=AttackWave.CHARACTER,
|
|
103
|
+
order=AttackOrder.NORMAL,
|
|
104
|
+
seed=seed,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
108
|
+
return {"type": "apostrofae"}
|
|
109
|
+
|
|
110
|
+
def reset_rng(self, seed: int | None = None) -> None: # pragma: no cover - exercised indirectly
|
|
111
|
+
if seed is not None:
|
|
112
|
+
self._master_seed = seed
|
|
113
|
+
super().reset_rng(seed)
|
|
114
|
+
if self.seed is None:
|
|
115
|
+
return
|
|
116
|
+
derived = Gaggle.derive_seed(int(seed), self.name, 0)
|
|
117
|
+
self.seed = int(derived)
|
|
118
|
+
self.rng = random.Random(self.seed)
|
|
119
|
+
self.kwargs["seed"] = self.seed
|
|
120
|
+
else:
|
|
121
|
+
super().reset_rng(None)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
apostrofae = Apostrofae()
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
__all__ = ["Apostrofae", "apostrofae", "smart_quotes"]
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"\"": [
|
|
3
|
+
["“", "”"],
|
|
4
|
+
["„", "“"],
|
|
5
|
+
["«", "»"],
|
|
6
|
+
["‹", "›"],
|
|
7
|
+
["『", "』"],
|
|
8
|
+
["「", "」"],
|
|
9
|
+
["﹁", "﹂"],
|
|
10
|
+
["﹃", "﹄"],
|
|
11
|
+
["〝", "〞"],
|
|
12
|
+
["❝", "❞"]
|
|
13
|
+
],
|
|
14
|
+
"'": [
|
|
15
|
+
["‘", "’"],
|
|
16
|
+
["‚", "‘"],
|
|
17
|
+
["‹", "›"],
|
|
18
|
+
["❮", "❯"],
|
|
19
|
+
["❛", "❜"],
|
|
20
|
+
["﹇", "﹈"]
|
|
21
|
+
],
|
|
22
|
+
"`": [
|
|
23
|
+
["‵", "′"],
|
|
24
|
+
["﹁", "﹂"],
|
|
25
|
+
["﹃", "﹄"],
|
|
26
|
+
["⌈", "⌉"],
|
|
27
|
+
["⌊", "⌋"],
|
|
28
|
+
["⎡", "⎤"],
|
|
29
|
+
["⎣", "⎦"],
|
|
30
|
+
["〝", "〞"]
|
|
31
|
+
]
|
|
32
|
+
}
|