glitchlings 0.3.0__cp311-cp311-macosx_11_0_universal2.whl → 0.4.0__cp311-cp311-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
- glitchlings/config.py +258 -0
- glitchlings/config.toml +3 -0
- glitchlings/lexicon/__init__.py +191 -0
- glitchlings/lexicon/data/default_vector_cache.json +16 -0
- glitchlings/lexicon/graph.py +303 -0
- glitchlings/lexicon/metrics.py +169 -0
- glitchlings/lexicon/vector.py +610 -0
- glitchlings/lexicon/wordnet.py +182 -0
- glitchlings/main.py +145 -5
- glitchlings/zoo/__init__.py +15 -0
- glitchlings/zoo/_sampling.py +55 -0
- glitchlings/zoo/_text_utils.py +62 -0
- glitchlings/zoo/jargoyle.py +190 -200
- glitchlings/zoo/redactyl.py +26 -54
- glitchlings/zoo/reduple.py +10 -21
- glitchlings/zoo/rushmore.py +15 -21
- glitchlings/zoo/typogre.py +22 -1
- glitchlings/zoo/zeedub.py +40 -1
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/METADATA +30 -8
- glitchlings-0.4.0.dist-info/RECORD +38 -0
- glitchlings-0.3.0.dist-info/RECORD +0 -29
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/WHEEL +0 -0
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/top_level.txt +0 -0
glitchlings/zoo/jargoyle.py
CHANGED
|
@@ -2,121 +2,47 @@ import random
|
|
|
2
2
|
import re
|
|
3
3
|
from collections.abc import Iterable
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from typing import
|
|
6
|
-
|
|
7
|
-
try: # pragma: no cover - exercised in environments with NLTK installed
|
|
8
|
-
import nltk # type: ignore[import]
|
|
9
|
-
except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
|
|
10
|
-
nltk = None # type: ignore[assignment]
|
|
11
|
-
find = None # type: ignore[assignment]
|
|
12
|
-
_NLTK_IMPORT_ERROR = exc
|
|
13
|
-
else: # pragma: no cover - executed when NLTK is available
|
|
14
|
-
from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
|
|
15
|
-
from nltk.data import find as _nltk_find # type: ignore[import]
|
|
16
|
-
|
|
17
|
-
find = _nltk_find
|
|
18
|
-
_NLTK_IMPORT_ERROR = None
|
|
19
|
-
|
|
20
|
-
if TYPE_CHECKING: # pragma: no cover - typing aid only
|
|
21
|
-
from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
|
|
22
|
-
else: # Use ``Any`` at runtime to avoid hard dependency when NLTK missing
|
|
23
|
-
WordNetCorpusReader = Any
|
|
24
|
-
|
|
25
|
-
if nltk is not None: # pragma: no cover - guarded by import success
|
|
26
|
-
try:
|
|
27
|
-
from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
|
|
28
|
-
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
|
29
|
-
_WORDNET_MODULE = None
|
|
30
|
-
else:
|
|
31
|
-
WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
|
|
32
|
-
else:
|
|
33
|
-
_WORDNET_MODULE = None
|
|
5
|
+
from typing import Any, Literal, cast
|
|
34
6
|
|
|
35
|
-
from .
|
|
36
|
-
from ._rate import resolve_rate
|
|
37
|
-
|
|
38
|
-
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
|
39
|
-
|
|
40
|
-
_wordnet_ready = False
|
|
7
|
+
from glitchlings.lexicon import Lexicon, get_default_lexicon
|
|
41
8
|
|
|
9
|
+
try: # pragma: no cover - optional WordNet dependency
|
|
10
|
+
from glitchlings.lexicon.wordnet import (
|
|
11
|
+
WordNetLexicon,
|
|
12
|
+
dependencies_available as _lexicon_dependencies_available,
|
|
13
|
+
ensure_wordnet as _lexicon_ensure_wordnet,
|
|
14
|
+
)
|
|
15
|
+
except Exception: # pragma: no cover - triggered when nltk unavailable
|
|
16
|
+
WordNetLexicon = None # type: ignore[assignment]
|
|
42
17
|
|
|
43
|
-
def
|
|
44
|
-
|
|
18
|
+
def _lexicon_dependencies_available() -> bool:
|
|
19
|
+
return False
|
|
45
20
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"The
|
|
49
|
-
"
|
|
21
|
+
def _lexicon_ensure_wordnet() -> None:
|
|
22
|
+
raise RuntimeError(
|
|
23
|
+
"The WordNet backend is no longer bundled by default. Install NLTK "
|
|
24
|
+
"and download its WordNet corpus manually if you need legacy synonyms."
|
|
50
25
|
)
|
|
51
|
-
if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
|
|
52
|
-
raise RuntimeError(message) from _NLTK_IMPORT_ERROR
|
|
53
|
-
raise RuntimeError(message)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def dependencies_available() -> bool:
|
|
57
|
-
"""Return ``True`` when the runtime NLTK dependency is present."""
|
|
58
|
-
|
|
59
|
-
return nltk is not None and find is not None
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def _load_wordnet_reader() -> WordNetCorpusReader:
|
|
63
|
-
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
|
64
26
|
|
|
65
|
-
_require_nltk()
|
|
66
|
-
|
|
67
|
-
try:
|
|
68
|
-
root = find("corpora/wordnet")
|
|
69
|
-
except LookupError:
|
|
70
|
-
try:
|
|
71
|
-
zip_root = find("corpora/wordnet.zip")
|
|
72
|
-
except LookupError as exc:
|
|
73
|
-
raise RuntimeError(
|
|
74
|
-
"The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
|
|
75
|
-
) from exc
|
|
76
|
-
root = zip_root.join("wordnet/")
|
|
77
|
-
|
|
78
|
-
return WordNetCorpusReader(root, None)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
|
|
82
|
-
"""Retrieve the active WordNet handle, rebuilding it on demand."""
|
|
83
|
-
|
|
84
|
-
global _WORDNET_HANDLE
|
|
85
|
-
|
|
86
|
-
if force_refresh:
|
|
87
|
-
_WORDNET_HANDLE = _WORDNET_MODULE
|
|
88
|
-
|
|
89
|
-
if _WORDNET_HANDLE is not None:
|
|
90
|
-
return _WORDNET_HANDLE
|
|
91
|
-
|
|
92
|
-
_WORDNET_HANDLE = _load_wordnet_reader()
|
|
93
|
-
return _WORDNET_HANDLE
|
|
94
27
|
|
|
28
|
+
from ._rate import resolve_rate
|
|
29
|
+
from .core import AttackWave, Glitchling
|
|
95
30
|
|
|
96
|
-
|
|
97
|
-
"""Ensure the WordNet corpus is available before use."""
|
|
31
|
+
ensure_wordnet = _lexicon_ensure_wordnet
|
|
98
32
|
|
|
99
|
-
global _wordnet_ready
|
|
100
|
-
if _wordnet_ready:
|
|
101
|
-
return
|
|
102
33
|
|
|
103
|
-
|
|
34
|
+
def dependencies_available() -> bool:
|
|
35
|
+
"""Return ``True`` when a synonym backend is accessible."""
|
|
104
36
|
|
|
105
|
-
|
|
37
|
+
if _lexicon_dependencies_available():
|
|
38
|
+
return True
|
|
106
39
|
|
|
107
40
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
resource.ensure_loaded()
|
|
114
|
-
except LookupError as exc: # pragma: no cover - only triggered when download fails
|
|
115
|
-
raise RuntimeError(
|
|
116
|
-
"Unable to load NLTK WordNet corpus for the jargoyle glitchling."
|
|
117
|
-
) from exc
|
|
118
|
-
|
|
119
|
-
_wordnet_ready = True
|
|
41
|
+
# Fall back to the configured default lexicon (typically the bundled vector cache).
|
|
42
|
+
get_default_lexicon(seed=None)
|
|
43
|
+
except Exception:
|
|
44
|
+
return False
|
|
45
|
+
return True
|
|
120
46
|
|
|
121
47
|
|
|
122
48
|
# Backwards compatibility for callers relying on the previous private helper name.
|
|
@@ -140,7 +66,9 @@ def _split_token(token: str) -> tuple[str, str, str]:
|
|
|
140
66
|
return prefix, core, suffix
|
|
141
67
|
|
|
142
68
|
|
|
143
|
-
def _normalize_parts_of_speech(
|
|
69
|
+
def _normalize_parts_of_speech(
|
|
70
|
+
part_of_speech: PartOfSpeechInput,
|
|
71
|
+
) -> NormalizedPartsOfSpeech:
|
|
144
72
|
"""Coerce user input into a tuple of valid WordNet POS tags."""
|
|
145
73
|
|
|
146
74
|
if isinstance(part_of_speech, str):
|
|
@@ -173,41 +101,8 @@ class CandidateInfo:
|
|
|
173
101
|
prefix: str
|
|
174
102
|
core_word: str
|
|
175
103
|
suffix: str
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
def _collect_synonyms(
|
|
180
|
-
word: str, parts_of_speech: NormalizedPartsOfSpeech
|
|
181
|
-
) -> list[str]:
|
|
182
|
-
"""Gather deterministic synonym candidates for the supplied word."""
|
|
183
|
-
|
|
184
|
-
normalized_word = word.lower()
|
|
185
|
-
wordnet = _wordnet()
|
|
186
|
-
synonyms: set[str] = set()
|
|
187
|
-
for pos_tag in parts_of_speech:
|
|
188
|
-
synsets = wordnet.synsets(word, pos=pos_tag)
|
|
189
|
-
if not synsets:
|
|
190
|
-
continue
|
|
191
|
-
|
|
192
|
-
for synset in synsets:
|
|
193
|
-
lemmas_list = [lemma.name() for lemma in cast(Any, synset).lemmas()]
|
|
194
|
-
if not lemmas_list:
|
|
195
|
-
continue
|
|
196
|
-
|
|
197
|
-
filtered = []
|
|
198
|
-
for lemma_str in lemmas_list:
|
|
199
|
-
cleaned = lemma_str.replace("_", " ")
|
|
200
|
-
if cleaned.lower() != normalized_word:
|
|
201
|
-
filtered.append(cleaned)
|
|
202
|
-
|
|
203
|
-
if filtered:
|
|
204
|
-
synonyms.update(filtered)
|
|
205
|
-
break
|
|
206
|
-
|
|
207
|
-
if synonyms:
|
|
208
|
-
break
|
|
209
|
-
|
|
210
|
-
return sorted(synonyms)
|
|
104
|
+
part_of_speech: str | None
|
|
105
|
+
synonyms: list[str]
|
|
211
106
|
|
|
212
107
|
|
|
213
108
|
def substitute_random_synonyms(
|
|
@@ -218,22 +113,27 @@ def substitute_random_synonyms(
|
|
|
218
113
|
rng: random.Random | None = None,
|
|
219
114
|
*,
|
|
220
115
|
replacement_rate: float | None = None,
|
|
116
|
+
lexicon: Lexicon | None = None,
|
|
221
117
|
) -> str:
|
|
222
|
-
"""Replace words with random
|
|
118
|
+
"""Replace words with random lexicon-driven synonyms.
|
|
223
119
|
|
|
224
120
|
Parameters
|
|
225
121
|
- text: Input text.
|
|
226
|
-
- rate: Max proportion of candidate words to replace (default 0.
|
|
122
|
+
- rate: Max proportion of candidate words to replace (default 0.01).
|
|
227
123
|
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
|
228
|
-
any iterable of those tags, or "any" to include all four.
|
|
124
|
+
any iterable of those tags, or "any" to include all four. Backends that do
|
|
125
|
+
not differentiate parts of speech simply ignore the setting.
|
|
229
126
|
- rng: Optional RNG instance used for deterministic sampling.
|
|
230
127
|
- seed: Optional seed if `rng` not provided.
|
|
128
|
+
- lexicon: Optional :class:`~glitchlings.lexicon.Lexicon` implementation to
|
|
129
|
+
supply synonyms. Defaults to the configured lexicon priority, typically the
|
|
130
|
+
packaged vector cache.
|
|
231
131
|
|
|
232
132
|
Determinism
|
|
233
133
|
- Candidates collected in left-to-right order; no set() reordering.
|
|
234
134
|
- Replacement positions chosen via rng.sample.
|
|
235
|
-
- Synonyms
|
|
236
|
-
|
|
135
|
+
- Synonyms sourced through the lexicon; the default backend derives
|
|
136
|
+
deterministic subsets per word and part-of-speech using the active seed.
|
|
237
137
|
"""
|
|
238
138
|
effective_rate = resolve_rate(
|
|
239
139
|
rate=rate,
|
|
@@ -242,68 +142,106 @@ def substitute_random_synonyms(
|
|
|
242
142
|
legacy_name="replacement_rate",
|
|
243
143
|
)
|
|
244
144
|
|
|
245
|
-
ensure_wordnet()
|
|
246
|
-
wordnet = _wordnet()
|
|
247
|
-
|
|
248
145
|
active_rng: random.Random
|
|
249
146
|
if rng is not None:
|
|
250
147
|
active_rng = rng
|
|
251
148
|
else:
|
|
252
149
|
active_rng = random.Random(seed)
|
|
253
150
|
|
|
254
|
-
|
|
151
|
+
active_lexicon: Lexicon
|
|
152
|
+
restore_lexicon_seed = False
|
|
153
|
+
original_lexicon_seed: int | None = None
|
|
255
154
|
|
|
256
|
-
|
|
257
|
-
|
|
155
|
+
if lexicon is None:
|
|
156
|
+
active_lexicon = get_default_lexicon(seed=seed)
|
|
157
|
+
else:
|
|
158
|
+
active_lexicon = lexicon
|
|
159
|
+
if seed is not None:
|
|
160
|
+
original_lexicon_seed = active_lexicon.seed
|
|
161
|
+
if original_lexicon_seed != seed:
|
|
162
|
+
active_lexicon.reseed(seed)
|
|
163
|
+
restore_lexicon_seed = True
|
|
258
164
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
165
|
+
try:
|
|
166
|
+
target_pos = _normalize_parts_of_speech(part_of_speech)
|
|
167
|
+
|
|
168
|
+
# Split but keep whitespace separators so we can rebuild easily
|
|
169
|
+
tokens = re.split(r"(\s+)", text)
|
|
170
|
+
|
|
171
|
+
# Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
|
|
172
|
+
candidate_indices: list[int] = []
|
|
173
|
+
candidate_metadata: dict[int, CandidateInfo] = {}
|
|
174
|
+
for idx, tok in enumerate(tokens):
|
|
175
|
+
if idx % 2 == 0 and tok and not tok.isspace():
|
|
176
|
+
prefix, core_word, suffix = _split_token(tok)
|
|
177
|
+
if not core_word:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
chosen_pos: str | None = None
|
|
181
|
+
synonyms: list[str] = []
|
|
182
|
+
|
|
183
|
+
for pos in target_pos:
|
|
184
|
+
if not active_lexicon.supports_pos(pos):
|
|
185
|
+
continue
|
|
186
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=pos)
|
|
187
|
+
if synonyms:
|
|
188
|
+
chosen_pos = pos
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
if not synonyms and active_lexicon.supports_pos(None):
|
|
192
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=None)
|
|
193
|
+
|
|
194
|
+
if synonyms:
|
|
195
|
+
candidate_indices.append(idx)
|
|
196
|
+
candidate_metadata[idx] = CandidateInfo(
|
|
197
|
+
prefix=prefix,
|
|
198
|
+
core_word=core_word,
|
|
199
|
+
suffix=suffix,
|
|
200
|
+
part_of_speech=chosen_pos,
|
|
201
|
+
synonyms=synonyms,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if not candidate_indices:
|
|
205
|
+
return text
|
|
206
|
+
|
|
207
|
+
clamped_rate = max(0.0, effective_rate)
|
|
208
|
+
if clamped_rate == 0.0:
|
|
209
|
+
return text
|
|
210
|
+
|
|
211
|
+
population = len(candidate_indices)
|
|
212
|
+
effective_fraction = min(clamped_rate, 1.0)
|
|
213
|
+
expected_replacements = population * effective_fraction
|
|
214
|
+
max_replacements = int(expected_replacements)
|
|
215
|
+
remainder = expected_replacements - max_replacements
|
|
216
|
+
if remainder > 0.0 and active_rng.random() < remainder:
|
|
217
|
+
max_replacements += 1
|
|
218
|
+
if clamped_rate >= 1.0:
|
|
219
|
+
max_replacements = population
|
|
220
|
+
max_replacements = min(population, max_replacements)
|
|
221
|
+
if max_replacements <= 0:
|
|
222
|
+
return text
|
|
223
|
+
|
|
224
|
+
# Choose which positions to replace deterministically via rng.sample
|
|
225
|
+
replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
|
|
226
|
+
# Process in ascending order to avoid affecting later indices
|
|
227
|
+
replace_positions.sort()
|
|
228
|
+
|
|
229
|
+
for pos in replace_positions:
|
|
230
|
+
metadata = candidate_metadata[pos]
|
|
231
|
+
if not metadata.synonyms:
|
|
266
232
|
continue
|
|
267
233
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
)
|
|
271
|
-
if available_pos:
|
|
272
|
-
candidate_indices.append(idx)
|
|
273
|
-
candidate_metadata[idx] = CandidateInfo(
|
|
274
|
-
prefix=prefix,
|
|
275
|
-
core_word=core_word,
|
|
276
|
-
suffix=suffix,
|
|
277
|
-
parts_of_speech=available_pos,
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
if not candidate_indices:
|
|
281
|
-
return text
|
|
282
|
-
|
|
283
|
-
clamped_rate = max(0.0, effective_rate)
|
|
284
|
-
max_replacements = int(len(candidate_indices) * clamped_rate)
|
|
285
|
-
if max_replacements <= 0:
|
|
286
|
-
return text
|
|
287
|
-
|
|
288
|
-
# Choose which positions to replace deterministically via rng.sample
|
|
289
|
-
replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
|
|
290
|
-
# Process in ascending order to avoid affecting later indices
|
|
291
|
-
replace_positions.sort()
|
|
234
|
+
replacement = active_rng.choice(metadata.synonyms)
|
|
235
|
+
tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
|
|
292
236
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
continue
|
|
298
|
-
|
|
299
|
-
replacement = active_rng.choice(synonyms)
|
|
300
|
-
tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
|
|
301
|
-
|
|
302
|
-
return "".join(tokens)
|
|
237
|
+
return "".join(tokens)
|
|
238
|
+
finally:
|
|
239
|
+
if restore_lexicon_seed:
|
|
240
|
+
active_lexicon.reseed(original_lexicon_seed)
|
|
303
241
|
|
|
304
242
|
|
|
305
243
|
class Jargoyle(Glitchling):
|
|
306
|
-
"""Glitchling that swaps words with
|
|
244
|
+
"""Glitchling that swaps words with lexicon-driven synonyms."""
|
|
307
245
|
|
|
308
246
|
def __init__(
|
|
309
247
|
self,
|
|
@@ -312,22 +250,74 @@ class Jargoyle(Glitchling):
|
|
|
312
250
|
replacement_rate: float | None = None,
|
|
313
251
|
part_of_speech: PartOfSpeechInput = "n",
|
|
314
252
|
seed: int | None = None,
|
|
253
|
+
lexicon: Lexicon | None = None,
|
|
315
254
|
) -> None:
|
|
316
255
|
self._param_aliases = {"replacement_rate": "rate"}
|
|
256
|
+
self._owns_lexicon = lexicon is None
|
|
257
|
+
self._external_lexicon_original_seed = (
|
|
258
|
+
lexicon.seed if isinstance(lexicon, Lexicon) else None
|
|
259
|
+
)
|
|
260
|
+
self._initializing = True
|
|
317
261
|
effective_rate = resolve_rate(
|
|
318
262
|
rate=rate,
|
|
319
263
|
legacy_value=replacement_rate,
|
|
320
|
-
default=0.
|
|
264
|
+
default=0.01,
|
|
321
265
|
legacy_name="replacement_rate",
|
|
322
266
|
)
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
267
|
+
prepared_lexicon = lexicon or get_default_lexicon(seed=seed)
|
|
268
|
+
if lexicon is not None and seed is not None:
|
|
269
|
+
prepared_lexicon.reseed(seed)
|
|
270
|
+
try:
|
|
271
|
+
super().__init__(
|
|
272
|
+
name="Jargoyle",
|
|
273
|
+
corruption_function=substitute_random_synonyms,
|
|
274
|
+
scope=AttackWave.WORD,
|
|
275
|
+
seed=seed,
|
|
276
|
+
rate=effective_rate,
|
|
277
|
+
part_of_speech=part_of_speech,
|
|
278
|
+
lexicon=prepared_lexicon,
|
|
279
|
+
)
|
|
280
|
+
finally:
|
|
281
|
+
self._initializing = False
|
|
282
|
+
|
|
283
|
+
def set_param(self, key: str, value: Any) -> None:
|
|
284
|
+
super().set_param(key, value)
|
|
285
|
+
|
|
286
|
+
aliases = getattr(self, "_param_aliases", {})
|
|
287
|
+
canonical = aliases.get(key, key)
|
|
288
|
+
|
|
289
|
+
if canonical == "seed":
|
|
290
|
+
current_lexicon = getattr(self, "lexicon", None)
|
|
291
|
+
if isinstance(current_lexicon, Lexicon):
|
|
292
|
+
if getattr(self, "_owns_lexicon", False):
|
|
293
|
+
current_lexicon.reseed(self.seed)
|
|
294
|
+
else:
|
|
295
|
+
if self.seed is not None:
|
|
296
|
+
current_lexicon.reseed(self.seed)
|
|
297
|
+
else:
|
|
298
|
+
if hasattr(self, "_external_lexicon_original_seed"):
|
|
299
|
+
original_seed = getattr(
|
|
300
|
+
self, "_external_lexicon_original_seed", None
|
|
301
|
+
)
|
|
302
|
+
current_lexicon.reseed(original_seed)
|
|
303
|
+
elif canonical == "lexicon" and isinstance(value, Lexicon):
|
|
304
|
+
if getattr(self, "_initializing", False):
|
|
305
|
+
if getattr(self, "_owns_lexicon", False):
|
|
306
|
+
if self.seed is not None:
|
|
307
|
+
value.reseed(self.seed)
|
|
308
|
+
else:
|
|
309
|
+
if getattr(self, "_external_lexicon_original_seed", None) is None:
|
|
310
|
+
self._external_lexicon_original_seed = value.seed
|
|
311
|
+
if self.seed is not None:
|
|
312
|
+
value.reseed(self.seed)
|
|
313
|
+
return
|
|
314
|
+
|
|
315
|
+
self._owns_lexicon = False
|
|
316
|
+
self._external_lexicon_original_seed = value.seed
|
|
317
|
+
if self.seed is not None:
|
|
318
|
+
value.reseed(self.seed)
|
|
319
|
+
elif value.seed != self._external_lexicon_original_seed:
|
|
320
|
+
value.reseed(self._external_lexicon_original_seed)
|
|
331
321
|
|
|
332
322
|
|
|
333
323
|
jargoyle = Jargoyle()
|
glitchlings/zoo/redactyl.py
CHANGED
|
@@ -3,10 +3,11 @@ import random
|
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
5
|
from ._rate import resolve_rate
|
|
6
|
+
from ._sampling import weighted_sample_without_replacement
|
|
6
7
|
from ._text_utils import (
|
|
8
|
+
WordToken,
|
|
9
|
+
collect_word_tokens,
|
|
7
10
|
split_preserving_whitespace,
|
|
8
|
-
split_token_edges,
|
|
9
|
-
token_core_length,
|
|
10
11
|
)
|
|
11
12
|
from .core import AttackWave, Glitchling
|
|
12
13
|
|
|
@@ -19,41 +20,6 @@ except ImportError: # pragma: no cover - compiled extension not present
|
|
|
19
20
|
_redact_words_rust = None
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
def _weighted_sample_without_replacement(
|
|
23
|
-
population: list[int],
|
|
24
|
-
weights: list[float],
|
|
25
|
-
*,
|
|
26
|
-
k: int,
|
|
27
|
-
rng: random.Random,
|
|
28
|
-
) -> list[int]:
|
|
29
|
-
"""Select `k` unique indices according to the given weights."""
|
|
30
|
-
|
|
31
|
-
selections: list[int] = []
|
|
32
|
-
items = list(zip(population, weights))
|
|
33
|
-
if k <= 0 or not items:
|
|
34
|
-
return selections
|
|
35
|
-
if k > len(items):
|
|
36
|
-
raise ValueError("Sample larger than population or is negative")
|
|
37
|
-
|
|
38
|
-
for _ in range(k):
|
|
39
|
-
total_weight = sum(weight for _, weight in items)
|
|
40
|
-
if total_weight <= 0:
|
|
41
|
-
chosen_index = rng.randrange(len(items))
|
|
42
|
-
else:
|
|
43
|
-
threshold = rng.random() * total_weight
|
|
44
|
-
cumulative = 0.0
|
|
45
|
-
chosen_index = len(items) - 1
|
|
46
|
-
for idx, (_, weight) in enumerate(items):
|
|
47
|
-
cumulative += weight
|
|
48
|
-
if cumulative >= threshold:
|
|
49
|
-
chosen_index = idx
|
|
50
|
-
break
|
|
51
|
-
value, _ = items.pop(chosen_index)
|
|
52
|
-
selections.append(value)
|
|
53
|
-
|
|
54
|
-
return selections
|
|
55
|
-
|
|
56
|
-
|
|
57
23
|
def _python_redact_words(
|
|
58
24
|
text: str,
|
|
59
25
|
*,
|
|
@@ -74,39 +40,45 @@ def _python_redact_words(
|
|
|
74
40
|
- unweighted: When True, sample words uniformly instead of by length.
|
|
75
41
|
"""
|
|
76
42
|
tokens = split_preserving_whitespace(text)
|
|
77
|
-
|
|
78
|
-
if not
|
|
43
|
+
word_tokens = collect_word_tokens(tokens)
|
|
44
|
+
if not word_tokens:
|
|
79
45
|
raise ValueError(
|
|
80
46
|
"Cannot redact words because the input text contains no redactable words."
|
|
81
47
|
)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
48
|
+
|
|
49
|
+
population = [token.index for token in word_tokens]
|
|
50
|
+
weights = [
|
|
51
|
+
1.0 if unweighted else float(token.core_length) for token in word_tokens
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
clamped_rate = max(0.0, min(rate, 1.0))
|
|
55
|
+
raw_quota = len(population) * clamped_rate
|
|
88
56
|
num_to_redact = int(raw_quota)
|
|
89
|
-
if
|
|
57
|
+
if clamped_rate > 0.0:
|
|
90
58
|
num_to_redact = max(1, num_to_redact)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
59
|
+
num_to_redact = min(num_to_redact, len(population))
|
|
60
|
+
if num_to_redact <= 0:
|
|
61
|
+
return "".join(tokens)
|
|
62
|
+
|
|
63
|
+
indices_to_redact = weighted_sample_without_replacement(
|
|
64
|
+
population,
|
|
95
65
|
weights,
|
|
96
66
|
k=num_to_redact,
|
|
97
67
|
rng=rng,
|
|
98
68
|
)
|
|
99
69
|
indices_to_redact.sort()
|
|
100
70
|
|
|
71
|
+
token_by_index: dict[int, WordToken] = {token.index: token for token in word_tokens}
|
|
72
|
+
|
|
101
73
|
for i in indices_to_redact:
|
|
102
74
|
if i >= len(tokens):
|
|
103
75
|
break
|
|
104
76
|
|
|
105
|
-
|
|
106
|
-
if
|
|
77
|
+
token = token_by_index.get(i)
|
|
78
|
+
if token is None:
|
|
107
79
|
continue
|
|
108
80
|
|
|
109
|
-
prefix, core, suffix =
|
|
81
|
+
prefix, core, suffix = token.prefix, token.core, token.suffix
|
|
110
82
|
tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
|
|
111
83
|
|
|
112
84
|
text = "".join(tokens)
|
|
@@ -144,7 +116,7 @@ def redact_words(
|
|
|
144
116
|
if rng is None:
|
|
145
117
|
rng = random.Random(seed)
|
|
146
118
|
|
|
147
|
-
clamped_rate = max(0.0, effective_rate)
|
|
119
|
+
clamped_rate = max(0.0, min(effective_rate, 1.0))
|
|
148
120
|
unweighted_flag = bool(unweighted)
|
|
149
121
|
|
|
150
122
|
use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
|
glitchlings/zoo/reduple.py
CHANGED
|
@@ -2,11 +2,7 @@ import random
|
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
4
|
from ._rate import resolve_rate
|
|
5
|
-
from ._text_utils import
|
|
6
|
-
split_preserving_whitespace,
|
|
7
|
-
split_token_edges,
|
|
8
|
-
token_core_length,
|
|
9
|
-
)
|
|
5
|
+
from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
|
|
10
6
|
from .core import AttackWave, Glitchling
|
|
11
7
|
|
|
12
8
|
try:
|
|
@@ -35,29 +31,23 @@ def _python_reduplicate_words(
|
|
|
35
31
|
- Deterministic when run with a fixed seed or via Gaggle.
|
|
36
32
|
"""
|
|
37
33
|
tokens = split_preserving_whitespace(text)
|
|
34
|
+
word_tokens = collect_word_tokens(tokens)
|
|
38
35
|
|
|
39
|
-
|
|
40
|
-
for
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
continue
|
|
44
|
-
|
|
45
|
-
length = token_core_length(word)
|
|
46
|
-
weight = 1.0 if unweighted else 1.0 / length
|
|
47
|
-
candidate_weights.append((i, weight))
|
|
36
|
+
weighted_tokens: list[tuple[int, float, WordToken]] = []
|
|
37
|
+
for token in word_tokens:
|
|
38
|
+
weight = 1.0 if unweighted else 1.0 / float(token.core_length)
|
|
39
|
+
weighted_tokens.append((token.index, weight, token))
|
|
48
40
|
|
|
49
|
-
if not
|
|
41
|
+
if not weighted_tokens:
|
|
50
42
|
return "".join(tokens)
|
|
51
43
|
|
|
52
44
|
effective_rate = max(rate, 0.0)
|
|
53
45
|
if effective_rate <= 0.0:
|
|
54
46
|
return "".join(tokens)
|
|
55
47
|
|
|
56
|
-
mean_weight = sum(weight for _, weight in
|
|
57
|
-
candidate_weights
|
|
58
|
-
)
|
|
48
|
+
mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(weighted_tokens)
|
|
59
49
|
|
|
60
|
-
for index, weight in
|
|
50
|
+
for index, weight, token in weighted_tokens:
|
|
61
51
|
if effective_rate >= 1.0:
|
|
62
52
|
probability = 1.0
|
|
63
53
|
else:
|
|
@@ -68,8 +58,7 @@ def _python_reduplicate_words(
|
|
|
68
58
|
if rng.random() >= probability:
|
|
69
59
|
continue
|
|
70
60
|
|
|
71
|
-
|
|
72
|
-
prefix, core, suffix = split_token_edges(word)
|
|
61
|
+
prefix, core, suffix = token.prefix, token.core, token.suffix
|
|
73
62
|
tokens[index] = f"{prefix}{core} {core}{suffix}"
|
|
74
63
|
return "".join(tokens)
|
|
75
64
|
|