glitchlings 0.4.1__cp311-cp311-macosx_11_0_universal2.whl → 0.4.2__cp311-cp311-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +26 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
- glitchlings/compat.py +215 -0
- glitchlings/config.py +136 -19
- glitchlings/dlc/_shared.py +68 -0
- glitchlings/dlc/huggingface.py +26 -41
- glitchlings/dlc/prime.py +64 -101
- glitchlings/lexicon/__init__.py +8 -19
- glitchlings/lexicon/_cache.py +0 -7
- glitchlings/lexicon/graph.py +4 -12
- glitchlings/lexicon/metrics.py +1 -8
- glitchlings/lexicon/vector.py +15 -34
- glitchlings/lexicon/wordnet.py +31 -32
- glitchlings/main.py +9 -13
- glitchlings/util/__init__.py +18 -4
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +21 -14
- glitchlings/zoo/_ocr_confusions.py +1 -3
- glitchlings/zoo/_rate.py +1 -4
- glitchlings/zoo/_sampling.py +0 -1
- glitchlings/zoo/_text_utils.py +1 -5
- glitchlings/zoo/adjax.py +0 -2
- glitchlings/zoo/core.py +114 -75
- glitchlings/zoo/jargoyle.py +9 -14
- glitchlings/zoo/mim1c.py +11 -10
- glitchlings/zoo/redactyl.py +5 -8
- glitchlings/zoo/reduple.py +3 -1
- glitchlings/zoo/rushmore.py +2 -8
- glitchlings/zoo/scannequin.py +5 -4
- glitchlings/zoo/typogre.py +3 -7
- glitchlings/zoo/zeedub.py +2 -2
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/METADATA +67 -3
- glitchlings-0.4.2.dist-info/RECORD +42 -0
- glitchlings-0.4.1.dist-info/RECORD +0 -39
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/top_level.txt +0 -0
glitchlings/zoo/core.py
CHANGED
|
@@ -4,24 +4,18 @@ import inspect
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
6
|
import random
|
|
7
|
+
from collections.abc import Mapping, Sequence
|
|
7
8
|
from enum import IntEnum, auto
|
|
8
9
|
from hashlib import blake2s
|
|
9
|
-
from typing import TYPE_CHECKING, Any, Callable, Protocol
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Protocol, TypedDict, Union
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
except ModuleNotFoundError as error: # pragma: no cover - optional dependency
|
|
15
|
-
_DatasetsDataset = None # type: ignore[assignment]
|
|
16
|
-
_datasets_error = error
|
|
17
|
-
else:
|
|
18
|
-
_datasets_error = None
|
|
12
|
+
from ..compat import get_datasets_dataset, require_datasets
|
|
13
|
+
|
|
14
|
+
_DatasetsDataset = get_datasets_dataset()
|
|
19
15
|
|
|
20
16
|
try: # pragma: no cover - optional dependency
|
|
21
|
-
from glitchlings._zoo_rust import
|
|
22
|
-
|
|
23
|
-
plan_glitchlings as _plan_glitchlings_rust,
|
|
24
|
-
)
|
|
17
|
+
from glitchlings._zoo_rust import compose_glitchlings as _compose_glitchlings_rust
|
|
18
|
+
from glitchlings._zoo_rust import plan_glitchlings as _plan_glitchlings_rust
|
|
25
19
|
except ImportError: # pragma: no cover - compiled extension not present
|
|
26
20
|
_compose_glitchlings_rust = None
|
|
27
21
|
_plan_glitchlings_rust = None
|
|
@@ -35,9 +29,17 @@ _PIPELINE_ENABLE_VALUES = {"1", "true", "yes", "on"}
|
|
|
35
29
|
_PIPELINE_DISABLE_VALUES = {"0", "false", "no", "off"}
|
|
36
30
|
|
|
37
31
|
|
|
38
|
-
|
|
39
|
-
|
|
32
|
+
class PlanSpecification(TypedDict):
|
|
33
|
+
name: str
|
|
34
|
+
scope: int
|
|
35
|
+
order: int
|
|
40
36
|
|
|
37
|
+
|
|
38
|
+
PlanEntry = Union["Glitchling", Mapping[str, Any]]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def pipeline_feature_flag_enabled() -> bool:
|
|
42
|
+
"""Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
|
|
41
43
|
value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
|
|
42
44
|
if value is None:
|
|
43
45
|
return True
|
|
@@ -51,12 +53,62 @@ def _pipeline_feature_flag_enabled() -> bool:
|
|
|
51
53
|
|
|
52
54
|
return True
|
|
53
55
|
|
|
56
|
+
|
|
57
|
+
def _pipeline_feature_flag_enabled() -> bool:
|
|
58
|
+
"""Compatibility shim for legacy callers."""
|
|
59
|
+
return pipeline_feature_flag_enabled()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def is_rust_pipeline_supported() -> bool:
|
|
63
|
+
"""Return ``True`` when the optional Rust extension is importable."""
|
|
64
|
+
return _compose_glitchlings_rust is not None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_rust_pipeline_enabled() -> bool:
|
|
68
|
+
"""Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
|
|
69
|
+
return is_rust_pipeline_supported() and pipeline_feature_flag_enabled()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _spec_from_glitchling(glitchling: "Glitchling") -> PlanSpecification:
|
|
73
|
+
"""Create a plan specification mapping from a glitchling instance."""
|
|
74
|
+
return {
|
|
75
|
+
"name": glitchling.name,
|
|
76
|
+
"scope": int(glitchling.level),
|
|
77
|
+
"order": int(glitchling.order),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _normalize_plan_entry(entry: PlanEntry) -> PlanSpecification:
|
|
82
|
+
"""Convert a plan entry (glitchling or mapping) into a normalized specification."""
|
|
83
|
+
if isinstance(entry, Glitchling):
|
|
84
|
+
return _spec_from_glitchling(entry)
|
|
85
|
+
|
|
86
|
+
if not isinstance(entry, Mapping):
|
|
87
|
+
message = "plan_glitchlings expects Glitchling instances or mapping specifications"
|
|
88
|
+
raise TypeError(message)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
name = str(entry["name"])
|
|
92
|
+
scope_value = int(entry["scope"])
|
|
93
|
+
order_value = int(entry["order"])
|
|
94
|
+
except KeyError as exc: # pragma: no cover - defensive guard
|
|
95
|
+
raise ValueError(f"Plan specification missing required field: {exc.args[0]}") from exc
|
|
96
|
+
except (TypeError, ValueError) as exc:
|
|
97
|
+
raise ValueError("Plan specification fields must be coercible to integers") from exc
|
|
98
|
+
|
|
99
|
+
return {"name": name, "scope": scope_value, "order": order_value}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _normalize_plan_entries(entries: Sequence[PlanEntry]) -> list[PlanSpecification]:
|
|
103
|
+
"""Normalize a collection of orchestration plan entries."""
|
|
104
|
+
return [_normalize_plan_entry(entry) for entry in entries]
|
|
105
|
+
|
|
106
|
+
|
|
54
107
|
def _plan_glitchlings_python(
|
|
55
|
-
specs:
|
|
108
|
+
specs: Sequence[Mapping[str, Any]],
|
|
56
109
|
master_seed: int,
|
|
57
110
|
) -> list[tuple[int, int]]:
|
|
58
111
|
"""Pure-Python fallback for orchestrating glitchlings in deterministic order."""
|
|
59
|
-
|
|
60
112
|
master_seed_int = int(master_seed)
|
|
61
113
|
planned: list[tuple[int, int, int, int, str]] = []
|
|
62
114
|
for index, spec in enumerate(specs):
|
|
@@ -71,11 +123,10 @@ def _plan_glitchlings_python(
|
|
|
71
123
|
|
|
72
124
|
|
|
73
125
|
def _plan_glitchlings_with_rust(
|
|
74
|
-
specs:
|
|
126
|
+
specs: Sequence[Mapping[str, Any]],
|
|
75
127
|
master_seed: int,
|
|
76
128
|
) -> list[tuple[int, int]] | None:
|
|
77
129
|
"""Attempt to obtain the orchestration plan from the compiled Rust module."""
|
|
78
|
-
|
|
79
130
|
if _plan_glitchlings_rust is None:
|
|
80
131
|
return None
|
|
81
132
|
|
|
@@ -88,38 +139,51 @@ def _plan_glitchlings_with_rust(
|
|
|
88
139
|
return [(int(index), int(seed)) for index, seed in plan]
|
|
89
140
|
|
|
90
141
|
|
|
91
|
-
def
|
|
92
|
-
specs:
|
|
142
|
+
def _resolve_orchestration_plan(
|
|
143
|
+
specs: Sequence[PlanSpecification],
|
|
144
|
+
master_seed: int,
|
|
145
|
+
prefer_rust: bool,
|
|
146
|
+
) -> list[tuple[int, int]]:
|
|
147
|
+
"""Dispatch to the Rust planner when available, otherwise fall back to Python."""
|
|
148
|
+
if prefer_rust:
|
|
149
|
+
plan = _plan_glitchlings_with_rust(list(specs), master_seed)
|
|
150
|
+
if plan is not None:
|
|
151
|
+
return plan
|
|
152
|
+
|
|
153
|
+
return _plan_glitchlings_python(list(specs), master_seed)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def plan_glitchling_specs(
|
|
157
|
+
specs: Sequence[Mapping[str, Any]],
|
|
93
158
|
master_seed: int | None,
|
|
159
|
+
*,
|
|
160
|
+
prefer_rust: bool = True,
|
|
94
161
|
) -> list[tuple[int, int]]:
|
|
95
162
|
"""Resolve orchestration order and seeds from glitchling specifications."""
|
|
96
|
-
|
|
97
163
|
if master_seed is None:
|
|
98
164
|
message = "Gaggle orchestration requires a master seed"
|
|
99
165
|
raise ValueError(message)
|
|
100
166
|
|
|
167
|
+
normalized_specs = [_normalize_plan_entry(spec) for spec in specs]
|
|
101
168
|
master_seed_int = int(master_seed)
|
|
102
|
-
|
|
103
|
-
if plan is not None:
|
|
104
|
-
return plan
|
|
105
|
-
|
|
106
|
-
return _plan_glitchlings_python(specs, master_seed_int)
|
|
169
|
+
return _resolve_orchestration_plan(normalized_specs, master_seed_int, prefer_rust)
|
|
107
170
|
|
|
108
171
|
|
|
109
|
-
def
|
|
110
|
-
|
|
172
|
+
def plan_glitchlings(
|
|
173
|
+
entries: Sequence[PlanEntry],
|
|
174
|
+
master_seed: int | None,
|
|
175
|
+
*,
|
|
176
|
+
prefer_rust: bool = True,
|
|
111
177
|
) -> list[tuple[int, int]]:
|
|
112
|
-
"""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
]
|
|
122
|
-
return _plan_glitchling_specs(specs, master_seed)
|
|
178
|
+
"""Normalize glitchling instances or specs and compute an orchestration plan."""
|
|
179
|
+
if master_seed is None:
|
|
180
|
+
message = "Gaggle orchestration requires a master seed"
|
|
181
|
+
raise ValueError(message)
|
|
182
|
+
|
|
183
|
+
normalized_specs = _normalize_plan_entries(entries)
|
|
184
|
+
master_seed_int = int(master_seed)
|
|
185
|
+
return _resolve_orchestration_plan(normalized_specs, master_seed_int, prefer_rust)
|
|
186
|
+
|
|
123
187
|
|
|
124
188
|
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
125
189
|
from datasets import Dataset # type: ignore
|
|
@@ -140,7 +204,6 @@ def _is_transcript(
|
|
|
140
204
|
require_all_content: bool = False,
|
|
141
205
|
) -> bool:
|
|
142
206
|
"""Return `True` when `value` appears to be a chat transcript."""
|
|
143
|
-
|
|
144
207
|
if not isinstance(value, list):
|
|
145
208
|
return False
|
|
146
209
|
|
|
@@ -209,8 +272,8 @@ class Glitchling:
|
|
|
209
272
|
order: Relative ordering within the same scope.
|
|
210
273
|
seed: Optional seed for deterministic random behaviour.
|
|
211
274
|
**kwargs: Additional parameters forwarded to the corruption callable.
|
|
212
|
-
"""
|
|
213
275
|
|
|
276
|
+
"""
|
|
214
277
|
# Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
|
|
215
278
|
# If no seed is supplied, we fall back to Python's default entropy.
|
|
216
279
|
self.seed = seed
|
|
@@ -228,7 +291,6 @@ class Glitchling:
|
|
|
228
291
|
|
|
229
292
|
def set_param(self, key: str, value: Any) -> None:
|
|
230
293
|
"""Persist a parameter for use by the corruption callable."""
|
|
231
|
-
|
|
232
294
|
aliases = getattr(self, "_param_aliases", {})
|
|
233
295
|
canonical = aliases.get(key, key)
|
|
234
296
|
|
|
@@ -250,7 +312,6 @@ class Glitchling:
|
|
|
250
312
|
|
|
251
313
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
252
314
|
"""Return the Rust pipeline operation descriptor for this glitchling."""
|
|
253
|
-
|
|
254
315
|
factory = self._pipeline_descriptor_factory
|
|
255
316
|
if factory is None:
|
|
256
317
|
return None
|
|
@@ -259,15 +320,11 @@ class Glitchling:
|
|
|
259
320
|
|
|
260
321
|
def _corruption_expects_rng(self) -> bool:
|
|
261
322
|
"""Return `True` when the corruption function accepts an rng keyword."""
|
|
262
|
-
|
|
263
323
|
cached_callable = self._cached_rng_callable
|
|
264
324
|
cached_expectation = self._cached_rng_expectation
|
|
265
325
|
corruption_function = self.corruption_function
|
|
266
326
|
|
|
267
|
-
if
|
|
268
|
-
cached_callable is corruption_function
|
|
269
|
-
and cached_expectation is not None
|
|
270
|
-
):
|
|
327
|
+
if cached_callable is corruption_function and cached_expectation is not None:
|
|
271
328
|
return cached_expectation
|
|
272
329
|
|
|
273
330
|
expects_rng = False
|
|
@@ -285,7 +342,6 @@ class Glitchling:
|
|
|
285
342
|
|
|
286
343
|
def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
|
|
287
344
|
"""Execute the corruption callable, injecting the RNG when required."""
|
|
288
|
-
|
|
289
345
|
# Pass rng to underlying corruption function if it expects it.
|
|
290
346
|
expects_rng = self._corruption_expects_rng()
|
|
291
347
|
|
|
@@ -297,23 +353,17 @@ class Glitchling:
|
|
|
297
353
|
|
|
298
354
|
def corrupt(self, text: str | list[dict[str, Any]]) -> str | list[dict[str, Any]]:
|
|
299
355
|
"""Apply the corruption function to text or conversational transcripts."""
|
|
300
|
-
|
|
301
356
|
if _is_transcript(text):
|
|
302
357
|
transcript = [dict(turn) for turn in text]
|
|
303
358
|
if transcript:
|
|
304
|
-
transcript[-1]["content"] = self.__corrupt(
|
|
305
|
-
transcript[-1]["content"], **self.kwargs
|
|
306
|
-
)
|
|
359
|
+
transcript[-1]["content"] = self.__corrupt(transcript[-1]["content"], **self.kwargs)
|
|
307
360
|
return transcript
|
|
308
361
|
|
|
309
362
|
return self.__corrupt(text, **self.kwargs)
|
|
310
363
|
|
|
311
364
|
def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
|
|
312
365
|
"""Apply corruption lazily across dataset columns."""
|
|
313
|
-
|
|
314
|
-
if _DatasetsDataset is None:
|
|
315
|
-
message = "datasets is not installed"
|
|
316
|
-
raise ModuleNotFoundError(message) from _datasets_error
|
|
366
|
+
require_datasets("datasets is not installed")
|
|
317
367
|
|
|
318
368
|
def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
|
|
319
369
|
row = dict(row)
|
|
@@ -335,12 +385,10 @@ class Glitchling:
|
|
|
335
385
|
|
|
336
386
|
def __call__(self, text: str, *args: Any, **kwds: Any) -> str | list[dict[str, Any]]:
|
|
337
387
|
"""Allow a glitchling to be invoked directly like a callable."""
|
|
338
|
-
|
|
339
388
|
return self.corrupt(text, *args, **kwds)
|
|
340
389
|
|
|
341
390
|
def reset_rng(self, seed: int | None = None) -> None:
|
|
342
391
|
"""Reset the glitchling's RNG to its initial seed."""
|
|
343
|
-
|
|
344
392
|
if seed is not None:
|
|
345
393
|
self.seed = seed
|
|
346
394
|
if self.seed is not None:
|
|
@@ -348,7 +396,6 @@ class Glitchling:
|
|
|
348
396
|
|
|
349
397
|
def clone(self, seed: int | None = None) -> "Glitchling":
|
|
350
398
|
"""Create a copy of this glitchling, optionally with a new seed."""
|
|
351
|
-
|
|
352
399
|
cls = self.__class__
|
|
353
400
|
filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
|
|
354
401
|
clone_seed = seed if seed is not None else self.seed
|
|
@@ -368,9 +415,6 @@ class Glitchling:
|
|
|
368
415
|
return cls(**filtered_kwargs)
|
|
369
416
|
|
|
370
417
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
418
|
class Gaggle(Glitchling):
|
|
375
419
|
"""A collection of glitchlings executed in a deterministic order."""
|
|
376
420
|
|
|
@@ -380,8 +424,8 @@ class Gaggle(Glitchling):
|
|
|
380
424
|
Args:
|
|
381
425
|
glitchlings: Glitchlings to orchestrate.
|
|
382
426
|
seed: Master seed used to derive per-glitchling seeds.
|
|
383
|
-
"""
|
|
384
427
|
|
|
428
|
+
"""
|
|
385
429
|
super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
|
|
386
430
|
self._clones_by_index: list[Glitchling] = []
|
|
387
431
|
for idx, glitchling in enumerate(glitchlings):
|
|
@@ -389,9 +433,7 @@ class Gaggle(Glitchling):
|
|
|
389
433
|
setattr(clone, "_gaggle_index", idx)
|
|
390
434
|
self._clones_by_index.append(clone)
|
|
391
435
|
|
|
392
|
-
self.glitchlings: dict[AttackWave, list[Glitchling]] = {
|
|
393
|
-
level: [] for level in AttackWave
|
|
394
|
-
}
|
|
436
|
+
self.glitchlings: dict[AttackWave, list[Glitchling]] = {level: [] for level in AttackWave}
|
|
395
437
|
self.apply_order: list[Glitchling] = []
|
|
396
438
|
self._plan: list[tuple[int, int]] = []
|
|
397
439
|
self.sort_glitchlings()
|
|
@@ -399,6 +441,7 @@ class Gaggle(Glitchling):
|
|
|
399
441
|
@staticmethod
|
|
400
442
|
def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
|
|
401
443
|
"""Derive a deterministic seed for a glitchling based on the master seed."""
|
|
444
|
+
|
|
402
445
|
def _int_to_bytes(value: int) -> bytes:
|
|
403
446
|
if value == 0:
|
|
404
447
|
return b"\x00"
|
|
@@ -425,8 +468,7 @@ class Gaggle(Glitchling):
|
|
|
425
468
|
|
|
426
469
|
def sort_glitchlings(self) -> None:
|
|
427
470
|
"""Sort glitchlings by wave then order to produce application order."""
|
|
428
|
-
|
|
429
|
-
plan = _plan_glitchling_sequence(self._clones_by_index, self.seed)
|
|
471
|
+
plan = plan_glitchlings(self._clones_by_index, self.seed)
|
|
430
472
|
self._plan = plan
|
|
431
473
|
|
|
432
474
|
self.glitchlings = {level: [] for level in AttackWave}
|
|
@@ -451,14 +493,12 @@ class Gaggle(Glitchling):
|
|
|
451
493
|
@staticmethod
|
|
452
494
|
def rust_pipeline_supported() -> bool:
|
|
453
495
|
"""Return ``True`` when the compiled Rust pipeline is importable."""
|
|
454
|
-
|
|
455
|
-
return _compose_glitchlings_rust is not None
|
|
496
|
+
return is_rust_pipeline_supported()
|
|
456
497
|
|
|
457
498
|
@staticmethod
|
|
458
499
|
def rust_pipeline_enabled() -> bool:
|
|
459
500
|
"""Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
|
|
460
|
-
|
|
461
|
-
return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
|
|
501
|
+
return is_rust_pipeline_enabled()
|
|
462
502
|
|
|
463
503
|
def _pipeline_descriptors(self) -> list[dict[str, Any]] | None:
|
|
464
504
|
if not self.rust_pipeline_enabled():
|
|
@@ -490,7 +530,6 @@ class Gaggle(Glitchling):
|
|
|
490
530
|
|
|
491
531
|
def corrupt(self, text: str) -> str:
|
|
492
532
|
"""Apply each glitchling to the provided text sequentially."""
|
|
493
|
-
|
|
494
533
|
master_seed = self.seed
|
|
495
534
|
descriptors = self._pipeline_descriptors()
|
|
496
535
|
if master_seed is not None and descriptors is not None:
|
glitchlings/zoo/jargoyle.py
CHANGED
|
@@ -9,9 +9,11 @@ from glitchlings.lexicon import Lexicon, get_default_lexicon
|
|
|
9
9
|
try: # pragma: no cover - optional WordNet dependency
|
|
10
10
|
from glitchlings.lexicon.wordnet import (
|
|
11
11
|
WordNetLexicon,
|
|
12
|
+
)
|
|
13
|
+
from glitchlings.lexicon.wordnet import (
|
|
12
14
|
dependencies_available as _lexicon_dependencies_available,
|
|
13
|
-
ensure_wordnet as _lexicon_ensure_wordnet,
|
|
14
15
|
)
|
|
16
|
+
from glitchlings.lexicon.wordnet import ensure_wordnet as _lexicon_ensure_wordnet
|
|
15
17
|
except Exception: # pragma: no cover - triggered when nltk unavailable
|
|
16
18
|
WordNetLexicon = None # type: ignore[assignment]
|
|
17
19
|
|
|
@@ -33,7 +35,6 @@ ensure_wordnet = _lexicon_ensure_wordnet
|
|
|
33
35
|
|
|
34
36
|
def dependencies_available() -> bool:
|
|
35
37
|
"""Return ``True`` when a synonym backend is accessible."""
|
|
36
|
-
|
|
37
38
|
if _lexicon_dependencies_available():
|
|
38
39
|
return True
|
|
39
40
|
|
|
@@ -58,7 +59,6 @@ _VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
|
|
|
58
59
|
|
|
59
60
|
def _split_token(token: str) -> tuple[str, str, str]:
|
|
60
61
|
"""Split a token into leading punctuation, core word, and trailing punctuation."""
|
|
61
|
-
|
|
62
62
|
match = re.match(r"^(\W*)(.*?)(\W*)$", token)
|
|
63
63
|
if not match:
|
|
64
64
|
return "", token, ""
|
|
@@ -70,23 +70,18 @@ def _normalize_parts_of_speech(
|
|
|
70
70
|
part_of_speech: PartOfSpeechInput,
|
|
71
71
|
) -> NormalizedPartsOfSpeech:
|
|
72
72
|
"""Coerce user input into a tuple of valid WordNet POS tags."""
|
|
73
|
-
|
|
74
73
|
if isinstance(part_of_speech, str):
|
|
75
74
|
lowered = part_of_speech.lower()
|
|
76
75
|
if lowered == "any":
|
|
77
76
|
return _VALID_POS
|
|
78
77
|
if lowered not in _VALID_POS:
|
|
79
|
-
raise ValueError(
|
|
80
|
-
"part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'"
|
|
81
|
-
)
|
|
78
|
+
raise ValueError("part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'")
|
|
82
79
|
return (cast(PartOfSpeech, lowered),)
|
|
83
80
|
|
|
84
81
|
normalized: list[PartOfSpeech] = []
|
|
85
82
|
for pos in part_of_speech:
|
|
86
83
|
if pos not in _VALID_POS:
|
|
87
|
-
raise ValueError(
|
|
88
|
-
"part_of_speech entries must be one of 'n', 'v', 'a', or 'r'"
|
|
89
|
-
)
|
|
84
|
+
raise ValueError("part_of_speech entries must be one of 'n', 'v', 'a', or 'r'")
|
|
90
85
|
if pos not in normalized:
|
|
91
86
|
normalized.append(pos)
|
|
92
87
|
if not normalized:
|
|
@@ -118,6 +113,7 @@ def substitute_random_synonyms(
|
|
|
118
113
|
"""Replace words with random lexicon-driven synonyms.
|
|
119
114
|
|
|
120
115
|
Parameters
|
|
116
|
+
----------
|
|
121
117
|
- text: Input text.
|
|
122
118
|
- rate: Max proportion of candidate words to replace (default 0.01).
|
|
123
119
|
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
|
@@ -134,6 +130,7 @@ def substitute_random_synonyms(
|
|
|
134
130
|
- Replacement positions chosen via rng.sample.
|
|
135
131
|
- Synonyms sourced through the lexicon; the default backend derives
|
|
136
132
|
deterministic subsets per word and part-of-speech using the active seed.
|
|
133
|
+
|
|
137
134
|
"""
|
|
138
135
|
effective_rate = resolve_rate(
|
|
139
136
|
rate=rate,
|
|
@@ -168,7 +165,7 @@ def substitute_random_synonyms(
|
|
|
168
165
|
# Split but keep whitespace separators so we can rebuild easily
|
|
169
166
|
tokens = re.split(r"(\s+)", text)
|
|
170
167
|
|
|
171
|
-
# Collect
|
|
168
|
+
# Collect candidate word indices (even positions are words because separators are kept)
|
|
172
169
|
candidate_indices: list[int] = []
|
|
173
170
|
candidate_metadata: dict[int, CandidateInfo] = {}
|
|
174
171
|
for idx, tok in enumerate(tokens):
|
|
@@ -296,9 +293,7 @@ class Jargoyle(Glitchling):
|
|
|
296
293
|
current_lexicon.reseed(self.seed)
|
|
297
294
|
else:
|
|
298
295
|
if hasattr(self, "_external_lexicon_original_seed"):
|
|
299
|
-
original_seed = getattr(
|
|
300
|
-
self, "_external_lexicon_original_seed", None
|
|
301
|
-
)
|
|
296
|
+
original_seed = getattr(self, "_external_lexicon_original_seed", None)
|
|
302
297
|
current_lexicon.reseed(original_seed)
|
|
303
298
|
elif canonical == "lexicon" and isinstance(value, Lexicon):
|
|
304
299
|
if getattr(self, "_initializing", False):
|
glitchlings/zoo/mim1c.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from collections.abc import Collection
|
|
2
1
|
import random
|
|
2
|
+
from collections.abc import Collection
|
|
3
3
|
from typing import Literal
|
|
4
4
|
|
|
5
5
|
from confusable_homoglyphs import confusables
|
|
6
6
|
|
|
7
|
-
from .core import AttackOrder, AttackWave, Glitchling
|
|
8
7
|
from ._rate import resolve_rate
|
|
8
|
+
from .core import AttackOrder, AttackWave, Glitchling
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def swap_homoglyphs(
|
|
@@ -21,16 +21,21 @@ def swap_homoglyphs(
|
|
|
21
21
|
"""Replace characters with visually confusable homoglyphs.
|
|
22
22
|
|
|
23
23
|
Parameters
|
|
24
|
+
----------
|
|
24
25
|
- text: Input text.
|
|
25
26
|
- rate: Max proportion of eligible characters to replace (default 0.02).
|
|
26
|
-
- classes: Restrict replacements to these Unicode script classes (default
|
|
27
|
+
- classes: Restrict replacements to these Unicode script classes (default
|
|
28
|
+
["LATIN", "GREEK", "CYRILLIC"]). Use "all" to allow any.
|
|
27
29
|
- banned_characters: Characters that must never appear as replacements.
|
|
28
30
|
- seed: Optional seed if `rng` not provided.
|
|
29
31
|
- rng: Optional RNG; overrides seed.
|
|
30
32
|
|
|
31
33
|
Notes
|
|
32
|
-
|
|
34
|
+
-----
|
|
35
|
+
- Only replaces characters present in ``confusables.confusables_data`` with
|
|
36
|
+
single-codepoint alternatives.
|
|
33
37
|
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
|
38
|
+
|
|
34
39
|
"""
|
|
35
40
|
effective_rate = resolve_rate(
|
|
36
41
|
rate=rate,
|
|
@@ -46,9 +51,7 @@ def swap_homoglyphs(
|
|
|
46
51
|
classes = ["LATIN", "GREEK", "CYRILLIC"]
|
|
47
52
|
|
|
48
53
|
target_chars = [char for char in text if char.isalnum()]
|
|
49
|
-
confusable_chars = [
|
|
50
|
-
char for char in target_chars if char in confusables.confusables_data
|
|
51
|
-
]
|
|
54
|
+
confusable_chars = [char for char in target_chars if char in confusables.confusables_data]
|
|
52
55
|
clamped_rate = max(0.0, effective_rate)
|
|
53
56
|
num_replacements = int(len(confusable_chars) * clamped_rate)
|
|
54
57
|
done = 0
|
|
@@ -57,9 +60,7 @@ def swap_homoglyphs(
|
|
|
57
60
|
for char in confusable_chars:
|
|
58
61
|
if done >= num_replacements:
|
|
59
62
|
break
|
|
60
|
-
options = [
|
|
61
|
-
o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
|
|
62
|
-
]
|
|
63
|
+
options = [o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1]
|
|
63
64
|
if classes != "all":
|
|
64
65
|
options = [opt for opt in options if confusables.alias(opt) in classes]
|
|
65
66
|
if banned_set:
|
glitchlings/zoo/redactyl.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import re
|
|
2
1
|
import random
|
|
2
|
+
import re
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
5
|
from ._rate import resolve_rate
|
|
@@ -32,24 +32,22 @@ def _python_redact_words(
|
|
|
32
32
|
"""Redact random words by replacing their characters.
|
|
33
33
|
|
|
34
34
|
Parameters
|
|
35
|
+
----------
|
|
35
36
|
- text: Input text.
|
|
36
37
|
- replacement_char: The character to use for redaction (default FULL_BLOCK).
|
|
37
38
|
- rate: Max proportion of words to redact (default 0.05).
|
|
38
39
|
- merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
|
|
39
40
|
- rng: RNG used for sampling decisions.
|
|
40
41
|
- unweighted: When True, sample words uniformly instead of by length.
|
|
42
|
+
|
|
41
43
|
"""
|
|
42
44
|
tokens = split_preserving_whitespace(text)
|
|
43
45
|
word_tokens = collect_word_tokens(tokens)
|
|
44
46
|
if not word_tokens:
|
|
45
|
-
raise ValueError(
|
|
46
|
-
"Cannot redact words because the input text contains no redactable words."
|
|
47
|
-
)
|
|
47
|
+
raise ValueError("Cannot redact words because the input text contains no redactable words.")
|
|
48
48
|
|
|
49
49
|
population = [token.index for token in word_tokens]
|
|
50
|
-
weights = [
|
|
51
|
-
1.0 if unweighted else float(token.core_length) for token in word_tokens
|
|
52
|
-
]
|
|
50
|
+
weights = [1.0 if unweighted else float(token.core_length) for token in word_tokens]
|
|
53
51
|
|
|
54
52
|
clamped_rate = max(0.0, min(rate, 1.0))
|
|
55
53
|
raw_quota = len(population) * clamped_rate
|
|
@@ -105,7 +103,6 @@ def redact_words(
|
|
|
105
103
|
unweighted: bool = False,
|
|
106
104
|
) -> str:
|
|
107
105
|
"""Redact random words by replacing their characters."""
|
|
108
|
-
|
|
109
106
|
effective_rate = resolve_rate(
|
|
110
107
|
rate=rate,
|
|
111
108
|
legacy_value=redaction_rate,
|
glitchlings/zoo/reduple.py
CHANGED
|
@@ -21,14 +21,17 @@ def _python_reduplicate_words(
|
|
|
21
21
|
"""Randomly reduplicate words in the text.
|
|
22
22
|
|
|
23
23
|
Parameters
|
|
24
|
+
----------
|
|
24
25
|
- text: Input text.
|
|
25
26
|
- rate: Max proportion of words to reduplicate (default 0.05).
|
|
26
27
|
- rng: RNG used for sampling decisions.
|
|
27
28
|
- unweighted: When True, sample words uniformly instead of length-weighted.
|
|
28
29
|
|
|
29
30
|
Notes
|
|
31
|
+
-----
|
|
30
32
|
- Preserves spacing and punctuation by tokenizing with separators.
|
|
31
33
|
- Deterministic when run with a fixed seed or via Gaggle.
|
|
34
|
+
|
|
32
35
|
"""
|
|
33
36
|
tokens = split_preserving_whitespace(text)
|
|
34
37
|
word_tokens = collect_word_tokens(tokens)
|
|
@@ -77,7 +80,6 @@ def reduplicate_words(
|
|
|
77
80
|
Falls back to the Python implementation when the optional Rust
|
|
78
81
|
extension is unavailable.
|
|
79
82
|
"""
|
|
80
|
-
|
|
81
83
|
effective_rate = resolve_rate(
|
|
82
84
|
rate=rate,
|
|
83
85
|
legacy_value=reduplication_rate,
|
glitchlings/zoo/rushmore.py
CHANGED
|
@@ -21,7 +21,6 @@ def _python_delete_random_words(
|
|
|
21
21
|
unweighted: bool = False,
|
|
22
22
|
) -> str:
|
|
23
23
|
"""Delete random words from the input text while preserving whitespace."""
|
|
24
|
-
|
|
25
24
|
effective_rate = max(rate, 0.0)
|
|
26
25
|
if effective_rate <= 0.0:
|
|
27
26
|
return text
|
|
@@ -37,15 +36,11 @@ def _python_delete_random_words(
|
|
|
37
36
|
if not weighted_tokens:
|
|
38
37
|
return text
|
|
39
38
|
|
|
40
|
-
allowed_deletions = min(
|
|
41
|
-
len(weighted_tokens), math.floor(len(weighted_tokens) * effective_rate)
|
|
42
|
-
)
|
|
39
|
+
allowed_deletions = min(len(weighted_tokens), math.floor(len(weighted_tokens) * effective_rate))
|
|
43
40
|
if allowed_deletions <= 0:
|
|
44
41
|
return text
|
|
45
42
|
|
|
46
|
-
mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(
|
|
47
|
-
weighted_tokens
|
|
48
|
-
)
|
|
43
|
+
mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(weighted_tokens)
|
|
49
44
|
|
|
50
45
|
deletions = 0
|
|
51
46
|
for index, weight, token in weighted_tokens:
|
|
@@ -88,7 +83,6 @@ def delete_random_words(
|
|
|
88
83
|
|
|
89
84
|
Uses the optional Rust implementation when available.
|
|
90
85
|
"""
|
|
91
|
-
|
|
92
86
|
effective_rate = resolve_rate(
|
|
93
87
|
rate=rate,
|
|
94
88
|
legacy_value=max_deletion_rate,
|
glitchlings/zoo/scannequin.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import re
|
|
2
1
|
import random
|
|
2
|
+
import re
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
5
|
from ._ocr_confusions import load_confusion_table
|
|
6
|
-
from .core import Glitchling, AttackWave, AttackOrder
|
|
7
6
|
from ._rate import resolve_rate
|
|
7
|
+
from .core import AttackOrder, AttackWave, Glitchling
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
10
|
from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
|
|
@@ -21,17 +21,20 @@ def _python_ocr_artifacts(
|
|
|
21
21
|
"""Introduce OCR-like artifacts into text.
|
|
22
22
|
|
|
23
23
|
Parameters
|
|
24
|
+
----------
|
|
24
25
|
- text: Input text to corrupt.
|
|
25
26
|
- rate: Max proportion of eligible confusion matches to replace (default 0.02).
|
|
26
27
|
- seed: Optional seed if `rng` not provided.
|
|
27
28
|
- rng: Optional RNG; overrides seed.
|
|
28
29
|
|
|
29
30
|
Notes
|
|
31
|
+
-----
|
|
30
32
|
- Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
|
|
31
33
|
- Collects all non-overlapping candidate spans in reading order, then samples
|
|
32
34
|
a subset deterministically with the provided RNG.
|
|
33
35
|
- Replacements can change length (e.g., m→rn), so edits are applied from left
|
|
34
36
|
to right using precomputed spans to avoid index drift.
|
|
37
|
+
|
|
35
38
|
"""
|
|
36
39
|
if not text:
|
|
37
40
|
return text
|
|
@@ -107,7 +110,6 @@ def ocr_artifacts(
|
|
|
107
110
|
|
|
108
111
|
Prefers the Rust implementation when available.
|
|
109
112
|
"""
|
|
110
|
-
|
|
111
113
|
if not text:
|
|
112
114
|
return text
|
|
113
115
|
|
|
@@ -164,7 +166,6 @@ class Scannequin(Glitchling):
|
|
|
164
166
|
return {"type": "ocr", "error_rate": float(rate)}
|
|
165
167
|
|
|
166
168
|
|
|
167
|
-
|
|
168
169
|
scannequin = Scannequin()
|
|
169
170
|
|
|
170
171
|
|