glitchlings 0.4.1__cp312-cp312-win_amd64.whl → 0.4.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +30 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/compat.py +284 -0
- glitchlings/config.py +164 -34
- glitchlings/config.toml +1 -1
- glitchlings/dlc/__init__.py +3 -1
- glitchlings/dlc/_shared.py +68 -0
- glitchlings/dlc/huggingface.py +26 -41
- glitchlings/dlc/prime.py +64 -101
- glitchlings/dlc/pytorch.py +216 -0
- glitchlings/dlc/pytorch_lightning.py +233 -0
- glitchlings/lexicon/__init__.py +12 -33
- glitchlings/lexicon/_cache.py +21 -22
- glitchlings/lexicon/data/default_vector_cache.json +80 -14
- glitchlings/lexicon/metrics.py +1 -8
- glitchlings/lexicon/vector.py +109 -49
- glitchlings/lexicon/wordnet.py +89 -49
- glitchlings/main.py +30 -24
- glitchlings/util/__init__.py +18 -4
- glitchlings/util/adapters.py +27 -0
- glitchlings/zoo/__init__.py +26 -15
- glitchlings/zoo/_ocr_confusions.py +1 -3
- glitchlings/zoo/_rate.py +1 -4
- glitchlings/zoo/_sampling.py +0 -1
- glitchlings/zoo/_text_utils.py +1 -5
- glitchlings/zoo/adjax.py +2 -4
- glitchlings/zoo/apostrofae.py +128 -0
- glitchlings/zoo/assets/__init__.py +0 -0
- glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
- glitchlings/zoo/core.py +152 -87
- glitchlings/zoo/jargoyle.py +50 -45
- glitchlings/zoo/mim1c.py +11 -10
- glitchlings/zoo/redactyl.py +16 -16
- glitchlings/zoo/reduple.py +5 -3
- glitchlings/zoo/rushmore.py +4 -10
- glitchlings/zoo/scannequin.py +7 -6
- glitchlings/zoo/typogre.py +8 -9
- glitchlings/zoo/zeedub.py +6 -3
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/METADATA +101 -4
- glitchlings-0.4.3.dist-info/RECORD +46 -0
- glitchlings/lexicon/graph.py +0 -290
- glitchlings-0.4.1.dist-info/RECORD +0 -39
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.1.dist-info → glitchlings-0.4.3.dist-info}/top_level.txt +0 -0
glitchlings/zoo/core.py
CHANGED
|
@@ -4,24 +4,18 @@ import inspect
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
6
|
import random
|
|
7
|
+
from collections.abc import Mapping, Sequence
|
|
7
8
|
from enum import IntEnum, auto
|
|
8
9
|
from hashlib import blake2s
|
|
9
|
-
from typing import TYPE_CHECKING, Any, Callable, Protocol
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Protocol, TypedDict, TypeGuard, Union, cast
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
except ModuleNotFoundError as error: # pragma: no cover - optional dependency
|
|
15
|
-
_DatasetsDataset = None # type: ignore[assignment]
|
|
16
|
-
_datasets_error = error
|
|
17
|
-
else:
|
|
18
|
-
_datasets_error = None
|
|
12
|
+
from ..compat import get_datasets_dataset, require_datasets
|
|
13
|
+
|
|
14
|
+
_DatasetsDataset = get_datasets_dataset()
|
|
19
15
|
|
|
20
16
|
try: # pragma: no cover - optional dependency
|
|
21
|
-
from glitchlings._zoo_rust import
|
|
22
|
-
|
|
23
|
-
plan_glitchlings as _plan_glitchlings_rust,
|
|
24
|
-
)
|
|
17
|
+
from glitchlings._zoo_rust import compose_glitchlings as _compose_glitchlings_rust
|
|
18
|
+
from glitchlings._zoo_rust import plan_glitchlings as _plan_glitchlings_rust
|
|
25
19
|
except ImportError: # pragma: no cover - compiled extension not present
|
|
26
20
|
_compose_glitchlings_rust = None
|
|
27
21
|
_plan_glitchlings_rust = None
|
|
@@ -35,9 +29,20 @@ _PIPELINE_ENABLE_VALUES = {"1", "true", "yes", "on"}
|
|
|
35
29
|
_PIPELINE_DISABLE_VALUES = {"0", "false", "no", "off"}
|
|
36
30
|
|
|
37
31
|
|
|
38
|
-
|
|
39
|
-
|
|
32
|
+
class PlanSpecification(TypedDict):
|
|
33
|
+
name: str
|
|
34
|
+
scope: int
|
|
35
|
+
order: int
|
|
40
36
|
|
|
37
|
+
|
|
38
|
+
TranscriptTurn = dict[str, Any]
|
|
39
|
+
Transcript = list[TranscriptTurn]
|
|
40
|
+
|
|
41
|
+
PlanEntry = Union["Glitchling", Mapping[str, Any]]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def pipeline_feature_flag_enabled() -> bool:
|
|
45
|
+
"""Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
|
|
41
46
|
value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
|
|
42
47
|
if value is None:
|
|
43
48
|
return True
|
|
@@ -51,12 +56,62 @@ def _pipeline_feature_flag_enabled() -> bool:
|
|
|
51
56
|
|
|
52
57
|
return True
|
|
53
58
|
|
|
59
|
+
|
|
60
|
+
def _pipeline_feature_flag_enabled() -> bool:
|
|
61
|
+
"""Compatibility shim for legacy callers."""
|
|
62
|
+
return pipeline_feature_flag_enabled()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def is_rust_pipeline_supported() -> bool:
|
|
66
|
+
"""Return ``True`` when the optional Rust extension is importable."""
|
|
67
|
+
return _compose_glitchlings_rust is not None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_rust_pipeline_enabled() -> bool:
|
|
71
|
+
"""Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
|
|
72
|
+
return is_rust_pipeline_supported() and pipeline_feature_flag_enabled()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _spec_from_glitchling(glitchling: "Glitchling") -> PlanSpecification:
|
|
76
|
+
"""Create a plan specification mapping from a glitchling instance."""
|
|
77
|
+
return {
|
|
78
|
+
"name": glitchling.name,
|
|
79
|
+
"scope": int(glitchling.level),
|
|
80
|
+
"order": int(glitchling.order),
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _normalize_plan_entry(entry: PlanEntry) -> PlanSpecification:
|
|
85
|
+
"""Convert a plan entry (glitchling or mapping) into a normalized specification."""
|
|
86
|
+
if isinstance(entry, Glitchling):
|
|
87
|
+
return _spec_from_glitchling(entry)
|
|
88
|
+
|
|
89
|
+
if not isinstance(entry, Mapping):
|
|
90
|
+
message = "plan_glitchlings expects Glitchling instances or mapping specifications"
|
|
91
|
+
raise TypeError(message)
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
name = str(entry["name"])
|
|
95
|
+
scope_value = int(entry["scope"])
|
|
96
|
+
order_value = int(entry["order"])
|
|
97
|
+
except KeyError as exc: # pragma: no cover - defensive guard
|
|
98
|
+
raise ValueError(f"Plan specification missing required field: {exc.args[0]}") from exc
|
|
99
|
+
except (TypeError, ValueError) as exc:
|
|
100
|
+
raise ValueError("Plan specification fields must be coercible to integers") from exc
|
|
101
|
+
|
|
102
|
+
return {"name": name, "scope": scope_value, "order": order_value}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _normalize_plan_entries(entries: Sequence[PlanEntry]) -> list[PlanSpecification]:
|
|
106
|
+
"""Normalize a collection of orchestration plan entries."""
|
|
107
|
+
return [_normalize_plan_entry(entry) for entry in entries]
|
|
108
|
+
|
|
109
|
+
|
|
54
110
|
def _plan_glitchlings_python(
|
|
55
|
-
specs:
|
|
111
|
+
specs: Sequence[Mapping[str, Any]],
|
|
56
112
|
master_seed: int,
|
|
57
113
|
) -> list[tuple[int, int]]:
|
|
58
114
|
"""Pure-Python fallback for orchestrating glitchlings in deterministic order."""
|
|
59
|
-
|
|
60
115
|
master_seed_int = int(master_seed)
|
|
61
116
|
planned: list[tuple[int, int, int, int, str]] = []
|
|
62
117
|
for index, spec in enumerate(specs):
|
|
@@ -71,11 +126,10 @@ def _plan_glitchlings_python(
|
|
|
71
126
|
|
|
72
127
|
|
|
73
128
|
def _plan_glitchlings_with_rust(
|
|
74
|
-
specs:
|
|
129
|
+
specs: Sequence[Mapping[str, Any]],
|
|
75
130
|
master_seed: int,
|
|
76
131
|
) -> list[tuple[int, int]] | None:
|
|
77
132
|
"""Attempt to obtain the orchestration plan from the compiled Rust module."""
|
|
78
|
-
|
|
79
133
|
if _plan_glitchlings_rust is None:
|
|
80
134
|
return None
|
|
81
135
|
|
|
@@ -88,41 +142,54 @@ def _plan_glitchlings_with_rust(
|
|
|
88
142
|
return [(int(index), int(seed)) for index, seed in plan]
|
|
89
143
|
|
|
90
144
|
|
|
91
|
-
def
|
|
92
|
-
specs:
|
|
145
|
+
def _resolve_orchestration_plan(
|
|
146
|
+
specs: Sequence[PlanSpecification],
|
|
147
|
+
master_seed: int,
|
|
148
|
+
prefer_rust: bool,
|
|
149
|
+
) -> list[tuple[int, int]]:
|
|
150
|
+
"""Dispatch to the Rust planner when available, otherwise fall back to Python."""
|
|
151
|
+
if prefer_rust:
|
|
152
|
+
plan = _plan_glitchlings_with_rust(list(specs), master_seed)
|
|
153
|
+
if plan is not None:
|
|
154
|
+
return plan
|
|
155
|
+
|
|
156
|
+
return _plan_glitchlings_python(list(specs), master_seed)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def plan_glitchling_specs(
|
|
160
|
+
specs: Sequence[Mapping[str, Any]],
|
|
93
161
|
master_seed: int | None,
|
|
162
|
+
*,
|
|
163
|
+
prefer_rust: bool = True,
|
|
94
164
|
) -> list[tuple[int, int]]:
|
|
95
165
|
"""Resolve orchestration order and seeds from glitchling specifications."""
|
|
96
|
-
|
|
97
166
|
if master_seed is None:
|
|
98
167
|
message = "Gaggle orchestration requires a master seed"
|
|
99
168
|
raise ValueError(message)
|
|
100
169
|
|
|
170
|
+
normalized_specs = [_normalize_plan_entry(spec) for spec in specs]
|
|
101
171
|
master_seed_int = int(master_seed)
|
|
102
|
-
|
|
103
|
-
if plan is not None:
|
|
104
|
-
return plan
|
|
105
|
-
|
|
106
|
-
return _plan_glitchlings_python(specs, master_seed_int)
|
|
172
|
+
return _resolve_orchestration_plan(normalized_specs, master_seed_int, prefer_rust)
|
|
107
173
|
|
|
108
174
|
|
|
109
|
-
def
|
|
110
|
-
|
|
175
|
+
def plan_glitchlings(
|
|
176
|
+
entries: Sequence[PlanEntry],
|
|
177
|
+
master_seed: int | None,
|
|
178
|
+
*,
|
|
179
|
+
prefer_rust: bool = True,
|
|
111
180
|
) -> list[tuple[int, int]]:
|
|
112
|
-
"""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
]
|
|
122
|
-
return _plan_glitchling_specs(specs, master_seed)
|
|
181
|
+
"""Normalize glitchling instances or specs and compute an orchestration plan."""
|
|
182
|
+
if master_seed is None:
|
|
183
|
+
message = "Gaggle orchestration requires a master seed"
|
|
184
|
+
raise ValueError(message)
|
|
185
|
+
|
|
186
|
+
normalized_specs = _normalize_plan_entries(entries)
|
|
187
|
+
master_seed_int = int(master_seed)
|
|
188
|
+
return _resolve_orchestration_plan(normalized_specs, master_seed_int, prefer_rust)
|
|
189
|
+
|
|
123
190
|
|
|
124
191
|
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
125
|
-
from datasets import Dataset
|
|
192
|
+
from datasets import Dataset
|
|
126
193
|
elif _DatasetsDataset is not None:
|
|
127
194
|
Dataset = _DatasetsDataset
|
|
128
195
|
else:
|
|
@@ -138,9 +205,8 @@ def _is_transcript(
|
|
|
138
205
|
*,
|
|
139
206
|
allow_empty: bool = True,
|
|
140
207
|
require_all_content: bool = False,
|
|
141
|
-
) ->
|
|
142
|
-
"""Return
|
|
143
|
-
|
|
208
|
+
) -> TypeGuard[Transcript]:
|
|
209
|
+
"""Return ``True`` when ``value`` appears to be a chat transcript."""
|
|
144
210
|
if not isinstance(value, list):
|
|
145
211
|
return False
|
|
146
212
|
|
|
@@ -209,8 +275,8 @@ class Glitchling:
|
|
|
209
275
|
order: Relative ordering within the same scope.
|
|
210
276
|
seed: Optional seed for deterministic random behaviour.
|
|
211
277
|
**kwargs: Additional parameters forwarded to the corruption callable.
|
|
212
|
-
"""
|
|
213
278
|
|
|
279
|
+
"""
|
|
214
280
|
# Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
|
|
215
281
|
# If no seed is supplied, we fall back to Python's default entropy.
|
|
216
282
|
self.seed = seed
|
|
@@ -228,7 +294,6 @@ class Glitchling:
|
|
|
228
294
|
|
|
229
295
|
def set_param(self, key: str, value: Any) -> None:
|
|
230
296
|
"""Persist a parameter for use by the corruption callable."""
|
|
231
|
-
|
|
232
297
|
aliases = getattr(self, "_param_aliases", {})
|
|
233
298
|
canonical = aliases.get(key, key)
|
|
234
299
|
|
|
@@ -250,7 +315,6 @@ class Glitchling:
|
|
|
250
315
|
|
|
251
316
|
def pipeline_operation(self) -> dict[str, Any] | None:
|
|
252
317
|
"""Return the Rust pipeline operation descriptor for this glitchling."""
|
|
253
|
-
|
|
254
318
|
factory = self._pipeline_descriptor_factory
|
|
255
319
|
if factory is None:
|
|
256
320
|
return None
|
|
@@ -259,15 +323,11 @@ class Glitchling:
|
|
|
259
323
|
|
|
260
324
|
def _corruption_expects_rng(self) -> bool:
|
|
261
325
|
"""Return `True` when the corruption function accepts an rng keyword."""
|
|
262
|
-
|
|
263
326
|
cached_callable = self._cached_rng_callable
|
|
264
327
|
cached_expectation = self._cached_rng_expectation
|
|
265
328
|
corruption_function = self.corruption_function
|
|
266
329
|
|
|
267
|
-
if
|
|
268
|
-
cached_callable is corruption_function
|
|
269
|
-
and cached_expectation is not None
|
|
270
|
-
):
|
|
330
|
+
if cached_callable is corruption_function and cached_expectation is not None:
|
|
271
331
|
return cached_expectation
|
|
272
332
|
|
|
273
333
|
expects_rng = False
|
|
@@ -285,7 +345,6 @@ class Glitchling:
|
|
|
285
345
|
|
|
286
346
|
def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
|
|
287
347
|
"""Execute the corruption callable, injecting the RNG when required."""
|
|
288
|
-
|
|
289
348
|
# Pass rng to underlying corruption function if it expects it.
|
|
290
349
|
expects_rng = self._corruption_expects_rng()
|
|
291
350
|
|
|
@@ -295,25 +354,21 @@ class Glitchling:
|
|
|
295
354
|
corrupted = self.corruption_function(text, *args, **kwargs)
|
|
296
355
|
return corrupted
|
|
297
356
|
|
|
298
|
-
def corrupt(self, text: str |
|
|
357
|
+
def corrupt(self, text: str | Transcript) -> str | Transcript:
|
|
299
358
|
"""Apply the corruption function to text or conversational transcripts."""
|
|
300
|
-
|
|
301
359
|
if _is_transcript(text):
|
|
302
|
-
transcript = [dict(turn) for turn in text]
|
|
360
|
+
transcript: Transcript = [dict(turn) for turn in text]
|
|
303
361
|
if transcript:
|
|
304
|
-
transcript[-1]
|
|
305
|
-
|
|
306
|
-
|
|
362
|
+
content = transcript[-1].get("content")
|
|
363
|
+
if isinstance(content, str):
|
|
364
|
+
transcript[-1]["content"] = self.__corrupt(content, **self.kwargs)
|
|
307
365
|
return transcript
|
|
308
366
|
|
|
309
|
-
return self.__corrupt(text, **self.kwargs)
|
|
367
|
+
return self.__corrupt(cast(str, text), **self.kwargs)
|
|
310
368
|
|
|
311
369
|
def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
|
|
312
370
|
"""Apply corruption lazily across dataset columns."""
|
|
313
|
-
|
|
314
|
-
if _DatasetsDataset is None:
|
|
315
|
-
message = "datasets is not installed"
|
|
316
|
-
raise ModuleNotFoundError(message) from _datasets_error
|
|
371
|
+
require_datasets("datasets is not installed")
|
|
317
372
|
|
|
318
373
|
def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
|
|
319
374
|
row = dict(row)
|
|
@@ -333,14 +388,12 @@ class Glitchling:
|
|
|
333
388
|
|
|
334
389
|
return dataset.with_transform(__corrupt_row)
|
|
335
390
|
|
|
336
|
-
def __call__(self, text: str, *args: Any, **kwds: Any) -> str |
|
|
391
|
+
def __call__(self, text: str, *args: Any, **kwds: Any) -> str | Transcript:
|
|
337
392
|
"""Allow a glitchling to be invoked directly like a callable."""
|
|
338
|
-
|
|
339
393
|
return self.corrupt(text, *args, **kwds)
|
|
340
394
|
|
|
341
395
|
def reset_rng(self, seed: int | None = None) -> None:
|
|
342
396
|
"""Reset the glitchling's RNG to its initial seed."""
|
|
343
|
-
|
|
344
397
|
if seed is not None:
|
|
345
398
|
self.seed = seed
|
|
346
399
|
if self.seed is not None:
|
|
@@ -348,7 +401,6 @@ class Glitchling:
|
|
|
348
401
|
|
|
349
402
|
def clone(self, seed: int | None = None) -> "Glitchling":
|
|
350
403
|
"""Create a copy of this glitchling, optionally with a new seed."""
|
|
351
|
-
|
|
352
404
|
cls = self.__class__
|
|
353
405
|
filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
|
|
354
406
|
clone_seed = seed if seed is not None else self.seed
|
|
@@ -368,9 +420,6 @@ class Glitchling:
|
|
|
368
420
|
return cls(**filtered_kwargs)
|
|
369
421
|
|
|
370
422
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
423
|
class Gaggle(Glitchling):
|
|
375
424
|
"""A collection of glitchlings executed in a deterministic order."""
|
|
376
425
|
|
|
@@ -380,18 +429,16 @@ class Gaggle(Glitchling):
|
|
|
380
429
|
Args:
|
|
381
430
|
glitchlings: Glitchlings to orchestrate.
|
|
382
431
|
seed: Master seed used to derive per-glitchling seeds.
|
|
383
|
-
"""
|
|
384
432
|
|
|
385
|
-
|
|
433
|
+
"""
|
|
434
|
+
super().__init__("Gaggle", self._corrupt_text, AttackWave.DOCUMENT, seed=seed)
|
|
386
435
|
self._clones_by_index: list[Glitchling] = []
|
|
387
436
|
for idx, glitchling in enumerate(glitchlings):
|
|
388
437
|
clone = glitchling.clone()
|
|
389
438
|
setattr(clone, "_gaggle_index", idx)
|
|
390
439
|
self._clones_by_index.append(clone)
|
|
391
440
|
|
|
392
|
-
self.glitchlings: dict[AttackWave, list[Glitchling]] = {
|
|
393
|
-
level: [] for level in AttackWave
|
|
394
|
-
}
|
|
441
|
+
self.glitchlings: dict[AttackWave, list[Glitchling]] = {level: [] for level in AttackWave}
|
|
395
442
|
self.apply_order: list[Glitchling] = []
|
|
396
443
|
self._plan: list[tuple[int, int]] = []
|
|
397
444
|
self.sort_glitchlings()
|
|
@@ -399,6 +446,7 @@ class Gaggle(Glitchling):
|
|
|
399
446
|
@staticmethod
|
|
400
447
|
def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
|
|
401
448
|
"""Derive a deterministic seed for a glitchling based on the master seed."""
|
|
449
|
+
|
|
402
450
|
def _int_to_bytes(value: int) -> bytes:
|
|
403
451
|
if value == 0:
|
|
404
452
|
return b"\x00"
|
|
@@ -425,8 +473,7 @@ class Gaggle(Glitchling):
|
|
|
425
473
|
|
|
426
474
|
def sort_glitchlings(self) -> None:
|
|
427
475
|
"""Sort glitchlings by wave then order to produce application order."""
|
|
428
|
-
|
|
429
|
-
plan = _plan_glitchling_sequence(self._clones_by_index, self.seed)
|
|
476
|
+
plan = plan_glitchlings(self._clones_by_index, self.seed)
|
|
430
477
|
self._plan = plan
|
|
431
478
|
|
|
432
479
|
self.glitchlings = {level: [] for level in AttackWave}
|
|
@@ -451,14 +498,12 @@ class Gaggle(Glitchling):
|
|
|
451
498
|
@staticmethod
|
|
452
499
|
def rust_pipeline_supported() -> bool:
|
|
453
500
|
"""Return ``True`` when the compiled Rust pipeline is importable."""
|
|
454
|
-
|
|
455
|
-
return _compose_glitchlings_rust is not None
|
|
501
|
+
return is_rust_pipeline_supported()
|
|
456
502
|
|
|
457
503
|
@staticmethod
|
|
458
504
|
def rust_pipeline_enabled() -> bool:
|
|
459
505
|
"""Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
|
|
460
|
-
|
|
461
|
-
return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
|
|
506
|
+
return is_rust_pipeline_enabled()
|
|
462
507
|
|
|
463
508
|
def _pipeline_descriptors(self) -> list[dict[str, Any]] | None:
|
|
464
509
|
if not self.rust_pipeline_enabled():
|
|
@@ -488,18 +533,38 @@ class Gaggle(Glitchling):
|
|
|
488
533
|
|
|
489
534
|
return descriptors
|
|
490
535
|
|
|
491
|
-
def
|
|
492
|
-
"""Apply each glitchling to
|
|
493
|
-
|
|
536
|
+
def _corrupt_text(self, text: str) -> str:
|
|
537
|
+
"""Apply each glitchling to string input sequentially."""
|
|
494
538
|
master_seed = self.seed
|
|
495
539
|
descriptors = self._pipeline_descriptors()
|
|
496
540
|
if master_seed is not None and descriptors is not None:
|
|
497
541
|
try:
|
|
498
|
-
return _compose_glitchlings_rust(text, descriptors, master_seed)
|
|
542
|
+
return cast(str, _compose_glitchlings_rust(text, descriptors, master_seed))
|
|
499
543
|
except Exception: # pragma: no cover - fall back to Python execution
|
|
500
544
|
log.debug("Rust pipeline failed; falling back", exc_info=True)
|
|
501
545
|
|
|
502
546
|
corrupted = text
|
|
503
547
|
for glitchling in self.apply_order:
|
|
504
|
-
|
|
548
|
+
next_value = glitchling.corrupt(corrupted)
|
|
549
|
+
if not isinstance(next_value, str):
|
|
550
|
+
message = "Glitchling pipeline produced non-string output for string input"
|
|
551
|
+
raise TypeError(message)
|
|
552
|
+
corrupted = next_value
|
|
553
|
+
|
|
505
554
|
return corrupted
|
|
555
|
+
|
|
556
|
+
def corrupt(self, text: str | Transcript) -> str | Transcript:
|
|
557
|
+
"""Apply each glitchling to the provided text sequentially."""
|
|
558
|
+
if isinstance(text, str):
|
|
559
|
+
return self._corrupt_text(text)
|
|
560
|
+
|
|
561
|
+
if _is_transcript(text):
|
|
562
|
+
transcript: Transcript = [dict(turn) for turn in text]
|
|
563
|
+
if transcript and "content" in transcript[-1]:
|
|
564
|
+
content = transcript[-1]["content"]
|
|
565
|
+
if isinstance(content, str):
|
|
566
|
+
transcript[-1]["content"] = self._corrupt_text(content)
|
|
567
|
+
return transcript
|
|
568
|
+
|
|
569
|
+
message = f"Unsupported text type for Gaggle corruption: {type(text)!r}"
|
|
570
|
+
raise TypeError(message)
|
glitchlings/zoo/jargoyle.py
CHANGED
|
@@ -2,18 +2,25 @@ import random
|
|
|
2
2
|
import re
|
|
3
3
|
from collections.abc import Iterable
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from types import ModuleType
|
|
5
6
|
from typing import Any, Literal, cast
|
|
6
7
|
|
|
7
8
|
from glitchlings.lexicon import Lexicon, get_default_lexicon
|
|
8
9
|
|
|
10
|
+
from ._rate import resolve_rate
|
|
11
|
+
from .core import AttackWave, Glitchling
|
|
12
|
+
|
|
13
|
+
_wordnet_module: ModuleType | None
|
|
14
|
+
|
|
9
15
|
try: # pragma: no cover - optional WordNet dependency
|
|
10
|
-
|
|
11
|
-
WordNetLexicon,
|
|
12
|
-
dependencies_available as _lexicon_dependencies_available,
|
|
13
|
-
ensure_wordnet as _lexicon_ensure_wordnet,
|
|
14
|
-
)
|
|
16
|
+
import glitchlings.lexicon.wordnet as _wordnet_module
|
|
15
17
|
except Exception: # pragma: no cover - triggered when nltk unavailable
|
|
16
|
-
|
|
18
|
+
_wordnet_module = None
|
|
19
|
+
|
|
20
|
+
_wordnet_runtime: ModuleType | None = _wordnet_module
|
|
21
|
+
|
|
22
|
+
WordNetLexicon: type[Lexicon] | None
|
|
23
|
+
if _wordnet_runtime is None:
|
|
17
24
|
|
|
18
25
|
def _lexicon_dependencies_available() -> bool:
|
|
19
26
|
return False
|
|
@@ -24,16 +31,18 @@ except Exception: # pragma: no cover - triggered when nltk unavailable
|
|
|
24
31
|
"and download its WordNet corpus manually if you need legacy synonyms."
|
|
25
32
|
)
|
|
26
33
|
|
|
34
|
+
WordNetLexicon = None
|
|
35
|
+
else:
|
|
36
|
+
WordNetLexicon = cast(type[Lexicon], _wordnet_runtime.WordNetLexicon)
|
|
37
|
+
_lexicon_dependencies_available = _wordnet_runtime.dependencies_available
|
|
38
|
+
_lexicon_ensure_wordnet = _wordnet_runtime.ensure_wordnet
|
|
27
39
|
|
|
28
|
-
from ._rate import resolve_rate
|
|
29
|
-
from .core import AttackWave, Glitchling
|
|
30
40
|
|
|
31
41
|
ensure_wordnet = _lexicon_ensure_wordnet
|
|
32
42
|
|
|
33
43
|
|
|
34
44
|
def dependencies_available() -> bool:
|
|
35
45
|
"""Return ``True`` when a synonym backend is accessible."""
|
|
36
|
-
|
|
37
46
|
if _lexicon_dependencies_available():
|
|
38
47
|
return True
|
|
39
48
|
|
|
@@ -58,7 +67,6 @@ _VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
|
|
|
58
67
|
|
|
59
68
|
def _split_token(token: str) -> tuple[str, str, str]:
|
|
60
69
|
"""Split a token into leading punctuation, core word, and trailing punctuation."""
|
|
61
|
-
|
|
62
70
|
match = re.match(r"^(\W*)(.*?)(\W*)$", token)
|
|
63
71
|
if not match:
|
|
64
72
|
return "", token, ""
|
|
@@ -70,23 +78,18 @@ def _normalize_parts_of_speech(
|
|
|
70
78
|
part_of_speech: PartOfSpeechInput,
|
|
71
79
|
) -> NormalizedPartsOfSpeech:
|
|
72
80
|
"""Coerce user input into a tuple of valid WordNet POS tags."""
|
|
73
|
-
|
|
74
81
|
if isinstance(part_of_speech, str):
|
|
75
82
|
lowered = part_of_speech.lower()
|
|
76
83
|
if lowered == "any":
|
|
77
84
|
return _VALID_POS
|
|
78
85
|
if lowered not in _VALID_POS:
|
|
79
|
-
raise ValueError(
|
|
80
|
-
"part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'"
|
|
81
|
-
)
|
|
86
|
+
raise ValueError("part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'")
|
|
82
87
|
return (cast(PartOfSpeech, lowered),)
|
|
83
88
|
|
|
84
89
|
normalized: list[PartOfSpeech] = []
|
|
85
90
|
for pos in part_of_speech:
|
|
86
91
|
if pos not in _VALID_POS:
|
|
87
|
-
raise ValueError(
|
|
88
|
-
"part_of_speech entries must be one of 'n', 'v', 'a', or 'r'"
|
|
89
|
-
)
|
|
92
|
+
raise ValueError("part_of_speech entries must be one of 'n', 'v', 'a', or 'r'")
|
|
90
93
|
if pos not in normalized:
|
|
91
94
|
normalized.append(pos)
|
|
92
95
|
if not normalized:
|
|
@@ -118,6 +121,7 @@ def substitute_random_synonyms(
|
|
|
118
121
|
"""Replace words with random lexicon-driven synonyms.
|
|
119
122
|
|
|
120
123
|
Parameters
|
|
124
|
+
----------
|
|
121
125
|
- text: Input text.
|
|
122
126
|
- rate: Max proportion of candidate words to replace (default 0.01).
|
|
123
127
|
- part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
|
|
@@ -134,6 +138,7 @@ def substitute_random_synonyms(
|
|
|
134
138
|
- Replacement positions chosen via rng.sample.
|
|
135
139
|
- Synonyms sourced through the lexicon; the default backend derives
|
|
136
140
|
deterministic subsets per word and part-of-speech using the active seed.
|
|
141
|
+
|
|
137
142
|
"""
|
|
138
143
|
effective_rate = resolve_rate(
|
|
139
144
|
rate=rate,
|
|
@@ -168,38 +173,40 @@ def substitute_random_synonyms(
|
|
|
168
173
|
# Split but keep whitespace separators so we can rebuild easily
|
|
169
174
|
tokens = re.split(r"(\s+)", text)
|
|
170
175
|
|
|
171
|
-
# Collect
|
|
176
|
+
# Collect candidate word indices (even positions are words because separators are kept)
|
|
172
177
|
candidate_indices: list[int] = []
|
|
173
178
|
candidate_metadata: dict[int, CandidateInfo] = {}
|
|
174
179
|
for idx, tok in enumerate(tokens):
|
|
175
|
-
if idx % 2
|
|
176
|
-
|
|
177
|
-
if not core_word:
|
|
178
|
-
continue
|
|
179
|
-
|
|
180
|
-
chosen_pos: str | None = None
|
|
181
|
-
synonyms: list[str] = []
|
|
180
|
+
if idx % 2 != 0 or not tok or tok.isspace():
|
|
181
|
+
continue
|
|
182
182
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
synonyms = active_lexicon.get_synonyms(core_word, pos=pos)
|
|
187
|
-
if synonyms:
|
|
188
|
-
chosen_pos = pos
|
|
189
|
-
break
|
|
183
|
+
prefix, core_word, suffix = _split_token(tok)
|
|
184
|
+
if not core_word:
|
|
185
|
+
continue
|
|
190
186
|
|
|
191
|
-
|
|
192
|
-
|
|
187
|
+
chosen_pos: str | None = None
|
|
188
|
+
synonyms: list[str] = []
|
|
193
189
|
|
|
190
|
+
for tag in target_pos:
|
|
191
|
+
if not active_lexicon.supports_pos(tag):
|
|
192
|
+
continue
|
|
193
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=tag)
|
|
194
194
|
if synonyms:
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
195
|
+
chosen_pos = tag
|
|
196
|
+
break
|
|
197
|
+
|
|
198
|
+
if not synonyms and active_lexicon.supports_pos(None):
|
|
199
|
+
synonyms = active_lexicon.get_synonyms(core_word, pos=None)
|
|
200
|
+
|
|
201
|
+
if synonyms:
|
|
202
|
+
candidate_indices.append(idx)
|
|
203
|
+
candidate_metadata[idx] = CandidateInfo(
|
|
204
|
+
prefix=prefix,
|
|
205
|
+
core_word=core_word,
|
|
206
|
+
suffix=suffix,
|
|
207
|
+
part_of_speech=chosen_pos,
|
|
208
|
+
synonyms=synonyms,
|
|
209
|
+
)
|
|
203
210
|
|
|
204
211
|
if not candidate_indices:
|
|
205
212
|
return text
|
|
@@ -296,9 +303,7 @@ class Jargoyle(Glitchling):
|
|
|
296
303
|
current_lexicon.reseed(self.seed)
|
|
297
304
|
else:
|
|
298
305
|
if hasattr(self, "_external_lexicon_original_seed"):
|
|
299
|
-
original_seed = getattr(
|
|
300
|
-
self, "_external_lexicon_original_seed", None
|
|
301
|
-
)
|
|
306
|
+
original_seed = getattr(self, "_external_lexicon_original_seed", None)
|
|
302
307
|
current_lexicon.reseed(original_seed)
|
|
303
308
|
elif canonical == "lexicon" and isinstance(value, Lexicon):
|
|
304
309
|
if getattr(self, "_initializing", False):
|
glitchlings/zoo/mim1c.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from collections.abc import Collection
|
|
2
1
|
import random
|
|
2
|
+
from collections.abc import Collection
|
|
3
3
|
from typing import Literal
|
|
4
4
|
|
|
5
5
|
from confusable_homoglyphs import confusables
|
|
6
6
|
|
|
7
|
-
from .core import AttackOrder, AttackWave, Glitchling
|
|
8
7
|
from ._rate import resolve_rate
|
|
8
|
+
from .core import AttackOrder, AttackWave, Glitchling
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def swap_homoglyphs(
|
|
@@ -21,16 +21,21 @@ def swap_homoglyphs(
|
|
|
21
21
|
"""Replace characters with visually confusable homoglyphs.
|
|
22
22
|
|
|
23
23
|
Parameters
|
|
24
|
+
----------
|
|
24
25
|
- text: Input text.
|
|
25
26
|
- rate: Max proportion of eligible characters to replace (default 0.02).
|
|
26
|
-
- classes: Restrict replacements to these Unicode script classes (default
|
|
27
|
+
- classes: Restrict replacements to these Unicode script classes (default
|
|
28
|
+
["LATIN", "GREEK", "CYRILLIC"]). Use "all" to allow any.
|
|
27
29
|
- banned_characters: Characters that must never appear as replacements.
|
|
28
30
|
- seed: Optional seed if `rng` not provided.
|
|
29
31
|
- rng: Optional RNG; overrides seed.
|
|
30
32
|
|
|
31
33
|
Notes
|
|
32
|
-
|
|
34
|
+
-----
|
|
35
|
+
- Only replaces characters present in ``confusables.confusables_data`` with
|
|
36
|
+
single-codepoint alternatives.
|
|
33
37
|
- Maintains determinism by shuffling candidates and sampling via the provided RNG.
|
|
38
|
+
|
|
34
39
|
"""
|
|
35
40
|
effective_rate = resolve_rate(
|
|
36
41
|
rate=rate,
|
|
@@ -46,9 +51,7 @@ def swap_homoglyphs(
|
|
|
46
51
|
classes = ["LATIN", "GREEK", "CYRILLIC"]
|
|
47
52
|
|
|
48
53
|
target_chars = [char for char in text if char.isalnum()]
|
|
49
|
-
confusable_chars = [
|
|
50
|
-
char for char in target_chars if char in confusables.confusables_data
|
|
51
|
-
]
|
|
54
|
+
confusable_chars = [char for char in target_chars if char in confusables.confusables_data]
|
|
52
55
|
clamped_rate = max(0.0, effective_rate)
|
|
53
56
|
num_replacements = int(len(confusable_chars) * clamped_rate)
|
|
54
57
|
done = 0
|
|
@@ -57,9 +60,7 @@ def swap_homoglyphs(
|
|
|
57
60
|
for char in confusable_chars:
|
|
58
61
|
if done >= num_replacements:
|
|
59
62
|
break
|
|
60
|
-
options = [
|
|
61
|
-
o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
|
|
62
|
-
]
|
|
63
|
+
options = [o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1]
|
|
63
64
|
if classes != "all":
|
|
64
65
|
options = [opt for opt in options if confusables.alias(opt) in classes]
|
|
65
66
|
if banned_set:
|