glitchlings 0.4.1__cp311-cp311-macosx_11_0_universal2.whl → 0.4.2__cp311-cp311-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (39) hide show
  1. glitchlings/__init__.py +26 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
  4. glitchlings/compat.py +215 -0
  5. glitchlings/config.py +136 -19
  6. glitchlings/dlc/_shared.py +68 -0
  7. glitchlings/dlc/huggingface.py +26 -41
  8. glitchlings/dlc/prime.py +64 -101
  9. glitchlings/lexicon/__init__.py +8 -19
  10. glitchlings/lexicon/_cache.py +0 -7
  11. glitchlings/lexicon/graph.py +4 -12
  12. glitchlings/lexicon/metrics.py +1 -8
  13. glitchlings/lexicon/vector.py +15 -34
  14. glitchlings/lexicon/wordnet.py +31 -32
  15. glitchlings/main.py +9 -13
  16. glitchlings/util/__init__.py +18 -4
  17. glitchlings/util/adapters.py +27 -0
  18. glitchlings/zoo/__init__.py +21 -14
  19. glitchlings/zoo/_ocr_confusions.py +1 -3
  20. glitchlings/zoo/_rate.py +1 -4
  21. glitchlings/zoo/_sampling.py +0 -1
  22. glitchlings/zoo/_text_utils.py +1 -5
  23. glitchlings/zoo/adjax.py +0 -2
  24. glitchlings/zoo/core.py +114 -75
  25. glitchlings/zoo/jargoyle.py +9 -14
  26. glitchlings/zoo/mim1c.py +11 -10
  27. glitchlings/zoo/redactyl.py +5 -8
  28. glitchlings/zoo/reduple.py +3 -1
  29. glitchlings/zoo/rushmore.py +2 -8
  30. glitchlings/zoo/scannequin.py +5 -4
  31. glitchlings/zoo/typogre.py +3 -7
  32. glitchlings/zoo/zeedub.py +2 -2
  33. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/METADATA +67 -3
  34. glitchlings-0.4.2.dist-info/RECORD +42 -0
  35. glitchlings-0.4.1.dist-info/RECORD +0 -39
  36. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/WHEEL +0 -0
  37. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/entry_points.txt +0 -0
  38. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/licenses/LICENSE +0 -0
  39. {glitchlings-0.4.1.dist-info → glitchlings-0.4.2.dist-info}/top_level.txt +0 -0
glitchlings/zoo/core.py CHANGED
@@ -4,24 +4,18 @@ import inspect
4
4
  import logging
5
5
  import os
6
6
  import random
7
+ from collections.abc import Mapping, Sequence
7
8
  from enum import IntEnum, auto
8
9
  from hashlib import blake2s
9
- from typing import TYPE_CHECKING, Any, Callable, Protocol
10
+ from typing import TYPE_CHECKING, Any, Callable, Protocol, TypedDict, Union
10
11
 
11
- _datasets_error: ModuleNotFoundError | None = None
12
- try: # pragma: no cover - optional dependency
13
- from datasets import Dataset as _DatasetsDataset
14
- except ModuleNotFoundError as error: # pragma: no cover - optional dependency
15
- _DatasetsDataset = None # type: ignore[assignment]
16
- _datasets_error = error
17
- else:
18
- _datasets_error = None
12
+ from ..compat import get_datasets_dataset, require_datasets
13
+
14
+ _DatasetsDataset = get_datasets_dataset()
19
15
 
20
16
  try: # pragma: no cover - optional dependency
21
- from glitchlings._zoo_rust import (
22
- compose_glitchlings as _compose_glitchlings_rust,
23
- plan_glitchlings as _plan_glitchlings_rust,
24
- )
17
+ from glitchlings._zoo_rust import compose_glitchlings as _compose_glitchlings_rust
18
+ from glitchlings._zoo_rust import plan_glitchlings as _plan_glitchlings_rust
25
19
  except ImportError: # pragma: no cover - compiled extension not present
26
20
  _compose_glitchlings_rust = None
27
21
  _plan_glitchlings_rust = None
@@ -35,9 +29,17 @@ _PIPELINE_ENABLE_VALUES = {"1", "true", "yes", "on"}
35
29
  _PIPELINE_DISABLE_VALUES = {"0", "false", "no", "off"}
36
30
 
37
31
 
38
- def _pipeline_feature_flag_enabled() -> bool:
39
- """Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
32
+ class PlanSpecification(TypedDict):
33
+ name: str
34
+ scope: int
35
+ order: int
40
36
 
37
+
38
+ PlanEntry = Union["Glitchling", Mapping[str, Any]]
39
+
40
+
41
+ def pipeline_feature_flag_enabled() -> bool:
42
+ """Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
41
43
  value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
42
44
  if value is None:
43
45
  return True
@@ -51,12 +53,62 @@ def _pipeline_feature_flag_enabled() -> bool:
51
53
 
52
54
  return True
53
55
 
56
+
57
+ def _pipeline_feature_flag_enabled() -> bool:
58
+ """Compatibility shim for legacy callers."""
59
+ return pipeline_feature_flag_enabled()
60
+
61
+
62
+ def is_rust_pipeline_supported() -> bool:
63
+ """Return ``True`` when the optional Rust extension is importable."""
64
+ return _compose_glitchlings_rust is not None
65
+
66
+
67
+ def is_rust_pipeline_enabled() -> bool:
68
+ """Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
69
+ return is_rust_pipeline_supported() and pipeline_feature_flag_enabled()
70
+
71
+
72
+ def _spec_from_glitchling(glitchling: "Glitchling") -> PlanSpecification:
73
+ """Create a plan specification mapping from a glitchling instance."""
74
+ return {
75
+ "name": glitchling.name,
76
+ "scope": int(glitchling.level),
77
+ "order": int(glitchling.order),
78
+ }
79
+
80
+
81
+ def _normalize_plan_entry(entry: PlanEntry) -> PlanSpecification:
82
+ """Convert a plan entry (glitchling or mapping) into a normalized specification."""
83
+ if isinstance(entry, Glitchling):
84
+ return _spec_from_glitchling(entry)
85
+
86
+ if not isinstance(entry, Mapping):
87
+ message = "plan_glitchlings expects Glitchling instances or mapping specifications"
88
+ raise TypeError(message)
89
+
90
+ try:
91
+ name = str(entry["name"])
92
+ scope_value = int(entry["scope"])
93
+ order_value = int(entry["order"])
94
+ except KeyError as exc: # pragma: no cover - defensive guard
95
+ raise ValueError(f"Plan specification missing required field: {exc.args[0]}") from exc
96
+ except (TypeError, ValueError) as exc:
97
+ raise ValueError("Plan specification fields must be coercible to integers") from exc
98
+
99
+ return {"name": name, "scope": scope_value, "order": order_value}
100
+
101
+
102
+ def _normalize_plan_entries(entries: Sequence[PlanEntry]) -> list[PlanSpecification]:
103
+ """Normalize a collection of orchestration plan entries."""
104
+ return [_normalize_plan_entry(entry) for entry in entries]
105
+
106
+
54
107
  def _plan_glitchlings_python(
55
- specs: list[dict[str, Any]],
108
+ specs: Sequence[Mapping[str, Any]],
56
109
  master_seed: int,
57
110
  ) -> list[tuple[int, int]]:
58
111
  """Pure-Python fallback for orchestrating glitchlings in deterministic order."""
59
-
60
112
  master_seed_int = int(master_seed)
61
113
  planned: list[tuple[int, int, int, int, str]] = []
62
114
  for index, spec in enumerate(specs):
@@ -71,11 +123,10 @@ def _plan_glitchlings_python(
71
123
 
72
124
 
73
125
  def _plan_glitchlings_with_rust(
74
- specs: list[dict[str, Any]],
126
+ specs: Sequence[Mapping[str, Any]],
75
127
  master_seed: int,
76
128
  ) -> list[tuple[int, int]] | None:
77
129
  """Attempt to obtain the orchestration plan from the compiled Rust module."""
78
-
79
130
  if _plan_glitchlings_rust is None:
80
131
  return None
81
132
 
@@ -88,38 +139,51 @@ def _plan_glitchlings_with_rust(
88
139
  return [(int(index), int(seed)) for index, seed in plan]
89
140
 
90
141
 
91
- def _plan_glitchling_specs(
92
- specs: list[dict[str, Any]],
142
+ def _resolve_orchestration_plan(
143
+ specs: Sequence[PlanSpecification],
144
+ master_seed: int,
145
+ prefer_rust: bool,
146
+ ) -> list[tuple[int, int]]:
147
+ """Dispatch to the Rust planner when available, otherwise fall back to Python."""
148
+ if prefer_rust:
149
+ plan = _plan_glitchlings_with_rust(list(specs), master_seed)
150
+ if plan is not None:
151
+ return plan
152
+
153
+ return _plan_glitchlings_python(list(specs), master_seed)
154
+
155
+
156
+ def plan_glitchling_specs(
157
+ specs: Sequence[Mapping[str, Any]],
93
158
  master_seed: int | None,
159
+ *,
160
+ prefer_rust: bool = True,
94
161
  ) -> list[tuple[int, int]]:
95
162
  """Resolve orchestration order and seeds from glitchling specifications."""
96
-
97
163
  if master_seed is None:
98
164
  message = "Gaggle orchestration requires a master seed"
99
165
  raise ValueError(message)
100
166
 
167
+ normalized_specs = [_normalize_plan_entry(spec) for spec in specs]
101
168
  master_seed_int = int(master_seed)
102
- plan = _plan_glitchlings_with_rust(specs, master_seed_int)
103
- if plan is not None:
104
- return plan
105
-
106
- return _plan_glitchlings_python(specs, master_seed_int)
169
+ return _resolve_orchestration_plan(normalized_specs, master_seed_int, prefer_rust)
107
170
 
108
171
 
109
- def _plan_glitchling_sequence(
110
- glitchlings: list["Glitchling"], master_seed: int | None
172
+ def plan_glitchlings(
173
+ entries: Sequence[PlanEntry],
174
+ master_seed: int | None,
175
+ *,
176
+ prefer_rust: bool = True,
111
177
  ) -> list[tuple[int, int]]:
112
- """Derive orchestration plan for concrete glitchling instances."""
113
-
114
- specs = [
115
- {
116
- "name": glitchling.name,
117
- "scope": int(glitchling.level),
118
- "order": int(glitchling.order),
119
- }
120
- for glitchling in glitchlings
121
- ]
122
- return _plan_glitchling_specs(specs, master_seed)
178
+ """Normalize glitchling instances or specs and compute an orchestration plan."""
179
+ if master_seed is None:
180
+ message = "Gaggle orchestration requires a master seed"
181
+ raise ValueError(message)
182
+
183
+ normalized_specs = _normalize_plan_entries(entries)
184
+ master_seed_int = int(master_seed)
185
+ return _resolve_orchestration_plan(normalized_specs, master_seed_int, prefer_rust)
186
+
123
187
 
124
188
  if TYPE_CHECKING: # pragma: no cover - typing only
125
189
  from datasets import Dataset # type: ignore
@@ -140,7 +204,6 @@ def _is_transcript(
140
204
  require_all_content: bool = False,
141
205
  ) -> bool:
142
206
  """Return `True` when `value` appears to be a chat transcript."""
143
-
144
207
  if not isinstance(value, list):
145
208
  return False
146
209
 
@@ -209,8 +272,8 @@ class Glitchling:
209
272
  order: Relative ordering within the same scope.
210
273
  seed: Optional seed for deterministic random behaviour.
211
274
  **kwargs: Additional parameters forwarded to the corruption callable.
212
- """
213
275
 
276
+ """
214
277
  # Each Glitchling maintains its own RNG for deterministic yet isolated behavior.
215
278
  # If no seed is supplied, we fall back to Python's default entropy.
216
279
  self.seed = seed
@@ -228,7 +291,6 @@ class Glitchling:
228
291
 
229
292
  def set_param(self, key: str, value: Any) -> None:
230
293
  """Persist a parameter for use by the corruption callable."""
231
-
232
294
  aliases = getattr(self, "_param_aliases", {})
233
295
  canonical = aliases.get(key, key)
234
296
 
@@ -250,7 +312,6 @@ class Glitchling:
250
312
 
251
313
  def pipeline_operation(self) -> dict[str, Any] | None:
252
314
  """Return the Rust pipeline operation descriptor for this glitchling."""
253
-
254
315
  factory = self._pipeline_descriptor_factory
255
316
  if factory is None:
256
317
  return None
@@ -259,15 +320,11 @@ class Glitchling:
259
320
 
260
321
  def _corruption_expects_rng(self) -> bool:
261
322
  """Return `True` when the corruption function accepts an rng keyword."""
262
-
263
323
  cached_callable = self._cached_rng_callable
264
324
  cached_expectation = self._cached_rng_expectation
265
325
  corruption_function = self.corruption_function
266
326
 
267
- if (
268
- cached_callable is corruption_function
269
- and cached_expectation is not None
270
- ):
327
+ if cached_callable is corruption_function and cached_expectation is not None:
271
328
  return cached_expectation
272
329
 
273
330
  expects_rng = False
@@ -285,7 +342,6 @@ class Glitchling:
285
342
 
286
343
  def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
287
344
  """Execute the corruption callable, injecting the RNG when required."""
288
-
289
345
  # Pass rng to underlying corruption function if it expects it.
290
346
  expects_rng = self._corruption_expects_rng()
291
347
 
@@ -297,23 +353,17 @@ class Glitchling:
297
353
 
298
354
  def corrupt(self, text: str | list[dict[str, Any]]) -> str | list[dict[str, Any]]:
299
355
  """Apply the corruption function to text or conversational transcripts."""
300
-
301
356
  if _is_transcript(text):
302
357
  transcript = [dict(turn) for turn in text]
303
358
  if transcript:
304
- transcript[-1]["content"] = self.__corrupt(
305
- transcript[-1]["content"], **self.kwargs
306
- )
359
+ transcript[-1]["content"] = self.__corrupt(transcript[-1]["content"], **self.kwargs)
307
360
  return transcript
308
361
 
309
362
  return self.__corrupt(text, **self.kwargs)
310
363
 
311
364
  def corrupt_dataset(self, dataset: Dataset, columns: list[str]) -> Dataset:
312
365
  """Apply corruption lazily across dataset columns."""
313
-
314
- if _DatasetsDataset is None:
315
- message = "datasets is not installed"
316
- raise ModuleNotFoundError(message) from _datasets_error
366
+ require_datasets("datasets is not installed")
317
367
 
318
368
  def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
319
369
  row = dict(row)
@@ -335,12 +385,10 @@ class Glitchling:
335
385
 
336
386
  def __call__(self, text: str, *args: Any, **kwds: Any) -> str | list[dict[str, Any]]:
337
387
  """Allow a glitchling to be invoked directly like a callable."""
338
-
339
388
  return self.corrupt(text, *args, **kwds)
340
389
 
341
390
  def reset_rng(self, seed: int | None = None) -> None:
342
391
  """Reset the glitchling's RNG to its initial seed."""
343
-
344
392
  if seed is not None:
345
393
  self.seed = seed
346
394
  if self.seed is not None:
@@ -348,7 +396,6 @@ class Glitchling:
348
396
 
349
397
  def clone(self, seed: int | None = None) -> "Glitchling":
350
398
  """Create a copy of this glitchling, optionally with a new seed."""
351
-
352
399
  cls = self.__class__
353
400
  filtered_kwargs = {k: v for k, v in self.kwargs.items() if k != "seed"}
354
401
  clone_seed = seed if seed is not None else self.seed
@@ -368,9 +415,6 @@ class Glitchling:
368
415
  return cls(**filtered_kwargs)
369
416
 
370
417
 
371
-
372
-
373
-
374
418
  class Gaggle(Glitchling):
375
419
  """A collection of glitchlings executed in a deterministic order."""
376
420
 
@@ -380,8 +424,8 @@ class Gaggle(Glitchling):
380
424
  Args:
381
425
  glitchlings: Glitchlings to orchestrate.
382
426
  seed: Master seed used to derive per-glitchling seeds.
383
- """
384
427
 
428
+ """
385
429
  super().__init__("Gaggle", self.corrupt, AttackWave.DOCUMENT, seed=seed)
386
430
  self._clones_by_index: list[Glitchling] = []
387
431
  for idx, glitchling in enumerate(glitchlings):
@@ -389,9 +433,7 @@ class Gaggle(Glitchling):
389
433
  setattr(clone, "_gaggle_index", idx)
390
434
  self._clones_by_index.append(clone)
391
435
 
392
- self.glitchlings: dict[AttackWave, list[Glitchling]] = {
393
- level: [] for level in AttackWave
394
- }
436
+ self.glitchlings: dict[AttackWave, list[Glitchling]] = {level: [] for level in AttackWave}
395
437
  self.apply_order: list[Glitchling] = []
396
438
  self._plan: list[tuple[int, int]] = []
397
439
  self.sort_glitchlings()
@@ -399,6 +441,7 @@ class Gaggle(Glitchling):
399
441
  @staticmethod
400
442
  def derive_seed(master_seed: int, glitchling_name: str, index: int) -> int:
401
443
  """Derive a deterministic seed for a glitchling based on the master seed."""
444
+
402
445
  def _int_to_bytes(value: int) -> bytes:
403
446
  if value == 0:
404
447
  return b"\x00"
@@ -425,8 +468,7 @@ class Gaggle(Glitchling):
425
468
 
426
469
  def sort_glitchlings(self) -> None:
427
470
  """Sort glitchlings by wave then order to produce application order."""
428
-
429
- plan = _plan_glitchling_sequence(self._clones_by_index, self.seed)
471
+ plan = plan_glitchlings(self._clones_by_index, self.seed)
430
472
  self._plan = plan
431
473
 
432
474
  self.glitchlings = {level: [] for level in AttackWave}
@@ -451,14 +493,12 @@ class Gaggle(Glitchling):
451
493
  @staticmethod
452
494
  def rust_pipeline_supported() -> bool:
453
495
  """Return ``True`` when the compiled Rust pipeline is importable."""
454
-
455
- return _compose_glitchlings_rust is not None
496
+ return is_rust_pipeline_supported()
456
497
 
457
498
  @staticmethod
458
499
  def rust_pipeline_enabled() -> bool:
459
500
  """Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
460
-
461
- return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
501
+ return is_rust_pipeline_enabled()
462
502
 
463
503
  def _pipeline_descriptors(self) -> list[dict[str, Any]] | None:
464
504
  if not self.rust_pipeline_enabled():
@@ -490,7 +530,6 @@ class Gaggle(Glitchling):
490
530
 
491
531
  def corrupt(self, text: str) -> str:
492
532
  """Apply each glitchling to the provided text sequentially."""
493
-
494
533
  master_seed = self.seed
495
534
  descriptors = self._pipeline_descriptors()
496
535
  if master_seed is not None and descriptors is not None:
@@ -9,9 +9,11 @@ from glitchlings.lexicon import Lexicon, get_default_lexicon
9
9
  try: # pragma: no cover - optional WordNet dependency
10
10
  from glitchlings.lexicon.wordnet import (
11
11
  WordNetLexicon,
12
+ )
13
+ from glitchlings.lexicon.wordnet import (
12
14
  dependencies_available as _lexicon_dependencies_available,
13
- ensure_wordnet as _lexicon_ensure_wordnet,
14
15
  )
16
+ from glitchlings.lexicon.wordnet import ensure_wordnet as _lexicon_ensure_wordnet
15
17
  except Exception: # pragma: no cover - triggered when nltk unavailable
16
18
  WordNetLexicon = None # type: ignore[assignment]
17
19
 
@@ -33,7 +35,6 @@ ensure_wordnet = _lexicon_ensure_wordnet
33
35
 
34
36
  def dependencies_available() -> bool:
35
37
  """Return ``True`` when a synonym backend is accessible."""
36
-
37
38
  if _lexicon_dependencies_available():
38
39
  return True
39
40
 
@@ -58,7 +59,6 @@ _VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
58
59
 
59
60
  def _split_token(token: str) -> tuple[str, str, str]:
60
61
  """Split a token into leading punctuation, core word, and trailing punctuation."""
61
-
62
62
  match = re.match(r"^(\W*)(.*?)(\W*)$", token)
63
63
  if not match:
64
64
  return "", token, ""
@@ -70,23 +70,18 @@ def _normalize_parts_of_speech(
70
70
  part_of_speech: PartOfSpeechInput,
71
71
  ) -> NormalizedPartsOfSpeech:
72
72
  """Coerce user input into a tuple of valid WordNet POS tags."""
73
-
74
73
  if isinstance(part_of_speech, str):
75
74
  lowered = part_of_speech.lower()
76
75
  if lowered == "any":
77
76
  return _VALID_POS
78
77
  if lowered not in _VALID_POS:
79
- raise ValueError(
80
- "part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'"
81
- )
78
+ raise ValueError("part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'")
82
79
  return (cast(PartOfSpeech, lowered),)
83
80
 
84
81
  normalized: list[PartOfSpeech] = []
85
82
  for pos in part_of_speech:
86
83
  if pos not in _VALID_POS:
87
- raise ValueError(
88
- "part_of_speech entries must be one of 'n', 'v', 'a', or 'r'"
89
- )
84
+ raise ValueError("part_of_speech entries must be one of 'n', 'v', 'a', or 'r'")
90
85
  if pos not in normalized:
91
86
  normalized.append(pos)
92
87
  if not normalized:
@@ -118,6 +113,7 @@ def substitute_random_synonyms(
118
113
  """Replace words with random lexicon-driven synonyms.
119
114
 
120
115
  Parameters
116
+ ----------
121
117
  - text: Input text.
122
118
  - rate: Max proportion of candidate words to replace (default 0.01).
123
119
  - part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
@@ -134,6 +130,7 @@ def substitute_random_synonyms(
134
130
  - Replacement positions chosen via rng.sample.
135
131
  - Synonyms sourced through the lexicon; the default backend derives
136
132
  deterministic subsets per word and part-of-speech using the active seed.
133
+
137
134
  """
138
135
  effective_rate = resolve_rate(
139
136
  rate=rate,
@@ -168,7 +165,7 @@ def substitute_random_synonyms(
168
165
  # Split but keep whitespace separators so we can rebuild easily
169
166
  tokens = re.split(r"(\s+)", text)
170
167
 
171
- # Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
168
+ # Collect candidate word indices (even positions are words because separators are kept)
172
169
  candidate_indices: list[int] = []
173
170
  candidate_metadata: dict[int, CandidateInfo] = {}
174
171
  for idx, tok in enumerate(tokens):
@@ -296,9 +293,7 @@ class Jargoyle(Glitchling):
296
293
  current_lexicon.reseed(self.seed)
297
294
  else:
298
295
  if hasattr(self, "_external_lexicon_original_seed"):
299
- original_seed = getattr(
300
- self, "_external_lexicon_original_seed", None
301
- )
296
+ original_seed = getattr(self, "_external_lexicon_original_seed", None)
302
297
  current_lexicon.reseed(original_seed)
303
298
  elif canonical == "lexicon" and isinstance(value, Lexicon):
304
299
  if getattr(self, "_initializing", False):
glitchlings/zoo/mim1c.py CHANGED
@@ -1,11 +1,11 @@
1
- from collections.abc import Collection
2
1
  import random
2
+ from collections.abc import Collection
3
3
  from typing import Literal
4
4
 
5
5
  from confusable_homoglyphs import confusables
6
6
 
7
- from .core import AttackOrder, AttackWave, Glitchling
8
7
  from ._rate import resolve_rate
8
+ from .core import AttackOrder, AttackWave, Glitchling
9
9
 
10
10
 
11
11
  def swap_homoglyphs(
@@ -21,16 +21,21 @@ def swap_homoglyphs(
21
21
  """Replace characters with visually confusable homoglyphs.
22
22
 
23
23
  Parameters
24
+ ----------
24
25
  - text: Input text.
25
26
  - rate: Max proportion of eligible characters to replace (default 0.02).
26
- - classes: Restrict replacements to these Unicode script classes (default ["LATIN","GREEK","CYRILLIC"]). Use "all" to allow any.
27
+ - classes: Restrict replacements to these Unicode script classes (default
28
+ ["LATIN", "GREEK", "CYRILLIC"]). Use "all" to allow any.
27
29
  - banned_characters: Characters that must never appear as replacements.
28
30
  - seed: Optional seed if `rng` not provided.
29
31
  - rng: Optional RNG; overrides seed.
30
32
 
31
33
  Notes
32
- - Only replaces characters present in confusables.confusables_data with single-codepoint alternatives.
34
+ -----
35
+ - Only replaces characters present in ``confusables.confusables_data`` with
36
+ single-codepoint alternatives.
33
37
  - Maintains determinism by shuffling candidates and sampling via the provided RNG.
38
+
34
39
  """
35
40
  effective_rate = resolve_rate(
36
41
  rate=rate,
@@ -46,9 +51,7 @@ def swap_homoglyphs(
46
51
  classes = ["LATIN", "GREEK", "CYRILLIC"]
47
52
 
48
53
  target_chars = [char for char in text if char.isalnum()]
49
- confusable_chars = [
50
- char for char in target_chars if char in confusables.confusables_data
51
- ]
54
+ confusable_chars = [char for char in target_chars if char in confusables.confusables_data]
52
55
  clamped_rate = max(0.0, effective_rate)
53
56
  num_replacements = int(len(confusable_chars) * clamped_rate)
54
57
  done = 0
@@ -57,9 +60,7 @@ def swap_homoglyphs(
57
60
  for char in confusable_chars:
58
61
  if done >= num_replacements:
59
62
  break
60
- options = [
61
- o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1
62
- ]
63
+ options = [o["c"] for o in confusables.confusables_data[char] if len(o["c"]) == 1]
63
64
  if classes != "all":
64
65
  options = [opt for opt in options if confusables.alias(opt) in classes]
65
66
  if banned_set:
@@ -1,5 +1,5 @@
1
- import re
2
1
  import random
2
+ import re
3
3
  from typing import Any
4
4
 
5
5
  from ._rate import resolve_rate
@@ -32,24 +32,22 @@ def _python_redact_words(
32
32
  """Redact random words by replacing their characters.
33
33
 
34
34
  Parameters
35
+ ----------
35
36
  - text: Input text.
36
37
  - replacement_char: The character to use for redaction (default FULL_BLOCK).
37
38
  - rate: Max proportion of words to redact (default 0.05).
38
39
  - merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
39
40
  - rng: RNG used for sampling decisions.
40
41
  - unweighted: When True, sample words uniformly instead of by length.
42
+
41
43
  """
42
44
  tokens = split_preserving_whitespace(text)
43
45
  word_tokens = collect_word_tokens(tokens)
44
46
  if not word_tokens:
45
- raise ValueError(
46
- "Cannot redact words because the input text contains no redactable words."
47
- )
47
+ raise ValueError("Cannot redact words because the input text contains no redactable words.")
48
48
 
49
49
  population = [token.index for token in word_tokens]
50
- weights = [
51
- 1.0 if unweighted else float(token.core_length) for token in word_tokens
52
- ]
50
+ weights = [1.0 if unweighted else float(token.core_length) for token in word_tokens]
53
51
 
54
52
  clamped_rate = max(0.0, min(rate, 1.0))
55
53
  raw_quota = len(population) * clamped_rate
@@ -105,7 +103,6 @@ def redact_words(
105
103
  unweighted: bool = False,
106
104
  ) -> str:
107
105
  """Redact random words by replacing their characters."""
108
-
109
106
  effective_rate = resolve_rate(
110
107
  rate=rate,
111
108
  legacy_value=redaction_rate,
@@ -21,14 +21,17 @@ def _python_reduplicate_words(
21
21
  """Randomly reduplicate words in the text.
22
22
 
23
23
  Parameters
24
+ ----------
24
25
  - text: Input text.
25
26
  - rate: Max proportion of words to reduplicate (default 0.05).
26
27
  - rng: RNG used for sampling decisions.
27
28
  - unweighted: When True, sample words uniformly instead of length-weighted.
28
29
 
29
30
  Notes
31
+ -----
30
32
  - Preserves spacing and punctuation by tokenizing with separators.
31
33
  - Deterministic when run with a fixed seed or via Gaggle.
34
+
32
35
  """
33
36
  tokens = split_preserving_whitespace(text)
34
37
  word_tokens = collect_word_tokens(tokens)
@@ -77,7 +80,6 @@ def reduplicate_words(
77
80
  Falls back to the Python implementation when the optional Rust
78
81
  extension is unavailable.
79
82
  """
80
-
81
83
  effective_rate = resolve_rate(
82
84
  rate=rate,
83
85
  legacy_value=reduplication_rate,
@@ -21,7 +21,6 @@ def _python_delete_random_words(
21
21
  unweighted: bool = False,
22
22
  ) -> str:
23
23
  """Delete random words from the input text while preserving whitespace."""
24
-
25
24
  effective_rate = max(rate, 0.0)
26
25
  if effective_rate <= 0.0:
27
26
  return text
@@ -37,15 +36,11 @@ def _python_delete_random_words(
37
36
  if not weighted_tokens:
38
37
  return text
39
38
 
40
- allowed_deletions = min(
41
- len(weighted_tokens), math.floor(len(weighted_tokens) * effective_rate)
42
- )
39
+ allowed_deletions = min(len(weighted_tokens), math.floor(len(weighted_tokens) * effective_rate))
43
40
  if allowed_deletions <= 0:
44
41
  return text
45
42
 
46
- mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(
47
- weighted_tokens
48
- )
43
+ mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(weighted_tokens)
49
44
 
50
45
  deletions = 0
51
46
  for index, weight, token in weighted_tokens:
@@ -88,7 +83,6 @@ def delete_random_words(
88
83
 
89
84
  Uses the optional Rust implementation when available.
90
85
  """
91
-
92
86
  effective_rate = resolve_rate(
93
87
  rate=rate,
94
88
  legacy_value=max_deletion_rate,
@@ -1,10 +1,10 @@
1
- import re
2
1
  import random
2
+ import re
3
3
  from typing import Any
4
4
 
5
5
  from ._ocr_confusions import load_confusion_table
6
- from .core import Glitchling, AttackWave, AttackOrder
7
6
  from ._rate import resolve_rate
7
+ from .core import AttackOrder, AttackWave, Glitchling
8
8
 
9
9
  try:
10
10
  from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
@@ -21,17 +21,20 @@ def _python_ocr_artifacts(
21
21
  """Introduce OCR-like artifacts into text.
22
22
 
23
23
  Parameters
24
+ ----------
24
25
  - text: Input text to corrupt.
25
26
  - rate: Max proportion of eligible confusion matches to replace (default 0.02).
26
27
  - seed: Optional seed if `rng` not provided.
27
28
  - rng: Optional RNG; overrides seed.
28
29
 
29
30
  Notes
31
+ -----
30
32
  - Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
31
33
  - Collects all non-overlapping candidate spans in reading order, then samples
32
34
  a subset deterministically with the provided RNG.
33
35
  - Replacements can change length (e.g., m→rn), so edits are applied from left
34
36
  to right using precomputed spans to avoid index drift.
37
+
35
38
  """
36
39
  if not text:
37
40
  return text
@@ -107,7 +110,6 @@ def ocr_artifacts(
107
110
 
108
111
  Prefers the Rust implementation when available.
109
112
  """
110
-
111
113
  if not text:
112
114
  return text
113
115
 
@@ -164,7 +166,6 @@ class Scannequin(Glitchling):
164
166
  return {"type": "ocr", "error_rate": float(rate)}
165
167
 
166
168
 
167
-
168
169
  scannequin = Scannequin()
169
170
 
170
171