glitchlings 0.2.4__cp310-cp310-macosx_11_0_universal2.whl → 0.2.6__cp310-cp310-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
glitchlings/__init__.py CHANGED
@@ -13,6 +13,8 @@ from .zoo import (
13
13
  rushmore,
14
14
  Scannequin,
15
15
  scannequin,
16
+ Zeedub,
17
+ zeedub,
16
18
  Glitchling,
17
19
  Gaggle,
18
20
  summon,
@@ -35,6 +37,8 @@ __all__ = [
35
37
  "rushmore",
36
38
  "Scannequin",
37
39
  "scannequin",
40
+ "Zeedub",
41
+ "zeedub",
38
42
  "summon",
39
43
  "Glitchling",
40
44
  "Gaggle",
Binary file
glitchlings/dlc/prime.py CHANGED
@@ -49,7 +49,24 @@ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[st
49
49
  if candidate in available:
50
50
  return [candidate]
51
51
 
52
- sample = dataset[0] if len(dataset) else {}
52
+ try:
53
+ dataset_length = len(dataset) # type: ignore[arg-type]
54
+ except TypeError:
55
+ preview_rows: list[dict[str, Any]]
56
+ take_fn = getattr(dataset, "take", None)
57
+ if callable(take_fn):
58
+ preview_rows = list(take_fn(1))
59
+ else:
60
+ iterator = iter(dataset)
61
+ try:
62
+ first_row = next(iterator)
63
+ except StopIteration:
64
+ preview_rows = []
65
+ else:
66
+ preview_rows = [first_row]
67
+ sample = dict(preview_rows[0]) if preview_rows else {}
68
+ else:
69
+ sample = dataset[0] if dataset_length else {}
53
70
  inferred = [
54
71
  name
55
72
  for name in dataset.column_names
@@ -10,6 +10,7 @@ from .reduple import Reduple, reduple
10
10
  from .rushmore import Rushmore, rushmore
11
11
  from .redactyl import Redactyl, redactyl
12
12
  from .scannequin import Scannequin, scannequin
13
+ from .zeedub import Zeedub, zeedub
13
14
  from .core import Glitchling, Gaggle
14
15
 
15
16
  __all__ = [
@@ -27,6 +28,8 @@ __all__ = [
27
28
  "redactyl",
28
29
  "Scannequin",
29
30
  "scannequin",
31
+ "Zeedub",
32
+ "zeedub",
30
33
  "Glitchling",
31
34
  "Gaggle",
32
35
  "summon",
@@ -40,7 +43,7 @@ _HAS_JARGOYLE = _jargoyle_available()
40
43
  _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
41
44
  if _HAS_JARGOYLE:
42
45
  _BUILTIN_GLITCHLING_LIST.append(jargoyle)
43
- _BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin])
46
+ _BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
44
47
 
45
48
  BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
46
49
  glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
@@ -53,6 +56,7 @@ _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
53
56
  rushmore.name.lower(): Rushmore,
54
57
  redactyl.name.lower(): Redactyl,
55
58
  scannequin.name.lower(): Scannequin,
59
+ zeedub.name.lower(): Zeedub,
56
60
  }
57
61
  if _HAS_JARGOYLE:
58
62
  _BUILTIN_GLITCHLING_TYPES[jargoyle.name.lower()] = Jargoyle
glitchlings/zoo/core.py CHANGED
@@ -27,17 +27,25 @@ log = logging.getLogger(__name__)
27
27
 
28
28
 
29
29
  _PIPELINE_FEATURE_FLAG_ENV = "GLITCHLINGS_RUST_PIPELINE"
30
+ _PIPELINE_ENABLE_VALUES = {"1", "true", "yes", "on"}
31
+ _PIPELINE_DISABLE_VALUES = {"0", "false", "no", "off"}
30
32
 
31
33
 
32
34
  def _pipeline_feature_flag_enabled() -> bool:
33
- """Return ``True`` when the environment explicitly opts into the Rust pipeline."""
35
+ """Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
34
36
 
35
37
  value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
36
38
  if value is None:
37
- return False
39
+ return True
38
40
 
39
41
  normalized = value.strip().lower()
40
- return normalized in {"1", "true", "yes", "on"}
42
+ if normalized in _PIPELINE_DISABLE_VALUES:
43
+ return False
44
+
45
+ if normalized in _PIPELINE_ENABLE_VALUES:
46
+ return True
47
+
48
+ return True
41
49
 
42
50
  if TYPE_CHECKING: # pragma: no cover - typing only
43
51
  from datasets import Dataset # type: ignore
@@ -356,7 +364,7 @@ class Gaggle(Glitchling):
356
364
 
357
365
  @staticmethod
358
366
  def rust_pipeline_enabled() -> bool:
359
- """Return ``True`` when the Rust pipeline is available and opted in."""
367
+ """Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
360
368
 
361
369
  return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
362
370
 
@@ -14,6 +14,41 @@ except ImportError: # pragma: no cover - compiled extension not present
14
14
  _redact_words_rust = None
15
15
 
16
16
 
17
+ def _weighted_sample_without_replacement(
18
+ population: list[int],
19
+ weights: list[float],
20
+ *,
21
+ k: int,
22
+ rng: random.Random,
23
+ ) -> list[int]:
24
+ """Select `k` unique indices according to the given weights."""
25
+
26
+ selections: list[int] = []
27
+ items = list(zip(population, weights))
28
+ if k <= 0 or not items:
29
+ return selections
30
+ if k > len(items):
31
+ raise ValueError("Sample larger than population or is negative")
32
+
33
+ for _ in range(k):
34
+ total_weight = sum(weight for _, weight in items)
35
+ if total_weight <= 0:
36
+ chosen_index = rng.randrange(len(items))
37
+ else:
38
+ threshold = rng.random() * total_weight
39
+ cumulative = 0.0
40
+ chosen_index = len(items) - 1
41
+ for idx, (_, weight) in enumerate(items):
42
+ cumulative += weight
43
+ if cumulative >= threshold:
44
+ chosen_index = idx
45
+ break
46
+ value, _ = items.pop(chosen_index)
47
+ selections.append(value)
48
+
49
+ return selections
50
+
51
+
17
52
  def _python_redact_words(
18
53
  text: str,
19
54
  *,
@@ -21,6 +56,7 @@ def _python_redact_words(
21
56
  rate: float,
22
57
  merge_adjacent: bool,
23
58
  rng: random.Random,
59
+ unweighted: bool = False,
24
60
  ) -> str:
25
61
  """Redact random words by replacing their characters.
26
62
 
@@ -29,18 +65,39 @@ def _python_redact_words(
29
65
  - replacement_char: The character to use for redaction (default FULL_BLOCK).
30
66
  - rate: Max proportion of words to redact (default 0.05).
31
67
  - merge_adjacent: If True, merges adjacent redactions across intervening non-word chars.
32
- - seed: Seed used if `rng` not provided (default 151).
33
- - rng: Optional RNG; overrides seed.
68
+ - rng: RNG used for sampling decisions.
69
+ - unweighted: When True, sample words uniformly instead of by length.
34
70
  """
35
71
  # Preserve exact spacing and punctuation by using regex
36
72
  tokens = re.split(r"(\s+)", text)
37
73
  word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
38
74
  if not word_indices:
39
- raise ValueError("Cannot redact words because the input text contains no redactable words.")
40
- num_to_redact = max(1, int(len(word_indices) * rate))
41
-
42
- # Sample from the indices of actual words
43
- indices_to_redact = rng.sample(word_indices, k=num_to_redact)
75
+ raise ValueError(
76
+ "Cannot redact words because the input text contains no redactable words."
77
+ )
78
+ weights: list[float] = []
79
+ for index in word_indices:
80
+ word = tokens[index]
81
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
82
+ core = match.group(2) if match else word
83
+ core_length = len(core) if core else len(word)
84
+ if core_length <= 0:
85
+ core_length = len(word.strip()) or len(word)
86
+ if core_length <= 0:
87
+ core_length = 1
88
+ weights.append(1.0 if unweighted else float(core_length))
89
+ raw_quota = len(word_indices) * rate
90
+ num_to_redact = int(raw_quota)
91
+ if rate > 0:
92
+ num_to_redact = max(1, num_to_redact)
93
+ if num_to_redact > len(word_indices):
94
+ raise ValueError("Sample larger than population or is negative")
95
+ indices_to_redact = _weighted_sample_without_replacement(
96
+ word_indices,
97
+ weights,
98
+ k=num_to_redact,
99
+ rng=rng,
100
+ )
44
101
  indices_to_redact.sort()
45
102
 
46
103
  for i in indices_to_redact:
@@ -80,13 +137,14 @@ def redact_words(
80
137
  rng: random.Random | None = None,
81
138
  *,
82
139
  redaction_rate: float | None = None,
140
+ unweighted: bool = False,
83
141
  ) -> str:
84
142
  """Redact random words by replacing their characters."""
85
143
 
86
144
  effective_rate = resolve_rate(
87
145
  rate=rate,
88
146
  legacy_value=redaction_rate,
89
- default=0.05,
147
+ default=0.025,
90
148
  legacy_name="redaction_rate",
91
149
  )
92
150
 
@@ -94,6 +152,7 @@ def redact_words(
94
152
  rng = random.Random(seed)
95
153
 
96
154
  clamped_rate = max(0.0, effective_rate)
155
+ unweighted_flag = bool(unweighted)
97
156
 
98
157
  use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
99
158
 
@@ -103,6 +162,7 @@ def redact_words(
103
162
  replacement_char,
104
163
  clamped_rate,
105
164
  merge_adjacent,
165
+ unweighted_flag,
106
166
  rng,
107
167
  )
108
168
 
@@ -112,6 +172,7 @@ def redact_words(
112
172
  rate=clamped_rate,
113
173
  merge_adjacent=merge_adjacent,
114
174
  rng=rng,
175
+ unweighted=unweighted_flag,
115
176
  )
116
177
 
117
178
 
@@ -126,12 +187,13 @@ class Redactyl(Glitchling):
126
187
  redaction_rate: float | None = None,
127
188
  merge_adjacent: bool = False,
128
189
  seed: int = 151,
190
+ unweighted: bool = False,
129
191
  ) -> None:
130
192
  self._param_aliases = {"redaction_rate": "rate"}
131
193
  effective_rate = resolve_rate(
132
194
  rate=rate,
133
195
  legacy_value=redaction_rate,
134
- default=0.05,
196
+ default=0.025,
135
197
  legacy_name="redaction_rate",
136
198
  )
137
199
  super().__init__(
@@ -142,6 +204,7 @@ class Redactyl(Glitchling):
142
204
  replacement_char=replacement_char,
143
205
  rate=effective_rate,
144
206
  merge_adjacent=merge_adjacent,
207
+ unweighted=unweighted,
145
208
  )
146
209
 
147
210
  def pipeline_operation(self) -> dict[str, Any] | None:
@@ -150,15 +213,16 @@ class Redactyl(Glitchling):
150
213
  merge_adjacent = self.kwargs.get("merge_adjacent")
151
214
  if replacement_char is None or rate is None or merge_adjacent is None:
152
215
  return None
216
+ unweighted = bool(self.kwargs.get("unweighted", False))
153
217
  return {
154
218
  "type": "redact",
155
219
  "replacement_char": str(replacement_char),
156
220
  "redaction_rate": float(rate),
157
221
  "merge_adjacent": bool(merge_adjacent),
222
+ "unweighted": unweighted,
158
223
  }
159
224
 
160
225
 
161
-
162
226
  redactyl = Redactyl()
163
227
 
164
228
 
@@ -16,14 +16,15 @@ def _python_reduplicate_words(
16
16
  *,
17
17
  rate: float,
18
18
  rng: random.Random,
19
+ unweighted: bool = False,
19
20
  ) -> str:
20
21
  """Randomly reduplicate words in the text.
21
22
 
22
23
  Parameters
23
24
  - text: Input text.
24
25
  - rate: Max proportion of words to reduplicate (default 0.05).
25
- - seed: Optional seed if `rng` not provided.
26
- - rng: Optional RNG; overrides seed.
26
+ - rng: RNG used for sampling decisions.
27
+ - unweighted: When True, sample words uniformly instead of length-weighted.
27
28
 
28
29
  Notes
29
30
  - Preserves spacing and punctuation by tokenizing with separators.
@@ -32,6 +33,7 @@ def _python_reduplicate_words(
32
33
  # Preserve exact spacing and punctuation by using regex
33
34
  tokens = re.split(r"(\s+)", text) # Split but keep separators
34
35
 
36
+ candidate_weights: list[tuple[int, float]] = []
35
37
  for i in range(0, len(tokens), 2): # Every other token is a word
36
38
  if i >= len(tokens):
37
39
  break
@@ -40,16 +42,46 @@ def _python_reduplicate_words(
40
42
  if not word or word.isspace(): # Skip empty or whitespace
41
43
  continue
42
44
 
43
- # Only consider actual words for reduplication
44
- if rng.random() < rate:
45
- # Check if word has trailing punctuation
46
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
47
- if match:
48
- prefix, core, suffix = match.groups()
49
- # Reduplicate with a space: "word" -> "word word"
50
- tokens[i] = f"{prefix}{core} {core}{suffix}"
45
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
46
+ core = match.group(2) if match else word
47
+ core_length = len(core) if core else len(word)
48
+ if core_length <= 0:
49
+ core_length = len(word.strip()) or len(word)
50
+ if core_length <= 0:
51
+ core_length = 1
52
+ weight = 1.0 if unweighted else 1.0 / core_length
53
+ candidate_weights.append((i, weight))
54
+
55
+ if not candidate_weights:
56
+ return "".join(tokens)
57
+
58
+ effective_rate = max(rate, 0.0)
59
+ if effective_rate <= 0.0:
60
+ return "".join(tokens)
61
+
62
+ mean_weight = sum(weight for _, weight in candidate_weights) / len(
63
+ candidate_weights
64
+ )
65
+
66
+ for index, weight in candidate_weights:
67
+ if effective_rate >= 1.0:
68
+ probability = 1.0
69
+ else:
70
+ if mean_weight <= 0.0:
71
+ probability = effective_rate
51
72
  else:
52
- tokens[i] = f"{word} {word}"
73
+ probability = min(1.0, effective_rate * (weight / mean_weight))
74
+ if rng.random() >= probability:
75
+ continue
76
+
77
+ word = tokens[index]
78
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
79
+ if match:
80
+ prefix, core, suffix = match.groups()
81
+ # Reduplicate with a space: "word" -> "word word"
82
+ tokens[index] = f"{prefix}{core} {core}{suffix}"
83
+ else:
84
+ tokens[index] = f"{word} {word}"
53
85
  return "".join(tokens)
54
86
 
55
87
 
@@ -60,6 +92,7 @@ def reduplicate_words(
60
92
  rng: random.Random | None = None,
61
93
  *,
62
94
  reduplication_rate: float | None = None,
95
+ unweighted: bool = False,
63
96
  ) -> str:
64
97
  """Randomly reduplicate words in the text.
65
98
 
@@ -70,7 +103,7 @@ def reduplicate_words(
70
103
  effective_rate = resolve_rate(
71
104
  rate=rate,
72
105
  legacy_value=reduplication_rate,
73
- default=0.05,
106
+ default=0.01,
74
107
  legacy_name="reduplication_rate",
75
108
  )
76
109
 
@@ -78,14 +111,16 @@ def reduplicate_words(
78
111
  rng = random.Random(seed)
79
112
 
80
113
  clamped_rate = max(0.0, effective_rate)
114
+ unweighted_flag = bool(unweighted)
81
115
 
82
116
  if _reduplicate_words_rust is not None:
83
- return _reduplicate_words_rust(text, clamped_rate, rng)
117
+ return _reduplicate_words_rust(text, clamped_rate, unweighted_flag, rng)
84
118
 
85
119
  return _python_reduplicate_words(
86
120
  text,
87
121
  rate=clamped_rate,
88
122
  rng=rng,
123
+ unweighted=unweighted_flag,
89
124
  )
90
125
 
91
126
 
@@ -98,12 +133,13 @@ class Reduple(Glitchling):
98
133
  rate: float | None = None,
99
134
  reduplication_rate: float | None = None,
100
135
  seed: int | None = None,
136
+ unweighted: bool = False,
101
137
  ) -> None:
102
138
  self._param_aliases = {"reduplication_rate": "rate"}
103
139
  effective_rate = resolve_rate(
104
140
  rate=rate,
105
141
  legacy_value=reduplication_rate,
106
- default=0.05,
142
+ default=0.01,
107
143
  legacy_name="reduplication_rate",
108
144
  )
109
145
  super().__init__(
@@ -112,14 +148,19 @@ class Reduple(Glitchling):
112
148
  scope=AttackWave.WORD,
113
149
  seed=seed,
114
150
  rate=effective_rate,
151
+ unweighted=unweighted,
115
152
  )
116
153
 
117
154
  def pipeline_operation(self) -> dict[str, Any] | None:
118
155
  rate = self.kwargs.get("rate")
119
156
  if rate is None:
120
157
  return None
121
- return {"type": "reduplicate", "reduplication_rate": float(rate)}
122
-
158
+ unweighted = bool(self.kwargs.get("unweighted", False))
159
+ return {
160
+ "type": "reduplicate",
161
+ "reduplication_rate": float(rate),
162
+ "unweighted": unweighted,
163
+ }
123
164
 
124
165
 
125
166
  reduple = Reduple()
@@ -17,42 +17,67 @@ def _python_delete_random_words(
17
17
  *,
18
18
  rate: float,
19
19
  rng: random.Random,
20
+ unweighted: bool = False,
20
21
  ) -> str:
21
22
  """Delete random words from the input text while preserving whitespace."""
22
23
 
23
- if rate <= 0.0:
24
+ effective_rate = max(rate, 0.0)
25
+ if effective_rate <= 0.0:
24
26
  return text
25
27
 
26
28
  tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
27
29
 
28
- candidate_indices: list[int] = []
30
+ candidate_data: list[tuple[int, float]] = []
29
31
  for i in range(2, len(tokens), 2): # Every other token is a word, skip the first word
30
32
  word = tokens[i]
31
33
  if not word or word.isspace():
32
34
  continue
33
35
 
34
- candidate_indices.append(i)
36
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
37
+ core = match.group(2) if match else word
38
+ core_length = len(core) if core else len(word)
39
+ if core_length <= 0:
40
+ core_length = len(word.strip()) or len(word)
41
+ if core_length <= 0:
42
+ core_length = 1
43
+ weight = 1.0 if unweighted else 1.0 / core_length
44
+ candidate_data.append((i, weight))
45
+
46
+ if not candidate_data:
47
+ return text
35
48
 
36
49
  allowed_deletions = min(
37
- len(candidate_indices), math.floor(len(candidate_indices) * rate)
50
+ len(candidate_data), math.floor(len(candidate_data) * effective_rate)
38
51
  )
39
52
  if allowed_deletions <= 0:
40
53
  return text
41
54
 
55
+ mean_weight = sum(weight for _, weight in candidate_data) / len(candidate_data)
56
+
42
57
  deletions = 0
43
- for i in candidate_indices:
44
- if rng.random() < rate:
45
- word = tokens[i]
46
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
47
- if match:
48
- prefix, _, suffix = match.groups()
49
- tokens[i] = f"{prefix.strip()}{suffix.strip()}"
58
+ for index, weight in candidate_data:
59
+ if deletions >= allowed_deletions:
60
+ break
61
+
62
+ if effective_rate >= 1.0:
63
+ probability = 1.0
64
+ else:
65
+ if mean_weight <= 0.0:
66
+ probability = effective_rate
50
67
  else:
51
- tokens[i] = ""
68
+ probability = min(1.0, effective_rate * (weight / mean_weight))
69
+ if rng.random() >= probability:
70
+ continue
71
+
72
+ word = tokens[index]
73
+ match = re.match(r"^(\W*)(.*?)(\W*)$", word)
74
+ if match:
75
+ prefix, _, suffix = match.groups()
76
+ tokens[index] = f"{prefix.strip()}{suffix.strip()}"
77
+ else:
78
+ tokens[index] = ""
52
79
 
53
- deletions += 1
54
- if deletions >= allowed_deletions:
55
- break
80
+ deletions += 1
56
81
 
57
82
  text = "".join(tokens)
58
83
  text = re.sub(r"\s+([.,;:])", r"\1", text)
@@ -68,6 +93,7 @@ def delete_random_words(
68
93
  rng: random.Random | None = None,
69
94
  *,
70
95
  max_deletion_rate: float | None = None,
96
+ unweighted: bool = False,
71
97
  ) -> str:
72
98
  """Delete random words from the input text.
73
99
 
@@ -85,14 +111,16 @@ def delete_random_words(
85
111
  rng = random.Random(seed)
86
112
 
87
113
  clamped_rate = max(0.0, effective_rate)
114
+ unweighted_flag = bool(unweighted)
88
115
 
89
116
  if _delete_random_words_rust is not None:
90
- return _delete_random_words_rust(text, clamped_rate, rng)
117
+ return _delete_random_words_rust(text, clamped_rate, unweighted_flag, rng)
91
118
 
92
119
  return _python_delete_random_words(
93
120
  text,
94
121
  rate=clamped_rate,
95
122
  rng=rng,
123
+ unweighted=unweighted_flag,
96
124
  )
97
125
 
98
126
 
@@ -105,6 +133,7 @@ class Rushmore(Glitchling):
105
133
  rate: float | None = None,
106
134
  max_deletion_rate: float | None = None,
107
135
  seed: int | None = None,
136
+ unweighted: bool = False,
108
137
  ) -> None:
109
138
  self._param_aliases = {"max_deletion_rate": "rate"}
110
139
  effective_rate = resolve_rate(
@@ -119,6 +148,7 @@ class Rushmore(Glitchling):
119
148
  scope=AttackWave.WORD,
120
149
  seed=seed,
121
150
  rate=effective_rate,
151
+ unweighted=unweighted,
122
152
  )
123
153
 
124
154
  def pipeline_operation(self) -> dict[str, Any] | None:
@@ -127,7 +157,12 @@ class Rushmore(Glitchling):
127
157
  rate = self.kwargs.get("max_deletion_rate")
128
158
  if rate is None:
129
159
  return None
130
- return {"type": "delete", "max_deletion_rate": float(rate)}
160
+ unweighted = bool(self.kwargs.get("unweighted", False))
161
+ return {
162
+ "type": "delete",
163
+ "max_deletion_rate": float(rate),
164
+ "unweighted": unweighted,
165
+ }
131
166
 
132
167
 
133
168
  rushmore = Rushmore()
@@ -0,0 +1,144 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import random
5
+ from collections.abc import Sequence
6
+
7
+ from .core import Glitchling, AttackWave, AttackOrder
8
+ from ._rate import resolve_rate
9
+
10
+ try:
11
+ from glitchlings._zoo_rust import inject_zero_widths as _inject_zero_widths_rust
12
+ except ImportError: # pragma: no cover - compiled extension not present
13
+ _inject_zero_widths_rust = None
14
+
15
+ _DEFAULT_ZERO_WIDTH_CHARACTERS: tuple[str, ...] = (
16
+ "\u200b", # ZERO WIDTH SPACE
17
+ "\u200c", # ZERO WIDTH NON-JOINER
18
+ "\u200d", # ZERO WIDTH JOINER
19
+ "\ufeff", # ZERO WIDTH NO-BREAK SPACE
20
+ "\u2060", # WORD JOINER
21
+ )
22
+
23
+
24
+ def _python_insert_zero_widths(
25
+ text: str,
26
+ *,
27
+ rate: float,
28
+ rng: random.Random,
29
+ characters: Sequence[str],
30
+ ) -> str:
31
+ if not text:
32
+ return text
33
+
34
+ palette = [char for char in characters if char]
35
+ if not palette:
36
+ return text
37
+
38
+ positions = [
39
+ index + 1
40
+ for index in range(len(text) - 1)
41
+ if not text[index].isspace() and not text[index + 1].isspace()
42
+ ]
43
+ if not positions:
44
+ return text
45
+
46
+ total = len(positions)
47
+ clamped_rate = max(0.0, rate)
48
+ if clamped_rate <= 0.0:
49
+ return text
50
+
51
+ target = clamped_rate * total
52
+ count = math.floor(target)
53
+ remainder = target - count
54
+ if remainder > 0.0 and rng.random() < remainder:
55
+ count += 1
56
+ count = min(total, count)
57
+
58
+ if count <= 0:
59
+ return text
60
+
61
+ chosen = rng.sample(positions, count)
62
+ chosen.sort()
63
+
64
+ chars = list(text)
65
+ for position in reversed(chosen):
66
+ chars.insert(position, rng.choice(palette))
67
+
68
+ return "".join(chars)
69
+
70
+
71
+ def insert_zero_widths(
72
+ text: str,
73
+ rate: float | None = None,
74
+ seed: int | None = None,
75
+ rng: random.Random | None = None,
76
+ *,
77
+ characters: Sequence[str] | None = None,
78
+ ) -> str:
79
+ """Inject zero-width characters between non-space character pairs."""
80
+
81
+ effective_rate = resolve_rate(
82
+ rate=rate,
83
+ legacy_value=None,
84
+ default=0.02,
85
+ legacy_name="rate",
86
+ )
87
+
88
+ if rng is None:
89
+ rng = random.Random(seed)
90
+
91
+ palette: Sequence[str] = (
92
+ tuple(characters) if characters is not None else _DEFAULT_ZERO_WIDTH_CHARACTERS
93
+ )
94
+
95
+ cleaned_palette = tuple(char for char in palette if char)
96
+ if not cleaned_palette or not text:
97
+ return text
98
+
99
+ clamped_rate = max(0.0, effective_rate)
100
+ if clamped_rate == 0.0:
101
+ return text
102
+
103
+ if _inject_zero_widths_rust is not None:
104
+ return _inject_zero_widths_rust(text, clamped_rate, list(cleaned_palette), rng)
105
+
106
+ return _python_insert_zero_widths(
107
+ text,
108
+ rate=clamped_rate,
109
+ rng=rng,
110
+ characters=cleaned_palette,
111
+ )
112
+
113
+
114
+ class Zeedub(Glitchling):
115
+ """Glitchling that plants zero-width glyphs inside words."""
116
+
117
+ def __init__(
118
+ self,
119
+ *,
120
+ rate: float | None = None,
121
+ seed: int | None = None,
122
+ characters: Sequence[str] | None = None,
123
+ ) -> None:
124
+ effective_rate = resolve_rate(
125
+ rate=rate,
126
+ legacy_value=None,
127
+ default=0.02,
128
+ legacy_name="rate",
129
+ )
130
+ super().__init__(
131
+ name="Zeedub",
132
+ corruption_function=insert_zero_widths,
133
+ scope=AttackWave.CHARACTER,
134
+ order=AttackOrder.LAST,
135
+ seed=seed,
136
+ rate=effective_rate,
137
+ characters=tuple(characters) if characters is not None else None,
138
+ )
139
+
140
+
141
+ zeedub = Zeedub()
142
+
143
+
144
+ __all__ = ["Zeedub", "zeedub", "insert_zero_widths"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -296,7 +296,7 @@ print(gaggle(SAMPLE_TEXT))
296
296
 
297
297
  Consult the [Glitchlings Usage Guide](docs/index.md)
298
298
  for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
299
- integrations, and the feature-flagged Rust pipeline.
299
+ integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
300
300
 
301
301
  ## Motivation
302
302
 
@@ -396,6 +396,18 @@ _How can a computer need reading glasses?_
396
396
  > - `rate (float)`: The maximum proportion of eligible confusion spans to replace (default: 0.02, 2%).
397
397
  > - `seed (int)`: The random seed for reproducibility (default: 151).
398
398
 
399
+ ### Zeedub
400
+
401
+ _A whispering glyph parasite that lives in the interstices of codepoints, marking territory with invisible traces._
402
+
403
+ > _**Invisible Ink.**_ Zeedub slips zero-width codepoints between non-space character pairs, forcing models to reason about text whose visible form masks hidden glyphs.
404
+ >
405
+ > Args
406
+ >
407
+ > - `rate (float)`: Expected number of zero-width insertions as a proportion of eligible bigrams (default: 0.02, 2%).
408
+ > - `characters (Sequence[str])`: Optional override for the pool of zero-width strings to inject (default: curated invisibles such as U+200B, U+200C, U+200D, U+FEFF, U+2060).
409
+ > - `seed (int)`: The random seed for reproducibility (default: 151).
410
+
399
411
  ### Jargoyle
400
412
 
401
413
  _Uh oh. The worst person you know just bought a thesaurus._
@@ -0,0 +1,27 @@
1
+ glitchlings/__init__.py,sha256=ui8kzf7mK5YAlFY1Og5UX5Rp14v4wC2ZqHihAJBBj6s,632
2
+ glitchlings/__main__.py,sha256=EOiBgay0x6B9VlSDzSQvMuoq6bHJdSvFSgcAVGGKkd4,121
3
+ glitchlings/_zoo_rust.cpython-310-darwin.so,sha256=4cUtfVEjY-3czJzu-DajFXyqfmgP_vjYZT4P1-Ip9WE,2389632
4
+ glitchlings/main.py,sha256=u6969Vl0n47e3S-ZlYZBj3HWVsjs-hvW6RpF9RYuXnc,5931
5
+ glitchlings/dlc/__init__.py,sha256=eTLEEWrVWPqniXHqee4W23H1rjElI1PQ_jcqWFe9D3g,141
6
+ glitchlings/dlc/huggingface.py,sha256=I1QWanWVxO02awgSpHDtgQEVF-9AQRLtsta2RCitWhE,2933
7
+ glitchlings/dlc/prime.py,sha256=wpRMNtgka1vNlEzifeCjGMp1q_-QclZn3NxXczGnNpM,9278
8
+ glitchlings/util/__init__.py,sha256=7KiZ0gKMjocfd34cajneZhTqYb7Hkwi_PpjltPqvkNI,4498
9
+ glitchlings/zoo/__init__.py,sha256=pdQSiQjMCqnhrM3qSRvu98FJd-EyXLNNwvthnYSXpmM,4282
10
+ glitchlings/zoo/_ocr_confusions.py,sha256=MkCbwk9T24SO2pD3JNPajYCfpMMlm2vQ5_sJty5GoXE,1218
11
+ glitchlings/zoo/_rate.py,sha256=TMyfVFV7pLxSGVswPlOAtBvk25Bjtx5xXTtpb_utgik,527
12
+ glitchlings/zoo/core.py,sha256=xLF9Op07KtMH0ql1-O7KyZ6lLESsdeNkvxdyiSOzhAc,14236
13
+ glitchlings/zoo/jargoyle.py,sha256=T6vPWBxceIPE6gOQ7BaihaqALOJwzXuhfiZzvKa4S50,10666
14
+ glitchlings/zoo/mim1c.py,sha256=yAt1ngR3j2KXLbzc8LhrQlIWRO_KT5dFK1EE8QivMAQ,3429
15
+ glitchlings/zoo/ocr_confusions.tsv,sha256=KhtR7vJDTITpfTSGa-I7RHr6CK7LkGi2KjdhEWipI6o,183
16
+ glitchlings/zoo/redactyl.py,sha256=wn7hxbtA0xMRuIXa6NNeeNOi0h0S8vh2bAa3x5Ec_Y0,6783
17
+ glitchlings/zoo/reduple.py,sha256=YNhTBH25XsXLeQD8xxXPE_JJMiCtmEpUFGGn36rd2tY,4857
18
+ glitchlings/zoo/rushmore.py,sha256=oG8MmMbrpmHH4rOp-NXkQznVlBCtSnrOttAZMdVlMkc,4729
19
+ glitchlings/zoo/scannequin.py,sha256=Ps8nxysKjkJV408zaL1kjVjy4jliATDBpYcNHLWbNFg,4859
20
+ glitchlings/zoo/typogre.py,sha256=xD02ldcMIA07XsdSts2bUniOc-k_DqTf0PBMaXGjLZE,6009
21
+ glitchlings/zoo/zeedub.py,sha256=D6rGk3O02OQ9jEIO9o0Ag-maVzNPN5O6qO3klG6Y62c,3552
22
+ glitchlings-0.2.6.dist-info/licenses/LICENSE,sha256=YCvGip-LoaRyu6h0nPo71q6eHEkzUpsE11psDJOIRkw,11337
23
+ glitchlings-0.2.6.dist-info/METADATA,sha256=5Xg6w5_-87bIRXY51i-nd7EmJMVPLtcBH_V3tj74CWI,26749
24
+ glitchlings-0.2.6.dist-info/WHEEL,sha256=G4cu_uTI97hAXSudQC0D9fpgNQkuavCNljtwFXiUqZM,114
25
+ glitchlings-0.2.6.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
26
+ glitchlings-0.2.6.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
27
+ glitchlings-0.2.6.dist-info/RECORD,,
@@ -1,26 +0,0 @@
1
- glitchlings/__init__.py,sha256=yD0BaldUpcc_QlHVca1z1iwpOp8ne1H9YVQHc85d1So,580
2
- glitchlings/__main__.py,sha256=EOiBgay0x6B9VlSDzSQvMuoq6bHJdSvFSgcAVGGKkd4,121
3
- glitchlings/_zoo_rust.cpython-310-darwin.so,sha256=6QV_035NzQnKIw_YGWOHqmQi_F19Nhur82A8kYyQ_gY,2369568
4
- glitchlings/main.py,sha256=u6969Vl0n47e3S-ZlYZBj3HWVsjs-hvW6RpF9RYuXnc,5931
5
- glitchlings/dlc/__init__.py,sha256=eTLEEWrVWPqniXHqee4W23H1rjElI1PQ_jcqWFe9D3g,141
6
- glitchlings/dlc/huggingface.py,sha256=I1QWanWVxO02awgSpHDtgQEVF-9AQRLtsta2RCitWhE,2933
7
- glitchlings/dlc/prime.py,sha256=v6wzkVxIsjTOAumn9cPfsmjuGf3RitCfUtk9eZzthyg,8698
8
- glitchlings/util/__init__.py,sha256=7KiZ0gKMjocfd34cajneZhTqYb7Hkwi_PpjltPqvkNI,4498
9
- glitchlings/zoo/__init__.py,sha256=bpQyCs-gEyv8RyQmBqssw-ozYLKz5yBIGODhiTGv-1U,4178
10
- glitchlings/zoo/_ocr_confusions.py,sha256=MkCbwk9T24SO2pD3JNPajYCfpMMlm2vQ5_sJty5GoXE,1218
11
- glitchlings/zoo/_rate.py,sha256=TMyfVFV7pLxSGVswPlOAtBvk25Bjtx5xXTtpb_utgik,527
12
- glitchlings/zoo/core.py,sha256=Fdxx4uoRH1WOL5rH_FeTUuQSwmnagP8mGXALq6IrtGY,14007
13
- glitchlings/zoo/jargoyle.py,sha256=T6vPWBxceIPE6gOQ7BaihaqALOJwzXuhfiZzvKa4S50,10666
14
- glitchlings/zoo/mim1c.py,sha256=yAt1ngR3j2KXLbzc8LhrQlIWRO_KT5dFK1EE8QivMAQ,3429
15
- glitchlings/zoo/ocr_confusions.tsv,sha256=KhtR7vJDTITpfTSGa-I7RHr6CK7LkGi2KjdhEWipI6o,183
16
- glitchlings/zoo/redactyl.py,sha256=IvyT9d-KPRTJoblSRTSagdFDhN8Y_ITBw9aSlFfE-Yo,4669
17
- glitchlings/zoo/reduple.py,sha256=5mNqdArs4raSEVH9tMLfhMl1s_uBDGxJ8h2DxM82vYw,3513
18
- glitchlings/zoo/rushmore.py,sha256=ooFmTKfq32NMjyehs5-luBPD0g9sFVZ5GTLk5dpGOp4,3544
19
- glitchlings/zoo/scannequin.py,sha256=Ps8nxysKjkJV408zaL1kjVjy4jliATDBpYcNHLWbNFg,4859
20
- glitchlings/zoo/typogre.py,sha256=xD02ldcMIA07XsdSts2bUniOc-k_DqTf0PBMaXGjLZE,6009
21
- glitchlings-0.2.4.dist-info/licenses/LICENSE,sha256=YCvGip-LoaRyu6h0nPo71q6eHEkzUpsE11psDJOIRkw,11337
22
- glitchlings-0.2.4.dist-info/METADATA,sha256=t99kSFIP7dv9B-x7U9tok4-ZkOmspRJp3J8MijNsU3M,26035
23
- glitchlings-0.2.4.dist-info/WHEEL,sha256=G4cu_uTI97hAXSudQC0D9fpgNQkuavCNljtwFXiUqZM,114
24
- glitchlings-0.2.4.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
25
- glitchlings-0.2.4.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
26
- glitchlings-0.2.4.dist-info/RECORD,,