glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.3.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
glitchlings/__init__.py CHANGED
@@ -5,6 +5,8 @@ from .zoo import (
5
5
  mim1c,
6
6
  Jargoyle,
7
7
  jargoyle,
8
+ Adjax,
9
+ adjax,
8
10
  Redactyl,
9
11
  redactyl,
10
12
  Reduple,
@@ -29,6 +31,8 @@ __all__ = [
29
31
  "mim1c",
30
32
  "Jargoyle",
31
33
  "jargoyle",
34
+ "Adjax",
35
+ "adjax",
32
36
  "Redactyl",
33
37
  "redactyl",
34
38
  "Reduple",
Binary file
glitchlings/dlc/prime.py CHANGED
@@ -49,7 +49,24 @@ def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[st
49
49
  if candidate in available:
50
50
  return [candidate]
51
51
 
52
- sample = dataset[0] if len(dataset) else {}
52
+ try:
53
+ dataset_length = len(dataset) # type: ignore[arg-type]
54
+ except TypeError:
55
+ preview_rows: list[dict[str, Any]]
56
+ take_fn = getattr(dataset, "take", None)
57
+ if callable(take_fn):
58
+ preview_rows = list(take_fn(1))
59
+ else:
60
+ iterator = iter(dataset)
61
+ try:
62
+ first_row = next(iterator)
63
+ except StopIteration:
64
+ preview_rows = []
65
+ else:
66
+ preview_rows = [first_row]
67
+ sample = dict(preview_rows[0]) if preview_rows else {}
68
+ else:
69
+ sample = dataset[0] if dataset_length else {}
53
70
  inferred = [
54
71
  name
55
72
  for name in dataset.column_names
@@ -6,6 +6,7 @@ from typing import Any
6
6
  from .typogre import Typogre, typogre
7
7
  from .mim1c import Mim1c, mim1c
8
8
  from .jargoyle import Jargoyle, jargoyle, dependencies_available as _jargoyle_available
9
+ from .adjax import Adjax, adjax
9
10
  from .reduple import Reduple, reduple
10
11
  from .rushmore import Rushmore, rushmore
11
12
  from .redactyl import Redactyl, redactyl
@@ -20,6 +21,8 @@ __all__ = [
20
21
  "mim1c",
21
22
  "Jargoyle",
22
23
  "jargoyle",
24
+ "Adjax",
25
+ "adjax",
23
26
  "Reduple",
24
27
  "reduple",
25
28
  "Rushmore",
@@ -43,7 +46,7 @@ _HAS_JARGOYLE = _jargoyle_available()
43
46
  _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
44
47
  if _HAS_JARGOYLE:
45
48
  _BUILTIN_GLITCHLING_LIST.append(jargoyle)
46
- _BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin, zeedub])
49
+ _BUILTIN_GLITCHLING_LIST.extend([adjax, reduple, rushmore, redactyl, scannequin, zeedub])
47
50
 
48
51
  BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
49
52
  glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
@@ -52,6 +55,7 @@ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
52
55
  _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
53
56
  typogre.name.lower(): Typogre,
54
57
  mim1c.name.lower(): Mim1c,
58
+ adjax.name.lower(): Adjax,
55
59
  reduple.name.lower(): Reduple,
56
60
  rushmore.name.lower(): Rushmore,
57
61
  redactyl.name.lower(): Redactyl,
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ _WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
6
+ _TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
7
+
8
+
9
+ def split_preserving_whitespace(text: str) -> list[str]:
10
+ """Split text while keeping whitespace tokens for stable reconstruction."""
11
+
12
+ return _WORD_SPLIT_PATTERN.split(text)
13
+
14
+
15
+ def split_token_edges(token: str) -> tuple[str, str, str]:
16
+ """Return leading, core, and trailing segments for a token."""
17
+
18
+ match = _TOKEN_EDGES_PATTERN.match(token)
19
+ if match is None:
20
+ return "", token, ""
21
+ return match.group(1), match.group(2), match.group(3)
22
+
23
+
24
+ def token_core_length(token: str) -> int:
25
+ """Return the length of the main word characters for weighting heuristics."""
26
+
27
+ _, core, _ = split_token_edges(token)
28
+ candidate = core if core else token
29
+ length = len(candidate)
30
+ if length <= 0:
31
+ stripped = token.strip()
32
+ length = len(stripped) if stripped else len(token)
33
+ if length <= 0:
34
+ length = 1
35
+ return length
36
+
37
+
38
+ __all__ = [
39
+ "split_preserving_whitespace",
40
+ "split_token_edges",
41
+ "token_core_length",
42
+ ]
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import Any
5
+
6
+ from ._rate import resolve_rate
7
+ from ._text_utils import split_preserving_whitespace, split_token_edges
8
+ from .core import AttackWave, Glitchling
9
+
10
+ try:
11
+ from glitchlings._zoo_rust import swap_adjacent_words as _swap_adjacent_words_rust
12
+ except ImportError: # pragma: no cover - optional acceleration
13
+ _swap_adjacent_words_rust = None
14
+
15
+
16
+ def _python_swap_adjacent_words(
17
+ text: str,
18
+ *,
19
+ rate: float,
20
+ rng: random.Random,
21
+ ) -> str:
22
+ """Swap the cores of adjacent words while keeping affixes and spacing intact."""
23
+
24
+ tokens = split_preserving_whitespace(text)
25
+ if len(tokens) < 2:
26
+ return text
27
+
28
+ word_indices: list[int] = []
29
+ for index in range(len(tokens)):
30
+ token = tokens[index]
31
+ if not token or token.isspace():
32
+ continue
33
+ if index % 2 == 0:
34
+ word_indices.append(index)
35
+
36
+ if len(word_indices) < 2:
37
+ return text
38
+
39
+ clamped = max(0.0, min(rate, 1.0))
40
+ if clamped <= 0.0:
41
+ return text
42
+
43
+ for cursor in range(0, len(word_indices) - 1, 2):
44
+ left_index = word_indices[cursor]
45
+ right_index = word_indices[cursor + 1]
46
+
47
+ left_token = tokens[left_index]
48
+ right_token = tokens[right_index]
49
+
50
+ left_prefix, left_core, left_suffix = split_token_edges(left_token)
51
+ right_prefix, right_core, right_suffix = split_token_edges(right_token)
52
+
53
+ if not left_core or not right_core:
54
+ continue
55
+
56
+ should_swap = clamped >= 1.0 or rng.random() < clamped
57
+ if not should_swap:
58
+ continue
59
+
60
+ tokens[left_index] = f"{left_prefix}{right_core}{left_suffix}"
61
+ tokens[right_index] = f"{right_prefix}{left_core}{right_suffix}"
62
+
63
+ return "".join(tokens)
64
+
65
+
66
+ def swap_adjacent_words(
67
+ text: str,
68
+ rate: float | None = None,
69
+ seed: int | None = None,
70
+ rng: random.Random | None = None,
71
+ *,
72
+ swap_rate: float | None = None,
73
+ ) -> str:
74
+ """Swap adjacent word cores while preserving spacing and punctuation."""
75
+
76
+ effective_rate = resolve_rate(
77
+ rate=rate,
78
+ legacy_value=swap_rate,
79
+ default=0.5,
80
+ legacy_name="swap_rate",
81
+ )
82
+ clamped_rate = max(0.0, min(effective_rate, 1.0))
83
+
84
+ if rng is None:
85
+ rng = random.Random(seed)
86
+
87
+ if _swap_adjacent_words_rust is not None:
88
+ return _swap_adjacent_words_rust(text, clamped_rate, rng)
89
+
90
+ return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
91
+
92
+
93
+ class Adjax(Glitchling):
94
+ """Glitchling that swaps adjacent words to scramble local semantics."""
95
+
96
+ def __init__(
97
+ self,
98
+ *,
99
+ rate: float | None = None,
100
+ swap_rate: float | None = None,
101
+ seed: int | None = None,
102
+ ) -> None:
103
+ self._param_aliases = {"swap_rate": "rate"}
104
+ effective_rate = resolve_rate(
105
+ rate=rate,
106
+ legacy_value=swap_rate,
107
+ default=0.5,
108
+ legacy_name="swap_rate",
109
+ )
110
+ super().__init__(
111
+ name="Adjax",
112
+ corruption_function=swap_adjacent_words,
113
+ scope=AttackWave.WORD,
114
+ seed=seed,
115
+ rate=effective_rate,
116
+ )
117
+
118
+ def pipeline_operation(self) -> dict[str, Any] | None:
119
+ rate = self.kwargs.get("rate")
120
+ if rate is None:
121
+ return None
122
+ return {
123
+ "type": "swap_adjacent",
124
+ "swap_rate": float(rate),
125
+ }
126
+
127
+
128
+ adjax = Adjax()
129
+
130
+
131
+ __all__ = ["Adjax", "adjax", "swap_adjacent_words"]
glitchlings/zoo/core.py CHANGED
@@ -27,17 +27,25 @@ log = logging.getLogger(__name__)
27
27
 
28
28
 
29
29
  _PIPELINE_FEATURE_FLAG_ENV = "GLITCHLINGS_RUST_PIPELINE"
30
+ _PIPELINE_ENABLE_VALUES = {"1", "true", "yes", "on"}
31
+ _PIPELINE_DISABLE_VALUES = {"0", "false", "no", "off"}
30
32
 
31
33
 
32
34
  def _pipeline_feature_flag_enabled() -> bool:
33
- """Return ``True`` when the environment explicitly opts into the Rust pipeline."""
35
+ """Return ``True`` when the environment does not explicitly disable the Rust pipeline."""
34
36
 
35
37
  value = os.environ.get(_PIPELINE_FEATURE_FLAG_ENV)
36
38
  if value is None:
37
- return False
39
+ return True
38
40
 
39
41
  normalized = value.strip().lower()
40
- return normalized in {"1", "true", "yes", "on"}
42
+ if normalized in _PIPELINE_DISABLE_VALUES:
43
+ return False
44
+
45
+ if normalized in _PIPELINE_ENABLE_VALUES:
46
+ return True
47
+
48
+ return True
41
49
 
42
50
  if TYPE_CHECKING: # pragma: no cover - typing only
43
51
  from datasets import Dataset # type: ignore
@@ -51,18 +59,26 @@ else:
51
59
  def with_transform(self, function: Any) -> "Dataset": ...
52
60
 
53
61
 
54
- def _is_transcript(value: Any) -> bool:
55
- """Return True when the value resembles a chat transcript."""
62
+ def _is_transcript(
63
+ value: Any,
64
+ *,
65
+ allow_empty: bool = True,
66
+ require_all_content: bool = False,
67
+ ) -> bool:
68
+ """Return `True` when `value` appears to be a chat transcript."""
56
69
 
57
70
  if not isinstance(value, list):
58
71
  return False
59
72
 
60
73
  if not value:
61
- return True
74
+ return allow_empty
62
75
 
63
76
  if not all(isinstance(turn, dict) for turn in value):
64
77
  return False
65
78
 
79
+ if require_all_content:
80
+ return all("content" in turn for turn in value)
81
+
66
82
  return "content" in value[-1]
67
83
 
68
84
 
@@ -225,21 +241,15 @@ class Glitchling:
225
241
  message = "datasets is not installed"
226
242
  raise ModuleNotFoundError(message) from _datasets_error
227
243
 
228
- def _is_transcript(value: Any) -> bool:
229
- """Return ``True`` when the value resembles a chat transcript."""
230
-
231
- if not isinstance(value, list) or not value:
232
- return False
233
-
234
- return all(
235
- isinstance(turn, dict) and "content" in turn for turn in value
236
- )
237
-
238
244
  def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
239
245
  row = dict(row)
240
246
  for column in columns:
241
247
  value = row[column]
242
- if _is_transcript(value):
248
+ if _is_transcript(
249
+ value,
250
+ allow_empty=False,
251
+ require_all_content=True,
252
+ ):
243
253
  row[column] = self.corrupt(value)
244
254
  elif isinstance(value, list):
245
255
  row[column] = [self.corrupt(item) for item in value]
@@ -356,7 +366,7 @@ class Gaggle(Glitchling):
356
366
 
357
367
  @staticmethod
358
368
  def rust_pipeline_enabled() -> bool:
359
- """Return ``True`` when the Rust pipeline is available and opted in."""
369
+ """Return ``True`` when the Rust pipeline is available and not explicitly disabled."""
360
370
 
361
371
  return Gaggle.rust_pipeline_supported() and _pipeline_feature_flag_enabled()
362
372
 
@@ -2,8 +2,13 @@ import re
2
2
  import random
3
3
  from typing import Any
4
4
 
5
- from .core import Glitchling, AttackWave
6
5
  from ._rate import resolve_rate
6
+ from ._text_utils import (
7
+ split_preserving_whitespace,
8
+ split_token_edges,
9
+ token_core_length,
10
+ )
11
+ from .core import AttackWave, Glitchling
7
12
 
8
13
  FULL_BLOCK = "█"
9
14
 
@@ -68,8 +73,7 @@ def _python_redact_words(
68
73
  - rng: RNG used for sampling decisions.
69
74
  - unweighted: When True, sample words uniformly instead of by length.
70
75
  """
71
- # Preserve exact spacing and punctuation by using regex
72
- tokens = re.split(r"(\s+)", text)
76
+ tokens = split_preserving_whitespace(text)
73
77
  word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
74
78
  if not word_indices:
75
79
  raise ValueError(
@@ -78,15 +82,12 @@ def _python_redact_words(
78
82
  weights: list[float] = []
79
83
  for index in word_indices:
80
84
  word = tokens[index]
81
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
82
- core = match.group(2) if match else word
83
- core_length = len(core) if core else len(word)
84
- if core_length <= 0:
85
- core_length = len(word.strip()) or len(word)
86
- if core_length <= 0:
87
- core_length = 1
88
- weights.append(1.0 if unweighted else float(core_length))
89
- num_to_redact = max(1, int(len(word_indices) * rate))
85
+ length = token_core_length(word)
86
+ weights.append(1.0 if unweighted else float(length))
87
+ raw_quota = len(word_indices) * rate
88
+ num_to_redact = int(raw_quota)
89
+ if rate > 0:
90
+ num_to_redact = max(1, num_to_redact)
90
91
  if num_to_redact > len(word_indices):
91
92
  raise ValueError("Sample larger than population or is negative")
92
93
  indices_to_redact = _weighted_sample_without_replacement(
@@ -102,16 +103,11 @@ def _python_redact_words(
102
103
  break
103
104
 
104
105
  word = tokens[i]
105
- if not word or word.isspace(): # Skip empty or whitespace
106
+ if not word or word.isspace():
106
107
  continue
107
108
 
108
- # Check if word has trailing punctuation
109
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
110
- if match:
111
- prefix, core, suffix = match.groups()
112
- tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
113
- else:
114
- tokens[i] = f"{replacement_char * len(word)}"
109
+ prefix, core, suffix = split_token_edges(word)
110
+ tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
115
111
 
116
112
  text = "".join(tokens)
117
113
 
@@ -1,9 +1,13 @@
1
- import re
2
1
  import random
3
2
  from typing import Any
4
3
 
5
- from .core import Glitchling, AttackWave
6
4
  from ._rate import resolve_rate
5
+ from ._text_utils import (
6
+ split_preserving_whitespace,
7
+ split_token_edges,
8
+ token_core_length,
9
+ )
10
+ from .core import AttackWave, Glitchling
7
11
 
8
12
  try:
9
13
  from glitchlings._zoo_rust import reduplicate_words as _reduplicate_words_rust
@@ -30,26 +34,16 @@ def _python_reduplicate_words(
30
34
  - Preserves spacing and punctuation by tokenizing with separators.
31
35
  - Deterministic when run with a fixed seed or via Gaggle.
32
36
  """
33
- # Preserve exact spacing and punctuation by using regex
34
- tokens = re.split(r"(\s+)", text) # Split but keep separators
37
+ tokens = split_preserving_whitespace(text)
35
38
 
36
39
  candidate_weights: list[tuple[int, float]] = []
37
- for i in range(0, len(tokens), 2): # Every other token is a word
38
- if i >= len(tokens):
39
- break
40
-
40
+ for i in range(0, len(tokens), 2):
41
41
  word = tokens[i]
42
- if not word or word.isspace(): # Skip empty or whitespace
42
+ if not word or word.isspace():
43
43
  continue
44
44
 
45
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
46
- core = match.group(2) if match else word
47
- core_length = len(core) if core else len(word)
48
- if core_length <= 0:
49
- core_length = len(word.strip()) or len(word)
50
- if core_length <= 0:
51
- core_length = 1
52
- weight = 1.0 if unweighted else 1.0 / core_length
45
+ length = token_core_length(word)
46
+ weight = 1.0 if unweighted else 1.0 / length
53
47
  candidate_weights.append((i, weight))
54
48
 
55
49
  if not candidate_weights:
@@ -75,13 +69,8 @@ def _python_reduplicate_words(
75
69
  continue
76
70
 
77
71
  word = tokens[index]
78
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
79
- if match:
80
- prefix, core, suffix = match.groups()
81
- # Reduplicate with a space: "word" -> "word word"
82
- tokens[index] = f"{prefix}{core} {core}{suffix}"
83
- else:
84
- tokens[index] = f"{word} {word}"
72
+ prefix, core, suffix = split_token_edges(word)
73
+ tokens[index] = f"{prefix}{core} {core}{suffix}"
85
74
  return "".join(tokens)
86
75
 
87
76
 
@@ -3,8 +3,13 @@ import random
3
3
  import re
4
4
  from typing import Any
5
5
 
6
- from .core import Glitchling, AttackWave
7
6
  from ._rate import resolve_rate
7
+ from ._text_utils import (
8
+ split_preserving_whitespace,
9
+ split_token_edges,
10
+ token_core_length,
11
+ )
12
+ from .core import AttackWave, Glitchling
8
13
 
9
14
  try:
10
15
  from glitchlings._zoo_rust import delete_random_words as _delete_random_words_rust
@@ -25,22 +30,16 @@ def _python_delete_random_words(
25
30
  if effective_rate <= 0.0:
26
31
  return text
27
32
 
28
- tokens = re.split(r"(\s+)", text) # Split but keep separators for later rejoin
33
+ tokens = split_preserving_whitespace(text)
29
34
 
30
35
  candidate_data: list[tuple[int, float]] = []
31
- for i in range(2, len(tokens), 2): # Every other token is a word, skip the first word
36
+ for i in range(2, len(tokens), 2):
32
37
  word = tokens[i]
33
38
  if not word or word.isspace():
34
39
  continue
35
40
 
36
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
37
- core = match.group(2) if match else word
38
- core_length = len(core) if core else len(word)
39
- if core_length <= 0:
40
- core_length = len(word.strip()) or len(word)
41
- if core_length <= 0:
42
- core_length = 1
43
- weight = 1.0 if unweighted else 1.0 / core_length
41
+ length = token_core_length(word)
42
+ weight = 1.0 if unweighted else 1.0 / length
44
43
  candidate_data.append((i, weight))
45
44
 
46
45
  if not candidate_data:
@@ -70,12 +69,8 @@ def _python_delete_random_words(
70
69
  continue
71
70
 
72
71
  word = tokens[index]
73
- match = re.match(r"^(\W*)(.*?)(\W*)$", word)
74
- if match:
75
- prefix, _, suffix = match.groups()
76
- tokens[index] = f"{prefix.strip()}{suffix.strip()}"
77
- else:
78
- tokens[index] = ""
72
+ prefix, _, suffix = split_token_edges(word)
73
+ tokens[index] = f"{prefix.strip()}{suffix.strip()}"
79
74
 
80
75
  deletions += 1
81
76
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glitchlings
3
- Version: 0.2.5
3
+ Version: 0.3.0
4
4
  Summary: Monsters for your language games.
5
5
  Author: osoleve
6
6
  License: Apache License
@@ -209,7 +209,7 @@ Project-URL: Homepage, https://github.com/osoleve/glitchlings
209
209
  Project-URL: Repository, https://github.com/osoleve/glitchlings.git
210
210
  Project-URL: Issues, https://github.com/osoleve/glitchlings/issues
211
211
  Project-URL: Changelog, https://github.com/osoleve/glitchlings/releases
212
- Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,confusables,typo,
212
+ Keywords: nlp,text,adversarial augmentation,text augmentation,large language models,llms,data augmentation,rlvr
213
213
  Classifier: Development Status :: 3 - Alpha
214
214
  Classifier: Intended Audience :: Developers
215
215
  Classifier: Programming Language :: Python
@@ -296,7 +296,7 @@ print(gaggle(SAMPLE_TEXT))
296
296
 
297
297
  Consult the [Glitchlings Usage Guide](docs/index.md)
298
298
  for end-to-end instructions spanning the Python API, CLI, HuggingFace and Prime Intellect
299
- integrations, and the feature-flagged Rust pipeline.
299
+ integrations, and the autodetected Rust pipeline (enabled whenever the extension is present).
300
300
 
301
301
  ## Motivation
302
302
 
@@ -428,7 +428,8 @@ _Did you say that or did I?_
428
428
  >
429
429
  > Args
430
430
  >
431
- > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.05, 5%).
431
+ > - `rate (float)`: The maximum proportion of words to reduplicate (default: 0.01, 1%).
432
+ > - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
432
433
  > - `seed (int)`: The random seed for reproducibility (default: 151).
433
434
 
434
435
  ### Rushmore
@@ -440,6 +441,19 @@ _I accidentally an entire word._
440
441
  > Args
441
442
  >
442
443
  > - `rate (float)`: The maximum proportion of words to delete (default: 0.01, 1%).
444
+ > - `unweighted (bool)`: Sample words uniformly instead of favouring shorter tokens (default: False).
445
+ > - `seed (int)`: The random seed for reproducibility (default: 151).
446
+
447
+ ### Adjax
448
+
449
+ _Keep your hands and punctuation where I can see them._
450
+
451
+ > _**Perfect Shuffle.**_ Adjax trades the cores of neighbouring words while leaving punctuation, casing, and surrounding whitespace untouched, turning fluent prose into locally scrambled tongue-twisters.
452
+ >
453
+ > Args
454
+ >
455
+ > - `rate (float)`: Probability that each adjacent pair swaps cores (default: 0.5, 50%).
456
+ > - `swap_rate (float)`: Alias for `rate`, retained for backward compatibility.
443
457
  > - `seed (int)`: The random seed for reproducibility (default: 151).
444
458
 
445
459
  ### Redactyl
@@ -450,9 +464,10 @@ _Oops, that was my black highlighter._
450
464
  >
451
465
  > ### Args
452
466
  >
453
- > - `replacement_char (str)`: The character to use for redaction (default: ).
454
- > - `rate (float)`: The maximum proportion of words to redact (default: 0.05, 5%).
467
+ > - `replacement_char (str)`: The character to use for redaction (default: FULL_BLOCK).
468
+ > - `rate (float)`: The maximum proportion of words to redact (default: 0.025, 2.5%).
455
469
  > - `merge_adjacent (bool)`: Whether to redact the space between adjacent redacted words (default: False).
470
+ > - `unweighted (bool)`: Sample words uniformly instead of biasing toward longer tokens (default: False).
456
471
  > - `seed (int)`: The random seed for reproducibility (default: 151).
457
472
 
458
473
  ## Field Report: Uncontained Specimens
@@ -0,0 +1,29 @@
1
+ glitchlings/__init__.py,sha256=lqzYzB1RdQnw-NpWXN2dtcEGDkQ-OkC2OTEL16HDMYc,730
2
+ glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
3
+ glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=AW-mnIw4O-B53whfrlhwNRB-OEAssSdgnSJEUfjxZvc,2024960
4
+ glitchlings/main.py,sha256=QrSSLWcKh1_NDfJDGh-3UVKdI7AkzfMy6Jz1ouxIgnE,6149
5
+ glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
6
+ glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
7
+ glitchlings/dlc/prime.py,sha256=b5CE1qDl5MxZjTudlKrqMsmSGxXNKZ16krqPyrr2nK8,9569
8
+ glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
9
+ glitchlings/zoo/__init__.py,sha256=LryHn930FuEdKRyvtRu7breBvz9IYYTvJv7yGIxLd5Y,4520
10
+ glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
11
+ glitchlings/zoo/_rate.py,sha256=EYUWXYyR2IK0zYBWyBOlnUjDxU32JE9mZTZeodVx5CA,548
12
+ glitchlings/zoo/_text_utils.py,sha256=pul6iGtVWir4mX-Mq5ni06JFOzf6x3J82iYSICXJCGE,1162
13
+ glitchlings/zoo/adjax.py,sha256=G2diAEsQ8T4mjFCcTeiGzLF0261n7LjLyW5HyVCy3R4,3661
14
+ glitchlings/zoo/core.py,sha256=sK3F1OVifbzQFsDrG-pQIImcGP7YfccwTfbqFTJi8Fc,14622
15
+ glitchlings/zoo/jargoyle.py,sha256=1fnL_8bv1Y-T2h1C6NRzIylYyOuAUI-BiMReFewqh00,11002
16
+ glitchlings/zoo/mim1c.py,sha256=3ddNOzWgLABuEOh5T98Xk439ejx-YHGI7ErXET03Crc,3537
17
+ glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
18
+ glitchlings/zoo/redactyl.py,sha256=poBzhXtApDa55G7iVCGEM4v1_YSYh3LfEAp2fkVFIJ4,6579
19
+ glitchlings/zoo/reduple.py,sha256=orgS3ajpuGTDN-QqGuYgfkEI7yVCgIXHtL_HHp8jGmE,4471
20
+ glitchlings/zoo/rushmore.py,sha256=rUluMdjvSxaVlUfK9_N0F108O5Exoa4klWLumrV2CgA,4535
21
+ glitchlings/zoo/scannequin.py,sha256=TJyNYTTIB7rxZH3XKIETy0YVf4EjsMgGWYmYaxH9jxU,5030
22
+ glitchlings/zoo/typogre.py,sha256=olTTXDmFkVQ3r-T1vxm2mLomRvIDXHrNHfgin316wzE,6221
23
+ glitchlings/zoo/zeedub.py,sha256=n1qTKE_Dl0m8SEKhaP91oHAyJ484NxaGLPu_ZLr0Ldo,3696
24
+ glitchlings-0.3.0.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
25
+ glitchlings-0.3.0.dist-info/METADATA,sha256=b9uWb19S04moT94a_onQBjurDfBHJulwIW4R2ep84mE,28084
26
+ glitchlings-0.3.0.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
27
+ glitchlings-0.3.0.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
28
+ glitchlings-0.3.0.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
29
+ glitchlings-0.3.0.dist-info/RECORD,,
@@ -1,27 +0,0 @@
1
- glitchlings/__init__.py,sha256=fjerquRITZQY_rY5mhTVVQyeGAz1qTpgicvDhbpqgi8,678
2
- glitchlings/__main__.py,sha256=pqNe1C9hMf8pap4oh6x6yo2h4Nsa2RFSaMWHfGtNXj0,130
3
- glitchlings/_zoo_rust.cp312-win_amd64.pyd,sha256=JO0QDAqXc14YPXxmghAKNuXZ3uHFEOyBa-SkU4EO_fI,2019328
4
- glitchlings/main.py,sha256=QrSSLWcKh1_NDfJDGh-3UVKdI7AkzfMy6Jz1ouxIgnE,6149
5
- glitchlings/dlc/__init__.py,sha256=IHD-GGhVFb7SVzErvf2YCJkOR4wGo0nFHXkn_daMvS8,146
6
- glitchlings/dlc/huggingface.py,sha256=PIesnDIEvyJxj1IuLw2P9nVPTr4Nv81XM7w2axfyhkA,3029
7
- glitchlings/dlc/prime.py,sha256=hySyYBncUM-49j6JtrHYO6c3HpbG2vTt2EYZnOJ85C0,8972
8
- glitchlings/util/__init__.py,sha256=GoyQuHTfGRkHzuZwJji6QWSiGd_LHa9QiyjjEpBFW7E,4679
9
- glitchlings/zoo/__init__.py,sha256=mAhsnR3ZK9BocxT3J4WF6JcYQMYI9e_EYZ-GMxHv0P4,4420
10
- glitchlings/zoo/_ocr_confusions.py,sha256=W59Aa5MBDwRF65f8GV-6XwGAmlR5Uk7pa5qvHvhIYdY,1252
11
- glitchlings/zoo/_rate.py,sha256=EYUWXYyR2IK0zYBWyBOlnUjDxU32JE9mZTZeodVx5CA,548
12
- glitchlings/zoo/core.py,sha256=QKHmzmONNkiA3RdfgLdNx-FPFwoH4Bm-Tkc3vSCHNpc,14412
13
- glitchlings/zoo/jargoyle.py,sha256=1fnL_8bv1Y-T2h1C6NRzIylYyOuAUI-BiMReFewqh00,11002
14
- glitchlings/zoo/mim1c.py,sha256=3ddNOzWgLABuEOh5T98Xk439ejx-YHGI7ErXET03Crc,3537
15
- glitchlings/zoo/ocr_confusions.tsv,sha256=S-IJEYCIXYKT1Uu7Id8Lnvg5pw528yNigTtWUdnMv9k,213
16
- glitchlings/zoo/redactyl.py,sha256=jjLad9ugG8516CNhuUfv16OOs9HwqTiUzVqY0CLskhY,6928
17
- glitchlings/zoo/reduple.py,sha256=oGkOkH9bJiG-ogsi5ewglq6FUmzvRM6UC4N61LyNdvk,5026
18
- glitchlings/zoo/rushmore.py,sha256=RE-br8OAIBRil3Mz381OcdMtb1fuNCZ7LzAjt44hFkM,4900
19
- glitchlings/zoo/scannequin.py,sha256=TJyNYTTIB7rxZH3XKIETy0YVf4EjsMgGWYmYaxH9jxU,5030
20
- glitchlings/zoo/typogre.py,sha256=olTTXDmFkVQ3r-T1vxm2mLomRvIDXHrNHfgin316wzE,6221
21
- glitchlings/zoo/zeedub.py,sha256=n1qTKE_Dl0m8SEKhaP91oHAyJ484NxaGLPu_ZLr0Ldo,3696
22
- glitchlings-0.2.5.dist-info/licenses/LICENSE,sha256=EFEP1evBfHaxsMTBjxm0sZVRp2wct8QLvHE1saII5FI,11538
23
- glitchlings-0.2.5.dist-info/METADATA,sha256=UG-L-7qePJBz0sKAWGX2yKNmup3D_gaid0DDlPsuf3Y,27198
24
- glitchlings-0.2.5.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
25
- glitchlings-0.2.5.dist-info/entry_points.txt,sha256=kGOwuAsjFDLtztLisaXtOouq9wFVMOJg5FzaAkg-Hto,54
26
- glitchlings-0.2.5.dist-info/top_level.txt,sha256=VHFNBrLjtDwPCYXbGKi6o17Eueedi81eNbR3hBOoST0,12
27
- glitchlings-0.2.5.dist-info/RECORD,,