glitchlings 0.2.1__cp312-cp312-win_amd64.whl → 0.2.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
glitchlings/dlc/prime.py CHANGED
@@ -79,8 +79,8 @@ def tutorial_level(
79
79
  ) -> vf.Environment:
80
80
  """Create a low-corruption environment using tuned defaults."""
81
81
 
82
- tuned_mim1c = Mim1c(replacement_rate=0.01 * difficulty.value)
83
- tuned_typogre = Typogre(max_change_rate=0.025 * difficulty.value)
82
+ tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
83
+ tuned_typogre = Typogre(rate=0.025 * difficulty.value)
84
84
 
85
85
  return load_environment(
86
86
  env,
@@ -220,32 +220,54 @@ def echo_chamber(
220
220
  "Specify which split to use when the dataset loads as a DatasetDict."
221
221
  )
222
222
 
223
- prompts: list[list[dict[str, str]]] = []
224
- answers: list[str] = []
223
+ filtered_dataset = hf_dataset.filter(
224
+ lambda row: row.get(column) is not None,
225
+ load_from_cache_file=False,
226
+ )
225
227
 
226
- for row in hf_dataset:
227
- value = row.get(column)
228
- if value is None:
229
- continue
228
+ source_column_names = list(filtered_dataset.column_names)
230
229
 
231
- text = str(value)
232
- prompts.append(
233
- [
234
- {"role": "system", "content": instructions},
235
- {"role": "user", "content": f"Corrupted text:\n{text}"},
236
- ]
237
- )
238
- answers.append(text)
230
+ def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
231
+ text = str(row[column])
232
+ prompt = [
233
+ {"role": "system", "content": instructions},
234
+ {"role": "user", "content": f"Corrupted text:\n{text}"},
235
+ ]
236
+ return {"prompt": prompt, "answer": text}
239
237
 
240
- if not prompts:
241
- raise ValueError(
242
- f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
243
- )
238
+ base_dataset = filtered_dataset.map(
239
+ _build_prompt,
240
+ remove_columns=source_column_names,
241
+ load_from_cache_file=False,
242
+ )
244
243
 
245
- dataset = HFDataset.from_dict({"prompt": prompts, "answer": answers})
244
+ try:
245
+ dataset_length = len(base_dataset) # type: ignore[arg-type]
246
+ except TypeError:
247
+ preview_rows: list[dict[str, Any]]
248
+ take_fn = getattr(base_dataset, "take", None)
249
+ if callable(take_fn):
250
+ preview_rows = list(take_fn(1))
251
+ else:
252
+ iterator = iter(base_dataset)
253
+ try:
254
+ first_row = next(iterator)
255
+ except StopIteration:
256
+ preview_rows = []
257
+ else:
258
+ preview_rows = [first_row]
259
+ if not preview_rows:
260
+ raise ValueError(
261
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
262
+ )
263
+ else:
264
+ if dataset_length == 0:
265
+ raise ValueError(
266
+ f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
267
+ )
246
268
 
247
269
  gaggle = _as_gaggle(glitchlings, seed=seed)
248
- glitched_dataset = gaggle.corrupt_dataset(dataset, ["prompt"])
270
+ glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
249
271
 
250
272
  rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
251
273
  rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
glitchlings/main.py CHANGED
@@ -11,31 +11,12 @@ from . import SAMPLE_TEXT
11
11
  from .zoo import (
12
12
  Glitchling,
13
13
  Gaggle,
14
- jargoyle,
15
- mim1c,
16
- typogre,
17
- reduple,
18
- rushmore,
19
- redactyl,
20
- scannequin,
14
+ BUILTIN_GLITCHLINGS,
15
+ DEFAULT_GLITCHLING_NAMES,
16
+ parse_glitchling_spec,
21
17
  summon,
22
18
  )
23
19
 
24
-
25
- BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
26
- g.name.lower(): g
27
- for g in [
28
- typogre,
29
- mim1c,
30
- jargoyle,
31
- reduple,
32
- rushmore,
33
- redactyl,
34
- scannequin,
35
- ]
36
- }
37
-
38
- DEFAULT_GLITCHLING_NAMES: list[str] = list(BUILTIN_GLITCHLINGS.keys())
39
20
  MAX_NAME_WIDTH = max(len(glitchling.name) for glitchling in BUILTIN_GLITCHLINGS.values())
40
21
 
41
22
 
@@ -62,8 +43,11 @@ def build_parser() -> argparse.ArgumentParser:
62
43
  "--glitchling",
63
44
  dest="glitchlings",
64
45
  action="append",
65
- metavar="NAME",
66
- help="Glitchling to apply (repeat for multiples). Defaults to all built-ins.",
46
+ metavar="SPEC",
47
+ help=(
48
+ "Glitchling to apply, optionally with parameters like "
49
+ "Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
50
+ ),
67
51
  )
68
52
  parser.add_argument(
69
53
  "-s",
@@ -147,23 +131,16 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
147
131
  def summon_glitchlings(
148
132
  names: list[str] | None, parser: argparse.ArgumentParser, seed: int
149
133
  ) -> Gaggle:
150
- """Instantiate the requested glitchlings and bundle them in a ``Gaggle``.
151
-
152
- Args:
153
- names: Optional list of glitchling names provided by the user.
154
- parser: The argument parser used for emitting user-facing errors.
155
- seed: Master seed controlling deterministic corruption order.
156
-
157
- Returns:
158
- Gaggle: A ready-to-use collection of glitchlings.
159
-
160
- Raises:
161
- SystemExit: Raised indirectly via ``parser.error`` when a provided glitchling
162
- name is invalid.
163
- """
134
+ """Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
164
135
 
165
136
  if names:
166
- normalized = [name.lower() for name in names]
137
+ normalized: list[str | Glitchling] = []
138
+ for specification in names:
139
+ try:
140
+ normalized.append(parse_glitchling_spec(specification))
141
+ except ValueError as exc:
142
+ parser.error(str(exc))
143
+ raise AssertionError("parser.error should exit")
167
144
  else:
168
145
  normalized = DEFAULT_GLITCHLING_NAMES
169
146
 
@@ -174,6 +151,7 @@ def summon_glitchlings(
174
151
  raise AssertionError("parser.error should exit")
175
152
 
176
153
 
154
+
177
155
  def show_diff(original: str, corrupted: str) -> None:
178
156
  """Display a unified diff between the original and corrupted text."""
179
157
 
@@ -141,6 +141,36 @@ _register_layout(
141
141
  ),
142
142
  )
143
143
 
144
+ _register_layout(
145
+ "QWERTZ",
146
+ (
147
+ "^1234567890ß´",
148
+ " qwertzuiopü+",
149
+ " asdfghjklöä#",
150
+ " yxcvbnm,.-",
151
+ ),
152
+ )
153
+
154
+ _register_layout(
155
+ "SPANISH_QWERTY",
156
+ (
157
+ "º1234567890'¡",
158
+ " qwertyuiop´+",
159
+ " asdfghjklñ´",
160
+ " <zxcvbnm,.-",
161
+ ),
162
+ )
163
+
164
+ _register_layout(
165
+ "SWEDISH_QWERTY",
166
+ (
167
+ "§1234567890+´",
168
+ " qwertyuiopå¨",
169
+ " asdfghjklöä'",
170
+ " <zxcvbnm,.-",
171
+ ),
172
+ )
173
+
144
174
 
145
175
  class KeyNeighbors:
146
176
  def __init__(self) -> None:
@@ -1,6 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ from typing import Any
5
+
1
6
  from .typogre import Typogre, typogre
2
7
  from .mim1c import Mim1c, mim1c
3
- from .jargoyle import Jargoyle, jargoyle
8
+ from .jargoyle import Jargoyle, jargoyle, dependencies_available as _jargoyle_available
4
9
  from .reduple import Reduple, reduple
5
10
  from .rushmore import Rushmore, rushmore
6
11
  from .redactyl import Redactyl, redactyl
@@ -25,33 +30,105 @@ __all__ = [
25
30
  "Glitchling",
26
31
  "Gaggle",
27
32
  "summon",
33
+ "BUILTIN_GLITCHLINGS",
34
+ "DEFAULT_GLITCHLING_NAMES",
35
+ "parse_glitchling_spec",
28
36
  ]
29
37
 
38
+ _HAS_JARGOYLE = _jargoyle_available()
39
+
40
+ _BUILTIN_GLITCHLING_LIST: list[Glitchling] = [typogre, mim1c]
41
+ if _HAS_JARGOYLE:
42
+ _BUILTIN_GLITCHLING_LIST.append(jargoyle)
43
+ _BUILTIN_GLITCHLING_LIST.extend([reduple, rushmore, redactyl, scannequin])
44
+
45
+ BUILTIN_GLITCHLINGS: dict[str, Glitchling] = {
46
+ glitchling.name.lower(): glitchling for glitchling in _BUILTIN_GLITCHLING_LIST
47
+ }
48
+
49
+ _BUILTIN_GLITCHLING_TYPES: dict[str, type[Glitchling]] = {
50
+ typogre.name.lower(): Typogre,
51
+ mim1c.name.lower(): Mim1c,
52
+ reduple.name.lower(): Reduple,
53
+ rushmore.name.lower(): Rushmore,
54
+ redactyl.name.lower(): Redactyl,
55
+ scannequin.name.lower(): Scannequin,
56
+ }
57
+ if _HAS_JARGOYLE:
58
+ _BUILTIN_GLITCHLING_TYPES[jargoyle.name.lower()] = Jargoyle
59
+
60
+ DEFAULT_GLITCHLING_NAMES: list[str] = list(BUILTIN_GLITCHLINGS.keys())
61
+
62
+
63
+ def parse_glitchling_spec(specification: str) -> Glitchling:
64
+ """Return a glitchling instance configured according to ``specification``."""
65
+
66
+ text = specification.strip()
67
+ if not text:
68
+ raise ValueError("Glitchling specification cannot be empty.")
69
+
70
+ if "(" not in text:
71
+ glitchling = BUILTIN_GLITCHLINGS.get(text.lower())
72
+ if glitchling is None:
73
+ raise ValueError(f"Glitchling '{text}' not found.")
74
+ return glitchling
75
+
76
+ if not text.endswith(")"):
77
+ raise ValueError(f"Invalid parameter syntax for glitchling '{text}'.")
78
+
79
+ name_part, arg_source = text[:-1].split("(", 1)
80
+ name = name_part.strip()
81
+ if not name:
82
+ raise ValueError(f"Invalid glitchling specification '{text}'.")
83
+
84
+ lower_name = name.lower()
85
+ glitchling_type = _BUILTIN_GLITCHLING_TYPES.get(lower_name)
86
+ if glitchling_type is None:
87
+ raise ValueError(f"Glitchling '{name}' not found.")
88
+
89
+ try:
90
+ call_expr = ast.parse(f"_({arg_source})", mode="eval").body
91
+ except SyntaxError as exc:
92
+ raise ValueError(
93
+ f"Invalid parameter syntax for glitchling '{name}': {exc.msg}"
94
+ ) from exc
95
+
96
+ if not isinstance(call_expr, ast.Call) or call_expr.args:
97
+ raise ValueError(
98
+ f"Glitchling '{name}' parameters must be provided as keyword arguments."
99
+ )
100
+
101
+ kwargs: dict[str, Any] = {}
102
+ for keyword in call_expr.keywords:
103
+ if keyword.arg is None:
104
+ raise ValueError(
105
+ f"Glitchling '{name}' does not support unpacking arbitrary keyword arguments."
106
+ )
107
+ try:
108
+ kwargs[keyword.arg] = ast.literal_eval(keyword.value)
109
+ except (ValueError, SyntaxError) as exc:
110
+ raise ValueError(
111
+ f"Failed to parse value for parameter '{keyword.arg}' on glitchling '{name}': {exc}"
112
+ ) from exc
113
+
114
+ try:
115
+ return glitchling_type(**kwargs)
116
+ except TypeError as exc:
117
+ raise ValueError(f"Failed to instantiate glitchling '{name}': {exc}") from exc
118
+
30
119
 
31
120
  def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
32
121
  """Summon glitchlings by name (using defaults) or instance (to change parameters)."""
33
- available = {
34
- g.name.lower(): g
35
- for g in [
36
- typogre,
37
- mim1c,
38
- jargoyle,
39
- reduple,
40
- rushmore,
41
- redactyl,
42
- scannequin,
43
- ]
44
- }
45
- summoned = []
122
+
123
+ summoned: list[Glitchling] = []
46
124
  for entry in glitchlings:
47
125
  if isinstance(entry, Glitchling):
48
126
  summoned.append(entry)
49
127
  continue
50
128
 
51
- g = available.get(entry.lower())
52
- if g:
53
- summoned.append(g)
54
- else:
55
- raise ValueError(f"Glitchling '{entry}' not found.")
129
+ try:
130
+ summoned.append(parse_glitchling_spec(entry))
131
+ except ValueError as exc:
132
+ raise ValueError(str(exc)) from exc
56
133
 
57
134
  return Gaggle(summoned, seed=seed)
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib import resources
4
+
5
+ _CONFUSION_TABLE: list[tuple[str, list[str]]] | None = None
6
+
7
+
8
+ def load_confusion_table() -> list[tuple[str, list[str]]]:
9
+ """Load the OCR confusion table shared by Python and Rust implementations."""
10
+ global _CONFUSION_TABLE
11
+ if _CONFUSION_TABLE is not None:
12
+ return _CONFUSION_TABLE
13
+
14
+ data = resources.files(__package__) / "ocr_confusions.tsv"
15
+ text = data.read_text(encoding="utf-8")
16
+ indexed_entries: list[tuple[int, tuple[str, list[str]]]] = []
17
+ for line_number, line in enumerate(text.splitlines()):
18
+ stripped = line.strip()
19
+ if not stripped or stripped.startswith("#"):
20
+ continue
21
+ parts = stripped.split()
22
+ if len(parts) < 2:
23
+ continue
24
+ source, *replacements = parts
25
+ indexed_entries.append((line_number, (source, replacements)))
26
+
27
+ # Sort longer patterns first to avoid overlapping matches, mirroring the
28
+ # behaviour of the Rust `confusion_table` helper.
29
+ indexed_entries.sort(
30
+ key=lambda item: (-len(item[1][0]), item[0])
31
+ )
32
+ entries = [entry for _, entry in indexed_entries]
33
+ _CONFUSION_TABLE = entries
34
+ return entries
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def resolve_rate(
5
+ *,
6
+ rate: float | None,
7
+ legacy_value: float | None,
8
+ default: float,
9
+ legacy_name: str,
10
+ ) -> float:
11
+ """Return the effective rate while enforcing mutual exclusivity."""
12
+
13
+ if rate is not None and legacy_value is not None:
14
+ raise ValueError(
15
+ f"Specify either 'rate' or '{legacy_name}', not both."
16
+ )
17
+ if rate is not None:
18
+ return rate
19
+ if legacy_value is not None:
20
+ return legacy_value
21
+ return default
glitchlings/zoo/core.py CHANGED
@@ -107,6 +107,7 @@ class Glitchling:
107
107
  scope: AttackWave,
108
108
  order: AttackOrder = AttackOrder.NORMAL,
109
109
  seed: int | None = None,
110
+ pipeline_operation: Callable[["Glitchling"], dict[str, Any] | None] | None = None,
110
111
  **kwargs: Any,
111
112
  ) -> None:
112
113
  """Initialize a glitchling.
@@ -128,31 +129,76 @@ class Glitchling:
128
129
  self.corruption_function: CorruptionCallable = corruption_function
129
130
  self.level: AttackWave = scope
130
131
  self.order: AttackOrder = order
132
+ self._pipeline_descriptor_factory = pipeline_operation
131
133
  self.kwargs: dict[str, Any] = {}
134
+ self._cached_rng_callable: CorruptionCallable | None = None
135
+ self._cached_rng_expectation: bool | None = None
132
136
  for kw, val in kwargs.items():
133
137
  self.set_param(kw, val)
134
138
 
135
139
  def set_param(self, key: str, value: Any) -> None:
136
140
  """Persist a parameter for use by the corruption callable."""
137
141
 
138
- setattr(self, key, value)
139
- self.kwargs[key] = value
140
- if key == "seed":
142
+ aliases = getattr(self, "_param_aliases", {})
143
+ canonical = aliases.get(key, key)
144
+
145
+ # Drop stale alias keys so we only forward canonical kwargs.
146
+ self.kwargs.pop(key, None)
147
+ for alias, target in aliases.items():
148
+ if target == canonical:
149
+ self.kwargs.pop(alias, None)
150
+
151
+ self.kwargs[canonical] = value
152
+ setattr(self, canonical, value)
153
+
154
+ if canonical == "seed":
141
155
  self.reset_rng(value)
142
156
 
143
- def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
144
- """Execute the corruption callable, injecting the RNG when required."""
157
+ for alias, target in aliases.items():
158
+ if target == canonical:
159
+ setattr(self, alias, value)
145
160
 
146
- # Pass rng to underlying corruption function if it expects it.
161
+ def pipeline_operation(self) -> dict[str, Any] | None:
162
+ """Return the Rust pipeline operation descriptor for this glitchling."""
163
+
164
+ factory = self._pipeline_descriptor_factory
165
+ if factory is None:
166
+ return None
167
+
168
+ return factory(self)
169
+
170
+ def _corruption_expects_rng(self) -> bool:
171
+ """Return `True` when the corruption function accepts an rng keyword."""
172
+
173
+ cached_callable = self._cached_rng_callable
174
+ cached_expectation = self._cached_rng_expectation
175
+ corruption_function = self.corruption_function
176
+
177
+ if (
178
+ cached_callable is corruption_function
179
+ and cached_expectation is not None
180
+ ):
181
+ return cached_expectation
182
+
183
+ expects_rng = False
147
184
  try:
148
- signature = inspect.signature(self.corruption_function)
185
+ signature = inspect.signature(corruption_function)
149
186
  except (TypeError, ValueError):
150
187
  signature = None
151
188
 
152
- expects_rng = False
153
189
  if signature is not None:
154
190
  expects_rng = "rng" in signature.parameters
155
191
 
192
+ self._cached_rng_callable = corruption_function
193
+ self._cached_rng_expectation = expects_rng
194
+ return expects_rng
195
+
196
+ def __corrupt(self, text: str, *args: Any, **kwargs: Any) -> str:
197
+ """Execute the corruption callable, injecting the RNG when required."""
198
+
199
+ # Pass rng to underlying corruption function if it expects it.
200
+ expects_rng = self._corruption_expects_rng()
201
+
156
202
  if expects_rng:
157
203
  corrupted = self.corruption_function(text, *args, rng=self.rng, **kwargs)
158
204
  else:
@@ -231,53 +277,14 @@ class Glitchling:
231
277
  self.corruption_function,
232
278
  self.level,
233
279
  self.order,
280
+ pipeline_operation=self._pipeline_descriptor_factory,
234
281
  **filtered_kwargs,
235
282
  )
236
283
 
237
284
  return cls(**filtered_kwargs)
238
285
 
239
286
 
240
- def _pipeline_operation_reduplicate(glitchling: "Glitchling") -> dict[str, Any] | None:
241
- rate = glitchling.kwargs.get("reduplication_rate")
242
- if rate is None:
243
- return None
244
- return {"type": "reduplicate", "reduplication_rate": float(rate)}
245
-
246
287
 
247
- def _pipeline_operation_delete(glitchling: "Glitchling") -> dict[str, Any] | None:
248
- rate = glitchling.kwargs.get("max_deletion_rate")
249
- if rate is None:
250
- return None
251
- return {"type": "delete", "max_deletion_rate": float(rate)}
252
-
253
-
254
- def _pipeline_operation_redact(glitchling: "Glitchling") -> dict[str, Any] | None:
255
- replacement_char = glitchling.kwargs.get("replacement_char")
256
- redaction_rate = glitchling.kwargs.get("redaction_rate")
257
- merge_adjacent = glitchling.kwargs.get("merge_adjacent")
258
- if replacement_char is None or redaction_rate is None or merge_adjacent is None:
259
- return None
260
- return {
261
- "type": "redact",
262
- "replacement_char": str(replacement_char),
263
- "redaction_rate": float(redaction_rate),
264
- "merge_adjacent": bool(merge_adjacent),
265
- }
266
-
267
-
268
- def _pipeline_operation_ocr(glitchling: "Glitchling") -> dict[str, Any] | None:
269
- error_rate = glitchling.kwargs.get("error_rate")
270
- if error_rate is None:
271
- return None
272
- return {"type": "ocr", "error_rate": float(error_rate)}
273
-
274
-
275
- _PIPELINE_OPERATION_BUILDERS: dict[str, Callable[["Glitchling"], dict[str, Any] | None]] = {
276
- "Reduple": _pipeline_operation_reduplicate,
277
- "Rushmore": _pipeline_operation_delete,
278
- "Redactyl": _pipeline_operation_redact,
279
- "Scannequin": _pipeline_operation_ocr,
280
- }
281
288
 
282
289
 
283
290
  class Gaggle(Glitchling):
@@ -359,10 +366,7 @@ class Gaggle(Glitchling):
359
366
 
360
367
  descriptors: list[dict[str, Any]] = []
361
368
  for glitchling in self.apply_order:
362
- builder = _PIPELINE_OPERATION_BUILDERS.get(glitchling.name)
363
- if builder is None:
364
- return None
365
- operation = builder(glitchling)
369
+ operation = glitchling.pipeline_operation()
366
370
  if operation is None:
367
371
  return None
368
372