glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. glitchlings/__init__.py +36 -17
  2. glitchlings/__main__.py +0 -1
  3. glitchlings/_zoo_rust/__init__.py +12 -0
  4. glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/pipeline_assets.json +29 -0
  17. glitchlings/attack/__init__.py +53 -0
  18. glitchlings/attack/compose.py +299 -0
  19. glitchlings/attack/core.py +465 -0
  20. glitchlings/attack/encode.py +114 -0
  21. glitchlings/attack/metrics.py +104 -0
  22. glitchlings/attack/metrics_dispatch.py +70 -0
  23. glitchlings/attack/tokenization.py +157 -0
  24. glitchlings/auggie.py +283 -0
  25. glitchlings/compat/__init__.py +9 -0
  26. glitchlings/compat/loaders.py +355 -0
  27. glitchlings/compat/types.py +41 -0
  28. glitchlings/conf/__init__.py +41 -0
  29. glitchlings/conf/loaders.py +331 -0
  30. glitchlings/conf/schema.py +156 -0
  31. glitchlings/conf/types.py +72 -0
  32. glitchlings/config.toml +2 -0
  33. glitchlings/constants.py +59 -0
  34. glitchlings/dev/__init__.py +3 -0
  35. glitchlings/dev/docs.py +45 -0
  36. glitchlings/dlc/__init__.py +17 -3
  37. glitchlings/dlc/_shared.py +296 -0
  38. glitchlings/dlc/gutenberg.py +400 -0
  39. glitchlings/dlc/huggingface.py +37 -65
  40. glitchlings/dlc/prime.py +55 -114
  41. glitchlings/dlc/pytorch.py +98 -0
  42. glitchlings/dlc/pytorch_lightning.py +173 -0
  43. glitchlings/internal/__init__.py +16 -0
  44. glitchlings/internal/rust.py +159 -0
  45. glitchlings/internal/rust_ffi.py +432 -0
  46. glitchlings/main.py +123 -32
  47. glitchlings/runtime_config.py +24 -0
  48. glitchlings/util/__init__.py +29 -176
  49. glitchlings/util/adapters.py +65 -0
  50. glitchlings/util/keyboards.py +311 -0
  51. glitchlings/util/transcripts.py +108 -0
  52. glitchlings/zoo/__init__.py +47 -24
  53. glitchlings/zoo/assets/__init__.py +29 -0
  54. glitchlings/zoo/core.py +301 -167
  55. glitchlings/zoo/core_execution.py +98 -0
  56. glitchlings/zoo/core_planning.py +451 -0
  57. glitchlings/zoo/corrupt_dispatch.py +295 -0
  58. glitchlings/zoo/ekkokin.py +118 -0
  59. glitchlings/zoo/hokey.py +137 -0
  60. glitchlings/zoo/jargoyle.py +179 -274
  61. glitchlings/zoo/mim1c.py +106 -68
  62. glitchlings/zoo/pedant/__init__.py +107 -0
  63. glitchlings/zoo/pedant/core.py +105 -0
  64. glitchlings/zoo/pedant/forms.py +74 -0
  65. glitchlings/zoo/pedant/stones.py +74 -0
  66. glitchlings/zoo/redactyl.py +44 -175
  67. glitchlings/zoo/rng.py +259 -0
  68. glitchlings/zoo/rushmore.py +359 -116
  69. glitchlings/zoo/scannequin.py +18 -125
  70. glitchlings/zoo/transforms.py +386 -0
  71. glitchlings/zoo/typogre.py +76 -162
  72. glitchlings/zoo/validation.py +477 -0
  73. glitchlings/zoo/zeedub.py +33 -86
  74. glitchlings-0.9.3.dist-info/METADATA +334 -0
  75. glitchlings-0.9.3.dist-info/RECORD +80 -0
  76. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
  77. glitchlings/zoo/_ocr_confusions.py +0 -34
  78. glitchlings/zoo/_rate.py +0 -21
  79. glitchlings/zoo/reduple.py +0 -169
  80. glitchlings-0.2.5.dist-info/METADATA +0 -490
  81. glitchlings-0.2.5.dist-info/RECORD +0 -27
  82. /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
  83. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
  84. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
  85. {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
@@ -1,98 +1,10 @@
1
- import re
2
1
  import random
3
- from typing import Any
2
+ from typing import cast
4
3
 
5
- from ._ocr_confusions import load_confusion_table
6
- from .core import Glitchling, AttackWave, AttackOrder
7
- from ._rate import resolve_rate
4
+ from glitchlings.constants import DEFAULT_SCANNEQUIN_RATE
5
+ from glitchlings.internal.rust_ffi import ocr_artifacts_rust, resolve_seed
8
6
 
9
- try:
10
- from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
11
- except ImportError: # pragma: no cover - compiled extension not present
12
- _ocr_artifacts_rust = None
13
-
14
-
15
- def _python_ocr_artifacts(
16
- text: str,
17
- *,
18
- rate: float,
19
- rng: random.Random,
20
- ) -> str:
21
- """Introduce OCR-like artifacts into text.
22
-
23
- Parameters
24
- - text: Input text to corrupt.
25
- - rate: Max proportion of eligible confusion matches to replace (default 0.02).
26
- - seed: Optional seed if `rng` not provided.
27
- - rng: Optional RNG; overrides seed.
28
-
29
- Notes
30
- - Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
31
- - Collects all non-overlapping candidate spans in reading order, then samples
32
- a subset deterministically with the provided RNG.
33
- - Replacements can change length (e.g., m→rn), so edits are applied from left
34
- to right using precomputed spans to avoid index drift.
35
- """
36
- if not text:
37
- return text
38
-
39
- # Keep the confusion definitions in a shared data file so both the Python
40
- # and Rust implementations stay in sync.
41
- confusion_table = load_confusion_table()
42
-
43
- # Build candidate matches as (start, end, choices)
44
- candidates: list[tuple[int, int, list[str]]] = []
45
-
46
- # To avoid double-counting overlapping patterns (like 'l' inside 'li'),
47
- # we will scan longer patterns first by sorting by len(src) desc.
48
- for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
49
- pattern = re.escape(src)
50
- for m in re.finditer(pattern, text):
51
- start, end = m.span()
52
- candidates.append((start, end, choices))
53
-
54
- if not candidates:
55
- return text
56
-
57
- # Decide how many to replace
58
- k = int(len(candidates) * rate)
59
- if k <= 0:
60
- return text
61
-
62
- # Shuffle deterministically and select non-overlapping k spans
63
- rng.shuffle(candidates)
64
- chosen: list[tuple[int, int, str]] = []
65
- occupied: list[tuple[int, int]] = []
66
-
67
- def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
68
- return not (a[1] <= b[0] or b[1] <= a[0])
69
-
70
- for start, end, choices in candidates:
71
- if len(chosen) >= k:
72
- break
73
- span = (start, end)
74
- if any(overlaps(span, occ) for occ in occupied):
75
- continue
76
- replacement = rng.choice(choices)
77
- chosen.append((start, end, replacement))
78
- occupied.append(span)
79
-
80
- if not chosen:
81
- return text
82
-
83
- # Apply edits from left to right
84
- chosen.sort(key=lambda t: t[0])
85
- out_parts = []
86
- cursor = 0
87
- for start, end, rep in chosen:
88
- if cursor < start:
89
- out_parts.append(text[cursor:start])
90
- out_parts.append(rep)
91
- cursor = end
92
- if cursor < len(text):
93
- out_parts.append(text[cursor:])
94
-
95
- return "".join(out_parts)
7
+ from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
96
8
 
97
9
 
98
10
  def ocr_artifacts(
@@ -100,52 +12,33 @@ def ocr_artifacts(
100
12
  rate: float | None = None,
101
13
  seed: int | None = None,
102
14
  rng: random.Random | None = None,
103
- *,
104
- error_rate: float | None = None,
105
15
  ) -> str:
106
16
  """Introduce OCR-like artifacts into text.
107
17
 
108
- Prefers the Rust implementation when available.
18
+ Uses the Rust implementation for performance and determinism.
109
19
  """
110
-
111
20
  if not text:
112
21
  return text
113
22
 
114
- effective_rate = resolve_rate(
115
- rate=rate,
116
- legacy_value=error_rate,
117
- default=0.02,
118
- legacy_name="error_rate",
119
- )
120
-
121
- if rng is None:
122
- rng = random.Random(seed)
23
+ effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
123
24
 
124
25
  clamped_rate = max(0.0, effective_rate)
125
26
 
126
- if _ocr_artifacts_rust is not None:
127
- return _ocr_artifacts_rust(text, clamped_rate, rng)
128
-
129
- return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
27
+ return ocr_artifacts_rust(text, clamped_rate, resolve_seed(seed, rng))
130
28
 
131
29
 
132
30
  class Scannequin(Glitchling):
133
31
  """Glitchling that simulates OCR artifacts using common confusions."""
134
32
 
33
+ flavor = "Isn't it weird how the word 'bed' looks like a bed?"
34
+
135
35
  def __init__(
136
36
  self,
137
37
  *,
138
38
  rate: float | None = None,
139
- error_rate: float | None = None,
140
39
  seed: int | None = None,
141
40
  ) -> None:
142
- self._param_aliases = {"error_rate": "rate"}
143
- effective_rate = resolve_rate(
144
- rate=rate,
145
- legacy_value=error_rate,
146
- default=0.02,
147
- legacy_name="error_rate",
148
- )
41
+ effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
149
42
  super().__init__(
150
43
  name="Scannequin",
151
44
  corruption_function=ocr_artifacts,
@@ -155,17 +48,17 @@ class Scannequin(Glitchling):
155
48
  rate=effective_rate,
156
49
  )
157
50
 
158
- def pipeline_operation(self) -> dict[str, Any] | None:
159
- rate = self.kwargs.get("rate")
160
- if rate is None:
161
- rate = self.kwargs.get("error_rate")
162
- if rate is None:
163
- return None
164
- return {"type": "ocr", "error_rate": float(rate)}
51
+ def pipeline_operation(self) -> PipelineOperationPayload:
52
+ rate_value = self.kwargs.get("rate", DEFAULT_SCANNEQUIN_RATE)
53
+ rate = DEFAULT_SCANNEQUIN_RATE if rate_value is None else float(rate_value)
165
54
 
55
+ return cast(
56
+ PipelineOperationPayload,
57
+ {"type": "ocr", "rate": rate},
58
+ )
166
59
 
167
60
 
168
61
  scannequin = Scannequin()
169
62
 
170
63
 
171
- __all__ = ["Scannequin", "scannequin"]
64
+ __all__ = ["Scannequin", "scannequin", "ocr_artifacts"]
@@ -0,0 +1,386 @@
1
+ """Pure text transformation functions.
2
+
3
+ This module contains text manipulation functions that are:
4
+ - **Pure**: Output depends only on inputs, no side effects
5
+ - **Deterministic**: Same inputs always produce same outputs
6
+ - **Self-contained**: No RNG, no Rust FFI, no config loading
7
+
8
+ These functions receive pre-validated inputs from boundary layers
9
+ (see validation.py) and trust that inputs are already checked.
10
+ Core transformation code should NOT re-validate parameters.
11
+
12
+ Design Philosophy
13
+ -----------------
14
+ This module implements the innermost layer of the purity architecture:
15
+
16
+ CLI/API → validation.py → transforms.py → Rust FFI
17
+ (boundary) (boundary) (pure core) (impure)
18
+
19
+ Functions here should:
20
+ - Accept concrete types (not Optional unless semantically required)
21
+ - Not log, print, or mutate external state
22
+ - Not import impure modules (internal.rust, config loaders, etc.)
23
+ - Document any preconditions callers must satisfy
24
+
25
+ See AGENTS.md "Functional Purity Architecture" for full details.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import re
31
+ from collections.abc import Iterable, Mapping, Sequence
32
+ from dataclasses import dataclass
33
+ from typing import TypeVar, cast
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Text Tokenization
37
+ # ---------------------------------------------------------------------------
38
+
39
+ _WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
40
+ _TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.DOTALL)
41
+
42
+
43
+ def split_preserving_whitespace(text: str) -> list[str]:
44
+ """Split text while keeping whitespace tokens for stable reconstruction.
45
+
46
+ Returns alternating [word, whitespace, word, whitespace, ...] tokens.
47
+ Joining the result reconstructs the original text exactly.
48
+
49
+ Args:
50
+ text: Input text to tokenize.
51
+
52
+ Returns:
53
+ List of tokens alternating between non-whitespace and whitespace.
54
+
55
+ Example:
56
+ >>> split_preserving_whitespace("hello world")
57
+ ['hello', ' ', 'world']
58
+ """
59
+ return _WORD_SPLIT_PATTERN.split(text)
60
+
61
+
62
+ def split_token_edges(token: str) -> tuple[str, str, str]:
63
+ """Decompose a token into leading punctuation, core, and trailing punctuation.
64
+
65
+ Args:
66
+ token: A non-whitespace token.
67
+
68
+ Returns:
69
+ Tuple of (prefix, core, suffix) where:
70
+ - prefix: leading non-word characters
71
+ - core: central word characters
72
+ - suffix: trailing non-word characters
73
+
74
+ Example:
75
+ >>> split_token_edges('"Hello!"')
76
+ ('"', 'Hello', '!"')
77
+ """
78
+ match = cast(re.Match[str], _TOKEN_EDGES_PATTERN.match(token))
79
+ prefix, core, suffix = match.groups()
80
+ return prefix, core, suffix
81
+
82
+
83
+ def compute_core_length(token: str) -> int:
84
+ """Compute the effective length of a token's core for weighting heuristics.
85
+
86
+ Used by weighted sampling algorithms to prioritize longer words.
87
+ Always returns at least 1 to avoid zero-weight issues.
88
+
89
+ Args:
90
+ token: A non-whitespace token.
91
+
92
+ Returns:
93
+ Positive integer representing the token's effective length.
94
+ """
95
+ _, core, _ = split_token_edges(token)
96
+ if core:
97
+ return len(core)
98
+ stripped = token.strip()
99
+ if stripped:
100
+ return len(stripped)
101
+ if token:
102
+ return len(token)
103
+ return 1
104
+
105
+
106
+ @dataclass(frozen=True)
107
+ class WordToken:
108
+ """Metadata describing a non-whitespace token from text tokenization.
109
+
110
+ Attributes:
111
+ index: Position in the parent token sequence.
112
+ prefix: Leading non-word characters (punctuation).
113
+ core: Central word characters.
114
+ suffix: Trailing non-word characters (punctuation).
115
+ core_length: Effective length for weighting (always >= 1).
116
+ """
117
+
118
+ index: int
119
+ prefix: str
120
+ core: str
121
+ suffix: str
122
+ core_length: int
123
+
124
+ @property
125
+ def has_core(self) -> bool:
126
+ """Return True when the token contains at least one core character."""
127
+ return bool(self.core)
128
+
129
+
130
+ def collect_word_tokens(
131
+ tokens: Sequence[str],
132
+ *,
133
+ skip_first_word: bool = False,
134
+ ) -> list[WordToken]:
135
+ """Extract structured metadata for non-whitespace tokens.
136
+
137
+ Args:
138
+ tokens: Token sequence from split_preserving_whitespace.
139
+ skip_first_word: If True, exclude the first content token
140
+ (useful for preserving leading words in delete operations).
141
+
142
+ Returns:
143
+ List of WordToken instances for each non-whitespace token.
144
+ """
145
+ start = 2 if skip_first_word else 0
146
+ collected: list[WordToken] = []
147
+
148
+ for index in range(start, len(tokens), 2):
149
+ token = tokens[index]
150
+ if not token or token.isspace():
151
+ continue
152
+
153
+ prefix, core, suffix = split_token_edges(token)
154
+ core_length = compute_core_length(token)
155
+
156
+ collected.append(
157
+ WordToken(
158
+ index=index,
159
+ prefix=prefix,
160
+ core=core,
161
+ suffix=suffix,
162
+ core_length=core_length,
163
+ )
164
+ )
165
+
166
+ return collected
167
+
168
+
169
+ def reassemble_tokens(tokens: Sequence[str]) -> str:
170
+ """Join tokens back into text, preserving original structure.
171
+
172
+ Args:
173
+ tokens: Token sequence (typically modified from split_preserving_whitespace).
174
+
175
+ Returns:
176
+ Reassembled text string.
177
+ """
178
+ return "".join(tokens)
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Keyboard Layout Processing
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ KeyNeighborMap = dict[str, list[str]]
187
+
188
+
189
+ def build_keyboard_neighbor_map(rows: Iterable[str]) -> KeyNeighborMap:
190
+ """Derive 8-neighbour adjacency lists from keyboard layout rows.
191
+
192
+ Each row represents a keyboard row with characters positioned by index.
193
+ Spaces are treated as empty positions. Characters are normalized to lowercase.
194
+
195
+ Args:
196
+ rows: Iterable of strings representing keyboard rows, with
197
+ characters positioned to reflect their physical layout.
198
+
199
+ Returns:
200
+ Dictionary mapping each lowercase character to its adjacent characters.
201
+
202
+ Example:
203
+ >>> rows = ["qwerty", " asdfg"] # 'a' offset by 1
204
+ >>> neighbors = build_keyboard_neighbor_map(rows)
205
+ >>> neighbors['s'] # adjacent to q, w, e, a, d on QWERTY
206
+ ['q', 'w', 'e', 'a', 'd']
207
+ """
208
+ grid: dict[tuple[int, int], str] = {}
209
+ for y, row in enumerate(rows):
210
+ for x, char in enumerate(row):
211
+ if char == " ":
212
+ continue
213
+ grid[(x, y)] = char.lower()
214
+
215
+ neighbors: KeyNeighborMap = {}
216
+ for (x, y), char in grid.items():
217
+ seen: list[str] = []
218
+ for dy in (-1, 0, 1):
219
+ for dx in (-1, 0, 1):
220
+ if dx == 0 and dy == 0:
221
+ continue
222
+ candidate = grid.get((x + dx, y + dy))
223
+ if candidate is None:
224
+ continue
225
+ seen.append(candidate)
226
+ # Preserve encounter order but drop duplicates for determinism
227
+ deduped = list(dict.fromkeys(seen))
228
+ neighbors[char] = deduped
229
+
230
+ return neighbors
231
+
232
+
233
+ # ---------------------------------------------------------------------------
234
+ # String Difference Computation
235
+ # ---------------------------------------------------------------------------
236
+
237
+
238
+ def compute_string_diffs(
239
+ original: str,
240
+ modified: str,
241
+ ) -> list[list[tuple[str, str, str]]]:
242
+ """Compare two strings and return grouped adjacent change operations.
243
+
244
+ Uses difflib's SequenceMatcher to identify changes between strings.
245
+ Consecutive changes are grouped together; equal regions are skipped.
246
+
247
+ Args:
248
+ original: The original string.
249
+ modified: The modified string.
250
+
251
+ Returns:
252
+ List of change groups. Each group is a list of (tag, old_text, new_text)
253
+ tuples where tag is 'replace', 'delete', or 'insert'.
254
+
255
+ Example:
256
+ >>> compute_string_diffs("hello world", "helo worlds")
257
+ [[('delete', 'l', '')], [('replace', '', 's')]]
258
+ """
259
+ import difflib
260
+
261
+ sm = difflib.SequenceMatcher(None, original, modified)
262
+ ops: list[list[tuple[str, str, str]]] = []
263
+ buffer: list[tuple[str, str, str]] = []
264
+
265
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
266
+ if tag == "equal":
267
+ if buffer:
268
+ ops.append(buffer)
269
+ buffer = []
270
+ continue
271
+ buffer.append((tag, original[i1:i2], modified[j1:j2]))
272
+
273
+ if buffer:
274
+ ops.append(buffer)
275
+
276
+ return ops
277
+
278
+
279
+ # ---------------------------------------------------------------------------
280
+ # Sequence Operations
281
+ # ---------------------------------------------------------------------------
282
+
283
+ T = TypeVar("T")
284
+
285
+
286
+ def stable_deduplicate(items: Iterable[T]) -> list[T]:
287
+ """Remove duplicates while preserving original order.
288
+
289
+ Args:
290
+ items: Iterable of hashable items.
291
+
292
+ Returns:
293
+ List with duplicates removed, first occurrence preserved.
294
+
295
+ Example:
296
+ >>> stable_deduplicate([3, 1, 4, 1, 5, 9, 2, 6, 5])
297
+ [3, 1, 4, 5, 9, 2, 6]
298
+ """
299
+ seen: set[T] = set()
300
+ result: list[T] = []
301
+ for item in items:
302
+ if item not in seen:
303
+ seen.add(item)
304
+ result.append(item)
305
+ return result
306
+
307
+
308
+ def interleave_lists(
309
+ primary: Sequence[T],
310
+ secondary: Sequence[T],
311
+ *,
312
+ secondary_first: bool = False,
313
+ ) -> list[T]:
314
+ """Interleave two sequences, padding shorter with empty slots.
315
+
316
+ Args:
317
+ primary: First sequence.
318
+ secondary: Second sequence.
319
+ secondary_first: If True, start with secondary element.
320
+
321
+ Returns:
322
+ Interleaved list [p0, s0, p1, s1, ...] or [s0, p0, s1, p1, ...].
323
+ """
324
+ result: list[T] = []
325
+ max_len = max(len(primary), len(secondary))
326
+
327
+ for i in range(max_len):
328
+ if secondary_first:
329
+ if i < len(secondary):
330
+ result.append(secondary[i])
331
+ if i < len(primary):
332
+ result.append(primary[i])
333
+ else:
334
+ if i < len(primary):
335
+ result.append(primary[i])
336
+ if i < len(secondary):
337
+ result.append(secondary[i])
338
+
339
+ return result
340
+
341
+
342
+ # ---------------------------------------------------------------------------
343
+ # Mapping Helpers
344
+ # ---------------------------------------------------------------------------
345
+
346
+
347
+ def invert_mapping(
348
+ mapping: Mapping[str, Sequence[str]],
349
+ ) -> dict[str, str]:
350
+ """Invert a one-to-many mapping into a many-to-one lookup.
351
+
352
+ Given {key: [val1, val2]}, returns {val1: key, val2: key}.
353
+ Later keys overwrite earlier ones if values collide.
354
+
355
+ Args:
356
+ mapping: Dictionary mapping keys to sequences of values.
357
+
358
+ Returns:
359
+ Inverted dictionary mapping each value to its key.
360
+ """
361
+ inverted: dict[str, str] = {}
362
+ for key, values in mapping.items():
363
+ for value in values:
364
+ inverted[value] = key
365
+ return inverted
366
+
367
+
368
+ __all__ = [
369
+ # Tokenization
370
+ "split_preserving_whitespace",
371
+ "split_token_edges",
372
+ "compute_core_length",
373
+ "WordToken",
374
+ "collect_word_tokens",
375
+ "reassemble_tokens",
376
+ # Keyboard
377
+ "KeyNeighborMap",
378
+ "build_keyboard_neighbor_map",
379
+ # Diffs
380
+ "compute_string_diffs",
381
+ # Sequences
382
+ "stable_deduplicate",
383
+ "interleave_lists",
384
+ # Mappings
385
+ "invert_mapping",
386
+ ]