glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +36 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +53 -0
- glitchlings/attack/compose.py +299 -0
- glitchlings/attack/core.py +465 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +104 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +157 -0
- glitchlings/auggie.py +283 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +17 -3
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +37 -65
- glitchlings/dlc/prime.py +55 -114
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +432 -0
- glitchlings/main.py +123 -32
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +29 -176
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +311 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +47 -24
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +301 -167
- glitchlings/zoo/core_execution.py +98 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/ekkokin.py +118 -0
- glitchlings/zoo/hokey.py +137 -0
- glitchlings/zoo/jargoyle.py +179 -274
- glitchlings/zoo/mim1c.py +106 -68
- glitchlings/zoo/pedant/__init__.py +107 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +44 -175
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +359 -116
- glitchlings/zoo/scannequin.py +18 -125
- glitchlings/zoo/transforms.py +386 -0
- glitchlings/zoo/typogre.py +76 -162
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/zeedub.py +33 -86
- glitchlings-0.9.3.dist-info/METADATA +334 -0
- glitchlings-0.9.3.dist-info/RECORD +80 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
- glitchlings/zoo/_ocr_confusions.py +0 -34
- glitchlings/zoo/_rate.py +0 -21
- glitchlings/zoo/reduple.py +0 -169
- glitchlings-0.2.5.dist-info/METADATA +0 -490
- glitchlings-0.2.5.dist-info/RECORD +0 -27
- /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
glitchlings/zoo/scannequin.py
CHANGED
|
@@ -1,98 +1,10 @@
|
|
|
1
|
-
import re
|
|
2
1
|
import random
|
|
3
|
-
from typing import
|
|
2
|
+
from typing import cast
|
|
4
3
|
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from ._rate import resolve_rate
|
|
4
|
+
from glitchlings.constants import DEFAULT_SCANNEQUIN_RATE
|
|
5
|
+
from glitchlings.internal.rust_ffi import ocr_artifacts_rust, resolve_seed
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
|
|
11
|
-
except ImportError: # pragma: no cover - compiled extension not present
|
|
12
|
-
_ocr_artifacts_rust = None
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _python_ocr_artifacts(
|
|
16
|
-
text: str,
|
|
17
|
-
*,
|
|
18
|
-
rate: float,
|
|
19
|
-
rng: random.Random,
|
|
20
|
-
) -> str:
|
|
21
|
-
"""Introduce OCR-like artifacts into text.
|
|
22
|
-
|
|
23
|
-
Parameters
|
|
24
|
-
- text: Input text to corrupt.
|
|
25
|
-
- rate: Max proportion of eligible confusion matches to replace (default 0.02).
|
|
26
|
-
- seed: Optional seed if `rng` not provided.
|
|
27
|
-
- rng: Optional RNG; overrides seed.
|
|
28
|
-
|
|
29
|
-
Notes
|
|
30
|
-
- Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
|
|
31
|
-
- Collects all non-overlapping candidate spans in reading order, then samples
|
|
32
|
-
a subset deterministically with the provided RNG.
|
|
33
|
-
- Replacements can change length (e.g., m→rn), so edits are applied from left
|
|
34
|
-
to right using precomputed spans to avoid index drift.
|
|
35
|
-
"""
|
|
36
|
-
if not text:
|
|
37
|
-
return text
|
|
38
|
-
|
|
39
|
-
# Keep the confusion definitions in a shared data file so both the Python
|
|
40
|
-
# and Rust implementations stay in sync.
|
|
41
|
-
confusion_table = load_confusion_table()
|
|
42
|
-
|
|
43
|
-
# Build candidate matches as (start, end, choices)
|
|
44
|
-
candidates: list[tuple[int, int, list[str]]] = []
|
|
45
|
-
|
|
46
|
-
# To avoid double-counting overlapping patterns (like 'l' inside 'li'),
|
|
47
|
-
# we will scan longer patterns first by sorting by len(src) desc.
|
|
48
|
-
for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
|
|
49
|
-
pattern = re.escape(src)
|
|
50
|
-
for m in re.finditer(pattern, text):
|
|
51
|
-
start, end = m.span()
|
|
52
|
-
candidates.append((start, end, choices))
|
|
53
|
-
|
|
54
|
-
if not candidates:
|
|
55
|
-
return text
|
|
56
|
-
|
|
57
|
-
# Decide how many to replace
|
|
58
|
-
k = int(len(candidates) * rate)
|
|
59
|
-
if k <= 0:
|
|
60
|
-
return text
|
|
61
|
-
|
|
62
|
-
# Shuffle deterministically and select non-overlapping k spans
|
|
63
|
-
rng.shuffle(candidates)
|
|
64
|
-
chosen: list[tuple[int, int, str]] = []
|
|
65
|
-
occupied: list[tuple[int, int]] = []
|
|
66
|
-
|
|
67
|
-
def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
|
|
68
|
-
return not (a[1] <= b[0] or b[1] <= a[0])
|
|
69
|
-
|
|
70
|
-
for start, end, choices in candidates:
|
|
71
|
-
if len(chosen) >= k:
|
|
72
|
-
break
|
|
73
|
-
span = (start, end)
|
|
74
|
-
if any(overlaps(span, occ) for occ in occupied):
|
|
75
|
-
continue
|
|
76
|
-
replacement = rng.choice(choices)
|
|
77
|
-
chosen.append((start, end, replacement))
|
|
78
|
-
occupied.append(span)
|
|
79
|
-
|
|
80
|
-
if not chosen:
|
|
81
|
-
return text
|
|
82
|
-
|
|
83
|
-
# Apply edits from left to right
|
|
84
|
-
chosen.sort(key=lambda t: t[0])
|
|
85
|
-
out_parts = []
|
|
86
|
-
cursor = 0
|
|
87
|
-
for start, end, rep in chosen:
|
|
88
|
-
if cursor < start:
|
|
89
|
-
out_parts.append(text[cursor:start])
|
|
90
|
-
out_parts.append(rep)
|
|
91
|
-
cursor = end
|
|
92
|
-
if cursor < len(text):
|
|
93
|
-
out_parts.append(text[cursor:])
|
|
94
|
-
|
|
95
|
-
return "".join(out_parts)
|
|
7
|
+
from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
|
|
96
8
|
|
|
97
9
|
|
|
98
10
|
def ocr_artifacts(
|
|
@@ -100,52 +12,33 @@ def ocr_artifacts(
|
|
|
100
12
|
rate: float | None = None,
|
|
101
13
|
seed: int | None = None,
|
|
102
14
|
rng: random.Random | None = None,
|
|
103
|
-
*,
|
|
104
|
-
error_rate: float | None = None,
|
|
105
15
|
) -> str:
|
|
106
16
|
"""Introduce OCR-like artifacts into text.
|
|
107
17
|
|
|
108
|
-
|
|
18
|
+
Uses the Rust implementation for performance and determinism.
|
|
109
19
|
"""
|
|
110
|
-
|
|
111
20
|
if not text:
|
|
112
21
|
return text
|
|
113
22
|
|
|
114
|
-
effective_rate =
|
|
115
|
-
rate=rate,
|
|
116
|
-
legacy_value=error_rate,
|
|
117
|
-
default=0.02,
|
|
118
|
-
legacy_name="error_rate",
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
if rng is None:
|
|
122
|
-
rng = random.Random(seed)
|
|
23
|
+
effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
|
|
123
24
|
|
|
124
25
|
clamped_rate = max(0.0, effective_rate)
|
|
125
26
|
|
|
126
|
-
|
|
127
|
-
return _ocr_artifacts_rust(text, clamped_rate, rng)
|
|
128
|
-
|
|
129
|
-
return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
|
|
27
|
+
return ocr_artifacts_rust(text, clamped_rate, resolve_seed(seed, rng))
|
|
130
28
|
|
|
131
29
|
|
|
132
30
|
class Scannequin(Glitchling):
|
|
133
31
|
"""Glitchling that simulates OCR artifacts using common confusions."""
|
|
134
32
|
|
|
33
|
+
flavor = "Isn't it weird how the word 'bed' looks like a bed?"
|
|
34
|
+
|
|
135
35
|
def __init__(
|
|
136
36
|
self,
|
|
137
37
|
*,
|
|
138
38
|
rate: float | None = None,
|
|
139
|
-
error_rate: float | None = None,
|
|
140
39
|
seed: int | None = None,
|
|
141
40
|
) -> None:
|
|
142
|
-
|
|
143
|
-
effective_rate = resolve_rate(
|
|
144
|
-
rate=rate,
|
|
145
|
-
legacy_value=error_rate,
|
|
146
|
-
default=0.02,
|
|
147
|
-
legacy_name="error_rate",
|
|
148
|
-
)
|
|
41
|
+
effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
|
|
149
42
|
super().__init__(
|
|
150
43
|
name="Scannequin",
|
|
151
44
|
corruption_function=ocr_artifacts,
|
|
@@ -155,17 +48,17 @@ class Scannequin(Glitchling):
|
|
|
155
48
|
rate=effective_rate,
|
|
156
49
|
)
|
|
157
50
|
|
|
158
|
-
def pipeline_operation(self) ->
|
|
159
|
-
|
|
160
|
-
if
|
|
161
|
-
rate = self.kwargs.get("error_rate")
|
|
162
|
-
if rate is None:
|
|
163
|
-
return None
|
|
164
|
-
return {"type": "ocr", "error_rate": float(rate)}
|
|
51
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
52
|
+
rate_value = self.kwargs.get("rate", DEFAULT_SCANNEQUIN_RATE)
|
|
53
|
+
rate = DEFAULT_SCANNEQUIN_RATE if rate_value is None else float(rate_value)
|
|
165
54
|
|
|
55
|
+
return cast(
|
|
56
|
+
PipelineOperationPayload,
|
|
57
|
+
{"type": "ocr", "rate": rate},
|
|
58
|
+
)
|
|
166
59
|
|
|
167
60
|
|
|
168
61
|
scannequin = Scannequin()
|
|
169
62
|
|
|
170
63
|
|
|
171
|
-
__all__ = ["Scannequin", "scannequin"]
|
|
64
|
+
__all__ = ["Scannequin", "scannequin", "ocr_artifacts"]
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""Pure text transformation functions.
|
|
2
|
+
|
|
3
|
+
This module contains text manipulation functions that are:
|
|
4
|
+
- **Pure**: Output depends only on inputs, no side effects
|
|
5
|
+
- **Deterministic**: Same inputs always produce same outputs
|
|
6
|
+
- **Self-contained**: No RNG, no Rust FFI, no config loading
|
|
7
|
+
|
|
8
|
+
These functions receive pre-validated inputs from boundary layers
|
|
9
|
+
(see validation.py) and trust that inputs are already checked.
|
|
10
|
+
Core transformation code should NOT re-validate parameters.
|
|
11
|
+
|
|
12
|
+
Design Philosophy
|
|
13
|
+
-----------------
|
|
14
|
+
This module implements the innermost layer of the purity architecture:
|
|
15
|
+
|
|
16
|
+
CLI/API → validation.py → transforms.py → Rust FFI
|
|
17
|
+
(boundary) (boundary) (pure core) (impure)
|
|
18
|
+
|
|
19
|
+
Functions here should:
|
|
20
|
+
- Accept concrete types (not Optional unless semantically required)
|
|
21
|
+
- Not log, print, or mutate external state
|
|
22
|
+
- Not import impure modules (internal.rust, config loaders, etc.)
|
|
23
|
+
- Document any preconditions callers must satisfy
|
|
24
|
+
|
|
25
|
+
See AGENTS.md "Functional Purity Architecture" for full details.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import re
|
|
31
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
32
|
+
from dataclasses import dataclass
|
|
33
|
+
from typing import TypeVar, cast
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Text Tokenization
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
_WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
|
|
40
|
+
_TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.DOTALL)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def split_preserving_whitespace(text: str) -> list[str]:
|
|
44
|
+
"""Split text while keeping whitespace tokens for stable reconstruction.
|
|
45
|
+
|
|
46
|
+
Returns alternating [word, whitespace, word, whitespace, ...] tokens.
|
|
47
|
+
Joining the result reconstructs the original text exactly.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
text: Input text to tokenize.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of tokens alternating between non-whitespace and whitespace.
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
>>> split_preserving_whitespace("hello world")
|
|
57
|
+
['hello', ' ', 'world']
|
|
58
|
+
"""
|
|
59
|
+
return _WORD_SPLIT_PATTERN.split(text)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def split_token_edges(token: str) -> tuple[str, str, str]:
|
|
63
|
+
"""Decompose a token into leading punctuation, core, and trailing punctuation.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
token: A non-whitespace token.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Tuple of (prefix, core, suffix) where:
|
|
70
|
+
- prefix: leading non-word characters
|
|
71
|
+
- core: central word characters
|
|
72
|
+
- suffix: trailing non-word characters
|
|
73
|
+
|
|
74
|
+
Example:
|
|
75
|
+
>>> split_token_edges('"Hello!"')
|
|
76
|
+
('"', 'Hello', '!"')
|
|
77
|
+
"""
|
|
78
|
+
match = cast(re.Match[str], _TOKEN_EDGES_PATTERN.match(token))
|
|
79
|
+
prefix, core, suffix = match.groups()
|
|
80
|
+
return prefix, core, suffix
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def compute_core_length(token: str) -> int:
|
|
84
|
+
"""Compute the effective length of a token's core for weighting heuristics.
|
|
85
|
+
|
|
86
|
+
Used by weighted sampling algorithms to prioritize longer words.
|
|
87
|
+
Always returns at least 1 to avoid zero-weight issues.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
token: A non-whitespace token.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Positive integer representing the token's effective length.
|
|
94
|
+
"""
|
|
95
|
+
_, core, _ = split_token_edges(token)
|
|
96
|
+
if core:
|
|
97
|
+
return len(core)
|
|
98
|
+
stripped = token.strip()
|
|
99
|
+
if stripped:
|
|
100
|
+
return len(stripped)
|
|
101
|
+
if token:
|
|
102
|
+
return len(token)
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass(frozen=True)
|
|
107
|
+
class WordToken:
|
|
108
|
+
"""Metadata describing a non-whitespace token from text tokenization.
|
|
109
|
+
|
|
110
|
+
Attributes:
|
|
111
|
+
index: Position in the parent token sequence.
|
|
112
|
+
prefix: Leading non-word characters (punctuation).
|
|
113
|
+
core: Central word characters.
|
|
114
|
+
suffix: Trailing non-word characters (punctuation).
|
|
115
|
+
core_length: Effective length for weighting (always >= 1).
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
index: int
|
|
119
|
+
prefix: str
|
|
120
|
+
core: str
|
|
121
|
+
suffix: str
|
|
122
|
+
core_length: int
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def has_core(self) -> bool:
|
|
126
|
+
"""Return True when the token contains at least one core character."""
|
|
127
|
+
return bool(self.core)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def collect_word_tokens(
|
|
131
|
+
tokens: Sequence[str],
|
|
132
|
+
*,
|
|
133
|
+
skip_first_word: bool = False,
|
|
134
|
+
) -> list[WordToken]:
|
|
135
|
+
"""Extract structured metadata for non-whitespace tokens.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
tokens: Token sequence from split_preserving_whitespace.
|
|
139
|
+
skip_first_word: If True, exclude the first content token
|
|
140
|
+
(useful for preserving leading words in delete operations).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of WordToken instances for each non-whitespace token.
|
|
144
|
+
"""
|
|
145
|
+
start = 2 if skip_first_word else 0
|
|
146
|
+
collected: list[WordToken] = []
|
|
147
|
+
|
|
148
|
+
for index in range(start, len(tokens), 2):
|
|
149
|
+
token = tokens[index]
|
|
150
|
+
if not token or token.isspace():
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
prefix, core, suffix = split_token_edges(token)
|
|
154
|
+
core_length = compute_core_length(token)
|
|
155
|
+
|
|
156
|
+
collected.append(
|
|
157
|
+
WordToken(
|
|
158
|
+
index=index,
|
|
159
|
+
prefix=prefix,
|
|
160
|
+
core=core,
|
|
161
|
+
suffix=suffix,
|
|
162
|
+
core_length=core_length,
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return collected
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def reassemble_tokens(tokens: Sequence[str]) -> str:
|
|
170
|
+
"""Join tokens back into text, preserving original structure.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
tokens: Token sequence (typically modified from split_preserving_whitespace).
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Reassembled text string.
|
|
177
|
+
"""
|
|
178
|
+
return "".join(tokens)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
# Keyboard Layout Processing
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
KeyNeighborMap = dict[str, list[str]]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def build_keyboard_neighbor_map(rows: Iterable[str]) -> KeyNeighborMap:
|
|
190
|
+
"""Derive 8-neighbour adjacency lists from keyboard layout rows.
|
|
191
|
+
|
|
192
|
+
Each row represents a keyboard row with characters positioned by index.
|
|
193
|
+
Spaces are treated as empty positions. Characters are normalized to lowercase.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
rows: Iterable of strings representing keyboard rows, with
|
|
197
|
+
characters positioned to reflect their physical layout.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Dictionary mapping each lowercase character to its adjacent characters.
|
|
201
|
+
|
|
202
|
+
Example:
|
|
203
|
+
>>> rows = ["qwerty", " asdfg"] # 'a' offset by 1
|
|
204
|
+
>>> neighbors = build_keyboard_neighbor_map(rows)
|
|
205
|
+
>>> neighbors['s'] # adjacent to q, w, e, a, d on QWERTY
|
|
206
|
+
['q', 'w', 'e', 'a', 'd']
|
|
207
|
+
"""
|
|
208
|
+
grid: dict[tuple[int, int], str] = {}
|
|
209
|
+
for y, row in enumerate(rows):
|
|
210
|
+
for x, char in enumerate(row):
|
|
211
|
+
if char == " ":
|
|
212
|
+
continue
|
|
213
|
+
grid[(x, y)] = char.lower()
|
|
214
|
+
|
|
215
|
+
neighbors: KeyNeighborMap = {}
|
|
216
|
+
for (x, y), char in grid.items():
|
|
217
|
+
seen: list[str] = []
|
|
218
|
+
for dy in (-1, 0, 1):
|
|
219
|
+
for dx in (-1, 0, 1):
|
|
220
|
+
if dx == 0 and dy == 0:
|
|
221
|
+
continue
|
|
222
|
+
candidate = grid.get((x + dx, y + dy))
|
|
223
|
+
if candidate is None:
|
|
224
|
+
continue
|
|
225
|
+
seen.append(candidate)
|
|
226
|
+
# Preserve encounter order but drop duplicates for determinism
|
|
227
|
+
deduped = list(dict.fromkeys(seen))
|
|
228
|
+
neighbors[char] = deduped
|
|
229
|
+
|
|
230
|
+
return neighbors
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
# String Difference Computation
|
|
235
|
+
# ---------------------------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def compute_string_diffs(
|
|
239
|
+
original: str,
|
|
240
|
+
modified: str,
|
|
241
|
+
) -> list[list[tuple[str, str, str]]]:
|
|
242
|
+
"""Compare two strings and return grouped adjacent change operations.
|
|
243
|
+
|
|
244
|
+
Uses difflib's SequenceMatcher to identify changes between strings.
|
|
245
|
+
Consecutive changes are grouped together; equal regions are skipped.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
original: The original string.
|
|
249
|
+
modified: The modified string.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
List of change groups. Each group is a list of (tag, old_text, new_text)
|
|
253
|
+
tuples where tag is 'replace', 'delete', or 'insert'.
|
|
254
|
+
|
|
255
|
+
Example:
|
|
256
|
+
>>> compute_string_diffs("hello world", "helo worlds")
|
|
257
|
+
[[('delete', 'l', '')], [('replace', '', 's')]]
|
|
258
|
+
"""
|
|
259
|
+
import difflib
|
|
260
|
+
|
|
261
|
+
sm = difflib.SequenceMatcher(None, original, modified)
|
|
262
|
+
ops: list[list[tuple[str, str, str]]] = []
|
|
263
|
+
buffer: list[tuple[str, str, str]] = []
|
|
264
|
+
|
|
265
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
|
266
|
+
if tag == "equal":
|
|
267
|
+
if buffer:
|
|
268
|
+
ops.append(buffer)
|
|
269
|
+
buffer = []
|
|
270
|
+
continue
|
|
271
|
+
buffer.append((tag, original[i1:i2], modified[j1:j2]))
|
|
272
|
+
|
|
273
|
+
if buffer:
|
|
274
|
+
ops.append(buffer)
|
|
275
|
+
|
|
276
|
+
return ops
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
# ---------------------------------------------------------------------------
|
|
280
|
+
# Sequence Operations
|
|
281
|
+
# ---------------------------------------------------------------------------
|
|
282
|
+
|
|
283
|
+
T = TypeVar("T")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def stable_deduplicate(items: Iterable[T]) -> list[T]:
|
|
287
|
+
"""Remove duplicates while preserving original order.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
items: Iterable of hashable items.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
List with duplicates removed, first occurrence preserved.
|
|
294
|
+
|
|
295
|
+
Example:
|
|
296
|
+
>>> stable_deduplicate([3, 1, 4, 1, 5, 9, 2, 6, 5])
|
|
297
|
+
[3, 1, 4, 5, 9, 2, 6]
|
|
298
|
+
"""
|
|
299
|
+
seen: set[T] = set()
|
|
300
|
+
result: list[T] = []
|
|
301
|
+
for item in items:
|
|
302
|
+
if item not in seen:
|
|
303
|
+
seen.add(item)
|
|
304
|
+
result.append(item)
|
|
305
|
+
return result
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def interleave_lists(
|
|
309
|
+
primary: Sequence[T],
|
|
310
|
+
secondary: Sequence[T],
|
|
311
|
+
*,
|
|
312
|
+
secondary_first: bool = False,
|
|
313
|
+
) -> list[T]:
|
|
314
|
+
"""Interleave two sequences, padding shorter with empty slots.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
primary: First sequence.
|
|
318
|
+
secondary: Second sequence.
|
|
319
|
+
secondary_first: If True, start with secondary element.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Interleaved list [p0, s0, p1, s1, ...] or [s0, p0, s1, p1, ...].
|
|
323
|
+
"""
|
|
324
|
+
result: list[T] = []
|
|
325
|
+
max_len = max(len(primary), len(secondary))
|
|
326
|
+
|
|
327
|
+
for i in range(max_len):
|
|
328
|
+
if secondary_first:
|
|
329
|
+
if i < len(secondary):
|
|
330
|
+
result.append(secondary[i])
|
|
331
|
+
if i < len(primary):
|
|
332
|
+
result.append(primary[i])
|
|
333
|
+
else:
|
|
334
|
+
if i < len(primary):
|
|
335
|
+
result.append(primary[i])
|
|
336
|
+
if i < len(secondary):
|
|
337
|
+
result.append(secondary[i])
|
|
338
|
+
|
|
339
|
+
return result
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# ---------------------------------------------------------------------------
|
|
343
|
+
# Mapping Helpers
|
|
344
|
+
# ---------------------------------------------------------------------------
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def invert_mapping(
|
|
348
|
+
mapping: Mapping[str, Sequence[str]],
|
|
349
|
+
) -> dict[str, str]:
|
|
350
|
+
"""Invert a one-to-many mapping into a many-to-one lookup.
|
|
351
|
+
|
|
352
|
+
Given {key: [val1, val2]}, returns {val1: key, val2: key}.
|
|
353
|
+
Later keys overwrite earlier ones if values collide.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
mapping: Dictionary mapping keys to sequences of values.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Inverted dictionary mapping each value to its key.
|
|
360
|
+
"""
|
|
361
|
+
inverted: dict[str, str] = {}
|
|
362
|
+
for key, values in mapping.items():
|
|
363
|
+
for value in values:
|
|
364
|
+
inverted[value] = key
|
|
365
|
+
return inverted
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
__all__ = [
|
|
369
|
+
# Tokenization
|
|
370
|
+
"split_preserving_whitespace",
|
|
371
|
+
"split_token_edges",
|
|
372
|
+
"compute_core_length",
|
|
373
|
+
"WordToken",
|
|
374
|
+
"collect_word_tokens",
|
|
375
|
+
"reassemble_tokens",
|
|
376
|
+
# Keyboard
|
|
377
|
+
"KeyNeighborMap",
|
|
378
|
+
"build_keyboard_neighbor_map",
|
|
379
|
+
# Diffs
|
|
380
|
+
"compute_string_diffs",
|
|
381
|
+
# Sequences
|
|
382
|
+
"stable_deduplicate",
|
|
383
|
+
"interleave_lists",
|
|
384
|
+
# Mappings
|
|
385
|
+
"invert_mapping",
|
|
386
|
+
]
|