glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. glitchlings/__init__.py +101 -0
  2. glitchlings/__main__.py +8 -0
  3. glitchlings/_corruption_engine/__init__.py +12 -0
  4. glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
  5. glitchlings/assets/__init__.py +180 -0
  6. glitchlings/assets/apostrofae_pairs.json +32 -0
  7. glitchlings/assets/ekkokin_homophones.json +2014 -0
  8. glitchlings/assets/hokey_assets.json +193 -0
  9. glitchlings/assets/lexemes/academic.json +1049 -0
  10. glitchlings/assets/lexemes/colors.json +1333 -0
  11. glitchlings/assets/lexemes/corporate.json +716 -0
  12. glitchlings/assets/lexemes/cyberpunk.json +22 -0
  13. glitchlings/assets/lexemes/lovecraftian.json +23 -0
  14. glitchlings/assets/lexemes/synonyms.json +3354 -0
  15. glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
  16. glitchlings/assets/ocr_confusions.tsv +30 -0
  17. glitchlings/assets/pipeline_assets.json +29 -0
  18. glitchlings/attack/__init__.py +184 -0
  19. glitchlings/attack/analysis.py +1321 -0
  20. glitchlings/attack/core.py +819 -0
  21. glitchlings/attack/core_execution.py +378 -0
  22. glitchlings/attack/core_planning.py +612 -0
  23. glitchlings/attack/encode.py +114 -0
  24. glitchlings/attack/metrics.py +211 -0
  25. glitchlings/attack/metrics_dispatch.py +70 -0
  26. glitchlings/attack/tokenization.py +338 -0
  27. glitchlings/attack/tokenizer_metrics.py +373 -0
  28. glitchlings/auggie.py +285 -0
  29. glitchlings/compat/__init__.py +9 -0
  30. glitchlings/compat/loaders.py +355 -0
  31. glitchlings/compat/types.py +41 -0
  32. glitchlings/conf/__init__.py +39 -0
  33. glitchlings/conf/loaders.py +331 -0
  34. glitchlings/conf/schema.py +156 -0
  35. glitchlings/conf/types.py +72 -0
  36. glitchlings/config.toml +2 -0
  37. glitchlings/constants.py +139 -0
  38. glitchlings/dev/__init__.py +3 -0
  39. glitchlings/dev/docs.py +45 -0
  40. glitchlings/dlc/__init__.py +21 -0
  41. glitchlings/dlc/_shared.py +300 -0
  42. glitchlings/dlc/gutenberg.py +400 -0
  43. glitchlings/dlc/huggingface.py +68 -0
  44. glitchlings/dlc/langchain.py +147 -0
  45. glitchlings/dlc/nemo.py +283 -0
  46. glitchlings/dlc/prime.py +215 -0
  47. glitchlings/dlc/pytorch.py +98 -0
  48. glitchlings/dlc/pytorch_lightning.py +173 -0
  49. glitchlings/internal/__init__.py +16 -0
  50. glitchlings/internal/rust.py +159 -0
  51. glitchlings/internal/rust_ffi.py +599 -0
  52. glitchlings/main.py +426 -0
  53. glitchlings/protocols.py +91 -0
  54. glitchlings/runtime_config.py +24 -0
  55. glitchlings/util/__init__.py +41 -0
  56. glitchlings/util/adapters.py +65 -0
  57. glitchlings/util/keyboards.py +508 -0
  58. glitchlings/util/transcripts.py +108 -0
  59. glitchlings/zoo/__init__.py +161 -0
  60. glitchlings/zoo/assets/__init__.py +29 -0
  61. glitchlings/zoo/core.py +852 -0
  62. glitchlings/zoo/core_execution.py +154 -0
  63. glitchlings/zoo/core_planning.py +451 -0
  64. glitchlings/zoo/corrupt_dispatch.py +291 -0
  65. glitchlings/zoo/hokey.py +139 -0
  66. glitchlings/zoo/jargoyle.py +301 -0
  67. glitchlings/zoo/mim1c.py +269 -0
  68. glitchlings/zoo/pedant/__init__.py +109 -0
  69. glitchlings/zoo/pedant/core.py +99 -0
  70. glitchlings/zoo/pedant/forms.py +50 -0
  71. glitchlings/zoo/pedant/stones.py +83 -0
  72. glitchlings/zoo/redactyl.py +94 -0
  73. glitchlings/zoo/rng.py +280 -0
  74. glitchlings/zoo/rushmore.py +416 -0
  75. glitchlings/zoo/scannequin.py +370 -0
  76. glitchlings/zoo/transforms.py +331 -0
  77. glitchlings/zoo/typogre.py +194 -0
  78. glitchlings/zoo/validation.py +643 -0
  79. glitchlings/zoo/wherewolf.py +120 -0
  80. glitchlings/zoo/zeedub.py +165 -0
  81. glitchlings-1.0.0.dist-info/METADATA +404 -0
  82. glitchlings-1.0.0.dist-info/RECORD +86 -0
  83. glitchlings-1.0.0.dist-info/WHEEL +5 -0
  84. glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
  85. glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
  86. glitchlings-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,370 @@
1
+ """Scannequin: Research-backed OCR simulation glitchling.
2
+
3
+ This module provides OCR-style text corruption based on empirical research
4
+ into document degradation and character recognition failures.
5
+
6
+ References
7
+ ----------
8
+ - Kolak & Resnik (2002) - Noisy-channel OCR error modeling
9
+ - Kanungo et al. (1994) - "Nonlinear Global and Local Document Degradation Models"
10
+ https://kanungo.com/pubs/ijist94-model.pdf
11
+ - Li, Lopresti, Nagy, Tompkins (1996) - "Validation of Image Defect Models for OCR"
12
+ https://sites.ecse.rpi.edu/~nagy/PDF_files/Li_Lopresti_Tompkins_PAMI96.pdf
13
+ - Rice et al. / UNLV-ISRI Annual Tests (1995) - Quality preset empirical basis
14
+ - Taghva et al. - "Context beats Confusion"
15
+ https://www.projectcomputing.com/resources/CorrectingNoisyOCR.pdf
16
+ - ICDAR Robust Reading Competitions
17
+ https://dl.acm.org/doi/abs/10.1007/s10032-004-0134-3
18
+ - Smith (2007) - Tesseract OCR architecture
19
+ """
20
+
21
+ import random
22
+ from typing import Any, Literal, cast
23
+
24
+ from glitchlings.constants import (
25
+ DEFAULT_SCANNEQUIN_BIAS_BETA,
26
+ DEFAULT_SCANNEQUIN_BIAS_K,
27
+ DEFAULT_SCANNEQUIN_BURST_ENTER,
28
+ DEFAULT_SCANNEQUIN_BURST_EXIT,
29
+ DEFAULT_SCANNEQUIN_BURST_MULTIPLIER,
30
+ DEFAULT_SCANNEQUIN_RATE,
31
+ DEFAULT_SCANNEQUIN_SPACE_DROP_RATE,
32
+ DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE,
33
+ SCANNEQUIN_PRESETS,
34
+ )
35
+ from glitchlings.internal.rust_ffi import ocr_artifacts_rust, resolve_seed
36
+
37
+ from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
38
+
39
+ # Type alias for preset names
40
+ PresetName = Literal["clean_300dpi", "newspaper", "fax", "photocopy_3rd_gen"]
41
+
42
+
43
+ def ocr_artifacts(
44
+ text: str,
45
+ rate: float | None = None,
46
+ seed: int | None = None,
47
+ rng: random.Random | None = None,
48
+ *,
49
+ burst_enter: float | None = None,
50
+ burst_exit: float | None = None,
51
+ burst_multiplier: float | None = None,
52
+ bias_k: int | None = None,
53
+ bias_beta: float | None = None,
54
+ space_drop_rate: float | None = None,
55
+ space_insert_rate: float | None = None,
56
+ ) -> str:
57
+ """Introduce OCR-like artifacts into text with research-backed enhancements.
58
+
59
+ This function simulates OCR errors using three research-backed features:
60
+
61
+ **Burst Model (Kanungo et al., 1994)**
62
+ Real document defects are spatially correlated - a coffee stain or fold
63
+ affects a region, not individual characters. Uses an HMM to create error
64
+ clusters simulating smudges, folds, or degraded scan regions.
65
+
66
+ **Document-Level Bias (UNLV-ISRI, 1995)**
67
+ Documents scanned under the same conditions exhibit consistent error
68
+ profiles. Randomly selects K confusion patterns and amplifies their
69
+ selection probability, creating "why does it always turn 'l' into '1'"
70
+ consistency.
71
+
72
+ **Whitespace Errors (Smith, 2007; ICDAR)**
73
+ Models OCR segmentation failures that cause word merges/splits. These
74
+ happen before character recognition in the real pipeline.
75
+
76
+ Parameters
77
+ ----------
78
+ text : str
79
+ Input text to corrupt.
80
+ rate : float, optional
81
+ Base probability of applying a confusion to any given candidate.
82
+ Default is 0.02.
83
+ seed : int, optional
84
+ Deterministic seed for reproducibility.
85
+ rng : random.Random, optional
86
+ Optional RNG for seed generation.
87
+ burst_enter : float, optional
88
+ Probability of transitioning from clean to harsh state (default 0.0).
89
+ burst_exit : float, optional
90
+ Probability of transitioning from harsh to clean state (default 0.3).
91
+ burst_multiplier : float, optional
92
+ Rate multiplier when in harsh state (default 3.0).
93
+ bias_k : int, optional
94
+ Number of confusion patterns to amplify per document (default 0).
95
+ bias_beta : float, optional
96
+ Amplification factor for selected patterns (default 2.0).
97
+ space_drop_rate : float, optional
98
+ Probability of deleting a space, merging words (default 0.0).
99
+ space_insert_rate : float, optional
100
+ Probability of inserting a spurious space (default 0.0).
101
+
102
+ Returns
103
+ -------
104
+ str
105
+ Text with simulated OCR errors.
106
+
107
+ References
108
+ ----------
109
+ - Kanungo et al. (1994) - "Nonlinear Global and Local Document Degradation Models"
110
+ - Rice et al. / UNLV-ISRI Annual Tests (1995)
111
+ - Smith (2007) - Tesseract OCR architecture
112
+ - ICDAR Robust Reading Competitions
113
+ """
114
+ if not text:
115
+ return text
116
+
117
+ effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
118
+ clamped_rate = max(0.0, effective_rate)
119
+
120
+ return ocr_artifacts_rust(
121
+ text,
122
+ clamped_rate,
123
+ resolve_seed(seed, rng),
124
+ burst_enter=burst_enter,
125
+ burst_exit=burst_exit,
126
+ burst_multiplier=burst_multiplier,
127
+ bias_k=bias_k,
128
+ bias_beta=bias_beta,
129
+ space_drop_rate=space_drop_rate,
130
+ space_insert_rate=space_insert_rate,
131
+ )
132
+
133
+
134
+ class Scannequin(Glitchling):
135
+ """Glitchling that simulates OCR artifacts with research-backed enhancements.
136
+
137
+ Scannequin introduces OCR-inspired transcription mistakes to emulate noisy
138
+ document scans. It now operates at **document level** to enable document-wide
139
+ consistency in error patterns.
140
+
141
+ Features
142
+ --------
143
+
144
+ **Burst Model (Kanungo et al., 1994)**
145
+ Uses an HMM with clean/harsh states to create spatially correlated error
146
+ clusters, simulating physical defects like smudges, folds, or scan artifacts.
147
+
148
+ **Document-Level Bias (UNLV-ISRI, 1995)**
149
+ Selects K confusion patterns at document start and amplifies their selection
150
+ probability, creating consistent error profiles across the document.
151
+
152
+ **Whitespace Errors (Smith, 2007; ICDAR)**
153
+ Models OCR segmentation failures: space drops (word merges) and spurious
154
+ space insertions (word splits).
155
+
156
+ **Quality Presets**
157
+ Based on UNLV-ISRI test regimes:
158
+ - ``"clean_300dpi"``: Minimal errors, good quality baseline
159
+ - ``"newspaper"``: Moderate errors with some burst
160
+ - ``"fax"``: High errors, strong burst, heavy l/1/I confusion
161
+ - ``"photocopy_3rd_gen"``: Very degraded, long burst runs
162
+
163
+ Parameters
164
+ ----------
165
+ rate : float, optional
166
+ Base probability of applying a confusion (default 0.02).
167
+ seed : int, optional
168
+ Deterministic seed.
169
+ preset : str, optional
170
+ Quality preset name. Overrides individual parameters when set.
171
+ burst_enter : float, optional
172
+ P(clean → harsh) state transition (default 0.0 = disabled).
173
+ burst_exit : float, optional
174
+ P(harsh → clean) state transition (default 0.3).
175
+ burst_multiplier : float, optional
176
+ Rate multiplier in harsh state (default 3.0).
177
+ bias_k : int, optional
178
+ Number of patterns to amplify per document (default 0 = disabled).
179
+ bias_beta : float, optional
180
+ Amplification factor for biased patterns (default 2.0).
181
+ space_drop_rate : float, optional
182
+ P(delete space, merge words) (default 0.0 = disabled).
183
+ space_insert_rate : float, optional
184
+ P(insert spurious space) (default 0.0 = disabled).
185
+ **kwargs
186
+ Additional parameters passed to base Glitchling.
187
+
188
+ Examples
189
+ --------
190
+ Basic usage with default parameters:
191
+
192
+ >>> scan = Scannequin(rate=0.02, seed=42)
193
+ >>> scan("The cat sat on the mat")
194
+ 'The cat sat on the rnat'
195
+
196
+ Using a quality preset:
197
+
198
+ >>> fax_scan = Scannequin(preset="fax", seed=42)
199
+ >>> fax_scan("Hello world, this is a test document.")
200
+ 'He1lo vvorld, thls is a testdocument.'
201
+
202
+ Enabling burst mode for realistic degradation:
203
+
204
+ >>> degraded = Scannequin(rate=0.03, burst_enter=0.1, burst_exit=0.2, seed=42)
205
+ >>> degraded("Some regions will have clustered errors like smudges.")
206
+ 'Sorne regions will have dustered errors Iike srnudges.'
207
+
208
+ References
209
+ ----------
210
+ - Kolak & Resnik (2002) - Noisy-channel OCR error modeling
211
+ - Kanungo et al. (1994) - "Nonlinear Global and Local Document Degradation Models"
212
+ - Li, Lopresti, Nagy, Tompkins (1996) - "Validation of Image Defect Models for OCR"
213
+ - Rice et al. / UNLV-ISRI Annual Tests (1995)
214
+ - Smith (2007) - Tesseract OCR architecture
215
+ """
216
+
217
+ flavor = "Isn't it weird how the word 'bed' looks like a bed?"
218
+
219
+ def __init__(
220
+ self,
221
+ *,
222
+ rate: float | None = None,
223
+ seed: int | None = None,
224
+ preset: PresetName | None = None,
225
+ burst_enter: float | None = None,
226
+ burst_exit: float | None = None,
227
+ burst_multiplier: float | None = None,
228
+ bias_k: int | None = None,
229
+ bias_beta: float | None = None,
230
+ space_drop_rate: float | None = None,
231
+ space_insert_rate: float | None = None,
232
+ **kwargs: Any,
233
+ ) -> None:
234
+ # If preset is specified, load parameters from it
235
+ if preset is not None:
236
+ if preset not in SCANNEQUIN_PRESETS:
237
+ valid_presets = ", ".join(sorted(SCANNEQUIN_PRESETS.keys()))
238
+ msg = f"Unknown preset '{preset}'. Valid presets: {valid_presets}"
239
+ raise ValueError(msg)
240
+
241
+ (
242
+ preset_rate,
243
+ preset_burst_enter,
244
+ preset_burst_exit,
245
+ preset_burst_multiplier,
246
+ preset_bias_k,
247
+ preset_bias_beta,
248
+ preset_space_drop_rate,
249
+ preset_space_insert_rate,
250
+ ) = SCANNEQUIN_PRESETS[preset]
251
+
252
+ # Preset values are used as defaults, explicit params override
253
+ if rate is None:
254
+ rate = preset_rate
255
+ if burst_enter is None:
256
+ burst_enter = preset_burst_enter
257
+ if burst_exit is None:
258
+ burst_exit = preset_burst_exit
259
+ if burst_multiplier is None:
260
+ burst_multiplier = preset_burst_multiplier
261
+ if bias_k is None:
262
+ bias_k = preset_bias_k
263
+ if bias_beta is None:
264
+ bias_beta = preset_bias_beta
265
+ if space_drop_rate is None:
266
+ space_drop_rate = preset_space_drop_rate
267
+ if space_insert_rate is None:
268
+ space_insert_rate = preset_space_insert_rate
269
+
270
+ # Apply defaults for any remaining None values
271
+ effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
272
+ effective_burst_enter = (
273
+ DEFAULT_SCANNEQUIN_BURST_ENTER if burst_enter is None else burst_enter
274
+ )
275
+ effective_burst_exit = DEFAULT_SCANNEQUIN_BURST_EXIT if burst_exit is None else burst_exit
276
+ effective_burst_multiplier = (
277
+ DEFAULT_SCANNEQUIN_BURST_MULTIPLIER if burst_multiplier is None else burst_multiplier
278
+ )
279
+ effective_bias_k = DEFAULT_SCANNEQUIN_BIAS_K if bias_k is None else bias_k
280
+ effective_bias_beta = DEFAULT_SCANNEQUIN_BIAS_BETA if bias_beta is None else bias_beta
281
+ effective_space_drop_rate = (
282
+ DEFAULT_SCANNEQUIN_SPACE_DROP_RATE if space_drop_rate is None else space_drop_rate
283
+ )
284
+ effective_space_insert_rate = (
285
+ DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE if space_insert_rate is None else space_insert_rate
286
+ )
287
+
288
+ super().__init__(
289
+ name="Scannequin",
290
+ corruption_function=ocr_artifacts,
291
+ # Changed from CHARACTER to DOCUMENT for document-wide consistency
292
+ scope=AttackWave.DOCUMENT,
293
+ order=AttackOrder.LATE,
294
+ seed=seed,
295
+ rate=effective_rate,
296
+ burst_enter=effective_burst_enter,
297
+ burst_exit=effective_burst_exit,
298
+ burst_multiplier=effective_burst_multiplier,
299
+ bias_k=effective_bias_k,
300
+ bias_beta=effective_bias_beta,
301
+ space_drop_rate=effective_space_drop_rate,
302
+ space_insert_rate=effective_space_insert_rate,
303
+ **kwargs,
304
+ )
305
+
306
+ # Store preset name if used
307
+ self._preset = preset
308
+
309
+ @classmethod
310
+ def from_preset(cls, preset: PresetName, *, seed: int | None = None) -> "Scannequin":
311
+ """Create a Scannequin instance from a named quality preset.
312
+
313
+ Parameters
314
+ ----------
315
+ preset : str
316
+ Quality preset name. One of:
317
+ - ``"clean_300dpi"``: Clean 300 DPI scan, minimal errors
318
+ - ``"newspaper"``: Newspaper-quality scan, moderate degradation
319
+ - ``"fax"``: Fax-quality, high error rate with l/1/I confusion
320
+ - ``"photocopy_3rd_gen"``: Third-generation photocopy, severe degradation
321
+ seed : int, optional
322
+ Deterministic seed for reproducibility.
323
+
324
+ Returns
325
+ -------
326
+ Scannequin
327
+ Configured Scannequin instance.
328
+
329
+ Examples
330
+ --------
331
+ >>> fax = Scannequin.from_preset("fax", seed=42)
332
+ >>> fax("The quick brown fox")
333
+ 'Tbe quick brovvn fox'
334
+ """
335
+ return cls(preset=preset, seed=seed)
336
+
337
+ def pipeline_operation(self) -> PipelineOperationPayload:
338
+ """Return the Rust pipeline descriptor with all OCR parameters."""
339
+ rate_value = self.kwargs.get("rate", DEFAULT_SCANNEQUIN_RATE)
340
+ rate = DEFAULT_SCANNEQUIN_RATE if rate_value is None else float(rate_value)
341
+
342
+ return cast(
343
+ PipelineOperationPayload,
344
+ {
345
+ "type": "ocr",
346
+ "rate": rate,
347
+ "burst_enter": float(
348
+ self.kwargs.get("burst_enter", DEFAULT_SCANNEQUIN_BURST_ENTER)
349
+ ),
350
+ "burst_exit": float(self.kwargs.get("burst_exit", DEFAULT_SCANNEQUIN_BURST_EXIT)),
351
+ "burst_multiplier": float(
352
+ self.kwargs.get("burst_multiplier", DEFAULT_SCANNEQUIN_BURST_MULTIPLIER)
353
+ ),
354
+ "bias_k": int(self.kwargs.get("bias_k", DEFAULT_SCANNEQUIN_BIAS_K)),
355
+ "bias_beta": float(self.kwargs.get("bias_beta", DEFAULT_SCANNEQUIN_BIAS_BETA)),
356
+ "space_drop_rate": float(
357
+ self.kwargs.get("space_drop_rate", DEFAULT_SCANNEQUIN_SPACE_DROP_RATE)
358
+ ),
359
+ "space_insert_rate": float(
360
+ self.kwargs.get("space_insert_rate", DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE)
361
+ ),
362
+ },
363
+ )
364
+
365
+
366
+ # Default instance for convenience
367
+ scannequin = Scannequin()
368
+
369
+
370
+ __all__ = ["Scannequin", "scannequin", "ocr_artifacts", "SCANNEQUIN_PRESETS"]
@@ -0,0 +1,331 @@
1
+ """Pure text transformation functions.
2
+
3
+ This module contains text manipulation functions that are:
4
+ - **Pure**: Output depends only on inputs, no side effects
5
+ - **Deterministic**: Same inputs always produce same outputs
6
+ - **Self-contained**: No RNG, no Rust FFI, no config loading
7
+
8
+ These functions receive pre-validated inputs from boundary layers
9
+ (see validation.py) and trust that inputs are already checked.
10
+ Core transformation code should NOT re-validate parameters.
11
+
12
+ Design Philosophy
13
+ -----------------
14
+ This module implements the innermost layer of the purity architecture:
15
+
16
+ CLI/API → validation.py → transforms.py → Rust FFI
17
+ (boundary) (boundary) (pure core) (impure)
18
+
19
+ Functions here should:
20
+ - Accept concrete types (not Optional unless semantically required)
21
+ - Not log, print, or mutate external state
22
+ - Not import impure modules (internal.rust, config loaders, etc.)
23
+ - Document any preconditions callers must satisfy
24
+
25
+ See AGENTS.md "Functional Purity Architecture" for full details.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import re
31
+ from collections.abc import Iterable, Mapping, Sequence
32
+ from dataclasses import dataclass
33
+ from typing import TypeVar, cast
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Text Tokenization
37
+ # ---------------------------------------------------------------------------
38
+
39
+ _WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
40
+ _TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.DOTALL)
41
+
42
+
43
+ def split_preserving_whitespace(text: str) -> list[str]:
44
+ """Split text while keeping whitespace tokens for stable reconstruction.
45
+
46
+ Returns alternating [word, whitespace, word, whitespace, ...] tokens.
47
+ Joining the result reconstructs the original text exactly.
48
+
49
+ Args:
50
+ text: Input text to tokenize.
51
+
52
+ Returns:
53
+ List of tokens alternating between non-whitespace and whitespace.
54
+
55
+ Example:
56
+ >>> split_preserving_whitespace("hello world")
57
+ ['hello', ' ', 'world']
58
+ """
59
+ return _WORD_SPLIT_PATTERN.split(text)
60
+
61
+
62
+ def split_token_edges(token: str) -> tuple[str, str, str]:
63
+ """Decompose a token into leading punctuation, core, and trailing punctuation.
64
+
65
+ Args:
66
+ token: A non-whitespace token.
67
+
68
+ Returns:
69
+ Tuple of (prefix, core, suffix) where:
70
+ - prefix: leading non-word characters
71
+ - core: central word characters
72
+ - suffix: trailing non-word characters
73
+
74
+ Example:
75
+ >>> split_token_edges('"Hello!"')
76
+ ('"', 'Hello', '!"')
77
+ """
78
+ match = cast(re.Match[str], _TOKEN_EDGES_PATTERN.match(token))
79
+ prefix, core, suffix = match.groups()
80
+ return prefix, core, suffix
81
+
82
+
83
+ def compute_core_length(token: str) -> int:
84
+ """Compute the effective length of a token's core for weighting heuristics.
85
+
86
+ Used by weighted sampling algorithms to prioritize longer words.
87
+ Always returns at least 1 to avoid zero-weight issues.
88
+
89
+ Args:
90
+ token: A non-whitespace token.
91
+
92
+ Returns:
93
+ Positive integer representing the token's effective length.
94
+ """
95
+ _, core, _ = split_token_edges(token)
96
+ if core:
97
+ return len(core)
98
+ stripped = token.strip()
99
+ if stripped:
100
+ return len(stripped)
101
+ if token:
102
+ return len(token)
103
+ return 1
104
+
105
+
106
+ @dataclass(frozen=True)
107
+ class WordToken:
108
+ """Metadata describing a non-whitespace token from text tokenization.
109
+
110
+ Attributes:
111
+ index: Position in the parent token sequence.
112
+ prefix: Leading non-word characters (punctuation).
113
+ core: Central word characters.
114
+ suffix: Trailing non-word characters (punctuation).
115
+ core_length: Effective length for weighting (always >= 1).
116
+ """
117
+
118
+ index: int
119
+ prefix: str
120
+ core: str
121
+ suffix: str
122
+ core_length: int
123
+
124
+ @property
125
+ def has_core(self) -> bool:
126
+ """Return True when the token contains at least one core character."""
127
+ return bool(self.core)
128
+
129
+
130
+ def collect_word_tokens(
131
+ tokens: Sequence[str],
132
+ *,
133
+ skip_first_word: bool = False,
134
+ ) -> list[WordToken]:
135
+ """Extract structured metadata for non-whitespace tokens.
136
+
137
+ Args:
138
+ tokens: Token sequence from split_preserving_whitespace.
139
+ skip_first_word: If True, exclude the first content token
140
+ (useful for preserving leading words in delete operations).
141
+
142
+ Returns:
143
+ List of WordToken instances for each non-whitespace token.
144
+ """
145
+ start = 2 if skip_first_word else 0
146
+ collected: list[WordToken] = []
147
+
148
+ for index in range(start, len(tokens), 2):
149
+ token = tokens[index]
150
+ if not token or token.isspace():
151
+ continue
152
+
153
+ prefix, core, suffix = split_token_edges(token)
154
+ core_length = compute_core_length(token)
155
+
156
+ collected.append(
157
+ WordToken(
158
+ index=index,
159
+ prefix=prefix,
160
+ core=core,
161
+ suffix=suffix,
162
+ core_length=core_length,
163
+ )
164
+ )
165
+
166
+ return collected
167
+
168
+
169
+ def reassemble_tokens(tokens: Sequence[str]) -> str:
170
+ """Join tokens back into text, preserving original structure.
171
+
172
+ Args:
173
+ tokens: Token sequence (typically modified from split_preserving_whitespace).
174
+
175
+ Returns:
176
+ Reassembled text string.
177
+ """
178
+ return "".join(tokens)
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # String Difference Computation
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def compute_string_diffs(
187
+ original: str,
188
+ modified: str,
189
+ ) -> list[list[tuple[str, str, str]]]:
190
+ """Compare two strings and return grouped adjacent change operations.
191
+
192
+ Uses difflib's SequenceMatcher to identify changes between strings.
193
+ Consecutive changes are grouped together; equal regions are skipped.
194
+
195
+ Args:
196
+ original: The original string.
197
+ modified: The modified string.
198
+
199
+ Returns:
200
+ List of change groups. Each group is a list of (tag, old_text, new_text)
201
+ tuples where tag is 'replace', 'delete', or 'insert'.
202
+
203
+ Example:
204
+ >>> compute_string_diffs("hello world", "helo worlds")
205
+ [[('delete', 'l', '')], [('replace', '', 's')]]
206
+ """
207
+ import difflib
208
+
209
+ sm = difflib.SequenceMatcher(None, original, modified)
210
+ ops: list[list[tuple[str, str, str]]] = []
211
+ buffer: list[tuple[str, str, str]] = []
212
+
213
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
214
+ if tag == "equal":
215
+ if buffer:
216
+ ops.append(buffer)
217
+ buffer = []
218
+ continue
219
+ buffer.append((tag, original[i1:i2], modified[j1:j2]))
220
+
221
+ if buffer:
222
+ ops.append(buffer)
223
+
224
+ return ops
225
+
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # Sequence Operations
229
+ # ---------------------------------------------------------------------------
230
+
231
+ T = TypeVar("T")
232
+
233
+
234
+ def stable_deduplicate(items: Iterable[T]) -> list[T]:
235
+ """Remove duplicates while preserving original order.
236
+
237
+ Args:
238
+ items: Iterable of hashable items.
239
+
240
+ Returns:
241
+ List with duplicates removed, first occurrence preserved.
242
+
243
+ Example:
244
+ >>> stable_deduplicate([3, 1, 4, 1, 5, 9, 2, 6, 5])
245
+ [3, 1, 4, 5, 9, 2, 6]
246
+ """
247
+ seen: set[T] = set()
248
+ result: list[T] = []
249
+ for item in items:
250
+ if item not in seen:
251
+ seen.add(item)
252
+ result.append(item)
253
+ return result
254
+
255
+
256
+ def interleave_lists(
257
+ primary: Sequence[T],
258
+ secondary: Sequence[T],
259
+ *,
260
+ secondary_first: bool = False,
261
+ ) -> list[T]:
262
+ """Interleave two sequences, padding shorter with empty slots.
263
+
264
+ Args:
265
+ primary: First sequence.
266
+ secondary: Second sequence.
267
+ secondary_first: If True, start with secondary element.
268
+
269
+ Returns:
270
+ Interleaved list [p0, s0, p1, s1, ...] or [s0, p0, s1, p1, ...].
271
+ """
272
+ result: list[T] = []
273
+ max_len = max(len(primary), len(secondary))
274
+
275
+ for i in range(max_len):
276
+ if secondary_first:
277
+ if i < len(secondary):
278
+ result.append(secondary[i])
279
+ if i < len(primary):
280
+ result.append(primary[i])
281
+ else:
282
+ if i < len(primary):
283
+ result.append(primary[i])
284
+ if i < len(secondary):
285
+ result.append(secondary[i])
286
+
287
+ return result
288
+
289
+
290
+ # ---------------------------------------------------------------------------
291
+ # Mapping Helpers
292
+ # ---------------------------------------------------------------------------
293
+
294
+
295
+ def invert_mapping(
296
+ mapping: Mapping[str, Sequence[str]],
297
+ ) -> dict[str, str]:
298
+ """Invert a one-to-many mapping into a many-to-one lookup.
299
+
300
+ Given {key: [val1, val2]}, returns {val1: key, val2: key}.
301
+ Later keys overwrite earlier ones if values collide.
302
+
303
+ Args:
304
+ mapping: Dictionary mapping keys to sequences of values.
305
+
306
+ Returns:
307
+ Inverted dictionary mapping each value to its key.
308
+ """
309
+ inverted: dict[str, str] = {}
310
+ for key, values in mapping.items():
311
+ for value in values:
312
+ inverted[value] = key
313
+ return inverted
314
+
315
+
316
+ __all__ = [
317
+ # Tokenization
318
+ "split_preserving_whitespace",
319
+ "split_token_edges",
320
+ "compute_core_length",
321
+ "WordToken",
322
+ "collect_word_tokens",
323
+ "reassemble_tokens",
324
+ # Diffs
325
+ "compute_string_diffs",
326
+ # Sequences
327
+ "stable_deduplicate",
328
+ "interleave_lists",
329
+ # Mappings
330
+ "invert_mapping",
331
+ ]