glitchlings 1.0.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +101 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_corruption_engine/__init__.py +12 -0
- glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +184 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +819 -0
- glitchlings/attack/core_execution.py +378 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +211 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +338 -0
- glitchlings/attack/tokenizer_metrics.py +373 -0
- glitchlings/auggie.py +285 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +39 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +139 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +21 -0
- glitchlings/dlc/_shared.py +300 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/langchain.py +147 -0
- glitchlings/dlc/nemo.py +283 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +599 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +41 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +508 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +852 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +291 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +301 -0
- glitchlings/zoo/mim1c.py +269 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +99 -0
- glitchlings/zoo/pedant/forms.py +50 -0
- glitchlings/zoo/pedant/stones.py +83 -0
- glitchlings/zoo/redactyl.py +94 -0
- glitchlings/zoo/rng.py +280 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +370 -0
- glitchlings/zoo/transforms.py +331 -0
- glitchlings/zoo/typogre.py +194 -0
- glitchlings/zoo/validation.py +643 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +165 -0
- glitchlings-1.0.0.dist-info/METADATA +404 -0
- glitchlings-1.0.0.dist-info/RECORD +86 -0
- glitchlings-1.0.0.dist-info/WHEEL +5 -0
- glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
- glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
- glitchlings-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
"""Scannequin: Research-backed OCR simulation glitchling.
|
|
2
|
+
|
|
3
|
+
This module provides OCR-style text corruption based on empirical research
|
|
4
|
+
into document degradation and character recognition failures.
|
|
5
|
+
|
|
6
|
+
References
|
|
7
|
+
----------
|
|
8
|
+
- Kolak & Resnik (2002) - Noisy-channel OCR error modeling
|
|
9
|
+
- Kanungo et al. (1994) - "Nonlinear Global and Local Document Degradation Models"
|
|
10
|
+
https://kanungo.com/pubs/ijist94-model.pdf
|
|
11
|
+
- Li, Lopresti, Nagy, Tompkins (1996) - "Validation of Image Defect Models for OCR"
|
|
12
|
+
https://sites.ecse.rpi.edu/~nagy/PDF_files/Li_Lopresti_Tompkins_PAMI96.pdf
|
|
13
|
+
- Rice et al. / UNLV-ISRI Annual Tests (1995) - Quality preset empirical basis
|
|
14
|
+
- Taghva et al. - "Context beats Confusion"
|
|
15
|
+
https://www.projectcomputing.com/resources/CorrectingNoisyOCR.pdf
|
|
16
|
+
- ICDAR Robust Reading Competitions
|
|
17
|
+
https://dl.acm.org/doi/abs/10.1007/s10032-004-0134-3
|
|
18
|
+
- Smith (2007) - Tesseract OCR architecture
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import random
|
|
22
|
+
from typing import Any, Literal, cast
|
|
23
|
+
|
|
24
|
+
from glitchlings.constants import (
|
|
25
|
+
DEFAULT_SCANNEQUIN_BIAS_BETA,
|
|
26
|
+
DEFAULT_SCANNEQUIN_BIAS_K,
|
|
27
|
+
DEFAULT_SCANNEQUIN_BURST_ENTER,
|
|
28
|
+
DEFAULT_SCANNEQUIN_BURST_EXIT,
|
|
29
|
+
DEFAULT_SCANNEQUIN_BURST_MULTIPLIER,
|
|
30
|
+
DEFAULT_SCANNEQUIN_RATE,
|
|
31
|
+
DEFAULT_SCANNEQUIN_SPACE_DROP_RATE,
|
|
32
|
+
DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE,
|
|
33
|
+
SCANNEQUIN_PRESETS,
|
|
34
|
+
)
|
|
35
|
+
from glitchlings.internal.rust_ffi import ocr_artifacts_rust, resolve_seed
|
|
36
|
+
|
|
37
|
+
from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
|
|
38
|
+
|
|
39
|
+
# Type alias for preset names
|
|
40
|
+
PresetName = Literal["clean_300dpi", "newspaper", "fax", "photocopy_3rd_gen"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def ocr_artifacts(
|
|
44
|
+
text: str,
|
|
45
|
+
rate: float | None = None,
|
|
46
|
+
seed: int | None = None,
|
|
47
|
+
rng: random.Random | None = None,
|
|
48
|
+
*,
|
|
49
|
+
burst_enter: float | None = None,
|
|
50
|
+
burst_exit: float | None = None,
|
|
51
|
+
burst_multiplier: float | None = None,
|
|
52
|
+
bias_k: int | None = None,
|
|
53
|
+
bias_beta: float | None = None,
|
|
54
|
+
space_drop_rate: float | None = None,
|
|
55
|
+
space_insert_rate: float | None = None,
|
|
56
|
+
) -> str:
|
|
57
|
+
"""Introduce OCR-like artifacts into text with research-backed enhancements.
|
|
58
|
+
|
|
59
|
+
This function simulates OCR errors using three research-backed features:
|
|
60
|
+
|
|
61
|
+
**Burst Model (Kanungo et al., 1994)**
|
|
62
|
+
Real document defects are spatially correlated - a coffee stain or fold
|
|
63
|
+
affects a region, not individual characters. Uses an HMM to create error
|
|
64
|
+
clusters simulating smudges, folds, or degraded scan regions.
|
|
65
|
+
|
|
66
|
+
**Document-Level Bias (UNLV-ISRI, 1995)**
|
|
67
|
+
Documents scanned under the same conditions exhibit consistent error
|
|
68
|
+
profiles. Randomly selects K confusion patterns and amplifies their
|
|
69
|
+
selection probability, creating "why does it always turn 'l' into '1'"
|
|
70
|
+
consistency.
|
|
71
|
+
|
|
72
|
+
**Whitespace Errors (Smith, 2007; ICDAR)**
|
|
73
|
+
Models OCR segmentation failures that cause word merges/splits. These
|
|
74
|
+
happen before character recognition in the real pipeline.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
text : str
|
|
79
|
+
Input text to corrupt.
|
|
80
|
+
rate : float, optional
|
|
81
|
+
Base probability of applying a confusion to any given candidate.
|
|
82
|
+
Default is 0.02.
|
|
83
|
+
seed : int, optional
|
|
84
|
+
Deterministic seed for reproducibility.
|
|
85
|
+
rng : random.Random, optional
|
|
86
|
+
Optional RNG for seed generation.
|
|
87
|
+
burst_enter : float, optional
|
|
88
|
+
Probability of transitioning from clean to harsh state (default 0.0).
|
|
89
|
+
burst_exit : float, optional
|
|
90
|
+
Probability of transitioning from harsh to clean state (default 0.3).
|
|
91
|
+
burst_multiplier : float, optional
|
|
92
|
+
Rate multiplier when in harsh state (default 3.0).
|
|
93
|
+
bias_k : int, optional
|
|
94
|
+
Number of confusion patterns to amplify per document (default 0).
|
|
95
|
+
bias_beta : float, optional
|
|
96
|
+
Amplification factor for selected patterns (default 2.0).
|
|
97
|
+
space_drop_rate : float, optional
|
|
98
|
+
Probability of deleting a space, merging words (default 0.0).
|
|
99
|
+
space_insert_rate : float, optional
|
|
100
|
+
Probability of inserting a spurious space (default 0.0).
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
str
|
|
105
|
+
Text with simulated OCR errors.
|
|
106
|
+
|
|
107
|
+
References
|
|
108
|
+
----------
|
|
109
|
+
- Kanungo et al. (1994) - "Nonlinear Global and Local Document Degradation Models"
|
|
110
|
+
- Rice et al. / UNLV-ISRI Annual Tests (1995)
|
|
111
|
+
- Smith (2007) - Tesseract OCR architecture
|
|
112
|
+
- ICDAR Robust Reading Competitions
|
|
113
|
+
"""
|
|
114
|
+
if not text:
|
|
115
|
+
return text
|
|
116
|
+
|
|
117
|
+
effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
|
|
118
|
+
clamped_rate = max(0.0, effective_rate)
|
|
119
|
+
|
|
120
|
+
return ocr_artifacts_rust(
|
|
121
|
+
text,
|
|
122
|
+
clamped_rate,
|
|
123
|
+
resolve_seed(seed, rng),
|
|
124
|
+
burst_enter=burst_enter,
|
|
125
|
+
burst_exit=burst_exit,
|
|
126
|
+
burst_multiplier=burst_multiplier,
|
|
127
|
+
bias_k=bias_k,
|
|
128
|
+
bias_beta=bias_beta,
|
|
129
|
+
space_drop_rate=space_drop_rate,
|
|
130
|
+
space_insert_rate=space_insert_rate,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class Scannequin(Glitchling):
|
|
135
|
+
"""Glitchling that simulates OCR artifacts with research-backed enhancements.
|
|
136
|
+
|
|
137
|
+
Scannequin introduces OCR-inspired transcription mistakes to emulate noisy
|
|
138
|
+
document scans. It now operates at **document level** to enable document-wide
|
|
139
|
+
consistency in error patterns.
|
|
140
|
+
|
|
141
|
+
Features
|
|
142
|
+
--------
|
|
143
|
+
|
|
144
|
+
**Burst Model (Kanungo et al., 1994)**
|
|
145
|
+
Uses an HMM with clean/harsh states to create spatially correlated error
|
|
146
|
+
clusters, simulating physical defects like smudges, folds, or scan artifacts.
|
|
147
|
+
|
|
148
|
+
**Document-Level Bias (UNLV-ISRI, 1995)**
|
|
149
|
+
Selects K confusion patterns at document start and amplifies their selection
|
|
150
|
+
probability, creating consistent error profiles across the document.
|
|
151
|
+
|
|
152
|
+
**Whitespace Errors (Smith, 2007; ICDAR)**
|
|
153
|
+
Models OCR segmentation failures: space drops (word merges) and spurious
|
|
154
|
+
space insertions (word splits).
|
|
155
|
+
|
|
156
|
+
**Quality Presets**
|
|
157
|
+
Based on UNLV-ISRI test regimes:
|
|
158
|
+
- ``"clean_300dpi"``: Minimal errors, good quality baseline
|
|
159
|
+
- ``"newspaper"``: Moderate errors with some burst
|
|
160
|
+
- ``"fax"``: High errors, strong burst, heavy l/1/I confusion
|
|
161
|
+
- ``"photocopy_3rd_gen"``: Very degraded, long burst runs
|
|
162
|
+
|
|
163
|
+
Parameters
|
|
164
|
+
----------
|
|
165
|
+
rate : float, optional
|
|
166
|
+
Base probability of applying a confusion (default 0.02).
|
|
167
|
+
seed : int, optional
|
|
168
|
+
Deterministic seed.
|
|
169
|
+
preset : str, optional
|
|
170
|
+
Quality preset name. Overrides individual parameters when set.
|
|
171
|
+
burst_enter : float, optional
|
|
172
|
+
P(clean → harsh) state transition (default 0.0 = disabled).
|
|
173
|
+
burst_exit : float, optional
|
|
174
|
+
P(harsh → clean) state transition (default 0.3).
|
|
175
|
+
burst_multiplier : float, optional
|
|
176
|
+
Rate multiplier in harsh state (default 3.0).
|
|
177
|
+
bias_k : int, optional
|
|
178
|
+
Number of patterns to amplify per document (default 0 = disabled).
|
|
179
|
+
bias_beta : float, optional
|
|
180
|
+
Amplification factor for biased patterns (default 2.0).
|
|
181
|
+
space_drop_rate : float, optional
|
|
182
|
+
P(delete space, merge words) (default 0.0 = disabled).
|
|
183
|
+
space_insert_rate : float, optional
|
|
184
|
+
P(insert spurious space) (default 0.0 = disabled).
|
|
185
|
+
**kwargs
|
|
186
|
+
Additional parameters passed to base Glitchling.
|
|
187
|
+
|
|
188
|
+
Examples
|
|
189
|
+
--------
|
|
190
|
+
Basic usage with default parameters:
|
|
191
|
+
|
|
192
|
+
>>> scan = Scannequin(rate=0.02, seed=42)
|
|
193
|
+
>>> scan("The cat sat on the mat")
|
|
194
|
+
'The cat sat on the rnat'
|
|
195
|
+
|
|
196
|
+
Using a quality preset:
|
|
197
|
+
|
|
198
|
+
>>> fax_scan = Scannequin(preset="fax", seed=42)
|
|
199
|
+
>>> fax_scan("Hello world, this is a test document.")
|
|
200
|
+
'He1lo vvorld, thls is a testdocument.'
|
|
201
|
+
|
|
202
|
+
Enabling burst mode for realistic degradation:
|
|
203
|
+
|
|
204
|
+
>>> degraded = Scannequin(rate=0.03, burst_enter=0.1, burst_exit=0.2, seed=42)
|
|
205
|
+
>>> degraded("Some regions will have clustered errors like smudges.")
|
|
206
|
+
'Sorne regions will have dustered errors Iike srnudges.'
|
|
207
|
+
|
|
208
|
+
References
|
|
209
|
+
----------
|
|
210
|
+
- Kolak & Resnik (2002) - Noisy-channel OCR error modeling
|
|
211
|
+
- Kanungo et al. (1994) - "Nonlinear Global and Local Document Degradation Models"
|
|
212
|
+
- Li, Lopresti, Nagy, Tompkins (1996) - "Validation of Image Defect Models for OCR"
|
|
213
|
+
- Rice et al. / UNLV-ISRI Annual Tests (1995)
|
|
214
|
+
- Smith (2007) - Tesseract OCR architecture
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
flavor = "Isn't it weird how the word 'bed' looks like a bed?"
|
|
218
|
+
|
|
219
|
+
def __init__(
|
|
220
|
+
self,
|
|
221
|
+
*,
|
|
222
|
+
rate: float | None = None,
|
|
223
|
+
seed: int | None = None,
|
|
224
|
+
preset: PresetName | None = None,
|
|
225
|
+
burst_enter: float | None = None,
|
|
226
|
+
burst_exit: float | None = None,
|
|
227
|
+
burst_multiplier: float | None = None,
|
|
228
|
+
bias_k: int | None = None,
|
|
229
|
+
bias_beta: float | None = None,
|
|
230
|
+
space_drop_rate: float | None = None,
|
|
231
|
+
space_insert_rate: float | None = None,
|
|
232
|
+
**kwargs: Any,
|
|
233
|
+
) -> None:
|
|
234
|
+
# If preset is specified, load parameters from it
|
|
235
|
+
if preset is not None:
|
|
236
|
+
if preset not in SCANNEQUIN_PRESETS:
|
|
237
|
+
valid_presets = ", ".join(sorted(SCANNEQUIN_PRESETS.keys()))
|
|
238
|
+
msg = f"Unknown preset '{preset}'. Valid presets: {valid_presets}"
|
|
239
|
+
raise ValueError(msg)
|
|
240
|
+
|
|
241
|
+
(
|
|
242
|
+
preset_rate,
|
|
243
|
+
preset_burst_enter,
|
|
244
|
+
preset_burst_exit,
|
|
245
|
+
preset_burst_multiplier,
|
|
246
|
+
preset_bias_k,
|
|
247
|
+
preset_bias_beta,
|
|
248
|
+
preset_space_drop_rate,
|
|
249
|
+
preset_space_insert_rate,
|
|
250
|
+
) = SCANNEQUIN_PRESETS[preset]
|
|
251
|
+
|
|
252
|
+
# Preset values are used as defaults, explicit params override
|
|
253
|
+
if rate is None:
|
|
254
|
+
rate = preset_rate
|
|
255
|
+
if burst_enter is None:
|
|
256
|
+
burst_enter = preset_burst_enter
|
|
257
|
+
if burst_exit is None:
|
|
258
|
+
burst_exit = preset_burst_exit
|
|
259
|
+
if burst_multiplier is None:
|
|
260
|
+
burst_multiplier = preset_burst_multiplier
|
|
261
|
+
if bias_k is None:
|
|
262
|
+
bias_k = preset_bias_k
|
|
263
|
+
if bias_beta is None:
|
|
264
|
+
bias_beta = preset_bias_beta
|
|
265
|
+
if space_drop_rate is None:
|
|
266
|
+
space_drop_rate = preset_space_drop_rate
|
|
267
|
+
if space_insert_rate is None:
|
|
268
|
+
space_insert_rate = preset_space_insert_rate
|
|
269
|
+
|
|
270
|
+
# Apply defaults for any remaining None values
|
|
271
|
+
effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
|
|
272
|
+
effective_burst_enter = (
|
|
273
|
+
DEFAULT_SCANNEQUIN_BURST_ENTER if burst_enter is None else burst_enter
|
|
274
|
+
)
|
|
275
|
+
effective_burst_exit = DEFAULT_SCANNEQUIN_BURST_EXIT if burst_exit is None else burst_exit
|
|
276
|
+
effective_burst_multiplier = (
|
|
277
|
+
DEFAULT_SCANNEQUIN_BURST_MULTIPLIER if burst_multiplier is None else burst_multiplier
|
|
278
|
+
)
|
|
279
|
+
effective_bias_k = DEFAULT_SCANNEQUIN_BIAS_K if bias_k is None else bias_k
|
|
280
|
+
effective_bias_beta = DEFAULT_SCANNEQUIN_BIAS_BETA if bias_beta is None else bias_beta
|
|
281
|
+
effective_space_drop_rate = (
|
|
282
|
+
DEFAULT_SCANNEQUIN_SPACE_DROP_RATE if space_drop_rate is None else space_drop_rate
|
|
283
|
+
)
|
|
284
|
+
effective_space_insert_rate = (
|
|
285
|
+
DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE if space_insert_rate is None else space_insert_rate
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
super().__init__(
|
|
289
|
+
name="Scannequin",
|
|
290
|
+
corruption_function=ocr_artifacts,
|
|
291
|
+
# Changed from CHARACTER to DOCUMENT for document-wide consistency
|
|
292
|
+
scope=AttackWave.DOCUMENT,
|
|
293
|
+
order=AttackOrder.LATE,
|
|
294
|
+
seed=seed,
|
|
295
|
+
rate=effective_rate,
|
|
296
|
+
burst_enter=effective_burst_enter,
|
|
297
|
+
burst_exit=effective_burst_exit,
|
|
298
|
+
burst_multiplier=effective_burst_multiplier,
|
|
299
|
+
bias_k=effective_bias_k,
|
|
300
|
+
bias_beta=effective_bias_beta,
|
|
301
|
+
space_drop_rate=effective_space_drop_rate,
|
|
302
|
+
space_insert_rate=effective_space_insert_rate,
|
|
303
|
+
**kwargs,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Store preset name if used
|
|
307
|
+
self._preset = preset
|
|
308
|
+
|
|
309
|
+
@classmethod
|
|
310
|
+
def from_preset(cls, preset: PresetName, *, seed: int | None = None) -> "Scannequin":
|
|
311
|
+
"""Create a Scannequin instance from a named quality preset.
|
|
312
|
+
|
|
313
|
+
Parameters
|
|
314
|
+
----------
|
|
315
|
+
preset : str
|
|
316
|
+
Quality preset name. One of:
|
|
317
|
+
- ``"clean_300dpi"``: Clean 300 DPI scan, minimal errors
|
|
318
|
+
- ``"newspaper"``: Newspaper-quality scan, moderate degradation
|
|
319
|
+
- ``"fax"``: Fax-quality, high error rate with l/1/I confusion
|
|
320
|
+
- ``"photocopy_3rd_gen"``: Third-generation photocopy, severe degradation
|
|
321
|
+
seed : int, optional
|
|
322
|
+
Deterministic seed for reproducibility.
|
|
323
|
+
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
Scannequin
|
|
327
|
+
Configured Scannequin instance.
|
|
328
|
+
|
|
329
|
+
Examples
|
|
330
|
+
--------
|
|
331
|
+
>>> fax = Scannequin.from_preset("fax", seed=42)
|
|
332
|
+
>>> fax("The quick brown fox")
|
|
333
|
+
'Tbe quick brovvn fox'
|
|
334
|
+
"""
|
|
335
|
+
return cls(preset=preset, seed=seed)
|
|
336
|
+
|
|
337
|
+
def pipeline_operation(self) -> PipelineOperationPayload:
|
|
338
|
+
"""Return the Rust pipeline descriptor with all OCR parameters."""
|
|
339
|
+
rate_value = self.kwargs.get("rate", DEFAULT_SCANNEQUIN_RATE)
|
|
340
|
+
rate = DEFAULT_SCANNEQUIN_RATE if rate_value is None else float(rate_value)
|
|
341
|
+
|
|
342
|
+
return cast(
|
|
343
|
+
PipelineOperationPayload,
|
|
344
|
+
{
|
|
345
|
+
"type": "ocr",
|
|
346
|
+
"rate": rate,
|
|
347
|
+
"burst_enter": float(
|
|
348
|
+
self.kwargs.get("burst_enter", DEFAULT_SCANNEQUIN_BURST_ENTER)
|
|
349
|
+
),
|
|
350
|
+
"burst_exit": float(self.kwargs.get("burst_exit", DEFAULT_SCANNEQUIN_BURST_EXIT)),
|
|
351
|
+
"burst_multiplier": float(
|
|
352
|
+
self.kwargs.get("burst_multiplier", DEFAULT_SCANNEQUIN_BURST_MULTIPLIER)
|
|
353
|
+
),
|
|
354
|
+
"bias_k": int(self.kwargs.get("bias_k", DEFAULT_SCANNEQUIN_BIAS_K)),
|
|
355
|
+
"bias_beta": float(self.kwargs.get("bias_beta", DEFAULT_SCANNEQUIN_BIAS_BETA)),
|
|
356
|
+
"space_drop_rate": float(
|
|
357
|
+
self.kwargs.get("space_drop_rate", DEFAULT_SCANNEQUIN_SPACE_DROP_RATE)
|
|
358
|
+
),
|
|
359
|
+
"space_insert_rate": float(
|
|
360
|
+
self.kwargs.get("space_insert_rate", DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE)
|
|
361
|
+
),
|
|
362
|
+
},
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
# Default instance for convenience
|
|
367
|
+
scannequin = Scannequin()
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
__all__ = ["Scannequin", "scannequin", "ocr_artifacts", "SCANNEQUIN_PRESETS"]
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""Pure text transformation functions.
|
|
2
|
+
|
|
3
|
+
This module contains text manipulation functions that are:
|
|
4
|
+
- **Pure**: Output depends only on inputs, no side effects
|
|
5
|
+
- **Deterministic**: Same inputs always produce same outputs
|
|
6
|
+
- **Self-contained**: No RNG, no Rust FFI, no config loading
|
|
7
|
+
|
|
8
|
+
These functions receive pre-validated inputs from boundary layers
|
|
9
|
+
(see validation.py) and trust that inputs are already checked.
|
|
10
|
+
Core transformation code should NOT re-validate parameters.
|
|
11
|
+
|
|
12
|
+
Design Philosophy
|
|
13
|
+
-----------------
|
|
14
|
+
This module implements the innermost layer of the purity architecture:
|
|
15
|
+
|
|
16
|
+
CLI/API → validation.py → transforms.py → Rust FFI
|
|
17
|
+
(boundary) (boundary) (pure core) (impure)
|
|
18
|
+
|
|
19
|
+
Functions here should:
|
|
20
|
+
- Accept concrete types (not Optional unless semantically required)
|
|
21
|
+
- Not log, print, or mutate external state
|
|
22
|
+
- Not import impure modules (internal.rust, config loaders, etc.)
|
|
23
|
+
- Document any preconditions callers must satisfy
|
|
24
|
+
|
|
25
|
+
See AGENTS.md "Functional Purity Architecture" for full details.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import re
|
|
31
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
32
|
+
from dataclasses import dataclass
|
|
33
|
+
from typing import TypeVar, cast
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Text Tokenization
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
_WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
|
|
40
|
+
_TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.DOTALL)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def split_preserving_whitespace(text: str) -> list[str]:
|
|
44
|
+
"""Split text while keeping whitespace tokens for stable reconstruction.
|
|
45
|
+
|
|
46
|
+
Returns alternating [word, whitespace, word, whitespace, ...] tokens.
|
|
47
|
+
Joining the result reconstructs the original text exactly.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
text: Input text to tokenize.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of tokens alternating between non-whitespace and whitespace.
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
>>> split_preserving_whitespace("hello world")
|
|
57
|
+
['hello', ' ', 'world']
|
|
58
|
+
"""
|
|
59
|
+
return _WORD_SPLIT_PATTERN.split(text)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def split_token_edges(token: str) -> tuple[str, str, str]:
|
|
63
|
+
"""Decompose a token into leading punctuation, core, and trailing punctuation.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
token: A non-whitespace token.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Tuple of (prefix, core, suffix) where:
|
|
70
|
+
- prefix: leading non-word characters
|
|
71
|
+
- core: central word characters
|
|
72
|
+
- suffix: trailing non-word characters
|
|
73
|
+
|
|
74
|
+
Example:
|
|
75
|
+
>>> split_token_edges('"Hello!"')
|
|
76
|
+
('"', 'Hello', '!"')
|
|
77
|
+
"""
|
|
78
|
+
match = cast(re.Match[str], _TOKEN_EDGES_PATTERN.match(token))
|
|
79
|
+
prefix, core, suffix = match.groups()
|
|
80
|
+
return prefix, core, suffix
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def compute_core_length(token: str) -> int:
|
|
84
|
+
"""Compute the effective length of a token's core for weighting heuristics.
|
|
85
|
+
|
|
86
|
+
Used by weighted sampling algorithms to prioritize longer words.
|
|
87
|
+
Always returns at least 1 to avoid zero-weight issues.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
token: A non-whitespace token.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Positive integer representing the token's effective length.
|
|
94
|
+
"""
|
|
95
|
+
_, core, _ = split_token_edges(token)
|
|
96
|
+
if core:
|
|
97
|
+
return len(core)
|
|
98
|
+
stripped = token.strip()
|
|
99
|
+
if stripped:
|
|
100
|
+
return len(stripped)
|
|
101
|
+
if token:
|
|
102
|
+
return len(token)
|
|
103
|
+
return 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass(frozen=True)
|
|
107
|
+
class WordToken:
|
|
108
|
+
"""Metadata describing a non-whitespace token from text tokenization.
|
|
109
|
+
|
|
110
|
+
Attributes:
|
|
111
|
+
index: Position in the parent token sequence.
|
|
112
|
+
prefix: Leading non-word characters (punctuation).
|
|
113
|
+
core: Central word characters.
|
|
114
|
+
suffix: Trailing non-word characters (punctuation).
|
|
115
|
+
core_length: Effective length for weighting (always >= 1).
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
index: int
|
|
119
|
+
prefix: str
|
|
120
|
+
core: str
|
|
121
|
+
suffix: str
|
|
122
|
+
core_length: int
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def has_core(self) -> bool:
|
|
126
|
+
"""Return True when the token contains at least one core character."""
|
|
127
|
+
return bool(self.core)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def collect_word_tokens(
|
|
131
|
+
tokens: Sequence[str],
|
|
132
|
+
*,
|
|
133
|
+
skip_first_word: bool = False,
|
|
134
|
+
) -> list[WordToken]:
|
|
135
|
+
"""Extract structured metadata for non-whitespace tokens.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
tokens: Token sequence from split_preserving_whitespace.
|
|
139
|
+
skip_first_word: If True, exclude the first content token
|
|
140
|
+
(useful for preserving leading words in delete operations).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of WordToken instances for each non-whitespace token.
|
|
144
|
+
"""
|
|
145
|
+
start = 2 if skip_first_word else 0
|
|
146
|
+
collected: list[WordToken] = []
|
|
147
|
+
|
|
148
|
+
for index in range(start, len(tokens), 2):
|
|
149
|
+
token = tokens[index]
|
|
150
|
+
if not token or token.isspace():
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
prefix, core, suffix = split_token_edges(token)
|
|
154
|
+
core_length = compute_core_length(token)
|
|
155
|
+
|
|
156
|
+
collected.append(
|
|
157
|
+
WordToken(
|
|
158
|
+
index=index,
|
|
159
|
+
prefix=prefix,
|
|
160
|
+
core=core,
|
|
161
|
+
suffix=suffix,
|
|
162
|
+
core_length=core_length,
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return collected
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def reassemble_tokens(tokens: Sequence[str]) -> str:
|
|
170
|
+
"""Join tokens back into text, preserving original structure.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
tokens: Token sequence (typically modified from split_preserving_whitespace).
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Reassembled text string.
|
|
177
|
+
"""
|
|
178
|
+
return "".join(tokens)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
# String Difference Computation
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def compute_string_diffs(
|
|
187
|
+
original: str,
|
|
188
|
+
modified: str,
|
|
189
|
+
) -> list[list[tuple[str, str, str]]]:
|
|
190
|
+
"""Compare two strings and return grouped adjacent change operations.
|
|
191
|
+
|
|
192
|
+
Uses difflib's SequenceMatcher to identify changes between strings.
|
|
193
|
+
Consecutive changes are grouped together; equal regions are skipped.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
original: The original string.
|
|
197
|
+
modified: The modified string.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
List of change groups. Each group is a list of (tag, old_text, new_text)
|
|
201
|
+
tuples where tag is 'replace', 'delete', or 'insert'.
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
>>> compute_string_diffs("hello world", "helo worlds")
|
|
205
|
+
[[('delete', 'l', '')], [('replace', '', 's')]]
|
|
206
|
+
"""
|
|
207
|
+
import difflib
|
|
208
|
+
|
|
209
|
+
sm = difflib.SequenceMatcher(None, original, modified)
|
|
210
|
+
ops: list[list[tuple[str, str, str]]] = []
|
|
211
|
+
buffer: list[tuple[str, str, str]] = []
|
|
212
|
+
|
|
213
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
|
214
|
+
if tag == "equal":
|
|
215
|
+
if buffer:
|
|
216
|
+
ops.append(buffer)
|
|
217
|
+
buffer = []
|
|
218
|
+
continue
|
|
219
|
+
buffer.append((tag, original[i1:i2], modified[j1:j2]))
|
|
220
|
+
|
|
221
|
+
if buffer:
|
|
222
|
+
ops.append(buffer)
|
|
223
|
+
|
|
224
|
+
return ops
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ---------------------------------------------------------------------------
|
|
228
|
+
# Sequence Operations
|
|
229
|
+
# ---------------------------------------------------------------------------
|
|
230
|
+
|
|
231
|
+
T = TypeVar("T")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def stable_deduplicate(items: Iterable[T]) -> list[T]:
|
|
235
|
+
"""Remove duplicates while preserving original order.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
items: Iterable of hashable items.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
List with duplicates removed, first occurrence preserved.
|
|
242
|
+
|
|
243
|
+
Example:
|
|
244
|
+
>>> stable_deduplicate([3, 1, 4, 1, 5, 9, 2, 6, 5])
|
|
245
|
+
[3, 1, 4, 5, 9, 2, 6]
|
|
246
|
+
"""
|
|
247
|
+
seen: set[T] = set()
|
|
248
|
+
result: list[T] = []
|
|
249
|
+
for item in items:
|
|
250
|
+
if item not in seen:
|
|
251
|
+
seen.add(item)
|
|
252
|
+
result.append(item)
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def interleave_lists(
|
|
257
|
+
primary: Sequence[T],
|
|
258
|
+
secondary: Sequence[T],
|
|
259
|
+
*,
|
|
260
|
+
secondary_first: bool = False,
|
|
261
|
+
) -> list[T]:
|
|
262
|
+
"""Interleave two sequences, padding shorter with empty slots.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
primary: First sequence.
|
|
266
|
+
secondary: Second sequence.
|
|
267
|
+
secondary_first: If True, start with secondary element.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Interleaved list [p0, s0, p1, s1, ...] or [s0, p0, s1, p1, ...].
|
|
271
|
+
"""
|
|
272
|
+
result: list[T] = []
|
|
273
|
+
max_len = max(len(primary), len(secondary))
|
|
274
|
+
|
|
275
|
+
for i in range(max_len):
|
|
276
|
+
if secondary_first:
|
|
277
|
+
if i < len(secondary):
|
|
278
|
+
result.append(secondary[i])
|
|
279
|
+
if i < len(primary):
|
|
280
|
+
result.append(primary[i])
|
|
281
|
+
else:
|
|
282
|
+
if i < len(primary):
|
|
283
|
+
result.append(primary[i])
|
|
284
|
+
if i < len(secondary):
|
|
285
|
+
result.append(secondary[i])
|
|
286
|
+
|
|
287
|
+
return result
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# ---------------------------------------------------------------------------
|
|
291
|
+
# Mapping Helpers
|
|
292
|
+
# ---------------------------------------------------------------------------
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def invert_mapping(
|
|
296
|
+
mapping: Mapping[str, Sequence[str]],
|
|
297
|
+
) -> dict[str, str]:
|
|
298
|
+
"""Invert a one-to-many mapping into a many-to-one lookup.
|
|
299
|
+
|
|
300
|
+
Given {key: [val1, val2]}, returns {val1: key, val2: key}.
|
|
301
|
+
Later keys overwrite earlier ones if values collide.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
mapping: Dictionary mapping keys to sequences of values.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Inverted dictionary mapping each value to its key.
|
|
308
|
+
"""
|
|
309
|
+
inverted: dict[str, str] = {}
|
|
310
|
+
for key, values in mapping.items():
|
|
311
|
+
for value in values:
|
|
312
|
+
inverted[value] = key
|
|
313
|
+
return inverted
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
__all__ = [
|
|
317
|
+
# Tokenization
|
|
318
|
+
"split_preserving_whitespace",
|
|
319
|
+
"split_token_edges",
|
|
320
|
+
"compute_core_length",
|
|
321
|
+
"WordToken",
|
|
322
|
+
"collect_word_tokens",
|
|
323
|
+
"reassemble_tokens",
|
|
324
|
+
# Diffs
|
|
325
|
+
"compute_string_diffs",
|
|
326
|
+
# Sequences
|
|
327
|
+
"stable_deduplicate",
|
|
328
|
+
"interleave_lists",
|
|
329
|
+
# Mappings
|
|
330
|
+
"invert_mapping",
|
|
331
|
+
]
|