mrz-genius 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +270 -0
- package/package.json +47 -0
- package/src/detector/mrzDetector.js +214 -0
- package/src/index.d.ts +141 -0
- package/src/index.js +150 -0
- package/src/ocr/llmExtractor.js +146 -0
- package/src/ocr/mrzOCR.js +489 -0
- package/src/parser/checkDigit.js +84 -0
- package/src/parser/fieldPositions.js +122 -0
- package/src/parser/mrzParser.js +487 -0
- package/src/parser/ocrCorrector.js +172 -0
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MRZ OCR Engine
|
|
3
|
+
* Uses Tesseract.js to perform OCR on MRZ regions
|
|
4
|
+
* Applies MRZ-specific post-processing for better accuracy
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
'use strict';
|
|
8
|
+
|
|
9
|
+
const Tesseract = require('tesseract.js');
|
|
10
|
+
const sharp = require('sharp');
|
|
11
|
+
const { detectMRZRegion, optimizeForOCR } = require('../detector/mrzDetector');
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* MRZ-specific character whitelist
|
|
15
|
+
*/
|
|
16
|
+
const MRZ_CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789<';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Characters commonly misread by OCR as '<' filler
|
|
20
|
+
* K, L, |, [, ], {, }, (, ) are often mistaken for '<'
|
|
21
|
+
*/
|
|
22
|
+
const FILLER_MISREADS = /[KL\|\[\]\{\}\(\)\\\/_\-\.,:;!]/g;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Clean a single MRZ line by fixing common OCR mistakes
|
|
26
|
+
* @param {string} line - Raw OCR line
|
|
27
|
+
* @returns {string} Cleaned line
|
|
28
|
+
*/
|
|
29
|
+
function cleanMRZLine(line) {
|
|
30
|
+
let cleaned = line
|
|
31
|
+
.toUpperCase()
|
|
32
|
+
.replace(/\s+/g, '')
|
|
33
|
+
.trim();
|
|
34
|
+
|
|
35
|
+
// Step 1: Fix obvious non-MRZ characters → '<'
|
|
36
|
+
cleaned = cleaned.replace(/[^A-Z0-9<]/g, '<');
|
|
37
|
+
|
|
38
|
+
// Step 2: Fix trailing K/L sequences → '<' (filler misreads)
|
|
39
|
+
// e.g. "472<<K<<<K<K<LLKKLKLE" → "472<<<<<<<<<<<<<<<<<"
|
|
40
|
+
// Strategy: If a character is K or L and surrounded by '<' or other K/L, it's a filler
|
|
41
|
+
cleaned = fixFillerMisreads(cleaned);
|
|
42
|
+
|
|
43
|
+
return cleaned;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Fix K/L/similar chars that are actually '<' fillers
|
|
48
|
+
* Works by detecting patterns: if K or L appears in a sequence with '<', replace with '<'
|
|
49
|
+
* @param {string} line
|
|
50
|
+
* @returns {string}
|
|
51
|
+
*/
|
|
52
|
+
function fixFillerMisreads(line) {
|
|
53
|
+
const chars = [...line];
|
|
54
|
+
const result = [...chars];
|
|
55
|
+
|
|
56
|
+
for (let i = 0; i < chars.length; i++) {
|
|
57
|
+
const c = chars[i];
|
|
58
|
+
if (c === 'K' || c === 'L') {
|
|
59
|
+
// Check neighborhood: is this char surrounded by fillers or other K/L?
|
|
60
|
+
const prev = i > 0 ? chars[i - 1] : '<';
|
|
61
|
+
const next = i < chars.length - 1 ? chars[i + 1] : '<';
|
|
62
|
+
const isFillerNeighbor = (ch) => ch === '<' || ch === 'K' || ch === 'L';
|
|
63
|
+
|
|
64
|
+
if (isFillerNeighbor(prev) || isFillerNeighbor(next)) {
|
|
65
|
+
result[i] = '<';
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Also check: if we're near the end of the line (last 40%), K/L in MRZ are rare
|
|
69
|
+
// Filler typically fills the right side of lines
|
|
70
|
+
if (i > line.length * 0.6) {
|
|
71
|
+
// In the tail of a line, K/L are very likely fillers
|
|
72
|
+
const remainingChars = chars.slice(i);
|
|
73
|
+
const alphaCount = remainingChars.filter(ch => /[A-Z]/.test(ch) && ch !== 'K' && ch !== 'L').length;
|
|
74
|
+
if (alphaCount < 3) {
|
|
75
|
+
result[i] = '<';
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return result.join('');
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Post-process OCR output for MRZ
|
|
86
|
+
* @param {string} text - Raw OCR text
|
|
87
|
+
* @returns {string[]} Cleaned MRZ lines
|
|
88
|
+
*/
|
|
89
|
+
function postProcessOCR(text) {
|
|
90
|
+
let lines = text.split('\n')
|
|
91
|
+
.map(line => cleanMRZLine(line))
|
|
92
|
+
.filter(line => line.length > 0);
|
|
93
|
+
|
|
94
|
+
// Find lines that look like MRZ (right length range)
|
|
95
|
+
const mrzLines = lines.filter(line => {
|
|
96
|
+
const len = line.length;
|
|
97
|
+
return (len >= 28 && len <= 32) || // TD1: 30
|
|
98
|
+
(len >= 34 && len <= 38) || // TD2/MRVB: 36
|
|
99
|
+
(len >= 42 && len <= 46); // TD3/MRVA: 44
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
if (mrzLines.length >= 2) {
|
|
103
|
+
lines = mrzLines;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Normalize line lengths
|
|
107
|
+
if (lines.length >= 2) {
|
|
108
|
+
const expectedLength = determineExpectedLineLength(lines);
|
|
109
|
+
if (expectedLength) {
|
|
110
|
+
lines = lines.map(line => {
|
|
111
|
+
if (line.length < expectedLength) {
|
|
112
|
+
return line + '<'.repeat(expectedLength - line.length);
|
|
113
|
+
} else if (line.length > expectedLength) {
|
|
114
|
+
return line.substring(0, expectedLength);
|
|
115
|
+
}
|
|
116
|
+
return line;
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return lines;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Determine the expected line length from scanned lines
|
|
126
|
+
* @param {string[]} lines
|
|
127
|
+
* @returns {number|null}
|
|
128
|
+
*/
|
|
129
|
+
function determineExpectedLineLength(lines) {
|
|
130
|
+
// Use the most common line length closest to a standard
|
|
131
|
+
const lengths = lines.map(l => l.length);
|
|
132
|
+
|
|
133
|
+
// Find closest standard length for each line
|
|
134
|
+
const standards = [30, 36, 44];
|
|
135
|
+
const closestLengths = lengths.map(len => {
|
|
136
|
+
let closest = standards[0];
|
|
137
|
+
let minDist = Math.abs(len - closest);
|
|
138
|
+
for (const std of standards) {
|
|
139
|
+
const dist = Math.abs(len - std);
|
|
140
|
+
if (dist < minDist) {
|
|
141
|
+
closest = std;
|
|
142
|
+
minDist = dist;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return closest;
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
// Most common closest standard
|
|
149
|
+
const counts = {};
|
|
150
|
+
closestLengths.forEach(l => { counts[l] = (counts[l] || 0) + 1; });
|
|
151
|
+
let best = null, bestCount = 0;
|
|
152
|
+
for (const [len, count] of Object.entries(counts)) {
|
|
153
|
+
if (count > bestCount) {
|
|
154
|
+
best = parseInt(len);
|
|
155
|
+
bestCount = count;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return best;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Score a line for MRZ-likeness
|
|
164
|
+
* @param {string} line - Cleaned line
|
|
165
|
+
* @returns {number} Score (higher = more likely MRZ)
|
|
166
|
+
*/
|
|
167
|
+
function scoreMRZLine(line) {
|
|
168
|
+
let score = 0;
|
|
169
|
+
const len = line.length;
|
|
170
|
+
|
|
171
|
+
// Length matching
|
|
172
|
+
if ([30, 36, 44].includes(len)) score += 50;
|
|
173
|
+
else if (Math.abs(len - 30) <= 2 || Math.abs(len - 36) <= 2 || Math.abs(len - 44) <= 2) score += 25;
|
|
174
|
+
else if (len < 20) return -100; // Too short, definitely not MRZ
|
|
175
|
+
|
|
176
|
+
// MRZ starts with specific document type indicators
|
|
177
|
+
if (/^[PIACV][<A-Z]/.test(line)) score += 20;
|
|
178
|
+
|
|
179
|
+
// Contains << separator (name fields)
|
|
180
|
+
if (line.includes('<<')) score += 20;
|
|
181
|
+
|
|
182
|
+
// Has a mix of digits AND letters (most MRZ lines do)
|
|
183
|
+
const digitCount = [...line].filter(c => c >= '0' && c <= '9').length;
|
|
184
|
+
const letterCount = [...line].filter(c => c >= 'A' && c <= 'Z').length;
|
|
185
|
+
const fillerCount = [...line].filter(c => c === '<').length;
|
|
186
|
+
|
|
187
|
+
if (digitCount > 0 && letterCount > 0) score += 15;
|
|
188
|
+
if (digitCount > 3) score += 10; // MRZ has plenty of digits
|
|
189
|
+
|
|
190
|
+
// Has fillers but not 100% fillers
|
|
191
|
+
const fillerRatio = fillerCount / Math.max(len, 1);
|
|
192
|
+
if (fillerRatio > 0.05 && fillerRatio < 0.7) score += 10;
|
|
193
|
+
if (fillerRatio > 0.9) score -= 20; // Almost all fillers = probably noise
|
|
194
|
+
|
|
195
|
+
// Contains country code pattern (3 uppercase letters)
|
|
196
|
+
if (/[A-Z]{3}/.test(line)) score += 5;
|
|
197
|
+
|
|
198
|
+
// Penalty: if it looks like readable text words (not MRZ)
|
|
199
|
+
// MRZ doesn't have lowercase or long words without fillers
|
|
200
|
+
const words = line.replace(/</g, ' ').trim().split(/\s+/);
|
|
201
|
+
const longWords = words.filter(w => w.length > 8 && /^[A-Z]+$/.test(w));
|
|
202
|
+
if (longWords.length > 0 && fillerCount < 3) score -= 30; // Looks like regular text
|
|
203
|
+
|
|
204
|
+
return score;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Extract MRZ lines from full OCR text using pattern scoring
|
|
209
|
+
* @param {string} fullText - Full OCR text
|
|
210
|
+
* @returns {string[]|null} MRZ lines or null
|
|
211
|
+
*/
|
|
212
|
+
function extractMRZFromFullText(fullText) {
|
|
213
|
+
const allLines = fullText.split('\n')
|
|
214
|
+
.map(l => cleanMRZLine(l))
|
|
215
|
+
.filter(l => l.length > 15);
|
|
216
|
+
|
|
217
|
+
// Score each line
|
|
218
|
+
const scoredLines = allLines.map(line => ({
|
|
219
|
+
line,
|
|
220
|
+
score: scoreMRZLine(line),
|
|
221
|
+
length: line.length,
|
|
222
|
+
}));
|
|
223
|
+
|
|
224
|
+
// Sort by score descending
|
|
225
|
+
const sorted = [...scoredLines].sort((a, b) => b.score - a.score);
|
|
226
|
+
|
|
227
|
+
// Keep lines with decent scores
|
|
228
|
+
const threshold = 20;
|
|
229
|
+
const candidates = sorted.filter(s => s.score >= threshold);
|
|
230
|
+
|
|
231
|
+
if (candidates.length < 2) return null;
|
|
232
|
+
|
|
233
|
+
// Preserve original document order
|
|
234
|
+
const candidateLines = candidates.map(c => c.line);
|
|
235
|
+
const orderedCandidates = allLines.filter(l => candidateLines.includes(l));
|
|
236
|
+
|
|
237
|
+
// Try TD1: 3 lines close to 30 chars
|
|
238
|
+
const td1Candidates = orderedCandidates.filter(l => Math.abs(l.length - 30) <= 15);
|
|
239
|
+
if (td1Candidates.length >= 3) {
|
|
240
|
+
return td1Candidates.slice(-3).map(l => (l + '<'.repeat(30)).substring(0, 30));
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Try TD2/MRVB: 2 lines close to 36 chars
|
|
244
|
+
const td2Candidates = orderedCandidates.filter(l => Math.abs(l.length - 36) <= 18);
|
|
245
|
+
if (td2Candidates.length >= 2) {
|
|
246
|
+
return td2Candidates.slice(-2).map(l => (l + '<'.repeat(36)).substring(0, 36));
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Try TD3/MRVA: 2 lines close to 44 chars
|
|
250
|
+
const td3Candidates = orderedCandidates.filter(l => Math.abs(l.length - 44) <= 22);
|
|
251
|
+
if (td3Candidates.length >= 2) {
|
|
252
|
+
return td3Candidates.slice(-2).map(l => (l + '<'.repeat(44)).substring(0, 44));
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Fallback: take the best scored lines
|
|
256
|
+
if (orderedCandidates.length >= 3) {
|
|
257
|
+
return orderedCandidates.slice(-3);
|
|
258
|
+
}
|
|
259
|
+
if (orderedCandidates.length >= 2) {
|
|
260
|
+
return orderedCandidates.slice(-2);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Prepare multiple image variants for OCR
|
|
268
|
+
* @param {Buffer} imageBuffer - Source image
|
|
269
|
+
* @returns {Promise<{buffer: Buffer, name: string}[]>}
|
|
270
|
+
*/
|
|
271
|
+
async function prepareImageVariants(imageBuffer) {
|
|
272
|
+
const meta = await sharp(imageBuffer).metadata();
|
|
273
|
+
const w = meta.width;
|
|
274
|
+
const h = meta.height;
|
|
275
|
+
const variants = [];
|
|
276
|
+
|
|
277
|
+
// Variant 1: Upscale + high contrast threshold
|
|
278
|
+
try {
|
|
279
|
+
const v1 = await sharp(imageBuffer)
|
|
280
|
+
.grayscale()
|
|
281
|
+
.normalize()
|
|
282
|
+
.sharpen({ sigma: 2 })
|
|
283
|
+
.threshold(140)
|
|
284
|
+
.resize(w * 3, h * 3, { kernel: sharp.kernel.lanczos3 })
|
|
285
|
+
.toBuffer();
|
|
286
|
+
variants.push({ buffer: v1, name: 'threshold_140_3x' });
|
|
287
|
+
} catch (e) { /* skip */ }
|
|
288
|
+
|
|
289
|
+
// Variant 2: Lower threshold (for lighter prints)
|
|
290
|
+
try {
|
|
291
|
+
const v2 = await sharp(imageBuffer)
|
|
292
|
+
.grayscale()
|
|
293
|
+
.normalize()
|
|
294
|
+
.threshold(100)
|
|
295
|
+
.resize(w * 3, h * 3, { kernel: sharp.kernel.lanczos3 })
|
|
296
|
+
.toBuffer();
|
|
297
|
+
variants.push({ buffer: v2, name: 'threshold_100_3x' });
|
|
298
|
+
} catch (e) { /* skip */ }
|
|
299
|
+
|
|
300
|
+
// Variant 3: Higher threshold (for heavy prints)
|
|
301
|
+
try {
|
|
302
|
+
const v3 = await sharp(imageBuffer)
|
|
303
|
+
.grayscale()
|
|
304
|
+
.normalize()
|
|
305
|
+
.threshold(170)
|
|
306
|
+
.resize(w * 3, h * 3, { kernel: sharp.kernel.lanczos3 })
|
|
307
|
+
.toBuffer();
|
|
308
|
+
variants.push({ buffer: v3, name: 'threshold_170_3x' });
|
|
309
|
+
} catch (e) { /* skip */ }
|
|
310
|
+
|
|
311
|
+
// Variant 4: Adaptive - just grayscale + normalize + big upscale
|
|
312
|
+
try {
|
|
313
|
+
const v4 = await sharp(imageBuffer)
|
|
314
|
+
.grayscale()
|
|
315
|
+
.normalize()
|
|
316
|
+
.sharpen({ sigma: 3 })
|
|
317
|
+
.resize(w * 4, h * 4, { kernel: sharp.kernel.lanczos3 })
|
|
318
|
+
.toBuffer();
|
|
319
|
+
variants.push({ buffer: v4, name: 'sharp_4x' });
|
|
320
|
+
} catch (e) { /* skip */ }
|
|
321
|
+
|
|
322
|
+
// Variant 5: Inverted (white text on dark)
|
|
323
|
+
try {
|
|
324
|
+
const v5 = await sharp(imageBuffer)
|
|
325
|
+
.grayscale()
|
|
326
|
+
.negate()
|
|
327
|
+
.normalize()
|
|
328
|
+
.threshold(140)
|
|
329
|
+
.resize(w * 3, h * 3, { kernel: sharp.kernel.lanczos3 })
|
|
330
|
+
.toBuffer();
|
|
331
|
+
variants.push({ buffer: v5, name: 'inverted_140_3x' });
|
|
332
|
+
} catch (e) { /* skip */ }
|
|
333
|
+
|
|
334
|
+
return variants;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* Score OCR result quality for MRZ
|
|
339
|
+
* @param {string[]} lines - Post-processed MRZ lines
|
|
340
|
+
* @param {number} confidence - Tesseract confidence
|
|
341
|
+
* @returns {number}
|
|
342
|
+
*/
|
|
343
|
+
function scoreOCRResult(lines, confidence) {
|
|
344
|
+
let score = confidence;
|
|
345
|
+
|
|
346
|
+
// Bonus for having the right number of lines
|
|
347
|
+
if (lines.length === 3) score += 30; // TD1
|
|
348
|
+
else if (lines.length === 2) score += 20;
|
|
349
|
+
|
|
350
|
+
// Bonus for correct line lengths
|
|
351
|
+
lines.forEach(line => {
|
|
352
|
+
if ([30, 36, 44].includes(line.length)) score += 15;
|
|
353
|
+
});
|
|
354
|
+
|
|
355
|
+
// Bonus for lines that score well as MRZ
|
|
356
|
+
lines.forEach(line => {
|
|
357
|
+
score += scoreMRZLine(line) * 0.5;
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
return score;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Perform OCR on an image to extract MRZ text
|
|
365
|
+
* @param {Buffer|string} input - Image buffer or file path
|
|
366
|
+
* @param {Object} [options] - OCR options
|
|
367
|
+
* @param {string} [options.lang='eng'] - Tesseract language
|
|
368
|
+
* @param {boolean} [options.detectRegion=true] - Auto-detect MRZ region
|
|
369
|
+
* @returns {Promise<Object>} OCR result with MRZ lines
|
|
370
|
+
*/
|
|
371
|
+
async function performOCR(input, options = {}) {
|
|
372
|
+
const {
|
|
373
|
+
lang = 'eng',
|
|
374
|
+
detectRegion = true,
|
|
375
|
+
} = options;
|
|
376
|
+
|
|
377
|
+
// Step 1: Create Tesseract worker
|
|
378
|
+
const worker = await Tesseract.createWorker(lang, 1);
|
|
379
|
+
await worker.setParameters({
|
|
380
|
+
tessedit_char_whitelist: MRZ_CHARSET,
|
|
381
|
+
tessedit_pageseg_mode: Tesseract.PSM.SINGLE_BLOCK,
|
|
382
|
+
preserve_interword_spaces: '0',
|
|
383
|
+
});
|
|
384
|
+
|
|
385
|
+
let bestResult = { lines: [], rawText: '', confidence: 0, method: 'none' };
|
|
386
|
+
let bestScore = -Infinity;
|
|
387
|
+
|
|
388
|
+
try {
|
|
389
|
+
// Step 2: Try MRZ region crops at different bottom percentages
|
|
390
|
+
const bottomPercents = [30, 40, 25, 50];
|
|
391
|
+
|
|
392
|
+
for (const pct of bottomPercents) {
|
|
393
|
+
try {
|
|
394
|
+
const region = await detectMRZRegion(input, { bottomPercent: pct });
|
|
395
|
+
if (!region) continue;
|
|
396
|
+
|
|
397
|
+
// Prepare image variants from the cropped region
|
|
398
|
+
const variants = await prepareImageVariants(region.imageBuffer);
|
|
399
|
+
|
|
400
|
+
for (const variant of variants) {
|
|
401
|
+
try {
|
|
402
|
+
const result = await worker.recognize(variant.buffer);
|
|
403
|
+
const lines = postProcessOCR(result.data.text);
|
|
404
|
+
const score = scoreOCRResult(lines, result.data.confidence);
|
|
405
|
+
|
|
406
|
+
if (score > bestScore && lines.length >= 2) {
|
|
407
|
+
bestScore = score;
|
|
408
|
+
bestResult = {
|
|
409
|
+
lines,
|
|
410
|
+
rawText: result.data.text,
|
|
411
|
+
confidence: result.data.confidence,
|
|
412
|
+
method: `region_${pct}pct_${variant.name}`,
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
} catch (e) { /* skip variant */ }
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
// Early exit if we have a great result
|
|
419
|
+
if (bestScore > 150) break;
|
|
420
|
+
} catch (e) { /* try next crop */ }
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Step 3: If no good result from region crops, try full image
|
|
424
|
+
if (bestResult.lines.length < 2) {
|
|
425
|
+
const variants = await prepareImageVariants(
|
|
426
|
+
typeof input === 'string' ? await sharp(input).toBuffer() : input
|
|
427
|
+
);
|
|
428
|
+
|
|
429
|
+
for (const variant of variants) {
|
|
430
|
+
try {
|
|
431
|
+
const result = await worker.recognize(variant.buffer);
|
|
432
|
+
|
|
433
|
+
let extracted = extractMRZFromFullText(result.data.text);
|
|
434
|
+
if (!extracted || extracted.length < 2) {
|
|
435
|
+
const lines = postProcessOCR(result.data.text);
|
|
436
|
+
extracted = lines;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
const score = scoreOCRResult(extracted, result.data.confidence);
|
|
440
|
+
if (score > bestScore && extracted.length >= 2) {
|
|
441
|
+
bestScore = score;
|
|
442
|
+
bestResult = {
|
|
443
|
+
lines: extracted,
|
|
444
|
+
rawText: result.data.text,
|
|
445
|
+
confidence: result.data.confidence,
|
|
446
|
+
method: `full_image_${variant.name}`,
|
|
447
|
+
};
|
|
448
|
+
}
|
|
449
|
+
} catch (e) { /* skip */ }
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Step 4: Last resort
|
|
454
|
+
if (bestResult.lines.length < 2 && bestResult.rawText) {
|
|
455
|
+
const extracted = extractMRZFromFullText(bestResult.rawText);
|
|
456
|
+
if (extracted && extracted.length >= 2) {
|
|
457
|
+
bestResult.lines = extracted;
|
|
458
|
+
bestResult.method += '+pattern_extraction';
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
return bestResult;
|
|
463
|
+
} finally {
|
|
464
|
+
await worker.terminate();
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Quick check if an image contains an MRZ
|
|
470
|
+
* @param {Buffer|string} input - Image buffer or file path
|
|
471
|
+
* @returns {Promise<boolean>} True if MRZ is likely present
|
|
472
|
+
*/
|
|
473
|
+
async function hasMRZ(input) {
|
|
474
|
+
try {
|
|
475
|
+
const result = await performOCR(input, { detectRegion: true });
|
|
476
|
+
return result.lines.length >= 2;
|
|
477
|
+
} catch {
|
|
478
|
+
return false;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
module.exports = {
|
|
483
|
+
performOCR,
|
|
484
|
+
postProcessOCR,
|
|
485
|
+
extractMRZFromFullText,
|
|
486
|
+
cleanMRZLine,
|
|
487
|
+
hasMRZ,
|
|
488
|
+
MRZ_CHARSET,
|
|
489
|
+
};
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MRZ Check Digit Calculator & Validator
|
|
3
|
+
* Based on ICAO 9303 standard
|
|
4
|
+
*
|
|
5
|
+
* Weight pattern: 7, 3, 1 (repeating)
|
|
6
|
+
* Character values:
|
|
7
|
+
* 0-9 → 0-9
|
|
8
|
+
* A-Z → 10-35
|
|
9
|
+
* < → 0
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
'use strict';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Get numeric value for an MRZ character
|
|
16
|
+
* @param {string} char - Single character
|
|
17
|
+
* @returns {number|null} Numeric value or null if invalid
|
|
18
|
+
*/
|
|
19
|
+
function getCharValue(char) {
|
|
20
|
+
if (char === '<') return 0;
|
|
21
|
+
|
|
22
|
+
const code = char.charCodeAt(0);
|
|
23
|
+
|
|
24
|
+
// 0-9
|
|
25
|
+
if (code >= 48 && code <= 57) return code - 48;
|
|
26
|
+
|
|
27
|
+
// A-Z
|
|
28
|
+
if (code >= 65 && code <= 90) return code - 55;
|
|
29
|
+
|
|
30
|
+
// a-z (uppercase tolerance)
|
|
31
|
+
if (code >= 97 && code <= 122) return code - 87;
|
|
32
|
+
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Calculate the check digit for a given MRZ string
|
|
38
|
+
* @param {string} value - MRZ string to calculate check digit for
|
|
39
|
+
* @returns {number|null} Check digit (0-9) or null if invalid
|
|
40
|
+
*/
|
|
41
|
+
function calculateCheckDigit(value) {
|
|
42
|
+
const weights = [7, 3, 1];
|
|
43
|
+
let sum = 0;
|
|
44
|
+
|
|
45
|
+
for (let i = 0; i < value.length; i++) {
|
|
46
|
+
const charVal = getCharValue(value[i]);
|
|
47
|
+
if (charVal === null) return null;
|
|
48
|
+
sum += charVal * weights[i % 3];
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return sum % 10;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Validate an MRZ field value against its check digit
|
|
56
|
+
* @param {string} rawValue - MRZ string to validate
|
|
57
|
+
* @param {number} checkDigit - Expected check digit
|
|
58
|
+
* @returns {boolean} True if valid
|
|
59
|
+
*/
|
|
60
|
+
function isCheckDigitValid(rawValue, checkDigit) {
|
|
61
|
+
const calculated = calculateCheckDigit(rawValue);
|
|
62
|
+
return calculated === checkDigit;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Validate composite check digit (used for final line validation)
|
|
67
|
+
* @param {Array<{rawValue: string, checkDigit: number|null}>} fields - Fields to composite
|
|
68
|
+
* @param {number} finalCheckDigit - Final check digit to validate against
|
|
69
|
+
* @returns {boolean} True if valid
|
|
70
|
+
*/
|
|
71
|
+
function isCompositeValid(fields, finalCheckDigit) {
|
|
72
|
+
const compositedValue = fields.reduce((acc, field) => {
|
|
73
|
+
return acc + field.rawValue + (field.checkDigit !== null && field.checkDigit !== undefined ? String(field.checkDigit) : '');
|
|
74
|
+
}, '');
|
|
75
|
+
|
|
76
|
+
return isCheckDigitValid(compositedValue, finalCheckDigit);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
module.exports = {
|
|
80
|
+
getCharValue,
|
|
81
|
+
calculateCheckDigit,
|
|
82
|
+
isCheckDigitValid,
|
|
83
|
+
isCompositeValid
|
|
84
|
+
};
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MRZ Field Positions
|
|
3
|
+
* Defines the position of each field for each MRZ format (TD1, TD2, TD3, MRVA, MRVB)
|
|
4
|
+
* Based on ICAO 9303 standard
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
'use strict';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* MRZ Format specifications
|
|
11
|
+
*/
|
|
12
|
+
const MRZ_FORMATS = {
|
|
13
|
+
TD1: { lineLength: 30, linesCount: 3 },
|
|
14
|
+
TD2: { lineLength: 36, linesCount: 2 },
|
|
15
|
+
TD3: { lineLength: 44, linesCount: 2 },
|
|
16
|
+
// MRVA and MRVB use the same line lengths as TD3 and TD2
|
|
17
|
+
MRVA: { lineLength: 44, linesCount: 2 },
|
|
18
|
+
MRVB: { lineLength: 36, linesCount: 2 },
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Field position definition
|
|
23
|
+
* @typedef {Object} FieldPosition
|
|
24
|
+
* @property {number} line - Line index (0-based)
|
|
25
|
+
* @property {number} start - Start position (inclusive)
|
|
26
|
+
* @property {number} end - End position (exclusive)
|
|
27
|
+
* @property {boolean} hasCheckDigit - Whether the field is followed by a check digit
|
|
28
|
+
* @property {'digits'|'letters'|'mixed'|'sex'} contentType - Expected content type
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Get field positions for a given MRZ format
|
|
33
|
+
* @param {string} format - MRZ format ('TD1', 'TD2', 'TD3', 'MRVA', 'MRVB')
|
|
34
|
+
* @returns {Object} Field positions
|
|
35
|
+
*/
|
|
36
|
+
function getFieldPositions(format) {
|
|
37
|
+
switch (format) {
|
|
38
|
+
case 'TD1':
|
|
39
|
+
return {
|
|
40
|
+
documentType: { line: 0, start: 0, end: 1, hasCheckDigit: false, contentType: 'letters' },
|
|
41
|
+
documentSubtype: { line: 0, start: 1, end: 2, hasCheckDigit: false, contentType: 'letters' },
|
|
42
|
+
issuingCountry: { line: 0, start: 2, end: 5, hasCheckDigit: false, contentType: 'letters' },
|
|
43
|
+
documentNumber: { line: 0, start: 5, end: 14, hasCheckDigit: true, contentType: 'mixed' },
|
|
44
|
+
optionalData1: { line: 0, start: 15, end: 30, hasCheckDigit: false, contentType: 'mixed' },
|
|
45
|
+
birthDate: { line: 1, start: 0, end: 6, hasCheckDigit: true, contentType: 'digits' },
|
|
46
|
+
sex: { line: 1, start: 7, end: 8, hasCheckDigit: false, contentType: 'sex' },
|
|
47
|
+
expiryDate: { line: 1, start: 8, end: 14, hasCheckDigit: true, contentType: 'digits' },
|
|
48
|
+
nationality: { line: 1, start: 15, end: 18, hasCheckDigit: false, contentType: 'letters' },
|
|
49
|
+
optionalData2: { line: 1, start: 18, end: 29, hasCheckDigit: false, contentType: 'mixed' },
|
|
50
|
+
finalCheckDigit: { line: 1, start: 29, end: 30, hasCheckDigit: false, contentType: 'digits' },
|
|
51
|
+
name: { line: 2, start: 0, end: 30, hasCheckDigit: false, contentType: 'letters' },
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
case 'TD2':
|
|
55
|
+
return {
|
|
56
|
+
documentType: { line: 0, start: 0, end: 1, hasCheckDigit: false, contentType: 'letters' },
|
|
57
|
+
documentSubtype: { line: 0, start: 1, end: 2, hasCheckDigit: false, contentType: 'letters' },
|
|
58
|
+
issuingCountry: { line: 0, start: 2, end: 5, hasCheckDigit: false, contentType: 'letters' },
|
|
59
|
+
name: { line: 0, start: 5, end: 36, hasCheckDigit: false, contentType: 'letters' },
|
|
60
|
+
documentNumber: { line: 1, start: 0, end: 9, hasCheckDigit: true, contentType: 'mixed' },
|
|
61
|
+
nationality: { line: 1, start: 10, end: 13, hasCheckDigit: false, contentType: 'letters' },
|
|
62
|
+
birthDate: { line: 1, start: 13, end: 19, hasCheckDigit: true, contentType: 'digits' },
|
|
63
|
+
sex: { line: 1, start: 20, end: 21, hasCheckDigit: false, contentType: 'sex' },
|
|
64
|
+
expiryDate: { line: 1, start: 21, end: 27, hasCheckDigit: true, contentType: 'digits' },
|
|
65
|
+
optionalData1: { line: 1, start: 28, end: 35, hasCheckDigit: false, contentType: 'mixed' },
|
|
66
|
+
finalCheckDigit: { line: 1, start: 35, end: 36, hasCheckDigit: false, contentType: 'digits' },
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
case 'TD3':
|
|
70
|
+
return {
|
|
71
|
+
documentType: { line: 0, start: 0, end: 1, hasCheckDigit: false, contentType: 'letters' },
|
|
72
|
+
documentSubtype: { line: 0, start: 1, end: 2, hasCheckDigit: false, contentType: 'letters' },
|
|
73
|
+
issuingCountry: { line: 0, start: 2, end: 5, hasCheckDigit: false, contentType: 'letters' },
|
|
74
|
+
name: { line: 0, start: 5, end: 44, hasCheckDigit: false, contentType: 'letters' },
|
|
75
|
+
documentNumber: { line: 1, start: 0, end: 9, hasCheckDigit: true, contentType: 'mixed' },
|
|
76
|
+
nationality: { line: 1, start: 10, end: 13, hasCheckDigit: false, contentType: 'letters' },
|
|
77
|
+
birthDate: { line: 1, start: 13, end: 19, hasCheckDigit: true, contentType: 'digits' },
|
|
78
|
+
sex: { line: 1, start: 20, end: 21, hasCheckDigit: false, contentType: 'sex' },
|
|
79
|
+
expiryDate: { line: 1, start: 21, end: 27, hasCheckDigit: true, contentType: 'digits' },
|
|
80
|
+
optionalData1: { line: 1, start: 28, end: 42, hasCheckDigit: true, contentType: 'mixed' },
|
|
81
|
+
finalCheckDigit: { line: 1, start: 43, end: 44, hasCheckDigit: false, contentType: 'digits' },
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
case 'MRVA':
|
|
85
|
+
return {
|
|
86
|
+
documentType: { line: 0, start: 0, end: 1, hasCheckDigit: false, contentType: 'letters' },
|
|
87
|
+
documentSubtype: { line: 0, start: 1, end: 2, hasCheckDigit: false, contentType: 'letters' },
|
|
88
|
+
issuingCountry: { line: 0, start: 2, end: 5, hasCheckDigit: false, contentType: 'letters' },
|
|
89
|
+
name: { line: 0, start: 5, end: 44, hasCheckDigit: false, contentType: 'letters' },
|
|
90
|
+
documentNumber: { line: 1, start: 0, end: 9, hasCheckDigit: true, contentType: 'mixed' },
|
|
91
|
+
nationality: { line: 1, start: 10, end: 13, hasCheckDigit: false, contentType: 'letters' },
|
|
92
|
+
birthDate: { line: 1, start: 13, end: 19, hasCheckDigit: true, contentType: 'digits' },
|
|
93
|
+
sex: { line: 1, start: 20, end: 21, hasCheckDigit: false, contentType: 'sex' },
|
|
94
|
+
expiryDate: { line: 1, start: 21, end: 27, hasCheckDigit: true, contentType: 'digits' },
|
|
95
|
+
optionalData1: { line: 1, start: 28, end: 44, hasCheckDigit: false, contentType: 'mixed' },
|
|
96
|
+
// MRVA has NO final check digit
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
case 'MRVB':
|
|
100
|
+
return {
|
|
101
|
+
documentType: { line: 0, start: 0, end: 1, hasCheckDigit: false, contentType: 'letters' },
|
|
102
|
+
documentSubtype: { line: 0, start: 1, end: 2, hasCheckDigit: false, contentType: 'letters' },
|
|
103
|
+
issuingCountry: { line: 0, start: 2, end: 5, hasCheckDigit: false, contentType: 'letters' },
|
|
104
|
+
name: { line: 0, start: 5, end: 36, hasCheckDigit: false, contentType: 'letters' },
|
|
105
|
+
documentNumber: { line: 1, start: 0, end: 9, hasCheckDigit: true, contentType: 'mixed' },
|
|
106
|
+
nationality: { line: 1, start: 10, end: 13, hasCheckDigit: false, contentType: 'letters' },
|
|
107
|
+
birthDate: { line: 1, start: 13, end: 19, hasCheckDigit: true, contentType: 'digits' },
|
|
108
|
+
sex: { line: 1, start: 20, end: 21, hasCheckDigit: false, contentType: 'sex' },
|
|
109
|
+
expiryDate: { line: 1, start: 21, end: 27, hasCheckDigit: true, contentType: 'digits' },
|
|
110
|
+
optionalData1: { line: 1, start: 28, end: 36, hasCheckDigit: false, contentType: 'mixed' },
|
|
111
|
+
// MRVB has NO final check digit
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
default:
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
module.exports = {
|
|
120
|
+
MRZ_FORMATS,
|
|
121
|
+
getFieldPositions
|
|
122
|
+
};
|