mrz-genius 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +270 -0
- package/package.json +47 -0
- package/src/detector/mrzDetector.js +214 -0
- package/src/index.d.ts +141 -0
- package/src/index.js +150 -0
- package/src/ocr/llmExtractor.js +146 -0
- package/src/ocr/mrzOCR.js +489 -0
- package/src/parser/checkDigit.js +84 -0
- package/src/parser/fieldPositions.js +122 -0
- package/src/parser/mrzParser.js +487 -0
- package/src/parser/ocrCorrector.js +172 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mihradj KYC Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# mrz-genius
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/mrz-genius)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://nodejs.org)
|
|
6
|
+
[](#)
|
|
7
|
+
|
|
8
|
+
Bibliothèque Node.js complète pour la **détection**, l' **OCR** et le **parsing** des zones MRZ (Machine Readable Zone) sur les documents d'identité.
|
|
9
|
+
|
|
10
|
+
### 📍 Workflow d'extraction
|
|
11
|
+
|
|
12
|
+
```mermaid
|
|
13
|
+
graph LR
|
|
14
|
+
A[Image Brute] --> B[Détection Zone MRZ]
|
|
15
|
+
B --> C[Optimisation Image]
|
|
16
|
+
C --> D[OCR Tesseract/LLM]
|
|
17
|
+
D --> E[Parsing & Correction]
|
|
18
|
+
E --> F[Données ID Validées]
|
|
19
|
+
style F fill:#10b981,stroke:#059669,color:#fff
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Inspirée du module Swift [MRZParser](https://github.com/romanmazeev/MRZParser).
|
|
23
|
+
|
|
24
|
+
## 📋 Formats supportés
|
|
25
|
+
|
|
26
|
+
| Format | Type de document | Lignes | Caractères/ligne |
|
|
27
|
+
|--------|-----------------|--------|------------------|
|
|
28
|
+
| **TD1** | Cartes d'identité (CNI) | 3 | 30 |
|
|
29
|
+
| **TD2** | Documents de voyage (format moyen) | 2 | 36 |
|
|
30
|
+
| **TD3** | Passeports | 2 | 44 |
|
|
31
|
+
| **MRVA** | Visas type A | 2 | 44 |
|
|
32
|
+
| **MRVB** | Visas type B | 2 | 36 |
|
|
33
|
+
|
|
34
|
+
## 🚀 Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npm install mrz-genius
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Ou depuis les sources :
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
cd mrz-genius
|
|
44
|
+
npm install
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## 📖 Utilisation
|
|
48
|
+
|
|
49
|
+
### 1. Pipeline complet : Image → MRZ parsé
|
|
50
|
+
|
|
51
|
+
```javascript
|
|
52
|
+
const { processImage } = require('mrz-genius');
|
|
53
|
+
|
|
54
|
+
// Depuis un fichier image
|
|
55
|
+
const result = await processImage('./passport.jpg');
|
|
56
|
+
|
|
57
|
+
if (result.success) {
|
|
58
|
+
console.log('Nom:', result.parsed.surname);
|
|
59
|
+
console.log('Prénoms:', result.parsed.givenNames);
|
|
60
|
+
console.log('N° Document:', result.parsed.documentNumber);
|
|
61
|
+
console.log('Nationalité:', result.parsed.nationality);
|
|
62
|
+
console.log('Date naissance:', result.parsed.birthDateFormatted);
|
|
63
|
+
console.log('Date expiration:', result.parsed.expiryDateFormatted);
|
|
64
|
+
console.log('Sexe:', result.parsed.sex);
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### 2. Parser direct (texte MRZ)
|
|
69
|
+
|
|
70
|
+
```javascript
|
|
71
|
+
const { parseMRZ } = require('mrz-genius');
|
|
72
|
+
|
|
73
|
+
// TD3 - Passeport
|
|
74
|
+
const result = parseMRZ([
|
|
75
|
+
'P<UTOERIKSSON<<ANNA<MARIA<<<<<<<<<<<<<<<<<<<',
|
|
76
|
+
'L898902C36UTO7408122F1204159ZE184226B<<<<<10'
|
|
77
|
+
]);
|
|
78
|
+
|
|
79
|
+
console.log(result);
|
|
80
|
+
// {
|
|
81
|
+
// success: true,
|
|
82
|
+
// ocr: { confidence: 98, method: 'full_image_threshold_140_3x' },
|
|
83
|
+
// parsed: {
|
|
84
|
+
...
|
|
85
|
+
// surname: 'TOURE',
|
|
86
|
+
// documentNumber: 'CI0086201',
|
|
87
|
+
// ...
|
|
88
|
+
// }
|
|
89
|
+
// }
|
|
90
|
+
|
|
91
|
+
### 2. Option avec LLM (ChatGPT, Claude, Gemini)
|
|
92
|
+
|
|
93
|
+
Pour remplacer l'OCR classique par les capacités de vision surpuissantes d'un modèle d'IA :
|
|
94
|
+
|
|
95
|
+
```javascript
|
|
96
|
+
const result = await processImage('./document.jpg', {
|
|
97
|
+
llm: {
|
|
98
|
+
provider: 'chatgpt', // ou 'anthropic', 'gemini', 'litellm'
|
|
99
|
+
apiKey: process.env.API_KEY,
|
|
100
|
+
model: 'gpt-4o' // (Optionnel) modèle spécifique
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### 3. Parser direct (texte MRZ)
|
|
106
|
+
// issuingCountry: 'UTO',
|
|
107
|
+
// surname: 'ERIKSSON',
|
|
108
|
+
// givenNames: 'ANNA MARIA',
|
|
109
|
+
// documentNumber: 'L898902C3',
|
|
110
|
+
// nationality: 'UTO',
|
|
111
|
+
// birthDate: Date,
|
|
112
|
+
// birthDateFormatted: '1974-08-12',
|
|
113
|
+
// sex: 'FEMALE',
|
|
114
|
+
// expiryDate: Date,
|
|
115
|
+
// expiryDateFormatted: '2012-04-15',
|
|
116
|
+
// optionalData1: 'ZE184226B',
|
|
117
|
+
// ...
|
|
118
|
+
// }
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 3. Parsing avec texte multi-ligne
|
|
122
|
+
|
|
123
|
+
```javascript
|
|
124
|
+
const { parseMRZ } = require('mrz-genius');
|
|
125
|
+
|
|
126
|
+
// TD1 - Carte d'identité
|
|
127
|
+
const mrz = `I<UTOD231458907<<<<<<<<<<<<<<<
|
|
128
|
+
7408122F1204159UTO<<<<<<<<<<<6
|
|
129
|
+
ERIKSSON<<ANNA<MARIA<<<<<<<<<<`;
|
|
130
|
+
|
|
131
|
+
const result = parseMRZ(mrz);
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### 4. Détection de la zone MRZ sur image
|
|
135
|
+
|
|
136
|
+
```javascript
|
|
137
|
+
const { detectMRZRegion } = require('mrz-genius');
|
|
138
|
+
|
|
139
|
+
const region = await detectMRZRegion('./document.jpg');
|
|
140
|
+
console.log('Région MRZ:', region.x, region.y, region.width, region.height);
|
|
141
|
+
console.log('Confiance:', region.confidence);
|
|
142
|
+
// region.imageBuffer contient l'image croppée de la zone MRZ
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 5. OCR seul
|
|
146
|
+
|
|
147
|
+
```javascript
|
|
148
|
+
const { performOCR } = require('mrz-genius');
|
|
149
|
+
|
|
150
|
+
const ocrResult = await performOCR('./passport.jpg');
|
|
151
|
+
console.log('Lignes MRZ:', ocrResult.lines);
|
|
152
|
+
console.log('Confiance OCR:', ocrResult.confidence);
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 6. Correction OCR
|
|
156
|
+
|
|
157
|
+
```javascript
|
|
158
|
+
const { parseMRZ } = require('mrz-genius');
|
|
159
|
+
|
|
160
|
+
// Avec correction des erreurs OCR courantes (O→0, I→1, etc.)
|
|
161
|
+
const result = parseMRZ(mrzText, { ocrCorrection: true });
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### 7. Vérification de présence MRZ
|
|
165
|
+
|
|
166
|
+
```javascript
|
|
167
|
+
const { hasMRZ } = require('mrz-genius');
|
|
168
|
+
|
|
169
|
+
const containsMRZ = await hasMRZ('./document.jpg');
|
|
170
|
+
console.log('Document contient un MRZ:', containsMRZ);
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## 🔧 API Complète
|
|
174
|
+
|
|
175
|
+
### Haut niveau
|
|
176
|
+
|
|
177
|
+
| Fonction | Description |
|
|
178
|
+
|----------|-------------|
|
|
179
|
+
| `processImage(image, options?)` | Pipeline complet : détection + OCR + parsing |
|
|
180
|
+
| `parseMRZ(text, options?)` | Parse un texte MRZ directement |
|
|
181
|
+
|
|
182
|
+
### Détection
|
|
183
|
+
|
|
184
|
+
| Fonction | Description |
|
|
185
|
+
|----------|-------------|
|
|
186
|
+
| `detectMRZRegion(input, options?)` | Détecte la zone MRZ dans une image |
|
|
187
|
+
| `optimizeForOCR(buffer, options?)` | Optimise une image pour l'OCR |
|
|
188
|
+
| `preprocessForOCR(input, options?)` | Détection + optimisation combinées |
|
|
189
|
+
|
|
190
|
+
### OCR
|
|
191
|
+
|
|
192
|
+
| Fonction | Description |
|
|
193
|
+
|----------|-------------|
|
|
194
|
+
| `performOCR(input, options?)` | OCR Tesseract optimisé pour MRZ |
|
|
195
|
+
| `hasMRZ(input)` | Vérifie si une image contient un MRZ |
|
|
196
|
+
|
|
197
|
+
### Validation
|
|
198
|
+
|
|
199
|
+
| Fonction | Description |
|
|
200
|
+
|----------|-------------|
|
|
201
|
+
| `calculateCheckDigit(value)` | Calcule le chiffre de contrôle ICAO |
|
|
202
|
+
| `isCheckDigitValid(value, digit)` | Valide un chiffre de contrôle |
|
|
203
|
+
| `isCompositeValid(fields, digit)` | Valide le chiffre de contrôle composite |
|
|
204
|
+
|
|
205
|
+
### Correction OCR
|
|
206
|
+
|
|
207
|
+
| Fonction | Description |
|
|
208
|
+
|----------|-------------|
|
|
209
|
+
### Correction OCR (Heuristiques d'auto-réparation)
|
|
210
|
+
|
|
211
|
+
| Fonction | Description |
|
|
212
|
+
|----------|-------------|
|
|
213
|
+
| `correctOCR(str, contentType)` | Corrige les erreurs OCR courantes simples |
|
|
214
|
+
| `repairFieldWithCheckDigit(val, cd)`| Répare par brute-force les confusions en vérifiant le Check Digit |
|
|
215
|
+
| `repairIvorianDocumentNumber(doc)` | Corrige de force un numéro CNI pour correspondre aux normes Ivoiriennes |
|
|
216
|
+
|
|
217
|
+
## 📦 Structure du résultat
|
|
218
|
+
|
|
219
|
+
```typescript
|
|
220
|
+
interface MRZResult {
|
|
221
|
+
valid: boolean; // Tous les chiffres de contrôle sont valides
|
|
222
|
+
format: string; // TD1, TD2, TD3, MRVA, MRVB
|
|
223
|
+
documentType: string; // PASSPORT, VISA, ID_CARD
|
|
224
|
+
issuingCountry: string; // Code pays ISO 3166-1 (3 lettres)
|
|
225
|
+
surname: string; // Nom de famille
|
|
226
|
+
givenNames: string | null; // Prénoms
|
|
227
|
+
documentNumber: string; // Numéro du document
|
|
228
|
+
nationality: string; // Code nationalité
|
|
229
|
+
birthDate: Date; // Date de naissance
|
|
230
|
+
birthDateFormatted: string; // Format YYYY-MM-DD
|
|
231
|
+
sex: string; // MALE, FEMALE, UNSPECIFIED
|
|
232
|
+
expiryDate: Date | null; // Date d'expiration
|
|
233
|
+
expiryDateFormatted: string;
|
|
234
|
+
optionalData1: string | null;
|
|
235
|
+
optionalData2: string | null; // TD1 uniquement
|
|
236
|
+
mrzKey: string; // Clé BAC pour e-Passeports
|
|
237
|
+
rawMRZ: string; // MRZ brut
|
|
238
|
+
details: { // Détails de validation
|
|
239
|
+
fields: { ... },
|
|
240
|
+
finalCheckDigitValid: boolean
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## 🧪 Tests
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
npm test
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## 📐 Distribution des champs
|
|
252
|
+
|
|
253
|
+
```
|
|
254
|
+
TD1 (Carte d'identité) — 3 lignes × 30 caractères
|
|
255
|
+
┌─────────────────────────────────────┐
|
|
256
|
+
│ Ligne 1: Type|Pays|N°Document|Opt1 │
|
|
257
|
+
│ Ligne 2: DDN|S|Exp|Nat|Opt2|Check │
|
|
258
|
+
│ Ligne 3: NOM<<PRÉNOMS │
|
|
259
|
+
└─────────────────────────────────────┘
|
|
260
|
+
|
|
261
|
+
TD3 (Passeport) — 2 lignes × 44 caractères
|
|
262
|
+
┌────────────────────────────────────────────────┐
|
|
263
|
+
│ Ligne 1: Type|Pays|NOM<<PRÉNOMS │
|
|
264
|
+
│ Ligne 2: N°Doc|Nat|DDN|S|Exp|Opt1|Check │
|
|
265
|
+
└────────────────────────────────────────────────┘
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## 📄 Licence
|
|
269
|
+
|
|
270
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mrz-genius",
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "Node.js library for MRZ detection, OCR, and parsing from identity documents (TD1, TD2, TD3, MRVA, MRVB)",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"types": "src/index.d.ts",
|
|
7
|
+
"scripts": {
|
|
8
|
+
"test": "node test/test.js",
|
|
9
|
+
"demo": "node ../example/index.js"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"src",
|
|
13
|
+
"README.md",
|
|
14
|
+
"LICENSE"
|
|
15
|
+
],
|
|
16
|
+
"repository": {
|
|
17
|
+
"type": "git",
|
|
18
|
+
"url": "https://github.com/toure5013/mrz-genius.git"
|
|
19
|
+
},
|
|
20
|
+
"bugs": {
|
|
21
|
+
"url": "https://github.com/toure5013/mrz-genius/issues"
|
|
22
|
+
},
|
|
23
|
+
"homepage": "https://github.com/toure5013/mrz-genius#readme",
|
|
24
|
+
"keywords": [
|
|
25
|
+
"mrz",
|
|
26
|
+
"passport",
|
|
27
|
+
"id-card",
|
|
28
|
+
"ocr",
|
|
29
|
+
"identity",
|
|
30
|
+
"travel-document",
|
|
31
|
+
"machine-readable-zone",
|
|
32
|
+
"kyc",
|
|
33
|
+
"td1",
|
|
34
|
+
"td2",
|
|
35
|
+
"td3",
|
|
36
|
+
"visa"
|
|
37
|
+
],
|
|
38
|
+
"author": "toure5013",
|
|
39
|
+
"license": "MIT",
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"sharp": "^0.33.2",
|
|
42
|
+
"tesseract.js": "^5.1.1"
|
|
43
|
+
},
|
|
44
|
+
"engines": {
|
|
45
|
+
"node": ">=18.0.0"
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MRZ Detector
|
|
3
|
+
* Detects and extracts the MRZ zone from an identity document image
|
|
4
|
+
* Uses image processing techniques (sharp) to locate and crop the MRZ region
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
'use strict';
|
|
8
|
+
|
|
9
|
+
const sharp = require('sharp');
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Preprocess image for better MRZ detection
|
|
13
|
+
* @param {Buffer|string} input - Image buffer or file path
|
|
14
|
+
* @returns {Promise<{metadata: Object, buffer: Buffer}>}
|
|
15
|
+
*/
|
|
16
|
+
async function preprocessImage(input) {
|
|
17
|
+
const image = sharp(input);
|
|
18
|
+
const metadata = await image.metadata();
|
|
19
|
+
|
|
20
|
+
const processed = sharp(input)
|
|
21
|
+
.grayscale()
|
|
22
|
+
.normalize()
|
|
23
|
+
.sharpen({ sigma: 1.5 })
|
|
24
|
+
.modulate({ brightness: 1.1 });
|
|
25
|
+
|
|
26
|
+
const buffer = await processed.toBuffer();
|
|
27
|
+
return { metadata, buffer };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Detect MRZ region in an image.
|
|
32
|
+
* Strategy: crop the bottom portion of the document where MRZ always resides.
|
|
33
|
+
* For ID cards (TD1), MRZ is ~25-35% from bottom.
|
|
34
|
+
* For passports (TD3), MRZ is ~15-25% from bottom.
|
|
35
|
+
*
|
|
36
|
+
* @param {Buffer|string} input - Image buffer or file path
|
|
37
|
+
* @param {Object} [options] - Detection options
|
|
38
|
+
* @param {number} [options.bottomPercent=45] - Bottom percentage of image to scan
|
|
39
|
+
* @param {number} [options.padding=5] - Padding around detected MRZ in pixels
|
|
40
|
+
* @returns {Promise<Object|null>} Detected MRZ region
|
|
41
|
+
*/
|
|
42
|
+
async function detectMRZRegion(input, options = {}) {
|
|
43
|
+
const { bottomPercent = 45, padding = 5 } = options;
|
|
44
|
+
|
|
45
|
+
try {
|
|
46
|
+
const { metadata } = await preprocessImage(input);
|
|
47
|
+
const { width, height } = metadata;
|
|
48
|
+
|
|
49
|
+
if (!width || !height) {
|
|
50
|
+
throw new Error('Unable to read image dimensions');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// MRZ is at the bottom of the document
|
|
54
|
+
const mrzTopStart = Math.floor(height * (1 - bottomPercent / 100));
|
|
55
|
+
const mrzHeight = height - mrzTopStart;
|
|
56
|
+
|
|
57
|
+
// Crop bottom region
|
|
58
|
+
const extractTop = Math.max(0, mrzTopStart - padding);
|
|
59
|
+
const extractHeight = Math.min(mrzHeight + padding * 2, height - extractTop);
|
|
60
|
+
|
|
61
|
+
const croppedBuffer = await sharp(input)
|
|
62
|
+
.extract({
|
|
63
|
+
left: 0,
|
|
64
|
+
top: extractTop,
|
|
65
|
+
width: width,
|
|
66
|
+
height: extractHeight,
|
|
67
|
+
})
|
|
68
|
+
.grayscale()
|
|
69
|
+
.normalize()
|
|
70
|
+
.sharpen({ sigma: 2 })
|
|
71
|
+
.toBuffer();
|
|
72
|
+
|
|
73
|
+
// Analyze rows to find the MRZ text band
|
|
74
|
+
const analyzed = sharp(croppedBuffer);
|
|
75
|
+
const analyzedMeta = await analyzed.metadata();
|
|
76
|
+
const rawBuffer = await analyzed.raw().toBuffer();
|
|
77
|
+
|
|
78
|
+
const cropHeight = analyzedMeta.height;
|
|
79
|
+
const cropWidth = analyzedMeta.width;
|
|
80
|
+
const channels = analyzedMeta.channels || 1;
|
|
81
|
+
|
|
82
|
+
// Calculate horizontal text density per row
|
|
83
|
+
const rowDensities = [];
|
|
84
|
+
for (let y = 0; y < cropHeight; y++) {
|
|
85
|
+
let darkPixelCount = 0;
|
|
86
|
+
for (let x = 0; x < cropWidth; x++) {
|
|
87
|
+
const pixel = rawBuffer[y * cropWidth * channels + x * channels];
|
|
88
|
+
if (pixel < 120) darkPixelCount++;
|
|
89
|
+
}
|
|
90
|
+
rowDensities.push(darkPixelCount / cropWidth);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Find continuous bands of high text density (MRZ lines)
|
|
94
|
+
const threshold = 0.12;
|
|
95
|
+
let bestStart = -1;
|
|
96
|
+
let bestEnd = -1;
|
|
97
|
+
let bestLength = 0;
|
|
98
|
+
let currentStart = -1;
|
|
99
|
+
let consecutiveRows = 0;
|
|
100
|
+
|
|
101
|
+
for (let y = 0; y < cropHeight; y++) {
|
|
102
|
+
if (rowDensities[y] > threshold) {
|
|
103
|
+
if (currentStart === -1) currentStart = y;
|
|
104
|
+
consecutiveRows++;
|
|
105
|
+
} else {
|
|
106
|
+
if (consecutiveRows > bestLength && consecutiveRows > 3) {
|
|
107
|
+
bestStart = currentStart;
|
|
108
|
+
bestEnd = y - 1;
|
|
109
|
+
bestLength = consecutiveRows;
|
|
110
|
+
}
|
|
111
|
+
currentStart = -1;
|
|
112
|
+
consecutiveRows = 0;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Check last segment
|
|
117
|
+
if (consecutiveRows > bestLength && consecutiveRows > 3) {
|
|
118
|
+
bestStart = currentStart;
|
|
119
|
+
bestEnd = cropHeight - 1;
|
|
120
|
+
bestLength = consecutiveRows;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Extract the MRZ band or fall back to full bottom crop
|
|
124
|
+
let mrzBuffer;
|
|
125
|
+
let confidence = 'low';
|
|
126
|
+
|
|
127
|
+
if (bestStart !== -1 && bestLength > 5) {
|
|
128
|
+
const absoluteTop = Math.max(0, extractTop + bestStart - padding * 2);
|
|
129
|
+
const bandHeight = Math.min(bestLength + padding * 4, height - absoluteTop);
|
|
130
|
+
|
|
131
|
+
mrzBuffer = await sharp(input)
|
|
132
|
+
.extract({
|
|
133
|
+
left: 0,
|
|
134
|
+
top: absoluteTop,
|
|
135
|
+
width: width,
|
|
136
|
+
height: bandHeight,
|
|
137
|
+
})
|
|
138
|
+
.grayscale()
|
|
139
|
+
.normalize()
|
|
140
|
+
.sharpen({ sigma: 2 })
|
|
141
|
+
.toBuffer();
|
|
142
|
+
|
|
143
|
+
confidence = 'high';
|
|
144
|
+
} else {
|
|
145
|
+
// Fallback: the whole bottom region
|
|
146
|
+
mrzBuffer = croppedBuffer;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
x: 0,
|
|
151
|
+
y: extractTop,
|
|
152
|
+
width: width,
|
|
153
|
+
height: extractHeight,
|
|
154
|
+
imageBuffer: mrzBuffer,
|
|
155
|
+
confidence,
|
|
156
|
+
};
|
|
157
|
+
} catch (error) {
|
|
158
|
+
throw new Error(`MRZ detection failed: ${error.message}`);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Create an OCR-optimized image from the detected MRZ region
|
|
164
|
+
* @param {Buffer} mrzImageBuffer - MRZ region image buffer
|
|
165
|
+
* @param {Object} [options] - Options
|
|
166
|
+
* @param {number} [options.scaleFactor=3] - Upscale factor for better OCR
|
|
167
|
+
* @param {number} [options.threshold=140] - Binarization threshold
|
|
168
|
+
* @returns {Promise<Buffer>} OCR-optimized image buffer
|
|
169
|
+
*/
|
|
170
|
+
async function optimizeForOCR(mrzImageBuffer, options = {}) {
|
|
171
|
+
const { scaleFactor = 3, threshold = 140 } = options;
|
|
172
|
+
|
|
173
|
+
const meta = await sharp(mrzImageBuffer).metadata();
|
|
174
|
+
const targetWidth = meta.width * scaleFactor;
|
|
175
|
+
const targetHeight = meta.height * scaleFactor;
|
|
176
|
+
|
|
177
|
+
return sharp(mrzImageBuffer)
|
|
178
|
+
.resize(targetWidth, targetHeight, {
|
|
179
|
+
kernel: sharp.kernel.lanczos3,
|
|
180
|
+
fit: 'fill',
|
|
181
|
+
})
|
|
182
|
+
.grayscale()
|
|
183
|
+
.normalize()
|
|
184
|
+
.sharpen({ sigma: 2, m1: 2, m2: 3 })
|
|
185
|
+
.threshold(threshold)
|
|
186
|
+
.toBuffer();
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Full preprocessing pipeline: detect MRZ region + optimize for OCR
|
|
191
|
+
* @param {Buffer|string} input - Image path or buffer
|
|
192
|
+
* @param {Object} [options] - Options
|
|
193
|
+
* @returns {Promise<{region: Object, ocrReadyBuffer: Buffer}>}
|
|
194
|
+
*/
|
|
195
|
+
async function preprocessForOCR(input, options = {}) {
|
|
196
|
+
const region = await detectMRZRegion(input, options);
|
|
197
|
+
if (!region) {
|
|
198
|
+
throw new Error('Could not detect MRZ region in image');
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const ocrReadyBuffer = await optimizeForOCR(region.imageBuffer, options);
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
region,
|
|
205
|
+
ocrReadyBuffer,
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
module.exports = {
|
|
210
|
+
preprocessImage,
|
|
211
|
+
detectMRZRegion,
|
|
212
|
+
optimizeForOCR,
|
|
213
|
+
preprocessForOCR,
|
|
214
|
+
};
|
package/src/index.d.ts
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for mrz-genius
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export interface MRZFieldDetail {
|
|
6
|
+
raw: string;
|
|
7
|
+
checkDigit: number | null;
|
|
8
|
+
valid: boolean;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface MRZDetails {
|
|
12
|
+
fields: {
|
|
13
|
+
documentNumber: MRZFieldDetail;
|
|
14
|
+
birthDate: MRZFieldDetail;
|
|
15
|
+
expiryDate: MRZFieldDetail;
|
|
16
|
+
optionalData1: MRZFieldDetail;
|
|
17
|
+
};
|
|
18
|
+
finalCheckDigitValid: boolean;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface MRZResult {
|
|
22
|
+
valid: boolean;
|
|
23
|
+
format: 'TD1' | 'TD2' | 'TD3' | 'MRVA' | 'MRVB';
|
|
24
|
+
documentType: 'PASSPORT' | 'VISA' | 'ID_CARD' | string;
|
|
25
|
+
issuingCountry: string | null;
|
|
26
|
+
surname: string | null;
|
|
27
|
+
givenNames: string | null;
|
|
28
|
+
documentNumber: string | null;
|
|
29
|
+
nationality: string | null;
|
|
30
|
+
birthDate: Date | null;
|
|
31
|
+
birthDateFormatted: string | null;
|
|
32
|
+
sex: 'MALE' | 'FEMALE' | 'NON_BINARY' | 'UNSPECIFIED';
|
|
33
|
+
expiryDate: Date | null;
|
|
34
|
+
expiryDateFormatted: string | null;
|
|
35
|
+
optionalData1: string | null;
|
|
36
|
+
optionalData2: string | null;
|
|
37
|
+
mrzKey: string;
|
|
38
|
+
rawMRZ: string;
|
|
39
|
+
details: MRZDetails;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface OCRResult {
|
|
43
|
+
lines: string[];
|
|
44
|
+
rawText: string;
|
|
45
|
+
confidence: number;
|
|
46
|
+
method: string;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface ProcessImageResult {
|
|
50
|
+
success: boolean;
|
|
51
|
+
ocr: OCRResult;
|
|
52
|
+
parsed: MRZResult | null;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export interface MRZRegion {
|
|
56
|
+
x: number;
|
|
57
|
+
y: number;
|
|
58
|
+
width: number;
|
|
59
|
+
height: number;
|
|
60
|
+
imageBuffer: Buffer;
|
|
61
|
+
confidence: 'high' | 'low';
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export interface FieldPosition {
|
|
65
|
+
line: number;
|
|
66
|
+
start: number;
|
|
67
|
+
end: number;
|
|
68
|
+
hasCheckDigit: boolean;
|
|
69
|
+
contentType: 'digits' | 'letters' | 'mixed' | 'sex';
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export interface MRZFormatSpec {
|
|
73
|
+
lineLength: number;
|
|
74
|
+
linesCount: number;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export interface LLMConfig {
|
|
78
|
+
provider: 'chatgpt' | 'openai' | 'anthropic' | 'gemini' | 'litellm';
|
|
79
|
+
apiKey: string;
|
|
80
|
+
model?: string;
|
|
81
|
+
baseUrl?: string;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export interface ProcessImageOptions {
|
|
85
|
+
ocrCorrection?: boolean;
|
|
86
|
+
detectRegion?: boolean;
|
|
87
|
+
lang?: string;
|
|
88
|
+
llm?: LLMConfig;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export interface ParseOptions {
|
|
92
|
+
ocrCorrection?: boolean;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export interface DetectOptions {
|
|
96
|
+
bottomPercent?: number;
|
|
97
|
+
padding?: number;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
export interface OCROptions {
|
|
101
|
+
lang?: string;
|
|
102
|
+
detectRegion?: boolean;
|
|
103
|
+
tessdataPath?: string;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// High-level API
|
|
107
|
+
export function processImage(image: Buffer | string, options?: ProcessImageOptions): Promise<ProcessImageResult>;
|
|
108
|
+
export function parseMRZ(mrzText: string | string[], options?: ParseOptions): MRZResult | null;
|
|
109
|
+
|
|
110
|
+
// Detection
|
|
111
|
+
export function detectMRZRegion(input: Buffer | string, options?: DetectOptions): Promise<MRZRegion | null>;
|
|
112
|
+
export function optimizeForOCR(mrzImageBuffer: Buffer, options?: { scaleFactor?: number; threshold?: number }): Promise<Buffer>;
|
|
113
|
+
export function preprocessForOCR(input: Buffer | string, options?: DetectOptions): Promise<{ region: MRZRegion; ocrReadyBuffer: Buffer }>;
|
|
114
|
+
|
|
115
|
+
// OCR
|
|
116
|
+
export function performOCR(input: Buffer | string, options?: OCROptions): Promise<OCRResult>;
|
|
117
|
+
export function hasMRZ(input: Buffer | string): Promise<boolean>;
|
|
118
|
+
export function postProcessOCR(text: string): string[];
|
|
119
|
+
export function extractMRZFromFullText(fullText: string): string[] | null;
|
|
120
|
+
|
|
121
|
+
// Parser
|
|
122
|
+
export function parse(input: string | string[], options?: ParseOptions): MRZResult | null;
|
|
123
|
+
export function detectFormat(lines: string[]): string | null;
|
|
124
|
+
export function parseName(rawName: string): { surname: string | null; givenNames: string | null };
|
|
125
|
+
export function parseDate(raw: string, type: 'birth' | 'expiry'): Date | null;
|
|
126
|
+
export function formatDate(date: Date): string;
|
|
127
|
+
export function parseSex(rawSex: string): string;
|
|
128
|
+
export function parseDocumentType(rawType: string): string;
|
|
129
|
+
|
|
130
|
+
// Validation
|
|
131
|
+
export function calculateCheckDigit(value: string): number | null;
|
|
132
|
+
export function isCheckDigitValid(rawValue: string, checkDigit: number): boolean;
|
|
133
|
+
export function isCompositeValid(fields: Array<{ rawValue: string; checkDigit: number | null }>, finalCheckDigit: number): boolean;
|
|
134
|
+
|
|
135
|
+
// OCR Correction
|
|
136
|
+
export function correctOCR(str: string, contentType: 'digits' | 'letters' | 'sex' | 'mixed'): string;
|
|
137
|
+
export function findMatchingStrings(strings: string[], isCorrectCombination: (combo: string[]) => boolean): string[] | null;
|
|
138
|
+
|
|
139
|
+
// Constants
|
|
140
|
+
export const MRZ_FORMATS: Record<string, MRZFormatSpec>;
|
|
141
|
+
export function getFieldPositions(format: string): Record<string, FieldPosition> | null;
|