@arkyc/ocr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/drivers/ai.d.mts +120 -0
- package/dist/drivers/ai.d.mts.map +1 -0
- package/dist/drivers/ai.mjs +454 -0
- package/dist/drivers/ai.mjs.map +1 -0
- package/dist/drivers/external.d.mts +17 -0
- package/dist/drivers/external.d.mts.map +1 -0
- package/dist/drivers/external.mjs +34 -0
- package/dist/drivers/external.mjs.map +1 -0
- package/dist/drivers/mock.d.mts +16 -0
- package/dist/drivers/mock.d.mts.map +1 -0
- package/dist/drivers/mock.mjs +34 -0
- package/dist/drivers/mock.mjs.map +1 -0
- package/dist/drivers/preprocess.d.mts +51 -0
- package/dist/drivers/preprocess.d.mts.map +1 -0
- package/dist/drivers/preprocess.mjs +50 -0
- package/dist/drivers/preprocess.mjs.map +1 -0
- package/dist/drivers/tesseract.d.mts +75 -0
- package/dist/drivers/tesseract.d.mts.map +1 -0
- package/dist/drivers/tesseract.mjs +175 -0
- package/dist/drivers/tesseract.mjs.map +1 -0
- package/dist/index.d.mts +12 -0
- package/dist/index.mjs +10 -0
- package/dist/parsers/generic.d.mts +8 -0
- package/dist/parsers/generic.d.mts.map +1 -0
- package/dist/parsers/generic.mjs +84 -0
- package/dist/parsers/generic.mjs.map +1 -0
- package/dist/parsers/mrz.d.mts +8 -0
- package/dist/parsers/mrz.d.mts.map +1 -0
- package/dist/parsers/mrz.mjs +149 -0
- package/dist/parsers/mrz.mjs.map +1 -0
- package/dist/parsers/registry.d.mts +49 -0
- package/dist/parsers/registry.d.mts.map +1 -0
- package/dist/parsers/registry.mjs +100 -0
- package/dist/parsers/registry.mjs.map +1 -0
- package/dist/parsers/types.d.mts +43 -0
- package/dist/parsers/types.d.mts.map +1 -0
- package/dist/registry.d.mts +20 -0
- package/dist/registry.d.mts.map +1 -0
- package/dist/registry.mjs +36 -0
- package/dist/registry.mjs.map +1 -0
- package/dist/types.d.mts +48 -0
- package/dist/types.d.mts.map +1 -0
- package/package.json +32 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mrz.d.mts","names":[],"sources":["../../src/parsers/mrz.ts"],"mappings":";;;;iBAkKgB,SAAA,IAAa,cAAc"}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
//#region src/parsers/mrz.ts
|
|
2
|
+
/**
|
|
3
|
+
* Parser for the ICAO 9303 Machine-Readable Zone (MRZ) found on passports (TD3),
|
|
4
|
+
* ID/residence cards (TD1) and some visas (TD2). The MRZ is standardized across
|
|
5
|
+
* countries, which makes this a strong default parser. Check digits are verified
|
|
6
|
+
* to derive a confidence score.
|
|
7
|
+
*/
|
|
8
|
+
const FILLER = "<";
|
|
9
|
+
/** Numeric value of an MRZ character for check-digit computation. */
|
|
10
|
+
function charValue(ch) {
|
|
11
|
+
if (ch === FILLER) return 0;
|
|
12
|
+
if (ch >= "0" && ch <= "9") return ch.charCodeAt(0) - 48;
|
|
13
|
+
if (ch >= "A" && ch <= "Z") return ch.charCodeAt(0) - 55;
|
|
14
|
+
return 0;
|
|
15
|
+
}
|
|
16
|
+
/** ICAO 9303 check digit over a field (weights 7,3,1 repeating, mod 10). */
|
|
17
|
+
function checkDigit(field) {
|
|
18
|
+
const weights = [
|
|
19
|
+
7,
|
|
20
|
+
3,
|
|
21
|
+
1
|
|
22
|
+
];
|
|
23
|
+
let sum = 0;
|
|
24
|
+
for (let i = 0; i < field.length; i += 1) sum += charValue(field[i]) * weights[i % 3];
|
|
25
|
+
return sum % 10;
|
|
26
|
+
}
|
|
27
|
+
/** Whether `field`'s trailing check digit matches (`expected` is the digit char). */
|
|
28
|
+
function checkOk(field, expected) {
|
|
29
|
+
if (!/^[0-9]$/.test(expected)) return false;
|
|
30
|
+
return checkDigit(field) === Number(expected);
|
|
31
|
+
}
|
|
32
|
+
/** Convert an MRZ `YYMMDD` to ISO `YYYY-MM-DD`, or `undefined` if implausible. */
|
|
33
|
+
function mrzDate(yymmdd, kind) {
|
|
34
|
+
if (!/^[0-9]{6}$/.test(yymmdd)) return void 0;
|
|
35
|
+
const yy = Number(yymmdd.slice(0, 2));
|
|
36
|
+
const mm = yymmdd.slice(2, 4);
|
|
37
|
+
const dd = yymmdd.slice(4, 6);
|
|
38
|
+
if (Number(mm) < 1 || Number(mm) > 12 || Number(dd) < 1 || Number(dd) > 31) return void 0;
|
|
39
|
+
const pivot = 70;
|
|
40
|
+
return `${(kind === "dob" ? yy > pivot ? 1900 : 2e3 : yy < pivot ? 2e3 : 1900) + yy}-${mm}-${dd}`;
|
|
41
|
+
}
|
|
42
|
+
/** Split an MRZ name field (`SURNAME<<GIVEN<NAMES`) into structured names. */
|
|
43
|
+
function parseNameField(field) {
|
|
44
|
+
const [surnameRaw = "", givenRaw = ""] = field.split("<<");
|
|
45
|
+
const surname = surnameRaw.replace(/</g, " ").trim();
|
|
46
|
+
const given = givenRaw.replace(/</g, " ").trim();
|
|
47
|
+
const firstName = given.split(/\s+/)[0] || void 0;
|
|
48
|
+
const fullName = [given, surname].filter(Boolean).join(" ") || void 0;
|
|
49
|
+
return {
|
|
50
|
+
firstName,
|
|
51
|
+
lastName: surname || void 0,
|
|
52
|
+
fullName
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
/** Normalize raw OCR text into candidate MRZ lines (uppercase, MRZ charset). */
|
|
56
|
+
function mrzLines(input) {
|
|
57
|
+
return (input.lines ?? input.text.split(/\r?\n/)).map((l) => l.toUpperCase().replace(/\s+/g, "").replace(/[^A-Z0-9<]/g, "")).filter((l) => l.length >= 28 && /^[A-Z0-9<]+$/.test(l) && l.includes(FILLER));
|
|
58
|
+
}
|
|
59
|
+
function field(raw, expected) {
|
|
60
|
+
return {
|
|
61
|
+
value: raw.replace(/</g, ""),
|
|
62
|
+
ok: checkOk(raw, expected)
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
function build(name, documentNumber, nationality, dob, expiry) {
|
|
66
|
+
const fields = {
|
|
67
|
+
...name,
|
|
68
|
+
documentNumber: documentNumber.value || void 0,
|
|
69
|
+
nationality: nationality.replace(/</g, "") || void 0,
|
|
70
|
+
dateOfBirth: mrzDate(dob.value, "dob"),
|
|
71
|
+
expiryDate: mrzDate(expiry.value, "expiry")
|
|
72
|
+
};
|
|
73
|
+
const checks = [
|
|
74
|
+
documentNumber.ok,
|
|
75
|
+
dob.ok,
|
|
76
|
+
expiry.ok
|
|
77
|
+
];
|
|
78
|
+
return {
|
|
79
|
+
fields,
|
|
80
|
+
confidence: .5 + .5 * (checks.filter(Boolean).length / checks.length),
|
|
81
|
+
raw: {
|
|
82
|
+
format: "mrz",
|
|
83
|
+
checks: {
|
|
84
|
+
documentNumber: documentNumber.ok,
|
|
85
|
+
dob: dob.ok,
|
|
86
|
+
expiry: expiry.ok
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
/** TD3 (passport): two 44-char lines. */
|
|
92
|
+
function parseTd3(l1, l2) {
|
|
93
|
+
return build(parseNameField(l1.slice(5)), field(l2.slice(0, 9), l2[9]), l2.slice(10, 13), {
|
|
94
|
+
value: l2.slice(13, 19),
|
|
95
|
+
ok: checkOk(l2.slice(13, 19), l2[19])
|
|
96
|
+
}, {
|
|
97
|
+
value: l2.slice(21, 27),
|
|
98
|
+
ok: checkOk(l2.slice(21, 27), l2[27])
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
/** TD2: two 36-char lines. */
|
|
102
|
+
function parseTd2(l1, l2) {
|
|
103
|
+
return build(parseNameField(l1.slice(5)), field(l2.slice(0, 9), l2[9]), l2.slice(10, 13), {
|
|
104
|
+
value: l2.slice(13, 19),
|
|
105
|
+
ok: checkOk(l2.slice(13, 19), l2[19])
|
|
106
|
+
}, {
|
|
107
|
+
value: l2.slice(21, 27),
|
|
108
|
+
ok: checkOk(l2.slice(21, 27), l2[27])
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
/** TD1 (ID card): three 30-char lines. */
|
|
112
|
+
function parseTd1(l1, l2, l3) {
|
|
113
|
+
const documentNumber = field(l1.slice(5, 14), l1[14]);
|
|
114
|
+
const dob = {
|
|
115
|
+
value: l2.slice(0, 6),
|
|
116
|
+
ok: checkOk(l2.slice(0, 6), l2[6])
|
|
117
|
+
};
|
|
118
|
+
const expiry = {
|
|
119
|
+
value: l2.slice(8, 14),
|
|
120
|
+
ok: checkOk(l2.slice(8, 14), l2[14])
|
|
121
|
+
};
|
|
122
|
+
const nationality = l2.slice(15, 18);
|
|
123
|
+
return build(parseNameField(l3), documentNumber, nationality, dob, expiry);
|
|
124
|
+
}
|
|
125
|
+
var MrzParser = class {
|
|
126
|
+
name = "mrz";
|
|
127
|
+
parse(input) {
|
|
128
|
+
const lines = mrzLines(input);
|
|
129
|
+
if (lines.length < 2) return null;
|
|
130
|
+
const td1 = lines.filter((l) => l.length >= 28 && l.length <= 32);
|
|
131
|
+
if (td1.length >= 3) return parseTd1(pad(td1[0], 30), pad(td1[1], 30), td1[2]);
|
|
132
|
+
const td3 = lines.filter((l) => l.length >= 42 && l.length <= 46);
|
|
133
|
+
if (td3.length >= 2) return parseTd3(pad(td3[0], 44), pad(td3[1], 44));
|
|
134
|
+
const td2 = lines.filter((l) => l.length >= 34 && l.length <= 38);
|
|
135
|
+
if (td2.length >= 2) return parseTd2(pad(td2[0], 36), pad(td2[1], 36));
|
|
136
|
+
return null;
|
|
137
|
+
}
|
|
138
|
+
};
|
|
139
|
+
function pad(line, len) {
|
|
140
|
+
return line.length >= len ? line : line + FILLER.repeat(len - line.length);
|
|
141
|
+
}
|
|
142
|
+
/** The MRZ parser — a country-agnostic default for machine-readable documents. */
|
|
143
|
+
function mrzParser() {
|
|
144
|
+
return new MrzParser();
|
|
145
|
+
}
|
|
146
|
+
//#endregion
|
|
147
|
+
export { mrzParser };
|
|
148
|
+
|
|
149
|
+
//# sourceMappingURL=mrz.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mrz.mjs","names":[],"sources":["../../src/parsers/mrz.ts"],"sourcesContent":["import type { IsoDate, OcrFields } from '@arkyc/types'\nimport type { DocumentParser, ParseInput, ParseOutput } from './types'\n\n/**\n * Parser for the ICAO 9303 Machine-Readable Zone (MRZ) found on passports (TD3),\n * ID/residence cards (TD1) and some visas (TD2). The MRZ is standardized across\n * countries, which makes this a strong default parser. Check digits are verified\n * to derive a confidence score.\n */\n\nconst FILLER = '<'\n\n/** Numeric value of an MRZ character for check-digit computation. */\nfunction charValue(ch: string): number {\n if (ch === FILLER) return 0\n if (ch >= '0' && ch <= '9') return ch.charCodeAt(0) - 48\n if (ch >= 'A' && ch <= 'Z') return ch.charCodeAt(0) - 55 // A=10 … Z=35\n return 0\n}\n\n/** ICAO 9303 check digit over a field (weights 7,3,1 repeating, mod 10). */\nfunction checkDigit(field: string): number {\n const weights = [7, 3, 1]\n let sum = 0\n for (let i = 0; i < field.length; i += 1) sum += charValue(field[i]!) * weights[i % 3]!\n return sum % 10\n}\n\n/** Whether `field`'s trailing check digit matches (`expected` is the digit char). */\nfunction checkOk(field: string, expected: string): boolean {\n if (!/^[0-9]$/.test(expected)) return false\n return checkDigit(field) === Number(expected)\n}\n\n/** Convert an MRZ `YYMMDD` to ISO `YYYY-MM-DD`, or `undefined` if implausible. */\nfunction mrzDate(yymmdd: string, kind: 'dob' | 'expiry'): IsoDate | undefined {\n if (!/^[0-9]{6}$/.test(yymmdd)) return undefined\n const yy = Number(yymmdd.slice(0, 2))\n const mm = yymmdd.slice(2, 4)\n const dd = yymmdd.slice(4, 6)\n if (Number(mm) < 1 || Number(mm) > 12 || Number(dd) < 1 || Number(dd) > 31) return undefined\n // DOB is in the past; expiry is generally in the present/future.\n const pivot = 70\n const century = kind === 'dob' ? (yy > pivot ? 1900 : 2000) : yy < pivot ? 2000 : 1900\n return `${century + yy}-${mm}-${dd}` as IsoDate\n}\n\n/** Split an MRZ name field (`SURNAME<<GIVEN<NAMES`) into structured names. */\nfunction parseNameField(field: string): Pick<OcrFields, 'firstName' | 'lastName' | 'fullName'> {\n const [surnameRaw = '', givenRaw = ''] = field.split('<<')\n const surname = surnameRaw.replace(/</g, ' ').trim()\n const given = givenRaw.replace(/</g, ' ').trim()\n const firstName = given.split(/\\s+/)[0] || undefined\n const fullName = [given, surname].filter(Boolean).join(' ') || undefined\n return { firstName, lastName: surname || undefined, fullName }\n}\n\n/** Normalize raw OCR text into candidate MRZ lines (uppercase, MRZ charset). */\nfunction mrzLines(input: ParseInput): string[] {\n const lines = input.lines ?? input.text.split(/\\r?\\n/)\n return lines\n .map((l) =>\n l\n .toUpperCase()\n .replace(/\\s+/g, '')\n .replace(/[^A-Z0-9<]/g, ''),\n )\n .filter((l) => l.length >= 28 && /^[A-Z0-9<]+$/.test(l) && l.includes(FILLER))\n}\n\n/** A field's value with whether its check digit validated. */\ninterface Checked {\n value: string\n ok: boolean\n}\n\nfunction field(raw: string, expected: string): Checked {\n return { value: raw.replace(/</g, ''), ok: checkOk(raw, expected) }\n}\n\nfunction build(\n name: ReturnType<typeof parseNameField>,\n documentNumber: Checked,\n nationality: string,\n dob: { value: string; ok: boolean },\n expiry: { value: string; ok: boolean },\n): ParseOutput {\n const fields: OcrFields = {\n ...name,\n documentNumber: documentNumber.value || undefined,\n nationality: nationality.replace(/</g, '') || undefined,\n dateOfBirth: mrzDate(dob.value, 'dob'),\n expiryDate: mrzDate(expiry.value, 'expiry'),\n }\n const checks = [documentNumber.ok, dob.ok, expiry.ok]\n const passed = checks.filter(Boolean).length\n const confidence = 0.5 + 0.5 * (passed / checks.length)\n return {\n fields,\n confidence,\n raw: { format: 'mrz', checks: { documentNumber: documentNumber.ok, dob: dob.ok, expiry: expiry.ok } },\n }\n}\n\n/** TD3 (passport): two 44-char lines. */\nfunction parseTd3(l1: string, l2: string): ParseOutput {\n const name = parseNameField(l1.slice(5))\n const documentNumber = field(l2.slice(0, 9), l2[9]!)\n const nationality = l2.slice(10, 13)\n const dob = { value: l2.slice(13, 19), ok: checkOk(l2.slice(13, 19), l2[19]!) }\n const expiry = { value: l2.slice(21, 27), ok: checkOk(l2.slice(21, 27), l2[27]!) }\n return build(name, documentNumber, nationality, dob, expiry)\n}\n\n/** TD2: two 36-char lines. */\nfunction parseTd2(l1: string, l2: string): ParseOutput {\n const name = parseNameField(l1.slice(5))\n const documentNumber = field(l2.slice(0, 9), l2[9]!)\n const nationality = l2.slice(10, 13)\n const dob = { value: l2.slice(13, 19), ok: checkOk(l2.slice(13, 19), l2[19]!) }\n const expiry = { value: l2.slice(21, 27), ok: checkOk(l2.slice(21, 27), l2[27]!) }\n return build(name, documentNumber, nationality, dob, expiry)\n}\n\n/** TD1 (ID card): three 30-char lines. */\nfunction parseTd1(l1: string, l2: string, l3: string): ParseOutput {\n const documentNumber = field(l1.slice(5, 14), l1[14]!)\n const dob = { value: l2.slice(0, 6), ok: checkOk(l2.slice(0, 6), l2[6]!) }\n const expiry = { value: l2.slice(8, 14), ok: checkOk(l2.slice(8, 14), l2[14]!) }\n const nationality = l2.slice(15, 18)\n const name = parseNameField(l3)\n return build(name, documentNumber, nationality, dob, expiry)\n}\n\nclass MrzParser implements DocumentParser {\n readonly name = 'mrz'\n\n parse(input: ParseInput): ParseOutput | null {\n const lines = mrzLines(input)\n if (lines.length < 2) return null\n\n // TD1: three ~30-char lines.\n const td1 = lines.filter((l) => l.length >= 28 && l.length <= 32)\n if (td1.length >= 3) return parseTd1(pad(td1[0]!, 30), pad(td1[1]!, 30), td1[2]!)\n\n // TD3: two ~44-char lines.\n const td3 = lines.filter((l) => l.length >= 42 && l.length <= 46)\n if (td3.length >= 2) return parseTd3(pad(td3[0]!, 44), pad(td3[1]!, 44))\n\n // TD2: two ~36-char lines.\n const td2 = lines.filter((l) => l.length >= 34 && l.length <= 38)\n if (td2.length >= 2) return parseTd2(pad(td2[0]!, 36), pad(td2[1]!, 36))\n\n return null\n }\n}\n\nfunction pad(line: string, len: number): string {\n return line.length >= len ? line : line + FILLER.repeat(len - line.length)\n}\n\n/** The MRZ parser — a country-agnostic default for machine-readable documents. */\nexport function mrzParser(): DocumentParser {\n return new MrzParser()\n}\n"],"mappings":";;;;;;;AAUA,MAAM,SAAS;;AAGf,SAAS,UAAU,IAAoB;CACrC,IAAI,OAAO,QAAQ,OAAO;CAC1B,IAAI,MAAM,OAAO,MAAM,KAAK,OAAO,GAAG,WAAW,CAAC,IAAI;CACtD,IAAI,MAAM,OAAO,MAAM,KAAK,OAAO,GAAG,WAAW,CAAC,IAAI;CACtD,OAAO;AACT;;AAGA,SAAS,WAAW,OAAuB;CACzC,MAAM,UAAU;EAAC;EAAG;EAAG;CAAC;CACxB,IAAI,MAAM;CACV,KAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK,GAAG,OAAO,UAAU,MAAM,EAAG,IAAI,QAAQ,IAAI;CACpF,OAAO,MAAM;AACf;;AAGA,SAAS,QAAQ,OAAe,UAA2B;CACzD,IAAI,CAAC,UAAU,KAAK,QAAQ,GAAG,OAAO;CACtC,OAAO,WAAW,KAAK,MAAM,OAAO,QAAQ;AAC9C;;AAGA,SAAS,QAAQ,QAAgB,MAA6C;CAC5E,IAAI,CAAC,aAAa,KAAK,MAAM,GAAG,OAAO,KAAA;CACvC,MAAM,KAAK,OAAO,OAAO,MAAM,GAAG,CAAC,CAAC;CACpC,MAAM,KAAK,OAAO,MAAM,GAAG,CAAC;CAC5B,MAAM,KAAK,OAAO,MAAM,GAAG,CAAC;CAC5B,IAAI,OAAO,EAAE,IAAI,KAAK,OAAO,EAAE,IAAI,MAAM,OAAO,EAAE,IAAI,KAAK,OAAO,EAAE,IAAI,IAAI,OAAO,KAAA;CAEnF,MAAM,QAAQ;CAEd,OAAO,IADS,SAAS,QAAS,KAAK,QAAQ,OAAO,MAAQ,KAAK,QAAQ,MAAO,QAC9D,GAAG,GAAG,GAAG,GAAG;AAClC;;AAGA,SAAS,eAAe,OAAuE;CAC7F,MAAM,CAAC,aAAa,IAAI,WAAW,MAAM,MAAM,MAAM,IAAI;CACzD,MAAM,UAAU,WAAW,QAAQ,MAAM,GAAG,CAAC,CAAC,KAAK;CACnD,MAAM,QAAQ,SAAS,QAAQ,MAAM,GAAG,CAAC,CAAC,KAAK;CAC/C,MAAM,YAAY,MAAM,MAAM,KAAK,CAAC,CAAC,MAAM,KAAA;CAC3C,MAAM,WAAW,CAAC,OAAO,OAAO,CAAC,CAAC,OAAO,OAAO,CAAC,CAAC,KAAK,GAAG,KAAK,KAAA;CAC/D,OAAO;EAAE;EAAW,UAAU,WAAW,KAAA;EAAW;CAAS;AAC/D;;AAGA,SAAS,SAAS,OAA6B;CAE7C,QADc,MAAM,SAAS,MAAM,KAAK,MAAM,OAAO,EAAA,CAElD,KAAK,MACJ,EACG,YAAY,CAAC,CACb,QAAQ,QAAQ,EAAE,CAAC,CACnB,QAAQ,eAAe,EAAE,CAC9B,CAAC,CACA,QAAQ,MAAM,EAAE,UAAU,MAAM,eAAe,KAAK,CAAC,KAAK,EAAE,SAAS,MAAM,CAAC;AACjF;AAQA,SAAS,MAAM,KAAa,UAA2B;CACrD,OAAO;EAAE,OAAO,IAAI,QAAQ,MAAM,EAAE;EAAG,IAAI,QAAQ,KAAK,QAAQ;CAAE;AACpE;AAEA,SAAS,MACP,MACA,gBACA,aACA,KACA,QACa;CACb,MAAM,SAAoB;EACxB,GAAG;EACH,gBAAgB,eAAe,SAAS,KAAA;EACxC,aAAa,YAAY,QAAQ,MAAM,EAAE,KAAK,KAAA;EAC9C,aAAa,QAAQ,IAAI,OAAO,KAAK;EACrC,YAAY,QAAQ,OAAO,OAAO,QAAQ;CAC5C;CACA,MAAM,SAAS;EAAC,eAAe;EAAI,IAAI;EAAI,OAAO;CAAE;CAGpD,OAAO;EACL;EACA,YAHiB,KAAM,MADV,OAAO,OAAO,OAAO,CAAC,CAAC,SACG,OAAO;EAI9C,KAAK;GAAE,QAAQ;GAAO,QAAQ;IAAE,gBAAgB,eAAe;IAAI,KAAK,IAAI;IAAI,QAAQ,OAAO;GAAG;EAAE;CACtG;AACF;;AAGA,SAAS,SAAS,IAAY,IAAyB;CAMrD,OAAO,MALM,eAAe,GAAG,MAAM,CAAC,CAKtB,GAJO,MAAM,GAAG,MAAM,GAAG,CAAC,GAAG,GAAG,EAIhB,GAHZ,GAAG,MAAM,IAAI,EAGY,GAAG;EAFlC,OAAO,GAAG,MAAM,IAAI,EAAE;EAAG,IAAI,QAAQ,GAAG,MAAM,IAAI,EAAE,GAAG,GAAG,GAAI;CAE1B,GAAG;EADpC,OAAO,GAAG,MAAM,IAAI,EAAE;EAAG,IAAI,QAAQ,GAAG,MAAM,IAAI,EAAE,GAAG,GAAG,GAAI;CACrB,CAAC;AAC7D;;AAGA,SAAS,SAAS,IAAY,IAAyB;CAMrD,OAAO,MALM,eAAe,GAAG,MAAM,CAAC,CAKtB,GAJO,MAAM,GAAG,MAAM,GAAG,CAAC,GAAG,GAAG,EAIhB,GAHZ,GAAG,MAAM,IAAI,EAGY,GAAG;EAFlC,OAAO,GAAG,MAAM,IAAI,EAAE;EAAG,IAAI,QAAQ,GAAG,MAAM,IAAI,EAAE,GAAG,GAAG,GAAI;CAE1B,GAAG;EADpC,OAAO,GAAG,MAAM,IAAI,EAAE;EAAG,IAAI,QAAQ,GAAG,MAAM,IAAI,EAAE,GAAG,GAAG,GAAI;CACrB,CAAC;AAC7D;;AAGA,SAAS,SAAS,IAAY,IAAY,IAAyB;CACjE,MAAM,iBAAiB,MAAM,GAAG,MAAM,GAAG,EAAE,GAAG,GAAG,GAAI;CACrD,MAAM,MAAM;EAAE,OAAO,GAAG,MAAM,GAAG,CAAC;EAAG,IAAI,QAAQ,GAAG,MAAM,GAAG,CAAC,GAAG,GAAG,EAAG;CAAE;CACzE,MAAM,SAAS;EAAE,OAAO,GAAG,MAAM,GAAG,EAAE;EAAG,IAAI,QAAQ,GAAG,MAAM,GAAG,EAAE,GAAG,GAAG,GAAI;CAAE;CAC/E,MAAM,cAAc,GAAG,MAAM,IAAI,EAAE;CAEnC,OAAO,MADM,eAAe,EACZ,GAAG,gBAAgB,aAAa,KAAK,MAAM;AAC7D;AAEA,IAAM,YAAN,MAA0C;CACxC,OAAgB;CAEhB,MAAM,OAAuC;EAC3C,MAAM,QAAQ,SAAS,KAAK;EAC5B,IAAI,MAAM,SAAS,GAAG,OAAO;EAG7B,MAAM,MAAM,MAAM,QAAQ,MAAM,EAAE,UAAU,MAAM,EAAE,UAAU,EAAE;EAChE,IAAI,IAAI,UAAU,GAAG,OAAO,SAAS,IAAI,IAAI,IAAK,EAAE,GAAG,IAAI,IAAI,IAAK,EAAE,GAAG,IAAI,EAAG;EAGhF,MAAM,MAAM,MAAM,QAAQ,MAAM,EAAE,UAAU,MAAM,EAAE,UAAU,EAAE;EAChE,IAAI,IAAI,UAAU,GAAG,OAAO,SAAS,IAAI,IAAI,IAAK,EAAE,GAAG,IAAI,IAAI,IAAK,EAAE,CAAC;EAGvE,MAAM,MAAM,MAAM,QAAQ,MAAM,EAAE,UAAU,MAAM,EAAE,UAAU,EAAE;EAChE,IAAI,IAAI,UAAU,GAAG,OAAO,SAAS,IAAI,IAAI,IAAK,EAAE,GAAG,IAAI,IAAI,IAAK,EAAE,CAAC;EAEvE,OAAO;CACT;AACF;AAEA,SAAS,IAAI,MAAc,KAAqB;CAC9C,OAAO,KAAK,UAAU,MAAM,OAAO,OAAO,OAAO,OAAO,MAAM,KAAK,MAAM;AAC3E;;AAGA,SAAgB,YAA4B;CAC1C,OAAO,IAAI,UAAU;AACvB"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { DocumentParser, ParseInput, ParseOutput } from "./types.mjs";
|
|
2
|
+
import { DocumentType } from "@arkyc/types";
|
|
3
|
+
|
|
4
|
+
//#region src/parsers/registry.d.ts
|
|
5
|
+
/** Which stage of the pipeline produced a parse result. */
|
|
6
|
+
type ParseStage = 'mrz' | 'custom' | 'generic';
|
|
7
|
+
/**
|
|
8
|
+
* A registry of document parsers, resolved in three stages for each document:
|
|
9
|
+
*
|
|
10
|
+
* 1. **MRZ** — the machine-readable zone (reliable, check-digit-verified) when present.
|
|
11
|
+
* 2. **Custom** — country/document-type-specific parsers (registered via {@link register}),
|
|
12
|
+
* matched against the user's selection (most specific first) when the MRZ fails.
|
|
13
|
+
* 3. **Generic** — a best-effort text scraper, the last resort.
|
|
14
|
+
*
|
|
15
|
+
* Custom parsers declare the countries and document types they apply to; register
|
|
16
|
+
* one per document layout the MRZ + generic stages can't read.
|
|
17
|
+
*/
|
|
18
|
+
declare class DocumentParserRegistry {
|
|
19
|
+
private readonly primary;
|
|
20
|
+
private readonly fallback;
|
|
21
|
+
private readonly parsers;
|
|
22
|
+
/**
|
|
23
|
+
* @param primary Tried first for every document (the MRZ parser).
|
|
24
|
+
* @param fallback Tried last when nothing else extracts anything (generic).
|
|
25
|
+
*/
|
|
26
|
+
constructor(primary: DocumentParser, fallback: DocumentParser);
|
|
27
|
+
/** Register a custom country/type-specific parser. Returns `this` for chaining. */
|
|
28
|
+
register(parser: DocumentParser): this;
|
|
29
|
+
/** The applicable custom parsers for a selection, most specific first. */
|
|
30
|
+
candidates(country?: string | null, documentType?: DocumentType | null): DocumentParser[];
|
|
31
|
+
/**
|
|
32
|
+
* Parse fields from OCR text: MRZ first, then the matched custom parsers, then
|
|
33
|
+
* the generic fallback. Always returns a result (empty fields, confidence 0,
|
|
34
|
+
* when nothing could parse).
|
|
35
|
+
*/
|
|
36
|
+
parse(input: ParseInput): ParseOutput;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Create a registry with the MRZ parser as the primary stage and the generic text
|
|
40
|
+
* parser as the fallback. Register country/document-type-specific custom parsers
|
|
41
|
+
* on top for documents the MRZ + generic stages can't read.
|
|
42
|
+
*/
|
|
43
|
+
declare function createDocumentParserRegistry(options?: {
|
|
44
|
+
primary?: DocumentParser;
|
|
45
|
+
fallback?: DocumentParser;
|
|
46
|
+
}): DocumentParserRegistry;
|
|
47
|
+
//#endregion
|
|
48
|
+
export { DocumentParserRegistry, ParseStage, createDocumentParserRegistry };
|
|
49
|
+
//# sourceMappingURL=registry.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"registry.d.mts","names":[],"sources":["../../src/parsers/registry.ts"],"mappings":";;;;;KA0BY,UAAA;AAAZ;;;;AAAsB;AAatB;;;;;;AAbA,cAaa,sBAAA;EAAA,iBAQQ,OAAA;EAAA,iBACA,QAAA;EAAA,iBARF,OAAA;EA+BoB;;;;cAxBlB,OAAA,EAAS,cAAA,EACT,QAAA,EAAU,cAAA;;EAI7B,QAAA,CAAS,MAAA,EAAQ,cAAA;EALE;EAWnB,UAAA,CAAW,OAAA,kBAAyB,YAAA,GAAe,YAAA,UAAsB,cAAA;EAVtD;;;;;EAuBnB,KAAA,CAAM,KAAA,EAAO,UAAA,GAAa,WAAA;AAAA;;;;;;iBA8BZ,4BAAA,CACd,OAAA;EAAW,OAAA,GAAU,cAAA;EAAgB,QAAA,GAAW,cAAA;AAAA,IAC/C,sBAAA"}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { mrzParser } from "./mrz.mjs";
|
|
2
|
+
import { genericTextParser } from "./generic.mjs";
|
|
3
|
+
//#region src/parsers/registry.ts
|
|
4
|
+
function eqCountry(a, b) {
|
|
5
|
+
return a.trim().toUpperCase() === b.trim().toUpperCase();
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Specificity score for a parser against the user's selection, or -1 if it does
|
|
9
|
+
* not apply. A parser scoped to a country scores higher than one scoped only to a
|
|
10
|
+
* document type, so the most specific applicable parser wins.
|
|
11
|
+
*/
|
|
12
|
+
function matchScore(parser, country, documentType) {
|
|
13
|
+
const scopedCountries = parser.countries?.length ? parser.countries : null;
|
|
14
|
+
const scopedTypes = parser.documentTypes?.length ? parser.documentTypes : null;
|
|
15
|
+
const countryOk = !scopedCountries || !!country && scopedCountries.some((c) => eqCountry(c, country));
|
|
16
|
+
const typeOk = !scopedTypes || !!documentType && scopedTypes.includes(documentType);
|
|
17
|
+
if (!countryOk || !typeOk) return -1;
|
|
18
|
+
return (scopedCountries ? 2 : 0) + (scopedTypes ? 1 : 0);
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* A registry of document parsers, resolved in three stages for each document:
|
|
22
|
+
*
|
|
23
|
+
* 1. **MRZ** — the machine-readable zone (reliable, check-digit-verified) when present.
|
|
24
|
+
* 2. **Custom** — country/document-type-specific parsers (registered via {@link register}),
|
|
25
|
+
* matched against the user's selection (most specific first) when the MRZ fails.
|
|
26
|
+
* 3. **Generic** — a best-effort text scraper, the last resort.
|
|
27
|
+
*
|
|
28
|
+
* Custom parsers declare the countries and document types they apply to; register
|
|
29
|
+
* one per document layout the MRZ + generic stages can't read.
|
|
30
|
+
*/
|
|
31
|
+
var DocumentParserRegistry = class {
|
|
32
|
+
primary;
|
|
33
|
+
fallback;
|
|
34
|
+
parsers = [];
|
|
35
|
+
/**
|
|
36
|
+
* @param primary Tried first for every document (the MRZ parser).
|
|
37
|
+
* @param fallback Tried last when nothing else extracts anything (generic).
|
|
38
|
+
*/
|
|
39
|
+
constructor(primary, fallback) {
|
|
40
|
+
this.primary = primary;
|
|
41
|
+
this.fallback = fallback;
|
|
42
|
+
}
|
|
43
|
+
/** Register a custom country/type-specific parser. Returns `this` for chaining. */
|
|
44
|
+
register(parser) {
|
|
45
|
+
this.parsers.push(parser);
|
|
46
|
+
return this;
|
|
47
|
+
}
|
|
48
|
+
/** The applicable custom parsers for a selection, most specific first. */
|
|
49
|
+
candidates(country, documentType) {
|
|
50
|
+
return this.parsers.map((parser, index) => ({
|
|
51
|
+
parser,
|
|
52
|
+
index,
|
|
53
|
+
score: matchScore(parser, country, documentType)
|
|
54
|
+
})).filter((entry) => entry.score >= 0).sort((a, b) => b.score - a.score || a.index - b.index).map((entry) => entry.parser);
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Parse fields from OCR text: MRZ first, then the matched custom parsers, then
|
|
58
|
+
* the generic fallback. Always returns a result (empty fields, confidence 0,
|
|
59
|
+
* when nothing could parse).
|
|
60
|
+
*/
|
|
61
|
+
parse(input) {
|
|
62
|
+
const fromMrz = this.primary.parse(input);
|
|
63
|
+
if (fromMrz) return tag(fromMrz, this.primary.name, "mrz");
|
|
64
|
+
for (const parser of this.candidates(input.country, input.documentType)) {
|
|
65
|
+
const out = parser.parse(input);
|
|
66
|
+
if (out) return tag(out, parser.name, "custom");
|
|
67
|
+
}
|
|
68
|
+
const fromGeneric = this.fallback.parse(input);
|
|
69
|
+
if (fromGeneric) return tag(fromGeneric, this.fallback.name, "generic");
|
|
70
|
+
return {
|
|
71
|
+
fields: {},
|
|
72
|
+
confidence: 0,
|
|
73
|
+
raw: { parser: "none" }
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
/** Annotate a parse result with which parser + stage produced it. */
|
|
78
|
+
function tag(out, parser, stage) {
|
|
79
|
+
const detail = out.raw && typeof out.raw === "object" ? out.raw : { value: out.raw };
|
|
80
|
+
return {
|
|
81
|
+
...out,
|
|
82
|
+
raw: {
|
|
83
|
+
parser,
|
|
84
|
+
stage,
|
|
85
|
+
...detail
|
|
86
|
+
}
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Create a registry with the MRZ parser as the primary stage and the generic text
|
|
91
|
+
* parser as the fallback. Register country/document-type-specific custom parsers
|
|
92
|
+
* on top for documents the MRZ + generic stages can't read.
|
|
93
|
+
*/
|
|
94
|
+
function createDocumentParserRegistry(options = {}) {
|
|
95
|
+
return new DocumentParserRegistry(options.primary ?? mrzParser(), options.fallback ?? genericTextParser());
|
|
96
|
+
}
|
|
97
|
+
//#endregion
|
|
98
|
+
export { DocumentParserRegistry, createDocumentParserRegistry };
|
|
99
|
+
|
|
100
|
+
//# sourceMappingURL=registry.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"registry.mjs","names":[],"sources":["../../src/parsers/registry.ts"],"sourcesContent":["import type { DocumentType } from '@arkyc/types'\nimport type { DocumentParser, ParseInput, ParseOutput } from './types'\nimport { mrzParser } from './mrz'\nimport { genericTextParser } from './generic'\n\nfunction eqCountry(a: string, b: string): boolean {\n return a.trim().toUpperCase() === b.trim().toUpperCase()\n}\n\n/**\n * Specificity score for a parser against the user's selection, or -1 if it does\n * not apply. A parser scoped to a country scores higher than one scoped only to a\n * document type, so the most specific applicable parser wins.\n */\nfunction matchScore(parser: DocumentParser, country?: string | null, documentType?: DocumentType | null): number {\n const scopedCountries = parser.countries?.length ? parser.countries : null\n const scopedTypes = parser.documentTypes?.length ? parser.documentTypes : null\n\n const countryOk = !scopedCountries || (!!country && scopedCountries.some((c) => eqCountry(c, country)))\n const typeOk = !scopedTypes || (!!documentType && scopedTypes.includes(documentType))\n if (!countryOk || !typeOk) return -1\n\n return (scopedCountries ? 2 : 0) + (scopedTypes ? 1 : 0)\n}\n\n/** Which stage of the pipeline produced a parse result. */\nexport type ParseStage = 'mrz' | 'custom' | 'generic'\n\n/**\n * A registry of document parsers, resolved in three stages for each document:\n *\n * 1. **MRZ** — the machine-readable zone (reliable, check-digit-verified) when present.\n * 2. **Custom** — country/document-type-specific parsers (registered via {@link register}),\n * matched against the user's selection (most specific first) when the MRZ fails.\n * 3. **Generic** — a best-effort text scraper, the last resort.\n *\n * Custom parsers declare the countries and document types they apply to; register\n * one per document layout the MRZ + generic stages can't read.\n */\nexport class DocumentParserRegistry {\n private readonly parsers: DocumentParser[] = []\n\n /**\n * @param primary Tried first for every document (the MRZ parser).\n * @param fallback Tried last when nothing else extracts anything (generic).\n */\n constructor(\n private readonly primary: DocumentParser,\n private readonly fallback: DocumentParser,\n ) {}\n\n /** Register a custom country/type-specific parser. Returns `this` for chaining. */\n register(parser: DocumentParser): this {\n this.parsers.push(parser)\n return this\n }\n\n /** The applicable custom parsers for a selection, most specific first. */\n candidates(country?: string | null, documentType?: DocumentType | null): DocumentParser[] {\n return this.parsers\n .map((parser, index) => ({ parser, index, score: matchScore(parser, country, documentType) }))\n .filter((entry) => entry.score >= 0)\n .sort((a, b) => b.score - a.score || a.index - b.index)\n .map((entry) => entry.parser)\n }\n\n /**\n * Parse fields from OCR text: MRZ first, then the matched custom parsers, then\n * the generic fallback. Always returns a result (empty fields, confidence 0,\n * when nothing could parse).\n */\n parse(input: ParseInput): ParseOutput {\n // 1. MRZ — the gold standard when a machine-readable zone is present.\n const fromMrz = this.primary.parse(input)\n if (fromMrz) return tag(fromMrz, this.primary.name, 'mrz')\n\n // 2. Custom country/type-specific parsers, most specific first.\n for (const parser of this.candidates(input.country, input.documentType)) {\n const out = parser.parse(input)\n if (out) return tag(out, parser.name, 'custom')\n }\n\n // 3. Generic best-effort extraction.\n const fromGeneric = this.fallback.parse(input)\n if (fromGeneric) return tag(fromGeneric, this.fallback.name, 'generic')\n\n return { fields: {}, confidence: 0, raw: { parser: 'none' } }\n }\n}\n\n/** Annotate a parse result with which parser + stage produced it. */\nfunction tag(out: ParseOutput, parser: string, stage: ParseStage): ParseOutput {\n const detail = out.raw && typeof out.raw === 'object' ? out.raw : { value: out.raw }\n return { ...out, raw: { parser, stage, ...detail } }\n}\n\n/**\n * Create a registry with the MRZ parser as the primary stage and the generic text\n * parser as the fallback. Register country/document-type-specific custom parsers\n * on top for documents the MRZ + generic stages can't read.\n */\nexport function createDocumentParserRegistry(\n options: { primary?: DocumentParser; fallback?: DocumentParser } = {},\n): DocumentParserRegistry {\n return new DocumentParserRegistry(options.primary ?? mrzParser(), options.fallback ?? genericTextParser())\n}\n"],"mappings":";;;AAKA,SAAS,UAAU,GAAW,GAAoB;CAChD,OAAO,EAAE,KAAK,CAAC,CAAC,YAAY,MAAM,EAAE,KAAK,CAAC,CAAC,YAAY;AACzD;;;;;;AAOA,SAAS,WAAW,QAAwB,SAAyB,cAA4C;CAC/G,MAAM,kBAAkB,OAAO,WAAW,SAAS,OAAO,YAAY;CACtE,MAAM,cAAc,OAAO,eAAe,SAAS,OAAO,gBAAgB;CAE1E,MAAM,YAAY,CAAC,mBAAoB,CAAC,CAAC,WAAW,gBAAgB,MAAM,MAAM,UAAU,GAAG,OAAO,CAAC;CACrG,MAAM,SAAS,CAAC,eAAgB,CAAC,CAAC,gBAAgB,YAAY,SAAS,YAAY;CACnF,IAAI,CAAC,aAAa,CAAC,QAAQ,OAAO;CAElC,QAAQ,kBAAkB,IAAI,MAAM,cAAc,IAAI;AACxD;;;;;;;;;;;;AAgBA,IAAa,yBAAb,MAAoC;CAQf;CACA;CARnB,UAA6C,CAAC;;;;;CAM9C,YACE,SACA,UACA;EAFiB,KAAA,UAAA;EACA,KAAA,WAAA;CAChB;;CAGH,SAAS,QAA8B;EACrC,KAAK,QAAQ,KAAK,MAAM;EACxB,OAAO;CACT;;CAGA,WAAW,SAAyB,cAAsD;EACxF,OAAO,KAAK,QACT,KAAK,QAAQ,WAAW;GAAE;GAAQ;GAAO,OAAO,WAAW,QAAQ,SAAS,YAAY;EAAE,EAAE,CAAC,CAC7F,QAAQ,UAAU,MAAM,SAAS,CAAC,CAAC,CACnC,MAAM,GAAG,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,QAAQ,EAAE,KAAK,CAAC,CACtD,KAAK,UAAU,MAAM,MAAM;CAChC;;;;;;CAOA,MAAM,OAAgC;EAEpC,MAAM,UAAU,KAAK,QAAQ,MAAM,KAAK;EACxC,IAAI,SAAS,OAAO,IAAI,SAAS,KAAK,QAAQ,MAAM,KAAK;EAGzD,KAAK,MAAM,UAAU,KAAK,WAAW,MAAM,SAAS,MAAM,YAAY,GAAG;GACvE,MAAM,MAAM,OAAO,MAAM,KAAK;GAC9B,IAAI,KAAK,OAAO,IAAI,KAAK,OAAO,MAAM,QAAQ;EAChD;EAGA,MAAM,cAAc,KAAK,SAAS,MAAM,KAAK;EAC7C,IAAI,aAAa,OAAO,IAAI,aAAa,KAAK,SAAS,MAAM,SAAS;EAEtE,OAAO;GAAE,QAAQ,CAAC;GAAG,YAAY;GAAG,KAAK,EAAE,QAAQ,OAAO;EAAE;CAC9D;AACF;;AAGA,SAAS,IAAI,KAAkB,QAAgB,OAAgC;CAC7E,MAAM,SAAS,IAAI,OAAO,OAAO,IAAI,QAAQ,WAAW,IAAI,MAAM,EAAE,OAAO,IAAI,IAAI;CACnF,OAAO;EAAE,GAAG;EAAK,KAAK;GAAE;GAAQ;GAAO,GAAG;EAAO;CAAE;AACrD;;;;;;AAOA,SAAgB,6BACd,UAAmE,CAAC,GAC5C;CACxB,OAAO,IAAI,uBAAuB,QAAQ,WAAW,UAAU,GAAG,QAAQ,YAAY,kBAAkB,CAAC;AAC3G"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { DocumentType, OcrFields } from "@arkyc/types";
|
|
2
|
+
|
|
3
|
+
//#region src/parsers/types.d.ts
|
|
4
|
+
/** The OCR text + context handed to a document parser. */
|
|
5
|
+
interface ParseInput {
|
|
6
|
+
/** Full OCR text for the document (raw engine output). */
|
|
7
|
+
text: string;
|
|
8
|
+
/** Pre-split lines; derived from `text` when omitted. */
|
|
9
|
+
lines?: string[];
|
|
10
|
+
/** The document category the user selected (matched against parsers). */
|
|
11
|
+
documentType?: DocumentType | null;
|
|
12
|
+
/** The ISO country the user selected (matched against parsers). */
|
|
13
|
+
country?: string | null;
|
|
14
|
+
}
|
|
15
|
+
/** A parser's structured result. */
|
|
16
|
+
interface ParseOutput {
|
|
17
|
+
fields: OcrFields;
|
|
18
|
+
/** Confidence in this parse, 0–1. */
|
|
19
|
+
confidence: number;
|
|
20
|
+
/** Anything worth retaining for audit (matched zone, raw lines, …). */
|
|
21
|
+
raw?: unknown;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* A document parser turns raw OCR text into structured {@link OcrFields}. A parser
|
|
25
|
+
* declares the countries and document types it applies to; the registry matches
|
|
26
|
+
* those against the user's selection to pick the most specific parser, and falls
|
|
27
|
+
* back to the default when none match (or none can parse the text).
|
|
28
|
+
*/
|
|
29
|
+
interface DocumentParser {
|
|
30
|
+
readonly name: string;
|
|
31
|
+
/**
|
|
32
|
+
* ISO-3166 country codes (alpha-2 or alpha-3) this parser applies to. Omit (or
|
|
33
|
+
* leave empty) to apply to any country.
|
|
34
|
+
*/
|
|
35
|
+
readonly countries?: readonly string[];
|
|
36
|
+
/** Document types this parser applies to. Omit/empty to apply to any type. */
|
|
37
|
+
readonly documentTypes?: readonly DocumentType[];
|
|
38
|
+
/** Parse fields from the input, or return `null` if it can't handle it. */
|
|
39
|
+
parse(input: ParseInput): ParseOutput | null;
|
|
40
|
+
}
|
|
41
|
+
//#endregion
|
|
42
|
+
export { DocumentParser, ParseInput, ParseOutput };
|
|
43
|
+
//# sourceMappingURL=types.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.mts","names":[],"sources":["../../src/parsers/types.ts"],"mappings":";;;;UAGiB,UAAA;EAAA;EAEf,IAAA;;EAEA,KAAA;EAFA;EAIA,YAAA,GAAe,YAAY;EAA3B;EAEA,OAAA;AAAA;;UAIe,WAAA;EACf,MAAA,EAAQ,SAAS;EADS;EAG1B,UAAA;EAFiB;EAIjB,GAAA;AAAA;;;;AAAG;AASL;;UAAiB,cAAA;EAAA,SACN,IAAA;EASI;;;;EAAA,SAJJ,SAAA;EAAA;EAAA,SAEA,aAAA,YAAyB,YAAA;EAAA;EAElC,KAAA,CAAM,KAAA,EAAO,UAAA,GAAa,WAAA;AAAA"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { OcrConfig, OcrDriver } from "./types.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/registry.d.ts
|
|
4
|
+
/**
|
|
5
|
+
* Selects an OCR driver from config. Call sites depend only on the
|
|
6
|
+
* {@link OcrDriver} interface, so swapping `config.driver` changes behaviour
|
|
7
|
+
* with no other changes.
|
|
8
|
+
*/
|
|
9
|
+
declare class OcrDriverFactory {
|
|
10
|
+
/**
|
|
11
|
+
* Resolve the OCR driver named by `config`.
|
|
12
|
+
*
|
|
13
|
+
* @param config
|
|
14
|
+
* @returns
|
|
15
|
+
*/
|
|
16
|
+
static create(config: OcrConfig): OcrDriver;
|
|
17
|
+
}
|
|
18
|
+
//#endregion
|
|
19
|
+
export { OcrDriverFactory };
|
|
20
|
+
//# sourceMappingURL=registry.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"registry.d.mts","names":[],"sources":["../src/registry.ts"],"mappings":";;;;;AAYA;;;cAAa,gBAAA;EAOJ;;;;;AAAoC;EAApC,OAAA,MAAA,CAAO,MAAA,EAAQ,SAAA,GAAY,SAAS;AAAA"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { AnthropicOcrDriver } from "./drivers/ai.mjs";
|
|
2
|
+
import { ExternalOcrDriver } from "./drivers/external.mjs";
|
|
3
|
+
import { MockOcrDriver } from "./drivers/mock.mjs";
|
|
4
|
+
import { TesseractOcrDriver } from "./drivers/tesseract.mjs";
|
|
5
|
+
//#region src/registry.ts
|
|
6
|
+
/**
|
|
7
|
+
* Selects an OCR driver from config. Call sites depend only on the
|
|
8
|
+
* {@link OcrDriver} interface, so swapping `config.driver` changes behaviour
|
|
9
|
+
* with no other changes.
|
|
10
|
+
*/
|
|
11
|
+
var OcrDriverFactory = class {
|
|
12
|
+
/**
|
|
13
|
+
* Resolve the OCR driver named by `config`.
|
|
14
|
+
*
|
|
15
|
+
* @param config
|
|
16
|
+
* @returns
|
|
17
|
+
*/
|
|
18
|
+
static create(config) {
|
|
19
|
+
switch (config.driver) {
|
|
20
|
+
case "mock": return new MockOcrDriver();
|
|
21
|
+
case "tesseract": return new TesseractOcrDriver({ language: config.language });
|
|
22
|
+
case "external": return new ExternalOcrDriver(config);
|
|
23
|
+
case "ai": return new AnthropicOcrDriver({
|
|
24
|
+
apiKey: config.apiKey,
|
|
25
|
+
model: config.model,
|
|
26
|
+
baseUrl: config.endpoint,
|
|
27
|
+
maxEdge: config.maxEdge
|
|
28
|
+
});
|
|
29
|
+
default: throw new Error(`Unknown OCR driver: ${config.driver}`);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
};
|
|
33
|
+
//#endregion
|
|
34
|
+
export { OcrDriverFactory };
|
|
35
|
+
|
|
36
|
+
//# sourceMappingURL=registry.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"registry.mjs","names":[],"sources":["../src/registry.ts"],"sourcesContent":["import type { OcrConfig, OcrDriver } from './types'\n\nimport { AnthropicOcrDriver } from './drivers/ai'\nimport { ExternalOcrDriver } from './drivers/external'\nimport { MockOcrDriver } from './drivers/mock'\nimport { TesseractOcrDriver } from './drivers/tesseract'\n\n/**\n * Selects an OCR driver from config. Call sites depend only on the\n * {@link OcrDriver} interface, so swapping `config.driver` changes behaviour\n * with no other changes.\n */\nexport class OcrDriverFactory {\n /**\n * Resolve the OCR driver named by `config`.\n *\n * @param config\n * @returns\n */\n static create(config: OcrConfig): OcrDriver {\n switch (config.driver) {\n case 'mock':\n return new MockOcrDriver()\n case 'tesseract':\n return new TesseractOcrDriver({ language: config.language })\n case 'external':\n return new ExternalOcrDriver(config)\n case 'ai':\n return new AnthropicOcrDriver({\n apiKey: config.apiKey,\n model: config.model,\n baseUrl: config.endpoint,\n maxEdge: config.maxEdge,\n })\n default:\n throw new Error(`Unknown OCR driver: ${(config as OcrConfig).driver}`)\n }\n }\n}\n"],"mappings":";;;;;;;;;;AAYA,IAAa,mBAAb,MAA8B;;;;;;;CAO5B,OAAO,OAAO,QAA8B;EAC1C,QAAQ,OAAO,QAAf;GACE,KAAK,QACH,OAAO,IAAI,cAAc;GAC3B,KAAK,aACH,OAAO,IAAI,mBAAmB,EAAE,UAAU,OAAO,SAAS,CAAC;GAC7D,KAAK,YACH,OAAO,IAAI,kBAAkB,MAAM;GACrC,KAAK,MACH,OAAO,IAAI,mBAAmB;IAC5B,QAAQ,OAAO;IACf,OAAO,OAAO;IACd,SAAS,OAAO;IAChB,SAAS,OAAO;GAClB,CAAC;GACH,SACE,MAAM,IAAI,MAAM,uBAAwB,OAAqB,QAAQ;EACzE;CACF;AACF"}
|
package/dist/types.d.mts
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { DocumentType, OcrResultData } from "@arkyc/types";
|
|
2
|
+
|
|
3
|
+
//#region src/types.d.ts
|
|
4
|
+
/** The bytes + context handed to an OCR driver for extraction. */
|
|
5
|
+
interface OcrRequest {
|
|
6
|
+
/** Raw document-front image bytes. */
|
|
7
|
+
image: Uint8Array;
|
|
8
|
+
/**
|
|
9
|
+
* Raw document-back image bytes, when captured. Some documents (TD1 ID cards,
|
|
10
|
+
* residence permits) print the MRZ on the back, so both sides are read.
|
|
11
|
+
*/
|
|
12
|
+
backImage?: Uint8Array;
|
|
13
|
+
/** Optional hint about the document category. */
|
|
14
|
+
documentType?: DocumentType | null;
|
|
15
|
+
/** Optional ISO country code hint. */
|
|
16
|
+
country?: string | null;
|
|
17
|
+
/**
|
|
18
|
+
* Optional deterministic signals (used by the `mock` driver and tests to
|
|
19
|
+
* steer the extracted confidence / expiry). Ignored by real drivers.
|
|
20
|
+
*/
|
|
21
|
+
hints?: {
|
|
22
|
+
confidence?: number;
|
|
23
|
+
expired?: boolean;
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
/** A pluggable OCR provider. */
|
|
27
|
+
interface OcrDriver {
|
|
28
|
+
readonly name: string;
|
|
29
|
+
extract(request: OcrRequest): Promise<OcrResultData>;
|
|
30
|
+
}
|
|
31
|
+
/** Identifier for a registered OCR driver. */
|
|
32
|
+
type OcrDriverName = 'mock' | 'tesseract' | 'external' | 'ai';
|
|
33
|
+
/** Configuration selecting + parameterising the active OCR driver. */
|
|
34
|
+
interface OcrConfig {
|
|
35
|
+
driver: OcrDriverName;
|
|
36
|
+
/** Base URL for the `external` driver, or API base override for `ai`. */
|
|
37
|
+
endpoint?: string;
|
|
38
|
+
apiKey?: string;
|
|
39
|
+
/** Tesseract language(s), e.g. `eng` (the `tesseract` driver). */
|
|
40
|
+
language?: string;
|
|
41
|
+
/** Vision model id (the `ai` driver). */
|
|
42
|
+
model?: string;
|
|
43
|
+
/** Longest image edge (px) uploaded by the `ai` driver. */
|
|
44
|
+
maxEdge?: number;
|
|
45
|
+
}
|
|
46
|
+
//#endregion
|
|
47
|
+
export { OcrConfig, OcrDriver, OcrDriverName, OcrRequest };
|
|
48
|
+
//# sourceMappingURL=types.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.mts","names":[],"sources":["../src/types.ts"],"mappings":";;;;UAGiB,UAAA;EAAA;EAEf,KAAA,EAAO,UAAA;;;;;EAKP,SAAA,GAAY,UAAA;EAEe;EAA3B,YAAA,GAAe,YAAA;EAPR;EASP,OAAA;EAJY;;;;EASZ,KAAA;IAAU,UAAA;IAAqB,OAAA;EAAA;AAAA;AAIjC;AAAA,UAAiB,SAAA;EAAA,SACN,IAAA;EACT,OAAA,CAAQ,OAAA,EAAS,UAAA,GAAa,OAAA,CAAQ,aAAA;AAAA;;KAI5B,aAAA;;UAGK,SAAA;EACf,MAAA,EAAQ,aAAa;EARJ;EAUjB,QAAA;EACA,MAAA;EAXsC;EAatC,QAAA;EAbmD;EAenD,KAAA;EAXuB;EAavB,OAAA;AAAA"}
|
package/package.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@arkyc/ocr",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Driver-based OCR extraction",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"main": "./dist/index.mjs",
|
|
8
|
+
"module": "./dist/index.mjs",
|
|
9
|
+
"types": "./dist/index.d.mts",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": {
|
|
12
|
+
"types": "./dist/index.d.mts",
|
|
13
|
+
"import": "./dist/index.mjs"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"dist"
|
|
18
|
+
],
|
|
19
|
+
"dependencies": {
|
|
20
|
+
"@arkyc/types": "^1.0.0"
|
|
21
|
+
},
|
|
22
|
+
"optionalDependencies": {
|
|
23
|
+
"sharp": "^0.33.5",
|
|
24
|
+
"tesseract.js": "^5.1.1"
|
|
25
|
+
},
|
|
26
|
+
"scripts": {
|
|
27
|
+
"typecheck": "tsc --noEmit",
|
|
28
|
+
"test": "vitest run",
|
|
29
|
+
"lint": "eslint src",
|
|
30
|
+
"clean": "rm -rf dist"
|
|
31
|
+
}
|
|
32
|
+
}
|