npm - @arkyc/ocr - Versions diffs - 1.0.0 - Mend

@arkyc/ocr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/dist/drivers/ai.d.mts +120 -0
package/dist/drivers/ai.d.mts.map +1 -0
package/dist/drivers/ai.mjs +454 -0
package/dist/drivers/ai.mjs.map +1 -0
package/dist/drivers/external.d.mts +17 -0
package/dist/drivers/external.d.mts.map +1 -0
package/dist/drivers/external.mjs +34 -0
package/dist/drivers/external.mjs.map +1 -0
package/dist/drivers/mock.d.mts +16 -0
package/dist/drivers/mock.d.mts.map +1 -0
package/dist/drivers/mock.mjs +34 -0
package/dist/drivers/mock.mjs.map +1 -0
package/dist/drivers/preprocess.d.mts +51 -0
package/dist/drivers/preprocess.d.mts.map +1 -0
package/dist/drivers/preprocess.mjs +50 -0
package/dist/drivers/preprocess.mjs.map +1 -0
package/dist/drivers/tesseract.d.mts +75 -0
package/dist/drivers/tesseract.d.mts.map +1 -0
package/dist/drivers/tesseract.mjs +175 -0
package/dist/drivers/tesseract.mjs.map +1 -0
package/dist/index.d.mts +12 -0
package/dist/index.mjs +10 -0
package/dist/parsers/generic.d.mts +8 -0
package/dist/parsers/generic.d.mts.map +1 -0
package/dist/parsers/generic.mjs +84 -0
package/dist/parsers/generic.mjs.map +1 -0
package/dist/parsers/mrz.d.mts +8 -0
package/dist/parsers/mrz.d.mts.map +1 -0
package/dist/parsers/mrz.mjs +149 -0
package/dist/parsers/mrz.mjs.map +1 -0
package/dist/parsers/registry.d.mts +49 -0
package/dist/parsers/registry.d.mts.map +1 -0
package/dist/parsers/registry.mjs +100 -0
package/dist/parsers/registry.mjs.map +1 -0
package/dist/parsers/types.d.mts +43 -0
package/dist/parsers/types.d.mts.map +1 -0
package/dist/registry.d.mts +20 -0
package/dist/registry.d.mts.map +1 -0
package/dist/registry.mjs +36 -0
package/dist/registry.mjs.map +1 -0
package/dist/types.d.mts +48 -0
package/dist/types.d.mts.map +1 -0
package/package.json +32 -0

package/dist/drivers/external.mjs ADDED Viewed

@@ -0,0 +1,34 @@
+//#region src/drivers/external.ts
+/**
+* Generic HTTP OCR driver: POSTs the base64 image to a configured endpoint and
+* expects an {@link OcrResultData}-shaped JSON response.
+*/
+var ExternalOcrDriver = class {
+	config;
+	name = "external";
+	constructor(config) {
+		this.config = config;
+		if (!config.endpoint) throw new Error("ExternalOcrDriver requires config.endpoint");
+	}
+	async extract(request) {
+		const res = await fetch(this.config.endpoint, {
+			method: "POST",
+			headers: {
+				"content-type": "application/json",
+				...this.config.apiKey ? { authorization: `Bearer ${this.config.apiKey}` } : {}
+			},
+			body: JSON.stringify({
+				image: Buffer.from(request.image).toString("base64"),
+				backImage: request.backImage?.length ? Buffer.from(request.backImage).toString("base64") : null,
+				documentType: request.documentType ?? null,
+				country: request.country ?? null
+			})
+		});
+		if (!res.ok) throw new Error(`ExternalOcrDriver request failed with status ${res.status}`);
+		return await res.json();
+	}
+};
+//#endregion
+export { ExternalOcrDriver };
+//# sourceMappingURL=external.mjs.map

package/dist/drivers/external.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"external.mjs","names":[],"sources":["../../src/drivers/external.ts"],"sourcesContent":["import type { OcrResultData } from '@arkyc/types'\nimport type { OcrConfig, OcrDriver, OcrRequest } from '../types'\n\n/**\n * Generic HTTP OCR driver: POSTs the base64 image to a configured endpoint and\n * expects an {@link OcrResultData}-shaped JSON response.\n */\nexport class ExternalOcrDriver implements OcrDriver {\n readonly name = 'external'\n\n constructor(private readonly config: OcrConfig) {\n if (!config.endpoint) throw new Error('ExternalOcrDriver requires config.endpoint')\n }\n\n async extract(request: OcrRequest): Promise<OcrResultData> {\n const res = await fetch(this.config.endpoint as string, {\n method: 'POST',\n headers: {\n 'content-type': 'application/json',\n ...(this.config.apiKey ? { authorization: `Bearer ${this.config.apiKey}` } : {}),\n },\n body: JSON.stringify({\n image: Buffer.from(request.image).toString('base64'),\n backImage: request.backImage?.length ? Buffer.from(request.backImage).toString('base64') : null,\n documentType: request.documentType ?? null,\n country: request.country ?? null,\n }),\n })\n\n if (!res.ok) {\n throw new Error(`ExternalOcrDriver request failed with status ${res.status}`)\n }\n\n return (await res.json()) as OcrResultData\n }\n}\n"],"mappings":";;;;;AAOA,IAAa,oBAAb,MAAoD;CAGrB;CAF7B,OAAgB;CAEhB,YAAY,QAAoC;EAAnB,KAAA,SAAA;EAC3B,IAAI,CAAC,OAAO,UAAU,MAAM,IAAI,MAAM,4CAA4C;CACpF;CAEA,MAAM,QAAQ,SAA6C;EACzD,MAAM,MAAM,MAAM,MAAM,KAAK,OAAO,UAAoB;GACtD,QAAQ;GACR,SAAS;IACP,gBAAgB;IAChB,GAAI,KAAK,OAAO,SAAS,EAAE,eAAe,UAAU,KAAK,OAAO,SAAS,IAAI,CAAC;GAChF;GACA,MAAM,KAAK,UAAU;IACnB,OAAO,OAAO,KAAK,QAAQ,KAAK,CAAC,CAAC,SAAS,QAAQ;IACnD,WAAW,QAAQ,WAAW,SAAS,OAAO,KAAK,QAAQ,SAAS,CAAC,CAAC,SAAS,QAAQ,IAAI;IAC3F,cAAc,QAAQ,gBAAgB;IACtC,SAAS,QAAQ,WAAW;GAC9B,CAAC;EACH,CAAC;EAED,IAAI,CAAC,IAAI,IACP,MAAM,IAAI,MAAM,gDAAgD,IAAI,QAAQ;EAG9E,OAAQ,MAAM,IAAI,KAAK;CACzB;AACF"}

package/dist/drivers/mock.d.mts ADDED Viewed

@@ -0,0 +1,16 @@
+import { OcrDriver, OcrRequest } from "../types.mjs";
+import { OcrResultData } from "@arkyc/types";
+//#region src/drivers/mock.d.ts
+/**
+ * Deterministic OCR driver for development + tests. Returns fixed identity
+ * fields; `hints` steer the confidence and expiry so a caller can drive a
+ * session toward any decision.
+ */
+declare class MockOcrDriver implements OcrDriver {
+  readonly name = "mock";
+  extract(request: OcrRequest): Promise<OcrResultData>;
+}
+//#endregion
+export { MockOcrDriver };
+//# sourceMappingURL=mock.d.mts.map

package/dist/drivers/mock.d.mts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"mock.d.mts","names":[],"sources":["../../src/drivers/mock.ts"],"mappings":";;;;;;AAUA;;;cAAa,aAAA,YAAyB,SAAA;EAAA,SAC3B,IAAA;EAEH,OAAA,CAAQ,OAAA,EAAS,UAAA,GAAa,OAAA,CAAQ,aAAA;AAAA"}

package/dist/drivers/mock.mjs ADDED Viewed

@@ -0,0 +1,34 @@
+//#region src/drivers/mock.ts
+const clamp01 = (n) => Math.min(1, Math.max(0, n));
+/**
+* Deterministic OCR driver for development + tests. Returns fixed identity
+* fields; `hints` steer the confidence and expiry so a caller can drive a
+* session toward any decision.
+*/
+var MockOcrDriver = class {
+	name = "mock";
+	async extract(request) {
+		const confidence = clamp01(request.hints?.confidence ?? .92);
+		return {
+			fields: {
+				firstName: "Ada",
+				lastName: "Lovelace",
+				fullName: "Ada Lovelace",
+				dateOfBirth: "1990-01-01",
+				documentNumber: "X1234567",
+				expiryDate: request.hints?.expired ? "2000-01-01" : "2035-01-01",
+				nationality: request.country ?? "GB"
+			},
+			confidence,
+			raw: {
+				provider: "mock",
+				confidence,
+				documentType: request.documentType ?? null
+			}
+		};
+	}
+};
+//#endregion
+export { MockOcrDriver };
+//# sourceMappingURL=mock.mjs.map

package/dist/drivers/mock.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"mock.mjs","names":[],"sources":["../../src/drivers/mock.ts"],"sourcesContent":["import type { OcrResultData } from '@arkyc/types'\nimport type { OcrDriver, OcrRequest } from '../types'\n\nconst clamp01 = (n: number): number => Math.min(1, Math.max(0, n))\n\n/**\n * Deterministic OCR driver for development + tests. Returns fixed identity\n * fields; `hints` steer the confidence and expiry so a caller can drive a\n * session toward any decision.\n */\nexport class MockOcrDriver implements OcrDriver {\n readonly name = 'mock'\n\n async extract(request: OcrRequest): Promise<OcrResultData> {\n const confidence = clamp01(request.hints?.confidence ?? 0.92)\n\n return {\n fields: {\n firstName: 'Ada',\n lastName: 'Lovelace',\n fullName: 'Ada Lovelace',\n dateOfBirth: '1990-01-01',\n documentNumber: 'X1234567',\n expiryDate: request.hints?.expired ? '2000-01-01' : '2035-01-01',\n nationality: request.country ?? 'GB',\n },\n confidence,\n raw: { provider: 'mock', confidence, documentType: request.documentType ?? null },\n }\n }\n}\n"],"mappings":";AAGA,MAAM,WAAW,MAAsB,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;;;;;;AAOjE,IAAa,gBAAb,MAAgD;CAC9C,OAAgB;CAEhB,MAAM,QAAQ,SAA6C;EACzD,MAAM,aAAa,QAAQ,QAAQ,OAAO,cAAc,GAAI;EAE5D,OAAO;GACL,QAAQ;IACN,WAAW;IACX,UAAU;IACV,UAAU;IACV,aAAa;IACb,gBAAgB;IAChB,YAAY,QAAQ,OAAO,UAAU,eAAe;IACpD,aAAa,QAAQ,WAAW;GAClC;GACA;GACA,KAAK;IAAE,UAAU;IAAQ;IAAY,cAAc,QAAQ,gBAAgB;GAAK;EAClF;CACF;AACF"}

package/dist/drivers/preprocess.d.mts ADDED Viewed

@@ -0,0 +1,51 @@
+//#region src/drivers/preprocess.d.ts
+/**
+ * Transforms raw image bytes to improve OCR legibility. Returns the (possibly
+ * unchanged) bytes — a preprocessor must never throw; on failure it returns the
+ * input so recognition still runs.
+ */
+type OcrPreprocessor = (image: Uint8Array) => Promise<Uint8Array>;
+/** Tunables for the default {@link sharpPreprocessor}. */
+interface PreprocessOptions {
+  /**
+   * Upscale images narrower than this (px) to give the engine more pixels per
+   * glyph — the single biggest win for the small OCR-B MRZ band. Default 1600.
+   */
+  minWidth?: number;
+}
+/** A no-op preprocessor: passes the original bytes through unchanged. */
+declare const passthrough: OcrPreprocessor;
+/** Minimal structural type for the lazily-imported `sharp` module. */
+interface SharpInstance {
+  metadata(): Promise<{
+    width?: number;
+    height?: number;
+  }>;
+  grayscale(): SharpInstance;
+  normalise(): SharpInstance;
+  median(size: number): SharpInstance;
+  sharpen(): SharpInstance;
+  resize(options: {
+    width: number;
+    withoutEnlargement: boolean;
+  }): SharpInstance;
+  png(): SharpInstance;
+  toBuffer(): Promise<Buffer>;
+}
+type SharpFactory = (input: Buffer) => SharpInstance;
+/**
+ * Build a preprocessor over an injected `sharp` factory: grayscale → contrast
+ * normalise → light denoise → sharpen → upscale small images. This lifts faint
+ * document text and the OCR-B MRZ band without the aggressive binarisation that
+ * destroys detail under uneven lighting. Any failure falls back to the original.
+ */
+declare function buildSharpPreprocessor(sharp: SharpFactory, options?: PreprocessOptions): OcrPreprocessor;
+/**
+ * The default preprocessor, backed by the optional `sharp` package and resolved
+ * once. If `sharp` isn't installed it degrades to {@link passthrough}, so OCR
+ * still runs on the raw bytes (just without the legibility boost).
+ */
+declare function defaultPreprocessor(options?: PreprocessOptions): Promise<OcrPreprocessor>;
+//#endregion
+export { OcrPreprocessor, PreprocessOptions, buildSharpPreprocessor, defaultPreprocessor, passthrough };
+//# sourceMappingURL=preprocess.d.mts.map

package/dist/drivers/preprocess.d.mts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"preprocess.d.mts","names":[],"sources":["../../src/drivers/preprocess.ts"],"mappings":";;AAKA;;;;KAAY,eAAA,IAAmB,KAAA,EAAO,UAAA,KAAe,OAAA,CAAQ,UAAA;;UAG5C,iBAAA;EAH2C;;;;EAQ1D,QAAQ;AAAA;AAR6D;AAAA,cAY1D,WAAA,EAAa,eAAwC;;UAGxD,aAAA;EACR,QAAA,IAAY,OAAA;IAAU,KAAA;IAAgB,MAAA;EAAA;EACtC,SAAA,IAAa,aAAA;EACb,SAAA,IAAa,aAAA;EACb,MAAA,CAAO,IAAA,WAAe,aAAA;EACtB,OAAA,IAAW,aAAA;EACX,MAAA,CAAO,OAAA;IAAW,KAAA;IAAe,kBAAA;EAAA,IAAgC,aAAA;EACjE,GAAA,IAAO,aAAA;EACP,QAAA,IAAY,OAAA,CAAQ,MAAA;AAAA;AAAA,KAEjB,YAAA,IAAgB,KAAA,EAAO,MAAA,KAAW,aAAa;;;;;;;iBAQpC,sBAAA,CAAuB,KAAA,EAAO,YAAA,EAAc,OAAA,GAAS,iBAAA,GAAyB,eAAA;;;;;;iBA0BxE,mBAAA,CAAoB,OAAA,GAAS,iBAAA,GAAyB,OAAA,CAAQ,eAAA"}

package/dist/drivers/preprocess.mjs ADDED Viewed

@@ -0,0 +1,50 @@
+//#region src/drivers/preprocess.ts
+/** A no-op preprocessor: passes the original bytes through unchanged. */
+const passthrough = async (image) => image;
+/**
+* Build a preprocessor over an injected `sharp` factory: grayscale → contrast
+* normalise → light denoise → sharpen → upscale small images. This lifts faint
+* document text and the OCR-B MRZ band without the aggressive binarisation that
+* destroys detail under uneven lighting. Any failure falls back to the original.
+*/
+function buildSharpPreprocessor(sharp, options = {}) {
+	const minWidth = options.minWidth ?? 1600;
+	return async (image) => {
+		try {
+			const input = Buffer.from(image);
+			const { width } = await sharp(input).metadata();
+			let pipe = sharp(input).grayscale().normalise().median(1).sharpen();
+			if (width && width < minWidth) pipe = pipe.resize({
+				width: minWidth,
+				withoutEnlargement: false
+			});
+			const out = await pipe.png().toBuffer();
+			return new Uint8Array(out);
+		} catch {
+			return image;
+		}
+	};
+}
+let cached;
+/**
+* The default preprocessor, backed by the optional `sharp` package and resolved
+* once. If `sharp` isn't installed it degrades to {@link passthrough}, so OCR
+* still runs on the raw bytes (just without the legibility boost).
+*/
+async function defaultPreprocessor(options = {}) {
+	if (cached) return cached;
+	const moduleName = "sharp";
+	try {
+		cached = buildSharpPreprocessor((await import(
+			/* @vite-ignore */
+			moduleName
+)).default, options);
+	} catch {
+		cached = passthrough;
+	}
+	return cached;
+}
+//#endregion
+export { buildSharpPreprocessor, defaultPreprocessor, passthrough };
+//# sourceMappingURL=preprocess.mjs.map

package/dist/drivers/preprocess.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"preprocess.mjs","names":[],"sources":["../../src/drivers/preprocess.ts"],"sourcesContent":["/**\n * Transforms raw image bytes to improve OCR legibility. Returns the (possibly\n * unchanged) bytes — a preprocessor must never throw; on failure it returns the\n * input so recognition still runs.\n */\nexport type OcrPreprocessor = (image: Uint8Array) => Promise<Uint8Array>\n\n/** Tunables for the default {@link sharpPreprocessor}. */\nexport interface PreprocessOptions {\n /**\n * Upscale images narrower than this (px) to give the engine more pixels per\n * glyph — the single biggest win for the small OCR-B MRZ band. Default 1600.\n */\n minWidth?: number\n}\n\n/** A no-op preprocessor: passes the original bytes through unchanged. */\nexport const passthrough: OcrPreprocessor = async (image) => image\n\n/** Minimal structural type for the lazily-imported `sharp` module. */\ninterface SharpInstance {\n metadata(): Promise<{ width?: number; height?: number }>\n grayscale(): SharpInstance\n normalise(): SharpInstance\n median(size: number): SharpInstance\n sharpen(): SharpInstance\n resize(options: { width: number; withoutEnlargement: boolean }): SharpInstance\n png(): SharpInstance\n toBuffer(): Promise<Buffer>\n}\ntype SharpFactory = (input: Buffer) => SharpInstance\n\n/**\n * Build a preprocessor over an injected `sharp` factory: grayscale → contrast\n * normalise → light denoise → sharpen → upscale small images. This lifts faint\n * document text and the OCR-B MRZ band without the aggressive binarisation that\n * destroys detail under uneven lighting. Any failure falls back to the original.\n */\nexport function buildSharpPreprocessor(sharp: SharpFactory, options: PreprocessOptions = {}): OcrPreprocessor {\n const minWidth = options.minWidth ?? 1600\n return async (image) => {\n try {\n const input = Buffer.from(image)\n const base = sharp(input)\n const { width } = await base.metadata()\n let pipe = sharp(input).grayscale().normalise().median(1).sharpen()\n if (width && width < minWidth) {\n pipe = pipe.resize({ width: minWidth, withoutEnlargement: false })\n }\n const out = await pipe.png().toBuffer()\n return new Uint8Array(out)\n } catch {\n return image\n }\n }\n}\n\nlet cached: OcrPreprocessor | undefined\n\n/**\n * The default preprocessor, backed by the optional `sharp` package and resolved\n * once. If `sharp` isn't installed it degrades to {@link passthrough}, so OCR\n * still runs on the raw bytes (just without the legibility boost).\n */\nexport async function defaultPreprocessor(options: PreprocessOptions = {}): Promise<OcrPreprocessor> {\n if (cached) return cached\n const moduleName = 'sharp'\n try {\n const mod = (await import(/* @vite-ignore */ moduleName)) as unknown as { default: SharpFactory }\n cached = buildSharpPreprocessor(mod.default, options)\n } catch {\n cached = passthrough\n }\n return cached\n}\n"],"mappings":";;AAiBA,MAAa,cAA+B,OAAO,UAAU;;;;;;;AAqB7D,SAAgB,uBAAuB,OAAqB,UAA6B,CAAC,GAAoB;CAC5G,MAAM,WAAW,QAAQ,YAAY;CACrC,OAAO,OAAO,UAAU;EACtB,IAAI;GACF,MAAM,QAAQ,OAAO,KAAK,KAAK;GAE/B,MAAM,EAAE,UAAU,MADL,MAAM,KACQ,CAAC,CAAC,SAAS;GACtC,IAAI,OAAO,MAAM,KAAK,CAAC,CAAC,UAAU,CAAC,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ;GAClE,IAAI,SAAS,QAAQ,UACnB,OAAO,KAAK,OAAO;IAAE,OAAO;IAAU,oBAAoB;GAAM,CAAC;GAEnE,MAAM,MAAM,MAAM,KAAK,IAAI,CAAC,CAAC,SAAS;GACtC,OAAO,IAAI,WAAW,GAAG;EAC3B,QAAQ;GACN,OAAO;EACT;CACF;AACF;AAEA,IAAI;;;;;;AAOJ,eAAsB,oBAAoB,UAA6B,CAAC,GAA6B;CACnG,IAAI,QAAQ,OAAO;CACnB,MAAM,aAAa;CACnB,IAAI;EAEF,SAAS,wBAAuB,MADb;;GAA0B;GACT,SAAS,OAAO;CACtD,QAAQ;EACN,SAAS;CACX;CACA,OAAO;AACT"}

package/dist/drivers/tesseract.d.mts ADDED Viewed

@@ -0,0 +1,75 @@
+import { OcrDriver, OcrRequest } from "../types.mjs";
+import { DocumentParserRegistry } from "../parsers/registry.mjs";
+import { OcrPreprocessor } from "./preprocess.mjs";
+import { OcrResultData } from "@arkyc/types";
+//#region src/drivers/tesseract.d.ts
+/** Options for a single recognition pass. */
+interface RecognizeOptions {
+  /**
+   * Constrain recognition to the {@link MRZ_CHARSET}. This forces ambiguous OCR-B
+   * glyphs into the machine-readable zone's alphabet (e.g. `O`→`0`, `I`→`1`),
+   * dramatically improving the numeric MRZ lines (dates, check digits).
+   */
+  mrz?: boolean;
+}
+/** Reads text from an image; returns text + an engine confidence in [0, 100]. */
+type TesseractRecognize = (image: Uint8Array, language: string, options?: RecognizeOptions) => Promise<{
+  text: string;
+  confidence: number;
+}>;
+interface TesseractOcrOptions {
+  /** Recognition language(s), default `eng`. */
+  language?: string;
+  /**
+   * Parser registry used to turn recognized text into fields. Defaults to the
+   * MRZ-backed registry; pass one with your country/type parsers registered.
+   */
+  registry?: DocumentParserRegistry;
+  /** Injectable recognizer (tests); defaults to a lazily-loaded `tesseract.js`. */
+  recognize?: TesseractRecognize;
+  /**
+   * Injectable image preprocessor run on each side before recognition. Defaults
+   * to a lazily-loaded `sharp` pass (grayscale/normalise/upscale) that degrades
+   * to a no-op when `sharp` isn't installed. Pass `false` to disable it.
+   */
+  preprocess?: OcrPreprocessor | false;
+}
+/**
+ * In-process OCR via Tesseract.js. Recognizes text from the document image, then
+ * runs it through the {@link DocumentParserRegistry} to extract structured fields.
+ * `tesseract.js` is imported lazily so it is only loaded when this driver runs.
+ */
+declare class TesseractOcrDriver implements OcrDriver {
+  readonly name = "tesseract";
+  private readonly language;
+  private readonly registry;
+  private readonly recognizeImpl?;
+  private readonly preprocessOption?;
+  private preprocessImpl?;
+  constructor(options?: TesseractOcrOptions);
+  extract(request: OcrRequest): Promise<OcrResultData>;
+  /** Parse every candidate text and return the best-ranked, highest-scoring result. */
+  private bestOf;
+  /** Parse one candidate text and attach its stage rank + blended confidence. */
+  private score;
+  /**
+   * Blend the parser's structural confidence with the engine's self-reported
+   * confidence — parser-dominant, because what the parser extracted matters more
+   * than how sure Tesseract felt about each glyph (it is pessimistic on the OCR-B
+   * MRZ font and busy document backgrounds). For the `mrz` stage the result is
+   * check-digit-verified ground truth, so the engine only lifts the score and can
+   * never drag a verified read down. Other stages aren't self-verifying, so the
+   * engine's confidence carries more weight.
+   */
+  private scoreConfidence;
+  /** Recognize text, or `null` if the engine can't read the image. */
+  private tryRecognize;
+  /** Run the configured preprocessor (resolved once), or pass bytes through. */
+  private preprocess;
+  /** Lazily load `tesseract.js` and adapt it to {@link TesseractRecognize}. */
+  private loadRecognizer;
+}
+//#endregion
+export { RecognizeOptions, TesseractOcrDriver, TesseractOcrOptions, TesseractRecognize };
+//# sourceMappingURL=tesseract.d.mts.map

package/dist/drivers/tesseract.d.mts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"tesseract.d.mts","names":[],"sources":["../../src/drivers/tesseract.ts"],"mappings":";;;;;;;UAeiB,gBAAA;EAAA;;;;AAMZ;EAAH,GAAG;AAAA;;KAIO,kBAAA,IACV,KAAA,EAAO,UAAA,EACP,QAAA,UACA,OAAA,GAAU,gBAAA,KACP,OAAA;EAAU,IAAA;EAAc,UAAA;AAAA;AAAA,UA0BZ,mBAAA;EA7BR;EA+BP,QAAA;EA9BA;;;;EAmCA,QAAA,GAAW,sBAAA;EAjCgB;EAmC3B,SAAA,GAAY,kBAAA;EAnCyB;AA0BvC;;;;EAeE,UAAA,GAAa,eAAA;AAAA;;;;;;cAQF,kBAAA,YAA8B,SAAA;EAAA,SAChC,IAAA;EAAA,iBACQ,QAAA;EAAA,iBACA,QAAA;EAAA,iBACA,aAAA;EAAA,iBACA,gBAAA;EAAA,QACT,cAAA;cAEI,OAAA,GAAS,mBAAA;EAOf,OAAA,CAAQ,OAAA,EAAS,UAAA,GAAa,OAAA,CAAQ,aAAA;EAPvB;EAAA,QA2Db,MAAA;EApDoC;EAAA,QAgEpC,KAAA;EA/EiC;;;;;;;;;EAAA,QA0GjC,eAAA;;UAOM,YAAA;EAzGF;EAAA,QAuHE,UAAA;EAhHS;EAAA,QAyHT,cAAA;AAAA"}

package/dist/drivers/tesseract.mjs ADDED Viewed

@@ -0,0 +1,175 @@
+import { createDocumentParserRegistry } from "../parsers/registry.mjs";
+import { defaultPreprocessor } from "./preprocess.mjs";
+//#region src/drivers/tesseract.ts
+const clamp01 = (n) => Math.min(1, Math.max(0, n));
+/** MRZ (machine-readable zone) charset — uppercase letters, digits and the filler. */
+const MRZ_CHARSET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789<";
+/** Relative quality of each parse stage; higher wins when picking the best result. */
+const STAGE_RANK = {
+	mrz: 3,
+	custom: 2,
+	generic: 1,
+	none: 0
+};
+/**
+* In-process OCR via Tesseract.js. Recognizes text from the document image, then
+* runs it through the {@link DocumentParserRegistry} to extract structured fields.
+* `tesseract.js` is imported lazily so it is only loaded when this driver runs.
+*/
+var TesseractOcrDriver = class {
+	name = "tesseract";
+	language;
+	registry;
+	recognizeImpl;
+	preprocessOption;
+	preprocessImpl;
+	constructor(options = {}) {
+		this.language = options.language ?? "eng";
+		this.registry = options.registry ?? createDocumentParserRegistry();
+		this.recognizeImpl = options.recognize;
+		this.preprocessOption = options.preprocess;
+	}
+	async extract(request) {
+		const sides = [];
+		if (request.image?.length) sides.push(request.image);
+		if (request.backImage?.length) sides.push(request.backImage);
+		const reads = [];
+		for (const image of sides) {
+			const read = await this.tryRecognize(image);
+			if (read) reads.push({
+				text: read.text,
+				engine: read.confidence
+			});
+		}
+		if (reads.length === 0) return {
+			fields: {},
+			confidence: 0,
+			raw: {
+				engine: "tesseract",
+				empty: true
+			}
+		};
+		const candidates = [...reads];
+		if (reads.length > 1) candidates.push({
+			text: reads.map((r) => r.text).join("\n"),
+			engine: avg(reads.map((r) => r.engine))
+		});
+		let best = this.bestOf(candidates, request);
+		if (best.rank < STAGE_RANK.mrz) {
+			const mrzReads = [];
+			for (const image of sides) {
+				const read = await this.tryRecognize(image, { mrz: true });
+				if (read?.text.trim()) mrzReads.push({
+					text: read.text,
+					engine: read.confidence
+				});
+			}
+			if (mrzReads.length) {
+				const mrzBest = this.bestOf(mrzReads, request);
+				if (mrzBest.rank > best.rank || mrzBest.rank === best.rank && mrzBest.score > best.score) best = mrzBest;
+			}
+		}
+		return {
+			fields: best.parsed.fields,
+			confidence: best.score,
+			raw: {
+				engine: "tesseract",
+				stage: best.stage,
+				text: best.text,
+				parser: best.parsed.raw
+			}
+		};
+	}
+	/** Parse every candidate text and return the best-ranked, highest-scoring result. */
+	bestOf(candidates, request) {
+		let best = null;
+		for (const candidate of candidates) {
+			const scored = this.score(candidate, request);
+			if (!best || scored.rank > best.rank || scored.rank === best.rank && scored.score > best.score) best = scored;
+		}
+		return best;
+	}
+	/** Parse one candidate text and attach its stage rank + blended confidence. */
+	score(candidate, request) {
+		const parsed = this.registry.parse({
+			text: candidate.text,
+			country: request.country,
+			documentType: request.documentType
+		});
+		const stage = parsed.raw?.stage;
+		const hasFields = Object.keys(parsed.fields).length > 0;
+		return {
+			parsed,
+			stage,
+			rank: STAGE_RANK[stage ?? "none"] ?? 0,
+			score: hasFields ? this.scoreConfidence(parsed.confidence, candidate.engine / 100, stage) : 0,
+			text: candidate.text
+		};
+	}
+	/**
+	* Blend the parser's structural confidence with the engine's self-reported
+	* confidence — parser-dominant, because what the parser extracted matters more
+	* than how sure Tesseract felt about each glyph (it is pessimistic on the OCR-B
+	* MRZ font and busy document backgrounds). For the `mrz` stage the result is
+	* check-digit-verified ground truth, so the engine only lifts the score and can
+	* never drag a verified read down. Other stages aren't self-verifying, so the
+	* engine's confidence carries more weight.
+	*/
+	scoreConfidence(parserConfidence, engine, stage) {
+		const e = clamp01(engine);
+		if (stage === "mrz") return clamp01(parserConfidence * .9 + e * .1);
+		return clamp01(parserConfidence * .6 + e * .4);
+	}
+	/** Recognize text, or `null` if the engine can't read the image. */
+	async tryRecognize(image, options) {
+		try {
+			const prepared = await this.preprocess(image);
+			return await (this.recognizeImpl ?? await this.loadRecognizer())(prepared, this.language, options);
+		} catch {
+			return null;
+		}
+	}
+	/** Run the configured preprocessor (resolved once), or pass bytes through. */
+	async preprocess(image) {
+		if (this.preprocessOption === false) return image;
+		if (!this.preprocessImpl) this.preprocessImpl = this.preprocessOption ?? await defaultPreprocessor();
+		return this.preprocessImpl(image);
+	}
+	/** Lazily load `tesseract.js` and adapt it to {@link TesseractRecognize}. */
+	async loadRecognizer() {
+		const moduleName = "tesseract.js";
+		let mod;
+		try {
+			mod = await import(
+				/* @vite-ignore */
+				moduleName
+);
+		} catch {
+			throw new Error("OCR driver 'tesseract' requires the 'tesseract.js' package. Install it with: pnpm add tesseract.js -F @arkyc/ocr");
+		}
+		return async (image, language, options) => {
+			const worker = await mod.createWorker(language);
+			try {
+				if (options?.mrz) await worker.setParameters({
+					tessedit_char_whitelist: MRZ_CHARSET,
+					tessedit_pageseg_mode: "6"
+				});
+				const { data } = await worker.recognize(Buffer.from(image));
+				return {
+					text: data.text,
+					confidence: data.confidence
+				};
+			} finally {
+				await worker.terminate();
+			}
+		};
+	}
+};
+/** Mean of a non-empty list of numbers. */
+function avg(values) {
+	return values.reduce((a, b) => a + b, 0) / values.length;
+}
+//#endregion
+export { TesseractOcrDriver };
+//# sourceMappingURL=tesseract.mjs.map

package/dist/drivers/tesseract.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"tesseract.mjs","names":[],"sources":["../../src/drivers/tesseract.ts"],"sourcesContent":["import type { OcrResultData } from '@arkyc/types'\nimport type { OcrDriver, OcrRequest } from '../types'\nimport { createDocumentParserRegistry, type DocumentParserRegistry } from '../parsers/registry'\nimport type { ParseOutput } from '../parsers/types'\nimport { defaultPreprocessor, type OcrPreprocessor } from './preprocess'\n\nconst clamp01 = (n: number): number => Math.min(1, Math.max(0, n))\n\n/** MRZ (machine-readable zone) charset — uppercase letters, digits and the filler. */\nconst MRZ_CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789<'\n\n/** Relative quality of each parse stage; higher wins when picking the best result. */\nconst STAGE_RANK: Record<string, number> = { mrz: 3, custom: 2, generic: 1, none: 0 }\n\n/** Options for a single recognition pass. */\nexport interface RecognizeOptions {\n /**\n * Constrain recognition to the {@link MRZ_CHARSET}. This forces ambiguous OCR-B\n * glyphs into the machine-readable zone's alphabet (e.g. `O`→`0`, `I`→`1`),\n * dramatically improving the numeric MRZ lines (dates, check digits).\n */\n mrz?: boolean\n}\n\n/** Reads text from an image; returns text + an engine confidence in [0, 100]. */\nexport type TesseractRecognize = (\n image: Uint8Array,\n language: string,\n options?: RecognizeOptions,\n) => Promise<{ text: string; confidence: number }>\n\n/** Minimal structural type for the lazily-imported `tesseract.js` module. */\ninterface TesseractModule {\n createWorker(language?: string): Promise<{\n setParameters(params: Record<string, string>): Promise<unknown>\n recognize(image: Buffer): Promise<{ data: { text: string; confidence: number } }>\n terminate(): Promise<void>\n }>\n}\n\n/** A candidate OCR text to parse, with the engine confidence that produced it. */\ninterface Candidate {\n text: string\n engine: number\n}\n\n/** A parse result with the metadata used to rank it against other candidates. */\ninterface Scored {\n parsed: ParseOutput\n stage?: string\n rank: number\n score: number\n text: string\n}\n\nexport interface TesseractOcrOptions {\n /** Recognition language(s), default `eng`. */\n language?: string\n /**\n * Parser registry used to turn recognized text into fields. Defaults to the\n * MRZ-backed registry; pass one with your country/type parsers registered.\n */\n registry?: DocumentParserRegistry\n /** Injectable recognizer (tests); defaults to a lazily-loaded `tesseract.js`. */\n recognize?: TesseractRecognize\n /**\n * Injectable image preprocessor run on each side before recognition. Defaults\n * to a lazily-loaded `sharp` pass (grayscale/normalise/upscale) that degrades\n * to a no-op when `sharp` isn't installed. Pass `false` to disable it.\n */\n preprocess?: OcrPreprocessor | false\n}\n\n/**\n * In-process OCR via Tesseract.js. Recognizes text from the document image, then\n * runs it through the {@link DocumentParserRegistry} to extract structured fields.\n * `tesseract.js` is imported lazily so it is only loaded when this driver runs.\n */\nexport class TesseractOcrDriver implements OcrDriver {\n readonly name = 'tesseract'\n private readonly language: string\n private readonly registry: DocumentParserRegistry\n private readonly recognizeImpl?: TesseractRecognize\n private readonly preprocessOption?: OcrPreprocessor | false\n private preprocessImpl?: OcrPreprocessor\n\n constructor(options: TesseractOcrOptions = {}) {\n this.language = options.language ?? 'eng'\n this.registry = options.registry ?? createDocumentParserRegistry()\n this.recognizeImpl = options.recognize\n this.preprocessOption = options.preprocess\n }\n\n async extract(request: OcrRequest): Promise<OcrResultData> {\n // Read both sides — the MRZ may be on the front (passports) or the back\n // (TD1 ID cards, residence permits).\n const sides: Uint8Array[] = []\n if (request.image?.length) sides.push(request.image)\n if (request.backImage?.length) sides.push(request.backImage)\n\n const reads: Candidate[] = []\n for (const image of sides) {\n const read = await this.tryRecognize(image)\n if (read) reads.push({ text: read.text, engine: read.confidence })\n }\n\n if (reads.length === 0) {\n // Nothing readable (empty or unreadable images, or engine failure): return\n // empty so the decision engine routes on low confidence (manual review).\n return { fields: {}, confidence: 0, raw: { engine: 'tesseract', empty: true } }\n }\n\n // Parse each side ALONE, plus the combination. The MRZ lives on a single\n // side, and mixing both sides' text into one parse lets the other side's\n // stray long lines capture the MRZ's line slots — so a clean single side can\n // read where front+back together cannot. We keep the best-ranked result.\n const candidates: Candidate[] = [...reads]\n if (reads.length > 1) {\n candidates.push({ text: reads.map((r) => r.text).join('\\n'), engine: avg(reads.map((r) => r.engine)) })\n }\n let best = this.bestOf(candidates, request)\n\n // Legibility fallback: if nothing parsed as an MRZ, retry each side\n // constrained to the OCR-B charset (forces O→0 / I→1 on the numeric lines),\n // which often rescues an MRZ the unconstrained pass mangled.\n if (best.rank < STAGE_RANK.mrz!) {\n const mrzReads: Candidate[] = []\n for (const image of sides) {\n const read = await this.tryRecognize(image, { mrz: true })\n if (read?.text.trim()) mrzReads.push({ text: read.text, engine: read.confidence })\n }\n if (mrzReads.length) {\n const mrzBest = this.bestOf(mrzReads, request)\n if (mrzBest.rank > best.rank || (mrzBest.rank === best.rank && mrzBest.score > best.score)) best = mrzBest\n }\n }\n\n return {\n fields: best.parsed.fields,\n confidence: best.score,\n raw: { engine: 'tesseract', stage: best.stage, text: best.text, parser: best.parsed.raw },\n }\n }\n\n /** Parse every candidate text and return the best-ranked, highest-scoring result. */\n private bestOf(candidates: Candidate[], request: OcrRequest): Scored {\n let best: Scored | null = null\n for (const candidate of candidates) {\n const scored = this.score(candidate, request)\n if (!best || scored.rank > best.rank || (scored.rank === best.rank && scored.score > best.score)) {\n best = scored\n }\n }\n return best!\n }\n\n /** Parse one candidate text and attach its stage rank + blended confidence. */\n private score(candidate: Candidate, request: OcrRequest): Scored {\n const parsed = this.registry.parse({\n text: candidate.text,\n country: request.country,\n documentType: request.documentType,\n })\n const stage = (parsed.raw as { stage?: string } | undefined)?.stage\n const hasFields = Object.keys(parsed.fields).length > 0\n return {\n parsed,\n stage,\n rank: STAGE_RANK[stage ?? 'none'] ?? 0,\n // A result with no extracted fields carries no confidence, whatever the engine felt.\n score: hasFields ? this.scoreConfidence(parsed.confidence, candidate.engine / 100, stage) : 0,\n text: candidate.text,\n }\n }\n\n /**\n * Blend the parser's structural confidence with the engine's self-reported\n * confidence — parser-dominant, because what the parser extracted matters more\n * than how sure Tesseract felt about each glyph (it is pessimistic on the OCR-B\n * MRZ font and busy document backgrounds). For the `mrz` stage the result is\n * check-digit-verified ground truth, so the engine only lifts the score and can\n * never drag a verified read down. Other stages aren't self-verifying, so the\n * engine's confidence carries more weight.\n */\n private scoreConfidence(parserConfidence: number, engine: number, stage?: string): number {\n const e = clamp01(engine)\n if (stage === 'mrz') return clamp01(parserConfidence * 0.9 + e * 0.1)\n return clamp01(parserConfidence * 0.6 + e * 0.4)\n }\n\n /** Recognize text, or `null` if the engine can't read the image. */\n private async tryRecognize(\n image: Uint8Array,\n options?: RecognizeOptions,\n ): Promise<{ text: string; confidence: number } | null> {\n try {\n const prepared = await this.preprocess(image)\n const recognize = this.recognizeImpl ?? (await this.loadRecognizer())\n return await recognize(prepared, this.language, options)\n } catch {\n return null\n }\n }\n\n /** Run the configured preprocessor (resolved once), or pass bytes through. */\n private async preprocess(image: Uint8Array): Promise<Uint8Array> {\n if (this.preprocessOption === false) return image\n if (!this.preprocessImpl) {\n this.preprocessImpl = this.preprocessOption ?? (await defaultPreprocessor())\n }\n return this.preprocessImpl(image)\n }\n\n /** Lazily load `tesseract.js` and adapt it to {@link TesseractRecognize}. */\n private async loadRecognizer(): Promise<TesseractRecognize> {\n const moduleName = 'tesseract.js'\n let mod: TesseractModule\n try {\n mod = (await import(/* @vite-ignore */ moduleName)) as unknown as TesseractModule\n } catch {\n throw new Error(\n \"OCR driver 'tesseract' requires the 'tesseract.js' package. Install it with: pnpm add tesseract.js -F @arkyc/ocr\",\n )\n }\n return async (image, language, options) => {\n const worker = await mod.createWorker(language)\n try {\n if (options?.mrz) {\n await worker.setParameters({\n tessedit_char_whitelist: MRZ_CHARSET,\n // Treat the input as a single uniform block (the MRZ band's fixed lines).\n tessedit_pageseg_mode: '6',\n })\n }\n const { data } = await worker.recognize(Buffer.from(image))\n return { text: data.text, confidence: data.confidence }\n } finally {\n await worker.terminate()\n }\n }\n }\n}\n\n/** Mean of a non-empty list of numbers. */\nfunction avg(values: number[]): number {\n return values.reduce((a, b) => a + b, 0) / values.length\n}\n"],"mappings":";;;AAMA,MAAM,WAAW,MAAsB,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;;AAGjE,MAAM,cAAc;;AAGpB,MAAM,aAAqC;CAAE,KAAK;CAAG,QAAQ;CAAG,SAAS;CAAG,MAAM;AAAE;;;;;;AAkEpF,IAAa,qBAAb,MAAqD;CACnD,OAAgB;CAChB;CACA;CACA;CACA;CACA;CAEA,YAAY,UAA+B,CAAC,GAAG;EAC7C,KAAK,WAAW,QAAQ,YAAY;EACpC,KAAK,WAAW,QAAQ,YAAY,6BAA6B;EACjE,KAAK,gBAAgB,QAAQ;EAC7B,KAAK,mBAAmB,QAAQ;CAClC;CAEA,MAAM,QAAQ,SAA6C;EAGzD,MAAM,QAAsB,CAAC;EAC7B,IAAI,QAAQ,OAAO,QAAQ,MAAM,KAAK,QAAQ,KAAK;EACnD,IAAI,QAAQ,WAAW,QAAQ,MAAM,KAAK,QAAQ,SAAS;EAE3D,MAAM,QAAqB,CAAC;EAC5B,KAAK,MAAM,SAAS,OAAO;GACzB,MAAM,OAAO,MAAM,KAAK,aAAa,KAAK;GAC1C,IAAI,MAAM,MAAM,KAAK;IAAE,MAAM,KAAK;IAAM,QAAQ,KAAK;GAAW,CAAC;EACnE;EAEA,IAAI,MAAM,WAAW,GAGnB,OAAO;GAAE,QAAQ,CAAC;GAAG,YAAY;GAAG,KAAK;IAAE,QAAQ;IAAa,OAAO;GAAK;EAAE;EAOhF,MAAM,aAA0B,CAAC,GAAG,KAAK;EACzC,IAAI,MAAM,SAAS,GACjB,WAAW,KAAK;GAAE,MAAM,MAAM,KAAK,MAAM,EAAE,IAAI,CAAC,CAAC,KAAK,IAAI;GAAG,QAAQ,IAAI,MAAM,KAAK,MAAM,EAAE,MAAM,CAAC;EAAE,CAAC;EAExG,IAAI,OAAO,KAAK,OAAO,YAAY,OAAO;EAK1C,IAAI,KAAK,OAAO,WAAW,KAAM;GAC/B,MAAM,WAAwB,CAAC;GAC/B,KAAK,MAAM,SAAS,OAAO;IACzB,MAAM,OAAO,MAAM,KAAK,aAAa,OAAO,EAAE,KAAK,KAAK,CAAC;IACzD,IAAI,MAAM,KAAK,KAAK,GAAG,SAAS,KAAK;KAAE,MAAM,KAAK;KAAM,QAAQ,KAAK;IAAW,CAAC;GACnF;GACA,IAAI,SAAS,QAAQ;IACnB,MAAM,UAAU,KAAK,OAAO,UAAU,OAAO;IAC7C,IAAI,QAAQ,OAAO,KAAK,QAAS,QAAQ,SAAS,KAAK,QAAQ,QAAQ,QAAQ,KAAK,OAAQ,OAAO;GACrG;EACF;EAEA,OAAO;GACL,QAAQ,KAAK,OAAO;GACpB,YAAY,KAAK;GACjB,KAAK;IAAE,QAAQ;IAAa,OAAO,KAAK;IAAO,MAAM,KAAK;IAAM,QAAQ,KAAK,OAAO;GAAI;EAC1F;CACF;;CAGA,OAAe,YAAyB,SAA6B;EACnE,IAAI,OAAsB;EAC1B,KAAK,MAAM,aAAa,YAAY;GAClC,MAAM,SAAS,KAAK,MAAM,WAAW,OAAO;GAC5C,IAAI,CAAC,QAAQ,OAAO,OAAO,KAAK,QAAS,OAAO,SAAS,KAAK,QAAQ,OAAO,QAAQ,KAAK,OACxF,OAAO;EAEX;EACA,OAAO;CACT;;CAGA,MAAc,WAAsB,SAA6B;EAC/D,MAAM,SAAS,KAAK,SAAS,MAAM;GACjC,MAAM,UAAU;GAChB,SAAS,QAAQ;GACjB,cAAc,QAAQ;EACxB,CAAC;EACD,MAAM,QAAS,OAAO,KAAwC;EAC9D,MAAM,YAAY,OAAO,KAAK,OAAO,MAAM,CAAC,CAAC,SAAS;EACtD,OAAO;GACL;GACA;GACA,MAAM,WAAW,SAAS,WAAW;GAErC,OAAO,YAAY,KAAK,gBAAgB,OAAO,YAAY,UAAU,SAAS,KAAK,KAAK,IAAI;GAC5F,MAAM,UAAU;EAClB;CACF;;;;;;;;;;CAWA,gBAAwB,kBAA0B,QAAgB,OAAwB;EACxF,MAAM,IAAI,QAAQ,MAAM;EACxB,IAAI,UAAU,OAAO,OAAO,QAAQ,mBAAmB,KAAM,IAAI,EAAG;EACpE,OAAO,QAAQ,mBAAmB,KAAM,IAAI,EAAG;CACjD;;CAGA,MAAc,aACZ,OACA,SACsD;EACtD,IAAI;GACF,MAAM,WAAW,MAAM,KAAK,WAAW,KAAK;GAE5C,OAAO,OADW,KAAK,iBAAkB,MAAM,KAAK,eAAe,EAAA,CAC5C,UAAU,KAAK,UAAU,OAAO;EACzD,QAAQ;GACN,OAAO;EACT;CACF;;CAGA,MAAc,WAAW,OAAwC;EAC/D,IAAI,KAAK,qBAAqB,OAAO,OAAO;EAC5C,IAAI,CAAC,KAAK,gBACR,KAAK,iBAAiB,KAAK,oBAAqB,MAAM,oBAAoB;EAE5E,OAAO,KAAK,eAAe,KAAK;CAClC;;CAGA,MAAc,iBAA8C;EAC1D,MAAM,aAAa;EACnB,IAAI;EACJ,IAAI;GACF,MAAO,MAAM;;IAA0B;;EACzC,QAAQ;GACN,MAAM,IAAI,MACR,kHACF;EACF;EACA,OAAO,OAAO,OAAO,UAAU,YAAY;GACzC,MAAM,SAAS,MAAM,IAAI,aAAa,QAAQ;GAC9C,IAAI;IACF,IAAI,SAAS,KACX,MAAM,OAAO,cAAc;KACzB,yBAAyB;KAEzB,uBAAuB;IACzB,CAAC;IAEH,MAAM,EAAE,SAAS,MAAM,OAAO,UAAU,OAAO,KAAK,KAAK,CAAC;IAC1D,OAAO;KAAE,MAAM,KAAK;KAAM,YAAY,KAAK;IAAW;GACxD,UAAU;IACR,MAAM,OAAO,UAAU;GACzB;EACF;CACF;AACF;;AAGA,SAAS,IAAI,QAA0B;CACrC,OAAO,OAAO,QAAQ,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AACpD"}

package/dist/index.d.mts ADDED Viewed

@@ -0,0 +1,12 @@
+import { OcrConfig, OcrDriver, OcrDriverName, OcrRequest } from "./types.mjs";
+import { OcrDriverFactory } from "./registry.mjs";
+import { MockOcrDriver } from "./drivers/mock.mjs";
+import { ExternalOcrDriver } from "./drivers/external.mjs";
+import { DocumentParser, ParseInput, ParseOutput } from "./parsers/types.mjs";
+import { DocumentParserRegistry, ParseStage, createDocumentParserRegistry } from "./parsers/registry.mjs";
+import { OcrPreprocessor, PreprocessOptions, buildSharpPreprocessor, defaultPreprocessor, passthrough } from "./drivers/preprocess.mjs";
+import { RecognizeOptions, TesseractOcrDriver, TesseractOcrOptions, TesseractRecognize } from "./drivers/tesseract.mjs";
+import { AiExtraction, AiVisionExtract, AnthropicOcrDriver, AnthropicOcrOptions, DEFAULT_AI_MODEL, anthropicVision, applyAuthenticity, scoreConfidence } from "./drivers/ai.mjs";
+import { mrzParser } from "./parsers/mrz.mjs";
+import { genericTextParser } from "./parsers/generic.mjs";
+export { type AiExtraction, type AiVisionExtract, AnthropicOcrDriver, type AnthropicOcrOptions, DEFAULT_AI_MODEL, type DocumentParser, DocumentParserRegistry, ExternalOcrDriver, MockOcrDriver, OcrConfig, OcrDriver, OcrDriverFactory, OcrDriverName, type OcrPreprocessor, OcrRequest, type ParseInput, type ParseOutput, type ParseStage, type PreprocessOptions, type RecognizeOptions, TesseractOcrDriver, type TesseractOcrOptions, type TesseractRecognize, anthropicVision, applyAuthenticity, buildSharpPreprocessor, createDocumentParserRegistry, defaultPreprocessor, genericTextParser, mrzParser, passthrough, scoreConfidence };

package/dist/index.mjs ADDED Viewed

@@ -0,0 +1,10 @@
+import { AnthropicOcrDriver, DEFAULT_AI_MODEL, anthropicVision, applyAuthenticity, scoreConfidence } from "./drivers/ai.mjs";
+import { ExternalOcrDriver } from "./drivers/external.mjs";
+import { MockOcrDriver } from "./drivers/mock.mjs";
+import { mrzParser } from "./parsers/mrz.mjs";
+import { genericTextParser } from "./parsers/generic.mjs";
+import { DocumentParserRegistry, createDocumentParserRegistry } from "./parsers/registry.mjs";
+import { buildSharpPreprocessor, defaultPreprocessor, passthrough } from "./drivers/preprocess.mjs";
+import { TesseractOcrDriver } from "./drivers/tesseract.mjs";
+import { OcrDriverFactory } from "./registry.mjs";
+export { AnthropicOcrDriver, DEFAULT_AI_MODEL, DocumentParserRegistry, ExternalOcrDriver, MockOcrDriver, OcrDriverFactory, TesseractOcrDriver, anthropicVision, applyAuthenticity, buildSharpPreprocessor, createDocumentParserRegistry, defaultPreprocessor, genericTextParser, mrzParser, passthrough, scoreConfidence };

package/dist/parsers/generic.d.mts ADDED Viewed

@@ -0,0 +1,8 @@
+import { DocumentParser } from "./types.mjs";
+//#region src/parsers/generic.d.ts
+/** The generic best-effort text parser. */
+declare function genericTextParser(): DocumentParser;
+//#endregion
+export { genericTextParser };
+//# sourceMappingURL=generic.d.mts.map

package/dist/parsers/generic.d.mts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"generic.d.mts","names":[],"sources":["../../src/parsers/generic.ts"],"mappings":";;;;iBA0FgB,iBAAA,IAAqB,cAAc"}

package/dist/parsers/generic.mjs ADDED Viewed

@@ -0,0 +1,84 @@
+//#region src/parsers/generic.ts
+/**
+* A best-effort parser for documents without (or with an unreadable) MRZ. It
+* scrapes dates and a document-number-like token from the raw OCR text via
+* regex. Low confidence by design — it's the last resort when the MRZ parser
+* can't read a machine-readable zone (most ID/licence fronts).
+*/
+const MONTHS = {
+	jan: 1,
+	feb: 2,
+	mar: 3,
+	apr: 4,
+	may: 5,
+	jun: 6,
+	jul: 7,
+	aug: 8,
+	sep: 9,
+	oct: 10,
+	nov: 11,
+	dec: 12
+};
+const pad2 = (n) => n < 10 ? `0${n}` : `${n}`;
+function isoDate(y, m, d) {
+	if (m < 1 || m > 12 || d < 1 || d > 31 || y < 1900 || y > 2100) return null;
+	return `${y}-${pad2(m)}-${pad2(d)}`;
+}
+/** Extract plausible calendar dates from text, ascending and de-duplicated. */
+function extractDates(text) {
+	const found = /* @__PURE__ */ new Set();
+	for (const m of text.matchAll(/\b(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})\b/g)) {
+		const iso = isoDate(Number(m[1]), Number(m[2]), Number(m[3]));
+		if (iso) found.add(iso);
+	}
+	for (const m of text.matchAll(/\b(\d{1,2})[-/.](\d{1,2})[-/.](\d{4})\b/g)) {
+		const a = Number(m[1]);
+		const b = Number(m[2]);
+		const y = Number(m[3]);
+		const iso = b > 12 ? isoDate(y, a, b) : isoDate(y, b, a);
+		if (iso) found.add(iso);
+	}
+	for (const m of text.matchAll(/\b(\d{1,2})[-\s]([A-Za-z]{3,})[-\s](\d{4})\b/g)) {
+		const month = MONTHS[m[2].slice(0, 3).toLowerCase()];
+		if (month) {
+			const iso = isoDate(Number(m[3]), month, Number(m[1]));
+			if (iso) found.add(iso);
+		}
+	}
+	return [...found].sort();
+}
+/** Pick a document-number-like token: 6–12 chars, has a digit, prefer alphanumeric. */
+function extractDocumentNumber(text) {
+	const tokens = (text.toUpperCase().match(/\b[A-Z0-9]{6,12}\b/g) ?? []).filter((t) => /[0-9]/.test(t) && !/^(19|20)\d{2}$/.test(t));
+	return tokens.find((t) => /[A-Z]/.test(t)) ?? tokens[0];
+}
+var GenericTextParser = class {
+	name = "generic";
+	parse(input) {
+		const text = input.lines ? input.lines.join("\n") : input.text;
+		const dates = extractDates(text);
+		const documentNumber = extractDocumentNumber(text);
+		if (dates.length === 0 && !documentNumber) return null;
+		const fields = {};
+		if (documentNumber) fields.documentNumber = documentNumber;
+		if (dates.length >= 1) fields.dateOfBirth = dates[0];
+		if (dates.length >= 2) fields.expiryDate = dates[dates.length - 1];
+		return {
+			fields,
+			confidence: .35,
+			raw: {
+				format: "generic",
+				dates,
+				documentNumber
+			}
+		};
+	}
+};
+/** The generic best-effort text parser. */
+function genericTextParser() {
+	return new GenericTextParser();
+}
+//#endregion
+export { genericTextParser };
+//# sourceMappingURL=generic.mjs.map

package/dist/parsers/generic.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"generic.mjs","names":[],"sources":["../../src/parsers/generic.ts"],"sourcesContent":["import type { IsoDate, OcrFields } from '@arkyc/types'\nimport type { DocumentParser, ParseInput, ParseOutput } from './types'\n\n/**\n * A best-effort parser for documents without (or with an unreadable) MRZ. It\n * scrapes dates and a document-number-like token from the raw OCR text via\n * regex. Low confidence by design — it's the last resort when the MRZ parser\n * can't read a machine-readable zone (most ID/licence fronts).\n */\n\nconst MONTHS: Record<string, number> = {\n jan: 1,\n feb: 2,\n mar: 3,\n apr: 4,\n may: 5,\n jun: 6,\n jul: 7,\n aug: 8,\n sep: 9,\n oct: 10,\n nov: 11,\n dec: 12,\n}\n\nconst pad2 = (n: number): string => (n < 10 ? `0${n}` : `${n}`)\n\nfunction isoDate(y: number, m: number, d: number): IsoDate | null {\n if (m < 1 || m > 12 || d < 1 || d > 31 || y < 1900 || y > 2100) return null\n return `${y}-${pad2(m)}-${pad2(d)}` as IsoDate\n}\n\n/** Extract plausible calendar dates from text, ascending and de-duplicated. */\nfunction extractDates(text: string): IsoDate[] {\n const found = new Set<IsoDate>()\n\n // ISO-ish: YYYY-MM-DD / YYYY.MM.DD / YYYY/MM/DD\n for (const m of text.matchAll(/\\b(\\d{4})[-/.](\\d{1,2})[-/.](\\d{1,2})\\b/g)) {\n const iso = isoDate(Number(m[1]), Number(m[2]), Number(m[3]))\n if (iso) found.add(iso)\n }\n // Day/Month first: DD-MM-YYYY / DD.MM.YYYY / DD/MM/YYYY (US MM/DD/YYYY when month>12).\n for (const m of text.matchAll(/\\b(\\d{1,2})[-/.](\\d{1,2})[-/.](\\d{4})\\b/g)) {\n const a = Number(m[1])\n const b = Number(m[2])\n const y = Number(m[3])\n const iso = b > 12 ? isoDate(y, a, b) : isoDate(y, b, a)\n if (iso) found.add(iso)\n }\n // DD MON YYYY: \"12 JAN 2020\", \"12-Jan-2020\"\n for (const m of text.matchAll(/\\b(\\d{1,2})[-\\s]([A-Za-z]{3,})[-\\s](\\d{4})\\b/g)) {\n const month = MONTHS[m[2]!.slice(0, 3).toLowerCase()]\n if (month) {\n const iso = isoDate(Number(m[3]), month, Number(m[1]))\n if (iso) found.add(iso)\n }\n }\n\n return [...found].sort()\n}\n\n/** Pick a document-number-like token: 6–12 chars, has a digit, prefer alphanumeric. */\nfunction extractDocumentNumber(text: string): string | undefined {\n const tokens = (text.toUpperCase().match(/\\b[A-Z0-9]{6,12}\\b/g) ?? []).filter(\n (t) => /[0-9]/.test(t) && !/^(19|20)\\d{2}$/.test(t),\n )\n return tokens.find((t) => /[A-Z]/.test(t)) ?? tokens[0]\n}\n\nclass GenericTextParser implements DocumentParser {\n readonly name = 'generic'\n\n parse(input: ParseInput): ParseOutput | null {\n const text = input.lines ? input.lines.join('\\n') : input.text\n const dates = extractDates(text)\n const documentNumber = extractDocumentNumber(text)\n\n if (dates.length === 0 && !documentNumber) return null\n\n const fields: OcrFields = {}\n if (documentNumber) fields.documentNumber = documentNumber\n // Earliest date is most likely the date of birth; latest the expiry.\n if (dates.length >= 1) fields.dateOfBirth = dates[0]\n if (dates.length >= 2) fields.expiryDate = dates[dates.length - 1]\n\n return { fields, confidence: 0.35, raw: { format: 'generic', dates, documentNumber } }\n }\n}\n\n/** The generic best-effort text parser. */\nexport function genericTextParser(): DocumentParser {\n return new GenericTextParser()\n}\n"],"mappings":";;;;;;;AAUA,MAAM,SAAiC;CACrC,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;CACL,KAAK;AACP;AAEA,MAAM,QAAQ,MAAuB,IAAI,KAAK,IAAI,MAAM,GAAG;AAE3D,SAAS,QAAQ,GAAW,GAAW,GAA2B;CAChE,IAAI,IAAI,KAAK,IAAI,MAAM,IAAI,KAAK,IAAI,MAAM,IAAI,QAAQ,IAAI,MAAM,OAAO;CACvE,OAAO,GAAG,EAAE,GAAG,KAAK,CAAC,EAAE,GAAG,KAAK,CAAC;AAClC;;AAGA,SAAS,aAAa,MAAyB;CAC7C,MAAM,wBAAQ,IAAI,IAAa;CAG/B,KAAK,MAAM,KAAK,KAAK,SAAS,0CAA0C,GAAG;EACzE,MAAM,MAAM,QAAQ,OAAO,EAAE,EAAE,GAAG,OAAO,EAAE,EAAE,GAAG,OAAO,EAAE,EAAE,CAAC;EAC5D,IAAI,KAAK,MAAM,IAAI,GAAG;CACxB;CAEA,KAAK,MAAM,KAAK,KAAK,SAAS,0CAA0C,GAAG;EACzE,MAAM,IAAI,OAAO,EAAE,EAAE;EACrB,MAAM,IAAI,OAAO,EAAE,EAAE;EACrB,MAAM,IAAI,OAAO,EAAE,EAAE;EACrB,MAAM,MAAM,IAAI,KAAK,QAAQ,GAAG,GAAG,CAAC,IAAI,QAAQ,GAAG,GAAG,CAAC;EACvD,IAAI,KAAK,MAAM,IAAI,GAAG;CACxB;CAEA,KAAK,MAAM,KAAK,KAAK,SAAS,+CAA+C,GAAG;EAC9E,MAAM,QAAQ,OAAO,EAAE,EAAE,CAAE,MAAM,GAAG,CAAC,CAAC,CAAC,YAAY;EACnD,IAAI,OAAO;GACT,MAAM,MAAM,QAAQ,OAAO,EAAE,EAAE,GAAG,OAAO,OAAO,EAAE,EAAE,CAAC;GACrD,IAAI,KAAK,MAAM,IAAI,GAAG;EACxB;CACF;CAEA,OAAO,CAAC,GAAG,KAAK,CAAC,CAAC,KAAK;AACzB;;AAGA,SAAS,sBAAsB,MAAkC;CAC/D,MAAM,UAAU,KAAK,YAAY,CAAC,CAAC,MAAM,qBAAqB,KAAK,CAAC,EAAA,CAAG,QACpE,MAAM,QAAQ,KAAK,CAAC,KAAK,CAAC,iBAAiB,KAAK,CAAC,CACpD;CACA,OAAO,OAAO,MAAM,MAAM,QAAQ,KAAK,CAAC,CAAC,KAAK,OAAO;AACvD;AAEA,IAAM,oBAAN,MAAkD;CAChD,OAAgB;CAEhB,MAAM,OAAuC;EAC3C,MAAM,OAAO,MAAM,QAAQ,MAAM,MAAM,KAAK,IAAI,IAAI,MAAM;EAC1D,MAAM,QAAQ,aAAa,IAAI;EAC/B,MAAM,iBAAiB,sBAAsB,IAAI;EAEjD,IAAI,MAAM,WAAW,KAAK,CAAC,gBAAgB,OAAO;EAElD,MAAM,SAAoB,CAAC;EAC3B,IAAI,gBAAgB,OAAO,iBAAiB;EAE5C,IAAI,MAAM,UAAU,GAAG,OAAO,cAAc,MAAM;EAClD,IAAI,MAAM,UAAU,GAAG,OAAO,aAAa,MAAM,MAAM,SAAS;EAEhE,OAAO;GAAE;GAAQ,YAAY;GAAM,KAAK;IAAE,QAAQ;IAAW;IAAO;GAAe;EAAE;CACvF;AACF;;AAGA,SAAgB,oBAAoC;CAClD,OAAO,IAAI,kBAAkB;AAC/B"}

package/dist/parsers/mrz.d.mts ADDED Viewed

@@ -0,0 +1,8 @@
+import { DocumentParser } from "./types.mjs";
+//#region src/parsers/mrz.d.ts
+/** The MRZ parser — a country-agnostic default for machine-readable documents. */
+declare function mrzParser(): DocumentParser;
+//#endregion
+export { mrzParser };
+//# sourceMappingURL=mrz.d.mts.map