@albex/ocr 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Wrapper around Tesseract.js.
3
+ *
4
+ * Responsibilities:
5
+ * 1. Lazy-load the Tesseract.js library on first use (dynamic import).
6
+ * 2. Maintain a per-language `Worker` instance, created on demand.
7
+ * 3. Auto-terminate idle workers after a configurable timeout to release
8
+ * the LSTM model from memory (each Tesseract worker holds 2-5 MB).
9
+ * 4. Expose a Promise-based `recognize(image, lang)` that returns text +
10
+ * confidence in a stable shape.
11
+ *
12
+ * Why one worker per language: Tesseract.js workers are tied to the language
13
+ * model they were initialised with. Switching languages on the same worker
14
+ * triggers a slow reload of the LSTM. Maintaining N workers — one per
15
+ * language ever used — keeps each recognise call fast at the cost of slightly
16
+ * more memory, which the idle eviction then claws back.
17
+ */
18
+ import { AlbexOcrInitError, AlbexOcrLanguageError, AlbexOcrRecognitionError, } from './errors.js';
19
+ /**
20
+ * Single global Tesseract module reference. The dynamic import is shared
21
+ * across every OCR call once it resolves.
22
+ */
23
+ let tesseractPromise = null;
24
+ function loadTesseract() {
25
+ if (tesseractPromise)
26
+ return tesseractPromise;
27
+ tesseractPromise = (async () => {
28
+ try {
29
+ // The bundler resolves this to the `tesseract.js` ESM entry. Tesseract
30
+ // internally pulls its own WASM core and worker script lazily, so we
31
+ // don't pay for them until createWorker is called.
32
+ const mod = (await import('tesseract.js'));
33
+ if (typeof mod.createWorker !== 'function') {
34
+ throw new Error('tesseract.js: createWorker export missing');
35
+ }
36
+ return mod;
37
+ }
38
+ catch (e) {
39
+ tesseractPromise = null; // allow retry
40
+ throw new AlbexOcrInitError(`Failed to load tesseract.js: ${e.message}`);
41
+ }
42
+ })();
43
+ return tesseractPromise;
44
+ }
45
+ export class OcrWorkerPool {
46
+ _workers = new Map();
47
+ _evictionTimer = null;
48
+ _idleTimeoutMs;
49
+ _langPath;
50
+ constructor(opts = {}) {
51
+ this._idleTimeoutMs = opts.idleTimeoutMs ?? 5 * 60_000;
52
+ this._langPath = opts.langPath;
53
+ if (this._idleTimeoutMs > 0) {
54
+ // Sweep every minute. We don't need finer granularity for "did this
55
+ // worker idle past the threshold?".
56
+ this._evictionTimer = setInterval(() => { void this._sweepIdle(); }, Math.min(60_000, this._idleTimeoutMs));
57
+ // Don't keep the event loop alive just because we have a timer.
58
+ // (Browsers don't have unref; Node does.)
59
+ const t = this._evictionTimer;
60
+ t.unref?.();
61
+ }
62
+ }
63
+ /**
64
+ * Run OCR on a single image. Spawns the appropriate language worker on
65
+ * first use and caches it; subsequent calls for the same language reuse it.
66
+ */
67
+ async recognize(image, lang) {
68
+ const entry = await this._getOrCreate(lang);
69
+ entry.pending++;
70
+ const t0 = performance.now();
71
+ try {
72
+ const { data } = await entry.worker.recognize(image);
73
+ const elapsed = performance.now() - t0;
74
+ entry.lastUsedAt = Date.now();
75
+ return { text: data.text, confidence: data.confidence, timeMs: elapsed };
76
+ }
77
+ catch (e) {
78
+ throw new AlbexOcrRecognitionError(`OCR failed for language=${lang}: ${e.message}`);
79
+ }
80
+ finally {
81
+ entry.pending--;
82
+ }
83
+ }
84
+ /**
85
+ * Names of languages currently loaded in memory. Useful for diagnostics
86
+ * and for the demo's runtime panel.
87
+ */
88
+ loadedLanguages() {
89
+ return [...this._workers.keys()];
90
+ }
91
+ /**
92
+ * Tear down all workers immediately. Called by orchestrator dispose.
93
+ */
94
+ async dispose() {
95
+ if (this._evictionTimer) {
96
+ clearInterval(this._evictionTimer);
97
+ this._evictionTimer = null;
98
+ }
99
+ const all = [...this._workers.values()];
100
+ this._workers.clear();
101
+ await Promise.allSettled(all.map(e => e.worker.terminate()));
102
+ }
103
+ /** TC39 explicit-resource-management alias. Fires `dispose()` async. */
104
+ [Symbol.dispose]() {
105
+ void this.dispose();
106
+ }
107
+ // ── internals ─────────────────────────────────────────────────────────
108
+ async _getOrCreate(lang) {
109
+ const existing = this._workers.get(lang);
110
+ if (existing)
111
+ return existing;
112
+ const tess = await loadTesseract();
113
+ let worker;
114
+ try {
115
+ // Tesseract.js v5+ createWorker(lang, oem, opts). `oem: 1` = LSTM only
116
+ // (the fast modern path, faster than the legacy Cube engine).
117
+ const opts = this._langPath ? { langPath: this._langPath } : undefined;
118
+ worker = await tess.createWorker(lang, 1, opts);
119
+ }
120
+ catch (e) {
121
+ throw new AlbexOcrLanguageError(lang, `Failed to load language model "${lang}": ${e.message}`);
122
+ }
123
+ const entry = {
124
+ worker,
125
+ lang,
126
+ lastUsedAt: Date.now(),
127
+ pending: 0,
128
+ };
129
+ this._workers.set(lang, entry);
130
+ return entry;
131
+ }
132
+ async _sweepIdle() {
133
+ const now = Date.now();
134
+ const victims = [];
135
+ for (const [lang, entry] of this._workers) {
136
+ if (entry.pending > 0)
137
+ continue;
138
+ if (now - entry.lastUsedAt < this._idleTimeoutMs)
139
+ continue;
140
+ this._workers.delete(lang);
141
+ victims.push(entry);
142
+ }
143
+ await Promise.allSettled(victims.map(v => v.worker.terminate()));
144
+ }
145
+ }
146
+ //# sourceMappingURL=ocr-worker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ocr-worker.js","sourceRoot":"","sources":["../src/ocr-worker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAGH,OAAO,EACL,iBAAiB,EACjB,qBAAqB,EACrB,wBAAwB,GACzB,MAAM,aAAa,CAAC;AA8DrB;;;GAGG;AACH,IAAI,gBAAgB,GAAoC,IAAI,CAAC;AAE7D,SAAS,aAAa;IACpB,IAAI,gBAAgB;QAAE,OAAO,gBAAgB,CAAC;IAC9C,gBAAgB,GAAG,CAAC,KAAK,IAAI,EAAE;QAC7B,IAAI,CAAC;YACH,uEAAuE;YACvE,qEAAqE;YACrE,mDAAmD;YACnD,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM,CAAC,cAAc,CAAC,CAA+B,CAAC;YACzE,IAAI,OAAO,GAAG,CAAC,YAAY,KAAK,UAAU,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAC;YAC/D,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,gBAAgB,GAAG,IAAI,CAAC,CAAC,cAAc;YACvC,MAAM,IAAI,iBAAiB,CAAC,gCAAiC,CAAW,CAAC,OAAO,EAAE,CAAC,CAAC;QACtF,CAAC;IACH,CAAC,CAAC,EAAE,CAAC;IACL,OAAO,gBAAgB,CAAC;AAC1B,CAAC;AAED,MAAM,OAAO,aAAa;IAChB,QAAQ,GAAG,IAAI,GAAG,EAAqB,CAAC;IACxC,cAAc,GAA0C,IAAI,CAAC;IACpD,cAAc,CAAS;IACvB,SAAS,CAAqB;IAE/C,YAAY,OAAyB,EAAE;QACrC,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,aAAa,IAAI,CAAC,GAAG,MAAM,CAAC;QACvD,IAAI,CAAC,SAAS,GAAQ,IAAI,CAAC,QAAQ,CAAC;QACpC,IAAI,IAAI,CAAC,cAAc,GAAG,CAAC,EAAE,CAAC;YAC5B,oEAAoE;YACpE,oCAAoC;YACpC,IAAI,CAAC,cAAc,GAAG,WAAW,CAC/B,GAAG,EAAE,GAAG,KAAK,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,EACjC,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,CAAC,CACtC,CAAC;YACF,gEAAgE;YAChE,0CAA0C;YAC1C,MAAM,CAAC,GAAG,IAAI,CAAC,cAAmD,CAAC;YACnE,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,SAAS,CAAC,KAAgB,EAAE,IAAU;QAC1C,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;QAC5C,KAAK,CAAC,OAAO,EAAE,CAAC;QAChB,MAAM,EAAE,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;YACrD,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,EAAE,CAAC;YACvC,KAAK,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC9B,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC;QAC3E,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,IAAI,wBAAwB,CAChC,2BAA2B,IAAI,KAAM,CAAW,CAAC,OAAO,EAAE,CAC3D,CAAC;QACJ,CAAC;gBAAS,CAAC;YACT,KAAK,CAAC,OAAO,EAAE,CAAC;QAClB,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,eAAe;QACb,OAAO,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;IACnC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACxB,aAAa,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YACnC,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC;QAC7B,CAAC;QACD,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QACxC,IAAI,CAAC,QAAQ,CAAC,KAAK,EAAE,CAAC;QACtB,MAAM,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;IAC/D,CAAC;IAED,wEAAwE;IACxE,CAAC,MAAM,CAAC,OAAO,CAAC;QACd,KAAK,IAAI,CAAC,OAAO,EAAE,CAAC;IACtB,CAAC;IAED,yEAAyE;IAEjE,KAAK,CAAC,YAAY,CAAC,IAAU;QACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACzC,IAAI,QAAQ;YAAE,OAAO,QAAQ,CAAC;QAE9B,MAAM,IAAI,GAAG,MAAM,aAAa,EAAE,CAAC;QACnC,IAAI,MAAuB,CAAC;QAC5B,IAAI,CAAC;YACH,uEAAuE;YACvE,8DAA8D;YAC9D,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;YACvE,MAAM,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,EAAE,IAAI,CAAC,CAAC;QAClD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,IAAI,qBAAqB,CAC7B,IAAI,EACJ,kCAAkC,IAAI,MAAO,CAAW,CAAC,OAAO,EAAE,CACnE,CAAC;QACJ,CAAC;QACD,MAAM,KAAK,GAAgB;YACzB,MAAM;YACN,IAAI;YACJ,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE;YACtB,OAAO,EAAE,CAAC;SACX,CAAC;QACF,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;QAC/B,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,KAAK,CAAC,UAAU;QACtB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,MAAM,OAAO,GAAkB,EAAE,CAAC;QAClC,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC1C,IAAI,KAAK,CAAC,OAAO,GAAG,CAAC;gBAAE,SAAS;YAChC,IAAI,GAAG,GAAG,KAAK,CAAC,UAAU,GAAG,IAAI,CAAC,cAAc;gBAAE,SAAS;YAC3D,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YAC3B,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;QACD,MAAM,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;IACnE,CAAC;CACF"}
@@ -0,0 +1,117 @@
1
+ /**
2
+ * Public surface of `@albex/ocr`.
3
+ *
4
+ * The orchestrator wires an `OcrWorkerPool` to an `AlbexEngine` instance.
5
+ * After `enableOcr(engine)`:
6
+ *
7
+ * - `engine.ocrImage(blob)` becomes available and returns recognised text.
8
+ * - The text can be indexed manually with `engine.indexText(name, text)`
9
+ * if such a primitive is exposed by the host (future addition).
10
+ *
11
+ * The returned `OcrHandle` lets the caller probe state, query loaded
12
+ * languages, force pre-load of specific languages, or shut down.
13
+ */
14
+ import type { ImageLike, RecognitionResult } from './ocr-worker.js';
15
+ import type { Lang } from './language-detector.js';
16
+ /**
17
+ * The subset of `AlbexEngine` we need. Kept minimal so this package's only
18
+ * peer dependency on `albex` is a type contract.
19
+ */
20
+ export interface OcrCapableEngine {
21
+ /** Storage slot where the orchestrator parks its public ocrImage method. */
22
+ ocrImage?: (image: ImageLike, opts?: OcrRecognizeOptions) => Promise<RecognitionResult>;
23
+ /** Structural slot the engine reads to decide whether to invoke OCR on
24
+ * embedded images of PDFs that ALSO have vector text. Set by `enableOcr`
25
+ * when the `alwaysExtractEmbeddedImages` option is true. */
26
+ ocrConfig?: {
27
+ alwaysExtractEmbeddedImages?: boolean;
28
+ };
29
+ }
30
+ export interface OcrOptions {
31
+ /**
32
+ * Languages to make available for auto-detection. Order matters only
33
+ * in tie-breaking. Default: all 6 supported languages.
34
+ */
35
+ languages?: readonly Lang[];
36
+ /**
37
+ * Default language when detection cannot decide (very short text, no
38
+ * distinctive characters). Default: `eng`.
39
+ */
40
+ defaultLanguage?: Lang;
41
+ /**
42
+ * Eagerly load these languages on `enableOcr` instead of waiting for
43
+ * the first call that needs them. Useful when you know the corpus
44
+ * language ahead of time and want a warm engine.
45
+ */
46
+ preload?: readonly Lang[];
47
+ /**
48
+ * Milliseconds of inactivity after which idle Tesseract workers are
49
+ * terminated and their language models released. Default: 5 minutes.
50
+ */
51
+ idleTimeoutMs?: number;
52
+ /**
53
+ * Override the location from which `<lang>_fast.traineddata` is fetched.
54
+ * Defaults to the jsDelivr mirror Tesseract.js uses internally.
55
+ */
56
+ langPath?: string;
57
+ /**
58
+ * Hybrid PDF mode. When `true`, the engine OCRs embedded images of
59
+ * EVERY PDF, not just PDFs that have no extractable vector text.
60
+ * Useful for documents that mix native text with scanned annexes,
61
+ * stamps, signatures or diagrams with text labels.
62
+ *
63
+ * Cost: 1–3 s of OCR per qualifying image (only images larger than
64
+ * 200×200 are sent to Tesseract; logos, scanner-corner marks and
65
+ * signature glyphs are skipped server-side in Rust).
66
+ *
67
+ * Default: `false`. Keep it off unless your corpus is known to
68
+ * contain hybrid PDFs.
69
+ */
70
+ alwaysExtractEmbeddedImages?: boolean;
71
+ }
72
+ export interface OcrRecognizeOptions {
73
+ /**
74
+ * Force a specific language. When omitted, the orchestrator attempts to
75
+ * detect it from the first run's output (if recoverable) and otherwise
76
+ * falls back to `defaultLanguage`.
77
+ */
78
+ lang?: Lang;
79
+ /**
80
+ * Hint of expected language for the first-pass detection. Useful when
81
+ * you know the doc is e.g. Spanish but don't want to lock it.
82
+ */
83
+ hint?: Lang;
84
+ }
85
+ export interface OcrHandle {
86
+ /** Pre-load the language models for the listed languages. */
87
+ preload(langs: readonly Lang[]): Promise<void>;
88
+ /** Currently loaded (in-memory) languages. */
89
+ loadedLanguages(): Lang[];
90
+ /** Tear down all workers and unhook from the engine. */
91
+ dispose(): Promise<void>;
92
+ }
93
+ /**
94
+ * Hook OCR into an Albex engine. Returns a handle that lets the caller
95
+ * preload languages, inspect state, and dispose cleanly.
96
+ *
97
+ * Example:
98
+ *
99
+ * ```ts
100
+ * import { AlbexEngine } from 'albex';
101
+ * import { enableOcr } from '@albex/ocr';
102
+ *
103
+ * const engine = new AlbexEngine();
104
+ * await engine.init();
105
+ *
106
+ * const ocr = enableOcr(engine, { preload: ['eng', 'spa'] });
107
+ *
108
+ * const blob: Blob = await fetch('/scan.png').then(r => r.blob());
109
+ * const { text } = await engine.ocrImage(blob);
110
+ * ```
111
+ */
112
+ export declare function enableOcr<T extends OcrCapableEngine>(engine: T, opts?: OcrOptions): OcrHandle;
113
+ export { detectLanguage, detectLanguageOr, scoreLanguages, SUPPORTED_LANGS } from './language-detector.js';
114
+ export type { Lang } from './language-detector.js';
115
+ export type { RecognitionResult, ImageLike } from './ocr-worker.js';
116
+ export { AlbexOcrError, AlbexOcrInitError, AlbexOcrLanguageError, AlbexOcrRecognitionError, } from './errors.js';
117
+ //# sourceMappingURL=orchestrator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"orchestrator.d.ts","sourceRoot":"","sources":["../src/orchestrator.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAEpE,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAGnD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4EAA4E;IAC5E,QAAQ,CAAC,EAAE,CAAC,KAAK,EAAE,SAAS,EAAE,IAAI,CAAC,EAAE,mBAAmB,KAAK,OAAO,CAAC,iBAAiB,CAAC,CAAC;IACxF;;gEAE4D;IAC5D,SAAS,CAAC,EAAE;QAAE,2BAA2B,CAAC,EAAE,OAAO,CAAA;KAAE,CAAC;CACvD;AAED,MAAM,WAAW,UAAU;IACzB;;;OAGG;IACH,SAAS,CAAC,EAAE,SAAS,IAAI,EAAE,CAAC;IAC5B;;;OAGG;IACH,eAAe,CAAC,EAAE,IAAI,CAAC;IACvB;;;;OAIG;IACH,OAAO,CAAC,EAAE,SAAS,IAAI,EAAE,CAAC;IAC1B;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;;;;;;;;;;OAYG;IACH,2BAA2B,CAAC,EAAE,OAAO,CAAC;CACvC;AAED,MAAM,WAAW,mBAAmB;IAClC;;;;OAIG;IACH,IAAI,CAAC,EAAE,IAAI,CAAC;IACZ;;;OAGG;IACH,IAAI,CAAC,EAAE,IAAI,CAAC;CACb;AAED,MAAM,WAAW,SAAS;IACxB,6DAA6D;IAC7D,OAAO,CAAC,KAAK,EAAE,SAAS,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAC/C,8CAA8C;IAC9C,eAAe,IAAI,IAAI,EAAE,CAAC;IAC1B,wDAAwD;IACxD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC1B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,SAAS,CAAC,CAAC,SAAS,gBAAgB,EAClD,MAAM,EAAE,CAAC,EACT,IAAI,GAAE,UAAe,GACpB,SAAS,CAuDX;AA4CD,OAAO,EAAE,cAAc,EAAE,gBAAgB,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAC3G,YAAY,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AACnD,YAAY,EAAE,iBAAiB,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AACpE,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,wBAAwB,GACzB,MAAM,aAAa,CAAC"}
@@ -0,0 +1,120 @@
1
+ /**
2
+ * Public surface of `@albex/ocr`.
3
+ *
4
+ * The orchestrator wires an `OcrWorkerPool` to an `AlbexEngine` instance.
5
+ * After `enableOcr(engine)`:
6
+ *
7
+ * - `engine.ocrImage(blob)` becomes available and returns recognised text.
8
+ * - The text can be indexed manually with `engine.indexText(name, text)`
9
+ * if such a primitive is exposed by the host (future addition).
10
+ *
11
+ * The returned `OcrHandle` lets the caller probe state, query loaded
12
+ * languages, force pre-load of specific languages, or shut down.
13
+ */
14
+ import { OcrWorkerPool } from './ocr-worker.js';
15
+ import { SUPPORTED_LANGS } from './language-detector.js';
16
+ import { AlbexOcrError } from './errors.js';
17
+ /**
18
+ * Hook OCR into an Albex engine. Returns a handle that lets the caller
19
+ * preload languages, inspect state, and dispose cleanly.
20
+ *
21
+ * Example:
22
+ *
23
+ * ```ts
24
+ * import { AlbexEngine } from 'albex';
25
+ * import { enableOcr } from '@albex/ocr';
26
+ *
27
+ * const engine = new AlbexEngine();
28
+ * await engine.init();
29
+ *
30
+ * const ocr = enableOcr(engine, { preload: ['eng', 'spa'] });
31
+ *
32
+ * const blob: Blob = await fetch('/scan.png').then(r => r.blob());
33
+ * const { text } = await engine.ocrImage(blob);
34
+ * ```
35
+ */
36
+ export function enableOcr(engine, opts = {}) {
37
+ if (engine.ocrImage) {
38
+ throw new AlbexOcrError('ocr_already_enabled', 'enableOcr called on an engine that already has OCR attached. Call dispose() on the previous handle first.');
39
+ }
40
+ const enabledLangs = new Set(opts.languages ?? SUPPORTED_LANGS);
41
+ const defaultLang = opts.defaultLanguage ?? 'eng';
42
+ const pool = new OcrWorkerPool({
43
+ idleTimeoutMs: opts.idleTimeoutMs,
44
+ langPath: opts.langPath,
45
+ });
46
+ // Pre-load requested languages without awaiting (fire-and-forget). The
47
+ // user can also call `handle.preload(...)` to await explicitly.
48
+ if (opts.preload && opts.preload.length > 0) {
49
+ void Promise.all(opts.preload
50
+ .filter(l => enabledLangs.has(l))
51
+ .map(l => pool.recognize(EMPTY_PIXEL, l).catch(() => { })));
52
+ }
53
+ // Attach the recognise method to the engine.
54
+ engine.ocrImage = async (image, recOpts) => {
55
+ const targetLang = pickLanguage(recOpts, enabledLangs, defaultLang);
56
+ return pool.recognize(image, targetLang);
57
+ };
58
+ // Hybrid-PDF flag. The engine reads this to decide whether to walk every
59
+ // PDF's embedded images on top of the normal text extraction. Stored as
60
+ // a separate slot so the structural contract with `albex` stays minimal.
61
+ if (opts.alwaysExtractEmbeddedImages) {
62
+ engine.ocrConfig = { alwaysExtractEmbeddedImages: true };
63
+ }
64
+ return {
65
+ async preload(langs) {
66
+ await Promise.all(langs
67
+ .filter(l => enabledLangs.has(l))
68
+ .map(l => pool.recognize(EMPTY_PIXEL, l).catch(() => { })));
69
+ },
70
+ loadedLanguages() {
71
+ return pool.loadedLanguages();
72
+ },
73
+ async dispose() {
74
+ await pool.dispose();
75
+ delete engine.ocrImage;
76
+ delete engine.ocrConfig;
77
+ },
78
+ };
79
+ }
80
+ /**
81
+ * Picks the language for a recognise call. Priority:
82
+ * 1. Explicit `opts.lang`.
83
+ * 2. `opts.hint` if enabled.
84
+ * 3. `defaultLanguage`.
85
+ *
86
+ * (Automatic from-text detection happens AFTER the first recognise, not
87
+ * before — until we have output we have nothing to detect from. A second
88
+ * pass with the corrected language is a future feature.)
89
+ */
90
+ function pickLanguage(opts, enabled, fallback) {
91
+ if (opts?.lang && enabled.has(opts.lang))
92
+ return opts.lang;
93
+ if (opts?.hint && enabled.has(opts.hint))
94
+ return opts.hint;
95
+ return fallback;
96
+ }
97
+ /**
98
+ * A 1×1 transparent PNG used to warm a language worker without making real
99
+ * inference effort. Tesseract.js still bootstraps the LSTM on first
100
+ * recognise, which is the slow part we want done in advance.
101
+ */
102
+ const EMPTY_PIXEL = (() => {
103
+ const bytes = new Uint8Array([
104
+ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A,
105
+ 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
106
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
107
+ 0x08, 0x06, 0x00, 0x00, 0x00, 0x1F, 0x15, 0xC4,
108
+ 0x89, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x44, 0x41,
109
+ 0x54, 0x78, 0x9C, 0x63, 0x00, 0x01, 0x00, 0x00,
110
+ 0x05, 0x00, 0x01, 0x0D, 0x0A, 0x2D, 0xB4, 0x00,
111
+ 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE,
112
+ 0x42, 0x60, 0x82,
113
+ ]);
114
+ return bytes.buffer;
115
+ })();
116
+ // Re-export the detector so consumers can use it directly when they have
117
+ // known-good source text to identify the document's language before OCR.
118
+ export { detectLanguage, detectLanguageOr, scoreLanguages, SUPPORTED_LANGS } from './language-detector.js';
119
+ export { AlbexOcrError, AlbexOcrInitError, AlbexOcrLanguageError, AlbexOcrRecognitionError, } from './errors.js';
120
+ //# sourceMappingURL=orchestrator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"orchestrator.js","sourceRoot":"","sources":["../src/orchestrator.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAEhD,OAAO,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAEzD,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAiF5C;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,UAAU,SAAS,CACvB,MAAS,EACT,OAAmB,EAAE;IAErB,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;QACpB,MAAM,IAAI,aAAa,CACrB,qBAAqB,EACrB,2GAA2G,CAC5G,CAAC;IACJ,CAAC;IAED,MAAM,YAAY,GAAG,IAAI,GAAG,CAAO,IAAI,CAAC,SAAS,IAAI,eAAe,CAAC,CAAC;IACtE,MAAM,WAAW,GAAI,IAAI,CAAC,eAAe,IAAI,KAAK,CAAC;IACnD,MAAM,IAAI,GAAG,IAAI,aAAa,CAAC;QAC7B,aAAa,EAAE,IAAI,CAAC,aAAa;QACjC,QAAQ,EAAO,IAAI,CAAC,QAAQ;KAC7B,CAAC,CAAC;IAEH,uEAAuE;IACvE,gEAAgE;IAChE,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,KAAK,OAAO,CAAC,GAAG,CACd,IAAI,CAAC,OAAO;aACT,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;aAChC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAoE,CAAC,CAAC,CAAC,CAC9H,CAAC;IACJ,CAAC;IAED,6CAA6C;IAC7C,MAAM,CAAC,QAAQ,GAAG,KAAK,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;QACzC,MAAM,UAAU,GAAG,YAAY,CAAC,OAAO,EAAE,YAAY,EAAE,WAAW,CAAC,CAAC;QACpE,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC;IAC3C,CAAC,CAAC;IAEF,yEAAyE;IACzE,wEAAwE;IACxE,yEAAyE;IACzE,IAAI,IAAI,CAAC,2BAA2B,EAAE,CAAC;QACrC,MAAM,CAAC,SAAS,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,CAAC;IAC3D,CAAC;IAED,OAAO;QACL,KAAK,CAAC,OAAO,CAAC,KAAK;YACjB,MAAM,OAAO,CAAC,GAAG,CACf,KAAK;iBACF,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;iBAChC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAiB,CAAC,CAAC,CAAC,CAC3E,CAAC;QACJ,CAAC;QACD,eAAe;YACb,OAAO,IAAI,CAAC,eAAe,EAAE,CAAC;QAChC,CAAC;QACD,KAAK,CAAC,OAAO;YACX,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YACrB,OAAO,MAAM,CAAC,QAAQ,CAAC;YACvB,OAAO,MAAM,CAAC,SAAS,CAAC;QAC1B,CAAC;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,YAAY,CACnB,IAAqC,EACrC,OAA0B,EAC1B,QAAc;IAEd,IAAI,IAAI,EAAE,IAAI,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC;QAAI,OAAO,IAAI,CAAC,IAAI,CAAC;IAC7D,IAAI,IAAI,EAAE,IAAI,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC;QAAI,OAAO,IAAI,CAAC,IAAI,CAAC;IAC7D,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;;GAIG;AACH,MAAM,WAAW,GAAG,CAAC,GAAG,EAAE;IACxB,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC;QAC3B,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;QAC9C,IAAI,EAAE,IAAI,EAAE,IAAI;KACjB,CAAC,CAAC;IACH,OAAO,KAAK,CAAC,MAAM,CAAC;AACtB,CAAC,CAAC,EAAE,CAAC;AAEL,yEAAyE;AACzE,yEAAyE;AACzE,OAAO,EAAE,cAAc,EAAE,gBAAgB,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAG3G,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,wBAAwB,GACzB,MAAM,aAAa,CAAC"}
package/package.json ADDED
@@ -0,0 +1,45 @@
1
+ {
2
+ "name": "@albex/ocr",
3
+ "version": "0.2.0",
4
+ "description": "OCR module for Albex — Tesseract.js fast, lazy by language, zero-impact on the base bundle.",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "import": "./dist/index.js",
12
+ "types": "./dist/index.d.ts"
13
+ }
14
+ },
15
+ "files": [
16
+ "dist/",
17
+ "src/",
18
+ "README.md"
19
+ ],
20
+ "scripts": {
21
+ "build": "tsc",
22
+ "typecheck": "tsc --noEmit",
23
+ "test": "vitest run",
24
+ "test:watch": "vitest"
25
+ },
26
+ "peerDependencies": {
27
+ "albex": "^0.3.0"
28
+ },
29
+ "dependencies": {
30
+ "tesseract.js": "^5.1.0"
31
+ },
32
+ "devDependencies": {
33
+ "typescript": "^5.4.0",
34
+ "vitest": "^2.0.0"
35
+ },
36
+ "keywords": [
37
+ "ocr",
38
+ "tesseract",
39
+ "albex",
40
+ "wasm",
41
+ "browser",
42
+ "search"
43
+ ],
44
+ "license": "MIT"
45
+ }
package/src/errors.ts ADDED
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Errors thrown by `@albex/ocr`. Mirrors the AlbexError hierarchy of the
3
+ * main package: subclass of `Error`, exposes a `kind` discriminator that
4
+ * survives `structuredClone` across worker boundaries.
5
+ */
6
+
7
+ export class AlbexOcrError extends Error {
8
+ readonly kind: string;
9
+ constructor(kind: string, message: string) {
10
+ super(message);
11
+ this.name = 'AlbexOcrError';
12
+ this.kind = kind;
13
+ }
14
+ }
15
+
16
+ export class AlbexOcrInitError extends AlbexOcrError {
17
+ constructor(message: string) {
18
+ super('ocr_init', message);
19
+ this.name = 'AlbexOcrInitError';
20
+ }
21
+ }
22
+
23
+ export class AlbexOcrLanguageError extends AlbexOcrError {
24
+ readonly lang: string;
25
+ constructor(lang: string, message: string) {
26
+ super('ocr_language', message);
27
+ this.name = 'AlbexOcrLanguageError';
28
+ this.lang = lang;
29
+ }
30
+ }
31
+
32
+ export class AlbexOcrRecognitionError extends AlbexOcrError {
33
+ constructor(message: string) {
34
+ super('ocr_recognition', message);
35
+ this.name = 'AlbexOcrRecognitionError';
36
+ }
37
+ }
package/src/index.ts ADDED
@@ -0,0 +1,48 @@
1
+ /**
2
+ * `@albex/ocr` — OCR module for Albex.
3
+ *
4
+ * Drop-in OCR for Albex powered by Tesseract.js. Lazy loaded at every level:
5
+ * the package itself is opt-in, the Tesseract.js library is dynamic-imported
6
+ * on first use, language models are downloaded on demand and cached forever
7
+ * in IndexedDB.
8
+ *
9
+ * Quick start:
10
+ *
11
+ * ```ts
12
+ * import { AlbexEngine } from 'albex';
13
+ * import { enableOcr } from '@albex/ocr';
14
+ *
15
+ * const engine = new AlbexEngine();
16
+ * await engine.init();
17
+ *
18
+ * const ocr = enableOcr(engine);
19
+ *
20
+ * const { text } = await engine.ocrImage(myImageBlob);
21
+ * console.log(text);
22
+ * ```
23
+ */
24
+
25
+ export {
26
+ enableOcr,
27
+ detectLanguage,
28
+ detectLanguageOr,
29
+ scoreLanguages,
30
+ SUPPORTED_LANGS,
31
+ AlbexOcrError,
32
+ AlbexOcrInitError,
33
+ AlbexOcrLanguageError,
34
+ AlbexOcrRecognitionError,
35
+ } from './orchestrator.js';
36
+
37
+ export type {
38
+ OcrHandle,
39
+ OcrOptions,
40
+ OcrRecognizeOptions,
41
+ OcrCapableEngine,
42
+ Lang,
43
+ RecognitionResult,
44
+ ImageLike,
45
+ } from './orchestrator.js';
46
+
47
+ export { OcrWorkerPool } from './ocr-worker.js';
48
+ export type { OcrWorkerOptions } from './ocr-worker.js';
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Lightweight language detection for the 6 pre-supported OCR languages.
3
+ *
4
+ * The signal is a combination of:
5
+ * 1. Distinctive single characters (`ñ`, `ç`, `ß`, …).
6
+ * 2. The most common stop words of each language, scored by frequency.
7
+ *
8
+ * The detector is meant to be called on a SHORT sample of source text — a
9
+ * paragraph or two from a known-good area of the document, or a previous
10
+ * OCR pass of the first page. It is NOT a robust language ID model; the
11
+ * threshold of confidence below which we fall back to `eng` is intentional.
12
+ *
13
+ * Total weight in the bundle: ~4 KB minified once the rest of the package
14
+ * is tree-shaken alongside it.
15
+ */
16
+
17
+ export type Lang = 'eng' | 'spa' | 'fra' | 'deu' | 'ita' | 'por';
18
+
19
+ export const SUPPORTED_LANGS: readonly Lang[] = ['eng', 'spa', 'fra', 'deu', 'ita', 'por'];
20
+
21
+ interface LangProfile {
22
+ /** Single Unicode characters that strongly suggest this language. */
23
+ chars: string[];
24
+ /** Stop words; first ones are most common. */
25
+ stopWords: string[];
26
+ }
27
+
28
+ /**
29
+ * One profile per language. Stop words come from the OPUS-100 / Wikipedia
30
+ * frequency tables; the top 12 of each language cover ~25-30 % of any
31
+ * substantial text. Distinctive characters are conservative — listed only
32
+ * when they appear in that language and almost nowhere else among our six.
33
+ */
34
+ const PROFILES: Record<Lang, LangProfile> = {
35
+ eng: {
36
+ chars: [], // English shares its alphabet with everyone else in our set.
37
+ stopWords: ['the', 'of', 'and', 'to', 'in', 'a', 'is', 'that', 'for', 'it', 'with', 'as'],
38
+ },
39
+ spa: {
40
+ chars: ['ñ', '¿', '¡'],
41
+ stopWords: ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'se', 'del', 'las', 'por'],
42
+ },
43
+ fra: {
44
+ chars: ['ç', 'œ', 'à', 'â', 'ê', 'ô', 'û'],
45
+ stopWords: ['de', 'la', 'le', 'et', 'les', 'des', 'un', 'une', 'que', 'il', 'au', 'aux'],
46
+ },
47
+ deu: {
48
+ chars: ['ß', 'ü', 'ö', 'ä'],
49
+ stopWords: ['die', 'der', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit', 'sich', 'des', 'auf'],
50
+ },
51
+ ita: {
52
+ chars: [], // No unique distinctive chars vs spa/por.
53
+ stopWords: ['di', 'la', 'il', 'e', 'che', 'un', 'del', 'le', 'per', 'una', 'gli', 'sono'],
54
+ },
55
+ por: {
56
+ chars: ['ã', 'õ'],
57
+ stopWords: ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'os', 'com'],
58
+ },
59
+ };
60
+
61
+ const CHAR_WEIGHT = 8;
62
+ const STOP_WEIGHT = 4;
63
+
64
+ /**
65
+ * Score how likely a sample of text is in each candidate language.
66
+ *
67
+ * Returns scores normalised so the top language is always > 0. Use
68
+ * `detectLanguage` for the simple "which one?" answer.
69
+ */
70
+ export function scoreLanguages(text: string): Record<Lang, number> {
71
+ const lower = text.toLowerCase();
72
+ // Tokenise into words once — used by stop word scoring.
73
+ // Accepts UTF-8 letters and digits; everything else is a separator.
74
+ const words = lower.match(/[\p{L}\p{N}']+/gu) ?? [];
75
+ const wordSet = new Set(words);
76
+
77
+ const scores: Record<Lang, number> = { eng: 0, spa: 0, fra: 0, deu: 0, ita: 0, por: 0 };
78
+
79
+ for (const lang of SUPPORTED_LANGS) {
80
+ const profile = PROFILES[lang];
81
+
82
+ // Distinctive char score.
83
+ for (const c of profile.chars) {
84
+ if (lower.includes(c)) scores[lang] += CHAR_WEIGHT;
85
+ }
86
+
87
+ // Stop word score — top words count more.
88
+ for (let i = 0; i < profile.stopWords.length; i++) {
89
+ const word = profile.stopWords[i]!;
90
+ if (wordSet.has(word)) {
91
+ // First word in the list = highest weight; decays linearly.
92
+ scores[lang] += STOP_WEIGHT * (1 - i / profile.stopWords.length);
93
+ }
94
+ }
95
+ }
96
+
97
+ return scores;
98
+ }
99
+
100
+ /**
101
+ * Pick the most likely language. Returns `null` when no signal is strong
102
+ * enough — the caller should fall back to a configured default (usually
103
+ * `eng`).
104
+ *
105
+ * `minScore` is the threshold below which we abstain. Default 6 — empirical
106
+ * from testing on short samples (<200 chars) of each language.
107
+ */
108
+ export function detectLanguage(text: string, minScore = 6): Lang | null {
109
+ if (text.length < 20) return null; // Nothing to learn from.
110
+
111
+ const scores = scoreLanguages(text);
112
+ let best: Lang | null = null;
113
+ let bestScore = -1;
114
+ for (const lang of SUPPORTED_LANGS) {
115
+ if (scores[lang] > bestScore) {
116
+ bestScore = scores[lang];
117
+ best = lang;
118
+ }
119
+ }
120
+ return bestScore >= minScore ? best : null;
121
+ }
122
+
123
+ /**
124
+ * Convenience: detect with a fallback. The default fallback is `eng`,
125
+ * which is also the safest universal choice for technical / mixed corpora.
126
+ */
127
+ export function detectLanguageOr(text: string, fallback: Lang = 'eng', minScore = 6): Lang {
128
+ return detectLanguage(text, minScore) ?? fallback;
129
+ }