@albex/ocr 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ /**
2
+ * Wrapper around Tesseract.js.
3
+ *
4
+ * Responsibilities:
5
+ * 1. Lazy-load the Tesseract.js library on first use (dynamic import).
6
+ * 2. Maintain a per-language `Worker` instance, created on demand.
7
+ * 3. Auto-terminate idle workers after a configurable timeout to release
8
+ * the LSTM model from memory (each Tesseract worker holds 2-5 MB).
9
+ * 4. Expose a Promise-based `recognize(image, lang)` that returns text +
10
+ * confidence in a stable shape.
11
+ *
12
+ * Why one worker per language: Tesseract.js workers are tied to the language
13
+ * model they were initialised with. Switching languages on the same worker
14
+ * triggers a slow reload of the LSTM. Maintaining N workers — one per
15
+ * language ever used — keeps each recognise call fast at the cost of slightly
16
+ * more memory, which the idle eviction then claws back.
17
+ */
18
+
19
+ import type { Lang } from './language-detector.js';
20
+ import {
21
+ AlbexOcrInitError,
22
+ AlbexOcrLanguageError,
23
+ AlbexOcrRecognitionError,
24
+ } from './errors.js';
25
+
26
+ /** Minimal subset of the Tesseract.js Worker surface we use. */
27
+ interface TesseractWorker {
28
+ recognize(image: ImageLike): Promise<{ data: { text: string; confidence: number } }>;
29
+ terminate(): Promise<void>;
30
+ }
31
+
32
+ interface TesseractCreateWorker {
33
+ (lang: string, oem?: number, opts?: { workerPath?: string; corePath?: string; langPath?: string }): Promise<TesseractWorker>;
34
+ }
35
+
36
+ interface TesseractModule {
37
+ createWorker: TesseractCreateWorker;
38
+ }
39
+
40
+ /**
41
+ * Anything Tesseract.js can accept as input. We narrow to what an Albex
42
+ * consumer is likely to hand us (Blob, ArrayBuffer, Uint8Array, an HTML
43
+ * image element). The Tesseract.js source itself does the discrimination.
44
+ */
45
+ export type ImageLike =
46
+ | Blob
47
+ | ArrayBuffer
48
+ | Uint8Array
49
+ | string
50
+ | HTMLImageElement
51
+ | HTMLCanvasElement
52
+ | OffscreenCanvas;
53
+
54
+ export interface RecognitionResult {
55
+ /** Raw OCR output. May contain newlines for paragraphs / lines. */
56
+ text: string;
57
+ /** 0-100 confidence reported by Tesseract for the page. */
58
+ confidence: number;
59
+ /** Wall-clock time spent on this recognition. */
60
+ timeMs: number;
61
+ }
62
+
63
+ export interface OcrWorkerOptions {
64
+ /**
65
+ * Milliseconds of inactivity after which an idle Tesseract worker is
66
+ * terminated and its language model released. Default: 5 minutes.
67
+ * Set 0 to disable eviction (worker stays for the lifetime of the page).
68
+ */
69
+ idleTimeoutMs?: number;
70
+ /**
71
+ * Override the `tessdata_fast` mirror. Defaults to the official Tesseract.js
72
+ * jsDelivr mirror, which is what `tesseract.js` ships with anyway.
73
+ */
74
+ langPath?: string;
75
+ }
76
+
77
+ interface WorkerEntry {
78
+ worker: TesseractWorker;
79
+ lang: Lang;
80
+ /** Last time `recognize` finished on this worker. */
81
+ lastUsedAt: number;
82
+ /** Pending recognise to avoid termination mid-flight. */
83
+ pending: number;
84
+ }
85
+
86
+ /**
87
+ * Single global Tesseract module reference. The dynamic import is shared
88
+ * across every OCR call once it resolves.
89
+ */
90
+ let tesseractPromise: Promise<TesseractModule> | null = null;
91
+
92
+ function loadTesseract(): Promise<TesseractModule> {
93
+ if (tesseractPromise) return tesseractPromise;
94
+ tesseractPromise = (async () => {
95
+ try {
96
+ // The bundler resolves this to the `tesseract.js` ESM entry. Tesseract
97
+ // internally pulls its own WASM core and worker script lazily, so we
98
+ // don't pay for them until createWorker is called.
99
+ const mod = (await import('tesseract.js')) as unknown as TesseractModule;
100
+ if (typeof mod.createWorker !== 'function') {
101
+ throw new Error('tesseract.js: createWorker export missing');
102
+ }
103
+ return mod;
104
+ } catch (e) {
105
+ tesseractPromise = null; // allow retry
106
+ throw new AlbexOcrInitError(`Failed to load tesseract.js: ${(e as Error).message}`);
107
+ }
108
+ })();
109
+ return tesseractPromise;
110
+ }
111
+
112
+ export class OcrWorkerPool {
113
+ private _workers = new Map<Lang, WorkerEntry>();
114
+ private _evictionTimer: ReturnType<typeof setInterval> | null = null;
115
+ private readonly _idleTimeoutMs: number;
116
+ private readonly _langPath: string | undefined;
117
+
118
+ constructor(opts: OcrWorkerOptions = {}) {
119
+ this._idleTimeoutMs = opts.idleTimeoutMs ?? 5 * 60_000;
120
+ this._langPath = opts.langPath;
121
+ if (this._idleTimeoutMs > 0) {
122
+ // Sweep every minute. We don't need finer granularity for "did this
123
+ // worker idle past the threshold?".
124
+ this._evictionTimer = setInterval(
125
+ () => { void this._sweepIdle(); },
126
+ Math.min(60_000, this._idleTimeoutMs),
127
+ );
128
+ // Don't keep the event loop alive just because we have a timer.
129
+ // (Browsers don't have unref; Node does.)
130
+ const t = this._evictionTimer as unknown as { unref?: () => void };
131
+ t.unref?.();
132
+ }
133
+ }
134
+
135
+ /**
136
+ * Run OCR on a single image. Spawns the appropriate language worker on
137
+ * first use and caches it; subsequent calls for the same language reuse it.
138
+ */
139
+ async recognize(image: ImageLike, lang: Lang): Promise<RecognitionResult> {
140
+ const entry = await this._getOrCreate(lang);
141
+ entry.pending++;
142
+ const t0 = performance.now();
143
+ try {
144
+ const { data } = await entry.worker.recognize(image);
145
+ const elapsed = performance.now() - t0;
146
+ entry.lastUsedAt = Date.now();
147
+ return { text: data.text, confidence: data.confidence, timeMs: elapsed };
148
+ } catch (e) {
149
+ throw new AlbexOcrRecognitionError(
150
+ `OCR failed for language=${lang}: ${(e as Error).message}`,
151
+ );
152
+ } finally {
153
+ entry.pending--;
154
+ }
155
+ }
156
+
157
+ /**
158
+ * Names of languages currently loaded in memory. Useful for diagnostics
159
+ * and for the demo's runtime panel.
160
+ */
161
+ loadedLanguages(): Lang[] {
162
+ return [...this._workers.keys()];
163
+ }
164
+
165
+ /**
166
+ * Tear down all workers immediately. Called by orchestrator dispose.
167
+ */
168
+ async dispose(): Promise<void> {
169
+ if (this._evictionTimer) {
170
+ clearInterval(this._evictionTimer);
171
+ this._evictionTimer = null;
172
+ }
173
+ const all = [...this._workers.values()];
174
+ this._workers.clear();
175
+ await Promise.allSettled(all.map(e => e.worker.terminate()));
176
+ }
177
+
178
+ /** TC39 explicit-resource-management alias. Fires `dispose()` async. */
179
+ [Symbol.dispose](): void {
180
+ void this.dispose();
181
+ }
182
+
183
+ // ── internals ─────────────────────────────────────────────────────────
184
+
185
+ private async _getOrCreate(lang: Lang): Promise<WorkerEntry> {
186
+ const existing = this._workers.get(lang);
187
+ if (existing) return existing;
188
+
189
+ const tess = await loadTesseract();
190
+ let worker: TesseractWorker;
191
+ try {
192
+ // Tesseract.js v5+ createWorker(lang, oem, opts). `oem: 1` = LSTM only
193
+ // (the fast modern path, faster than the legacy Cube engine).
194
+ const opts = this._langPath ? { langPath: this._langPath } : undefined;
195
+ worker = await tess.createWorker(lang, 1, opts);
196
+ } catch (e) {
197
+ throw new AlbexOcrLanguageError(
198
+ lang,
199
+ `Failed to load language model "${lang}": ${(e as Error).message}`,
200
+ );
201
+ }
202
+ const entry: WorkerEntry = {
203
+ worker,
204
+ lang,
205
+ lastUsedAt: Date.now(),
206
+ pending: 0,
207
+ };
208
+ this._workers.set(lang, entry);
209
+ return entry;
210
+ }
211
+
212
+ private async _sweepIdle(): Promise<void> {
213
+ const now = Date.now();
214
+ const victims: WorkerEntry[] = [];
215
+ for (const [lang, entry] of this._workers) {
216
+ if (entry.pending > 0) continue;
217
+ if (now - entry.lastUsedAt < this._idleTimeoutMs) continue;
218
+ this._workers.delete(lang);
219
+ victims.push(entry);
220
+ }
221
+ await Promise.allSettled(victims.map(v => v.worker.terminate()));
222
+ }
223
+ }
@@ -0,0 +1,229 @@
1
+ /**
2
+ * Public surface of `@albex/ocr`.
3
+ *
4
+ * The orchestrator wires an `OcrWorkerPool` to an `AlbexEngine` instance.
5
+ * After `enableOcr(engine)`:
6
+ *
7
+ * - `engine.ocrImage(blob)` becomes available and returns recognised text.
8
+ * - The text can be indexed manually with `engine.indexText(name, text)`
9
+ * if such a primitive is exposed by the host (future addition).
10
+ *
11
+ * The returned `OcrHandle` lets the caller probe state, query loaded
12
+ * languages, force pre-load of specific languages, or shut down.
13
+ */
14
+
15
+ import { OcrWorkerPool } from './ocr-worker.js';
16
+ import type { ImageLike, RecognitionResult } from './ocr-worker.js';
17
+ import { SUPPORTED_LANGS } from './language-detector.js';
18
+ import type { Lang } from './language-detector.js';
19
+ import { AlbexOcrError } from './errors.js';
20
+
21
+ /**
22
+ * The subset of `AlbexEngine` we need. Kept minimal so this package's only
23
+ * peer dependency on `albex` is a type contract.
24
+ */
25
+ export interface OcrCapableEngine {
26
+ /** Storage slot where the orchestrator parks its public ocrImage method. */
27
+ ocrImage?: (image: ImageLike, opts?: OcrRecognizeOptions) => Promise<RecognitionResult>;
28
+ /** Structural slot the engine reads to decide whether to invoke OCR on
29
+ * embedded images of PDFs that ALSO have vector text. Set by `enableOcr`
30
+ * when the `alwaysExtractEmbeddedImages` option is true. */
31
+ ocrConfig?: { alwaysExtractEmbeddedImages?: boolean };
32
+ }
33
+
34
+ export interface OcrOptions {
35
+ /**
36
+ * Languages to make available for auto-detection. Order matters only
37
+ * in tie-breaking. Default: all 6 supported languages.
38
+ */
39
+ languages?: readonly Lang[];
40
+ /**
41
+ * Default language when detection cannot decide (very short text, no
42
+ * distinctive characters). Default: `eng`.
43
+ */
44
+ defaultLanguage?: Lang;
45
+ /**
46
+ * Eagerly load these languages on `enableOcr` instead of waiting for
47
+ * the first call that needs them. Useful when you know the corpus
48
+ * language ahead of time and want a warm engine.
49
+ */
50
+ preload?: readonly Lang[];
51
+ /**
52
+ * Milliseconds of inactivity after which idle Tesseract workers are
53
+ * terminated and their language models released. Default: 5 minutes.
54
+ */
55
+ idleTimeoutMs?: number;
56
+ /**
57
+ * Override the location from which `<lang>_fast.traineddata` is fetched.
58
+ * Defaults to the jsDelivr mirror Tesseract.js uses internally.
59
+ */
60
+ langPath?: string;
61
+ /**
62
+ * Hybrid PDF mode. When `true`, the engine OCRs embedded images of
63
+ * EVERY PDF, not just PDFs that have no extractable vector text.
64
+ * Useful for documents that mix native text with scanned annexes,
65
+ * stamps, signatures or diagrams with text labels.
66
+ *
67
+ * Cost: 1–3 s of OCR per qualifying image (only images larger than
68
+ * 200×200 are sent to Tesseract; logos, scanner-corner marks and
69
+ * signature glyphs are skipped server-side in Rust).
70
+ *
71
+ * Default: `false`. Keep it off unless your corpus is known to
72
+ * contain hybrid PDFs.
73
+ */
74
+ alwaysExtractEmbeddedImages?: boolean;
75
+ }
76
+
77
+ export interface OcrRecognizeOptions {
78
+ /**
79
+ * Force a specific language. When omitted, the orchestrator attempts to
80
+ * detect it from the first run's output (if recoverable) and otherwise
81
+ * falls back to `defaultLanguage`.
82
+ */
83
+ lang?: Lang;
84
+ /**
85
+ * Hint of expected language for the first-pass detection. Useful when
86
+ * you know the doc is e.g. Spanish but don't want to lock it.
87
+ */
88
+ hint?: Lang;
89
+ }
90
+
91
+ export interface OcrHandle {
92
+ /** Pre-load the language models for the listed languages. */
93
+ preload(langs: readonly Lang[]): Promise<void>;
94
+ /** Currently loaded (in-memory) languages. */
95
+ loadedLanguages(): Lang[];
96
+ /** Tear down all workers and unhook from the engine. */
97
+ dispose(): Promise<void>;
98
+ }
99
+
100
+ /**
101
+ * Hook OCR into an Albex engine. Returns a handle that lets the caller
102
+ * preload languages, inspect state, and dispose cleanly.
103
+ *
104
+ * Example:
105
+ *
106
+ * ```ts
107
+ * import { AlbexEngine } from 'albex';
108
+ * import { enableOcr } from '@albex/ocr';
109
+ *
110
+ * const engine = new AlbexEngine();
111
+ * await engine.init();
112
+ *
113
+ * const ocr = enableOcr(engine, { preload: ['eng', 'spa'] });
114
+ *
115
+ * const blob: Blob = await fetch('/scan.png').then(r => r.blob());
116
+ * const { text } = await engine.ocrImage(blob);
117
+ * ```
118
+ */
119
+ export function enableOcr<T extends OcrCapableEngine>(
120
+ engine: T,
121
+ opts: OcrOptions = {},
122
+ ): OcrHandle {
123
+ if (engine.ocrImage) {
124
+ throw new AlbexOcrError(
125
+ 'ocr_already_enabled',
126
+ 'enableOcr called on an engine that already has OCR attached. Call dispose() on the previous handle first.',
127
+ );
128
+ }
129
+
130
+ const enabledLangs = new Set<Lang>(opts.languages ?? SUPPORTED_LANGS);
131
+ const defaultLang = opts.defaultLanguage ?? 'eng';
132
+ const pool = new OcrWorkerPool({
133
+ idleTimeoutMs: opts.idleTimeoutMs,
134
+ langPath: opts.langPath,
135
+ });
136
+
137
+ // Pre-load requested languages without awaiting (fire-and-forget). The
138
+ // user can also call `handle.preload(...)` to await explicitly.
139
+ if (opts.preload && opts.preload.length > 0) {
140
+ void Promise.all(
141
+ opts.preload
142
+ .filter(l => enabledLangs.has(l))
143
+ .map(l => pool.recognize(EMPTY_PIXEL, l).catch(() => { /* model warmed; the recognise on a 1x1 image is fine to fail */ })),
144
+ );
145
+ }
146
+
147
+ // Attach the recognise method to the engine.
148
+ engine.ocrImage = async (image, recOpts) => {
149
+ const targetLang = pickLanguage(recOpts, enabledLangs, defaultLang);
150
+ return pool.recognize(image, targetLang);
151
+ };
152
+
153
+ // Hybrid-PDF flag. The engine reads this to decide whether to walk every
154
+ // PDF's embedded images on top of the normal text extraction. Stored as
155
+ // a separate slot so the structural contract with `albex` stays minimal.
156
+ if (opts.alwaysExtractEmbeddedImages) {
157
+ engine.ocrConfig = { alwaysExtractEmbeddedImages: true };
158
+ }
159
+
160
+ return {
161
+ async preload(langs) {
162
+ await Promise.all(
163
+ langs
164
+ .filter(l => enabledLangs.has(l))
165
+ .map(l => pool.recognize(EMPTY_PIXEL, l).catch(() => { /* warming */ })),
166
+ );
167
+ },
168
+ loadedLanguages() {
169
+ return pool.loadedLanguages();
170
+ },
171
+ async dispose() {
172
+ await pool.dispose();
173
+ delete engine.ocrImage;
174
+ delete engine.ocrConfig;
175
+ },
176
+ };
177
+ }
178
+
179
+ /**
180
+ * Picks the language for a recognise call. Priority:
181
+ * 1. Explicit `opts.lang`.
182
+ * 2. `opts.hint` if enabled.
183
+ * 3. `defaultLanguage`.
184
+ *
185
+ * (Automatic from-text detection happens AFTER the first recognise, not
186
+ * before — until we have output we have nothing to detect from. A second
187
+ * pass with the corrected language is a future feature.)
188
+ */
189
+ function pickLanguage(
190
+ opts: OcrRecognizeOptions | undefined,
191
+ enabled: ReadonlySet<Lang>,
192
+ fallback: Lang,
193
+ ): Lang {
194
+ if (opts?.lang && enabled.has(opts.lang)) return opts.lang;
195
+ if (opts?.hint && enabled.has(opts.hint)) return opts.hint;
196
+ return fallback;
197
+ }
198
+
199
+ /**
200
+ * A 1×1 transparent PNG used to warm a language worker without making real
201
+ * inference effort. Tesseract.js still bootstraps the LSTM on first
202
+ * recognise, which is the slow part we want done in advance.
203
+ */
204
+ const EMPTY_PIXEL = (() => {
205
+ const bytes = new Uint8Array([
206
+ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A,
207
+ 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
208
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
209
+ 0x08, 0x06, 0x00, 0x00, 0x00, 0x1F, 0x15, 0xC4,
210
+ 0x89, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x44, 0x41,
211
+ 0x54, 0x78, 0x9C, 0x63, 0x00, 0x01, 0x00, 0x00,
212
+ 0x05, 0x00, 0x01, 0x0D, 0x0A, 0x2D, 0xB4, 0x00,
213
+ 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE,
214
+ 0x42, 0x60, 0x82,
215
+ ]);
216
+ return bytes.buffer;
217
+ })();
218
+
219
+ // Re-export the detector so consumers can use it directly when they have
220
+ // known-good source text to identify the document's language before OCR.
221
+ export { detectLanguage, detectLanguageOr, scoreLanguages, SUPPORTED_LANGS } from './language-detector.js';
222
+ export type { Lang } from './language-detector.js';
223
+ export type { RecognitionResult, ImageLike } from './ocr-worker.js';
224
+ export {
225
+ AlbexOcrError,
226
+ AlbexOcrInitError,
227
+ AlbexOcrLanguageError,
228
+ AlbexOcrRecognitionError,
229
+ } from './errors.js';