npm - @albex/ocr - Versions diffs - 0.2.0 - Mend

@albex/ocr 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +180 -0
package/dist/errors.d.ts +20 -0
package/dist/errors.d.ts.map +1 -0
package/dist/errors.js +34 -0
package/dist/errors.js.map +1 -0
package/dist/index.d.ts +28 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +26 -0
package/dist/index.js.map +1 -0
package/dist/language-detector.d.ts +39 -0
package/dist/language-detector.d.ts.map +1 -0
package/dist/language-detector.js +111 -0
package/dist/language-detector.js.map +1 -0
package/dist/ocr-worker.d.ts +71 -0
package/dist/ocr-worker.d.ts.map +1 -0
package/dist/ocr-worker.js +146 -0
package/dist/ocr-worker.js.map +1 -0
package/dist/orchestrator.d.ts +117 -0
package/dist/orchestrator.d.ts.map +1 -0
package/dist/orchestrator.js +120 -0
package/dist/orchestrator.js.map +1 -0
package/package.json +45 -0
package/src/errors.ts +37 -0
package/src/index.ts +48 -0
package/src/language-detector.ts +129 -0
package/src/ocr-worker.ts +223 -0
package/src/orchestrator.ts +229 -0

package/src/ocr-worker.ts ADDED Viewed

@@ -0,0 +1,223 @@
+/**
+ * Wrapper around Tesseract.js.
+ *
+ * Responsibilities:
+ *   1. Lazy-load the Tesseract.js library on first use (dynamic import).
+ *   2. Maintain a per-language `Worker` instance, created on demand.
+ *   3. Auto-terminate idle workers after a configurable timeout to release
+ *      the LSTM model from memory (each Tesseract worker holds 2-5 MB).
+ *   4. Expose a Promise-based `recognize(image, lang)` that returns text +
+ *      confidence in a stable shape.
+ *
+ * Why one worker per language: Tesseract.js workers are tied to the language
+ * model they were initialised with. Switching languages on the same worker
+ * triggers a slow reload of the LSTM. Maintaining N workers — one per
+ * language ever used — keeps each recognise call fast at the cost of slightly
+ * more memory, which the idle eviction then claws back.
+ */
+import type { Lang } from './language-detector.js';
+import {
+  AlbexOcrInitError,
+  AlbexOcrLanguageError,
+  AlbexOcrRecognitionError,
+} from './errors.js';
+/** Minimal subset of the Tesseract.js Worker surface we use. */
+interface TesseractWorker {
+  recognize(image: ImageLike): Promise<{ data: { text: string; confidence: number } }>;
+  terminate(): Promise<void>;
+}
+interface TesseractCreateWorker {
+  (lang: string, oem?: number, opts?: { workerPath?: string; corePath?: string; langPath?: string }): Promise<TesseractWorker>;
+}
+interface TesseractModule {
+  createWorker: TesseractCreateWorker;
+}
+/**
+ * Anything Tesseract.js can accept as input. We narrow to what an Albex
+ * consumer is likely to hand us (Blob, ArrayBuffer, Uint8Array, an HTML
+ * image element). The Tesseract.js source itself does the discrimination.
+ */
+export type ImageLike =
+  | Blob
+  | ArrayBuffer
+  | Uint8Array
+  | string
+  | HTMLImageElement
+  | HTMLCanvasElement
+  | OffscreenCanvas;
+export interface RecognitionResult {
+  /** Raw OCR output. May contain newlines for paragraphs / lines. */
+  text: string;
+  /** 0-100 confidence reported by Tesseract for the page. */
+  confidence: number;
+  /** Wall-clock time spent on this recognition. */
+  timeMs: number;
+}
+export interface OcrWorkerOptions {
+  /**
+   * Milliseconds of inactivity after which an idle Tesseract worker is
+   * terminated and its language model released. Default: 5 minutes.
+   * Set 0 to disable eviction (worker stays for the lifetime of the page).
+   */
+  idleTimeoutMs?: number;
+  /**
+   * Override the `tessdata_fast` mirror. Defaults to the official Tesseract.js
+   * jsDelivr mirror, which is what `tesseract.js` ships with anyway.
+   */
+  langPath?: string;
+}
+interface WorkerEntry {
+  worker: TesseractWorker;
+  lang: Lang;
+  /** Last time `recognize` finished on this worker. */
+  lastUsedAt: number;
+  /** Pending recognise to avoid termination mid-flight. */
+  pending: number;
+}
+/**
+ * Single global Tesseract module reference. The dynamic import is shared
+ * across every OCR call once it resolves.
+ */
+let tesseractPromise: Promise<TesseractModule> | null = null;
+function loadTesseract(): Promise<TesseractModule> {
+  if (tesseractPromise) return tesseractPromise;
+  tesseractPromise = (async () => {
+    try {
+      // The bundler resolves this to the `tesseract.js` ESM entry. Tesseract
+      // internally pulls its own WASM core and worker script lazily, so we
+      // don't pay for them until createWorker is called.
+      const mod = (await import('tesseract.js')) as unknown as TesseractModule;
+      if (typeof mod.createWorker !== 'function') {
+        throw new Error('tesseract.js: createWorker export missing');
+      }
+      return mod;
+    } catch (e) {
+      tesseractPromise = null; // allow retry
+      throw new AlbexOcrInitError(`Failed to load tesseract.js: ${(e as Error).message}`);
+    }
+  })();
+  return tesseractPromise;
+}
+export class OcrWorkerPool {
+  private _workers = new Map<Lang, WorkerEntry>();
+  private _evictionTimer: ReturnType<typeof setInterval> | null = null;
+  private readonly _idleTimeoutMs: number;
+  private readonly _langPath: string | undefined;
+  constructor(opts: OcrWorkerOptions = {}) {
+    this._idleTimeoutMs = opts.idleTimeoutMs ?? 5 * 60_000;
+    this._langPath      = opts.langPath;
+    if (this._idleTimeoutMs > 0) {
+      // Sweep every minute. We don't need finer granularity for "did this
+      // worker idle past the threshold?".
+      this._evictionTimer = setInterval(
+        () => { void this._sweepIdle(); },
+        Math.min(60_000, this._idleTimeoutMs),
+      );
+      // Don't keep the event loop alive just because we have a timer.
+      // (Browsers don't have unref; Node does.)
+      const t = this._evictionTimer as unknown as { unref?: () => void };
+      t.unref?.();
+    }
+  }
+  /**
+   * Run OCR on a single image. Spawns the appropriate language worker on
+   * first use and caches it; subsequent calls for the same language reuse it.
+   */
+  async recognize(image: ImageLike, lang: Lang): Promise<RecognitionResult> {
+    const entry = await this._getOrCreate(lang);
+    entry.pending++;
+    const t0 = performance.now();
+    try {
+      const { data } = await entry.worker.recognize(image);
+      const elapsed = performance.now() - t0;
+      entry.lastUsedAt = Date.now();
+      return { text: data.text, confidence: data.confidence, timeMs: elapsed };
+    } catch (e) {
+      throw new AlbexOcrRecognitionError(
+        `OCR failed for language=${lang}: ${(e as Error).message}`,
+      );
+    } finally {
+      entry.pending--;
+    }
+  }
+  /**
+   * Names of languages currently loaded in memory. Useful for diagnostics
+   * and for the demo's runtime panel.
+   */
+  loadedLanguages(): Lang[] {
+    return [...this._workers.keys()];
+  }
+  /**
+   * Tear down all workers immediately. Called by orchestrator dispose.
+   */
+  async dispose(): Promise<void> {
+    if (this._evictionTimer) {
+      clearInterval(this._evictionTimer);
+      this._evictionTimer = null;
+    }
+    const all = [...this._workers.values()];
+    this._workers.clear();
+    await Promise.allSettled(all.map(e => e.worker.terminate()));
+  }
+  /** TC39 explicit-resource-management alias. Fires `dispose()` async. */
+  [Symbol.dispose](): void {
+    void this.dispose();
+  }
+  // ── internals ─────────────────────────────────────────────────────────
+  private async _getOrCreate(lang: Lang): Promise<WorkerEntry> {
+    const existing = this._workers.get(lang);
+    if (existing) return existing;
+    const tess = await loadTesseract();
+    let worker: TesseractWorker;
+    try {
+      // Tesseract.js v5+ createWorker(lang, oem, opts). `oem: 1` = LSTM only
+      // (the fast modern path, faster than the legacy Cube engine).
+      const opts = this._langPath ? { langPath: this._langPath } : undefined;
+      worker = await tess.createWorker(lang, 1, opts);
+    } catch (e) {
+      throw new AlbexOcrLanguageError(
+        lang,
+        `Failed to load language model "${lang}": ${(e as Error).message}`,
+      );
+    }
+    const entry: WorkerEntry = {
+      worker,
+      lang,
+      lastUsedAt: Date.now(),
+      pending: 0,
+    };
+    this._workers.set(lang, entry);
+    return entry;
+  }
+  private async _sweepIdle(): Promise<void> {
+    const now = Date.now();
+    const victims: WorkerEntry[] = [];
+    for (const [lang, entry] of this._workers) {
+      if (entry.pending > 0) continue;
+      if (now - entry.lastUsedAt < this._idleTimeoutMs) continue;
+      this._workers.delete(lang);
+      victims.push(entry);
+    }
+    await Promise.allSettled(victims.map(v => v.worker.terminate()));
+  }
+}

package/src/orchestrator.ts ADDED Viewed

@@ -0,0 +1,229 @@
+/**
+ * Public surface of `@albex/ocr`.
+ *
+ * The orchestrator wires an `OcrWorkerPool` to an `AlbexEngine` instance.
+ * After `enableOcr(engine)`:
+ *
+ *   - `engine.ocrImage(blob)` becomes available and returns recognised text.
+ *   - The text can be indexed manually with `engine.indexText(name, text)`
+ *     if such a primitive is exposed by the host (future addition).
+ *
+ * The returned `OcrHandle` lets the caller probe state, query loaded
+ * languages, force pre-load of specific languages, or shut down.
+ */
+import { OcrWorkerPool } from './ocr-worker.js';
+import type { ImageLike, RecognitionResult } from './ocr-worker.js';
+import { SUPPORTED_LANGS } from './language-detector.js';
+import type { Lang } from './language-detector.js';
+import { AlbexOcrError } from './errors.js';
+/**
+ * The subset of `AlbexEngine` we need. Kept minimal so this package's only
+ * peer dependency on `albex` is a type contract.
+ */
+export interface OcrCapableEngine {
+  /** Storage slot where the orchestrator parks its public ocrImage method. */
+  ocrImage?: (image: ImageLike, opts?: OcrRecognizeOptions) => Promise<RecognitionResult>;
+  /** Structural slot the engine reads to decide whether to invoke OCR on
+   * embedded images of PDFs that ALSO have vector text. Set by `enableOcr`
+   * when the `alwaysExtractEmbeddedImages` option is true. */
+  ocrConfig?: { alwaysExtractEmbeddedImages?: boolean };
+}
+export interface OcrOptions {
+  /**
+   * Languages to make available for auto-detection. Order matters only
+   * in tie-breaking. Default: all 6 supported languages.
+   */
+  languages?: readonly Lang[];
+  /**
+   * Default language when detection cannot decide (very short text, no
+   * distinctive characters). Default: `eng`.
+   */
+  defaultLanguage?: Lang;
+  /**
+   * Eagerly load these languages on `enableOcr` instead of waiting for
+   * the first call that needs them. Useful when you know the corpus
+   * language ahead of time and want a warm engine.
+   */
+  preload?: readonly Lang[];
+  /**
+   * Milliseconds of inactivity after which idle Tesseract workers are
+   * terminated and their language models released. Default: 5 minutes.
+   */
+  idleTimeoutMs?: number;
+  /**
+   * Override the location from which `<lang>_fast.traineddata` is fetched.
+   * Defaults to the jsDelivr mirror Tesseract.js uses internally.
+   */
+  langPath?: string;
+  /**
+   * Hybrid PDF mode. When `true`, the engine OCRs embedded images of
+   * EVERY PDF, not just PDFs that have no extractable vector text.
+   * Useful for documents that mix native text with scanned annexes,
+   * stamps, signatures or diagrams with text labels.
+   *
+   * Cost: 1–3 s of OCR per qualifying image (only images larger than
+   * 200×200 are sent to Tesseract; logos, scanner-corner marks and
+   * signature glyphs are skipped server-side in Rust).
+   *
+   * Default: `false`. Keep it off unless your corpus is known to
+   * contain hybrid PDFs.
+   */
+  alwaysExtractEmbeddedImages?: boolean;
+}
+export interface OcrRecognizeOptions {
+  /**
+   * Force a specific language. When omitted, the orchestrator attempts to
+   * detect it from the first run's output (if recoverable) and otherwise
+   * falls back to `defaultLanguage`.
+   */
+  lang?: Lang;
+  /**
+   * Hint of expected language for the first-pass detection. Useful when
+   * you know the doc is e.g. Spanish but don't want to lock it.
+   */
+  hint?: Lang;
+}
+export interface OcrHandle {
+  /** Pre-load the language models for the listed languages. */
+  preload(langs: readonly Lang[]): Promise<void>;
+  /** Currently loaded (in-memory) languages. */
+  loadedLanguages(): Lang[];
+  /** Tear down all workers and unhook from the engine. */
+  dispose(): Promise<void>;
+}
+/**
+ * Hook OCR into an Albex engine. Returns a handle that lets the caller
+ * preload languages, inspect state, and dispose cleanly.
+ *
+ * Example:
+ *
+ * ```ts
+ * import { AlbexEngine } from 'albex';
+ * import { enableOcr } from '@albex/ocr';
+ *
+ * const engine = new AlbexEngine();
+ * await engine.init();
+ *
+ * const ocr = enableOcr(engine, { preload: ['eng', 'spa'] });
+ *
+ * const blob: Blob = await fetch('/scan.png').then(r => r.blob());
+ * const { text } = await engine.ocrImage(blob);
+ * ```
+ */
+export function enableOcr<T extends OcrCapableEngine>(
+  engine: T,
+  opts: OcrOptions = {},
+): OcrHandle {
+  if (engine.ocrImage) {
+    throw new AlbexOcrError(
+      'ocr_already_enabled',
+      'enableOcr called on an engine that already has OCR attached. Call dispose() on the previous handle first.',
+    );
+  }
+  const enabledLangs = new Set<Lang>(opts.languages ?? SUPPORTED_LANGS);
+  const defaultLang  = opts.defaultLanguage ?? 'eng';
+  const pool = new OcrWorkerPool({
+    idleTimeoutMs: opts.idleTimeoutMs,
+    langPath:      opts.langPath,
+  });
+  // Pre-load requested languages without awaiting (fire-and-forget). The
+  // user can also call `handle.preload(...)` to await explicitly.
+  if (opts.preload && opts.preload.length > 0) {
+    void Promise.all(
+      opts.preload
+        .filter(l => enabledLangs.has(l))
+        .map(l => pool.recognize(EMPTY_PIXEL, l).catch(() => { /* model warmed; the recognise on a 1x1 image is fine to fail */ })),
+    );
+  }
+  // Attach the recognise method to the engine.
+  engine.ocrImage = async (image, recOpts) => {
+    const targetLang = pickLanguage(recOpts, enabledLangs, defaultLang);
+    return pool.recognize(image, targetLang);
+  };
+  // Hybrid-PDF flag. The engine reads this to decide whether to walk every
+  // PDF's embedded images on top of the normal text extraction. Stored as
+  // a separate slot so the structural contract with `albex` stays minimal.
+  if (opts.alwaysExtractEmbeddedImages) {
+    engine.ocrConfig = { alwaysExtractEmbeddedImages: true };
+  }
+  return {
+    async preload(langs) {
+      await Promise.all(
+        langs
+          .filter(l => enabledLangs.has(l))
+          .map(l => pool.recognize(EMPTY_PIXEL, l).catch(() => { /* warming */ })),
+      );
+    },
+    loadedLanguages() {
+      return pool.loadedLanguages();
+    },
+    async dispose() {
+      await pool.dispose();
+      delete engine.ocrImage;
+      delete engine.ocrConfig;
+    },
+  };
+}
+/**
+ * Picks the language for a recognise call. Priority:
+ *   1. Explicit `opts.lang`.
+ *   2. `opts.hint` if enabled.
+ *   3. `defaultLanguage`.
+ *
+ * (Automatic from-text detection happens AFTER the first recognise, not
+ * before — until we have output we have nothing to detect from. A second
+ * pass with the corrected language is a future feature.)
+ */
+function pickLanguage(
+  opts: OcrRecognizeOptions | undefined,
+  enabled: ReadonlySet<Lang>,
+  fallback: Lang,
+): Lang {
+  if (opts?.lang && enabled.has(opts.lang))   return opts.lang;
+  if (opts?.hint && enabled.has(opts.hint))   return opts.hint;
+  return fallback;
+}
+/**
+ * A 1×1 transparent PNG used to warm a language worker without making real
+ * inference effort. Tesseract.js still bootstraps the LSTM on first
+ * recognise, which is the slow part we want done in advance.
+ */
+const EMPTY_PIXEL = (() => {
+  const bytes = new Uint8Array([
+    0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A,
+    0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
+    0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
+    0x08, 0x06, 0x00, 0x00, 0x00, 0x1F, 0x15, 0xC4,
+    0x89, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x44, 0x41,
+    0x54, 0x78, 0x9C, 0x63, 0x00, 0x01, 0x00, 0x00,
+    0x05, 0x00, 0x01, 0x0D, 0x0A, 0x2D, 0xB4, 0x00,
+    0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE,
+    0x42, 0x60, 0x82,
+  ]);
+  return bytes.buffer;
+})();
+// Re-export the detector so consumers can use it directly when they have
+// known-good source text to identify the document's language before OCR.
+export { detectLanguage, detectLanguageOr, scoreLanguages, SUPPORTED_LANGS } from './language-detector.js';
+export type { Lang } from './language-detector.js';
+export type { RecognitionResult, ImageLike } from './ocr-worker.js';
+export {
+  AlbexOcrError,
+  AlbexOcrInitError,
+  AlbexOcrLanguageError,
+  AlbexOcrRecognitionError,
+} from './errors.js';