@kreuzberg/wasm 4.0.0-rc.23 → 4.0.0-rc.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,92 +0,0 @@
1
- "use strict";
2
- var __defProp = Object.defineProperty;
3
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
- var __getOwnPropNames = Object.getOwnPropertyNames;
5
- var __hasOwnProp = Object.prototype.hasOwnProperty;
6
- var __export = (target, all) => {
7
- for (var name in all)
8
- __defProp(target, name, { get: all[name], enumerable: true });
9
- };
10
- var __copyProps = (to, from, except, desc) => {
11
- if (from && typeof from === "object" || typeof from === "function") {
12
- for (let key of __getOwnPropNames(from))
13
- if (!__hasOwnProp.call(to, key) && key !== except)
14
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
- }
16
- return to;
17
- };
18
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
-
20
- // typescript/ocr/registry.ts
21
- var registry_exports = {};
22
- __export(registry_exports, {
23
- clearOcrBackends: () => clearOcrBackends,
24
- getOcrBackend: () => getOcrBackend,
25
- listOcrBackends: () => listOcrBackends,
26
- registerOcrBackend: () => registerOcrBackend,
27
- unregisterOcrBackend: () => unregisterOcrBackend
28
- });
29
- module.exports = __toCommonJS(registry_exports);
30
- var ocrBackendRegistry = /* @__PURE__ */ new Map();
31
- function registerOcrBackend(backend) {
32
- if (!backend) {
33
- throw new Error("Backend cannot be null or undefined");
34
- }
35
- if (typeof backend.name !== "function") {
36
- throw new Error("Backend must implement name() method");
37
- }
38
- if (typeof backend.supportedLanguages !== "function") {
39
- throw new Error("Backend must implement supportedLanguages() method");
40
- }
41
- if (typeof backend.processImage !== "function") {
42
- throw new Error("Backend must implement processImage() method");
43
- }
44
- const backendName = backend.name();
45
- if (!backendName || typeof backendName !== "string") {
46
- throw new Error("Backend name must be a non-empty string");
47
- }
48
- if (ocrBackendRegistry.has(backendName)) {
49
- console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
50
- }
51
- ocrBackendRegistry.set(backendName, backend);
52
- }
53
- function getOcrBackend(name) {
54
- return ocrBackendRegistry.get(name);
55
- }
56
- function listOcrBackends() {
57
- return Array.from(ocrBackendRegistry.keys());
58
- }
59
- async function unregisterOcrBackend(name) {
60
- const backend = ocrBackendRegistry.get(name);
61
- if (!backend) {
62
- throw new Error(
63
- `OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
64
- );
65
- }
66
- if (typeof backend.shutdown === "function") {
67
- try {
68
- await backend.shutdown();
69
- } catch (error) {
70
- console.warn(
71
- `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
72
- );
73
- }
74
- }
75
- ocrBackendRegistry.delete(name);
76
- }
77
- async function clearOcrBackends() {
78
- const backends = Array.from(ocrBackendRegistry.entries());
79
- for (const [name, backend] of backends) {
80
- if (typeof backend.shutdown === "function") {
81
- try {
82
- await backend.shutdown();
83
- } catch (error) {
84
- console.warn(
85
- `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
86
- );
87
- }
88
- }
89
- }
90
- ocrBackendRegistry.clear();
91
- }
92
- //# sourceMappingURL=registry.cjs.map
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../../typescript/ocr/registry.ts"],"sourcesContent":["/**\n * OCR Backend Registry\n *\n * Provides a registry for OCR backends in the WASM environment.\n * This enables auto-registration and management of OCR backends.\n *\n * Note: The WASM package provides a lightweight registry in the browser.\n * For more advanced features like Rust integration, use @kreuzberg/node or @kreuzberg/deno.\n *\n * @example\n * ```typescript\n * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';\n * import { enableOcr } from '@kreuzberg/wasm';\n *\n * // Simple auto-registration\n * await enableOcr();\n * ```\n */\n\nimport type { OcrBackendProtocol } from \"../types.js\";\n\n/** Global registry of OCR backends */\nconst ocrBackendRegistry = new Map<string, OcrBackendProtocol>();\n\n/**\n * Register an OCR backend\n *\n * Registers an OCR backend with the WASM extraction pipeline.\n * If a backend with the same name is already registered, it will be replaced.\n *\n * @param backend - OCR backend implementing OcrBackendProtocol\n * @throws {Error} If backend validation fails\n *\n * @example\n * ```typescript\n * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';\n * import { registerOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backend = new TesseractWasmBackend();\n * await backend.initialize();\n * registerOcrBackend(backend);\n * ```\n */\nexport function registerOcrBackend(backend: OcrBackendProtocol): void {\n\tif (!backend) {\n\t\tthrow new Error(\"Backend cannot be null or undefined\");\n\t}\n\n\tif (typeof backend.name !== \"function\") {\n\t\tthrow new Error(\"Backend must implement name() method\");\n\t}\n\n\tif (typeof backend.supportedLanguages !== \"function\") {\n\t\tthrow new Error(\"Backend must implement supportedLanguages() method\");\n\t}\n\n\tif (typeof backend.processImage !== \"function\") {\n\t\tthrow new Error(\"Backend must implement processImage() method\");\n\t}\n\n\tconst backendName = backend.name();\n\n\tif (!backendName || typeof backendName !== \"string\") {\n\t\tthrow new Error(\"Backend name must be a non-empty string\");\n\t}\n\n\tif (ocrBackendRegistry.has(backendName)) {\n\t\tconsole.warn(`OCR backend \"${backendName}\" is already registered and will be replaced`);\n\t}\n\n\tocrBackendRegistry.set(backendName, backend);\n}\n\n/**\n * Get a registered OCR backend by name\n *\n * @param name - Backend name\n * @returns The OCR backend or undefined if not found\n *\n * @example\n * ```typescript\n * import { getOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backend = getOcrBackend('tesseract-wasm');\n * if (backend) {\n * console.log('Available languages:', backend.supportedLanguages());\n * }\n * ```\n */\nexport function getOcrBackend(name: string): OcrBackendProtocol | undefined {\n\treturn ocrBackendRegistry.get(name);\n}\n\n/**\n * List all registered OCR backends\n *\n * @returns Array of registered backend names\n *\n * @example\n * ```typescript\n * import { listOcrBackends } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backends = listOcrBackends();\n * console.log('Available OCR backends:', backends);\n * ```\n */\nexport function listOcrBackends(): string[] {\n\treturn Array.from(ocrBackendRegistry.keys());\n}\n\n/**\n * Unregister an OCR backend\n *\n * @param name - Backend name to unregister\n * @throws {Error} If backend is not found\n *\n * @example\n * ```typescript\n * import { unregisterOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * unregisterOcrBackend('tesseract-wasm');\n * ```\n */\nexport async function unregisterOcrBackend(name: string): Promise<void> {\n\tconst backend = ocrBackendRegistry.get(name);\n\n\tif (!backend) {\n\t\tthrow new Error(\n\t\t\t`OCR backend \"${name}\" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(\", \")}`,\n\t\t);\n\t}\n\n\tif (typeof backend.shutdown === \"function\") {\n\t\ttry {\n\t\t\tawait backend.shutdown();\n\t\t} catch (error) {\n\t\t\tconsole.warn(\n\t\t\t\t`Error shutting down OCR backend \"${name}\": ${error instanceof Error ? error.message : String(error)}`,\n\t\t\t);\n\t\t}\n\t}\n\n\tocrBackendRegistry.delete(name);\n}\n\n/**\n * Clear all registered OCR backends\n *\n * Unregisters all OCR backends and calls their shutdown methods.\n *\n * @example\n * ```typescript\n * import { clearOcrBackends } from '@kreuzberg/wasm/ocr/registry';\n *\n * // Clean up all backends when shutting down\n * await clearOcrBackends();\n * ```\n */\nexport async function clearOcrBackends(): Promise<void> {\n\tconst backends = Array.from(ocrBackendRegistry.entries());\n\n\tfor (const [name, backend] of backends) {\n\t\tif (typeof backend.shutdown === \"function\") {\n\t\t\ttry {\n\t\t\t\tawait backend.shutdown();\n\t\t\t} catch (error) {\n\t\t\t\tconsole.warn(\n\t\t\t\t\t`Error shutting down OCR backend \"${name}\": ${error instanceof Error ? error.message : String(error)}`,\n\t\t\t\t);\n\t\t\t}\n\t\t}\n\t}\n\n\tocrBackendRegistry.clear();\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAsBA,IAAM,qBAAqB,oBAAI,IAAgC;AAqBxD,SAAS,mBAAmB,SAAmC;AACrE,MAAI,CAAC,SAAS;AACb,UAAM,IAAI,MAAM,qCAAqC;AAAA,EACtD;AAEA,MAAI,OAAO,QAAQ,SAAS,YAAY;AACvC,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACvD;AAEA,MAAI,OAAO,QAAQ,uBAAuB,YAAY;AACrD,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACrE;AAEA,MAAI,OAAO,QAAQ,iBAAiB,YAAY;AAC/C,UAAM,IAAI,MAAM,8CAA8C;AAAA,EAC/D;AAEA,QAAM,cAAc,QAAQ,KAAK;AAEjC,MAAI,CAAC,eAAe,OAAO,gBAAgB,UAAU;AACpD,UAAM,IAAI,MAAM,yCAAyC;AAAA,EAC1D;AAEA,MAAI,mBAAmB,IAAI,WAAW,GAAG;AACxC,YAAQ,KAAK,gBAAgB,WAAW,8CAA8C;AAAA,EACvF;AAEA,qBAAmB,IAAI,aAAa,OAAO;AAC5C;AAkBO,SAAS,cAAc,MAA8C;AAC3E,SAAO,mBAAmB,IAAI,IAAI;AACnC;AAeO,SAAS,kBAA4B;AAC3C,SAAO,MAAM,KAAK,mBAAmB,KAAK,CAAC;AAC5C;AAeA,eAAsB,qBAAqB,MAA6B;AACvE,QAAM,UAAU,mBAAmB,IAAI,IAAI;AAE3C,MAAI,CAAC,SAAS;AACb,UAAM,IAAI;AAAA,MACT,gBAAgB,IAAI,4CAA4C,MAAM,KAAK,mBAAmB,KAAK,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,IACjH;AAAA,EACD;AAEA,MAAI,OAAO,QAAQ,aAAa,YAAY;AAC3C,QAAI;AACH,YAAM,QAAQ,SAAS;AAAA,IACxB,SAAS,OAAO;AACf,cAAQ;AAAA,QACP,oCAAoC,IAAI,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AAAA,MACrG;AAAA,IACD;AAAA,EACD;AAEA,qBAAmB,OAAO,IAAI;AAC/B;AAeA,eAAsB,mBAAkC;AACvD,QAAM,WAAW,MAAM,KAAK,mBAAmB,QAAQ,CAAC;AAExD,aAAW,CAAC,MAAM,OAAO,KAAK,UAAU;AACvC,QAAI,OAAO,QAAQ,aAAa,YAAY;AAC3C,UAAI;AACH,cAAM,QAAQ,SAAS;AAAA,MACxB,SAAS,OAAO;AACf,gBAAQ;AAAA,UACP,oCAAoC,IAAI,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AAAA,QACrG;AAAA,MACD;AAAA,IACD;AAAA,EACD;AAEA,qBAAmB,MAAM;AAC1B;","names":[]}
@@ -1,102 +0,0 @@
1
- import { O as OcrBackendProtocol } from '../types-wVLLDHkl.cjs';
2
-
3
- /**
4
- * OCR Backend Registry
5
- *
6
- * Provides a registry for OCR backends in the WASM environment.
7
- * This enables auto-registration and management of OCR backends.
8
- *
9
- * Note: The WASM package provides a lightweight registry in the browser.
10
- * For more advanced features like Rust integration, use @kreuzberg/node or @kreuzberg/deno.
11
- *
12
- * @example
13
- * ```typescript
14
- * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
15
- * import { enableOcr } from '@kreuzberg/wasm';
16
- *
17
- * // Simple auto-registration
18
- * await enableOcr();
19
- * ```
20
- */
21
-
22
- /**
23
- * Register an OCR backend
24
- *
25
- * Registers an OCR backend with the WASM extraction pipeline.
26
- * If a backend with the same name is already registered, it will be replaced.
27
- *
28
- * @param backend - OCR backend implementing OcrBackendProtocol
29
- * @throws {Error} If backend validation fails
30
- *
31
- * @example
32
- * ```typescript
33
- * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
34
- * import { registerOcrBackend } from '@kreuzberg/wasm/ocr/registry';
35
- *
36
- * const backend = new TesseractWasmBackend();
37
- * await backend.initialize();
38
- * registerOcrBackend(backend);
39
- * ```
40
- */
41
- declare function registerOcrBackend(backend: OcrBackendProtocol): void;
42
- /**
43
- * Get a registered OCR backend by name
44
- *
45
- * @param name - Backend name
46
- * @returns The OCR backend or undefined if not found
47
- *
48
- * @example
49
- * ```typescript
50
- * import { getOcrBackend } from '@kreuzberg/wasm/ocr/registry';
51
- *
52
- * const backend = getOcrBackend('tesseract-wasm');
53
- * if (backend) {
54
- * console.log('Available languages:', backend.supportedLanguages());
55
- * }
56
- * ```
57
- */
58
- declare function getOcrBackend(name: string): OcrBackendProtocol | undefined;
59
- /**
60
- * List all registered OCR backends
61
- *
62
- * @returns Array of registered backend names
63
- *
64
- * @example
65
- * ```typescript
66
- * import { listOcrBackends } from '@kreuzberg/wasm/ocr/registry';
67
- *
68
- * const backends = listOcrBackends();
69
- * console.log('Available OCR backends:', backends);
70
- * ```
71
- */
72
- declare function listOcrBackends(): string[];
73
- /**
74
- * Unregister an OCR backend
75
- *
76
- * @param name - Backend name to unregister
77
- * @throws {Error} If backend is not found
78
- *
79
- * @example
80
- * ```typescript
81
- * import { unregisterOcrBackend } from '@kreuzberg/wasm/ocr/registry';
82
- *
83
- * unregisterOcrBackend('tesseract-wasm');
84
- * ```
85
- */
86
- declare function unregisterOcrBackend(name: string): Promise<void>;
87
- /**
88
- * Clear all registered OCR backends
89
- *
90
- * Unregisters all OCR backends and calls their shutdown methods.
91
- *
92
- * @example
93
- * ```typescript
94
- * import { clearOcrBackends } from '@kreuzberg/wasm/ocr/registry';
95
- *
96
- * // Clean up all backends when shutting down
97
- * await clearOcrBackends();
98
- * ```
99
- */
100
- declare function clearOcrBackends(): Promise<void>;
101
-
102
- export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend };
@@ -1,410 +0,0 @@
1
- "use strict";
2
- var __create = Object.create;
3
- var __defProp = Object.defineProperty;
4
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
- var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
- var __hasOwnProp = Object.prototype.hasOwnProperty;
8
- var __export = (target, all) => {
9
- for (var name in all)
10
- __defProp(target, name, { get: all[name], enumerable: true });
11
- };
12
- var __copyProps = (to, from, except, desc) => {
13
- if (from && typeof from === "object" || typeof from === "function") {
14
- for (let key of __getOwnPropNames(from))
15
- if (!__hasOwnProp.call(to, key) && key !== except)
16
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
- }
18
- return to;
19
- };
20
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
- // If the importer is in node compatibility mode or this is not an ESM
22
- // file that has been converted to a CommonJS file using a Babel-
23
- // compatible transform (i.e. "__esModule" has not been set), then set
24
- // "default" to the CommonJS "module.exports" for node compatibility.
25
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
- mod
27
- ));
28
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
-
30
- // typescript/ocr/tesseract-wasm-backend.ts
31
- var tesseract_wasm_backend_exports = {};
32
- __export(tesseract_wasm_backend_exports, {
33
- TesseractWasmBackend: () => TesseractWasmBackend
34
- });
35
- module.exports = __toCommonJS(tesseract_wasm_backend_exports);
36
- var TesseractWasmBackend = class {
37
- /** Tesseract WASM client instance */
38
- client = null;
39
- /** Track which models are currently loaded to avoid redundant loads */
40
- loadedLanguages = /* @__PURE__ */ new Set();
41
- /** Cache for language availability validation */
42
- supportedLangsCache = null;
43
- /** Progress callback for UI updates */
44
- progressCallback = null;
45
- /** Base URL for training data CDN */
46
- CDN_BASE_URL = "https://cdn.jsdelivr.net/npm/tesseract-wasm@0.11.0/dist";
47
- /**
48
- * Return the unique name of this OCR backend
49
- *
50
- * @returns Backend identifier "tesseract-wasm"
51
- */
52
- name() {
53
- return "tesseract-wasm";
54
- }
55
- /**
56
- * Return list of supported language codes
57
- *
58
- * Returns a curated list of commonly available Tesseract language models.
59
- * Tesseract supports many more languages through custom models.
60
- *
61
- * @returns Array of ISO 639-1/2/3 language codes
62
- */
63
- supportedLanguages() {
64
- if (this.supportedLangsCache) {
65
- return this.supportedLangsCache;
66
- }
67
- this.supportedLangsCache = [
68
- "eng",
69
- "deu",
70
- "fra",
71
- "spa",
72
- "ita",
73
- "por",
74
- "nld",
75
- "rus",
76
- "jpn",
77
- "kor",
78
- "chi_sim",
79
- "chi_tra",
80
- "pol",
81
- "tur",
82
- "swe",
83
- "dan",
84
- "fin",
85
- "nor",
86
- "ces",
87
- "slk",
88
- "ron",
89
- "hun",
90
- "hrv",
91
- "srp",
92
- "bul",
93
- "ukr",
94
- "ell",
95
- "ara",
96
- "heb",
97
- "hin",
98
- "tha",
99
- "vie",
100
- "mkd",
101
- "ben",
102
- "tam",
103
- "tel",
104
- "kan",
105
- "mal",
106
- "mya",
107
- "khm",
108
- "lao",
109
- "sin"
110
- ];
111
- return this.supportedLangsCache;
112
- }
113
- /**
114
- * Initialize the OCR backend
115
- *
116
- * Creates the Tesseract WASM client instance. This is called once when
117
- * the backend is registered with the extraction pipeline.
118
- *
119
- * The actual model loading happens in processImage() on-demand to avoid
120
- * loading all models upfront.
121
- *
122
- * @throws {Error} If tesseract-wasm is not available or initialization fails
123
- *
124
- * @example
125
- * ```typescript
126
- * const backend = new TesseractWasmBackend();
127
- * try {
128
- * await backend.initialize();
129
- * } catch (error) {
130
- * console.error('Failed to initialize OCR:', error);
131
- * }
132
- * ```
133
- */
134
- async initialize() {
135
- if (this.client) {
136
- return;
137
- }
138
- try {
139
- const tesseractModule = await this.loadTesseractWasm();
140
- if (!tesseractModule || typeof tesseractModule.OCRClient !== "function") {
141
- throw new Error("tesseract-wasm OCRClient not found. Ensure tesseract-wasm is installed and available.");
142
- }
143
- this.client = new tesseractModule.OCRClient();
144
- this.loadedLanguages.clear();
145
- } catch (error) {
146
- const message = error instanceof Error ? error.message : String(error);
147
- throw new Error(`Failed to initialize TesseractWasmBackend: ${message}`);
148
- }
149
- }
150
- /**
151
- * Process image bytes and extract text via OCR
152
- *
153
- * Handles image loading, model loading, OCR processing, and result formatting.
154
- * Automatically loads the language model on first use and caches it for subsequent calls.
155
- *
156
- * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string
157
- * @param language - ISO 639-2/3 language code (e.g., "eng", "deu")
158
- * @returns Promise resolving to OCR result with content and metadata
159
- * @throws {Error} If image processing fails, model loading fails, or language is unsupported
160
- *
161
- * @example
162
- * ```typescript
163
- * const backend = new TesseractWasmBackend();
164
- * await backend.initialize();
165
- *
166
- * const imageBuffer = fs.readFileSync('scanned.png');
167
- * const result = await backend.processImage(
168
- * new Uint8Array(imageBuffer),
169
- * 'eng'
170
- * );
171
- *
172
- * console.log(result.content); // Extracted text
173
- * console.log(result.metadata.confidence); // OCR confidence score
174
- * ```
175
- */
176
- async processImage(imageBytes, language) {
177
- if (!this.client) {
178
- throw new Error("TesseractWasmBackend not initialized. Call initialize() first.");
179
- }
180
- const supported = this.supportedLanguages();
181
- const normalizedLang = language.toLowerCase();
182
- const isSupported = supported.some((lang) => lang.toLowerCase() === normalizedLang);
183
- if (!isSupported) {
184
- throw new Error(`Language "${language}" is not supported. Supported languages: ${supported.join(", ")}`);
185
- }
186
- try {
187
- if (!this.loadedLanguages.has(normalizedLang)) {
188
- this.reportProgress(10);
189
- await this.loadLanguageModel(normalizedLang);
190
- this.loadedLanguages.add(normalizedLang);
191
- this.reportProgress(30);
192
- }
193
- this.reportProgress(40);
194
- const imageBitmap = await this.convertToImageBitmap(imageBytes);
195
- this.reportProgress(50);
196
- await this.client.loadImage(imageBitmap);
197
- this.reportProgress(70);
198
- const text = await this.client.getText();
199
- const confidence = await this.getConfidenceScore();
200
- const pageMetadata = await this.getPageMetadata();
201
- this.reportProgress(90);
202
- return {
203
- content: text,
204
- mime_type: "text/plain",
205
- metadata: {
206
- language: normalizedLang,
207
- confidence,
208
- ...pageMetadata
209
- },
210
- tables: []
211
- };
212
- } catch (error) {
213
- const message = error instanceof Error ? error.message : String(error);
214
- throw new Error(`OCR processing failed for language "${language}": ${message}`);
215
- } finally {
216
- this.reportProgress(100);
217
- }
218
- }
219
- /**
220
- * Shutdown the OCR backend and release resources
221
- *
222
- * Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.
223
- * Called when the backend is unregistered or the application shuts down.
224
- *
225
- * @throws {Error} If cleanup fails (errors are logged but not critical)
226
- *
227
- * @example
228
- * ```typescript
229
- * const backend = new TesseractWasmBackend();
230
- * await backend.initialize();
231
- * // ... use backend ...
232
- * await backend.shutdown(); // Clean up resources
233
- * ```
234
- */
235
- async shutdown() {
236
- try {
237
- if (this.client) {
238
- if (typeof this.client.destroy === "function") {
239
- this.client.destroy();
240
- }
241
- if (typeof this.client.terminate === "function") {
242
- this.client.terminate();
243
- }
244
- this.client = null;
245
- }
246
- this.loadedLanguages.clear();
247
- this.supportedLangsCache = null;
248
- this.progressCallback = null;
249
- } catch (error) {
250
- console.warn(
251
- `Warning during TesseractWasmBackend shutdown: ${error instanceof Error ? error.message : String(error)}`
252
- );
253
- }
254
- }
255
- /**
256
- * Set a progress callback for UI updates
257
- *
258
- * Allows the UI to display progress during OCR processing.
259
- * The callback will be called with values from 0 to 100.
260
- *
261
- * @param callback - Function to call with progress percentage
262
- *
263
- * @example
264
- * ```typescript
265
- * const backend = new TesseractWasmBackend();
266
- * backend.setProgressCallback((progress) => {
267
- * console.log(`OCR Progress: ${progress}%`);
268
- * document.getElementById('progress-bar').style.width = `${progress}%`;
269
- * });
270
- * ```
271
- */
272
- setProgressCallback(callback) {
273
- this.progressCallback = callback;
274
- }
275
- /**
276
- * Load language model from CDN
277
- *
278
- * Fetches the training data for a specific language from jsDelivr CDN.
279
- * This is an MVP approach - models are cached by the browser.
280
- *
281
- * @param language - ISO 639-2/3 language code
282
- * @throws {Error} If model download fails or language is not available
283
- *
284
- * @internal
285
- */
286
- async loadLanguageModel(language) {
287
- if (!this.client) {
288
- throw new Error("Client not initialized");
289
- }
290
- const modelFilename = `${language}.traineddata`;
291
- const modelUrl = `${this.CDN_BASE_URL}/${modelFilename}`;
292
- try {
293
- await this.client.loadModel(modelUrl);
294
- } catch (error) {
295
- const message = error instanceof Error ? error.message : String(error);
296
- throw new Error(`Failed to load model for language "${language}" from ${modelUrl}: ${message}`);
297
- }
298
- }
299
- /**
300
- * Convert image bytes or Base64 string to ImageBitmap
301
- *
302
- * Handles both Uint8Array and Base64-encoded image data, converting to
303
- * ImageBitmap format required by Tesseract WASM.
304
- *
305
- * @param imageBytes - Image data as Uint8Array or Base64 string
306
- * @returns Promise resolving to ImageBitmap
307
- * @throws {Error} If conversion fails (browser API not available or invalid image data)
308
- *
309
- * @internal
310
- */
311
- async convertToImageBitmap(imageBytes) {
312
- if (typeof createImageBitmap === "undefined") {
313
- throw new Error("createImageBitmap is not available. TesseractWasmBackend requires a browser environment.");
314
- }
315
- try {
316
- let bytes = imageBytes;
317
- if (typeof imageBytes === "string") {
318
- const binaryString = atob(imageBytes);
319
- bytes = new Uint8Array(binaryString.length);
320
- for (let i = 0; i < binaryString.length; i++) {
321
- bytes[i] = binaryString.charCodeAt(i);
322
- }
323
- }
324
- const blob = new Blob([bytes]);
325
- const imageBitmap = await createImageBitmap(blob);
326
- return imageBitmap;
327
- } catch (error) {
328
- const message = error instanceof Error ? error.message : String(error);
329
- throw new Error(`Failed to convert image bytes to ImageBitmap: ${message}`);
330
- }
331
- }
332
- /**
333
- * Get confidence score from OCR result
334
- *
335
- * Attempts to retrieve confidence score from Tesseract.
336
- * Returns a safe default if unavailable.
337
- *
338
- * @returns Confidence score between 0 and 1
339
- *
340
- * @internal
341
- */
342
- async getConfidenceScore() {
343
- try {
344
- if (this.client && typeof this.client.getConfidence === "function") {
345
- const confidence = await this.client.getConfidence();
346
- return confidence > 1 ? confidence / 100 : confidence;
347
- }
348
- } catch {
349
- }
350
- return 0.9;
351
- }
352
- /**
353
- * Get page metadata from OCR result
354
- *
355
- * Retrieves additional metadata like image dimensions and processing info.
356
- *
357
- * @returns Metadata object (may be empty if unavailable)
358
- *
359
- * @internal
360
- */
361
- async getPageMetadata() {
362
- try {
363
- if (this.client && typeof this.client.getPageMetadata === "function") {
364
- return await this.client.getPageMetadata();
365
- }
366
- } catch {
367
- }
368
- return {};
369
- }
370
- /**
371
- * Dynamically load tesseract-wasm module
372
- *
373
- * Uses dynamic import to load tesseract-wasm only when needed,
374
- * avoiding hard dependency in browser environments where it may not be bundled.
375
- *
376
- * @returns tesseract-wasm module object
377
- * @throws {Error} If module cannot be imported
378
- *
379
- * @internal
380
- */
381
- async loadTesseractWasm() {
382
- try {
383
- const module2 = await import("tesseract-wasm");
384
- return module2;
385
- } catch (error) {
386
- const message = error instanceof Error ? error.message : String(error);
387
- throw new Error(
388
- `Failed to import tesseract-wasm. Ensure it is installed via: npm install tesseract-wasm. Error: ${message}`
389
- );
390
- }
391
- }
392
- /**
393
- * Report progress to progress callback
394
- *
395
- * Internal helper for notifying progress updates during OCR processing.
396
- *
397
- * @param progress - Progress percentage (0-100)
398
- *
399
- * @internal
400
- */
401
- reportProgress(progress) {
402
- if (this.progressCallback) {
403
- try {
404
- this.progressCallback(Math.min(100, Math.max(0, progress)));
405
- } catch {
406
- }
407
- }
408
- }
409
- };
410
- //# sourceMappingURL=tesseract-wasm-backend.cjs.map
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../../typescript/ocr/tesseract-wasm-backend.ts"],"sourcesContent":["/**\n * Tesseract WASM OCR Backend\n *\n * Provides OCR capabilities using tesseract-wasm library for browser environments.\n * Loads training data on-demand from jsDelivr CDN and implements the OcrBackendProtocol.\n *\n * ## Browser-Only Requirement\n *\n * This backend requires browser APIs like createImageBitmap and Web Workers.\n * It will NOT work in Node.js environments without additional canvas polyfills.\n *\n * ## Supported Languages\n *\n * Common ISO 639-1 and ISO 639-2 codes:\n * - English: \"eng\"\n * - German: \"deu\"\n * - French: \"fra\"\n * - Spanish: \"spa\"\n * - Italian: \"ita\"\n * - Portuguese: \"por\"\n * - Dutch: \"nld\"\n * - Russian: \"rus\"\n * - Chinese (Simplified): \"chi_sim\"\n * - Chinese (Traditional): \"chi_tra\"\n * - Japanese: \"jpn\"\n * - Korean: \"kor\"\n * - Arabic: \"ara\"\n * - Hindi: \"hin\"\n *\n * For complete language list, see: https://github.com/naptha/tesseract.js\n *\n * @example Basic Usage\n * ```typescript\n * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';\n * import { registerOcrBackend, extractBytes, initWasm } from '@kreuzberg/wasm';\n *\n * // Initialize\n * await initWasm();\n * const backend = new TesseractWasmBackend();\n * await backend.initialize();\n * registerOcrBackend(backend);\n *\n * // Use in extraction\n * const imageBytes = new Uint8Array(buffer);\n * const result = await extractBytes(imageBytes, 'image/png', {\n * ocr: { backend: 'tesseract-wasm', language: 'eng' }\n * });\n * console.log(result.content); // Extracted text\n * ```\n *\n * @example With Language Auto-Detection\n * ```typescript\n * const backend = new TesseractWasmBackend();\n * await backend.initialize();\n * registerOcrBackend(backend);\n *\n * // Extract without specifying language - backend will auto-detect\n * const result = await extractBytes(imageBytes, 'image/png', {\n * ocr: { backend: 'tesseract-wasm' } // language will auto-detect\n * });\n * ```\n */\n\nimport type { OcrBackendProtocol } from \"../types.js\";\n\n/**\n * Tesseract WASM Client interface\n * Type definition for tesseract-wasm's OCRClient class\n */\ninterface TesseractClient {\n\tloadModel(modelPath: string): Promise<void>;\n\tloadImage(image: ImageBitmap | Blob): Promise<void>;\n\tgetText(): Promise<string>;\n\tgetConfidence(): Promise<number>;\n\tgetPageMetadata(): Promise<Record<string, unknown>>;\n\tdestroy(): void;\n\tterminate(): void;\n}\n\n/**\n * TesseractWasmBackend - OCR backend using tesseract-wasm library\n *\n * Implements the OcrBackendProtocol for Kreuzberg document extraction pipeline.\n * Provides comprehensive OCR support with model caching, error handling, and progress reporting.\n */\nexport class TesseractWasmBackend implements OcrBackendProtocol {\n\t/** Tesseract WASM client instance */\n\tprivate client: TesseractClient | null = null;\n\n\t/** Track which models are currently loaded to avoid redundant loads */\n\tprivate loadedLanguages: Set<string> = new Set();\n\n\t/** Cache for language availability validation */\n\tprivate supportedLangsCache: string[] | null = null;\n\n\t/** Progress callback for UI updates */\n\tprivate progressCallback: ((progress: number) => void) | null = null;\n\n\t/** Base URL for training data CDN */\n\tprivate readonly CDN_BASE_URL = \"https://cdn.jsdelivr.net/npm/tesseract-wasm@0.11.0/dist\";\n\n\t/**\n\t * Return the unique name of this OCR backend\n\t *\n\t * @returns Backend identifier \"tesseract-wasm\"\n\t */\n\tname(): string {\n\t\treturn \"tesseract-wasm\";\n\t}\n\n\t/**\n\t * Return list of supported language codes\n\t *\n\t * Returns a curated list of commonly available Tesseract language models.\n\t * Tesseract supports many more languages through custom models.\n\t *\n\t * @returns Array of ISO 639-1/2/3 language codes\n\t */\n\tsupportedLanguages(): string[] {\n\t\tif (this.supportedLangsCache) {\n\t\t\treturn this.supportedLangsCache;\n\t\t}\n\n\t\tthis.supportedLangsCache = [\n\t\t\t\"eng\",\n\t\t\t\"deu\",\n\t\t\t\"fra\",\n\t\t\t\"spa\",\n\t\t\t\"ita\",\n\t\t\t\"por\",\n\t\t\t\"nld\",\n\t\t\t\"rus\",\n\t\t\t\"jpn\",\n\t\t\t\"kor\",\n\t\t\t\"chi_sim\",\n\t\t\t\"chi_tra\",\n\n\t\t\t\"pol\",\n\t\t\t\"tur\",\n\t\t\t\"swe\",\n\t\t\t\"dan\",\n\t\t\t\"fin\",\n\t\t\t\"nor\",\n\t\t\t\"ces\",\n\t\t\t\"slk\",\n\t\t\t\"ron\",\n\t\t\t\"hun\",\n\t\t\t\"hrv\",\n\t\t\t\"srp\",\n\t\t\t\"bul\",\n\t\t\t\"ukr\",\n\t\t\t\"ell\",\n\n\t\t\t\"ara\",\n\t\t\t\"heb\",\n\t\t\t\"hin\",\n\t\t\t\"tha\",\n\t\t\t\"vie\",\n\t\t\t\"mkd\",\n\t\t\t\"ben\",\n\t\t\t\"tam\",\n\t\t\t\"tel\",\n\t\t\t\"kan\",\n\t\t\t\"mal\",\n\t\t\t\"mya\",\n\t\t\t\"khm\",\n\t\t\t\"lao\",\n\t\t\t\"sin\",\n\t\t];\n\n\t\treturn this.supportedLangsCache;\n\t}\n\n\t/**\n\t * Initialize the OCR backend\n\t *\n\t * Creates the Tesseract WASM client instance. This is called once when\n\t * the backend is registered with the extraction pipeline.\n\t *\n\t * The actual model loading happens in processImage() on-demand to avoid\n\t * loading all models upfront.\n\t *\n\t * @throws {Error} If tesseract-wasm is not available or initialization fails\n\t *\n\t * @example\n\t * ```typescript\n\t * const backend = new TesseractWasmBackend();\n\t * try {\n\t * await backend.initialize();\n\t * } catch (error) {\n\t * console.error('Failed to initialize OCR:', error);\n\t * }\n\t * ```\n\t */\n\tasync initialize(): Promise<void> {\n\t\tif (this.client) {\n\t\t\treturn;\n\t\t}\n\n\t\ttry {\n\t\t\tconst tesseractModule = await this.loadTesseractWasm();\n\n\t\t\t// @ts-expect-error - tesseract-wasm types are not fully typed\n\t\t\tif (!tesseractModule || typeof tesseractModule.OCRClient !== \"function\") {\n\t\t\t\tthrow new Error(\"tesseract-wasm OCRClient not found. Ensure tesseract-wasm is installed and available.\");\n\t\t\t}\n\n\t\t\t// @ts-expect-error - tesseract-wasm types are not fully typed\n\t\t\tthis.client = new tesseractModule.OCRClient();\n\n\t\t\tthis.loadedLanguages.clear();\n\t\t} catch (error) {\n\t\t\tconst message = error instanceof Error ? error.message : String(error);\n\t\t\tthrow new Error(`Failed to initialize TesseractWasmBackend: ${message}`);\n\t\t}\n\t}\n\n\t/**\n\t * Process image bytes and extract text via OCR\n\t *\n\t * Handles image loading, model loading, OCR processing, and result formatting.\n\t * Automatically loads the language model on first use and caches it for subsequent calls.\n\t *\n\t * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string\n\t * @param language - ISO 639-2/3 language code (e.g., \"eng\", \"deu\")\n\t * @returns Promise resolving to OCR result with content and metadata\n\t * @throws {Error} If image processing fails, model loading fails, or language is unsupported\n\t *\n\t * @example\n\t * ```typescript\n\t * const backend = new TesseractWasmBackend();\n\t * await backend.initialize();\n\t *\n\t * const imageBuffer = fs.readFileSync('scanned.png');\n\t * const result = await backend.processImage(\n\t * new Uint8Array(imageBuffer),\n\t * 'eng'\n\t * );\n\t *\n\t * console.log(result.content); // Extracted text\n\t * console.log(result.metadata.confidence); // OCR confidence score\n\t * ```\n\t */\n\tasync processImage(\n\t\timageBytes: Uint8Array | string,\n\t\tlanguage: string,\n\t): Promise<{\n\t\tcontent: string;\n\t\tmime_type: string;\n\t\tmetadata: Record<string, unknown>;\n\t\ttables: unknown[];\n\t}> {\n\t\tif (!this.client) {\n\t\t\tthrow new Error(\"TesseractWasmBackend not initialized. Call initialize() first.\");\n\t\t}\n\n\t\tconst supported = this.supportedLanguages();\n\t\tconst normalizedLang = language.toLowerCase();\n\t\tconst isSupported = supported.some((lang) => lang.toLowerCase() === normalizedLang);\n\n\t\tif (!isSupported) {\n\t\t\tthrow new Error(`Language \"${language}\" is not supported. Supported languages: ${supported.join(\", \")}`);\n\t\t}\n\n\t\ttry {\n\t\t\tif (!this.loadedLanguages.has(normalizedLang)) {\n\t\t\t\tthis.reportProgress(10);\n\t\t\t\tawait this.loadLanguageModel(normalizedLang);\n\t\t\t\tthis.loadedLanguages.add(normalizedLang);\n\t\t\t\tthis.reportProgress(30);\n\t\t\t}\n\n\t\t\tthis.reportProgress(40);\n\t\t\tconst imageBitmap = await this.convertToImageBitmap(imageBytes);\n\n\t\t\tthis.reportProgress(50);\n\t\t\tawait this.client.loadImage(imageBitmap);\n\n\t\t\tthis.reportProgress(70);\n\t\t\tconst text = await this.client.getText();\n\n\t\t\tconst confidence = await this.getConfidenceScore();\n\t\t\tconst pageMetadata = await this.getPageMetadata();\n\n\t\t\tthis.reportProgress(90);\n\n\t\t\treturn {\n\t\t\t\tcontent: text,\n\t\t\t\tmime_type: \"text/plain\",\n\t\t\t\tmetadata: {\n\t\t\t\t\tlanguage: normalizedLang,\n\t\t\t\t\tconfidence,\n\t\t\t\t\t...pageMetadata,\n\t\t\t\t},\n\t\t\t\ttables: [],\n\t\t\t};\n\t\t} catch (error) {\n\t\t\tconst message = error instanceof Error ? error.message : String(error);\n\t\t\tthrow new Error(`OCR processing failed for language \"${language}\": ${message}`);\n\t\t} finally {\n\t\t\tthis.reportProgress(100);\n\t\t}\n\t}\n\n\t/**\n\t * Shutdown the OCR backend and release resources\n\t *\n\t * Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.\n\t * Called when the backend is unregistered or the application shuts down.\n\t *\n\t * @throws {Error} If cleanup fails (errors are logged but not critical)\n\t *\n\t * @example\n\t * ```typescript\n\t * const backend = new TesseractWasmBackend();\n\t * await backend.initialize();\n\t * // ... use backend ...\n\t * await backend.shutdown(); // Clean up resources\n\t * ```\n\t */\n\tasync shutdown(): Promise<void> {\n\t\ttry {\n\t\t\tif (this.client) {\n\t\t\t\tif (typeof this.client.destroy === \"function\") {\n\t\t\t\t\tthis.client.destroy();\n\t\t\t\t}\n\t\t\t\tif (typeof this.client.terminate === \"function\") {\n\t\t\t\t\tthis.client.terminate();\n\t\t\t\t}\n\t\t\t\tthis.client = null;\n\t\t\t}\n\n\t\t\tthis.loadedLanguages.clear();\n\t\t\tthis.supportedLangsCache = null;\n\t\t\tthis.progressCallback = null;\n\t\t} catch (error) {\n\t\t\tconsole.warn(\n\t\t\t\t`Warning during TesseractWasmBackend shutdown: ${error instanceof Error ? error.message : String(error)}`,\n\t\t\t);\n\t\t}\n\t}\n\n\t/**\n\t * Set a progress callback for UI updates\n\t *\n\t * Allows the UI to display progress during OCR processing.\n\t * The callback will be called with values from 0 to 100.\n\t *\n\t * @param callback - Function to call with progress percentage\n\t *\n\t * @example\n\t * ```typescript\n\t * const backend = new TesseractWasmBackend();\n\t * backend.setProgressCallback((progress) => {\n\t * console.log(`OCR Progress: ${progress}%`);\n\t * document.getElementById('progress-bar').style.width = `${progress}%`;\n\t * });\n\t * ```\n\t */\n\tsetProgressCallback(callback: (progress: number) => void): void {\n\t\tthis.progressCallback = callback;\n\t}\n\n\t/**\n\t * Load language model from CDN\n\t *\n\t * Fetches the training data for a specific language from jsDelivr CDN.\n\t * This is an MVP approach - models are cached by the browser.\n\t *\n\t * @param language - ISO 639-2/3 language code\n\t * @throws {Error} If model download fails or language is not available\n\t *\n\t * @internal\n\t */\n\tprivate async loadLanguageModel(language: string): Promise<void> {\n\t\tif (!this.client) {\n\t\t\tthrow new Error(\"Client not initialized\");\n\t\t}\n\n\t\tconst modelFilename = `${language}.traineddata`;\n\t\tconst modelUrl = `${this.CDN_BASE_URL}/${modelFilename}`;\n\n\t\ttry {\n\t\t\tawait this.client.loadModel(modelUrl);\n\t\t} catch (error) {\n\t\t\tconst message = error instanceof Error ? error.message : String(error);\n\t\t\tthrow new Error(`Failed to load model for language \"${language}\" from ${modelUrl}: ${message}`);\n\t\t}\n\t}\n\n\t/**\n\t * Convert image bytes or Base64 string to ImageBitmap\n\t *\n\t * Handles both Uint8Array and Base64-encoded image data, converting to\n\t * ImageBitmap format required by Tesseract WASM.\n\t *\n\t * @param imageBytes - Image data as Uint8Array or Base64 string\n\t * @returns Promise resolving to ImageBitmap\n\t * @throws {Error} If conversion fails (browser API not available or invalid image data)\n\t *\n\t * @internal\n\t */\n\tprivate async convertToImageBitmap(imageBytes: Uint8Array | string): Promise<ImageBitmap> {\n\t\tif (typeof createImageBitmap === \"undefined\") {\n\t\t\tthrow new Error(\"createImageBitmap is not available. TesseractWasmBackend requires a browser environment.\");\n\t\t}\n\n\t\ttry {\n\t\t\tlet bytes = imageBytes;\n\t\t\tif (typeof imageBytes === \"string\") {\n\t\t\t\tconst binaryString = atob(imageBytes);\n\t\t\t\tbytes = new Uint8Array(binaryString.length);\n\t\t\t\tfor (let i = 0; i < binaryString.length; i++) {\n\t\t\t\t\t(bytes as Uint8Array)[i] = binaryString.charCodeAt(i);\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tconst blob = new Blob([bytes as Uint8Array] as BlobPart[]);\n\n\t\t\tconst imageBitmap = await createImageBitmap(blob);\n\t\t\treturn imageBitmap;\n\t\t} catch (error) {\n\t\t\tconst message = error instanceof Error ? error.message : String(error);\n\t\t\tthrow new Error(`Failed to convert image bytes to ImageBitmap: ${message}`);\n\t\t}\n\t}\n\n\t/**\n\t * Get confidence score from OCR result\n\t *\n\t * Attempts to retrieve confidence score from Tesseract.\n\t * Returns a safe default if unavailable.\n\t *\n\t * @returns Confidence score between 0 and 1\n\t *\n\t * @internal\n\t */\n\tprivate async getConfidenceScore(): Promise<number> {\n\t\ttry {\n\t\t\tif (this.client && typeof this.client.getConfidence === \"function\") {\n\t\t\t\tconst confidence = await this.client.getConfidence();\n\t\t\t\treturn confidence > 1 ? confidence / 100 : confidence;\n\t\t\t}\n\t\t} catch {}\n\t\treturn 0.9;\n\t}\n\n\t/**\n\t * Get page metadata from OCR result\n\t *\n\t * Retrieves additional metadata like image dimensions and processing info.\n\t *\n\t * @returns Metadata object (may be empty if unavailable)\n\t *\n\t * @internal\n\t */\n\tprivate async getPageMetadata(): Promise<Record<string, unknown>> {\n\t\ttry {\n\t\t\tif (this.client && typeof this.client.getPageMetadata === \"function\") {\n\t\t\t\treturn await this.client.getPageMetadata();\n\t\t\t}\n\t\t} catch {}\n\t\treturn {};\n\t}\n\n\t/**\n\t * Dynamically load tesseract-wasm module\n\t *\n\t * Uses dynamic import to load tesseract-wasm only when needed,\n\t * avoiding hard dependency in browser environments where it may not be bundled.\n\t *\n\t * @returns tesseract-wasm module object\n\t * @throws {Error} If module cannot be imported\n\t *\n\t * @internal\n\t */\n\tprivate async loadTesseractWasm(): Promise<unknown> {\n\t\ttry {\n\t\t\t// @ts-expect-error - tesseract-wasm has package.json exports issues with TypeScript\n\t\t\tconst module = await import(\"tesseract-wasm\");\n\t\t\treturn module;\n\t\t} catch (error) {\n\t\t\tconst message = error instanceof Error ? error.message : String(error);\n\t\t\tthrow new Error(\n\t\t\t\t`Failed to import tesseract-wasm. Ensure it is installed via: npm install tesseract-wasm. Error: ${message}`,\n\t\t\t);\n\t\t}\n\t}\n\n\t/**\n\t * Report progress to progress callback\n\t *\n\t * Internal helper for notifying progress updates during OCR processing.\n\t *\n\t * @param progress - Progress percentage (0-100)\n\t *\n\t * @internal\n\t */\n\tprivate reportProgress(progress: number): void {\n\t\tif (this.progressCallback) {\n\t\t\ttry {\n\t\t\t\tthis.progressCallback(Math.min(100, Math.max(0, progress)));\n\t\t\t} catch {}\n\t\t}\n\t}\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAqFO,IAAM,uBAAN,MAAyD;AAAA;AAAA,EAEvD,SAAiC;AAAA;AAAA,EAGjC,kBAA+B,oBAAI,IAAI;AAAA;AAAA,EAGvC,sBAAuC;AAAA;AAAA,EAGvC,mBAAwD;AAAA;AAAA,EAG/C,eAAe;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOhC,OAAe;AACd,WAAO;AAAA,EACR;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUA,qBAA+B;AAC9B,QAAI,KAAK,qBAAqB;AAC7B,aAAO,KAAK;AAAA,IACb;AAEA,SAAK,sBAAsB;AAAA,MAC1B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MAEA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MAEA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACD;AAEA,WAAO,KAAK;AAAA,EACb;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAuBA,MAAM,aAA4B;AACjC,QAAI,KAAK,QAAQ;AAChB;AAAA,IACD;AAEA,QAAI;AACH,YAAM,kBAAkB,MAAM,KAAK,kBAAkB;AAGrD,UAAI,CAAC,mBAAmB,OAAO,gBAAgB,cAAc,YAAY;AACxE,cAAM,IAAI,MAAM,uFAAuF;AAAA,MACxG;AAGA,WAAK,SAAS,IAAI,gBAAgB,UAAU;AAE5C,WAAK,gBAAgB,MAAM;AAAA,IAC5B,SAAS,OAAO;AACf,YAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAM,IAAI,MAAM,8CAA8C,OAAO,EAAE;AAAA,IACxE;AAAA,EACD;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EA4BA,MAAM,aACL,YACA,UAME;AACF,QAAI,CAAC,KAAK,QAAQ;AACjB,YAAM,IAAI,MAAM,gEAAgE;AAAA,IACjF;AAEA,UAAM,YAAY,KAAK,mBAAmB;AAC1C,UAAM,iBAAiB,SAAS,YAAY;AAC5C,UAAM,cAAc,UAAU,KAAK,CAAC,SAAS,KAAK,YAAY,MAAM,cAAc;AAElF,QAAI,CAAC,aAAa;AACjB,YAAM,IAAI,MAAM,aAAa,QAAQ,4CAA4C,UAAU,KAAK,IAAI,CAAC,EAAE;AAAA,IACxG;AAEA,QAAI;AACH,UAAI,CAAC,KAAK,gBAAgB,IAAI,cAAc,GAAG;AAC9C,aAAK,eAAe,EAAE;AACtB,cAAM,KAAK,kBAAkB,cAAc;AAC3C,aAAK,gBAAgB,IAAI,cAAc;AACvC,aAAK,eAAe,EAAE;AAAA,MACvB;AAEA,WAAK,eAAe,EAAE;AACtB,YAAM,cAAc,MAAM,KAAK,qBAAqB,UAAU;AAE9D,WAAK,eAAe,EAAE;AACtB,YAAM,KAAK,OAAO,UAAU,WAAW;AAEvC,WAAK,eAAe,EAAE;AACtB,YAAM,OAAO,MAAM,KAAK,OAAO,QAAQ;AAEvC,YAAM,aAAa,MAAM,KAAK,mBAAmB;AACjD,YAAM,eAAe,MAAM,KAAK,gBAAgB;AAEhD,WAAK,eAAe,EAAE;AAEtB,aAAO;AAAA,QACN,SAAS;AAAA,QACT,WAAW;AAAA,QACX,UAAU;AAAA,UACT,UAAU;AAAA,UACV;AAAA,UACA,GAAG;AAAA,QACJ;AAAA,QACA,QAAQ,CAAC;AAAA,MACV;AAAA,IACD,SAAS,OAAO;AACf,YAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAM,IAAI,MAAM,uCAAuC,QAAQ,MAAM,OAAO,EAAE;AAAA,IAC/E,UAAE;AACD,WAAK,eAAe,GAAG;AAAA,IACxB;AAAA,EACD;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAkBA,MAAM,WAA0B;AAC/B,QAAI;AACH,UAAI,KAAK,QAAQ;AAChB,YAAI,OAAO,KAAK,OAAO,YAAY,YAAY;AAC9C,eAAK,OAAO,QAAQ;AAAA,QACrB;AACA,YAAI,OAAO,KAAK,OAAO,cAAc,YAAY;AAChD,eAAK,OAAO,UAAU;AAAA,QACvB;AACA,aAAK,SAAS;AAAA,MACf;AAEA,WAAK,gBAAgB,MAAM;AAC3B,WAAK,sBAAsB;AAC3B,WAAK,mBAAmB;AAAA,IACzB,SAAS,OAAO;AACf,cAAQ;AAAA,QACP,iDAAiD,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AAAA,MACxG;AAAA,IACD;AAAA,EACD;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAmBA,oBAAoB,UAA4C;AAC/D,SAAK,mBAAmB;AAAA,EACzB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAaA,MAAc,kBAAkB,UAAiC;AAChE,QAAI,CAAC,KAAK,QAAQ;AACjB,YAAM,IAAI,MAAM,wBAAwB;AAAA,IACzC;AAEA,UAAM,gBAAgB,GAAG,QAAQ;AACjC,UAAM,WAAW,GAAG,KAAK,YAAY,IAAI,aAAa;AAEtD,QAAI;AACH,YAAM,KAAK,OAAO,UAAU,QAAQ;AAAA,IACrC,SAAS,OAAO;AACf,YAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAM,IAAI,MAAM,sCAAsC,QAAQ,UAAU,QAAQ,KAAK,OAAO,EAAE;AAAA,IAC/F;AAAA,EACD;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAcA,MAAc,qBAAqB,YAAuD;AACzF,QAAI,OAAO,sBAAsB,aAAa;AAC7C,YAAM,IAAI,MAAM,0FAA0F;AAAA,IAC3G;AAEA,QAAI;AACH,UAAI,QAAQ;AACZ,UAAI,OAAO,eAAe,UAAU;AACnC,cAAM,eAAe,KAAK,UAAU;AACpC,gBAAQ,IAAI,WAAW,aAAa,MAAM;AAC1C,iBAAS,IAAI,GAAG,IAAI,aAAa,QAAQ,KAAK;AAC7C,UAAC,MAAqB,CAAC,IAAI,aAAa,WAAW,CAAC;AAAA,QACrD;AAAA,MACD;AAEA,YAAM,OAAO,IAAI,KAAK,CAAC,KAAmB,CAAe;AAEzD,YAAM,cAAc,MAAM,kBAAkB,IAAI;AAChD,aAAO;AAAA,IACR,SAAS,OAAO;AACf,YAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAM,IAAI,MAAM,iDAAiD,OAAO,EAAE;AAAA,IAC3E;AAAA,EACD;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAYA,MAAc,qBAAsC;AACnD,QAAI;AACH,UAAI,KAAK,UAAU,OAAO,KAAK,OAAO,kBAAkB,YAAY;AACnE,cAAM,aAAa,MAAM,KAAK,OAAO,cAAc;AACnD,eAAO,aAAa,IAAI,aAAa,MAAM;AAAA,MAC5C;AAAA,IACD,QAAQ;AAAA,IAAC;AACT,WAAO;AAAA,EACR;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAWA,MAAc,kBAAoD;AACjE,QAAI;AACH,UAAI,KAAK,UAAU,OAAO,KAAK,OAAO,oBAAoB,YAAY;AACrE,eAAO,MAAM,KAAK,OAAO,gBAAgB;AAAA,MAC1C;AAAA,IACD,QAAQ;AAAA,IAAC;AACT,WAAO,CAAC;AAAA,EACT;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAaA,MAAc,oBAAsC;AACnD,QAAI;AAEH,YAAMA,UAAS,MAAM,OAAO,gBAAgB;AAC5C,aAAOA;AAAA,IACR,SAAS,OAAO;AACf,YAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,YAAM,IAAI;AAAA,QACT,mGAAmG,OAAO;AAAA,MAC3G;AAAA,IACD;AAAA,EACD;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAWQ,eAAe,UAAwB;AAC9C,QAAI,KAAK,kBAAkB;AAC1B,UAAI;AACH,aAAK,iBAAiB,KAAK,IAAI,KAAK,KAAK,IAAI,GAAG,QAAQ,CAAC,CAAC;AAAA,MAC3D,QAAQ;AAAA,MAAC;AAAA,IACV;AAAA,EACD;AACD;","names":["module"]}