@kreuzberg/wasm 4.0.0-rc.6 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +317 -801
- package/dist/adapters/wasm-adapter.d.ts +7 -10
- package/dist/adapters/wasm-adapter.d.ts.map +1 -0
- package/dist/adapters/wasm-adapter.js +53 -54
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.d.ts +23 -67
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1102 -104
- package/dist/index.js.map +1 -1
- package/dist/ocr/registry.d.ts +7 -10
- package/dist/ocr/registry.d.ts.map +1 -0
- package/dist/ocr/registry.js +9 -28
- package/dist/ocr/registry.js.map +1 -1
- package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
- package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.js +8 -83
- package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
- package/dist/pdfium.js +77 -0
- package/dist/pkg/LICENSE +7 -0
- package/dist/pkg/README.md +498 -0
- package/dist/{kreuzberg_wasm.d.ts → pkg/kreuzberg_wasm.d.ts} +24 -12
- package/dist/{kreuzberg_wasm.js → pkg/kreuzberg_wasm.js} +224 -233
- package/dist/pkg/kreuzberg_wasm_bg.js +1871 -0
- package/dist/{kreuzberg_wasm_bg.wasm → pkg/kreuzberg_wasm_bg.wasm} +0 -0
- package/dist/{kreuzberg_wasm_bg.wasm.d.ts → pkg/kreuzberg_wasm_bg.wasm.d.ts} +10 -13
- package/dist/pkg/package.json +27 -0
- package/dist/plugin-registry.d.ts +246 -0
- package/dist/plugin-registry.d.ts.map +1 -0
- package/dist/runtime.d.ts +21 -22
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +21 -41
- package/dist/runtime.js.map +1 -1
- package/dist/types.d.ts +363 -0
- package/dist/types.d.ts.map +1 -0
- package/package.json +34 -51
- package/dist/adapters/wasm-adapter.d.mts +0 -121
- package/dist/adapters/wasm-adapter.mjs +0 -221
- package/dist/adapters/wasm-adapter.mjs.map +0 -1
- package/dist/index.d.mts +0 -466
- package/dist/index.mjs +0 -384
- package/dist/index.mjs.map +0 -1
- package/dist/kreuzberg_wasm.d.mts +0 -758
- package/dist/kreuzberg_wasm.mjs +0 -48
- package/dist/ocr/registry.d.mts +0 -102
- package/dist/ocr/registry.mjs +0 -70
- package/dist/ocr/registry.mjs.map +0 -1
- package/dist/ocr/tesseract-wasm-backend.d.mts +0 -257
- package/dist/ocr/tesseract-wasm-backend.mjs +0 -424
- package/dist/ocr/tesseract-wasm-backend.mjs.map +0 -1
- package/dist/runtime.d.mts +0 -256
- package/dist/runtime.mjs +0 -152
- package/dist/runtime.mjs.map +0 -1
- package/dist/snippets/wasm-bindgen-rayon-38edf6e439f6d70d/src/workerHelpers.js +0 -107
- package/dist/types-GJVIvbPy.d.mts +0 -221
- package/dist/types-GJVIvbPy.d.ts +0 -221
package/dist/kreuzberg_wasm.mjs
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import { createRequire } from 'module';
|
|
2
|
-
import { fileURLToPath } from 'url';
|
|
3
|
-
import { dirname, join } from 'path';
|
|
4
|
-
|
|
5
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
-
const __dirname = dirname(__filename);
|
|
7
|
-
const require = createRequire(import.meta.url);
|
|
8
|
-
|
|
9
|
-
// Import the CommonJS module
|
|
10
|
-
const wasmModule = require('./kreuzberg_wasm.js');
|
|
11
|
-
|
|
12
|
-
// Re-export everything from the CommonJS module as ESM
|
|
13
|
-
export const {
|
|
14
|
-
memory,
|
|
15
|
-
extractBytes,
|
|
16
|
-
extractBytesSync,
|
|
17
|
-
batchExtractBytes,
|
|
18
|
-
batchExtractBytesSync,
|
|
19
|
-
extractFile,
|
|
20
|
-
batchExtractFiles,
|
|
21
|
-
detectMimeFromBytes,
|
|
22
|
-
normalizeMimeType,
|
|
23
|
-
getMimeFromExtension,
|
|
24
|
-
getExtensionsForMime,
|
|
25
|
-
loadConfigFromString,
|
|
26
|
-
discoverConfig,
|
|
27
|
-
version,
|
|
28
|
-
get_module_info,
|
|
29
|
-
register_ocr_backend,
|
|
30
|
-
unregister_ocr_backend,
|
|
31
|
-
list_ocr_backends,
|
|
32
|
-
clear_ocr_backends,
|
|
33
|
-
register_post_processor,
|
|
34
|
-
unregister_post_processor,
|
|
35
|
-
list_post_processors,
|
|
36
|
-
clear_post_processors,
|
|
37
|
-
register_validator,
|
|
38
|
-
unregister_validator,
|
|
39
|
-
list_validators,
|
|
40
|
-
clear_validators,
|
|
41
|
-
initialize_pdfium_render,
|
|
42
|
-
read_block_from_callback_wasm,
|
|
43
|
-
write_block_from_callback_wasm,
|
|
44
|
-
default: wasmDefault
|
|
45
|
-
} = wasmModule;
|
|
46
|
-
|
|
47
|
-
// Support default export pattern
|
|
48
|
-
export default wasmModule;
|
package/dist/ocr/registry.d.mts
DELETED
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import { f as OcrBackendProtocol } from '../types-GJVIvbPy.mjs';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* OCR Backend Registry
|
|
5
|
-
*
|
|
6
|
-
* Provides a registry for OCR backends in the WASM environment.
|
|
7
|
-
* This enables auto-registration and management of OCR backends.
|
|
8
|
-
*
|
|
9
|
-
* Note: The WASM package provides a lightweight registry in the browser.
|
|
10
|
-
* For more advanced features like Rust integration, use @kreuzberg/node or @kreuzberg/deno.
|
|
11
|
-
*
|
|
12
|
-
* @example
|
|
13
|
-
* ```typescript
|
|
14
|
-
* import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
|
|
15
|
-
* import { enableOcr } from '@kreuzberg/wasm';
|
|
16
|
-
*
|
|
17
|
-
* // Simple auto-registration
|
|
18
|
-
* await enableOcr();
|
|
19
|
-
* ```
|
|
20
|
-
*/
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Register an OCR backend
|
|
24
|
-
*
|
|
25
|
-
* Registers an OCR backend with the WASM extraction pipeline.
|
|
26
|
-
* If a backend with the same name is already registered, it will be replaced.
|
|
27
|
-
*
|
|
28
|
-
* @param backend - OCR backend implementing OcrBackendProtocol
|
|
29
|
-
* @throws {Error} If backend validation fails
|
|
30
|
-
*
|
|
31
|
-
* @example
|
|
32
|
-
* ```typescript
|
|
33
|
-
* import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
|
|
34
|
-
* import { registerOcrBackend } from '@kreuzberg/wasm/ocr/registry';
|
|
35
|
-
*
|
|
36
|
-
* const backend = new TesseractWasmBackend();
|
|
37
|
-
* await backend.initialize();
|
|
38
|
-
* registerOcrBackend(backend);
|
|
39
|
-
* ```
|
|
40
|
-
*/
|
|
41
|
-
declare function registerOcrBackend(backend: OcrBackendProtocol): void;
|
|
42
|
-
/**
|
|
43
|
-
* Get a registered OCR backend by name
|
|
44
|
-
*
|
|
45
|
-
* @param name - Backend name
|
|
46
|
-
* @returns The OCR backend or undefined if not found
|
|
47
|
-
*
|
|
48
|
-
* @example
|
|
49
|
-
* ```typescript
|
|
50
|
-
* import { getOcrBackend } from '@kreuzberg/wasm/ocr/registry';
|
|
51
|
-
*
|
|
52
|
-
* const backend = getOcrBackend('tesseract-wasm');
|
|
53
|
-
* if (backend) {
|
|
54
|
-
* console.log('Available languages:', backend.supportedLanguages());
|
|
55
|
-
* }
|
|
56
|
-
* ```
|
|
57
|
-
*/
|
|
58
|
-
declare function getOcrBackend(name: string): OcrBackendProtocol | undefined;
|
|
59
|
-
/**
|
|
60
|
-
* List all registered OCR backends
|
|
61
|
-
*
|
|
62
|
-
* @returns Array of registered backend names
|
|
63
|
-
*
|
|
64
|
-
* @example
|
|
65
|
-
* ```typescript
|
|
66
|
-
* import { listOcrBackends } from '@kreuzberg/wasm/ocr/registry';
|
|
67
|
-
*
|
|
68
|
-
* const backends = listOcrBackends();
|
|
69
|
-
* console.log('Available OCR backends:', backends);
|
|
70
|
-
* ```
|
|
71
|
-
*/
|
|
72
|
-
declare function listOcrBackends(): string[];
|
|
73
|
-
/**
|
|
74
|
-
* Unregister an OCR backend
|
|
75
|
-
*
|
|
76
|
-
* @param name - Backend name to unregister
|
|
77
|
-
* @throws {Error} If backend is not found
|
|
78
|
-
*
|
|
79
|
-
* @example
|
|
80
|
-
* ```typescript
|
|
81
|
-
* import { unregisterOcrBackend } from '@kreuzberg/wasm/ocr/registry';
|
|
82
|
-
*
|
|
83
|
-
* unregisterOcrBackend('tesseract-wasm');
|
|
84
|
-
* ```
|
|
85
|
-
*/
|
|
86
|
-
declare function unregisterOcrBackend(name: string): Promise<void>;
|
|
87
|
-
/**
|
|
88
|
-
* Clear all registered OCR backends
|
|
89
|
-
*
|
|
90
|
-
* Unregisters all OCR backends and calls their shutdown methods.
|
|
91
|
-
*
|
|
92
|
-
* @example
|
|
93
|
-
* ```typescript
|
|
94
|
-
* import { clearOcrBackends } from '@kreuzberg/wasm/ocr/registry';
|
|
95
|
-
*
|
|
96
|
-
* // Clean up all backends when shutting down
|
|
97
|
-
* await clearOcrBackends();
|
|
98
|
-
* ```
|
|
99
|
-
*/
|
|
100
|
-
declare function clearOcrBackends(): Promise<void>;
|
|
101
|
-
|
|
102
|
-
export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend };
|
package/dist/ocr/registry.mjs
DELETED
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
const ocrBackendRegistry = /* @__PURE__ */ new Map();
|
|
2
|
-
function registerOcrBackend(backend) {
|
|
3
|
-
if (!backend) {
|
|
4
|
-
throw new Error("Backend cannot be null or undefined");
|
|
5
|
-
}
|
|
6
|
-
if (typeof backend.name !== "function") {
|
|
7
|
-
throw new Error("Backend must implement name() method");
|
|
8
|
-
}
|
|
9
|
-
if (typeof backend.supportedLanguages !== "function") {
|
|
10
|
-
throw new Error("Backend must implement supportedLanguages() method");
|
|
11
|
-
}
|
|
12
|
-
if (typeof backend.processImage !== "function") {
|
|
13
|
-
throw new Error("Backend must implement processImage() method");
|
|
14
|
-
}
|
|
15
|
-
const backendName = backend.name();
|
|
16
|
-
if (!backendName || typeof backendName !== "string") {
|
|
17
|
-
throw new Error("Backend name must be a non-empty string");
|
|
18
|
-
}
|
|
19
|
-
if (ocrBackendRegistry.has(backendName)) {
|
|
20
|
-
console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
|
|
21
|
-
}
|
|
22
|
-
ocrBackendRegistry.set(backendName, backend);
|
|
23
|
-
}
|
|
24
|
-
function getOcrBackend(name) {
|
|
25
|
-
return ocrBackendRegistry.get(name);
|
|
26
|
-
}
|
|
27
|
-
function listOcrBackends() {
|
|
28
|
-
return Array.from(ocrBackendRegistry.keys());
|
|
29
|
-
}
|
|
30
|
-
async function unregisterOcrBackend(name) {
|
|
31
|
-
const backend = ocrBackendRegistry.get(name);
|
|
32
|
-
if (!backend) {
|
|
33
|
-
throw new Error(
|
|
34
|
-
`OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
|
|
35
|
-
);
|
|
36
|
-
}
|
|
37
|
-
if (typeof backend.shutdown === "function") {
|
|
38
|
-
try {
|
|
39
|
-
await backend.shutdown();
|
|
40
|
-
} catch (error) {
|
|
41
|
-
console.warn(
|
|
42
|
-
`Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
|
|
43
|
-
);
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
ocrBackendRegistry.delete(name);
|
|
47
|
-
}
|
|
48
|
-
async function clearOcrBackends() {
|
|
49
|
-
const backends = Array.from(ocrBackendRegistry.entries());
|
|
50
|
-
for (const [name, backend] of backends) {
|
|
51
|
-
if (typeof backend.shutdown === "function") {
|
|
52
|
-
try {
|
|
53
|
-
await backend.shutdown();
|
|
54
|
-
} catch (error) {
|
|
55
|
-
console.warn(
|
|
56
|
-
`Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
|
|
57
|
-
);
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
ocrBackendRegistry.clear();
|
|
62
|
-
}
|
|
63
|
-
export {
|
|
64
|
-
clearOcrBackends,
|
|
65
|
-
getOcrBackend,
|
|
66
|
-
listOcrBackends,
|
|
67
|
-
registerOcrBackend,
|
|
68
|
-
unregisterOcrBackend
|
|
69
|
-
};
|
|
70
|
-
//# sourceMappingURL=registry.mjs.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../../typescript/ocr/registry.ts"],"sourcesContent":["/**\n * OCR Backend Registry\n *\n * Provides a registry for OCR backends in the WASM environment.\n * This enables auto-registration and management of OCR backends.\n *\n * Note: The WASM package provides a lightweight registry in the browser.\n * For more advanced features like Rust integration, use @kreuzberg/node or @kreuzberg/deno.\n *\n * @example\n * ```typescript\n * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';\n * import { enableOcr } from '@kreuzberg/wasm';\n *\n * // Simple auto-registration\n * await enableOcr();\n * ```\n */\n\nimport type { OcrBackendProtocol } from \"../types.js\";\n\n/** Global registry of OCR backends */\nconst ocrBackendRegistry = new Map<string, OcrBackendProtocol>();\n\n/**\n * Register an OCR backend\n *\n * Registers an OCR backend with the WASM extraction pipeline.\n * If a backend with the same name is already registered, it will be replaced.\n *\n * @param backend - OCR backend implementing OcrBackendProtocol\n * @throws {Error} If backend validation fails\n *\n * @example\n * ```typescript\n * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';\n * import { registerOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backend = new TesseractWasmBackend();\n * await backend.initialize();\n * registerOcrBackend(backend);\n * ```\n */\nexport function registerOcrBackend(backend: OcrBackendProtocol): void {\n\t// Validate backend\n\tif (!backend) {\n\t\tthrow new Error(\"Backend cannot be null or undefined\");\n\t}\n\n\tif (typeof backend.name !== \"function\") {\n\t\tthrow new Error(\"Backend must implement name() method\");\n\t}\n\n\tif (typeof backend.supportedLanguages !== \"function\") {\n\t\tthrow new Error(\"Backend must implement supportedLanguages() method\");\n\t}\n\n\tif (typeof backend.processImage !== \"function\") {\n\t\tthrow new Error(\"Backend must implement processImage() method\");\n\t}\n\n\tconst backendName = backend.name();\n\n\tif (!backendName || typeof backendName !== \"string\") {\n\t\tthrow new Error(\"Backend name must be a non-empty string\");\n\t}\n\n\t// Check for duplicate registration (allow overwriting with warning)\n\tif (ocrBackendRegistry.has(backendName)) {\n\t\tconsole.warn(`OCR backend \"${backendName}\" is already registered and will be replaced`);\n\t}\n\n\t// Register the backend\n\tocrBackendRegistry.set(backendName, backend);\n}\n\n/**\n * Get a registered OCR backend by name\n *\n * @param name - Backend name\n * @returns The OCR backend or undefined if not found\n *\n * @example\n * ```typescript\n * import { getOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backend = getOcrBackend('tesseract-wasm');\n * if (backend) {\n * console.log('Available languages:', backend.supportedLanguages());\n * }\n * ```\n */\nexport function getOcrBackend(name: string): OcrBackendProtocol | undefined {\n\treturn ocrBackendRegistry.get(name);\n}\n\n/**\n * List all registered OCR backends\n *\n * @returns Array of registered backend names\n *\n * @example\n * ```typescript\n * import { listOcrBackends } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backends = listOcrBackends();\n * console.log('Available OCR backends:', backends);\n * ```\n */\nexport function listOcrBackends(): string[] {\n\treturn Array.from(ocrBackendRegistry.keys());\n}\n\n/**\n * Unregister an OCR backend\n *\n * @param name - Backend name to unregister\n * @throws {Error} If backend is not found\n *\n * @example\n * ```typescript\n * import { unregisterOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * unregisterOcrBackend('tesseract-wasm');\n * ```\n */\nexport async function unregisterOcrBackend(name: string): Promise<void> {\n\tconst backend = ocrBackendRegistry.get(name);\n\n\tif (!backend) {\n\t\tthrow new Error(\n\t\t\t`OCR backend \"${name}\" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(\", \")}`,\n\t\t);\n\t}\n\n\t// Call shutdown if available\n\tif (typeof backend.shutdown === \"function\") {\n\t\ttry {\n\t\t\tawait backend.shutdown();\n\t\t} catch (error) {\n\t\t\tconsole.warn(\n\t\t\t\t`Error shutting down OCR backend \"${name}\": ${error instanceof Error ? error.message : String(error)}`,\n\t\t\t);\n\t\t}\n\t}\n\n\tocrBackendRegistry.delete(name);\n}\n\n/**\n * Clear all registered OCR backends\n *\n * Unregisters all OCR backends and calls their shutdown methods.\n *\n * @example\n * ```typescript\n * import { clearOcrBackends } from '@kreuzberg/wasm/ocr/registry';\n *\n * // Clean up all backends when shutting down\n * await clearOcrBackends();\n * ```\n */\nexport async function clearOcrBackends(): Promise<void> {\n\tconst backends = Array.from(ocrBackendRegistry.entries());\n\n\tfor (const [name, backend] of backends) {\n\t\tif (typeof backend.shutdown === \"function\") {\n\t\t\ttry {\n\t\t\t\tawait backend.shutdown();\n\t\t\t} catch (error) {\n\t\t\t\tconsole.warn(\n\t\t\t\t\t`Error shutting down OCR backend \"${name}\": ${error instanceof Error ? error.message : String(error)}`,\n\t\t\t\t);\n\t\t\t}\n\t\t}\n\t}\n\n\tocrBackendRegistry.clear();\n}\n"],"mappings":"AAsBA,MAAM,qBAAqB,oBAAI,IAAgC;AAqBxD,SAAS,mBAAmB,SAAmC;AAErE,MAAI,CAAC,SAAS;AACb,UAAM,IAAI,MAAM,qCAAqC;AAAA,EACtD;AAEA,MAAI,OAAO,QAAQ,SAAS,YAAY;AACvC,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACvD;AAEA,MAAI,OAAO,QAAQ,uBAAuB,YAAY;AACrD,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACrE;AAEA,MAAI,OAAO,QAAQ,iBAAiB,YAAY;AAC/C,UAAM,IAAI,MAAM,8CAA8C;AAAA,EAC/D;AAEA,QAAM,cAAc,QAAQ,KAAK;AAEjC,MAAI,CAAC,eAAe,OAAO,gBAAgB,UAAU;AACpD,UAAM,IAAI,MAAM,yCAAyC;AAAA,EAC1D;AAGA,MAAI,mBAAmB,IAAI,WAAW,GAAG;AACxC,YAAQ,KAAK,gBAAgB,WAAW,8CAA8C;AAAA,EACvF;AAGA,qBAAmB,IAAI,aAAa,OAAO;AAC5C;AAkBO,SAAS,cAAc,MAA8C;AAC3E,SAAO,mBAAmB,IAAI,IAAI;AACnC;AAeO,SAAS,kBAA4B;AAC3C,SAAO,MAAM,KAAK,mBAAmB,KAAK,CAAC;AAC5C;AAeA,eAAsB,qBAAqB,MAA6B;AACvE,QAAM,UAAU,mBAAmB,IAAI,IAAI;AAE3C,MAAI,CAAC,SAAS;AACb,UAAM,IAAI;AAAA,MACT,gBAAgB,IAAI,4CAA4C,MAAM,KAAK,mBAAmB,KAAK,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,IACjH;AAAA,EACD;AAGA,MAAI,OAAO,QAAQ,aAAa,YAAY;AAC3C,QAAI;AACH,YAAM,QAAQ,SAAS;AAAA,IACxB,SAAS,OAAO;AACf,cAAQ;AAAA,QACP,oCAAoC,IAAI,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AAAA,MACrG;AAAA,IACD;AAAA,EACD;AAEA,qBAAmB,OAAO,IAAI;AAC/B;AAeA,eAAsB,mBAAkC;AACvD,QAAM,WAAW,MAAM,KAAK,mBAAmB,QAAQ,CAAC;AAExD,aAAW,CAAC,MAAM,OAAO,KAAK,UAAU;AACvC,QAAI,OAAO,QAAQ,aAAa,YAAY;AAC3C,UAAI;AACH,cAAM,QAAQ,SAAS;AAAA,MACxB,SAAS,OAAO;AACf,gBAAQ;AAAA,UACP,oCAAoC,IAAI,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AAAA,QACrG;AAAA,MACD;AAAA,IACD;AAAA,EACD;AAEA,qBAAmB,MAAM;AAC1B;","names":[]}
|
|
@@ -1,257 +0,0 @@
|
|
|
1
|
-
import { f as OcrBackendProtocol } from '../types-GJVIvbPy.mjs';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Tesseract WASM OCR Backend
|
|
5
|
-
*
|
|
6
|
-
* Provides OCR capabilities using tesseract-wasm library for browser environments.
|
|
7
|
-
* Loads training data on-demand from jsDelivr CDN and implements the OcrBackendProtocol.
|
|
8
|
-
*
|
|
9
|
-
* ## Browser-Only Requirement
|
|
10
|
-
*
|
|
11
|
-
* This backend requires browser APIs like createImageBitmap and Web Workers.
|
|
12
|
-
* It will NOT work in Node.js environments without additional canvas polyfills.
|
|
13
|
-
*
|
|
14
|
-
* ## Supported Languages
|
|
15
|
-
*
|
|
16
|
-
* Common ISO 639-1 and ISO 639-2 codes:
|
|
17
|
-
* - English: "eng"
|
|
18
|
-
* - German: "deu"
|
|
19
|
-
* - French: "fra"
|
|
20
|
-
* - Spanish: "spa"
|
|
21
|
-
* - Italian: "ita"
|
|
22
|
-
* - Portuguese: "por"
|
|
23
|
-
* - Dutch: "nld"
|
|
24
|
-
* - Russian: "rus"
|
|
25
|
-
* - Chinese (Simplified): "chi_sim"
|
|
26
|
-
* - Chinese (Traditional): "chi_tra"
|
|
27
|
-
* - Japanese: "jpn"
|
|
28
|
-
* - Korean: "kor"
|
|
29
|
-
* - Arabic: "ara"
|
|
30
|
-
* - Hindi: "hin"
|
|
31
|
-
*
|
|
32
|
-
* For complete language list, see: https://github.com/naptha/tesseract.js
|
|
33
|
-
*
|
|
34
|
-
* @example Basic Usage
|
|
35
|
-
* ```typescript
|
|
36
|
-
* import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
|
|
37
|
-
* import { registerOcrBackend, extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
38
|
-
*
|
|
39
|
-
* // Initialize
|
|
40
|
-
* await initWasm();
|
|
41
|
-
* const backend = new TesseractWasmBackend();
|
|
42
|
-
* await backend.initialize();
|
|
43
|
-
* registerOcrBackend(backend);
|
|
44
|
-
*
|
|
45
|
-
* // Use in extraction
|
|
46
|
-
* const imageBytes = new Uint8Array(buffer);
|
|
47
|
-
* const result = await extractBytes(imageBytes, 'image/png', {
|
|
48
|
-
* ocr: { backend: 'tesseract-wasm', language: 'eng' }
|
|
49
|
-
* });
|
|
50
|
-
* console.log(result.content); // Extracted text
|
|
51
|
-
* ```
|
|
52
|
-
*
|
|
53
|
-
* @example With Language Auto-Detection
|
|
54
|
-
* ```typescript
|
|
55
|
-
* const backend = new TesseractWasmBackend();
|
|
56
|
-
* await backend.initialize();
|
|
57
|
-
* registerOcrBackend(backend);
|
|
58
|
-
*
|
|
59
|
-
* // Extract without specifying language - backend will auto-detect
|
|
60
|
-
* const result = await extractBytes(imageBytes, 'image/png', {
|
|
61
|
-
* ocr: { backend: 'tesseract-wasm' } // language will auto-detect
|
|
62
|
-
* });
|
|
63
|
-
* ```
|
|
64
|
-
*/
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* TesseractWasmBackend - OCR backend using tesseract-wasm library
|
|
68
|
-
*
|
|
69
|
-
* Implements the OcrBackendProtocol for Kreuzberg document extraction pipeline.
|
|
70
|
-
* Provides comprehensive OCR support with model caching, error handling, and progress reporting.
|
|
71
|
-
*/
|
|
72
|
-
declare class TesseractWasmBackend implements OcrBackendProtocol {
|
|
73
|
-
/** Tesseract WASM client instance */
|
|
74
|
-
private client;
|
|
75
|
-
/** Track which models are currently loaded to avoid redundant loads */
|
|
76
|
-
private loadedLanguages;
|
|
77
|
-
/** Cache for language availability validation */
|
|
78
|
-
private supportedLangsCache;
|
|
79
|
-
/** Progress callback for UI updates */
|
|
80
|
-
private progressCallback;
|
|
81
|
-
/** Base URL for training data CDN */
|
|
82
|
-
private readonly CDN_BASE_URL;
|
|
83
|
-
/**
|
|
84
|
-
* Return the unique name of this OCR backend
|
|
85
|
-
*
|
|
86
|
-
* @returns Backend identifier "tesseract-wasm"
|
|
87
|
-
*/
|
|
88
|
-
name(): string;
|
|
89
|
-
/**
|
|
90
|
-
* Return list of supported language codes
|
|
91
|
-
*
|
|
92
|
-
* Returns a curated list of commonly available Tesseract language models.
|
|
93
|
-
* Tesseract supports many more languages through custom models.
|
|
94
|
-
*
|
|
95
|
-
* @returns Array of ISO 639-1/2/3 language codes
|
|
96
|
-
*/
|
|
97
|
-
supportedLanguages(): string[];
|
|
98
|
-
/**
|
|
99
|
-
* Initialize the OCR backend
|
|
100
|
-
*
|
|
101
|
-
* Creates the Tesseract WASM client instance. This is called once when
|
|
102
|
-
* the backend is registered with the extraction pipeline.
|
|
103
|
-
*
|
|
104
|
-
* The actual model loading happens in processImage() on-demand to avoid
|
|
105
|
-
* loading all models upfront.
|
|
106
|
-
*
|
|
107
|
-
* @throws {Error} If tesseract-wasm is not available or initialization fails
|
|
108
|
-
*
|
|
109
|
-
* @example
|
|
110
|
-
* ```typescript
|
|
111
|
-
* const backend = new TesseractWasmBackend();
|
|
112
|
-
* try {
|
|
113
|
-
* await backend.initialize();
|
|
114
|
-
* } catch (error) {
|
|
115
|
-
* console.error('Failed to initialize OCR:', error);
|
|
116
|
-
* }
|
|
117
|
-
* ```
|
|
118
|
-
*/
|
|
119
|
-
initialize(): Promise<void>;
|
|
120
|
-
/**
|
|
121
|
-
* Process image bytes and extract text via OCR
|
|
122
|
-
*
|
|
123
|
-
* Handles image loading, model loading, OCR processing, and result formatting.
|
|
124
|
-
* Automatically loads the language model on first use and caches it for subsequent calls.
|
|
125
|
-
*
|
|
126
|
-
* @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string
|
|
127
|
-
* @param language - ISO 639-2/3 language code (e.g., "eng", "deu")
|
|
128
|
-
* @returns Promise resolving to OCR result with content and metadata
|
|
129
|
-
* @throws {Error} If image processing fails, model loading fails, or language is unsupported
|
|
130
|
-
*
|
|
131
|
-
* @example
|
|
132
|
-
* ```typescript
|
|
133
|
-
* const backend = new TesseractWasmBackend();
|
|
134
|
-
* await backend.initialize();
|
|
135
|
-
*
|
|
136
|
-
* const imageBuffer = fs.readFileSync('scanned.png');
|
|
137
|
-
* const result = await backend.processImage(
|
|
138
|
-
* new Uint8Array(imageBuffer),
|
|
139
|
-
* 'eng'
|
|
140
|
-
* );
|
|
141
|
-
*
|
|
142
|
-
* console.log(result.content); // Extracted text
|
|
143
|
-
* console.log(result.metadata.confidence); // OCR confidence score
|
|
144
|
-
* ```
|
|
145
|
-
*/
|
|
146
|
-
processImage(imageBytes: Uint8Array | string, language: string): Promise<{
|
|
147
|
-
content: string;
|
|
148
|
-
mime_type: string;
|
|
149
|
-
metadata: Record<string, unknown>;
|
|
150
|
-
tables: unknown[];
|
|
151
|
-
}>;
|
|
152
|
-
/**
|
|
153
|
-
* Shutdown the OCR backend and release resources
|
|
154
|
-
*
|
|
155
|
-
* Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.
|
|
156
|
-
* Called when the backend is unregistered or the application shuts down.
|
|
157
|
-
*
|
|
158
|
-
* @throws {Error} If cleanup fails (errors are logged but not critical)
|
|
159
|
-
*
|
|
160
|
-
* @example
|
|
161
|
-
* ```typescript
|
|
162
|
-
* const backend = new TesseractWasmBackend();
|
|
163
|
-
* await backend.initialize();
|
|
164
|
-
* // ... use backend ...
|
|
165
|
-
* await backend.shutdown(); // Clean up resources
|
|
166
|
-
* ```
|
|
167
|
-
*/
|
|
168
|
-
shutdown(): Promise<void>;
|
|
169
|
-
/**
|
|
170
|
-
* Set a progress callback for UI updates
|
|
171
|
-
*
|
|
172
|
-
* Allows the UI to display progress during OCR processing.
|
|
173
|
-
* The callback will be called with values from 0 to 100.
|
|
174
|
-
*
|
|
175
|
-
* @param callback - Function to call with progress percentage
|
|
176
|
-
*
|
|
177
|
-
* @example
|
|
178
|
-
* ```typescript
|
|
179
|
-
* const backend = new TesseractWasmBackend();
|
|
180
|
-
* backend.setProgressCallback((progress) => {
|
|
181
|
-
* console.log(`OCR Progress: ${progress}%`);
|
|
182
|
-
* document.getElementById('progress-bar').style.width = `${progress}%`;
|
|
183
|
-
* });
|
|
184
|
-
* ```
|
|
185
|
-
*/
|
|
186
|
-
setProgressCallback(callback: (progress: number) => void): void;
|
|
187
|
-
/**
|
|
188
|
-
* Load language model from CDN
|
|
189
|
-
*
|
|
190
|
-
* Fetches the training data for a specific language from jsDelivr CDN.
|
|
191
|
-
* This is an MVP approach - models are cached by the browser.
|
|
192
|
-
*
|
|
193
|
-
* @param language - ISO 639-2/3 language code
|
|
194
|
-
* @throws {Error} If model download fails or language is not available
|
|
195
|
-
*
|
|
196
|
-
* @internal
|
|
197
|
-
*/
|
|
198
|
-
private loadLanguageModel;
|
|
199
|
-
/**
|
|
200
|
-
* Convert image bytes or Base64 string to ImageBitmap
|
|
201
|
-
*
|
|
202
|
-
* Handles both Uint8Array and Base64-encoded image data, converting to
|
|
203
|
-
* ImageBitmap format required by Tesseract WASM.
|
|
204
|
-
*
|
|
205
|
-
* @param imageBytes - Image data as Uint8Array or Base64 string
|
|
206
|
-
* @returns Promise resolving to ImageBitmap
|
|
207
|
-
* @throws {Error} If conversion fails (browser API not available or invalid image data)
|
|
208
|
-
*
|
|
209
|
-
* @internal
|
|
210
|
-
*/
|
|
211
|
-
private convertToImageBitmap;
|
|
212
|
-
/**
|
|
213
|
-
* Get confidence score from OCR result
|
|
214
|
-
*
|
|
215
|
-
* Attempts to retrieve confidence score from Tesseract.
|
|
216
|
-
* Returns a safe default if unavailable.
|
|
217
|
-
*
|
|
218
|
-
* @returns Confidence score between 0 and 1
|
|
219
|
-
*
|
|
220
|
-
* @internal
|
|
221
|
-
*/
|
|
222
|
-
private getConfidenceScore;
|
|
223
|
-
/**
|
|
224
|
-
* Get page metadata from OCR result
|
|
225
|
-
*
|
|
226
|
-
* Retrieves additional metadata like image dimensions and processing info.
|
|
227
|
-
*
|
|
228
|
-
* @returns Metadata object (may be empty if unavailable)
|
|
229
|
-
*
|
|
230
|
-
* @internal
|
|
231
|
-
*/
|
|
232
|
-
private getPageMetadata;
|
|
233
|
-
/**
|
|
234
|
-
* Dynamically load tesseract-wasm module
|
|
235
|
-
*
|
|
236
|
-
* Uses dynamic import to load tesseract-wasm only when needed,
|
|
237
|
-
* avoiding hard dependency in browser environments where it may not be bundled.
|
|
238
|
-
*
|
|
239
|
-
* @returns tesseract-wasm module object
|
|
240
|
-
* @throws {Error} If module cannot be imported
|
|
241
|
-
*
|
|
242
|
-
* @internal
|
|
243
|
-
*/
|
|
244
|
-
private loadTesseractWasm;
|
|
245
|
-
/**
|
|
246
|
-
* Report progress to progress callback
|
|
247
|
-
*
|
|
248
|
-
* Internal helper for notifying progress updates during OCR processing.
|
|
249
|
-
*
|
|
250
|
-
* @param progress - Progress percentage (0-100)
|
|
251
|
-
*
|
|
252
|
-
* @internal
|
|
253
|
-
*/
|
|
254
|
-
private reportProgress;
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
export { TesseractWasmBackend };
|