@kreuzberg/wasm 4.0.0-rc.21 → 4.0.0-rc.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +520 -837
- package/dist/adapters/wasm-adapter.d.ts +7 -10
- package/dist/adapters/wasm-adapter.d.ts.map +1 -0
- package/dist/adapters/wasm-adapter.js +41 -19
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.d.ts +23 -24
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +240 -67
- package/dist/index.js.map +1 -1
- package/dist/ocr/registry.d.ts +7 -10
- package/dist/ocr/registry.d.ts.map +1 -0
- package/dist/ocr/registry.js.map +1 -1
- package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
- package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.js +0 -46
- package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
- package/dist/pdfium.js +0 -5
- package/dist/plugin-registry.d.ts +246 -0
- package/dist/plugin-registry.d.ts.map +1 -0
- package/dist/runtime.d.ts +21 -22
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +0 -1
- package/dist/runtime.js.map +1 -1
- package/dist/{types-CKjcIYcX.d.ts → types.d.ts} +91 -22
- package/dist/types.d.ts.map +1 -0
- package/package.json +119 -162
- package/dist/adapters/wasm-adapter.cjs +0 -245
- package/dist/adapters/wasm-adapter.cjs.map +0 -1
- package/dist/adapters/wasm-adapter.d.cts +0 -121
- package/dist/index.cjs +0 -1245
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.cts +0 -423
- package/dist/ocr/registry.cjs +0 -92
- package/dist/ocr/registry.cjs.map +0 -1
- package/dist/ocr/registry.d.cts +0 -102
- package/dist/ocr/tesseract-wasm-backend.cjs +0 -456
- package/dist/ocr/tesseract-wasm-backend.cjs.map +0 -1
- package/dist/ocr/tesseract-wasm-backend.d.cts +0 -257
- package/dist/runtime.cjs +0 -174
- package/dist/runtime.cjs.map +0 -1
- package/dist/runtime.d.cts +0 -256
- package/dist/types-CKjcIYcX.d.cts +0 -294
package/dist/index.cjs
DELETED
|
@@ -1,1245 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __create = Object.create;
|
|
3
|
-
var __defProp = Object.defineProperty;
|
|
4
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
-
var __export = (target, all) => {
|
|
9
|
-
for (var name in all)
|
|
10
|
-
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
-
};
|
|
12
|
-
var __copyProps = (to, from, except, desc) => {
|
|
13
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
-
for (let key of __getOwnPropNames(from))
|
|
15
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
-
}
|
|
18
|
-
return to;
|
|
19
|
-
};
|
|
20
|
-
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
-
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
-
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
-
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
-
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
-
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
-
mod
|
|
27
|
-
));
|
|
28
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
-
|
|
30
|
-
// typescript/index.ts
|
|
31
|
-
var index_exports = {};
|
|
32
|
-
__export(index_exports, {
|
|
33
|
-
TesseractWasmBackend: () => TesseractWasmBackend,
|
|
34
|
-
batchExtractBytes: () => batchExtractBytes,
|
|
35
|
-
batchExtractBytesSync: () => batchExtractBytesSync,
|
|
36
|
-
batchExtractFiles: () => batchExtractFiles,
|
|
37
|
-
clearOcrBackends: () => clearOcrBackends,
|
|
38
|
-
configToJS: () => configToJS,
|
|
39
|
-
detectRuntime: () => detectRuntime,
|
|
40
|
-
enableOcr: () => enableOcr,
|
|
41
|
-
extractBytes: () => extractBytes,
|
|
42
|
-
extractBytesSync: () => extractBytesSync,
|
|
43
|
-
extractFile: () => extractFile,
|
|
44
|
-
extractFromFile: () => extractFromFile,
|
|
45
|
-
fileToUint8Array: () => fileToUint8Array,
|
|
46
|
-
getInitializationError: () => getInitializationError,
|
|
47
|
-
getOcrBackend: () => getOcrBackend,
|
|
48
|
-
getRuntimeInfo: () => getRuntimeInfo,
|
|
49
|
-
getRuntimeVersion: () => getRuntimeVersion,
|
|
50
|
-
getVersion: () => getVersion,
|
|
51
|
-
getWasmCapabilities: () => getWasmCapabilities,
|
|
52
|
-
hasBigInt: () => hasBigInt,
|
|
53
|
-
hasBlob: () => hasBlob,
|
|
54
|
-
hasFileApi: () => hasFileApi,
|
|
55
|
-
hasModuleWorkers: () => hasModuleWorkers,
|
|
56
|
-
hasSharedArrayBuffer: () => hasSharedArrayBuffer,
|
|
57
|
-
hasWasm: () => hasWasm,
|
|
58
|
-
hasWasmStreaming: () => hasWasmStreaming,
|
|
59
|
-
hasWorkers: () => hasWorkers,
|
|
60
|
-
initWasm: () => initWasm,
|
|
61
|
-
isBrowser: () => isBrowser,
|
|
62
|
-
isBun: () => isBun,
|
|
63
|
-
isDeno: () => isDeno,
|
|
64
|
-
isInitialized: () => isInitialized,
|
|
65
|
-
isNode: () => isNode,
|
|
66
|
-
isServerEnvironment: () => isServerEnvironment,
|
|
67
|
-
isValidExtractionResult: () => isValidExtractionResult,
|
|
68
|
-
isWebEnvironment: () => isWebEnvironment,
|
|
69
|
-
jsToExtractionResult: () => jsToExtractionResult,
|
|
70
|
-
listOcrBackends: () => listOcrBackends,
|
|
71
|
-
registerOcrBackend: () => registerOcrBackend,
|
|
72
|
-
unregisterOcrBackend: () => unregisterOcrBackend,
|
|
73
|
-
wrapWasmError: () => wrapWasmError
|
|
74
|
-
});
|
|
75
|
-
module.exports = __toCommonJS(index_exports);
|
|
76
|
-
|
|
77
|
-
// typescript/adapters/wasm-adapter.ts
|
|
78
|
-
var MAX_FILE_SIZE = 512 * 1024 * 1024;
|
|
79
|
-
function isNumberOrNull(value) {
|
|
80
|
-
return typeof value === "number" || value === null;
|
|
81
|
-
}
|
|
82
|
-
function isStringOrNull(value) {
|
|
83
|
-
return typeof value === "string" || value === null;
|
|
84
|
-
}
|
|
85
|
-
function isBoolean(value) {
|
|
86
|
-
return typeof value === "boolean";
|
|
87
|
-
}
|
|
88
|
-
async function fileToUint8Array(file) {
|
|
89
|
-
try {
|
|
90
|
-
if (file.size > MAX_FILE_SIZE) {
|
|
91
|
-
throw new Error(
|
|
92
|
-
`File size (${file.size} bytes) exceeds maximum (${MAX_FILE_SIZE} bytes). Maximum file size is 512 MB.`
|
|
93
|
-
);
|
|
94
|
-
}
|
|
95
|
-
const arrayBuffer = await file.arrayBuffer();
|
|
96
|
-
return new Uint8Array(arrayBuffer);
|
|
97
|
-
} catch (error) {
|
|
98
|
-
throw new Error(`Failed to read file: ${error instanceof Error ? error.message : String(error)}`);
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
function configToJS(config) {
|
|
102
|
-
if (!config) {
|
|
103
|
-
return {};
|
|
104
|
-
}
|
|
105
|
-
const normalized = {};
|
|
106
|
-
const normalizeValue = (value) => {
|
|
107
|
-
if (value === null || value === void 0) {
|
|
108
|
-
return null;
|
|
109
|
-
}
|
|
110
|
-
if (typeof value === "object") {
|
|
111
|
-
if (Array.isArray(value)) {
|
|
112
|
-
return value.map(normalizeValue);
|
|
113
|
-
}
|
|
114
|
-
const obj = value;
|
|
115
|
-
const normalized2 = {};
|
|
116
|
-
for (const [key, val] of Object.entries(obj)) {
|
|
117
|
-
const normalizedVal = normalizeValue(val);
|
|
118
|
-
if (normalizedVal !== null && normalizedVal !== void 0) {
|
|
119
|
-
normalized2[key] = normalizedVal;
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
return Object.keys(normalized2).length > 0 ? normalized2 : null;
|
|
123
|
-
}
|
|
124
|
-
return value;
|
|
125
|
-
};
|
|
126
|
-
for (const [key, value] of Object.entries(config)) {
|
|
127
|
-
const normalizedValue = normalizeValue(value);
|
|
128
|
-
if (normalizedValue !== null && normalizedValue !== void 0) {
|
|
129
|
-
normalized[key] = normalizedValue;
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
return normalized;
|
|
133
|
-
}
|
|
134
|
-
function jsToExtractionResult(jsValue) {
|
|
135
|
-
if (!jsValue || typeof jsValue !== "object") {
|
|
136
|
-
throw new Error("Invalid extraction result: value is not an object");
|
|
137
|
-
}
|
|
138
|
-
const result = jsValue;
|
|
139
|
-
const mimeType = typeof result.mimeType === "string" ? result.mimeType : typeof result.mime_type === "string" ? result.mime_type : null;
|
|
140
|
-
if (typeof result.content !== "string") {
|
|
141
|
-
throw new Error("Invalid extraction result: missing or invalid content");
|
|
142
|
-
}
|
|
143
|
-
if (typeof mimeType !== "string") {
|
|
144
|
-
throw new Error("Invalid extraction result: missing or invalid mimeType");
|
|
145
|
-
}
|
|
146
|
-
if (!result.metadata || typeof result.metadata !== "object") {
|
|
147
|
-
throw new Error("Invalid extraction result: missing or invalid metadata");
|
|
148
|
-
}
|
|
149
|
-
const tables = [];
|
|
150
|
-
if (Array.isArray(result.tables)) {
|
|
151
|
-
for (const table of result.tables) {
|
|
152
|
-
if (table && typeof table === "object") {
|
|
153
|
-
const t = table;
|
|
154
|
-
if (Array.isArray(t.cells) && t.cells.every((row) => Array.isArray(row) && row.every((cell) => typeof cell === "string")) && typeof t.markdown === "string" && typeof t.pageNumber === "number") {
|
|
155
|
-
tables.push({
|
|
156
|
-
cells: t.cells,
|
|
157
|
-
markdown: t.markdown,
|
|
158
|
-
pageNumber: t.pageNumber
|
|
159
|
-
});
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
const chunks = Array.isArray(result.chunks) ? result.chunks.map((chunk) => {
|
|
165
|
-
if (!chunk || typeof chunk !== "object") {
|
|
166
|
-
throw new Error("Invalid chunk structure");
|
|
167
|
-
}
|
|
168
|
-
const c = chunk;
|
|
169
|
-
if (typeof c.content !== "string") {
|
|
170
|
-
throw new Error("Invalid chunk: missing content");
|
|
171
|
-
}
|
|
172
|
-
if (!c.metadata || typeof c.metadata !== "object") {
|
|
173
|
-
throw new Error("Invalid chunk: missing metadata");
|
|
174
|
-
}
|
|
175
|
-
const metadata = c.metadata;
|
|
176
|
-
let embedding = null;
|
|
177
|
-
if (Array.isArray(c.embedding)) {
|
|
178
|
-
if (!c.embedding.every((item) => typeof item === "number")) {
|
|
179
|
-
throw new Error("Invalid chunk: embedding must contain only numbers");
|
|
180
|
-
}
|
|
181
|
-
embedding = c.embedding;
|
|
182
|
-
}
|
|
183
|
-
if (typeof metadata.charStart !== "number") {
|
|
184
|
-
throw new Error("Invalid chunk metadata: charStart must be a number");
|
|
185
|
-
}
|
|
186
|
-
if (typeof metadata.charEnd !== "number") {
|
|
187
|
-
throw new Error("Invalid chunk metadata: charEnd must be a number");
|
|
188
|
-
}
|
|
189
|
-
if (!isNumberOrNull(metadata.tokenCount)) {
|
|
190
|
-
throw new Error("Invalid chunk metadata: tokenCount must be a number or null");
|
|
191
|
-
}
|
|
192
|
-
if (typeof metadata.chunkIndex !== "number") {
|
|
193
|
-
throw new Error("Invalid chunk metadata: chunkIndex must be a number");
|
|
194
|
-
}
|
|
195
|
-
if (typeof metadata.totalChunks !== "number") {
|
|
196
|
-
throw new Error("Invalid chunk metadata: totalChunks must be a number");
|
|
197
|
-
}
|
|
198
|
-
return {
|
|
199
|
-
content: c.content,
|
|
200
|
-
embedding,
|
|
201
|
-
metadata: {
|
|
202
|
-
charStart: metadata.charStart,
|
|
203
|
-
charEnd: metadata.charEnd,
|
|
204
|
-
tokenCount: metadata.tokenCount,
|
|
205
|
-
chunkIndex: metadata.chunkIndex,
|
|
206
|
-
totalChunks: metadata.totalChunks
|
|
207
|
-
}
|
|
208
|
-
};
|
|
209
|
-
}) : null;
|
|
210
|
-
const images = Array.isArray(result.images) ? result.images.map((image) => {
|
|
211
|
-
if (!image || typeof image !== "object") {
|
|
212
|
-
throw new Error("Invalid image structure");
|
|
213
|
-
}
|
|
214
|
-
const img = image;
|
|
215
|
-
if (!(img.data instanceof Uint8Array)) {
|
|
216
|
-
throw new Error("Invalid image: data must be Uint8Array");
|
|
217
|
-
}
|
|
218
|
-
if (typeof img.format !== "string") {
|
|
219
|
-
throw new Error("Invalid image: missing format");
|
|
220
|
-
}
|
|
221
|
-
if (typeof img.imageIndex !== "number") {
|
|
222
|
-
throw new Error("Invalid image: imageIndex must be a number");
|
|
223
|
-
}
|
|
224
|
-
if (!isNumberOrNull(img.pageNumber)) {
|
|
225
|
-
throw new Error("Invalid image: pageNumber must be a number or null");
|
|
226
|
-
}
|
|
227
|
-
if (!isNumberOrNull(img.width)) {
|
|
228
|
-
throw new Error("Invalid image: width must be a number or null");
|
|
229
|
-
}
|
|
230
|
-
if (!isNumberOrNull(img.height)) {
|
|
231
|
-
throw new Error("Invalid image: height must be a number or null");
|
|
232
|
-
}
|
|
233
|
-
if (!isNumberOrNull(img.bitsPerComponent)) {
|
|
234
|
-
throw new Error("Invalid image: bitsPerComponent must be a number or null");
|
|
235
|
-
}
|
|
236
|
-
if (!isBoolean(img.isMask)) {
|
|
237
|
-
throw new Error("Invalid image: isMask must be a boolean");
|
|
238
|
-
}
|
|
239
|
-
if (!isStringOrNull(img.colorspace)) {
|
|
240
|
-
throw new Error("Invalid image: colorspace must be a string or null");
|
|
241
|
-
}
|
|
242
|
-
if (!isStringOrNull(img.description)) {
|
|
243
|
-
throw new Error("Invalid image: description must be a string or null");
|
|
244
|
-
}
|
|
245
|
-
return {
|
|
246
|
-
data: img.data,
|
|
247
|
-
format: img.format,
|
|
248
|
-
imageIndex: img.imageIndex,
|
|
249
|
-
pageNumber: img.pageNumber,
|
|
250
|
-
width: img.width,
|
|
251
|
-
height: img.height,
|
|
252
|
-
colorspace: img.colorspace,
|
|
253
|
-
bitsPerComponent: img.bitsPerComponent,
|
|
254
|
-
isMask: img.isMask,
|
|
255
|
-
description: img.description,
|
|
256
|
-
ocrResult: img.ocrResult ? jsToExtractionResult(img.ocrResult) : null
|
|
257
|
-
};
|
|
258
|
-
}) : null;
|
|
259
|
-
let detectedLanguages = null;
|
|
260
|
-
const detectedLanguagesRaw = Array.isArray(result.detectedLanguages) ? result.detectedLanguages : result.detected_languages;
|
|
261
|
-
if (Array.isArray(detectedLanguagesRaw)) {
|
|
262
|
-
if (!detectedLanguagesRaw.every((lang) => typeof lang === "string")) {
|
|
263
|
-
throw new Error("Invalid result: detectedLanguages must contain only strings");
|
|
264
|
-
}
|
|
265
|
-
detectedLanguages = detectedLanguagesRaw;
|
|
266
|
-
}
|
|
267
|
-
return {
|
|
268
|
-
content: result.content,
|
|
269
|
-
mimeType,
|
|
270
|
-
metadata: result.metadata ?? {},
|
|
271
|
-
tables,
|
|
272
|
-
detectedLanguages,
|
|
273
|
-
chunks,
|
|
274
|
-
images
|
|
275
|
-
};
|
|
276
|
-
}
|
|
277
|
-
function wrapWasmError(error, context) {
|
|
278
|
-
if (error instanceof Error) {
|
|
279
|
-
return new Error(`Error ${context}: ${error.message}`, {
|
|
280
|
-
cause: error
|
|
281
|
-
});
|
|
282
|
-
}
|
|
283
|
-
const message = String(error);
|
|
284
|
-
return new Error(`Error ${context}: ${message}`);
|
|
285
|
-
}
|
|
286
|
-
function isValidExtractionResult(value) {
|
|
287
|
-
if (!value || typeof value !== "object") {
|
|
288
|
-
return false;
|
|
289
|
-
}
|
|
290
|
-
const obj = value;
|
|
291
|
-
return typeof obj.content === "string" && (typeof obj.mimeType === "string" || typeof obj.mime_type === "string") && obj.metadata !== null && typeof obj.metadata === "object" && Array.isArray(obj.tables);
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
// typescript/ocr/registry.ts
|
|
295
|
-
var ocrBackendRegistry = /* @__PURE__ */ new Map();
|
|
296
|
-
function registerOcrBackend(backend) {
|
|
297
|
-
if (!backend) {
|
|
298
|
-
throw new Error("Backend cannot be null or undefined");
|
|
299
|
-
}
|
|
300
|
-
if (typeof backend.name !== "function") {
|
|
301
|
-
throw new Error("Backend must implement name() method");
|
|
302
|
-
}
|
|
303
|
-
if (typeof backend.supportedLanguages !== "function") {
|
|
304
|
-
throw new Error("Backend must implement supportedLanguages() method");
|
|
305
|
-
}
|
|
306
|
-
if (typeof backend.processImage !== "function") {
|
|
307
|
-
throw new Error("Backend must implement processImage() method");
|
|
308
|
-
}
|
|
309
|
-
const backendName = backend.name();
|
|
310
|
-
if (!backendName || typeof backendName !== "string") {
|
|
311
|
-
throw new Error("Backend name must be a non-empty string");
|
|
312
|
-
}
|
|
313
|
-
if (ocrBackendRegistry.has(backendName)) {
|
|
314
|
-
console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
|
|
315
|
-
}
|
|
316
|
-
ocrBackendRegistry.set(backendName, backend);
|
|
317
|
-
}
|
|
318
|
-
function getOcrBackend(name) {
|
|
319
|
-
return ocrBackendRegistry.get(name);
|
|
320
|
-
}
|
|
321
|
-
function listOcrBackends() {
|
|
322
|
-
return Array.from(ocrBackendRegistry.keys());
|
|
323
|
-
}
|
|
324
|
-
async function unregisterOcrBackend(name) {
|
|
325
|
-
const backend = ocrBackendRegistry.get(name);
|
|
326
|
-
if (!backend) {
|
|
327
|
-
throw new Error(
|
|
328
|
-
`OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
|
|
329
|
-
);
|
|
330
|
-
}
|
|
331
|
-
if (typeof backend.shutdown === "function") {
|
|
332
|
-
try {
|
|
333
|
-
await backend.shutdown();
|
|
334
|
-
} catch (error) {
|
|
335
|
-
console.warn(
|
|
336
|
-
`Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
|
|
337
|
-
);
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
ocrBackendRegistry.delete(name);
|
|
341
|
-
}
|
|
342
|
-
async function clearOcrBackends() {
|
|
343
|
-
const backends = Array.from(ocrBackendRegistry.entries());
|
|
344
|
-
for (const [name, backend] of backends) {
|
|
345
|
-
if (typeof backend.shutdown === "function") {
|
|
346
|
-
try {
|
|
347
|
-
await backend.shutdown();
|
|
348
|
-
} catch (error) {
|
|
349
|
-
console.warn(
|
|
350
|
-
`Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
|
|
351
|
-
);
|
|
352
|
-
}
|
|
353
|
-
}
|
|
354
|
-
}
|
|
355
|
-
ocrBackendRegistry.clear();
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
// typescript/ocr/tesseract-wasm-backend.ts
|
|
359
|
-
var TesseractWasmBackend = class {
|
|
360
|
-
/** Tesseract WASM client instance */
|
|
361
|
-
client = null;
|
|
362
|
-
/** Track which models are currently loaded to avoid redundant loads */
|
|
363
|
-
loadedLanguages = /* @__PURE__ */ new Set();
|
|
364
|
-
/** Cache for language availability validation */
|
|
365
|
-
supportedLangsCache = null;
|
|
366
|
-
/** Progress callback for UI updates */
|
|
367
|
-
progressCallback = null;
|
|
368
|
-
/** Base URL for training data CDN */
|
|
369
|
-
CDN_BASE_URL = "https://cdn.jsdelivr.net/npm/tesseract-wasm@0.11.0/dist";
|
|
370
|
-
/**
|
|
371
|
-
* Return the unique name of this OCR backend
|
|
372
|
-
*
|
|
373
|
-
* @returns Backend identifier "tesseract-wasm"
|
|
374
|
-
*/
|
|
375
|
-
name() {
|
|
376
|
-
return "tesseract-wasm";
|
|
377
|
-
}
|
|
378
|
-
/**
|
|
379
|
-
* Return list of supported language codes
|
|
380
|
-
*
|
|
381
|
-
* Returns a curated list of commonly available Tesseract language models.
|
|
382
|
-
* Tesseract supports many more languages through custom models.
|
|
383
|
-
*
|
|
384
|
-
* @returns Array of ISO 639-1/2/3 language codes
|
|
385
|
-
*/
|
|
386
|
-
supportedLanguages() {
|
|
387
|
-
if (this.supportedLangsCache) {
|
|
388
|
-
return this.supportedLangsCache;
|
|
389
|
-
}
|
|
390
|
-
this.supportedLangsCache = [
|
|
391
|
-
// Major languages
|
|
392
|
-
"eng",
|
|
393
|
-
// English
|
|
394
|
-
"deu",
|
|
395
|
-
// German
|
|
396
|
-
"fra",
|
|
397
|
-
// French
|
|
398
|
-
"spa",
|
|
399
|
-
// Spanish
|
|
400
|
-
"ita",
|
|
401
|
-
// Italian
|
|
402
|
-
"por",
|
|
403
|
-
// Portuguese
|
|
404
|
-
"nld",
|
|
405
|
-
// Dutch
|
|
406
|
-
"rus",
|
|
407
|
-
// Russian
|
|
408
|
-
"jpn",
|
|
409
|
-
// Japanese
|
|
410
|
-
"kor",
|
|
411
|
-
// Korean
|
|
412
|
-
"chi_sim",
|
|
413
|
-
// Chinese (Simplified)
|
|
414
|
-
"chi_tra",
|
|
415
|
-
// Chinese (Traditional)
|
|
416
|
-
// Additional European languages
|
|
417
|
-
"pol",
|
|
418
|
-
// Polish
|
|
419
|
-
"tur",
|
|
420
|
-
// Turkish
|
|
421
|
-
"swe",
|
|
422
|
-
// Swedish
|
|
423
|
-
"dan",
|
|
424
|
-
// Danish
|
|
425
|
-
"fin",
|
|
426
|
-
// Finnish
|
|
427
|
-
"nor",
|
|
428
|
-
// Norwegian
|
|
429
|
-
"ces",
|
|
430
|
-
// Czech
|
|
431
|
-
"slk",
|
|
432
|
-
// Slovak
|
|
433
|
-
"ron",
|
|
434
|
-
// Romanian
|
|
435
|
-
"hun",
|
|
436
|
-
// Hungarian
|
|
437
|
-
"hrv",
|
|
438
|
-
// Croatian
|
|
439
|
-
"srp",
|
|
440
|
-
// Serbian
|
|
441
|
-
"bul",
|
|
442
|
-
// Bulgarian
|
|
443
|
-
"ukr",
|
|
444
|
-
// Ukrainian
|
|
445
|
-
"ell",
|
|
446
|
-
// Greek
|
|
447
|
-
// Asian languages
|
|
448
|
-
"ara",
|
|
449
|
-
// Arabic
|
|
450
|
-
"heb",
|
|
451
|
-
// Hebrew
|
|
452
|
-
"hin",
|
|
453
|
-
// Hindi
|
|
454
|
-
"tha",
|
|
455
|
-
// Thai
|
|
456
|
-
"vie",
|
|
457
|
-
// Vietnamese
|
|
458
|
-
"mkd",
|
|
459
|
-
// Macedonian
|
|
460
|
-
"ben",
|
|
461
|
-
// Bengali
|
|
462
|
-
"tam",
|
|
463
|
-
// Tamil
|
|
464
|
-
"tel",
|
|
465
|
-
// Telugu
|
|
466
|
-
"kan",
|
|
467
|
-
// Kannada
|
|
468
|
-
"mal",
|
|
469
|
-
// Malayalam
|
|
470
|
-
"mya",
|
|
471
|
-
// Burmese
|
|
472
|
-
"khm",
|
|
473
|
-
// Khmer
|
|
474
|
-
"lao",
|
|
475
|
-
// Lao
|
|
476
|
-
"sin"
|
|
477
|
-
// Sinhala
|
|
478
|
-
];
|
|
479
|
-
return this.supportedLangsCache;
|
|
480
|
-
}
|
|
481
|
-
/**
|
|
482
|
-
* Initialize the OCR backend
|
|
483
|
-
*
|
|
484
|
-
* Creates the Tesseract WASM client instance. This is called once when
|
|
485
|
-
* the backend is registered with the extraction pipeline.
|
|
486
|
-
*
|
|
487
|
-
* The actual model loading happens in processImage() on-demand to avoid
|
|
488
|
-
* loading all models upfront.
|
|
489
|
-
*
|
|
490
|
-
* @throws {Error} If tesseract-wasm is not available or initialization fails
|
|
491
|
-
*
|
|
492
|
-
* @example
|
|
493
|
-
* ```typescript
|
|
494
|
-
* const backend = new TesseractWasmBackend();
|
|
495
|
-
* try {
|
|
496
|
-
* await backend.initialize();
|
|
497
|
-
* } catch (error) {
|
|
498
|
-
* console.error('Failed to initialize OCR:', error);
|
|
499
|
-
* }
|
|
500
|
-
* ```
|
|
501
|
-
*/
|
|
502
|
-
async initialize() {
|
|
503
|
-
if (this.client) {
|
|
504
|
-
return;
|
|
505
|
-
}
|
|
506
|
-
try {
|
|
507
|
-
const tesseractModule = await this.loadTesseractWasm();
|
|
508
|
-
if (!tesseractModule || typeof tesseractModule.OCRClient !== "function") {
|
|
509
|
-
throw new Error("tesseract-wasm OCRClient not found. Ensure tesseract-wasm is installed and available.");
|
|
510
|
-
}
|
|
511
|
-
this.client = new tesseractModule.OCRClient();
|
|
512
|
-
this.loadedLanguages.clear();
|
|
513
|
-
} catch (error) {
|
|
514
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
515
|
-
throw new Error(`Failed to initialize TesseractWasmBackend: ${message}`);
|
|
516
|
-
}
|
|
517
|
-
}
|
|
518
|
-
/**
|
|
519
|
-
* Process image bytes and extract text via OCR
|
|
520
|
-
*
|
|
521
|
-
* Handles image loading, model loading, OCR processing, and result formatting.
|
|
522
|
-
* Automatically loads the language model on first use and caches it for subsequent calls.
|
|
523
|
-
*
|
|
524
|
-
* @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string
|
|
525
|
-
* @param language - ISO 639-2/3 language code (e.g., "eng", "deu")
|
|
526
|
-
* @returns Promise resolving to OCR result with content and metadata
|
|
527
|
-
* @throws {Error} If image processing fails, model loading fails, or language is unsupported
|
|
528
|
-
*
|
|
529
|
-
* @example
|
|
530
|
-
* ```typescript
|
|
531
|
-
* const backend = new TesseractWasmBackend();
|
|
532
|
-
* await backend.initialize();
|
|
533
|
-
*
|
|
534
|
-
* const imageBuffer = fs.readFileSync('scanned.png');
|
|
535
|
-
* const result = await backend.processImage(
|
|
536
|
-
* new Uint8Array(imageBuffer),
|
|
537
|
-
* 'eng'
|
|
538
|
-
* );
|
|
539
|
-
*
|
|
540
|
-
* console.log(result.content); // Extracted text
|
|
541
|
-
* console.log(result.metadata.confidence); // OCR confidence score
|
|
542
|
-
* ```
|
|
543
|
-
*/
|
|
544
|
-
async processImage(imageBytes, language) {
|
|
545
|
-
if (!this.client) {
|
|
546
|
-
throw new Error("TesseractWasmBackend not initialized. Call initialize() first.");
|
|
547
|
-
}
|
|
548
|
-
const supported = this.supportedLanguages();
|
|
549
|
-
const normalizedLang = language.toLowerCase();
|
|
550
|
-
const isSupported = supported.some((lang) => lang.toLowerCase() === normalizedLang);
|
|
551
|
-
if (!isSupported) {
|
|
552
|
-
throw new Error(`Language "${language}" is not supported. Supported languages: ${supported.join(", ")}`);
|
|
553
|
-
}
|
|
554
|
-
try {
|
|
555
|
-
if (!this.loadedLanguages.has(normalizedLang)) {
|
|
556
|
-
this.reportProgress(10);
|
|
557
|
-
await this.loadLanguageModel(normalizedLang);
|
|
558
|
-
this.loadedLanguages.add(normalizedLang);
|
|
559
|
-
this.reportProgress(30);
|
|
560
|
-
}
|
|
561
|
-
this.reportProgress(40);
|
|
562
|
-
const imageBitmap = await this.convertToImageBitmap(imageBytes);
|
|
563
|
-
this.reportProgress(50);
|
|
564
|
-
await this.client.loadImage(imageBitmap);
|
|
565
|
-
this.reportProgress(70);
|
|
566
|
-
const text = await this.client.getText();
|
|
567
|
-
const confidence = await this.getConfidenceScore();
|
|
568
|
-
const pageMetadata = await this.getPageMetadata();
|
|
569
|
-
this.reportProgress(90);
|
|
570
|
-
return {
|
|
571
|
-
content: text,
|
|
572
|
-
mime_type: "text/plain",
|
|
573
|
-
metadata: {
|
|
574
|
-
language: normalizedLang,
|
|
575
|
-
confidence,
|
|
576
|
-
...pageMetadata
|
|
577
|
-
},
|
|
578
|
-
tables: []
|
|
579
|
-
// Tesseract-wasm doesn't provide structured table detection
|
|
580
|
-
};
|
|
581
|
-
} catch (error) {
|
|
582
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
583
|
-
throw new Error(`OCR processing failed for language "${language}": ${message}`);
|
|
584
|
-
} finally {
|
|
585
|
-
this.reportProgress(100);
|
|
586
|
-
}
|
|
587
|
-
}
|
|
588
|
-
/**
|
|
589
|
-
* Shutdown the OCR backend and release resources
|
|
590
|
-
*
|
|
591
|
-
* Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.
|
|
592
|
-
* Called when the backend is unregistered or the application shuts down.
|
|
593
|
-
*
|
|
594
|
-
* @throws {Error} If cleanup fails (errors are logged but not critical)
|
|
595
|
-
*
|
|
596
|
-
* @example
|
|
597
|
-
* ```typescript
|
|
598
|
-
* const backend = new TesseractWasmBackend();
|
|
599
|
-
* await backend.initialize();
|
|
600
|
-
* // ... use backend ...
|
|
601
|
-
* await backend.shutdown(); // Clean up resources
|
|
602
|
-
* ```
|
|
603
|
-
*/
|
|
604
|
-
async shutdown() {
|
|
605
|
-
try {
|
|
606
|
-
if (this.client) {
|
|
607
|
-
if (typeof this.client.destroy === "function") {
|
|
608
|
-
this.client.destroy();
|
|
609
|
-
}
|
|
610
|
-
if (typeof this.client.terminate === "function") {
|
|
611
|
-
this.client.terminate();
|
|
612
|
-
}
|
|
613
|
-
this.client = null;
|
|
614
|
-
}
|
|
615
|
-
this.loadedLanguages.clear();
|
|
616
|
-
this.supportedLangsCache = null;
|
|
617
|
-
this.progressCallback = null;
|
|
618
|
-
} catch (error) {
|
|
619
|
-
console.warn(
|
|
620
|
-
`Warning during TesseractWasmBackend shutdown: ${error instanceof Error ? error.message : String(error)}`
|
|
621
|
-
);
|
|
622
|
-
}
|
|
623
|
-
}
|
|
624
|
-
/**
|
|
625
|
-
* Set a progress callback for UI updates
|
|
626
|
-
*
|
|
627
|
-
* Allows the UI to display progress during OCR processing.
|
|
628
|
-
* The callback will be called with values from 0 to 100.
|
|
629
|
-
*
|
|
630
|
-
* @param callback - Function to call with progress percentage
|
|
631
|
-
*
|
|
632
|
-
* @example
|
|
633
|
-
* ```typescript
|
|
634
|
-
* const backend = new TesseractWasmBackend();
|
|
635
|
-
* backend.setProgressCallback((progress) => {
|
|
636
|
-
* console.log(`OCR Progress: ${progress}%`);
|
|
637
|
-
* document.getElementById('progress-bar').style.width = `${progress}%`;
|
|
638
|
-
* });
|
|
639
|
-
* ```
|
|
640
|
-
*/
|
|
641
|
-
setProgressCallback(callback) {
|
|
642
|
-
this.progressCallback = callback;
|
|
643
|
-
}
|
|
644
|
-
/**
|
|
645
|
-
* Load language model from CDN
|
|
646
|
-
*
|
|
647
|
-
* Fetches the training data for a specific language from jsDelivr CDN.
|
|
648
|
-
* This is an MVP approach - models are cached by the browser.
|
|
649
|
-
*
|
|
650
|
-
* @param language - ISO 639-2/3 language code
|
|
651
|
-
* @throws {Error} If model download fails or language is not available
|
|
652
|
-
*
|
|
653
|
-
* @internal
|
|
654
|
-
*/
|
|
655
|
-
async loadLanguageModel(language) {
|
|
656
|
-
if (!this.client) {
|
|
657
|
-
throw new Error("Client not initialized");
|
|
658
|
-
}
|
|
659
|
-
const modelFilename = `${language}.traineddata`;
|
|
660
|
-
const modelUrl = `${this.CDN_BASE_URL}/${modelFilename}`;
|
|
661
|
-
try {
|
|
662
|
-
await this.client.loadModel(modelUrl);
|
|
663
|
-
} catch (error) {
|
|
664
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
665
|
-
throw new Error(`Failed to load model for language "${language}" from ${modelUrl}: ${message}`);
|
|
666
|
-
}
|
|
667
|
-
}
|
|
668
|
-
/**
|
|
669
|
-
* Convert image bytes or Base64 string to ImageBitmap
|
|
670
|
-
*
|
|
671
|
-
* Handles both Uint8Array and Base64-encoded image data, converting to
|
|
672
|
-
* ImageBitmap format required by Tesseract WASM.
|
|
673
|
-
*
|
|
674
|
-
* @param imageBytes - Image data as Uint8Array or Base64 string
|
|
675
|
-
* @returns Promise resolving to ImageBitmap
|
|
676
|
-
* @throws {Error} If conversion fails (browser API not available or invalid image data)
|
|
677
|
-
*
|
|
678
|
-
* @internal
|
|
679
|
-
*/
|
|
680
|
-
async convertToImageBitmap(imageBytes) {
|
|
681
|
-
if (typeof createImageBitmap === "undefined") {
|
|
682
|
-
throw new Error("createImageBitmap is not available. TesseractWasmBackend requires a browser environment.");
|
|
683
|
-
}
|
|
684
|
-
try {
|
|
685
|
-
let bytes = imageBytes;
|
|
686
|
-
if (typeof imageBytes === "string") {
|
|
687
|
-
const binaryString = atob(imageBytes);
|
|
688
|
-
bytes = new Uint8Array(binaryString.length);
|
|
689
|
-
for (let i = 0; i < binaryString.length; i++) {
|
|
690
|
-
bytes[i] = binaryString.charCodeAt(i);
|
|
691
|
-
}
|
|
692
|
-
}
|
|
693
|
-
const blob = new Blob([bytes]);
|
|
694
|
-
const imageBitmap = await createImageBitmap(blob);
|
|
695
|
-
return imageBitmap;
|
|
696
|
-
} catch (error) {
|
|
697
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
698
|
-
throw new Error(`Failed to convert image bytes to ImageBitmap: ${message}`);
|
|
699
|
-
}
|
|
700
|
-
}
|
|
701
|
-
/**
|
|
702
|
-
* Get confidence score from OCR result
|
|
703
|
-
*
|
|
704
|
-
* Attempts to retrieve confidence score from Tesseract.
|
|
705
|
-
* Returns a safe default if unavailable.
|
|
706
|
-
*
|
|
707
|
-
* @returns Confidence score between 0 and 1
|
|
708
|
-
*
|
|
709
|
-
* @internal
|
|
710
|
-
*/
|
|
711
|
-
async getConfidenceScore() {
|
|
712
|
-
try {
|
|
713
|
-
if (this.client && typeof this.client.getConfidence === "function") {
|
|
714
|
-
const confidence = await this.client.getConfidence();
|
|
715
|
-
return confidence > 1 ? confidence / 100 : confidence;
|
|
716
|
-
}
|
|
717
|
-
} catch {
|
|
718
|
-
}
|
|
719
|
-
return 0.9;
|
|
720
|
-
}
|
|
721
|
-
/**
|
|
722
|
-
* Get page metadata from OCR result
|
|
723
|
-
*
|
|
724
|
-
* Retrieves additional metadata like image dimensions and processing info.
|
|
725
|
-
*
|
|
726
|
-
* @returns Metadata object (may be empty if unavailable)
|
|
727
|
-
*
|
|
728
|
-
* @internal
|
|
729
|
-
*/
|
|
730
|
-
async getPageMetadata() {
|
|
731
|
-
try {
|
|
732
|
-
if (this.client && typeof this.client.getPageMetadata === "function") {
|
|
733
|
-
return await this.client.getPageMetadata();
|
|
734
|
-
}
|
|
735
|
-
} catch {
|
|
736
|
-
}
|
|
737
|
-
return {};
|
|
738
|
-
}
|
|
739
|
-
/**
|
|
740
|
-
* Dynamically load tesseract-wasm module
|
|
741
|
-
*
|
|
742
|
-
* Uses dynamic import to load tesseract-wasm only when needed,
|
|
743
|
-
* avoiding hard dependency in browser environments where it may not be bundled.
|
|
744
|
-
*
|
|
745
|
-
* @returns tesseract-wasm module object
|
|
746
|
-
* @throws {Error} If module cannot be imported
|
|
747
|
-
*
|
|
748
|
-
* @internal
|
|
749
|
-
*/
|
|
750
|
-
async loadTesseractWasm() {
|
|
751
|
-
try {
|
|
752
|
-
const module2 = await import("tesseract-wasm");
|
|
753
|
-
return module2;
|
|
754
|
-
} catch (error) {
|
|
755
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
756
|
-
throw new Error(
|
|
757
|
-
`Failed to import tesseract-wasm. Ensure it is installed via: npm install tesseract-wasm. Error: ${message}`
|
|
758
|
-
);
|
|
759
|
-
}
|
|
760
|
-
}
|
|
761
|
-
/**
|
|
762
|
-
* Report progress to progress callback
|
|
763
|
-
*
|
|
764
|
-
* Internal helper for notifying progress updates during OCR processing.
|
|
765
|
-
*
|
|
766
|
-
* @param progress - Progress percentage (0-100)
|
|
767
|
-
*
|
|
768
|
-
* @internal
|
|
769
|
-
*/
|
|
770
|
-
reportProgress(progress) {
|
|
771
|
-
if (this.progressCallback) {
|
|
772
|
-
try {
|
|
773
|
-
this.progressCallback(Math.min(100, Math.max(0, progress)));
|
|
774
|
-
} catch {
|
|
775
|
-
}
|
|
776
|
-
}
|
|
777
|
-
}
|
|
778
|
-
};
|
|
779
|
-
|
|
780
|
-
// typescript/runtime.ts
|
|
781
|
-
function detectRuntime() {
|
|
782
|
-
if (typeof globalThis.Deno !== "undefined") {
|
|
783
|
-
return "deno";
|
|
784
|
-
}
|
|
785
|
-
if (typeof globalThis.Bun !== "undefined") {
|
|
786
|
-
return "bun";
|
|
787
|
-
}
|
|
788
|
-
if (typeof process !== "undefined" && process.versions && process.versions.node) {
|
|
789
|
-
return "node";
|
|
790
|
-
}
|
|
791
|
-
if (typeof window !== "undefined" && typeof document !== "undefined") {
|
|
792
|
-
return "browser";
|
|
793
|
-
}
|
|
794
|
-
return "unknown";
|
|
795
|
-
}
|
|
796
|
-
function isBrowser() {
|
|
797
|
-
return detectRuntime() === "browser";
|
|
798
|
-
}
|
|
799
|
-
function isNode() {
|
|
800
|
-
return detectRuntime() === "node";
|
|
801
|
-
}
|
|
802
|
-
function isDeno() {
|
|
803
|
-
return detectRuntime() === "deno";
|
|
804
|
-
}
|
|
805
|
-
function isBun() {
|
|
806
|
-
return detectRuntime() === "bun";
|
|
807
|
-
}
|
|
808
|
-
function isWebEnvironment() {
|
|
809
|
-
const runtime = detectRuntime();
|
|
810
|
-
return runtime === "browser";
|
|
811
|
-
}
|
|
812
|
-
function isServerEnvironment() {
|
|
813
|
-
const runtime = detectRuntime();
|
|
814
|
-
return runtime === "node" || runtime === "deno" || runtime === "bun";
|
|
815
|
-
}
|
|
816
|
-
function hasFileApi() {
|
|
817
|
-
return typeof window !== "undefined" && typeof File !== "undefined" && typeof Blob !== "undefined";
|
|
818
|
-
}
|
|
819
|
-
function hasBlob() {
|
|
820
|
-
return typeof Blob !== "undefined";
|
|
821
|
-
}
|
|
822
|
-
function hasWorkers() {
|
|
823
|
-
return typeof Worker !== "undefined";
|
|
824
|
-
}
|
|
825
|
-
function hasSharedArrayBuffer() {
|
|
826
|
-
return typeof SharedArrayBuffer !== "undefined";
|
|
827
|
-
}
|
|
828
|
-
function hasModuleWorkers() {
|
|
829
|
-
if (!hasWorkers()) {
|
|
830
|
-
return false;
|
|
831
|
-
}
|
|
832
|
-
try {
|
|
833
|
-
const blob = new Blob(['console.log("test")'], {
|
|
834
|
-
type: "application/javascript"
|
|
835
|
-
});
|
|
836
|
-
const workerUrl = URL.createObjectURL(blob);
|
|
837
|
-
try {
|
|
838
|
-
return true;
|
|
839
|
-
} finally {
|
|
840
|
-
URL.revokeObjectURL(workerUrl);
|
|
841
|
-
}
|
|
842
|
-
} catch {
|
|
843
|
-
return false;
|
|
844
|
-
}
|
|
845
|
-
}
|
|
846
|
-
function hasWasm() {
|
|
847
|
-
return typeof WebAssembly !== "undefined" && WebAssembly.instantiate !== void 0;
|
|
848
|
-
}
|
|
849
|
-
function hasWasmStreaming() {
|
|
850
|
-
return typeof WebAssembly !== "undefined" && WebAssembly.instantiateStreaming !== void 0;
|
|
851
|
-
}
|
|
852
|
-
function hasBigInt() {
|
|
853
|
-
try {
|
|
854
|
-
const test = BigInt("1");
|
|
855
|
-
return typeof test === "bigint";
|
|
856
|
-
} catch {
|
|
857
|
-
return false;
|
|
858
|
-
}
|
|
859
|
-
}
|
|
860
|
-
function getRuntimeVersion() {
|
|
861
|
-
const runtime = detectRuntime();
|
|
862
|
-
switch (runtime) {
|
|
863
|
-
case "node":
|
|
864
|
-
return process.version?.substring(1);
|
|
865
|
-
// Remove 'v' prefix
|
|
866
|
-
case "deno": {
|
|
867
|
-
const deno = globalThis.Deno;
|
|
868
|
-
const version = deno?.version;
|
|
869
|
-
return version?.deno;
|
|
870
|
-
}
|
|
871
|
-
case "bun": {
|
|
872
|
-
const bun = globalThis.Bun;
|
|
873
|
-
return bun?.version;
|
|
874
|
-
}
|
|
875
|
-
default:
|
|
876
|
-
return void 0;
|
|
877
|
-
}
|
|
878
|
-
}
|
|
879
|
-
function getWasmCapabilities() {
|
|
880
|
-
const runtime = detectRuntime();
|
|
881
|
-
const version = getRuntimeVersion();
|
|
882
|
-
const capabilities = {
|
|
883
|
-
runtime,
|
|
884
|
-
hasWasm: hasWasm(),
|
|
885
|
-
hasWasmStreaming: hasWasmStreaming(),
|
|
886
|
-
hasFileApi: hasFileApi(),
|
|
887
|
-
hasBlob: hasBlob(),
|
|
888
|
-
hasWorkers: hasWorkers(),
|
|
889
|
-
hasSharedArrayBuffer: hasSharedArrayBuffer(),
|
|
890
|
-
hasModuleWorkers: hasModuleWorkers(),
|
|
891
|
-
hasBigInt: hasBigInt(),
|
|
892
|
-
...version !== void 0 ? { runtimeVersion: version } : {}
|
|
893
|
-
};
|
|
894
|
-
return capabilities;
|
|
895
|
-
}
|
|
896
|
-
function getRuntimeInfo() {
|
|
897
|
-
const runtime = detectRuntime();
|
|
898
|
-
const capabilities = getWasmCapabilities();
|
|
899
|
-
return {
|
|
900
|
-
runtime,
|
|
901
|
-
isBrowser: isBrowser(),
|
|
902
|
-
isNode: isNode(),
|
|
903
|
-
isDeno: isDeno(),
|
|
904
|
-
isBun: isBun(),
|
|
905
|
-
isWeb: isWebEnvironment(),
|
|
906
|
-
isServer: isServerEnvironment(),
|
|
907
|
-
runtimeVersion: getRuntimeVersion(),
|
|
908
|
-
userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "N/A",
|
|
909
|
-
capabilities
|
|
910
|
-
};
|
|
911
|
-
}
|
|
912
|
-
|
|
913
|
-
// typescript/index.ts
|
|
914
|
-
var wasm = null;
|
|
915
|
-
var initialized = false;
|
|
916
|
-
var initializationError = null;
|
|
917
|
-
var initializationPromise = null;
|
|
918
|
-
async function initializePdfiumAsync(wasmModule) {
|
|
919
|
-
if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
|
|
920
|
-
return;
|
|
921
|
-
}
|
|
922
|
-
if (!isBrowser()) {
|
|
923
|
-
console.debug("PDFium initialization skipped (non-browser environment)");
|
|
924
|
-
return;
|
|
925
|
-
}
|
|
926
|
-
try {
|
|
927
|
-
const pdfiumModule = await import("./pdfium.js");
|
|
928
|
-
const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
|
|
929
|
-
const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
|
|
930
|
-
if (!success) {
|
|
931
|
-
console.warn("PDFium initialization returned false");
|
|
932
|
-
}
|
|
933
|
-
} catch (error) {
|
|
934
|
-
console.debug("PDFium initialization error:", error);
|
|
935
|
-
}
|
|
936
|
-
}
|
|
937
|
-
async function initWasm() {
|
|
938
|
-
if (initialized) {
|
|
939
|
-
return;
|
|
940
|
-
}
|
|
941
|
-
if (initializationPromise) {
|
|
942
|
-
return initializationPromise;
|
|
943
|
-
}
|
|
944
|
-
initializationPromise = (async () => {
|
|
945
|
-
try {
|
|
946
|
-
if (!hasWasm()) {
|
|
947
|
-
throw new Error("WebAssembly is not supported in this environment");
|
|
948
|
-
}
|
|
949
|
-
let wasmModule;
|
|
950
|
-
try {
|
|
951
|
-
wasmModule = await import(
|
|
952
|
-
/* @vite-ignore */
|
|
953
|
-
"../pkg/kreuzberg_wasm.js"
|
|
954
|
-
);
|
|
955
|
-
} catch {
|
|
956
|
-
wasmModule = await import(
|
|
957
|
-
/* @vite-ignore */
|
|
958
|
-
"./kreuzberg_wasm.js"
|
|
959
|
-
);
|
|
960
|
-
}
|
|
961
|
-
wasm = wasmModule;
|
|
962
|
-
if (wasm && typeof wasm.default === "function") {
|
|
963
|
-
await wasm.default();
|
|
964
|
-
}
|
|
965
|
-
if (isBrowser() && wasm && typeof wasm.initialize_pdfium_render === "function") {
|
|
966
|
-
initializePdfiumAsync(wasm).catch((error) => {
|
|
967
|
-
console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
|
|
968
|
-
});
|
|
969
|
-
}
|
|
970
|
-
initialized = true;
|
|
971
|
-
initializationError = null;
|
|
972
|
-
} catch (error) {
|
|
973
|
-
initializationError = error instanceof Error ? error : new Error(String(error));
|
|
974
|
-
throw wrapWasmError(error, "initializing Kreuzberg WASM module");
|
|
975
|
-
}
|
|
976
|
-
})();
|
|
977
|
-
return initializationPromise;
|
|
978
|
-
}
|
|
979
|
-
function isInitialized() {
|
|
980
|
-
return initialized;
|
|
981
|
-
}
|
|
982
|
-
function getVersion() {
|
|
983
|
-
if (!initialized) {
|
|
984
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
985
|
-
}
|
|
986
|
-
if (!wasm) {
|
|
987
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
988
|
-
}
|
|
989
|
-
return wasm.version();
|
|
990
|
-
}
|
|
991
|
-
function getInitializationError() {
|
|
992
|
-
return initializationError;
|
|
993
|
-
}
|
|
994
|
-
async function extractBytes(data, mimeType, config) {
|
|
995
|
-
if (!initialized) {
|
|
996
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
997
|
-
}
|
|
998
|
-
if (!wasm) {
|
|
999
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1000
|
-
}
|
|
1001
|
-
try {
|
|
1002
|
-
if (!data || data.length === 0) {
|
|
1003
|
-
throw new Error("Document data cannot be empty");
|
|
1004
|
-
}
|
|
1005
|
-
if (!mimeType) {
|
|
1006
|
-
throw new Error("MIME type is required");
|
|
1007
|
-
}
|
|
1008
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1009
|
-
const result = await wasm.extractBytes(data, mimeType, normalizedConfig);
|
|
1010
|
-
if (!result) {
|
|
1011
|
-
throw new Error("Invalid extraction result: no result from WASM module");
|
|
1012
|
-
}
|
|
1013
|
-
return jsToExtractionResult(result);
|
|
1014
|
-
} catch (error) {
|
|
1015
|
-
throw wrapWasmError(error, "extracting from bytes");
|
|
1016
|
-
}
|
|
1017
|
-
}
|
|
1018
|
-
async function extractFile(path, mimeType, config) {
|
|
1019
|
-
if (!initialized) {
|
|
1020
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1021
|
-
}
|
|
1022
|
-
if (!wasm) {
|
|
1023
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1024
|
-
}
|
|
1025
|
-
try {
|
|
1026
|
-
if (!path) {
|
|
1027
|
-
throw new Error("File path is required");
|
|
1028
|
-
}
|
|
1029
|
-
const runtime = detectRuntime();
|
|
1030
|
-
if (runtime === "browser") {
|
|
1031
|
-
throw new Error("Use extractBytes with fileToUint8Array for browser environments");
|
|
1032
|
-
}
|
|
1033
|
-
let fileData;
|
|
1034
|
-
if (runtime === "node") {
|
|
1035
|
-
const { readFile } = await import("fs/promises");
|
|
1036
|
-
const buffer = await readFile(path);
|
|
1037
|
-
fileData = new Uint8Array(buffer);
|
|
1038
|
-
} else if (runtime === "deno") {
|
|
1039
|
-
const deno = globalThis.Deno;
|
|
1040
|
-
fileData = await deno.readFile(path);
|
|
1041
|
-
} else if (runtime === "bun") {
|
|
1042
|
-
const { readFile } = await import("fs/promises");
|
|
1043
|
-
const buffer = await readFile(path);
|
|
1044
|
-
fileData = new Uint8Array(buffer);
|
|
1045
|
-
} else {
|
|
1046
|
-
throw new Error(`Unsupported runtime for file extraction: ${runtime}`);
|
|
1047
|
-
}
|
|
1048
|
-
let detectedMimeType = mimeType;
|
|
1049
|
-
if (!detectedMimeType) {
|
|
1050
|
-
detectedMimeType = wasm.detectMimeFromBytes(fileData);
|
|
1051
|
-
}
|
|
1052
|
-
if (!detectedMimeType) {
|
|
1053
|
-
throw new Error("Could not detect MIME type for file. Please provide mimeType parameter.");
|
|
1054
|
-
}
|
|
1055
|
-
detectedMimeType = wasm.normalizeMimeType(detectedMimeType);
|
|
1056
|
-
return await extractBytes(fileData, detectedMimeType, config);
|
|
1057
|
-
} catch (error) {
|
|
1058
|
-
throw wrapWasmError(error, `extracting from file: ${path}`);
|
|
1059
|
-
}
|
|
1060
|
-
}
|
|
1061
|
-
async function extractFromFile(file, mimeType, config) {
|
|
1062
|
-
if (!initialized) {
|
|
1063
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1064
|
-
}
|
|
1065
|
-
if (!wasm) {
|
|
1066
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1067
|
-
}
|
|
1068
|
-
try {
|
|
1069
|
-
const bytes = await fileToUint8Array(file);
|
|
1070
|
-
let type = mimeType ?? (file instanceof File ? file.type : "application/octet-stream");
|
|
1071
|
-
type = wasm.normalizeMimeType(type);
|
|
1072
|
-
return await extractBytes(bytes, type, config);
|
|
1073
|
-
} catch (error) {
|
|
1074
|
-
throw wrapWasmError(error, `extracting from ${file instanceof File ? "file" : "blob"}`);
|
|
1075
|
-
}
|
|
1076
|
-
}
|
|
1077
|
-
function extractBytesSync(data, mimeType, config) {
|
|
1078
|
-
if (!initialized) {
|
|
1079
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1080
|
-
}
|
|
1081
|
-
if (!wasm) {
|
|
1082
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1083
|
-
}
|
|
1084
|
-
try {
|
|
1085
|
-
if (!data || data.length === 0) {
|
|
1086
|
-
throw new Error("Document data cannot be empty");
|
|
1087
|
-
}
|
|
1088
|
-
if (!mimeType) {
|
|
1089
|
-
throw new Error("MIME type is required");
|
|
1090
|
-
}
|
|
1091
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1092
|
-
const result = wasm.extractBytesSync(data, mimeType, normalizedConfig);
|
|
1093
|
-
if (!result) {
|
|
1094
|
-
throw new Error("Invalid extraction result: no result from WASM module");
|
|
1095
|
-
}
|
|
1096
|
-
return jsToExtractionResult(result);
|
|
1097
|
-
} catch (error) {
|
|
1098
|
-
throw wrapWasmError(error, "extracting from bytes (sync)");
|
|
1099
|
-
}
|
|
1100
|
-
}
|
|
1101
|
-
async function batchExtractBytes(files, config) {
|
|
1102
|
-
if (!initialized) {
|
|
1103
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1104
|
-
}
|
|
1105
|
-
if (!wasm) {
|
|
1106
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1107
|
-
}
|
|
1108
|
-
try {
|
|
1109
|
-
if (!Array.isArray(files)) {
|
|
1110
|
-
throw new Error("Files parameter must be an array");
|
|
1111
|
-
}
|
|
1112
|
-
if (files.length === 0) {
|
|
1113
|
-
throw new Error("Files array cannot be empty");
|
|
1114
|
-
}
|
|
1115
|
-
const dataList = [];
|
|
1116
|
-
const mimeTypes = [];
|
|
1117
|
-
for (let i = 0; i < files.length; i += 1) {
|
|
1118
|
-
const file = files[i];
|
|
1119
|
-
if (!file || typeof file !== "object") {
|
|
1120
|
-
throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
|
|
1121
|
-
}
|
|
1122
|
-
const f = file;
|
|
1123
|
-
if (!(f.data instanceof Uint8Array)) {
|
|
1124
|
-
throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
|
|
1125
|
-
}
|
|
1126
|
-
if (typeof f.mimeType !== "string") {
|
|
1127
|
-
throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
|
|
1128
|
-
}
|
|
1129
|
-
if (f.data.length === 0) {
|
|
1130
|
-
throw new Error(`Invalid file at index ${i}: data cannot be empty`);
|
|
1131
|
-
}
|
|
1132
|
-
dataList.push(f.data);
|
|
1133
|
-
mimeTypes.push(f.mimeType);
|
|
1134
|
-
}
|
|
1135
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1136
|
-
const results = await wasm.batchExtractBytes(dataList, mimeTypes, normalizedConfig);
|
|
1137
|
-
if (!Array.isArray(results)) {
|
|
1138
|
-
throw new Error("Invalid batch extraction result: expected array");
|
|
1139
|
-
}
|
|
1140
|
-
return results.map((result, index) => {
|
|
1141
|
-
if (!result) {
|
|
1142
|
-
throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
|
|
1143
|
-
}
|
|
1144
|
-
return jsToExtractionResult(result);
|
|
1145
|
-
});
|
|
1146
|
-
} catch (error) {
|
|
1147
|
-
throw wrapWasmError(error, "batch extracting from bytes");
|
|
1148
|
-
}
|
|
1149
|
-
}
|
|
1150
|
-
function batchExtractBytesSync(files, config) {
|
|
1151
|
-
if (!initialized) {
|
|
1152
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1153
|
-
}
|
|
1154
|
-
if (!wasm) {
|
|
1155
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1156
|
-
}
|
|
1157
|
-
try {
|
|
1158
|
-
if (!Array.isArray(files)) {
|
|
1159
|
-
throw new Error("Files parameter must be an array");
|
|
1160
|
-
}
|
|
1161
|
-
if (files.length === 0) {
|
|
1162
|
-
throw new Error("Files array cannot be empty");
|
|
1163
|
-
}
|
|
1164
|
-
const dataList = [];
|
|
1165
|
-
const mimeTypes = [];
|
|
1166
|
-
for (let i = 0; i < files.length; i += 1) {
|
|
1167
|
-
const file = files[i];
|
|
1168
|
-
if (!file || typeof file !== "object") {
|
|
1169
|
-
throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
|
|
1170
|
-
}
|
|
1171
|
-
const f = file;
|
|
1172
|
-
if (!(f.data instanceof Uint8Array)) {
|
|
1173
|
-
throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
|
|
1174
|
-
}
|
|
1175
|
-
if (typeof f.mimeType !== "string") {
|
|
1176
|
-
throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
|
|
1177
|
-
}
|
|
1178
|
-
if (f.data.length === 0) {
|
|
1179
|
-
throw new Error(`Invalid file at index ${i}: data cannot be empty`);
|
|
1180
|
-
}
|
|
1181
|
-
dataList.push(f.data);
|
|
1182
|
-
mimeTypes.push(f.mimeType);
|
|
1183
|
-
}
|
|
1184
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1185
|
-
const results = wasm.batchExtractBytesSync(dataList, mimeTypes, normalizedConfig);
|
|
1186
|
-
if (!Array.isArray(results)) {
|
|
1187
|
-
throw new Error("Invalid batch extraction result: expected array");
|
|
1188
|
-
}
|
|
1189
|
-
return results.map((result, index) => {
|
|
1190
|
-
if (!result) {
|
|
1191
|
-
throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
|
|
1192
|
-
}
|
|
1193
|
-
return jsToExtractionResult(result);
|
|
1194
|
-
});
|
|
1195
|
-
} catch (error) {
|
|
1196
|
-
throw wrapWasmError(error, "batch extracting from bytes (sync)");
|
|
1197
|
-
}
|
|
1198
|
-
}
|
|
1199
|
-
async function batchExtractFiles(files, config) {
|
|
1200
|
-
if (!initialized) {
|
|
1201
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1202
|
-
}
|
|
1203
|
-
try {
|
|
1204
|
-
if (!Array.isArray(files)) {
|
|
1205
|
-
throw new Error("Files parameter must be an array");
|
|
1206
|
-
}
|
|
1207
|
-
if (files.length === 0) {
|
|
1208
|
-
throw new Error("Files array cannot be empty");
|
|
1209
|
-
}
|
|
1210
|
-
const byteFiles = [];
|
|
1211
|
-
for (let i = 0; i < files.length; i += 1) {
|
|
1212
|
-
const file = files[i];
|
|
1213
|
-
if (!(file instanceof File)) {
|
|
1214
|
-
throw new Error(`Invalid file at index ${i}: must be a File object`);
|
|
1215
|
-
}
|
|
1216
|
-
const bytes = await fileToUint8Array(file);
|
|
1217
|
-
byteFiles.push({
|
|
1218
|
-
data: bytes,
|
|
1219
|
-
mimeType: file.type || "application/octet-stream"
|
|
1220
|
-
});
|
|
1221
|
-
}
|
|
1222
|
-
return await batchExtractBytes(byteFiles, config);
|
|
1223
|
-
} catch (error) {
|
|
1224
|
-
throw wrapWasmError(error, "batch extracting from files");
|
|
1225
|
-
}
|
|
1226
|
-
}
|
|
1227
|
-
async function enableOcr() {
|
|
1228
|
-
if (!initialized) {
|
|
1229
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1230
|
-
}
|
|
1231
|
-
if (!isBrowser()) {
|
|
1232
|
-
throw new Error(
|
|
1233
|
-
"OCR is only available in browser environments. TesseractWasmBackend requires Web Workers and createImageBitmap."
|
|
1234
|
-
);
|
|
1235
|
-
}
|
|
1236
|
-
try {
|
|
1237
|
-
const backend = new TesseractWasmBackend();
|
|
1238
|
-
await backend.initialize();
|
|
1239
|
-
registerOcrBackend(backend);
|
|
1240
|
-
} catch (error) {
|
|
1241
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1242
|
-
throw new Error(`Failed to enable OCR: ${message}`);
|
|
1243
|
-
}
|
|
1244
|
-
}
|
|
1245
|
-
//# sourceMappingURL=index.cjs.map
|