@kreuzberg/wasm 4.0.0-rc.21 → 4.0.0-rc.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +520 -837
  2. package/dist/adapters/wasm-adapter.d.ts +7 -10
  3. package/dist/adapters/wasm-adapter.d.ts.map +1 -0
  4. package/dist/adapters/wasm-adapter.js +41 -19
  5. package/dist/adapters/wasm-adapter.js.map +1 -1
  6. package/dist/index.d.ts +23 -24
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +240 -67
  9. package/dist/index.js.map +1 -1
  10. package/dist/ocr/registry.d.ts +7 -10
  11. package/dist/ocr/registry.d.ts.map +1 -0
  12. package/dist/ocr/registry.js.map +1 -1
  13. package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
  14. package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
  15. package/dist/ocr/tesseract-wasm-backend.js +0 -46
  16. package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
  17. package/dist/pdfium.js +0 -5
  18. package/dist/plugin-registry.d.ts +246 -0
  19. package/dist/plugin-registry.d.ts.map +1 -0
  20. package/dist/runtime.d.ts +21 -22
  21. package/dist/runtime.d.ts.map +1 -0
  22. package/dist/runtime.js +0 -1
  23. package/dist/runtime.js.map +1 -1
  24. package/dist/{types-CKjcIYcX.d.ts → types.d.ts} +91 -22
  25. package/dist/types.d.ts.map +1 -0
  26. package/package.json +119 -162
  27. package/dist/adapters/wasm-adapter.cjs +0 -245
  28. package/dist/adapters/wasm-adapter.cjs.map +0 -1
  29. package/dist/adapters/wasm-adapter.d.cts +0 -121
  30. package/dist/index.cjs +0 -1245
  31. package/dist/index.cjs.map +0 -1
  32. package/dist/index.d.cts +0 -423
  33. package/dist/ocr/registry.cjs +0 -92
  34. package/dist/ocr/registry.cjs.map +0 -1
  35. package/dist/ocr/registry.d.cts +0 -102
  36. package/dist/ocr/tesseract-wasm-backend.cjs +0 -456
  37. package/dist/ocr/tesseract-wasm-backend.cjs.map +0 -1
  38. package/dist/ocr/tesseract-wasm-backend.d.cts +0 -257
  39. package/dist/runtime.cjs +0 -174
  40. package/dist/runtime.cjs.map +0 -1
  41. package/dist/runtime.d.cts +0 -256
  42. package/dist/types-CKjcIYcX.d.cts +0 -294
package/dist/index.cjs DELETED
@@ -1,1245 +0,0 @@
1
- "use strict";
2
- var __create = Object.create;
3
- var __defProp = Object.defineProperty;
4
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
- var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
- var __hasOwnProp = Object.prototype.hasOwnProperty;
8
- var __export = (target, all) => {
9
- for (var name in all)
10
- __defProp(target, name, { get: all[name], enumerable: true });
11
- };
12
- var __copyProps = (to, from, except, desc) => {
13
- if (from && typeof from === "object" || typeof from === "function") {
14
- for (let key of __getOwnPropNames(from))
15
- if (!__hasOwnProp.call(to, key) && key !== except)
16
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
- }
18
- return to;
19
- };
20
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
- // If the importer is in node compatibility mode or this is not an ESM
22
- // file that has been converted to a CommonJS file using a Babel-
23
- // compatible transform (i.e. "__esModule" has not been set), then set
24
- // "default" to the CommonJS "module.exports" for node compatibility.
25
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
- mod
27
- ));
28
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
-
30
- // typescript/index.ts
31
- var index_exports = {};
32
- __export(index_exports, {
33
- TesseractWasmBackend: () => TesseractWasmBackend,
34
- batchExtractBytes: () => batchExtractBytes,
35
- batchExtractBytesSync: () => batchExtractBytesSync,
36
- batchExtractFiles: () => batchExtractFiles,
37
- clearOcrBackends: () => clearOcrBackends,
38
- configToJS: () => configToJS,
39
- detectRuntime: () => detectRuntime,
40
- enableOcr: () => enableOcr,
41
- extractBytes: () => extractBytes,
42
- extractBytesSync: () => extractBytesSync,
43
- extractFile: () => extractFile,
44
- extractFromFile: () => extractFromFile,
45
- fileToUint8Array: () => fileToUint8Array,
46
- getInitializationError: () => getInitializationError,
47
- getOcrBackend: () => getOcrBackend,
48
- getRuntimeInfo: () => getRuntimeInfo,
49
- getRuntimeVersion: () => getRuntimeVersion,
50
- getVersion: () => getVersion,
51
- getWasmCapabilities: () => getWasmCapabilities,
52
- hasBigInt: () => hasBigInt,
53
- hasBlob: () => hasBlob,
54
- hasFileApi: () => hasFileApi,
55
- hasModuleWorkers: () => hasModuleWorkers,
56
- hasSharedArrayBuffer: () => hasSharedArrayBuffer,
57
- hasWasm: () => hasWasm,
58
- hasWasmStreaming: () => hasWasmStreaming,
59
- hasWorkers: () => hasWorkers,
60
- initWasm: () => initWasm,
61
- isBrowser: () => isBrowser,
62
- isBun: () => isBun,
63
- isDeno: () => isDeno,
64
- isInitialized: () => isInitialized,
65
- isNode: () => isNode,
66
- isServerEnvironment: () => isServerEnvironment,
67
- isValidExtractionResult: () => isValidExtractionResult,
68
- isWebEnvironment: () => isWebEnvironment,
69
- jsToExtractionResult: () => jsToExtractionResult,
70
- listOcrBackends: () => listOcrBackends,
71
- registerOcrBackend: () => registerOcrBackend,
72
- unregisterOcrBackend: () => unregisterOcrBackend,
73
- wrapWasmError: () => wrapWasmError
74
- });
75
- module.exports = __toCommonJS(index_exports);
76
-
77
- // typescript/adapters/wasm-adapter.ts
78
- var MAX_FILE_SIZE = 512 * 1024 * 1024;
79
- function isNumberOrNull(value) {
80
- return typeof value === "number" || value === null;
81
- }
82
- function isStringOrNull(value) {
83
- return typeof value === "string" || value === null;
84
- }
85
- function isBoolean(value) {
86
- return typeof value === "boolean";
87
- }
88
- async function fileToUint8Array(file) {
89
- try {
90
- if (file.size > MAX_FILE_SIZE) {
91
- throw new Error(
92
- `File size (${file.size} bytes) exceeds maximum (${MAX_FILE_SIZE} bytes). Maximum file size is 512 MB.`
93
- );
94
- }
95
- const arrayBuffer = await file.arrayBuffer();
96
- return new Uint8Array(arrayBuffer);
97
- } catch (error) {
98
- throw new Error(`Failed to read file: ${error instanceof Error ? error.message : String(error)}`);
99
- }
100
- }
101
- function configToJS(config) {
102
- if (!config) {
103
- return {};
104
- }
105
- const normalized = {};
106
- const normalizeValue = (value) => {
107
- if (value === null || value === void 0) {
108
- return null;
109
- }
110
- if (typeof value === "object") {
111
- if (Array.isArray(value)) {
112
- return value.map(normalizeValue);
113
- }
114
- const obj = value;
115
- const normalized2 = {};
116
- for (const [key, val] of Object.entries(obj)) {
117
- const normalizedVal = normalizeValue(val);
118
- if (normalizedVal !== null && normalizedVal !== void 0) {
119
- normalized2[key] = normalizedVal;
120
- }
121
- }
122
- return Object.keys(normalized2).length > 0 ? normalized2 : null;
123
- }
124
- return value;
125
- };
126
- for (const [key, value] of Object.entries(config)) {
127
- const normalizedValue = normalizeValue(value);
128
- if (normalizedValue !== null && normalizedValue !== void 0) {
129
- normalized[key] = normalizedValue;
130
- }
131
- }
132
- return normalized;
133
- }
134
- function jsToExtractionResult(jsValue) {
135
- if (!jsValue || typeof jsValue !== "object") {
136
- throw new Error("Invalid extraction result: value is not an object");
137
- }
138
- const result = jsValue;
139
- const mimeType = typeof result.mimeType === "string" ? result.mimeType : typeof result.mime_type === "string" ? result.mime_type : null;
140
- if (typeof result.content !== "string") {
141
- throw new Error("Invalid extraction result: missing or invalid content");
142
- }
143
- if (typeof mimeType !== "string") {
144
- throw new Error("Invalid extraction result: missing or invalid mimeType");
145
- }
146
- if (!result.metadata || typeof result.metadata !== "object") {
147
- throw new Error("Invalid extraction result: missing or invalid metadata");
148
- }
149
- const tables = [];
150
- if (Array.isArray(result.tables)) {
151
- for (const table of result.tables) {
152
- if (table && typeof table === "object") {
153
- const t = table;
154
- if (Array.isArray(t.cells) && t.cells.every((row) => Array.isArray(row) && row.every((cell) => typeof cell === "string")) && typeof t.markdown === "string" && typeof t.pageNumber === "number") {
155
- tables.push({
156
- cells: t.cells,
157
- markdown: t.markdown,
158
- pageNumber: t.pageNumber
159
- });
160
- }
161
- }
162
- }
163
- }
164
- const chunks = Array.isArray(result.chunks) ? result.chunks.map((chunk) => {
165
- if (!chunk || typeof chunk !== "object") {
166
- throw new Error("Invalid chunk structure");
167
- }
168
- const c = chunk;
169
- if (typeof c.content !== "string") {
170
- throw new Error("Invalid chunk: missing content");
171
- }
172
- if (!c.metadata || typeof c.metadata !== "object") {
173
- throw new Error("Invalid chunk: missing metadata");
174
- }
175
- const metadata = c.metadata;
176
- let embedding = null;
177
- if (Array.isArray(c.embedding)) {
178
- if (!c.embedding.every((item) => typeof item === "number")) {
179
- throw new Error("Invalid chunk: embedding must contain only numbers");
180
- }
181
- embedding = c.embedding;
182
- }
183
- if (typeof metadata.charStart !== "number") {
184
- throw new Error("Invalid chunk metadata: charStart must be a number");
185
- }
186
- if (typeof metadata.charEnd !== "number") {
187
- throw new Error("Invalid chunk metadata: charEnd must be a number");
188
- }
189
- if (!isNumberOrNull(metadata.tokenCount)) {
190
- throw new Error("Invalid chunk metadata: tokenCount must be a number or null");
191
- }
192
- if (typeof metadata.chunkIndex !== "number") {
193
- throw new Error("Invalid chunk metadata: chunkIndex must be a number");
194
- }
195
- if (typeof metadata.totalChunks !== "number") {
196
- throw new Error("Invalid chunk metadata: totalChunks must be a number");
197
- }
198
- return {
199
- content: c.content,
200
- embedding,
201
- metadata: {
202
- charStart: metadata.charStart,
203
- charEnd: metadata.charEnd,
204
- tokenCount: metadata.tokenCount,
205
- chunkIndex: metadata.chunkIndex,
206
- totalChunks: metadata.totalChunks
207
- }
208
- };
209
- }) : null;
210
- const images = Array.isArray(result.images) ? result.images.map((image) => {
211
- if (!image || typeof image !== "object") {
212
- throw new Error("Invalid image structure");
213
- }
214
- const img = image;
215
- if (!(img.data instanceof Uint8Array)) {
216
- throw new Error("Invalid image: data must be Uint8Array");
217
- }
218
- if (typeof img.format !== "string") {
219
- throw new Error("Invalid image: missing format");
220
- }
221
- if (typeof img.imageIndex !== "number") {
222
- throw new Error("Invalid image: imageIndex must be a number");
223
- }
224
- if (!isNumberOrNull(img.pageNumber)) {
225
- throw new Error("Invalid image: pageNumber must be a number or null");
226
- }
227
- if (!isNumberOrNull(img.width)) {
228
- throw new Error("Invalid image: width must be a number or null");
229
- }
230
- if (!isNumberOrNull(img.height)) {
231
- throw new Error("Invalid image: height must be a number or null");
232
- }
233
- if (!isNumberOrNull(img.bitsPerComponent)) {
234
- throw new Error("Invalid image: bitsPerComponent must be a number or null");
235
- }
236
- if (!isBoolean(img.isMask)) {
237
- throw new Error("Invalid image: isMask must be a boolean");
238
- }
239
- if (!isStringOrNull(img.colorspace)) {
240
- throw new Error("Invalid image: colorspace must be a string or null");
241
- }
242
- if (!isStringOrNull(img.description)) {
243
- throw new Error("Invalid image: description must be a string or null");
244
- }
245
- return {
246
- data: img.data,
247
- format: img.format,
248
- imageIndex: img.imageIndex,
249
- pageNumber: img.pageNumber,
250
- width: img.width,
251
- height: img.height,
252
- colorspace: img.colorspace,
253
- bitsPerComponent: img.bitsPerComponent,
254
- isMask: img.isMask,
255
- description: img.description,
256
- ocrResult: img.ocrResult ? jsToExtractionResult(img.ocrResult) : null
257
- };
258
- }) : null;
259
- let detectedLanguages = null;
260
- const detectedLanguagesRaw = Array.isArray(result.detectedLanguages) ? result.detectedLanguages : result.detected_languages;
261
- if (Array.isArray(detectedLanguagesRaw)) {
262
- if (!detectedLanguagesRaw.every((lang) => typeof lang === "string")) {
263
- throw new Error("Invalid result: detectedLanguages must contain only strings");
264
- }
265
- detectedLanguages = detectedLanguagesRaw;
266
- }
267
- return {
268
- content: result.content,
269
- mimeType,
270
- metadata: result.metadata ?? {},
271
- tables,
272
- detectedLanguages,
273
- chunks,
274
- images
275
- };
276
- }
277
- function wrapWasmError(error, context) {
278
- if (error instanceof Error) {
279
- return new Error(`Error ${context}: ${error.message}`, {
280
- cause: error
281
- });
282
- }
283
- const message = String(error);
284
- return new Error(`Error ${context}: ${message}`);
285
- }
286
- function isValidExtractionResult(value) {
287
- if (!value || typeof value !== "object") {
288
- return false;
289
- }
290
- const obj = value;
291
- return typeof obj.content === "string" && (typeof obj.mimeType === "string" || typeof obj.mime_type === "string") && obj.metadata !== null && typeof obj.metadata === "object" && Array.isArray(obj.tables);
292
- }
293
-
294
- // typescript/ocr/registry.ts
295
- var ocrBackendRegistry = /* @__PURE__ */ new Map();
296
- function registerOcrBackend(backend) {
297
- if (!backend) {
298
- throw new Error("Backend cannot be null or undefined");
299
- }
300
- if (typeof backend.name !== "function") {
301
- throw new Error("Backend must implement name() method");
302
- }
303
- if (typeof backend.supportedLanguages !== "function") {
304
- throw new Error("Backend must implement supportedLanguages() method");
305
- }
306
- if (typeof backend.processImage !== "function") {
307
- throw new Error("Backend must implement processImage() method");
308
- }
309
- const backendName = backend.name();
310
- if (!backendName || typeof backendName !== "string") {
311
- throw new Error("Backend name must be a non-empty string");
312
- }
313
- if (ocrBackendRegistry.has(backendName)) {
314
- console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
315
- }
316
- ocrBackendRegistry.set(backendName, backend);
317
- }
318
- function getOcrBackend(name) {
319
- return ocrBackendRegistry.get(name);
320
- }
321
- function listOcrBackends() {
322
- return Array.from(ocrBackendRegistry.keys());
323
- }
324
- async function unregisterOcrBackend(name) {
325
- const backend = ocrBackendRegistry.get(name);
326
- if (!backend) {
327
- throw new Error(
328
- `OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
329
- );
330
- }
331
- if (typeof backend.shutdown === "function") {
332
- try {
333
- await backend.shutdown();
334
- } catch (error) {
335
- console.warn(
336
- `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
337
- );
338
- }
339
- }
340
- ocrBackendRegistry.delete(name);
341
- }
342
- async function clearOcrBackends() {
343
- const backends = Array.from(ocrBackendRegistry.entries());
344
- for (const [name, backend] of backends) {
345
- if (typeof backend.shutdown === "function") {
346
- try {
347
- await backend.shutdown();
348
- } catch (error) {
349
- console.warn(
350
- `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
351
- );
352
- }
353
- }
354
- }
355
- ocrBackendRegistry.clear();
356
- }
357
-
358
- // typescript/ocr/tesseract-wasm-backend.ts
359
- var TesseractWasmBackend = class {
360
- /** Tesseract WASM client instance */
361
- client = null;
362
- /** Track which models are currently loaded to avoid redundant loads */
363
- loadedLanguages = /* @__PURE__ */ new Set();
364
- /** Cache for language availability validation */
365
- supportedLangsCache = null;
366
- /** Progress callback for UI updates */
367
- progressCallback = null;
368
- /** Base URL for training data CDN */
369
- CDN_BASE_URL = "https://cdn.jsdelivr.net/npm/tesseract-wasm@0.11.0/dist";
370
- /**
371
- * Return the unique name of this OCR backend
372
- *
373
- * @returns Backend identifier "tesseract-wasm"
374
- */
375
- name() {
376
- return "tesseract-wasm";
377
- }
378
- /**
379
- * Return list of supported language codes
380
- *
381
- * Returns a curated list of commonly available Tesseract language models.
382
- * Tesseract supports many more languages through custom models.
383
- *
384
- * @returns Array of ISO 639-1/2/3 language codes
385
- */
386
- supportedLanguages() {
387
- if (this.supportedLangsCache) {
388
- return this.supportedLangsCache;
389
- }
390
- this.supportedLangsCache = [
391
- // Major languages
392
- "eng",
393
- // English
394
- "deu",
395
- // German
396
- "fra",
397
- // French
398
- "spa",
399
- // Spanish
400
- "ita",
401
- // Italian
402
- "por",
403
- // Portuguese
404
- "nld",
405
- // Dutch
406
- "rus",
407
- // Russian
408
- "jpn",
409
- // Japanese
410
- "kor",
411
- // Korean
412
- "chi_sim",
413
- // Chinese (Simplified)
414
- "chi_tra",
415
- // Chinese (Traditional)
416
- // Additional European languages
417
- "pol",
418
- // Polish
419
- "tur",
420
- // Turkish
421
- "swe",
422
- // Swedish
423
- "dan",
424
- // Danish
425
- "fin",
426
- // Finnish
427
- "nor",
428
- // Norwegian
429
- "ces",
430
- // Czech
431
- "slk",
432
- // Slovak
433
- "ron",
434
- // Romanian
435
- "hun",
436
- // Hungarian
437
- "hrv",
438
- // Croatian
439
- "srp",
440
- // Serbian
441
- "bul",
442
- // Bulgarian
443
- "ukr",
444
- // Ukrainian
445
- "ell",
446
- // Greek
447
- // Asian languages
448
- "ara",
449
- // Arabic
450
- "heb",
451
- // Hebrew
452
- "hin",
453
- // Hindi
454
- "tha",
455
- // Thai
456
- "vie",
457
- // Vietnamese
458
- "mkd",
459
- // Macedonian
460
- "ben",
461
- // Bengali
462
- "tam",
463
- // Tamil
464
- "tel",
465
- // Telugu
466
- "kan",
467
- // Kannada
468
- "mal",
469
- // Malayalam
470
- "mya",
471
- // Burmese
472
- "khm",
473
- // Khmer
474
- "lao",
475
- // Lao
476
- "sin"
477
- // Sinhala
478
- ];
479
- return this.supportedLangsCache;
480
- }
481
- /**
482
- * Initialize the OCR backend
483
- *
484
- * Creates the Tesseract WASM client instance. This is called once when
485
- * the backend is registered with the extraction pipeline.
486
- *
487
- * The actual model loading happens in processImage() on-demand to avoid
488
- * loading all models upfront.
489
- *
490
- * @throws {Error} If tesseract-wasm is not available or initialization fails
491
- *
492
- * @example
493
- * ```typescript
494
- * const backend = new TesseractWasmBackend();
495
- * try {
496
- * await backend.initialize();
497
- * } catch (error) {
498
- * console.error('Failed to initialize OCR:', error);
499
- * }
500
- * ```
501
- */
502
- async initialize() {
503
- if (this.client) {
504
- return;
505
- }
506
- try {
507
- const tesseractModule = await this.loadTesseractWasm();
508
- if (!tesseractModule || typeof tesseractModule.OCRClient !== "function") {
509
- throw new Error("tesseract-wasm OCRClient not found. Ensure tesseract-wasm is installed and available.");
510
- }
511
- this.client = new tesseractModule.OCRClient();
512
- this.loadedLanguages.clear();
513
- } catch (error) {
514
- const message = error instanceof Error ? error.message : String(error);
515
- throw new Error(`Failed to initialize TesseractWasmBackend: ${message}`);
516
- }
517
- }
518
- /**
519
- * Process image bytes and extract text via OCR
520
- *
521
- * Handles image loading, model loading, OCR processing, and result formatting.
522
- * Automatically loads the language model on first use and caches it for subsequent calls.
523
- *
524
- * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string
525
- * @param language - ISO 639-2/3 language code (e.g., "eng", "deu")
526
- * @returns Promise resolving to OCR result with content and metadata
527
- * @throws {Error} If image processing fails, model loading fails, or language is unsupported
528
- *
529
- * @example
530
- * ```typescript
531
- * const backend = new TesseractWasmBackend();
532
- * await backend.initialize();
533
- *
534
- * const imageBuffer = fs.readFileSync('scanned.png');
535
- * const result = await backend.processImage(
536
- * new Uint8Array(imageBuffer),
537
- * 'eng'
538
- * );
539
- *
540
- * console.log(result.content); // Extracted text
541
- * console.log(result.metadata.confidence); // OCR confidence score
542
- * ```
543
- */
544
- async processImage(imageBytes, language) {
545
- if (!this.client) {
546
- throw new Error("TesseractWasmBackend not initialized. Call initialize() first.");
547
- }
548
- const supported = this.supportedLanguages();
549
- const normalizedLang = language.toLowerCase();
550
- const isSupported = supported.some((lang) => lang.toLowerCase() === normalizedLang);
551
- if (!isSupported) {
552
- throw new Error(`Language "${language}" is not supported. Supported languages: ${supported.join(", ")}`);
553
- }
554
- try {
555
- if (!this.loadedLanguages.has(normalizedLang)) {
556
- this.reportProgress(10);
557
- await this.loadLanguageModel(normalizedLang);
558
- this.loadedLanguages.add(normalizedLang);
559
- this.reportProgress(30);
560
- }
561
- this.reportProgress(40);
562
- const imageBitmap = await this.convertToImageBitmap(imageBytes);
563
- this.reportProgress(50);
564
- await this.client.loadImage(imageBitmap);
565
- this.reportProgress(70);
566
- const text = await this.client.getText();
567
- const confidence = await this.getConfidenceScore();
568
- const pageMetadata = await this.getPageMetadata();
569
- this.reportProgress(90);
570
- return {
571
- content: text,
572
- mime_type: "text/plain",
573
- metadata: {
574
- language: normalizedLang,
575
- confidence,
576
- ...pageMetadata
577
- },
578
- tables: []
579
- // Tesseract-wasm doesn't provide structured table detection
580
- };
581
- } catch (error) {
582
- const message = error instanceof Error ? error.message : String(error);
583
- throw new Error(`OCR processing failed for language "${language}": ${message}`);
584
- } finally {
585
- this.reportProgress(100);
586
- }
587
- }
588
- /**
589
- * Shutdown the OCR backend and release resources
590
- *
591
- * Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.
592
- * Called when the backend is unregistered or the application shuts down.
593
- *
594
- * @throws {Error} If cleanup fails (errors are logged but not critical)
595
- *
596
- * @example
597
- * ```typescript
598
- * const backend = new TesseractWasmBackend();
599
- * await backend.initialize();
600
- * // ... use backend ...
601
- * await backend.shutdown(); // Clean up resources
602
- * ```
603
- */
604
- async shutdown() {
605
- try {
606
- if (this.client) {
607
- if (typeof this.client.destroy === "function") {
608
- this.client.destroy();
609
- }
610
- if (typeof this.client.terminate === "function") {
611
- this.client.terminate();
612
- }
613
- this.client = null;
614
- }
615
- this.loadedLanguages.clear();
616
- this.supportedLangsCache = null;
617
- this.progressCallback = null;
618
- } catch (error) {
619
- console.warn(
620
- `Warning during TesseractWasmBackend shutdown: ${error instanceof Error ? error.message : String(error)}`
621
- );
622
- }
623
- }
624
- /**
625
- * Set a progress callback for UI updates
626
- *
627
- * Allows the UI to display progress during OCR processing.
628
- * The callback will be called with values from 0 to 100.
629
- *
630
- * @param callback - Function to call with progress percentage
631
- *
632
- * @example
633
- * ```typescript
634
- * const backend = new TesseractWasmBackend();
635
- * backend.setProgressCallback((progress) => {
636
- * console.log(`OCR Progress: ${progress}%`);
637
- * document.getElementById('progress-bar').style.width = `${progress}%`;
638
- * });
639
- * ```
640
- */
641
- setProgressCallback(callback) {
642
- this.progressCallback = callback;
643
- }
644
- /**
645
- * Load language model from CDN
646
- *
647
- * Fetches the training data for a specific language from jsDelivr CDN.
648
- * This is an MVP approach - models are cached by the browser.
649
- *
650
- * @param language - ISO 639-2/3 language code
651
- * @throws {Error} If model download fails or language is not available
652
- *
653
- * @internal
654
- */
655
- async loadLanguageModel(language) {
656
- if (!this.client) {
657
- throw new Error("Client not initialized");
658
- }
659
- const modelFilename = `${language}.traineddata`;
660
- const modelUrl = `${this.CDN_BASE_URL}/${modelFilename}`;
661
- try {
662
- await this.client.loadModel(modelUrl);
663
- } catch (error) {
664
- const message = error instanceof Error ? error.message : String(error);
665
- throw new Error(`Failed to load model for language "${language}" from ${modelUrl}: ${message}`);
666
- }
667
- }
668
- /**
669
- * Convert image bytes or Base64 string to ImageBitmap
670
- *
671
- * Handles both Uint8Array and Base64-encoded image data, converting to
672
- * ImageBitmap format required by Tesseract WASM.
673
- *
674
- * @param imageBytes - Image data as Uint8Array or Base64 string
675
- * @returns Promise resolving to ImageBitmap
676
- * @throws {Error} If conversion fails (browser API not available or invalid image data)
677
- *
678
- * @internal
679
- */
680
- async convertToImageBitmap(imageBytes) {
681
- if (typeof createImageBitmap === "undefined") {
682
- throw new Error("createImageBitmap is not available. TesseractWasmBackend requires a browser environment.");
683
- }
684
- try {
685
- let bytes = imageBytes;
686
- if (typeof imageBytes === "string") {
687
- const binaryString = atob(imageBytes);
688
- bytes = new Uint8Array(binaryString.length);
689
- for (let i = 0; i < binaryString.length; i++) {
690
- bytes[i] = binaryString.charCodeAt(i);
691
- }
692
- }
693
- const blob = new Blob([bytes]);
694
- const imageBitmap = await createImageBitmap(blob);
695
- return imageBitmap;
696
- } catch (error) {
697
- const message = error instanceof Error ? error.message : String(error);
698
- throw new Error(`Failed to convert image bytes to ImageBitmap: ${message}`);
699
- }
700
- }
701
- /**
702
- * Get confidence score from OCR result
703
- *
704
- * Attempts to retrieve confidence score from Tesseract.
705
- * Returns a safe default if unavailable.
706
- *
707
- * @returns Confidence score between 0 and 1
708
- *
709
- * @internal
710
- */
711
- async getConfidenceScore() {
712
- try {
713
- if (this.client && typeof this.client.getConfidence === "function") {
714
- const confidence = await this.client.getConfidence();
715
- return confidence > 1 ? confidence / 100 : confidence;
716
- }
717
- } catch {
718
- }
719
- return 0.9;
720
- }
721
- /**
722
- * Get page metadata from OCR result
723
- *
724
- * Retrieves additional metadata like image dimensions and processing info.
725
- *
726
- * @returns Metadata object (may be empty if unavailable)
727
- *
728
- * @internal
729
- */
730
- async getPageMetadata() {
731
- try {
732
- if (this.client && typeof this.client.getPageMetadata === "function") {
733
- return await this.client.getPageMetadata();
734
- }
735
- } catch {
736
- }
737
- return {};
738
- }
739
- /**
740
- * Dynamically load tesseract-wasm module
741
- *
742
- * Uses dynamic import to load tesseract-wasm only when needed,
743
- * avoiding hard dependency in browser environments where it may not be bundled.
744
- *
745
- * @returns tesseract-wasm module object
746
- * @throws {Error} If module cannot be imported
747
- *
748
- * @internal
749
- */
750
- async loadTesseractWasm() {
751
- try {
752
- const module2 = await import("tesseract-wasm");
753
- return module2;
754
- } catch (error) {
755
- const message = error instanceof Error ? error.message : String(error);
756
- throw new Error(
757
- `Failed to import tesseract-wasm. Ensure it is installed via: npm install tesseract-wasm. Error: ${message}`
758
- );
759
- }
760
- }
761
- /**
762
- * Report progress to progress callback
763
- *
764
- * Internal helper for notifying progress updates during OCR processing.
765
- *
766
- * @param progress - Progress percentage (0-100)
767
- *
768
- * @internal
769
- */
770
- reportProgress(progress) {
771
- if (this.progressCallback) {
772
- try {
773
- this.progressCallback(Math.min(100, Math.max(0, progress)));
774
- } catch {
775
- }
776
- }
777
- }
778
- };
779
-
780
- // typescript/runtime.ts
781
- function detectRuntime() {
782
- if (typeof globalThis.Deno !== "undefined") {
783
- return "deno";
784
- }
785
- if (typeof globalThis.Bun !== "undefined") {
786
- return "bun";
787
- }
788
- if (typeof process !== "undefined" && process.versions && process.versions.node) {
789
- return "node";
790
- }
791
- if (typeof window !== "undefined" && typeof document !== "undefined") {
792
- return "browser";
793
- }
794
- return "unknown";
795
- }
796
- function isBrowser() {
797
- return detectRuntime() === "browser";
798
- }
799
- function isNode() {
800
- return detectRuntime() === "node";
801
- }
802
- function isDeno() {
803
- return detectRuntime() === "deno";
804
- }
805
- function isBun() {
806
- return detectRuntime() === "bun";
807
- }
808
- function isWebEnvironment() {
809
- const runtime = detectRuntime();
810
- return runtime === "browser";
811
- }
812
- function isServerEnvironment() {
813
- const runtime = detectRuntime();
814
- return runtime === "node" || runtime === "deno" || runtime === "bun";
815
- }
816
- function hasFileApi() {
817
- return typeof window !== "undefined" && typeof File !== "undefined" && typeof Blob !== "undefined";
818
- }
819
- function hasBlob() {
820
- return typeof Blob !== "undefined";
821
- }
822
- function hasWorkers() {
823
- return typeof Worker !== "undefined";
824
- }
825
- function hasSharedArrayBuffer() {
826
- return typeof SharedArrayBuffer !== "undefined";
827
- }
828
- function hasModuleWorkers() {
829
- if (!hasWorkers()) {
830
- return false;
831
- }
832
- try {
833
- const blob = new Blob(['console.log("test")'], {
834
- type: "application/javascript"
835
- });
836
- const workerUrl = URL.createObjectURL(blob);
837
- try {
838
- return true;
839
- } finally {
840
- URL.revokeObjectURL(workerUrl);
841
- }
842
- } catch {
843
- return false;
844
- }
845
- }
846
- function hasWasm() {
847
- return typeof WebAssembly !== "undefined" && WebAssembly.instantiate !== void 0;
848
- }
849
- function hasWasmStreaming() {
850
- return typeof WebAssembly !== "undefined" && WebAssembly.instantiateStreaming !== void 0;
851
- }
852
- function hasBigInt() {
853
- try {
854
- const test = BigInt("1");
855
- return typeof test === "bigint";
856
- } catch {
857
- return false;
858
- }
859
- }
860
- function getRuntimeVersion() {
861
- const runtime = detectRuntime();
862
- switch (runtime) {
863
- case "node":
864
- return process.version?.substring(1);
865
- // Remove 'v' prefix
866
- case "deno": {
867
- const deno = globalThis.Deno;
868
- const version = deno?.version;
869
- return version?.deno;
870
- }
871
- case "bun": {
872
- const bun = globalThis.Bun;
873
- return bun?.version;
874
- }
875
- default:
876
- return void 0;
877
- }
878
- }
879
- function getWasmCapabilities() {
880
- const runtime = detectRuntime();
881
- const version = getRuntimeVersion();
882
- const capabilities = {
883
- runtime,
884
- hasWasm: hasWasm(),
885
- hasWasmStreaming: hasWasmStreaming(),
886
- hasFileApi: hasFileApi(),
887
- hasBlob: hasBlob(),
888
- hasWorkers: hasWorkers(),
889
- hasSharedArrayBuffer: hasSharedArrayBuffer(),
890
- hasModuleWorkers: hasModuleWorkers(),
891
- hasBigInt: hasBigInt(),
892
- ...version !== void 0 ? { runtimeVersion: version } : {}
893
- };
894
- return capabilities;
895
- }
896
- function getRuntimeInfo() {
897
- const runtime = detectRuntime();
898
- const capabilities = getWasmCapabilities();
899
- return {
900
- runtime,
901
- isBrowser: isBrowser(),
902
- isNode: isNode(),
903
- isDeno: isDeno(),
904
- isBun: isBun(),
905
- isWeb: isWebEnvironment(),
906
- isServer: isServerEnvironment(),
907
- runtimeVersion: getRuntimeVersion(),
908
- userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "N/A",
909
- capabilities
910
- };
911
- }
912
-
913
- // typescript/index.ts
914
- var wasm = null;
915
- var initialized = false;
916
- var initializationError = null;
917
- var initializationPromise = null;
918
- async function initializePdfiumAsync(wasmModule) {
919
- if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
920
- return;
921
- }
922
- if (!isBrowser()) {
923
- console.debug("PDFium initialization skipped (non-browser environment)");
924
- return;
925
- }
926
- try {
927
- const pdfiumModule = await import("./pdfium.js");
928
- const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
929
- const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
930
- if (!success) {
931
- console.warn("PDFium initialization returned false");
932
- }
933
- } catch (error) {
934
- console.debug("PDFium initialization error:", error);
935
- }
936
- }
937
- async function initWasm() {
938
- if (initialized) {
939
- return;
940
- }
941
- if (initializationPromise) {
942
- return initializationPromise;
943
- }
944
- initializationPromise = (async () => {
945
- try {
946
- if (!hasWasm()) {
947
- throw new Error("WebAssembly is not supported in this environment");
948
- }
949
- let wasmModule;
950
- try {
951
- wasmModule = await import(
952
- /* @vite-ignore */
953
- "../pkg/kreuzberg_wasm.js"
954
- );
955
- } catch {
956
- wasmModule = await import(
957
- /* @vite-ignore */
958
- "./kreuzberg_wasm.js"
959
- );
960
- }
961
- wasm = wasmModule;
962
- if (wasm && typeof wasm.default === "function") {
963
- await wasm.default();
964
- }
965
- if (isBrowser() && wasm && typeof wasm.initialize_pdfium_render === "function") {
966
- initializePdfiumAsync(wasm).catch((error) => {
967
- console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
968
- });
969
- }
970
- initialized = true;
971
- initializationError = null;
972
- } catch (error) {
973
- initializationError = error instanceof Error ? error : new Error(String(error));
974
- throw wrapWasmError(error, "initializing Kreuzberg WASM module");
975
- }
976
- })();
977
- return initializationPromise;
978
- }
979
- function isInitialized() {
980
- return initialized;
981
- }
982
- function getVersion() {
983
- if (!initialized) {
984
- throw new Error("WASM module not initialized. Call initWasm() first.");
985
- }
986
- if (!wasm) {
987
- throw new Error("WASM module not loaded. Call initWasm() first.");
988
- }
989
- return wasm.version();
990
- }
991
- function getInitializationError() {
992
- return initializationError;
993
- }
994
- async function extractBytes(data, mimeType, config) {
995
- if (!initialized) {
996
- throw new Error("WASM module not initialized. Call initWasm() first.");
997
- }
998
- if (!wasm) {
999
- throw new Error("WASM module not loaded. Call initWasm() first.");
1000
- }
1001
- try {
1002
- if (!data || data.length === 0) {
1003
- throw new Error("Document data cannot be empty");
1004
- }
1005
- if (!mimeType) {
1006
- throw new Error("MIME type is required");
1007
- }
1008
- const normalizedConfig = configToJS(config ?? null);
1009
- const result = await wasm.extractBytes(data, mimeType, normalizedConfig);
1010
- if (!result) {
1011
- throw new Error("Invalid extraction result: no result from WASM module");
1012
- }
1013
- return jsToExtractionResult(result);
1014
- } catch (error) {
1015
- throw wrapWasmError(error, "extracting from bytes");
1016
- }
1017
- }
1018
- async function extractFile(path, mimeType, config) {
1019
- if (!initialized) {
1020
- throw new Error("WASM module not initialized. Call initWasm() first.");
1021
- }
1022
- if (!wasm) {
1023
- throw new Error("WASM module not loaded. Call initWasm() first.");
1024
- }
1025
- try {
1026
- if (!path) {
1027
- throw new Error("File path is required");
1028
- }
1029
- const runtime = detectRuntime();
1030
- if (runtime === "browser") {
1031
- throw new Error("Use extractBytes with fileToUint8Array for browser environments");
1032
- }
1033
- let fileData;
1034
- if (runtime === "node") {
1035
- const { readFile } = await import("fs/promises");
1036
- const buffer = await readFile(path);
1037
- fileData = new Uint8Array(buffer);
1038
- } else if (runtime === "deno") {
1039
- const deno = globalThis.Deno;
1040
- fileData = await deno.readFile(path);
1041
- } else if (runtime === "bun") {
1042
- const { readFile } = await import("fs/promises");
1043
- const buffer = await readFile(path);
1044
- fileData = new Uint8Array(buffer);
1045
- } else {
1046
- throw new Error(`Unsupported runtime for file extraction: ${runtime}`);
1047
- }
1048
- let detectedMimeType = mimeType;
1049
- if (!detectedMimeType) {
1050
- detectedMimeType = wasm.detectMimeFromBytes(fileData);
1051
- }
1052
- if (!detectedMimeType) {
1053
- throw new Error("Could not detect MIME type for file. Please provide mimeType parameter.");
1054
- }
1055
- detectedMimeType = wasm.normalizeMimeType(detectedMimeType);
1056
- return await extractBytes(fileData, detectedMimeType, config);
1057
- } catch (error) {
1058
- throw wrapWasmError(error, `extracting from file: ${path}`);
1059
- }
1060
- }
1061
- async function extractFromFile(file, mimeType, config) {
1062
- if (!initialized) {
1063
- throw new Error("WASM module not initialized. Call initWasm() first.");
1064
- }
1065
- if (!wasm) {
1066
- throw new Error("WASM module not loaded. Call initWasm() first.");
1067
- }
1068
- try {
1069
- const bytes = await fileToUint8Array(file);
1070
- let type = mimeType ?? (file instanceof File ? file.type : "application/octet-stream");
1071
- type = wasm.normalizeMimeType(type);
1072
- return await extractBytes(bytes, type, config);
1073
- } catch (error) {
1074
- throw wrapWasmError(error, `extracting from ${file instanceof File ? "file" : "blob"}`);
1075
- }
1076
- }
1077
- function extractBytesSync(data, mimeType, config) {
1078
- if (!initialized) {
1079
- throw new Error("WASM module not initialized. Call initWasm() first.");
1080
- }
1081
- if (!wasm) {
1082
- throw new Error("WASM module not loaded. Call initWasm() first.");
1083
- }
1084
- try {
1085
- if (!data || data.length === 0) {
1086
- throw new Error("Document data cannot be empty");
1087
- }
1088
- if (!mimeType) {
1089
- throw new Error("MIME type is required");
1090
- }
1091
- const normalizedConfig = configToJS(config ?? null);
1092
- const result = wasm.extractBytesSync(data, mimeType, normalizedConfig);
1093
- if (!result) {
1094
- throw new Error("Invalid extraction result: no result from WASM module");
1095
- }
1096
- return jsToExtractionResult(result);
1097
- } catch (error) {
1098
- throw wrapWasmError(error, "extracting from bytes (sync)");
1099
- }
1100
- }
1101
- async function batchExtractBytes(files, config) {
1102
- if (!initialized) {
1103
- throw new Error("WASM module not initialized. Call initWasm() first.");
1104
- }
1105
- if (!wasm) {
1106
- throw new Error("WASM module not loaded. Call initWasm() first.");
1107
- }
1108
- try {
1109
- if (!Array.isArray(files)) {
1110
- throw new Error("Files parameter must be an array");
1111
- }
1112
- if (files.length === 0) {
1113
- throw new Error("Files array cannot be empty");
1114
- }
1115
- const dataList = [];
1116
- const mimeTypes = [];
1117
- for (let i = 0; i < files.length; i += 1) {
1118
- const file = files[i];
1119
- if (!file || typeof file !== "object") {
1120
- throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
1121
- }
1122
- const f = file;
1123
- if (!(f.data instanceof Uint8Array)) {
1124
- throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
1125
- }
1126
- if (typeof f.mimeType !== "string") {
1127
- throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
1128
- }
1129
- if (f.data.length === 0) {
1130
- throw new Error(`Invalid file at index ${i}: data cannot be empty`);
1131
- }
1132
- dataList.push(f.data);
1133
- mimeTypes.push(f.mimeType);
1134
- }
1135
- const normalizedConfig = configToJS(config ?? null);
1136
- const results = await wasm.batchExtractBytes(dataList, mimeTypes, normalizedConfig);
1137
- if (!Array.isArray(results)) {
1138
- throw new Error("Invalid batch extraction result: expected array");
1139
- }
1140
- return results.map((result, index) => {
1141
- if (!result) {
1142
- throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
1143
- }
1144
- return jsToExtractionResult(result);
1145
- });
1146
- } catch (error) {
1147
- throw wrapWasmError(error, "batch extracting from bytes");
1148
- }
1149
- }
1150
- function batchExtractBytesSync(files, config) {
1151
- if (!initialized) {
1152
- throw new Error("WASM module not initialized. Call initWasm() first.");
1153
- }
1154
- if (!wasm) {
1155
- throw new Error("WASM module not loaded. Call initWasm() first.");
1156
- }
1157
- try {
1158
- if (!Array.isArray(files)) {
1159
- throw new Error("Files parameter must be an array");
1160
- }
1161
- if (files.length === 0) {
1162
- throw new Error("Files array cannot be empty");
1163
- }
1164
- const dataList = [];
1165
- const mimeTypes = [];
1166
- for (let i = 0; i < files.length; i += 1) {
1167
- const file = files[i];
1168
- if (!file || typeof file !== "object") {
1169
- throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
1170
- }
1171
- const f = file;
1172
- if (!(f.data instanceof Uint8Array)) {
1173
- throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
1174
- }
1175
- if (typeof f.mimeType !== "string") {
1176
- throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
1177
- }
1178
- if (f.data.length === 0) {
1179
- throw new Error(`Invalid file at index ${i}: data cannot be empty`);
1180
- }
1181
- dataList.push(f.data);
1182
- mimeTypes.push(f.mimeType);
1183
- }
1184
- const normalizedConfig = configToJS(config ?? null);
1185
- const results = wasm.batchExtractBytesSync(dataList, mimeTypes, normalizedConfig);
1186
- if (!Array.isArray(results)) {
1187
- throw new Error("Invalid batch extraction result: expected array");
1188
- }
1189
- return results.map((result, index) => {
1190
- if (!result) {
1191
- throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
1192
- }
1193
- return jsToExtractionResult(result);
1194
- });
1195
- } catch (error) {
1196
- throw wrapWasmError(error, "batch extracting from bytes (sync)");
1197
- }
1198
- }
1199
- async function batchExtractFiles(files, config) {
1200
- if (!initialized) {
1201
- throw new Error("WASM module not initialized. Call initWasm() first.");
1202
- }
1203
- try {
1204
- if (!Array.isArray(files)) {
1205
- throw new Error("Files parameter must be an array");
1206
- }
1207
- if (files.length === 0) {
1208
- throw new Error("Files array cannot be empty");
1209
- }
1210
- const byteFiles = [];
1211
- for (let i = 0; i < files.length; i += 1) {
1212
- const file = files[i];
1213
- if (!(file instanceof File)) {
1214
- throw new Error(`Invalid file at index ${i}: must be a File object`);
1215
- }
1216
- const bytes = await fileToUint8Array(file);
1217
- byteFiles.push({
1218
- data: bytes,
1219
- mimeType: file.type || "application/octet-stream"
1220
- });
1221
- }
1222
- return await batchExtractBytes(byteFiles, config);
1223
- } catch (error) {
1224
- throw wrapWasmError(error, "batch extracting from files");
1225
- }
1226
- }
1227
- async function enableOcr() {
1228
- if (!initialized) {
1229
- throw new Error("WASM module not initialized. Call initWasm() first.");
1230
- }
1231
- if (!isBrowser()) {
1232
- throw new Error(
1233
- "OCR is only available in browser environments. TesseractWasmBackend requires Web Workers and createImageBitmap."
1234
- );
1235
- }
1236
- try {
1237
- const backend = new TesseractWasmBackend();
1238
- await backend.initialize();
1239
- registerOcrBackend(backend);
1240
- } catch (error) {
1241
- const message = error instanceof Error ? error.message : String(error);
1242
- throw new Error(`Failed to enable OCR: ${message}`);
1243
- }
1244
- }
1245
- //# sourceMappingURL=index.cjs.map