@kreuzberg/wasm 4.0.0-rc.6 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/LICENSE +7 -0
  2. package/README.md +317 -801
  3. package/dist/adapters/wasm-adapter.d.ts +7 -10
  4. package/dist/adapters/wasm-adapter.d.ts.map +1 -0
  5. package/dist/adapters/wasm-adapter.js +53 -54
  6. package/dist/adapters/wasm-adapter.js.map +1 -1
  7. package/dist/index.d.ts +23 -67
  8. package/dist/index.d.ts.map +1 -0
  9. package/dist/index.js +1102 -104
  10. package/dist/index.js.map +1 -1
  11. package/dist/ocr/registry.d.ts +7 -10
  12. package/dist/ocr/registry.d.ts.map +1 -0
  13. package/dist/ocr/registry.js +9 -28
  14. package/dist/ocr/registry.js.map +1 -1
  15. package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
  16. package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
  17. package/dist/ocr/tesseract-wasm-backend.js +8 -83
  18. package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
  19. package/dist/pdfium.js +77 -0
  20. package/dist/pkg/LICENSE +7 -0
  21. package/dist/pkg/README.md +498 -0
  22. package/dist/{kreuzberg_wasm.d.ts → pkg/kreuzberg_wasm.d.ts} +24 -12
  23. package/dist/{kreuzberg_wasm.js → pkg/kreuzberg_wasm.js} +224 -233
  24. package/dist/pkg/kreuzberg_wasm_bg.js +1871 -0
  25. package/dist/{kreuzberg_wasm_bg.wasm → pkg/kreuzberg_wasm_bg.wasm} +0 -0
  26. package/dist/{kreuzberg_wasm_bg.wasm.d.ts → pkg/kreuzberg_wasm_bg.wasm.d.ts} +10 -13
  27. package/dist/pkg/package.json +27 -0
  28. package/dist/plugin-registry.d.ts +246 -0
  29. package/dist/plugin-registry.d.ts.map +1 -0
  30. package/dist/runtime.d.ts +21 -22
  31. package/dist/runtime.d.ts.map +1 -0
  32. package/dist/runtime.js +21 -41
  33. package/dist/runtime.js.map +1 -1
  34. package/dist/types.d.ts +363 -0
  35. package/dist/types.d.ts.map +1 -0
  36. package/package.json +34 -51
  37. package/dist/adapters/wasm-adapter.d.mts +0 -121
  38. package/dist/adapters/wasm-adapter.mjs +0 -221
  39. package/dist/adapters/wasm-adapter.mjs.map +0 -1
  40. package/dist/index.d.mts +0 -466
  41. package/dist/index.mjs +0 -384
  42. package/dist/index.mjs.map +0 -1
  43. package/dist/kreuzberg_wasm.d.mts +0 -758
  44. package/dist/kreuzberg_wasm.mjs +0 -48
  45. package/dist/ocr/registry.d.mts +0 -102
  46. package/dist/ocr/registry.mjs +0 -70
  47. package/dist/ocr/registry.mjs.map +0 -1
  48. package/dist/ocr/tesseract-wasm-backend.d.mts +0 -257
  49. package/dist/ocr/tesseract-wasm-backend.mjs +0 -424
  50. package/dist/ocr/tesseract-wasm-backend.mjs.map +0 -1
  51. package/dist/runtime.d.mts +0 -256
  52. package/dist/runtime.mjs +0 -152
  53. package/dist/runtime.mjs.map +0 -1
  54. package/dist/snippets/wasm-bindgen-rayon-38edf6e439f6d70d/src/workerHelpers.js +0 -107
  55. package/dist/types-GJVIvbPy.d.mts +0 -221
  56. package/dist/types-GJVIvbPy.d.ts +0 -221
package/dist/index.js CHANGED
@@ -1,88 +1,1020 @@
1
- "use strict";
2
- var __create = Object.create;
3
1
  var __defProp = Object.defineProperty;
4
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
2
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
- var __hasOwnProp = Object.prototype.hasOwnProperty;
3
+ var __esm = (fn, res) => function __init() {
4
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
5
+ };
8
6
  var __export = (target, all) => {
9
7
  for (var name in all)
10
8
  __defProp(target, name, { get: all[name], enumerable: true });
11
9
  };
12
- var __copyProps = (to, from, except, desc) => {
13
- if (from && typeof from === "object" || typeof from === "function") {
14
- for (let key of __getOwnPropNames(from))
15
- if (!__hasOwnProp.call(to, key) && key !== except)
16
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
10
+
11
+ // typescript/pdfium.js
12
+ var pdfium_exports = {};
13
+ __export(pdfium_exports, {
14
+ default: () => initPdfium
15
+ });
16
+ async function initPdfium() {
17
+ return {
18
+ // Dummy implementation for testing
19
+ };
20
+ }
21
+ var init_pdfium = __esm({
22
+ "typescript/pdfium.js"() {
23
+ "use strict";
17
24
  }
18
- return to;
19
- };
20
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
- // If the importer is in node compatibility mode or this is not an ESM
22
- // file that has been converted to a CommonJS file using a Babel-
23
- // compatible transform (i.e. "__esModule" has not been set), then set
24
- // "default" to the CommonJS "module.exports" for node compatibility.
25
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
- mod
27
- ));
28
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
- var index_exports = {};
30
- __export(index_exports, {
31
- TesseractWasmBackend: () => import_tesseract_wasm_backend2.TesseractWasmBackend,
32
- batchExtractBytes: () => batchExtractBytes,
33
- batchExtractBytesSync: () => batchExtractBytesSync,
34
- batchExtractFiles: () => batchExtractFiles,
35
- clearOcrBackends: () => import_registry2.clearOcrBackends,
36
- configToJS: () => import_wasm_adapter2.configToJS,
37
- detectRuntime: () => import_runtime2.detectRuntime,
38
- enableOcr: () => enableOcr,
39
- extractBytes: () => extractBytes,
40
- extractBytesSync: () => extractBytesSync,
41
- extractFile: () => extractFile,
42
- extractFromFile: () => extractFromFile,
43
- fileToUint8Array: () => import_wasm_adapter2.fileToUint8Array,
44
- getInitializationError: () => getInitializationError,
45
- getOcrBackend: () => import_registry2.getOcrBackend,
46
- getRuntimeInfo: () => import_runtime2.getRuntimeInfo,
47
- getRuntimeVersion: () => import_runtime2.getRuntimeVersion,
48
- getVersion: () => getVersion,
49
- getWasmCapabilities: () => import_runtime2.getWasmCapabilities,
50
- hasBigInt: () => import_runtime2.hasBigInt,
51
- hasBlob: () => import_runtime2.hasBlob,
52
- hasFileApi: () => import_runtime2.hasFileApi,
53
- hasModuleWorkers: () => import_runtime2.hasModuleWorkers,
54
- hasSharedArrayBuffer: () => import_runtime2.hasSharedArrayBuffer,
55
- hasWasm: () => import_runtime2.hasWasm,
56
- hasWasmStreaming: () => import_runtime2.hasWasmStreaming,
57
- hasWorkers: () => import_runtime2.hasWorkers,
58
- initWasm: () => initWasm,
59
- isBrowser: () => import_runtime2.isBrowser,
60
- isBun: () => import_runtime2.isBun,
61
- isDeno: () => import_runtime2.isDeno,
62
- isInitialized: () => isInitialized,
63
- isNode: () => import_runtime2.isNode,
64
- isServerEnvironment: () => import_runtime2.isServerEnvironment,
65
- isValidExtractionResult: () => import_wasm_adapter2.isValidExtractionResult,
66
- isWebEnvironment: () => import_runtime2.isWebEnvironment,
67
- jsToExtractionResult: () => import_wasm_adapter2.jsToExtractionResult,
68
- listOcrBackends: () => import_registry2.listOcrBackends,
69
- registerOcrBackend: () => import_registry2.registerOcrBackend,
70
- unregisterOcrBackend: () => import_registry2.unregisterOcrBackend,
71
- wrapWasmError: () => import_wasm_adapter2.wrapWasmError
72
25
  });
73
- module.exports = __toCommonJS(index_exports);
74
- var import_wasm_adapter = require("./adapters/wasm-adapter.js");
75
- var import_registry = require("./ocr/registry.js");
76
- var import_tesseract_wasm_backend = require("./ocr/tesseract-wasm-backend.js");
77
- var import_runtime = require("./runtime.js");
78
- var import_wasm_adapter2 = require("./adapters/wasm-adapter.js");
79
- var import_registry2 = require("./ocr/registry.js");
80
- var import_tesseract_wasm_backend2 = require("./ocr/tesseract-wasm-backend.js");
81
- var import_runtime2 = require("./runtime.js");
82
- let wasm = null;
83
- let initialized = false;
84
- let initializationError = null;
85
- let initializationPromise = null;
26
+
27
+ // typescript/adapters/wasm-adapter.ts
28
+ var MAX_FILE_SIZE = 512 * 1024 * 1024;
29
+ function isNumberOrNull(value) {
30
+ return typeof value === "number" || value === null;
31
+ }
32
+ function isStringOrNull(value) {
33
+ return typeof value === "string" || value === null;
34
+ }
35
+ function isBoolean(value) {
36
+ return typeof value === "boolean";
37
+ }
38
+ async function fileToUint8Array(file) {
39
+ try {
40
+ if (file.size > MAX_FILE_SIZE) {
41
+ throw new Error(
42
+ `File size (${file.size} bytes) exceeds maximum (${MAX_FILE_SIZE} bytes). Maximum file size is 512 MB.`
43
+ );
44
+ }
45
+ const arrayBuffer = await file.arrayBuffer();
46
+ return new Uint8Array(arrayBuffer);
47
+ } catch (error) {
48
+ throw new Error(`Failed to read file: ${error instanceof Error ? error.message : String(error)}`);
49
+ }
50
+ }
51
+ function configToJS(config) {
52
+ if (!config) {
53
+ return {};
54
+ }
55
+ const normalized = {};
56
+ const normalizeValue = (value) => {
57
+ if (value === null || value === void 0) {
58
+ return null;
59
+ }
60
+ if (typeof value === "object") {
61
+ if (Array.isArray(value)) {
62
+ return value.map(normalizeValue);
63
+ }
64
+ const obj = value;
65
+ const normalized2 = {};
66
+ for (const [key, val] of Object.entries(obj)) {
67
+ const normalizedVal = normalizeValue(val);
68
+ if (normalizedVal !== null && normalizedVal !== void 0) {
69
+ normalized2[key] = normalizedVal;
70
+ }
71
+ }
72
+ return Object.keys(normalized2).length > 0 ? normalized2 : null;
73
+ }
74
+ return value;
75
+ };
76
+ for (const [key, value] of Object.entries(config)) {
77
+ const normalizedValue = normalizeValue(value);
78
+ if (normalizedValue !== null && normalizedValue !== void 0) {
79
+ normalized[key] = normalizedValue;
80
+ }
81
+ }
82
+ return normalized;
83
+ }
84
+ function jsToExtractionResult(jsValue) {
85
+ if (!jsValue || typeof jsValue !== "object") {
86
+ throw new Error("Invalid extraction result: value is not an object");
87
+ }
88
+ const result = jsValue;
89
+ const mimeType = typeof result.mimeType === "string" ? result.mimeType : typeof result.mime_type === "string" ? result.mime_type : null;
90
+ if (typeof result.content !== "string") {
91
+ throw new Error("Invalid extraction result: missing or invalid content");
92
+ }
93
+ if (typeof mimeType !== "string") {
94
+ throw new Error("Invalid extraction result: missing or invalid mimeType");
95
+ }
96
+ if (!result.metadata || typeof result.metadata !== "object") {
97
+ throw new Error("Invalid extraction result: missing or invalid metadata");
98
+ }
99
+ const tables = [];
100
+ if (Array.isArray(result.tables)) {
101
+ for (const table of result.tables) {
102
+ if (table && typeof table === "object") {
103
+ const t = table;
104
+ if (Array.isArray(t.cells) && t.cells.every((row) => Array.isArray(row) && row.every((cell) => typeof cell === "string")) && typeof t.markdown === "string" && typeof t.pageNumber === "number") {
105
+ tables.push({
106
+ cells: t.cells,
107
+ markdown: t.markdown,
108
+ pageNumber: t.pageNumber
109
+ });
110
+ }
111
+ }
112
+ }
113
+ }
114
+ const chunks = Array.isArray(result.chunks) ? result.chunks.map((chunk) => {
115
+ if (!chunk || typeof chunk !== "object") {
116
+ throw new Error("Invalid chunk structure");
117
+ }
118
+ const c = chunk;
119
+ if (typeof c.content !== "string") {
120
+ throw new Error("Invalid chunk: missing content");
121
+ }
122
+ if (!c.metadata || typeof c.metadata !== "object") {
123
+ throw new Error("Invalid chunk: missing metadata");
124
+ }
125
+ const metadata = c.metadata;
126
+ let embedding = null;
127
+ if (Array.isArray(c.embedding)) {
128
+ if (!c.embedding.every((item) => typeof item === "number")) {
129
+ throw new Error("Invalid chunk: embedding must contain only numbers");
130
+ }
131
+ embedding = c.embedding;
132
+ }
133
+ const coerceToNumber = (value, fieldName) => {
134
+ if (typeof value === "number") {
135
+ return value;
136
+ }
137
+ if (typeof value === "bigint") {
138
+ return Number(value);
139
+ }
140
+ if (typeof value === "string") {
141
+ const parsed = parseInt(value, 10);
142
+ if (Number.isNaN(parsed)) {
143
+ throw new Error(`Invalid chunk metadata: ${fieldName} must be a valid number, got "${value}"`);
144
+ }
145
+ return parsed;
146
+ }
147
+ throw new Error(`Invalid chunk metadata: ${fieldName} must be a number, got ${typeof value}`);
148
+ };
149
+ const charStart = coerceToNumber(
150
+ metadata.charStart ?? metadata.char_start ?? metadata.byteStart ?? metadata.byte_start,
151
+ "charStart"
152
+ );
153
+ const charEnd = coerceToNumber(
154
+ metadata.charEnd ?? metadata.char_end ?? metadata.byteEnd ?? metadata.byte_end,
155
+ "charEnd"
156
+ );
157
+ const chunkIndex = coerceToNumber(metadata.chunkIndex ?? metadata.chunk_index, "chunkIndex");
158
+ const totalChunks = coerceToNumber(metadata.totalChunks ?? metadata.total_chunks, "totalChunks");
159
+ let tokenCount = null;
160
+ const tokenCountValue = metadata.tokenCount ?? metadata.token_count;
161
+ if (tokenCountValue !== null && tokenCountValue !== void 0) {
162
+ tokenCount = coerceToNumber(tokenCountValue, "tokenCount");
163
+ }
164
+ return {
165
+ content: c.content,
166
+ embedding,
167
+ metadata: {
168
+ charStart,
169
+ charEnd,
170
+ tokenCount,
171
+ chunkIndex,
172
+ totalChunks
173
+ }
174
+ };
175
+ }) : null;
176
+ const images = Array.isArray(result.images) ? result.images.map((image) => {
177
+ if (!image || typeof image !== "object") {
178
+ throw new Error("Invalid image structure");
179
+ }
180
+ const img = image;
181
+ if (!(img.data instanceof Uint8Array)) {
182
+ throw new Error("Invalid image: data must be Uint8Array");
183
+ }
184
+ if (typeof img.format !== "string") {
185
+ throw new Error("Invalid image: missing format");
186
+ }
187
+ if (typeof img.imageIndex !== "number") {
188
+ throw new Error("Invalid image: imageIndex must be a number");
189
+ }
190
+ if (!isNumberOrNull(img.pageNumber)) {
191
+ throw new Error("Invalid image: pageNumber must be a number or null");
192
+ }
193
+ if (!isNumberOrNull(img.width)) {
194
+ throw new Error("Invalid image: width must be a number or null");
195
+ }
196
+ if (!isNumberOrNull(img.height)) {
197
+ throw new Error("Invalid image: height must be a number or null");
198
+ }
199
+ if (!isNumberOrNull(img.bitsPerComponent)) {
200
+ throw new Error("Invalid image: bitsPerComponent must be a number or null");
201
+ }
202
+ if (!isBoolean(img.isMask)) {
203
+ throw new Error("Invalid image: isMask must be a boolean");
204
+ }
205
+ if (!isStringOrNull(img.colorspace)) {
206
+ throw new Error("Invalid image: colorspace must be a string or null");
207
+ }
208
+ if (!isStringOrNull(img.description)) {
209
+ throw new Error("Invalid image: description must be a string or null");
210
+ }
211
+ return {
212
+ data: img.data,
213
+ format: img.format,
214
+ imageIndex: img.imageIndex,
215
+ pageNumber: img.pageNumber,
216
+ width: img.width,
217
+ height: img.height,
218
+ colorspace: img.colorspace,
219
+ bitsPerComponent: img.bitsPerComponent,
220
+ isMask: img.isMask,
221
+ description: img.description,
222
+ ocrResult: img.ocrResult ? jsToExtractionResult(img.ocrResult) : null
223
+ };
224
+ }) : null;
225
+ let detectedLanguages = null;
226
+ const detectedLanguagesRaw = Array.isArray(result.detectedLanguages) ? result.detectedLanguages : result.detected_languages;
227
+ if (Array.isArray(detectedLanguagesRaw)) {
228
+ if (!detectedLanguagesRaw.every((lang) => typeof lang === "string")) {
229
+ throw new Error("Invalid result: detectedLanguages must contain only strings");
230
+ }
231
+ detectedLanguages = detectedLanguagesRaw;
232
+ }
233
+ return {
234
+ content: result.content,
235
+ mimeType,
236
+ metadata: result.metadata ?? {},
237
+ tables,
238
+ detectedLanguages,
239
+ chunks,
240
+ images
241
+ };
242
+ }
243
+ function wrapWasmError(error, context) {
244
+ if (error instanceof Error) {
245
+ return new Error(`Error ${context}: ${error.message}`, {
246
+ cause: error
247
+ });
248
+ }
249
+ const message = String(error);
250
+ return new Error(`Error ${context}: ${message}`);
251
+ }
252
+ function isValidExtractionResult(value) {
253
+ if (!value || typeof value !== "object") {
254
+ return false;
255
+ }
256
+ const obj = value;
257
+ return typeof obj.content === "string" && (typeof obj.mimeType === "string" || typeof obj.mime_type === "string") && obj.metadata !== null && typeof obj.metadata === "object" && Array.isArray(obj.tables);
258
+ }
259
+
260
+ // typescript/ocr/registry.ts
261
+ var ocrBackendRegistry = /* @__PURE__ */ new Map();
262
+ function registerOcrBackend(backend) {
263
+ if (!backend) {
264
+ throw new Error("Backend cannot be null or undefined");
265
+ }
266
+ if (typeof backend.name !== "function") {
267
+ throw new Error("Backend must implement name() method");
268
+ }
269
+ if (typeof backend.supportedLanguages !== "function") {
270
+ throw new Error("Backend must implement supportedLanguages() method");
271
+ }
272
+ if (typeof backend.processImage !== "function") {
273
+ throw new Error("Backend must implement processImage() method");
274
+ }
275
+ const backendName = backend.name();
276
+ if (!backendName || typeof backendName !== "string") {
277
+ throw new Error("Backend name must be a non-empty string");
278
+ }
279
+ if (ocrBackendRegistry.has(backendName)) {
280
+ console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
281
+ }
282
+ ocrBackendRegistry.set(backendName, backend);
283
+ }
284
+ function getOcrBackend(name) {
285
+ return ocrBackendRegistry.get(name);
286
+ }
287
+ function listOcrBackends() {
288
+ return Array.from(ocrBackendRegistry.keys());
289
+ }
290
+ async function unregisterOcrBackend(name) {
291
+ const backend = ocrBackendRegistry.get(name);
292
+ if (!backend) {
293
+ throw new Error(
294
+ `OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
295
+ );
296
+ }
297
+ if (typeof backend.shutdown === "function") {
298
+ try {
299
+ await backend.shutdown();
300
+ } catch (error) {
301
+ console.warn(
302
+ `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
303
+ );
304
+ }
305
+ }
306
+ ocrBackendRegistry.delete(name);
307
+ }
308
+ async function clearOcrBackends() {
309
+ const backends = Array.from(ocrBackendRegistry.entries());
310
+ for (const [name, backend] of backends) {
311
+ if (typeof backend.shutdown === "function") {
312
+ try {
313
+ await backend.shutdown();
314
+ } catch (error) {
315
+ console.warn(
316
+ `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
317
+ );
318
+ }
319
+ }
320
+ }
321
+ ocrBackendRegistry.clear();
322
+ }
323
+
324
+ // typescript/ocr/tesseract-wasm-backend.ts
325
+ var TesseractWasmBackend = class {
326
+ /** Tesseract WASM client instance */
327
+ client = null;
328
+ /** Track which models are currently loaded to avoid redundant loads */
329
+ loadedLanguages = /* @__PURE__ */ new Set();
330
+ /** Cache for language availability validation */
331
+ supportedLangsCache = null;
332
+ /** Progress callback for UI updates */
333
+ progressCallback = null;
334
+ /** Base URL for training data CDN */
335
+ CDN_BASE_URL = "https://cdn.jsdelivr.net/npm/tesseract-wasm@0.11.0/dist";
336
+ /**
337
+ * Return the unique name of this OCR backend
338
+ *
339
+ * @returns Backend identifier "tesseract-wasm"
340
+ */
341
+ name() {
342
+ return "tesseract-wasm";
343
+ }
344
+ /**
345
+ * Return list of supported language codes
346
+ *
347
+ * Returns a curated list of commonly available Tesseract language models.
348
+ * Tesseract supports many more languages through custom models.
349
+ *
350
+ * @returns Array of ISO 639-1/2/3 language codes
351
+ */
352
+ supportedLanguages() {
353
+ if (this.supportedLangsCache) {
354
+ return this.supportedLangsCache;
355
+ }
356
+ this.supportedLangsCache = [
357
+ "eng",
358
+ "deu",
359
+ "fra",
360
+ "spa",
361
+ "ita",
362
+ "por",
363
+ "nld",
364
+ "rus",
365
+ "jpn",
366
+ "kor",
367
+ "chi_sim",
368
+ "chi_tra",
369
+ "pol",
370
+ "tur",
371
+ "swe",
372
+ "dan",
373
+ "fin",
374
+ "nor",
375
+ "ces",
376
+ "slk",
377
+ "ron",
378
+ "hun",
379
+ "hrv",
380
+ "srp",
381
+ "bul",
382
+ "ukr",
383
+ "ell",
384
+ "ara",
385
+ "heb",
386
+ "hin",
387
+ "tha",
388
+ "vie",
389
+ "mkd",
390
+ "ben",
391
+ "tam",
392
+ "tel",
393
+ "kan",
394
+ "mal",
395
+ "mya",
396
+ "khm",
397
+ "lao",
398
+ "sin"
399
+ ];
400
+ return this.supportedLangsCache;
401
+ }
402
+ /**
403
+ * Initialize the OCR backend
404
+ *
405
+ * Creates the Tesseract WASM client instance. This is called once when
406
+ * the backend is registered with the extraction pipeline.
407
+ *
408
+ * The actual model loading happens in processImage() on-demand to avoid
409
+ * loading all models upfront.
410
+ *
411
+ * @throws {Error} If tesseract-wasm is not available or initialization fails
412
+ *
413
+ * @example
414
+ * ```typescript
415
+ * const backend = new TesseractWasmBackend();
416
+ * try {
417
+ * await backend.initialize();
418
+ * } catch (error) {
419
+ * console.error('Failed to initialize OCR:', error);
420
+ * }
421
+ * ```
422
+ */
423
+ async initialize() {
424
+ if (this.client) {
425
+ return;
426
+ }
427
+ try {
428
+ const tesseractModule = await this.loadTesseractWasm();
429
+ if (!tesseractModule || typeof tesseractModule.OCRClient !== "function") {
430
+ throw new Error("tesseract-wasm OCRClient not found. Ensure tesseract-wasm is installed and available.");
431
+ }
432
+ this.client = new tesseractModule.OCRClient();
433
+ this.loadedLanguages.clear();
434
+ } catch (error) {
435
+ const message = error instanceof Error ? error.message : String(error);
436
+ throw new Error(`Failed to initialize TesseractWasmBackend: ${message}`);
437
+ }
438
+ }
439
+ /**
440
+ * Process image bytes and extract text via OCR
441
+ *
442
+ * Handles image loading, model loading, OCR processing, and result formatting.
443
+ * Automatically loads the language model on first use and caches it for subsequent calls.
444
+ *
445
+ * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string
446
+ * @param language - ISO 639-2/3 language code (e.g., "eng", "deu")
447
+ * @returns Promise resolving to OCR result with content and metadata
448
+ * @throws {Error} If image processing fails, model loading fails, or language is unsupported
449
+ *
450
+ * @example
451
+ * ```typescript
452
+ * const backend = new TesseractWasmBackend();
453
+ * await backend.initialize();
454
+ *
455
+ * const imageBuffer = fs.readFileSync('scanned.png');
456
+ * const result = await backend.processImage(
457
+ * new Uint8Array(imageBuffer),
458
+ * 'eng'
459
+ * );
460
+ *
461
+ * console.log(result.content); // Extracted text
462
+ * console.log(result.metadata.confidence); // OCR confidence score
463
+ * ```
464
+ */
465
+ async processImage(imageBytes, language) {
466
+ if (!this.client) {
467
+ throw new Error("TesseractWasmBackend not initialized. Call initialize() first.");
468
+ }
469
+ const supported = this.supportedLanguages();
470
+ const normalizedLang = language.toLowerCase();
471
+ const isSupported = supported.some((lang) => lang.toLowerCase() === normalizedLang);
472
+ if (!isSupported) {
473
+ throw new Error(`Language "${language}" is not supported. Supported languages: ${supported.join(", ")}`);
474
+ }
475
+ try {
476
+ if (!this.loadedLanguages.has(normalizedLang)) {
477
+ this.reportProgress(10);
478
+ await this.loadLanguageModel(normalizedLang);
479
+ this.loadedLanguages.add(normalizedLang);
480
+ this.reportProgress(30);
481
+ }
482
+ this.reportProgress(40);
483
+ const imageBitmap = await this.convertToImageBitmap(imageBytes);
484
+ this.reportProgress(50);
485
+ await this.client.loadImage(imageBitmap);
486
+ this.reportProgress(70);
487
+ const text = await this.client.getText();
488
+ const confidence = await this.getConfidenceScore();
489
+ const pageMetadata = await this.getPageMetadata();
490
+ this.reportProgress(90);
491
+ return {
492
+ content: text,
493
+ mime_type: "text/plain",
494
+ metadata: {
495
+ language: normalizedLang,
496
+ confidence,
497
+ ...pageMetadata
498
+ },
499
+ tables: []
500
+ };
501
+ } catch (error) {
502
+ const message = error instanceof Error ? error.message : String(error);
503
+ throw new Error(`OCR processing failed for language "${language}": ${message}`);
504
+ } finally {
505
+ this.reportProgress(100);
506
+ }
507
+ }
508
+ /**
509
+ * Shutdown the OCR backend and release resources
510
+ *
511
+ * Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.
512
+ * Called when the backend is unregistered or the application shuts down.
513
+ *
514
+ * @throws {Error} If cleanup fails (errors are logged but not critical)
515
+ *
516
+ * @example
517
+ * ```typescript
518
+ * const backend = new TesseractWasmBackend();
519
+ * await backend.initialize();
520
+ * // ... use backend ...
521
+ * await backend.shutdown(); // Clean up resources
522
+ * ```
523
+ */
524
+ async shutdown() {
525
+ try {
526
+ if (this.client) {
527
+ if (typeof this.client.destroy === "function") {
528
+ this.client.destroy();
529
+ }
530
+ if (typeof this.client.terminate === "function") {
531
+ this.client.terminate();
532
+ }
533
+ this.client = null;
534
+ }
535
+ this.loadedLanguages.clear();
536
+ this.supportedLangsCache = null;
537
+ this.progressCallback = null;
538
+ } catch (error) {
539
+ console.warn(
540
+ `Warning during TesseractWasmBackend shutdown: ${error instanceof Error ? error.message : String(error)}`
541
+ );
542
+ }
543
+ }
544
+ /**
545
+ * Set a progress callback for UI updates
546
+ *
547
+ * Allows the UI to display progress during OCR processing.
548
+ * The callback will be called with values from 0 to 100.
549
+ *
550
+ * @param callback - Function to call with progress percentage
551
+ *
552
+ * @example
553
+ * ```typescript
554
+ * const backend = new TesseractWasmBackend();
555
+ * backend.setProgressCallback((progress) => {
556
+ * console.log(`OCR Progress: ${progress}%`);
557
+ * document.getElementById('progress-bar').style.width = `${progress}%`;
558
+ * });
559
+ * ```
560
+ */
561
+ setProgressCallback(callback) {
562
+ this.progressCallback = callback;
563
+ }
564
+ /**
565
+ * Load language model from CDN
566
+ *
567
+ * Fetches the training data for a specific language from jsDelivr CDN.
568
+ * This is an MVP approach - models are cached by the browser.
569
+ *
570
+ * @param language - ISO 639-2/3 language code
571
+ * @throws {Error} If model download fails or language is not available
572
+ *
573
+ * @internal
574
+ */
575
+ async loadLanguageModel(language) {
576
+ if (!this.client) {
577
+ throw new Error("Client not initialized");
578
+ }
579
+ const modelFilename = `${language}.traineddata`;
580
+ const modelUrl = `${this.CDN_BASE_URL}/${modelFilename}`;
581
+ try {
582
+ await this.client.loadModel(modelUrl);
583
+ } catch (error) {
584
+ const message = error instanceof Error ? error.message : String(error);
585
+ throw new Error(`Failed to load model for language "${language}" from ${modelUrl}: ${message}`);
586
+ }
587
+ }
588
+ /**
589
+ * Convert image bytes or Base64 string to ImageBitmap
590
+ *
591
+ * Handles both Uint8Array and Base64-encoded image data, converting to
592
+ * ImageBitmap format required by Tesseract WASM.
593
+ *
594
+ * @param imageBytes - Image data as Uint8Array or Base64 string
595
+ * @returns Promise resolving to ImageBitmap
596
+ * @throws {Error} If conversion fails (browser API not available or invalid image data)
597
+ *
598
+ * @internal
599
+ */
600
+ async convertToImageBitmap(imageBytes) {
601
+ if (typeof createImageBitmap === "undefined") {
602
+ throw new Error("createImageBitmap is not available. TesseractWasmBackend requires a browser environment.");
603
+ }
604
+ try {
605
+ let bytes = imageBytes;
606
+ if (typeof imageBytes === "string") {
607
+ const binaryString = atob(imageBytes);
608
+ bytes = new Uint8Array(binaryString.length);
609
+ for (let i = 0; i < binaryString.length; i++) {
610
+ bytes[i] = binaryString.charCodeAt(i);
611
+ }
612
+ }
613
+ const blob = new Blob([bytes]);
614
+ const imageBitmap = await createImageBitmap(blob);
615
+ return imageBitmap;
616
+ } catch (error) {
617
+ const message = error instanceof Error ? error.message : String(error);
618
+ throw new Error(`Failed to convert image bytes to ImageBitmap: ${message}`);
619
+ }
620
+ }
621
+ /**
622
+ * Get confidence score from OCR result
623
+ *
624
+ * Attempts to retrieve confidence score from Tesseract.
625
+ * Returns a safe default if unavailable.
626
+ *
627
+ * @returns Confidence score between 0 and 1
628
+ *
629
+ * @internal
630
+ */
631
+ async getConfidenceScore() {
632
+ try {
633
+ if (this.client && typeof this.client.getConfidence === "function") {
634
+ const confidence = await this.client.getConfidence();
635
+ return confidence > 1 ? confidence / 100 : confidence;
636
+ }
637
+ } catch {
638
+ }
639
+ return 0.9;
640
+ }
641
+ /**
642
+ * Get page metadata from OCR result
643
+ *
644
+ * Retrieves additional metadata like image dimensions and processing info.
645
+ *
646
+ * @returns Metadata object (may be empty if unavailable)
647
+ *
648
+ * @internal
649
+ */
650
+ async getPageMetadata() {
651
+ try {
652
+ if (this.client && typeof this.client.getPageMetadata === "function") {
653
+ return await this.client.getPageMetadata();
654
+ }
655
+ } catch {
656
+ }
657
+ return {};
658
+ }
659
+ /**
660
+ * Dynamically load tesseract-wasm module
661
+ *
662
+ * Uses dynamic import to load tesseract-wasm only when needed,
663
+ * avoiding hard dependency in browser environments where it may not be bundled.
664
+ *
665
+ * @returns tesseract-wasm module object
666
+ * @throws {Error} If module cannot be imported
667
+ *
668
+ * @internal
669
+ */
670
+ async loadTesseractWasm() {
671
+ try {
672
+ const module = await import("tesseract-wasm");
673
+ return module;
674
+ } catch (error) {
675
+ const message = error instanceof Error ? error.message : String(error);
676
+ throw new Error(
677
+ `Failed to import tesseract-wasm. Ensure it is installed via: npm install tesseract-wasm. Error: ${message}`
678
+ );
679
+ }
680
+ }
681
+ /**
682
+ * Report progress to progress callback
683
+ *
684
+ * Internal helper for notifying progress updates during OCR processing.
685
+ *
686
+ * @param progress - Progress percentage (0-100)
687
+ *
688
+ * @internal
689
+ */
690
+ reportProgress(progress) {
691
+ if (this.progressCallback) {
692
+ try {
693
+ this.progressCallback(Math.min(100, Math.max(0, progress)));
694
+ } catch {
695
+ }
696
+ }
697
+ }
698
+ };
699
+
700
+ // typescript/runtime.ts
701
+ function detectRuntime() {
702
+ if (typeof globalThis.Deno !== "undefined") {
703
+ return "deno";
704
+ }
705
+ if (typeof globalThis.Bun !== "undefined") {
706
+ return "bun";
707
+ }
708
+ if (typeof process !== "undefined" && process.versions && process.versions.node) {
709
+ return "node";
710
+ }
711
+ if (typeof window !== "undefined" && typeof document !== "undefined") {
712
+ return "browser";
713
+ }
714
+ return "unknown";
715
+ }
716
+ function isBrowser() {
717
+ return detectRuntime() === "browser";
718
+ }
719
+ function isNode() {
720
+ return detectRuntime() === "node";
721
+ }
722
+ function isDeno() {
723
+ return detectRuntime() === "deno";
724
+ }
725
+ function isBun() {
726
+ return detectRuntime() === "bun";
727
+ }
728
+ function isWebEnvironment() {
729
+ const runtime = detectRuntime();
730
+ return runtime === "browser";
731
+ }
732
+ function isServerEnvironment() {
733
+ const runtime = detectRuntime();
734
+ return runtime === "node" || runtime === "deno" || runtime === "bun";
735
+ }
736
+ function hasFileApi() {
737
+ return typeof window !== "undefined" && typeof File !== "undefined" && typeof Blob !== "undefined";
738
+ }
739
+ function hasBlob() {
740
+ return typeof Blob !== "undefined";
741
+ }
742
+ function hasWorkers() {
743
+ return typeof Worker !== "undefined";
744
+ }
745
+ function hasSharedArrayBuffer() {
746
+ return typeof SharedArrayBuffer !== "undefined";
747
+ }
748
+ function hasModuleWorkers() {
749
+ if (!hasWorkers()) {
750
+ return false;
751
+ }
752
+ try {
753
+ const blob = new Blob(['console.log("test")'], {
754
+ type: "application/javascript"
755
+ });
756
+ const workerUrl = URL.createObjectURL(blob);
757
+ try {
758
+ return true;
759
+ } finally {
760
+ URL.revokeObjectURL(workerUrl);
761
+ }
762
+ } catch {
763
+ return false;
764
+ }
765
+ }
766
+ function hasWasm() {
767
+ return typeof WebAssembly !== "undefined" && WebAssembly.instantiate !== void 0;
768
+ }
769
+ function hasWasmStreaming() {
770
+ return typeof WebAssembly !== "undefined" && WebAssembly.instantiateStreaming !== void 0;
771
+ }
772
+ function hasBigInt() {
773
+ try {
774
+ const test = BigInt("1");
775
+ return typeof test === "bigint";
776
+ } catch {
777
+ return false;
778
+ }
779
+ }
780
+ function getRuntimeVersion() {
781
+ const runtime = detectRuntime();
782
+ switch (runtime) {
783
+ case "node":
784
+ return process.version?.substring(1);
785
+ case "deno": {
786
+ const deno = globalThis.Deno;
787
+ const version = deno?.version;
788
+ return version?.deno;
789
+ }
790
+ case "bun": {
791
+ const bun = globalThis.Bun;
792
+ return bun?.version;
793
+ }
794
+ default:
795
+ return void 0;
796
+ }
797
+ }
798
+ function getWasmCapabilities() {
799
+ const runtime = detectRuntime();
800
+ const version = getRuntimeVersion();
801
+ const capabilities = {
802
+ runtime,
803
+ hasWasm: hasWasm(),
804
+ hasWasmStreaming: hasWasmStreaming(),
805
+ hasFileApi: hasFileApi(),
806
+ hasBlob: hasBlob(),
807
+ hasWorkers: hasWorkers(),
808
+ hasSharedArrayBuffer: hasSharedArrayBuffer(),
809
+ hasModuleWorkers: hasModuleWorkers(),
810
+ hasBigInt: hasBigInt(),
811
+ ...version !== void 0 ? { runtimeVersion: version } : {}
812
+ };
813
+ return capabilities;
814
+ }
815
+ function getRuntimeInfo() {
816
+ const runtime = detectRuntime();
817
+ const capabilities = getWasmCapabilities();
818
+ return {
819
+ runtime,
820
+ isBrowser: isBrowser(),
821
+ isNode: isNode(),
822
+ isDeno: isDeno(),
823
+ isBun: isBun(),
824
+ isWeb: isWebEnvironment(),
825
+ isServer: isServerEnvironment(),
826
+ runtimeVersion: getRuntimeVersion(),
827
+ userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "N/A",
828
+ capabilities
829
+ };
830
+ }
831
+
832
+ // typescript/plugin-registry.ts
833
+ var postProcessors = /* @__PURE__ */ new Map();
834
+ var validators = /* @__PURE__ */ new Map();
835
+ function validatePostProcessor(processor) {
836
+ if (processor === null || processor === void 0) {
837
+ throw new Error("Post-processor cannot be null or undefined");
838
+ }
839
+ const obj = processor;
840
+ if (typeof obj.name !== "function") {
841
+ throw new Error("Post-processor must implement name() method");
842
+ }
843
+ if (typeof obj.process !== "function") {
844
+ throw new Error("Post-processor must implement process() method");
845
+ }
846
+ const name = obj.name();
847
+ if (typeof name !== "string" || name.trim() === "") {
848
+ throw new Error("Post-processor name must be a non-empty string");
849
+ }
850
+ return true;
851
+ }
852
+ function registerPostProcessor(processor) {
853
+ validatePostProcessor(processor);
854
+ const name = processor.name();
855
+ if (postProcessors.has(name)) {
856
+ console.warn(`Post-processor "${name}" already registered, overwriting with new implementation`);
857
+ }
858
+ postProcessors.set(name, processor);
859
+ }
860
+ function getPostProcessor(name) {
861
+ return postProcessors.get(name);
862
+ }
863
+ function listPostProcessors() {
864
+ return Array.from(postProcessors.keys());
865
+ }
866
+ async function unregisterPostProcessor(name) {
867
+ const processor = postProcessors.get(name);
868
+ if (!processor) {
869
+ const available = Array.from(postProcessors.keys());
870
+ const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
871
+ throw new Error(`Post-processor "${name}" is not registered.${availableStr}`);
872
+ }
873
+ try {
874
+ if (processor.shutdown) {
875
+ await processor.shutdown();
876
+ }
877
+ } catch (error) {
878
+ console.warn(`Error during shutdown of post-processor "${name}":`, error);
879
+ }
880
+ postProcessors.delete(name);
881
+ }
882
+ async function clearPostProcessors() {
883
+ const entries = Array.from(postProcessors.entries());
884
+ for (const [_name, processor] of entries) {
885
+ try {
886
+ if (processor.shutdown) {
887
+ await processor.shutdown();
888
+ }
889
+ } catch (error) {
890
+ console.warn(`Error during shutdown of post-processor "${_name}":`, error);
891
+ }
892
+ }
893
+ postProcessors.clear();
894
+ }
895
+ function validateValidator(validator) {
896
+ if (validator === null || validator === void 0) {
897
+ throw new Error("Validator cannot be null or undefined");
898
+ }
899
+ const obj = validator;
900
+ if (typeof obj.name !== "function") {
901
+ throw new Error("Validator must implement name() method");
902
+ }
903
+ if (typeof obj.validate !== "function") {
904
+ throw new Error("Validator must implement validate() method");
905
+ }
906
+ const name = obj.name();
907
+ if (typeof name !== "string" || name.trim() === "") {
908
+ throw new Error("Validator name must be a non-empty string");
909
+ }
910
+ return true;
911
+ }
912
+ function registerValidator(validator) {
913
+ validateValidator(validator);
914
+ const name = validator.name();
915
+ if (validators.has(name)) {
916
+ console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
917
+ }
918
+ validators.set(name, validator);
919
+ }
920
+ function getValidator(name) {
921
+ return validators.get(name);
922
+ }
923
+ function listValidators() {
924
+ return Array.from(validators.keys());
925
+ }
926
+ async function unregisterValidator(name) {
927
+ const validator = validators.get(name);
928
+ if (!validator) {
929
+ const available = Array.from(validators.keys());
930
+ const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
931
+ throw new Error(`Validator "${name}" is not registered.${availableStr}`);
932
+ }
933
+ try {
934
+ if (validator.shutdown) {
935
+ await validator.shutdown();
936
+ }
937
+ } catch (error) {
938
+ console.warn(`Error during shutdown of validator "${name}":`, error);
939
+ }
940
+ validators.delete(name);
941
+ }
942
+ async function clearValidators() {
943
+ const entries = Array.from(validators.entries());
944
+ for (const [_name, validator] of entries) {
945
+ try {
946
+ if (validator.shutdown) {
947
+ await validator.shutdown();
948
+ }
949
+ } catch (error) {
950
+ console.warn(`Error during shutdown of validator "${_name}":`, error);
951
+ }
952
+ }
953
+ validators.clear();
954
+ }
955
+ function executePostProcessor(name, result) {
956
+ const processor = postProcessors.get(name);
957
+ if (!processor) {
958
+ return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
959
+ }
960
+ try {
961
+ const output = processor.process(result);
962
+ if (output instanceof Promise) {
963
+ return output;
964
+ }
965
+ return Promise.resolve(output);
966
+ } catch (error) {
967
+ return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
968
+ }
969
+ }
970
+ function executeValidator(name, result) {
971
+ const validator = validators.get(name);
972
+ if (!validator) {
973
+ return Promise.reject(new Error(`Validator "${name}" is not registered`));
974
+ }
975
+ try {
976
+ const output = validator.validate(result);
977
+ if (output instanceof Promise) {
978
+ return output;
979
+ }
980
+ return Promise.resolve(output);
981
+ } catch (error) {
982
+ return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
983
+ }
984
+ }
985
+ function setupGlobalCallbacks() {
986
+ if (typeof globalThis !== "undefined") {
987
+ const callbacksObj = globalThis;
988
+ callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
989
+ callbacksObj.__kreuzberg_execute_validator = executeValidator;
990
+ }
991
+ }
992
+ setupGlobalCallbacks();
993
+
994
+ // typescript/index.ts
995
+ var wasm = null;
996
+ var initialized = false;
997
+ var initializationError = null;
998
+ var initializationPromise = null;
999
+ async function initializePdfiumAsync(wasmModule) {
1000
+ if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
1001
+ return;
1002
+ }
1003
+ if (!isBrowser()) {
1004
+ console.debug("PDFium initialization skipped (non-browser environment)");
1005
+ return;
1006
+ }
1007
+ try {
1008
+ const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
1009
+ const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
1010
+ const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
1011
+ if (!success) {
1012
+ console.warn("PDFium initialization returned false");
1013
+ }
1014
+ } catch (error) {
1015
+ console.debug("PDFium initialization error:", error);
1016
+ }
1017
+ }
86
1018
  async function initWasm() {
87
1019
  if (initialized) {
88
1020
  return;
@@ -92,24 +1024,37 @@ async function initWasm() {
92
1024
  }
93
1025
  initializationPromise = (async () => {
94
1026
  try {
95
- if (!(0, import_runtime.hasWasm)()) {
1027
+ if (!hasWasm()) {
96
1028
  throw new Error("WebAssembly is not supported in this environment");
97
1029
  }
98
1030
  let wasmModule;
1031
+ const pkgPath = "./pkg/kreuzberg_wasm.js";
1032
+ const fallbackPath = "./kreuzberg_wasm.js";
99
1033
  try {
100
- wasmModule = await import("../../pkg/kreuzberg_wasm");
1034
+ wasmModule = await import(
1035
+ /* @vite-ignore */
1036
+ pkgPath
1037
+ );
101
1038
  } catch {
102
- wasmModule = await import("./kreuzberg_wasm");
1039
+ wasmModule = await import(
1040
+ /* @vite-ignore */
1041
+ fallbackPath
1042
+ );
103
1043
  }
104
1044
  wasm = wasmModule;
105
1045
  if (wasm && typeof wasm.default === "function") {
106
1046
  await wasm.default();
107
1047
  }
1048
+ if (isBrowser() && wasm && typeof wasm.initialize_pdfium_render === "function") {
1049
+ initializePdfiumAsync(wasm).catch((error) => {
1050
+ console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
1051
+ });
1052
+ }
108
1053
  initialized = true;
109
1054
  initializationError = null;
110
1055
  } catch (error) {
111
1056
  initializationError = error instanceof Error ? error : new Error(String(error));
112
- throw (0, import_wasm_adapter.wrapWasmError)(error, "initializing Kreuzberg WASM module");
1057
+ throw wrapWasmError(error, "initializing Kreuzberg WASM module");
113
1058
  }
114
1059
  })();
115
1060
  return initializationPromise;
@@ -143,14 +1088,14 @@ async function extractBytes(data, mimeType, config) {
143
1088
  if (!mimeType) {
144
1089
  throw new Error("MIME type is required");
145
1090
  }
146
- const normalizedConfig = (0, import_wasm_adapter.configToJS)(config ?? null);
1091
+ const normalizedConfig = configToJS(config ?? null);
147
1092
  const result = await wasm.extractBytes(data, mimeType, normalizedConfig);
148
1093
  if (!result) {
149
1094
  throw new Error("Invalid extraction result: no result from WASM module");
150
1095
  }
151
- return (0, import_wasm_adapter.jsToExtractionResult)(result);
1096
+ return jsToExtractionResult(result);
152
1097
  } catch (error) {
153
- throw (0, import_wasm_adapter.wrapWasmError)(error, "extracting from bytes");
1098
+ throw wrapWasmError(error, "extracting from bytes");
154
1099
  }
155
1100
  }
156
1101
  async function extractFile(path, mimeType, config) {
@@ -164,20 +1109,20 @@ async function extractFile(path, mimeType, config) {
164
1109
  if (!path) {
165
1110
  throw new Error("File path is required");
166
1111
  }
167
- const runtime = (0, import_runtime.detectRuntime)();
1112
+ const runtime = detectRuntime();
168
1113
  if (runtime === "browser") {
169
1114
  throw new Error("Use extractBytes with fileToUint8Array for browser environments");
170
1115
  }
171
1116
  let fileData;
172
1117
  if (runtime === "node") {
173
- const { readFile } = await import("node:fs/promises");
1118
+ const { readFile } = await import("fs/promises");
174
1119
  const buffer = await readFile(path);
175
1120
  fileData = new Uint8Array(buffer);
176
1121
  } else if (runtime === "deno") {
177
1122
  const deno = globalThis.Deno;
178
1123
  fileData = await deno.readFile(path);
179
1124
  } else if (runtime === "bun") {
180
- const { readFile } = await import("node:fs/promises");
1125
+ const { readFile } = await import("fs/promises");
181
1126
  const buffer = await readFile(path);
182
1127
  fileData = new Uint8Array(buffer);
183
1128
  } else {
@@ -193,7 +1138,7 @@ async function extractFile(path, mimeType, config) {
193
1138
  detectedMimeType = wasm.normalizeMimeType(detectedMimeType);
194
1139
  return await extractBytes(fileData, detectedMimeType, config);
195
1140
  } catch (error) {
196
- throw (0, import_wasm_adapter.wrapWasmError)(error, `extracting from file: ${path}`);
1141
+ throw wrapWasmError(error, `extracting from file: ${path}`);
197
1142
  }
198
1143
  }
199
1144
  async function extractFromFile(file, mimeType, config) {
@@ -204,12 +1149,12 @@ async function extractFromFile(file, mimeType, config) {
204
1149
  throw new Error("WASM module not loaded. Call initWasm() first.");
205
1150
  }
206
1151
  try {
207
- const bytes = await (0, import_wasm_adapter.fileToUint8Array)(file);
1152
+ const bytes = await fileToUint8Array(file);
208
1153
  let type = mimeType ?? (file instanceof File ? file.type : "application/octet-stream");
209
1154
  type = wasm.normalizeMimeType(type);
210
1155
  return await extractBytes(bytes, type, config);
211
1156
  } catch (error) {
212
- throw (0, import_wasm_adapter.wrapWasmError)(error, `extracting from ${file instanceof File ? "file" : "blob"}`);
1157
+ throw wrapWasmError(error, `extracting from ${file instanceof File ? "file" : "blob"}`);
213
1158
  }
214
1159
  }
215
1160
  function extractBytesSync(data, mimeType, config) {
@@ -226,14 +1171,14 @@ function extractBytesSync(data, mimeType, config) {
226
1171
  if (!mimeType) {
227
1172
  throw new Error("MIME type is required");
228
1173
  }
229
- const normalizedConfig = (0, import_wasm_adapter.configToJS)(config ?? null);
1174
+ const normalizedConfig = configToJS(config ?? null);
230
1175
  const result = wasm.extractBytesSync(data, mimeType, normalizedConfig);
231
1176
  if (!result) {
232
1177
  throw new Error("Invalid extraction result: no result from WASM module");
233
1178
  }
234
- return (0, import_wasm_adapter.jsToExtractionResult)(result);
1179
+ return jsToExtractionResult(result);
235
1180
  } catch (error) {
236
- throw (0, import_wasm_adapter.wrapWasmError)(error, "extracting from bytes (sync)");
1181
+ throw wrapWasmError(error, "extracting from bytes (sync)");
237
1182
  }
238
1183
  }
239
1184
  async function batchExtractBytes(files, config) {
@@ -270,7 +1215,7 @@ async function batchExtractBytes(files, config) {
270
1215
  dataList.push(f.data);
271
1216
  mimeTypes.push(f.mimeType);
272
1217
  }
273
- const normalizedConfig = (0, import_wasm_adapter.configToJS)(config ?? null);
1218
+ const normalizedConfig = configToJS(config ?? null);
274
1219
  const results = await wasm.batchExtractBytes(dataList, mimeTypes, normalizedConfig);
275
1220
  if (!Array.isArray(results)) {
276
1221
  throw new Error("Invalid batch extraction result: expected array");
@@ -279,10 +1224,10 @@ async function batchExtractBytes(files, config) {
279
1224
  if (!result) {
280
1225
  throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
281
1226
  }
282
- return (0, import_wasm_adapter.jsToExtractionResult)(result);
1227
+ return jsToExtractionResult(result);
283
1228
  });
284
1229
  } catch (error) {
285
- throw (0, import_wasm_adapter.wrapWasmError)(error, "batch extracting from bytes");
1230
+ throw wrapWasmError(error, "batch extracting from bytes");
286
1231
  }
287
1232
  }
288
1233
  function batchExtractBytesSync(files, config) {
@@ -319,7 +1264,7 @@ function batchExtractBytesSync(files, config) {
319
1264
  dataList.push(f.data);
320
1265
  mimeTypes.push(f.mimeType);
321
1266
  }
322
- const normalizedConfig = (0, import_wasm_adapter.configToJS)(config ?? null);
1267
+ const normalizedConfig = configToJS(config ?? null);
323
1268
  const results = wasm.batchExtractBytesSync(dataList, mimeTypes, normalizedConfig);
324
1269
  if (!Array.isArray(results)) {
325
1270
  throw new Error("Invalid batch extraction result: expected array");
@@ -328,10 +1273,10 @@ function batchExtractBytesSync(files, config) {
328
1273
  if (!result) {
329
1274
  throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
330
1275
  }
331
- return (0, import_wasm_adapter.jsToExtractionResult)(result);
1276
+ return jsToExtractionResult(result);
332
1277
  });
333
1278
  } catch (error) {
334
- throw (0, import_wasm_adapter.wrapWasmError)(error, "batch extracting from bytes (sync)");
1279
+ throw wrapWasmError(error, "batch extracting from bytes (sync)");
335
1280
  }
336
1281
  }
337
1282
  async function batchExtractFiles(files, config) {
@@ -351,7 +1296,7 @@ async function batchExtractFiles(files, config) {
351
1296
  if (!(file instanceof File)) {
352
1297
  throw new Error(`Invalid file at index ${i}: must be a File object`);
353
1298
  }
354
- const bytes = await (0, import_wasm_adapter.fileToUint8Array)(file);
1299
+ const bytes = await fileToUint8Array(file);
355
1300
  byteFiles.push({
356
1301
  data: bytes,
357
1302
  mimeType: file.type || "application/octet-stream"
@@ -359,25 +1304,78 @@ async function batchExtractFiles(files, config) {
359
1304
  }
360
1305
  return await batchExtractBytes(byteFiles, config);
361
1306
  } catch (error) {
362
- throw (0, import_wasm_adapter.wrapWasmError)(error, "batch extracting from files");
1307
+ throw wrapWasmError(error, "batch extracting from files");
363
1308
  }
364
1309
  }
365
1310
  async function enableOcr() {
366
1311
  if (!initialized) {
367
1312
  throw new Error("WASM module not initialized. Call initWasm() first.");
368
1313
  }
369
- if (!(0, import_runtime.isBrowser)()) {
1314
+ if (!isBrowser()) {
370
1315
  throw new Error(
371
1316
  "OCR is only available in browser environments. TesseractWasmBackend requires Web Workers and createImageBitmap."
372
1317
  );
373
1318
  }
374
1319
  try {
375
- const backend = new import_tesseract_wasm_backend.TesseractWasmBackend();
1320
+ const backend = new TesseractWasmBackend();
376
1321
  await backend.initialize();
377
- (0, import_registry.registerOcrBackend)(backend);
1322
+ registerOcrBackend(backend);
378
1323
  } catch (error) {
379
1324
  const message = error instanceof Error ? error.message : String(error);
380
1325
  throw new Error(`Failed to enable OCR: ${message}`);
381
1326
  }
382
1327
  }
1328
+ export {
1329
+ TesseractWasmBackend,
1330
+ batchExtractBytes,
1331
+ batchExtractBytesSync,
1332
+ batchExtractFiles,
1333
+ clearOcrBackends,
1334
+ clearPostProcessors,
1335
+ clearValidators,
1336
+ configToJS,
1337
+ detectRuntime,
1338
+ enableOcr,
1339
+ extractBytes,
1340
+ extractBytesSync,
1341
+ extractFile,
1342
+ extractFromFile,
1343
+ fileToUint8Array,
1344
+ getInitializationError,
1345
+ getOcrBackend,
1346
+ getPostProcessor,
1347
+ getRuntimeInfo,
1348
+ getRuntimeVersion,
1349
+ getValidator,
1350
+ getVersion,
1351
+ getWasmCapabilities,
1352
+ hasBigInt,
1353
+ hasBlob,
1354
+ hasFileApi,
1355
+ hasModuleWorkers,
1356
+ hasSharedArrayBuffer,
1357
+ hasWasm,
1358
+ hasWasmStreaming,
1359
+ hasWorkers,
1360
+ initWasm,
1361
+ isBrowser,
1362
+ isBun,
1363
+ isDeno,
1364
+ isInitialized,
1365
+ isNode,
1366
+ isServerEnvironment,
1367
+ isValidExtractionResult,
1368
+ isWebEnvironment,
1369
+ jsToExtractionResult,
1370
+ listOcrBackends,
1371
+ listPostProcessors,
1372
+ listValidators,
1373
+ registerOcrBackend,
1374
+ registerPostProcessor,
1375
+ registerValidator,
1376
+ unregisterOcrBackend,
1377
+ unregisterPostProcessor,
1378
+ unregisterValidator,
1379
+ wrapWasmError
1380
+ };
383
1381
  //# sourceMappingURL=index.js.map