@kreuzberg/wasm 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +6 -3
  2. package/dist/extraction/batch.d.ts +80 -0
  3. package/dist/extraction/batch.d.ts.map +1 -0
  4. package/dist/extraction/bytes.d.ts +69 -0
  5. package/dist/extraction/bytes.d.ts.map +1 -0
  6. package/dist/extraction/files.d.ts +77 -0
  7. package/dist/extraction/files.d.ts.map +1 -0
  8. package/dist/extraction/index.d.ts +11 -0
  9. package/dist/extraction/index.d.ts.map +1 -0
  10. package/dist/extraction/internal.d.ts +21 -0
  11. package/dist/extraction/internal.d.ts.map +1 -0
  12. package/dist/index.d.ts +9 -323
  13. package/dist/index.d.ts.map +1 -1
  14. package/dist/index.js +677 -591
  15. package/dist/index.js.map +1 -1
  16. package/dist/initialization/pdfium-loader.d.ts +30 -0
  17. package/dist/initialization/pdfium-loader.d.ts.map +1 -0
  18. package/dist/initialization/state.d.ts +100 -0
  19. package/dist/initialization/state.d.ts.map +1 -0
  20. package/dist/initialization/wasm-loader.d.ts +81 -0
  21. package/dist/initialization/wasm-loader.d.ts.map +1 -0
  22. package/dist/ocr/enabler.d.ts +86 -0
  23. package/dist/ocr/enabler.d.ts.map +1 -0
  24. package/dist/pkg/README.md +6 -3
  25. package/dist/pkg/kreuzberg_wasm.d.ts +76 -0
  26. package/dist/pkg/kreuzberg_wasm.js +142 -82
  27. package/dist/pkg/kreuzberg_wasm_bg.js +7 -7
  28. package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
  29. package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +3 -3
  30. package/dist/pkg/package.json +5 -1
  31. package/dist/runtime.d.ts +22 -2
  32. package/dist/runtime.d.ts.map +1 -1
  33. package/dist/runtime.js +21 -1
  34. package/dist/runtime.js.map +1 -1
  35. package/dist/types.d.ts +75 -0
  36. package/dist/types.d.ts.map +1 -1
  37. package/package.json +6 -6
package/dist/index.js CHANGED
@@ -24,6 +24,176 @@ var init_pdfium = __esm({
24
24
  }
25
25
  });
26
26
 
27
+ // typescript/runtime.ts
28
+ function detectRuntime() {
29
+ const globalCaches = globalThis.caches;
30
+ if (typeof caches !== "undefined" && globalCaches !== null && typeof globalCaches === "object" && "default" in globalCaches && typeof window === "undefined" && typeof document === "undefined") {
31
+ return "cloudflare-workers";
32
+ }
33
+ if (typeof globalThis.EdgeRuntime !== "undefined") {
34
+ return "edge-runtime";
35
+ }
36
+ if (typeof globalThis.Deno !== "undefined") {
37
+ return "deno";
38
+ }
39
+ if (typeof globalThis.Bun !== "undefined") {
40
+ return "bun";
41
+ }
42
+ if (typeof process !== "undefined" && process.versions && process.versions.node) {
43
+ return "node";
44
+ }
45
+ if (typeof window !== "undefined" && typeof document !== "undefined") {
46
+ return "browser";
47
+ }
48
+ return "unknown";
49
+ }
50
+ function isBrowser() {
51
+ return detectRuntime() === "browser";
52
+ }
53
+ function isNode() {
54
+ return detectRuntime() === "node";
55
+ }
56
+ function isDeno() {
57
+ return detectRuntime() === "deno";
58
+ }
59
+ function isBun() {
60
+ return detectRuntime() === "bun";
61
+ }
62
+ function isCloudflareWorkers() {
63
+ return detectRuntime() === "cloudflare-workers";
64
+ }
65
+ function isEdgeRuntime() {
66
+ return detectRuntime() === "edge-runtime";
67
+ }
68
+ function isEdgeEnvironment() {
69
+ const runtime = detectRuntime();
70
+ return runtime === "cloudflare-workers" || runtime === "edge-runtime";
71
+ }
72
+ function isWebEnvironment() {
73
+ const runtime = detectRuntime();
74
+ return runtime === "browser";
75
+ }
76
+ function isServerEnvironment() {
77
+ const runtime = detectRuntime();
78
+ return runtime === "node" || runtime === "deno" || runtime === "bun" || runtime === "cloudflare-workers" || runtime === "edge-runtime";
79
+ }
80
+ function hasFileApi() {
81
+ return typeof window !== "undefined" && typeof File !== "undefined" && typeof Blob !== "undefined";
82
+ }
83
+ function hasBlob() {
84
+ return typeof Blob !== "undefined";
85
+ }
86
+ function hasWorkers() {
87
+ return typeof Worker !== "undefined";
88
+ }
89
+ function hasSharedArrayBuffer() {
90
+ return typeof SharedArrayBuffer !== "undefined";
91
+ }
92
+ function hasModuleWorkers() {
93
+ if (!hasWorkers()) {
94
+ return false;
95
+ }
96
+ try {
97
+ const blob = new Blob(['console.log("test")'], {
98
+ type: "application/javascript"
99
+ });
100
+ const workerUrl = URL.createObjectURL(blob);
101
+ try {
102
+ return true;
103
+ } finally {
104
+ URL.revokeObjectURL(workerUrl);
105
+ }
106
+ } catch {
107
+ return false;
108
+ }
109
+ }
110
+ function hasWasm() {
111
+ return typeof WebAssembly !== "undefined" && WebAssembly.instantiate !== void 0;
112
+ }
113
+ function hasWasmStreaming() {
114
+ return typeof WebAssembly !== "undefined" && WebAssembly.instantiateStreaming !== void 0;
115
+ }
116
+ function hasBigInt() {
117
+ try {
118
+ const test = BigInt("1");
119
+ return typeof test === "bigint";
120
+ } catch {
121
+ return false;
122
+ }
123
+ }
124
+ function getRuntimeVersion() {
125
+ const runtime = detectRuntime();
126
+ switch (runtime) {
127
+ case "node":
128
+ return process.version?.substring(1);
129
+ case "deno": {
130
+ const deno = globalThis.Deno;
131
+ const version = deno?.version;
132
+ return version?.deno;
133
+ }
134
+ case "bun": {
135
+ const bun = globalThis.Bun;
136
+ return bun?.version;
137
+ }
138
+ default:
139
+ return void 0;
140
+ }
141
+ }
142
+ function getWasmCapabilities() {
143
+ const runtime = detectRuntime();
144
+ const version = getRuntimeVersion();
145
+ const capabilities = {
146
+ runtime,
147
+ hasWasm: hasWasm(),
148
+ hasWasmStreaming: hasWasmStreaming(),
149
+ hasFileApi: hasFileApi(),
150
+ hasBlob: hasBlob(),
151
+ hasWorkers: hasWorkers(),
152
+ hasSharedArrayBuffer: hasSharedArrayBuffer(),
153
+ hasModuleWorkers: hasModuleWorkers(),
154
+ hasBigInt: hasBigInt(),
155
+ ...version !== void 0 ? { runtimeVersion: version } : {}
156
+ };
157
+ return capabilities;
158
+ }
159
+ function getRuntimeInfo() {
160
+ const runtime = detectRuntime();
161
+ const capabilities = getWasmCapabilities();
162
+ return {
163
+ runtime,
164
+ isBrowser: isBrowser(),
165
+ isNode: isNode(),
166
+ isDeno: isDeno(),
167
+ isBun: isBun(),
168
+ isWeb: isWebEnvironment(),
169
+ isServer: isServerEnvironment(),
170
+ runtimeVersion: getRuntimeVersion(),
171
+ userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "N/A",
172
+ capabilities
173
+ };
174
+ }
175
+
176
+ // typescript/initialization/pdfium-loader.ts
177
+ async function initializePdfiumAsync(wasmModule) {
178
+ if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
179
+ return;
180
+ }
181
+ if (!isBrowser()) {
182
+ console.debug("PDFium initialization skipped (non-browser environment)");
183
+ return;
184
+ }
185
+ try {
186
+ const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
187
+ const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
188
+ const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
189
+ if (!success) {
190
+ console.warn("PDFium initialization returned false");
191
+ }
192
+ } catch (error) {
193
+ console.debug("PDFium initialization error:", error);
194
+ }
195
+ }
196
+
27
197
  // typescript/adapters/wasm-adapter.ts
28
198
  var MAX_FILE_SIZE = 512 * 1024 * 1024;
29
199
  function isNumberOrNull(value) {
@@ -257,59 +427,417 @@ function isValidExtractionResult(value) {
257
427
  return typeof obj.content === "string" && (typeof obj.mimeType === "string" || typeof obj.mime_type === "string") && obj.metadata !== null && typeof obj.metadata === "object" && Array.isArray(obj.tables);
258
428
  }
259
429
 
260
- // typescript/ocr/registry.ts
261
- var ocrBackendRegistry = /* @__PURE__ */ new Map();
262
- function registerOcrBackend(backend) {
263
- if (!backend) {
264
- throw new Error("Backend cannot be null or undefined");
265
- }
266
- if (typeof backend.name !== "function") {
267
- throw new Error("Backend must implement name() method");
268
- }
269
- if (typeof backend.supportedLanguages !== "function") {
270
- throw new Error("Backend must implement supportedLanguages() method");
430
+ // typescript/initialization/state.ts
431
+ var wasm = null;
432
+ var initialized = false;
433
+ var initializationError = null;
434
+ var initializationPromise = null;
435
+ function getWasmModule() {
436
+ return wasm;
437
+ }
438
+ function setWasmModule(module) {
439
+ wasm = module;
440
+ }
441
+ function isInitialized() {
442
+ return initialized;
443
+ }
444
+ function setInitialized(value) {
445
+ initialized = value;
446
+ }
447
+ function getInitializationError() {
448
+ return initializationError;
449
+ }
450
+ function setInitializationError(error) {
451
+ initializationError = error;
452
+ }
453
+ function getInitializationPromise() {
454
+ return initializationPromise;
455
+ }
456
+ function setInitializationPromise(promise) {
457
+ initializationPromise = promise;
458
+ }
459
+
460
+ // typescript/initialization/wasm-loader.ts
461
+ async function loadWasmBinaryForNode() {
462
+ if (!isNode()) {
463
+ return void 0;
271
464
  }
272
- if (typeof backend.processImage !== "function") {
273
- throw new Error("Backend must implement processImage() method");
465
+ try {
466
+ const fs = await import(
467
+ /* @vite-ignore */
468
+ "fs/promises"
469
+ );
470
+ const path = await import(
471
+ /* @vite-ignore */
472
+ "path"
473
+ );
474
+ const url = await import(
475
+ /* @vite-ignore */
476
+ "url"
477
+ );
478
+ const __dirname = path.dirname(url.fileURLToPath(import.meta.url));
479
+ const wasmPath = path.join(__dirname, "..", "pkg", "kreuzberg_wasm_bg.wasm");
480
+ const wasmBuffer = await fs.readFile(wasmPath);
481
+ return new Uint8Array(wasmBuffer);
482
+ } catch {
483
+ return void 0;
274
484
  }
275
- const backendName = backend.name();
276
- if (!backendName || typeof backendName !== "string") {
277
- throw new Error("Backend name must be a non-empty string");
485
+ }
486
+ function getVersion() {
487
+ if (!isInitialized()) {
488
+ throw new Error("WASM module not initialized. Call initWasm() first.");
278
489
  }
279
- if (ocrBackendRegistry.has(backendName)) {
280
- console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
490
+ const wasmModule = getWasmModule();
491
+ if (!wasmModule) {
492
+ throw new Error("WASM module not loaded. Call initWasm() first.");
281
493
  }
282
- ocrBackendRegistry.set(backendName, backend);
283
- }
284
- function getOcrBackend(name) {
285
- return ocrBackendRegistry.get(name);
286
- }
287
- function listOcrBackends() {
288
- return Array.from(ocrBackendRegistry.keys());
494
+ return wasmModule.version();
289
495
  }
290
- async function unregisterOcrBackend(name) {
291
- const backend = ocrBackendRegistry.get(name);
292
- if (!backend) {
293
- throw new Error(
294
- `OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
295
- );
496
+ async function initWasm() {
497
+ if (isInitialized()) {
498
+ return;
296
499
  }
297
- if (typeof backend.shutdown === "function") {
500
+ let currentPromise = getInitializationPromise();
501
+ if (currentPromise) {
502
+ return currentPromise;
503
+ }
504
+ currentPromise = (async () => {
298
505
  try {
299
- await backend.shutdown();
506
+ if (!hasWasm()) {
507
+ throw new Error("WebAssembly is not supported in this environment");
508
+ }
509
+ let wasmModule;
510
+ const pkgPath = "./pkg/kreuzberg_wasm.js";
511
+ const fallbackPath = "./kreuzberg_wasm.js";
512
+ try {
513
+ wasmModule = await import(
514
+ /* @vite-ignore */
515
+ pkgPath
516
+ );
517
+ } catch {
518
+ wasmModule = await import(
519
+ /* @vite-ignore */
520
+ fallbackPath
521
+ );
522
+ }
523
+ const loadedModule = wasmModule;
524
+ setWasmModule(loadedModule);
525
+ if (loadedModule && typeof loadedModule.default === "function") {
526
+ const wasmBinary = await loadWasmBinaryForNode();
527
+ if (wasmBinary) {
528
+ await loadedModule.default(wasmBinary);
529
+ } else {
530
+ await loadedModule.default();
531
+ }
532
+ }
533
+ if (isBrowser() && loadedModule && typeof loadedModule.initialize_pdfium_render === "function") {
534
+ initializePdfiumAsync(loadedModule).catch((error) => {
535
+ console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
536
+ });
537
+ }
538
+ setInitialized(true);
539
+ setInitializationError(null);
300
540
  } catch (error) {
301
- console.warn(
302
- `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
303
- );
541
+ setInitializationError(error instanceof Error ? error : new Error(String(error)));
542
+ throw wrapWasmError(error, "initializing Kreuzberg WASM module");
304
543
  }
544
+ })();
545
+ setInitializationPromise(currentPromise);
546
+ return currentPromise;
547
+ }
548
+
549
+ // typescript/extraction/internal.ts
550
+ function getWasmModule2() {
551
+ const wasm2 = getWasmModule();
552
+ if (!wasm2) {
553
+ throw new Error("WASM module not loaded. Call initWasm() first.");
305
554
  }
306
- ocrBackendRegistry.delete(name);
555
+ return wasm2;
307
556
  }
308
- async function clearOcrBackends() {
309
- const backends = Array.from(ocrBackendRegistry.entries());
310
- for (const [name, backend] of backends) {
311
- if (typeof backend.shutdown === "function") {
312
- try {
557
+ function isInitialized2() {
558
+ return isInitialized();
559
+ }
560
+
561
+ // typescript/extraction/bytes.ts
562
+ async function extractBytes(data, mimeType, config) {
563
+ if (!isInitialized2()) {
564
+ throw new Error("WASM module not initialized. Call initWasm() first.");
565
+ }
566
+ const wasm2 = getWasmModule2();
567
+ try {
568
+ if (!data || data.length === 0) {
569
+ throw new Error("Document data cannot be empty");
570
+ }
571
+ if (!mimeType) {
572
+ throw new Error("MIME type is required");
573
+ }
574
+ const normalizedConfig = configToJS(config ?? null);
575
+ const result = await wasm2.extractBytes(data, mimeType, normalizedConfig);
576
+ if (!result) {
577
+ throw new Error("Invalid extraction result: no result from WASM module");
578
+ }
579
+ return jsToExtractionResult(result);
580
+ } catch (error) {
581
+ throw wrapWasmError(error, "extracting from bytes");
582
+ }
583
+ }
584
+ function extractBytesSync(data, mimeType, config) {
585
+ if (!isInitialized2()) {
586
+ throw new Error("WASM module not initialized. Call initWasm() first.");
587
+ }
588
+ const wasm2 = getWasmModule2();
589
+ try {
590
+ if (!data || data.length === 0) {
591
+ throw new Error("Document data cannot be empty");
592
+ }
593
+ if (!mimeType) {
594
+ throw new Error("MIME type is required");
595
+ }
596
+ const normalizedConfig = configToJS(config ?? null);
597
+ const result = wasm2.extractBytesSync(data, mimeType, normalizedConfig);
598
+ if (!result) {
599
+ throw new Error("Invalid extraction result: no result from WASM module");
600
+ }
601
+ return jsToExtractionResult(result);
602
+ } catch (error) {
603
+ throw wrapWasmError(error, "extracting from bytes (sync)");
604
+ }
605
+ }
606
+
607
+ // typescript/extraction/files.ts
608
+ async function extractFile(path, mimeType, config) {
609
+ if (!isInitialized2()) {
610
+ throw new Error("WASM module not initialized. Call initWasm() first.");
611
+ }
612
+ const wasm2 = getWasmModule2();
613
+ try {
614
+ if (!path) {
615
+ throw new Error("File path is required");
616
+ }
617
+ const runtime = detectRuntime();
618
+ if (runtime === "browser") {
619
+ throw new Error("Use extractBytes with fileToUint8Array for browser environments");
620
+ }
621
+ let fileData;
622
+ if (runtime === "node") {
623
+ const { readFile } = await import("fs/promises");
624
+ const buffer = await readFile(path);
625
+ fileData = new Uint8Array(buffer);
626
+ } else if (runtime === "deno") {
627
+ const deno = globalThis.Deno;
628
+ fileData = await deno.readFile(path);
629
+ } else if (runtime === "bun") {
630
+ const { readFile } = await import("fs/promises");
631
+ const buffer = await readFile(path);
632
+ fileData = new Uint8Array(buffer);
633
+ } else {
634
+ throw new Error(`Unsupported runtime for file extraction: ${runtime}`);
635
+ }
636
+ let detectedMimeType = mimeType;
637
+ if (!detectedMimeType) {
638
+ detectedMimeType = wasm2.detectMimeFromBytes(fileData);
639
+ }
640
+ if (!detectedMimeType) {
641
+ throw new Error("Could not detect MIME type for file. Please provide mimeType parameter.");
642
+ }
643
+ detectedMimeType = wasm2.normalizeMimeType(detectedMimeType);
644
+ return await extractBytes(fileData, detectedMimeType, config);
645
+ } catch (error) {
646
+ throw wrapWasmError(error, `extracting from file: ${path}`);
647
+ }
648
+ }
649
+ async function extractFromFile(file, mimeType, config) {
650
+ if (!isInitialized2()) {
651
+ throw new Error("WASM module not initialized. Call initWasm() first.");
652
+ }
653
+ const wasm2 = getWasmModule2();
654
+ try {
655
+ const bytes = await fileToUint8Array(file);
656
+ let type = mimeType ?? (file instanceof File ? file.type : "application/octet-stream");
657
+ type = wasm2.normalizeMimeType(type);
658
+ return await extractBytes(bytes, type, config);
659
+ } catch (error) {
660
+ throw wrapWasmError(error, `extracting from ${file instanceof File ? "file" : "blob"}`);
661
+ }
662
+ }
663
+
664
+ // typescript/extraction/batch.ts
665
+ async function batchExtractBytes(files, config) {
666
+ if (!isInitialized2()) {
667
+ throw new Error("WASM module not initialized. Call initWasm() first.");
668
+ }
669
+ const wasm2 = getWasmModule2();
670
+ try {
671
+ if (!Array.isArray(files)) {
672
+ throw new Error("Files parameter must be an array");
673
+ }
674
+ if (files.length === 0) {
675
+ throw new Error("Files array cannot be empty");
676
+ }
677
+ const dataList = [];
678
+ const mimeTypes = [];
679
+ for (let i = 0; i < files.length; i += 1) {
680
+ const file = files[i];
681
+ if (!file || typeof file !== "object") {
682
+ throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
683
+ }
684
+ const f = file;
685
+ if (!(f.data instanceof Uint8Array)) {
686
+ throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
687
+ }
688
+ if (typeof f.mimeType !== "string") {
689
+ throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
690
+ }
691
+ if (f.data.length === 0) {
692
+ throw new Error(`Invalid file at index ${i}: data cannot be empty`);
693
+ }
694
+ dataList.push(f.data);
695
+ mimeTypes.push(f.mimeType);
696
+ }
697
+ const normalizedConfig = configToJS(config ?? null);
698
+ const results = await wasm2.batchExtractBytes(dataList, mimeTypes, normalizedConfig);
699
+ if (!Array.isArray(results)) {
700
+ throw new Error("Invalid batch extraction result: expected array");
701
+ }
702
+ return results.map((result, index) => {
703
+ if (!result) {
704
+ throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
705
+ }
706
+ return jsToExtractionResult(result);
707
+ });
708
+ } catch (error) {
709
+ throw wrapWasmError(error, "batch extracting from bytes");
710
+ }
711
+ }
712
+ function batchExtractBytesSync(files, config) {
713
+ if (!isInitialized2()) {
714
+ throw new Error("WASM module not initialized. Call initWasm() first.");
715
+ }
716
+ const wasm2 = getWasmModule2();
717
+ try {
718
+ if (!Array.isArray(files)) {
719
+ throw new Error("Files parameter must be an array");
720
+ }
721
+ if (files.length === 0) {
722
+ throw new Error("Files array cannot be empty");
723
+ }
724
+ const dataList = [];
725
+ const mimeTypes = [];
726
+ for (let i = 0; i < files.length; i += 1) {
727
+ const file = files[i];
728
+ if (!file || typeof file !== "object") {
729
+ throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
730
+ }
731
+ const f = file;
732
+ if (!(f.data instanceof Uint8Array)) {
733
+ throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
734
+ }
735
+ if (typeof f.mimeType !== "string") {
736
+ throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
737
+ }
738
+ if (f.data.length === 0) {
739
+ throw new Error(`Invalid file at index ${i}: data cannot be empty`);
740
+ }
741
+ dataList.push(f.data);
742
+ mimeTypes.push(f.mimeType);
743
+ }
744
+ const normalizedConfig = configToJS(config ?? null);
745
+ const results = wasm2.batchExtractBytesSync(dataList, mimeTypes, normalizedConfig);
746
+ if (!Array.isArray(results)) {
747
+ throw new Error("Invalid batch extraction result: expected array");
748
+ }
749
+ return results.map((result, index) => {
750
+ if (!result) {
751
+ throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
752
+ }
753
+ return jsToExtractionResult(result);
754
+ });
755
+ } catch (error) {
756
+ throw wrapWasmError(error, "batch extracting from bytes (sync)");
757
+ }
758
+ }
759
+ async function batchExtractFiles(files, config) {
760
+ if (!isInitialized2()) {
761
+ throw new Error("WASM module not initialized. Call initWasm() first.");
762
+ }
763
+ try {
764
+ if (!Array.isArray(files)) {
765
+ throw new Error("Files parameter must be an array");
766
+ }
767
+ if (files.length === 0) {
768
+ throw new Error("Files array cannot be empty");
769
+ }
770
+ const byteFiles = [];
771
+ for (let i = 0; i < files.length; i += 1) {
772
+ const file = files[i];
773
+ if (!(file instanceof File)) {
774
+ throw new Error(`Invalid file at index ${i}: must be a File object`);
775
+ }
776
+ const bytes = await fileToUint8Array(file);
777
+ byteFiles.push({
778
+ data: bytes,
779
+ mimeType: file.type || "application/octet-stream"
780
+ });
781
+ }
782
+ return await batchExtractBytes(byteFiles, config);
783
+ } catch (error) {
784
+ throw wrapWasmError(error, "batch extracting from files");
785
+ }
786
+ }
787
+
788
+ // typescript/ocr/registry.ts
789
+ var ocrBackendRegistry = /* @__PURE__ */ new Map();
790
+ function registerOcrBackend(backend) {
791
+ if (!backend) {
792
+ throw new Error("Backend cannot be null or undefined");
793
+ }
794
+ if (typeof backend.name !== "function") {
795
+ throw new Error("Backend must implement name() method");
796
+ }
797
+ if (typeof backend.supportedLanguages !== "function") {
798
+ throw new Error("Backend must implement supportedLanguages() method");
799
+ }
800
+ if (typeof backend.processImage !== "function") {
801
+ throw new Error("Backend must implement processImage() method");
802
+ }
803
+ const backendName = backend.name();
804
+ if (!backendName || typeof backendName !== "string") {
805
+ throw new Error("Backend name must be a non-empty string");
806
+ }
807
+ if (ocrBackendRegistry.has(backendName)) {
808
+ console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
809
+ }
810
+ ocrBackendRegistry.set(backendName, backend);
811
+ }
812
+ function getOcrBackend(name) {
813
+ return ocrBackendRegistry.get(name);
814
+ }
815
+ function listOcrBackends() {
816
+ return Array.from(ocrBackendRegistry.keys());
817
+ }
818
+ async function unregisterOcrBackend(name) {
819
+ const backend = ocrBackendRegistry.get(name);
820
+ if (!backend) {
821
+ throw new Error(
822
+ `OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
823
+ );
824
+ }
825
+ if (typeof backend.shutdown === "function") {
826
+ try {
827
+ await backend.shutdown();
828
+ } catch (error) {
829
+ console.warn(
830
+ `Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
831
+ );
832
+ }
833
+ }
834
+ ocrBackendRegistry.delete(name);
835
+ }
836
+ async function clearOcrBackends() {
837
+ const backends = Array.from(ocrBackendRegistry.entries());
838
+ for (const [name, backend] of backends) {
839
+ if (typeof backend.shutdown === "function") {
840
+ try {
313
841
  await backend.shutdown();
314
842
  } catch (error) {
315
843
  console.warn(
@@ -697,136 +1225,24 @@ var TesseractWasmBackend = class {
697
1225
  }
698
1226
  };
699
1227
 
700
- // typescript/runtime.ts
701
- function detectRuntime() {
702
- if (typeof globalThis.Deno !== "undefined") {
703
- return "deno";
704
- }
705
- if (typeof globalThis.Bun !== "undefined") {
706
- return "bun";
707
- }
708
- if (typeof process !== "undefined" && process.versions && process.versions.node) {
709
- return "node";
710
- }
711
- if (typeof window !== "undefined" && typeof document !== "undefined") {
712
- return "browser";
713
- }
714
- return "unknown";
715
- }
716
- function isBrowser() {
717
- return detectRuntime() === "browser";
718
- }
719
- function isNode() {
720
- return detectRuntime() === "node";
721
- }
722
- function isDeno() {
723
- return detectRuntime() === "deno";
724
- }
725
- function isBun() {
726
- return detectRuntime() === "bun";
727
- }
728
- function isWebEnvironment() {
729
- const runtime = detectRuntime();
730
- return runtime === "browser";
731
- }
732
- function isServerEnvironment() {
733
- const runtime = detectRuntime();
734
- return runtime === "node" || runtime === "deno" || runtime === "bun";
735
- }
736
- function hasFileApi() {
737
- return typeof window !== "undefined" && typeof File !== "undefined" && typeof Blob !== "undefined";
738
- }
739
- function hasBlob() {
740
- return typeof Blob !== "undefined";
741
- }
742
- function hasWorkers() {
743
- return typeof Worker !== "undefined";
744
- }
745
- function hasSharedArrayBuffer() {
746
- return typeof SharedArrayBuffer !== "undefined";
747
- }
748
- function hasModuleWorkers() {
749
- if (!hasWorkers()) {
750
- return false;
751
- }
752
- try {
753
- const blob = new Blob(['console.log("test")'], {
754
- type: "application/javascript"
755
- });
756
- const workerUrl = URL.createObjectURL(blob);
757
- try {
758
- return true;
759
- } finally {
760
- URL.revokeObjectURL(workerUrl);
761
- }
762
- } catch {
763
- return false;
764
- }
765
- }
766
- function hasWasm() {
767
- return typeof WebAssembly !== "undefined" && WebAssembly.instantiate !== void 0;
768
- }
769
- function hasWasmStreaming() {
770
- return typeof WebAssembly !== "undefined" && WebAssembly.instantiateStreaming !== void 0;
771
- }
772
- function hasBigInt() {
773
- try {
774
- const test = BigInt("1");
775
- return typeof test === "bigint";
776
- } catch {
777
- return false;
1228
+ // typescript/ocr/enabler.ts
1229
+ async function enableOcr() {
1230
+ if (!isInitialized2()) {
1231
+ throw new Error("WASM module not initialized. Call initWasm() first.");
778
1232
  }
779
- }
780
- function getRuntimeVersion() {
781
- const runtime = detectRuntime();
782
- switch (runtime) {
783
- case "node":
784
- return process.version?.substring(1);
785
- case "deno": {
786
- const deno = globalThis.Deno;
787
- const version = deno?.version;
788
- return version?.deno;
789
- }
790
- case "bun": {
791
- const bun = globalThis.Bun;
792
- return bun?.version;
793
- }
794
- default:
795
- return void 0;
1233
+ if (!isBrowser()) {
1234
+ throw new Error(
1235
+ "OCR is only available in browser environments. TesseractWasmBackend requires Web Workers and createImageBitmap."
1236
+ );
1237
+ }
1238
+ try {
1239
+ const backend = new TesseractWasmBackend();
1240
+ await backend.initialize();
1241
+ registerOcrBackend(backend);
1242
+ } catch (error) {
1243
+ const message = error instanceof Error ? error.message : String(error);
1244
+ throw new Error(`Failed to enable OCR: ${message}`);
796
1245
  }
797
- }
798
- function getWasmCapabilities() {
799
- const runtime = detectRuntime();
800
- const version = getRuntimeVersion();
801
- const capabilities = {
802
- runtime,
803
- hasWasm: hasWasm(),
804
- hasWasmStreaming: hasWasmStreaming(),
805
- hasFileApi: hasFileApi(),
806
- hasBlob: hasBlob(),
807
- hasWorkers: hasWorkers(),
808
- hasSharedArrayBuffer: hasSharedArrayBuffer(),
809
- hasModuleWorkers: hasModuleWorkers(),
810
- hasBigInt: hasBigInt(),
811
- ...version !== void 0 ? { runtimeVersion: version } : {}
812
- };
813
- return capabilities;
814
- }
815
- function getRuntimeInfo() {
816
- const runtime = detectRuntime();
817
- const capabilities = getWasmCapabilities();
818
- return {
819
- runtime,
820
- isBrowser: isBrowser(),
821
- isNode: isNode(),
822
- isDeno: isDeno(),
823
- isBun: isBun(),
824
- isWeb: isWebEnvironment(),
825
- isServer: isServerEnvironment(),
826
- runtimeVersion: getRuntimeVersion(),
827
- userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "N/A",
828
- capabilities
829
- };
830
1246
  }
831
1247
 
832
1248
  // typescript/plugin-registry.ts
@@ -871,460 +1287,125 @@ async function unregisterPostProcessor(name) {
871
1287
  throw new Error(`Post-processor "${name}" is not registered.${availableStr}`);
872
1288
  }
873
1289
  try {
874
- if (processor.shutdown) {
875
- await processor.shutdown();
876
- }
877
- } catch (error) {
878
- console.warn(`Error during shutdown of post-processor "${name}":`, error);
879
- }
880
- postProcessors.delete(name);
881
- }
882
- async function clearPostProcessors() {
883
- const entries = Array.from(postProcessors.entries());
884
- for (const [_name, processor] of entries) {
885
- try {
886
- if (processor.shutdown) {
887
- await processor.shutdown();
888
- }
889
- } catch (error) {
890
- console.warn(`Error during shutdown of post-processor "${_name}":`, error);
891
- }
892
- }
893
- postProcessors.clear();
894
- }
895
- function validateValidator(validator) {
896
- if (validator === null || validator === void 0) {
897
- throw new Error("Validator cannot be null or undefined");
898
- }
899
- const obj = validator;
900
- if (typeof obj.name !== "function") {
901
- throw new Error("Validator must implement name() method");
902
- }
903
- if (typeof obj.validate !== "function") {
904
- throw new Error("Validator must implement validate() method");
905
- }
906
- const name = obj.name();
907
- if (typeof name !== "string" || name.trim() === "") {
908
- throw new Error("Validator name must be a non-empty string");
909
- }
910
- return true;
911
- }
912
- function registerValidator(validator) {
913
- validateValidator(validator);
914
- const name = validator.name();
915
- if (validators.has(name)) {
916
- console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
917
- }
918
- validators.set(name, validator);
919
- }
920
- function getValidator(name) {
921
- return validators.get(name);
922
- }
923
- function listValidators() {
924
- return Array.from(validators.keys());
925
- }
926
- async function unregisterValidator(name) {
927
- const validator = validators.get(name);
928
- if (!validator) {
929
- const available = Array.from(validators.keys());
930
- const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
931
- throw new Error(`Validator "${name}" is not registered.${availableStr}`);
932
- }
933
- try {
934
- if (validator.shutdown) {
935
- await validator.shutdown();
936
- }
937
- } catch (error) {
938
- console.warn(`Error during shutdown of validator "${name}":`, error);
939
- }
940
- validators.delete(name);
941
- }
942
- async function clearValidators() {
943
- const entries = Array.from(validators.entries());
944
- for (const [_name, validator] of entries) {
945
- try {
946
- if (validator.shutdown) {
947
- await validator.shutdown();
948
- }
949
- } catch (error) {
950
- console.warn(`Error during shutdown of validator "${_name}":`, error);
951
- }
952
- }
953
- validators.clear();
954
- }
955
- function executePostProcessor(name, result) {
956
- const processor = postProcessors.get(name);
957
- if (!processor) {
958
- return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
959
- }
960
- try {
961
- const output = processor.process(result);
962
- if (output instanceof Promise) {
963
- return output;
964
- }
965
- return Promise.resolve(output);
966
- } catch (error) {
967
- return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
968
- }
969
- }
970
- function executeValidator(name, result) {
971
- const validator = validators.get(name);
972
- if (!validator) {
973
- return Promise.reject(new Error(`Validator "${name}" is not registered`));
974
- }
975
- try {
976
- const output = validator.validate(result);
977
- if (output instanceof Promise) {
978
- return output;
979
- }
980
- return Promise.resolve(output);
981
- } catch (error) {
982
- return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
983
- }
984
- }
985
- function setupGlobalCallbacks() {
986
- if (typeof globalThis !== "undefined") {
987
- const callbacksObj = globalThis;
988
- callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
989
- callbacksObj.__kreuzberg_execute_validator = executeValidator;
990
- }
991
- }
992
- setupGlobalCallbacks();
993
-
994
- // typescript/index.ts
995
- var wasm = null;
996
- var initialized = false;
997
- var initializationError = null;
998
- var initializationPromise = null;
999
- async function initializePdfiumAsync(wasmModule) {
1000
- if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
1001
- return;
1002
- }
1003
- if (!isBrowser()) {
1004
- console.debug("PDFium initialization skipped (non-browser environment)");
1005
- return;
1006
- }
1007
- try {
1008
- const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
1009
- const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
1010
- const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
1011
- if (!success) {
1012
- console.warn("PDFium initialization returned false");
1013
- }
1014
- } catch (error) {
1015
- console.debug("PDFium initialization error:", error);
1016
- }
1017
- }
1018
- async function initWasm() {
1019
- if (initialized) {
1020
- return;
1021
- }
1022
- if (initializationPromise) {
1023
- return initializationPromise;
1024
- }
1025
- initializationPromise = (async () => {
1026
- try {
1027
- if (!hasWasm()) {
1028
- throw new Error("WebAssembly is not supported in this environment");
1029
- }
1030
- let wasmModule;
1031
- const pkgPath = "./pkg/kreuzberg_wasm.js";
1032
- const fallbackPath = "./kreuzberg_wasm.js";
1033
- try {
1034
- wasmModule = await import(
1035
- /* @vite-ignore */
1036
- pkgPath
1037
- );
1038
- } catch {
1039
- wasmModule = await import(
1040
- /* @vite-ignore */
1041
- fallbackPath
1042
- );
1043
- }
1044
- wasm = wasmModule;
1045
- if (wasm && typeof wasm.default === "function") {
1046
- await wasm.default();
1047
- }
1048
- if (isBrowser() && wasm && typeof wasm.initialize_pdfium_render === "function") {
1049
- initializePdfiumAsync(wasm).catch((error) => {
1050
- console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
1051
- });
1052
- }
1053
- initialized = true;
1054
- initializationError = null;
1055
- } catch (error) {
1056
- initializationError = error instanceof Error ? error : new Error(String(error));
1057
- throw wrapWasmError(error, "initializing Kreuzberg WASM module");
1058
- }
1059
- })();
1060
- return initializationPromise;
1061
- }
1062
- function isInitialized() {
1063
- return initialized;
1064
- }
1065
- function getVersion() {
1066
- if (!initialized) {
1067
- throw new Error("WASM module not initialized. Call initWasm() first.");
1068
- }
1069
- if (!wasm) {
1070
- throw new Error("WASM module not loaded. Call initWasm() first.");
1071
- }
1072
- return wasm.version();
1073
- }
1074
- function getInitializationError() {
1075
- return initializationError;
1076
- }
1077
- async function extractBytes(data, mimeType, config) {
1078
- if (!initialized) {
1079
- throw new Error("WASM module not initialized. Call initWasm() first.");
1080
- }
1081
- if (!wasm) {
1082
- throw new Error("WASM module not loaded. Call initWasm() first.");
1083
- }
1084
- try {
1085
- if (!data || data.length === 0) {
1086
- throw new Error("Document data cannot be empty");
1087
- }
1088
- if (!mimeType) {
1089
- throw new Error("MIME type is required");
1090
- }
1091
- const normalizedConfig = configToJS(config ?? null);
1092
- const result = await wasm.extractBytes(data, mimeType, normalizedConfig);
1093
- if (!result) {
1094
- throw new Error("Invalid extraction result: no result from WASM module");
1095
- }
1096
- return jsToExtractionResult(result);
1097
- } catch (error) {
1098
- throw wrapWasmError(error, "extracting from bytes");
1099
- }
1100
- }
1101
- async function extractFile(path, mimeType, config) {
1102
- if (!initialized) {
1103
- throw new Error("WASM module not initialized. Call initWasm() first.");
1104
- }
1105
- if (!wasm) {
1106
- throw new Error("WASM module not loaded. Call initWasm() first.");
1107
- }
1108
- try {
1109
- if (!path) {
1110
- throw new Error("File path is required");
1111
- }
1112
- const runtime = detectRuntime();
1113
- if (runtime === "browser") {
1114
- throw new Error("Use extractBytes with fileToUint8Array for browser environments");
1115
- }
1116
- let fileData;
1117
- if (runtime === "node") {
1118
- const { readFile } = await import("fs/promises");
1119
- const buffer = await readFile(path);
1120
- fileData = new Uint8Array(buffer);
1121
- } else if (runtime === "deno") {
1122
- const deno = globalThis.Deno;
1123
- fileData = await deno.readFile(path);
1124
- } else if (runtime === "bun") {
1125
- const { readFile } = await import("fs/promises");
1126
- const buffer = await readFile(path);
1127
- fileData = new Uint8Array(buffer);
1128
- } else {
1129
- throw new Error(`Unsupported runtime for file extraction: ${runtime}`);
1130
- }
1131
- let detectedMimeType = mimeType;
1132
- if (!detectedMimeType) {
1133
- detectedMimeType = wasm.detectMimeFromBytes(fileData);
1134
- }
1135
- if (!detectedMimeType) {
1136
- throw new Error("Could not detect MIME type for file. Please provide mimeType parameter.");
1290
+ if (processor.shutdown) {
1291
+ await processor.shutdown();
1137
1292
  }
1138
- detectedMimeType = wasm.normalizeMimeType(detectedMimeType);
1139
- return await extractBytes(fileData, detectedMimeType, config);
1140
1293
  } catch (error) {
1141
- throw wrapWasmError(error, `extracting from file: ${path}`);
1294
+ console.warn(`Error during shutdown of post-processor "${name}":`, error);
1142
1295
  }
1296
+ postProcessors.delete(name);
1143
1297
  }
1144
- async function extractFromFile(file, mimeType, config) {
1145
- if (!initialized) {
1146
- throw new Error("WASM module not initialized. Call initWasm() first.");
1298
+ async function clearPostProcessors() {
1299
+ const entries = Array.from(postProcessors.entries());
1300
+ for (const [_name, processor] of entries) {
1301
+ try {
1302
+ if (processor.shutdown) {
1303
+ await processor.shutdown();
1304
+ }
1305
+ } catch (error) {
1306
+ console.warn(`Error during shutdown of post-processor "${_name}":`, error);
1307
+ }
1147
1308
  }
1148
- if (!wasm) {
1149
- throw new Error("WASM module not loaded. Call initWasm() first.");
1309
+ postProcessors.clear();
1310
+ }
1311
+ function validateValidator(validator) {
1312
+ if (validator === null || validator === void 0) {
1313
+ throw new Error("Validator cannot be null or undefined");
1150
1314
  }
1151
- try {
1152
- const bytes = await fileToUint8Array(file);
1153
- let type = mimeType ?? (file instanceof File ? file.type : "application/octet-stream");
1154
- type = wasm.normalizeMimeType(type);
1155
- return await extractBytes(bytes, type, config);
1156
- } catch (error) {
1157
- throw wrapWasmError(error, `extracting from ${file instanceof File ? "file" : "blob"}`);
1315
+ const obj = validator;
1316
+ if (typeof obj.name !== "function") {
1317
+ throw new Error("Validator must implement name() method");
1318
+ }
1319
+ if (typeof obj.validate !== "function") {
1320
+ throw new Error("Validator must implement validate() method");
1321
+ }
1322
+ const name = obj.name();
1323
+ if (typeof name !== "string" || name.trim() === "") {
1324
+ throw new Error("Validator name must be a non-empty string");
1158
1325
  }
1326
+ return true;
1159
1327
  }
1160
- function extractBytesSync(data, mimeType, config) {
1161
- if (!initialized) {
1162
- throw new Error("WASM module not initialized. Call initWasm() first.");
1328
+ function registerValidator(validator) {
1329
+ validateValidator(validator);
1330
+ const name = validator.name();
1331
+ if (validators.has(name)) {
1332
+ console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
1163
1333
  }
1164
- if (!wasm) {
1165
- throw new Error("WASM module not loaded. Call initWasm() first.");
1334
+ validators.set(name, validator);
1335
+ }
1336
+ function getValidator(name) {
1337
+ return validators.get(name);
1338
+ }
1339
+ function listValidators() {
1340
+ return Array.from(validators.keys());
1341
+ }
1342
+ async function unregisterValidator(name) {
1343
+ const validator = validators.get(name);
1344
+ if (!validator) {
1345
+ const available = Array.from(validators.keys());
1346
+ const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
1347
+ throw new Error(`Validator "${name}" is not registered.${availableStr}`);
1166
1348
  }
1167
1349
  try {
1168
- if (!data || data.length === 0) {
1169
- throw new Error("Document data cannot be empty");
1170
- }
1171
- if (!mimeType) {
1172
- throw new Error("MIME type is required");
1173
- }
1174
- const normalizedConfig = configToJS(config ?? null);
1175
- const result = wasm.extractBytesSync(data, mimeType, normalizedConfig);
1176
- if (!result) {
1177
- throw new Error("Invalid extraction result: no result from WASM module");
1350
+ if (validator.shutdown) {
1351
+ await validator.shutdown();
1178
1352
  }
1179
- return jsToExtractionResult(result);
1180
1353
  } catch (error) {
1181
- throw wrapWasmError(error, "extracting from bytes (sync)");
1354
+ console.warn(`Error during shutdown of validator "${name}":`, error);
1182
1355
  }
1356
+ validators.delete(name);
1183
1357
  }
1184
- async function batchExtractBytes(files, config) {
1185
- if (!initialized) {
1186
- throw new Error("WASM module not initialized. Call initWasm() first.");
1187
- }
1188
- if (!wasm) {
1189
- throw new Error("WASM module not loaded. Call initWasm() first.");
1190
- }
1191
- try {
1192
- if (!Array.isArray(files)) {
1193
- throw new Error("Files parameter must be an array");
1194
- }
1195
- if (files.length === 0) {
1196
- throw new Error("Files array cannot be empty");
1197
- }
1198
- const dataList = [];
1199
- const mimeTypes = [];
1200
- for (let i = 0; i < files.length; i += 1) {
1201
- const file = files[i];
1202
- if (!file || typeof file !== "object") {
1203
- throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
1204
- }
1205
- const f = file;
1206
- if (!(f.data instanceof Uint8Array)) {
1207
- throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
1208
- }
1209
- if (typeof f.mimeType !== "string") {
1210
- throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
1211
- }
1212
- if (f.data.length === 0) {
1213
- throw new Error(`Invalid file at index ${i}: data cannot be empty`);
1358
+ async function clearValidators() {
1359
+ const entries = Array.from(validators.entries());
1360
+ for (const [_name, validator] of entries) {
1361
+ try {
1362
+ if (validator.shutdown) {
1363
+ await validator.shutdown();
1214
1364
  }
1215
- dataList.push(f.data);
1216
- mimeTypes.push(f.mimeType);
1217
- }
1218
- const normalizedConfig = configToJS(config ?? null);
1219
- const results = await wasm.batchExtractBytes(dataList, mimeTypes, normalizedConfig);
1220
- if (!Array.isArray(results)) {
1221
- throw new Error("Invalid batch extraction result: expected array");
1365
+ } catch (error) {
1366
+ console.warn(`Error during shutdown of validator "${_name}":`, error);
1222
1367
  }
1223
- return results.map((result, index) => {
1224
- if (!result) {
1225
- throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
1226
- }
1227
- return jsToExtractionResult(result);
1228
- });
1229
- } catch (error) {
1230
- throw wrapWasmError(error, "batch extracting from bytes");
1231
1368
  }
1369
+ validators.clear();
1232
1370
  }
1233
- function batchExtractBytesSync(files, config) {
1234
- if (!initialized) {
1235
- throw new Error("WASM module not initialized. Call initWasm() first.");
1236
- }
1237
- if (!wasm) {
1238
- throw new Error("WASM module not loaded. Call initWasm() first.");
1371
+ function executePostProcessor(name, result) {
1372
+ const processor = postProcessors.get(name);
1373
+ if (!processor) {
1374
+ return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
1239
1375
  }
1240
1376
  try {
1241
- if (!Array.isArray(files)) {
1242
- throw new Error("Files parameter must be an array");
1243
- }
1244
- if (files.length === 0) {
1245
- throw new Error("Files array cannot be empty");
1246
- }
1247
- const dataList = [];
1248
- const mimeTypes = [];
1249
- for (let i = 0; i < files.length; i += 1) {
1250
- const file = files[i];
1251
- if (!file || typeof file !== "object") {
1252
- throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
1253
- }
1254
- const f = file;
1255
- if (!(f.data instanceof Uint8Array)) {
1256
- throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
1257
- }
1258
- if (typeof f.mimeType !== "string") {
1259
- throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
1260
- }
1261
- if (f.data.length === 0) {
1262
- throw new Error(`Invalid file at index ${i}: data cannot be empty`);
1263
- }
1264
- dataList.push(f.data);
1265
- mimeTypes.push(f.mimeType);
1266
- }
1267
- const normalizedConfig = configToJS(config ?? null);
1268
- const results = wasm.batchExtractBytesSync(dataList, mimeTypes, normalizedConfig);
1269
- if (!Array.isArray(results)) {
1270
- throw new Error("Invalid batch extraction result: expected array");
1377
+ const output = processor.process(result);
1378
+ if (output instanceof Promise) {
1379
+ return output;
1271
1380
  }
1272
- return results.map((result, index) => {
1273
- if (!result) {
1274
- throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
1275
- }
1276
- return jsToExtractionResult(result);
1277
- });
1381
+ return Promise.resolve(output);
1278
1382
  } catch (error) {
1279
- throw wrapWasmError(error, "batch extracting from bytes (sync)");
1383
+ return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
1280
1384
  }
1281
1385
  }
1282
- async function batchExtractFiles(files, config) {
1283
- if (!initialized) {
1284
- throw new Error("WASM module not initialized. Call initWasm() first.");
1386
+ function executeValidator(name, result) {
1387
+ const validator = validators.get(name);
1388
+ if (!validator) {
1389
+ return Promise.reject(new Error(`Validator "${name}" is not registered`));
1285
1390
  }
1286
1391
  try {
1287
- if (!Array.isArray(files)) {
1288
- throw new Error("Files parameter must be an array");
1289
- }
1290
- if (files.length === 0) {
1291
- throw new Error("Files array cannot be empty");
1292
- }
1293
- const byteFiles = [];
1294
- for (let i = 0; i < files.length; i += 1) {
1295
- const file = files[i];
1296
- if (!(file instanceof File)) {
1297
- throw new Error(`Invalid file at index ${i}: must be a File object`);
1298
- }
1299
- const bytes = await fileToUint8Array(file);
1300
- byteFiles.push({
1301
- data: bytes,
1302
- mimeType: file.type || "application/octet-stream"
1303
- });
1392
+ const output = validator.validate(result);
1393
+ if (output instanceof Promise) {
1394
+ return output;
1304
1395
  }
1305
- return await batchExtractBytes(byteFiles, config);
1396
+ return Promise.resolve(output);
1306
1397
  } catch (error) {
1307
- throw wrapWasmError(error, "batch extracting from files");
1398
+ return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
1308
1399
  }
1309
1400
  }
1310
- async function enableOcr() {
1311
- if (!initialized) {
1312
- throw new Error("WASM module not initialized. Call initWasm() first.");
1313
- }
1314
- if (!isBrowser()) {
1315
- throw new Error(
1316
- "OCR is only available in browser environments. TesseractWasmBackend requires Web Workers and createImageBitmap."
1317
- );
1318
- }
1319
- try {
1320
- const backend = new TesseractWasmBackend();
1321
- await backend.initialize();
1322
- registerOcrBackend(backend);
1323
- } catch (error) {
1324
- const message = error instanceof Error ? error.message : String(error);
1325
- throw new Error(`Failed to enable OCR: ${message}`);
1401
+ function setupGlobalCallbacks() {
1402
+ if (typeof globalThis !== "undefined") {
1403
+ const callbacksObj = globalThis;
1404
+ callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
1405
+ callbacksObj.__kreuzberg_execute_validator = executeValidator;
1326
1406
  }
1327
1407
  }
1408
+ setupGlobalCallbacks();
1328
1409
  export {
1329
1410
  TesseractWasmBackend,
1330
1411
  batchExtractBytes,
@@ -1349,6 +1430,7 @@ export {
1349
1430
  getValidator,
1350
1431
  getVersion,
1351
1432
  getWasmCapabilities,
1433
+ getWasmModule,
1352
1434
  hasBigInt,
1353
1435
  hasBlob,
1354
1436
  hasFileApi,
@@ -1358,9 +1440,13 @@ export {
1358
1440
  hasWasmStreaming,
1359
1441
  hasWorkers,
1360
1442
  initWasm,
1443
+ initializePdfiumAsync,
1361
1444
  isBrowser,
1362
1445
  isBun,
1446
+ isCloudflareWorkers,
1363
1447
  isDeno,
1448
+ isEdgeEnvironment,
1449
+ isEdgeRuntime,
1364
1450
  isInitialized,
1365
1451
  isNode,
1366
1452
  isServerEnvironment,