@kreuzberg/wasm 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/extraction/batch.d.ts +80 -0
- package/dist/extraction/batch.d.ts.map +1 -0
- package/dist/extraction/bytes.d.ts +69 -0
- package/dist/extraction/bytes.d.ts.map +1 -0
- package/dist/extraction/files.d.ts +77 -0
- package/dist/extraction/files.d.ts.map +1 -0
- package/dist/extraction/index.d.ts +11 -0
- package/dist/extraction/index.d.ts.map +1 -0
- package/dist/extraction/internal.d.ts +21 -0
- package/dist/extraction/internal.d.ts.map +1 -0
- package/dist/index.d.ts +9 -323
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +677 -591
- package/dist/index.js.map +1 -1
- package/dist/initialization/pdfium-loader.d.ts +30 -0
- package/dist/initialization/pdfium-loader.d.ts.map +1 -0
- package/dist/initialization/state.d.ts +100 -0
- package/dist/initialization/state.d.ts.map +1 -0
- package/dist/initialization/wasm-loader.d.ts +81 -0
- package/dist/initialization/wasm-loader.d.ts.map +1 -0
- package/dist/ocr/enabler.d.ts +86 -0
- package/dist/ocr/enabler.d.ts.map +1 -0
- package/dist/pkg/README.md +1 -1
- package/dist/pkg/kreuzberg_wasm.d.ts +76 -0
- package/dist/pkg/kreuzberg_wasm.js +142 -82
- package/dist/pkg/kreuzberg_wasm_bg.js +7 -7
- package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +3 -3
- package/dist/pkg/package.json +5 -1
- package/dist/runtime.d.ts +22 -2
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +21 -1
- package/dist/runtime.js.map +1 -1
- package/dist/types.d.ts +75 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +6 -6
package/dist/index.js
CHANGED
|
@@ -24,6 +24,176 @@ var init_pdfium = __esm({
|
|
|
24
24
|
}
|
|
25
25
|
});
|
|
26
26
|
|
|
27
|
+
// typescript/runtime.ts
|
|
28
|
+
function detectRuntime() {
|
|
29
|
+
const globalCaches = globalThis.caches;
|
|
30
|
+
if (typeof caches !== "undefined" && globalCaches !== null && typeof globalCaches === "object" && "default" in globalCaches && typeof window === "undefined" && typeof document === "undefined") {
|
|
31
|
+
return "cloudflare-workers";
|
|
32
|
+
}
|
|
33
|
+
if (typeof globalThis.EdgeRuntime !== "undefined") {
|
|
34
|
+
return "edge-runtime";
|
|
35
|
+
}
|
|
36
|
+
if (typeof globalThis.Deno !== "undefined") {
|
|
37
|
+
return "deno";
|
|
38
|
+
}
|
|
39
|
+
if (typeof globalThis.Bun !== "undefined") {
|
|
40
|
+
return "bun";
|
|
41
|
+
}
|
|
42
|
+
if (typeof process !== "undefined" && process.versions && process.versions.node) {
|
|
43
|
+
return "node";
|
|
44
|
+
}
|
|
45
|
+
if (typeof window !== "undefined" && typeof document !== "undefined") {
|
|
46
|
+
return "browser";
|
|
47
|
+
}
|
|
48
|
+
return "unknown";
|
|
49
|
+
}
|
|
50
|
+
function isBrowser() {
|
|
51
|
+
return detectRuntime() === "browser";
|
|
52
|
+
}
|
|
53
|
+
function isNode() {
|
|
54
|
+
return detectRuntime() === "node";
|
|
55
|
+
}
|
|
56
|
+
function isDeno() {
|
|
57
|
+
return detectRuntime() === "deno";
|
|
58
|
+
}
|
|
59
|
+
function isBun() {
|
|
60
|
+
return detectRuntime() === "bun";
|
|
61
|
+
}
|
|
62
|
+
function isCloudflareWorkers() {
|
|
63
|
+
return detectRuntime() === "cloudflare-workers";
|
|
64
|
+
}
|
|
65
|
+
function isEdgeRuntime() {
|
|
66
|
+
return detectRuntime() === "edge-runtime";
|
|
67
|
+
}
|
|
68
|
+
function isEdgeEnvironment() {
|
|
69
|
+
const runtime = detectRuntime();
|
|
70
|
+
return runtime === "cloudflare-workers" || runtime === "edge-runtime";
|
|
71
|
+
}
|
|
72
|
+
function isWebEnvironment() {
|
|
73
|
+
const runtime = detectRuntime();
|
|
74
|
+
return runtime === "browser";
|
|
75
|
+
}
|
|
76
|
+
function isServerEnvironment() {
|
|
77
|
+
const runtime = detectRuntime();
|
|
78
|
+
return runtime === "node" || runtime === "deno" || runtime === "bun" || runtime === "cloudflare-workers" || runtime === "edge-runtime";
|
|
79
|
+
}
|
|
80
|
+
function hasFileApi() {
|
|
81
|
+
return typeof window !== "undefined" && typeof File !== "undefined" && typeof Blob !== "undefined";
|
|
82
|
+
}
|
|
83
|
+
function hasBlob() {
|
|
84
|
+
return typeof Blob !== "undefined";
|
|
85
|
+
}
|
|
86
|
+
function hasWorkers() {
|
|
87
|
+
return typeof Worker !== "undefined";
|
|
88
|
+
}
|
|
89
|
+
function hasSharedArrayBuffer() {
|
|
90
|
+
return typeof SharedArrayBuffer !== "undefined";
|
|
91
|
+
}
|
|
92
|
+
function hasModuleWorkers() {
|
|
93
|
+
if (!hasWorkers()) {
|
|
94
|
+
return false;
|
|
95
|
+
}
|
|
96
|
+
try {
|
|
97
|
+
const blob = new Blob(['console.log("test")'], {
|
|
98
|
+
type: "application/javascript"
|
|
99
|
+
});
|
|
100
|
+
const workerUrl = URL.createObjectURL(blob);
|
|
101
|
+
try {
|
|
102
|
+
return true;
|
|
103
|
+
} finally {
|
|
104
|
+
URL.revokeObjectURL(workerUrl);
|
|
105
|
+
}
|
|
106
|
+
} catch {
|
|
107
|
+
return false;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
function hasWasm() {
|
|
111
|
+
return typeof WebAssembly !== "undefined" && WebAssembly.instantiate !== void 0;
|
|
112
|
+
}
|
|
113
|
+
function hasWasmStreaming() {
|
|
114
|
+
return typeof WebAssembly !== "undefined" && WebAssembly.instantiateStreaming !== void 0;
|
|
115
|
+
}
|
|
116
|
+
function hasBigInt() {
|
|
117
|
+
try {
|
|
118
|
+
const test = BigInt("1");
|
|
119
|
+
return typeof test === "bigint";
|
|
120
|
+
} catch {
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
function getRuntimeVersion() {
|
|
125
|
+
const runtime = detectRuntime();
|
|
126
|
+
switch (runtime) {
|
|
127
|
+
case "node":
|
|
128
|
+
return process.version?.substring(1);
|
|
129
|
+
case "deno": {
|
|
130
|
+
const deno = globalThis.Deno;
|
|
131
|
+
const version = deno?.version;
|
|
132
|
+
return version?.deno;
|
|
133
|
+
}
|
|
134
|
+
case "bun": {
|
|
135
|
+
const bun = globalThis.Bun;
|
|
136
|
+
return bun?.version;
|
|
137
|
+
}
|
|
138
|
+
default:
|
|
139
|
+
return void 0;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
function getWasmCapabilities() {
|
|
143
|
+
const runtime = detectRuntime();
|
|
144
|
+
const version = getRuntimeVersion();
|
|
145
|
+
const capabilities = {
|
|
146
|
+
runtime,
|
|
147
|
+
hasWasm: hasWasm(),
|
|
148
|
+
hasWasmStreaming: hasWasmStreaming(),
|
|
149
|
+
hasFileApi: hasFileApi(),
|
|
150
|
+
hasBlob: hasBlob(),
|
|
151
|
+
hasWorkers: hasWorkers(),
|
|
152
|
+
hasSharedArrayBuffer: hasSharedArrayBuffer(),
|
|
153
|
+
hasModuleWorkers: hasModuleWorkers(),
|
|
154
|
+
hasBigInt: hasBigInt(),
|
|
155
|
+
...version !== void 0 ? { runtimeVersion: version } : {}
|
|
156
|
+
};
|
|
157
|
+
return capabilities;
|
|
158
|
+
}
|
|
159
|
+
function getRuntimeInfo() {
|
|
160
|
+
const runtime = detectRuntime();
|
|
161
|
+
const capabilities = getWasmCapabilities();
|
|
162
|
+
return {
|
|
163
|
+
runtime,
|
|
164
|
+
isBrowser: isBrowser(),
|
|
165
|
+
isNode: isNode(),
|
|
166
|
+
isDeno: isDeno(),
|
|
167
|
+
isBun: isBun(),
|
|
168
|
+
isWeb: isWebEnvironment(),
|
|
169
|
+
isServer: isServerEnvironment(),
|
|
170
|
+
runtimeVersion: getRuntimeVersion(),
|
|
171
|
+
userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "N/A",
|
|
172
|
+
capabilities
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// typescript/initialization/pdfium-loader.ts
|
|
177
|
+
async function initializePdfiumAsync(wasmModule) {
|
|
178
|
+
if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
|
|
179
|
+
return;
|
|
180
|
+
}
|
|
181
|
+
if (!isBrowser()) {
|
|
182
|
+
console.debug("PDFium initialization skipped (non-browser environment)");
|
|
183
|
+
return;
|
|
184
|
+
}
|
|
185
|
+
try {
|
|
186
|
+
const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
|
|
187
|
+
const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
|
|
188
|
+
const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
|
|
189
|
+
if (!success) {
|
|
190
|
+
console.warn("PDFium initialization returned false");
|
|
191
|
+
}
|
|
192
|
+
} catch (error) {
|
|
193
|
+
console.debug("PDFium initialization error:", error);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
27
197
|
// typescript/adapters/wasm-adapter.ts
|
|
28
198
|
var MAX_FILE_SIZE = 512 * 1024 * 1024;
|
|
29
199
|
function isNumberOrNull(value) {
|
|
@@ -257,59 +427,417 @@ function isValidExtractionResult(value) {
|
|
|
257
427
|
return typeof obj.content === "string" && (typeof obj.mimeType === "string" || typeof obj.mime_type === "string") && obj.metadata !== null && typeof obj.metadata === "object" && Array.isArray(obj.tables);
|
|
258
428
|
}
|
|
259
429
|
|
|
260
|
-
// typescript/
|
|
261
|
-
var
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
430
|
+
// typescript/initialization/state.ts
|
|
431
|
+
var wasm = null;
|
|
432
|
+
var initialized = false;
|
|
433
|
+
var initializationError = null;
|
|
434
|
+
var initializationPromise = null;
|
|
435
|
+
function getWasmModule() {
|
|
436
|
+
return wasm;
|
|
437
|
+
}
|
|
438
|
+
function setWasmModule(module) {
|
|
439
|
+
wasm = module;
|
|
440
|
+
}
|
|
441
|
+
function isInitialized() {
|
|
442
|
+
return initialized;
|
|
443
|
+
}
|
|
444
|
+
function setInitialized(value) {
|
|
445
|
+
initialized = value;
|
|
446
|
+
}
|
|
447
|
+
function getInitializationError() {
|
|
448
|
+
return initializationError;
|
|
449
|
+
}
|
|
450
|
+
function setInitializationError(error) {
|
|
451
|
+
initializationError = error;
|
|
452
|
+
}
|
|
453
|
+
function getInitializationPromise() {
|
|
454
|
+
return initializationPromise;
|
|
455
|
+
}
|
|
456
|
+
function setInitializationPromise(promise) {
|
|
457
|
+
initializationPromise = promise;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// typescript/initialization/wasm-loader.ts
|
|
461
|
+
async function loadWasmBinaryForNode() {
|
|
462
|
+
if (!isNode()) {
|
|
463
|
+
return void 0;
|
|
271
464
|
}
|
|
272
|
-
|
|
273
|
-
|
|
465
|
+
try {
|
|
466
|
+
const fs = await import(
|
|
467
|
+
/* @vite-ignore */
|
|
468
|
+
"fs/promises"
|
|
469
|
+
);
|
|
470
|
+
const path = await import(
|
|
471
|
+
/* @vite-ignore */
|
|
472
|
+
"path"
|
|
473
|
+
);
|
|
474
|
+
const url = await import(
|
|
475
|
+
/* @vite-ignore */
|
|
476
|
+
"url"
|
|
477
|
+
);
|
|
478
|
+
const __dirname = path.dirname(url.fileURLToPath(import.meta.url));
|
|
479
|
+
const wasmPath = path.join(__dirname, "..", "pkg", "kreuzberg_wasm_bg.wasm");
|
|
480
|
+
const wasmBuffer = await fs.readFile(wasmPath);
|
|
481
|
+
return new Uint8Array(wasmBuffer);
|
|
482
|
+
} catch {
|
|
483
|
+
return void 0;
|
|
274
484
|
}
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
485
|
+
}
|
|
486
|
+
function getVersion() {
|
|
487
|
+
if (!isInitialized()) {
|
|
488
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
278
489
|
}
|
|
279
|
-
|
|
280
|
-
|
|
490
|
+
const wasmModule = getWasmModule();
|
|
491
|
+
if (!wasmModule) {
|
|
492
|
+
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
281
493
|
}
|
|
282
|
-
|
|
283
|
-
}
|
|
284
|
-
function getOcrBackend(name) {
|
|
285
|
-
return ocrBackendRegistry.get(name);
|
|
286
|
-
}
|
|
287
|
-
function listOcrBackends() {
|
|
288
|
-
return Array.from(ocrBackendRegistry.keys());
|
|
494
|
+
return wasmModule.version();
|
|
289
495
|
}
|
|
290
|
-
async function
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
throw new Error(
|
|
294
|
-
`OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
|
|
295
|
-
);
|
|
496
|
+
async function initWasm() {
|
|
497
|
+
if (isInitialized()) {
|
|
498
|
+
return;
|
|
296
499
|
}
|
|
297
|
-
|
|
500
|
+
let currentPromise = getInitializationPromise();
|
|
501
|
+
if (currentPromise) {
|
|
502
|
+
return currentPromise;
|
|
503
|
+
}
|
|
504
|
+
currentPromise = (async () => {
|
|
298
505
|
try {
|
|
299
|
-
|
|
506
|
+
if (!hasWasm()) {
|
|
507
|
+
throw new Error("WebAssembly is not supported in this environment");
|
|
508
|
+
}
|
|
509
|
+
let wasmModule;
|
|
510
|
+
const pkgPath = "./pkg/kreuzberg_wasm.js";
|
|
511
|
+
const fallbackPath = "./kreuzberg_wasm.js";
|
|
512
|
+
try {
|
|
513
|
+
wasmModule = await import(
|
|
514
|
+
/* @vite-ignore */
|
|
515
|
+
pkgPath
|
|
516
|
+
);
|
|
517
|
+
} catch {
|
|
518
|
+
wasmModule = await import(
|
|
519
|
+
/* @vite-ignore */
|
|
520
|
+
fallbackPath
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
const loadedModule = wasmModule;
|
|
524
|
+
setWasmModule(loadedModule);
|
|
525
|
+
if (loadedModule && typeof loadedModule.default === "function") {
|
|
526
|
+
const wasmBinary = await loadWasmBinaryForNode();
|
|
527
|
+
if (wasmBinary) {
|
|
528
|
+
await loadedModule.default(wasmBinary);
|
|
529
|
+
} else {
|
|
530
|
+
await loadedModule.default();
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
if (isBrowser() && loadedModule && typeof loadedModule.initialize_pdfium_render === "function") {
|
|
534
|
+
initializePdfiumAsync(loadedModule).catch((error) => {
|
|
535
|
+
console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
|
|
536
|
+
});
|
|
537
|
+
}
|
|
538
|
+
setInitialized(true);
|
|
539
|
+
setInitializationError(null);
|
|
300
540
|
} catch (error) {
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
);
|
|
541
|
+
setInitializationError(error instanceof Error ? error : new Error(String(error)));
|
|
542
|
+
throw wrapWasmError(error, "initializing Kreuzberg WASM module");
|
|
304
543
|
}
|
|
544
|
+
})();
|
|
545
|
+
setInitializationPromise(currentPromise);
|
|
546
|
+
return currentPromise;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// typescript/extraction/internal.ts
|
|
550
|
+
function getWasmModule2() {
|
|
551
|
+
const wasm2 = getWasmModule();
|
|
552
|
+
if (!wasm2) {
|
|
553
|
+
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
305
554
|
}
|
|
306
|
-
|
|
555
|
+
return wasm2;
|
|
307
556
|
}
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
557
|
+
function isInitialized2() {
|
|
558
|
+
return isInitialized();
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// typescript/extraction/bytes.ts
|
|
562
|
+
async function extractBytes(data, mimeType, config) {
|
|
563
|
+
if (!isInitialized2()) {
|
|
564
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
565
|
+
}
|
|
566
|
+
const wasm2 = getWasmModule2();
|
|
567
|
+
try {
|
|
568
|
+
if (!data || data.length === 0) {
|
|
569
|
+
throw new Error("Document data cannot be empty");
|
|
570
|
+
}
|
|
571
|
+
if (!mimeType) {
|
|
572
|
+
throw new Error("MIME type is required");
|
|
573
|
+
}
|
|
574
|
+
const normalizedConfig = configToJS(config ?? null);
|
|
575
|
+
const result = await wasm2.extractBytes(data, mimeType, normalizedConfig);
|
|
576
|
+
if (!result) {
|
|
577
|
+
throw new Error("Invalid extraction result: no result from WASM module");
|
|
578
|
+
}
|
|
579
|
+
return jsToExtractionResult(result);
|
|
580
|
+
} catch (error) {
|
|
581
|
+
throw wrapWasmError(error, "extracting from bytes");
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
function extractBytesSync(data, mimeType, config) {
|
|
585
|
+
if (!isInitialized2()) {
|
|
586
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
587
|
+
}
|
|
588
|
+
const wasm2 = getWasmModule2();
|
|
589
|
+
try {
|
|
590
|
+
if (!data || data.length === 0) {
|
|
591
|
+
throw new Error("Document data cannot be empty");
|
|
592
|
+
}
|
|
593
|
+
if (!mimeType) {
|
|
594
|
+
throw new Error("MIME type is required");
|
|
595
|
+
}
|
|
596
|
+
const normalizedConfig = configToJS(config ?? null);
|
|
597
|
+
const result = wasm2.extractBytesSync(data, mimeType, normalizedConfig);
|
|
598
|
+
if (!result) {
|
|
599
|
+
throw new Error("Invalid extraction result: no result from WASM module");
|
|
600
|
+
}
|
|
601
|
+
return jsToExtractionResult(result);
|
|
602
|
+
} catch (error) {
|
|
603
|
+
throw wrapWasmError(error, "extracting from bytes (sync)");
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// typescript/extraction/files.ts
|
|
608
|
+
async function extractFile(path, mimeType, config) {
|
|
609
|
+
if (!isInitialized2()) {
|
|
610
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
611
|
+
}
|
|
612
|
+
const wasm2 = getWasmModule2();
|
|
613
|
+
try {
|
|
614
|
+
if (!path) {
|
|
615
|
+
throw new Error("File path is required");
|
|
616
|
+
}
|
|
617
|
+
const runtime = detectRuntime();
|
|
618
|
+
if (runtime === "browser") {
|
|
619
|
+
throw new Error("Use extractBytes with fileToUint8Array for browser environments");
|
|
620
|
+
}
|
|
621
|
+
let fileData;
|
|
622
|
+
if (runtime === "node") {
|
|
623
|
+
const { readFile } = await import("fs/promises");
|
|
624
|
+
const buffer = await readFile(path);
|
|
625
|
+
fileData = new Uint8Array(buffer);
|
|
626
|
+
} else if (runtime === "deno") {
|
|
627
|
+
const deno = globalThis.Deno;
|
|
628
|
+
fileData = await deno.readFile(path);
|
|
629
|
+
} else if (runtime === "bun") {
|
|
630
|
+
const { readFile } = await import("fs/promises");
|
|
631
|
+
const buffer = await readFile(path);
|
|
632
|
+
fileData = new Uint8Array(buffer);
|
|
633
|
+
} else {
|
|
634
|
+
throw new Error(`Unsupported runtime for file extraction: ${runtime}`);
|
|
635
|
+
}
|
|
636
|
+
let detectedMimeType = mimeType;
|
|
637
|
+
if (!detectedMimeType) {
|
|
638
|
+
detectedMimeType = wasm2.detectMimeFromBytes(fileData);
|
|
639
|
+
}
|
|
640
|
+
if (!detectedMimeType) {
|
|
641
|
+
throw new Error("Could not detect MIME type for file. Please provide mimeType parameter.");
|
|
642
|
+
}
|
|
643
|
+
detectedMimeType = wasm2.normalizeMimeType(detectedMimeType);
|
|
644
|
+
return await extractBytes(fileData, detectedMimeType, config);
|
|
645
|
+
} catch (error) {
|
|
646
|
+
throw wrapWasmError(error, `extracting from file: ${path}`);
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
async function extractFromFile(file, mimeType, config) {
|
|
650
|
+
if (!isInitialized2()) {
|
|
651
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
652
|
+
}
|
|
653
|
+
const wasm2 = getWasmModule2();
|
|
654
|
+
try {
|
|
655
|
+
const bytes = await fileToUint8Array(file);
|
|
656
|
+
let type = mimeType ?? (file instanceof File ? file.type : "application/octet-stream");
|
|
657
|
+
type = wasm2.normalizeMimeType(type);
|
|
658
|
+
return await extractBytes(bytes, type, config);
|
|
659
|
+
} catch (error) {
|
|
660
|
+
throw wrapWasmError(error, `extracting from ${file instanceof File ? "file" : "blob"}`);
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// typescript/extraction/batch.ts
|
|
665
|
+
async function batchExtractBytes(files, config) {
|
|
666
|
+
if (!isInitialized2()) {
|
|
667
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
668
|
+
}
|
|
669
|
+
const wasm2 = getWasmModule2();
|
|
670
|
+
try {
|
|
671
|
+
if (!Array.isArray(files)) {
|
|
672
|
+
throw new Error("Files parameter must be an array");
|
|
673
|
+
}
|
|
674
|
+
if (files.length === 0) {
|
|
675
|
+
throw new Error("Files array cannot be empty");
|
|
676
|
+
}
|
|
677
|
+
const dataList = [];
|
|
678
|
+
const mimeTypes = [];
|
|
679
|
+
for (let i = 0; i < files.length; i += 1) {
|
|
680
|
+
const file = files[i];
|
|
681
|
+
if (!file || typeof file !== "object") {
|
|
682
|
+
throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
|
|
683
|
+
}
|
|
684
|
+
const f = file;
|
|
685
|
+
if (!(f.data instanceof Uint8Array)) {
|
|
686
|
+
throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
|
|
687
|
+
}
|
|
688
|
+
if (typeof f.mimeType !== "string") {
|
|
689
|
+
throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
|
|
690
|
+
}
|
|
691
|
+
if (f.data.length === 0) {
|
|
692
|
+
throw new Error(`Invalid file at index ${i}: data cannot be empty`);
|
|
693
|
+
}
|
|
694
|
+
dataList.push(f.data);
|
|
695
|
+
mimeTypes.push(f.mimeType);
|
|
696
|
+
}
|
|
697
|
+
const normalizedConfig = configToJS(config ?? null);
|
|
698
|
+
const results = await wasm2.batchExtractBytes(dataList, mimeTypes, normalizedConfig);
|
|
699
|
+
if (!Array.isArray(results)) {
|
|
700
|
+
throw new Error("Invalid batch extraction result: expected array");
|
|
701
|
+
}
|
|
702
|
+
return results.map((result, index) => {
|
|
703
|
+
if (!result) {
|
|
704
|
+
throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
|
|
705
|
+
}
|
|
706
|
+
return jsToExtractionResult(result);
|
|
707
|
+
});
|
|
708
|
+
} catch (error) {
|
|
709
|
+
throw wrapWasmError(error, "batch extracting from bytes");
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
function batchExtractBytesSync(files, config) {
|
|
713
|
+
if (!isInitialized2()) {
|
|
714
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
715
|
+
}
|
|
716
|
+
const wasm2 = getWasmModule2();
|
|
717
|
+
try {
|
|
718
|
+
if (!Array.isArray(files)) {
|
|
719
|
+
throw new Error("Files parameter must be an array");
|
|
720
|
+
}
|
|
721
|
+
if (files.length === 0) {
|
|
722
|
+
throw new Error("Files array cannot be empty");
|
|
723
|
+
}
|
|
724
|
+
const dataList = [];
|
|
725
|
+
const mimeTypes = [];
|
|
726
|
+
for (let i = 0; i < files.length; i += 1) {
|
|
727
|
+
const file = files[i];
|
|
728
|
+
if (!file || typeof file !== "object") {
|
|
729
|
+
throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
|
|
730
|
+
}
|
|
731
|
+
const f = file;
|
|
732
|
+
if (!(f.data instanceof Uint8Array)) {
|
|
733
|
+
throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
|
|
734
|
+
}
|
|
735
|
+
if (typeof f.mimeType !== "string") {
|
|
736
|
+
throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
|
|
737
|
+
}
|
|
738
|
+
if (f.data.length === 0) {
|
|
739
|
+
throw new Error(`Invalid file at index ${i}: data cannot be empty`);
|
|
740
|
+
}
|
|
741
|
+
dataList.push(f.data);
|
|
742
|
+
mimeTypes.push(f.mimeType);
|
|
743
|
+
}
|
|
744
|
+
const normalizedConfig = configToJS(config ?? null);
|
|
745
|
+
const results = wasm2.batchExtractBytesSync(dataList, mimeTypes, normalizedConfig);
|
|
746
|
+
if (!Array.isArray(results)) {
|
|
747
|
+
throw new Error("Invalid batch extraction result: expected array");
|
|
748
|
+
}
|
|
749
|
+
return results.map((result, index) => {
|
|
750
|
+
if (!result) {
|
|
751
|
+
throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
|
|
752
|
+
}
|
|
753
|
+
return jsToExtractionResult(result);
|
|
754
|
+
});
|
|
755
|
+
} catch (error) {
|
|
756
|
+
throw wrapWasmError(error, "batch extracting from bytes (sync)");
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
async function batchExtractFiles(files, config) {
|
|
760
|
+
if (!isInitialized2()) {
|
|
761
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
762
|
+
}
|
|
763
|
+
try {
|
|
764
|
+
if (!Array.isArray(files)) {
|
|
765
|
+
throw new Error("Files parameter must be an array");
|
|
766
|
+
}
|
|
767
|
+
if (files.length === 0) {
|
|
768
|
+
throw new Error("Files array cannot be empty");
|
|
769
|
+
}
|
|
770
|
+
const byteFiles = [];
|
|
771
|
+
for (let i = 0; i < files.length; i += 1) {
|
|
772
|
+
const file = files[i];
|
|
773
|
+
if (!(file instanceof File)) {
|
|
774
|
+
throw new Error(`Invalid file at index ${i}: must be a File object`);
|
|
775
|
+
}
|
|
776
|
+
const bytes = await fileToUint8Array(file);
|
|
777
|
+
byteFiles.push({
|
|
778
|
+
data: bytes,
|
|
779
|
+
mimeType: file.type || "application/octet-stream"
|
|
780
|
+
});
|
|
781
|
+
}
|
|
782
|
+
return await batchExtractBytes(byteFiles, config);
|
|
783
|
+
} catch (error) {
|
|
784
|
+
throw wrapWasmError(error, "batch extracting from files");
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
// typescript/ocr/registry.ts
|
|
789
|
+
var ocrBackendRegistry = /* @__PURE__ */ new Map();
|
|
790
|
+
function registerOcrBackend(backend) {
|
|
791
|
+
if (!backend) {
|
|
792
|
+
throw new Error("Backend cannot be null or undefined");
|
|
793
|
+
}
|
|
794
|
+
if (typeof backend.name !== "function") {
|
|
795
|
+
throw new Error("Backend must implement name() method");
|
|
796
|
+
}
|
|
797
|
+
if (typeof backend.supportedLanguages !== "function") {
|
|
798
|
+
throw new Error("Backend must implement supportedLanguages() method");
|
|
799
|
+
}
|
|
800
|
+
if (typeof backend.processImage !== "function") {
|
|
801
|
+
throw new Error("Backend must implement processImage() method");
|
|
802
|
+
}
|
|
803
|
+
const backendName = backend.name();
|
|
804
|
+
if (!backendName || typeof backendName !== "string") {
|
|
805
|
+
throw new Error("Backend name must be a non-empty string");
|
|
806
|
+
}
|
|
807
|
+
if (ocrBackendRegistry.has(backendName)) {
|
|
808
|
+
console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
|
|
809
|
+
}
|
|
810
|
+
ocrBackendRegistry.set(backendName, backend);
|
|
811
|
+
}
|
|
812
|
+
function getOcrBackend(name) {
|
|
813
|
+
return ocrBackendRegistry.get(name);
|
|
814
|
+
}
|
|
815
|
+
function listOcrBackends() {
|
|
816
|
+
return Array.from(ocrBackendRegistry.keys());
|
|
817
|
+
}
|
|
818
|
+
async function unregisterOcrBackend(name) {
|
|
819
|
+
const backend = ocrBackendRegistry.get(name);
|
|
820
|
+
if (!backend) {
|
|
821
|
+
throw new Error(
|
|
822
|
+
`OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
|
|
823
|
+
);
|
|
824
|
+
}
|
|
825
|
+
if (typeof backend.shutdown === "function") {
|
|
826
|
+
try {
|
|
827
|
+
await backend.shutdown();
|
|
828
|
+
} catch (error) {
|
|
829
|
+
console.warn(
|
|
830
|
+
`Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
|
|
831
|
+
);
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
ocrBackendRegistry.delete(name);
|
|
835
|
+
}
|
|
836
|
+
async function clearOcrBackends() {
|
|
837
|
+
const backends = Array.from(ocrBackendRegistry.entries());
|
|
838
|
+
for (const [name, backend] of backends) {
|
|
839
|
+
if (typeof backend.shutdown === "function") {
|
|
840
|
+
try {
|
|
313
841
|
await backend.shutdown();
|
|
314
842
|
} catch (error) {
|
|
315
843
|
console.warn(
|
|
@@ -697,136 +1225,24 @@ var TesseractWasmBackend = class {
|
|
|
697
1225
|
}
|
|
698
1226
|
};
|
|
699
1227
|
|
|
700
|
-
// typescript/
|
|
701
|
-
function
|
|
702
|
-
if (
|
|
703
|
-
|
|
704
|
-
}
|
|
705
|
-
if (typeof globalThis.Bun !== "undefined") {
|
|
706
|
-
return "bun";
|
|
707
|
-
}
|
|
708
|
-
if (typeof process !== "undefined" && process.versions && process.versions.node) {
|
|
709
|
-
return "node";
|
|
710
|
-
}
|
|
711
|
-
if (typeof window !== "undefined" && typeof document !== "undefined") {
|
|
712
|
-
return "browser";
|
|
713
|
-
}
|
|
714
|
-
return "unknown";
|
|
715
|
-
}
|
|
716
|
-
function isBrowser() {
|
|
717
|
-
return detectRuntime() === "browser";
|
|
718
|
-
}
|
|
719
|
-
function isNode() {
|
|
720
|
-
return detectRuntime() === "node";
|
|
721
|
-
}
|
|
722
|
-
function isDeno() {
|
|
723
|
-
return detectRuntime() === "deno";
|
|
724
|
-
}
|
|
725
|
-
function isBun() {
|
|
726
|
-
return detectRuntime() === "bun";
|
|
727
|
-
}
|
|
728
|
-
function isWebEnvironment() {
|
|
729
|
-
const runtime = detectRuntime();
|
|
730
|
-
return runtime === "browser";
|
|
731
|
-
}
|
|
732
|
-
function isServerEnvironment() {
|
|
733
|
-
const runtime = detectRuntime();
|
|
734
|
-
return runtime === "node" || runtime === "deno" || runtime === "bun";
|
|
735
|
-
}
|
|
736
|
-
function hasFileApi() {
|
|
737
|
-
return typeof window !== "undefined" && typeof File !== "undefined" && typeof Blob !== "undefined";
|
|
738
|
-
}
|
|
739
|
-
function hasBlob() {
|
|
740
|
-
return typeof Blob !== "undefined";
|
|
741
|
-
}
|
|
742
|
-
function hasWorkers() {
|
|
743
|
-
return typeof Worker !== "undefined";
|
|
744
|
-
}
|
|
745
|
-
function hasSharedArrayBuffer() {
|
|
746
|
-
return typeof SharedArrayBuffer !== "undefined";
|
|
747
|
-
}
|
|
748
|
-
function hasModuleWorkers() {
|
|
749
|
-
if (!hasWorkers()) {
|
|
750
|
-
return false;
|
|
751
|
-
}
|
|
752
|
-
try {
|
|
753
|
-
const blob = new Blob(['console.log("test")'], {
|
|
754
|
-
type: "application/javascript"
|
|
755
|
-
});
|
|
756
|
-
const workerUrl = URL.createObjectURL(blob);
|
|
757
|
-
try {
|
|
758
|
-
return true;
|
|
759
|
-
} finally {
|
|
760
|
-
URL.revokeObjectURL(workerUrl);
|
|
761
|
-
}
|
|
762
|
-
} catch {
|
|
763
|
-
return false;
|
|
764
|
-
}
|
|
765
|
-
}
|
|
766
|
-
function hasWasm() {
|
|
767
|
-
return typeof WebAssembly !== "undefined" && WebAssembly.instantiate !== void 0;
|
|
768
|
-
}
|
|
769
|
-
function hasWasmStreaming() {
|
|
770
|
-
return typeof WebAssembly !== "undefined" && WebAssembly.instantiateStreaming !== void 0;
|
|
771
|
-
}
|
|
772
|
-
function hasBigInt() {
|
|
773
|
-
try {
|
|
774
|
-
const test = BigInt("1");
|
|
775
|
-
return typeof test === "bigint";
|
|
776
|
-
} catch {
|
|
777
|
-
return false;
|
|
1228
|
+
// typescript/ocr/enabler.ts
|
|
1229
|
+
async function enableOcr() {
|
|
1230
|
+
if (!isInitialized2()) {
|
|
1231
|
+
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
778
1232
|
}
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
const bun = globalThis.Bun;
|
|
792
|
-
return bun?.version;
|
|
793
|
-
}
|
|
794
|
-
default:
|
|
795
|
-
return void 0;
|
|
1233
|
+
if (!isBrowser()) {
|
|
1234
|
+
throw new Error(
|
|
1235
|
+
"OCR is only available in browser environments. TesseractWasmBackend requires Web Workers and createImageBitmap."
|
|
1236
|
+
);
|
|
1237
|
+
}
|
|
1238
|
+
try {
|
|
1239
|
+
const backend = new TesseractWasmBackend();
|
|
1240
|
+
await backend.initialize();
|
|
1241
|
+
registerOcrBackend(backend);
|
|
1242
|
+
} catch (error) {
|
|
1243
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1244
|
+
throw new Error(`Failed to enable OCR: ${message}`);
|
|
796
1245
|
}
|
|
797
|
-
}
|
|
798
|
-
function getWasmCapabilities() {
|
|
799
|
-
const runtime = detectRuntime();
|
|
800
|
-
const version = getRuntimeVersion();
|
|
801
|
-
const capabilities = {
|
|
802
|
-
runtime,
|
|
803
|
-
hasWasm: hasWasm(),
|
|
804
|
-
hasWasmStreaming: hasWasmStreaming(),
|
|
805
|
-
hasFileApi: hasFileApi(),
|
|
806
|
-
hasBlob: hasBlob(),
|
|
807
|
-
hasWorkers: hasWorkers(),
|
|
808
|
-
hasSharedArrayBuffer: hasSharedArrayBuffer(),
|
|
809
|
-
hasModuleWorkers: hasModuleWorkers(),
|
|
810
|
-
hasBigInt: hasBigInt(),
|
|
811
|
-
...version !== void 0 ? { runtimeVersion: version } : {}
|
|
812
|
-
};
|
|
813
|
-
return capabilities;
|
|
814
|
-
}
|
|
815
|
-
function getRuntimeInfo() {
|
|
816
|
-
const runtime = detectRuntime();
|
|
817
|
-
const capabilities = getWasmCapabilities();
|
|
818
|
-
return {
|
|
819
|
-
runtime,
|
|
820
|
-
isBrowser: isBrowser(),
|
|
821
|
-
isNode: isNode(),
|
|
822
|
-
isDeno: isDeno(),
|
|
823
|
-
isBun: isBun(),
|
|
824
|
-
isWeb: isWebEnvironment(),
|
|
825
|
-
isServer: isServerEnvironment(),
|
|
826
|
-
runtimeVersion: getRuntimeVersion(),
|
|
827
|
-
userAgent: typeof navigator !== "undefined" ? navigator.userAgent : "N/A",
|
|
828
|
-
capabilities
|
|
829
|
-
};
|
|
830
1246
|
}
|
|
831
1247
|
|
|
832
1248
|
// typescript/plugin-registry.ts
|
|
@@ -871,460 +1287,125 @@ async function unregisterPostProcessor(name) {
|
|
|
871
1287
|
throw new Error(`Post-processor "${name}" is not registered.${availableStr}`);
|
|
872
1288
|
}
|
|
873
1289
|
try {
|
|
874
|
-
if (processor.shutdown) {
|
|
875
|
-
await processor.shutdown();
|
|
876
|
-
}
|
|
877
|
-
} catch (error) {
|
|
878
|
-
console.warn(`Error during shutdown of post-processor "${name}":`, error);
|
|
879
|
-
}
|
|
880
|
-
postProcessors.delete(name);
|
|
881
|
-
}
|
|
882
|
-
async function clearPostProcessors() {
|
|
883
|
-
const entries = Array.from(postProcessors.entries());
|
|
884
|
-
for (const [_name, processor] of entries) {
|
|
885
|
-
try {
|
|
886
|
-
if (processor.shutdown) {
|
|
887
|
-
await processor.shutdown();
|
|
888
|
-
}
|
|
889
|
-
} catch (error) {
|
|
890
|
-
console.warn(`Error during shutdown of post-processor "${_name}":`, error);
|
|
891
|
-
}
|
|
892
|
-
}
|
|
893
|
-
postProcessors.clear();
|
|
894
|
-
}
|
|
895
|
-
function validateValidator(validator) {
|
|
896
|
-
if (validator === null || validator === void 0) {
|
|
897
|
-
throw new Error("Validator cannot be null or undefined");
|
|
898
|
-
}
|
|
899
|
-
const obj = validator;
|
|
900
|
-
if (typeof obj.name !== "function") {
|
|
901
|
-
throw new Error("Validator must implement name() method");
|
|
902
|
-
}
|
|
903
|
-
if (typeof obj.validate !== "function") {
|
|
904
|
-
throw new Error("Validator must implement validate() method");
|
|
905
|
-
}
|
|
906
|
-
const name = obj.name();
|
|
907
|
-
if (typeof name !== "string" || name.trim() === "") {
|
|
908
|
-
throw new Error("Validator name must be a non-empty string");
|
|
909
|
-
}
|
|
910
|
-
return true;
|
|
911
|
-
}
|
|
912
|
-
function registerValidator(validator) {
|
|
913
|
-
validateValidator(validator);
|
|
914
|
-
const name = validator.name();
|
|
915
|
-
if (validators.has(name)) {
|
|
916
|
-
console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
|
|
917
|
-
}
|
|
918
|
-
validators.set(name, validator);
|
|
919
|
-
}
|
|
920
|
-
function getValidator(name) {
|
|
921
|
-
return validators.get(name);
|
|
922
|
-
}
|
|
923
|
-
function listValidators() {
|
|
924
|
-
return Array.from(validators.keys());
|
|
925
|
-
}
|
|
926
|
-
async function unregisterValidator(name) {
|
|
927
|
-
const validator = validators.get(name);
|
|
928
|
-
if (!validator) {
|
|
929
|
-
const available = Array.from(validators.keys());
|
|
930
|
-
const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
|
|
931
|
-
throw new Error(`Validator "${name}" is not registered.${availableStr}`);
|
|
932
|
-
}
|
|
933
|
-
try {
|
|
934
|
-
if (validator.shutdown) {
|
|
935
|
-
await validator.shutdown();
|
|
936
|
-
}
|
|
937
|
-
} catch (error) {
|
|
938
|
-
console.warn(`Error during shutdown of validator "${name}":`, error);
|
|
939
|
-
}
|
|
940
|
-
validators.delete(name);
|
|
941
|
-
}
|
|
942
|
-
async function clearValidators() {
|
|
943
|
-
const entries = Array.from(validators.entries());
|
|
944
|
-
for (const [_name, validator] of entries) {
|
|
945
|
-
try {
|
|
946
|
-
if (validator.shutdown) {
|
|
947
|
-
await validator.shutdown();
|
|
948
|
-
}
|
|
949
|
-
} catch (error) {
|
|
950
|
-
console.warn(`Error during shutdown of validator "${_name}":`, error);
|
|
951
|
-
}
|
|
952
|
-
}
|
|
953
|
-
validators.clear();
|
|
954
|
-
}
|
|
955
|
-
function executePostProcessor(name, result) {
|
|
956
|
-
const processor = postProcessors.get(name);
|
|
957
|
-
if (!processor) {
|
|
958
|
-
return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
|
|
959
|
-
}
|
|
960
|
-
try {
|
|
961
|
-
const output = processor.process(result);
|
|
962
|
-
if (output instanceof Promise) {
|
|
963
|
-
return output;
|
|
964
|
-
}
|
|
965
|
-
return Promise.resolve(output);
|
|
966
|
-
} catch (error) {
|
|
967
|
-
return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
|
|
968
|
-
}
|
|
969
|
-
}
|
|
970
|
-
function executeValidator(name, result) {
|
|
971
|
-
const validator = validators.get(name);
|
|
972
|
-
if (!validator) {
|
|
973
|
-
return Promise.reject(new Error(`Validator "${name}" is not registered`));
|
|
974
|
-
}
|
|
975
|
-
try {
|
|
976
|
-
const output = validator.validate(result);
|
|
977
|
-
if (output instanceof Promise) {
|
|
978
|
-
return output;
|
|
979
|
-
}
|
|
980
|
-
return Promise.resolve(output);
|
|
981
|
-
} catch (error) {
|
|
982
|
-
return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
|
|
983
|
-
}
|
|
984
|
-
}
|
|
985
|
-
function setupGlobalCallbacks() {
|
|
986
|
-
if (typeof globalThis !== "undefined") {
|
|
987
|
-
const callbacksObj = globalThis;
|
|
988
|
-
callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
|
|
989
|
-
callbacksObj.__kreuzberg_execute_validator = executeValidator;
|
|
990
|
-
}
|
|
991
|
-
}
|
|
992
|
-
setupGlobalCallbacks();
|
|
993
|
-
|
|
994
|
-
// typescript/index.ts
|
|
995
|
-
var wasm = null;
|
|
996
|
-
var initialized = false;
|
|
997
|
-
var initializationError = null;
|
|
998
|
-
var initializationPromise = null;
|
|
999
|
-
async function initializePdfiumAsync(wasmModule) {
|
|
1000
|
-
if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
|
|
1001
|
-
return;
|
|
1002
|
-
}
|
|
1003
|
-
if (!isBrowser()) {
|
|
1004
|
-
console.debug("PDFium initialization skipped (non-browser environment)");
|
|
1005
|
-
return;
|
|
1006
|
-
}
|
|
1007
|
-
try {
|
|
1008
|
-
const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
|
|
1009
|
-
const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
|
|
1010
|
-
const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
|
|
1011
|
-
if (!success) {
|
|
1012
|
-
console.warn("PDFium initialization returned false");
|
|
1013
|
-
}
|
|
1014
|
-
} catch (error) {
|
|
1015
|
-
console.debug("PDFium initialization error:", error);
|
|
1016
|
-
}
|
|
1017
|
-
}
|
|
1018
|
-
async function initWasm() {
|
|
1019
|
-
if (initialized) {
|
|
1020
|
-
return;
|
|
1021
|
-
}
|
|
1022
|
-
if (initializationPromise) {
|
|
1023
|
-
return initializationPromise;
|
|
1024
|
-
}
|
|
1025
|
-
initializationPromise = (async () => {
|
|
1026
|
-
try {
|
|
1027
|
-
if (!hasWasm()) {
|
|
1028
|
-
throw new Error("WebAssembly is not supported in this environment");
|
|
1029
|
-
}
|
|
1030
|
-
let wasmModule;
|
|
1031
|
-
const pkgPath = "./pkg/kreuzberg_wasm.js";
|
|
1032
|
-
const fallbackPath = "./kreuzberg_wasm.js";
|
|
1033
|
-
try {
|
|
1034
|
-
wasmModule = await import(
|
|
1035
|
-
/* @vite-ignore */
|
|
1036
|
-
pkgPath
|
|
1037
|
-
);
|
|
1038
|
-
} catch {
|
|
1039
|
-
wasmModule = await import(
|
|
1040
|
-
/* @vite-ignore */
|
|
1041
|
-
fallbackPath
|
|
1042
|
-
);
|
|
1043
|
-
}
|
|
1044
|
-
wasm = wasmModule;
|
|
1045
|
-
if (wasm && typeof wasm.default === "function") {
|
|
1046
|
-
await wasm.default();
|
|
1047
|
-
}
|
|
1048
|
-
if (isBrowser() && wasm && typeof wasm.initialize_pdfium_render === "function") {
|
|
1049
|
-
initializePdfiumAsync(wasm).catch((error) => {
|
|
1050
|
-
console.warn("PDFium auto-initialization failed (PDF extraction disabled):", error);
|
|
1051
|
-
});
|
|
1052
|
-
}
|
|
1053
|
-
initialized = true;
|
|
1054
|
-
initializationError = null;
|
|
1055
|
-
} catch (error) {
|
|
1056
|
-
initializationError = error instanceof Error ? error : new Error(String(error));
|
|
1057
|
-
throw wrapWasmError(error, "initializing Kreuzberg WASM module");
|
|
1058
|
-
}
|
|
1059
|
-
})();
|
|
1060
|
-
return initializationPromise;
|
|
1061
|
-
}
|
|
1062
|
-
function isInitialized() {
|
|
1063
|
-
return initialized;
|
|
1064
|
-
}
|
|
1065
|
-
function getVersion() {
|
|
1066
|
-
if (!initialized) {
|
|
1067
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1068
|
-
}
|
|
1069
|
-
if (!wasm) {
|
|
1070
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1071
|
-
}
|
|
1072
|
-
return wasm.version();
|
|
1073
|
-
}
|
|
1074
|
-
function getInitializationError() {
|
|
1075
|
-
return initializationError;
|
|
1076
|
-
}
|
|
1077
|
-
async function extractBytes(data, mimeType, config) {
|
|
1078
|
-
if (!initialized) {
|
|
1079
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1080
|
-
}
|
|
1081
|
-
if (!wasm) {
|
|
1082
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1083
|
-
}
|
|
1084
|
-
try {
|
|
1085
|
-
if (!data || data.length === 0) {
|
|
1086
|
-
throw new Error("Document data cannot be empty");
|
|
1087
|
-
}
|
|
1088
|
-
if (!mimeType) {
|
|
1089
|
-
throw new Error("MIME type is required");
|
|
1090
|
-
}
|
|
1091
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1092
|
-
const result = await wasm.extractBytes(data, mimeType, normalizedConfig);
|
|
1093
|
-
if (!result) {
|
|
1094
|
-
throw new Error("Invalid extraction result: no result from WASM module");
|
|
1095
|
-
}
|
|
1096
|
-
return jsToExtractionResult(result);
|
|
1097
|
-
} catch (error) {
|
|
1098
|
-
throw wrapWasmError(error, "extracting from bytes");
|
|
1099
|
-
}
|
|
1100
|
-
}
|
|
1101
|
-
async function extractFile(path, mimeType, config) {
|
|
1102
|
-
if (!initialized) {
|
|
1103
|
-
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1104
|
-
}
|
|
1105
|
-
if (!wasm) {
|
|
1106
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1107
|
-
}
|
|
1108
|
-
try {
|
|
1109
|
-
if (!path) {
|
|
1110
|
-
throw new Error("File path is required");
|
|
1111
|
-
}
|
|
1112
|
-
const runtime = detectRuntime();
|
|
1113
|
-
if (runtime === "browser") {
|
|
1114
|
-
throw new Error("Use extractBytes with fileToUint8Array for browser environments");
|
|
1115
|
-
}
|
|
1116
|
-
let fileData;
|
|
1117
|
-
if (runtime === "node") {
|
|
1118
|
-
const { readFile } = await import("fs/promises");
|
|
1119
|
-
const buffer = await readFile(path);
|
|
1120
|
-
fileData = new Uint8Array(buffer);
|
|
1121
|
-
} else if (runtime === "deno") {
|
|
1122
|
-
const deno = globalThis.Deno;
|
|
1123
|
-
fileData = await deno.readFile(path);
|
|
1124
|
-
} else if (runtime === "bun") {
|
|
1125
|
-
const { readFile } = await import("fs/promises");
|
|
1126
|
-
const buffer = await readFile(path);
|
|
1127
|
-
fileData = new Uint8Array(buffer);
|
|
1128
|
-
} else {
|
|
1129
|
-
throw new Error(`Unsupported runtime for file extraction: ${runtime}`);
|
|
1130
|
-
}
|
|
1131
|
-
let detectedMimeType = mimeType;
|
|
1132
|
-
if (!detectedMimeType) {
|
|
1133
|
-
detectedMimeType = wasm.detectMimeFromBytes(fileData);
|
|
1134
|
-
}
|
|
1135
|
-
if (!detectedMimeType) {
|
|
1136
|
-
throw new Error("Could not detect MIME type for file. Please provide mimeType parameter.");
|
|
1290
|
+
if (processor.shutdown) {
|
|
1291
|
+
await processor.shutdown();
|
|
1137
1292
|
}
|
|
1138
|
-
detectedMimeType = wasm.normalizeMimeType(detectedMimeType);
|
|
1139
|
-
return await extractBytes(fileData, detectedMimeType, config);
|
|
1140
1293
|
} catch (error) {
|
|
1141
|
-
|
|
1294
|
+
console.warn(`Error during shutdown of post-processor "${name}":`, error);
|
|
1142
1295
|
}
|
|
1296
|
+
postProcessors.delete(name);
|
|
1143
1297
|
}
|
|
1144
|
-
async function
|
|
1145
|
-
|
|
1146
|
-
|
|
1298
|
+
async function clearPostProcessors() {
|
|
1299
|
+
const entries = Array.from(postProcessors.entries());
|
|
1300
|
+
for (const [_name, processor] of entries) {
|
|
1301
|
+
try {
|
|
1302
|
+
if (processor.shutdown) {
|
|
1303
|
+
await processor.shutdown();
|
|
1304
|
+
}
|
|
1305
|
+
} catch (error) {
|
|
1306
|
+
console.warn(`Error during shutdown of post-processor "${_name}":`, error);
|
|
1307
|
+
}
|
|
1147
1308
|
}
|
|
1148
|
-
|
|
1149
|
-
|
|
1309
|
+
postProcessors.clear();
|
|
1310
|
+
}
|
|
1311
|
+
function validateValidator(validator) {
|
|
1312
|
+
if (validator === null || validator === void 0) {
|
|
1313
|
+
throw new Error("Validator cannot be null or undefined");
|
|
1150
1314
|
}
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1315
|
+
const obj = validator;
|
|
1316
|
+
if (typeof obj.name !== "function") {
|
|
1317
|
+
throw new Error("Validator must implement name() method");
|
|
1318
|
+
}
|
|
1319
|
+
if (typeof obj.validate !== "function") {
|
|
1320
|
+
throw new Error("Validator must implement validate() method");
|
|
1321
|
+
}
|
|
1322
|
+
const name = obj.name();
|
|
1323
|
+
if (typeof name !== "string" || name.trim() === "") {
|
|
1324
|
+
throw new Error("Validator name must be a non-empty string");
|
|
1158
1325
|
}
|
|
1326
|
+
return true;
|
|
1159
1327
|
}
|
|
1160
|
-
function
|
|
1161
|
-
|
|
1162
|
-
|
|
1328
|
+
function registerValidator(validator) {
|
|
1329
|
+
validateValidator(validator);
|
|
1330
|
+
const name = validator.name();
|
|
1331
|
+
if (validators.has(name)) {
|
|
1332
|
+
console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
|
|
1163
1333
|
}
|
|
1164
|
-
|
|
1165
|
-
|
|
1334
|
+
validators.set(name, validator);
|
|
1335
|
+
}
|
|
1336
|
+
function getValidator(name) {
|
|
1337
|
+
return validators.get(name);
|
|
1338
|
+
}
|
|
1339
|
+
function listValidators() {
|
|
1340
|
+
return Array.from(validators.keys());
|
|
1341
|
+
}
|
|
1342
|
+
async function unregisterValidator(name) {
|
|
1343
|
+
const validator = validators.get(name);
|
|
1344
|
+
if (!validator) {
|
|
1345
|
+
const available = Array.from(validators.keys());
|
|
1346
|
+
const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
|
|
1347
|
+
throw new Error(`Validator "${name}" is not registered.${availableStr}`);
|
|
1166
1348
|
}
|
|
1167
1349
|
try {
|
|
1168
|
-
if (
|
|
1169
|
-
|
|
1170
|
-
}
|
|
1171
|
-
if (!mimeType) {
|
|
1172
|
-
throw new Error("MIME type is required");
|
|
1173
|
-
}
|
|
1174
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1175
|
-
const result = wasm.extractBytesSync(data, mimeType, normalizedConfig);
|
|
1176
|
-
if (!result) {
|
|
1177
|
-
throw new Error("Invalid extraction result: no result from WASM module");
|
|
1350
|
+
if (validator.shutdown) {
|
|
1351
|
+
await validator.shutdown();
|
|
1178
1352
|
}
|
|
1179
|
-
return jsToExtractionResult(result);
|
|
1180
1353
|
} catch (error) {
|
|
1181
|
-
|
|
1354
|
+
console.warn(`Error during shutdown of validator "${name}":`, error);
|
|
1182
1355
|
}
|
|
1356
|
+
validators.delete(name);
|
|
1183
1357
|
}
|
|
1184
|
-
async function
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
}
|
|
1191
|
-
try {
|
|
1192
|
-
if (!Array.isArray(files)) {
|
|
1193
|
-
throw new Error("Files parameter must be an array");
|
|
1194
|
-
}
|
|
1195
|
-
if (files.length === 0) {
|
|
1196
|
-
throw new Error("Files array cannot be empty");
|
|
1197
|
-
}
|
|
1198
|
-
const dataList = [];
|
|
1199
|
-
const mimeTypes = [];
|
|
1200
|
-
for (let i = 0; i < files.length; i += 1) {
|
|
1201
|
-
const file = files[i];
|
|
1202
|
-
if (!file || typeof file !== "object") {
|
|
1203
|
-
throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
|
|
1204
|
-
}
|
|
1205
|
-
const f = file;
|
|
1206
|
-
if (!(f.data instanceof Uint8Array)) {
|
|
1207
|
-
throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
|
|
1208
|
-
}
|
|
1209
|
-
if (typeof f.mimeType !== "string") {
|
|
1210
|
-
throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
|
|
1211
|
-
}
|
|
1212
|
-
if (f.data.length === 0) {
|
|
1213
|
-
throw new Error(`Invalid file at index ${i}: data cannot be empty`);
|
|
1358
|
+
async function clearValidators() {
|
|
1359
|
+
const entries = Array.from(validators.entries());
|
|
1360
|
+
for (const [_name, validator] of entries) {
|
|
1361
|
+
try {
|
|
1362
|
+
if (validator.shutdown) {
|
|
1363
|
+
await validator.shutdown();
|
|
1214
1364
|
}
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
}
|
|
1218
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1219
|
-
const results = await wasm.batchExtractBytes(dataList, mimeTypes, normalizedConfig);
|
|
1220
|
-
if (!Array.isArray(results)) {
|
|
1221
|
-
throw new Error("Invalid batch extraction result: expected array");
|
|
1365
|
+
} catch (error) {
|
|
1366
|
+
console.warn(`Error during shutdown of validator "${_name}":`, error);
|
|
1222
1367
|
}
|
|
1223
|
-
return results.map((result, index) => {
|
|
1224
|
-
if (!result) {
|
|
1225
|
-
throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
|
|
1226
|
-
}
|
|
1227
|
-
return jsToExtractionResult(result);
|
|
1228
|
-
});
|
|
1229
|
-
} catch (error) {
|
|
1230
|
-
throw wrapWasmError(error, "batch extracting from bytes");
|
|
1231
1368
|
}
|
|
1369
|
+
validators.clear();
|
|
1232
1370
|
}
|
|
1233
|
-
function
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
if (!wasm) {
|
|
1238
|
-
throw new Error("WASM module not loaded. Call initWasm() first.");
|
|
1371
|
+
function executePostProcessor(name, result) {
|
|
1372
|
+
const processor = postProcessors.get(name);
|
|
1373
|
+
if (!processor) {
|
|
1374
|
+
return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
|
|
1239
1375
|
}
|
|
1240
1376
|
try {
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
if (files.length === 0) {
|
|
1245
|
-
throw new Error("Files array cannot be empty");
|
|
1246
|
-
}
|
|
1247
|
-
const dataList = [];
|
|
1248
|
-
const mimeTypes = [];
|
|
1249
|
-
for (let i = 0; i < files.length; i += 1) {
|
|
1250
|
-
const file = files[i];
|
|
1251
|
-
if (!file || typeof file !== "object") {
|
|
1252
|
-
throw new Error(`Invalid file at index ${i}: must be an object with data and mimeType`);
|
|
1253
|
-
}
|
|
1254
|
-
const f = file;
|
|
1255
|
-
if (!(f.data instanceof Uint8Array)) {
|
|
1256
|
-
throw new Error(`Invalid file at index ${i}: data must be Uint8Array`);
|
|
1257
|
-
}
|
|
1258
|
-
if (typeof f.mimeType !== "string") {
|
|
1259
|
-
throw new Error(`Invalid file at index ${i}: mimeType must be a string`);
|
|
1260
|
-
}
|
|
1261
|
-
if (f.data.length === 0) {
|
|
1262
|
-
throw new Error(`Invalid file at index ${i}: data cannot be empty`);
|
|
1263
|
-
}
|
|
1264
|
-
dataList.push(f.data);
|
|
1265
|
-
mimeTypes.push(f.mimeType);
|
|
1266
|
-
}
|
|
1267
|
-
const normalizedConfig = configToJS(config ?? null);
|
|
1268
|
-
const results = wasm.batchExtractBytesSync(dataList, mimeTypes, normalizedConfig);
|
|
1269
|
-
if (!Array.isArray(results)) {
|
|
1270
|
-
throw new Error("Invalid batch extraction result: expected array");
|
|
1377
|
+
const output = processor.process(result);
|
|
1378
|
+
if (output instanceof Promise) {
|
|
1379
|
+
return output;
|
|
1271
1380
|
}
|
|
1272
|
-
return
|
|
1273
|
-
if (!result) {
|
|
1274
|
-
throw new Error(`Invalid extraction result at index ${index}: no result from WASM module`);
|
|
1275
|
-
}
|
|
1276
|
-
return jsToExtractionResult(result);
|
|
1277
|
-
});
|
|
1381
|
+
return Promise.resolve(output);
|
|
1278
1382
|
} catch (error) {
|
|
1279
|
-
|
|
1383
|
+
return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
|
|
1280
1384
|
}
|
|
1281
1385
|
}
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1386
|
+
function executeValidator(name, result) {
|
|
1387
|
+
const validator = validators.get(name);
|
|
1388
|
+
if (!validator) {
|
|
1389
|
+
return Promise.reject(new Error(`Validator "${name}" is not registered`));
|
|
1285
1390
|
}
|
|
1286
1391
|
try {
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
if (files.length === 0) {
|
|
1291
|
-
throw new Error("Files array cannot be empty");
|
|
1292
|
-
}
|
|
1293
|
-
const byteFiles = [];
|
|
1294
|
-
for (let i = 0; i < files.length; i += 1) {
|
|
1295
|
-
const file = files[i];
|
|
1296
|
-
if (!(file instanceof File)) {
|
|
1297
|
-
throw new Error(`Invalid file at index ${i}: must be a File object`);
|
|
1298
|
-
}
|
|
1299
|
-
const bytes = await fileToUint8Array(file);
|
|
1300
|
-
byteFiles.push({
|
|
1301
|
-
data: bytes,
|
|
1302
|
-
mimeType: file.type || "application/octet-stream"
|
|
1303
|
-
});
|
|
1392
|
+
const output = validator.validate(result);
|
|
1393
|
+
if (output instanceof Promise) {
|
|
1394
|
+
return output;
|
|
1304
1395
|
}
|
|
1305
|
-
return
|
|
1396
|
+
return Promise.resolve(output);
|
|
1306
1397
|
} catch (error) {
|
|
1307
|
-
|
|
1398
|
+
return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
|
|
1308
1399
|
}
|
|
1309
1400
|
}
|
|
1310
|
-
|
|
1311
|
-
if (
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
throw new Error(
|
|
1316
|
-
"OCR is only available in browser environments. TesseractWasmBackend requires Web Workers and createImageBitmap."
|
|
1317
|
-
);
|
|
1318
|
-
}
|
|
1319
|
-
try {
|
|
1320
|
-
const backend = new TesseractWasmBackend();
|
|
1321
|
-
await backend.initialize();
|
|
1322
|
-
registerOcrBackend(backend);
|
|
1323
|
-
} catch (error) {
|
|
1324
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1325
|
-
throw new Error(`Failed to enable OCR: ${message}`);
|
|
1401
|
+
function setupGlobalCallbacks() {
|
|
1402
|
+
if (typeof globalThis !== "undefined") {
|
|
1403
|
+
const callbacksObj = globalThis;
|
|
1404
|
+
callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
|
|
1405
|
+
callbacksObj.__kreuzberg_execute_validator = executeValidator;
|
|
1326
1406
|
}
|
|
1327
1407
|
}
|
|
1408
|
+
setupGlobalCallbacks();
|
|
1328
1409
|
export {
|
|
1329
1410
|
TesseractWasmBackend,
|
|
1330
1411
|
batchExtractBytes,
|
|
@@ -1349,6 +1430,7 @@ export {
|
|
|
1349
1430
|
getValidator,
|
|
1350
1431
|
getVersion,
|
|
1351
1432
|
getWasmCapabilities,
|
|
1433
|
+
getWasmModule,
|
|
1352
1434
|
hasBigInt,
|
|
1353
1435
|
hasBlob,
|
|
1354
1436
|
hasFileApi,
|
|
@@ -1358,9 +1440,13 @@ export {
|
|
|
1358
1440
|
hasWasmStreaming,
|
|
1359
1441
|
hasWorkers,
|
|
1360
1442
|
initWasm,
|
|
1443
|
+
initializePdfiumAsync,
|
|
1361
1444
|
isBrowser,
|
|
1362
1445
|
isBun,
|
|
1446
|
+
isCloudflareWorkers,
|
|
1363
1447
|
isDeno,
|
|
1448
|
+
isEdgeEnvironment,
|
|
1449
|
+
isEdgeRuntime,
|
|
1364
1450
|
isInitialized,
|
|
1365
1451
|
isNode,
|
|
1366
1452
|
isServerEnvironment,
|