@kreuzberg/wasm 4.0.0-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +982 -0
  2. package/dist/adapters/wasm-adapter.d.mts +121 -0
  3. package/dist/adapters/wasm-adapter.d.ts +121 -0
  4. package/dist/adapters/wasm-adapter.js +241 -0
  5. package/dist/adapters/wasm-adapter.js.map +1 -0
  6. package/dist/adapters/wasm-adapter.mjs +221 -0
  7. package/dist/adapters/wasm-adapter.mjs.map +1 -0
  8. package/dist/index.d.mts +466 -0
  9. package/dist/index.d.ts +466 -0
  10. package/dist/index.js +383 -0
  11. package/dist/index.js.map +1 -0
  12. package/dist/index.mjs +384 -0
  13. package/dist/index.mjs.map +1 -0
  14. package/dist/kreuzberg_wasm.d.mts +758 -0
  15. package/dist/kreuzberg_wasm.d.ts +758 -0
  16. package/dist/kreuzberg_wasm.js +1913 -0
  17. package/dist/kreuzberg_wasm.mjs +48 -0
  18. package/dist/kreuzberg_wasm_bg.wasm +0 -0
  19. package/dist/kreuzberg_wasm_bg.wasm.d.ts +54 -0
  20. package/dist/ocr/registry.d.mts +102 -0
  21. package/dist/ocr/registry.d.ts +102 -0
  22. package/dist/ocr/registry.js +90 -0
  23. package/dist/ocr/registry.js.map +1 -0
  24. package/dist/ocr/registry.mjs +70 -0
  25. package/dist/ocr/registry.mjs.map +1 -0
  26. package/dist/ocr/tesseract-wasm-backend.d.mts +257 -0
  27. package/dist/ocr/tesseract-wasm-backend.d.ts +257 -0
  28. package/dist/ocr/tesseract-wasm-backend.js +454 -0
  29. package/dist/ocr/tesseract-wasm-backend.js.map +1 -0
  30. package/dist/ocr/tesseract-wasm-backend.mjs +424 -0
  31. package/dist/ocr/tesseract-wasm-backend.mjs.map +1 -0
  32. package/dist/runtime.d.mts +256 -0
  33. package/dist/runtime.d.ts +256 -0
  34. package/dist/runtime.js +172 -0
  35. package/dist/runtime.js.map +1 -0
  36. package/dist/runtime.mjs +152 -0
  37. package/dist/runtime.mjs.map +1 -0
  38. package/dist/snippets/wasm-bindgen-rayon-38edf6e439f6d70d/src/workerHelpers.js +107 -0
  39. package/dist/types-GJVIvbPy.d.mts +221 -0
  40. package/dist/types-GJVIvbPy.d.ts +221 -0
  41. package/package.json +138 -0
@@ -0,0 +1,466 @@
1
+ import { E as ExtractionConfig, a as ExtractionResult } from './types-GJVIvbPy.mjs';
2
+ export { C as Chunk, d as ChunkMetadata, b as ChunkingConfig, c as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, f as OcrBackendProtocol, O as OcrConfig, e as PageContent, P as PageExtractionConfig, T as Table } from './types-GJVIvbPy.mjs';
3
+ export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.mjs';
4
+ export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.mjs';
5
+ export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.mjs';
6
+ export { RuntimeType, WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.mjs';
7
+
8
+ /**
9
+ * Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
10
+ *
11
+ * This module provides WebAssembly bindings for Kreuzberg document intelligence,
12
+ * enabling high-performance document extraction in browser and JavaScript runtime environments.
13
+ *
14
+ * ## Features
15
+ *
16
+ * - Extract text, metadata, and tables from documents
17
+ * - Support for multiple document formats (PDF, Office, images, etc.)
18
+ * - Browser and runtime-compatible WASM bindings
19
+ * - Type-safe TypeScript interfaces
20
+ * - Runtime detection and feature capability checking
21
+ * - Automatic type conversion and error handling
22
+ *
23
+ * ## Installation
24
+ *
25
+ * ```bash
26
+ * npm install @kreuzberg/wasm
27
+ * ```
28
+ *
29
+ * ## Basic Usage
30
+ *
31
+ * ```typescript
32
+ * import { extractBytes, initWasm } from '@kreuzberg/wasm';
33
+ *
34
+ * // Initialize WASM module once at app startup
35
+ * await initWasm();
36
+ *
37
+ * // Extract from bytes
38
+ * const bytes = new Uint8Array(buffer);
39
+ * const result = await extractBytes(bytes, 'application/pdf');
40
+ * console.log(result.content);
41
+ * ```
42
+ *
43
+ * ## Browser Usage with File Input
44
+ *
45
+ * ```typescript
46
+ * import { extractBytes, initWasm } from '@kreuzberg/wasm';
47
+ * import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';
48
+ *
49
+ * // Initialize once at app startup
50
+ * await initWasm();
51
+ *
52
+ * // Handle file input
53
+ * const fileInput = document.getElementById('file');
54
+ * fileInput.addEventListener('change', async (e) => {
55
+ * const file = e.target.files?.[0];
56
+ * if (file) {
57
+ * const bytes = await fileToUint8Array(file);
58
+ * const result = await extractBytes(bytes, file.type);
59
+ * console.log(result.content);
60
+ * }
61
+ * });
62
+ * ```
63
+ *
64
+ * ## Runtime Detection
65
+ *
66
+ * ```typescript
67
+ * import { detectRuntime, getWasmCapabilities } from '@kreuzberg/wasm/runtime';
68
+ *
69
+ * const runtime = detectRuntime();
70
+ * const caps = getWasmCapabilities();
71
+ *
72
+ * if (caps.hasWorkers) {
73
+ * // Can use Web Workers for parallel processing
74
+ * }
75
+ * ```
76
+ *
77
+ * ## Configuration
78
+ *
79
+ * ```typescript
80
+ * import { extractBytes, initWasm } from '@kreuzberg/wasm';
81
+ * import type { ExtractionConfig } from '@kreuzberg/wasm';
82
+ *
83
+ * await initWasm();
84
+ *
85
+ * const config: ExtractionConfig = {
86
+ * ocr: {
87
+ * backend: 'tesseract',
88
+ * language: 'eng'
89
+ * },
90
+ * chunking: {
91
+ * maxChars: 1000,
92
+ * chunkOverlap: 100
93
+ * },
94
+ * images: {
95
+ * extractImages: true,
96
+ * targetDpi: 150
97
+ * }
98
+ * };
99
+ *
100
+ * const result = await extractBytes(bytes, 'application/pdf', config);
101
+ * ```
102
+ */
103
+
104
+ /**
105
+ * Initialize the WASM module
106
+ *
107
+ * This function must be called once before using any extraction functions.
108
+ * It loads and initializes the WASM module in the current runtime environment,
109
+ * automatically selecting the appropriate WASM variant for the detected runtime.
110
+ *
111
+ * Multiple calls to initWasm() are safe and will return immediately if already initialized.
112
+ *
113
+ * @throws {Error} If WASM module fails to load or is not supported in the current environment
114
+ *
115
+ * @example Basic Usage
116
+ * ```typescript
117
+ * import { initWasm } from '@kreuzberg/wasm';
118
+ *
119
+ * async function main() {
120
+ * await initWasm();
121
+ * // Now you can use extraction functions
122
+ * }
123
+ *
124
+ * main().catch(console.error);
125
+ * ```
126
+ *
127
+ * @example With Error Handling
128
+ * ```typescript
129
+ * import { initWasm, getWasmCapabilities } from '@kreuzberg/wasm';
130
+ *
131
+ * async function initializeKreuzberg() {
132
+ * const caps = getWasmCapabilities();
133
+ * if (!caps.hasWasm) {
134
+ * throw new Error('WebAssembly is not supported in this environment');
135
+ * }
136
+ *
137
+ * try {
138
+ * await initWasm();
139
+ * console.log('Kreuzberg initialized successfully');
140
+ * } catch (error) {
141
+ * console.error('Failed to initialize Kreuzberg:', error);
142
+ * throw error;
143
+ * }
144
+ * }
145
+ * ```
146
+ */
147
+ declare function initWasm(): Promise<void>;
148
+ /**
149
+ * Check if WASM module is initialized
150
+ *
151
+ * @returns True if WASM module is initialized, false otherwise
152
+ *
153
+ * @example
154
+ * ```typescript
155
+ * if (!isInitialized()) {
156
+ * await initWasm();
157
+ * }
158
+ * ```
159
+ */
160
+ declare function isInitialized(): boolean;
161
+ /**
162
+ * Get WASM module version
163
+ *
164
+ * @throws {Error} If WASM module is not initialized
165
+ * @returns The version string of the WASM module
166
+ *
167
+ * @example
168
+ * ```typescript
169
+ * const version = getVersion();
170
+ * console.log(`Using Kreuzberg ${version}`);
171
+ * ```
172
+ */
173
+ declare function getVersion(): string;
174
+ /**
175
+ * Get initialization error if module failed to load
176
+ *
177
+ * @returns The error that occurred during initialization, or null if no error
178
+ *
179
+ * @internal
180
+ */
181
+ declare function getInitializationError(): Error | null;
182
+ /**
183
+ * Extract content from bytes (document data)
184
+ *
185
+ * Extracts text, metadata, tables, images, and other content from document bytes.
186
+ * Automatically detects document type from MIME type and applies appropriate extraction logic.
187
+ *
188
+ * @param data - The document bytes to extract from
189
+ * @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
190
+ * @param config - Optional extraction configuration
191
+ * @returns Promise resolving to the extraction result
192
+ * @throws {Error} If WASM module is not initialized or extraction fails
193
+ *
194
+ * @example Extract PDF
195
+ * ```typescript
196
+ * const bytes = new Uint8Array(buffer);
197
+ * const result = await extractBytes(bytes, 'application/pdf');
198
+ * console.log(result.content);
199
+ * console.log(result.tables);
200
+ * ```
201
+ *
202
+ * @example Extract with Configuration
203
+ * ```typescript
204
+ * const result = await extractBytes(bytes, 'application/pdf', {
205
+ * ocr: {
206
+ * backend: 'tesseract',
207
+ * language: 'deu' // German
208
+ * },
209
+ * images: {
210
+ * extractImages: true,
211
+ * targetDpi: 200
212
+ * }
213
+ * });
214
+ * ```
215
+ *
216
+ * @example Extract from File
217
+ * ```typescript
218
+ * const file = inputEvent.target.files[0];
219
+ * const bytes = await fileToUint8Array(file);
220
+ * const result = await extractBytes(bytes, file.type);
221
+ * ```
222
+ */
223
+ declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;
224
+ /**
225
+ * Extract content from a file on the file system
226
+ *
227
+ * Node.js and Deno specific function that reads a file from the file system
228
+ * and extracts content from it. Automatically detects MIME type if not provided.
229
+ *
230
+ * @param path - Path to the file to extract from
231
+ * @param mimeType - Optional MIME type of the file. If not provided, will attempt to detect
232
+ * @param config - Optional extraction configuration
233
+ * @returns Promise resolving to the extraction result
234
+ * @throws {Error} If WASM module is not initialized, file doesn't exist, or extraction fails
235
+ *
236
+ * @example Extract with auto-detection
237
+ * ```typescript
238
+ * const result = await extractFile('./document.pdf');
239
+ * console.log(result.content);
240
+ * ```
241
+ *
242
+ * @example Extract with explicit MIME type
243
+ * ```typescript
244
+ * const result = await extractFile('./document.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document');
245
+ * ```
246
+ *
247
+ * @example Extract from Node.js with config
248
+ * ```typescript
249
+ * import { extractFile } from '@kreuzberg/wasm';
250
+ * import { readFile } from 'fs/promises';
251
+ *
252
+ * const result = await extractFile('./report.xlsx', null, {
253
+ * chunking: {
254
+ * maxChars: 1000
255
+ * }
256
+ * });
257
+ * ```
258
+ */
259
+ declare function extractFile(path: string, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
260
+ /**
261
+ * Extract content from a File or Blob (browser-friendly wrapper)
262
+ *
263
+ * Convenience function that wraps fileToUint8Array and extractBytes,
264
+ * providing a streamlined API for browser applications handling file inputs.
265
+ *
266
+ * @param file - The File or Blob to extract from
267
+ * @param mimeType - Optional MIME type. If not provided, uses file.type if available
268
+ * @param config - Optional extraction configuration
269
+ * @returns Promise resolving to the extraction result
270
+ * @throws {Error} If WASM module is not initialized or extraction fails
271
+ *
272
+ * @example Simple file extraction
273
+ * ```typescript
274
+ * const fileInput = document.getElementById('file');
275
+ * fileInput.addEventListener('change', async (e) => {
276
+ * const file = e.target.files?.[0];
277
+ * if (file) {
278
+ * const result = await extractFromFile(file);
279
+ * console.log(result.content);
280
+ * }
281
+ * });
282
+ * ```
283
+ *
284
+ * @example With configuration
285
+ * ```typescript
286
+ * const result = await extractFromFile(file, file.type, {
287
+ * chunking: { maxChars: 1000 },
288
+ * images: { extractImages: true }
289
+ * });
290
+ * ```
291
+ */
292
+ declare function extractFromFile(file: File | Blob, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
293
+ /**
294
+ * Extract content from bytes synchronously
295
+ *
296
+ * Synchronous version of extractBytes. Performs extraction without async operations.
297
+ * Note: Some extraction features may still be async internally, but the wrapper is synchronous.
298
+ *
299
+ * @param data - The document bytes to extract from
300
+ * @param mimeType - MIME type of the document
301
+ * @param config - Optional extraction configuration
302
+ * @returns The extraction result
303
+ * @throws {Error} If WASM module is not initialized or extraction fails
304
+ *
305
+ * @example
306
+ * ```typescript
307
+ * const bytes = new Uint8Array(buffer);
308
+ * const result = extractBytesSync(bytes, 'application/pdf');
309
+ * console.log(result.content);
310
+ * ```
311
+ */
312
+ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
313
+ /**
314
+ * Batch extract content from multiple byte arrays asynchronously
315
+ *
316
+ * Extracts content from multiple documents in a single batch operation,
317
+ * allowing for more efficient processing of multiple files.
318
+ *
319
+ * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
320
+ * @param config - Optional extraction configuration applied to all files
321
+ * @returns Promise resolving to array of extraction results
322
+ * @throws {Error} If WASM module is not initialized or extraction fails
323
+ *
324
+ * @example
325
+ * ```typescript
326
+ * const files = [
327
+ * { data: pdfBytes, mimeType: 'application/pdf' },
328
+ * { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
329
+ * ];
330
+ * const results = await batchExtractBytes(files);
331
+ * results.forEach((result) => console.log(result.content));
332
+ * ```
333
+ */
334
+ declare function batchExtractBytes(files: Array<{
335
+ data: Uint8Array;
336
+ mimeType: string;
337
+ }>, config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
338
+ /**
339
+ * Batch extract content from multiple byte arrays synchronously
340
+ *
341
+ * Synchronous version of batchExtractBytes. Extracts content from multiple documents
342
+ * in a single batch operation without async operations.
343
+ *
344
+ * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
345
+ * @param config - Optional extraction configuration applied to all files
346
+ * @returns Array of extraction results
347
+ * @throws {Error} If WASM module is not initialized or extraction fails
348
+ *
349
+ * @example
350
+ * ```typescript
351
+ * const files = [
352
+ * { data: pdfBytes, mimeType: 'application/pdf' },
353
+ * { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
354
+ * ];
355
+ * const results = batchExtractBytesSync(files);
356
+ * results.forEach((result) => console.log(result.content));
357
+ * ```
358
+ */
359
+ declare function batchExtractBytesSync(files: Array<{
360
+ data: Uint8Array;
361
+ mimeType: string;
362
+ }>, config?: ExtractionConfig | null): ExtractionResult[];
363
+ /**
364
+ * Batch extract content from multiple File objects asynchronously
365
+ *
366
+ * Convenience function that converts File objects to Uint8Array and calls batchExtractBytes.
367
+ * Automatically uses the file.type as MIME type if available.
368
+ *
369
+ * @param files - Array of File objects to extract from
370
+ * @param config - Optional extraction configuration applied to all files
371
+ * @returns Promise resolving to array of extraction results
372
+ * @throws {Error} If WASM module is not initialized, files cannot be read, or extraction fails
373
+ *
374
+ * @example
375
+ * ```typescript
376
+ * const fileInput = document.getElementById('files');
377
+ * const files = Array.from(fileInput.files ?? []);
378
+ * const results = await batchExtractFiles(files);
379
+ * results.forEach((result, index) => {
380
+ * console.log(`File ${index}: ${result.content.substring(0, 50)}...`);
381
+ * });
382
+ * ```
383
+ */
384
+ declare function batchExtractFiles(files: File[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
385
+ /**
386
+ * Enable OCR functionality with tesseract-wasm backend
387
+ *
388
+ * Convenience function that automatically initializes and registers the Tesseract WASM backend.
389
+ * This is the recommended approach for enabling OCR in WASM-based applications.
390
+ *
391
+ * ## Browser Requirement
392
+ *
393
+ * This function requires a browser environment with support for:
394
+ * - WebWorkers (for Tesseract processing)
395
+ * - createImageBitmap (for image conversion)
396
+ * - Blob API
397
+ *
398
+ * ## Network Requirement
399
+ *
400
+ * Training data will be loaded from jsDelivr CDN on first use of each language.
401
+ * Ensure network access to cdn.jsdelivr.net is available.
402
+ *
403
+ * @throws {Error} If not in browser environment or tesseract-wasm is not available
404
+ *
405
+ * @example Basic Usage
406
+ * ```typescript
407
+ * import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
408
+ *
409
+ * async function main() {
410
+ * // Initialize WASM module
411
+ * await initWasm();
412
+ *
413
+ * // Enable OCR with tesseract-wasm
414
+ * await enableOcr();
415
+ *
416
+ * // Now you can use OCR in extraction
417
+ * const imageBytes = new Uint8Array(buffer);
418
+ * const result = await extractBytes(imageBytes, 'image/png', {
419
+ * ocr: { backend: 'tesseract-wasm', language: 'eng' }
420
+ * });
421
+ *
422
+ * console.log(result.content); // Extracted text
423
+ * }
424
+ *
425
+ * main().catch(console.error);
426
+ * ```
427
+ *
428
+ * @example With Progress Tracking
429
+ * ```typescript
430
+ * import { enableOcr, TesseractWasmBackend } from '@kreuzberg/wasm';
431
+ *
432
+ * async function setupOcrWithProgress() {
433
+ * const backend = new TesseractWasmBackend();
434
+ * backend.setProgressCallback((progress) => {
435
+ * console.log(`OCR Progress: ${progress}%`);
436
+ * updateProgressBar(progress);
437
+ * });
438
+ *
439
+ * await backend.initialize();
440
+ * registerOcrBackend(backend);
441
+ * }
442
+ *
443
+ * setupOcrWithProgress().catch(console.error);
444
+ * ```
445
+ *
446
+ * @example Multiple Languages
447
+ * ```typescript
448
+ * import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
449
+ *
450
+ * await initWasm();
451
+ * await enableOcr();
452
+ *
453
+ * // Extract English text
454
+ * const englishResult = await extractBytes(engImageBytes, 'image/png', {
455
+ * ocr: { backend: 'tesseract-wasm', language: 'eng' }
456
+ * });
457
+ *
458
+ * // Extract German text - model is cached after first use
459
+ * const germanResult = await extractBytes(deImageBytes, 'image/png', {
460
+ * ocr: { backend: 'tesseract-wasm', language: 'deu' }
461
+ * });
462
+ * ```
463
+ */
464
+ declare function enableOcr(): Promise<void>;
465
+
466
+ export { ExtractionConfig, ExtractionResult, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getVersion, initWasm, isInitialized };