@kreuzberg/wasm 4.0.0-rc.23 → 4.0.0-rc.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts DELETED
@@ -1,639 +0,0 @@
1
- import { E as ExtractionConfig, a as ExtractionResult } from './types-wVLLDHkl.cjs.js';
2
- export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-wVLLDHkl.cjs.js';
3
- export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.cjs';
4
- export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.cjs';
5
- export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.cjs';
6
- export { type RuntimeType, type WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.d.cts';
7
-
8
- /**
9
- * Plugin Registry Module
10
- *
11
- * This module manages registrations and execution of post-processors and validators
12
- * for document extraction pipelines.
13
- *
14
- * # Thread Safety
15
- * All registrations are stored in Maps and are single-threaded safe for WASM environments.
16
- *
17
- * # Global Callback Functions
18
- * The WASM module can invoke processing via global callback functions:
19
- * - `__kreuzberg_execute_post_processor`: Execute a registered post-processor
20
- * - `__kreuzberg_execute_validator`: Execute a registered validator
21
- */
22
-
23
- /**
24
- * Post-processor plugin interface
25
- *
26
- * A post-processor modifies extraction results after extraction completes.
27
- */
28
- interface PostProcessor {
29
- /**
30
- * Get the processor name (must be non-empty string)
31
- */
32
- name(): string;
33
- /**
34
- * Get the processing stage (optional, defaults to "middle")
35
- * - "early": Process early in the pipeline
36
- * - "middle": Process in the middle of the pipeline
37
- * - "late": Process late in the pipeline
38
- */
39
- stage?(): "early" | "middle" | "late";
40
- /**
41
- * Process an extraction result
42
- * Can be sync or async
43
- */
44
- process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
45
- /**
46
- * Shutdown the processor (optional)
47
- */
48
- shutdown?(): void | Promise<void>;
49
- }
50
- /**
51
- * Validator plugin interface
52
- *
53
- * A validator checks extraction results for correctness
54
- */
55
- interface Validator {
56
- /**
57
- * Get the validator name (must be non-empty string)
58
- */
59
- name(): string;
60
- /**
61
- * Get the validation priority (optional, defaults to 50)
62
- * Higher numbers = higher priority (execute first)
63
- */
64
- priority?(): number;
65
- /**
66
- * Validate an extraction result
67
- * Can be sync or async
68
- */
69
- validate(result: ExtractionResult): {
70
- valid: boolean;
71
- errors: string[];
72
- } | Promise<{
73
- valid: boolean;
74
- errors: string[];
75
- }>;
76
- /**
77
- * Shutdown the validator (optional)
78
- */
79
- shutdown?(): void | Promise<void>;
80
- }
81
- /**
82
- * Register a post-processor plugin
83
- *
84
- * @param processor - The post-processor to register
85
- * @throws {Error} If the processor is invalid or missing required methods
86
- *
87
- * @example
88
- * ```typescript
89
- * const processor = {
90
- * name: () => "my-processor",
91
- * stage: () => "middle",
92
- * process: async (result) => {
93
- * result.content = result.content.toUpperCase();
94
- * return result;
95
- * }
96
- * };
97
- * registerPostProcessor(processor);
98
- * ```
99
- */
100
- declare function registerPostProcessor(processor: PostProcessor): void;
101
- /**
102
- * Get a registered post-processor by name
103
- *
104
- * @param name - The processor name
105
- * @returns The processor, or undefined if not found
106
- *
107
- * @example
108
- * ```typescript
109
- * const processor = getPostProcessor("my-processor");
110
- * if (processor) {
111
- * console.log("Found processor:", processor.name());
112
- * }
113
- * ```
114
- */
115
- declare function getPostProcessor(name: string): PostProcessor | undefined;
116
- /**
117
- * List all registered post-processor names
118
- *
119
- * @returns Array of processor names
120
- *
121
- * @example
122
- * ```typescript
123
- * const names = listPostProcessors();
124
- * console.log("Registered processors:", names);
125
- * ```
126
- */
127
- declare function listPostProcessors(): string[];
128
- /**
129
- * Unregister a post-processor and call its shutdown method
130
- *
131
- * @param name - The processor name
132
- * @throws {Error} If the processor is not registered
133
- *
134
- * @example
135
- * ```typescript
136
- * await unregisterPostProcessor("my-processor");
137
- * ```
138
- */
139
- declare function unregisterPostProcessor(name: string): Promise<void>;
140
- /**
141
- * Clear all registered post-processors
142
- *
143
- * Calls shutdown on all processors before clearing.
144
- *
145
- * @example
146
- * ```typescript
147
- * await clearPostProcessors();
148
- * ```
149
- */
150
- declare function clearPostProcessors(): Promise<void>;
151
- /**
152
- * Register a validator plugin
153
- *
154
- * @param validator - The validator to register
155
- * @throws {Error} If the validator is invalid or missing required methods
156
- *
157
- * @example
158
- * ```typescript
159
- * const validator = {
160
- * name: () => "my-validator",
161
- * priority: () => 50,
162
- * validate: async (result) => {
163
- * if (!result.content) {
164
- * return { valid: false, errors: ["Content is empty"] };
165
- * }
166
- * return { valid: true, errors: [] };
167
- * }
168
- * };
169
- * registerValidator(validator);
170
- * ```
171
- */
172
- declare function registerValidator(validator: Validator): void;
173
- /**
174
- * Get a registered validator by name
175
- *
176
- * @param name - The validator name
177
- * @returns The validator, or undefined if not found
178
- *
179
- * @example
180
- * ```typescript
181
- * const validator = getValidator("my-validator");
182
- * if (validator) {
183
- * console.log("Found validator:", validator.name());
184
- * }
185
- * ```
186
- */
187
- declare function getValidator(name: string): Validator | undefined;
188
- /**
189
- * List all registered validator names
190
- *
191
- * @returns Array of validator names
192
- *
193
- * @example
194
- * ```typescript
195
- * const names = listValidators();
196
- * console.log("Registered validators:", names);
197
- * ```
198
- */
199
- declare function listValidators(): string[];
200
- /**
201
- * Unregister a validator and call its shutdown method
202
- *
203
- * @param name - The validator name
204
- * @throws {Error} If the validator is not registered
205
- *
206
- * @example
207
- * ```typescript
208
- * await unregisterValidator("my-validator");
209
- * ```
210
- */
211
- declare function unregisterValidator(name: string): Promise<void>;
212
- /**
213
- * Clear all registered validators
214
- *
215
- * Calls shutdown on all validators before clearing.
216
- *
217
- * @example
218
- * ```typescript
219
- * await clearValidators();
220
- * ```
221
- */
222
- declare function clearValidators(): Promise<void>;
223
-
224
- /**
225
- * Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
226
- *
227
- * This module provides WebAssembly bindings for Kreuzberg document intelligence,
228
- * enabling high-performance document extraction in browser and JavaScript runtime environments.
229
- *
230
- * ## Features
231
- *
232
- * - Extract text, metadata, and tables from documents
233
- * - Support for multiple document formats (PDF, Office, images, etc.)
234
- * - Browser and runtime-compatible WASM bindings
235
- * - Type-safe TypeScript interfaces
236
- * - Runtime detection and feature capability checking
237
- * - Automatic type conversion and error handling
238
- *
239
- * ## Installation
240
- *
241
- * ```bash
242
- * npm install @kreuzberg/wasm
243
- * ```
244
- *
245
- * ## Basic Usage
246
- *
247
- * ```typescript
248
- * import { extractBytes, initWasm } from '@kreuzberg/wasm';
249
- *
250
- * // Initialize WASM module once at app startup
251
- * await initWasm();
252
- *
253
- * // Extract from bytes
254
- * const bytes = new Uint8Array(buffer);
255
- * const result = await extractBytes(bytes, 'application/pdf');
256
- * console.log(result.content);
257
- * ```
258
- *
259
- * ## Browser Usage with File Input
260
- *
261
- * ```typescript
262
- * import { extractBytes, initWasm } from '@kreuzberg/wasm';
263
- * import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';
264
- *
265
- * // Initialize once at app startup
266
- * await initWasm();
267
- *
268
- * // Handle file input
269
- * const fileInput = document.getElementById('file');
270
- * fileInput.addEventListener('change', async (e) => {
271
- * const file = e.target.files?.[0];
272
- * if (file) {
273
- * const bytes = await fileToUint8Array(file);
274
- * const result = await extractBytes(bytes, file.type);
275
- * console.log(result.content);
276
- * }
277
- * });
278
- * ```
279
- *
280
- * ## Runtime Detection
281
- *
282
- * ```typescript
283
- * import { detectRuntime, getWasmCapabilities } from '@kreuzberg/wasm/runtime';
284
- *
285
- * const runtime = detectRuntime();
286
- * const caps = getWasmCapabilities();
287
- *
288
- * if (caps.hasWorkers) {
289
- * // Can use Web Workers for parallel processing
290
- * }
291
- * ```
292
- *
293
- * ## Configuration
294
- *
295
- * ```typescript
296
- * import { extractBytes, initWasm } from '@kreuzberg/wasm';
297
- * import type { ExtractionConfig } from '@kreuzberg/wasm';
298
- *
299
- * await initWasm();
300
- *
301
- * const config: ExtractionConfig = {
302
- * ocr: {
303
- * backend: 'tesseract',
304
- * language: 'eng'
305
- * },
306
- * chunking: {
307
- * maxChars: 1000,
308
- * chunkOverlap: 100
309
- * },
310
- * images: {
311
- * extractImages: true,
312
- * targetDpi: 150
313
- * }
314
- * };
315
- *
316
- * const result = await extractBytes(bytes, 'application/pdf', config);
317
- * ```
318
- */
319
-
320
- declare function initWasm(): Promise<void>;
321
- /**
322
- * Check if WASM module is initialized
323
- *
324
- * @returns True if WASM module is initialized, false otherwise
325
- *
326
- * @example
327
- * ```typescript
328
- * if (!isInitialized()) {
329
- * await initWasm();
330
- * }
331
- * ```
332
- */
333
- declare function isInitialized(): boolean;
334
- /**
335
- * Get WASM module version
336
- *
337
- * @throws {Error} If WASM module is not initialized
338
- * @returns The version string of the WASM module
339
- *
340
- * @example
341
- * ```typescript
342
- * const version = getVersion();
343
- * console.log(`Using Kreuzberg ${version}`);
344
- * ```
345
- */
346
- declare function getVersion(): string;
347
- /**
348
- * Get initialization error if module failed to load
349
- *
350
- * @returns The error that occurred during initialization, or null if no error
351
- *
352
- * @internal
353
- */
354
- declare function getInitializationError(): Error | null;
355
- /**
356
- * Extract content from bytes (document data)
357
- *
358
- * Extracts text, metadata, tables, images, and other content from document bytes.
359
- * Automatically detects document type from MIME type and applies appropriate extraction logic.
360
- *
361
- * @param data - The document bytes to extract from
362
- * @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
363
- * @param config - Optional extraction configuration
364
- * @returns Promise resolving to the extraction result
365
- * @throws {Error} If WASM module is not initialized or extraction fails
366
- *
367
- * @example Extract PDF
368
- * ```typescript
369
- * const bytes = new Uint8Array(buffer);
370
- * const result = await extractBytes(bytes, 'application/pdf');
371
- * console.log(result.content);
372
- * console.log(result.tables);
373
- * ```
374
- *
375
- * @example Extract with Configuration
376
- * ```typescript
377
- * const result = await extractBytes(bytes, 'application/pdf', {
378
- * ocr: {
379
- * backend: 'tesseract',
380
- * language: 'deu' // German
381
- * },
382
- * images: {
383
- * extractImages: true,
384
- * targetDpi: 200
385
- * }
386
- * });
387
- * ```
388
- *
389
- * @example Extract from File
390
- * ```typescript
391
- * const file = inputEvent.target.files[0];
392
- * const bytes = await fileToUint8Array(file);
393
- * const result = await extractBytes(bytes, file.type);
394
- * ```
395
- */
396
- declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;
397
- /**
398
- * Extract content from a file on the file system
399
- *
400
- * Node.js and Deno specific function that reads a file from the file system
401
- * and extracts content from it. Automatically detects MIME type if not provided.
402
- *
403
- * @param path - Path to the file to extract from
404
- * @param mimeType - Optional MIME type of the file. If not provided, will attempt to detect
405
- * @param config - Optional extraction configuration
406
- * @returns Promise resolving to the extraction result
407
- * @throws {Error} If WASM module is not initialized, file doesn't exist, or extraction fails
408
- *
409
- * @example Extract with auto-detection
410
- * ```typescript
411
- * const result = await extractFile('./document.pdf');
412
- * console.log(result.content);
413
- * ```
414
- *
415
- * @example Extract with explicit MIME type
416
- * ```typescript
417
- * const result = await extractFile('./document.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document');
418
- * ```
419
- *
420
- * @example Extract from Node.js with config
421
- * ```typescript
422
- * import { extractFile } from '@kreuzberg/wasm';
423
- * import { readFile } from 'fs/promises';
424
- *
425
- * const result = await extractFile('./report.xlsx', null, {
426
- * chunking: {
427
- * maxChars: 1000
428
- * }
429
- * });
430
- * ```
431
- */
432
- declare function extractFile(path: string, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
433
- /**
434
- * Extract content from a File or Blob (browser-friendly wrapper)
435
- *
436
- * Convenience function that wraps fileToUint8Array and extractBytes,
437
- * providing a streamlined API for browser applications handling file inputs.
438
- *
439
- * @param file - The File or Blob to extract from
440
- * @param mimeType - Optional MIME type. If not provided, uses file.type if available
441
- * @param config - Optional extraction configuration
442
- * @returns Promise resolving to the extraction result
443
- * @throws {Error} If WASM module is not initialized or extraction fails
444
- *
445
- * @example Simple file extraction
446
- * ```typescript
447
- * const fileInput = document.getElementById('file');
448
- * fileInput.addEventListener('change', async (e) => {
449
- * const file = e.target.files?.[0];
450
- * if (file) {
451
- * const result = await extractFromFile(file);
452
- * console.log(result.content);
453
- * }
454
- * });
455
- * ```
456
- *
457
- * @example With configuration
458
- * ```typescript
459
- * const result = await extractFromFile(file, file.type, {
460
- * chunking: { maxChars: 1000 },
461
- * images: { extractImages: true }
462
- * });
463
- * ```
464
- */
465
- declare function extractFromFile(file: File | Blob, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
466
- /**
467
- * Extract content from bytes synchronously
468
- *
469
- * Synchronous version of extractBytes. Performs extraction without async operations.
470
- * Note: Some extraction features may still be async internally, but the wrapper is synchronous.
471
- *
472
- * @param data - The document bytes to extract from
473
- * @param mimeType - MIME type of the document
474
- * @param config - Optional extraction configuration
475
- * @returns The extraction result
476
- * @throws {Error} If WASM module is not initialized or extraction fails
477
- *
478
- * @example
479
- * ```typescript
480
- * const bytes = new Uint8Array(buffer);
481
- * const result = extractBytesSync(bytes, 'application/pdf');
482
- * console.log(result.content);
483
- * ```
484
- */
485
- declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
486
- /**
487
- * Batch extract content from multiple byte arrays asynchronously
488
- *
489
- * Extracts content from multiple documents in a single batch operation,
490
- * allowing for more efficient processing of multiple files.
491
- *
492
- * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
493
- * @param config - Optional extraction configuration applied to all files
494
- * @returns Promise resolving to array of extraction results
495
- * @throws {Error} If WASM module is not initialized or extraction fails
496
- *
497
- * @example
498
- * ```typescript
499
- * const files = [
500
- * { data: pdfBytes, mimeType: 'application/pdf' },
501
- * { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
502
- * ];
503
- * const results = await batchExtractBytes(files);
504
- * results.forEach((result) => console.log(result.content));
505
- * ```
506
- */
507
- declare function batchExtractBytes(files: Array<{
508
- data: Uint8Array;
509
- mimeType: string;
510
- }>, config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
511
- /**
512
- * Batch extract content from multiple byte arrays synchronously
513
- *
514
- * Synchronous version of batchExtractBytes. Extracts content from multiple documents
515
- * in a single batch operation without async operations.
516
- *
517
- * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
518
- * @param config - Optional extraction configuration applied to all files
519
- * @returns Array of extraction results
520
- * @throws {Error} If WASM module is not initialized or extraction fails
521
- *
522
- * @example
523
- * ```typescript
524
- * const files = [
525
- * { data: pdfBytes, mimeType: 'application/pdf' },
526
- * { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
527
- * ];
528
- * const results = batchExtractBytesSync(files);
529
- * results.forEach((result) => console.log(result.content));
530
- * ```
531
- */
532
- declare function batchExtractBytesSync(files: Array<{
533
- data: Uint8Array;
534
- mimeType: string;
535
- }>, config?: ExtractionConfig | null): ExtractionResult[];
536
- /**
537
- * Batch extract content from multiple File objects asynchronously
538
- *
539
- * Convenience function that converts File objects to Uint8Array and calls batchExtractBytes.
540
- * Automatically uses the file.type as MIME type if available.
541
- *
542
- * @param files - Array of File objects to extract from
543
- * @param config - Optional extraction configuration applied to all files
544
- * @returns Promise resolving to array of extraction results
545
- * @throws {Error} If WASM module is not initialized, files cannot be read, or extraction fails
546
- *
547
- * @example
548
- * ```typescript
549
- * const fileInput = document.getElementById('files');
550
- * const files = Array.from(fileInput.files ?? []);
551
- * const results = await batchExtractFiles(files);
552
- * results.forEach((result, index) => {
553
- * console.log(`File ${index}: ${result.content.substring(0, 50)}...`);
554
- * });
555
- * ```
556
- */
557
- declare function batchExtractFiles(files: File[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
558
- /**
559
- * Enable OCR functionality with tesseract-wasm backend
560
- *
561
- * Convenience function that automatically initializes and registers the Tesseract WASM backend.
562
- * This is the recommended approach for enabling OCR in WASM-based applications.
563
- *
564
- * ## Browser Requirement
565
- *
566
- * This function requires a browser environment with support for:
567
- * - WebWorkers (for Tesseract processing)
568
- * - createImageBitmap (for image conversion)
569
- * - Blob API
570
- *
571
- * ## Network Requirement
572
- *
573
- * Training data will be loaded from jsDelivr CDN on first use of each language.
574
- * Ensure network access to cdn.jsdelivr.net is available.
575
- *
576
- * @throws {Error} If not in browser environment or tesseract-wasm is not available
577
- *
578
- * @example Basic Usage
579
- * ```typescript
580
- * import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
581
- *
582
- * async function main() {
583
- * // Initialize WASM module
584
- * await initWasm();
585
- *
586
- * // Enable OCR with tesseract-wasm
587
- * await enableOcr();
588
- *
589
- * // Now you can use OCR in extraction
590
- * const imageBytes = new Uint8Array(buffer);
591
- * const result = await extractBytes(imageBytes, 'image/png', {
592
- * ocr: { backend: 'tesseract-wasm', language: 'eng' }
593
- * });
594
- *
595
- * console.log(result.content); // Extracted text
596
- * }
597
- *
598
- * main().catch(console.error);
599
- * ```
600
- *
601
- * @example With Progress Tracking
602
- * ```typescript
603
- * import { enableOcr, TesseractWasmBackend } from '@kreuzberg/wasm';
604
- *
605
- * async function setupOcrWithProgress() {
606
- * const backend = new TesseractWasmBackend();
607
- * backend.setProgressCallback((progress) => {
608
- * console.log(`OCR Progress: ${progress}%`);
609
- * updateProgressBar(progress);
610
- * });
611
- *
612
- * await backend.initialize();
613
- * registerOcrBackend(backend);
614
- * }
615
- *
616
- * setupOcrWithProgress().catch(console.error);
617
- * ```
618
- *
619
- * @example Multiple Languages
620
- * ```typescript
621
- * import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
622
- *
623
- * await initWasm();
624
- * await enableOcr();
625
- *
626
- * // Extract English text
627
- * const englishResult = await extractBytes(engImageBytes, 'image/png', {
628
- * ocr: { backend: 'tesseract-wasm', language: 'eng' }
629
- * });
630
- *
631
- * // Extract German text - model is cached after first use
632
- * const germanResult = await extractBytes(deImageBytes, 'image/png', {
633
- * ocr: { backend: 'tesseract-wasm', language: 'deu' }
634
- * });
635
- * ```
636
- */
637
- declare function enableOcr(): Promise<void>;
638
-
639
- export { type PostProcessor, type Validator, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, clearPostProcessors, clearValidators, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getPostProcessor, getValidator, getVersion, initWasm, isInitialized, listPostProcessors, listValidators, registerPostProcessor, registerValidator, unregisterPostProcessor, unregisterValidator };