npm - @kreuzberg/wasm - Versions diffs - 4.0.0-rc.10 - Mend

@kreuzberg/wasm 4.0.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/README.md +982 -0
package/dist/adapters/wasm-adapter.cjs +245 -0
package/dist/adapters/wasm-adapter.cjs.map +1 -0
package/dist/adapters/wasm-adapter.d.cts +121 -0
package/dist/adapters/wasm-adapter.d.ts +121 -0
package/dist/adapters/wasm-adapter.js +224 -0
package/dist/adapters/wasm-adapter.js.map +1 -0
package/dist/index.cjs +4335 -0
package/dist/index.cjs.map +1 -0
package/dist/index.d.cts +466 -0
package/dist/index.d.ts +466 -0
package/dist/index.js +4308 -0
package/dist/index.js.map +1 -0
package/dist/ocr/registry.cjs +92 -0
package/dist/ocr/registry.cjs.map +1 -0
package/dist/ocr/registry.d.cts +102 -0
package/dist/ocr/registry.d.ts +102 -0
package/dist/ocr/registry.js +71 -0
package/dist/ocr/registry.js.map +1 -0
package/dist/ocr/tesseract-wasm-backend.cjs +3566 -0
package/dist/ocr/tesseract-wasm-backend.cjs.map +1 -0
package/dist/ocr/tesseract-wasm-backend.d.cts +257 -0
package/dist/ocr/tesseract-wasm-backend.d.ts +257 -0
package/dist/ocr/tesseract-wasm-backend.js +3551 -0
package/dist/ocr/tesseract-wasm-backend.js.map +1 -0
package/dist/runtime.cjs +174 -0
package/dist/runtime.cjs.map +1 -0
package/dist/runtime.d.cts +256 -0
package/dist/runtime.d.ts +256 -0
package/dist/runtime.js +153 -0
package/dist/runtime.js.map +1 -0
package/dist/types-CKjcIYcX.d.cts +294 -0
package/dist/types-CKjcIYcX.d.ts +294 -0
package/package.json +140 -0

package/dist/index.d.cts ADDED Viewed

@@ -0,0 +1,466 @@
+import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs';
+export { C as Chunk, c as ChunkMetadata, b as ChunkingConfig, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig } from './types-CKjcIYcX.cjs';
+export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.cjs';
+export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.cjs';
+export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.cjs';
+export { RuntimeType, WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.cjs';
+/**
+ * Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
+ *
+ * This module provides WebAssembly bindings for Kreuzberg document intelligence,
+ * enabling high-performance document extraction in browser and JavaScript runtime environments.
+ *
+ * ## Features
+ *
+ * - Extract text, metadata, and tables from documents
+ * - Support for multiple document formats (PDF, Office, images, etc.)
+ * - Browser and runtime-compatible WASM bindings
+ * - Type-safe TypeScript interfaces
+ * - Runtime detection and feature capability checking
+ * - Automatic type conversion and error handling
+ *
+ * ## Installation
+ *
+ * ```bash
+ * npm install @kreuzberg/wasm
+ * ```
+ *
+ * ## Basic Usage
+ *
+ * ```typescript
+ * import { extractBytes, initWasm } from '@kreuzberg/wasm';
+ *
+ * // Initialize WASM module once at app startup
+ * await initWasm();
+ *
+ * // Extract from bytes
+ * const bytes = new Uint8Array(buffer);
+ * const result = await extractBytes(bytes, 'application/pdf');
+ * console.log(result.content);
+ * ```
+ *
+ * ## Browser Usage with File Input
+ *
+ * ```typescript
+ * import { extractBytes, initWasm } from '@kreuzberg/wasm';
+ * import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';
+ *
+ * // Initialize once at app startup
+ * await initWasm();
+ *
+ * // Handle file input
+ * const fileInput = document.getElementById('file');
+ * fileInput.addEventListener('change', async (e) => {
+ *   const file = e.target.files?.[0];
+ *   if (file) {
+ *     const bytes = await fileToUint8Array(file);
+ *     const result = await extractBytes(bytes, file.type);
+ *     console.log(result.content);
+ *   }
+ * });
+ * ```
+ *
+ * ## Runtime Detection
+ *
+ * ```typescript
+ * import { detectRuntime, getWasmCapabilities } from '@kreuzberg/wasm/runtime';
+ *
+ * const runtime = detectRuntime();
+ * const caps = getWasmCapabilities();
+ *
+ * if (caps.hasWorkers) {
+ *   // Can use Web Workers for parallel processing
+ * }
+ * ```
+ *
+ * ## Configuration
+ *
+ * ```typescript
+ * import { extractBytes, initWasm } from '@kreuzberg/wasm';
+ * import type { ExtractionConfig } from '@kreuzberg/wasm';
+ *
+ * await initWasm();
+ *
+ * const config: ExtractionConfig = {
+ *   ocr: {
+ *     backend: 'tesseract',
+ *     language: 'eng'
+ *   },
+ *   chunking: {
+ *     maxChars: 1000,
+ *     chunkOverlap: 100
+ *   },
+ *   images: {
+ *     extractImages: true,
+ *     targetDpi: 150
+ *   }
+ * };
+ *
+ * const result = await extractBytes(bytes, 'application/pdf', config);
+ * ```
+ */
+/**
+ * Initialize the WASM module
+ *
+ * This function must be called once before using any extraction functions.
+ * It loads and initializes the WASM module in the current runtime environment,
+ * automatically selecting the appropriate WASM variant for the detected runtime.
+ *
+ * Multiple calls to initWasm() are safe and will return immediately if already initialized.
+ *
+ * @throws {Error} If WASM module fails to load or is not supported in the current environment
+ *
+ * @example Basic Usage
+ * ```typescript
+ * import { initWasm } from '@kreuzberg/wasm';
+ *
+ * async function main() {
+ *   await initWasm();
+ *   // Now you can use extraction functions
+ * }
+ *
+ * main().catch(console.error);
+ * ```
+ *
+ * @example With Error Handling
+ * ```typescript
+ * import { initWasm, getWasmCapabilities } from '@kreuzberg/wasm';
+ *
+ * async function initializeKreuzberg() {
+ *   const caps = getWasmCapabilities();
+ *   if (!caps.hasWasm) {
+ *     throw new Error('WebAssembly is not supported in this environment');
+ *   }
+ *
+ *   try {
+ *     await initWasm();
+ *     console.log('Kreuzberg initialized successfully');
+ *   } catch (error) {
+ *     console.error('Failed to initialize Kreuzberg:', error);
+ *     throw error;
+ *   }
+ * }
+ * ```
+ */
+declare function initWasm(): Promise<void>;
+/**
+ * Check if WASM module is initialized
+ *
+ * @returns True if WASM module is initialized, false otherwise
+ *
+ * @example
+ * ```typescript
+ * if (!isInitialized()) {
+ *   await initWasm();
+ * }
+ * ```
+ */
+declare function isInitialized(): boolean;
+/**
+ * Get WASM module version
+ *
+ * @throws {Error} If WASM module is not initialized
+ * @returns The version string of the WASM module
+ *
+ * @example
+ * ```typescript
+ * const version = getVersion();
+ * console.log(`Using Kreuzberg ${version}`);
+ * ```
+ */
+declare function getVersion(): string;
+/**
+ * Get initialization error if module failed to load
+ *
+ * @returns The error that occurred during initialization, or null if no error
+ *
+ * @internal
+ */
+declare function getInitializationError(): Error | null;
+/**
+ * Extract content from bytes (document data)
+ *
+ * Extracts text, metadata, tables, images, and other content from document bytes.
+ * Automatically detects document type from MIME type and applies appropriate extraction logic.
+ *
+ * @param data - The document bytes to extract from
+ * @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
+ * @param config - Optional extraction configuration
+ * @returns Promise resolving to the extraction result
+ * @throws {Error} If WASM module is not initialized or extraction fails
+ *
+ * @example Extract PDF
+ * ```typescript
+ * const bytes = new Uint8Array(buffer);
+ * const result = await extractBytes(bytes, 'application/pdf');
+ * console.log(result.content);
+ * console.log(result.tables);
+ * ```
+ *
+ * @example Extract with Configuration
+ * ```typescript
+ * const result = await extractBytes(bytes, 'application/pdf', {
+ *   ocr: {
+ *     backend: 'tesseract',
+ *     language: 'deu' // German
+ *   },
+ *   images: {
+ *     extractImages: true,
+ *     targetDpi: 200
+ *   }
+ * });
+ * ```
+ *
+ * @example Extract from File
+ * ```typescript
+ * const file = inputEvent.target.files[0];
+ * const bytes = await fileToUint8Array(file);
+ * const result = await extractBytes(bytes, file.type);
+ * ```
+ */
+declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;
+/**
+ * Extract content from a file on the file system
+ *
+ * Node.js and Deno specific function that reads a file from the file system
+ * and extracts content from it. Automatically detects MIME type if not provided.
+ *
+ * @param path - Path to the file to extract from
+ * @param mimeType - Optional MIME type of the file. If not provided, will attempt to detect
+ * @param config - Optional extraction configuration
+ * @returns Promise resolving to the extraction result
+ * @throws {Error} If WASM module is not initialized, file doesn't exist, or extraction fails
+ *
+ * @example Extract with auto-detection
+ * ```typescript
+ * const result = await extractFile('./document.pdf');
+ * console.log(result.content);
+ * ```
+ *
+ * @example Extract with explicit MIME type
+ * ```typescript
+ * const result = await extractFile('./document.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document');
+ * ```
+ *
+ * @example Extract from Node.js with config
+ * ```typescript
+ * import { extractFile } from '@kreuzberg/wasm';
+ * import { readFile } from 'fs/promises';
+ *
+ * const result = await extractFile('./report.xlsx', null, {
+ *   chunking: {
+ *     maxChars: 1000
+ *   }
+ * });
+ * ```
+ */
+declare function extractFile(path: string, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
+/**
+ * Extract content from a File or Blob (browser-friendly wrapper)
+ *
+ * Convenience function that wraps fileToUint8Array and extractBytes,
+ * providing a streamlined API for browser applications handling file inputs.
+ *
+ * @param file - The File or Blob to extract from
+ * @param mimeType - Optional MIME type. If not provided, uses file.type if available
+ * @param config - Optional extraction configuration
+ * @returns Promise resolving to the extraction result
+ * @throws {Error} If WASM module is not initialized or extraction fails
+ *
+ * @example Simple file extraction
+ * ```typescript
+ * const fileInput = document.getElementById('file');
+ * fileInput.addEventListener('change', async (e) => {
+ *   const file = e.target.files?.[0];
+ *   if (file) {
+ *     const result = await extractFromFile(file);
+ *     console.log(result.content);
+ *   }
+ * });
+ * ```
+ *
+ * @example With configuration
+ * ```typescript
+ * const result = await extractFromFile(file, file.type, {
+ *   chunking: { maxChars: 1000 },
+ *   images: { extractImages: true }
+ * });
+ * ```
+ */
+declare function extractFromFile(file: File | Blob, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
+/**
+ * Extract content from bytes synchronously
+ *
+ * Synchronous version of extractBytes. Performs extraction without async operations.
+ * Note: Some extraction features may still be async internally, but the wrapper is synchronous.
+ *
+ * @param data - The document bytes to extract from
+ * @param mimeType - MIME type of the document
+ * @param config - Optional extraction configuration
+ * @returns The extraction result
+ * @throws {Error} If WASM module is not initialized or extraction fails
+ *
+ * @example
+ * ```typescript
+ * const bytes = new Uint8Array(buffer);
+ * const result = extractBytesSync(bytes, 'application/pdf');
+ * console.log(result.content);
+ * ```
+ */
+declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
+/**
+ * Batch extract content from multiple byte arrays asynchronously
+ *
+ * Extracts content from multiple documents in a single batch operation,
+ * allowing for more efficient processing of multiple files.
+ *
+ * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
+ * @param config - Optional extraction configuration applied to all files
+ * @returns Promise resolving to array of extraction results
+ * @throws {Error} If WASM module is not initialized or extraction fails
+ *
+ * @example
+ * ```typescript
+ * const files = [
+ *   { data: pdfBytes, mimeType: 'application/pdf' },
+ *   { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
+ * ];
+ * const results = await batchExtractBytes(files);
+ * results.forEach((result) => console.log(result.content));
+ * ```
+ */
+declare function batchExtractBytes(files: Array<{
+    data: Uint8Array;
+    mimeType: string;
+}>, config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
+/**
+ * Batch extract content from multiple byte arrays synchronously
+ *
+ * Synchronous version of batchExtractBytes. Extracts content from multiple documents
+ * in a single batch operation without async operations.
+ *
+ * @param files - Array of objects containing data (Uint8Array) and mimeType (string)
+ * @param config - Optional extraction configuration applied to all files
+ * @returns Array of extraction results
+ * @throws {Error} If WASM module is not initialized or extraction fails
+ *
+ * @example
+ * ```typescript
+ * const files = [
+ *   { data: pdfBytes, mimeType: 'application/pdf' },
+ *   { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
+ * ];
+ * const results = batchExtractBytesSync(files);
+ * results.forEach((result) => console.log(result.content));
+ * ```
+ */
+declare function batchExtractBytesSync(files: Array<{
+    data: Uint8Array;
+    mimeType: string;
+}>, config?: ExtractionConfig | null): ExtractionResult[];
+/**
+ * Batch extract content from multiple File objects asynchronously
+ *
+ * Convenience function that converts File objects to Uint8Array and calls batchExtractBytes.
+ * Automatically uses the file.type as MIME type if available.
+ *
+ * @param files - Array of File objects to extract from
+ * @param config - Optional extraction configuration applied to all files
+ * @returns Promise resolving to array of extraction results
+ * @throws {Error} If WASM module is not initialized, files cannot be read, or extraction fails
+ *
+ * @example
+ * ```typescript
+ * const fileInput = document.getElementById('files');
+ * const files = Array.from(fileInput.files ?? []);
+ * const results = await batchExtractFiles(files);
+ * results.forEach((result, index) => {
+ *   console.log(`File ${index}: ${result.content.substring(0, 50)}...`);
+ * });
+ * ```
+ */
+declare function batchExtractFiles(files: File[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
+/**
+ * Enable OCR functionality with tesseract-wasm backend
+ *
+ * Convenience function that automatically initializes and registers the Tesseract WASM backend.
+ * This is the recommended approach for enabling OCR in WASM-based applications.
+ *
+ * ## Browser Requirement
+ *
+ * This function requires a browser environment with support for:
+ * - WebWorkers (for Tesseract processing)
+ * - createImageBitmap (for image conversion)
+ * - Blob API
+ *
+ * ## Network Requirement
+ *
+ * Training data will be loaded from jsDelivr CDN on first use of each language.
+ * Ensure network access to cdn.jsdelivr.net is available.
+ *
+ * @throws {Error} If not in browser environment or tesseract-wasm is not available
+ *
+ * @example Basic Usage
+ * ```typescript
+ * import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
+ *
+ * async function main() {
+ *   // Initialize WASM module
+ *   await initWasm();
+ *
+ *   // Enable OCR with tesseract-wasm
+ *   await enableOcr();
+ *
+ *   // Now you can use OCR in extraction
+ *   const imageBytes = new Uint8Array(buffer);
+ *   const result = await extractBytes(imageBytes, 'image/png', {
+ *     ocr: { backend: 'tesseract-wasm', language: 'eng' }
+ *   });
+ *
+ *   console.log(result.content); // Extracted text
+ * }
+ *
+ * main().catch(console.error);
+ * ```
+ *
+ * @example With Progress Tracking
+ * ```typescript
+ * import { enableOcr, TesseractWasmBackend } from '@kreuzberg/wasm';
+ *
+ * async function setupOcrWithProgress() {
+ *   const backend = new TesseractWasmBackend();
+ *   backend.setProgressCallback((progress) => {
+ *     console.log(`OCR Progress: ${progress}%`);
+ *     updateProgressBar(progress);
+ *   });
+ *
+ *   await backend.initialize();
+ *   registerOcrBackend(backend);
+ * }
+ *
+ * setupOcrWithProgress().catch(console.error);
+ * ```
+ *
+ * @example Multiple Languages
+ * ```typescript
+ * import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
+ *
+ * await initWasm();
+ * await enableOcr();
+ *
+ * // Extract English text
+ * const englishResult = await extractBytes(engImageBytes, 'image/png', {
+ *   ocr: { backend: 'tesseract-wasm', language: 'eng' }
+ * });
+ *
+ * // Extract German text - model is cached after first use
+ * const germanResult = await extractBytes(deImageBytes, 'image/png', {
+ *   ocr: { backend: 'tesseract-wasm', language: 'deu' }
+ * });
+ * ```
+ */
+declare function enableOcr(): Promise<void>;
+export { ExtractionConfig, ExtractionResult, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getVersion, initWasm, isInitialized };