npm - @kreuzberg/node - Versions diffs - 4.0.8 → 4.1.1 - Mend

@kreuzberg/node 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +1 -1
package/dist/cli.js +6 -4
package/dist/cli.js.map +1 -1
package/dist/cli.mjs +13 -5
package/dist/cli.mjs.map +1 -1
package/dist/errors.js +26 -24
package/dist/errors.js.map +1 -1
package/dist/errors.mjs +25 -24
package/dist/errors.mjs.map +1 -1
package/dist/index.d.mts +608 -535
package/dist/index.d.ts +608 -535
package/dist/index.js +682 -338
package/dist/index.js.map +1 -1
package/dist/index.mjs +662 -334
package/dist/index.mjs.map +1 -1
package/dist/ocr/guten-ocr.js +4 -2
package/dist/ocr/guten-ocr.js.map +1 -1
package/dist/ocr/guten-ocr.mjs +3 -2
package/dist/ocr/guten-ocr.mjs.map +1 -1
package/dist/types.js +2 -0
package/dist/types.js.map +1 -1
package/index.d.ts +77 -178
package/index.js +54 -52
package/package.json +7 -7

package/dist/index.d.ts CHANGED Viewed

@@ -1,195 +1,153 @@
+import { ErrorClassification, ExtractionConfig, ExtractionResult, WorkerPool, WorkerPoolStats, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
+export { Chunk, ChunkingConfig, ExtractedImage, HtmlConversionOptions, HtmlPreprocessingOptions, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrConfig, PageContent, PageExtractionConfig, PdfConfig, PostProcessorConfig, Table, TesseractConfig, TokenReductionConfig } from './types.js';
 import { PanicContext } from './errors.js';
 export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
-import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.js';
-export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
 export { GutenOcrBackend } from './ocr/guten-ocr.js';
 /**
- * Kreuzberg - Multi-language document intelligence framework.
- *
- * This is a TypeScript SDK around a high-performance Rust core.
- * All extraction logic, chunking, quality processing, and language detection
- * are implemented in Rust for maximum performance.
- *
- * ## API Usage Recommendations
- *
- * **For processing multiple documents**, prefer batch APIs:
- * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
- * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
- *
- * **Batch APIs provide**:
- * - Better performance (parallel processing in Rust)
- * - More reliable memory management
- * - Recommended for all multi-document workflows
+ * Get the error code for the last FFI error.
  *
- * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
- * - One-off document processing
- * - Interactive applications processing documents on-demand
- * - Avoid calling these in tight loops - use batch APIs instead
+ * Returns the FFI error code as an integer. This is useful for programmatic error handling
+ * and distinguishing between different types of failures in native code.
  *
- * ## Supported Formats
+ * Error codes:
+ * - 0: Success (no error)
+ * - 1: GenericError
+ * - 2: Panic
+ * - 3: InvalidArgument
+ * - 4: IoError
+ * - 5: ParsingError
+ * - 6: OcrError
+ * - 7: MissingDependency
  *
- * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
- * - **Text**: Markdown, Plain Text, XML
- * - **Web**: HTML (converted to Markdown)
- * - **Data**: JSON, YAML, TOML
- * - **Email**: EML, MSG
- * - **Images**: PNG, JPEG, TIFF (with OCR support)
+ * @returns The integer error code
  *
  * @example
  * ```typescript
- * import { extractFile, batchExtractFiles } from '@kreuzberg/node';
- *
- * // Single file extraction
- * const result = await extractFile('document.pdf');
- * console.log(result.content);
+ * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
  *
- * // Multiple files (recommended approach)
- * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
- * const results = await batchExtractFiles(files);
- * results.forEach(r => console.log(r.content));
+ * try {
+ *   const result = await extractFile('document.pdf');
+ * } catch (error) {
+ *   const code = getLastErrorCode();
+ *   if (code === ErrorCode.Panic) {
+ *     console.error('Native code panic detected');
+ *   }
+ * }
  * ```
  */
-/**
- * @internal Allows tests to provide a mocked native binding.
- */
-declare function __setBindingForTests(mock: unknown): void;
-/**
- * @internal Resets the cached native binding for tests.
- */
-declare function __resetBindingForTests(): void;
+declare function getLastErrorCode(): number;
 /**
- * Extract content from a single file (synchronous).
+ * Get panic context information if the last error was a panic.
  *
- * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
- * provides better performance and memory management.
+ * Returns detailed information about a panic in native code, or null if the last error was not a panic.
+ * This provides debugging information when native code panics.
  *
- * @param filePath - Path to the file to extract (string). Can be absolute or relative.
- * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
- * @param config - Extraction configuration object. If null, uses default extraction settings.
- * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
- * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
- * @throws {ParsingError} When document format is invalid or corrupted
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
- * @throws {KreuzbergError} For other extraction-related failures
+ * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
  *
  * @example
  * ```typescript
- * import { extractFileSync } from '@kreuzberg/node';
- *
- * // Basic usage
- * const result = extractFileSync('document.pdf');
- * console.log(result.content);
+ * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
  *
- * // With OCR configuration
- * const config = {
- *   ocr: {
- *     backend: 'tesseract',
- *     language: 'eng',
- *     tesseractConfig: {
- *       psm: 6,
- *       enableTableDetection: true,
- *     },
- *   },
- * };
- * const result2 = extractFileSync('scanned.pdf', null, config);
+ * try {
+ *   const result = await extractFile('document.pdf');
+ * } catch (error) {
+ *   const context = getLastPanicContext();
+ *   if (context) {
+ *     console.error(`Panic at ${context.file}:${context.line}`);
+ *     console.error(`In function: ${context.function}`);
+ *     console.error(`Message: ${context.message}`);
+ *   }
+ * }
  * ```
  */
-declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
+declare function getLastPanicContext(): PanicContext | null;
 /**
- * Extract content from a single file (asynchronous).
+ * Returns the human-readable name for an error code.
  *
- * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
- * provides better performance and memory management.
+ * Maps numeric error codes to their string names, providing a consistent way
+ * to get error code names across all platforms.
  *
- * @param filePath - Path to the file to extract (string). Can be absolute or relative.
- * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
- * @param config - Extraction configuration object. If null, uses default extraction settings.
- * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
- * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
- * @throws {ParsingError} When document format is invalid or corrupted
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
- * @throws {KreuzbergError} For other extraction-related failures
+ * @param code - The numeric error code (0-7)
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
  *
  * @example
  * ```typescript
- * import { extractFile } from '@kreuzberg/node';
- *
- * // Basic usage
- * const result = await extractFile('document.pdf');
- * console.log(result.content);
+ * import { getErrorCodeName } from '@kreuzberg/node';
  *
- * // With chunking enabled
- * const config = {
- *   chunking: {
- *     maxChars: 1000,
- *     maxOverlap: 200,
- *   },
- * };
- * const result2 = await extractFile('long_document.pdf', null, config);
- * console.log(result2.chunks); // Array of text chunks
+ * const name = getErrorCodeName(0);  // returns "validation"
+ * const name = getErrorCodeName(2);  // returns "ocr"
+ * const name = getErrorCodeName(99); // returns "unknown"
  * ```
  */
-declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function getErrorCodeName(code: number): string;
 /**
- * Extract content from raw bytes (synchronous).
+ * Returns the description for an error code.
  *
- * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
- * which provides better performance and memory management.
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
  *
- * @param data - File content as Uint8Array (Buffer will be converted)
- * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
- * @param config - Extraction configuration object. If null, uses default extraction settings.
- * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
- * @throws {TypeError} When data is not a valid Uint8Array
- * @throws {Error} When file cannot be read or parsed
- * @throws {ParsingError} When document format is invalid or corrupted
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
- * @throws {KreuzbergError} For other extraction-related failures
+ * @param code - The numeric error code (0-7)
+ * @returns A brief description of the error type
  *
  * @example
  * ```typescript
- * import { extractBytesSync } from '@kreuzberg/node';
- * import { readFileSync } from 'fs';
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
  *
- * const data = readFileSync('document.pdf');
- * const result = extractBytesSync(data, 'application/pdf');
- * console.log(result.content);
+ * const desc = getErrorCodeDescription(0);  // returns "Input validation error"
+ * const desc = getErrorCodeDescription(4);  // returns "File system I/O error"
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
  * ```
  */
-declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
+declare function getErrorCodeDescription(code: number): string;
 /**
- * Extract content from raw bytes (asynchronous).
+ * Classifies an error message string into an error code category.
  *
- * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
- * which provides better performance and memory management.
+ * This function analyzes the error message content and returns the most likely
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
+ * errors for handling purposes.
  *
- * @param data - File content as Uint8Array (Buffer will be converted)
- * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
- * @param config - Extraction configuration object. If null, uses default extraction settings.
- * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
- * @throws {TypeError} When data is not a valid Uint8Array
- * @throws {Error} When file cannot be read or parsed
- * @throws {ParsingError} When document format is invalid or corrupted
- * @throws {OcrError} When OCR processing fails (if OCR is enabled)
- * @throws {ValidationError} When extraction result fails validation (if validators registered)
- * @throws {KreuzbergError} For other extraction-related failures
+ * The classification is based on keyword matching:
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
+ *
+ * @param errorMessage - The error message string to classify
+ * @returns An object with the classification details
  *
  * @example
  * ```typescript
- * import { extractBytes } from '@kreuzberg/node';
- * import { readFile } from 'fs/promises';
+ * import { classifyError } from '@kreuzberg/node';
  *
- * const data = await readFile('document.pdf');
- * const result = await extractBytes(data, 'application/pdf');
- * console.log(result.content);
+ * const result = classifyError("PDF file is corrupted");
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
+ *
+ * const result = classifyError("Tesseract not found");
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
  * ```
  */
-declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function classifyError(errorMessage: string): ErrorClassification;
+/**
+ * Batch extraction APIs for processing multiple documents.
+ *
+ * This module provides synchronous and asynchronous functions for extracting content
+ * from multiple files or byte arrays in parallel. Batch operations offer better
+ * performance and memory management compared to calling single extraction functions
+ * in a loop.
+ *
+ * **Benefits of Batch Processing**:
+ * - Parallel processing in Rust for maximum performance
+ * - Optimized memory usage across all extractions
+ * - More reliable for large-scale document processing
+ *
+ * @internal This module is part of Layer 2 (extraction APIs).
+ */
 /**
  * Extract content from multiple files in parallel (synchronous).
  *
@@ -222,7 +180,7 @@ declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string,
  * });
  * ```
  */
-declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
+declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig | null): ExtractionResult[];
 /**
  * Extract content from multiple files in parallel (asynchronous).
  *
@@ -258,7 +216,7 @@ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfi
  *   .reduce((a, b) => a + b, 0);
  * ```
  */
-declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
+declare function batchExtractFiles(paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
 /**
  * Extract content from multiple byte arrays in parallel (synchronous).
  *
@@ -296,7 +254,7 @@ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1
  * });
  * ```
  */
-declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): ExtractionResult[];
+declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): ExtractionResult[];
 /**
  * Extract content from multiple byte arrays in parallel (asynchronous).
  *
@@ -338,55 +296,355 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
  *   .reduce((a, b) => a + b, 0);
  * ```
  */
-declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
+declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
 /**
- * Register a custom postprocessor.
+ * Single-document extraction APIs.
  *
- * **IMPORTANT**: Custom processors only work with **async extraction functions**:
- * - ✅ `extractFile()`, `extractBytes()`, `batchExtractFiles()`, `batchExtractBytes()`
- * - ❌ `extractFileSync()`, `extractBytesSync()`, etc. (will skip custom processors)
+ * This module provides synchronous and asynchronous functions for extracting content
+ * from a single file or byte array. These are convenience wrappers around the native
+ * binding that handle config normalization and result conversion.
  *
- * This limitation exists because sync extraction blocks the Node.js event loop,
- * preventing JavaScript callbacks from executing. For v4.0, use async extraction
- * when you need custom processors.
+ * **Usage Note**: For processing multiple files, prefer batch extraction functions
+ * (`batchExtractFiles`, `batchExtractFilesSync`) which provide better performance
+ * and memory management.
  *
- * @param processor - PostProcessorProtocol implementation with name(), process(), and optional processingStage()
- * @throws {Error} If processor is missing required methods (name or process)
- * @throws {Error} If processor name is empty string
- * @throws {Error} If a processor with the same name is already registered
+ * @internal This module is part of Layer 2 (extraction APIs).
+ */
+/**
+ * Extract content from a single file (synchronous).
  *
- * @example
- * ```typescript
- * import { registerPostProcessor, extractFile, ExtractionResult } from '@kreuzberg/node';
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
+ * provides better performance and memory management.
  *
- * class MyProcessor implements PostProcessorProtocol {
- *   name(): string {
- *     return 'my_processor';
- *   }
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
+ * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
+ *   If a string, treated as MIME type. If an object, treated as ExtractionConfig.
+ *   If null, MIME type is auto-detected from file extension or content.
+ * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
+ *   Only used if second parameter is a MIME type string.
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
- *   process(result: ExtractionResult): ExtractionResult {
- *     result.metadata.customField = 'custom_value';
- *     return result;
- *   }
+ * @example
+ * ```typescript
+ * import { extractFileSync } from '@kreuzberg/node';
  *
- *   processingStage(): 'early' | 'middle' | 'late' {
- *     return 'middle';
- *   }
- * }
+ * // Basic usage
+ * const result = extractFileSync('document.pdf');
+ * console.log(result.content);
  *
- * registerPostProcessor(new MyProcessor());
+ * // With explicit MIME type
+ * const result2 = extractFileSync('document.pdf', 'application/pdf');
  *
- * // Use async extraction (required for custom processors)
- * const result = await extractFile('document.pdf');
- * console.log(result.metadata.customField); // 'custom_value'
+ * // With configuration
+ * const result3 = extractFileSync('document.pdf', {
+ *   chunking: {
+ *     maxChars: 1000,
+ *     maxOverlap: 200,
+ *   },
+ * });
  * ```
  */
-declare function registerPostProcessor(processor: PostProcessorProtocol): void;
+declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): ExtractionResult;
 /**
- * Unregister a postprocessor by name.
+ * Extract content from a single file (asynchronous).
  *
- * Removes a previously registered postprocessor from the registry.
- * If the processor doesn't exist, this is a no-op (does not throw).
+ * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
+ * provides better performance and memory management.
+ *
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
+ * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
+ *   If a string, treated as MIME type. If an object, treated as ExtractionConfig.
+ *   If null, MIME type is auto-detected from file extension or content.
+ * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
+ *   Only used if second parameter is a MIME type string.
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
+ *
+ * @example
+ * ```typescript
+ * import { extractFile } from '@kreuzberg/node';
+ *
+ * // Basic usage
+ * const result = await extractFile('document.pdf');
+ * console.log(result.content);
+ *
+ * // With chunking enabled
+ * const config = {
+ *   chunking: {
+ *     maxChars: 1000,
+ *     maxOverlap: 200,
+ *   },
+ * };
+ * const result2 = await extractFile('long_document.pdf', null, config);
+ * console.log(result2.chunks); // Array of text chunks
+ * ```
+ */
+declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
+/**
+ * Extract content from raw bytes (synchronous).
+ *
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
+ * which provides better performance and memory management.
+ *
+ * @param data - File content as Uint8Array (Buffer will be converted)
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {TypeError} When data is not a valid Uint8Array
+ * @throws {Error} When file cannot be read or parsed
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
+ *
+ * @example
+ * ```typescript
+ * import { extractBytesSync } from '@kreuzberg/node';
+ * import { readFileSync } from 'fs';
+ *
+ * const data = readFileSync('document.pdf');
+ * const result = extractBytesSync(data, 'application/pdf');
+ * console.log(result.content);
+ * ```
+ */
+declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
+/**
+ * Extract content from raw bytes (asynchronous).
+ *
+ * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
+ * which provides better performance and memory management.
+ *
+ * @param data - File content as Uint8Array (Buffer will be converted)
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {TypeError} When data is not a valid Uint8Array
+ * @throws {Error} When file cannot be read or parsed
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
+ *
+ * @example
+ * ```typescript
+ * import { extractBytes } from '@kreuzberg/node';
+ * import { readFile } from 'fs/promises';
+ *
+ * const data = await readFile('document.pdf');
+ * const result = await extractBytes(data, 'application/pdf');
+ * console.log(result.content);
+ * ```
+ */
+declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;
+/**
+ * Worker pool management for concurrent document extraction.
+ *
+ * This module provides utilities for creating and managing worker pools that enable
+ * concurrent extraction of documents using Node.js worker threads. Worker pools allow
+ * multiple extraction operations to run in parallel with configurable pool sizes.
+ *
+ * **Usage Pattern**:
+ * 1. Create a pool with `createWorkerPool(size)`
+ * 2. Submit tasks with `extractFileInWorker()` or `batchExtractFilesInWorker()`
+ * 3. Close the pool with `closeWorkerPool()` when done
+ *
+ * @internal This module is part of Layer 2 (extraction APIs).
+ */
+/**
+ * Create a new worker pool for concurrent extraction operations.
+ *
+ * Creates a pool of worker threads that can process extraction tasks concurrently.
+ * The pool manages a queue of pending tasks and distributes them across available workers.
+ *
+ * @param size - Optional number of workers in the pool. If not specified, defaults to the number of CPU cores.
+ * @returns WorkerPool instance that can be used with extraction functions
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool } from '@kreuzberg/node';
+ *
+ * // Create pool with default size (number of CPU cores)
+ * const pool = createWorkerPool();
+ *
+ * // Create pool with 4 workers
+ * const pool4 = createWorkerPool(4);
+ * ```
+ */
+declare function createWorkerPool(size?: number): WorkerPool;
+/**
+ * Get statistics about a worker pool.
+ *
+ * Returns information about the pool's current state, including the number of active workers,
+ * queued tasks, and total processed tasks.
+ *
+ * @param pool - The worker pool instance
+ * @returns WorkerPoolStats with pool information
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ * const stats = getWorkerPoolStats(pool);
+ *
+ * console.log(`Pool size: ${stats.size}`);
+ * console.log(`Active workers: ${stats.activeWorkers}`);
+ * console.log(`Queued tasks: ${stats.queuedTasks}`);
+ * ```
+ */
+declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
+/**
+ * Extract content from a single file using a worker pool (asynchronous).
+ *
+ * Submits an extraction task to the worker pool. The task is executed by one of the
+ * available workers in the background, allowing other tasks to be processed concurrently.
+ *
+ * @param pool - The worker pool instance
+ * @param filePath - Path to the file to extract
+ * @param mimeTypeOrConfig - Optional MIME type or extraction configuration.
+ *   If a string, treated as MIME type. If an object, treated as ExtractionConfig.
+ *   If null, MIME type is auto-detected from file extension or content.
+ * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
+ *   Only used if second parameter is a MIME type string.
+ * @returns Promise<ExtractionResult> containing extracted content and metadata
+ *
+ * @throws {Error} If the file cannot be read or extraction fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
+ *   const results = await Promise.all(
+ *     files.map(f => extractFileInWorker(pool, f))
+ *   );
+ *
+ *   results.forEach((r, i) => {
+ *     console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
+ *   });
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
+/**
+ * Extract content from multiple files in parallel using a worker pool (asynchronous).
+ *
+ * Submits multiple extraction tasks to the worker pool for concurrent processing.
+ * This is more efficient than using `extractFileInWorker` multiple times sequentially.
+ *
+ * @param pool - The worker pool instance
+ * @param paths - Array of file paths to extract
+ * @param config - Extraction configuration object (applies to all files). If null, uses default extraction settings.
+ * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
+ *
+ * @throws {Error} If any file cannot be read or extraction fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
+ *   const results = await batchExtractFilesInWorker(pool, files, {
+ *     ocr: { backend: 'tesseract', language: 'eng' }
+ *   });
+ *
+ *   const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
+ *   console.log(`Total: $${total}`);
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
+/**
+ * Close a worker pool and shut down all worker threads.
+ *
+ * Should be called when the pool is no longer needed to clean up resources
+ * and gracefully shut down worker threads. Any pending tasks will be cancelled.
+ *
+ * @param pool - The worker pool instance to close
+ * @returns Promise that resolves when the pool is fully closed
+ *
+ * @throws {Error} If pool shutdown fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   // Clean up the pool
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
+/**
+ * Register a custom post-processor.
+ *
+ * Post-processors allow you to hook into the extraction pipeline and transform
+ * the extraction results. They run after the core extraction is complete.
+ *
+ * Post-processors are async and can modify extraction results before they are
+ * returned to the caller.
+ *
+ * @param processor - Post-processor implementing PostProcessorProtocol
+ *
+ * @example
+ * ```typescript
+ * import { registerPostProcessor, extractFile } from '@kreuzberg/node';
+ *
+ * class CustomProcessor {
+ *   name() {
+ *     return 'custom_processor';
+ *   }
+ *   processingStage() {
+ *     return 'post';
+ *   }
+ *   async process(result) {
+ *     // Add custom metadata
+ *     result.metadata.customField = 'custom_value';
+ *     return result;
+ *   }
+ * }
+ *
+ * // Use async extraction (required for custom processors)
+ * const result = await extractFile('document.pdf');
+ * console.log(result.metadata.customField); // 'custom_value'
+ * ```
+ */
+declare function registerPostProcessor(processor: PostProcessorProtocol): void;
+/**
+ * Unregister a postprocessor by name.
+ *
+ * Removes a previously registered postprocessor from the registry.
+ * If the processor doesn't exist, this is a no-op (does not throw).
  *
  * @param name - Name of the processor to unregister (case-sensitive)
  *
@@ -428,6 +686,7 @@ declare function clearPostProcessors(): void;
  * ```
  */
 declare function listPostProcessors(): string[];
 /**
  * Register a custom validator.
  *
@@ -435,27 +694,26 @@ declare function listPostProcessors(): string[];
  * Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
  * the extraction fails immediately.
  *
- * @param validator - ValidatorProtocol implementation with name(), validate(), and optional priority()/shouldValidate()
- * @throws {Error} If validator is missing required methods (name or validate)
- * @throws {Error} If validator name is empty string
- * @throws {Error} If a validator with the same name is already registered
+ * Validators are async and run after post-processors in the extraction pipeline.
+ *
+ * @param validator - Validator implementing ValidatorProtocol
  *
  * @example
  * ```typescript
- * import { registerValidator } from '@kreuzberg/node';
+ * import { registerValidator, extractFile } from '@kreuzberg/node';
  *
- * class MinLengthValidator implements ValidatorProtocol {
- *   name(): string {
+ * class MinLengthValidator {
+ *   name() {
  *     return 'min_length_validator';
  *   }
  *
- *   priority(): number {
- *     return 100; // Run early
+ *   priority() {
+ *     return 100;
  *   }
  *
- *   validate(result: ExtractionResult): void {
- *     if (result.content.length < 100) {
- *       throw new Error('Content too short: minimum 100 characters required');
+ *   async validate(result) {
+ *     if (result.content.length < 10) {
+ *       throw new Error('Content too short');
  *     }
  *   }
  * }
@@ -510,20 +768,93 @@ declare function clearValidators(): void;
  * ```
  */
 declare function listValidators(): string[];
-declare function registerOcrBackend(backend: OcrBackendProtocol): void;
 /**
- * List all registered OCR backends.
+ * Register a custom OCR backend.
  *
- * Returns an array of names of all currently registered OCR backends,
- * including built-in backends like "tesseract".
+ * This function registers a JavaScript OCR backend that will be used by Kreuzberg's
+ * extraction pipeline when OCR is enabled. The backend must implement the
+ * {@link OcrBackendProtocol} interface.
  *
- * @returns Array of OCR backend names (empty array if none registered)
+ * ## Usage
+ *
+ * 1. Create a class implementing {@link OcrBackendProtocol}
+ * 2. Call `initialize()` on your backend instance (if needed)
+ * 3. Register the backend with `registerOcrBackend()`
+ * 4. Use the backend name in extraction config
+ *
+ * ## Thread Safety
+ *
+ * The registered backend must be thread-safe as it may be called concurrently
+ * from multiple Rust async tasks. Ensure your implementation handles concurrent
+ * calls properly.
+ *
+ * @param backend - OcrBackendProtocol implementation with name(), supportedLanguages(), and processImage()
+ * @throws {Error} If backend is missing required methods (name, supportedLanguages, or processImage)
+ * @throws {Error} If backend name is empty string or contains invalid characters
+ * @throws {Error} If a backend with the same name is already registered
+ * @throws {Error} If registration fails due to FFI issues
  *
  * @example
  * ```typescript
- * import { listOcrBackends } from '@kreuzberg/node';
+ * import { GutenOcrBackend } from '@kreuzberg/node/ocr/guten-ocr';
+ * import { registerOcrBackend, extractFile } from '@kreuzberg/node';
  *
- * const backends = listOcrBackends();
+ * // Create and initialize backend
+ * const backend = new GutenOcrBackend();
+ * await backend.initialize();
+ *
+ * // Register with Kreuzberg
+ * registerOcrBackend(backend);
+ *
+ * // Use in extraction
+ * const result = await extractFile('scanned.pdf', null, {
+ *   ocr: { backend: 'guten-ocr', language: 'en' }
+ * });
+ * console.log(result.content);
+ * ```
+ *
+ * @example
+ * ```typescript
+ * import { registerOcrBackend } from '@kreuzberg/node';
+ *
+ * class MyOcrBackend {
+ *   name() {
+ *     return 'my-ocr';
+ *   }
+ *
+ *   supportedLanguages(): string[] {
+ *     return ['en', 'de', 'fr'];
+ *   }
+ *
+ *   async processImage(imageBytes: Uint8Array, language: string) {
+ *     const text = await myCustomOcrEngine(imageBytes, language);
+ *     return {
+ *       content: text,
+ *       mime_type: 'text/plain',
+ *       metadata: { confidence: 0.95, language },
+ *       tables: []
+ *     };
+ *   }
+ * }
+ *
+ * registerOcrBackend(new MyOcrBackend());
+ * ```
+ */
+declare function registerOcrBackend(backend: OcrBackendProtocol): void;
+/**
+ * List all registered OCR backends.
+ *
+ * Returns an array of names of all currently registered OCR backends,
+ * including built-in backends like "tesseract".
+ *
+ * @returns Array of OCR backend names (empty array if none registered)
+ *
+ * @example
+ * ```typescript
+ * import { listOcrBackends } from '@kreuzberg/node';
+ *
+ * const backends = listOcrBackends();
  * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
  * ```
  */
@@ -560,6 +891,7 @@ declare function unregisterOcrBackend(name: string): void;
  * ```
  */
 declare function clearOcrBackends(): void;
 /**
  * List all registered document extractors.
  *
@@ -573,7 +905,7 @@ declare function clearOcrBackends(): void;
  * import { listDocumentExtractors } from '@kreuzberg/node';
  *
  * const extractors = listDocumentExtractors();
- * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
+ * console.log(extractors); // ['pdf', 'docx', 'xlsx', 'custom-extractor', ...]
  * ```
  */
 declare function listDocumentExtractors(): string[];
@@ -609,87 +941,26 @@ declare function unregisterDocumentExtractor(name: string): void;
  * ```
  */
 declare function clearDocumentExtractors(): void;
 /**
- * ExtractionConfig namespace with static methods for loading configuration from files.
+ * Load extraction configuration from a file.
  *
- * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
- * or to discover configuration files in the current directory tree.
+ * @param filePath - Path to the configuration file
+ * @returns ExtractionConfig object loaded from the file
  *
- * For creating configurations programmatically, use plain TypeScript objects instead:
- *
- * @example
- * ```typescript
- * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+ * @deprecated Use ExtractionConfig.fromFile() instead
+ */
+declare function loadConfigFile(filePath: string): ExtractionConfig;
+/**
+ * Load extraction configuration from a specified path.
  *
- * // Load configuration from file
- * const config1 = ExtractionConfig.fromFile('config.toml');
+ * @param path - Path to the configuration file or directory
+ * @returns ExtractionConfig object or null
  *
- * // Or create with plain object
- * const config2 = {
- *   chunking: { maxChars: 2048 },
- *   ocr: { backend: 'tesseract', language: 'eng' }
- * };
- *
- * // Use with extraction
- * const result = await extractFile('document.pdf', null, config2);
- * ```
+ * @deprecated Use ExtractionConfig.fromFile() or ExtractionConfig.discover() instead
  */
-declare const ExtractionConfig: {
-    /**
-     * Load extraction configuration from a file.
-     *
-     * Automatically detects the file format based on extension:
-     * - `.toml` - TOML format
-     * - `.yaml` - YAML format
-     * - `.json` - JSON format
-     *
-     * @param filePath - Path to the configuration file (absolute or relative)
-     * @returns ExtractionConfig object loaded from the file
-     *
-     * @throws {Error} If file does not exist or is not accessible
-     * @throws {Error} If file content is not valid TOML/YAML/JSON
-     * @throws {Error} If configuration structure is invalid
-     * @throws {Error} If file extension is not supported
-     *
-     * @example
-     * ```typescript
-     * import { ExtractionConfig } from '@kreuzberg/node';
-     *
-     * // Load from TOML file
-     * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
-     *
-     * // Load from YAML file
-     * const config2 = ExtractionConfig.fromFile('./config.yaml');
-     *
-     * // Load from JSON file
-     * const config3 = ExtractionConfig.fromFile('./config.json');
-     * ```
-     */
-    fromFile(filePath: string): ExtractionConfig$1;
-    /**
-     * Discover and load configuration from current or parent directories.
-     *
-     * Searches for a `kreuzberg.toml` file starting from the current working directory
-     * and traversing up the directory tree. Returns the first configuration file found.
-     *
-     * @returns ExtractionConfig object if found, or null if no configuration file exists
-     *
-     * @example
-     * ```typescript
-     * import { ExtractionConfig } from '@kreuzberg/node';
-     *
-     * // Try to find config in current or parent directories
-     * const config = ExtractionConfig.discover();
-     * if (config) {
-     *   console.log('Found configuration');
-     *   // Use config for extraction
-     * } else {
-     *   console.log('No configuration file found, using defaults');
-     * }
-     * ```
-     */
-    discover(): ExtractionConfig$1 | null;
-};
+declare function loadConfigFromPath(path: string): ExtractionConfig | null;
 /**
  * Detect MIME type from raw bytes.
  *
@@ -800,6 +1071,7 @@ declare function validateMimeType(mimeType: string): string;
  * ```
  */
 declare function getExtensionsForMime(mimeType: string): string[];
 /**
  * Embedding preset configuration.
  *
@@ -820,28 +1092,29 @@ interface EmbeddingPreset {
     description: string;
 }
 /**
- * List all available embedding preset names.
+ * Get all available embedding presets.
  *
- * Returns an array of preset names that can be used with `getEmbeddingPreset`.
+ * Returns an array of names of all available embedding model presets.
  *
- * @returns Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
+ * @returns Array of preset names (e.g., ["fast", "balanced", "quality", "multilingual"])
  *
  * @example
  * ```typescript
  * import { listEmbeddingPresets } from '@kreuzberg/node';
  *
  * const presets = listEmbeddingPresets();
- * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
+ * console.log('Available presets:', presets);
  * ```
  */
 declare function listEmbeddingPresets(): string[];
 /**
- * Get a specific embedding preset by name.
+ * Get embedding preset configuration by name.
  *
- * Returns a preset configuration object, or null if the preset name is not found.
+ * Retrieves the configuration for a specific embedding model preset.
+ * Returns null if the preset doesn't exist.
  *
- * @param name - The preset name (case-sensitive)
- * @returns An `EmbeddingPreset` object or `null` if not found
+ * @param name - Name of the preset (e.g., "balanced", "fast", "quality")
+ * @returns EmbeddingPreset configuration if found, null otherwise
  *
  * @example
  * ```typescript
@@ -855,278 +1128,78 @@ declare function listEmbeddingPresets(): string[];
  * ```
  */
 declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
 /**
- * Get the error code for the last FFI error.
- *
- * Returns the FFI error code as an integer. This is useful for programmatic error handling
- * and distinguishing between different types of failures in native code.
- *
- * Error codes:
- * - 0: Success (no error)
- * - 1: GenericError
- * - 2: Panic
- * - 3: InvalidArgument
- * - 4: IoError
- * - 5: ParsingError
- * - 6: OcrError
- * - 7: MissingDependency
- *
- * @returns The integer error code
- *
- * @example
- * ```typescript
- * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
- *
- * try {
- *   const result = await extractFile('document.pdf');
- * } catch (error) {
- *   const code = getLastErrorCode();
- *   if (code === ErrorCode.Panic) {
- *     console.error('Native code panic detected');
- *   }
- * }
- * ```
- */
-declare function getLastErrorCode(): number;
-/**
- * Get panic context information if the last error was a panic.
- *
- * Returns detailed information about a panic in native code, or null if the last error was not a panic.
- * This provides debugging information when native code panics.
- *
- * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
- *
- * @example
- * ```typescript
- * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
- *
- * try {
- *   const result = await extractFile('document.pdf');
- * } catch (error) {
- *   const context = getLastPanicContext();
- *   if (context) {
- *     console.error(`Panic at ${context.file}:${context.line}`);
- *     console.error(`In function: ${context.function}`);
- *     console.error(`Message: ${context.message}`);
- *   }
- * }
- * ```
- */
-declare function getLastPanicContext(): PanicContext | null;
-/**
- * Returns the human-readable name for an error code.
- *
- * Maps numeric error codes to their string names, providing a consistent way
- * to get error code names across all platforms.
- *
- * @param code - The numeric error code (0-7)
- * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
- *
- * @example
- * ```typescript
- * import { getErrorCodeName } from '@kreuzberg/node';
- *
- * const name = getErrorCodeName(0);  // returns "validation"
- * const name = getErrorCodeName(2);  // returns "ocr"
- * const name = getErrorCodeName(99); // returns "unknown"
- * ```
- */
-declare function getErrorCodeName(code: number): string;
-/**
- * Returns the description for an error code.
- *
- * Retrieves user-friendly descriptions of error types from the FFI layer.
- *
- * @param code - The numeric error code (0-7)
- * @returns A brief description of the error type
- *
- * @example
- * ```typescript
- * import { getErrorCodeDescription } from '@kreuzberg/node';
- *
- * const desc = getErrorCodeDescription(0);  // returns "Input validation error"
- * const desc = getErrorCodeDescription(4);  // returns "File system I/O error"
- * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
- * ```
- */
-declare function getErrorCodeDescription(code: number): string;
-/**
- * Classifies an error message string into an error code category.
- *
- * This function analyzes the error message content and returns the most likely
- * error code (0-7) based on keyword patterns. Used to programmatically classify
- * errors for handling purposes.
- *
- * The classification is based on keyword matching:
- * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
- * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
- * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
- * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
- * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
- * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
- * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
- * - **Internal (7)**: Keywords like "internal", "bug", "panic"
- *
- * @param errorMessage - The error message string to classify
- * @returns An object with the classification details
- *
- * @example
- * ```typescript
- * import { classifyError } from '@kreuzberg/node';
- *
- * const result = classifyError("PDF file is corrupted");
- * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
- *
- * const result = classifyError("Tesseract not found");
- * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
- * ```
- */
-declare function classifyError(errorMessage: string): ErrorClassification;
-/**
- * Create a worker pool for concurrent file extraction.
- *
- * The worker pool manages a set of background worker threads that can process
- * extraction requests concurrently, improving throughput when handling multiple files.
- *
- * @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
- * @returns A WorkerPool instance to use with extraction functions
- *
- * @throws {Error} If size is invalid or pool creation fails
- *
- * @example
- * ```typescript
- * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
- *
- * // Create pool with 4 workers
- * const pool = createWorkerPool(4);
- *
- * try {
- *   const result = await extractFileInWorker(pool, 'document.pdf');
- *   console.log(result.content);
- * } finally {
- *   // Always close the pool when done
- *   await closeWorkerPool(pool);
- * }
- * ```
+ * @internal Allows tests to provide a mocked native binding.
  */
-declare function createWorkerPool(size?: number): WorkerPool;
+declare function __setBindingForTests(mock: unknown): void;
 /**
- * Get statistics about a worker pool.
- *
- * Returns information about the pool's current state, including the number of active workers,
- * queued tasks, and total processed tasks.
- *
- * @param pool - The worker pool instance
- * @returns WorkerPoolStats with pool information
- *
- * @example
- * ```typescript
- * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
- *
- * const pool = createWorkerPool(4);
- * const stats = getWorkerPoolStats(pool);
- *
- * console.log(`Pool size: ${stats.size}`);
- * console.log(`Active workers: ${stats.activeWorkers}`);
- * console.log(`Queued tasks: ${stats.queuedTasks}`);
- * ```
+ * @internal Resets the cached native binding for tests.
  */
-declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
+declare function __resetBindingForTests(): void;
 /**
- * Extract content from a single file using a worker pool (asynchronous).
- *
- * Submits an extraction task to the worker pool. The task is executed by one of the
- * available workers in the background, allowing other tasks to be processed concurrently.
+ * Kreuzberg - Multi-language document intelligence framework.
  *
- * @param pool - The worker pool instance
- * @param filePath - Path to the file to extract
- * @param mimeTypeOrConfig - Optional MIME type or extraction configuration
- * @param maybeConfig - Optional extraction configuration (if second param is MIME type)
- * @returns Promise<ExtractionResult> containing extracted content and metadata
+ * This is a TypeScript SDK around a high-performance Rust core.
+ * All extraction logic, chunking, quality processing, and language detection
+ * are implemented in Rust for maximum performance.
  *
- * @throws {Error} If the file cannot be read or extraction fails
+ * ## Module Organization
  *
- * @example
- * ```typescript
- * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ * The SDK is organized into logical domains:
+ * - **Extraction**: Single and batch document extraction with worker pool support
+ * - **Types**: Core type definitions and interfaces
+ * - **Errors**: Error classes and diagnostic utilities
+ * - **Plugins**: Custom post-processors, validators, and OCR backends
+ * - **Registry**: Plugin and document extractor management
+ * - **Config**: Configuration loading and management
+ * - **MIME**: MIME type detection and validation
+ * - **Embeddings**: Embedding model presets
  *
- * const pool = createWorkerPool(4);
+ * ## API Usage Recommendations
  *
- * try {
- *   const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
- *   const results = await Promise.all(
- *     files.map(f => extractFileInWorker(pool, f))
- *   );
+ * **For processing multiple documents**, prefer batch APIs:
+ * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
+ * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
+ * - Use worker pool APIs for high-concurrency scenarios
  *
- *   results.forEach((r, i) => {
- *     console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
- *   });
- * } finally {
- *   await closeWorkerPool(pool);
- * }
- * ```
- */
-declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
-/**
- * Extract content from multiple files in parallel using a worker pool (asynchronous).
+ * **Batch APIs provide**:
+ * - Better performance (parallel processing in Rust)
+ * - More reliable memory management
+ * - Recommended for all multi-document workflows
  *
- * Submits multiple extraction tasks to the worker pool for concurrent processing.
- * This is more efficient than using `extractFileInWorker` multiple times sequentially.
+ * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
+ * - One-off document processing
+ * - Interactive applications processing documents on-demand
+ * - Avoid calling these in tight loops - use batch APIs instead
  *
- * @param pool - The worker pool instance
- * @param paths - Array of file paths to extract
- * @param config - Extraction configuration object (applies to all files)
- * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
+ * ## Supported Formats
  *
- * @throws {Error} If any file cannot be read or extraction fails
+ * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
+ * - **Text**: Markdown, Plain Text, XML
+ * - **Web**: HTML (converted to Markdown)
+ * - **Data**: JSON, YAML, TOML
+ * - **Email**: EML, MSG
+ * - **Images**: PNG, JPEG, TIFF (with OCR support)
  *
  * @example
  * ```typescript
- * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
- *
- * const pool = createWorkerPool(4);
+ * import { extractFile, batchExtractFiles } from '@kreuzberg/node';
  *
- * try {
- *   const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
- *   const results = await batchExtractFilesInWorker(pool, files, {
- *     ocr: { backend: 'tesseract', language: 'eng' }
- *   });
+ * // Single file extraction
+ * const result = await extractFile('document.pdf');
+ * console.log(result.content);
  *
- *   const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
- *   console.log(`Total: $${total}`);
- * } finally {
- *   await closeWorkerPool(pool);
- * }
+ * // Multiple files (recommended approach)
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
+ * const results = await batchExtractFiles(files);
+ * results.forEach(r => console.log(r.content));
  * ```
- */
-declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
-/**
- * Close a worker pool and shut down all worker threads.
- *
- * Should be called when the pool is no longer needed to clean up resources
- * and gracefully shut down worker threads. Any pending tasks will be cancelled.
- *
- * @param pool - The worker pool instance to close
- * @returns Promise that resolves when the pool is fully closed
- *
- * @throws {Error} If pool shutdown fails
- *
- * @example
- * ```typescript
- * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
- *
- * const pool = createWorkerPool(4);
  *
- * try {
- *   const result = await extractFileInWorker(pool, 'document.pdf');
- *   console.log(result.content);
- * } finally {
- *   // Clean up the pool
- *   await closeWorkerPool(pool);
- * }
- * ```
+ * @module @kreuzberg/node
  */
-declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
-declare const __version__ = "4.0.8";
-export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
+declare const __version__ = "4.1.1";
+export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };