npm - @kreuzberg/node - Versions diffs - 4.0.0-rc.8 → 4.0.0 - Mend

@kreuzberg/node 4.0.0-rc.8 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { PanicContext } from './errors.js';
 export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
-import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
-export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
+import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.js';
+export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
 export { GutenOcrBackend } from './ocr/guten-ocr.js';
 /**
@@ -65,10 +65,15 @@ declare function __resetBindingForTests(): void;
  * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
  * provides better performance and memory management.
  *
- * @param filePath - Path to the file (string)
- * @param mimeType - Optional MIME type hint (auto-detected if null)
- * @param config - Extraction configuration (uses defaults if null)
- * @returns ExtractionResult with content, metadata, and tables
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
+ * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -92,17 +97,22 @@ declare function __resetBindingForTests(): void;
  * const result2 = extractFileSync('scanned.pdf', null, config);
  * ```
  */
-declare function extractFileSync(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): ExtractionResult;
+declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
 /**
  * Extract content from a single file (asynchronous).
  *
  * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
  * provides better performance and memory management.
  *
- * @param filePath - Path to the file (string)
- * @param mimeType - Optional MIME type hint (auto-detected if null)
- * @param config - Extraction configuration (uses defaults if null)
- * @returns Promise<ExtractionResult> with content, metadata, and tables
+ * @param filePath - Path to the file to extract (string). Can be absolute or relative.
+ * @param mimeType - Optional MIME type hint for format detection. If null, MIME type is auto-detected from file extension or content.
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -123,17 +133,23 @@ declare function extractFileSync(filePath: string, mimeType?: string | null, con
  * console.log(result2.chunks); // Array of text chunks
  * ```
  */
-declare function extractFile(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
 /**
  * Extract content from raw bytes (synchronous).
  *
  * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
  * which provides better performance and memory management.
  *
- * @param data - File content as Uint8Array
- * @param mimeType - MIME type of the data (required for format detection)
- * @param config - Extraction configuration (uses defaults if null)
- * @returns ExtractionResult with content, metadata, and tables
+ * @param data - File content as Uint8Array (Buffer will be converted)
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
+ * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {TypeError} When data is not a valid Uint8Array
+ * @throws {Error} When file cannot be read or parsed
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -145,17 +161,23 @@ declare function extractFile(filePath: string, mimeType?: string | null, config?
  * console.log(result.content);
  * ```
  */
-declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
+declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
 /**
  * Extract content from raw bytes (asynchronous).
  *
  * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
  * which provides better performance and memory management.
  *
- * @param data - File content as Uint8Array
- * @param mimeType - MIME type of the data (required for format detection)
- * @param config - Extraction configuration (uses defaults if null)
- * @returns Promise<ExtractionResult> with content, metadata, and tables
+ * @param data - File content as Uint8Array (Buffer will be converted)
+ * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
+ * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
+ * @throws {TypeError} When data is not a valid Uint8Array
+ * @throws {Error} When file cannot be read or parsed
+ * @throws {ParsingError} When document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -167,7 +189,7 @@ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: E
  * console.log(result.content);
  * ```
  */
-declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
 /**
  * Extract content from multiple files in parallel (synchronous).
  *
@@ -179,9 +201,14 @@ declare function extractBytes(data: Uint8Array, mimeType: string, config?: Extra
  * - Optimized memory usage across all extractions
  * - More reliable for batch document processing
  *
- * @param paths - List of file paths to extract
- * @param config - Extraction configuration (uses defaults if null)
+ * @param paths - List of file paths to extract (absolute or relative paths)
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
  * @returns Array of ExtractionResults (one per file, in same order as input)
+ * @throws {Error} If any file cannot be read or parsed
+ * @throws {ParsingError} When any document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -207,9 +234,14 @@ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfi
  * - Optimized memory usage across all extractions
  * - More reliable for batch document processing
  *
- * @param paths - List of file paths to extract
- * @param config - Extraction configuration (uses defaults if null)
+ * @param paths - List of file paths to extract (absolute or relative paths)
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
  * @returns Promise resolving to array of ExtractionResults (one per file, in same order as input)
+ * @throws {Error} If any file cannot be read or parsed
+ * @throws {ParsingError} When any document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -238,10 +270,16 @@ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig$1
  * - Optimized memory usage across all extractions
  * - More reliable for batch document processing
  *
- * @param dataList - List of file contents as Uint8Arrays
- * @param mimeTypes - List of MIME types (one per data item, required for format detection)
- * @param config - Extraction configuration (uses defaults if null)
+ * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
+ * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
  * @returns Array of ExtractionResults (one per data item, in same order as input)
+ * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
+ * @throws {Error} If any data cannot be read or parsed
+ * @throws {ParsingError} When any document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -270,10 +308,16 @@ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string
  * - Optimized memory usage across all extractions
  * - More reliable for batch document processing
  *
- * @param dataList - List of file contents as Uint8Arrays
- * @param mimeTypes - List of MIME types (one per data item, required for format detection)
- * @param config - Extraction configuration (uses defaults if null)
+ * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
+ * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
+ * @param config - Extraction configuration object. If null, uses default extraction settings.
  * @returns Promise resolving to array of ExtractionResults (one per data item, in same order as input)
+ * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
+ * @throws {Error} If any data cannot be read or parsed
+ * @throws {ParsingError} When any document format is invalid or corrupted
+ * @throws {OcrError} When OCR processing fails (if OCR is enabled)
+ * @throws {ValidationError} When any extraction result fails validation (if validators registered)
+ * @throws {KreuzbergError} For other extraction-related failures
  *
  * @example
  * ```typescript
@@ -306,7 +350,10 @@ declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[],
  * preventing JavaScript callbacks from executing. For v4.0, use async extraction
  * when you need custom processors.
  *
- * @param processor - PostProcessorProtocol implementation
+ * @param processor - PostProcessorProtocol implementation with name(), process(), and optional processingStage()
+ * @throws {Error} If processor is missing required methods (name or process)
+ * @throws {Error} If processor name is empty string
+ * @throws {Error} If a processor with the same name is already registered
  *
  * @example
  * ```typescript
@@ -339,8 +386,9 @@ declare function registerPostProcessor(processor: PostProcessorProtocol): void;
  * Unregister a postprocessor by name.
  *
  * Removes a previously registered postprocessor from the registry.
+ * If the processor doesn't exist, this is a no-op (does not throw).
  *
- * @param name - Name of the processor to unregister
+ * @param name - Name of the processor to unregister (case-sensitive)
  *
  * @example
  * ```typescript
@@ -353,7 +401,8 @@ declare function unregisterPostProcessor(name: string): void;
 /**
  * Clear all registered postprocessors.
  *
- * Removes all postprocessors from the registry.
+ * Removes all postprocessors from the registry. Useful for test cleanup or resetting state.
+ * If no postprocessors are registered, this is a no-op.
  *
  * @example
  * ```typescript
@@ -366,9 +415,9 @@ declare function clearPostProcessors(): void;
 /**
  * List all registered post-processors.
  *
- * Returns the names of all currently registered post-processors.
+ * Returns the names of all currently registered post-processors (both built-in and custom).
  *
- * @returns Array of post-processor names
+ * @returns Array of post-processor names (empty array if none registered)
  *
  * @example
  * ```typescript
@@ -386,7 +435,10 @@ declare function listPostProcessors(): string[];
  * Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
  * the extraction fails immediately.
  *
- * @param validator - ValidatorProtocol implementation
+ * @param validator - ValidatorProtocol implementation with name(), validate(), and optional priority()/shouldValidate()
+ * @throws {Error} If validator is missing required methods (name or validate)
+ * @throws {Error} If validator name is empty string
+ * @throws {Error} If a validator with the same name is already registered
  *
  * @example
  * ```typescript
@@ -416,8 +468,9 @@ declare function registerValidator(validator: ValidatorProtocol): void;
  * Unregister a validator by name.
  *
  * Removes a previously registered validator from the global registry.
+ * If the validator doesn't exist, this is a no-op (does not throw).
  *
- * @param name - Validator name to unregister
+ * @param name - Validator name to unregister (case-sensitive)
  *
  * @example
  * ```typescript
@@ -444,9 +497,9 @@ declare function clearValidators(): void;
 /**
  * List all registered validators.
  *
- * Returns the names of all currently registered validators.
+ * Returns the names of all currently registered validators (both built-in and custom).
  *
- * @returns Array of validator names
+ * @returns Array of validator names (empty array if none registered)
  *
  * @example
  * ```typescript
@@ -464,7 +517,7 @@ declare function registerOcrBackend(backend: OcrBackendProtocol): void;
  * Returns an array of names of all currently registered OCR backends,
  * including built-in backends like "tesseract".
  *
- * @returns Array of OCR backend names
+ * @returns Array of OCR backend names (empty array if none registered)
  *
  * @example
  * ```typescript
@@ -497,7 +550,7 @@ declare function unregisterOcrBackend(name: string): void;
  *
  * Removes all OCR backends from the registry, including built-in backends.
  * Use with caution as this will make OCR functionality unavailable until
- * backends are re-registered.
+ * backends are re-registered. If no backends are registered, this is a no-op.
  *
  * @example
  * ```typescript
@@ -513,7 +566,7 @@ declare function clearOcrBackends(): void;
  * Returns an array of names of all currently registered document extractors,
  * including built-in extractors for PDF, Office documents, images, etc.
  *
- * @returns Array of document extractor names
+ * @returns Array of document extractor names (empty array if none registered)
  *
  * @example
  * ```typescript
@@ -559,18 +612,26 @@ declare function clearDocumentExtractors(): void;
 /**
  * ExtractionConfig namespace with static methods for loading configuration from files.
  *
- * Provides a factory method to load extraction configuration from TOML, YAML, or JSON files.
- * The file format is automatically detected based on the file extension.
+ * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
+ * or to discover configuration files in the current directory tree.
+ *
+ * For creating configurations programmatically, use plain TypeScript objects instead:
  *
  * @example
  * ```typescript
  * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
  *
  * // Load configuration from file
- * const config = ExtractionConfig.fromFile('config.toml');
+ * const config1 = ExtractionConfig.fromFile('config.toml');
+ *
+ * // Or create with plain object
+ * const config2 = {
+ *   chunking: { maxChars: 2048 },
+ *   ocr: { backend: 'tesseract', language: 'eng' }
+ * };
  *
  * // Use with extraction
- * const result = await extractFile('document.pdf', null, config);
+ * const result = await extractFile('document.pdf', null, config2);
  * ```
  */
 declare const ExtractionConfig: {
@@ -658,30 +719,30 @@ declare function detectMimeType(bytes: Buffer): string;
 /**
  * Detect MIME type from a file path.
  *
- * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
- * if extension-based detection fails.
+ * Determines the MIME type based on the file extension in the provided path.
+ * By default, checks if the file exists; can be disabled with checkExists parameter.
  *
- * @param path - Path to the file (string)
- * @param checkExists - Whether to verify file existence (default: true)
- * @returns The detected MIME type string
+ * @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
+ * @param checkExists - Whether to verify the file exists (default: true)
+ * @returns The detected MIME type as a string (e.g., 'application/pdf')
  *
- * @throws {Error} If file doesn't exist (when checkExists is true)
- * @throws {Error} If MIME type cannot be determined from path/extension
- * @throws {Error} If extension is unknown
+ * @throws {Error} If MIME type cannot be determined from the file extension,
+ * or if checkExists is true and the file does not exist
  *
  * @example
  * ```typescript
  * import { detectMimeTypeFromPath } from '@kreuzberg/node';
  *
- * // Detect from existing file
- * const mimeType = detectMimeTypeFromPath('document.pdf');
+ * // Detect MIME type from existing file
+ * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
  * console.log(mimeType); // 'application/pdf'
  *
- * const mimeType2 = detectMimeTypeFromPath('document.docx');
+ * // Detect without checking file existence
+ * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
  * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
  * ```
  */
-declare function detectMimeTypeFromPath(path: string, checkExists?: boolean): string;
+declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
 /**
  * Validate that a MIME type is supported by Kreuzberg.
  *
@@ -852,6 +913,220 @@ declare function getLastErrorCode(): number;
  * ```
  */
 declare function getLastPanicContext(): PanicContext | null;
-declare const __version__ = "4.0.0-rc.8";
+/**
+ * Returns the human-readable name for an error code.
+ *
+ * Maps numeric error codes to their string names, providing a consistent way
+ * to get error code names across all platforms.
+ *
+ * @param code - The numeric error code (0-7)
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
+ *
+ * @example
+ * ```typescript
+ * import { getErrorCodeName } from '@kreuzberg/node';
+ *
+ * const name = getErrorCodeName(0);  // returns "validation"
+ * const name = getErrorCodeName(2);  // returns "ocr"
+ * const name = getErrorCodeName(99); // returns "unknown"
+ * ```
+ */
+declare function getErrorCodeName(code: number): string;
+/**
+ * Returns the description for an error code.
+ *
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
+ *
+ * @param code - The numeric error code (0-7)
+ * @returns A brief description of the error type
+ *
+ * @example
+ * ```typescript
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
+ *
+ * const desc = getErrorCodeDescription(0);  // returns "Input validation error"
+ * const desc = getErrorCodeDescription(4);  // returns "File system I/O error"
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
+ * ```
+ */
+declare function getErrorCodeDescription(code: number): string;
+/**
+ * Classifies an error message string into an error code category.
+ *
+ * This function analyzes the error message content and returns the most likely
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
+ * errors for handling purposes.
+ *
+ * The classification is based on keyword matching:
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
+ *
+ * @param errorMessage - The error message string to classify
+ * @returns An object with the classification details
+ *
+ * @example
+ * ```typescript
+ * import { classifyError } from '@kreuzberg/node';
+ *
+ * const result = classifyError("PDF file is corrupted");
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
+ *
+ * const result = classifyError("Tesseract not found");
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
+ * ```
+ */
+declare function classifyError(errorMessage: string): ErrorClassification;
+/**
+ * Create a worker pool for concurrent file extraction.
+ *
+ * The worker pool manages a set of background worker threads that can process
+ * extraction requests concurrently, improving throughput when handling multiple files.
+ *
+ * @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
+ * @returns A WorkerPool instance to use with extraction functions
+ *
+ * @throws {Error} If size is invalid or pool creation fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * // Create pool with 4 workers
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   // Always close the pool when done
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function createWorkerPool(size?: number): WorkerPool;
+/**
+ * Get statistics about a worker pool.
+ *
+ * Returns information about the pool's current state, including the number of active workers,
+ * queued tasks, and total processed tasks.
+ *
+ * @param pool - The worker pool instance
+ * @returns WorkerPoolStats with pool information
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ * const stats = getWorkerPoolStats(pool);
+ *
+ * console.log(`Pool size: ${stats.size}`);
+ * console.log(`Active workers: ${stats.activeWorkers}`);
+ * console.log(`Queued tasks: ${stats.queuedTasks}`);
+ * ```
+ */
+declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
+/**
+ * Extract content from a single file using a worker pool (asynchronous).
+ *
+ * Submits an extraction task to the worker pool. The task is executed by one of the
+ * available workers in the background, allowing other tasks to be processed concurrently.
+ *
+ * @param pool - The worker pool instance
+ * @param filePath - Path to the file to extract
+ * @param mimeTypeOrConfig - Optional MIME type or extraction configuration
+ * @param maybeConfig - Optional extraction configuration (if second param is MIME type)
+ * @returns Promise<ExtractionResult> containing extracted content and metadata
+ *
+ * @throws {Error} If the file cannot be read or extraction fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
+ *   const results = await Promise.all(
+ *     files.map(f => extractFileInWorker(pool, f))
+ *   );
+ *
+ *   results.forEach((r, i) => {
+ *     console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
+ *   });
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+/**
+ * Extract content from multiple files in parallel using a worker pool (asynchronous).
+ *
+ * Submits multiple extraction tasks to the worker pool for concurrent processing.
+ * This is more efficient than using `extractFileInWorker` multiple times sequentially.
+ *
+ * @param pool - The worker pool instance
+ * @param paths - Array of file paths to extract
+ * @param config - Extraction configuration object (applies to all files)
+ * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
+ *
+ * @throws {Error} If any file cannot be read or extraction fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
+ *   const results = await batchExtractFilesInWorker(pool, files, {
+ *     ocr: { backend: 'tesseract', language: 'eng' }
+ *   });
+ *
+ *   const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
+ *   console.log(`Total: $${total}`);
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
+/**
+ * Close a worker pool and shut down all worker threads.
+ *
+ * Should be called when the pool is no longer needed to clean up resources
+ * and gracefully shut down worker threads. Any pending tasks will be cancelled.
+ *
+ * @param pool - The worker pool instance to close
+ * @returns Promise that resolves when the pool is fully closed
+ *
+ * @throws {Error} If pool shutdown fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   // Clean up the pool
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
+declare const __version__ = "4.0.0";
-export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
+export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };