npm - @kreuzberg/node - Versions diffs - 4.0.0-rc.15 → 4.0.0-rc.17 - Mend

@kreuzberg/node 4.0.0-rc.15 → 4.0.0-rc.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { PanicContext } from './errors.mjs';
 export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.mjs';
-import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.mjs';
-export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
+import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, OcrConfig, ChunkingConfig, ImageExtractionConfig, PdfConfig, KeywordConfig, LanguageDetectionConfig, ErrorClassification } from './types.mjs';
+export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, Metadata, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
 export { GutenOcrBackend } from './ocr/guten-ocr.mjs';
 /**
@@ -97,7 +97,7 @@ declare function __resetBindingForTests(): void;
  * const result2 = extractFileSync('scanned.pdf', null, config);
  * ```
  */
-declare function extractFileSync(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): ExtractionResult;
+declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
 /**
  * Extract content from a single file (asynchronous).
  *
@@ -133,7 +133,7 @@ declare function extractFileSync(filePath: string, mimeType?: string | null, con
  * console.log(result2.chunks); // Array of text chunks
  * ```
  */
-declare function extractFile(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
 /**
  * Extract content from raw bytes (synchronous).
  *
@@ -161,7 +161,7 @@ declare function extractFile(filePath: string, mimeType?: string | null, config?
  * console.log(result.content);
  * ```
  */
-declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
+declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
 /**
  * Extract content from raw bytes (asynchronous).
  *
@@ -189,7 +189,7 @@ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: E
  * console.log(result.content);
  * ```
  */
-declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
 /**
  * Extract content from multiple files in parallel (synchronous).
  *
@@ -610,23 +610,109 @@ declare function unregisterDocumentExtractor(name: string): void;
  */
 declare function clearDocumentExtractors(): void;
 /**
- * ExtractionConfig namespace with static methods for loading configuration from files.
+ * Builder class for creating ExtractionConfig objects with a fluent API.
  *
- * Provides a factory method to load extraction configuration from TOML, YAML, or JSON files.
- * The file format is automatically detected based on the file extension.
+ * Provides a convenient way to build extraction configurations using method chaining.
+ *
+ * @example
+ * ```typescript
+ * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+ *
+ * // Create with builder pattern
+ * const config = ExtractionConfig.default()
+ *   .withChunking({ maxChars: 2048 })
+ *   .withOcr({ backend: 'tesseract', language: 'eng' })
+ *   .build();
+ *
+ * const result = await extractFile('document.pdf', null, config);
+ * ```
+ */
+declare class ExtractionConfigBuilder {
+    private config;
+    /**
+     * Create a new builder with default configuration.
+     */
+    static default(): ExtractionConfigBuilder;
+    /**
+     * Set OCR configuration.
+     */
+    withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
+    /**
+     * Set chunking configuration.
+     */
+    withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
+    /**
+     * Set image extraction configuration.
+     */
+    withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
+    /**
+     * Set PDF configuration.
+     */
+    withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
+    /**
+     * Set keyword extraction configuration.
+     */
+    withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
+    /**
+     * Set language detection configuration.
+     */
+    withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
+    /**
+     * Set whether to enable metadata extraction.
+     */
+    withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
+    /**
+     * Set whether to enable quality mode.
+     */
+    withQualityMode(enabled: boolean): ExtractionConfigBuilder;
+    /**
+     * Build and return the final ExtractionConfig object.
+     */
+    build(): ExtractionConfig$1;
+}
+/**
+ * ExtractionConfig namespace with static methods for loading configuration from files
+ * and creating new configurations with the builder pattern.
+ *
+ * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
+ * or to create configurations using a fluent builder API.
  *
  * @example
  * ```typescript
  * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
  *
  * // Load configuration from file
- * const config = ExtractionConfig.fromFile('config.toml');
+ * const config1 = ExtractionConfig.fromFile('config.toml');
+ *
+ * // Create with builder pattern
+ * const config2 = ExtractionConfig.default()
+ *   .withChunking({ maxChars: 2048 })
+ *   .build();
  *
  * // Use with extraction
- * const result = await extractFile('document.pdf', null, config);
+ * const result = await extractFile('document.pdf', null, config2);
  * ```
  */
 declare const ExtractionConfig: {
+    /**
+     * Create a default extraction configuration using the builder pattern.
+     *
+     * Returns a builder object that allows you to configure extraction settings
+     * using method chaining.
+     *
+     * @returns ExtractionConfigBuilder for chaining configuration calls
+     *
+     * @example
+     * ```typescript
+     * import { ExtractionConfig } from '@kreuzberg/node';
+     *
+     * const config = ExtractionConfig.default()
+     *   .withChunking({ maxChars: 2048 })
+     *   .withOcr({ backend: 'tesseract', language: 'eng' })
+     *   .build();
+     * ```
+     */
+    default(): ExtractionConfigBuilder;
     /**
      * Load extraction configuration from a file.
      *
@@ -711,28 +797,30 @@ declare function detectMimeType(bytes: Buffer): string;
 /**
  * Detect MIME type from a file path.
  *
- * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
- * if extension-based detection fails.
+ * Determines the MIME type based on the file extension in the provided path.
+ * By default, checks if the file exists; can be disabled with checkExists parameter.
  *
- * @param path - Path to the file (string)
- * @returns The detected MIME type string
+ * @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
+ * @param checkExists - Whether to verify the file exists (default: true)
+ * @returns The detected MIME type as a string (e.g., 'application/pdf')
  *
- * @throws {Error} If MIME type cannot be determined from path/extension
- * @throws {Error} If extension is unknown
+ * @throws {Error} If MIME type cannot be determined from the file extension,
+ * or if checkExists is true and the file does not exist
  *
  * @example
  * ```typescript
  * import { detectMimeTypeFromPath } from '@kreuzberg/node';
  *
- * // Detect from existing file
- * const mimeType = detectMimeTypeFromPath('document.pdf');
+ * // Detect MIME type from existing file
+ * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
  * console.log(mimeType); // 'application/pdf'
  *
- * const mimeType2 = detectMimeTypeFromPath('document.docx');
+ * // Detect without checking file existence
+ * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
  * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
  * ```
  */
-declare function detectMimeTypeFromPath(path: string): string;
+declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
 /**
  * Validate that a MIME type is supported by Kreuzberg.
  *
@@ -903,6 +991,75 @@ declare function getLastErrorCode(): number;
  * ```
  */
 declare function getLastPanicContext(): PanicContext | null;
-declare const __version__ = "4.0.0-rc.15";
+/**
+ * Returns the human-readable name for an error code.
+ *
+ * Maps numeric error codes to their string names, providing a consistent way
+ * to get error code names across all platforms.
+ *
+ * @param code - The numeric error code (0-7)
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
+ *
+ * @example
+ * ```typescript
+ * import { getErrorCodeName } from '@kreuzberg/node';
+ *
+ * const name = getErrorCodeName(0);  // returns "validation"
+ * const name = getErrorCodeName(2);  // returns "ocr"
+ * const name = getErrorCodeName(99); // returns "unknown"
+ * ```
+ */
+declare function getErrorCodeName(code: number): string;
+/**
+ * Returns the description for an error code.
+ *
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
+ *
+ * @param code - The numeric error code (0-7)
+ * @returns A brief description of the error type
+ *
+ * @example
+ * ```typescript
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
+ *
+ * const desc = getErrorCodeDescription(0);  // returns "Input validation error"
+ * const desc = getErrorCodeDescription(4);  // returns "File system I/O error"
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
+ * ```
+ */
+declare function getErrorCodeDescription(code: number): string;
+/**
+ * Classifies an error message string into an error code category.
+ *
+ * This function analyzes the error message content and returns the most likely
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
+ * errors for handling purposes.
+ *
+ * The classification is based on keyword matching:
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
+ *
+ * @param errorMessage - The error message string to classify
+ * @returns An object with the classification details
+ *
+ * @example
+ * ```typescript
+ * import { classifyError } from '@kreuzberg/node';
+ *
+ * const result = classifyError("PDF file is corrupted");
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
+ *
+ * const result = classifyError("Tesseract not found");
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
+ * ```
+ */
+declare function classifyError(errorMessage: string): ErrorClassification;
+declare const __version__ = "4.0.0-rc.17";
-export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
+export { ChunkingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrBackendProtocol, OcrConfig, PanicContext, PdfConfig, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };

package/dist/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { PanicContext } from './errors.js';
 export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
-import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol } from './types.js';
-export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
+import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, OcrConfig, ChunkingConfig, ImageExtractionConfig, PdfConfig, KeywordConfig, LanguageDetectionConfig, ErrorClassification } from './types.js';
+export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, Metadata, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
 export { GutenOcrBackend } from './ocr/guten-ocr.js';
 /**
@@ -97,7 +97,7 @@ declare function __resetBindingForTests(): void;
  * const result2 = extractFileSync('scanned.pdf', null, config);
  * ```
  */
-declare function extractFileSync(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): ExtractionResult;
+declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): ExtractionResult;
 /**
  * Extract content from a single file (asynchronous).
  *
@@ -133,7 +133,7 @@ declare function extractFileSync(filePath: string, mimeType?: string | null, con
  * console.log(result2.chunks); // Array of text chunks
  * ```
  */
-declare function extractFile(filePath: string, mimeType?: string | null, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
 /**
  * Extract content from raw bytes (synchronous).
  *
@@ -161,7 +161,7 @@ declare function extractFile(filePath: string, mimeType?: string | null, config?
  * console.log(result.content);
  * ```
  */
-declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
+declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): ExtractionResult;
 /**
  * Extract content from raw bytes (asynchronous).
  *
@@ -189,7 +189,7 @@ declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: E
  * console.log(result.content);
  * ```
  */
-declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
 /**
  * Extract content from multiple files in parallel (synchronous).
  *
@@ -610,23 +610,109 @@ declare function unregisterDocumentExtractor(name: string): void;
  */
 declare function clearDocumentExtractors(): void;
 /**
- * ExtractionConfig namespace with static methods for loading configuration from files.
+ * Builder class for creating ExtractionConfig objects with a fluent API.
  *
- * Provides a factory method to load extraction configuration from TOML, YAML, or JSON files.
- * The file format is automatically detected based on the file extension.
+ * Provides a convenient way to build extraction configurations using method chaining.
+ *
+ * @example
+ * ```typescript
+ * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+ *
+ * // Create with builder pattern
+ * const config = ExtractionConfig.default()
+ *   .withChunking({ maxChars: 2048 })
+ *   .withOcr({ backend: 'tesseract', language: 'eng' })
+ *   .build();
+ *
+ * const result = await extractFile('document.pdf', null, config);
+ * ```
+ */
+declare class ExtractionConfigBuilder {
+    private config;
+    /**
+     * Create a new builder with default configuration.
+     */
+    static default(): ExtractionConfigBuilder;
+    /**
+     * Set OCR configuration.
+     */
+    withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
+    /**
+     * Set chunking configuration.
+     */
+    withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
+    /**
+     * Set image extraction configuration.
+     */
+    withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
+    /**
+     * Set PDF configuration.
+     */
+    withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
+    /**
+     * Set keyword extraction configuration.
+     */
+    withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
+    /**
+     * Set language detection configuration.
+     */
+    withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
+    /**
+     * Set whether to enable metadata extraction.
+     */
+    withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
+    /**
+     * Set whether to enable quality mode.
+     */
+    withQualityMode(enabled: boolean): ExtractionConfigBuilder;
+    /**
+     * Build and return the final ExtractionConfig object.
+     */
+    build(): ExtractionConfig$1;
+}
+/**
+ * ExtractionConfig namespace with static methods for loading configuration from files
+ * and creating new configurations with the builder pattern.
+ *
+ * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
+ * or to create configurations using a fluent builder API.
  *
  * @example
  * ```typescript
  * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
  *
  * // Load configuration from file
- * const config = ExtractionConfig.fromFile('config.toml');
+ * const config1 = ExtractionConfig.fromFile('config.toml');
+ *
+ * // Create with builder pattern
+ * const config2 = ExtractionConfig.default()
+ *   .withChunking({ maxChars: 2048 })
+ *   .build();
  *
  * // Use with extraction
- * const result = await extractFile('document.pdf', null, config);
+ * const result = await extractFile('document.pdf', null, config2);
  * ```
  */
 declare const ExtractionConfig: {
+    /**
+     * Create a default extraction configuration using the builder pattern.
+     *
+     * Returns a builder object that allows you to configure extraction settings
+     * using method chaining.
+     *
+     * @returns ExtractionConfigBuilder for chaining configuration calls
+     *
+     * @example
+     * ```typescript
+     * import { ExtractionConfig } from '@kreuzberg/node';
+     *
+     * const config = ExtractionConfig.default()
+     *   .withChunking({ maxChars: 2048 })
+     *   .withOcr({ backend: 'tesseract', language: 'eng' })
+     *   .build();
+     * ```
+     */
+    default(): ExtractionConfigBuilder;
     /**
      * Load extraction configuration from a file.
      *
@@ -711,28 +797,30 @@ declare function detectMimeType(bytes: Buffer): string;
 /**
  * Detect MIME type from a file path.
  *
- * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
- * if extension-based detection fails.
+ * Determines the MIME type based on the file extension in the provided path.
+ * By default, checks if the file exists; can be disabled with checkExists parameter.
  *
- * @param path - Path to the file (string)
- * @returns The detected MIME type string
+ * @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
+ * @param checkExists - Whether to verify the file exists (default: true)
+ * @returns The detected MIME type as a string (e.g., 'application/pdf')
  *
- * @throws {Error} If MIME type cannot be determined from path/extension
- * @throws {Error} If extension is unknown
+ * @throws {Error} If MIME type cannot be determined from the file extension,
+ * or if checkExists is true and the file does not exist
  *
  * @example
  * ```typescript
  * import { detectMimeTypeFromPath } from '@kreuzberg/node';
  *
- * // Detect from existing file
- * const mimeType = detectMimeTypeFromPath('document.pdf');
+ * // Detect MIME type from existing file
+ * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
  * console.log(mimeType); // 'application/pdf'
  *
- * const mimeType2 = detectMimeTypeFromPath('document.docx');
+ * // Detect without checking file existence
+ * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
  * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
  * ```
  */
-declare function detectMimeTypeFromPath(path: string): string;
+declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
 /**
  * Validate that a MIME type is supported by Kreuzberg.
  *
@@ -903,6 +991,75 @@ declare function getLastErrorCode(): number;
  * ```
  */
 declare function getLastPanicContext(): PanicContext | null;
-declare const __version__ = "4.0.0-rc.15";
+/**
+ * Returns the human-readable name for an error code.
+ *
+ * Maps numeric error codes to their string names, providing a consistent way
+ * to get error code names across all platforms.
+ *
+ * @param code - The numeric error code (0-7)
+ * @returns The error code name as a string (e.g., "validation", "ocr", "unknown")
+ *
+ * @example
+ * ```typescript
+ * import { getErrorCodeName } from '@kreuzberg/node';
+ *
+ * const name = getErrorCodeName(0);  // returns "validation"
+ * const name = getErrorCodeName(2);  // returns "ocr"
+ * const name = getErrorCodeName(99); // returns "unknown"
+ * ```
+ */
+declare function getErrorCodeName(code: number): string;
+/**
+ * Returns the description for an error code.
+ *
+ * Retrieves user-friendly descriptions of error types from the FFI layer.
+ *
+ * @param code - The numeric error code (0-7)
+ * @returns A brief description of the error type
+ *
+ * @example
+ * ```typescript
+ * import { getErrorCodeDescription } from '@kreuzberg/node';
+ *
+ * const desc = getErrorCodeDescription(0);  // returns "Input validation error"
+ * const desc = getErrorCodeDescription(4);  // returns "File system I/O error"
+ * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
+ * ```
+ */
+declare function getErrorCodeDescription(code: number): string;
+/**
+ * Classifies an error message string into an error code category.
+ *
+ * This function analyzes the error message content and returns the most likely
+ * error code (0-7) based on keyword patterns. Used to programmatically classify
+ * errors for handling purposes.
+ *
+ * The classification is based on keyword matching:
+ * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
+ * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
+ * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
+ * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
+ * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
+ * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
+ * - **UnsupportedFormat (6)**: Keywords like "unsupported", "format", "mime"
+ * - **Internal (7)**: Keywords like "internal", "bug", "panic"
+ *
+ * @param errorMessage - The error message string to classify
+ * @returns An object with the classification details
+ *
+ * @example
+ * ```typescript
+ * import { classifyError } from '@kreuzberg/node';
+ *
+ * const result = classifyError("PDF file is corrupted");
+ * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
+ *
+ * const result = classifyError("Tesseract not found");
+ * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
+ * ```
+ */
+declare function classifyError(errorMessage: string): ErrorClassification;
+declare const __version__ = "4.0.0-rc.17";
-export { type EmbeddingPreset, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
+export { ChunkingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrBackendProtocol, OcrConfig, PanicContext, PdfConfig, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };