npm - @kreuzberg/node - Versions diffs - 4.0.8 → 4.1.0 - Mend

@kreuzberg/node 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +1 -1
package/dist/cli.js +6 -4
package/dist/cli.js.map +1 -1
package/dist/cli.mjs +13 -5
package/dist/cli.mjs.map +1 -1
package/dist/errors.js +26 -24
package/dist/errors.js.map +1 -1
package/dist/errors.mjs +25 -24
package/dist/errors.mjs.map +1 -1
package/dist/index.d.mts +608 -535
package/dist/index.d.ts +608 -535
package/dist/index.js +682 -338
package/dist/index.js.map +1 -1
package/dist/index.mjs +662 -334
package/dist/index.mjs.map +1 -1
package/dist/ocr/guten-ocr.js +4 -2
package/dist/ocr/guten-ocr.js.map +1 -1
package/dist/ocr/guten-ocr.mjs +3 -2
package/dist/ocr/guten-ocr.mjs.map +1 -1
package/dist/types.js +2 -0
package/dist/types.js.map +1 -1
package/index.d.ts +77 -178
package/index.js +54 -52
package/package.json +7 -7

package/index.d.ts CHANGED Viewed

@@ -135,33 +135,6 @@ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtra
  */
 export declare function batchExtractFilesInWorker(pool: JsWorkerPool, filePaths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
-/**
- * Batch extract from multiple files (synchronous).
- *
- * Synchronously processes multiple files in parallel using Rayon. Significantly
- * faster than sequential processing for large batches.
- *
- * # Parameters
- *
- * * `paths` - Array of file paths to extract
- * * `config` - Optional extraction configuration (applied to all files)
- *
- * # Returns
- *
- * Array of `ExtractionResult` in the same order as input paths.
- *
- * # Example
- *
- * ```typescript
- * import { batchExtractFilesSync } from '@kreuzberg/node';
- *
- * const files = ['doc1.pdf', 'doc2.docx', 'doc3.txt'];
- * const results = batchExtractFilesSync(files, null);
- * results.forEach((result, i) => {
- *   console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
- * });
- * ```
- */
 export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
 export declare function classifyError(errorMessage: string): ErrorClassification
@@ -183,21 +156,7 @@ export declare function classifyError(errorMessage: string): ErrorClassification
  */
 export declare function clearDocumentExtractors(): void
-/**
- * Clear all registered OCR backends.
- *
- * Removes all OCR backends from the registry, including built-in backends.
- * Use with caution as this will make OCR functionality unavailable until
- * backends are re-registered.
- *
- * # Example
- *
- * ```typescript
- * import { clearOcrBackends } from 'kreuzberg';
- *
- * clearOcrBackends();
- * ```
- */
+/** Clear all registered OCR backends */
 export declare function clearOcrBackends(): void
 /** Clear all registered postprocessors */
@@ -329,15 +288,14 @@ export declare function createWorkerPool(size?: number | undefined | null): JsWo
  * # Example
  *
  * ```typescript
- * import { detectMimeType } from 'kreuzberg';
+ * import { detectMimeTypeFromBytes } from 'kreuzberg';
  * import * as fs from 'fs';
  *
  * // Read file content
  * const content = fs.readFileSync('document.pdf');
  *
  * // Detect MIME type from bytes
- * const mimeType = detectMimeType(content);
- * console.log(mimeType); // 'application/pdf'
+ * const mimeType = detectMimeTypeFromBytes(content);
  * ```
  */
 export declare function detectMimeTypeFromBytes(bytes: Buffer): string
@@ -345,68 +303,83 @@ export declare function detectMimeTypeFromBytes(bytes: Buffer): string
 /**
  * Detect MIME type from a file path.
  *
- * Uses file extension to determine MIME type. Falls back to `mime_guess` crate
- * if extension-based detection fails.
+ * Determines the MIME type based on the file extension in the provided path.
+ * By default, checks if the file exists; can be disabled with check_exists parameter.
  *
  * # Parameters
  *
- * * `path` - Path to the file (string)
- * * `check_exists` - Whether to verify file existence (default: true)
+ * * `path` - The file path to detect MIME type from (e.g., 'document.pdf')
+ * * `check_exists` - Whether to verify the file exists (default: true)
  *
  * # Returns
  *
- * The detected MIME type string.
+ * The detected MIME type as a string (e.g., 'application/pdf').
  *
  * # Errors
  *
- * Throws an error if:
- * - File doesn't exist (when check_exists is true)
- * - MIME type cannot be determined from path/extension
- * - Extension is unknown
+ * Throws an error if MIME type cannot be determined from the file extension,
+ * or if check_exists is true and the file does not exist.
  *
  * # Example
  *
  * ```typescript
  * import { detectMimeTypeFromPath } from 'kreuzberg';
  *
- * // Detect from existing file
- * const mimeType = detectMimeTypeFromPath('document.pdf');
- * console.log(mimeType); // 'application/pdf'
+ * // Detect MIME type from existing file
+ * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
  *
- * const mimeType2 = detectMimeTypeFromPath('document.docx');
- * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ * // Detect without checking file existence
+ * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
  * ```
  */
 export declare function detectMimeTypeFromPath(path: string, checkExists?: boolean | undefined | null): string
 /**
- * Discover and load extraction configuration from current or parent directories.
+ * Discover extraction configuration file in current directory or parent directories.
  *
- * Searches for a `kreuzberg.toml` file starting from the current working directory
- * and traversing up the directory tree. Returns the first configuration file found.
+ * Searches for configuration files in the following order:
+ * 1. `kreuzberg.toml`
+ * 2. `kreuzberg.yaml` / `kreuzberg.yml`
+ * 3. `kreuzberg.json`
+ * 4. Searches parent directories up to the filesystem root
+ *
+ * Returns the first configuration file found or throws an error if none found.
  *
  * # Returns
  *
- * `JsExtractionConfig` object if a configuration file is found, or `null` if no
- * configuration file exists in the current or parent directories.
+ * `JsExtractionConfig` object with discovered configuration.
+ *
+ * # Errors
+ *
+ * Throws an error if no configuration file is found.
  *
  * # Example
  *
  * ```typescript
- * import { ExtractionConfig } from 'kreuzberg';
+ * import { discoverExtractionConfig } from 'kreuzberg';
  *
- * // Try to find config in current or parent directories
- * const config = ExtractionConfig.discover();
- * if (config) {
- *   console.log('Found configuration');
- *   // Use config for extraction
- * } else {
- *   console.log('No configuration file found, using defaults');
- * }
+ * // Automatically finds kreuzberg.toml or kreuzberg.yaml in current or parent directories
+ * const config = discoverExtractionConfig();
+ * const result = await extractFile('document.pdf', null, config);
  * ```
  */
 export declare function discoverExtractionConfig(): JsExtractionConfig | null
+export interface EmbeddingPreset {
+  /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
+  name: string
+  /** Recommended chunk size in characters */
+  chunkSize: number
+  /** Recommended overlap in characters */
+  overlap: number
+  /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
+  modelName: string
+  /** Embedding vector dimensions */
+  dimensions: number
+  /** Human-readable description of the preset */
+  description: string
+}
 /**
  * Embedding preset configuration for TypeScript bindings.
  *
@@ -604,60 +577,42 @@ export declare function extractFile(filePath: string, mimeType?: string | undefi
  */
 export declare function extractFileInWorker(pool: JsWorkerPool, filePath: string, password?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
+export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
 /**
- * Extract content from a file (synchronous).
+ * Get a specific embedding preset by name.
  *
- * Synchronously extracts text, tables, images, and metadata from a document file.
- * Supports 118+ file formats including PDFs, Office documents, images, and more.
+ * Returns a preset configuration object, or null if the preset name is not found.
  *
- * # Parameters
+ * # Arguments
  *
- * * `file_path` - Path to the file to extract (absolute or relative)
- * * `mime_type` - Optional MIME type hint (auto-detected if omitted)
- * * `config` - Optional extraction configuration (OCR, chunking, etc.)
+ * * `name` - The preset name (case-sensitive)
  *
  * # Returns
  *
- * `ExtractionResult` containing:
- * - `content`: Extracted text content
- * - `mimeType`: Detected MIME type
- * - `metadata`: File metadata (author, title, etc.)
- * - `tables`: Extracted tables (if any)
- * - `images`: Extracted images (if configured)
- * - `chunks`: Text chunks (if chunking enabled)
- * - `detectedLanguages`: Detected languages (if enabled)
- *
- * # Errors
+ * An `EmbeddingPreset` object with the following properties:
+ * - `name`: string - Preset name
+ * - `chunkSize`: number - Recommended chunk size in characters
+ * - `overlap`: number - Recommended overlap in characters
+ * - `modelName`: string - Model identifier
+ * - `dimensions`: number - Embedding vector dimensions
+ * - `description`: string - Human-readable description
  *
- * Throws an error if:
- * - File does not exist or is not accessible
- * - File format is unsupported
- * - File is corrupted or malformed
- * - OCR processing fails (if enabled)
+ * Returns `null` if preset name is not found.
  *
  * # Example
  *
  * ```typescript
- * import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';
- *
- * // Basic extraction
- * const result = extractFileSync('document.pdf', null, null);
- * console.log(result.content);
- *
- * // With MIME type hint
- * const result2 = extractFileSync('file.bin', 'application/pdf', null);
+ * import { getEmbeddingPreset } from 'kreuzberg';
  *
- * // With OCR enabled
- * const config: ExtractionConfig = {
- *   ocr: {
- *     backend: 'tesseract',
- *     language: 'eng',
- *   }
- * };
- * const result3 = extractFileSync('scanned.pdf', null, config);
+ * const preset = getEmbeddingPreset('balanced');
+ * if (preset) {
+ *   console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
+ *   // Model: BGEBaseENV15, Dims: 768
+ * }
  * ```
  */
-export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
+export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
 /**
  * Get a specific embedding preset by name.
@@ -1195,25 +1150,6 @@ export interface JsYakeParams {
   windowSize?: number
 }
-/**
- * List all registered document extractors.
- *
- * Returns an array of names of all currently registered document extractors,
- * including built-in extractors for PDF, Office documents, images, etc.
- *
- * # Returns
- *
- * Array of document extractor names.
- *
- * # Example
- *
- * ```typescript
- * import { listDocumentExtractors } from 'kreuzberg';
- *
- * const extractors = listDocumentExtractors();
- * console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
- * ```
- */
 export declare function listDocumentExtractors(): Array<string>
 /**
@@ -1237,24 +1173,26 @@ export declare function listDocumentExtractors(): Array<string>
 export declare function listEmbeddingPresets(): Array<string>
 /**
- * List all registered OCR backends.
+ * List all available embedding preset names.
  *
- * Returns an array of names of all currently registered OCR backends,
- * including built-in backends like "tesseract".
+ * Returns an array of preset names that can be used with `getEmbeddingPreset`.
  *
  * # Returns
  *
- * Array of OCR backend names.
+ * Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
  *
  * # Example
  *
  * ```typescript
- * import { listOcrBackends } from 'kreuzberg';
+ * import { listEmbeddingPresets } from 'kreuzberg';
  *
- * const backends = listOcrBackends();
- * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
+ * const presets = listEmbeddingPresets();
+ * console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
  * ```
  */
+export declare function listEmbeddingPresets(): Array<string>
+/** List all registered OCR backends */
 export declare function listOcrBackends(): Array<string>
 /** List all registered post-processors */
@@ -1451,25 +1389,7 @@ export declare function registerValidator(validator: object): void
  */
 export declare function unregisterDocumentExtractor(name: string): void
-/**
- * Unregister an OCR backend by name.
- *
- * Removes the specified OCR backend from the registry. If the backend doesn't exist,
- * this operation is a no-op (does not throw an error).
- *
- * # Parameters
- *
- * * `name` - Name of the OCR backend to unregister
- *
- * # Example
- *
- * ```typescript
- * import { unregisterOcrBackend } from 'kreuzberg';
- *
- * // Unregister a custom backend
- * unregisterOcrBackend('my-custom-ocr');
- * ```
- */
+/** Unregister an OCR backend by name */
 export declare function unregisterOcrBackend(name: string): void
 /** Unregister a postprocessor by name */
@@ -1623,27 +1543,6 @@ export declare function validateLanguageCode(code: string): boolean
  * # Errors
  *
  * Throws an error if the MIME type is not supported.
- *
- * # Example
- *
- * ```typescript
- * import { validateMimeType } from 'kreuzberg';
- *
- * // Validate supported type
- * const validated = validateMimeType('application/pdf');
- * console.log(validated); // 'application/pdf'
- *
- * // Validate custom image type
- * const validated2 = validateMimeType('image/custom-format');
- * console.log(validated2); // 'image/custom-format' (any image/* is valid)
- *
- * // Validate unsupported type (throws error)
- * try {
- *   validateMimeType('video/mp4');
- * } catch (err) {
- *   console.error(err); // Error: Unsupported format: video/mp4
- * }
- * ```
  */
 export declare function validateMimeType(mimeType: string): string