npm - @kreuzberg/node - Versions diffs - 4.0.0-rc.22 → 4.0.0-rc.25 - Mend

@kreuzberg/node 4.0.0-rc.22 → 4.0.0-rc.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { PanicContext } from './errors.js';
 export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';
-import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, OcrConfig, ChunkingConfig, ImageExtractionConfig, PdfConfig, KeywordConfig, LanguageDetectionConfig, ErrorClassification } from './types.js';
-export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HeaderMetadata, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, LinkMetadata, Metadata, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
+import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.js';
+export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.js';
 export { GutenOcrBackend } from './ocr/guten-ocr.js';
 /**
@@ -610,72 +610,12 @@ declare function unregisterDocumentExtractor(name: string): void;
  */
 declare function clearDocumentExtractors(): void;
 /**
- * Builder class for creating ExtractionConfig objects with a fluent API.
- *
- * Provides a convenient way to build extraction configurations using method chaining.
- *
- * @example
- * ```typescript
- * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
- *
- * // Create with builder pattern
- * const config = ExtractionConfig.default()
- *   .withChunking({ maxChars: 2048 })
- *   .withOcr({ backend: 'tesseract', language: 'eng' })
- *   .build();
- *
- * const result = await extractFile('document.pdf', null, config);
- * ```
- */
-declare class ExtractionConfigBuilder {
-    private config;
-    /**
-     * Create a new builder with default configuration.
-     */
-    static default(): ExtractionConfigBuilder;
-    /**
-     * Set OCR configuration.
-     */
-    withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
-    /**
-     * Set chunking configuration.
-     */
-    withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
-    /**
-     * Set image extraction configuration.
-     */
-    withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
-    /**
-     * Set PDF configuration.
-     */
-    withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
-    /**
-     * Set keyword extraction configuration.
-     */
-    withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
-    /**
-     * Set language detection configuration.
-     */
-    withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
-    /**
-     * Set whether to enable metadata extraction.
-     */
-    withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
-    /**
-     * Set whether to enable quality mode.
-     */
-    withQualityMode(enabled: boolean): ExtractionConfigBuilder;
-    /**
-     * Build and return the final ExtractionConfig object.
-     */
-    build(): ExtractionConfig$1;
-}
-/**
- * ExtractionConfig namespace with static methods for loading configuration from files
- * and creating new configurations with the builder pattern.
+ * ExtractionConfig namespace with static methods for loading configuration from files.
  *
  * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
- * or to create configurations using a fluent builder API.
+ * or to discover configuration files in the current directory tree.
+ *
+ * For creating configurations programmatically, use plain TypeScript objects instead:
  *
  * @example
  * ```typescript
@@ -684,35 +624,17 @@ declare class ExtractionConfigBuilder {
  * // Load configuration from file
  * const config1 = ExtractionConfig.fromFile('config.toml');
  *
- * // Create with builder pattern
- * const config2 = ExtractionConfig.default()
- *   .withChunking({ maxChars: 2048 })
- *   .build();
+ * // Or create with plain object
+ * const config2 = {
+ *   chunking: { maxChars: 2048 },
+ *   ocr: { backend: 'tesseract', language: 'eng' }
+ * };
  *
  * // Use with extraction
  * const result = await extractFile('document.pdf', null, config2);
  * ```
  */
 declare const ExtractionConfig: {
-    /**
-     * Create a default extraction configuration using the builder pattern.
-     *
-     * Returns a builder object that allows you to configure extraction settings
-     * using method chaining.
-     *
-     * @returns ExtractionConfigBuilder for chaining configuration calls
-     *
-     * @example
-     * ```typescript
-     * import { ExtractionConfig } from '@kreuzberg/node';
-     *
-     * const config = ExtractionConfig.default()
-     *   .withChunking({ maxChars: 2048 })
-     *   .withOcr({ backend: 'tesseract', language: 'eng' })
-     *   .build();
-     * ```
-     */
-    default(): ExtractionConfigBuilder;
     /**
      * Load extraction configuration from a file.
      *
@@ -1060,6 +982,151 @@ declare function getErrorCodeDescription(code: number): string;
  * ```
  */
 declare function classifyError(errorMessage: string): ErrorClassification;
-declare const __version__ = "4.0.0-rc.22";
+/**
+ * Create a worker pool for concurrent file extraction.
+ *
+ * The worker pool manages a set of background worker threads that can process
+ * extraction requests concurrently, improving throughput when handling multiple files.
+ *
+ * @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
+ * @returns A WorkerPool instance to use with extraction functions
+ *
+ * @throws {Error} If size is invalid or pool creation fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * // Create pool with 4 workers
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   // Always close the pool when done
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function createWorkerPool(size?: number): WorkerPool;
+/**
+ * Get statistics about a worker pool.
+ *
+ * Returns information about the pool's current state, including the number of active workers,
+ * queued tasks, and total processed tasks.
+ *
+ * @param pool - The worker pool instance
+ * @returns WorkerPoolStats with pool information
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ * const stats = getWorkerPoolStats(pool);
+ *
+ * console.log(`Pool size: ${stats.size}`);
+ * console.log(`Active workers: ${stats.activeWorkers}`);
+ * console.log(`Queued tasks: ${stats.queuedTasks}`);
+ * ```
+ */
+declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
+/**
+ * Extract content from a single file using a worker pool (asynchronous).
+ *
+ * Submits an extraction task to the worker pool. The task is executed by one of the
+ * available workers in the background, allowing other tasks to be processed concurrently.
+ *
+ * @param pool - The worker pool instance
+ * @param filePath - Path to the file to extract
+ * @param mimeTypeOrConfig - Optional MIME type or extraction configuration
+ * @param maybeConfig - Optional extraction configuration (if second param is MIME type)
+ * @returns Promise<ExtractionResult> containing extracted content and metadata
+ *
+ * @throws {Error} If the file cannot be read or extraction fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
+ *   const results = await Promise.all(
+ *     files.map(f => extractFileInWorker(pool, f))
+ *   );
+ *
+ *   results.forEach((r, i) => {
+ *     console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
+ *   });
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
+/**
+ * Extract content from multiple files in parallel using a worker pool (asynchronous).
+ *
+ * Submits multiple extraction tasks to the worker pool for concurrent processing.
+ * This is more efficient than using `extractFileInWorker` multiple times sequentially.
+ *
+ * @param pool - The worker pool instance
+ * @param paths - Array of file paths to extract
+ * @param config - Extraction configuration object (applies to all files)
+ * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
+ *
+ * @throws {Error} If any file cannot be read or extraction fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
+ *   const results = await batchExtractFilesInWorker(pool, files, {
+ *     ocr: { backend: 'tesseract', language: 'eng' }
+ *   });
+ *
+ *   const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
+ *   console.log(`Total: $${total}`);
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
+/**
+ * Close a worker pool and shut down all worker threads.
+ *
+ * Should be called when the pool is no longer needed to clean up resources
+ * and gracefully shut down worker threads. Any pending tasks will be cancelled.
+ *
+ * @param pool - The worker pool instance to close
+ * @returns Promise that resolves when the pool is fully closed
+ *
+ * @throws {Error} If pool shutdown fails
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ *
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   // Clean up the pool
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
+declare const __version__ = "4.0.0-rc.25";
-export { ChunkingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrBackendProtocol, OcrConfig, PanicContext, PdfConfig, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
+export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };

package/dist/index.js CHANGED Viewed

@@ -36,17 +36,21 @@ __export(index_exports, {
   batchExtractBytes: () => batchExtractBytes,
   batchExtractBytesSync: () => batchExtractBytesSync,
   batchExtractFiles: () => batchExtractFiles,
+  batchExtractFilesInWorker: () => batchExtractFilesInWorker,
   batchExtractFilesSync: () => batchExtractFilesSync,
   classifyError: () => classifyError,
   clearDocumentExtractors: () => clearDocumentExtractors,
   clearOcrBackends: () => clearOcrBackends,
   clearPostProcessors: () => clearPostProcessors,
   clearValidators: () => clearValidators,
+  closeWorkerPool: () => closeWorkerPool,
+  createWorkerPool: () => createWorkerPool,
   detectMimeType: () => detectMimeType,
   detectMimeTypeFromPath: () => detectMimeTypeFromPath,
   extractBytes: () => extractBytes,
   extractBytesSync: () => extractBytesSync,
   extractFile: () => extractFile,
+  extractFileInWorker: () => extractFileInWorker,
   extractFileSync: () => extractFileSync,
   getEmbeddingPreset: () => getEmbeddingPreset,
   getErrorCodeDescription: () => getErrorCodeDescription,
@@ -54,6 +58,7 @@ __export(index_exports, {
   getExtensionsForMime: () => getExtensionsForMime,
   getLastErrorCode: () => getLastErrorCode,
   getLastPanicContext: () => getLastPanicContext,
+  getWorkerPoolStats: () => getWorkerPoolStats,
   listDocumentExtractors: () => listDocumentExtractors,
   listEmbeddingPresets: () => listEmbeddingPresets,
   listOcrBackends: () => listOcrBackends,
@@ -133,7 +138,16 @@ function __resetBindingForTests() {
   bindingInitialized = false;
 }
 function loadNativeBinding() {
-  const localRequire = typeof require !== "undefined" ? require : (0, import_node_module.createRequire)(import_meta.url);
+  let localRequire;
+  if (typeof require !== "undefined") {
+    localRequire = require;
+  } else {
+    try {
+      localRequire = (0, import_node_module.createRequire)(import_meta.url);
+    } catch {
+      localRequire = void 0;
+    }
+  }
   if (!localRequire) {
     throw new Error("Unable to resolve native binding loader (require not available).");
   }
@@ -317,9 +331,9 @@ function convertResult(rawResult) {
       metadata: {},
       tables: [],
       detectedLanguages: null,
-      chunks: void 0,
-      images: void 0,
-      pages: void 0
+      chunks: null,
+      images: null,
+      pages: null
     };
   }
   const result = rawResult;
@@ -335,9 +349,9 @@ function convertResult(rawResult) {
     tables: Array.isArray(result["tables"]) ? result["tables"] : [],
     // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
     detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
-    chunks: void 0,
-    images: void 0,
-    pages: void 0
+    chunks: null,
+    images: null,
+    pages: null
   };
   const chunksData = result["chunks"];
   if (Array.isArray(chunksData)) {
@@ -515,9 +529,9 @@ function normalizePageConfig(pages) {
     return void 0;
   }
   const normalized = {};
-  setIfDefined(normalized, "extract_pages", pages.extractPages);
-  setIfDefined(normalized, "insert_page_markers", pages.insertPageMarkers);
-  setIfDefined(normalized, "marker_format", pages.markerFormat);
+  setIfDefined(normalized, "extractPages", pages.extractPages);
+  setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
+  setIfDefined(normalized, "markerFormat", pages.markerFormat);
   return normalized;
 }
 function normalizeExtractionConfig(config) {
@@ -813,99 +827,7 @@ function clearDocumentExtractors() {
   const binding2 = getBinding();
   binding2.clearDocumentExtractors();
 }
-class ExtractionConfigBuilder {
-  config = {};
-  /**
-   * Create a new builder with default configuration.
-   */
-  static default() {
-    return new ExtractionConfigBuilder();
-  }
-  /**
-   * Set OCR configuration.
-   */
-  withOcr(ocr) {
-    this.config["ocr"] = ocr;
-    return this;
-  }
-  /**
-   * Set chunking configuration.
-   */
-  withChunking(chunking) {
-    this.config["chunking"] = chunking;
-    return this;
-  }
-  /**
-   * Set image extraction configuration.
-   */
-  withImageExtraction(images) {
-    this.config["imageExtraction"] = images;
-    return this;
-  }
-  /**
-   * Set PDF configuration.
-   */
-  withPdf(pdf) {
-    this.config["pdf"] = pdf;
-    return this;
-  }
-  /**
-   * Set keyword extraction configuration.
-   */
-  withKeywords(keywords) {
-    this.config["keywords"] = keywords;
-    return this;
-  }
-  /**
-   * Set language detection configuration.
-   */
-  withLanguageDetection(languageDetection) {
-    this.config["languageDetection"] = languageDetection;
-    return this;
-  }
-  /**
-   * Set whether to enable metadata extraction.
-   */
-  withMetadataExtraction(enabled) {
-    this.config["metadataExtraction"] = enabled;
-    return this;
-  }
-  /**
-   * Set whether to enable quality mode.
-   */
-  withQualityMode(enabled) {
-    this.config["qualityMode"] = enabled;
-    return this;
-  }
-  /**
-   * Build and return the final ExtractionConfig object.
-   */
-  build() {
-    return this.config;
-  }
-}
 const ExtractionConfig = {
-  /**
-   * Create a default extraction configuration using the builder pattern.
-   *
-   * Returns a builder object that allows you to configure extraction settings
-   * using method chaining.
-   *
-   * @returns ExtractionConfigBuilder for chaining configuration calls
-   *
-   * @example
-   * ```typescript
-   * import { ExtractionConfig } from '@kreuzberg/node';
-   *
-   * const config = ExtractionConfig.default()
-   *   .withChunking({ maxChars: 2048 })
-   *   .withOcr({ backend: 'tesseract', language: 'eng' })
-   *   .build();
-   * ```
-   */
-  default() {
-    return ExtractionConfigBuilder.default();
-  },
   /**
    * Load extraction configuration from a file.
    *
@@ -1014,7 +936,54 @@ function classifyError(errorMessage) {
   const result = binding2.classifyError(errorMessage);
   return result;
 }
-const __version__ = "4.0.0-rc.22";
+function createWorkerPool(size) {
+  const binding2 = getBinding();
+  const rawPool = binding2.createWorkerPool(size);
+  return rawPool;
+}
+function getWorkerPoolStats(pool) {
+  const binding2 = getBinding();
+  const rawStats = binding2.getWorkerPoolStats(pool);
+  return rawStats;
+}
+async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
+  let mimeType = null;
+  let config = null;
+  if (typeof mimeTypeOrConfig === "string") {
+    mimeType = mimeTypeOrConfig;
+    config = maybeConfig ?? null;
+  } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
+    config = mimeTypeOrConfig;
+    mimeType = null;
+  } else {
+    config = maybeConfig ?? null;
+    mimeType = null;
+  }
+  const normalizedConfig = normalizeExtractionConfig(config);
+  const binding2 = getBinding();
+  const rawResult = await binding2.extractFileInWorker(
+    pool,
+    filePath,
+    mimeType,
+    normalizedConfig
+  );
+  return convertResult(rawResult);
+}
+async function batchExtractFilesInWorker(pool, paths, config = null) {
+  const normalizedConfig = normalizeExtractionConfig(config);
+  const binding2 = getBinding();
+  const rawResults = await binding2.batchExtractFilesInWorker(
+    pool,
+    paths,
+    normalizedConfig
+  );
+  return rawResults.map(convertResult);
+}
+async function closeWorkerPool(pool) {
+  const binding2 = getBinding();
+  await binding2.closeWorkerPool(pool);
+}
+const __version__ = "4.0.0-rc.25";
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   CacheError,
@@ -1034,17 +1003,21 @@ const __version__ = "4.0.0-rc.22";
   batchExtractBytes,
   batchExtractBytesSync,
   batchExtractFiles,
+  batchExtractFilesInWorker,
   batchExtractFilesSync,
   classifyError,
   clearDocumentExtractors,
   clearOcrBackends,
   clearPostProcessors,
   clearValidators,
+  closeWorkerPool,
+  createWorkerPool,
   detectMimeType,
   detectMimeTypeFromPath,
   extractBytes,
   extractBytesSync,
   extractFile,
+  extractFileInWorker,
   extractFileSync,
   getEmbeddingPreset,
   getErrorCodeDescription,
@@ -1052,6 +1025,7 @@ const __version__ = "4.0.0-rc.22";
   getExtensionsForMime,
   getLastErrorCode,
   getLastPanicContext,
+  getWorkerPoolStats,
   listDocumentExtractors,
   listEmbeddingPresets,
   listOcrBackends,