npm - @kreuzberg/node - Versions diffs - 4.0.0-rc.8 → 4.0.1 - Mend

@kreuzberg/node 4.0.0-rc.8 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/types.d.ts CHANGED Viewed

@@ -4,108 +4,316 @@
  * These types mirror the strongly-typed Rust metadata structures,
  * providing type safety for TypeScript users.
  */
+/**
+ * Tesseract OCR engine configuration options.
+ *
+ * @example
+ * ```typescript
+ * const config: TesseractConfig = {
+ *   psm: 6,
+ *   enableTableDetection: true,
+ *   tesseditCharWhitelist: '0123456789'
+ * };
+ * ```
+ */
 interface TesseractConfig {
+    /**
+     * Page Segmentation Mode (0-13). Controls how Tesseract segments and recognizes text.
+     * Common values: 3 (auto), 6 (single uniform block), 11 (sparse text).
+     * Default: 3 (auto layout analysis).
+     */
     psm?: number;
+    /**
+     * Enable table detection during OCR processing.
+     * When true, Tesseract attempts to preserve table structure in the output.
+     * Default: false.
+     */
     enableTableDetection?: boolean;
+    /**
+     * Whitelist of characters Tesseract should recognize.
+     * Only these characters will be returned by the OCR engine.
+     * Use empty string to allow all characters. Useful for constraining output to digits,
+     * specific alphabets, or other character sets.
+     * Default: null (recognize all).
+     */
     tesseditCharWhitelist?: string;
 }
+/**
+ * OCR (Optical Character Recognition) configuration.
+ *
+ * Controls which OCR engine to use and how it processes images.
+ */
 interface OcrConfig {
+    /** OCR backend name (e.g., 'tesseract', 'paddleocr', 'easyocr'). Required. */
     backend: string;
+    /** ISO 639-1/3 language code(s) for OCR (e.g., 'eng', 'fra', 'deu'). Default: 'eng'. */
     language?: string;
+    /** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */
     tesseractConfig?: TesseractConfig;
 }
+/**
+ * Document chunking configuration for splitting large documents.
+ *
+ * Breaks large documents into smaller, manageable chunks while preserving context.
+ * Useful for RAG (Retrieval Augmented Generation) and vector database indexing.
+ */
 interface ChunkingConfig {
+    /** Maximum characters per chunk. Default: 4096. */
     maxChars?: number;
+    /** Maximum overlapping characters between consecutive chunks for context preservation. Default: 512. */
     maxOverlap?: number;
+    /**
+     * Alternative to maxChars: chunk size using different unit.
+     * Mutually exclusive with maxChars.
+     */
     chunkSize?: number;
+    /**
+     * Alternative to maxOverlap: overlap amount using different unit.
+     * Mutually exclusive with maxOverlap.
+     */
     chunkOverlap?: number;
+    /**
+     * Named preset configuration (e.g., 'default', 'aggressive', 'minimal').
+     * Uses preset values if neither maxChars nor chunkSize is specified.
+     */
     preset?: string;
+    /** Embedding configuration for generating vector embeddings for each chunk. */
     embedding?: Record<string, unknown>;
+    /** Enable or disable chunking. Default: true when chunking config is provided. */
     enabled?: boolean;
 }
+/**
+ * Language detection configuration.
+ *
+ * Automatically detects the language(s) of extracted content.
+ */
 interface LanguageDetectionConfig {
+    /** Enable automatic language detection. Default: true. */
     enabled?: boolean;
+    /** Minimum confidence score (0.0-1.0) for language detection. Default: 0.5. */
     minConfidence?: number;
+    /** Detect multiple languages in the same document. Default: false. */
     detectMultiple?: boolean;
 }
+/**
+ * Token reduction configuration for optimizing token usage.
+ *
+ * Reduces the number of tokens in extracted content while preserving meaning.
+ * Useful for reducing costs in LLM pipelines.
+ */
 interface TokenReductionConfig {
+    /** Reduction mode: 'aggressive' or 'conservative'. Default: 'conservative'. */
     mode?: string;
+    /** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
     preserveImportantWords?: boolean;
 }
+/**
+ * Hierarchy extraction configuration.
+ *
+ * Controls document hierarchy detection based on font size clustering.
+ */
+interface HierarchyConfig {
+    /** Enable hierarchy extraction. Default: true. */
+    enabled?: boolean;
+    /** Number of font size clusters (2-10). Default: 6. */
+    kClusters?: number;
+    /** Include bounding box information. Default: true. */
+    includeBbox?: boolean;
+    /** OCR coverage threshold (0.0-1.0). Default: null. */
+    ocrCoverageThreshold?: number | null;
+}
+/**
+ * PDF-specific extraction configuration.
+ *
+ * Controls how PDF documents are processed.
+ */
 interface PdfConfig {
+    /** Extract images from PDF pages. Default: true. */
     extractImages?: boolean;
+    /** List of passwords to try for password-protected PDFs. */
     passwords?: string[];
+    /** Extract document metadata (title, author, creation date, etc.). Default: true. */
     extractMetadata?: boolean;
+    /** Hierarchy extraction configuration. */
+    hierarchy?: HierarchyConfig;
 }
+/**
+ * Image extraction and processing configuration.
+ *
+ * Controls how images are extracted and optimized from documents.
+ */
 interface ImageExtractionConfig {
+    /** Enable image extraction from documents. Default: true. */
     extractImages?: boolean;
+    /** Target DPI (dots per inch) for extracted images. Higher DPI = better quality but larger files. Default: 150. */
     targetDpi?: number;
+    /** Maximum image dimension (width or height) in pixels. Images larger than this are downscaled. Default: 2000. */
     maxImageDimension?: number;
+    /** Automatically adjust DPI based on image content and quality. Default: true. */
     autoAdjustDpi?: boolean;
+    /** Minimum DPI to maintain for image quality. Default: 72. */
     minDpi?: number;
+    /** Maximum DPI to avoid excessive file sizes. Default: 300. */
     maxDpi?: number;
 }
+/**
+ * Post-processor configuration for modifying extracted content.
+ *
+ * Post-processors allow customization and cleanup of extraction results
+ * without failing the extraction if they encounter errors.
+ */
 interface PostProcessorConfig {
+    /** Enable or disable post-processing entirely. Default: true. */
     enabled?: boolean;
+    /** List of processor names to enable (allowlist). When set, only these are used. */
     enabledProcessors?: string[];
+    /** List of processor names to disable (denylist). These are skipped. */
     disabledProcessors?: string[];
 }
+/**
+ * HTML preprocessing options.
+ *
+ * Cleans HTML content before conversion to Markdown.
+ */
 interface HtmlPreprocessingOptions {
+    /** Enable HTML preprocessing. Default: true. */
     enabled?: boolean;
+    /** Preset cleanup level: 'minimal' (light), 'standard' (balanced), 'aggressive' (heavy). Default: 'standard'. */
     preset?: "minimal" | "standard" | "aggressive";
+    /** Remove navigation menus and headers. Default: true. */
     removeNavigation?: boolean;
+    /** Remove form elements. Default: true. */
     removeForms?: boolean;
 }
+/**
+ * HTML to Markdown conversion configuration options.
+ *
+ * Controls how HTML content is converted to Markdown format, including formatting,
+ * escaping, and special handling for various HTML elements.
+ */
 interface HtmlConversionOptions {
+    /** Heading style conversion: "atx" (# style), "underlined" (underline style), or "atx_closed" (# style closed). Default: "atx". */
     headingStyle?: "atx" | "underlined" | "atx_closed";
+    /** List indentation type: "spaces" or "tabs". Default: "spaces". */
     listIndentType?: "spaces" | "tabs";
+    /** Number of spaces/tabs per list indent level. Default: 4. */
     listIndentWidth?: number;
+    /** Bullet characters for unordered lists (e.g., '*', '-', '+'). Default: '*'. */
     bullets?: string;
+    /** Markdown symbol for strong/bold emphasis: '**' or '__'. Default: '**'. */
     strongEmSymbol?: string;
+    /** Escape asterisks (*) in text to prevent accidental formatting. Default: false. */
     escapeAsterisks?: boolean;
+    /** Escape underscores (_) in text to prevent accidental formatting. Default: false. */
     escapeUnderscores?: boolean;
+    /** Escape miscellaneous special characters. Default: false. */
     escapeMisc?: boolean;
+    /** Escape ASCII control characters. Default: false. */
     escapeAscii?: boolean;
+    /** Default code language for syntax highlighting in code blocks (e.g., 'javascript'). Default: null. */
     codeLanguage?: string;
+    /** Convert HTML links to Markdown autolinks format ([text](url)). Default: true. */
     autolinks?: boolean;
+    /** Use the HTML title element as default for links when no text is available. Default: false. */
     defaultTitle?: boolean;
+    /** Insert <br> tags in Markdown tables. Default: false. */
     brInTables?: boolean;
+    /** Use HOCR spatial table format for better table structure preservation. Default: false. */
     hocrSpatialTables?: boolean;
+    /** Highlight style for marked/highlighted text: "double_equal" (==text==), "html" (<mark>), "bold" (**text**), or "none". Default: "none". */
     highlightStyle?: "double_equal" | "html" | "bold" | "none";
+    /** Extract metadata from HTML (title, meta tags, etc.). Default: false. */
     extractMetadata?: boolean;
+    /** Whitespace handling: "normalized" (collapse whitespace) or "strict" (preserve all whitespace). Default: "normalized". */
     whitespaceMode?: "normalized" | "strict";
+    /** Remove newlines from output (convert to single line). Default: false. */
     stripNewlines?: boolean;
+    /** Enable line wrapping at specified width. Default: true. */
     wrap?: boolean;
+    /** Maximum line width when wrapping is enabled. Default: 80. */
     wrapWidth?: number;
+    /** Convert as inline Markdown instead of block elements. Default: false. */
     convertAsInline?: boolean;
+    /** Markdown symbol for subscript text (e.g., '~' for ~text~). Default: '~'. */
     subSymbol?: string;
+    /** Markdown symbol for superscript text (e.g., '^' for ^text^). Default: '^'. */
     supSymbol?: string;
+    /** Newline style in output: "spaces" (two spaces + newline) or "backslash" (backslash + newline). Default: "spaces". */
     newlineStyle?: "spaces" | "backslash";
+    /** Code block style: "indented" (4-space indent), "backticks" (```), or "tildes" (~~~). Default: "backticks". */
     codeBlockStyle?: "indented" | "backticks" | "tildes";
+    /** List of HTML tag names to keep as inline images (don't convert). Default: []. */
     keepInlineImagesIn?: string[];
+    /** Character encoding for output (e.g., 'utf-8', 'ascii'). Default: 'utf-8'. */
     encoding?: string;
+    /** Enable debug mode for detailed conversion logging. Default: false. */
     debug?: boolean;
+    /** List of HTML tag names to remove entirely from output. Default: []. */
     stripTags?: string[];
+    /** List of HTML tag names to preserve in output (don't convert to Markdown). Default: []. */
     preserveTags?: string[];
+    /** HTML preprocessing options for cleaning HTML before conversion. */
     preprocessing?: HtmlPreprocessingOptions;
 }
+/** Keyword extraction algorithm type. */
 type KeywordAlgorithm = "yake" | "rake";
+/**
+ * YAKE (Yet Another Keyword Extractor) algorithm configuration.
+ *
+ * YAKE is an unsupervised keyword extraction method that doesn't require training data.
+ */
 interface YakeParams {
+    /** Window size for co-occurrence analysis (number of words to consider). Default: 3. */
     windowSize?: number;
 }
+/**
+ * RAKE (Rapid Automatic Keyword Extraction) algorithm configuration.
+ *
+ * RAKE extracts keywords based on word co-occurrence and statistical measures.
+ */
 interface RakeParams {
+    /** Minimum word length to consider as keyword. Default: 3. */
     minWordLength?: number;
+    /** Maximum number of words per keyword phrase. Default: 3. */
     maxWordsPerPhrase?: number;
 }
+/**
+ * Keyword extraction configuration.
+ *
+ * Extracts important keywords/phrases from document content using YAKE or RAKE algorithms.
+ */
 interface KeywordConfig {
+    /** Extraction algorithm: "yake" or "rake". Default: "yake". */
     algorithm?: KeywordAlgorithm;
+    /** Maximum number of keywords to extract. Default: 10. */
     maxKeywords?: number;
+    /** Minimum relevance score (0.0-1.0) for keywords. Keywords below this are filtered out. Default: 0.1. */
     minScore?: number;
+    /** N-gram range: [min_length, max_length] for phrase keywords (e.g., [1, 3] for 1-3 word phrases). Default: [1, 3]. */
     ngramRange?: [number, number];
+    /** Language for keyword extraction (e.g., 'en', 'de', 'fr'). Default: 'en'. */
     language?: string;
+    /** YAKE algorithm-specific parameters. Only used when algorithm is "yake". */
     yakeParams?: YakeParams;
+    /** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
     rakeParams?: RakeParams;
 }
+/**
+ * Extracted keyword with relevance metadata.
+ *
+ * Represents a single keyword extracted from text along with its relevance score,
+ * the algorithm that extracted it, and optional position information.
+ */
+interface ExtractedKeyword {
+    /** The keyword text */
+    text: string;
+    /** Relevance score (higher is better, algorithm-specific range) */
+    score: number;
+    /** Algorithm that extracted this keyword */
+    algorithm: KeywordAlgorithm;
+    /** Optional positions where keyword appears in text (character offsets) */
+    positions?: number[];
+}
 /**
  * Page tracking and extraction configuration.
  *
@@ -113,7 +321,7 @@ interface KeywordConfig {
  * Page range information in chunk metadata (first_page/last_page) is automatically
  * enabled when page boundaries are available and chunking is configured.
  */
-interface PageConfig {
+interface PageExtractionConfig {
     /** Extract pages as separate array (ExtractionResult.pages) */
     extractPages?: boolean;
     /** Insert page markers in main content string */
@@ -121,25 +329,53 @@ interface PageConfig {
     /** Page marker format (use {page_num} placeholder) */
     markerFormat?: string;
 }
+/**
+ * Main extraction configuration interface.
+ *
+ * Combines all sub-configurations for document extraction, OCR, chunking, post-processing, etc.
+ * All fields are optional and use sensible defaults.
+ */
 interface ExtractionConfig {
+    /** Enable caching of extraction results for identical inputs. Default: true. */
     useCache?: boolean;
+    /** Enable quality processing filters to improve extraction reliability. Default: false. */
     enableQualityProcessing?: boolean;
+    /** OCR configuration for text extraction from images. Only used when document contains images or forceOcr is true. */
     ocr?: OcrConfig;
+    /** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */
     forceOcr?: boolean;
+    /** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */
     chunking?: ChunkingConfig;
+    /** Image extraction and optimization configuration. */
     images?: ImageExtractionConfig;
+    /** PDF-specific extraction options (passwords, metadata, etc.). */
     pdfOptions?: PdfConfig;
+    /** Token reduction configuration for optimizing token usage in LLM pipelines. */
     tokenReduction?: TokenReductionConfig;
+    /** Language detection configuration for automatic language identification. */
     languageDetection?: LanguageDetectionConfig;
+    /** Post-processor configuration for customizing extraction results. */
     postprocessor?: PostProcessorConfig;
+    /** HTML to Markdown conversion options for HTML content. */
     htmlOptions?: HtmlConversionOptions;
+    /** Keyword extraction configuration for extracting important phrases. */
     keywords?: KeywordConfig;
-    pages?: PageConfig;
+    /** Page tracking and extraction configuration for multi-page documents. */
+    pages?: PageExtractionConfig;
+    /** Maximum number of concurrent extractions in batch operations. Default: 4. */
     maxConcurrentExtractions?: number;
 }
+/**
+ * Extracted table data from document.
+ *
+ * Contains both cell data and Markdown representation for easy display and processing.
+ */
 interface Table {
+    /** 2D array of cell contents (rows × columns) */
     cells: string[][];
+    /** Markdown representation of the table for display or parsing */
     markdown: string;
+    /** Page number where this table was found (1-indexed) */
     pageNumber: number;
 }
 interface ExcelMetadata {
@@ -180,28 +416,50 @@ interface TextMetadata {
     links?: [string, string][] | null;
     codeBlocks?: [string, string][] | null;
 }
+interface HeaderMetadata {
+    level: number;
+    text: string;
+    id?: string | null;
+    depth: number;
+    htmlOffset: number;
+}
+interface LinkMetadata {
+    href: string;
+    text: string;
+    title?: string | null;
+    linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
+    rel: string[];
+    attributes: Record<string, string>;
+}
+interface HtmlImageMetadata {
+    src: string;
+    alt?: string | null;
+    title?: string | null;
+    dimensions?: [number, number] | null;
+    imageType: "data_uri" | "inline_svg" | "external" | "relative";
+    attributes: Record<string, string>;
+}
+interface StructuredData {
+    dataType: "json_ld" | "microdata" | "rdfa";
+    rawJson: string;
+    schemaType?: string | null;
+}
 interface HtmlMetadata {
     title?: string | null;
     description?: string | null;
-    keywords?: string | null;
+    keywords: string[];
     author?: string | null;
-    canonical?: string | null;
+    canonicalUrl?: string | null;
     baseHref?: string | null;
-    ogTitle?: string | null;
-    ogDescription?: string | null;
-    ogImage?: string | null;
-    ogUrl?: string | null;
-    ogType?: string | null;
-    ogSiteName?: string | null;
-    twitterCard?: string | null;
-    twitterTitle?: string | null;
-    twitterDescription?: string | null;
-    twitterImage?: string | null;
-    twitterSite?: string | null;
-    twitterCreator?: string | null;
-    linkAuthor?: string | null;
-    linkLicense?: string | null;
-    linkAlternate?: string | null;
+    language?: string | null;
+    textDirection?: "ltr" | "rtl" | "auto" | null;
+    openGraph: Record<string, string>;
+    twitterCard: Record<string, string>;
+    metaTags: Record<string, string>;
+    htmlHeaders: HeaderMetadata[];
+    htmlLinks: LinkMetadata[];
+    htmlImages: HtmlImageMetadata[];
+    structuredData: StructuredData[];
 }
 interface PdfMetadata {
     title?: string | null;
@@ -329,38 +587,62 @@ interface ChunkMetadata {
     /** Last page number this chunk spans (1-indexed, only when page tracking enabled) */
     lastPage?: number | null;
 }
+/**
+ * Text chunk with optional embedding.
+ *
+ * Represents a segment of a document created by the chunking algorithm, useful for RAG and vector databases.
+ */
 interface Chunk {
+    /** Text content of this chunk */
     content: string;
+    /** Vector embedding for this chunk (if embedding model was used) */
     embedding?: number[] | null;
+    /** Metadata about chunk position and properties in the document */
     metadata: ChunkMetadata;
 }
+/**
+ * Extracted image from document with optional OCR result.
+ *
+ * Contains image data and metadata about position, dimensions, and properties.
+ */
 interface ExtractedImage {
+    /** Raw image bytes as Uint8Array */
     data: Uint8Array;
+    /** Image format (e.g., 'png', 'jpeg', 'tiff') */
     format: string;
+    /** Sequential index of this image in the document (0-indexed) */
     imageIndex: number;
+    /** Page number where this image was found (1-indexed), null if unknown */
     pageNumber?: number | null;
+    /** Image width in pixels, null if unknown */
     width?: number | null;
+    /** Image height in pixels, null if unknown */
     height?: number | null;
+    /** Color space (e.g., 'RGB', 'CMYK', 'Grayscale'), null if unknown */
     colorspace?: string | null;
+    /** Bits per color component (e.g., 8 for 8-bit), null if unknown */
     bitsPerComponent?: number | null;
+    /** Whether this is a mask image (used internally by PDF) */
     isMask: boolean;
+    /** Image description or caption if available */
     description?: string | null;
+    /** OCR extraction result if OCR was run on this image, null otherwise */
     ocrResult?: ExtractionResult | null;
 }
 /**
  * Content for a single page/slide/sheet.
  *
  * When page extraction is enabled, documents are split into per-page content
- * with associated tables and images mapped to each page.
+ * with associated tables and images mapped to each page. This allows for page-specific processing.
  */
 interface PageContent {
-    /** Page number (1-indexed) */
+    /** Page number (1-indexed) starting from 1 */
     pageNumber: number;
-    /** Text content for this page */
+    /** Text content extracted from this page */
     content: string;
-    /** Tables found on this page */
+    /** Tables found and extracted from this page */
     tables: Table[];
-    /** Images found on this page */
+    /** Images found and extracted from this page */
     images: ExtractedImage[];
 }
 /**
@@ -413,23 +695,17 @@ interface Metadata {
     headers?: string[] | null;
     links?: [string, string][] | null;
     code_blocks?: [string, string][] | null;
-    canonical?: string | null;
+    canonical_url?: string | null;
     base_href?: string | null;
-    og_title?: string | null;
-    og_description?: string | null;
-    og_image?: string | null;
-    og_url?: string | null;
-    og_type?: string | null;
-    og_site_name?: string | null;
-    twitter_card?: string | null;
-    twitter_title?: string | null;
-    twitter_description?: string | null;
-    twitter_image?: string | null;
-    twitter_site?: string | null;
-    twitter_creator?: string | null;
-    link_author?: string | null;
-    link_license?: string | null;
-    link_alternate?: string | null;
+    open_graph?: Record<string, string>;
+    twitter_card?: Record<string, string>;
+    meta_tags?: Record<string, string>;
+    html_language?: string | null;
+    text_direction?: "ltr" | "rtl" | "auto" | null;
+    html_headers?: HeaderMetadata[];
+    html_links?: LinkMetadata[];
+    html_images?: HtmlImageMetadata[];
+    structured_data?: StructuredData[];
     psm?: number;
     output_format?: string;
     table_count?: number;
@@ -439,92 +715,141 @@ interface Metadata {
     json_schema?: Record<string, unknown> | null;
     page_structure?: PageStructure | null;
     error?: ErrorMetadata | null;
-    [key: string]: any;
+    /**
+     * Additional fields may be added at runtime by postprocessors.
+     * Use bracket notation to safely access unexpected properties.
+     */
+    [key: string]: unknown;
 }
+/**
+ * Complete extraction result from document processing.
+ *
+ * Contains all extracted content, metadata, and optional processed data like chunks and images.
+ * This is the primary return value from extraction functions.
+ */
 interface ExtractionResult {
+    /** Extracted text content from the document (main content) */
     content: string;
+    /** MIME type of the input document (e.g., 'application/pdf', 'text/html') */
     mimeType: string;
+    /** Document metadata including title, author, creation date, language, and format-specific fields */
     metadata: Metadata;
+    /** Tables extracted from the document (2D cell arrays with Markdown representation) */
     tables: Table[];
+    /** Detected languages in the document (ISO 639-1 codes, e.g., ['en', 'de']), null if detection disabled */
     detectedLanguages: string[] | null;
+    /** Document chunks for RAG/vector databases (if chunking was enabled), null otherwise */
     chunks: Chunk[] | null;
+    /** Images extracted from document with metadata (if image extraction was enabled), null otherwise */
     images: ExtractedImage[] | null;
+    /** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
     pages?: PageContent[] | null;
+    /** Extracted keywords when keyword extraction is enabled, null otherwise */
+    keywords?: ExtractedKeyword[] | null;
 }
+/** Post-processor execution stage in the extraction pipeline. */
 type ProcessingStage = "early" | "middle" | "late";
+/**
+ * Protocol for custom post-processors that modify extraction results.
+ *
+ * Post-processors enrich or transform extraction results without failing the extraction.
+ * If a post-processor throws an error, it's logged but extraction continues.
+ * Only works with async extraction functions (`extractFile`, `extractBytes`, etc.).
+ */
 interface PostProcessorProtocol {
     /**
      * Return the unique name of this postprocessor.
+     *
+     * @returns Unique processor name (case-sensitive, alphanumeric + underscores recommended)
      */
     name(): string;
     /**
      * Process and enrich an extraction result.
      *
+     * Modify the result to add new metadata, transform content, or perform other enrichment.
+     * If this throws an error, it's logged but extraction continues.
+     *
      * @param result - ExtractionResult with extracted content, metadata, and tables
-     * @returns Modified result with enriched metadata
+     * @returns Modified result with enriched data. Can be async or sync.
      */
     process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
     /**
      * Return the processing stage for this processor.
      *
+     * Determines when this processor runs relative to others:
+     * - "early": Runs first, before other processors (good for cleanup/normalization)
+     * - "middle": Runs with other middle-stage processors (default)
+     * - "late": Runs last, after others (good for final enrichment)
+     *
      * @returns One of "early", "middle", or "late" (default: "middle")
      */
     processingStage?(): ProcessingStage;
     /**
-     * Initialize the processor (e.g., load ML models).
+     * Initialize the processor (e.g., load ML models, setup resources).
      *
-     * Called once when the processor is registered.
+     * Called once when the processor is first registered. Use for expensive operations.
      */
     initialize?(): void | Promise<void>;
     /**
      * Shutdown the processor and release resources.
      *
-     * Called when the processor is unregistered.
+     * Called when the processor is unregistered. Use for cleanup (closing connections, freeing memory).
      */
     shutdown?(): void | Promise<void>;
 }
+/**
+ * Protocol for custom validators that check extraction results.
+ *
+ * Validators perform quality checks and fail the extraction if validation fails.
+ * Unlike post-processors, validator errors cause the entire extraction to fail.
+ * Useful for enforcing quality standards on extracted content.
+ */
 interface ValidatorProtocol {
     /**
      * Return the unique name of this validator.
+     *
+     * @returns Unique validator name (case-sensitive, alphanumeric + underscores recommended)
      */
     name(): string;
     /**
      * Validate an extraction result.
      *
-     * Throw an error if validation fails. The error message should explain why validation failed.
-     * If validation passes, return without throwing.
+     * Throw an error if validation fails. The error message will be used as the extraction error.
+     * If validation passes, return without throwing (return value is ignored).
      *
      * @param result - ExtractionResult to validate
-     * @throws Error if validation fails (extraction will fail)
+     * @throws {Error} If validation fails (extraction will fail with this error)
      */
     validate(result: ExtractionResult): void | Promise<void>;
     /**
      * Return the validation priority.
      *
-     * Higher priority validators run first. Useful for running cheap validations before expensive ones.
+     * Higher priority validators run first. Useful for running cheap validations (e.g., length checks)
+     * before expensive ones (e.g., AI-based quality checks) to fail fast.
      *
-     * @returns Priority value (higher = runs earlier, default: 50)
+     * @returns Priority value (higher = runs earlier, default: 50). Range: 0-1000.
      */
     priority?(): number;
     /**
      * Check if this validator should run for a given result.
      *
      * Allows conditional validation based on MIME type, metadata, or content.
+     * This is evaluated before validation, so expensive checks can be skipped for irrelevant documents.
      *
      * @param result - ExtractionResult to check
      * @returns true if validator should run, false to skip (default: true)
      */
     shouldValidate?(result: ExtractionResult): boolean;
     /**
-     * Initialize the validator.
+     * Initialize the validator (e.g., load ML models, setup resources).
      *
-     * Called once when the validator is registered.
+     * Called once when the validator is first registered. Use for expensive operations.
      */
     initialize?(): void | Promise<void>;
     /**
      * Shutdown the validator and release resources.
      *
-     * Called when the validator is unregistered.
+     * Called when the validator is unregistered. Use for cleanup (closing connections, freeing memory).
      */
     shutdown?(): void | Promise<void>;
 }
@@ -662,5 +987,95 @@ interface OcrBackendProtocol {
      */
     shutdown?(): void | Promise<void>;
 }
+/**
+ * Result of error message classification into error codes.
+ *
+ * Provides classification details including the error code, name,
+ * description, and confidence score for the classification.
+ *
+ * @example
+ * ```typescript
+ * import { classifyError, ErrorCode } from '@kreuzberg/node';
+ *
+ * const result = classifyError("File not found in read operation");
+ * if (result.code === ErrorCode.IoError) {
+ *   console.error(`I/O Error: ${result.description}`);
+ *   console.log(`Confidence: ${result.confidence}`);
+ * }
+ * ```
+ */
+interface ErrorClassification {
+    /**
+     * The numeric error code (0-7) representing the error type.
+     */
+    code: number;
+    /**
+     * The human-readable name of the error code (e.g., "validation", "ocr").
+     */
+    name: string;
+    /**
+     * A brief description of the error type.
+     */
+    description: string;
+    /**
+     * Confidence score (0.0-1.0) indicating how certain the classification is.
+     * Higher values indicate higher confidence in the classification.
+     */
+    confidence: number;
+}
+/**
+ * Opaque handle to a worker pool for concurrent extraction operations.
+ *
+ * Worker pools enable parallel processing of CPU-bound document extraction
+ * tasks by distributing work across multiple threads. This is especially
+ * useful for batch processing large numbers of documents.
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4); // 4 concurrent workers
+ * try {
+ *   const result = await extractFileInWorker(pool, 'document.pdf');
+ *   console.log(result.content);
+ * } finally {
+ *   await closeWorkerPool(pool);
+ * }
+ * ```
+ */
+interface WorkerPool {
+    /** Internal pool identifier (opaque) */
+    readonly poolId: number;
+}
+/**
+ * Worker pool statistics.
+ *
+ * Provides information about the current state of a worker pool including
+ * pool size, number of active workers, and queued tasks.
+ *
+ * @example
+ * ```typescript
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
+ *
+ * const pool = createWorkerPool(4);
+ * const stats = getWorkerPoolStats(pool);
+ * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
+ * console.log(`Queued: ${stats.queuedTasks}`);
+ * ```
+ */
+interface WorkerPoolStats {
+    /**
+     * Maximum number of concurrent workers in the pool.
+     */
+    size: number;
+    /**
+     * Number of currently active (executing) workers.
+     */
+    activeWorkers: number;
+    /**
+     * Number of tasks waiting in the queue.
+     */
+    queuedTasks: number;
+}
-export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, XmlMetadata, YakeParams };
+export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };