@kreuzberg/node 4.4.6 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/types.d.mts CHANGED
@@ -472,6 +472,21 @@ interface PageExtractionConfig {
472
472
  /** Page marker format (use {page_num} placeholder) */
473
473
  markerFormat?: string;
474
474
  }
475
+ /**
476
+ * Layout detection configuration for PDF extraction.
477
+ *
478
+ * Controls layout detection using ONNX-based document layout models (YOLO or RT-DETR)
479
+ * to detect document structure elements like tables, figures, headers, and code blocks.
480
+ * Requires the `layout-detection` feature to be compiled.
481
+ */
482
+ interface LayoutDetectionConfig {
483
+ /** Model preset: "fast" (YOLO, 11 classes) or "accurate" (RT-DETR, 17 classes). Default: "fast". */
484
+ preset?: string;
485
+ /** Override the model's default confidence threshold for detections. Default: null (use model default). */
486
+ confidenceThreshold?: number;
487
+ /** Apply postprocessing heuristics to improve detection quality. Default: true. */
488
+ applyHeuristics?: boolean;
489
+ }
475
490
  /**
476
491
  * Main extraction configuration interface.
477
492
  *
@@ -525,6 +540,8 @@ interface ExtractionConfig {
525
540
  * - "element_based": Semantic element extraction (Unstructured-compatible)
526
541
  */
527
542
  resultFormat?: "unified" | "element_based";
543
+ /** Layout detection configuration for detecting document structure in PDFs. */
544
+ layout?: LayoutDetectionConfig;
528
545
  }
529
546
  /**
530
547
  * Extracted table data from document.
@@ -1336,4 +1353,4 @@ interface WorkerPoolStats {
1336
1353
  queuedTasks: number;
1337
1354
  }
1338
1355
 
1339
- export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HeadingContext, HeadingLevel, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfAnnotation, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, ProcessingWarning, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
1356
+ export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HeadingContext, HeadingLevel, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LayoutDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfAnnotation, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, ProcessingWarning, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
package/dist/types.d.ts CHANGED
@@ -472,6 +472,21 @@ interface PageExtractionConfig {
472
472
  /** Page marker format (use {page_num} placeholder) */
473
473
  markerFormat?: string;
474
474
  }
475
+ /**
476
+ * Layout detection configuration for PDF extraction.
477
+ *
478
+ * Controls layout detection using ONNX-based document layout models (YOLO or RT-DETR)
479
+ * to detect document structure elements like tables, figures, headers, and code blocks.
480
+ * Requires the `layout-detection` feature to be compiled.
481
+ */
482
+ interface LayoutDetectionConfig {
483
+ /** Model preset: "fast" (YOLO, 11 classes) or "accurate" (RT-DETR, 17 classes). Default: "fast". */
484
+ preset?: string;
485
+ /** Override the model's default confidence threshold for detections. Default: null (use model default). */
486
+ confidenceThreshold?: number;
487
+ /** Apply postprocessing heuristics to improve detection quality. Default: true. */
488
+ applyHeuristics?: boolean;
489
+ }
475
490
  /**
476
491
  * Main extraction configuration interface.
477
492
  *
@@ -525,6 +540,8 @@ interface ExtractionConfig {
525
540
  * - "element_based": Semantic element extraction (Unstructured-compatible)
526
541
  */
527
542
  resultFormat?: "unified" | "element_based";
543
+ /** Layout detection configuration for detecting document structure in PDFs. */
544
+ layout?: LayoutDetectionConfig;
528
545
  }
529
546
  /**
530
547
  * Extracted table data from document.
@@ -1336,4 +1353,4 @@ interface WorkerPoolStats {
1336
1353
  queuedTasks: number;
1337
1354
  }
1338
1355
 
1339
- export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HeadingContext, HeadingLevel, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfAnnotation, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, ProcessingWarning, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
1356
+ export type { ArchiveMetadata, BoundingBox, Chunk, ChunkMetadata, ChunkingConfig, Element, ElementMetadata, ElementType, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HeadingContext, HeadingLevel, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LayoutDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrBoundingGeometry, OcrBoundingGeometryQuadrilateral, OcrBoundingGeometryRectangle, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrElementLevel, OcrMetadata, OcrRotation, PaddleOcrConfig, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfAnnotation, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, ProcessingWarning, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
package/dist/types.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../typescript/types.ts"],"sourcesContent":["/**\n * Type definitions for Kreuzberg extraction results.\n *\n * These types mirror the strongly-typed Rust metadata structures,\n * providing type safety for TypeScript users.\n */\n\n// ============================================================================\n// ============================================================================\n\n/**\n * Tesseract OCR engine configuration options.\n *\n * @example\n * ```typescript\n * const config: TesseractConfig = {\n * psm: 6,\n * enableTableDetection: true,\n * tesseditCharWhitelist: '0123456789'\n * };\n * ```\n */\nexport interface TesseractConfig {\n\t/**\n\t * Page Segmentation Mode (0-13). Controls how Tesseract segments and recognizes text.\n\t * Common values: 3 (auto), 6 (single uniform block), 11 (sparse text).\n\t * Default: 3 (auto layout analysis).\n\t */\n\tpsm?: number;\n\n\t/**\n\t * Enable table detection during OCR processing.\n\t * When true, Tesseract attempts to preserve table structure in the output.\n\t * Default: false.\n\t */\n\tenableTableDetection?: boolean;\n\n\t/**\n\t * Whitelist of characters Tesseract should recognize.\n\t * Only these characters will be returned by the OCR engine.\n\t * Use empty string to allow all characters. Useful for constraining output to digits,\n\t * specific alphabets, or other character sets.\n\t * Default: null (recognize all).\n\t */\n\ttesseditCharWhitelist?: string;\n}\n\n/**\n * OCR element hierarchy level.\n *\n * Defines the granularity of OCR element extraction.\n */\nexport type OcrElementLevel = \"word\" | \"line\" | \"block\" | \"page\";\n\n/**\n * Bounding geometry for OCR elements using rectangle coordinates.\n *\n * Represents rectangular coordinates with position and dimensions.\n */\nexport interface OcrBoundingGeometryRectangle {\n\ttype: \"rectangle\";\n\tleft: number;\n\ttop: number;\n\twidth: number;\n\theight: number;\n}\n\n/**\n * Bounding geometry for OCR elements using quadrilateral points.\n *\n * Represents irregular quadrilateral shapes with four corner points.\n */\nexport interface OcrBoundingGeometryQuadrilateral {\n\ttype: \"quadrilateral\";\n\tpoints: number[][];\n}\n\n/**\n * Bounding geometry for OCR elements.\n *\n * Can be either rectangular or quadrilateral based on the OCR engine's detection capability.\n */\nexport type OcrBoundingGeometry = OcrBoundingGeometryRectangle | OcrBoundingGeometryQuadrilateral;\n\n/**\n * Confidence scores for OCR operations.\n *\n * Tracks confidence levels for different aspects of OCR processing.\n */\nexport interface OcrConfidence {\n\t/** Confidence score (0.0-1.0) for text detection. */\n\tdetection?: number;\n\n\t/** Confidence score (0.0-1.0) for text recognition. */\n\trecognition?: number;\n}\n\n/**\n * Rotation information for OCR elements.\n *\n * Tracks detected text rotation and associated confidence.\n */\nexport interface OcrRotation {\n\t/** Angle of rotation in degrees. */\n\tangleDegrees?: number;\n\n\t/** Confidence score (0.0-1.0) for rotation detection. */\n\tconfidence?: number;\n}\n\n/**\n * Individual OCR element (word, line, block, or page).\n *\n * Represents a granular unit of text extracted by OCR with geometric and confidence information.\n */\nexport interface OcrElement {\n\t/** Extracted text content */\n\ttext: string;\n\n\t/** Bounding geometry of the element in the image */\n\tgeometry?: OcrBoundingGeometry;\n\n\t/** Confidence scores for detection and recognition */\n\tconfidence?: OcrConfidence;\n\n\t/** Hierarchy level of this element */\n\tlevel?: OcrElementLevel;\n\n\t/** Rotation information if text is rotated */\n\trotation?: OcrRotation;\n\n\t/** Page number where this element was found (1-indexed) */\n\tpageNumber?: number;\n\n\t/** Parent element ID for hierarchical relationships */\n\tparentId?: string;\n\n\t/** Backend-specific metadata that doesn't fit standard fields */\n\tbackendMetadata?: Record<string, unknown>;\n}\n\n/**\n * Configuration for OCR element extraction.\n *\n * Controls how granular OCR elements are extracted and organized.\n */\nexport interface OcrElementConfig {\n\t/** Enable extraction of granular OCR elements. Default: false. */\n\tincludeElements?: boolean;\n\n\t/** Minimum hierarchy level to extract. Default: 'word'. */\n\tminLevel?: OcrElementLevel;\n\n\t/** Minimum confidence threshold (0.0-1.0) for including elements. Default: 0.0. */\n\tminConfidence?: number;\n\n\t/** Build hierarchical relationships between elements. Default: false. */\n\tbuildHierarchy?: boolean;\n}\n\n/**\n * PaddleOCR engine configuration options.\n *\n * Specific configuration for the PaddleOCR backend.\n */\nexport interface PaddleOcrConfig {\n\t/** Language code(s) for OCR (e.g., 'en', 'zh', 'multi'). */\n\tlanguage?: string;\n\n\t/** Directory to cache downloaded OCR models. */\n\tcacheDir?: string;\n\n\t/** Enable angle classification for rotated text detection. Default: false. */\n\tuseAngleCls?: boolean;\n\n\t/** Enable table structure detection. Default: false. */\n\tenableTableDetection?: boolean;\n\n\t/** Database threshold for text detection (0.0-1.0). Default: 0.3. */\n\tdetDbThresh?: number;\n\n\t/** Box threshold for text detection (0.0-1.0). Default: 0.5. */\n\tdetDbBoxThresh?: number;\n\n\t/** Unclip ratio for expanding detected text regions. Default: 1.5. */\n\tdetDbUnclipRatio?: number;\n\n\t/** Maximum side length for detection preprocessing. Default: 960. */\n\tdetLimitSideLen?: number;\n\n\t/** Batch size for text recognition. Default: 30. */\n\trecBatchNum?: number;\n}\n\n/**\n * OCR (Optical Character Recognition) configuration.\n *\n * Controls which OCR engine to use and how it processes images.\n */\nexport interface OcrConfig {\n\t/** OCR backend name (e.g., 'tesseract', 'paddleocr', 'easyocr'). Required. */\n\tbackend: string;\n\n\t/** ISO 639-1/3 language code(s) for OCR (e.g., 'eng', 'fra', 'deu'). Default: 'eng'. */\n\tlanguage?: string;\n\n\t/** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */\n\ttesseractConfig?: TesseractConfig;\n\n\t/** PaddleOCR engine-specific configuration options. Only used when backend is 'paddleocr'. */\n\tpaddleOcrConfig?: PaddleOcrConfig;\n\n\t/** OCR element extraction configuration. */\n\telementConfig?: OcrElementConfig;\n}\n\n/**\n * Document chunking configuration for splitting large documents.\n *\n * Breaks large documents into smaller, manageable chunks while preserving context.\n * Useful for RAG (Retrieval Augmented Generation) and vector database indexing.\n */\nexport interface ChunkingConfig {\n\t/** Maximum characters per chunk. Default: 4096. */\n\tmaxChars?: number;\n\n\t/** Maximum overlapping characters between consecutive chunks for context preservation. Default: 512. */\n\tmaxOverlap?: number;\n\n\t/**\n\t * Alternative to maxChars: chunk size using different unit.\n\t * Mutually exclusive with maxChars.\n\t */\n\tchunkSize?: number;\n\n\t/**\n\t * Alternative to maxOverlap: overlap amount using different unit.\n\t * Mutually exclusive with maxOverlap.\n\t */\n\tchunkOverlap?: number;\n\n\t/**\n\t * Named preset configuration (e.g., 'default', 'aggressive', 'minimal').\n\t * Uses preset values if neither maxChars nor chunkSize is specified.\n\t */\n\tpreset?: string;\n\n\t/** Embedding configuration for generating vector embeddings for each chunk. */\n\tembedding?: Record<string, unknown>;\n\n\t/** Enable or disable chunking. Default: true when chunking config is provided. */\n\tenabled?: boolean;\n\n\t/** Sizing type: \"characters\" (default) or \"tokenizer\". */\n\tsizingType?: \"characters\" | \"tokenizer\";\n\n\t/** HuggingFace model ID for tokenizer sizing (e.g., \"Xenova/gpt-4o\"). */\n\tsizingModel?: string;\n\n\t/** Optional cache directory for tokenizer files. */\n\tsizingCacheDir?: string;\n}\n\n/**\n * Language detection configuration.\n *\n * Automatically detects the language(s) of extracted content.\n */\nexport interface LanguageDetectionConfig {\n\t/** Enable automatic language detection. Default: true. */\n\tenabled?: boolean;\n\n\t/** Minimum confidence score (0.0-1.0) for language detection. Default: 0.5. */\n\tminConfidence?: number;\n\n\t/** Detect multiple languages in the same document. Default: false. */\n\tdetectMultiple?: boolean;\n}\n\n/**\n * Token reduction configuration for optimizing token usage.\n *\n * Reduces the number of tokens in extracted content while preserving meaning.\n * Useful for reducing costs in LLM pipelines.\n */\nexport interface TokenReductionConfig {\n\t/** Reduction mode: 'aggressive' or 'conservative'. Default: 'conservative'. */\n\tmode?: string;\n\n\t/** Preserve tokens for semantically important words even in aggressive mode. Default: true. */\n\tpreserveImportantWords?: boolean;\n}\n\n/**\n * Hierarchy extraction configuration.\n *\n * Controls document hierarchy detection based on font size clustering.\n */\nexport interface HierarchyConfig {\n\t/** Enable hierarchy extraction. Default: true. */\n\tenabled?: boolean;\n\n\t/** Number of font size clusters (2-10). Default: 6. */\n\tkClusters?: number;\n\n\t/** Include bounding box information. Default: true. */\n\tincludeBbox?: boolean;\n\n\t/** OCR coverage threshold (0.0-1.0). Default: null. */\n\tocrCoverageThreshold?: number | null;\n}\n\n/**\n * PDF-specific extraction configuration.\n *\n * Controls how PDF documents are processed.\n */\nexport interface PdfConfig {\n\t/** Extract images from PDF pages. Default: true. */\n\textractImages?: boolean;\n\n\t/** List of passwords to try for password-protected PDFs. */\n\tpasswords?: string[];\n\n\t/** Extract document metadata (title, author, creation date, etc.). Default: true. */\n\textractMetadata?: boolean;\n\n\t/** Hierarchy extraction configuration. */\n\thierarchy?: HierarchyConfig;\n\n\t/** Extract annotations from PDF pages. Default: false. */\n\textractAnnotations?: boolean;\n\n\t/** Top margin fraction (0.0-0.5) for filtering header content. */\n\ttopMarginFraction?: number;\n\n\t/** Bottom margin fraction (0.0-0.5) for filtering footer content. */\n\tbottomMarginFraction?: number;\n}\n\n/**\n * Image extraction and processing configuration.\n *\n * Controls how images are extracted and optimized from documents.\n */\nexport interface ImageExtractionConfig {\n\t/** Enable image extraction from documents. Default: true. */\n\textractImages?: boolean;\n\n\t/** Target DPI (dots per inch) for extracted images. Higher DPI = better quality but larger files. Default: 150. */\n\ttargetDpi?: number;\n\n\t/** Maximum image dimension (width or height) in pixels. Images larger than this are downscaled. Default: 2000. */\n\tmaxImageDimension?: number;\n\n\t/** Automatically adjust DPI based on image content and quality. Default: true. */\n\tautoAdjustDpi?: boolean;\n\n\t/** Minimum DPI to maintain for image quality. Default: 72. */\n\tminDpi?: number;\n\n\t/** Maximum DPI to avoid excessive file sizes. Default: 300. */\n\tmaxDpi?: number;\n}\n\n/**\n * Post-processor configuration for modifying extracted content.\n *\n * Post-processors allow customization and cleanup of extraction results\n * without failing the extraction if they encounter errors.\n */\nexport interface PostProcessorConfig {\n\t/** Enable or disable post-processing entirely. Default: true. */\n\tenabled?: boolean;\n\n\t/** List of processor names to enable (allowlist). When set, only these are used. */\n\tenabledProcessors?: string[];\n\n\t/** List of processor names to disable (denylist). These are skipped. */\n\tdisabledProcessors?: string[];\n}\n\n/**\n * HTML preprocessing options.\n *\n * Cleans HTML content before conversion to Markdown.\n */\nexport interface HtmlPreprocessingOptions {\n\t/** Enable HTML preprocessing. Default: true. */\n\tenabled?: boolean;\n\n\t/** Preset cleanup level: 'minimal' (light), 'standard' (balanced), 'aggressive' (heavy). Default: 'standard'. */\n\tpreset?: \"minimal\" | \"standard\" | \"aggressive\";\n\n\t/** Remove navigation menus and headers. Default: true. */\n\tremoveNavigation?: boolean;\n\n\t/** Remove form elements. Default: true. */\n\tremoveForms?: boolean;\n}\n\n/**\n * HTML to Markdown conversion configuration options.\n *\n * Controls how HTML content is converted to Markdown format, including formatting,\n * escaping, and special handling for various HTML elements.\n */\nexport interface HtmlConversionOptions {\n\t/** Heading style conversion: \"atx\" (# style), \"underlined\" (underline style), or \"atx_closed\" (# style closed). Default: \"atx\". */\n\theadingStyle?: \"atx\" | \"underlined\" | \"atx_closed\";\n\n\t/** List indentation type: \"spaces\" or \"tabs\". Default: \"spaces\". */\n\tlistIndentType?: \"spaces\" | \"tabs\";\n\n\t/** Number of spaces/tabs per list indent level. Default: 4. */\n\tlistIndentWidth?: number;\n\n\t/** Bullet characters for unordered lists (e.g., '*', '-', '+'). Default: '*'. */\n\tbullets?: string;\n\n\t/** Markdown symbol for strong/bold emphasis: '**' or '__'. Default: '**'. */\n\tstrongEmSymbol?: string;\n\n\t/** Escape asterisks (*) in text to prevent accidental formatting. Default: false. */\n\tescapeAsterisks?: boolean;\n\n\t/** Escape underscores (_) in text to prevent accidental formatting. Default: false. */\n\tescapeUnderscores?: boolean;\n\n\t/** Escape miscellaneous special characters. Default: false. */\n\tescapeMisc?: boolean;\n\n\t/** Escape ASCII control characters. Default: false. */\n\tescapeAscii?: boolean;\n\n\t/** Default code language for syntax highlighting in code blocks (e.g., 'javascript'). Default: null. */\n\tcodeLanguage?: string;\n\n\t/** Convert HTML links to Markdown autolinks format ([text](url)). Default: true. */\n\tautolinks?: boolean;\n\n\t/** Use the HTML title element as default for links when no text is available. Default: false. */\n\tdefaultTitle?: boolean;\n\n\t/** Insert <br> tags in Markdown tables. Default: false. */\n\tbrInTables?: boolean;\n\n\t/** Use HOCR spatial table format for better table structure preservation. Default: false. */\n\thocrSpatialTables?: boolean;\n\n\t/** Highlight style for marked/highlighted text: \"double_equal\" (==text==), \"html\" (<mark>), \"bold\" (**text**), or \"none\". Default: \"none\". */\n\thighlightStyle?: \"double_equal\" | \"html\" | \"bold\" | \"none\";\n\n\t/** Extract metadata from HTML (title, meta tags, etc.). Default: false. */\n\textractMetadata?: boolean;\n\n\t/** Whitespace handling: \"normalized\" (collapse whitespace) or \"strict\" (preserve all whitespace). Default: \"normalized\". */\n\twhitespaceMode?: \"normalized\" | \"strict\";\n\n\t/** Remove newlines from output (convert to single line). Default: false. */\n\tstripNewlines?: boolean;\n\n\t/** Enable line wrapping at specified width. Default: true. */\n\twrap?: boolean;\n\n\t/** Maximum line width when wrapping is enabled. Default: 80. */\n\twrapWidth?: number;\n\n\t/** Convert as inline Markdown instead of block elements. Default: false. */\n\tconvertAsInline?: boolean;\n\n\t/** Markdown symbol for subscript text (e.g., '~' for ~text~). Default: '~'. */\n\tsubSymbol?: string;\n\n\t/** Markdown symbol for superscript text (e.g., '^' for ^text^). Default: '^'. */\n\tsupSymbol?: string;\n\n\t/** Newline style in output: \"spaces\" (two spaces + newline) or \"backslash\" (backslash + newline). Default: \"spaces\". */\n\tnewlineStyle?: \"spaces\" | \"backslash\";\n\n\t/** Code block style: \"indented\" (4-space indent), \"backticks\" (```), or \"tildes\" (~~~). Default: \"backticks\". */\n\tcodeBlockStyle?: \"indented\" | \"backticks\" | \"tildes\";\n\n\t/** List of HTML tag names to keep as inline images (don't convert). Default: []. */\n\tkeepInlineImagesIn?: string[];\n\n\t/** Character encoding for output (e.g., 'utf-8', 'ascii'). Default: 'utf-8'. */\n\tencoding?: string;\n\n\t/** Enable debug mode for detailed conversion logging. Default: false. */\n\tdebug?: boolean;\n\n\t/** List of HTML tag names to remove entirely from output. Default: []. */\n\tstripTags?: string[];\n\n\t/** List of HTML tag names to preserve in output (don't convert to Markdown). Default: []. */\n\tpreserveTags?: string[];\n\n\t/** HTML preprocessing options for cleaning HTML before conversion. */\n\tpreprocessing?: HtmlPreprocessingOptions;\n}\n\n/** Keyword extraction algorithm type. */\nexport type KeywordAlgorithm = \"yake\" | \"rake\";\n\n/**\n * YAKE (Yet Another Keyword Extractor) algorithm configuration.\n *\n * YAKE is an unsupervised keyword extraction method that doesn't require training data.\n */\nexport interface YakeParams {\n\t/** Window size for co-occurrence analysis (number of words to consider). Default: 3. */\n\twindowSize?: number;\n}\n\n/**\n * RAKE (Rapid Automatic Keyword Extraction) algorithm configuration.\n *\n * RAKE extracts keywords based on word co-occurrence and statistical measures.\n */\nexport interface RakeParams {\n\t/** Minimum word length to consider as keyword. Default: 3. */\n\tminWordLength?: number;\n\n\t/** Maximum number of words per keyword phrase. Default: 3. */\n\tmaxWordsPerPhrase?: number;\n}\n\n/**\n * Keyword extraction configuration.\n *\n * Extracts important keywords/phrases from document content using YAKE or RAKE algorithms.\n */\nexport interface KeywordConfig {\n\t/** Extraction algorithm: \"yake\" or \"rake\". Default: \"yake\". */\n\talgorithm?: KeywordAlgorithm;\n\n\t/** Maximum number of keywords to extract. Default: 10. */\n\tmaxKeywords?: number;\n\n\t/** Minimum relevance score (0.0-1.0) for keywords. Keywords below this are filtered out. Default: 0.1. */\n\tminScore?: number;\n\n\t/** N-gram range: [min_length, max_length] for phrase keywords (e.g., [1, 3] for 1-3 word phrases). Default: [1, 3]. */\n\tngramRange?: [number, number];\n\n\t/** Language for keyword extraction (e.g., 'en', 'de', 'fr'). Default: 'en'. */\n\tlanguage?: string;\n\n\t/** YAKE algorithm-specific parameters. Only used when algorithm is \"yake\". */\n\tyakeParams?: YakeParams;\n\n\t/** RAKE algorithm-specific parameters. Only used when algorithm is \"rake\". */\n\trakeParams?: RakeParams;\n}\n\n/**\n * Extracted keyword with relevance metadata.\n *\n * Represents a single keyword extracted from text along with its relevance score,\n * the algorithm that extracted it, and optional position information.\n */\nexport interface ExtractedKeyword {\n\t/** The keyword text */\n\ttext: string;\n\n\t/** Relevance score (higher is better, algorithm-specific range) */\n\tscore: number;\n\n\t/** Algorithm that extracted this keyword */\n\talgorithm: KeywordAlgorithm;\n\n\t/** Optional positions where keyword appears in text (character offsets) */\n\tpositions?: number[];\n}\n\n/**\n * Warning from a post-processor during extraction.\n */\nexport interface ProcessingWarning {\n\t/** Name of the post-processor that produced the warning */\n\tsource: string;\n\n\t/** Warning message */\n\tmessage: string;\n}\n\n/**\n * Page tracking and extraction configuration.\n *\n * Controls how pages/slides/sheets are extracted and tracked in the document.\n * Page range information in chunk metadata (first_page/last_page) is automatically\n * enabled when page boundaries are available and chunking is configured.\n */\nexport interface PageExtractionConfig {\n\t/** Extract pages as separate array (ExtractionResult.pages) */\n\textractPages?: boolean;\n\t/** Insert page markers in main content string */\n\tinsertPageMarkers?: boolean;\n\t/** Page marker format (use {page_num} placeholder) */\n\tmarkerFormat?: string;\n}\n\n/**\n * Main extraction configuration interface.\n *\n * Combines all sub-configurations for document extraction, OCR, chunking, post-processing, etc.\n * All fields are optional and use sensible defaults.\n */\nexport interface ExtractionConfig {\n\t/** Enable caching of extraction results for identical inputs. Default: true. */\n\tuseCache?: boolean;\n\n\t/** Enable quality processing filters to improve extraction reliability. Default: false. */\n\tenableQualityProcessing?: boolean;\n\n\t/** OCR configuration for text extraction from images. Only used when document contains images or forceOcr is true. */\n\tocr?: OcrConfig;\n\n\t/** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */\n\tforceOcr?: boolean;\n\n\t/** Include structured document tree in the extraction result. Default: false. */\n\tincludeDocumentStructure?: boolean;\n\n\t/** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */\n\tchunking?: ChunkingConfig;\n\n\t/** Image extraction and optimization configuration. */\n\timages?: ImageExtractionConfig;\n\n\t/** PDF-specific extraction options (passwords, metadata, etc.). */\n\tpdfOptions?: PdfConfig;\n\n\t/** Token reduction configuration for optimizing token usage in LLM pipelines. */\n\ttokenReduction?: TokenReductionConfig;\n\n\t/** Language detection configuration for automatic language identification. */\n\tlanguageDetection?: LanguageDetectionConfig;\n\n\t/** Post-processor configuration for customizing extraction results. */\n\tpostprocessor?: PostProcessorConfig;\n\n\t/** HTML to Markdown conversion options for HTML content. */\n\thtmlOptions?: HtmlConversionOptions;\n\n\t/** Keyword extraction configuration for extracting important phrases. */\n\tkeywords?: KeywordConfig;\n\n\t/** Page tracking and extraction configuration for multi-page documents. */\n\tpages?: PageExtractionConfig;\n\n\t/** Maximum number of concurrent extractions in batch operations. Default: 4. */\n\tmaxConcurrentExtractions?: number;\n\n\t/**\n\t * Output text format for extracted content. Default: \"plain\".\n\t *\n\t * - \"plain\": Raw extracted text\n\t * - \"markdown\": Markdown formatted output\n\t * - \"djot\": Djot markup format\n\t * - \"html\": HTML formatted output\n\t */\n\toutputFormat?: \"plain\" | \"markdown\" | \"djot\" | \"html\";\n\n\t/**\n\t * Result structure format. Default: \"unified\".\n\t *\n\t * - \"unified\": All content in the `content` field (default)\n\t * - \"element_based\": Semantic element extraction (Unstructured-compatible)\n\t */\n\tresultFormat?: \"unified\" | \"element_based\";\n}\n\n/**\n * Extracted table data from document.\n *\n * Contains both cell data and Markdown representation for easy display and processing.\n */\nexport interface Table {\n\t/** 2D array of cell contents (rows × columns) */\n\tcells: string[][];\n\n\t/** Markdown representation of the table for display or parsing */\n\tmarkdown: string;\n\n\t/** Page number where this table was found (1-indexed) */\n\tpageNumber: number;\n\n\t/** Bounding box of the table on the page (PDF coordinates). */\n\tboundingBox?: BoundingBox | null;\n}\n\nexport interface ExcelMetadata {\n\tsheetCount?: number;\n\tsheetNames?: string[];\n}\n\nexport interface EmailMetadata {\n\tfromEmail?: string | null;\n\tfromName?: string | null;\n\ttoEmails?: string[];\n\tccEmails?: string[];\n\tbccEmails?: string[];\n\tmessageId?: string | null;\n\tattachments?: string[];\n}\n\nexport interface ArchiveMetadata {\n\tformat?: string;\n\tfileCount?: number;\n\tfileList?: string[];\n\ttotalSize?: number;\n\tcompressedSize?: number | null;\n}\n\nexport interface ImageMetadata {\n\twidth?: number;\n\theight?: number;\n\tformat?: string;\n\texif?: Record<string, string>;\n}\n\nexport interface XmlMetadata {\n\telementCount?: number;\n\tuniqueElements?: string[];\n}\n\nexport interface TextMetadata {\n\tlineCount?: number;\n\twordCount?: number;\n\tcharacterCount?: number;\n\theaders?: string[] | null;\n\tlinks?: [string, string][] | null;\n\tcodeBlocks?: [string, string][] | null;\n}\n\nexport interface HeaderMetadata {\n\tlevel: number;\n\ttext: string;\n\tid?: string | null;\n\tdepth: number;\n\thtmlOffset: number;\n}\n\nexport interface LinkMetadata {\n\thref: string;\n\ttext: string;\n\ttitle?: string | null;\n\tlinkType: \"anchor\" | \"internal\" | \"external\" | \"email\" | \"phone\" | \"other\";\n\trel: string[];\n\tattributes: Record<string, string>;\n}\n\nexport interface HtmlImageMetadata {\n\tsrc: string;\n\talt?: string | null;\n\ttitle?: string | null;\n\tdimensions?: [number, number] | null;\n\timageType: \"data_uri\" | \"inline_svg\" | \"external\" | \"relative\";\n\tattributes: Record<string, string>;\n}\n\nexport interface StructuredData {\n\tdataType: \"json_ld\" | \"microdata\" | \"rdfa\";\n\trawJson: string;\n\tschemaType?: string | null;\n}\n\nexport interface HtmlMetadata {\n\ttitle?: string | null;\n\tdescription?: string | null;\n\tkeywords: string[];\n\tauthor?: string | null;\n\tcanonicalUrl?: string | null;\n\tbaseHref?: string | null;\n\tlanguage?: string | null;\n\ttextDirection?: \"ltr\" | \"rtl\" | \"auto\" | null;\n\topenGraph: Record<string, string>;\n\ttwitterCard: Record<string, string>;\n\tmetaTags: Record<string, string>;\n\thtmlHeaders: HeaderMetadata[];\n\thtmlLinks: LinkMetadata[];\n\thtmlImages: HtmlImageMetadata[];\n\tstructuredData: StructuredData[];\n}\n\nexport interface PdfMetadata {\n\ttitle?: string | null;\n\tauthor?: string | null;\n\tsubject?: string | null;\n\tkeywords?: string | null;\n\tcreator?: string | null;\n\tproducer?: string | null;\n\tcreationDate?: string | null;\n\tmodificationDate?: string | null;\n\tpageCount?: number;\n}\n\nexport interface PptxMetadata {\n\ttitle?: string | null;\n\tauthor?: string | null;\n\tdescription?: string | null;\n\tsummary?: string | null;\n\tfonts?: string[];\n}\n\nexport interface OcrMetadata {\n\tlanguage?: string;\n\tpsm?: number;\n\toutputFormat?: string;\n\ttableCount?: number;\n\ttableRows?: number | null;\n\ttableCols?: number | null;\n}\n\nexport interface ImagePreprocessingMetadata {\n\toriginalDimensions?: [number, number];\n\toriginalDpi?: [number, number];\n\ttargetDpi?: number;\n\tscaleFactor?: number;\n\tautoAdjusted?: boolean;\n\tfinalDpi?: number;\n\tnewDimensions?: [number, number] | null;\n\tresampleMethod?: string;\n\tdimensionClamped?: boolean;\n\tcalculatedDpi?: number | null;\n\tskippedResize?: boolean;\n\tresizeError?: string | null;\n}\n\nexport interface ErrorMetadata {\n\terrorType?: string;\n\tmessage?: string;\n}\n\n/**\n * Page boundary information for chunk metadata.\n *\n * Tracks where a specific page's content starts and ends in the main content string,\n * enabling mapping from byte positions to page numbers. All offsets are guaranteed to be\n * at valid UTF-8 character boundaries.\n */\nexport interface PageBoundary {\n\t/** Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */\n\tbyteStart: number;\n\t/** Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */\n\tbyteEnd: number;\n\t/** Page number (1-indexed) */\n\tpageNumber: number;\n}\n\n/**\n * Type of paginated unit in a document.\n *\n * Distinguishes between different types of \"pages\":\n * - \"page\": Standard document pages (PDF, DOCX, images)\n * - \"slide\": Presentation slides (PPTX, ODP)\n * - \"sheet\": Spreadsheet sheets (XLSX, ODS)\n */\nexport type PageUnitType = \"page\" | \"slide\" | \"sheet\";\n\n/**\n * Detailed per-page metadata.\n *\n * Captures information about a single page/slide/sheet including dimensions,\n * content counts, and visibility state.\n */\nexport interface PageInfo {\n\t/** Page number (1-indexed) */\n\tnumber: number;\n\t/** Page title (usually for presentations) */\n\ttitle?: string | null;\n\t/** Dimensions in points (PDF) or pixels (images): [width, height] */\n\tdimensions?: [number, number] | null;\n\t/** Number of images on this page */\n\timageCount?: number | null;\n\t/** Number of tables on this page */\n\ttableCount?: number | null;\n\t/** Whether this page is hidden (e.g., in presentations) */\n\thidden?: boolean | null;\n\t/** Whether this page is blank (contains no meaningful content) */\n\tisBlank?: boolean | null;\n}\n\n/**\n * Page structure metadata.\n *\n * Contains information about pages/slides/sheets in a document, including\n * boundaries for mapping chunks to pages and detailed per-page metadata.\n */\nexport interface PageStructure {\n\t/** Total number of pages/slides/sheets */\n\ttotalCount: number;\n\t/** Type of paginated unit (page, slide, or sheet) */\n\tunitType: PageUnitType;\n\t/** Byte offset boundaries for each page */\n\tboundaries?: PageBoundary[] | null;\n\t/** Detailed per-page metadata (optional, only when needed) */\n\tpages?: PageInfo[] | null;\n}\n\n/**\n * Metadata about a chunk's position and properties in the document.\n *\n * Tracks where a chunk appears in the original document, including byte offsets\n * and page ranges when page tracking is enabled.\n */\n/** Heading depth and text for markdown heading context. */\nexport interface HeadingLevel {\n\t/** Heading depth (1 = h1, 2 = h2, etc.) */\n\tlevel: number;\n\t/** Text content of the heading */\n\ttext: string;\n}\n\n/** Heading hierarchy context for a markdown chunk. */\nexport interface HeadingContext {\n\t/** Heading hierarchy from document root to this chunk's section */\n\theadings: HeadingLevel[];\n}\n\nexport interface ChunkMetadata {\n\t/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary) */\n\tbyteStart: number;\n\t/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary) */\n\tbyteEnd: number;\n\t/** Number of tokens in this chunk (if available from embedding model) */\n\ttokenCount?: number | null;\n\t/** Zero-based index of this chunk in the document */\n\tchunkIndex: number;\n\t/** Total number of chunks in the document */\n\ttotalChunks: number;\n\t/** First page number this chunk spans (1-indexed, only when page tracking enabled) */\n\tfirstPage?: number | null;\n\t/** Last page number this chunk spans (1-indexed, only when page tracking enabled) */\n\tlastPage?: number | null;\n\t/** Heading context when using markdown chunker */\n\theadingContext?: HeadingContext | null;\n}\n\n/**\n * Text chunk with optional embedding.\n *\n * Represents a segment of a document created by the chunking algorithm, useful for RAG and vector databases.\n */\nexport interface Chunk {\n\t/** Text content of this chunk */\n\tcontent: string;\n\n\t/** Vector embedding for this chunk (if embedding model was used) */\n\tembedding?: number[] | null;\n\n\t/** Metadata about chunk position and properties in the document */\n\tmetadata: ChunkMetadata;\n}\n\n/**\n * Extracted image from document with optional OCR result.\n *\n * Contains image data and metadata about position, dimensions, and properties.\n */\nexport interface ExtractedImage {\n\t/** Raw image bytes as Uint8Array */\n\tdata: Uint8Array;\n\n\t/** Image format (e.g., 'png', 'jpeg', 'tiff') */\n\tformat: string;\n\n\t/** Sequential index of this image in the document (0-indexed) */\n\timageIndex: number;\n\n\t/** Page number where this image was found (1-indexed), null if unknown */\n\tpageNumber?: number | null;\n\n\t/** Image width in pixels, null if unknown */\n\twidth?: number | null;\n\n\t/** Image height in pixels, null if unknown */\n\theight?: number | null;\n\n\t/** Color space (e.g., 'RGB', 'CMYK', 'Grayscale'), null if unknown */\n\tcolorspace?: string | null;\n\n\t/** Bits per color component (e.g., 8 for 8-bit), null if unknown */\n\tbitsPerComponent?: number | null;\n\n\t/** Whether this is a mask image (used internally by PDF) */\n\tisMask: boolean;\n\n\t/** Image description or caption if available */\n\tdescription?: string | null;\n\n\t/** OCR extraction result if OCR was run on this image, null otherwise */\n\tocrResult?: ExtractionResult | null;\n\n\t/** Bounding box of the image on the page (PDF coordinates). */\n\tboundingBox?: BoundingBox | null;\n}\n\n/**\n * Content for a single page/slide/sheet.\n *\n * When page extraction is enabled, documents are split into per-page content\n * with associated tables and images mapped to each page. This allows for page-specific processing.\n */\nexport interface PageContent {\n\t/** Page number (1-indexed) starting from 1 */\n\tpageNumber: number;\n\n\t/** Text content extracted from this page */\n\tcontent: string;\n\n\t/** Tables found and extracted from this page */\n\ttables: Table[];\n\n\t/** Images found and extracted from this page */\n\timages: ExtractedImage[];\n\n\t/** Whether this page is blank (contains no meaningful content) */\n\tisBlank?: boolean | null;\n}\n\n/**\n * Extraction result metadata.\n *\n * Uses a flattened discriminated union approach with format_type as the discriminator.\n * When format_type is set (e.g., \"archive\"), the corresponding format-specific fields\n * are available at the root level of the metadata object.\n *\n * This structure matches the Rust serialization with serde's tagged enum flattening.\n */\nexport interface Metadata {\n\tlanguage?: string | null;\n\tcreatedAt?: string | null;\n\tmodifiedAt?: string | null;\n\tsubject?: string | null;\n\n\tformatType?: \"pdf\" | \"excel\" | \"email\" | \"pptx\" | \"archive\" | \"image\" | \"xml\" | \"text\" | \"html\" | \"ocr\";\n\n\ttitle?: string | null;\n\tauthors?: string[] | null;\n\tkeywords?: string[] | null;\n\tcreator?: string | null;\n\tproducer?: string | null;\n\tcreationDate?: string | null;\n\tmodificationDate?: string | null;\n\tpageCount?: number;\n\n\tsheetCount?: number;\n\tsheetNames?: string[];\n\n\tfromEmail?: string | null;\n\tfromName?: string | null;\n\ttoEmails?: string[];\n\tccEmails?: string[];\n\tbccEmails?: string[];\n\tmessageId?: string | null;\n\tattachments?: string[];\n\n\tdescription?: string | null;\n\tsummary?: string | null;\n\tfonts?: string[];\n\n\tformat?: string;\n\tfileCount?: number;\n\tfileList?: string[];\n\ttotalSize?: number;\n\tcompressedSize?: number | null;\n\n\twidth?: number;\n\theight?: number;\n\texif?: Record<string, string>;\n\n\telementCount?: number;\n\tuniqueElements?: string[];\n\n\tline_count?: number;\n\tword_count?: number;\n\tcharacter_count?: number;\n\theaders?: string[] | null;\n\tlinks?: [string, string][] | null;\n\tcode_blocks?: [string, string][] | null;\n\n\tcanonical_url?: string | null;\n\tbase_href?: string | null;\n\topen_graph?: Record<string, string>;\n\ttwitter_card?: Record<string, string>;\n\tmeta_tags?: Record<string, string>;\n\thtml_language?: string | null;\n\ttext_direction?: \"ltr\" | \"rtl\" | \"auto\" | null;\n\thtml_headers?: HeaderMetadata[];\n\thtml_links?: LinkMetadata[];\n\thtml_images?: HtmlImageMetadata[];\n\tstructured_data?: StructuredData[];\n\n\tpsm?: number;\n\toutput_format?: string;\n\ttable_count?: number;\n\ttable_rows?: number | null;\n\ttable_cols?: number | null;\n\n\timage_preprocessing?: ImagePreprocessingMetadata | null;\n\n\tjson_schema?: Record<string, unknown> | null;\n\n\tpage_structure?: PageStructure | null;\n\n\terror?: ErrorMetadata | null;\n\n\t/**\n\t * Additional fields may be added at runtime by postprocessors.\n\t * Use bracket notation to safely access unexpected properties.\n\t */\n\t[key: string]: unknown;\n}\n\n/**\n * Semantic element type classification.\n *\n * Categorizes text content into semantic units for downstream processing.\n * Supports the element types commonly found in structured documents.\n */\nexport type ElementType =\n\t| \"title\"\n\t| \"narrative_text\"\n\t| \"heading\"\n\t| \"list_item\"\n\t| \"table\"\n\t| \"image\"\n\t| \"page_break\"\n\t| \"code_block\"\n\t| \"block_quote\"\n\t| \"footer\"\n\t| \"header\";\n\n/**\n * Bounding box coordinates for element positioning.\n *\n * Represents rectangular coordinates in the document space.\n */\nexport interface BoundingBox {\n\t/** Left x-coordinate */\n\tx0: number;\n\t/** Bottom y-coordinate */\n\ty0: number;\n\t/** Right x-coordinate */\n\tx1: number;\n\t/** Top y-coordinate */\n\ty1: number;\n}\n\n/**\n * A PDF annotation extracted from a document page.\n */\nexport interface PdfAnnotation {\n\t/** Type of annotation (e.g., \"text\", \"highlight\", \"link\", \"underline\") */\n\tannotationType: string;\n\t/** Text content of the annotation, if available */\n\tcontent?: string | null;\n\t/** Page number (1-indexed) where the annotation appears */\n\tpageNumber: number;\n\t/** Bounding box of the annotation on the page */\n\tboundingBox?: BoundingBox | null;\n}\n\n/**\n * Metadata for a semantic element.\n *\n * Contains structural and positioning information about an extracted element.\n */\nexport interface ElementMetadata {\n\t/** Page number (1-indexed) */\n\tpageNumber?: number | null;\n\t/** Source filename or document name */\n\tfilename?: string | null;\n\t/** Bounding box coordinates if available */\n\tcoordinates?: BoundingBox | null;\n\t/** Position index in the element sequence */\n\telementIndex?: number | null;\n\t/** Additional custom metadata */\n\tadditional?: Record<string, string>;\n}\n\n/**\n * Semantic element extracted from document.\n *\n * Represents a logical unit of content with semantic classification,\n * unique identifier, and metadata for tracking origin and position.\n */\nexport interface Element {\n\t/** Unique element identifier */\n\telementId: string;\n\t/** Semantic type of this element */\n\telementType: ElementType;\n\t/** Text content of the element */\n\ttext: string;\n\t/** Metadata about the element */\n\tmetadata: ElementMetadata;\n}\n\n/**\n * Complete extraction result from document processing.\n *\n * Contains all extracted content, metadata, and optional processed data like chunks and images.\n * This is the primary return value from extraction functions.\n */\nexport interface ExtractionResult {\n\t/** Extracted text content from the document (main content) */\n\tcontent: string;\n\n\t/** MIME type of the input document (e.g., 'application/pdf', 'text/html') */\n\tmimeType: string;\n\n\t/** Document metadata including title, author, creation date, language, and format-specific fields */\n\tmetadata: Metadata;\n\n\t/** Tables extracted from the document (2D cell arrays with Markdown representation) */\n\ttables: Table[];\n\n\t/** Detected languages in the document (ISO 639-1 codes, e.g., ['en', 'de']), null if detection disabled */\n\tdetectedLanguages: string[] | null;\n\n\t/** Document chunks for RAG/vector databases (if chunking was enabled), null otherwise */\n\tchunks: Chunk[] | null;\n\n\t/** Images extracted from document with metadata (if image extraction was enabled), null otherwise */\n\timages: ExtractedImage[] | null;\n\n\t/** Semantic elements extracted from document with type classification and metadata (if element extraction was enabled), null otherwise */\n\telements?: Element[] | null;\n\n\t/** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */\n\tpages?: PageContent[] | null;\n\n\t/** Extracted keywords when keyword extraction is enabled, undefined otherwise */\n\textractedKeywords?: ExtractedKeyword[];\n\n\t/** Quality score when quality processing is enabled, undefined otherwise */\n\tqualityScore?: number;\n\n\t/** Processing warnings from post-processors */\n\tprocessingWarnings?: ProcessingWarning[];\n\n\t/** Granular OCR elements (words, lines, blocks) when OCR element extraction is enabled, null otherwise */\n\tocrElements?: OcrElement[] | null;\n\n\t/** Structured document tree when include_document_structure is enabled, null otherwise */\n\tdocument?: Record<string, unknown> | null;\n\n\t/** PDF annotations when extract_annotations is enabled, null otherwise */\n\tannotations?: PdfAnnotation[] | null;\n}\n\n/** Post-processor execution stage in the extraction pipeline. */\nexport type ProcessingStage = \"early\" | \"middle\" | \"late\";\n\n/**\n * Protocol for custom post-processors that modify extraction results.\n *\n * Post-processors enrich or transform extraction results without failing the extraction.\n * If a post-processor throws an error, it's logged but extraction continues.\n * Only works with async extraction functions (`extractFile`, `extractBytes`, etc.).\n */\nexport interface PostProcessorProtocol {\n\t/**\n\t * Return the unique name of this postprocessor.\n\t *\n\t * @returns Unique processor name (case-sensitive, alphanumeric + underscores recommended)\n\t */\n\tname(): string;\n\n\t/**\n\t * Process and enrich an extraction result.\n\t *\n\t * Modify the result to add new metadata, transform content, or perform other enrichment.\n\t * If this throws an error, it's logged but extraction continues.\n\t *\n\t * @param result - ExtractionResult with extracted content, metadata, and tables\n\t * @returns Modified result with enriched data. Can be async or sync.\n\t */\n\tprocess(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;\n\n\t/**\n\t * Return the processing stage for this processor.\n\t *\n\t * Determines when this processor runs relative to others:\n\t * - \"early\": Runs first, before other processors (good for cleanup/normalization)\n\t * - \"middle\": Runs with other middle-stage processors (default)\n\t * - \"late\": Runs last, after others (good for final enrichment)\n\t *\n\t * @returns One of \"early\", \"middle\", or \"late\" (default: \"middle\")\n\t */\n\tprocessingStage?(): ProcessingStage;\n\n\t/**\n\t * Initialize the processor (e.g., load ML models, setup resources).\n\t *\n\t * Called once when the processor is first registered. Use for expensive operations.\n\t */\n\tinitialize?(): void | Promise<void>;\n\n\t/**\n\t * Shutdown the processor and release resources.\n\t *\n\t * Called when the processor is unregistered. Use for cleanup (closing connections, freeing memory).\n\t */\n\tshutdown?(): void | Promise<void>;\n}\n\n/**\n * Protocol for custom validators that check extraction results.\n *\n * Validators perform quality checks and fail the extraction if validation fails.\n * Unlike post-processors, validator errors cause the entire extraction to fail.\n * Useful for enforcing quality standards on extracted content.\n */\nexport interface ValidatorProtocol {\n\t/**\n\t * Return the unique name of this validator.\n\t *\n\t * @returns Unique validator name (case-sensitive, alphanumeric + underscores recommended)\n\t */\n\tname(): string;\n\n\t/**\n\t * Validate an extraction result.\n\t *\n\t * Throw an error if validation fails. The error message will be used as the extraction error.\n\t * If validation passes, return without throwing (return value is ignored).\n\t *\n\t * @param result - ExtractionResult to validate\n\t * @throws {Error} If validation fails (extraction will fail with this error)\n\t */\n\tvalidate(result: ExtractionResult): void | Promise<void>;\n\n\t/**\n\t * Return the validation priority.\n\t *\n\t * Higher priority validators run first. Useful for running cheap validations (e.g., length checks)\n\t * before expensive ones (e.g., AI-based quality checks) to fail fast.\n\t *\n\t * @returns Priority value (higher = runs earlier, default: 50). Range: 0-1000.\n\t */\n\tpriority?(): number;\n\n\t/**\n\t * Check if this validator should run for a given result.\n\t *\n\t * Allows conditional validation based on MIME type, metadata, or content.\n\t * This is evaluated before validation, so expensive checks can be skipped for irrelevant documents.\n\t *\n\t * @param result - ExtractionResult to check\n\t * @returns true if validator should run, false to skip (default: true)\n\t */\n\tshouldValidate?(result: ExtractionResult): boolean;\n\n\t/**\n\t * Initialize the validator (e.g., load ML models, setup resources).\n\t *\n\t * Called once when the validator is first registered. Use for expensive operations.\n\t */\n\tinitialize?(): void | Promise<void>;\n\n\t/**\n\t * Shutdown the validator and release resources.\n\t *\n\t * Called when the validator is unregistered. Use for cleanup (closing connections, freeing memory).\n\t */\n\tshutdown?(): void | Promise<void>;\n}\n\n/**\n * OCR backend protocol for implementing custom OCR engines.\n *\n * This interface defines the contract for OCR backends that can be registered\n * with Kreuzberg's extraction pipeline.\n *\n * ## Implementation Requirements\n *\n * OCR backends must implement:\n * - `name()`: Return a unique backend identifier\n * - `supportedLanguages()`: Return list of supported ISO 639-1/2/3 language codes\n * - `processImage()`: Process image bytes and return extraction result\n *\n * ## Optional Methods\n *\n * - `initialize()`: Called when backend is registered (load models, etc.)\n * - `shutdown()`: Called when backend is unregistered (cleanup resources)\n *\n * @example\n * ```typescript\n * import { registerOcrBackend, extractFile } from '@kreuzberg/node';\n *\n * // PaddleOCR is built into the native Rust core - just use the backend name\n * const result = await extractFile('scanned.pdf', null, {\n * ocr: { backend: 'paddle-ocr', language: 'en' }\n * });\n * ```\n */\nexport interface OcrBackendProtocol {\n\t/**\n\t * Return the unique name of this OCR backend.\n\t *\n\t * This name is used in ExtractionConfig to select the backend:\n\t * ```typescript\n\t * { ocr: { backend: 'paddle-ocr', language: 'en' } }\n\t * ```\n\t *\n\t * @returns Unique backend identifier (e.g., \"paddle-ocr\", \"tesseract\")\n\t */\n\tname(): string;\n\n\t/**\n\t * Return list of supported language codes.\n\t *\n\t * Language codes should follow ISO 639-1 (2-letter) or ISO 639-2 (3-letter) standards.\n\t * Common codes: \"en\", \"eng\" (English), \"de\", \"deu\" (German), \"fr\", \"fra\" (French).\n\t *\n\t * @returns Array of supported language codes\n\t *\n\t * @example\n\t * ```typescript\n\t * supportedLanguages(): string[] {\n\t * return [\"en\", \"eng\", \"de\", \"deu\", \"fr\", \"fra\"];\n\t * }\n\t * ```\n\t */\n\tsupportedLanguages(): string[];\n\n\t/**\n\t * Process image bytes and extract text via OCR.\n\t *\n\t * This method receives raw image data and must return a result object with:\n\t * - `content`: Extracted text content\n\t * - `mime_type`: MIME type (usually \"text/plain\")\n\t * - `metadata`: Additional information (confidence, dimensions, etc.)\n\t * - `tables`: Optional array of detected tables\n\t *\n\t * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string (when called from Rust bindings)\n\t * @param language - Language code from supportedLanguages()\n\t * @returns Promise resolving to extraction result\n\t *\n\t * @example\n\t * ```typescript\n\t * async processImage(imageBytes: Uint8Array | string, language: string): Promise<{\n\t * content: string;\n\t * mime_type: string;\n\t * metadata: Record<string, unknown>;\n\t * tables: unknown[];\n\t * }> {\n\t * const buffer = typeof imageBytes === \"string\" ? Buffer.from(imageBytes, \"base64\") : Buffer.from(imageBytes);\n\t * const text = await myOcrEngine.recognize(buffer, language);\n\t * return {\n\t * content: text,\n\t * mime_type: \"text/plain\",\n\t * metadata: { confidence: 0.95, language },\n\t * tables: []\n\t * };\n\t * }\n\t * ```\n\t */\n\tprocessImage(\n\t\timageBytes: Uint8Array | string,\n\t\tlanguage: string,\n\t): Promise<{\n\t\tcontent: string;\n\t\tmime_type: string;\n\t\tmetadata: Record<string, unknown>;\n\t\ttables: unknown[];\n\t}>;\n\n\t/**\n\t * Initialize the OCR backend (optional).\n\t *\n\t * Called once when the backend is registered. Use this to:\n\t * - Load ML models\n\t * - Initialize libraries\n\t * - Validate dependencies\n\t *\n\t * @example\n\t * ```typescript\n\t * async initialize(): Promise<void> {\n\t * this.model = await loadModel('./path/to/model');\n\t * }\n\t * ```\n\t */\n\tinitialize?(): void | Promise<void>;\n\n\t/**\n\t * Shutdown the OCR backend and release resources (optional).\n\t *\n\t * Called when the backend is unregistered. Use this to:\n\t * - Free model memory\n\t * - Close file handles\n\t * - Cleanup temporary files\n\t *\n\t * @example\n\t * ```typescript\n\t * async shutdown(): Promise<void> {\n\t * await this.model.dispose();\n\t * this.model = null;\n\t * }\n\t * ```\n\t */\n\tshutdown?(): void | Promise<void>;\n}\n\n/**\n * Result of error message classification into error codes.\n *\n * Provides classification details including the error code, name,\n * description, and confidence score for the classification.\n *\n * @example\n * ```typescript\n * import { classifyError, ErrorCode } from '@kreuzberg/node';\n *\n * const result = classifyError(\"File not found in read operation\");\n * if (result.code === ErrorCode.IoError) {\n * console.error(`I/O Error: ${result.description}`);\n * console.log(`Confidence: ${result.confidence}`);\n * }\n * ```\n */\nexport interface ErrorClassification {\n\t/**\n\t * The numeric error code (0-7) representing the error type.\n\t */\n\tcode: number;\n\n\t/**\n\t * The human-readable name of the error code (e.g., \"validation\", \"ocr\").\n\t */\n\tname: string;\n\n\t/**\n\t * A brief description of the error type.\n\t */\n\tdescription: string;\n\n\t/**\n\t * Confidence score (0.0-1.0) indicating how certain the classification is.\n\t * Higher values indicate higher confidence in the classification.\n\t */\n\tconfidence: number;\n}\n\n// ============================================================================\n// Worker Pool APIs\n// ============================================================================\n\n/**\n * Opaque handle to a worker pool for concurrent extraction operations.\n *\n * Worker pools enable parallel processing of CPU-bound document extraction\n * tasks by distributing work across multiple threads. This is especially\n * useful for batch processing large numbers of documents.\n *\n * @example\n * ```typescript\n * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';\n *\n * const pool = createWorkerPool(4); // 4 concurrent workers\n * try {\n * const result = await extractFileInWorker(pool, 'document.pdf');\n * console.log(result.content);\n * } finally {\n * await closeWorkerPool(pool);\n * }\n * ```\n */\nexport interface WorkerPool {\n\t/** Internal pool identifier (opaque) */\n\treadonly poolId: number;\n}\n\n/**\n * Worker pool statistics.\n *\n * Provides information about the current state of a worker pool including\n * pool size, number of active workers, and queued tasks.\n *\n * @example\n * ```typescript\n * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';\n *\n * const pool = createWorkerPool(4);\n * const stats = getWorkerPoolStats(pool);\n * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);\n * console.log(`Queued: ${stats.queuedTasks}`);\n * ```\n */\nexport interface WorkerPoolStats {\n\t/**\n\t * Maximum number of concurrent workers in the pool.\n\t */\n\tsize: number;\n\n\t/**\n\t * Number of currently active (executing) workers.\n\t */\n\tactiveWorkers: number;\n\n\t/**\n\t * Number of tasks waiting in the queue.\n\t */\n\tqueuedTasks: number;\n}\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
1
+ {"version":3,"sources":["../typescript/types.ts"],"sourcesContent":["/**\n * Type definitions for Kreuzberg extraction results.\n *\n * These types mirror the strongly-typed Rust metadata structures,\n * providing type safety for TypeScript users.\n */\n\n// ============================================================================\n// ============================================================================\n\n/**\n * Tesseract OCR engine configuration options.\n *\n * @example\n * ```typescript\n * const config: TesseractConfig = {\n * psm: 6,\n * enableTableDetection: true,\n * tesseditCharWhitelist: '0123456789'\n * };\n * ```\n */\nexport interface TesseractConfig {\n\t/**\n\t * Page Segmentation Mode (0-13). Controls how Tesseract segments and recognizes text.\n\t * Common values: 3 (auto), 6 (single uniform block), 11 (sparse text).\n\t * Default: 3 (auto layout analysis).\n\t */\n\tpsm?: number;\n\n\t/**\n\t * Enable table detection during OCR processing.\n\t * When true, Tesseract attempts to preserve table structure in the output.\n\t * Default: false.\n\t */\n\tenableTableDetection?: boolean;\n\n\t/**\n\t * Whitelist of characters Tesseract should recognize.\n\t * Only these characters will be returned by the OCR engine.\n\t * Use empty string to allow all characters. Useful for constraining output to digits,\n\t * specific alphabets, or other character sets.\n\t * Default: null (recognize all).\n\t */\n\ttesseditCharWhitelist?: string;\n}\n\n/**\n * OCR element hierarchy level.\n *\n * Defines the granularity of OCR element extraction.\n */\nexport type OcrElementLevel = \"word\" | \"line\" | \"block\" | \"page\";\n\n/**\n * Bounding geometry for OCR elements using rectangle coordinates.\n *\n * Represents rectangular coordinates with position and dimensions.\n */\nexport interface OcrBoundingGeometryRectangle {\n\ttype: \"rectangle\";\n\tleft: number;\n\ttop: number;\n\twidth: number;\n\theight: number;\n}\n\n/**\n * Bounding geometry for OCR elements using quadrilateral points.\n *\n * Represents irregular quadrilateral shapes with four corner points.\n */\nexport interface OcrBoundingGeometryQuadrilateral {\n\ttype: \"quadrilateral\";\n\tpoints: number[][];\n}\n\n/**\n * Bounding geometry for OCR elements.\n *\n * Can be either rectangular or quadrilateral based on the OCR engine's detection capability.\n */\nexport type OcrBoundingGeometry = OcrBoundingGeometryRectangle | OcrBoundingGeometryQuadrilateral;\n\n/**\n * Confidence scores for OCR operations.\n *\n * Tracks confidence levels for different aspects of OCR processing.\n */\nexport interface OcrConfidence {\n\t/** Confidence score (0.0-1.0) for text detection. */\n\tdetection?: number;\n\n\t/** Confidence score (0.0-1.0) for text recognition. */\n\trecognition?: number;\n}\n\n/**\n * Rotation information for OCR elements.\n *\n * Tracks detected text rotation and associated confidence.\n */\nexport interface OcrRotation {\n\t/** Angle of rotation in degrees. */\n\tangleDegrees?: number;\n\n\t/** Confidence score (0.0-1.0) for rotation detection. */\n\tconfidence?: number;\n}\n\n/**\n * Individual OCR element (word, line, block, or page).\n *\n * Represents a granular unit of text extracted by OCR with geometric and confidence information.\n */\nexport interface OcrElement {\n\t/** Extracted text content */\n\ttext: string;\n\n\t/** Bounding geometry of the element in the image */\n\tgeometry?: OcrBoundingGeometry;\n\n\t/** Confidence scores for detection and recognition */\n\tconfidence?: OcrConfidence;\n\n\t/** Hierarchy level of this element */\n\tlevel?: OcrElementLevel;\n\n\t/** Rotation information if text is rotated */\n\trotation?: OcrRotation;\n\n\t/** Page number where this element was found (1-indexed) */\n\tpageNumber?: number;\n\n\t/** Parent element ID for hierarchical relationships */\n\tparentId?: string;\n\n\t/** Backend-specific metadata that doesn't fit standard fields */\n\tbackendMetadata?: Record<string, unknown>;\n}\n\n/**\n * Configuration for OCR element extraction.\n *\n * Controls how granular OCR elements are extracted and organized.\n */\nexport interface OcrElementConfig {\n\t/** Enable extraction of granular OCR elements. Default: false. */\n\tincludeElements?: boolean;\n\n\t/** Minimum hierarchy level to extract. Default: 'word'. */\n\tminLevel?: OcrElementLevel;\n\n\t/** Minimum confidence threshold (0.0-1.0) for including elements. Default: 0.0. */\n\tminConfidence?: number;\n\n\t/** Build hierarchical relationships between elements. Default: false. */\n\tbuildHierarchy?: boolean;\n}\n\n/**\n * PaddleOCR engine configuration options.\n *\n * Specific configuration for the PaddleOCR backend.\n */\nexport interface PaddleOcrConfig {\n\t/** Language code(s) for OCR (e.g., 'en', 'zh', 'multi'). */\n\tlanguage?: string;\n\n\t/** Directory to cache downloaded OCR models. */\n\tcacheDir?: string;\n\n\t/** Enable angle classification for rotated text detection. Default: false. */\n\tuseAngleCls?: boolean;\n\n\t/** Enable table structure detection. Default: false. */\n\tenableTableDetection?: boolean;\n\n\t/** Database threshold for text detection (0.0-1.0). Default: 0.3. */\n\tdetDbThresh?: number;\n\n\t/** Box threshold for text detection (0.0-1.0). Default: 0.5. */\n\tdetDbBoxThresh?: number;\n\n\t/** Unclip ratio for expanding detected text regions. Default: 1.5. */\n\tdetDbUnclipRatio?: number;\n\n\t/** Maximum side length for detection preprocessing. Default: 960. */\n\tdetLimitSideLen?: number;\n\n\t/** Batch size for text recognition. Default: 30. */\n\trecBatchNum?: number;\n}\n\n/**\n * OCR (Optical Character Recognition) configuration.\n *\n * Controls which OCR engine to use and how it processes images.\n */\nexport interface OcrConfig {\n\t/** OCR backend name (e.g., 'tesseract', 'paddleocr', 'easyocr'). Required. */\n\tbackend: string;\n\n\t/** ISO 639-1/3 language code(s) for OCR (e.g., 'eng', 'fra', 'deu'). Default: 'eng'. */\n\tlanguage?: string;\n\n\t/** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */\n\ttesseractConfig?: TesseractConfig;\n\n\t/** PaddleOCR engine-specific configuration options. Only used when backend is 'paddleocr'. */\n\tpaddleOcrConfig?: PaddleOcrConfig;\n\n\t/** OCR element extraction configuration. */\n\telementConfig?: OcrElementConfig;\n}\n\n/**\n * Document chunking configuration for splitting large documents.\n *\n * Breaks large documents into smaller, manageable chunks while preserving context.\n * Useful for RAG (Retrieval Augmented Generation) and vector database indexing.\n */\nexport interface ChunkingConfig {\n\t/** Maximum characters per chunk. Default: 4096. */\n\tmaxChars?: number;\n\n\t/** Maximum overlapping characters between consecutive chunks for context preservation. Default: 512. */\n\tmaxOverlap?: number;\n\n\t/**\n\t * Alternative to maxChars: chunk size using different unit.\n\t * Mutually exclusive with maxChars.\n\t */\n\tchunkSize?: number;\n\n\t/**\n\t * Alternative to maxOverlap: overlap amount using different unit.\n\t * Mutually exclusive with maxOverlap.\n\t */\n\tchunkOverlap?: number;\n\n\t/**\n\t * Named preset configuration (e.g., 'default', 'aggressive', 'minimal').\n\t * Uses preset values if neither maxChars nor chunkSize is specified.\n\t */\n\tpreset?: string;\n\n\t/** Embedding configuration for generating vector embeddings for each chunk. */\n\tembedding?: Record<string, unknown>;\n\n\t/** Enable or disable chunking. Default: true when chunking config is provided. */\n\tenabled?: boolean;\n\n\t/** Sizing type: \"characters\" (default) or \"tokenizer\". */\n\tsizingType?: \"characters\" | \"tokenizer\";\n\n\t/** HuggingFace model ID for tokenizer sizing (e.g., \"Xenova/gpt-4o\"). */\n\tsizingModel?: string;\n\n\t/** Optional cache directory for tokenizer files. */\n\tsizingCacheDir?: string;\n}\n\n/**\n * Language detection configuration.\n *\n * Automatically detects the language(s) of extracted content.\n */\nexport interface LanguageDetectionConfig {\n\t/** Enable automatic language detection. Default: true. */\n\tenabled?: boolean;\n\n\t/** Minimum confidence score (0.0-1.0) for language detection. Default: 0.5. */\n\tminConfidence?: number;\n\n\t/** Detect multiple languages in the same document. Default: false. */\n\tdetectMultiple?: boolean;\n}\n\n/**\n * Token reduction configuration for optimizing token usage.\n *\n * Reduces the number of tokens in extracted content while preserving meaning.\n * Useful for reducing costs in LLM pipelines.\n */\nexport interface TokenReductionConfig {\n\t/** Reduction mode: 'aggressive' or 'conservative'. Default: 'conservative'. */\n\tmode?: string;\n\n\t/** Preserve tokens for semantically important words even in aggressive mode. Default: true. */\n\tpreserveImportantWords?: boolean;\n}\n\n/**\n * Hierarchy extraction configuration.\n *\n * Controls document hierarchy detection based on font size clustering.\n */\nexport interface HierarchyConfig {\n\t/** Enable hierarchy extraction. Default: true. */\n\tenabled?: boolean;\n\n\t/** Number of font size clusters (2-10). Default: 6. */\n\tkClusters?: number;\n\n\t/** Include bounding box information. Default: true. */\n\tincludeBbox?: boolean;\n\n\t/** OCR coverage threshold (0.0-1.0). Default: null. */\n\tocrCoverageThreshold?: number | null;\n}\n\n/**\n * PDF-specific extraction configuration.\n *\n * Controls how PDF documents are processed.\n */\nexport interface PdfConfig {\n\t/** Extract images from PDF pages. Default: true. */\n\textractImages?: boolean;\n\n\t/** List of passwords to try for password-protected PDFs. */\n\tpasswords?: string[];\n\n\t/** Extract document metadata (title, author, creation date, etc.). Default: true. */\n\textractMetadata?: boolean;\n\n\t/** Hierarchy extraction configuration. */\n\thierarchy?: HierarchyConfig;\n\n\t/** Extract annotations from PDF pages. Default: false. */\n\textractAnnotations?: boolean;\n\n\t/** Top margin fraction (0.0-0.5) for filtering header content. */\n\ttopMarginFraction?: number;\n\n\t/** Bottom margin fraction (0.0-0.5) for filtering footer content. */\n\tbottomMarginFraction?: number;\n}\n\n/**\n * Image extraction and processing configuration.\n *\n * Controls how images are extracted and optimized from documents.\n */\nexport interface ImageExtractionConfig {\n\t/** Enable image extraction from documents. Default: true. */\n\textractImages?: boolean;\n\n\t/** Target DPI (dots per inch) for extracted images. Higher DPI = better quality but larger files. Default: 150. */\n\ttargetDpi?: number;\n\n\t/** Maximum image dimension (width or height) in pixels. Images larger than this are downscaled. Default: 2000. */\n\tmaxImageDimension?: number;\n\n\t/** Automatically adjust DPI based on image content and quality. Default: true. */\n\tautoAdjustDpi?: boolean;\n\n\t/** Minimum DPI to maintain for image quality. Default: 72. */\n\tminDpi?: number;\n\n\t/** Maximum DPI to avoid excessive file sizes. Default: 300. */\n\tmaxDpi?: number;\n}\n\n/**\n * Post-processor configuration for modifying extracted content.\n *\n * Post-processors allow customization and cleanup of extraction results\n * without failing the extraction if they encounter errors.\n */\nexport interface PostProcessorConfig {\n\t/** Enable or disable post-processing entirely. Default: true. */\n\tenabled?: boolean;\n\n\t/** List of processor names to enable (allowlist). When set, only these are used. */\n\tenabledProcessors?: string[];\n\n\t/** List of processor names to disable (denylist). These are skipped. */\n\tdisabledProcessors?: string[];\n}\n\n/**\n * HTML preprocessing options.\n *\n * Cleans HTML content before conversion to Markdown.\n */\nexport interface HtmlPreprocessingOptions {\n\t/** Enable HTML preprocessing. Default: true. */\n\tenabled?: boolean;\n\n\t/** Preset cleanup level: 'minimal' (light), 'standard' (balanced), 'aggressive' (heavy). Default: 'standard'. */\n\tpreset?: \"minimal\" | \"standard\" | \"aggressive\";\n\n\t/** Remove navigation menus and headers. Default: true. */\n\tremoveNavigation?: boolean;\n\n\t/** Remove form elements. Default: true. */\n\tremoveForms?: boolean;\n}\n\n/**\n * HTML to Markdown conversion configuration options.\n *\n * Controls how HTML content is converted to Markdown format, including formatting,\n * escaping, and special handling for various HTML elements.\n */\nexport interface HtmlConversionOptions {\n\t/** Heading style conversion: \"atx\" (# style), \"underlined\" (underline style), or \"atx_closed\" (# style closed). Default: \"atx\". */\n\theadingStyle?: \"atx\" | \"underlined\" | \"atx_closed\";\n\n\t/** List indentation type: \"spaces\" or \"tabs\". Default: \"spaces\". */\n\tlistIndentType?: \"spaces\" | \"tabs\";\n\n\t/** Number of spaces/tabs per list indent level. Default: 4. */\n\tlistIndentWidth?: number;\n\n\t/** Bullet characters for unordered lists (e.g., '*', '-', '+'). Default: '*'. */\n\tbullets?: string;\n\n\t/** Markdown symbol for strong/bold emphasis: '**' or '__'. Default: '**'. */\n\tstrongEmSymbol?: string;\n\n\t/** Escape asterisks (*) in text to prevent accidental formatting. Default: false. */\n\tescapeAsterisks?: boolean;\n\n\t/** Escape underscores (_) in text to prevent accidental formatting. Default: false. */\n\tescapeUnderscores?: boolean;\n\n\t/** Escape miscellaneous special characters. Default: false. */\n\tescapeMisc?: boolean;\n\n\t/** Escape ASCII control characters. Default: false. */\n\tescapeAscii?: boolean;\n\n\t/** Default code language for syntax highlighting in code blocks (e.g., 'javascript'). Default: null. */\n\tcodeLanguage?: string;\n\n\t/** Convert HTML links to Markdown autolinks format ([text](url)). Default: true. */\n\tautolinks?: boolean;\n\n\t/** Use the HTML title element as default for links when no text is available. Default: false. */\n\tdefaultTitle?: boolean;\n\n\t/** Insert <br> tags in Markdown tables. Default: false. */\n\tbrInTables?: boolean;\n\n\t/** Use HOCR spatial table format for better table structure preservation. Default: false. */\n\thocrSpatialTables?: boolean;\n\n\t/** Highlight style for marked/highlighted text: \"double_equal\" (==text==), \"html\" (<mark>), \"bold\" (**text**), or \"none\". Default: \"none\". */\n\thighlightStyle?: \"double_equal\" | \"html\" | \"bold\" | \"none\";\n\n\t/** Extract metadata from HTML (title, meta tags, etc.). Default: false. */\n\textractMetadata?: boolean;\n\n\t/** Whitespace handling: \"normalized\" (collapse whitespace) or \"strict\" (preserve all whitespace). Default: \"normalized\". */\n\twhitespaceMode?: \"normalized\" | \"strict\";\n\n\t/** Remove newlines from output (convert to single line). Default: false. */\n\tstripNewlines?: boolean;\n\n\t/** Enable line wrapping at specified width. Default: true. */\n\twrap?: boolean;\n\n\t/** Maximum line width when wrapping is enabled. Default: 80. */\n\twrapWidth?: number;\n\n\t/** Convert as inline Markdown instead of block elements. Default: false. */\n\tconvertAsInline?: boolean;\n\n\t/** Markdown symbol for subscript text (e.g., '~' for ~text~). Default: '~'. */\n\tsubSymbol?: string;\n\n\t/** Markdown symbol for superscript text (e.g., '^' for ^text^). Default: '^'. */\n\tsupSymbol?: string;\n\n\t/** Newline style in output: \"spaces\" (two spaces + newline) or \"backslash\" (backslash + newline). Default: \"spaces\". */\n\tnewlineStyle?: \"spaces\" | \"backslash\";\n\n\t/** Code block style: \"indented\" (4-space indent), \"backticks\" (```), or \"tildes\" (~~~). Default: \"backticks\". */\n\tcodeBlockStyle?: \"indented\" | \"backticks\" | \"tildes\";\n\n\t/** List of HTML tag names to keep as inline images (don't convert). Default: []. */\n\tkeepInlineImagesIn?: string[];\n\n\t/** Character encoding for output (e.g., 'utf-8', 'ascii'). Default: 'utf-8'. */\n\tencoding?: string;\n\n\t/** Enable debug mode for detailed conversion logging. Default: false. */\n\tdebug?: boolean;\n\n\t/** List of HTML tag names to remove entirely from output. Default: []. */\n\tstripTags?: string[];\n\n\t/** List of HTML tag names to preserve in output (don't convert to Markdown). Default: []. */\n\tpreserveTags?: string[];\n\n\t/** HTML preprocessing options for cleaning HTML before conversion. */\n\tpreprocessing?: HtmlPreprocessingOptions;\n}\n\n/** Keyword extraction algorithm type. */\nexport type KeywordAlgorithm = \"yake\" | \"rake\";\n\n/**\n * YAKE (Yet Another Keyword Extractor) algorithm configuration.\n *\n * YAKE is an unsupervised keyword extraction method that doesn't require training data.\n */\nexport interface YakeParams {\n\t/** Window size for co-occurrence analysis (number of words to consider). Default: 3. */\n\twindowSize?: number;\n}\n\n/**\n * RAKE (Rapid Automatic Keyword Extraction) algorithm configuration.\n *\n * RAKE extracts keywords based on word co-occurrence and statistical measures.\n */\nexport interface RakeParams {\n\t/** Minimum word length to consider as keyword. Default: 3. */\n\tminWordLength?: number;\n\n\t/** Maximum number of words per keyword phrase. Default: 3. */\n\tmaxWordsPerPhrase?: number;\n}\n\n/**\n * Keyword extraction configuration.\n *\n * Extracts important keywords/phrases from document content using YAKE or RAKE algorithms.\n */\nexport interface KeywordConfig {\n\t/** Extraction algorithm: \"yake\" or \"rake\". Default: \"yake\". */\n\talgorithm?: KeywordAlgorithm;\n\n\t/** Maximum number of keywords to extract. Default: 10. */\n\tmaxKeywords?: number;\n\n\t/** Minimum relevance score (0.0-1.0) for keywords. Keywords below this are filtered out. Default: 0.1. */\n\tminScore?: number;\n\n\t/** N-gram range: [min_length, max_length] for phrase keywords (e.g., [1, 3] for 1-3 word phrases). Default: [1, 3]. */\n\tngramRange?: [number, number];\n\n\t/** Language for keyword extraction (e.g., 'en', 'de', 'fr'). Default: 'en'. */\n\tlanguage?: string;\n\n\t/** YAKE algorithm-specific parameters. Only used when algorithm is \"yake\". */\n\tyakeParams?: YakeParams;\n\n\t/** RAKE algorithm-specific parameters. Only used when algorithm is \"rake\". */\n\trakeParams?: RakeParams;\n}\n\n/**\n * Extracted keyword with relevance metadata.\n *\n * Represents a single keyword extracted from text along with its relevance score,\n * the algorithm that extracted it, and optional position information.\n */\nexport interface ExtractedKeyword {\n\t/** The keyword text */\n\ttext: string;\n\n\t/** Relevance score (higher is better, algorithm-specific range) */\n\tscore: number;\n\n\t/** Algorithm that extracted this keyword */\n\talgorithm: KeywordAlgorithm;\n\n\t/** Optional positions where keyword appears in text (character offsets) */\n\tpositions?: number[];\n}\n\n/**\n * Warning from a post-processor during extraction.\n */\nexport interface ProcessingWarning {\n\t/** Name of the post-processor that produced the warning */\n\tsource: string;\n\n\t/** Warning message */\n\tmessage: string;\n}\n\n/**\n * Page tracking and extraction configuration.\n *\n * Controls how pages/slides/sheets are extracted and tracked in the document.\n * Page range information in chunk metadata (first_page/last_page) is automatically\n * enabled when page boundaries are available and chunking is configured.\n */\nexport interface PageExtractionConfig {\n\t/** Extract pages as separate array (ExtractionResult.pages) */\n\textractPages?: boolean;\n\t/** Insert page markers in main content string */\n\tinsertPageMarkers?: boolean;\n\t/** Page marker format (use {page_num} placeholder) */\n\tmarkerFormat?: string;\n}\n\n/**\n * Layout detection configuration for PDF extraction.\n *\n * Controls layout detection using ONNX-based document layout models (YOLO or RT-DETR)\n * to detect document structure elements like tables, figures, headers, and code blocks.\n * Requires the `layout-detection` feature to be compiled.\n */\nexport interface LayoutDetectionConfig {\n\t/** Model preset: \"fast\" (YOLO, 11 classes) or \"accurate\" (RT-DETR, 17 classes). Default: \"fast\". */\n\tpreset?: string;\n\n\t/** Override the model's default confidence threshold for detections. Default: null (use model default). */\n\tconfidenceThreshold?: number;\n\n\t/** Apply postprocessing heuristics to improve detection quality. Default: true. */\n\tapplyHeuristics?: boolean;\n}\n\n/**\n * Main extraction configuration interface.\n *\n * Combines all sub-configurations for document extraction, OCR, chunking, post-processing, etc.\n * All fields are optional and use sensible defaults.\n */\nexport interface ExtractionConfig {\n\t/** Enable caching of extraction results for identical inputs. Default: true. */\n\tuseCache?: boolean;\n\n\t/** Enable quality processing filters to improve extraction reliability. Default: false. */\n\tenableQualityProcessing?: boolean;\n\n\t/** OCR configuration for text extraction from images. Only used when document contains images or forceOcr is true. */\n\tocr?: OcrConfig;\n\n\t/** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */\n\tforceOcr?: boolean;\n\n\t/** Include structured document tree in the extraction result. Default: false. */\n\tincludeDocumentStructure?: boolean;\n\n\t/** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */\n\tchunking?: ChunkingConfig;\n\n\t/** Image extraction and optimization configuration. */\n\timages?: ImageExtractionConfig;\n\n\t/** PDF-specific extraction options (passwords, metadata, etc.). */\n\tpdfOptions?: PdfConfig;\n\n\t/** Token reduction configuration for optimizing token usage in LLM pipelines. */\n\ttokenReduction?: TokenReductionConfig;\n\n\t/** Language detection configuration for automatic language identification. */\n\tlanguageDetection?: LanguageDetectionConfig;\n\n\t/** Post-processor configuration for customizing extraction results. */\n\tpostprocessor?: PostProcessorConfig;\n\n\t/** HTML to Markdown conversion options for HTML content. */\n\thtmlOptions?: HtmlConversionOptions;\n\n\t/** Keyword extraction configuration for extracting important phrases. */\n\tkeywords?: KeywordConfig;\n\n\t/** Page tracking and extraction configuration for multi-page documents. */\n\tpages?: PageExtractionConfig;\n\n\t/** Maximum number of concurrent extractions in batch operations. Default: 4. */\n\tmaxConcurrentExtractions?: number;\n\n\t/**\n\t * Output text format for extracted content. Default: \"plain\".\n\t *\n\t * - \"plain\": Raw extracted text\n\t * - \"markdown\": Markdown formatted output\n\t * - \"djot\": Djot markup format\n\t * - \"html\": HTML formatted output\n\t */\n\toutputFormat?: \"plain\" | \"markdown\" | \"djot\" | \"html\";\n\n\t/**\n\t * Result structure format. Default: \"unified\".\n\t *\n\t * - \"unified\": All content in the `content` field (default)\n\t * - \"element_based\": Semantic element extraction (Unstructured-compatible)\n\t */\n\tresultFormat?: \"unified\" | \"element_based\";\n\n\t/** Layout detection configuration for detecting document structure in PDFs. */\n\tlayout?: LayoutDetectionConfig;\n}\n\n/**\n * Extracted table data from document.\n *\n * Contains both cell data and Markdown representation for easy display and processing.\n */\nexport interface Table {\n\t/** 2D array of cell contents (rows × columns) */\n\tcells: string[][];\n\n\t/** Markdown representation of the table for display or parsing */\n\tmarkdown: string;\n\n\t/** Page number where this table was found (1-indexed) */\n\tpageNumber: number;\n\n\t/** Bounding box of the table on the page (PDF coordinates). */\n\tboundingBox?: BoundingBox | null;\n}\n\nexport interface ExcelMetadata {\n\tsheetCount?: number;\n\tsheetNames?: string[];\n}\n\nexport interface EmailMetadata {\n\tfromEmail?: string | null;\n\tfromName?: string | null;\n\ttoEmails?: string[];\n\tccEmails?: string[];\n\tbccEmails?: string[];\n\tmessageId?: string | null;\n\tattachments?: string[];\n}\n\nexport interface ArchiveMetadata {\n\tformat?: string;\n\tfileCount?: number;\n\tfileList?: string[];\n\ttotalSize?: number;\n\tcompressedSize?: number | null;\n}\n\nexport interface ImageMetadata {\n\twidth?: number;\n\theight?: number;\n\tformat?: string;\n\texif?: Record<string, string>;\n}\n\nexport interface XmlMetadata {\n\telementCount?: number;\n\tuniqueElements?: string[];\n}\n\nexport interface TextMetadata {\n\tlineCount?: number;\n\twordCount?: number;\n\tcharacterCount?: number;\n\theaders?: string[] | null;\n\tlinks?: [string, string][] | null;\n\tcodeBlocks?: [string, string][] | null;\n}\n\nexport interface HeaderMetadata {\n\tlevel: number;\n\ttext: string;\n\tid?: string | null;\n\tdepth: number;\n\thtmlOffset: number;\n}\n\nexport interface LinkMetadata {\n\thref: string;\n\ttext: string;\n\ttitle?: string | null;\n\tlinkType: \"anchor\" | \"internal\" | \"external\" | \"email\" | \"phone\" | \"other\";\n\trel: string[];\n\tattributes: Record<string, string>;\n}\n\nexport interface HtmlImageMetadata {\n\tsrc: string;\n\talt?: string | null;\n\ttitle?: string | null;\n\tdimensions?: [number, number] | null;\n\timageType: \"data_uri\" | \"inline_svg\" | \"external\" | \"relative\";\n\tattributes: Record<string, string>;\n}\n\nexport interface StructuredData {\n\tdataType: \"json_ld\" | \"microdata\" | \"rdfa\";\n\trawJson: string;\n\tschemaType?: string | null;\n}\n\nexport interface HtmlMetadata {\n\ttitle?: string | null;\n\tdescription?: string | null;\n\tkeywords: string[];\n\tauthor?: string | null;\n\tcanonicalUrl?: string | null;\n\tbaseHref?: string | null;\n\tlanguage?: string | null;\n\ttextDirection?: \"ltr\" | \"rtl\" | \"auto\" | null;\n\topenGraph: Record<string, string>;\n\ttwitterCard: Record<string, string>;\n\tmetaTags: Record<string, string>;\n\thtmlHeaders: HeaderMetadata[];\n\thtmlLinks: LinkMetadata[];\n\thtmlImages: HtmlImageMetadata[];\n\tstructuredData: StructuredData[];\n}\n\nexport interface PdfMetadata {\n\ttitle?: string | null;\n\tauthor?: string | null;\n\tsubject?: string | null;\n\tkeywords?: string | null;\n\tcreator?: string | null;\n\tproducer?: string | null;\n\tcreationDate?: string | null;\n\tmodificationDate?: string | null;\n\tpageCount?: number;\n}\n\nexport interface PptxMetadata {\n\ttitle?: string | null;\n\tauthor?: string | null;\n\tdescription?: string | null;\n\tsummary?: string | null;\n\tfonts?: string[];\n}\n\nexport interface OcrMetadata {\n\tlanguage?: string;\n\tpsm?: number;\n\toutputFormat?: string;\n\ttableCount?: number;\n\ttableRows?: number | null;\n\ttableCols?: number | null;\n}\n\nexport interface ImagePreprocessingMetadata {\n\toriginalDimensions?: [number, number];\n\toriginalDpi?: [number, number];\n\ttargetDpi?: number;\n\tscaleFactor?: number;\n\tautoAdjusted?: boolean;\n\tfinalDpi?: number;\n\tnewDimensions?: [number, number] | null;\n\tresampleMethod?: string;\n\tdimensionClamped?: boolean;\n\tcalculatedDpi?: number | null;\n\tskippedResize?: boolean;\n\tresizeError?: string | null;\n}\n\nexport interface ErrorMetadata {\n\terrorType?: string;\n\tmessage?: string;\n}\n\n/**\n * Page boundary information for chunk metadata.\n *\n * Tracks where a specific page's content starts and ends in the main content string,\n * enabling mapping from byte positions to page numbers. All offsets are guaranteed to be\n * at valid UTF-8 character boundaries.\n */\nexport interface PageBoundary {\n\t/** Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */\n\tbyteStart: number;\n\t/** Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */\n\tbyteEnd: number;\n\t/** Page number (1-indexed) */\n\tpageNumber: number;\n}\n\n/**\n * Type of paginated unit in a document.\n *\n * Distinguishes between different types of \"pages\":\n * - \"page\": Standard document pages (PDF, DOCX, images)\n * - \"slide\": Presentation slides (PPTX, ODP)\n * - \"sheet\": Spreadsheet sheets (XLSX, ODS)\n */\nexport type PageUnitType = \"page\" | \"slide\" | \"sheet\";\n\n/**\n * Detailed per-page metadata.\n *\n * Captures information about a single page/slide/sheet including dimensions,\n * content counts, and visibility state.\n */\nexport interface PageInfo {\n\t/** Page number (1-indexed) */\n\tnumber: number;\n\t/** Page title (usually for presentations) */\n\ttitle?: string | null;\n\t/** Dimensions in points (PDF) or pixels (images): [width, height] */\n\tdimensions?: [number, number] | null;\n\t/** Number of images on this page */\n\timageCount?: number | null;\n\t/** Number of tables on this page */\n\ttableCount?: number | null;\n\t/** Whether this page is hidden (e.g., in presentations) */\n\thidden?: boolean | null;\n\t/** Whether this page is blank (contains no meaningful content) */\n\tisBlank?: boolean | null;\n}\n\n/**\n * Page structure metadata.\n *\n * Contains information about pages/slides/sheets in a document, including\n * boundaries for mapping chunks to pages and detailed per-page metadata.\n */\nexport interface PageStructure {\n\t/** Total number of pages/slides/sheets */\n\ttotalCount: number;\n\t/** Type of paginated unit (page, slide, or sheet) */\n\tunitType: PageUnitType;\n\t/** Byte offset boundaries for each page */\n\tboundaries?: PageBoundary[] | null;\n\t/** Detailed per-page metadata (optional, only when needed) */\n\tpages?: PageInfo[] | null;\n}\n\n/**\n * Metadata about a chunk's position and properties in the document.\n *\n * Tracks where a chunk appears in the original document, including byte offsets\n * and page ranges when page tracking is enabled.\n */\n/** Heading depth and text for markdown heading context. */\nexport interface HeadingLevel {\n\t/** Heading depth (1 = h1, 2 = h2, etc.) */\n\tlevel: number;\n\t/** Text content of the heading */\n\ttext: string;\n}\n\n/** Heading hierarchy context for a markdown chunk. */\nexport interface HeadingContext {\n\t/** Heading hierarchy from document root to this chunk's section */\n\theadings: HeadingLevel[];\n}\n\nexport interface ChunkMetadata {\n\t/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary) */\n\tbyteStart: number;\n\t/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary) */\n\tbyteEnd: number;\n\t/** Number of tokens in this chunk (if available from embedding model) */\n\ttokenCount?: number | null;\n\t/** Zero-based index of this chunk in the document */\n\tchunkIndex: number;\n\t/** Total number of chunks in the document */\n\ttotalChunks: number;\n\t/** First page number this chunk spans (1-indexed, only when page tracking enabled) */\n\tfirstPage?: number | null;\n\t/** Last page number this chunk spans (1-indexed, only when page tracking enabled) */\n\tlastPage?: number | null;\n\t/** Heading context when using markdown chunker */\n\theadingContext?: HeadingContext | null;\n}\n\n/**\n * Text chunk with optional embedding.\n *\n * Represents a segment of a document created by the chunking algorithm, useful for RAG and vector databases.\n */\nexport interface Chunk {\n\t/** Text content of this chunk */\n\tcontent: string;\n\n\t/** Vector embedding for this chunk (if embedding model was used) */\n\tembedding?: number[] | null;\n\n\t/** Metadata about chunk position and properties in the document */\n\tmetadata: ChunkMetadata;\n}\n\n/**\n * Extracted image from document with optional OCR result.\n *\n * Contains image data and metadata about position, dimensions, and properties.\n */\nexport interface ExtractedImage {\n\t/** Raw image bytes as Uint8Array */\n\tdata: Uint8Array;\n\n\t/** Image format (e.g., 'png', 'jpeg', 'tiff') */\n\tformat: string;\n\n\t/** Sequential index of this image in the document (0-indexed) */\n\timageIndex: number;\n\n\t/** Page number where this image was found (1-indexed), null if unknown */\n\tpageNumber?: number | null;\n\n\t/** Image width in pixels, null if unknown */\n\twidth?: number | null;\n\n\t/** Image height in pixels, null if unknown */\n\theight?: number | null;\n\n\t/** Color space (e.g., 'RGB', 'CMYK', 'Grayscale'), null if unknown */\n\tcolorspace?: string | null;\n\n\t/** Bits per color component (e.g., 8 for 8-bit), null if unknown */\n\tbitsPerComponent?: number | null;\n\n\t/** Whether this is a mask image (used internally by PDF) */\n\tisMask: boolean;\n\n\t/** Image description or caption if available */\n\tdescription?: string | null;\n\n\t/** OCR extraction result if OCR was run on this image, null otherwise */\n\tocrResult?: ExtractionResult | null;\n\n\t/** Bounding box of the image on the page (PDF coordinates). */\n\tboundingBox?: BoundingBox | null;\n}\n\n/**\n * Content for a single page/slide/sheet.\n *\n * When page extraction is enabled, documents are split into per-page content\n * with associated tables and images mapped to each page. This allows for page-specific processing.\n */\nexport interface PageContent {\n\t/** Page number (1-indexed) starting from 1 */\n\tpageNumber: number;\n\n\t/** Text content extracted from this page */\n\tcontent: string;\n\n\t/** Tables found and extracted from this page */\n\ttables: Table[];\n\n\t/** Images found and extracted from this page */\n\timages: ExtractedImage[];\n\n\t/** Whether this page is blank (contains no meaningful content) */\n\tisBlank?: boolean | null;\n}\n\n/**\n * Extraction result metadata.\n *\n * Uses a flattened discriminated union approach with format_type as the discriminator.\n * When format_type is set (e.g., \"archive\"), the corresponding format-specific fields\n * are available at the root level of the metadata object.\n *\n * This structure matches the Rust serialization with serde's tagged enum flattening.\n */\nexport interface Metadata {\n\tlanguage?: string | null;\n\tcreatedAt?: string | null;\n\tmodifiedAt?: string | null;\n\tsubject?: string | null;\n\n\tformatType?: \"pdf\" | \"excel\" | \"email\" | \"pptx\" | \"archive\" | \"image\" | \"xml\" | \"text\" | \"html\" | \"ocr\";\n\n\ttitle?: string | null;\n\tauthors?: string[] | null;\n\tkeywords?: string[] | null;\n\tcreator?: string | null;\n\tproducer?: string | null;\n\tcreationDate?: string | null;\n\tmodificationDate?: string | null;\n\tpageCount?: number;\n\n\tsheetCount?: number;\n\tsheetNames?: string[];\n\n\tfromEmail?: string | null;\n\tfromName?: string | null;\n\ttoEmails?: string[];\n\tccEmails?: string[];\n\tbccEmails?: string[];\n\tmessageId?: string | null;\n\tattachments?: string[];\n\n\tdescription?: string | null;\n\tsummary?: string | null;\n\tfonts?: string[];\n\n\tformat?: string;\n\tfileCount?: number;\n\tfileList?: string[];\n\ttotalSize?: number;\n\tcompressedSize?: number | null;\n\n\twidth?: number;\n\theight?: number;\n\texif?: Record<string, string>;\n\n\telementCount?: number;\n\tuniqueElements?: string[];\n\n\tline_count?: number;\n\tword_count?: number;\n\tcharacter_count?: number;\n\theaders?: string[] | null;\n\tlinks?: [string, string][] | null;\n\tcode_blocks?: [string, string][] | null;\n\n\tcanonical_url?: string | null;\n\tbase_href?: string | null;\n\topen_graph?: Record<string, string>;\n\ttwitter_card?: Record<string, string>;\n\tmeta_tags?: Record<string, string>;\n\thtml_language?: string | null;\n\ttext_direction?: \"ltr\" | \"rtl\" | \"auto\" | null;\n\thtml_headers?: HeaderMetadata[];\n\thtml_links?: LinkMetadata[];\n\thtml_images?: HtmlImageMetadata[];\n\tstructured_data?: StructuredData[];\n\n\tpsm?: number;\n\toutput_format?: string;\n\ttable_count?: number;\n\ttable_rows?: number | null;\n\ttable_cols?: number | null;\n\n\timage_preprocessing?: ImagePreprocessingMetadata | null;\n\n\tjson_schema?: Record<string, unknown> | null;\n\n\tpage_structure?: PageStructure | null;\n\n\terror?: ErrorMetadata | null;\n\n\t/**\n\t * Additional fields may be added at runtime by postprocessors.\n\t * Use bracket notation to safely access unexpected properties.\n\t */\n\t[key: string]: unknown;\n}\n\n/**\n * Semantic element type classification.\n *\n * Categorizes text content into semantic units for downstream processing.\n * Supports the element types commonly found in structured documents.\n */\nexport type ElementType =\n\t| \"title\"\n\t| \"narrative_text\"\n\t| \"heading\"\n\t| \"list_item\"\n\t| \"table\"\n\t| \"image\"\n\t| \"page_break\"\n\t| \"code_block\"\n\t| \"block_quote\"\n\t| \"footer\"\n\t| \"header\";\n\n/**\n * Bounding box coordinates for element positioning.\n *\n * Represents rectangular coordinates in the document space.\n */\nexport interface BoundingBox {\n\t/** Left x-coordinate */\n\tx0: number;\n\t/** Bottom y-coordinate */\n\ty0: number;\n\t/** Right x-coordinate */\n\tx1: number;\n\t/** Top y-coordinate */\n\ty1: number;\n}\n\n/**\n * A PDF annotation extracted from a document page.\n */\nexport interface PdfAnnotation {\n\t/** Type of annotation (e.g., \"text\", \"highlight\", \"link\", \"underline\") */\n\tannotationType: string;\n\t/** Text content of the annotation, if available */\n\tcontent?: string | null;\n\t/** Page number (1-indexed) where the annotation appears */\n\tpageNumber: number;\n\t/** Bounding box of the annotation on the page */\n\tboundingBox?: BoundingBox | null;\n}\n\n/**\n * Metadata for a semantic element.\n *\n * Contains structural and positioning information about an extracted element.\n */\nexport interface ElementMetadata {\n\t/** Page number (1-indexed) */\n\tpageNumber?: number | null;\n\t/** Source filename or document name */\n\tfilename?: string | null;\n\t/** Bounding box coordinates if available */\n\tcoordinates?: BoundingBox | null;\n\t/** Position index in the element sequence */\n\telementIndex?: number | null;\n\t/** Additional custom metadata */\n\tadditional?: Record<string, string>;\n}\n\n/**\n * Semantic element extracted from document.\n *\n * Represents a logical unit of content with semantic classification,\n * unique identifier, and metadata for tracking origin and position.\n */\nexport interface Element {\n\t/** Unique element identifier */\n\telementId: string;\n\t/** Semantic type of this element */\n\telementType: ElementType;\n\t/** Text content of the element */\n\ttext: string;\n\t/** Metadata about the element */\n\tmetadata: ElementMetadata;\n}\n\n/**\n * Complete extraction result from document processing.\n *\n * Contains all extracted content, metadata, and optional processed data like chunks and images.\n * This is the primary return value from extraction functions.\n */\nexport interface ExtractionResult {\n\t/** Extracted text content from the document (main content) */\n\tcontent: string;\n\n\t/** MIME type of the input document (e.g., 'application/pdf', 'text/html') */\n\tmimeType: string;\n\n\t/** Document metadata including title, author, creation date, language, and format-specific fields */\n\tmetadata: Metadata;\n\n\t/** Tables extracted from the document (2D cell arrays with Markdown representation) */\n\ttables: Table[];\n\n\t/** Detected languages in the document (ISO 639-1 codes, e.g., ['en', 'de']), null if detection disabled */\n\tdetectedLanguages: string[] | null;\n\n\t/** Document chunks for RAG/vector databases (if chunking was enabled), null otherwise */\n\tchunks: Chunk[] | null;\n\n\t/** Images extracted from document with metadata (if image extraction was enabled), null otherwise */\n\timages: ExtractedImage[] | null;\n\n\t/** Semantic elements extracted from document with type classification and metadata (if element extraction was enabled), null otherwise */\n\telements?: Element[] | null;\n\n\t/** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */\n\tpages?: PageContent[] | null;\n\n\t/** Extracted keywords when keyword extraction is enabled, undefined otherwise */\n\textractedKeywords?: ExtractedKeyword[];\n\n\t/** Quality score when quality processing is enabled, undefined otherwise */\n\tqualityScore?: number;\n\n\t/** Processing warnings from post-processors */\n\tprocessingWarnings?: ProcessingWarning[];\n\n\t/** Granular OCR elements (words, lines, blocks) when OCR element extraction is enabled, null otherwise */\n\tocrElements?: OcrElement[] | null;\n\n\t/** Structured document tree when include_document_structure is enabled, null otherwise */\n\tdocument?: Record<string, unknown> | null;\n\n\t/** PDF annotations when extract_annotations is enabled, null otherwise */\n\tannotations?: PdfAnnotation[] | null;\n}\n\n/** Post-processor execution stage in the extraction pipeline. */\nexport type ProcessingStage = \"early\" | \"middle\" | \"late\";\n\n/**\n * Protocol for custom post-processors that modify extraction results.\n *\n * Post-processors enrich or transform extraction results without failing the extraction.\n * If a post-processor throws an error, it's logged but extraction continues.\n * Only works with async extraction functions (`extractFile`, `extractBytes`, etc.).\n */\nexport interface PostProcessorProtocol {\n\t/**\n\t * Return the unique name of this postprocessor.\n\t *\n\t * @returns Unique processor name (case-sensitive, alphanumeric + underscores recommended)\n\t */\n\tname(): string;\n\n\t/**\n\t * Process and enrich an extraction result.\n\t *\n\t * Modify the result to add new metadata, transform content, or perform other enrichment.\n\t * If this throws an error, it's logged but extraction continues.\n\t *\n\t * @param result - ExtractionResult with extracted content, metadata, and tables\n\t * @returns Modified result with enriched data. Can be async or sync.\n\t */\n\tprocess(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;\n\n\t/**\n\t * Return the processing stage for this processor.\n\t *\n\t * Determines when this processor runs relative to others:\n\t * - \"early\": Runs first, before other processors (good for cleanup/normalization)\n\t * - \"middle\": Runs with other middle-stage processors (default)\n\t * - \"late\": Runs last, after others (good for final enrichment)\n\t *\n\t * @returns One of \"early\", \"middle\", or \"late\" (default: \"middle\")\n\t */\n\tprocessingStage?(): ProcessingStage;\n\n\t/**\n\t * Initialize the processor (e.g., load ML models, setup resources).\n\t *\n\t * Called once when the processor is first registered. Use for expensive operations.\n\t */\n\tinitialize?(): void | Promise<void>;\n\n\t/**\n\t * Shutdown the processor and release resources.\n\t *\n\t * Called when the processor is unregistered. Use for cleanup (closing connections, freeing memory).\n\t */\n\tshutdown?(): void | Promise<void>;\n}\n\n/**\n * Protocol for custom validators that check extraction results.\n *\n * Validators perform quality checks and fail the extraction if validation fails.\n * Unlike post-processors, validator errors cause the entire extraction to fail.\n * Useful for enforcing quality standards on extracted content.\n */\nexport interface ValidatorProtocol {\n\t/**\n\t * Return the unique name of this validator.\n\t *\n\t * @returns Unique validator name (case-sensitive, alphanumeric + underscores recommended)\n\t */\n\tname(): string;\n\n\t/**\n\t * Validate an extraction result.\n\t *\n\t * Throw an error if validation fails. The error message will be used as the extraction error.\n\t * If validation passes, return without throwing (return value is ignored).\n\t *\n\t * @param result - ExtractionResult to validate\n\t * @throws {Error} If validation fails (extraction will fail with this error)\n\t */\n\tvalidate(result: ExtractionResult): void | Promise<void>;\n\n\t/**\n\t * Return the validation priority.\n\t *\n\t * Higher priority validators run first. Useful for running cheap validations (e.g., length checks)\n\t * before expensive ones (e.g., AI-based quality checks) to fail fast.\n\t *\n\t * @returns Priority value (higher = runs earlier, default: 50). Range: 0-1000.\n\t */\n\tpriority?(): number;\n\n\t/**\n\t * Check if this validator should run for a given result.\n\t *\n\t * Allows conditional validation based on MIME type, metadata, or content.\n\t * This is evaluated before validation, so expensive checks can be skipped for irrelevant documents.\n\t *\n\t * @param result - ExtractionResult to check\n\t * @returns true if validator should run, false to skip (default: true)\n\t */\n\tshouldValidate?(result: ExtractionResult): boolean;\n\n\t/**\n\t * Initialize the validator (e.g., load ML models, setup resources).\n\t *\n\t * Called once when the validator is first registered. Use for expensive operations.\n\t */\n\tinitialize?(): void | Promise<void>;\n\n\t/**\n\t * Shutdown the validator and release resources.\n\t *\n\t * Called when the validator is unregistered. Use for cleanup (closing connections, freeing memory).\n\t */\n\tshutdown?(): void | Promise<void>;\n}\n\n/**\n * OCR backend protocol for implementing custom OCR engines.\n *\n * This interface defines the contract for OCR backends that can be registered\n * with Kreuzberg's extraction pipeline.\n *\n * ## Implementation Requirements\n *\n * OCR backends must implement:\n * - `name()`: Return a unique backend identifier\n * - `supportedLanguages()`: Return list of supported ISO 639-1/2/3 language codes\n * - `processImage()`: Process image bytes and return extraction result\n *\n * ## Optional Methods\n *\n * - `initialize()`: Called when backend is registered (load models, etc.)\n * - `shutdown()`: Called when backend is unregistered (cleanup resources)\n *\n * @example\n * ```typescript\n * import { registerOcrBackend, extractFile } from '@kreuzberg/node';\n *\n * // PaddleOCR is built into the native Rust core - just use the backend name\n * const result = await extractFile('scanned.pdf', null, {\n * ocr: { backend: 'paddle-ocr', language: 'en' }\n * });\n * ```\n */\nexport interface OcrBackendProtocol {\n\t/**\n\t * Return the unique name of this OCR backend.\n\t *\n\t * This name is used in ExtractionConfig to select the backend:\n\t * ```typescript\n\t * { ocr: { backend: 'paddle-ocr', language: 'en' } }\n\t * ```\n\t *\n\t * @returns Unique backend identifier (e.g., \"paddle-ocr\", \"tesseract\")\n\t */\n\tname(): string;\n\n\t/**\n\t * Return list of supported language codes.\n\t *\n\t * Language codes should follow ISO 639-1 (2-letter) or ISO 639-2 (3-letter) standards.\n\t * Common codes: \"en\", \"eng\" (English), \"de\", \"deu\" (German), \"fr\", \"fra\" (French).\n\t *\n\t * @returns Array of supported language codes\n\t *\n\t * @example\n\t * ```typescript\n\t * supportedLanguages(): string[] {\n\t * return [\"en\", \"eng\", \"de\", \"deu\", \"fr\", \"fra\"];\n\t * }\n\t * ```\n\t */\n\tsupportedLanguages(): string[];\n\n\t/**\n\t * Process image bytes and extract text via OCR.\n\t *\n\t * This method receives raw image data and must return a result object with:\n\t * - `content`: Extracted text content\n\t * - `mime_type`: MIME type (usually \"text/plain\")\n\t * - `metadata`: Additional information (confidence, dimensions, etc.)\n\t * - `tables`: Optional array of detected tables\n\t *\n\t * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string (when called from Rust bindings)\n\t * @param language - Language code from supportedLanguages()\n\t * @returns Promise resolving to extraction result\n\t *\n\t * @example\n\t * ```typescript\n\t * async processImage(imageBytes: Uint8Array | string, language: string): Promise<{\n\t * content: string;\n\t * mime_type: string;\n\t * metadata: Record<string, unknown>;\n\t * tables: unknown[];\n\t * }> {\n\t * const buffer = typeof imageBytes === \"string\" ? Buffer.from(imageBytes, \"base64\") : Buffer.from(imageBytes);\n\t * const text = await myOcrEngine.recognize(buffer, language);\n\t * return {\n\t * content: text,\n\t * mime_type: \"text/plain\",\n\t * metadata: { confidence: 0.95, language },\n\t * tables: []\n\t * };\n\t * }\n\t * ```\n\t */\n\tprocessImage(\n\t\timageBytes: Uint8Array | string,\n\t\tlanguage: string,\n\t): Promise<{\n\t\tcontent: string;\n\t\tmime_type: string;\n\t\tmetadata: Record<string, unknown>;\n\t\ttables: unknown[];\n\t}>;\n\n\t/**\n\t * Initialize the OCR backend (optional).\n\t *\n\t * Called once when the backend is registered. Use this to:\n\t * - Load ML models\n\t * - Initialize libraries\n\t * - Validate dependencies\n\t *\n\t * @example\n\t * ```typescript\n\t * async initialize(): Promise<void> {\n\t * this.model = await loadModel('./path/to/model');\n\t * }\n\t * ```\n\t */\n\tinitialize?(): void | Promise<void>;\n\n\t/**\n\t * Shutdown the OCR backend and release resources (optional).\n\t *\n\t * Called when the backend is unregistered. Use this to:\n\t * - Free model memory\n\t * - Close file handles\n\t * - Cleanup temporary files\n\t *\n\t * @example\n\t * ```typescript\n\t * async shutdown(): Promise<void> {\n\t * await this.model.dispose();\n\t * this.model = null;\n\t * }\n\t * ```\n\t */\n\tshutdown?(): void | Promise<void>;\n}\n\n/**\n * Result of error message classification into error codes.\n *\n * Provides classification details including the error code, name,\n * description, and confidence score for the classification.\n *\n * @example\n * ```typescript\n * import { classifyError, ErrorCode } from '@kreuzberg/node';\n *\n * const result = classifyError(\"File not found in read operation\");\n * if (result.code === ErrorCode.IoError) {\n * console.error(`I/O Error: ${result.description}`);\n * console.log(`Confidence: ${result.confidence}`);\n * }\n * ```\n */\nexport interface ErrorClassification {\n\t/**\n\t * The numeric error code (0-7) representing the error type.\n\t */\n\tcode: number;\n\n\t/**\n\t * The human-readable name of the error code (e.g., \"validation\", \"ocr\").\n\t */\n\tname: string;\n\n\t/**\n\t * A brief description of the error type.\n\t */\n\tdescription: string;\n\n\t/**\n\t * Confidence score (0.0-1.0) indicating how certain the classification is.\n\t * Higher values indicate higher confidence in the classification.\n\t */\n\tconfidence: number;\n}\n\n// ============================================================================\n// Worker Pool APIs\n// ============================================================================\n\n/**\n * Opaque handle to a worker pool for concurrent extraction operations.\n *\n * Worker pools enable parallel processing of CPU-bound document extraction\n * tasks by distributing work across multiple threads. This is especially\n * useful for batch processing large numbers of documents.\n *\n * @example\n * ```typescript\n * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';\n *\n * const pool = createWorkerPool(4); // 4 concurrent workers\n * try {\n * const result = await extractFileInWorker(pool, 'document.pdf');\n * console.log(result.content);\n * } finally {\n * await closeWorkerPool(pool);\n * }\n * ```\n */\nexport interface WorkerPool {\n\t/** Internal pool identifier (opaque) */\n\treadonly poolId: number;\n}\n\n/**\n * Worker pool statistics.\n *\n * Provides information about the current state of a worker pool including\n * pool size, number of active workers, and queued tasks.\n *\n * @example\n * ```typescript\n * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';\n *\n * const pool = createWorkerPool(4);\n * const stats = getWorkerPoolStats(pool);\n * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);\n * console.log(`Queued: ${stats.queuedTasks}`);\n * ```\n */\nexport interface WorkerPoolStats {\n\t/**\n\t * Maximum number of concurrent workers in the pool.\n\t */\n\tsize: number;\n\n\t/**\n\t * Number of currently active (executing) workers.\n\t */\n\tactiveWorkers: number;\n\n\t/**\n\t * Number of tasks waiting in the queue.\n\t */\n\tqueuedTasks: number;\n}\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
package/index.d.ts CHANGED
@@ -16,6 +16,7 @@ export declare class JsWorkerPool {
16
16
  * * `data_list` - Array of buffers to extract
17
17
  * * `mime_types` - Array of MIME types (must match data_list length)
18
18
  * * `config` - Optional extraction configuration
19
+ * * `file_configs` - Optional per-item extraction configs (must match data_list length if provided)
19
20
  *
20
21
  * # Returns
21
22
  *
@@ -40,7 +41,7 @@ export declare class JsWorkerPool {
40
41
  * );
41
42
  * ```
42
43
  */
43
- export declare function batchExtractBytes(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
44
+ export declare function batchExtractBytes(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array<JsFileExtractionConfig | undefined | null> | undefined | null): Promise<Array<JsExtractionResult>>
44
45
 
45
46
  /**
46
47
  * Batch extract from multiple byte arrays (synchronous).
@@ -53,6 +54,7 @@ export declare function batchExtractBytes(dataList: Array<Buffer>, mimeTypes: Ar
53
54
  * * `data_list` - Array of buffers to extract
54
55
  * * `mime_types` - Array of MIME types (must match data_list length)
55
56
  * * `config` - Optional extraction configuration
57
+ * * `file_configs` - Optional per-item extraction configs (must match data_list length if provided)
56
58
  *
57
59
  * # Returns
58
60
  *
@@ -72,7 +74,7 @@ export declare function batchExtractBytes(dataList: Array<Buffer>, mimeTypes: Ar
72
74
  * const results = batchExtractBytesSync(buffers, mimeTypes, null);
73
75
  * ```
74
76
  */
75
- export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
77
+ export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array<JsFileExtractionConfig | undefined | null> | undefined | null): Array<JsExtractionResult>
76
78
 
77
79
  /**
78
80
  * Batch extract from multiple files (asynchronous).
@@ -84,6 +86,7 @@ export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes
84
86
  *
85
87
  * * `paths` - Array of file paths to extract
86
88
  * * `config` - Optional extraction configuration (applied to all files)
89
+ * * `file_configs` - Optional per-file extraction configs (must match paths length if provided)
87
90
  *
88
91
  * # Returns
89
92
  *
@@ -99,7 +102,7 @@ export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes
99
102
  * console.log(`Processed ${results.length} files`);
100
103
  * ```
101
104
  */
102
- export declare function batchExtractFiles(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
105
+ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array<JsFileExtractionConfig | undefined | null> | undefined | null): Promise<Array<JsExtractionResult>>
103
106
 
104
107
  /**
105
108
  * Extract multiple files using worker threads from the pool.
@@ -135,7 +138,7 @@ export declare function batchExtractFiles(paths: Array<string>, config?: JsExtra
135
138
  */
136
139
  export declare function batchExtractFilesInWorker(pool: JsWorkerPool, filePaths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
137
140
 
138
- export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
141
+ export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array<JsFileExtractionConfig | undefined | null> | undefined | null): Array<JsExtractionResult>
139
142
 
140
143
  export declare function classifyError(errorMessage: string): ErrorClassification
141
144
 
@@ -575,7 +578,7 @@ export declare function extractFile(filePath: string, mimeType?: string | undefi
575
578
  * console.log(result.content);
576
579
  * ```
577
580
  */
578
- export declare function extractFileInWorker(pool: JsWorkerPool, filePath: string, password?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
581
+ export declare function extractFileInWorker(pool: JsWorkerPool, filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
579
582
 
580
583
  export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
581
584
 
@@ -908,6 +911,19 @@ export declare function getValidTokenReductionLevels(): Array<string>
908
911
  */
909
912
  export declare function getWorkerPoolStats(pool: JsWorkerPool): WorkerPoolStats
910
913
 
914
+ /**
915
+ * Hardware acceleration configuration for ONNX Runtime inference.
916
+ *
917
+ * Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
918
+ * for layout detection and embedding generation.
919
+ */
920
+ export interface JsAccelerationConfig {
921
+ /** Execution provider: "auto" (default), "cpu", "coreml", "cuda", "tensorrt". */
922
+ provider?: string
923
+ /** GPU device ID for CUDA/TensorRT. Ignored for CPU/CoreML/Auto. */
924
+ deviceId?: number
925
+ }
926
+
911
927
  export interface JsBoundingBox {
912
928
  x0: number
913
929
  y0: number
@@ -949,6 +965,12 @@ export interface JsChunkMetadata {
949
965
  headingContext?: JsHeadingContext
950
966
  }
951
967
 
968
+ /** Concurrency configuration for Node.js bindings. */
969
+ export interface JsConcurrencyConfig {
970
+ /** Maximum number of threads for all internal thread pools. */
971
+ maxThreads?: number
972
+ }
973
+
952
974
  export interface JsElement {
953
975
  elementId: string
954
976
  elementType: string
@@ -964,6 +986,16 @@ export interface JsElementMetadata {
964
986
  additional?: Record<string, string> | undefined
965
987
  }
966
988
 
989
+ /** Email extraction configuration for Node.js bindings. */
990
+ export interface JsEmailConfig {
991
+ /**
992
+ * Windows codepage number for MSG files with no codepage property.
993
+ * Common values: 1250 (Central European), 1251 (Cyrillic), 1252 (Western, default),
994
+ * 1253 (Greek), 1254 (Turkish), 932 (Japanese), 936 (Simplified Chinese).
995
+ */
996
+ msgFallbackCodepage?: number
997
+ }
998
+
967
999
  /** Embedding generation configuration for Node.js bindings. */
968
1000
  export interface JsEmbeddingConfig {
969
1001
  /** Embedding model configuration */
@@ -983,15 +1015,14 @@ export interface JsEmbeddingConfig {
983
1015
  *
984
1016
  * This struct represents different embedding model sources:
985
1017
  * - `preset`: Use a named preset (e.g., "balanced", "fast", "quality", "multilingual")
986
- * - `fastembed`: Use a FastEmbed model with custom dimensions
987
- * - `custom`: Use a custom ONNX model
1018
+ * - `custom`: Use a custom ONNX model from HuggingFace
988
1019
  */
989
1020
  export interface JsEmbeddingModelType {
990
- /** Type of model: "preset", "fastembed", or "custom" */
1021
+ /** Type of model: "preset" or "custom" */
991
1022
  modelType: string
992
- /** For preset: preset name; for fastembed/custom: model ID */
1023
+ /** For preset: preset name; for custom: HuggingFace model ID */
993
1024
  value: string
994
- /** Number of dimensions (only for fastembed/custom) */
1025
+ /** Number of dimensions (only for custom) */
995
1026
  dimensions?: number
996
1027
  }
997
1028
 
@@ -1038,6 +1069,16 @@ export interface JsExtractionConfig {
1038
1069
  resultFormat?: string
1039
1070
  /** Include document structure in extraction result */
1040
1071
  includeDocumentStructure?: boolean
1072
+ /** Layout detection configuration (None = layout detection disabled) */
1073
+ layout?: JsLayoutDetectionConfig
1074
+ /** Email extraction configuration */
1075
+ email?: JsEmailConfig
1076
+ /** Hardware acceleration configuration for ONNX Runtime inference */
1077
+ acceleration?: JsAccelerationConfig
1078
+ /** Security limits to guard against DoS attacks */
1079
+ securityLimits?: JsSecurityLimits
1080
+ /** Concurrency configuration for thread pool control */
1081
+ concurrency?: JsConcurrencyConfig
1041
1082
  }
1042
1083
 
1043
1084
  export interface JsExtractionResult {
@@ -1059,6 +1100,29 @@ export interface JsExtractionResult {
1059
1100
  annotations?: Array<JsPdfAnnotation>
1060
1101
  }
1061
1102
 
1103
+ export interface JsFileExtractionConfig {
1104
+ enableQualityProcessing?: boolean
1105
+ ocr?: JsOcrConfig
1106
+ forceOcr?: boolean
1107
+ chunking?: JsChunkingConfig
1108
+ images?: JsImageExtractionConfig
1109
+ pdfOptions?: JsPdfConfig
1110
+ tokenReduction?: JsTokenReductionConfig
1111
+ languageDetection?: JsLanguageDetectionConfig
1112
+ postprocessor?: JsPostProcessorConfig
1113
+ keywords?: JsKeywordConfig
1114
+ htmlOptions?: JsHtmlOptions
1115
+ pages?: JsPageConfig
1116
+ /** Output text format: "plain" | "markdown" | "djot" | "html" */
1117
+ outputFormat?: string
1118
+ /** Result structure format: "unified" | "element_based" */
1119
+ resultFormat?: string
1120
+ /** Include document structure in extraction result */
1121
+ includeDocumentStructure?: boolean
1122
+ /** Layout detection configuration (None = layout detection disabled) */
1123
+ layout?: JsLayoutDetectionConfig
1124
+ }
1125
+
1062
1126
  export interface JsHeadingContext {
1063
1127
  headings: Array<JsHeadingLevel>
1064
1128
  }
@@ -1149,6 +1213,12 @@ export interface JsLanguageDetectionConfig {
1149
1213
  detectMultiple?: boolean
1150
1214
  }
1151
1215
 
1216
+ export interface JsLayoutDetectionConfig {
1217
+ preset?: string
1218
+ confidenceThreshold?: number
1219
+ applyHeuristics?: boolean
1220
+ }
1221
+
1152
1222
  export interface JsOcrConfig {
1153
1223
  backend: string
1154
1224
  language?: string
@@ -1212,6 +1282,7 @@ export interface JsPdfConfig {
1212
1282
  extractAnnotations?: boolean
1213
1283
  topMarginFraction?: number
1214
1284
  bottomMarginFraction?: number
1285
+ allowSingleColumnTables?: boolean
1215
1286
  }
1216
1287
 
1217
1288
  export interface JsPostProcessorConfig {
@@ -1230,6 +1301,28 @@ export interface JsRakeParams {
1230
1301
  maxWordsPerPhrase?: number
1231
1302
  }
1232
1303
 
1304
+ /** Security limits to protect against DoS attacks (ZIP bombs, XML entity expansion, etc.). */
1305
+ export interface JsSecurityLimits {
1306
+ /** Maximum uncompressed size for archives in bytes. */
1307
+ maxArchiveSize?: number
1308
+ /** Maximum compression ratio before flagging as potential bomb. */
1309
+ maxCompressionRatio?: number
1310
+ /** Maximum number of files in an archive. */
1311
+ maxFilesInArchive?: number
1312
+ /** Maximum nesting depth for structures. */
1313
+ maxNestingDepth?: number
1314
+ /** Maximum entity/string length. */
1315
+ maxEntityLength?: number
1316
+ /** Maximum content size in bytes. */
1317
+ maxContentSize?: number
1318
+ /** Maximum iterations per operation. */
1319
+ maxIterations?: number
1320
+ /** Maximum XML depth in levels. */
1321
+ maxXmlDepth?: number
1322
+ /** Maximum cells per table. */
1323
+ maxTableCells?: number
1324
+ }
1325
+
1233
1326
  export interface JsTable {
1234
1327
  cells: Array<Array<string>>
1235
1328
  markdown: string