@kreuzberg/node 4.0.0-rc.8 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/types.d.ts CHANGED
@@ -4,108 +4,316 @@
4
4
  * These types mirror the strongly-typed Rust metadata structures,
5
5
  * providing type safety for TypeScript users.
6
6
  */
7
+ /**
8
+ * Tesseract OCR engine configuration options.
9
+ *
10
+ * @example
11
+ * ```typescript
12
+ * const config: TesseractConfig = {
13
+ * psm: 6,
14
+ * enableTableDetection: true,
15
+ * tesseditCharWhitelist: '0123456789'
16
+ * };
17
+ * ```
18
+ */
7
19
  interface TesseractConfig {
20
+ /**
21
+ * Page Segmentation Mode (0-13). Controls how Tesseract segments and recognizes text.
22
+ * Common values: 3 (auto), 6 (single uniform block), 11 (sparse text).
23
+ * Default: 3 (auto layout analysis).
24
+ */
8
25
  psm?: number;
26
+ /**
27
+ * Enable table detection during OCR processing.
28
+ * When true, Tesseract attempts to preserve table structure in the output.
29
+ * Default: false.
30
+ */
9
31
  enableTableDetection?: boolean;
32
+ /**
33
+ * Whitelist of characters Tesseract should recognize.
34
+ * Only these characters will be returned by the OCR engine.
35
+ * Use empty string to allow all characters. Useful for constraining output to digits,
36
+ * specific alphabets, or other character sets.
37
+ * Default: null (recognize all).
38
+ */
10
39
  tesseditCharWhitelist?: string;
11
40
  }
41
+ /**
42
+ * OCR (Optical Character Recognition) configuration.
43
+ *
44
+ * Controls which OCR engine to use and how it processes images.
45
+ */
12
46
  interface OcrConfig {
47
+ /** OCR backend name (e.g., 'tesseract', 'paddleocr', 'easyocr'). Required. */
13
48
  backend: string;
49
+ /** ISO 639-1/3 language code(s) for OCR (e.g., 'eng', 'fra', 'deu'). Default: 'eng'. */
14
50
  language?: string;
51
+ /** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */
15
52
  tesseractConfig?: TesseractConfig;
16
53
  }
54
+ /**
55
+ * Document chunking configuration for splitting large documents.
56
+ *
57
+ * Breaks large documents into smaller, manageable chunks while preserving context.
58
+ * Useful for RAG (Retrieval Augmented Generation) and vector database indexing.
59
+ */
17
60
  interface ChunkingConfig {
61
+ /** Maximum characters per chunk. Default: 4096. */
18
62
  maxChars?: number;
63
+ /** Maximum overlapping characters between consecutive chunks for context preservation. Default: 512. */
19
64
  maxOverlap?: number;
65
+ /**
66
+ * Alternative to maxChars: chunk size using different unit.
67
+ * Mutually exclusive with maxChars.
68
+ */
20
69
  chunkSize?: number;
70
+ /**
71
+ * Alternative to maxOverlap: overlap amount using different unit.
72
+ * Mutually exclusive with maxOverlap.
73
+ */
21
74
  chunkOverlap?: number;
75
+ /**
76
+ * Named preset configuration (e.g., 'default', 'aggressive', 'minimal').
77
+ * Uses preset values if neither maxChars nor chunkSize is specified.
78
+ */
22
79
  preset?: string;
80
+ /** Embedding configuration for generating vector embeddings for each chunk. */
23
81
  embedding?: Record<string, unknown>;
82
+ /** Enable or disable chunking. Default: true when chunking config is provided. */
24
83
  enabled?: boolean;
25
84
  }
85
+ /**
86
+ * Language detection configuration.
87
+ *
88
+ * Automatically detects the language(s) of extracted content.
89
+ */
26
90
  interface LanguageDetectionConfig {
91
+ /** Enable automatic language detection. Default: true. */
27
92
  enabled?: boolean;
93
+ /** Minimum confidence score (0.0-1.0) for language detection. Default: 0.5. */
28
94
  minConfidence?: number;
95
+ /** Detect multiple languages in the same document. Default: false. */
29
96
  detectMultiple?: boolean;
30
97
  }
98
+ /**
99
+ * Token reduction configuration for optimizing token usage.
100
+ *
101
+ * Reduces the number of tokens in extracted content while preserving meaning.
102
+ * Useful for reducing costs in LLM pipelines.
103
+ */
31
104
  interface TokenReductionConfig {
105
+ /** Reduction mode: 'aggressive' or 'conservative'. Default: 'conservative'. */
32
106
  mode?: string;
107
+ /** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
33
108
  preserveImportantWords?: boolean;
34
109
  }
110
+ /**
111
+ * Hierarchy extraction configuration.
112
+ *
113
+ * Controls document hierarchy detection based on font size clustering.
114
+ */
115
+ interface HierarchyConfig {
116
+ /** Enable hierarchy extraction. Default: true. */
117
+ enabled?: boolean;
118
+ /** Number of font size clusters (2-10). Default: 6. */
119
+ kClusters?: number;
120
+ /** Include bounding box information. Default: true. */
121
+ includeBbox?: boolean;
122
+ /** OCR coverage threshold (0.0-1.0). Default: null. */
123
+ ocrCoverageThreshold?: number | null;
124
+ }
125
+ /**
126
+ * PDF-specific extraction configuration.
127
+ *
128
+ * Controls how PDF documents are processed.
129
+ */
35
130
  interface PdfConfig {
131
+ /** Extract images from PDF pages. Default: true. */
36
132
  extractImages?: boolean;
133
+ /** List of passwords to try for password-protected PDFs. */
37
134
  passwords?: string[];
135
+ /** Extract document metadata (title, author, creation date, etc.). Default: true. */
38
136
  extractMetadata?: boolean;
137
+ /** Hierarchy extraction configuration. */
138
+ hierarchy?: HierarchyConfig;
39
139
  }
140
+ /**
141
+ * Image extraction and processing configuration.
142
+ *
143
+ * Controls how images are extracted and optimized from documents.
144
+ */
40
145
  interface ImageExtractionConfig {
146
+ /** Enable image extraction from documents. Default: true. */
41
147
  extractImages?: boolean;
148
+ /** Target DPI (dots per inch) for extracted images. Higher DPI = better quality but larger files. Default: 150. */
42
149
  targetDpi?: number;
150
+ /** Maximum image dimension (width or height) in pixels. Images larger than this are downscaled. Default: 2000. */
43
151
  maxImageDimension?: number;
152
+ /** Automatically adjust DPI based on image content and quality. Default: true. */
44
153
  autoAdjustDpi?: boolean;
154
+ /** Minimum DPI to maintain for image quality. Default: 72. */
45
155
  minDpi?: number;
156
+ /** Maximum DPI to avoid excessive file sizes. Default: 300. */
46
157
  maxDpi?: number;
47
158
  }
159
+ /**
160
+ * Post-processor configuration for modifying extracted content.
161
+ *
162
+ * Post-processors allow customization and cleanup of extraction results
163
+ * without failing the extraction if they encounter errors.
164
+ */
48
165
  interface PostProcessorConfig {
166
+ /** Enable or disable post-processing entirely. Default: true. */
49
167
  enabled?: boolean;
168
+ /** List of processor names to enable (allowlist). When set, only these are used. */
50
169
  enabledProcessors?: string[];
170
+ /** List of processor names to disable (denylist). These are skipped. */
51
171
  disabledProcessors?: string[];
52
172
  }
173
+ /**
174
+ * HTML preprocessing options.
175
+ *
176
+ * Cleans HTML content before conversion to Markdown.
177
+ */
53
178
  interface HtmlPreprocessingOptions {
179
+ /** Enable HTML preprocessing. Default: true. */
54
180
  enabled?: boolean;
181
+ /** Preset cleanup level: 'minimal' (light), 'standard' (balanced), 'aggressive' (heavy). Default: 'standard'. */
55
182
  preset?: "minimal" | "standard" | "aggressive";
183
+ /** Remove navigation menus and headers. Default: true. */
56
184
  removeNavigation?: boolean;
185
+ /** Remove form elements. Default: true. */
57
186
  removeForms?: boolean;
58
187
  }
188
+ /**
189
+ * HTML to Markdown conversion configuration options.
190
+ *
191
+ * Controls how HTML content is converted to Markdown format, including formatting,
192
+ * escaping, and special handling for various HTML elements.
193
+ */
59
194
  interface HtmlConversionOptions {
195
+ /** Heading style conversion: "atx" (# style), "underlined" (underline style), or "atx_closed" (# style closed). Default: "atx". */
60
196
  headingStyle?: "atx" | "underlined" | "atx_closed";
197
+ /** List indentation type: "spaces" or "tabs". Default: "spaces". */
61
198
  listIndentType?: "spaces" | "tabs";
199
+ /** Number of spaces/tabs per list indent level. Default: 4. */
62
200
  listIndentWidth?: number;
201
+ /** Bullet characters for unordered lists (e.g., '*', '-', '+'). Default: '*'. */
63
202
  bullets?: string;
203
+ /** Markdown symbol for strong/bold emphasis: '**' or '__'. Default: '**'. */
64
204
  strongEmSymbol?: string;
205
+ /** Escape asterisks (*) in text to prevent accidental formatting. Default: false. */
65
206
  escapeAsterisks?: boolean;
207
+ /** Escape underscores (_) in text to prevent accidental formatting. Default: false. */
66
208
  escapeUnderscores?: boolean;
209
+ /** Escape miscellaneous special characters. Default: false. */
67
210
  escapeMisc?: boolean;
211
+ /** Escape ASCII control characters. Default: false. */
68
212
  escapeAscii?: boolean;
213
+ /** Default code language for syntax highlighting in code blocks (e.g., 'javascript'). Default: null. */
69
214
  codeLanguage?: string;
215
+ /** Convert HTML links to Markdown autolinks format ([text](url)). Default: true. */
70
216
  autolinks?: boolean;
217
+ /** Use the HTML title element as default for links when no text is available. Default: false. */
71
218
  defaultTitle?: boolean;
219
+ /** Insert <br> tags in Markdown tables. Default: false. */
72
220
  brInTables?: boolean;
221
+ /** Use HOCR spatial table format for better table structure preservation. Default: false. */
73
222
  hocrSpatialTables?: boolean;
223
+ /** Highlight style for marked/highlighted text: "double_equal" (==text==), "html" (<mark>), "bold" (**text**), or "none". Default: "none". */
74
224
  highlightStyle?: "double_equal" | "html" | "bold" | "none";
225
+ /** Extract metadata from HTML (title, meta tags, etc.). Default: false. */
75
226
  extractMetadata?: boolean;
227
+ /** Whitespace handling: "normalized" (collapse whitespace) or "strict" (preserve all whitespace). Default: "normalized". */
76
228
  whitespaceMode?: "normalized" | "strict";
229
+ /** Remove newlines from output (convert to single line). Default: false. */
77
230
  stripNewlines?: boolean;
231
+ /** Enable line wrapping at specified width. Default: true. */
78
232
  wrap?: boolean;
233
+ /** Maximum line width when wrapping is enabled. Default: 80. */
79
234
  wrapWidth?: number;
235
+ /** Convert as inline Markdown instead of block elements. Default: false. */
80
236
  convertAsInline?: boolean;
237
+ /** Markdown symbol for subscript text (e.g., '~' for ~text~). Default: '~'. */
81
238
  subSymbol?: string;
239
+ /** Markdown symbol for superscript text (e.g., '^' for ^text^). Default: '^'. */
82
240
  supSymbol?: string;
241
+ /** Newline style in output: "spaces" (two spaces + newline) or "backslash" (backslash + newline). Default: "spaces". */
83
242
  newlineStyle?: "spaces" | "backslash";
243
+ /** Code block style: "indented" (4-space indent), "backticks" (```), or "tildes" (~~~). Default: "backticks". */
84
244
  codeBlockStyle?: "indented" | "backticks" | "tildes";
245
+ /** List of HTML tag names to keep as inline images (don't convert). Default: []. */
85
246
  keepInlineImagesIn?: string[];
247
+ /** Character encoding for output (e.g., 'utf-8', 'ascii'). Default: 'utf-8'. */
86
248
  encoding?: string;
249
+ /** Enable debug mode for detailed conversion logging. Default: false. */
87
250
  debug?: boolean;
251
+ /** List of HTML tag names to remove entirely from output. Default: []. */
88
252
  stripTags?: string[];
253
+ /** List of HTML tag names to preserve in output (don't convert to Markdown). Default: []. */
89
254
  preserveTags?: string[];
255
+ /** HTML preprocessing options for cleaning HTML before conversion. */
90
256
  preprocessing?: HtmlPreprocessingOptions;
91
257
  }
258
+ /** Keyword extraction algorithm type. */
92
259
  type KeywordAlgorithm = "yake" | "rake";
260
+ /**
261
+ * YAKE (Yet Another Keyword Extractor) algorithm configuration.
262
+ *
263
+ * YAKE is an unsupervised keyword extraction method that doesn't require training data.
264
+ */
93
265
  interface YakeParams {
266
+ /** Window size for co-occurrence analysis (number of words to consider). Default: 3. */
94
267
  windowSize?: number;
95
268
  }
269
+ /**
270
+ * RAKE (Rapid Automatic Keyword Extraction) algorithm configuration.
271
+ *
272
+ * RAKE extracts keywords based on word co-occurrence and statistical measures.
273
+ */
96
274
  interface RakeParams {
275
+ /** Minimum word length to consider as keyword. Default: 3. */
97
276
  minWordLength?: number;
277
+ /** Maximum number of words per keyword phrase. Default: 3. */
98
278
  maxWordsPerPhrase?: number;
99
279
  }
280
+ /**
281
+ * Keyword extraction configuration.
282
+ *
283
+ * Extracts important keywords/phrases from document content using YAKE or RAKE algorithms.
284
+ */
100
285
  interface KeywordConfig {
286
+ /** Extraction algorithm: "yake" or "rake". Default: "yake". */
101
287
  algorithm?: KeywordAlgorithm;
288
+ /** Maximum number of keywords to extract. Default: 10. */
102
289
  maxKeywords?: number;
290
+ /** Minimum relevance score (0.0-1.0) for keywords. Keywords below this are filtered out. Default: 0.1. */
103
291
  minScore?: number;
292
+ /** N-gram range: [min_length, max_length] for phrase keywords (e.g., [1, 3] for 1-3 word phrases). Default: [1, 3]. */
104
293
  ngramRange?: [number, number];
294
+ /** Language for keyword extraction (e.g., 'en', 'de', 'fr'). Default: 'en'. */
105
295
  language?: string;
296
+ /** YAKE algorithm-specific parameters. Only used when algorithm is "yake". */
106
297
  yakeParams?: YakeParams;
298
+ /** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
107
299
  rakeParams?: RakeParams;
108
300
  }
301
+ /**
302
+ * Extracted keyword with relevance metadata.
303
+ *
304
+ * Represents a single keyword extracted from text along with its relevance score,
305
+ * the algorithm that extracted it, and optional position information.
306
+ */
307
+ interface ExtractedKeyword {
308
+ /** The keyword text */
309
+ text: string;
310
+ /** Relevance score (higher is better, algorithm-specific range) */
311
+ score: number;
312
+ /** Algorithm that extracted this keyword */
313
+ algorithm: KeywordAlgorithm;
314
+ /** Optional positions where keyword appears in text (character offsets) */
315
+ positions?: number[];
316
+ }
109
317
  /**
110
318
  * Page tracking and extraction configuration.
111
319
  *
@@ -113,7 +321,7 @@ interface KeywordConfig {
113
321
  * Page range information in chunk metadata (first_page/last_page) is automatically
114
322
  * enabled when page boundaries are available and chunking is configured.
115
323
  */
116
- interface PageConfig {
324
+ interface PageExtractionConfig {
117
325
  /** Extract pages as separate array (ExtractionResult.pages) */
118
326
  extractPages?: boolean;
119
327
  /** Insert page markers in main content string */
@@ -121,25 +329,53 @@ interface PageConfig {
121
329
  /** Page marker format (use {page_num} placeholder) */
122
330
  markerFormat?: string;
123
331
  }
332
+ /**
333
+ * Main extraction configuration interface.
334
+ *
335
+ * Combines all sub-configurations for document extraction, OCR, chunking, post-processing, etc.
336
+ * All fields are optional and use sensible defaults.
337
+ */
124
338
  interface ExtractionConfig {
339
+ /** Enable caching of extraction results for identical inputs. Default: true. */
125
340
  useCache?: boolean;
341
+ /** Enable quality processing filters to improve extraction reliability. Default: false. */
126
342
  enableQualityProcessing?: boolean;
343
+ /** OCR configuration for text extraction from images. Only used when document contains images or forceOcr is true. */
127
344
  ocr?: OcrConfig;
345
+ /** Force OCR processing even for documents with selectable text. Useful for scanned documents. Default: false. */
128
346
  forceOcr?: boolean;
347
+ /** Chunking configuration for splitting documents into smaller pieces for RAG or vector DB. */
129
348
  chunking?: ChunkingConfig;
349
+ /** Image extraction and optimization configuration. */
130
350
  images?: ImageExtractionConfig;
351
+ /** PDF-specific extraction options (passwords, metadata, etc.). */
131
352
  pdfOptions?: PdfConfig;
353
+ /** Token reduction configuration for optimizing token usage in LLM pipelines. */
132
354
  tokenReduction?: TokenReductionConfig;
355
+ /** Language detection configuration for automatic language identification. */
133
356
  languageDetection?: LanguageDetectionConfig;
357
+ /** Post-processor configuration for customizing extraction results. */
134
358
  postprocessor?: PostProcessorConfig;
359
+ /** HTML to Markdown conversion options for HTML content. */
135
360
  htmlOptions?: HtmlConversionOptions;
361
+ /** Keyword extraction configuration for extracting important phrases. */
136
362
  keywords?: KeywordConfig;
137
- pages?: PageConfig;
363
+ /** Page tracking and extraction configuration for multi-page documents. */
364
+ pages?: PageExtractionConfig;
365
+ /** Maximum number of concurrent extractions in batch operations. Default: 4. */
138
366
  maxConcurrentExtractions?: number;
139
367
  }
368
+ /**
369
+ * Extracted table data from document.
370
+ *
371
+ * Contains both cell data and Markdown representation for easy display and processing.
372
+ */
140
373
  interface Table {
374
+ /** 2D array of cell contents (rows × columns) */
141
375
  cells: string[][];
376
+ /** Markdown representation of the table for display or parsing */
142
377
  markdown: string;
378
+ /** Page number where this table was found (1-indexed) */
143
379
  pageNumber: number;
144
380
  }
145
381
  interface ExcelMetadata {
@@ -180,28 +416,50 @@ interface TextMetadata {
180
416
  links?: [string, string][] | null;
181
417
  codeBlocks?: [string, string][] | null;
182
418
  }
419
+ interface HeaderMetadata {
420
+ level: number;
421
+ text: string;
422
+ id?: string | null;
423
+ depth: number;
424
+ htmlOffset: number;
425
+ }
426
+ interface LinkMetadata {
427
+ href: string;
428
+ text: string;
429
+ title?: string | null;
430
+ linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
431
+ rel: string[];
432
+ attributes: Record<string, string>;
433
+ }
434
+ interface HtmlImageMetadata {
435
+ src: string;
436
+ alt?: string | null;
437
+ title?: string | null;
438
+ dimensions?: [number, number] | null;
439
+ imageType: "data_uri" | "inline_svg" | "external" | "relative";
440
+ attributes: Record<string, string>;
441
+ }
442
+ interface StructuredData {
443
+ dataType: "json_ld" | "microdata" | "rdfa";
444
+ rawJson: string;
445
+ schemaType?: string | null;
446
+ }
183
447
  interface HtmlMetadata {
184
448
  title?: string | null;
185
449
  description?: string | null;
186
- keywords?: string | null;
450
+ keywords: string[];
187
451
  author?: string | null;
188
- canonical?: string | null;
452
+ canonicalUrl?: string | null;
189
453
  baseHref?: string | null;
190
- ogTitle?: string | null;
191
- ogDescription?: string | null;
192
- ogImage?: string | null;
193
- ogUrl?: string | null;
194
- ogType?: string | null;
195
- ogSiteName?: string | null;
196
- twitterCard?: string | null;
197
- twitterTitle?: string | null;
198
- twitterDescription?: string | null;
199
- twitterImage?: string | null;
200
- twitterSite?: string | null;
201
- twitterCreator?: string | null;
202
- linkAuthor?: string | null;
203
- linkLicense?: string | null;
204
- linkAlternate?: string | null;
454
+ language?: string | null;
455
+ textDirection?: "ltr" | "rtl" | "auto" | null;
456
+ openGraph: Record<string, string>;
457
+ twitterCard: Record<string, string>;
458
+ metaTags: Record<string, string>;
459
+ htmlHeaders: HeaderMetadata[];
460
+ htmlLinks: LinkMetadata[];
461
+ htmlImages: HtmlImageMetadata[];
462
+ structuredData: StructuredData[];
205
463
  }
206
464
  interface PdfMetadata {
207
465
  title?: string | null;
@@ -329,38 +587,62 @@ interface ChunkMetadata {
329
587
  /** Last page number this chunk spans (1-indexed, only when page tracking enabled) */
330
588
  lastPage?: number | null;
331
589
  }
590
+ /**
591
+ * Text chunk with optional embedding.
592
+ *
593
+ * Represents a segment of a document created by the chunking algorithm, useful for RAG and vector databases.
594
+ */
332
595
  interface Chunk {
596
+ /** Text content of this chunk */
333
597
  content: string;
598
+ /** Vector embedding for this chunk (if embedding model was used) */
334
599
  embedding?: number[] | null;
600
+ /** Metadata about chunk position and properties in the document */
335
601
  metadata: ChunkMetadata;
336
602
  }
603
+ /**
604
+ * Extracted image from document with optional OCR result.
605
+ *
606
+ * Contains image data and metadata about position, dimensions, and properties.
607
+ */
337
608
  interface ExtractedImage {
609
+ /** Raw image bytes as Uint8Array */
338
610
  data: Uint8Array;
611
+ /** Image format (e.g., 'png', 'jpeg', 'tiff') */
339
612
  format: string;
613
+ /** Sequential index of this image in the document (0-indexed) */
340
614
  imageIndex: number;
615
+ /** Page number where this image was found (1-indexed), null if unknown */
341
616
  pageNumber?: number | null;
617
+ /** Image width in pixels, null if unknown */
342
618
  width?: number | null;
619
+ /** Image height in pixels, null if unknown */
343
620
  height?: number | null;
621
+ /** Color space (e.g., 'RGB', 'CMYK', 'Grayscale'), null if unknown */
344
622
  colorspace?: string | null;
623
+ /** Bits per color component (e.g., 8 for 8-bit), null if unknown */
345
624
  bitsPerComponent?: number | null;
625
+ /** Whether this is a mask image (used internally by PDF) */
346
626
  isMask: boolean;
627
+ /** Image description or caption if available */
347
628
  description?: string | null;
629
+ /** OCR extraction result if OCR was run on this image, null otherwise */
348
630
  ocrResult?: ExtractionResult | null;
349
631
  }
350
632
  /**
351
633
  * Content for a single page/slide/sheet.
352
634
  *
353
635
  * When page extraction is enabled, documents are split into per-page content
354
- * with associated tables and images mapped to each page.
636
+ * with associated tables and images mapped to each page. This allows for page-specific processing.
355
637
  */
356
638
  interface PageContent {
357
- /** Page number (1-indexed) */
639
+ /** Page number (1-indexed) starting from 1 */
358
640
  pageNumber: number;
359
- /** Text content for this page */
641
+ /** Text content extracted from this page */
360
642
  content: string;
361
- /** Tables found on this page */
643
+ /** Tables found and extracted from this page */
362
644
  tables: Table[];
363
- /** Images found on this page */
645
+ /** Images found and extracted from this page */
364
646
  images: ExtractedImage[];
365
647
  }
366
648
  /**
@@ -413,23 +695,17 @@ interface Metadata {
413
695
  headers?: string[] | null;
414
696
  links?: [string, string][] | null;
415
697
  code_blocks?: [string, string][] | null;
416
- canonical?: string | null;
698
+ canonical_url?: string | null;
417
699
  base_href?: string | null;
418
- og_title?: string | null;
419
- og_description?: string | null;
420
- og_image?: string | null;
421
- og_url?: string | null;
422
- og_type?: string | null;
423
- og_site_name?: string | null;
424
- twitter_card?: string | null;
425
- twitter_title?: string | null;
426
- twitter_description?: string | null;
427
- twitter_image?: string | null;
428
- twitter_site?: string | null;
429
- twitter_creator?: string | null;
430
- link_author?: string | null;
431
- link_license?: string | null;
432
- link_alternate?: string | null;
700
+ open_graph?: Record<string, string>;
701
+ twitter_card?: Record<string, string>;
702
+ meta_tags?: Record<string, string>;
703
+ html_language?: string | null;
704
+ text_direction?: "ltr" | "rtl" | "auto" | null;
705
+ html_headers?: HeaderMetadata[];
706
+ html_links?: LinkMetadata[];
707
+ html_images?: HtmlImageMetadata[];
708
+ structured_data?: StructuredData[];
433
709
  psm?: number;
434
710
  output_format?: string;
435
711
  table_count?: number;
@@ -439,92 +715,141 @@ interface Metadata {
439
715
  json_schema?: Record<string, unknown> | null;
440
716
  page_structure?: PageStructure | null;
441
717
  error?: ErrorMetadata | null;
442
- [key: string]: any;
718
+ /**
719
+ * Additional fields may be added at runtime by postprocessors.
720
+ * Use bracket notation to safely access unexpected properties.
721
+ */
722
+ [key: string]: unknown;
443
723
  }
724
+ /**
725
+ * Complete extraction result from document processing.
726
+ *
727
+ * Contains all extracted content, metadata, and optional processed data like chunks and images.
728
+ * This is the primary return value from extraction functions.
729
+ */
444
730
  interface ExtractionResult {
731
+ /** Extracted text content from the document (main content) */
445
732
  content: string;
733
+ /** MIME type of the input document (e.g., 'application/pdf', 'text/html') */
446
734
  mimeType: string;
735
+ /** Document metadata including title, author, creation date, language, and format-specific fields */
447
736
  metadata: Metadata;
737
+ /** Tables extracted from the document (2D cell arrays with Markdown representation) */
448
738
  tables: Table[];
739
+ /** Detected languages in the document (ISO 639-1 codes, e.g., ['en', 'de']), null if detection disabled */
449
740
  detectedLanguages: string[] | null;
741
+ /** Document chunks for RAG/vector databases (if chunking was enabled), null otherwise */
450
742
  chunks: Chunk[] | null;
743
+ /** Images extracted from document with metadata (if image extraction was enabled), null otherwise */
451
744
  images: ExtractedImage[] | null;
745
+ /** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
452
746
  pages?: PageContent[] | null;
747
+ /** Extracted keywords when keyword extraction is enabled, null otherwise */
748
+ keywords?: ExtractedKeyword[] | null;
453
749
  }
750
+ /** Post-processor execution stage in the extraction pipeline. */
454
751
  type ProcessingStage = "early" | "middle" | "late";
752
+ /**
753
+ * Protocol for custom post-processors that modify extraction results.
754
+ *
755
+ * Post-processors enrich or transform extraction results without failing the extraction.
756
+ * If a post-processor throws an error, it's logged but extraction continues.
757
+ * Only works with async extraction functions (`extractFile`, `extractBytes`, etc.).
758
+ */
455
759
  interface PostProcessorProtocol {
456
760
  /**
457
761
  * Return the unique name of this postprocessor.
762
+ *
763
+ * @returns Unique processor name (case-sensitive, alphanumeric + underscores recommended)
458
764
  */
459
765
  name(): string;
460
766
  /**
461
767
  * Process and enrich an extraction result.
462
768
  *
769
+ * Modify the result to add new metadata, transform content, or perform other enrichment.
770
+ * If this throws an error, it's logged but extraction continues.
771
+ *
463
772
  * @param result - ExtractionResult with extracted content, metadata, and tables
464
- * @returns Modified result with enriched metadata
773
+ * @returns Modified result with enriched data. Can be async or sync.
465
774
  */
466
775
  process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
467
776
  /**
468
777
  * Return the processing stage for this processor.
469
778
  *
779
+ * Determines when this processor runs relative to others:
780
+ * - "early": Runs first, before other processors (good for cleanup/normalization)
781
+ * - "middle": Runs with other middle-stage processors (default)
782
+ * - "late": Runs last, after others (good for final enrichment)
783
+ *
470
784
  * @returns One of "early", "middle", or "late" (default: "middle")
471
785
  */
472
786
  processingStage?(): ProcessingStage;
473
787
  /**
474
- * Initialize the processor (e.g., load ML models).
788
+ * Initialize the processor (e.g., load ML models, setup resources).
475
789
  *
476
- * Called once when the processor is registered.
790
+ * Called once when the processor is first registered. Use for expensive operations.
477
791
  */
478
792
  initialize?(): void | Promise<void>;
479
793
  /**
480
794
  * Shutdown the processor and release resources.
481
795
  *
482
- * Called when the processor is unregistered.
796
+ * Called when the processor is unregistered. Use for cleanup (closing connections, freeing memory).
483
797
  */
484
798
  shutdown?(): void | Promise<void>;
485
799
  }
800
+ /**
801
+ * Protocol for custom validators that check extraction results.
802
+ *
803
+ * Validators perform quality checks and fail the extraction if validation fails.
804
+ * Unlike post-processors, validator errors cause the entire extraction to fail.
805
+ * Useful for enforcing quality standards on extracted content.
806
+ */
486
807
  interface ValidatorProtocol {
487
808
  /**
488
809
  * Return the unique name of this validator.
810
+ *
811
+ * @returns Unique validator name (case-sensitive, alphanumeric + underscores recommended)
489
812
  */
490
813
  name(): string;
491
814
  /**
492
815
  * Validate an extraction result.
493
816
  *
494
- * Throw an error if validation fails. The error message should explain why validation failed.
495
- * If validation passes, return without throwing.
817
+ * Throw an error if validation fails. The error message will be used as the extraction error.
818
+ * If validation passes, return without throwing (return value is ignored).
496
819
  *
497
820
  * @param result - ExtractionResult to validate
498
- * @throws Error if validation fails (extraction will fail)
821
+ * @throws {Error} If validation fails (extraction will fail with this error)
499
822
  */
500
823
  validate(result: ExtractionResult): void | Promise<void>;
501
824
  /**
502
825
  * Return the validation priority.
503
826
  *
504
- * Higher priority validators run first. Useful for running cheap validations before expensive ones.
827
+ * Higher priority validators run first. Useful for running cheap validations (e.g., length checks)
828
+ * before expensive ones (e.g., AI-based quality checks) to fail fast.
505
829
  *
506
- * @returns Priority value (higher = runs earlier, default: 50)
830
+ * @returns Priority value (higher = runs earlier, default: 50). Range: 0-1000.
507
831
  */
508
832
  priority?(): number;
509
833
  /**
510
834
  * Check if this validator should run for a given result.
511
835
  *
512
836
  * Allows conditional validation based on MIME type, metadata, or content.
837
+ * This is evaluated before validation, so expensive checks can be skipped for irrelevant documents.
513
838
  *
514
839
  * @param result - ExtractionResult to check
515
840
  * @returns true if validator should run, false to skip (default: true)
516
841
  */
517
842
  shouldValidate?(result: ExtractionResult): boolean;
518
843
  /**
519
- * Initialize the validator.
844
+ * Initialize the validator (e.g., load ML models, setup resources).
520
845
  *
521
- * Called once when the validator is registered.
846
+ * Called once when the validator is first registered. Use for expensive operations.
522
847
  */
523
848
  initialize?(): void | Promise<void>;
524
849
  /**
525
850
  * Shutdown the validator and release resources.
526
851
  *
527
- * Called when the validator is unregistered.
852
+ * Called when the validator is unregistered. Use for cleanup (closing connections, freeing memory).
528
853
  */
529
854
  shutdown?(): void | Promise<void>;
530
855
  }
@@ -662,5 +987,95 @@ interface OcrBackendProtocol {
662
987
  */
663
988
  shutdown?(): void | Promise<void>;
664
989
  }
990
+ /**
991
+ * Result of error message classification into error codes.
992
+ *
993
+ * Provides classification details including the error code, name,
994
+ * description, and confidence score for the classification.
995
+ *
996
+ * @example
997
+ * ```typescript
998
+ * import { classifyError, ErrorCode } from '@kreuzberg/node';
999
+ *
1000
+ * const result = classifyError("File not found in read operation");
1001
+ * if (result.code === ErrorCode.IoError) {
1002
+ * console.error(`I/O Error: ${result.description}`);
1003
+ * console.log(`Confidence: ${result.confidence}`);
1004
+ * }
1005
+ * ```
1006
+ */
1007
+ interface ErrorClassification {
1008
+ /**
1009
+ * The numeric error code (0-7) representing the error type.
1010
+ */
1011
+ code: number;
1012
+ /**
1013
+ * The human-readable name of the error code (e.g., "validation", "ocr").
1014
+ */
1015
+ name: string;
1016
+ /**
1017
+ * A brief description of the error type.
1018
+ */
1019
+ description: string;
1020
+ /**
1021
+ * Confidence score (0.0-1.0) indicating how certain the classification is.
1022
+ * Higher values indicate higher confidence in the classification.
1023
+ */
1024
+ confidence: number;
1025
+ }
1026
+ /**
1027
+ * Opaque handle to a worker pool for concurrent extraction operations.
1028
+ *
1029
+ * Worker pools enable parallel processing of CPU-bound document extraction
1030
+ * tasks by distributing work across multiple threads. This is especially
1031
+ * useful for batch processing large numbers of documents.
1032
+ *
1033
+ * @example
1034
+ * ```typescript
1035
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1036
+ *
1037
+ * const pool = createWorkerPool(4); // 4 concurrent workers
1038
+ * try {
1039
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1040
+ * console.log(result.content);
1041
+ * } finally {
1042
+ * await closeWorkerPool(pool);
1043
+ * }
1044
+ * ```
1045
+ */
1046
+ interface WorkerPool {
1047
+ /** Internal pool identifier (opaque) */
1048
+ readonly poolId: number;
1049
+ }
1050
+ /**
1051
+ * Worker pool statistics.
1052
+ *
1053
+ * Provides information about the current state of a worker pool including
1054
+ * pool size, number of active workers, and queued tasks.
1055
+ *
1056
+ * @example
1057
+ * ```typescript
1058
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
1059
+ *
1060
+ * const pool = createWorkerPool(4);
1061
+ * const stats = getWorkerPoolStats(pool);
1062
+ * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
1063
+ * console.log(`Queued: ${stats.queuedTasks}`);
1064
+ * ```
1065
+ */
1066
+ interface WorkerPoolStats {
1067
+ /**
1068
+ * Maximum number of concurrent workers in the pool.
1069
+ */
1070
+ size: number;
1071
+ /**
1072
+ * Number of currently active (executing) workers.
1073
+ */
1074
+ activeWorkers: number;
1075
+ /**
1076
+ * Number of tasks waiting in the queue.
1077
+ */
1078
+ queuedTasks: number;
1079
+ }
665
1080
 
666
- export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, XmlMetadata, YakeParams };
1081
+ export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };