@kreuzberg/node 4.0.0-rc.21 → 4.0.0-rc.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/types.d.mts CHANGED
@@ -107,6 +107,21 @@ interface TokenReductionConfig {
107
107
  /** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
108
108
  preserveImportantWords?: boolean;
109
109
  }
110
+ /**
111
+ * Hierarchy extraction configuration.
112
+ *
113
+ * Controls document hierarchy detection based on font size clustering.
114
+ */
115
+ interface HierarchyConfig {
116
+ /** Enable hierarchy extraction. Default: true. */
117
+ enabled?: boolean;
118
+ /** Number of font size clusters (2-10). Default: 6. */
119
+ kClusters?: number;
120
+ /** Include bounding box information. Default: true. */
121
+ includeBbox?: boolean;
122
+ /** OCR coverage threshold (0.0-1.0). Default: null. */
123
+ ocrCoverageThreshold?: number | null;
124
+ }
110
125
  /**
111
126
  * PDF-specific extraction configuration.
112
127
  *
@@ -119,6 +134,8 @@ interface PdfConfig {
119
134
  passwords?: string[];
120
135
  /** Extract document metadata (title, author, creation date, etc.). Default: true. */
121
136
  extractMetadata?: boolean;
137
+ /** Hierarchy extraction configuration. */
138
+ hierarchy?: HierarchyConfig;
122
139
  }
123
140
  /**
124
141
  * Image extraction and processing configuration.
@@ -281,6 +298,22 @@ interface KeywordConfig {
281
298
  /** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
282
299
  rakeParams?: RakeParams;
283
300
  }
301
+ /**
302
+ * Extracted keyword with relevance metadata.
303
+ *
304
+ * Represents a single keyword extracted from text along with its relevance score,
305
+ * the algorithm that extracted it, and optional position information.
306
+ */
307
+ interface ExtractedKeyword {
308
+ /** The keyword text */
309
+ text: string;
310
+ /** Relevance score (higher is better, algorithm-specific range) */
311
+ score: number;
312
+ /** Algorithm that extracted this keyword */
313
+ algorithm: KeywordAlgorithm;
314
+ /** Optional positions where keyword appears in text (character offsets) */
315
+ positions?: number[];
316
+ }
284
317
  /**
285
318
  * Page tracking and extraction configuration.
286
319
  *
@@ -288,7 +321,7 @@ interface KeywordConfig {
288
321
  * Page range information in chunk metadata (first_page/last_page) is automatically
289
322
  * enabled when page boundaries are available and chunking is configured.
290
323
  */
291
- interface PageConfig {
324
+ interface PageExtractionConfig {
292
325
  /** Extract pages as separate array (ExtractionResult.pages) */
293
326
  extractPages?: boolean;
294
327
  /** Insert page markers in main content string */
@@ -328,7 +361,7 @@ interface ExtractionConfig {
328
361
  /** Keyword extraction configuration for extracting important phrases. */
329
362
  keywords?: KeywordConfig;
330
363
  /** Page tracking and extraction configuration for multi-page documents. */
331
- pages?: PageConfig;
364
+ pages?: PageExtractionConfig;
332
365
  /** Maximum number of concurrent extractions in batch operations. Default: 4. */
333
366
  maxConcurrentExtractions?: number;
334
367
  }
@@ -383,28 +416,50 @@ interface TextMetadata {
383
416
  links?: [string, string][] | null;
384
417
  codeBlocks?: [string, string][] | null;
385
418
  }
419
+ interface HeaderMetadata {
420
+ level: number;
421
+ text: string;
422
+ id?: string | null;
423
+ depth: number;
424
+ htmlOffset: number;
425
+ }
426
+ interface LinkMetadata {
427
+ href: string;
428
+ text: string;
429
+ title?: string | null;
430
+ linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
431
+ rel: string[];
432
+ attributes: Record<string, string>;
433
+ }
434
+ interface HtmlImageMetadata {
435
+ src: string;
436
+ alt?: string | null;
437
+ title?: string | null;
438
+ dimensions?: [number, number] | null;
439
+ imageType: "data_uri" | "inline_svg" | "external" | "relative";
440
+ attributes: Record<string, string>;
441
+ }
442
+ interface StructuredData {
443
+ dataType: "json_ld" | "microdata" | "rdfa";
444
+ rawJson: string;
445
+ schemaType?: string | null;
446
+ }
386
447
  interface HtmlMetadata {
387
448
  title?: string | null;
388
449
  description?: string | null;
389
- keywords?: string | null;
450
+ keywords: string[];
390
451
  author?: string | null;
391
- canonical?: string | null;
452
+ canonicalUrl?: string | null;
392
453
  baseHref?: string | null;
393
- ogTitle?: string | null;
394
- ogDescription?: string | null;
395
- ogImage?: string | null;
396
- ogUrl?: string | null;
397
- ogType?: string | null;
398
- ogSiteName?: string | null;
399
- twitterCard?: string | null;
400
- twitterTitle?: string | null;
401
- twitterDescription?: string | null;
402
- twitterImage?: string | null;
403
- twitterSite?: string | null;
404
- twitterCreator?: string | null;
405
- linkAuthor?: string | null;
406
- linkLicense?: string | null;
407
- linkAlternate?: string | null;
454
+ language?: string | null;
455
+ textDirection?: "ltr" | "rtl" | "auto" | null;
456
+ openGraph: Record<string, string>;
457
+ twitterCard: Record<string, string>;
458
+ metaTags: Record<string, string>;
459
+ htmlHeaders: HeaderMetadata[];
460
+ htmlLinks: LinkMetadata[];
461
+ htmlImages: HtmlImageMetadata[];
462
+ structuredData: StructuredData[];
408
463
  }
409
464
  interface PdfMetadata {
410
465
  title?: string | null;
@@ -640,23 +695,17 @@ interface Metadata {
640
695
  headers?: string[] | null;
641
696
  links?: [string, string][] | null;
642
697
  code_blocks?: [string, string][] | null;
643
- canonical?: string | null;
698
+ canonical_url?: string | null;
644
699
  base_href?: string | null;
645
- og_title?: string | null;
646
- og_description?: string | null;
647
- og_image?: string | null;
648
- og_url?: string | null;
649
- og_type?: string | null;
650
- og_site_name?: string | null;
651
- twitter_card?: string | null;
652
- twitter_title?: string | null;
653
- twitter_description?: string | null;
654
- twitter_image?: string | null;
655
- twitter_site?: string | null;
656
- twitter_creator?: string | null;
657
- link_author?: string | null;
658
- link_license?: string | null;
659
- link_alternate?: string | null;
700
+ open_graph?: Record<string, string>;
701
+ twitter_card?: Record<string, string>;
702
+ meta_tags?: Record<string, string>;
703
+ html_language?: string | null;
704
+ text_direction?: "ltr" | "rtl" | "auto" | null;
705
+ html_headers?: HeaderMetadata[];
706
+ html_links?: LinkMetadata[];
707
+ html_images?: HtmlImageMetadata[];
708
+ structured_data?: StructuredData[];
660
709
  psm?: number;
661
710
  output_format?: string;
662
711
  table_count?: number;
@@ -695,6 +744,8 @@ interface ExtractionResult {
695
744
  images: ExtractedImage[] | null;
696
745
  /** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
697
746
  pages?: PageContent[] | null;
747
+ /** Extracted keywords when keyword extraction is enabled, null otherwise */
748
+ keywords?: ExtractedKeyword[] | null;
698
749
  }
699
750
  /** Post-processor execution stage in the extraction pipeline. */
700
751
  type ProcessingStage = "early" | "middle" | "late";
@@ -972,5 +1023,59 @@ interface ErrorClassification {
972
1023
  */
973
1024
  confidence: number;
974
1025
  }
1026
+ /**
1027
+ * Opaque handle to a worker pool for concurrent extraction operations.
1028
+ *
1029
+ * Worker pools enable parallel processing of CPU-bound document extraction
1030
+ * tasks by distributing work across multiple threads. This is especially
1031
+ * useful for batch processing large numbers of documents.
1032
+ *
1033
+ * @example
1034
+ * ```typescript
1035
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1036
+ *
1037
+ * const pool = createWorkerPool(4); // 4 concurrent workers
1038
+ * try {
1039
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1040
+ * console.log(result.content);
1041
+ * } finally {
1042
+ * await closeWorkerPool(pool);
1043
+ * }
1044
+ * ```
1045
+ */
1046
+ interface WorkerPool {
1047
+ /** Internal pool identifier (opaque) */
1048
+ readonly poolId: number;
1049
+ }
1050
+ /**
1051
+ * Worker pool statistics.
1052
+ *
1053
+ * Provides information about the current state of a worker pool including
1054
+ * pool size, number of active workers, and queued tasks.
1055
+ *
1056
+ * @example
1057
+ * ```typescript
1058
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
1059
+ *
1060
+ * const pool = createWorkerPool(4);
1061
+ * const stats = getWorkerPoolStats(pool);
1062
+ * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
1063
+ * console.log(`Queued: ${stats.queuedTasks}`);
1064
+ * ```
1065
+ */
1066
+ interface WorkerPoolStats {
1067
+ /**
1068
+ * Maximum number of concurrent workers in the pool.
1069
+ */
1070
+ size: number;
1071
+ /**
1072
+ * Number of currently active (executing) workers.
1073
+ */
1074
+ activeWorkers: number;
1075
+ /**
1076
+ * Number of tasks waiting in the queue.
1077
+ */
1078
+ queuedTasks: number;
1079
+ }
975
1080
 
976
- export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, XmlMetadata, YakeParams };
1081
+ export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
package/dist/types.d.ts CHANGED
@@ -107,6 +107,21 @@ interface TokenReductionConfig {
107
107
  /** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
108
108
  preserveImportantWords?: boolean;
109
109
  }
110
+ /**
111
+ * Hierarchy extraction configuration.
112
+ *
113
+ * Controls document hierarchy detection based on font size clustering.
114
+ */
115
+ interface HierarchyConfig {
116
+ /** Enable hierarchy extraction. Default: true. */
117
+ enabled?: boolean;
118
+ /** Number of font size clusters (2-10). Default: 6. */
119
+ kClusters?: number;
120
+ /** Include bounding box information. Default: true. */
121
+ includeBbox?: boolean;
122
+ /** OCR coverage threshold (0.0-1.0). Default: null. */
123
+ ocrCoverageThreshold?: number | null;
124
+ }
110
125
  /**
111
126
  * PDF-specific extraction configuration.
112
127
  *
@@ -119,6 +134,8 @@ interface PdfConfig {
119
134
  passwords?: string[];
120
135
  /** Extract document metadata (title, author, creation date, etc.). Default: true. */
121
136
  extractMetadata?: boolean;
137
+ /** Hierarchy extraction configuration. */
138
+ hierarchy?: HierarchyConfig;
122
139
  }
123
140
  /**
124
141
  * Image extraction and processing configuration.
@@ -281,6 +298,22 @@ interface KeywordConfig {
281
298
  /** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
282
299
  rakeParams?: RakeParams;
283
300
  }
301
+ /**
302
+ * Extracted keyword with relevance metadata.
303
+ *
304
+ * Represents a single keyword extracted from text along with its relevance score,
305
+ * the algorithm that extracted it, and optional position information.
306
+ */
307
+ interface ExtractedKeyword {
308
+ /** The keyword text */
309
+ text: string;
310
+ /** Relevance score (higher is better, algorithm-specific range) */
311
+ score: number;
312
+ /** Algorithm that extracted this keyword */
313
+ algorithm: KeywordAlgorithm;
314
+ /** Optional positions where keyword appears in text (character offsets) */
315
+ positions?: number[];
316
+ }
284
317
  /**
285
318
  * Page tracking and extraction configuration.
286
319
  *
@@ -288,7 +321,7 @@ interface KeywordConfig {
288
321
  * Page range information in chunk metadata (first_page/last_page) is automatically
289
322
  * enabled when page boundaries are available and chunking is configured.
290
323
  */
291
- interface PageConfig {
324
+ interface PageExtractionConfig {
292
325
  /** Extract pages as separate array (ExtractionResult.pages) */
293
326
  extractPages?: boolean;
294
327
  /** Insert page markers in main content string */
@@ -328,7 +361,7 @@ interface ExtractionConfig {
328
361
  /** Keyword extraction configuration for extracting important phrases. */
329
362
  keywords?: KeywordConfig;
330
363
  /** Page tracking and extraction configuration for multi-page documents. */
331
- pages?: PageConfig;
364
+ pages?: PageExtractionConfig;
332
365
  /** Maximum number of concurrent extractions in batch operations. Default: 4. */
333
366
  maxConcurrentExtractions?: number;
334
367
  }
@@ -383,28 +416,50 @@ interface TextMetadata {
383
416
  links?: [string, string][] | null;
384
417
  codeBlocks?: [string, string][] | null;
385
418
  }
419
+ interface HeaderMetadata {
420
+ level: number;
421
+ text: string;
422
+ id?: string | null;
423
+ depth: number;
424
+ htmlOffset: number;
425
+ }
426
+ interface LinkMetadata {
427
+ href: string;
428
+ text: string;
429
+ title?: string | null;
430
+ linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
431
+ rel: string[];
432
+ attributes: Record<string, string>;
433
+ }
434
+ interface HtmlImageMetadata {
435
+ src: string;
436
+ alt?: string | null;
437
+ title?: string | null;
438
+ dimensions?: [number, number] | null;
439
+ imageType: "data_uri" | "inline_svg" | "external" | "relative";
440
+ attributes: Record<string, string>;
441
+ }
442
+ interface StructuredData {
443
+ dataType: "json_ld" | "microdata" | "rdfa";
444
+ rawJson: string;
445
+ schemaType?: string | null;
446
+ }
386
447
  interface HtmlMetadata {
387
448
  title?: string | null;
388
449
  description?: string | null;
389
- keywords?: string | null;
450
+ keywords: string[];
390
451
  author?: string | null;
391
- canonical?: string | null;
452
+ canonicalUrl?: string | null;
392
453
  baseHref?: string | null;
393
- ogTitle?: string | null;
394
- ogDescription?: string | null;
395
- ogImage?: string | null;
396
- ogUrl?: string | null;
397
- ogType?: string | null;
398
- ogSiteName?: string | null;
399
- twitterCard?: string | null;
400
- twitterTitle?: string | null;
401
- twitterDescription?: string | null;
402
- twitterImage?: string | null;
403
- twitterSite?: string | null;
404
- twitterCreator?: string | null;
405
- linkAuthor?: string | null;
406
- linkLicense?: string | null;
407
- linkAlternate?: string | null;
454
+ language?: string | null;
455
+ textDirection?: "ltr" | "rtl" | "auto" | null;
456
+ openGraph: Record<string, string>;
457
+ twitterCard: Record<string, string>;
458
+ metaTags: Record<string, string>;
459
+ htmlHeaders: HeaderMetadata[];
460
+ htmlLinks: LinkMetadata[];
461
+ htmlImages: HtmlImageMetadata[];
462
+ structuredData: StructuredData[];
408
463
  }
409
464
  interface PdfMetadata {
410
465
  title?: string | null;
@@ -640,23 +695,17 @@ interface Metadata {
640
695
  headers?: string[] | null;
641
696
  links?: [string, string][] | null;
642
697
  code_blocks?: [string, string][] | null;
643
- canonical?: string | null;
698
+ canonical_url?: string | null;
644
699
  base_href?: string | null;
645
- og_title?: string | null;
646
- og_description?: string | null;
647
- og_image?: string | null;
648
- og_url?: string | null;
649
- og_type?: string | null;
650
- og_site_name?: string | null;
651
- twitter_card?: string | null;
652
- twitter_title?: string | null;
653
- twitter_description?: string | null;
654
- twitter_image?: string | null;
655
- twitter_site?: string | null;
656
- twitter_creator?: string | null;
657
- link_author?: string | null;
658
- link_license?: string | null;
659
- link_alternate?: string | null;
700
+ open_graph?: Record<string, string>;
701
+ twitter_card?: Record<string, string>;
702
+ meta_tags?: Record<string, string>;
703
+ html_language?: string | null;
704
+ text_direction?: "ltr" | "rtl" | "auto" | null;
705
+ html_headers?: HeaderMetadata[];
706
+ html_links?: LinkMetadata[];
707
+ html_images?: HtmlImageMetadata[];
708
+ structured_data?: StructuredData[];
660
709
  psm?: number;
661
710
  output_format?: string;
662
711
  table_count?: number;
@@ -695,6 +744,8 @@ interface ExtractionResult {
695
744
  images: ExtractedImage[] | null;
696
745
  /** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
697
746
  pages?: PageContent[] | null;
747
+ /** Extracted keywords when keyword extraction is enabled, null otherwise */
748
+ keywords?: ExtractedKeyword[] | null;
698
749
  }
699
750
  /** Post-processor execution stage in the extraction pipeline. */
700
751
  type ProcessingStage = "early" | "middle" | "late";
@@ -972,5 +1023,59 @@ interface ErrorClassification {
972
1023
  */
973
1024
  confidence: number;
974
1025
  }
1026
+ /**
1027
+ * Opaque handle to a worker pool for concurrent extraction operations.
1028
+ *
1029
+ * Worker pools enable parallel processing of CPU-bound document extraction
1030
+ * tasks by distributing work across multiple threads. This is especially
1031
+ * useful for batch processing large numbers of documents.
1032
+ *
1033
+ * @example
1034
+ * ```typescript
1035
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1036
+ *
1037
+ * const pool = createWorkerPool(4); // 4 concurrent workers
1038
+ * try {
1039
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1040
+ * console.log(result.content);
1041
+ * } finally {
1042
+ * await closeWorkerPool(pool);
1043
+ * }
1044
+ * ```
1045
+ */
1046
+ interface WorkerPool {
1047
+ /** Internal pool identifier (opaque) */
1048
+ readonly poolId: number;
1049
+ }
1050
+ /**
1051
+ * Worker pool statistics.
1052
+ *
1053
+ * Provides information about the current state of a worker pool including
1054
+ * pool size, number of active workers, and queued tasks.
1055
+ *
1056
+ * @example
1057
+ * ```typescript
1058
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
1059
+ *
1060
+ * const pool = createWorkerPool(4);
1061
+ * const stats = getWorkerPoolStats(pool);
1062
+ * console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
1063
+ * console.log(`Queued: ${stats.queuedTasks}`);
1064
+ * ```
1065
+ */
1066
+ interface WorkerPoolStats {
1067
+ /**
1068
+ * Maximum number of concurrent workers in the pool.
1069
+ */
1070
+ size: number;
1071
+ /**
1072
+ * Number of currently active (executing) workers.
1073
+ */
1074
+ activeWorkers: number;
1075
+ /**
1076
+ * Number of tasks waiting in the queue.
1077
+ */
1078
+ queuedTasks: number;
1079
+ }
975
1080
 
976
- export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, XmlMetadata, YakeParams };
1081
+ export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };