@kreuzberg/node 4.0.0-rc.21 → 4.0.0-rc.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +345 -534
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +158 -91
- package/dist/index.d.ts +158 -91
- package/dist/index.js +77 -103
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +72 -103
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +141 -36
- package/dist/types.d.ts +141 -36
- package/dist/types.js.map +1 -1
- package/index.d.ts +183 -0
- package/index.js +64 -54
- package/metadata.d.ts +53 -33
- package/package.json +5 -6
package/dist/types.d.mts
CHANGED
|
@@ -107,6 +107,21 @@ interface TokenReductionConfig {
|
|
|
107
107
|
/** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
|
|
108
108
|
preserveImportantWords?: boolean;
|
|
109
109
|
}
|
|
110
|
+
/**
|
|
111
|
+
* Hierarchy extraction configuration.
|
|
112
|
+
*
|
|
113
|
+
* Controls document hierarchy detection based on font size clustering.
|
|
114
|
+
*/
|
|
115
|
+
interface HierarchyConfig {
|
|
116
|
+
/** Enable hierarchy extraction. Default: true. */
|
|
117
|
+
enabled?: boolean;
|
|
118
|
+
/** Number of font size clusters (2-10). Default: 6. */
|
|
119
|
+
kClusters?: number;
|
|
120
|
+
/** Include bounding box information. Default: true. */
|
|
121
|
+
includeBbox?: boolean;
|
|
122
|
+
/** OCR coverage threshold (0.0-1.0). Default: null. */
|
|
123
|
+
ocrCoverageThreshold?: number | null;
|
|
124
|
+
}
|
|
110
125
|
/**
|
|
111
126
|
* PDF-specific extraction configuration.
|
|
112
127
|
*
|
|
@@ -119,6 +134,8 @@ interface PdfConfig {
|
|
|
119
134
|
passwords?: string[];
|
|
120
135
|
/** Extract document metadata (title, author, creation date, etc.). Default: true. */
|
|
121
136
|
extractMetadata?: boolean;
|
|
137
|
+
/** Hierarchy extraction configuration. */
|
|
138
|
+
hierarchy?: HierarchyConfig;
|
|
122
139
|
}
|
|
123
140
|
/**
|
|
124
141
|
* Image extraction and processing configuration.
|
|
@@ -281,6 +298,22 @@ interface KeywordConfig {
|
|
|
281
298
|
/** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
|
|
282
299
|
rakeParams?: RakeParams;
|
|
283
300
|
}
|
|
301
|
+
/**
|
|
302
|
+
* Extracted keyword with relevance metadata.
|
|
303
|
+
*
|
|
304
|
+
* Represents a single keyword extracted from text along with its relevance score,
|
|
305
|
+
* the algorithm that extracted it, and optional position information.
|
|
306
|
+
*/
|
|
307
|
+
interface ExtractedKeyword {
|
|
308
|
+
/** The keyword text */
|
|
309
|
+
text: string;
|
|
310
|
+
/** Relevance score (higher is better, algorithm-specific range) */
|
|
311
|
+
score: number;
|
|
312
|
+
/** Algorithm that extracted this keyword */
|
|
313
|
+
algorithm: KeywordAlgorithm;
|
|
314
|
+
/** Optional positions where keyword appears in text (character offsets) */
|
|
315
|
+
positions?: number[];
|
|
316
|
+
}
|
|
284
317
|
/**
|
|
285
318
|
* Page tracking and extraction configuration.
|
|
286
319
|
*
|
|
@@ -288,7 +321,7 @@ interface KeywordConfig {
|
|
|
288
321
|
* Page range information in chunk metadata (first_page/last_page) is automatically
|
|
289
322
|
* enabled when page boundaries are available and chunking is configured.
|
|
290
323
|
*/
|
|
291
|
-
interface
|
|
324
|
+
interface PageExtractionConfig {
|
|
292
325
|
/** Extract pages as separate array (ExtractionResult.pages) */
|
|
293
326
|
extractPages?: boolean;
|
|
294
327
|
/** Insert page markers in main content string */
|
|
@@ -328,7 +361,7 @@ interface ExtractionConfig {
|
|
|
328
361
|
/** Keyword extraction configuration for extracting important phrases. */
|
|
329
362
|
keywords?: KeywordConfig;
|
|
330
363
|
/** Page tracking and extraction configuration for multi-page documents. */
|
|
331
|
-
pages?:
|
|
364
|
+
pages?: PageExtractionConfig;
|
|
332
365
|
/** Maximum number of concurrent extractions in batch operations. Default: 4. */
|
|
333
366
|
maxConcurrentExtractions?: number;
|
|
334
367
|
}
|
|
@@ -383,28 +416,50 @@ interface TextMetadata {
|
|
|
383
416
|
links?: [string, string][] | null;
|
|
384
417
|
codeBlocks?: [string, string][] | null;
|
|
385
418
|
}
|
|
419
|
+
interface HeaderMetadata {
|
|
420
|
+
level: number;
|
|
421
|
+
text: string;
|
|
422
|
+
id?: string | null;
|
|
423
|
+
depth: number;
|
|
424
|
+
htmlOffset: number;
|
|
425
|
+
}
|
|
426
|
+
interface LinkMetadata {
|
|
427
|
+
href: string;
|
|
428
|
+
text: string;
|
|
429
|
+
title?: string | null;
|
|
430
|
+
linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
|
|
431
|
+
rel: string[];
|
|
432
|
+
attributes: Record<string, string>;
|
|
433
|
+
}
|
|
434
|
+
interface HtmlImageMetadata {
|
|
435
|
+
src: string;
|
|
436
|
+
alt?: string | null;
|
|
437
|
+
title?: string | null;
|
|
438
|
+
dimensions?: [number, number] | null;
|
|
439
|
+
imageType: "data_uri" | "inline_svg" | "external" | "relative";
|
|
440
|
+
attributes: Record<string, string>;
|
|
441
|
+
}
|
|
442
|
+
interface StructuredData {
|
|
443
|
+
dataType: "json_ld" | "microdata" | "rdfa";
|
|
444
|
+
rawJson: string;
|
|
445
|
+
schemaType?: string | null;
|
|
446
|
+
}
|
|
386
447
|
interface HtmlMetadata {
|
|
387
448
|
title?: string | null;
|
|
388
449
|
description?: string | null;
|
|
389
|
-
keywords
|
|
450
|
+
keywords: string[];
|
|
390
451
|
author?: string | null;
|
|
391
|
-
|
|
452
|
+
canonicalUrl?: string | null;
|
|
392
453
|
baseHref?: string | null;
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
twitterImage?: string | null;
|
|
403
|
-
twitterSite?: string | null;
|
|
404
|
-
twitterCreator?: string | null;
|
|
405
|
-
linkAuthor?: string | null;
|
|
406
|
-
linkLicense?: string | null;
|
|
407
|
-
linkAlternate?: string | null;
|
|
454
|
+
language?: string | null;
|
|
455
|
+
textDirection?: "ltr" | "rtl" | "auto" | null;
|
|
456
|
+
openGraph: Record<string, string>;
|
|
457
|
+
twitterCard: Record<string, string>;
|
|
458
|
+
metaTags: Record<string, string>;
|
|
459
|
+
htmlHeaders: HeaderMetadata[];
|
|
460
|
+
htmlLinks: LinkMetadata[];
|
|
461
|
+
htmlImages: HtmlImageMetadata[];
|
|
462
|
+
structuredData: StructuredData[];
|
|
408
463
|
}
|
|
409
464
|
interface PdfMetadata {
|
|
410
465
|
title?: string | null;
|
|
@@ -640,23 +695,17 @@ interface Metadata {
|
|
|
640
695
|
headers?: string[] | null;
|
|
641
696
|
links?: [string, string][] | null;
|
|
642
697
|
code_blocks?: [string, string][] | null;
|
|
643
|
-
|
|
698
|
+
canonical_url?: string | null;
|
|
644
699
|
base_href?: string | null;
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
twitter_image?: string | null;
|
|
655
|
-
twitter_site?: string | null;
|
|
656
|
-
twitter_creator?: string | null;
|
|
657
|
-
link_author?: string | null;
|
|
658
|
-
link_license?: string | null;
|
|
659
|
-
link_alternate?: string | null;
|
|
700
|
+
open_graph?: Record<string, string>;
|
|
701
|
+
twitter_card?: Record<string, string>;
|
|
702
|
+
meta_tags?: Record<string, string>;
|
|
703
|
+
html_language?: string | null;
|
|
704
|
+
text_direction?: "ltr" | "rtl" | "auto" | null;
|
|
705
|
+
html_headers?: HeaderMetadata[];
|
|
706
|
+
html_links?: LinkMetadata[];
|
|
707
|
+
html_images?: HtmlImageMetadata[];
|
|
708
|
+
structured_data?: StructuredData[];
|
|
660
709
|
psm?: number;
|
|
661
710
|
output_format?: string;
|
|
662
711
|
table_count?: number;
|
|
@@ -695,6 +744,8 @@ interface ExtractionResult {
|
|
|
695
744
|
images: ExtractedImage[] | null;
|
|
696
745
|
/** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
|
|
697
746
|
pages?: PageContent[] | null;
|
|
747
|
+
/** Extracted keywords when keyword extraction is enabled, null otherwise */
|
|
748
|
+
keywords?: ExtractedKeyword[] | null;
|
|
698
749
|
}
|
|
699
750
|
/** Post-processor execution stage in the extraction pipeline. */
|
|
700
751
|
type ProcessingStage = "early" | "middle" | "late";
|
|
@@ -972,5 +1023,59 @@ interface ErrorClassification {
|
|
|
972
1023
|
*/
|
|
973
1024
|
confidence: number;
|
|
974
1025
|
}
|
|
1026
|
+
/**
|
|
1027
|
+
* Opaque handle to a worker pool for concurrent extraction operations.
|
|
1028
|
+
*
|
|
1029
|
+
* Worker pools enable parallel processing of CPU-bound document extraction
|
|
1030
|
+
* tasks by distributing work across multiple threads. This is especially
|
|
1031
|
+
* useful for batch processing large numbers of documents.
|
|
1032
|
+
*
|
|
1033
|
+
* @example
|
|
1034
|
+
* ```typescript
|
|
1035
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1036
|
+
*
|
|
1037
|
+
* const pool = createWorkerPool(4); // 4 concurrent workers
|
|
1038
|
+
* try {
|
|
1039
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1040
|
+
* console.log(result.content);
|
|
1041
|
+
* } finally {
|
|
1042
|
+
* await closeWorkerPool(pool);
|
|
1043
|
+
* }
|
|
1044
|
+
* ```
|
|
1045
|
+
*/
|
|
1046
|
+
interface WorkerPool {
|
|
1047
|
+
/** Internal pool identifier (opaque) */
|
|
1048
|
+
readonly poolId: number;
|
|
1049
|
+
}
|
|
1050
|
+
/**
|
|
1051
|
+
* Worker pool statistics.
|
|
1052
|
+
*
|
|
1053
|
+
* Provides information about the current state of a worker pool including
|
|
1054
|
+
* pool size, number of active workers, and queued tasks.
|
|
1055
|
+
*
|
|
1056
|
+
* @example
|
|
1057
|
+
* ```typescript
|
|
1058
|
+
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
1059
|
+
*
|
|
1060
|
+
* const pool = createWorkerPool(4);
|
|
1061
|
+
* const stats = getWorkerPoolStats(pool);
|
|
1062
|
+
* console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
|
|
1063
|
+
* console.log(`Queued: ${stats.queuedTasks}`);
|
|
1064
|
+
* ```
|
|
1065
|
+
*/
|
|
1066
|
+
interface WorkerPoolStats {
|
|
1067
|
+
/**
|
|
1068
|
+
* Maximum number of concurrent workers in the pool.
|
|
1069
|
+
*/
|
|
1070
|
+
size: number;
|
|
1071
|
+
/**
|
|
1072
|
+
* Number of currently active (executing) workers.
|
|
1073
|
+
*/
|
|
1074
|
+
activeWorkers: number;
|
|
1075
|
+
/**
|
|
1076
|
+
* Number of tasks waiting in the queue.
|
|
1077
|
+
*/
|
|
1078
|
+
queuedTasks: number;
|
|
1079
|
+
}
|
|
975
1080
|
|
|
976
|
-
export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary,
|
|
1081
|
+
export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
|
package/dist/types.d.ts
CHANGED
|
@@ -107,6 +107,21 @@ interface TokenReductionConfig {
|
|
|
107
107
|
/** Preserve tokens for semantically important words even in aggressive mode. Default: true. */
|
|
108
108
|
preserveImportantWords?: boolean;
|
|
109
109
|
}
|
|
110
|
+
/**
|
|
111
|
+
* Hierarchy extraction configuration.
|
|
112
|
+
*
|
|
113
|
+
* Controls document hierarchy detection based on font size clustering.
|
|
114
|
+
*/
|
|
115
|
+
interface HierarchyConfig {
|
|
116
|
+
/** Enable hierarchy extraction. Default: true. */
|
|
117
|
+
enabled?: boolean;
|
|
118
|
+
/** Number of font size clusters (2-10). Default: 6. */
|
|
119
|
+
kClusters?: number;
|
|
120
|
+
/** Include bounding box information. Default: true. */
|
|
121
|
+
includeBbox?: boolean;
|
|
122
|
+
/** OCR coverage threshold (0.0-1.0). Default: null. */
|
|
123
|
+
ocrCoverageThreshold?: number | null;
|
|
124
|
+
}
|
|
110
125
|
/**
|
|
111
126
|
* PDF-specific extraction configuration.
|
|
112
127
|
*
|
|
@@ -119,6 +134,8 @@ interface PdfConfig {
|
|
|
119
134
|
passwords?: string[];
|
|
120
135
|
/** Extract document metadata (title, author, creation date, etc.). Default: true. */
|
|
121
136
|
extractMetadata?: boolean;
|
|
137
|
+
/** Hierarchy extraction configuration. */
|
|
138
|
+
hierarchy?: HierarchyConfig;
|
|
122
139
|
}
|
|
123
140
|
/**
|
|
124
141
|
* Image extraction and processing configuration.
|
|
@@ -281,6 +298,22 @@ interface KeywordConfig {
|
|
|
281
298
|
/** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */
|
|
282
299
|
rakeParams?: RakeParams;
|
|
283
300
|
}
|
|
301
|
+
/**
|
|
302
|
+
* Extracted keyword with relevance metadata.
|
|
303
|
+
*
|
|
304
|
+
* Represents a single keyword extracted from text along with its relevance score,
|
|
305
|
+
* the algorithm that extracted it, and optional position information.
|
|
306
|
+
*/
|
|
307
|
+
interface ExtractedKeyword {
|
|
308
|
+
/** The keyword text */
|
|
309
|
+
text: string;
|
|
310
|
+
/** Relevance score (higher is better, algorithm-specific range) */
|
|
311
|
+
score: number;
|
|
312
|
+
/** Algorithm that extracted this keyword */
|
|
313
|
+
algorithm: KeywordAlgorithm;
|
|
314
|
+
/** Optional positions where keyword appears in text (character offsets) */
|
|
315
|
+
positions?: number[];
|
|
316
|
+
}
|
|
284
317
|
/**
|
|
285
318
|
* Page tracking and extraction configuration.
|
|
286
319
|
*
|
|
@@ -288,7 +321,7 @@ interface KeywordConfig {
|
|
|
288
321
|
* Page range information in chunk metadata (first_page/last_page) is automatically
|
|
289
322
|
* enabled when page boundaries are available and chunking is configured.
|
|
290
323
|
*/
|
|
291
|
-
interface
|
|
324
|
+
interface PageExtractionConfig {
|
|
292
325
|
/** Extract pages as separate array (ExtractionResult.pages) */
|
|
293
326
|
extractPages?: boolean;
|
|
294
327
|
/** Insert page markers in main content string */
|
|
@@ -328,7 +361,7 @@ interface ExtractionConfig {
|
|
|
328
361
|
/** Keyword extraction configuration for extracting important phrases. */
|
|
329
362
|
keywords?: KeywordConfig;
|
|
330
363
|
/** Page tracking and extraction configuration for multi-page documents. */
|
|
331
|
-
pages?:
|
|
364
|
+
pages?: PageExtractionConfig;
|
|
332
365
|
/** Maximum number of concurrent extractions in batch operations. Default: 4. */
|
|
333
366
|
maxConcurrentExtractions?: number;
|
|
334
367
|
}
|
|
@@ -383,28 +416,50 @@ interface TextMetadata {
|
|
|
383
416
|
links?: [string, string][] | null;
|
|
384
417
|
codeBlocks?: [string, string][] | null;
|
|
385
418
|
}
|
|
419
|
+
interface HeaderMetadata {
|
|
420
|
+
level: number;
|
|
421
|
+
text: string;
|
|
422
|
+
id?: string | null;
|
|
423
|
+
depth: number;
|
|
424
|
+
htmlOffset: number;
|
|
425
|
+
}
|
|
426
|
+
interface LinkMetadata {
|
|
427
|
+
href: string;
|
|
428
|
+
text: string;
|
|
429
|
+
title?: string | null;
|
|
430
|
+
linkType: "anchor" | "internal" | "external" | "email" | "phone" | "other";
|
|
431
|
+
rel: string[];
|
|
432
|
+
attributes: Record<string, string>;
|
|
433
|
+
}
|
|
434
|
+
interface HtmlImageMetadata {
|
|
435
|
+
src: string;
|
|
436
|
+
alt?: string | null;
|
|
437
|
+
title?: string | null;
|
|
438
|
+
dimensions?: [number, number] | null;
|
|
439
|
+
imageType: "data_uri" | "inline_svg" | "external" | "relative";
|
|
440
|
+
attributes: Record<string, string>;
|
|
441
|
+
}
|
|
442
|
+
interface StructuredData {
|
|
443
|
+
dataType: "json_ld" | "microdata" | "rdfa";
|
|
444
|
+
rawJson: string;
|
|
445
|
+
schemaType?: string | null;
|
|
446
|
+
}
|
|
386
447
|
interface HtmlMetadata {
|
|
387
448
|
title?: string | null;
|
|
388
449
|
description?: string | null;
|
|
389
|
-
keywords
|
|
450
|
+
keywords: string[];
|
|
390
451
|
author?: string | null;
|
|
391
|
-
|
|
452
|
+
canonicalUrl?: string | null;
|
|
392
453
|
baseHref?: string | null;
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
twitterImage?: string | null;
|
|
403
|
-
twitterSite?: string | null;
|
|
404
|
-
twitterCreator?: string | null;
|
|
405
|
-
linkAuthor?: string | null;
|
|
406
|
-
linkLicense?: string | null;
|
|
407
|
-
linkAlternate?: string | null;
|
|
454
|
+
language?: string | null;
|
|
455
|
+
textDirection?: "ltr" | "rtl" | "auto" | null;
|
|
456
|
+
openGraph: Record<string, string>;
|
|
457
|
+
twitterCard: Record<string, string>;
|
|
458
|
+
metaTags: Record<string, string>;
|
|
459
|
+
htmlHeaders: HeaderMetadata[];
|
|
460
|
+
htmlLinks: LinkMetadata[];
|
|
461
|
+
htmlImages: HtmlImageMetadata[];
|
|
462
|
+
structuredData: StructuredData[];
|
|
408
463
|
}
|
|
409
464
|
interface PdfMetadata {
|
|
410
465
|
title?: string | null;
|
|
@@ -640,23 +695,17 @@ interface Metadata {
|
|
|
640
695
|
headers?: string[] | null;
|
|
641
696
|
links?: [string, string][] | null;
|
|
642
697
|
code_blocks?: [string, string][] | null;
|
|
643
|
-
|
|
698
|
+
canonical_url?: string | null;
|
|
644
699
|
base_href?: string | null;
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
twitter_image?: string | null;
|
|
655
|
-
twitter_site?: string | null;
|
|
656
|
-
twitter_creator?: string | null;
|
|
657
|
-
link_author?: string | null;
|
|
658
|
-
link_license?: string | null;
|
|
659
|
-
link_alternate?: string | null;
|
|
700
|
+
open_graph?: Record<string, string>;
|
|
701
|
+
twitter_card?: Record<string, string>;
|
|
702
|
+
meta_tags?: Record<string, string>;
|
|
703
|
+
html_language?: string | null;
|
|
704
|
+
text_direction?: "ltr" | "rtl" | "auto" | null;
|
|
705
|
+
html_headers?: HeaderMetadata[];
|
|
706
|
+
html_links?: LinkMetadata[];
|
|
707
|
+
html_images?: HtmlImageMetadata[];
|
|
708
|
+
structured_data?: StructuredData[];
|
|
660
709
|
psm?: number;
|
|
661
710
|
output_format?: string;
|
|
662
711
|
table_count?: number;
|
|
@@ -695,6 +744,8 @@ interface ExtractionResult {
|
|
|
695
744
|
images: ExtractedImage[] | null;
|
|
696
745
|
/** Per-page content when page extraction is enabled, null otherwise. Each item contains page number, content, tables, and images. */
|
|
697
746
|
pages?: PageContent[] | null;
|
|
747
|
+
/** Extracted keywords when keyword extraction is enabled, null otherwise */
|
|
748
|
+
keywords?: ExtractedKeyword[] | null;
|
|
698
749
|
}
|
|
699
750
|
/** Post-processor execution stage in the extraction pipeline. */
|
|
700
751
|
type ProcessingStage = "early" | "middle" | "late";
|
|
@@ -972,5 +1023,59 @@ interface ErrorClassification {
|
|
|
972
1023
|
*/
|
|
973
1024
|
confidence: number;
|
|
974
1025
|
}
|
|
1026
|
+
/**
|
|
1027
|
+
* Opaque handle to a worker pool for concurrent extraction operations.
|
|
1028
|
+
*
|
|
1029
|
+
* Worker pools enable parallel processing of CPU-bound document extraction
|
|
1030
|
+
* tasks by distributing work across multiple threads. This is especially
|
|
1031
|
+
* useful for batch processing large numbers of documents.
|
|
1032
|
+
*
|
|
1033
|
+
* @example
|
|
1034
|
+
* ```typescript
|
|
1035
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1036
|
+
*
|
|
1037
|
+
* const pool = createWorkerPool(4); // 4 concurrent workers
|
|
1038
|
+
* try {
|
|
1039
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1040
|
+
* console.log(result.content);
|
|
1041
|
+
* } finally {
|
|
1042
|
+
* await closeWorkerPool(pool);
|
|
1043
|
+
* }
|
|
1044
|
+
* ```
|
|
1045
|
+
*/
|
|
1046
|
+
interface WorkerPool {
|
|
1047
|
+
/** Internal pool identifier (opaque) */
|
|
1048
|
+
readonly poolId: number;
|
|
1049
|
+
}
|
|
1050
|
+
/**
|
|
1051
|
+
* Worker pool statistics.
|
|
1052
|
+
*
|
|
1053
|
+
* Provides information about the current state of a worker pool including
|
|
1054
|
+
* pool size, number of active workers, and queued tasks.
|
|
1055
|
+
*
|
|
1056
|
+
* @example
|
|
1057
|
+
* ```typescript
|
|
1058
|
+
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
1059
|
+
*
|
|
1060
|
+
* const pool = createWorkerPool(4);
|
|
1061
|
+
* const stats = getWorkerPoolStats(pool);
|
|
1062
|
+
* console.log(`Active: ${stats.activeWorkers}/${stats.size}`);
|
|
1063
|
+
* console.log(`Queued: ${stats.queuedTasks}`);
|
|
1064
|
+
* ```
|
|
1065
|
+
*/
|
|
1066
|
+
interface WorkerPoolStats {
|
|
1067
|
+
/**
|
|
1068
|
+
* Maximum number of concurrent workers in the pool.
|
|
1069
|
+
*/
|
|
1070
|
+
size: number;
|
|
1071
|
+
/**
|
|
1072
|
+
* Number of currently active (executing) workers.
|
|
1073
|
+
*/
|
|
1074
|
+
activeWorkers: number;
|
|
1075
|
+
/**
|
|
1076
|
+
* Number of tasks waiting in the queue.
|
|
1077
|
+
*/
|
|
1078
|
+
queuedTasks: number;
|
|
1079
|
+
}
|
|
975
1080
|
|
|
976
|
-
export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary,
|
|
1081
|
+
export type { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorClassification, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, ExtractionConfig, ExtractionResult, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrBackendProtocol, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PostProcessorProtocol, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, ValidatorProtocol, WorkerPool, WorkerPoolStats, XmlMetadata, YakeParams };
|