@kreuzberg/node 4.0.0-rc.21 → 4.0.0-rc.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +345 -534
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +158 -91
- package/dist/index.d.ts +158 -91
- package/dist/index.js +77 -103
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +72 -103
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +141 -36
- package/dist/types.d.ts +141 -36
- package/dist/types.js.map +1 -1
- package/index.d.ts +183 -0
- package/index.js +64 -54
- package/metadata.d.ts +53 -33
- package/package.json +5 -6
package/dist/cli.d.mts
CHANGED
|
@@ -4,6 +4,10 @@
|
|
|
4
4
|
*
|
|
5
5
|
* This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.
|
|
6
6
|
*/
|
|
7
|
+
declare global {
|
|
8
|
+
var __filename: string | undefined;
|
|
9
|
+
var __dirname: string | undefined;
|
|
10
|
+
}
|
|
7
11
|
declare function main(argv: string[]): number;
|
|
8
12
|
|
|
9
13
|
export { main };
|
package/dist/cli.d.ts
CHANGED
|
@@ -4,6 +4,10 @@
|
|
|
4
4
|
*
|
|
5
5
|
* This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.
|
|
6
6
|
*/
|
|
7
|
+
declare global {
|
|
8
|
+
var __filename: string | undefined;
|
|
9
|
+
var __dirname: string | undefined;
|
|
10
|
+
}
|
|
7
11
|
declare function main(argv: string[]): number;
|
|
8
12
|
|
|
9
13
|
export { main };
|
package/dist/cli.js
CHANGED
|
@@ -37,7 +37,17 @@ var import_node_fs = require("node:fs");
|
|
|
37
37
|
var import_node_path = require("node:path");
|
|
38
38
|
var import_node_url = require("node:url");
|
|
39
39
|
var import_which = __toESM(require("which"));
|
|
40
|
-
|
|
40
|
+
function getDirectory() {
|
|
41
|
+
if (typeof __filename !== "undefined") {
|
|
42
|
+
return (0, import_node_path.dirname)(__filename);
|
|
43
|
+
}
|
|
44
|
+
try {
|
|
45
|
+
const url = eval("import.meta.url");
|
|
46
|
+
return (0, import_node_path.dirname)((0, import_node_url.fileURLToPath)(url));
|
|
47
|
+
} catch {
|
|
48
|
+
return process.cwd();
|
|
49
|
+
}
|
|
50
|
+
}
|
|
41
51
|
function main(argv) {
|
|
42
52
|
const args = argv.slice(2);
|
|
43
53
|
let cliPath;
|
|
@@ -46,7 +56,7 @@ function main(argv) {
|
|
|
46
56
|
} catch {
|
|
47
57
|
}
|
|
48
58
|
if (!cliPath) {
|
|
49
|
-
const __dirname =
|
|
59
|
+
const __dirname = getDirectory();
|
|
50
60
|
const devBinary = (0, import_node_path.join)(__dirname, "..", "..", "..", "target", "release", "kreuzberg");
|
|
51
61
|
if ((0, import_node_fs.existsSync)(devBinary)) {
|
|
52
62
|
cliPath = devBinary;
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname =
|
|
1
|
+
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// @ts-ignore - import.meta is only available in ESM\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,eAAO,8BAAQ,+BAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
|
package/dist/cli.mjs
CHANGED
|
@@ -4,6 +4,17 @@ import { existsSync } from "node:fs";
|
|
|
4
4
|
import { dirname, join } from "node:path";
|
|
5
5
|
import { fileURLToPath } from "node:url";
|
|
6
6
|
import which from "which";
|
|
7
|
+
function getDirectory() {
|
|
8
|
+
if (typeof __filename !== "undefined") {
|
|
9
|
+
return dirname(__filename);
|
|
10
|
+
}
|
|
11
|
+
try {
|
|
12
|
+
const url = eval("import.meta.url");
|
|
13
|
+
return dirname(fileURLToPath(url));
|
|
14
|
+
} catch {
|
|
15
|
+
return process.cwd();
|
|
16
|
+
}
|
|
17
|
+
}
|
|
7
18
|
function main(argv) {
|
|
8
19
|
const args = argv.slice(2);
|
|
9
20
|
let cliPath;
|
|
@@ -12,7 +23,7 @@ function main(argv) {
|
|
|
12
23
|
} catch {
|
|
13
24
|
}
|
|
14
25
|
if (!cliPath) {
|
|
15
|
-
const __dirname =
|
|
26
|
+
const __dirname = getDirectory();
|
|
16
27
|
const devBinary = join(__dirname, "..", "..", "..", "target", "release", "kreuzberg");
|
|
17
28
|
if (existsSync(devBinary)) {
|
|
18
29
|
cliPath = devBinary;
|
package/dist/cli.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname =
|
|
1
|
+
{"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// @ts-ignore - import.meta is only available in ESM\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,WAAO,QAAQ,cAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
|
package/dist/index.d.mts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { PanicContext } from './errors.mjs';
|
|
2
2
|
export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.mjs';
|
|
3
|
-
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol,
|
|
4
|
-
export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, Metadata, OcrMetadata, PageBoundary,
|
|
3
|
+
import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.mjs';
|
|
4
|
+
export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
|
|
5
5
|
export { GutenOcrBackend } from './ocr/guten-ocr.mjs';
|
|
6
6
|
|
|
7
7
|
/**
|
|
@@ -610,72 +610,12 @@ declare function unregisterDocumentExtractor(name: string): void;
|
|
|
610
610
|
*/
|
|
611
611
|
declare function clearDocumentExtractors(): void;
|
|
612
612
|
/**
|
|
613
|
-
*
|
|
614
|
-
*
|
|
615
|
-
* Provides a convenient way to build extraction configurations using method chaining.
|
|
616
|
-
*
|
|
617
|
-
* @example
|
|
618
|
-
* ```typescript
|
|
619
|
-
* import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
620
|
-
*
|
|
621
|
-
* // Create with builder pattern
|
|
622
|
-
* const config = ExtractionConfig.default()
|
|
623
|
-
* .withChunking({ maxChars: 2048 })
|
|
624
|
-
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
625
|
-
* .build();
|
|
626
|
-
*
|
|
627
|
-
* const result = await extractFile('document.pdf', null, config);
|
|
628
|
-
* ```
|
|
629
|
-
*/
|
|
630
|
-
declare class ExtractionConfigBuilder {
|
|
631
|
-
private config;
|
|
632
|
-
/**
|
|
633
|
-
* Create a new builder with default configuration.
|
|
634
|
-
*/
|
|
635
|
-
static default(): ExtractionConfigBuilder;
|
|
636
|
-
/**
|
|
637
|
-
* Set OCR configuration.
|
|
638
|
-
*/
|
|
639
|
-
withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
|
|
640
|
-
/**
|
|
641
|
-
* Set chunking configuration.
|
|
642
|
-
*/
|
|
643
|
-
withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
|
|
644
|
-
/**
|
|
645
|
-
* Set image extraction configuration.
|
|
646
|
-
*/
|
|
647
|
-
withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
|
|
648
|
-
/**
|
|
649
|
-
* Set PDF configuration.
|
|
650
|
-
*/
|
|
651
|
-
withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
|
|
652
|
-
/**
|
|
653
|
-
* Set keyword extraction configuration.
|
|
654
|
-
*/
|
|
655
|
-
withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
|
|
656
|
-
/**
|
|
657
|
-
* Set language detection configuration.
|
|
658
|
-
*/
|
|
659
|
-
withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
|
|
660
|
-
/**
|
|
661
|
-
* Set whether to enable metadata extraction.
|
|
662
|
-
*/
|
|
663
|
-
withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
|
|
664
|
-
/**
|
|
665
|
-
* Set whether to enable quality mode.
|
|
666
|
-
*/
|
|
667
|
-
withQualityMode(enabled: boolean): ExtractionConfigBuilder;
|
|
668
|
-
/**
|
|
669
|
-
* Build and return the final ExtractionConfig object.
|
|
670
|
-
*/
|
|
671
|
-
build(): ExtractionConfig$1;
|
|
672
|
-
}
|
|
673
|
-
/**
|
|
674
|
-
* ExtractionConfig namespace with static methods for loading configuration from files
|
|
675
|
-
* and creating new configurations with the builder pattern.
|
|
613
|
+
* ExtractionConfig namespace with static methods for loading configuration from files.
|
|
676
614
|
*
|
|
677
615
|
* Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
|
|
678
|
-
* or to
|
|
616
|
+
* or to discover configuration files in the current directory tree.
|
|
617
|
+
*
|
|
618
|
+
* For creating configurations programmatically, use plain TypeScript objects instead:
|
|
679
619
|
*
|
|
680
620
|
* @example
|
|
681
621
|
* ```typescript
|
|
@@ -684,35 +624,17 @@ declare class ExtractionConfigBuilder {
|
|
|
684
624
|
* // Load configuration from file
|
|
685
625
|
* const config1 = ExtractionConfig.fromFile('config.toml');
|
|
686
626
|
*
|
|
687
|
-
* //
|
|
688
|
-
* const config2 =
|
|
689
|
-
*
|
|
690
|
-
*
|
|
627
|
+
* // Or create with plain object
|
|
628
|
+
* const config2 = {
|
|
629
|
+
* chunking: { maxChars: 2048 },
|
|
630
|
+
* ocr: { backend: 'tesseract', language: 'eng' }
|
|
631
|
+
* };
|
|
691
632
|
*
|
|
692
633
|
* // Use with extraction
|
|
693
634
|
* const result = await extractFile('document.pdf', null, config2);
|
|
694
635
|
* ```
|
|
695
636
|
*/
|
|
696
637
|
declare const ExtractionConfig: {
|
|
697
|
-
/**
|
|
698
|
-
* Create a default extraction configuration using the builder pattern.
|
|
699
|
-
*
|
|
700
|
-
* Returns a builder object that allows you to configure extraction settings
|
|
701
|
-
* using method chaining.
|
|
702
|
-
*
|
|
703
|
-
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
704
|
-
*
|
|
705
|
-
* @example
|
|
706
|
-
* ```typescript
|
|
707
|
-
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
708
|
-
*
|
|
709
|
-
* const config = ExtractionConfig.default()
|
|
710
|
-
* .withChunking({ maxChars: 2048 })
|
|
711
|
-
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
712
|
-
* .build();
|
|
713
|
-
* ```
|
|
714
|
-
*/
|
|
715
|
-
default(): ExtractionConfigBuilder;
|
|
716
638
|
/**
|
|
717
639
|
* Load extraction configuration from a file.
|
|
718
640
|
*
|
|
@@ -1060,6 +982,151 @@ declare function getErrorCodeDescription(code: number): string;
|
|
|
1060
982
|
* ```
|
|
1061
983
|
*/
|
|
1062
984
|
declare function classifyError(errorMessage: string): ErrorClassification;
|
|
1063
|
-
|
|
985
|
+
/**
|
|
986
|
+
* Create a worker pool for concurrent file extraction.
|
|
987
|
+
*
|
|
988
|
+
* The worker pool manages a set of background worker threads that can process
|
|
989
|
+
* extraction requests concurrently, improving throughput when handling multiple files.
|
|
990
|
+
*
|
|
991
|
+
* @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
|
|
992
|
+
* @returns A WorkerPool instance to use with extraction functions
|
|
993
|
+
*
|
|
994
|
+
* @throws {Error} If size is invalid or pool creation fails
|
|
995
|
+
*
|
|
996
|
+
* @example
|
|
997
|
+
* ```typescript
|
|
998
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
999
|
+
*
|
|
1000
|
+
* // Create pool with 4 workers
|
|
1001
|
+
* const pool = createWorkerPool(4);
|
|
1002
|
+
*
|
|
1003
|
+
* try {
|
|
1004
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1005
|
+
* console.log(result.content);
|
|
1006
|
+
* } finally {
|
|
1007
|
+
* // Always close the pool when done
|
|
1008
|
+
* await closeWorkerPool(pool);
|
|
1009
|
+
* }
|
|
1010
|
+
* ```
|
|
1011
|
+
*/
|
|
1012
|
+
declare function createWorkerPool(size?: number): WorkerPool;
|
|
1013
|
+
/**
|
|
1014
|
+
* Get statistics about a worker pool.
|
|
1015
|
+
*
|
|
1016
|
+
* Returns information about the pool's current state, including the number of active workers,
|
|
1017
|
+
* queued tasks, and total processed tasks.
|
|
1018
|
+
*
|
|
1019
|
+
* @param pool - The worker pool instance
|
|
1020
|
+
* @returns WorkerPoolStats with pool information
|
|
1021
|
+
*
|
|
1022
|
+
* @example
|
|
1023
|
+
* ```typescript
|
|
1024
|
+
* import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
|
|
1025
|
+
*
|
|
1026
|
+
* const pool = createWorkerPool(4);
|
|
1027
|
+
* const stats = getWorkerPoolStats(pool);
|
|
1028
|
+
*
|
|
1029
|
+
* console.log(`Pool size: ${stats.size}`);
|
|
1030
|
+
* console.log(`Active workers: ${stats.activeWorkers}`);
|
|
1031
|
+
* console.log(`Queued tasks: ${stats.queuedTasks}`);
|
|
1032
|
+
* ```
|
|
1033
|
+
*/
|
|
1034
|
+
declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
|
|
1035
|
+
/**
|
|
1036
|
+
* Extract content from a single file using a worker pool (asynchronous).
|
|
1037
|
+
*
|
|
1038
|
+
* Submits an extraction task to the worker pool. The task is executed by one of the
|
|
1039
|
+
* available workers in the background, allowing other tasks to be processed concurrently.
|
|
1040
|
+
*
|
|
1041
|
+
* @param pool - The worker pool instance
|
|
1042
|
+
* @param filePath - Path to the file to extract
|
|
1043
|
+
* @param mimeTypeOrConfig - Optional MIME type or extraction configuration
|
|
1044
|
+
* @param maybeConfig - Optional extraction configuration (if second param is MIME type)
|
|
1045
|
+
* @returns Promise<ExtractionResult> containing extracted content and metadata
|
|
1046
|
+
*
|
|
1047
|
+
* @throws {Error} If the file cannot be read or extraction fails
|
|
1048
|
+
*
|
|
1049
|
+
* @example
|
|
1050
|
+
* ```typescript
|
|
1051
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1052
|
+
*
|
|
1053
|
+
* const pool = createWorkerPool(4);
|
|
1054
|
+
*
|
|
1055
|
+
* try {
|
|
1056
|
+
* const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
1057
|
+
* const results = await Promise.all(
|
|
1058
|
+
* files.map(f => extractFileInWorker(pool, f))
|
|
1059
|
+
* );
|
|
1060
|
+
*
|
|
1061
|
+
* results.forEach((r, i) => {
|
|
1062
|
+
* console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
|
|
1063
|
+
* });
|
|
1064
|
+
* } finally {
|
|
1065
|
+
* await closeWorkerPool(pool);
|
|
1066
|
+
* }
|
|
1067
|
+
* ```
|
|
1068
|
+
*/
|
|
1069
|
+
declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
|
|
1070
|
+
/**
|
|
1071
|
+
* Extract content from multiple files in parallel using a worker pool (asynchronous).
|
|
1072
|
+
*
|
|
1073
|
+
* Submits multiple extraction tasks to the worker pool for concurrent processing.
|
|
1074
|
+
* This is more efficient than using `extractFileInWorker` multiple times sequentially.
|
|
1075
|
+
*
|
|
1076
|
+
* @param pool - The worker pool instance
|
|
1077
|
+
* @param paths - Array of file paths to extract
|
|
1078
|
+
* @param config - Extraction configuration object (applies to all files)
|
|
1079
|
+
* @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
|
|
1080
|
+
*
|
|
1081
|
+
* @throws {Error} If any file cannot be read or extraction fails
|
|
1082
|
+
*
|
|
1083
|
+
* @example
|
|
1084
|
+
* ```typescript
|
|
1085
|
+
* import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1086
|
+
*
|
|
1087
|
+
* const pool = createWorkerPool(4);
|
|
1088
|
+
*
|
|
1089
|
+
* try {
|
|
1090
|
+
* const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
|
|
1091
|
+
* const results = await batchExtractFilesInWorker(pool, files, {
|
|
1092
|
+
* ocr: { backend: 'tesseract', language: 'eng' }
|
|
1093
|
+
* });
|
|
1094
|
+
*
|
|
1095
|
+
* const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
|
|
1096
|
+
* console.log(`Total: $${total}`);
|
|
1097
|
+
* } finally {
|
|
1098
|
+
* await closeWorkerPool(pool);
|
|
1099
|
+
* }
|
|
1100
|
+
* ```
|
|
1101
|
+
*/
|
|
1102
|
+
declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
|
|
1103
|
+
/**
|
|
1104
|
+
* Close a worker pool and shut down all worker threads.
|
|
1105
|
+
*
|
|
1106
|
+
* Should be called when the pool is no longer needed to clean up resources
|
|
1107
|
+
* and gracefully shut down worker threads. Any pending tasks will be cancelled.
|
|
1108
|
+
*
|
|
1109
|
+
* @param pool - The worker pool instance to close
|
|
1110
|
+
* @returns Promise that resolves when the pool is fully closed
|
|
1111
|
+
*
|
|
1112
|
+
* @throws {Error} If pool shutdown fails
|
|
1113
|
+
*
|
|
1114
|
+
* @example
|
|
1115
|
+
* ```typescript
|
|
1116
|
+
* import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
1117
|
+
*
|
|
1118
|
+
* const pool = createWorkerPool(4);
|
|
1119
|
+
*
|
|
1120
|
+
* try {
|
|
1121
|
+
* const result = await extractFileInWorker(pool, 'document.pdf');
|
|
1122
|
+
* console.log(result.content);
|
|
1123
|
+
* } finally {
|
|
1124
|
+
* // Clean up the pool
|
|
1125
|
+
* await closeWorkerPool(pool);
|
|
1126
|
+
* }
|
|
1127
|
+
* ```
|
|
1128
|
+
*/
|
|
1129
|
+
declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
|
|
1130
|
+
declare const __version__ = "4.0.0-rc.24";
|
|
1064
1131
|
|
|
1065
|
-
export {
|
|
1132
|
+
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|