@kreuzberg/node 4.0.0-rc.21 → 4.0.0-rc.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.mts CHANGED
@@ -4,6 +4,10 @@
4
4
  *
5
5
  * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.
6
6
  */
7
+ declare global {
8
+ var __filename: string | undefined;
9
+ var __dirname: string | undefined;
10
+ }
7
11
  declare function main(argv: string[]): number;
8
12
 
9
13
  export { main };
package/dist/cli.d.ts CHANGED
@@ -4,6 +4,10 @@
4
4
  *
5
5
  * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.
6
6
  */
7
+ declare global {
8
+ var __filename: string | undefined;
9
+ var __dirname: string | undefined;
10
+ }
7
11
  declare function main(argv: string[]): number;
8
12
 
9
13
  export { main };
package/dist/cli.js CHANGED
@@ -37,7 +37,17 @@ var import_node_fs = require("node:fs");
37
37
  var import_node_path = require("node:path");
38
38
  var import_node_url = require("node:url");
39
39
  var import_which = __toESM(require("which"));
40
- const import_meta = {};
40
+ function getDirectory() {
41
+ if (typeof __filename !== "undefined") {
42
+ return (0, import_node_path.dirname)(__filename);
43
+ }
44
+ try {
45
+ const url = eval("import.meta.url");
46
+ return (0, import_node_path.dirname)((0, import_node_url.fileURLToPath)(url));
47
+ } catch {
48
+ return process.cwd();
49
+ }
50
+ }
41
51
  function main(argv) {
42
52
  const args = argv.slice(2);
43
53
  let cliPath;
@@ -46,7 +56,7 @@ function main(argv) {
46
56
  } catch {
47
57
  }
48
58
  if (!cliPath) {
49
- const __dirname = typeof __filename !== "undefined" ? (0, import_node_path.dirname)(__filename) : (0, import_node_path.dirname)((0, import_node_url.fileURLToPath)(import_meta.url));
59
+ const __dirname = getDirectory();
50
60
  const devBinary = (0, import_node_path.join)(__dirname, "..", "..", "..", "target", "release", "kreuzberg");
51
61
  if ((0, import_node_fs.existsSync)(devBinary)) {
52
62
  cliPath = devBinary;
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = typeof __filename !== \"undefined\" ? dirname(__filename) : dirname(fileURLToPath(import.meta.url));\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAZlB;AAcA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,OAAO,eAAe,kBAAc,0BAAQ,UAAU,QAAI,8BAAQ,+BAAc,YAAY,GAAG,CAAC;AAClH,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
1
+ {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// @ts-ignore - import.meta is only available in ESM\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAQA,gCAA0B;AAC1B,qBAA2B;AAC3B,uBAA8B;AAC9B,sBAA8B;AAC9B,mBAAkB;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,eAAO,0BAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,eAAO,8BAAQ,+BAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,aAAAA,QAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,gBAAY,uBAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,YAAI,2BAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAS,qCAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":["which"]}
package/dist/cli.mjs CHANGED
@@ -4,6 +4,17 @@ import { existsSync } from "node:fs";
4
4
  import { dirname, join } from "node:path";
5
5
  import { fileURLToPath } from "node:url";
6
6
  import which from "which";
7
+ function getDirectory() {
8
+ if (typeof __filename !== "undefined") {
9
+ return dirname(__filename);
10
+ }
11
+ try {
12
+ const url = eval("import.meta.url");
13
+ return dirname(fileURLToPath(url));
14
+ } catch {
15
+ return process.cwd();
16
+ }
17
+ }
7
18
  function main(argv) {
8
19
  const args = argv.slice(2);
9
20
  let cliPath;
@@ -12,7 +23,7 @@ function main(argv) {
12
23
  } catch {
13
24
  }
14
25
  if (!cliPath) {
15
- const __dirname = typeof __filename !== "undefined" ? dirname(__filename) : dirname(fileURLToPath(import.meta.url));
26
+ const __dirname = getDirectory();
16
27
  const devBinary = join(__dirname, "..", "..", "..", "target", "release", "kreuzberg");
17
28
  if (existsSync(devBinary)) {
18
29
  cliPath = devBinary;
package/dist/cli.mjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = typeof __filename !== \"undefined\" ? dirname(__filename) : dirname(fileURLToPath(import.meta.url));\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAElB,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,OAAO,eAAe,cAAc,QAAQ,UAAU,IAAI,QAAQ,cAAc,YAAY,GAAG,CAAC;AAClH,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
1
+ {"version":3,"sources":["../typescript/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/**\n * Proxy entry point that forwards to the Rust-based Kreuzberg CLI.\n *\n * This keeps `npx kreuzberg` working without shipping an additional TypeScript CLI implementation.\n */\n\nimport { spawnSync } from \"node:child_process\";\nimport { existsSync } from \"node:fs\";\nimport { dirname, join } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport which from \"which\";\n\ndeclare global {\n\tvar __filename: string | undefined;\n\tvar __dirname: string | undefined;\n}\n\nfunction getDirectory(): string {\n\t// In CJS, __filename will be defined\n\tif (typeof __filename !== \"undefined\") {\n\t\treturn dirname(__filename);\n\t}\n\t// Fallback for ESM\n\ttry {\n\t\t// Use eval to avoid esbuild warnings about import.meta in CJS builds\n\t\t// @ts-ignore - import.meta is only available in ESM\n\t\tconst url = eval(\"import.meta.url\");\n\t\treturn dirname(fileURLToPath(url));\n\t} catch {\n\t\treturn process.cwd();\n\t}\n}\n\nfunction main(argv: string[]): number {\n\tconst args = argv.slice(2);\n\n\tlet cliPath: string | undefined;\n\ttry {\n\t\tcliPath = which.sync(\"kreuzberg-cli\");\n\t} catch {}\n\n\tif (!cliPath) {\n\t\tconst __dirname = getDirectory();\n\t\tconst devBinary = join(__dirname, \"..\", \"..\", \"..\", \"target\", \"release\", \"kreuzberg\");\n\t\tif (existsSync(devBinary)) {\n\t\t\tcliPath = devBinary;\n\t\t}\n\t}\n\n\tif (!cliPath) {\n\t\tconsole.error(\n\t\t\t\"The embedded Kreuzberg CLI binary could not be located. \" +\n\t\t\t\t\"This indicates a packaging issue; please open an issue at \" +\n\t\t\t\t\"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\",\n\t\t);\n\t\treturn 1;\n\t}\n\n\tconst result = spawnSync(cliPath, args, {\n\t\tstdio: \"inherit\",\n\t\tshell: false,\n\t});\n\n\tif (result.error) {\n\t\tconsole.error(`Failed to execute kreuzberg-cli: ${result.error.message}`);\n\t\treturn 1;\n\t}\n\n\treturn result.status ?? 1;\n}\n\nif (require.main === module) {\n\tprocess.exit(main(process.argv));\n}\n\nexport { main };\n"],"mappings":";AAQA,SAAS,iBAAiB;AAC1B,SAAS,kBAAkB;AAC3B,SAAS,SAAS,YAAY;AAC9B,SAAS,qBAAqB;AAC9B,OAAO,WAAW;AAOlB,SAAS,eAAuB;AAE/B,MAAI,OAAO,eAAe,aAAa;AACtC,WAAO,QAAQ,UAAU;AAAA,EAC1B;AAEA,MAAI;AAGH,UAAM,MAAM,KAAK,iBAAiB;AAClC,WAAO,QAAQ,cAAc,GAAG,CAAC;AAAA,EAClC,QAAQ;AACP,WAAO,QAAQ,IAAI;AAAA,EACpB;AACD;AAEA,SAAS,KAAK,MAAwB;AACrC,QAAM,OAAO,KAAK,MAAM,CAAC;AAEzB,MAAI;AACJ,MAAI;AACH,cAAU,MAAM,KAAK,eAAe;AAAA,EACrC,QAAQ;AAAA,EAAC;AAET,MAAI,CAAC,SAAS;AACb,UAAM,YAAY,aAAa;AAC/B,UAAM,YAAY,KAAK,WAAW,MAAM,MAAM,MAAM,UAAU,WAAW,WAAW;AACpF,QAAI,WAAW,SAAS,GAAG;AAC1B,gBAAU;AAAA,IACX;AAAA,EACD;AAEA,MAAI,CAAC,SAAS;AACb,YAAQ;AAAA,MACP;AAAA,IAGD;AACA,WAAO;AAAA,EACR;AAEA,QAAM,SAAS,UAAU,SAAS,MAAM;AAAA,IACvC,OAAO;AAAA,IACP,OAAO;AAAA,EACR,CAAC;AAED,MAAI,OAAO,OAAO;AACjB,YAAQ,MAAM,oCAAoC,OAAO,MAAM,OAAO,EAAE;AACxE,WAAO;AAAA,EACR;AAEA,SAAO,OAAO,UAAU;AACzB;AAEA,IAAI,QAAQ,SAAS,QAAQ;AAC5B,UAAQ,KAAK,KAAK,QAAQ,IAAI,CAAC;AAChC;","names":[]}
package/dist/index.d.mts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { PanicContext } from './errors.mjs';
2
2
  export { CacheError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.mjs';
3
- import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, OcrConfig, ChunkingConfig, ImageExtractionConfig, PdfConfig, KeywordConfig, LanguageDetectionConfig, ErrorClassification } from './types.mjs';
4
- export { ArchiveMetadata, Chunk, ChunkMetadata, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, HtmlConversionOptions, HtmlMetadata, HtmlPreprocessingOptions, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, Metadata, OcrMetadata, PageBoundary, PageConfig, PageContent, PageInfo, PageStructure, PageUnitType, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
3
+ import { ExtractionConfig as ExtractionConfig$1, ExtractionResult, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, ErrorClassification, WorkerPool, WorkerPoolStats } from './types.mjs';
4
+ export { ArchiveMetadata, Chunk, ChunkMetadata, ChunkingConfig, EmailMetadata, ErrorMetadata, ExcelMetadata, ExtractedImage, ExtractedKeyword, HeaderMetadata, HierarchyConfig, HtmlConversionOptions, HtmlImageMetadata, HtmlMetadata, HtmlPreprocessingOptions, ImageExtractionConfig, ImageMetadata, ImagePreprocessingMetadata, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, LinkMetadata, Metadata, OcrConfig, OcrMetadata, PageBoundary, PageContent, PageExtractionConfig, PageInfo, PageStructure, PageUnitType, PdfConfig, PdfMetadata, PostProcessorConfig, PptxMetadata, ProcessingStage, RakeParams, StructuredData, Table, TesseractConfig, TextMetadata, TokenReductionConfig, XmlMetadata, YakeParams } from './types.mjs';
5
5
  export { GutenOcrBackend } from './ocr/guten-ocr.mjs';
6
6
 
7
7
  /**
@@ -610,72 +610,12 @@ declare function unregisterDocumentExtractor(name: string): void;
610
610
  */
611
611
  declare function clearDocumentExtractors(): void;
612
612
  /**
613
- * Builder class for creating ExtractionConfig objects with a fluent API.
614
- *
615
- * Provides a convenient way to build extraction configurations using method chaining.
616
- *
617
- * @example
618
- * ```typescript
619
- * import { ExtractionConfig, extractFile } from '@kreuzberg/node';
620
- *
621
- * // Create with builder pattern
622
- * const config = ExtractionConfig.default()
623
- * .withChunking({ maxChars: 2048 })
624
- * .withOcr({ backend: 'tesseract', language: 'eng' })
625
- * .build();
626
- *
627
- * const result = await extractFile('document.pdf', null, config);
628
- * ```
629
- */
630
- declare class ExtractionConfigBuilder {
631
- private config;
632
- /**
633
- * Create a new builder with default configuration.
634
- */
635
- static default(): ExtractionConfigBuilder;
636
- /**
637
- * Set OCR configuration.
638
- */
639
- withOcr(ocr: OcrConfig): ExtractionConfigBuilder;
640
- /**
641
- * Set chunking configuration.
642
- */
643
- withChunking(chunking: ChunkingConfig): ExtractionConfigBuilder;
644
- /**
645
- * Set image extraction configuration.
646
- */
647
- withImageExtraction(images: ImageExtractionConfig): ExtractionConfigBuilder;
648
- /**
649
- * Set PDF configuration.
650
- */
651
- withPdf(pdf: PdfConfig): ExtractionConfigBuilder;
652
- /**
653
- * Set keyword extraction configuration.
654
- */
655
- withKeywords(keywords: KeywordConfig): ExtractionConfigBuilder;
656
- /**
657
- * Set language detection configuration.
658
- */
659
- withLanguageDetection(languageDetection: LanguageDetectionConfig): ExtractionConfigBuilder;
660
- /**
661
- * Set whether to enable metadata extraction.
662
- */
663
- withMetadataExtraction(enabled: boolean): ExtractionConfigBuilder;
664
- /**
665
- * Set whether to enable quality mode.
666
- */
667
- withQualityMode(enabled: boolean): ExtractionConfigBuilder;
668
- /**
669
- * Build and return the final ExtractionConfig object.
670
- */
671
- build(): ExtractionConfig$1;
672
- }
673
- /**
674
- * ExtractionConfig namespace with static methods for loading configuration from files
675
- * and creating new configurations with the builder pattern.
613
+ * ExtractionConfig namespace with static methods for loading configuration from files.
676
614
  *
677
615
  * Provides factory methods to load extraction configuration from TOML, YAML, or JSON files,
678
- * or to create configurations using a fluent builder API.
616
+ * or to discover configuration files in the current directory tree.
617
+ *
618
+ * For creating configurations programmatically, use plain TypeScript objects instead:
679
619
  *
680
620
  * @example
681
621
  * ```typescript
@@ -684,35 +624,17 @@ declare class ExtractionConfigBuilder {
684
624
  * // Load configuration from file
685
625
  * const config1 = ExtractionConfig.fromFile('config.toml');
686
626
  *
687
- * // Create with builder pattern
688
- * const config2 = ExtractionConfig.default()
689
- * .withChunking({ maxChars: 2048 })
690
- * .build();
627
+ * // Or create with plain object
628
+ * const config2 = {
629
+ * chunking: { maxChars: 2048 },
630
+ * ocr: { backend: 'tesseract', language: 'eng' }
631
+ * };
691
632
  *
692
633
  * // Use with extraction
693
634
  * const result = await extractFile('document.pdf', null, config2);
694
635
  * ```
695
636
  */
696
637
  declare const ExtractionConfig: {
697
- /**
698
- * Create a default extraction configuration using the builder pattern.
699
- *
700
- * Returns a builder object that allows you to configure extraction settings
701
- * using method chaining.
702
- *
703
- * @returns ExtractionConfigBuilder for chaining configuration calls
704
- *
705
- * @example
706
- * ```typescript
707
- * import { ExtractionConfig } from '@kreuzberg/node';
708
- *
709
- * const config = ExtractionConfig.default()
710
- * .withChunking({ maxChars: 2048 })
711
- * .withOcr({ backend: 'tesseract', language: 'eng' })
712
- * .build();
713
- * ```
714
- */
715
- default(): ExtractionConfigBuilder;
716
638
  /**
717
639
  * Load extraction configuration from a file.
718
640
  *
@@ -1060,6 +982,151 @@ declare function getErrorCodeDescription(code: number): string;
1060
982
  * ```
1061
983
  */
1062
984
  declare function classifyError(errorMessage: string): ErrorClassification;
1063
- declare const __version__ = "4.0.0-rc.21";
985
+ /**
986
+ * Create a worker pool for concurrent file extraction.
987
+ *
988
+ * The worker pool manages a set of background worker threads that can process
989
+ * extraction requests concurrently, improving throughput when handling multiple files.
990
+ *
991
+ * @param size - Optional number of worker threads (defaults to CPU count). Must be > 0
992
+ * @returns A WorkerPool instance to use with extraction functions
993
+ *
994
+ * @throws {Error} If size is invalid or pool creation fails
995
+ *
996
+ * @example
997
+ * ```typescript
998
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
999
+ *
1000
+ * // Create pool with 4 workers
1001
+ * const pool = createWorkerPool(4);
1002
+ *
1003
+ * try {
1004
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1005
+ * console.log(result.content);
1006
+ * } finally {
1007
+ * // Always close the pool when done
1008
+ * await closeWorkerPool(pool);
1009
+ * }
1010
+ * ```
1011
+ */
1012
+ declare function createWorkerPool(size?: number): WorkerPool;
1013
+ /**
1014
+ * Get statistics about a worker pool.
1015
+ *
1016
+ * Returns information about the pool's current state, including the number of active workers,
1017
+ * queued tasks, and total processed tasks.
1018
+ *
1019
+ * @param pool - The worker pool instance
1020
+ * @returns WorkerPoolStats with pool information
1021
+ *
1022
+ * @example
1023
+ * ```typescript
1024
+ * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
1025
+ *
1026
+ * const pool = createWorkerPool(4);
1027
+ * const stats = getWorkerPoolStats(pool);
1028
+ *
1029
+ * console.log(`Pool size: ${stats.size}`);
1030
+ * console.log(`Active workers: ${stats.activeWorkers}`);
1031
+ * console.log(`Queued tasks: ${stats.queuedTasks}`);
1032
+ * ```
1033
+ */
1034
+ declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
1035
+ /**
1036
+ * Extract content from a single file using a worker pool (asynchronous).
1037
+ *
1038
+ * Submits an extraction task to the worker pool. The task is executed by one of the
1039
+ * available workers in the background, allowing other tasks to be processed concurrently.
1040
+ *
1041
+ * @param pool - The worker pool instance
1042
+ * @param filePath - Path to the file to extract
1043
+ * @param mimeTypeOrConfig - Optional MIME type or extraction configuration
1044
+ * @param maybeConfig - Optional extraction configuration (if second param is MIME type)
1045
+ * @returns Promise<ExtractionResult> containing extracted content and metadata
1046
+ *
1047
+ * @throws {Error} If the file cannot be read or extraction fails
1048
+ *
1049
+ * @example
1050
+ * ```typescript
1051
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1052
+ *
1053
+ * const pool = createWorkerPool(4);
1054
+ *
1055
+ * try {
1056
+ * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
1057
+ * const results = await Promise.all(
1058
+ * files.map(f => extractFileInWorker(pool, f))
1059
+ * );
1060
+ *
1061
+ * results.forEach((r, i) => {
1062
+ * console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
1063
+ * });
1064
+ * } finally {
1065
+ * await closeWorkerPool(pool);
1066
+ * }
1067
+ * ```
1068
+ */
1069
+ declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig$1, maybeConfig?: ExtractionConfig$1 | null): Promise<ExtractionResult>;
1070
+ /**
1071
+ * Extract content from multiple files in parallel using a worker pool (asynchronous).
1072
+ *
1073
+ * Submits multiple extraction tasks to the worker pool for concurrent processing.
1074
+ * This is more efficient than using `extractFileInWorker` multiple times sequentially.
1075
+ *
1076
+ * @param pool - The worker pool instance
1077
+ * @param paths - Array of file paths to extract
1078
+ * @param config - Extraction configuration object (applies to all files)
1079
+ * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
1080
+ *
1081
+ * @throws {Error} If any file cannot be read or extraction fails
1082
+ *
1083
+ * @example
1084
+ * ```typescript
1085
+ * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
1086
+ *
1087
+ * const pool = createWorkerPool(4);
1088
+ *
1089
+ * try {
1090
+ * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
1091
+ * const results = await batchExtractFilesInWorker(pool, files, {
1092
+ * ocr: { backend: 'tesseract', language: 'eng' }
1093
+ * });
1094
+ *
1095
+ * const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
1096
+ * console.log(`Total: $${total}`);
1097
+ * } finally {
1098
+ * await closeWorkerPool(pool);
1099
+ * }
1100
+ * ```
1101
+ */
1102
+ declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig$1 | null): Promise<ExtractionResult[]>;
1103
+ /**
1104
+ * Close a worker pool and shut down all worker threads.
1105
+ *
1106
+ * Should be called when the pool is no longer needed to clean up resources
1107
+ * and gracefully shut down worker threads. Any pending tasks will be cancelled.
1108
+ *
1109
+ * @param pool - The worker pool instance to close
1110
+ * @returns Promise that resolves when the pool is fully closed
1111
+ *
1112
+ * @throws {Error} If pool shutdown fails
1113
+ *
1114
+ * @example
1115
+ * ```typescript
1116
+ * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
1117
+ *
1118
+ * const pool = createWorkerPool(4);
1119
+ *
1120
+ * try {
1121
+ * const result = await extractFileInWorker(pool, 'document.pdf');
1122
+ * console.log(result.content);
1123
+ * } finally {
1124
+ * // Clean up the pool
1125
+ * await closeWorkerPool(pool);
1126
+ * }
1127
+ * ```
1128
+ */
1129
+ declare function closeWorkerPool(pool: WorkerPool): Promise<void>;
1130
+ declare const __version__ = "4.0.0-rc.24";
1064
1131
 
1065
- export { ChunkingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrBackendProtocol, OcrConfig, PanicContext, PdfConfig, PostProcessorProtocol, ValidatorProtocol, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
1132
+ export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };