@kreuzberg/node 4.0.0-rc.16 → 4.0.0-rc.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +111 -23
- package/dist/index.d.ts +111 -23
- package/dist/index.js +162 -29
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +162 -29
- package/dist/index.mjs.map +1 -1
- package/index.js +52 -52
- package/package.json +5 -5
package/dist/index.js
CHANGED
|
@@ -69,6 +69,7 @@ __export(index_exports, {
|
|
|
69
69
|
validateMimeType: () => validateMimeType
|
|
70
70
|
});
|
|
71
71
|
module.exports = __toCommonJS(index_exports);
|
|
72
|
+
var import_node_fs = require("node:fs");
|
|
72
73
|
var import_node_module = require("node:module");
|
|
73
74
|
var import_errors = require("./errors.js");
|
|
74
75
|
var import_guten_ocr = require("./ocr/guten-ocr.js");
|
|
@@ -316,15 +317,15 @@ function convertResult(rawResult) {
|
|
|
316
317
|
metadata: {},
|
|
317
318
|
tables: [],
|
|
318
319
|
detectedLanguages: null,
|
|
319
|
-
chunks:
|
|
320
|
-
images:
|
|
321
|
-
pages:
|
|
320
|
+
chunks: void 0,
|
|
321
|
+
images: void 0,
|
|
322
|
+
pages: void 0
|
|
322
323
|
};
|
|
323
324
|
}
|
|
324
325
|
const result = rawResult;
|
|
325
326
|
const metadata = result["metadata"];
|
|
326
327
|
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
327
|
-
|
|
328
|
+
const returnObj = {
|
|
328
329
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
329
330
|
content: result["content"] ?? "",
|
|
330
331
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
@@ -334,19 +335,23 @@ function convertResult(rawResult) {
|
|
|
334
335
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
335
336
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
336
337
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
337
|
-
chunks:
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
})(),
|
|
341
|
-
images: (() => {
|
|
342
|
-
const imagesData = result["images"];
|
|
343
|
-
return Array.isArray(imagesData) ? imagesData.map((image) => convertImage(image)) : null;
|
|
344
|
-
})(),
|
|
345
|
-
pages: (() => {
|
|
346
|
-
const pagesData = result["pages"];
|
|
347
|
-
return Array.isArray(pagesData) ? pagesData.map((page) => convertPageContent(page)) : null;
|
|
348
|
-
})()
|
|
338
|
+
chunks: void 0,
|
|
339
|
+
images: void 0,
|
|
340
|
+
pages: void 0
|
|
349
341
|
};
|
|
342
|
+
const chunksData = result["chunks"];
|
|
343
|
+
if (Array.isArray(chunksData)) {
|
|
344
|
+
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
345
|
+
}
|
|
346
|
+
const imagesData = result["images"];
|
|
347
|
+
if (Array.isArray(imagesData)) {
|
|
348
|
+
returnObj.images = imagesData.map((image) => convertImage(image));
|
|
349
|
+
}
|
|
350
|
+
const pagesData = result["pages"];
|
|
351
|
+
if (Array.isArray(pagesData)) {
|
|
352
|
+
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
353
|
+
}
|
|
354
|
+
return returnObj;
|
|
350
355
|
}
|
|
351
356
|
function setIfDefined(target, key, value) {
|
|
352
357
|
if (value !== void 0) {
|
|
@@ -546,23 +551,59 @@ function normalizeExtractionConfig(config) {
|
|
|
546
551
|
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
547
552
|
return normalized;
|
|
548
553
|
}
|
|
549
|
-
function extractFileSync(filePath,
|
|
554
|
+
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
555
|
+
let mimeType = null;
|
|
556
|
+
let config = null;
|
|
557
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
558
|
+
mimeType = mimeTypeOrConfig;
|
|
559
|
+
config = maybeConfig ?? null;
|
|
560
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
561
|
+
config = mimeTypeOrConfig;
|
|
562
|
+
mimeType = null;
|
|
563
|
+
} else {
|
|
564
|
+
config = maybeConfig ?? null;
|
|
565
|
+
mimeType = null;
|
|
566
|
+
}
|
|
550
567
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
551
568
|
const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
|
|
552
569
|
return convertResult(rawResult);
|
|
553
570
|
}
|
|
554
|
-
async function extractFile(filePath,
|
|
571
|
+
async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
572
|
+
let mimeType = null;
|
|
573
|
+
let config = null;
|
|
574
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
575
|
+
mimeType = mimeTypeOrConfig;
|
|
576
|
+
config = maybeConfig ?? null;
|
|
577
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
578
|
+
config = mimeTypeOrConfig;
|
|
579
|
+
mimeType = null;
|
|
580
|
+
} else {
|
|
581
|
+
config = maybeConfig ?? null;
|
|
582
|
+
mimeType = null;
|
|
583
|
+
}
|
|
555
584
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
556
585
|
const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
|
|
557
586
|
return convertResult(rawResult);
|
|
558
587
|
}
|
|
559
|
-
function extractBytesSync(
|
|
588
|
+
function extractBytesSync(dataOrPath, mimeType, config = null) {
|
|
589
|
+
let data;
|
|
590
|
+
if (typeof dataOrPath === "string") {
|
|
591
|
+
data = (0, import_node_fs.readFileSync)(dataOrPath);
|
|
592
|
+
} else {
|
|
593
|
+
data = dataOrPath;
|
|
594
|
+
}
|
|
560
595
|
const validated = assertUint8Array(data, "data");
|
|
561
596
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
562
597
|
const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
|
|
563
598
|
return convertResult(rawResult);
|
|
564
599
|
}
|
|
565
|
-
async function extractBytes(
|
|
600
|
+
async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
601
|
+
let data;
|
|
602
|
+
if (typeof dataOrPath === "string") {
|
|
603
|
+
data = (0, import_node_fs.readFileSync)(dataOrPath);
|
|
604
|
+
} else {
|
|
605
|
+
data = dataOrPath;
|
|
606
|
+
}
|
|
566
607
|
const validated = assertUint8Array(data, "data");
|
|
567
608
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
568
609
|
console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
|
|
@@ -602,8 +643,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
602
643
|
function registerPostProcessor(processor) {
|
|
603
644
|
const binding2 = getBinding();
|
|
604
645
|
const wrappedProcessor = {
|
|
605
|
-
name: processor.name.
|
|
606
|
-
processingStage: processor.processingStage
|
|
646
|
+
name: typeof processor.name === "function" ? processor.name() : processor.name,
|
|
647
|
+
processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
|
|
607
648
|
async process(...args) {
|
|
608
649
|
const wrappedValue = args[0];
|
|
609
650
|
const jsonString = wrappedValue[0];
|
|
@@ -656,8 +697,8 @@ function listPostProcessors() {
|
|
|
656
697
|
function registerValidator(validator) {
|
|
657
698
|
const binding2 = getBinding();
|
|
658
699
|
const wrappedValidator = {
|
|
659
|
-
name: validator.name.
|
|
660
|
-
priority: validator.priority
|
|
700
|
+
name: typeof validator.name === "function" ? validator.name() : validator.name,
|
|
701
|
+
priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
|
|
661
702
|
async validate(...args) {
|
|
662
703
|
const jsonString = args[0];
|
|
663
704
|
if (!jsonString || jsonString === "undefined") {
|
|
@@ -706,8 +747,8 @@ function describePayload(value) {
|
|
|
706
747
|
function registerOcrBackend(backend) {
|
|
707
748
|
const binding2 = getBinding();
|
|
708
749
|
const wrappedBackend = {
|
|
709
|
-
name: backend.name.
|
|
710
|
-
supportedLanguages: backend.supportedLanguages.
|
|
750
|
+
name: typeof backend.name === "function" ? backend.name() : backend.name,
|
|
751
|
+
supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
|
|
711
752
|
async processImage(...processArgs) {
|
|
712
753
|
const [imagePayload, maybeLanguage] = processArgs;
|
|
713
754
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
@@ -772,7 +813,99 @@ function clearDocumentExtractors() {
|
|
|
772
813
|
const binding2 = getBinding();
|
|
773
814
|
binding2.clearDocumentExtractors();
|
|
774
815
|
}
|
|
816
|
+
class ExtractionConfigBuilder {
|
|
817
|
+
config = {};
|
|
818
|
+
/**
|
|
819
|
+
* Create a new builder with default configuration.
|
|
820
|
+
*/
|
|
821
|
+
static default() {
|
|
822
|
+
return new ExtractionConfigBuilder();
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* Set OCR configuration.
|
|
826
|
+
*/
|
|
827
|
+
withOcr(ocr) {
|
|
828
|
+
this.config["ocr"] = ocr;
|
|
829
|
+
return this;
|
|
830
|
+
}
|
|
831
|
+
/**
|
|
832
|
+
* Set chunking configuration.
|
|
833
|
+
*/
|
|
834
|
+
withChunking(chunking) {
|
|
835
|
+
this.config["chunking"] = chunking;
|
|
836
|
+
return this;
|
|
837
|
+
}
|
|
838
|
+
/**
|
|
839
|
+
* Set image extraction configuration.
|
|
840
|
+
*/
|
|
841
|
+
withImageExtraction(images) {
|
|
842
|
+
this.config["imageExtraction"] = images;
|
|
843
|
+
return this;
|
|
844
|
+
}
|
|
845
|
+
/**
|
|
846
|
+
* Set PDF configuration.
|
|
847
|
+
*/
|
|
848
|
+
withPdf(pdf) {
|
|
849
|
+
this.config["pdf"] = pdf;
|
|
850
|
+
return this;
|
|
851
|
+
}
|
|
852
|
+
/**
|
|
853
|
+
* Set keyword extraction configuration.
|
|
854
|
+
*/
|
|
855
|
+
withKeywords(keywords) {
|
|
856
|
+
this.config["keywords"] = keywords;
|
|
857
|
+
return this;
|
|
858
|
+
}
|
|
859
|
+
/**
|
|
860
|
+
* Set language detection configuration.
|
|
861
|
+
*/
|
|
862
|
+
withLanguageDetection(languageDetection) {
|
|
863
|
+
this.config["languageDetection"] = languageDetection;
|
|
864
|
+
return this;
|
|
865
|
+
}
|
|
866
|
+
/**
|
|
867
|
+
* Set whether to enable metadata extraction.
|
|
868
|
+
*/
|
|
869
|
+
withMetadataExtraction(enabled) {
|
|
870
|
+
this.config["metadataExtraction"] = enabled;
|
|
871
|
+
return this;
|
|
872
|
+
}
|
|
873
|
+
/**
|
|
874
|
+
* Set whether to enable quality mode.
|
|
875
|
+
*/
|
|
876
|
+
withQualityMode(enabled) {
|
|
877
|
+
this.config["qualityMode"] = enabled;
|
|
878
|
+
return this;
|
|
879
|
+
}
|
|
880
|
+
/**
|
|
881
|
+
* Build and return the final ExtractionConfig object.
|
|
882
|
+
*/
|
|
883
|
+
build() {
|
|
884
|
+
return this.config;
|
|
885
|
+
}
|
|
886
|
+
}
|
|
775
887
|
const ExtractionConfig = {
|
|
888
|
+
/**
|
|
889
|
+
* Create a default extraction configuration using the builder pattern.
|
|
890
|
+
*
|
|
891
|
+
* Returns a builder object that allows you to configure extraction settings
|
|
892
|
+
* using method chaining.
|
|
893
|
+
*
|
|
894
|
+
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
895
|
+
*
|
|
896
|
+
* @example
|
|
897
|
+
* ```typescript
|
|
898
|
+
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
899
|
+
*
|
|
900
|
+
* const config = ExtractionConfig.default()
|
|
901
|
+
* .withChunking({ maxChars: 2048 })
|
|
902
|
+
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
903
|
+
* .build();
|
|
904
|
+
* ```
|
|
905
|
+
*/
|
|
906
|
+
default() {
|
|
907
|
+
return ExtractionConfigBuilder.default();
|
|
908
|
+
},
|
|
776
909
|
/**
|
|
777
910
|
* Load extraction configuration from a file.
|
|
778
911
|
*
|
|
@@ -838,9 +971,9 @@ function detectMimeType(bytes) {
|
|
|
838
971
|
const binding2 = getBinding();
|
|
839
972
|
return binding2.detectMimeTypeFromBytes(bytes);
|
|
840
973
|
}
|
|
841
|
-
function detectMimeTypeFromPath(
|
|
974
|
+
function detectMimeTypeFromPath(filePath, checkExists) {
|
|
842
975
|
const binding2 = getBinding();
|
|
843
|
-
return binding2.detectMimeTypeFromPath(
|
|
976
|
+
return binding2.detectMimeTypeFromPath(filePath, checkExists);
|
|
844
977
|
}
|
|
845
978
|
function validateMimeType(mimeType) {
|
|
846
979
|
const binding2 = getBinding();
|
|
@@ -881,7 +1014,7 @@ function classifyError(errorMessage) {
|
|
|
881
1014
|
const result = binding2.classifyError(errorMessage);
|
|
882
1015
|
return result;
|
|
883
1016
|
}
|
|
884
|
-
const __version__ = "4.0.0-rc.
|
|
1017
|
+
const __version__ = "4.0.0-rc.18";
|
|
885
1018
|
// Annotate the CommonJS export names for ESM import in node:
|
|
886
1019
|
0 && (module.exports = {
|
|
887
1020
|
CacheError,
|