@kreuzberg/node 4.0.0-rc.15 → 4.0.0-rc.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +180 -23
- package/dist/index.d.ts +180 -23
- package/dist/index.js +181 -29
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +178 -29
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +37 -1
- package/dist/types.d.ts +37 -1
- package/dist/types.js.map +1 -1
- package/index.d.ts +478 -0
- package/index.js +72 -52
- package/package.json +5 -5
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { createRequire } from "node:module";
|
|
2
|
+
import { readFileSync } from "node:fs";
|
|
2
3
|
import {
|
|
3
4
|
CacheError,
|
|
4
5
|
ErrorCode,
|
|
@@ -254,15 +255,15 @@ function convertResult(rawResult) {
|
|
|
254
255
|
metadata: {},
|
|
255
256
|
tables: [],
|
|
256
257
|
detectedLanguages: null,
|
|
257
|
-
chunks:
|
|
258
|
-
images:
|
|
259
|
-
pages:
|
|
258
|
+
chunks: void 0,
|
|
259
|
+
images: void 0,
|
|
260
|
+
pages: void 0
|
|
260
261
|
};
|
|
261
262
|
}
|
|
262
263
|
const result = rawResult;
|
|
263
264
|
const metadata = result["metadata"];
|
|
264
265
|
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
265
|
-
|
|
266
|
+
const returnObj = {
|
|
266
267
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
267
268
|
content: result["content"] ?? "",
|
|
268
269
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
@@ -272,19 +273,23 @@ function convertResult(rawResult) {
|
|
|
272
273
|
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
273
274
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
274
275
|
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
275
|
-
chunks:
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
})(),
|
|
279
|
-
images: (() => {
|
|
280
|
-
const imagesData = result["images"];
|
|
281
|
-
return Array.isArray(imagesData) ? imagesData.map((image) => convertImage(image)) : null;
|
|
282
|
-
})(),
|
|
283
|
-
pages: (() => {
|
|
284
|
-
const pagesData = result["pages"];
|
|
285
|
-
return Array.isArray(pagesData) ? pagesData.map((page) => convertPageContent(page)) : null;
|
|
286
|
-
})()
|
|
276
|
+
chunks: void 0,
|
|
277
|
+
images: void 0,
|
|
278
|
+
pages: void 0
|
|
287
279
|
};
|
|
280
|
+
const chunksData = result["chunks"];
|
|
281
|
+
if (Array.isArray(chunksData)) {
|
|
282
|
+
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
283
|
+
}
|
|
284
|
+
const imagesData = result["images"];
|
|
285
|
+
if (Array.isArray(imagesData)) {
|
|
286
|
+
returnObj.images = imagesData.map((image) => convertImage(image));
|
|
287
|
+
}
|
|
288
|
+
const pagesData = result["pages"];
|
|
289
|
+
if (Array.isArray(pagesData)) {
|
|
290
|
+
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
291
|
+
}
|
|
292
|
+
return returnObj;
|
|
288
293
|
}
|
|
289
294
|
function setIfDefined(target, key, value) {
|
|
290
295
|
if (value !== void 0) {
|
|
@@ -484,23 +489,59 @@ function normalizeExtractionConfig(config) {
|
|
|
484
489
|
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
485
490
|
return normalized;
|
|
486
491
|
}
|
|
487
|
-
function extractFileSync(filePath,
|
|
492
|
+
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
493
|
+
let mimeType = null;
|
|
494
|
+
let config = null;
|
|
495
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
496
|
+
mimeType = mimeTypeOrConfig;
|
|
497
|
+
config = maybeConfig ?? null;
|
|
498
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
499
|
+
config = mimeTypeOrConfig;
|
|
500
|
+
mimeType = null;
|
|
501
|
+
} else {
|
|
502
|
+
config = maybeConfig ?? null;
|
|
503
|
+
mimeType = null;
|
|
504
|
+
}
|
|
488
505
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
489
506
|
const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
|
|
490
507
|
return convertResult(rawResult);
|
|
491
508
|
}
|
|
492
|
-
async function extractFile(filePath,
|
|
509
|
+
async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
510
|
+
let mimeType = null;
|
|
511
|
+
let config = null;
|
|
512
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
513
|
+
mimeType = mimeTypeOrConfig;
|
|
514
|
+
config = maybeConfig ?? null;
|
|
515
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
516
|
+
config = mimeTypeOrConfig;
|
|
517
|
+
mimeType = null;
|
|
518
|
+
} else {
|
|
519
|
+
config = maybeConfig ?? null;
|
|
520
|
+
mimeType = null;
|
|
521
|
+
}
|
|
493
522
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
494
523
|
const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
|
|
495
524
|
return convertResult(rawResult);
|
|
496
525
|
}
|
|
497
|
-
function extractBytesSync(
|
|
526
|
+
function extractBytesSync(dataOrPath, mimeType, config = null) {
|
|
527
|
+
let data;
|
|
528
|
+
if (typeof dataOrPath === "string") {
|
|
529
|
+
data = readFileSync(dataOrPath);
|
|
530
|
+
} else {
|
|
531
|
+
data = dataOrPath;
|
|
532
|
+
}
|
|
498
533
|
const validated = assertUint8Array(data, "data");
|
|
499
534
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
500
535
|
const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
|
|
501
536
|
return convertResult(rawResult);
|
|
502
537
|
}
|
|
503
|
-
async function extractBytes(
|
|
538
|
+
async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
539
|
+
let data;
|
|
540
|
+
if (typeof dataOrPath === "string") {
|
|
541
|
+
data = readFileSync(dataOrPath);
|
|
542
|
+
} else {
|
|
543
|
+
data = dataOrPath;
|
|
544
|
+
}
|
|
504
545
|
const validated = assertUint8Array(data, "data");
|
|
505
546
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
506
547
|
console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
|
|
@@ -540,8 +581,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
540
581
|
function registerPostProcessor(processor) {
|
|
541
582
|
const binding2 = getBinding();
|
|
542
583
|
const wrappedProcessor = {
|
|
543
|
-
name: processor.name.
|
|
544
|
-
processingStage: processor.processingStage
|
|
584
|
+
name: typeof processor.name === "function" ? processor.name() : processor.name,
|
|
585
|
+
processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
|
|
545
586
|
async process(...args) {
|
|
546
587
|
const wrappedValue = args[0];
|
|
547
588
|
const jsonString = wrappedValue[0];
|
|
@@ -594,8 +635,8 @@ function listPostProcessors() {
|
|
|
594
635
|
function registerValidator(validator) {
|
|
595
636
|
const binding2 = getBinding();
|
|
596
637
|
const wrappedValidator = {
|
|
597
|
-
name: validator.name.
|
|
598
|
-
priority: validator.priority
|
|
638
|
+
name: typeof validator.name === "function" ? validator.name() : validator.name,
|
|
639
|
+
priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
|
|
599
640
|
async validate(...args) {
|
|
600
641
|
const jsonString = args[0];
|
|
601
642
|
if (!jsonString || jsonString === "undefined") {
|
|
@@ -644,8 +685,8 @@ function describePayload(value) {
|
|
|
644
685
|
function registerOcrBackend(backend) {
|
|
645
686
|
const binding2 = getBinding();
|
|
646
687
|
const wrappedBackend = {
|
|
647
|
-
name: backend.name.
|
|
648
|
-
supportedLanguages: backend.supportedLanguages.
|
|
688
|
+
name: typeof backend.name === "function" ? backend.name() : backend.name,
|
|
689
|
+
supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
|
|
649
690
|
async processImage(...processArgs) {
|
|
650
691
|
const [imagePayload, maybeLanguage] = processArgs;
|
|
651
692
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
@@ -710,7 +751,99 @@ function clearDocumentExtractors() {
|
|
|
710
751
|
const binding2 = getBinding();
|
|
711
752
|
binding2.clearDocumentExtractors();
|
|
712
753
|
}
|
|
754
|
+
class ExtractionConfigBuilder {
|
|
755
|
+
config = {};
|
|
756
|
+
/**
|
|
757
|
+
* Create a new builder with default configuration.
|
|
758
|
+
*/
|
|
759
|
+
static default() {
|
|
760
|
+
return new ExtractionConfigBuilder();
|
|
761
|
+
}
|
|
762
|
+
/**
|
|
763
|
+
* Set OCR configuration.
|
|
764
|
+
*/
|
|
765
|
+
withOcr(ocr) {
|
|
766
|
+
this.config["ocr"] = ocr;
|
|
767
|
+
return this;
|
|
768
|
+
}
|
|
769
|
+
/**
|
|
770
|
+
* Set chunking configuration.
|
|
771
|
+
*/
|
|
772
|
+
withChunking(chunking) {
|
|
773
|
+
this.config["chunking"] = chunking;
|
|
774
|
+
return this;
|
|
775
|
+
}
|
|
776
|
+
/**
|
|
777
|
+
* Set image extraction configuration.
|
|
778
|
+
*/
|
|
779
|
+
withImageExtraction(images) {
|
|
780
|
+
this.config["imageExtraction"] = images;
|
|
781
|
+
return this;
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* Set PDF configuration.
|
|
785
|
+
*/
|
|
786
|
+
withPdf(pdf) {
|
|
787
|
+
this.config["pdf"] = pdf;
|
|
788
|
+
return this;
|
|
789
|
+
}
|
|
790
|
+
/**
|
|
791
|
+
* Set keyword extraction configuration.
|
|
792
|
+
*/
|
|
793
|
+
withKeywords(keywords) {
|
|
794
|
+
this.config["keywords"] = keywords;
|
|
795
|
+
return this;
|
|
796
|
+
}
|
|
797
|
+
/**
|
|
798
|
+
* Set language detection configuration.
|
|
799
|
+
*/
|
|
800
|
+
withLanguageDetection(languageDetection) {
|
|
801
|
+
this.config["languageDetection"] = languageDetection;
|
|
802
|
+
return this;
|
|
803
|
+
}
|
|
804
|
+
/**
|
|
805
|
+
* Set whether to enable metadata extraction.
|
|
806
|
+
*/
|
|
807
|
+
withMetadataExtraction(enabled) {
|
|
808
|
+
this.config["metadataExtraction"] = enabled;
|
|
809
|
+
return this;
|
|
810
|
+
}
|
|
811
|
+
/**
|
|
812
|
+
* Set whether to enable quality mode.
|
|
813
|
+
*/
|
|
814
|
+
withQualityMode(enabled) {
|
|
815
|
+
this.config["qualityMode"] = enabled;
|
|
816
|
+
return this;
|
|
817
|
+
}
|
|
818
|
+
/**
|
|
819
|
+
* Build and return the final ExtractionConfig object.
|
|
820
|
+
*/
|
|
821
|
+
build() {
|
|
822
|
+
return this.config;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
713
825
|
const ExtractionConfig = {
|
|
826
|
+
/**
|
|
827
|
+
* Create a default extraction configuration using the builder pattern.
|
|
828
|
+
*
|
|
829
|
+
* Returns a builder object that allows you to configure extraction settings
|
|
830
|
+
* using method chaining.
|
|
831
|
+
*
|
|
832
|
+
* @returns ExtractionConfigBuilder for chaining configuration calls
|
|
833
|
+
*
|
|
834
|
+
* @example
|
|
835
|
+
* ```typescript
|
|
836
|
+
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
837
|
+
*
|
|
838
|
+
* const config = ExtractionConfig.default()
|
|
839
|
+
* .withChunking({ maxChars: 2048 })
|
|
840
|
+
* .withOcr({ backend: 'tesseract', language: 'eng' })
|
|
841
|
+
* .build();
|
|
842
|
+
* ```
|
|
843
|
+
*/
|
|
844
|
+
default() {
|
|
845
|
+
return ExtractionConfigBuilder.default();
|
|
846
|
+
},
|
|
714
847
|
/**
|
|
715
848
|
* Load extraction configuration from a file.
|
|
716
849
|
*
|
|
@@ -776,9 +909,9 @@ function detectMimeType(bytes) {
|
|
|
776
909
|
const binding2 = getBinding();
|
|
777
910
|
return binding2.detectMimeTypeFromBytes(bytes);
|
|
778
911
|
}
|
|
779
|
-
function detectMimeTypeFromPath(
|
|
912
|
+
function detectMimeTypeFromPath(filePath, checkExists) {
|
|
780
913
|
const binding2 = getBinding();
|
|
781
|
-
return binding2.detectMimeTypeFromPath(
|
|
914
|
+
return binding2.detectMimeTypeFromPath(filePath, checkExists);
|
|
782
915
|
}
|
|
783
916
|
function validateMimeType(mimeType) {
|
|
784
917
|
const binding2 = getBinding();
|
|
@@ -806,7 +939,20 @@ function getLastPanicContext() {
|
|
|
806
939
|
const result = binding2.getLastPanicContext();
|
|
807
940
|
return result;
|
|
808
941
|
}
|
|
809
|
-
|
|
942
|
+
function getErrorCodeName(code) {
|
|
943
|
+
const binding2 = getBinding();
|
|
944
|
+
return binding2.getErrorCodeName(code);
|
|
945
|
+
}
|
|
946
|
+
function getErrorCodeDescription(code) {
|
|
947
|
+
const binding2 = getBinding();
|
|
948
|
+
return binding2.getErrorCodeDescription(code);
|
|
949
|
+
}
|
|
950
|
+
function classifyError(errorMessage) {
|
|
951
|
+
const binding2 = getBinding();
|
|
952
|
+
const result = binding2.classifyError(errorMessage);
|
|
953
|
+
return result;
|
|
954
|
+
}
|
|
955
|
+
const __version__ = "4.0.0-rc.17";
|
|
810
956
|
export {
|
|
811
957
|
CacheError,
|
|
812
958
|
ErrorCode,
|
|
@@ -826,6 +972,7 @@ export {
|
|
|
826
972
|
batchExtractBytesSync,
|
|
827
973
|
batchExtractFiles,
|
|
828
974
|
batchExtractFilesSync,
|
|
975
|
+
classifyError,
|
|
829
976
|
clearDocumentExtractors,
|
|
830
977
|
clearOcrBackends,
|
|
831
978
|
clearPostProcessors,
|
|
@@ -837,6 +984,8 @@ export {
|
|
|
837
984
|
extractFile,
|
|
838
985
|
extractFileSync,
|
|
839
986
|
getEmbeddingPreset,
|
|
987
|
+
getErrorCodeDescription,
|
|
988
|
+
getErrorCodeName,
|
|
840
989
|
getExtensionsForMime,
|
|
841
990
|
getLastErrorCode,
|
|
842
991
|
getLastPanicContext,
|