@kreuzberg/node 4.0.0-rc.15 → 4.0.0-rc.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -37,6 +37,7 @@ __export(index_exports, {
37
37
  batchExtractBytesSync: () => batchExtractBytesSync,
38
38
  batchExtractFiles: () => batchExtractFiles,
39
39
  batchExtractFilesSync: () => batchExtractFilesSync,
40
+ classifyError: () => classifyError,
40
41
  clearDocumentExtractors: () => clearDocumentExtractors,
41
42
  clearOcrBackends: () => clearOcrBackends,
42
43
  clearPostProcessors: () => clearPostProcessors,
@@ -48,6 +49,8 @@ __export(index_exports, {
48
49
  extractFile: () => extractFile,
49
50
  extractFileSync: () => extractFileSync,
50
51
  getEmbeddingPreset: () => getEmbeddingPreset,
52
+ getErrorCodeDescription: () => getErrorCodeDescription,
53
+ getErrorCodeName: () => getErrorCodeName,
51
54
  getExtensionsForMime: () => getExtensionsForMime,
52
55
  getLastErrorCode: () => getLastErrorCode,
53
56
  getLastPanicContext: () => getLastPanicContext,
@@ -67,6 +70,7 @@ __export(index_exports, {
67
70
  });
68
71
  module.exports = __toCommonJS(index_exports);
69
72
  var import_node_module = require("node:module");
73
+ var import_node_fs = require("node:fs");
70
74
  var import_errors = require("./errors.js");
71
75
  var import_guten_ocr = require("./ocr/guten-ocr.js");
72
76
  __reExport(index_exports, require("./types.js"), module.exports);
@@ -313,15 +317,15 @@ function convertResult(rawResult) {
313
317
  metadata: {},
314
318
  tables: [],
315
319
  detectedLanguages: null,
316
- chunks: null,
317
- images: null,
318
- pages: null
320
+ chunks: void 0,
321
+ images: void 0,
322
+ pages: void 0
319
323
  };
320
324
  }
321
325
  const result = rawResult;
322
326
  const metadata = result["metadata"];
323
327
  const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
324
- return {
328
+ const returnObj = {
325
329
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
326
330
  content: result["content"] ?? "",
327
331
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
@@ -331,19 +335,23 @@ function convertResult(rawResult) {
331
335
  tables: Array.isArray(result["tables"]) ? result["tables"] : [],
332
336
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
333
337
  detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
334
- chunks: (() => {
335
- const chunksData = result["chunks"];
336
- return Array.isArray(chunksData) ? chunksData.map((chunk) => convertChunk(chunk)) : null;
337
- })(),
338
- images: (() => {
339
- const imagesData = result["images"];
340
- return Array.isArray(imagesData) ? imagesData.map((image) => convertImage(image)) : null;
341
- })(),
342
- pages: (() => {
343
- const pagesData = result["pages"];
344
- return Array.isArray(pagesData) ? pagesData.map((page) => convertPageContent(page)) : null;
345
- })()
338
+ chunks: void 0,
339
+ images: void 0,
340
+ pages: void 0
346
341
  };
342
+ const chunksData = result["chunks"];
343
+ if (Array.isArray(chunksData)) {
344
+ returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
345
+ }
346
+ const imagesData = result["images"];
347
+ if (Array.isArray(imagesData)) {
348
+ returnObj.images = imagesData.map((image) => convertImage(image));
349
+ }
350
+ const pagesData = result["pages"];
351
+ if (Array.isArray(pagesData)) {
352
+ returnObj.pages = pagesData.map((page) => convertPageContent(page));
353
+ }
354
+ return returnObj;
347
355
  }
348
356
  function setIfDefined(target, key, value) {
349
357
  if (value !== void 0) {
@@ -543,23 +551,59 @@ function normalizeExtractionConfig(config) {
543
551
  setIfDefined(normalized, "htmlOptions", htmlOptions);
544
552
  return normalized;
545
553
  }
546
- function extractFileSync(filePath, mimeType = null, config = null) {
554
+ function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
555
+ let mimeType = null;
556
+ let config = null;
557
+ if (typeof mimeTypeOrConfig === "string") {
558
+ mimeType = mimeTypeOrConfig;
559
+ config = maybeConfig ?? null;
560
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
561
+ config = mimeTypeOrConfig;
562
+ mimeType = null;
563
+ } else {
564
+ config = maybeConfig ?? null;
565
+ mimeType = null;
566
+ }
547
567
  const normalizedConfig = normalizeExtractionConfig(config);
548
568
  const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
549
569
  return convertResult(rawResult);
550
570
  }
551
- async function extractFile(filePath, mimeType = null, config = null) {
571
+ async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
572
+ let mimeType = null;
573
+ let config = null;
574
+ if (typeof mimeTypeOrConfig === "string") {
575
+ mimeType = mimeTypeOrConfig;
576
+ config = maybeConfig ?? null;
577
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
578
+ config = mimeTypeOrConfig;
579
+ mimeType = null;
580
+ } else {
581
+ config = maybeConfig ?? null;
582
+ mimeType = null;
583
+ }
552
584
  const normalizedConfig = normalizeExtractionConfig(config);
553
585
  const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
554
586
  return convertResult(rawResult);
555
587
  }
556
- function extractBytesSync(data, mimeType, config = null) {
588
+ function extractBytesSync(dataOrPath, mimeType, config = null) {
589
+ let data;
590
+ if (typeof dataOrPath === "string") {
591
+ data = (0, import_node_fs.readFileSync)(dataOrPath);
592
+ } else {
593
+ data = dataOrPath;
594
+ }
557
595
  const validated = assertUint8Array(data, "data");
558
596
  const normalizedConfig = normalizeExtractionConfig(config);
559
597
  const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
560
598
  return convertResult(rawResult);
561
599
  }
562
- async function extractBytes(data, mimeType, config = null) {
600
+ async function extractBytes(dataOrPath, mimeType, config = null) {
601
+ let data;
602
+ if (typeof dataOrPath === "string") {
603
+ data = (0, import_node_fs.readFileSync)(dataOrPath);
604
+ } else {
605
+ data = dataOrPath;
606
+ }
563
607
  const validated = assertUint8Array(data, "data");
564
608
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
565
609
  console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
@@ -599,8 +643,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
599
643
  function registerPostProcessor(processor) {
600
644
  const binding2 = getBinding();
601
645
  const wrappedProcessor = {
602
- name: processor.name.bind(processor),
603
- processingStage: processor.processingStage?.bind(processor),
646
+ name: typeof processor.name === "function" ? processor.name() : processor.name,
647
+ processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
604
648
  async process(...args) {
605
649
  const wrappedValue = args[0];
606
650
  const jsonString = wrappedValue[0];
@@ -653,8 +697,8 @@ function listPostProcessors() {
653
697
  function registerValidator(validator) {
654
698
  const binding2 = getBinding();
655
699
  const wrappedValidator = {
656
- name: validator.name.bind(validator),
657
- priority: validator.priority?.bind(validator),
700
+ name: typeof validator.name === "function" ? validator.name() : validator.name,
701
+ priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
658
702
  async validate(...args) {
659
703
  const jsonString = args[0];
660
704
  if (!jsonString || jsonString === "undefined") {
@@ -703,8 +747,8 @@ function describePayload(value) {
703
747
  function registerOcrBackend(backend) {
704
748
  const binding2 = getBinding();
705
749
  const wrappedBackend = {
706
- name: backend.name.bind(backend),
707
- supportedLanguages: backend.supportedLanguages.bind(backend),
750
+ name: typeof backend.name === "function" ? backend.name() : backend.name,
751
+ supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
708
752
  async processImage(...processArgs) {
709
753
  const [imagePayload, maybeLanguage] = processArgs;
710
754
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
@@ -769,7 +813,99 @@ function clearDocumentExtractors() {
769
813
  const binding2 = getBinding();
770
814
  binding2.clearDocumentExtractors();
771
815
  }
816
+ class ExtractionConfigBuilder {
817
+ config = {};
818
+ /**
819
+ * Create a new builder with default configuration.
820
+ */
821
+ static default() {
822
+ return new ExtractionConfigBuilder();
823
+ }
824
+ /**
825
+ * Set OCR configuration.
826
+ */
827
+ withOcr(ocr) {
828
+ this.config["ocr"] = ocr;
829
+ return this;
830
+ }
831
+ /**
832
+ * Set chunking configuration.
833
+ */
834
+ withChunking(chunking) {
835
+ this.config["chunking"] = chunking;
836
+ return this;
837
+ }
838
+ /**
839
+ * Set image extraction configuration.
840
+ */
841
+ withImageExtraction(images) {
842
+ this.config["imageExtraction"] = images;
843
+ return this;
844
+ }
845
+ /**
846
+ * Set PDF configuration.
847
+ */
848
+ withPdf(pdf) {
849
+ this.config["pdf"] = pdf;
850
+ return this;
851
+ }
852
+ /**
853
+ * Set keyword extraction configuration.
854
+ */
855
+ withKeywords(keywords) {
856
+ this.config["keywords"] = keywords;
857
+ return this;
858
+ }
859
+ /**
860
+ * Set language detection configuration.
861
+ */
862
+ withLanguageDetection(languageDetection) {
863
+ this.config["languageDetection"] = languageDetection;
864
+ return this;
865
+ }
866
+ /**
867
+ * Set whether to enable metadata extraction.
868
+ */
869
+ withMetadataExtraction(enabled) {
870
+ this.config["metadataExtraction"] = enabled;
871
+ return this;
872
+ }
873
+ /**
874
+ * Set whether to enable quality mode.
875
+ */
876
+ withQualityMode(enabled) {
877
+ this.config["qualityMode"] = enabled;
878
+ return this;
879
+ }
880
+ /**
881
+ * Build and return the final ExtractionConfig object.
882
+ */
883
+ build() {
884
+ return this.config;
885
+ }
886
+ }
772
887
  const ExtractionConfig = {
888
+ /**
889
+ * Create a default extraction configuration using the builder pattern.
890
+ *
891
+ * Returns a builder object that allows you to configure extraction settings
892
+ * using method chaining.
893
+ *
894
+ * @returns ExtractionConfigBuilder for chaining configuration calls
895
+ *
896
+ * @example
897
+ * ```typescript
898
+ * import { ExtractionConfig } from '@kreuzberg/node';
899
+ *
900
+ * const config = ExtractionConfig.default()
901
+ * .withChunking({ maxChars: 2048 })
902
+ * .withOcr({ backend: 'tesseract', language: 'eng' })
903
+ * .build();
904
+ * ```
905
+ */
906
+ default() {
907
+ return ExtractionConfigBuilder.default();
908
+ },
773
909
  /**
774
910
  * Load extraction configuration from a file.
775
911
  *
@@ -835,9 +971,9 @@ function detectMimeType(bytes) {
835
971
  const binding2 = getBinding();
836
972
  return binding2.detectMimeTypeFromBytes(bytes);
837
973
  }
838
- function detectMimeTypeFromPath(path) {
974
+ function detectMimeTypeFromPath(filePath, checkExists) {
839
975
  const binding2 = getBinding();
840
- return binding2.detectMimeTypeFromPath(path);
976
+ return binding2.detectMimeTypeFromPath(filePath, checkExists);
841
977
  }
842
978
  function validateMimeType(mimeType) {
843
979
  const binding2 = getBinding();
@@ -865,7 +1001,20 @@ function getLastPanicContext() {
865
1001
  const result = binding2.getLastPanicContext();
866
1002
  return result;
867
1003
  }
868
- const __version__ = "4.0.0-rc.15";
1004
+ function getErrorCodeName(code) {
1005
+ const binding2 = getBinding();
1006
+ return binding2.getErrorCodeName(code);
1007
+ }
1008
+ function getErrorCodeDescription(code) {
1009
+ const binding2 = getBinding();
1010
+ return binding2.getErrorCodeDescription(code);
1011
+ }
1012
+ function classifyError(errorMessage) {
1013
+ const binding2 = getBinding();
1014
+ const result = binding2.classifyError(errorMessage);
1015
+ return result;
1016
+ }
1017
+ const __version__ = "4.0.0-rc.17";
869
1018
  // Annotate the CommonJS export names for ESM import in node:
870
1019
  0 && (module.exports = {
871
1020
  CacheError,
@@ -886,6 +1035,7 @@ const __version__ = "4.0.0-rc.15";
886
1035
  batchExtractBytesSync,
887
1036
  batchExtractFiles,
888
1037
  batchExtractFilesSync,
1038
+ classifyError,
889
1039
  clearDocumentExtractors,
890
1040
  clearOcrBackends,
891
1041
  clearPostProcessors,
@@ -897,6 +1047,8 @@ const __version__ = "4.0.0-rc.15";
897
1047
  extractFile,
898
1048
  extractFileSync,
899
1049
  getEmbeddingPreset,
1050
+ getErrorCodeDescription,
1051
+ getErrorCodeName,
900
1052
  getExtensionsForMime,
901
1053
  getLastErrorCode,
902
1054
  getLastPanicContext,