@kreuzberg/node 4.0.0-rc.16 → 4.0.0-rc.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -70,6 +70,7 @@ __export(index_exports, {
70
70
  });
71
71
  module.exports = __toCommonJS(index_exports);
72
72
  var import_node_module = require("node:module");
73
+ var import_node_fs = require("node:fs");
73
74
  var import_errors = require("./errors.js");
74
75
  var import_guten_ocr = require("./ocr/guten-ocr.js");
75
76
  __reExport(index_exports, require("./types.js"), module.exports);
@@ -316,15 +317,15 @@ function convertResult(rawResult) {
316
317
  metadata: {},
317
318
  tables: [],
318
319
  detectedLanguages: null,
319
- chunks: null,
320
- images: null,
321
- pages: null
320
+ chunks: void 0,
321
+ images: void 0,
322
+ pages: void 0
322
323
  };
323
324
  }
324
325
  const result = rawResult;
325
326
  const metadata = result["metadata"];
326
327
  const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
327
- return {
328
+ const returnObj = {
328
329
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
329
330
  content: result["content"] ?? "",
330
331
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
@@ -334,19 +335,23 @@ function convertResult(rawResult) {
334
335
  tables: Array.isArray(result["tables"]) ? result["tables"] : [],
335
336
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
336
337
  detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
337
- chunks: (() => {
338
- const chunksData = result["chunks"];
339
- return Array.isArray(chunksData) ? chunksData.map((chunk) => convertChunk(chunk)) : null;
340
- })(),
341
- images: (() => {
342
- const imagesData = result["images"];
343
- return Array.isArray(imagesData) ? imagesData.map((image) => convertImage(image)) : null;
344
- })(),
345
- pages: (() => {
346
- const pagesData = result["pages"];
347
- return Array.isArray(pagesData) ? pagesData.map((page) => convertPageContent(page)) : null;
348
- })()
338
+ chunks: void 0,
339
+ images: void 0,
340
+ pages: void 0
349
341
  };
342
+ const chunksData = result["chunks"];
343
+ if (Array.isArray(chunksData)) {
344
+ returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
345
+ }
346
+ const imagesData = result["images"];
347
+ if (Array.isArray(imagesData)) {
348
+ returnObj.images = imagesData.map((image) => convertImage(image));
349
+ }
350
+ const pagesData = result["pages"];
351
+ if (Array.isArray(pagesData)) {
352
+ returnObj.pages = pagesData.map((page) => convertPageContent(page));
353
+ }
354
+ return returnObj;
350
355
  }
351
356
  function setIfDefined(target, key, value) {
352
357
  if (value !== void 0) {
@@ -546,23 +551,59 @@ function normalizeExtractionConfig(config) {
546
551
  setIfDefined(normalized, "htmlOptions", htmlOptions);
547
552
  return normalized;
548
553
  }
549
- function extractFileSync(filePath, mimeType = null, config = null) {
554
+ function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
555
+ let mimeType = null;
556
+ let config = null;
557
+ if (typeof mimeTypeOrConfig === "string") {
558
+ mimeType = mimeTypeOrConfig;
559
+ config = maybeConfig ?? null;
560
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
561
+ config = mimeTypeOrConfig;
562
+ mimeType = null;
563
+ } else {
564
+ config = maybeConfig ?? null;
565
+ mimeType = null;
566
+ }
550
567
  const normalizedConfig = normalizeExtractionConfig(config);
551
568
  const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
552
569
  return convertResult(rawResult);
553
570
  }
554
- async function extractFile(filePath, mimeType = null, config = null) {
571
+ async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
572
+ let mimeType = null;
573
+ let config = null;
574
+ if (typeof mimeTypeOrConfig === "string") {
575
+ mimeType = mimeTypeOrConfig;
576
+ config = maybeConfig ?? null;
577
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
578
+ config = mimeTypeOrConfig;
579
+ mimeType = null;
580
+ } else {
581
+ config = maybeConfig ?? null;
582
+ mimeType = null;
583
+ }
555
584
  const normalizedConfig = normalizeExtractionConfig(config);
556
585
  const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
557
586
  return convertResult(rawResult);
558
587
  }
559
- function extractBytesSync(data, mimeType, config = null) {
588
+ function extractBytesSync(dataOrPath, mimeType, config = null) {
589
+ let data;
590
+ if (typeof dataOrPath === "string") {
591
+ data = (0, import_node_fs.readFileSync)(dataOrPath);
592
+ } else {
593
+ data = dataOrPath;
594
+ }
560
595
  const validated = assertUint8Array(data, "data");
561
596
  const normalizedConfig = normalizeExtractionConfig(config);
562
597
  const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
563
598
  return convertResult(rawResult);
564
599
  }
565
- async function extractBytes(data, mimeType, config = null) {
600
+ async function extractBytes(dataOrPath, mimeType, config = null) {
601
+ let data;
602
+ if (typeof dataOrPath === "string") {
603
+ data = (0, import_node_fs.readFileSync)(dataOrPath);
604
+ } else {
605
+ data = dataOrPath;
606
+ }
566
607
  const validated = assertUint8Array(data, "data");
567
608
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
568
609
  console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
@@ -602,8 +643,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
602
643
  function registerPostProcessor(processor) {
603
644
  const binding2 = getBinding();
604
645
  const wrappedProcessor = {
605
- name: processor.name.bind(processor),
606
- processingStage: processor.processingStage?.bind(processor),
646
+ name: typeof processor.name === "function" ? processor.name() : processor.name,
647
+ processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
607
648
  async process(...args) {
608
649
  const wrappedValue = args[0];
609
650
  const jsonString = wrappedValue[0];
@@ -656,8 +697,8 @@ function listPostProcessors() {
656
697
  function registerValidator(validator) {
657
698
  const binding2 = getBinding();
658
699
  const wrappedValidator = {
659
- name: validator.name.bind(validator),
660
- priority: validator.priority?.bind(validator),
700
+ name: typeof validator.name === "function" ? validator.name() : validator.name,
701
+ priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
661
702
  async validate(...args) {
662
703
  const jsonString = args[0];
663
704
  if (!jsonString || jsonString === "undefined") {
@@ -706,8 +747,8 @@ function describePayload(value) {
706
747
  function registerOcrBackend(backend) {
707
748
  const binding2 = getBinding();
708
749
  const wrappedBackend = {
709
- name: backend.name.bind(backend),
710
- supportedLanguages: backend.supportedLanguages.bind(backend),
750
+ name: typeof backend.name === "function" ? backend.name() : backend.name,
751
+ supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
711
752
  async processImage(...processArgs) {
712
753
  const [imagePayload, maybeLanguage] = processArgs;
713
754
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
@@ -772,7 +813,99 @@ function clearDocumentExtractors() {
772
813
  const binding2 = getBinding();
773
814
  binding2.clearDocumentExtractors();
774
815
  }
816
+ class ExtractionConfigBuilder {
817
+ config = {};
818
+ /**
819
+ * Create a new builder with default configuration.
820
+ */
821
+ static default() {
822
+ return new ExtractionConfigBuilder();
823
+ }
824
+ /**
825
+ * Set OCR configuration.
826
+ */
827
+ withOcr(ocr) {
828
+ this.config["ocr"] = ocr;
829
+ return this;
830
+ }
831
+ /**
832
+ * Set chunking configuration.
833
+ */
834
+ withChunking(chunking) {
835
+ this.config["chunking"] = chunking;
836
+ return this;
837
+ }
838
+ /**
839
+ * Set image extraction configuration.
840
+ */
841
+ withImageExtraction(images) {
842
+ this.config["imageExtraction"] = images;
843
+ return this;
844
+ }
845
+ /**
846
+ * Set PDF configuration.
847
+ */
848
+ withPdf(pdf) {
849
+ this.config["pdf"] = pdf;
850
+ return this;
851
+ }
852
+ /**
853
+ * Set keyword extraction configuration.
854
+ */
855
+ withKeywords(keywords) {
856
+ this.config["keywords"] = keywords;
857
+ return this;
858
+ }
859
+ /**
860
+ * Set language detection configuration.
861
+ */
862
+ withLanguageDetection(languageDetection) {
863
+ this.config["languageDetection"] = languageDetection;
864
+ return this;
865
+ }
866
+ /**
867
+ * Set whether to enable metadata extraction.
868
+ */
869
+ withMetadataExtraction(enabled) {
870
+ this.config["metadataExtraction"] = enabled;
871
+ return this;
872
+ }
873
+ /**
874
+ * Set whether to enable quality mode.
875
+ */
876
+ withQualityMode(enabled) {
877
+ this.config["qualityMode"] = enabled;
878
+ return this;
879
+ }
880
+ /**
881
+ * Build and return the final ExtractionConfig object.
882
+ */
883
+ build() {
884
+ return this.config;
885
+ }
886
+ }
775
887
  const ExtractionConfig = {
888
+ /**
889
+ * Create a default extraction configuration using the builder pattern.
890
+ *
891
+ * Returns a builder object that allows you to configure extraction settings
892
+ * using method chaining.
893
+ *
894
+ * @returns ExtractionConfigBuilder for chaining configuration calls
895
+ *
896
+ * @example
897
+ * ```typescript
898
+ * import { ExtractionConfig } from '@kreuzberg/node';
899
+ *
900
+ * const config = ExtractionConfig.default()
901
+ * .withChunking({ maxChars: 2048 })
902
+ * .withOcr({ backend: 'tesseract', language: 'eng' })
903
+ * .build();
904
+ * ```
905
+ */
906
+ default() {
907
+ return ExtractionConfigBuilder.default();
908
+ },
776
909
  /**
777
910
  * Load extraction configuration from a file.
778
911
  *
@@ -838,9 +971,9 @@ function detectMimeType(bytes) {
838
971
  const binding2 = getBinding();
839
972
  return binding2.detectMimeTypeFromBytes(bytes);
840
973
  }
841
- function detectMimeTypeFromPath(path) {
974
+ function detectMimeTypeFromPath(filePath, checkExists) {
842
975
  const binding2 = getBinding();
843
- return binding2.detectMimeTypeFromPath(path);
976
+ return binding2.detectMimeTypeFromPath(filePath, checkExists);
844
977
  }
845
978
  function validateMimeType(mimeType) {
846
979
  const binding2 = getBinding();
@@ -881,7 +1014,7 @@ function classifyError(errorMessage) {
881
1014
  const result = binding2.classifyError(errorMessage);
882
1015
  return result;
883
1016
  }
884
- const __version__ = "4.0.0-rc.16";
1017
+ const __version__ = "4.0.0-rc.17";
885
1018
  // Annotate the CommonJS export names for ESM import in node:
886
1019
  0 && (module.exports = {
887
1020
  CacheError,