@kreuzberg/node 4.0.0-rc.16 → 4.0.0-rc.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,3 +1,4 @@
1
+ import { readFileSync } from "node:fs";
1
2
  import { createRequire } from "node:module";
2
3
  import {
3
4
  CacheError,
@@ -254,15 +255,15 @@ function convertResult(rawResult) {
254
255
  metadata: {},
255
256
  tables: [],
256
257
  detectedLanguages: null,
257
- chunks: null,
258
- images: null,
259
- pages: null
258
+ chunks: void 0,
259
+ images: void 0,
260
+ pages: void 0
260
261
  };
261
262
  }
262
263
  const result = rawResult;
263
264
  const metadata = result["metadata"];
264
265
  const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
265
- return {
266
+ const returnObj = {
266
267
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
267
268
  content: result["content"] ?? "",
268
269
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
@@ -272,19 +273,23 @@ function convertResult(rawResult) {
272
273
  tables: Array.isArray(result["tables"]) ? result["tables"] : [],
273
274
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
274
275
  detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
275
- chunks: (() => {
276
- const chunksData = result["chunks"];
277
- return Array.isArray(chunksData) ? chunksData.map((chunk) => convertChunk(chunk)) : null;
278
- })(),
279
- images: (() => {
280
- const imagesData = result["images"];
281
- return Array.isArray(imagesData) ? imagesData.map((image) => convertImage(image)) : null;
282
- })(),
283
- pages: (() => {
284
- const pagesData = result["pages"];
285
- return Array.isArray(pagesData) ? pagesData.map((page) => convertPageContent(page)) : null;
286
- })()
276
+ chunks: void 0,
277
+ images: void 0,
278
+ pages: void 0
287
279
  };
280
+ const chunksData = result["chunks"];
281
+ if (Array.isArray(chunksData)) {
282
+ returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
283
+ }
284
+ const imagesData = result["images"];
285
+ if (Array.isArray(imagesData)) {
286
+ returnObj.images = imagesData.map((image) => convertImage(image));
287
+ }
288
+ const pagesData = result["pages"];
289
+ if (Array.isArray(pagesData)) {
290
+ returnObj.pages = pagesData.map((page) => convertPageContent(page));
291
+ }
292
+ return returnObj;
288
293
  }
289
294
  function setIfDefined(target, key, value) {
290
295
  if (value !== void 0) {
@@ -484,23 +489,59 @@ function normalizeExtractionConfig(config) {
484
489
  setIfDefined(normalized, "htmlOptions", htmlOptions);
485
490
  return normalized;
486
491
  }
487
- function extractFileSync(filePath, mimeType = null, config = null) {
492
+ function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
493
+ let mimeType = null;
494
+ let config = null;
495
+ if (typeof mimeTypeOrConfig === "string") {
496
+ mimeType = mimeTypeOrConfig;
497
+ config = maybeConfig ?? null;
498
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
499
+ config = mimeTypeOrConfig;
500
+ mimeType = null;
501
+ } else {
502
+ config = maybeConfig ?? null;
503
+ mimeType = null;
504
+ }
488
505
  const normalizedConfig = normalizeExtractionConfig(config);
489
506
  const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
490
507
  return convertResult(rawResult);
491
508
  }
492
- async function extractFile(filePath, mimeType = null, config = null) {
509
+ async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
510
+ let mimeType = null;
511
+ let config = null;
512
+ if (typeof mimeTypeOrConfig === "string") {
513
+ mimeType = mimeTypeOrConfig;
514
+ config = maybeConfig ?? null;
515
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
516
+ config = mimeTypeOrConfig;
517
+ mimeType = null;
518
+ } else {
519
+ config = maybeConfig ?? null;
520
+ mimeType = null;
521
+ }
493
522
  const normalizedConfig = normalizeExtractionConfig(config);
494
523
  const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
495
524
  return convertResult(rawResult);
496
525
  }
497
- function extractBytesSync(data, mimeType, config = null) {
526
+ function extractBytesSync(dataOrPath, mimeType, config = null) {
527
+ let data;
528
+ if (typeof dataOrPath === "string") {
529
+ data = readFileSync(dataOrPath);
530
+ } else {
531
+ data = dataOrPath;
532
+ }
498
533
  const validated = assertUint8Array(data, "data");
499
534
  const normalizedConfig = normalizeExtractionConfig(config);
500
535
  const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
501
536
  return convertResult(rawResult);
502
537
  }
503
- async function extractBytes(data, mimeType, config = null) {
538
+ async function extractBytes(dataOrPath, mimeType, config = null) {
539
+ let data;
540
+ if (typeof dataOrPath === "string") {
541
+ data = readFileSync(dataOrPath);
542
+ } else {
543
+ data = dataOrPath;
544
+ }
504
545
  const validated = assertUint8Array(data, "data");
505
546
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
506
547
  console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
@@ -540,8 +581,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
540
581
  function registerPostProcessor(processor) {
541
582
  const binding2 = getBinding();
542
583
  const wrappedProcessor = {
543
- name: processor.name.bind(processor),
544
- processingStage: processor.processingStage?.bind(processor),
584
+ name: typeof processor.name === "function" ? processor.name() : processor.name,
585
+ processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
545
586
  async process(...args) {
546
587
  const wrappedValue = args[0];
547
588
  const jsonString = wrappedValue[0];
@@ -594,8 +635,8 @@ function listPostProcessors() {
594
635
  function registerValidator(validator) {
595
636
  const binding2 = getBinding();
596
637
  const wrappedValidator = {
597
- name: validator.name.bind(validator),
598
- priority: validator.priority?.bind(validator),
638
+ name: typeof validator.name === "function" ? validator.name() : validator.name,
639
+ priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
599
640
  async validate(...args) {
600
641
  const jsonString = args[0];
601
642
  if (!jsonString || jsonString === "undefined") {
@@ -644,8 +685,8 @@ function describePayload(value) {
644
685
  function registerOcrBackend(backend) {
645
686
  const binding2 = getBinding();
646
687
  const wrappedBackend = {
647
- name: backend.name.bind(backend),
648
- supportedLanguages: backend.supportedLanguages.bind(backend),
688
+ name: typeof backend.name === "function" ? backend.name() : backend.name,
689
+ supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
649
690
  async processImage(...processArgs) {
650
691
  const [imagePayload, maybeLanguage] = processArgs;
651
692
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
@@ -710,7 +751,99 @@ function clearDocumentExtractors() {
710
751
  const binding2 = getBinding();
711
752
  binding2.clearDocumentExtractors();
712
753
  }
754
+ class ExtractionConfigBuilder {
755
+ config = {};
756
+ /**
757
+ * Create a new builder with default configuration.
758
+ */
759
+ static default() {
760
+ return new ExtractionConfigBuilder();
761
+ }
762
+ /**
763
+ * Set OCR configuration.
764
+ */
765
+ withOcr(ocr) {
766
+ this.config["ocr"] = ocr;
767
+ return this;
768
+ }
769
+ /**
770
+ * Set chunking configuration.
771
+ */
772
+ withChunking(chunking) {
773
+ this.config["chunking"] = chunking;
774
+ return this;
775
+ }
776
+ /**
777
+ * Set image extraction configuration.
778
+ */
779
+ withImageExtraction(images) {
780
+ this.config["imageExtraction"] = images;
781
+ return this;
782
+ }
783
+ /**
784
+ * Set PDF configuration.
785
+ */
786
+ withPdf(pdf) {
787
+ this.config["pdf"] = pdf;
788
+ return this;
789
+ }
790
+ /**
791
+ * Set keyword extraction configuration.
792
+ */
793
+ withKeywords(keywords) {
794
+ this.config["keywords"] = keywords;
795
+ return this;
796
+ }
797
+ /**
798
+ * Set language detection configuration.
799
+ */
800
+ withLanguageDetection(languageDetection) {
801
+ this.config["languageDetection"] = languageDetection;
802
+ return this;
803
+ }
804
+ /**
805
+ * Set whether to enable metadata extraction.
806
+ */
807
+ withMetadataExtraction(enabled) {
808
+ this.config["metadataExtraction"] = enabled;
809
+ return this;
810
+ }
811
+ /**
812
+ * Set whether to enable quality mode.
813
+ */
814
+ withQualityMode(enabled) {
815
+ this.config["qualityMode"] = enabled;
816
+ return this;
817
+ }
818
+ /**
819
+ * Build and return the final ExtractionConfig object.
820
+ */
821
+ build() {
822
+ return this.config;
823
+ }
824
+ }
713
825
  const ExtractionConfig = {
826
+ /**
827
+ * Create a default extraction configuration using the builder pattern.
828
+ *
829
+ * Returns a builder object that allows you to configure extraction settings
830
+ * using method chaining.
831
+ *
832
+ * @returns ExtractionConfigBuilder for chaining configuration calls
833
+ *
834
+ * @example
835
+ * ```typescript
836
+ * import { ExtractionConfig } from '@kreuzberg/node';
837
+ *
838
+ * const config = ExtractionConfig.default()
839
+ * .withChunking({ maxChars: 2048 })
840
+ * .withOcr({ backend: 'tesseract', language: 'eng' })
841
+ * .build();
842
+ * ```
843
+ */
844
+ default() {
845
+ return ExtractionConfigBuilder.default();
846
+ },
714
847
  /**
715
848
  * Load extraction configuration from a file.
716
849
  *
@@ -776,9 +909,9 @@ function detectMimeType(bytes) {
776
909
  const binding2 = getBinding();
777
910
  return binding2.detectMimeTypeFromBytes(bytes);
778
911
  }
779
- function detectMimeTypeFromPath(path) {
912
+ function detectMimeTypeFromPath(filePath, checkExists) {
780
913
  const binding2 = getBinding();
781
- return binding2.detectMimeTypeFromPath(path);
914
+ return binding2.detectMimeTypeFromPath(filePath, checkExists);
782
915
  }
783
916
  function validateMimeType(mimeType) {
784
917
  const binding2 = getBinding();
@@ -819,7 +952,7 @@ function classifyError(errorMessage) {
819
952
  const result = binding2.classifyError(errorMessage);
820
953
  return result;
821
954
  }
822
- const __version__ = "4.0.0-rc.16";
955
+ const __version__ = "4.0.0-rc.18";
823
956
  export {
824
957
  CacheError,
825
958
  ErrorCode,