@kreuzberg/node 4.0.0-rc.8 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,3 +1,4 @@
1
+ import { readFileSync } from "node:fs";
1
2
  import { createRequire } from "node:module";
2
3
  import {
3
4
  CacheError,
@@ -70,17 +71,50 @@ function __resetBindingForTests() {
70
71
  bindingInitialized = false;
71
72
  }
72
73
  function loadNativeBinding() {
73
- const localRequire = typeof require !== "undefined" ? (
74
- // biome-ignore lint/suspicious/noExplicitAny: Node typings are available at runtime
75
- require
76
- ) : createRequire(import.meta.url);
74
+ let localRequire;
75
+ if (typeof require !== "undefined") {
76
+ localRequire = require;
77
+ } else {
78
+ try {
79
+ localRequire = createRequire(import.meta.url);
80
+ } catch {
81
+ localRequire = void 0;
82
+ }
83
+ }
77
84
  if (!localRequire) {
78
85
  throw new Error("Unable to resolve native binding loader (require not available).");
79
86
  }
80
- return localRequire("../index.js");
87
+ const loadedModule = localRequire("../index.js");
88
+ if (typeof loadedModule !== "object" || loadedModule === null) {
89
+ throw new Error(
90
+ "Native binding is not a valid object. Ensure the native module is properly built and compatible."
91
+ );
92
+ }
93
+ const module = loadedModule;
94
+ const requiredMethods = [
95
+ "extractFileSync",
96
+ "extractFile",
97
+ "extractBytesSync",
98
+ "extractBytes",
99
+ "batchExtractFilesSync",
100
+ "batchExtractFiles",
101
+ "batchExtractBytesSync",
102
+ "batchExtractBytes"
103
+ ];
104
+ for (const method of requiredMethods) {
105
+ if (typeof module[method] !== "function") {
106
+ throw new Error(
107
+ `Native binding is missing required method: ${method}. Ensure the native module is properly built and compatible.`
108
+ );
109
+ }
110
+ }
111
+ return module;
81
112
  }
82
113
  function getBinding() {
83
114
  if (bindingInitialized) {
115
+ if (binding === null) {
116
+ throw new Error("Native binding was previously failed to load.");
117
+ }
84
118
  return binding;
85
119
  }
86
120
  try {
@@ -90,6 +124,7 @@ function getBinding() {
90
124
  return binding;
91
125
  }
92
126
  } catch (error) {
127
+ bindingInitialized = true;
93
128
  throw createNativeBindingError(error);
94
129
  }
95
130
  throw new Error(
@@ -98,7 +133,11 @@ function getBinding() {
98
133
  }
99
134
  function parseMetadata(metadataStr) {
100
135
  try {
101
- return JSON.parse(metadataStr);
136
+ const parsed = JSON.parse(metadataStr);
137
+ if (typeof parsed === "object" && parsed !== null) {
138
+ return parsed;
139
+ }
140
+ return {};
102
141
  } catch {
103
142
  return {};
104
143
  }
@@ -116,7 +155,7 @@ function ensureUint8Array(value) {
116
155
  return new Uint8Array();
117
156
  }
118
157
  function convertChunk(rawChunk) {
119
- if (!rawChunk) {
158
+ if (!rawChunk || typeof rawChunk !== "object") {
120
159
  return {
121
160
  content: "",
122
161
  metadata: {
@@ -129,23 +168,33 @@ function convertChunk(rawChunk) {
129
168
  embedding: null
130
169
  };
131
170
  }
132
- const metadata = rawChunk.metadata ?? {};
171
+ const chunk = rawChunk;
172
+ const metadata = chunk["metadata"] ?? {};
133
173
  return {
134
- content: rawChunk.content ?? "",
135
- embedding: rawChunk.embedding ?? null,
174
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
175
+ content: chunk["content"] ?? "",
176
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
177
+ embedding: chunk["embedding"] ?? null,
136
178
  metadata: {
137
- byteStart: metadata.byte_start ?? metadata.charStart ?? 0,
138
- byteEnd: metadata.byte_end ?? metadata.charEnd ?? 0,
139
- tokenCount: metadata.token_count ?? metadata.tokenCount ?? null,
140
- chunkIndex: metadata.chunk_index ?? metadata.chunkIndex ?? 0,
141
- totalChunks: metadata.total_chunks ?? metadata.totalChunks ?? 0,
142
- firstPage: metadata.first_page ?? metadata.firstPage ?? null,
143
- lastPage: metadata.last_page ?? metadata.lastPage ?? null
179
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
180
+ byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
181
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
182
+ byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
183
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
184
+ tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
185
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
186
+ chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
187
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
188
+ totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
189
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
190
+ firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
191
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
192
+ lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
144
193
  }
145
194
  };
146
195
  }
147
196
  function convertImage(rawImage) {
148
- if (!rawImage) {
197
+ if (!rawImage || typeof rawImage !== "object") {
149
198
  return {
150
199
  data: new Uint8Array(),
151
200
  format: "unknown",
@@ -160,31 +209,97 @@ function convertImage(rawImage) {
160
209
  ocrResult: null
161
210
  };
162
211
  }
212
+ const image = rawImage;
163
213
  return {
164
- data: ensureUint8Array(rawImage.data),
165
- format: rawImage.format ?? "unknown",
166
- imageIndex: rawImage.imageIndex ?? 0,
167
- pageNumber: rawImage.pageNumber ?? null,
168
- width: rawImage.width ?? null,
169
- height: rawImage.height ?? null,
170
- colorspace: rawImage.colorspace ?? null,
171
- bitsPerComponent: rawImage.bitsPerComponent ?? null,
172
- isMask: rawImage.isMask ?? false,
173
- description: rawImage.description ?? null,
174
- ocrResult: rawImage.ocrResult ? convertResult(rawImage.ocrResult) : null
214
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
215
+ data: ensureUint8Array(image["data"]),
216
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
217
+ format: image["format"] ?? "unknown",
218
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
219
+ imageIndex: image["imageIndex"] ?? 0,
220
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
221
+ pageNumber: image["pageNumber"] ?? null,
222
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
223
+ width: image["width"] ?? null,
224
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
225
+ height: image["height"] ?? null,
226
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
227
+ colorspace: image["colorspace"] ?? null,
228
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
229
+ bitsPerComponent: image["bitsPerComponent"] ?? null,
230
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
231
+ isMask: image["isMask"] ?? false,
232
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
233
+ description: image["description"] ?? null,
234
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
235
+ ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
175
236
  };
176
237
  }
177
- function convertResult(rawResult) {
238
+ function convertPageContent(rawPage) {
239
+ if (!rawPage || typeof rawPage !== "object") {
240
+ return {
241
+ pageNumber: 0,
242
+ content: "",
243
+ tables: [],
244
+ images: []
245
+ };
246
+ }
247
+ const page = rawPage;
178
248
  return {
179
- content: rawResult.content,
180
- mimeType: rawResult.mimeType,
181
- metadata: typeof rawResult.metadata === "string" ? parseMetadata(rawResult.metadata) : rawResult.metadata,
182
- tables: rawResult.tables || [],
183
- detectedLanguages: rawResult.detectedLanguages || null,
184
- chunks: Array.isArray(rawResult.chunks) ? rawResult.chunks.map((chunk) => convertChunk(chunk)) : null,
185
- images: Array.isArray(rawResult.images) ? rawResult.images.map((image) => convertImage(image)) : null
249
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
250
+ pageNumber: page["pageNumber"] ?? 0,
251
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
252
+ content: page["content"] ?? "",
253
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
254
+ tables: Array.isArray(page["tables"]) ? page["tables"] : [],
255
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
256
+ images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
186
257
  };
187
258
  }
259
+ function convertResult(rawResult) {
260
+ if (!rawResult || typeof rawResult !== "object") {
261
+ return {
262
+ content: "",
263
+ mimeType: "application/octet-stream",
264
+ metadata: {},
265
+ tables: [],
266
+ detectedLanguages: null,
267
+ chunks: null,
268
+ images: null,
269
+ pages: null
270
+ };
271
+ }
272
+ const result = rawResult;
273
+ const metadata = result["metadata"];
274
+ const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
275
+ const returnObj = {
276
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
277
+ content: result["content"] ?? "",
278
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
279
+ mimeType: result["mimeType"] ?? "application/octet-stream",
280
+ metadata: metadataValue,
281
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
282
+ tables: Array.isArray(result["tables"]) ? result["tables"] : [],
283
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
284
+ detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
285
+ chunks: null,
286
+ images: null,
287
+ pages: null
288
+ };
289
+ const chunksData = result["chunks"];
290
+ if (Array.isArray(chunksData)) {
291
+ returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
292
+ }
293
+ const imagesData = result["images"];
294
+ if (Array.isArray(imagesData)) {
295
+ returnObj.images = imagesData.map((image) => convertImage(image));
296
+ }
297
+ const pagesData = result["pages"];
298
+ if (Array.isArray(pagesData)) {
299
+ returnObj.pages = pagesData.map((page) => convertPageContent(page));
300
+ }
301
+ return returnObj;
302
+ }
188
303
  function setIfDefined(target, key, value) {
189
304
  if (value !== void 0) {
190
305
  target[key] = value;
@@ -347,9 +462,9 @@ function normalizePageConfig(pages) {
347
462
  return void 0;
348
463
  }
349
464
  const normalized = {};
350
- setIfDefined(normalized, "extract_pages", pages.extractPages);
351
- setIfDefined(normalized, "insert_page_markers", pages.insertPageMarkers);
352
- setIfDefined(normalized, "marker_format", pages.markerFormat);
465
+ setIfDefined(normalized, "extractPages", pages.extractPages);
466
+ setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
467
+ setIfDefined(normalized, "markerFormat", pages.markerFormat);
353
468
  return normalized;
354
469
  }
355
470
  function normalizeExtractionConfig(config) {
@@ -383,23 +498,59 @@ function normalizeExtractionConfig(config) {
383
498
  setIfDefined(normalized, "htmlOptions", htmlOptions);
384
499
  return normalized;
385
500
  }
386
- function extractFileSync(filePath, mimeType = null, config = null) {
501
+ function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
502
+ let mimeType = null;
503
+ let config = null;
504
+ if (typeof mimeTypeOrConfig === "string") {
505
+ mimeType = mimeTypeOrConfig;
506
+ config = maybeConfig ?? null;
507
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
508
+ config = mimeTypeOrConfig;
509
+ mimeType = null;
510
+ } else {
511
+ config = maybeConfig ?? null;
512
+ mimeType = null;
513
+ }
387
514
  const normalizedConfig = normalizeExtractionConfig(config);
388
515
  const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
389
516
  return convertResult(rawResult);
390
517
  }
391
- async function extractFile(filePath, mimeType = null, config = null) {
518
+ async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
519
+ let mimeType = null;
520
+ let config = null;
521
+ if (typeof mimeTypeOrConfig === "string") {
522
+ mimeType = mimeTypeOrConfig;
523
+ config = maybeConfig ?? null;
524
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
525
+ config = mimeTypeOrConfig;
526
+ mimeType = null;
527
+ } else {
528
+ config = maybeConfig ?? null;
529
+ mimeType = null;
530
+ }
392
531
  const normalizedConfig = normalizeExtractionConfig(config);
393
532
  const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
394
533
  return convertResult(rawResult);
395
534
  }
396
- function extractBytesSync(data, mimeType, config = null) {
535
+ function extractBytesSync(dataOrPath, mimeType, config = null) {
536
+ let data;
537
+ if (typeof dataOrPath === "string") {
538
+ data = readFileSync(dataOrPath);
539
+ } else {
540
+ data = dataOrPath;
541
+ }
397
542
  const validated = assertUint8Array(data, "data");
398
543
  const normalizedConfig = normalizeExtractionConfig(config);
399
544
  const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
400
545
  return convertResult(rawResult);
401
546
  }
402
- async function extractBytes(data, mimeType, config = null) {
547
+ async function extractBytes(dataOrPath, mimeType, config = null) {
548
+ let data;
549
+ if (typeof dataOrPath === "string") {
550
+ data = readFileSync(dataOrPath);
551
+ } else {
552
+ data = dataOrPath;
553
+ }
403
554
  const validated = assertUint8Array(data, "data");
404
555
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
405
556
  console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
@@ -439,8 +590,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
439
590
  function registerPostProcessor(processor) {
440
591
  const binding2 = getBinding();
441
592
  const wrappedProcessor = {
442
- name: processor.name.bind(processor),
443
- processingStage: processor.processingStage?.bind(processor),
593
+ name: typeof processor.name === "function" ? processor.name() : processor.name,
594
+ processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
444
595
  async process(...args) {
445
596
  const wrappedValue = args[0];
446
597
  const jsonString = wrappedValue[0];
@@ -493,8 +644,8 @@ function listPostProcessors() {
493
644
  function registerValidator(validator) {
494
645
  const binding2 = getBinding();
495
646
  const wrappedValidator = {
496
- name: validator.name.bind(validator),
497
- priority: validator.priority?.bind(validator),
647
+ name: typeof validator.name === "function" ? validator.name() : validator.name,
648
+ priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
498
649
  async validate(...args) {
499
650
  const jsonString = args[0];
500
651
  if (!jsonString || jsonString === "undefined") {
@@ -543,8 +694,8 @@ function describePayload(value) {
543
694
  function registerOcrBackend(backend) {
544
695
  const binding2 = getBinding();
545
696
  const wrappedBackend = {
546
- name: backend.name.bind(backend),
547
- supportedLanguages: backend.supportedLanguages.bind(backend),
697
+ name: typeof backend.name === "function" ? backend.name() : backend.name,
698
+ supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
548
699
  async processImage(...processArgs) {
549
700
  const [imagePayload, maybeLanguage] = processArgs;
550
701
  if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
@@ -673,11 +824,11 @@ const ExtractionConfig = {
673
824
  };
674
825
  function detectMimeType(bytes) {
675
826
  const binding2 = getBinding();
676
- return binding2.detectMimeType(bytes);
827
+ return binding2.detectMimeTypeFromBytes(bytes);
677
828
  }
678
- function detectMimeTypeFromPath(path, checkExists) {
829
+ function detectMimeTypeFromPath(filePath, checkExists) {
679
830
  const binding2 = getBinding();
680
- return binding2.detectMimeTypeFromPath(path, checkExists);
831
+ return binding2.detectMimeTypeFromPath(filePath, checkExists);
681
832
  }
682
833
  function validateMimeType(mimeType) {
683
834
  const binding2 = getBinding();
@@ -693,7 +844,8 @@ function listEmbeddingPresets() {
693
844
  }
694
845
  function getEmbeddingPreset(name) {
695
846
  const binding2 = getBinding();
696
- return binding2.getEmbeddingPreset(name);
847
+ const result = binding2.getEmbeddingPreset(name);
848
+ return result;
697
849
  }
698
850
  function getLastErrorCode() {
699
851
  const binding2 = getBinding();
@@ -701,9 +853,70 @@ function getLastErrorCode() {
701
853
  }
702
854
  function getLastPanicContext() {
703
855
  const binding2 = getBinding();
704
- return binding2.getLastPanicContext();
856
+ const result = binding2.getLastPanicContext();
857
+ return result;
858
+ }
859
+ function getErrorCodeName(code) {
860
+ const binding2 = getBinding();
861
+ return binding2.getErrorCodeName(code);
862
+ }
863
+ function getErrorCodeDescription(code) {
864
+ const binding2 = getBinding();
865
+ return binding2.getErrorCodeDescription(code);
866
+ }
867
+ function classifyError(errorMessage) {
868
+ const binding2 = getBinding();
869
+ const result = binding2.classifyError(errorMessage);
870
+ return result;
871
+ }
872
+ function createWorkerPool(size) {
873
+ const binding2 = getBinding();
874
+ const rawPool = binding2.createWorkerPool(size);
875
+ return rawPool;
876
+ }
877
+ function getWorkerPoolStats(pool) {
878
+ const binding2 = getBinding();
879
+ const rawStats = binding2.getWorkerPoolStats(pool);
880
+ return rawStats;
881
+ }
882
+ async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
883
+ let mimeType = null;
884
+ let config = null;
885
+ if (typeof mimeTypeOrConfig === "string") {
886
+ mimeType = mimeTypeOrConfig;
887
+ config = maybeConfig ?? null;
888
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
889
+ config = mimeTypeOrConfig;
890
+ mimeType = null;
891
+ } else {
892
+ config = maybeConfig ?? null;
893
+ mimeType = null;
894
+ }
895
+ const normalizedConfig = normalizeExtractionConfig(config);
896
+ const binding2 = getBinding();
897
+ const rawResult = await binding2.extractFileInWorker(
898
+ pool,
899
+ filePath,
900
+ mimeType,
901
+ normalizedConfig
902
+ );
903
+ return convertResult(rawResult);
904
+ }
905
+ async function batchExtractFilesInWorker(pool, paths, config = null) {
906
+ const normalizedConfig = normalizeExtractionConfig(config);
907
+ const binding2 = getBinding();
908
+ const rawResults = await binding2.batchExtractFilesInWorker(
909
+ pool,
910
+ paths,
911
+ normalizedConfig
912
+ );
913
+ return rawResults.map(convertResult);
914
+ }
915
+ async function closeWorkerPool(pool) {
916
+ const binding2 = getBinding();
917
+ await binding2.closeWorkerPool(pool);
705
918
  }
706
- const __version__ = "4.0.0-rc.8";
919
+ const __version__ = "4.0.0";
707
920
  export {
708
921
  CacheError,
709
922
  ErrorCode,
@@ -722,21 +935,29 @@ export {
722
935
  batchExtractBytes,
723
936
  batchExtractBytesSync,
724
937
  batchExtractFiles,
938
+ batchExtractFilesInWorker,
725
939
  batchExtractFilesSync,
940
+ classifyError,
726
941
  clearDocumentExtractors,
727
942
  clearOcrBackends,
728
943
  clearPostProcessors,
729
944
  clearValidators,
945
+ closeWorkerPool,
946
+ createWorkerPool,
730
947
  detectMimeType,
731
948
  detectMimeTypeFromPath,
732
949
  extractBytes,
733
950
  extractBytesSync,
734
951
  extractFile,
952
+ extractFileInWorker,
735
953
  extractFileSync,
736
954
  getEmbeddingPreset,
955
+ getErrorCodeDescription,
956
+ getErrorCodeName,
737
957
  getExtensionsForMime,
738
958
  getLastErrorCode,
739
959
  getLastPanicContext,
960
+ getWorkerPoolStats,
740
961
  listDocumentExtractors,
741
962
  listEmbeddingPresets,
742
963
  listOcrBackends,