@kreuzberg/node 4.0.0-rc.6 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,975 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { createRequire } from "node:module";
3
+ import {
4
+ CacheError,
5
+ ErrorCode,
6
+ ImageProcessingError,
7
+ KreuzbergError,
8
+ MissingDependencyError,
9
+ OcrError,
10
+ ParsingError,
11
+ PluginError,
12
+ ValidationError
13
+ } from "./errors.js";
14
+ import { GutenOcrBackend } from "./ocr/guten-ocr.js";
15
+ export * from "./types.js";
16
+ let binding = null;
17
+ let bindingInitialized = false;
18
+ function createNativeBindingError(error) {
19
+ const hintParts = [];
20
+ let detail = "Unknown error while requiring native module.";
21
+ if (error instanceof Error) {
22
+ detail = error.message || error.toString();
23
+ if (/pdfium/i.test(detail)) {
24
+ hintParts.push(
25
+ "Pdfium runtime library was not found. Ensure the bundled libpdfium (dll/dylib/so) is present next to the native module."
26
+ );
27
+ }
28
+ return new Error(
29
+ [
30
+ "Failed to load Kreuzberg native bindings.",
31
+ hintParts.length ? hintParts.join(" ") : "",
32
+ "Report this error and attach the logs/stack trace for investigation.",
33
+ `Underlying error: ${detail}`
34
+ ].filter(Boolean).join(" "),
35
+ { cause: error }
36
+ );
37
+ }
38
+ return new Error(
39
+ [
40
+ "Failed to load Kreuzberg native bindings.",
41
+ "Report this error and attach the logs/stack trace for investigation.",
42
+ `Underlying error: ${String(error)}`
43
+ ].join(" ")
44
+ );
45
+ }
46
+ function assertUint8Array(value, name) {
47
+ if (!(value instanceof Uint8Array)) {
48
+ throw new TypeError(`${name} must be a Uint8Array`);
49
+ }
50
+ return value;
51
+ }
52
+ function assertUint8ArrayList(values, name) {
53
+ if (!Array.isArray(values)) {
54
+ throw new TypeError(`${name} must be an array of Uint8Array`);
55
+ }
56
+ const array = values;
57
+ return array.map((value, index) => {
58
+ try {
59
+ return assertUint8Array(value, `${name}[${index}]`);
60
+ } catch {
61
+ throw new TypeError(`${name}[${index}] must be a Uint8Array`);
62
+ }
63
+ });
64
+ }
65
+ function __setBindingForTests(mock) {
66
+ binding = mock;
67
+ bindingInitialized = true;
68
+ }
69
+ function __resetBindingForTests() {
70
+ binding = null;
71
+ bindingInitialized = false;
72
+ }
73
+ function loadNativeBinding() {
74
+ let localRequire;
75
+ if (typeof require !== "undefined") {
76
+ localRequire = require;
77
+ } else {
78
+ try {
79
+ localRequire = createRequire(import.meta.url);
80
+ } catch {
81
+ localRequire = void 0;
82
+ }
83
+ }
84
+ if (!localRequire) {
85
+ throw new Error("Unable to resolve native binding loader (require not available).");
86
+ }
87
+ const loadedModule = localRequire("../index.js");
88
+ if (typeof loadedModule !== "object" || loadedModule === null) {
89
+ throw new Error(
90
+ "Native binding is not a valid object. Ensure the native module is properly built and compatible."
91
+ );
92
+ }
93
+ const module = loadedModule;
94
+ const requiredMethods = [
95
+ "extractFileSync",
96
+ "extractFile",
97
+ "extractBytesSync",
98
+ "extractBytes",
99
+ "batchExtractFilesSync",
100
+ "batchExtractFiles",
101
+ "batchExtractBytesSync",
102
+ "batchExtractBytes"
103
+ ];
104
+ for (const method of requiredMethods) {
105
+ if (typeof module[method] !== "function") {
106
+ throw new Error(
107
+ `Native binding is missing required method: ${method}. Ensure the native module is properly built and compatible.`
108
+ );
109
+ }
110
+ }
111
+ return module;
112
+ }
113
+ function getBinding() {
114
+ if (bindingInitialized) {
115
+ if (binding === null) {
116
+ throw new Error("Native binding was previously failed to load.");
117
+ }
118
+ return binding;
119
+ }
120
+ try {
121
+ if (typeof process !== "undefined" && process.versions && process.versions.node) {
122
+ binding = loadNativeBinding();
123
+ bindingInitialized = true;
124
+ return binding;
125
+ }
126
+ } catch (error) {
127
+ bindingInitialized = true;
128
+ throw createNativeBindingError(error);
129
+ }
130
+ throw new Error(
131
+ "Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
132
+ );
133
+ }
134
+ function parseMetadata(metadataStr) {
135
+ try {
136
+ const parsed = JSON.parse(metadataStr);
137
+ if (typeof parsed === "object" && parsed !== null) {
138
+ return parsed;
139
+ }
140
+ return {};
141
+ } catch {
142
+ return {};
143
+ }
144
+ }
145
+ function ensureUint8Array(value) {
146
+ if (value instanceof Uint8Array) {
147
+ return value;
148
+ }
149
+ if (typeof Buffer !== "undefined" && value instanceof Buffer) {
150
+ return new Uint8Array(value);
151
+ }
152
+ if (Array.isArray(value)) {
153
+ return new Uint8Array(value);
154
+ }
155
+ return new Uint8Array();
156
+ }
157
+ function convertChunk(rawChunk) {
158
+ if (!rawChunk || typeof rawChunk !== "object") {
159
+ return {
160
+ content: "",
161
+ metadata: {
162
+ byteStart: 0,
163
+ byteEnd: 0,
164
+ tokenCount: null,
165
+ chunkIndex: 0,
166
+ totalChunks: 0
167
+ },
168
+ embedding: null
169
+ };
170
+ }
171
+ const chunk = rawChunk;
172
+ const metadata = chunk["metadata"] ?? {};
173
+ return {
174
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
175
+ content: chunk["content"] ?? "",
176
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
177
+ embedding: chunk["embedding"] ?? null,
178
+ metadata: {
179
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
180
+ byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
181
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
182
+ byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
183
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
184
+ tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
185
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
186
+ chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
187
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
188
+ totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
189
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
190
+ firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
191
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
192
+ lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
193
+ }
194
+ };
195
+ }
196
+ function convertImage(rawImage) {
197
+ if (!rawImage || typeof rawImage !== "object") {
198
+ return {
199
+ data: new Uint8Array(),
200
+ format: "unknown",
201
+ imageIndex: 0,
202
+ pageNumber: null,
203
+ width: null,
204
+ height: null,
205
+ colorspace: null,
206
+ bitsPerComponent: null,
207
+ isMask: false,
208
+ description: null,
209
+ ocrResult: null
210
+ };
211
+ }
212
+ const image = rawImage;
213
+ return {
214
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
215
+ data: ensureUint8Array(image["data"]),
216
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
217
+ format: image["format"] ?? "unknown",
218
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
219
+ imageIndex: image["imageIndex"] ?? 0,
220
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
221
+ pageNumber: image["pageNumber"] ?? null,
222
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
223
+ width: image["width"] ?? null,
224
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
225
+ height: image["height"] ?? null,
226
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
227
+ colorspace: image["colorspace"] ?? null,
228
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
229
+ bitsPerComponent: image["bitsPerComponent"] ?? null,
230
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
231
+ isMask: image["isMask"] ?? false,
232
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
233
+ description: image["description"] ?? null,
234
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
235
+ ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
236
+ };
237
+ }
238
+ function convertPageContent(rawPage) {
239
+ if (!rawPage || typeof rawPage !== "object") {
240
+ return {
241
+ pageNumber: 0,
242
+ content: "",
243
+ tables: [],
244
+ images: []
245
+ };
246
+ }
247
+ const page = rawPage;
248
+ return {
249
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
250
+ pageNumber: page["pageNumber"] ?? 0,
251
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
252
+ content: page["content"] ?? "",
253
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
254
+ tables: Array.isArray(page["tables"]) ? page["tables"] : [],
255
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
256
+ images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
257
+ };
258
+ }
259
+ function convertResult(rawResult) {
260
+ if (!rawResult || typeof rawResult !== "object") {
261
+ return {
262
+ content: "",
263
+ mimeType: "application/octet-stream",
264
+ metadata: {},
265
+ tables: [],
266
+ detectedLanguages: null,
267
+ chunks: null,
268
+ images: null,
269
+ pages: null
270
+ };
271
+ }
272
+ const result = rawResult;
273
+ const metadata = result["metadata"];
274
+ const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
275
+ const returnObj = {
276
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
277
+ content: result["content"] ?? "",
278
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
279
+ mimeType: result["mimeType"] ?? "application/octet-stream",
280
+ metadata: metadataValue,
281
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
282
+ tables: Array.isArray(result["tables"]) ? result["tables"] : [],
283
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
284
+ detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
285
+ chunks: null,
286
+ images: null,
287
+ pages: null
288
+ };
289
+ const chunksData = result["chunks"];
290
+ if (Array.isArray(chunksData)) {
291
+ returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
292
+ }
293
+ const imagesData = result["images"];
294
+ if (Array.isArray(imagesData)) {
295
+ returnObj.images = imagesData.map((image) => convertImage(image));
296
+ }
297
+ const pagesData = result["pages"];
298
+ if (Array.isArray(pagesData)) {
299
+ returnObj.pages = pagesData.map((page) => convertPageContent(page));
300
+ }
301
+ return returnObj;
302
+ }
303
+ function setIfDefined(target, key, value) {
304
+ if (value !== void 0) {
305
+ target[key] = value;
306
+ }
307
+ }
308
+ function normalizeTesseractConfig(config) {
309
+ if (!config) {
310
+ return void 0;
311
+ }
312
+ const normalized = {};
313
+ setIfDefined(normalized, "psm", config.psm);
314
+ setIfDefined(normalized, "enableTableDetection", config.enableTableDetection);
315
+ setIfDefined(normalized, "tesseditCharWhitelist", config.tesseditCharWhitelist);
316
+ return normalized;
317
+ }
318
+ function normalizeOcrConfig(ocr) {
319
+ if (!ocr) {
320
+ return void 0;
321
+ }
322
+ const normalized = {
323
+ backend: ocr.backend
324
+ };
325
+ setIfDefined(normalized, "language", ocr.language);
326
+ const tesseract = normalizeTesseractConfig(ocr.tesseractConfig);
327
+ if (tesseract) {
328
+ setIfDefined(normalized, "tesseractConfig", tesseract);
329
+ }
330
+ return normalized;
331
+ }
332
+ function normalizeChunkingConfig(chunking) {
333
+ if (!chunking) {
334
+ return void 0;
335
+ }
336
+ const normalized = {};
337
+ setIfDefined(normalized, "maxChars", chunking.maxChars);
338
+ setIfDefined(normalized, "maxOverlap", chunking.maxOverlap);
339
+ setIfDefined(normalized, "preset", chunking.preset);
340
+ setIfDefined(normalized, "embedding", chunking.embedding);
341
+ setIfDefined(normalized, "enabled", chunking.enabled);
342
+ return normalized;
343
+ }
344
+ function normalizeImageExtractionConfig(images) {
345
+ if (!images) {
346
+ return void 0;
347
+ }
348
+ const normalized = {};
349
+ setIfDefined(normalized, "extractImages", images.extractImages);
350
+ setIfDefined(normalized, "targetDpi", images.targetDpi);
351
+ setIfDefined(normalized, "maxImageDimension", images.maxImageDimension);
352
+ setIfDefined(normalized, "autoAdjustDpi", images.autoAdjustDpi);
353
+ setIfDefined(normalized, "minDpi", images.minDpi);
354
+ setIfDefined(normalized, "maxDpi", images.maxDpi);
355
+ return normalized;
356
+ }
357
+ function normalizePdfConfig(pdf) {
358
+ if (!pdf) {
359
+ return void 0;
360
+ }
361
+ const normalized = {};
362
+ setIfDefined(normalized, "extractImages", pdf.extractImages);
363
+ setIfDefined(normalized, "passwords", pdf.passwords);
364
+ setIfDefined(normalized, "extractMetadata", pdf.extractMetadata);
365
+ return normalized;
366
+ }
367
+ function normalizeTokenReductionConfig(tokenReduction) {
368
+ if (!tokenReduction) {
369
+ return void 0;
370
+ }
371
+ const normalized = {};
372
+ setIfDefined(normalized, "mode", tokenReduction.mode);
373
+ setIfDefined(normalized, "preserveImportantWords", tokenReduction.preserveImportantWords);
374
+ return normalized;
375
+ }
376
+ function normalizeLanguageDetectionConfig(languageDetection) {
377
+ if (!languageDetection) {
378
+ return void 0;
379
+ }
380
+ const normalized = {};
381
+ setIfDefined(normalized, "enabled", languageDetection.enabled);
382
+ setIfDefined(normalized, "minConfidence", languageDetection.minConfidence);
383
+ setIfDefined(normalized, "detectMultiple", languageDetection.detectMultiple);
384
+ return normalized;
385
+ }
386
+ function normalizePostProcessorConfig(postprocessor) {
387
+ if (!postprocessor) {
388
+ return void 0;
389
+ }
390
+ const normalized = {};
391
+ setIfDefined(normalized, "enabled", postprocessor.enabled);
392
+ setIfDefined(normalized, "enabledProcessors", postprocessor.enabledProcessors);
393
+ setIfDefined(normalized, "disabledProcessors", postprocessor.disabledProcessors);
394
+ return normalized;
395
+ }
396
+ function normalizeHtmlPreprocessing(options) {
397
+ if (!options) {
398
+ return void 0;
399
+ }
400
+ const normalized = {};
401
+ setIfDefined(normalized, "enabled", options.enabled);
402
+ setIfDefined(normalized, "preset", options.preset);
403
+ setIfDefined(normalized, "removeNavigation", options.removeNavigation);
404
+ setIfDefined(normalized, "removeForms", options.removeForms);
405
+ return normalized;
406
+ }
407
+ function normalizeHtmlOptions(options) {
408
+ if (!options) {
409
+ return void 0;
410
+ }
411
+ const normalized = {};
412
+ setIfDefined(normalized, "headingStyle", options.headingStyle);
413
+ setIfDefined(normalized, "listIndentType", options.listIndentType);
414
+ setIfDefined(normalized, "listIndentWidth", options.listIndentWidth);
415
+ setIfDefined(normalized, "bullets", options.bullets);
416
+ setIfDefined(normalized, "strongEmSymbol", options.strongEmSymbol);
417
+ setIfDefined(normalized, "escapeAsterisks", options.escapeAsterisks);
418
+ setIfDefined(normalized, "escapeUnderscores", options.escapeUnderscores);
419
+ setIfDefined(normalized, "escapeMisc", options.escapeMisc);
420
+ setIfDefined(normalized, "escapeAscii", options.escapeAscii);
421
+ setIfDefined(normalized, "codeLanguage", options.codeLanguage);
422
+ setIfDefined(normalized, "autolinks", options.autolinks);
423
+ setIfDefined(normalized, "defaultTitle", options.defaultTitle);
424
+ setIfDefined(normalized, "brInTables", options.brInTables);
425
+ setIfDefined(normalized, "hocrSpatialTables", options.hocrSpatialTables);
426
+ setIfDefined(normalized, "highlightStyle", options.highlightStyle);
427
+ setIfDefined(normalized, "extractMetadata", options.extractMetadata);
428
+ setIfDefined(normalized, "whitespaceMode", options.whitespaceMode);
429
+ setIfDefined(normalized, "stripNewlines", options.stripNewlines);
430
+ setIfDefined(normalized, "wrap", options.wrap);
431
+ setIfDefined(normalized, "wrapWidth", options.wrapWidth);
432
+ setIfDefined(normalized, "convertAsInline", options.convertAsInline);
433
+ setIfDefined(normalized, "subSymbol", options.subSymbol);
434
+ setIfDefined(normalized, "supSymbol", options.supSymbol);
435
+ setIfDefined(normalized, "newlineStyle", options.newlineStyle);
436
+ setIfDefined(normalized, "codeBlockStyle", options.codeBlockStyle);
437
+ setIfDefined(normalized, "keepInlineImagesIn", options.keepInlineImagesIn);
438
+ setIfDefined(normalized, "encoding", options.encoding);
439
+ setIfDefined(normalized, "debug", options.debug);
440
+ setIfDefined(normalized, "stripTags", options.stripTags);
441
+ setIfDefined(normalized, "preserveTags", options.preserveTags);
442
+ const preprocessing = normalizeHtmlPreprocessing(options.preprocessing);
443
+ setIfDefined(normalized, "preprocessing", preprocessing);
444
+ return normalized;
445
+ }
446
+ function normalizeKeywordConfig(config) {
447
+ if (!config) {
448
+ return void 0;
449
+ }
450
+ const normalized = {};
451
+ setIfDefined(normalized, "algorithm", config.algorithm);
452
+ setIfDefined(normalized, "maxKeywords", config.maxKeywords);
453
+ setIfDefined(normalized, "minScore", config.minScore);
454
+ setIfDefined(normalized, "ngramRange", config.ngramRange);
455
+ setIfDefined(normalized, "language", config.language);
456
+ setIfDefined(normalized, "yakeParams", config.yakeParams);
457
+ setIfDefined(normalized, "rakeParams", config.rakeParams);
458
+ return normalized;
459
+ }
460
+ function normalizePageConfig(pages) {
461
+ if (!pages) {
462
+ return void 0;
463
+ }
464
+ const normalized = {};
465
+ setIfDefined(normalized, "extractPages", pages.extractPages);
466
+ setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
467
+ setIfDefined(normalized, "markerFormat", pages.markerFormat);
468
+ return normalized;
469
+ }
470
+ function normalizeExtractionConfig(config) {
471
+ if (!config) {
472
+ return null;
473
+ }
474
+ const normalized = {};
475
+ setIfDefined(normalized, "useCache", config.useCache);
476
+ setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
477
+ setIfDefined(normalized, "forceOcr", config.forceOcr);
478
+ setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
479
+ const ocr = normalizeOcrConfig(config.ocr);
480
+ setIfDefined(normalized, "ocr", ocr);
481
+ const chunking = normalizeChunkingConfig(config.chunking);
482
+ setIfDefined(normalized, "chunking", chunking);
483
+ const images = normalizeImageExtractionConfig(config.images);
484
+ setIfDefined(normalized, "images", images);
485
+ const pdf = normalizePdfConfig(config.pdfOptions);
486
+ setIfDefined(normalized, "pdfOptions", pdf);
487
+ const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
488
+ setIfDefined(normalized, "tokenReduction", tokenReduction);
489
+ const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
490
+ setIfDefined(normalized, "languageDetection", languageDetection);
491
+ const postprocessor = normalizePostProcessorConfig(config.postprocessor);
492
+ setIfDefined(normalized, "postprocessor", postprocessor);
493
+ const keywords = normalizeKeywordConfig(config.keywords);
494
+ setIfDefined(normalized, "keywords", keywords);
495
+ const pages = normalizePageConfig(config.pages);
496
+ setIfDefined(normalized, "pages", pages);
497
+ const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
498
+ setIfDefined(normalized, "htmlOptions", htmlOptions);
499
+ return normalized;
500
+ }
501
+ function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
502
+ let mimeType = null;
503
+ let config = null;
504
+ if (typeof mimeTypeOrConfig === "string") {
505
+ mimeType = mimeTypeOrConfig;
506
+ config = maybeConfig ?? null;
507
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
508
+ config = mimeTypeOrConfig;
509
+ mimeType = null;
510
+ } else {
511
+ config = maybeConfig ?? null;
512
+ mimeType = null;
513
+ }
514
+ const normalizedConfig = normalizeExtractionConfig(config);
515
+ const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
516
+ return convertResult(rawResult);
517
+ }
518
+ async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
519
+ let mimeType = null;
520
+ let config = null;
521
+ if (typeof mimeTypeOrConfig === "string") {
522
+ mimeType = mimeTypeOrConfig;
523
+ config = maybeConfig ?? null;
524
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
525
+ config = mimeTypeOrConfig;
526
+ mimeType = null;
527
+ } else {
528
+ config = maybeConfig ?? null;
529
+ mimeType = null;
530
+ }
531
+ const normalizedConfig = normalizeExtractionConfig(config);
532
+ const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
533
+ return convertResult(rawResult);
534
+ }
535
+ function extractBytesSync(dataOrPath, mimeType, config = null) {
536
+ let data;
537
+ if (typeof dataOrPath === "string") {
538
+ data = readFileSync(dataOrPath);
539
+ } else {
540
+ data = dataOrPath;
541
+ }
542
+ const validated = assertUint8Array(data, "data");
543
+ const normalizedConfig = normalizeExtractionConfig(config);
544
+ const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
545
+ return convertResult(rawResult);
546
+ }
547
+ async function extractBytes(dataOrPath, mimeType, config = null) {
548
+ let data;
549
+ if (typeof dataOrPath === "string") {
550
+ data = readFileSync(dataOrPath);
551
+ } else {
552
+ data = dataOrPath;
553
+ }
554
+ const validated = assertUint8Array(data, "data");
555
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
556
+ console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
557
+ }
558
+ const normalizedConfig = normalizeExtractionConfig(config);
559
+ const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
560
+ return convertResult(rawResult);
561
+ }
562
+ function batchExtractFilesSync(paths, config = null) {
563
+ const normalizedConfig = normalizeExtractionConfig(config);
564
+ const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
565
+ return rawResults.map(convertResult);
566
+ }
567
+ async function batchExtractFiles(paths, config = null) {
568
+ const normalizedConfig = normalizeExtractionConfig(config);
569
+ const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
570
+ return rawResults.map(convertResult);
571
+ }
572
+ function batchExtractBytesSync(dataList, mimeTypes, config = null) {
573
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
574
+ if (buffers.length !== mimeTypes.length) {
575
+ throw new TypeError("dataList and mimeTypes must have the same length");
576
+ }
577
+ const normalizedConfig = normalizeExtractionConfig(config);
578
+ const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
579
+ return rawResults.map(convertResult);
580
+ }
581
+ async function batchExtractBytes(dataList, mimeTypes, config = null) {
582
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
583
+ if (buffers.length !== mimeTypes.length) {
584
+ throw new TypeError("dataList and mimeTypes must have the same length");
585
+ }
586
+ const normalizedConfig = normalizeExtractionConfig(config);
587
+ const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
588
+ return rawResults.map(convertResult);
589
+ }
590
+ function registerPostProcessor(processor) {
591
+ const binding2 = getBinding();
592
+ const wrappedProcessor = {
593
+ name: typeof processor.name === "function" ? processor.name() : processor.name,
594
+ processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
595
+ async process(...args) {
596
+ const wrappedValue = args[0];
597
+ const jsonString = wrappedValue[0];
598
+ const wireResult = JSON.parse(jsonString);
599
+ const result = {
600
+ content: wireResult.content,
601
+ mimeType: wireResult.mime_type,
602
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
603
+ tables: wireResult.tables || [],
604
+ detectedLanguages: wireResult.detected_languages ?? null,
605
+ chunks: wireResult.chunks ?? null,
606
+ images: wireResult.images ?? null
607
+ };
608
+ const updated = await processor.process(result);
609
+ const wireUpdated = {
610
+ content: updated.content,
611
+ mime_type: updated.mimeType,
612
+ metadata: updated.metadata,
613
+ tables: updated.tables,
614
+ detected_languages: updated.detectedLanguages,
615
+ chunks: updated.chunks,
616
+ images: updated.images
617
+ };
618
+ return JSON.stringify(wireUpdated);
619
+ }
620
+ };
621
+ Object.defineProperty(wrappedProcessor, "__original", {
622
+ value: processor,
623
+ enumerable: false
624
+ });
625
+ const stage = processor.processingStage?.() ?? "middle";
626
+ Object.defineProperty(wrappedProcessor, "__stage", {
627
+ value: stage,
628
+ enumerable: false
629
+ });
630
+ binding2.registerPostProcessor(wrappedProcessor);
631
+ }
632
+ function unregisterPostProcessor(name) {
633
+ const binding2 = getBinding();
634
+ binding2.unregisterPostProcessor(name);
635
+ }
636
+ function clearPostProcessors() {
637
+ const binding2 = getBinding();
638
+ binding2.clearPostProcessors();
639
+ }
640
+ function listPostProcessors() {
641
+ const binding2 = getBinding();
642
+ return binding2.listPostProcessors();
643
+ }
644
+ function registerValidator(validator) {
645
+ const binding2 = getBinding();
646
+ const wrappedValidator = {
647
+ name: typeof validator.name === "function" ? validator.name() : validator.name,
648
+ priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
649
+ async validate(...args) {
650
+ const jsonString = args[0];
651
+ if (!jsonString || jsonString === "undefined") {
652
+ throw new Error("Validator received invalid JSON string");
653
+ }
654
+ const wireResult = JSON.parse(jsonString);
655
+ const result = {
656
+ content: wireResult.content,
657
+ mimeType: wireResult.mime_type,
658
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
659
+ tables: wireResult.tables || [],
660
+ detectedLanguages: wireResult.detected_languages,
661
+ chunks: wireResult.chunks,
662
+ images: wireResult.images ?? null
663
+ };
664
+ await Promise.resolve(validator.validate(result));
665
+ return "";
666
+ }
667
+ };
668
+ binding2.registerValidator(wrappedValidator);
669
+ }
670
+ function unregisterValidator(name) {
671
+ const binding2 = getBinding();
672
+ binding2.unregisterValidator(name);
673
+ }
674
+ function clearValidators() {
675
+ const binding2 = getBinding();
676
+ binding2.clearValidators();
677
+ }
678
+ function listValidators() {
679
+ const binding2 = getBinding();
680
+ return binding2.listValidators();
681
+ }
682
+ function isOcrProcessTuple(value) {
683
+ return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
684
+ }
685
+ function isNestedOcrProcessTuple(value) {
686
+ return Array.isArray(value) && value.length === 1 && isOcrProcessTuple(value[0]);
687
+ }
688
+ function describePayload(value) {
689
+ if (typeof value === "string") {
690
+ return { ctor: "String", length: value.length };
691
+ }
692
+ return { ctor: value.constructor?.name ?? "Buffer", length: value.length };
693
+ }
694
+ function registerOcrBackend(backend) {
695
+ const binding2 = getBinding();
696
+ const wrappedBackend = {
697
+ name: typeof backend.name === "function" ? backend.name() : backend.name,
698
+ supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
699
+ async processImage(...processArgs) {
700
+ const [imagePayload, maybeLanguage] = processArgs;
701
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
702
+ console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
703
+ console.log("[registerOcrBackend] Raw args", {
704
+ imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
705
+ maybeLanguageType: typeof maybeLanguage,
706
+ metadata: Array.isArray(imagePayload) ? { tupleLength: imagePayload.length } : describePayload(imagePayload)
707
+ });
708
+ }
709
+ let rawBytes;
710
+ let language = maybeLanguage;
711
+ if (isNestedOcrProcessTuple(imagePayload)) {
712
+ [rawBytes, language] = imagePayload[0];
713
+ } else if (isOcrProcessTuple(imagePayload)) {
714
+ [rawBytes, language] = imagePayload;
715
+ } else {
716
+ rawBytes = imagePayload;
717
+ }
718
+ if (typeof language !== "string") {
719
+ throw new Error("OCR backend did not receive a language parameter");
720
+ }
721
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
722
+ const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
723
+ console.log(
724
+ "[registerOcrBackend] Received payload",
725
+ Array.isArray(imagePayload) ? "tuple" : typeof rawBytes,
726
+ "ctor",
727
+ describePayload(rawBytes).ctor,
728
+ "length",
729
+ length
730
+ );
731
+ }
732
+ const buffer = typeof rawBytes === "string" ? Buffer.from(rawBytes, "base64") : Buffer.from(rawBytes);
733
+ const result = await backend.processImage(new Uint8Array(buffer), language);
734
+ return JSON.stringify(result);
735
+ }
736
+ };
737
+ binding2.registerOcrBackend(wrappedBackend);
738
+ }
739
+ function listOcrBackends() {
740
+ const binding2 = getBinding();
741
+ return binding2.listOcrBackends();
742
+ }
743
+ function unregisterOcrBackend(name) {
744
+ const binding2 = getBinding();
745
+ binding2.unregisterOcrBackend(name);
746
+ }
747
+ function clearOcrBackends() {
748
+ const binding2 = getBinding();
749
+ binding2.clearOcrBackends();
750
+ }
751
+ function listDocumentExtractors() {
752
+ const binding2 = getBinding();
753
+ return binding2.listDocumentExtractors();
754
+ }
755
+ function unregisterDocumentExtractor(name) {
756
+ const binding2 = getBinding();
757
+ binding2.unregisterDocumentExtractor(name);
758
+ }
759
+ function clearDocumentExtractors() {
760
+ const binding2 = getBinding();
761
+ binding2.clearDocumentExtractors();
762
+ }
763
+ const ExtractionConfig = {
764
+ /**
765
+ * Load extraction configuration from a file.
766
+ *
767
+ * Automatically detects the file format based on extension:
768
+ * - `.toml` - TOML format
769
+ * - `.yaml` - YAML format
770
+ * - `.json` - JSON format
771
+ *
772
+ * @param filePath - Path to the configuration file (absolute or relative)
773
+ * @returns ExtractionConfig object loaded from the file
774
+ *
775
+ * @throws {Error} If file does not exist or is not accessible
776
+ * @throws {Error} If file content is not valid TOML/YAML/JSON
777
+ * @throws {Error} If configuration structure is invalid
778
+ * @throws {Error} If file extension is not supported
779
+ *
780
+ * @example
781
+ * ```typescript
782
+ * import { ExtractionConfig } from '@kreuzberg/node';
783
+ *
784
+ * // Load from TOML file
785
+ * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
786
+ *
787
+ * // Load from YAML file
788
+ * const config2 = ExtractionConfig.fromFile('./config.yaml');
789
+ *
790
+ * // Load from JSON file
791
+ * const config3 = ExtractionConfig.fromFile('./config.json');
792
+ * ```
793
+ */
794
+ fromFile(filePath) {
795
+ const binding2 = getBinding();
796
+ return binding2.loadExtractionConfigFromFile(filePath);
797
+ },
798
+ /**
799
+ * Discover and load configuration from current or parent directories.
800
+ *
801
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
802
+ * and traversing up the directory tree. Returns the first configuration file found.
803
+ *
804
+ * @returns ExtractionConfig object if found, or null if no configuration file exists
805
+ *
806
+ * @example
807
+ * ```typescript
808
+ * import { ExtractionConfig } from '@kreuzberg/node';
809
+ *
810
+ * // Try to find config in current or parent directories
811
+ * const config = ExtractionConfig.discover();
812
+ * if (config) {
813
+ * console.log('Found configuration');
814
+ * // Use config for extraction
815
+ * } else {
816
+ * console.log('No configuration file found, using defaults');
817
+ * }
818
+ * ```
819
+ */
820
+ discover() {
821
+ const binding2 = getBinding();
822
+ return binding2.discoverExtractionConfig();
823
+ }
824
+ };
825
+ function detectMimeType(bytes) {
826
+ const binding2 = getBinding();
827
+ return binding2.detectMimeTypeFromBytes(bytes);
828
+ }
829
+ function detectMimeTypeFromPath(filePath, checkExists) {
830
+ const binding2 = getBinding();
831
+ return binding2.detectMimeTypeFromPath(filePath, checkExists);
832
+ }
833
+ function validateMimeType(mimeType) {
834
+ const binding2 = getBinding();
835
+ return binding2.validateMimeType(mimeType);
836
+ }
837
+ function getExtensionsForMime(mimeType) {
838
+ const binding2 = getBinding();
839
+ return binding2.getExtensionsForMime(mimeType);
840
+ }
841
+ function listEmbeddingPresets() {
842
+ const binding2 = getBinding();
843
+ return binding2.listEmbeddingPresets();
844
+ }
845
+ function getEmbeddingPreset(name) {
846
+ const binding2 = getBinding();
847
+ const result = binding2.getEmbeddingPreset(name);
848
+ return result;
849
+ }
850
+ function getLastErrorCode() {
851
+ const binding2 = getBinding();
852
+ return binding2.getLastErrorCode();
853
+ }
854
+ function getLastPanicContext() {
855
+ const binding2 = getBinding();
856
+ const result = binding2.getLastPanicContext();
857
+ return result;
858
+ }
859
+ function getErrorCodeName(code) {
860
+ const binding2 = getBinding();
861
+ return binding2.getErrorCodeName(code);
862
+ }
863
+ function getErrorCodeDescription(code) {
864
+ const binding2 = getBinding();
865
+ return binding2.getErrorCodeDescription(code);
866
+ }
867
+ function classifyError(errorMessage) {
868
+ const binding2 = getBinding();
869
+ const result = binding2.classifyError(errorMessage);
870
+ return result;
871
+ }
872
+ function createWorkerPool(size) {
873
+ const binding2 = getBinding();
874
+ const rawPool = binding2.createWorkerPool(size);
875
+ return rawPool;
876
+ }
877
+ function getWorkerPoolStats(pool) {
878
+ const binding2 = getBinding();
879
+ const rawStats = binding2.getWorkerPoolStats(pool);
880
+ return rawStats;
881
+ }
882
+ async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
883
+ let mimeType = null;
884
+ let config = null;
885
+ if (typeof mimeTypeOrConfig === "string") {
886
+ mimeType = mimeTypeOrConfig;
887
+ config = maybeConfig ?? null;
888
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
889
+ config = mimeTypeOrConfig;
890
+ mimeType = null;
891
+ } else {
892
+ config = maybeConfig ?? null;
893
+ mimeType = null;
894
+ }
895
+ const normalizedConfig = normalizeExtractionConfig(config);
896
+ const binding2 = getBinding();
897
+ const rawResult = await binding2.extractFileInWorker(
898
+ pool,
899
+ filePath,
900
+ mimeType,
901
+ normalizedConfig
902
+ );
903
+ return convertResult(rawResult);
904
+ }
905
+ async function batchExtractFilesInWorker(pool, paths, config = null) {
906
+ const normalizedConfig = normalizeExtractionConfig(config);
907
+ const binding2 = getBinding();
908
+ const rawResults = await binding2.batchExtractFilesInWorker(
909
+ pool,
910
+ paths,
911
+ normalizedConfig
912
+ );
913
+ return rawResults.map(convertResult);
914
+ }
915
+ async function closeWorkerPool(pool) {
916
+ const binding2 = getBinding();
917
+ await binding2.closeWorkerPool(pool);
918
+ }
919
+ const __version__ = "4.0.0";
920
+ export {
921
+ CacheError,
922
+ ErrorCode,
923
+ ExtractionConfig,
924
+ GutenOcrBackend,
925
+ ImageProcessingError,
926
+ KreuzbergError,
927
+ MissingDependencyError,
928
+ OcrError,
929
+ ParsingError,
930
+ PluginError,
931
+ ValidationError,
932
+ __resetBindingForTests,
933
+ __setBindingForTests,
934
+ __version__,
935
+ batchExtractBytes,
936
+ batchExtractBytesSync,
937
+ batchExtractFiles,
938
+ batchExtractFilesInWorker,
939
+ batchExtractFilesSync,
940
+ classifyError,
941
+ clearDocumentExtractors,
942
+ clearOcrBackends,
943
+ clearPostProcessors,
944
+ clearValidators,
945
+ closeWorkerPool,
946
+ createWorkerPool,
947
+ detectMimeType,
948
+ detectMimeTypeFromPath,
949
+ extractBytes,
950
+ extractBytesSync,
951
+ extractFile,
952
+ extractFileInWorker,
953
+ extractFileSync,
954
+ getEmbeddingPreset,
955
+ getErrorCodeDescription,
956
+ getErrorCodeName,
957
+ getExtensionsForMime,
958
+ getLastErrorCode,
959
+ getLastPanicContext,
960
+ getWorkerPoolStats,
961
+ listDocumentExtractors,
962
+ listEmbeddingPresets,
963
+ listOcrBackends,
964
+ listPostProcessors,
965
+ listValidators,
966
+ registerOcrBackend,
967
+ registerPostProcessor,
968
+ registerValidator,
969
+ unregisterDocumentExtractor,
970
+ unregisterOcrBackend,
971
+ unregisterPostProcessor,
972
+ unregisterValidator,
973
+ validateMimeType
974
+ };
975
+ //# sourceMappingURL=index.mjs.map