@kreuzberg/node 4.0.0-rc.6 → 4.0.0-rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,754 @@
1
+ import { createRequire } from "node:module";
2
+ import {
3
+ CacheError,
4
+ ErrorCode,
5
+ ImageProcessingError,
6
+ KreuzbergError,
7
+ MissingDependencyError,
8
+ OcrError,
9
+ ParsingError,
10
+ PluginError,
11
+ ValidationError
12
+ } from "./errors.js";
13
+ import { GutenOcrBackend } from "./ocr/guten-ocr.js";
14
+ export * from "./types.js";
15
+ let binding = null;
16
+ let bindingInitialized = false;
17
+ function createNativeBindingError(error) {
18
+ const hintParts = [];
19
+ let detail = "Unknown error while requiring native module.";
20
+ if (error instanceof Error) {
21
+ detail = error.message || error.toString();
22
+ if (/pdfium/i.test(detail)) {
23
+ hintParts.push(
24
+ "Pdfium runtime library was not found. Ensure the bundled libpdfium (dll/dylib/so) is present next to the native module."
25
+ );
26
+ }
27
+ return new Error(
28
+ [
29
+ "Failed to load Kreuzberg native bindings.",
30
+ hintParts.length ? hintParts.join(" ") : "",
31
+ "Report this error and attach the logs/stack trace for investigation.",
32
+ `Underlying error: ${detail}`
33
+ ].filter(Boolean).join(" "),
34
+ { cause: error }
35
+ );
36
+ }
37
+ return new Error(
38
+ [
39
+ "Failed to load Kreuzberg native bindings.",
40
+ "Report this error and attach the logs/stack trace for investigation.",
41
+ `Underlying error: ${String(error)}`
42
+ ].join(" ")
43
+ );
44
+ }
45
+ function assertUint8Array(value, name) {
46
+ if (!(value instanceof Uint8Array)) {
47
+ throw new TypeError(`${name} must be a Uint8Array`);
48
+ }
49
+ return value;
50
+ }
51
+ function assertUint8ArrayList(values, name) {
52
+ if (!Array.isArray(values)) {
53
+ throw new TypeError(`${name} must be an array of Uint8Array`);
54
+ }
55
+ const array = values;
56
+ return array.map((value, index) => {
57
+ try {
58
+ return assertUint8Array(value, `${name}[${index}]`);
59
+ } catch {
60
+ throw new TypeError(`${name}[${index}] must be a Uint8Array`);
61
+ }
62
+ });
63
+ }
64
+ function __setBindingForTests(mock) {
65
+ binding = mock;
66
+ bindingInitialized = true;
67
+ }
68
+ function __resetBindingForTests() {
69
+ binding = null;
70
+ bindingInitialized = false;
71
+ }
72
+ function loadNativeBinding() {
73
+ const localRequire = typeof require !== "undefined" ? (
74
+ // biome-ignore lint/suspicious/noExplicitAny: Node typings are available at runtime
75
+ require
76
+ ) : createRequire(import.meta.url);
77
+ if (!localRequire) {
78
+ throw new Error("Unable to resolve native binding loader (require not available).");
79
+ }
80
+ return localRequire("../index.js");
81
+ }
82
+ function getBinding() {
83
+ if (bindingInitialized) {
84
+ return binding;
85
+ }
86
+ try {
87
+ if (typeof process !== "undefined" && process.versions && process.versions.node) {
88
+ binding = loadNativeBinding();
89
+ bindingInitialized = true;
90
+ return binding;
91
+ }
92
+ } catch (error) {
93
+ throw createNativeBindingError(error);
94
+ }
95
+ throw new Error(
96
+ "Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
97
+ );
98
+ }
99
+ function parseMetadata(metadataStr) {
100
+ try {
101
+ return JSON.parse(metadataStr);
102
+ } catch {
103
+ return {};
104
+ }
105
+ }
106
+ function ensureUint8Array(value) {
107
+ if (value instanceof Uint8Array) {
108
+ return value;
109
+ }
110
+ if (typeof Buffer !== "undefined" && value instanceof Buffer) {
111
+ return new Uint8Array(value);
112
+ }
113
+ if (Array.isArray(value)) {
114
+ return new Uint8Array(value);
115
+ }
116
+ return new Uint8Array();
117
+ }
118
+ function convertChunk(rawChunk) {
119
+ if (!rawChunk) {
120
+ return {
121
+ content: "",
122
+ metadata: {
123
+ byteStart: 0,
124
+ byteEnd: 0,
125
+ tokenCount: null,
126
+ chunkIndex: 0,
127
+ totalChunks: 0
128
+ },
129
+ embedding: null
130
+ };
131
+ }
132
+ const metadata = rawChunk.metadata ?? {};
133
+ return {
134
+ content: rawChunk.content ?? "",
135
+ embedding: rawChunk.embedding ?? null,
136
+ metadata: {
137
+ byteStart: metadata.byte_start ?? metadata.charStart ?? 0,
138
+ byteEnd: metadata.byte_end ?? metadata.charEnd ?? 0,
139
+ tokenCount: metadata.token_count ?? metadata.tokenCount ?? null,
140
+ chunkIndex: metadata.chunk_index ?? metadata.chunkIndex ?? 0,
141
+ totalChunks: metadata.total_chunks ?? metadata.totalChunks ?? 0,
142
+ firstPage: metadata.first_page ?? metadata.firstPage ?? null,
143
+ lastPage: metadata.last_page ?? metadata.lastPage ?? null
144
+ }
145
+ };
146
+ }
147
+ function convertImage(rawImage) {
148
+ if (!rawImage) {
149
+ return {
150
+ data: new Uint8Array(),
151
+ format: "unknown",
152
+ imageIndex: 0,
153
+ pageNumber: null,
154
+ width: null,
155
+ height: null,
156
+ colorspace: null,
157
+ bitsPerComponent: null,
158
+ isMask: false,
159
+ description: null,
160
+ ocrResult: null
161
+ };
162
+ }
163
+ return {
164
+ data: ensureUint8Array(rawImage.data),
165
+ format: rawImage.format ?? "unknown",
166
+ imageIndex: rawImage.imageIndex ?? 0,
167
+ pageNumber: rawImage.pageNumber ?? null,
168
+ width: rawImage.width ?? null,
169
+ height: rawImage.height ?? null,
170
+ colorspace: rawImage.colorspace ?? null,
171
+ bitsPerComponent: rawImage.bitsPerComponent ?? null,
172
+ isMask: rawImage.isMask ?? false,
173
+ description: rawImage.description ?? null,
174
+ ocrResult: rawImage.ocrResult ? convertResult(rawImage.ocrResult) : null
175
+ };
176
+ }
177
+ function convertResult(rawResult) {
178
+ return {
179
+ content: rawResult.content,
180
+ mimeType: rawResult.mimeType,
181
+ metadata: typeof rawResult.metadata === "string" ? parseMetadata(rawResult.metadata) : rawResult.metadata,
182
+ tables: rawResult.tables || [],
183
+ detectedLanguages: rawResult.detectedLanguages || null,
184
+ chunks: Array.isArray(rawResult.chunks) ? rawResult.chunks.map((chunk) => convertChunk(chunk)) : null,
185
+ images: Array.isArray(rawResult.images) ? rawResult.images.map((image) => convertImage(image)) : null
186
+ };
187
+ }
188
+ function setIfDefined(target, key, value) {
189
+ if (value !== void 0) {
190
+ target[key] = value;
191
+ }
192
+ }
193
+ function normalizeTesseractConfig(config) {
194
+ if (!config) {
195
+ return void 0;
196
+ }
197
+ const normalized = {};
198
+ setIfDefined(normalized, "psm", config.psm);
199
+ setIfDefined(normalized, "enableTableDetection", config.enableTableDetection);
200
+ setIfDefined(normalized, "tesseditCharWhitelist", config.tesseditCharWhitelist);
201
+ return normalized;
202
+ }
203
+ function normalizeOcrConfig(ocr) {
204
+ if (!ocr) {
205
+ return void 0;
206
+ }
207
+ const normalized = {
208
+ backend: ocr.backend
209
+ };
210
+ setIfDefined(normalized, "language", ocr.language);
211
+ const tesseract = normalizeTesseractConfig(ocr.tesseractConfig);
212
+ if (tesseract) {
213
+ setIfDefined(normalized, "tesseractConfig", tesseract);
214
+ }
215
+ return normalized;
216
+ }
217
+ function normalizeChunkingConfig(chunking) {
218
+ if (!chunking) {
219
+ return void 0;
220
+ }
221
+ const normalized = {};
222
+ setIfDefined(normalized, "maxChars", chunking.maxChars);
223
+ setIfDefined(normalized, "maxOverlap", chunking.maxOverlap);
224
+ setIfDefined(normalized, "preset", chunking.preset);
225
+ setIfDefined(normalized, "embedding", chunking.embedding);
226
+ setIfDefined(normalized, "enabled", chunking.enabled);
227
+ return normalized;
228
+ }
229
+ function normalizeImageExtractionConfig(images) {
230
+ if (!images) {
231
+ return void 0;
232
+ }
233
+ const normalized = {};
234
+ setIfDefined(normalized, "extractImages", images.extractImages);
235
+ setIfDefined(normalized, "targetDpi", images.targetDpi);
236
+ setIfDefined(normalized, "maxImageDimension", images.maxImageDimension);
237
+ setIfDefined(normalized, "autoAdjustDpi", images.autoAdjustDpi);
238
+ setIfDefined(normalized, "minDpi", images.minDpi);
239
+ setIfDefined(normalized, "maxDpi", images.maxDpi);
240
+ return normalized;
241
+ }
242
+ function normalizePdfConfig(pdf) {
243
+ if (!pdf) {
244
+ return void 0;
245
+ }
246
+ const normalized = {};
247
+ setIfDefined(normalized, "extractImages", pdf.extractImages);
248
+ setIfDefined(normalized, "passwords", pdf.passwords);
249
+ setIfDefined(normalized, "extractMetadata", pdf.extractMetadata);
250
+ return normalized;
251
+ }
252
+ function normalizeTokenReductionConfig(tokenReduction) {
253
+ if (!tokenReduction) {
254
+ return void 0;
255
+ }
256
+ const normalized = {};
257
+ setIfDefined(normalized, "mode", tokenReduction.mode);
258
+ setIfDefined(normalized, "preserveImportantWords", tokenReduction.preserveImportantWords);
259
+ return normalized;
260
+ }
261
+ function normalizeLanguageDetectionConfig(languageDetection) {
262
+ if (!languageDetection) {
263
+ return void 0;
264
+ }
265
+ const normalized = {};
266
+ setIfDefined(normalized, "enabled", languageDetection.enabled);
267
+ setIfDefined(normalized, "minConfidence", languageDetection.minConfidence);
268
+ setIfDefined(normalized, "detectMultiple", languageDetection.detectMultiple);
269
+ return normalized;
270
+ }
271
+ function normalizePostProcessorConfig(postprocessor) {
272
+ if (!postprocessor) {
273
+ return void 0;
274
+ }
275
+ const normalized = {};
276
+ setIfDefined(normalized, "enabled", postprocessor.enabled);
277
+ setIfDefined(normalized, "enabledProcessors", postprocessor.enabledProcessors);
278
+ setIfDefined(normalized, "disabledProcessors", postprocessor.disabledProcessors);
279
+ return normalized;
280
+ }
281
+ function normalizeHtmlPreprocessing(options) {
282
+ if (!options) {
283
+ return void 0;
284
+ }
285
+ const normalized = {};
286
+ setIfDefined(normalized, "enabled", options.enabled);
287
+ setIfDefined(normalized, "preset", options.preset);
288
+ setIfDefined(normalized, "removeNavigation", options.removeNavigation);
289
+ setIfDefined(normalized, "removeForms", options.removeForms);
290
+ return normalized;
291
+ }
292
+ function normalizeHtmlOptions(options) {
293
+ if (!options) {
294
+ return void 0;
295
+ }
296
+ const normalized = {};
297
+ setIfDefined(normalized, "headingStyle", options.headingStyle);
298
+ setIfDefined(normalized, "listIndentType", options.listIndentType);
299
+ setIfDefined(normalized, "listIndentWidth", options.listIndentWidth);
300
+ setIfDefined(normalized, "bullets", options.bullets);
301
+ setIfDefined(normalized, "strongEmSymbol", options.strongEmSymbol);
302
+ setIfDefined(normalized, "escapeAsterisks", options.escapeAsterisks);
303
+ setIfDefined(normalized, "escapeUnderscores", options.escapeUnderscores);
304
+ setIfDefined(normalized, "escapeMisc", options.escapeMisc);
305
+ setIfDefined(normalized, "escapeAscii", options.escapeAscii);
306
+ setIfDefined(normalized, "codeLanguage", options.codeLanguage);
307
+ setIfDefined(normalized, "autolinks", options.autolinks);
308
+ setIfDefined(normalized, "defaultTitle", options.defaultTitle);
309
+ setIfDefined(normalized, "brInTables", options.brInTables);
310
+ setIfDefined(normalized, "hocrSpatialTables", options.hocrSpatialTables);
311
+ setIfDefined(normalized, "highlightStyle", options.highlightStyle);
312
+ setIfDefined(normalized, "extractMetadata", options.extractMetadata);
313
+ setIfDefined(normalized, "whitespaceMode", options.whitespaceMode);
314
+ setIfDefined(normalized, "stripNewlines", options.stripNewlines);
315
+ setIfDefined(normalized, "wrap", options.wrap);
316
+ setIfDefined(normalized, "wrapWidth", options.wrapWidth);
317
+ setIfDefined(normalized, "convertAsInline", options.convertAsInline);
318
+ setIfDefined(normalized, "subSymbol", options.subSymbol);
319
+ setIfDefined(normalized, "supSymbol", options.supSymbol);
320
+ setIfDefined(normalized, "newlineStyle", options.newlineStyle);
321
+ setIfDefined(normalized, "codeBlockStyle", options.codeBlockStyle);
322
+ setIfDefined(normalized, "keepInlineImagesIn", options.keepInlineImagesIn);
323
+ setIfDefined(normalized, "encoding", options.encoding);
324
+ setIfDefined(normalized, "debug", options.debug);
325
+ setIfDefined(normalized, "stripTags", options.stripTags);
326
+ setIfDefined(normalized, "preserveTags", options.preserveTags);
327
+ const preprocessing = normalizeHtmlPreprocessing(options.preprocessing);
328
+ setIfDefined(normalized, "preprocessing", preprocessing);
329
+ return normalized;
330
+ }
331
+ function normalizeKeywordConfig(config) {
332
+ if (!config) {
333
+ return void 0;
334
+ }
335
+ const normalized = {};
336
+ setIfDefined(normalized, "algorithm", config.algorithm);
337
+ setIfDefined(normalized, "maxKeywords", config.maxKeywords);
338
+ setIfDefined(normalized, "minScore", config.minScore);
339
+ setIfDefined(normalized, "ngramRange", config.ngramRange);
340
+ setIfDefined(normalized, "language", config.language);
341
+ setIfDefined(normalized, "yakeParams", config.yakeParams);
342
+ setIfDefined(normalized, "rakeParams", config.rakeParams);
343
+ return normalized;
344
+ }
345
+ function normalizePageConfig(pages) {
346
+ if (!pages) {
347
+ return void 0;
348
+ }
349
+ const normalized = {};
350
+ setIfDefined(normalized, "extract_pages", pages.extractPages);
351
+ setIfDefined(normalized, "insert_page_markers", pages.insertPageMarkers);
352
+ setIfDefined(normalized, "marker_format", pages.markerFormat);
353
+ return normalized;
354
+ }
355
+ function normalizeExtractionConfig(config) {
356
+ if (!config) {
357
+ return null;
358
+ }
359
+ const normalized = {};
360
+ setIfDefined(normalized, "useCache", config.useCache);
361
+ setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
362
+ setIfDefined(normalized, "forceOcr", config.forceOcr);
363
+ setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
364
+ const ocr = normalizeOcrConfig(config.ocr);
365
+ setIfDefined(normalized, "ocr", ocr);
366
+ const chunking = normalizeChunkingConfig(config.chunking);
367
+ setIfDefined(normalized, "chunking", chunking);
368
+ const images = normalizeImageExtractionConfig(config.images);
369
+ setIfDefined(normalized, "images", images);
370
+ const pdf = normalizePdfConfig(config.pdfOptions);
371
+ setIfDefined(normalized, "pdfOptions", pdf);
372
+ const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
373
+ setIfDefined(normalized, "tokenReduction", tokenReduction);
374
+ const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
375
+ setIfDefined(normalized, "languageDetection", languageDetection);
376
+ const postprocessor = normalizePostProcessorConfig(config.postprocessor);
377
+ setIfDefined(normalized, "postprocessor", postprocessor);
378
+ const keywords = normalizeKeywordConfig(config.keywords);
379
+ setIfDefined(normalized, "keywords", keywords);
380
+ const pages = normalizePageConfig(config.pages);
381
+ setIfDefined(normalized, "pages", pages);
382
+ const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
383
+ setIfDefined(normalized, "htmlOptions", htmlOptions);
384
+ return normalized;
385
+ }
386
+ function extractFileSync(filePath, mimeType = null, config = null) {
387
+ const normalizedConfig = normalizeExtractionConfig(config);
388
+ const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
389
+ return convertResult(rawResult);
390
+ }
391
+ async function extractFile(filePath, mimeType = null, config = null) {
392
+ const normalizedConfig = normalizeExtractionConfig(config);
393
+ const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
394
+ return convertResult(rawResult);
395
+ }
396
+ function extractBytesSync(data, mimeType, config = null) {
397
+ const validated = assertUint8Array(data, "data");
398
+ const normalizedConfig = normalizeExtractionConfig(config);
399
+ const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
400
+ return convertResult(rawResult);
401
+ }
402
+ async function extractBytes(data, mimeType, config = null) {
403
+ const validated = assertUint8Array(data, "data");
404
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
405
+ console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
406
+ }
407
+ const normalizedConfig = normalizeExtractionConfig(config);
408
+ const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
409
+ return convertResult(rawResult);
410
+ }
411
+ function batchExtractFilesSync(paths, config = null) {
412
+ const normalizedConfig = normalizeExtractionConfig(config);
413
+ const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
414
+ return rawResults.map(convertResult);
415
+ }
416
+ async function batchExtractFiles(paths, config = null) {
417
+ const normalizedConfig = normalizeExtractionConfig(config);
418
+ const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
419
+ return rawResults.map(convertResult);
420
+ }
421
+ function batchExtractBytesSync(dataList, mimeTypes, config = null) {
422
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
423
+ if (buffers.length !== mimeTypes.length) {
424
+ throw new TypeError("dataList and mimeTypes must have the same length");
425
+ }
426
+ const normalizedConfig = normalizeExtractionConfig(config);
427
+ const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
428
+ return rawResults.map(convertResult);
429
+ }
430
+ async function batchExtractBytes(dataList, mimeTypes, config = null) {
431
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
432
+ if (buffers.length !== mimeTypes.length) {
433
+ throw new TypeError("dataList and mimeTypes must have the same length");
434
+ }
435
+ const normalizedConfig = normalizeExtractionConfig(config);
436
+ const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
437
+ return rawResults.map(convertResult);
438
+ }
439
+ function registerPostProcessor(processor) {
440
+ const binding2 = getBinding();
441
+ const wrappedProcessor = {
442
+ name: processor.name.bind(processor),
443
+ processingStage: processor.processingStage?.bind(processor),
444
+ async process(...args) {
445
+ const wrappedValue = args[0];
446
+ const jsonString = wrappedValue[0];
447
+ const wireResult = JSON.parse(jsonString);
448
+ const result = {
449
+ content: wireResult.content,
450
+ mimeType: wireResult.mime_type,
451
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
452
+ tables: wireResult.tables || [],
453
+ detectedLanguages: wireResult.detected_languages ?? null,
454
+ chunks: wireResult.chunks ?? null,
455
+ images: wireResult.images ?? null
456
+ };
457
+ const updated = await processor.process(result);
458
+ const wireUpdated = {
459
+ content: updated.content,
460
+ mime_type: updated.mimeType,
461
+ metadata: updated.metadata,
462
+ tables: updated.tables,
463
+ detected_languages: updated.detectedLanguages,
464
+ chunks: updated.chunks,
465
+ images: updated.images
466
+ };
467
+ return JSON.stringify(wireUpdated);
468
+ }
469
+ };
470
+ Object.defineProperty(wrappedProcessor, "__original", {
471
+ value: processor,
472
+ enumerable: false
473
+ });
474
+ const stage = processor.processingStage?.() ?? "middle";
475
+ Object.defineProperty(wrappedProcessor, "__stage", {
476
+ value: stage,
477
+ enumerable: false
478
+ });
479
+ binding2.registerPostProcessor(wrappedProcessor);
480
+ }
481
+ function unregisterPostProcessor(name) {
482
+ const binding2 = getBinding();
483
+ binding2.unregisterPostProcessor(name);
484
+ }
485
+ function clearPostProcessors() {
486
+ const binding2 = getBinding();
487
+ binding2.clearPostProcessors();
488
+ }
489
+ function listPostProcessors() {
490
+ const binding2 = getBinding();
491
+ return binding2.listPostProcessors();
492
+ }
493
+ function registerValidator(validator) {
494
+ const binding2 = getBinding();
495
+ const wrappedValidator = {
496
+ name: validator.name.bind(validator),
497
+ priority: validator.priority?.bind(validator),
498
+ async validate(...args) {
499
+ const jsonString = args[0];
500
+ if (!jsonString || jsonString === "undefined") {
501
+ throw new Error("Validator received invalid JSON string");
502
+ }
503
+ const wireResult = JSON.parse(jsonString);
504
+ const result = {
505
+ content: wireResult.content,
506
+ mimeType: wireResult.mime_type,
507
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
508
+ tables: wireResult.tables || [],
509
+ detectedLanguages: wireResult.detected_languages,
510
+ chunks: wireResult.chunks,
511
+ images: wireResult.images ?? null
512
+ };
513
+ await Promise.resolve(validator.validate(result));
514
+ return "";
515
+ }
516
+ };
517
+ binding2.registerValidator(wrappedValidator);
518
+ }
519
+ function unregisterValidator(name) {
520
+ const binding2 = getBinding();
521
+ binding2.unregisterValidator(name);
522
+ }
523
+ function clearValidators() {
524
+ const binding2 = getBinding();
525
+ binding2.clearValidators();
526
+ }
527
+ function listValidators() {
528
+ const binding2 = getBinding();
529
+ return binding2.listValidators();
530
+ }
531
+ function isOcrProcessTuple(value) {
532
+ return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
533
+ }
534
+ function isNestedOcrProcessTuple(value) {
535
+ return Array.isArray(value) && value.length === 1 && isOcrProcessTuple(value[0]);
536
+ }
537
+ function describePayload(value) {
538
+ if (typeof value === "string") {
539
+ return { ctor: "String", length: value.length };
540
+ }
541
+ return { ctor: value.constructor?.name ?? "Buffer", length: value.length };
542
+ }
543
+ function registerOcrBackend(backend) {
544
+ const binding2 = getBinding();
545
+ const wrappedBackend = {
546
+ name: backend.name.bind(backend),
547
+ supportedLanguages: backend.supportedLanguages.bind(backend),
548
+ async processImage(...processArgs) {
549
+ const [imagePayload, maybeLanguage] = processArgs;
550
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
551
+ console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
552
+ console.log("[registerOcrBackend] Raw args", {
553
+ imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
554
+ maybeLanguageType: typeof maybeLanguage,
555
+ metadata: Array.isArray(imagePayload) ? { tupleLength: imagePayload.length } : describePayload(imagePayload)
556
+ });
557
+ }
558
+ let rawBytes;
559
+ let language = maybeLanguage;
560
+ if (isNestedOcrProcessTuple(imagePayload)) {
561
+ [rawBytes, language] = imagePayload[0];
562
+ } else if (isOcrProcessTuple(imagePayload)) {
563
+ [rawBytes, language] = imagePayload;
564
+ } else {
565
+ rawBytes = imagePayload;
566
+ }
567
+ if (typeof language !== "string") {
568
+ throw new Error("OCR backend did not receive a language parameter");
569
+ }
570
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
571
+ const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
572
+ console.log(
573
+ "[registerOcrBackend] Received payload",
574
+ Array.isArray(imagePayload) ? "tuple" : typeof rawBytes,
575
+ "ctor",
576
+ describePayload(rawBytes).ctor,
577
+ "length",
578
+ length
579
+ );
580
+ }
581
+ const buffer = typeof rawBytes === "string" ? Buffer.from(rawBytes, "base64") : Buffer.from(rawBytes);
582
+ const result = await backend.processImage(new Uint8Array(buffer), language);
583
+ return JSON.stringify(result);
584
+ }
585
+ };
586
+ binding2.registerOcrBackend(wrappedBackend);
587
+ }
588
+ function listOcrBackends() {
589
+ const binding2 = getBinding();
590
+ return binding2.listOcrBackends();
591
+ }
592
+ function unregisterOcrBackend(name) {
593
+ const binding2 = getBinding();
594
+ binding2.unregisterOcrBackend(name);
595
+ }
596
+ function clearOcrBackends() {
597
+ const binding2 = getBinding();
598
+ binding2.clearOcrBackends();
599
+ }
600
+ function listDocumentExtractors() {
601
+ const binding2 = getBinding();
602
+ return binding2.listDocumentExtractors();
603
+ }
604
+ function unregisterDocumentExtractor(name) {
605
+ const binding2 = getBinding();
606
+ binding2.unregisterDocumentExtractor(name);
607
+ }
608
+ function clearDocumentExtractors() {
609
+ const binding2 = getBinding();
610
+ binding2.clearDocumentExtractors();
611
+ }
612
+ const ExtractionConfig = {
613
+ /**
614
+ * Load extraction configuration from a file.
615
+ *
616
+ * Automatically detects the file format based on extension:
617
+ * - `.toml` - TOML format
618
+ * - `.yaml` - YAML format
619
+ * - `.json` - JSON format
620
+ *
621
+ * @param filePath - Path to the configuration file (absolute or relative)
622
+ * @returns ExtractionConfig object loaded from the file
623
+ *
624
+ * @throws {Error} If file does not exist or is not accessible
625
+ * @throws {Error} If file content is not valid TOML/YAML/JSON
626
+ * @throws {Error} If configuration structure is invalid
627
+ * @throws {Error} If file extension is not supported
628
+ *
629
+ * @example
630
+ * ```typescript
631
+ * import { ExtractionConfig } from '@kreuzberg/node';
632
+ *
633
+ * // Load from TOML file
634
+ * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
635
+ *
636
+ * // Load from YAML file
637
+ * const config2 = ExtractionConfig.fromFile('./config.yaml');
638
+ *
639
+ * // Load from JSON file
640
+ * const config3 = ExtractionConfig.fromFile('./config.json');
641
+ * ```
642
+ */
643
+ fromFile(filePath) {
644
+ const binding2 = getBinding();
645
+ return binding2.loadExtractionConfigFromFile(filePath);
646
+ },
647
+ /**
648
+ * Discover and load configuration from current or parent directories.
649
+ *
650
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
651
+ * and traversing up the directory tree. Returns the first configuration file found.
652
+ *
653
+ * @returns ExtractionConfig object if found, or null if no configuration file exists
654
+ *
655
+ * @example
656
+ * ```typescript
657
+ * import { ExtractionConfig } from '@kreuzberg/node';
658
+ *
659
+ * // Try to find config in current or parent directories
660
+ * const config = ExtractionConfig.discover();
661
+ * if (config) {
662
+ * console.log('Found configuration');
663
+ * // Use config for extraction
664
+ * } else {
665
+ * console.log('No configuration file found, using defaults');
666
+ * }
667
+ * ```
668
+ */
669
+ discover() {
670
+ const binding2 = getBinding();
671
+ return binding2.discoverExtractionConfig();
672
+ }
673
+ };
674
+ function detectMimeType(bytes) {
675
+ const binding2 = getBinding();
676
+ return binding2.detectMimeType(bytes);
677
+ }
678
+ function detectMimeTypeFromPath(path, checkExists) {
679
+ const binding2 = getBinding();
680
+ return binding2.detectMimeTypeFromPath(path, checkExists);
681
+ }
682
+ function validateMimeType(mimeType) {
683
+ const binding2 = getBinding();
684
+ return binding2.validateMimeType(mimeType);
685
+ }
686
+ function getExtensionsForMime(mimeType) {
687
+ const binding2 = getBinding();
688
+ return binding2.getExtensionsForMime(mimeType);
689
+ }
690
+ function listEmbeddingPresets() {
691
+ const binding2 = getBinding();
692
+ return binding2.listEmbeddingPresets();
693
+ }
694
+ function getEmbeddingPreset(name) {
695
+ const binding2 = getBinding();
696
+ return binding2.getEmbeddingPreset(name);
697
+ }
698
+ function getLastErrorCode() {
699
+ const binding2 = getBinding();
700
+ return binding2.getLastErrorCode();
701
+ }
702
+ function getLastPanicContext() {
703
+ const binding2 = getBinding();
704
+ return binding2.getLastPanicContext();
705
+ }
706
+ const __version__ = "4.0.0-rc.8";
707
+ export {
708
+ CacheError,
709
+ ErrorCode,
710
+ ExtractionConfig,
711
+ GutenOcrBackend,
712
+ ImageProcessingError,
713
+ KreuzbergError,
714
+ MissingDependencyError,
715
+ OcrError,
716
+ ParsingError,
717
+ PluginError,
718
+ ValidationError,
719
+ __resetBindingForTests,
720
+ __setBindingForTests,
721
+ __version__,
722
+ batchExtractBytes,
723
+ batchExtractBytesSync,
724
+ batchExtractFiles,
725
+ batchExtractFilesSync,
726
+ clearDocumentExtractors,
727
+ clearOcrBackends,
728
+ clearPostProcessors,
729
+ clearValidators,
730
+ detectMimeType,
731
+ detectMimeTypeFromPath,
732
+ extractBytes,
733
+ extractBytesSync,
734
+ extractFile,
735
+ extractFileSync,
736
+ getEmbeddingPreset,
737
+ getExtensionsForMime,
738
+ getLastErrorCode,
739
+ getLastPanicContext,
740
+ listDocumentExtractors,
741
+ listEmbeddingPresets,
742
+ listOcrBackends,
743
+ listPostProcessors,
744
+ listValidators,
745
+ registerOcrBackend,
746
+ registerPostProcessor,
747
+ registerValidator,
748
+ unregisterDocumentExtractor,
749
+ unregisterOcrBackend,
750
+ unregisterPostProcessor,
751
+ unregisterValidator,
752
+ validateMimeType
753
+ };
754
+ //# sourceMappingURL=index.mjs.map