@kreuzberg/node 4.0.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,918 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __reExport = (target, mod, secondTarget) => (__copyProps(target, mod, "default"), secondTarget && __copyProps(secondTarget, mod, "default"));
19
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
20
+ var index_exports = {};
21
+ __export(index_exports, {
22
+ CacheError: () => import_errors.CacheError,
23
+ ErrorCode: () => import_errors.ErrorCode,
24
+ ExtractionConfig: () => ExtractionConfig,
25
+ GutenOcrBackend: () => import_guten_ocr.GutenOcrBackend,
26
+ ImageProcessingError: () => import_errors.ImageProcessingError,
27
+ KreuzbergError: () => import_errors.KreuzbergError,
28
+ MissingDependencyError: () => import_errors.MissingDependencyError,
29
+ OcrError: () => import_errors.OcrError,
30
+ ParsingError: () => import_errors.ParsingError,
31
+ PluginError: () => import_errors.PluginError,
32
+ ValidationError: () => import_errors.ValidationError,
33
+ __resetBindingForTests: () => __resetBindingForTests,
34
+ __setBindingForTests: () => __setBindingForTests,
35
+ __version__: () => __version__,
36
+ batchExtractBytes: () => batchExtractBytes,
37
+ batchExtractBytesSync: () => batchExtractBytesSync,
38
+ batchExtractFiles: () => batchExtractFiles,
39
+ batchExtractFilesSync: () => batchExtractFilesSync,
40
+ clearDocumentExtractors: () => clearDocumentExtractors,
41
+ clearOcrBackends: () => clearOcrBackends,
42
+ clearPostProcessors: () => clearPostProcessors,
43
+ clearValidators: () => clearValidators,
44
+ detectMimeType: () => detectMimeType,
45
+ detectMimeTypeFromPath: () => detectMimeTypeFromPath,
46
+ extractBytes: () => extractBytes,
47
+ extractBytesSync: () => extractBytesSync,
48
+ extractFile: () => extractFile,
49
+ extractFileSync: () => extractFileSync,
50
+ getEmbeddingPreset: () => getEmbeddingPreset,
51
+ getExtensionsForMime: () => getExtensionsForMime,
52
+ getLastErrorCode: () => getLastErrorCode,
53
+ getLastPanicContext: () => getLastPanicContext,
54
+ listDocumentExtractors: () => listDocumentExtractors,
55
+ listEmbeddingPresets: () => listEmbeddingPresets,
56
+ listOcrBackends: () => listOcrBackends,
57
+ listPostProcessors: () => listPostProcessors,
58
+ listValidators: () => listValidators,
59
+ registerOcrBackend: () => registerOcrBackend,
60
+ registerPostProcessor: () => registerPostProcessor,
61
+ registerValidator: () => registerValidator,
62
+ unregisterDocumentExtractor: () => unregisterDocumentExtractor,
63
+ unregisterOcrBackend: () => unregisterOcrBackend,
64
+ unregisterPostProcessor: () => unregisterPostProcessor,
65
+ unregisterValidator: () => unregisterValidator,
66
+ validateMimeType: () => validateMimeType
67
+ });
68
+ module.exports = __toCommonJS(index_exports);
69
+ var import_node_module = require("node:module");
70
+ var import_errors = require("./errors.js");
71
+ var import_guten_ocr = require("./ocr/guten-ocr.js");
72
+ __reExport(index_exports, require("./types.js"), module.exports);
73
+ const import_meta = {};
74
+ let binding = null;
75
+ let bindingInitialized = false;
76
+ function createNativeBindingError(error) {
77
+ const hintParts = [];
78
+ let detail = "Unknown error while requiring native module.";
79
+ if (error instanceof Error) {
80
+ detail = error.message || error.toString();
81
+ if (/pdfium/i.test(detail)) {
82
+ hintParts.push(
83
+ "Pdfium runtime library was not found. Ensure the bundled libpdfium (dll/dylib/so) is present next to the native module."
84
+ );
85
+ }
86
+ return new Error(
87
+ [
88
+ "Failed to load Kreuzberg native bindings.",
89
+ hintParts.length ? hintParts.join(" ") : "",
90
+ "Report this error and attach the logs/stack trace for investigation.",
91
+ `Underlying error: ${detail}`
92
+ ].filter(Boolean).join(" "),
93
+ { cause: error }
94
+ );
95
+ }
96
+ return new Error(
97
+ [
98
+ "Failed to load Kreuzberg native bindings.",
99
+ "Report this error and attach the logs/stack trace for investigation.",
100
+ `Underlying error: ${String(error)}`
101
+ ].join(" ")
102
+ );
103
+ }
104
+ function assertUint8Array(value, name) {
105
+ if (!(value instanceof Uint8Array)) {
106
+ throw new TypeError(`${name} must be a Uint8Array`);
107
+ }
108
+ return value;
109
+ }
110
+ function assertUint8ArrayList(values, name) {
111
+ if (!Array.isArray(values)) {
112
+ throw new TypeError(`${name} must be an array of Uint8Array`);
113
+ }
114
+ const array = values;
115
+ return array.map((value, index) => {
116
+ try {
117
+ return assertUint8Array(value, `${name}[${index}]`);
118
+ } catch {
119
+ throw new TypeError(`${name}[${index}] must be a Uint8Array`);
120
+ }
121
+ });
122
+ }
123
+ function __setBindingForTests(mock) {
124
+ binding = mock;
125
+ bindingInitialized = true;
126
+ }
127
+ function __resetBindingForTests() {
128
+ binding = null;
129
+ bindingInitialized = false;
130
+ }
131
+ function loadNativeBinding() {
132
+ const localRequire = typeof require !== "undefined" ? require : (0, import_node_module.createRequire)(import_meta.url);
133
+ if (!localRequire) {
134
+ throw new Error("Unable to resolve native binding loader (require not available).");
135
+ }
136
+ const loadedModule = localRequire("../index.js");
137
+ if (typeof loadedModule !== "object" || loadedModule === null) {
138
+ throw new Error(
139
+ "Native binding is not a valid object. Ensure the native module is properly built and compatible."
140
+ );
141
+ }
142
+ const module2 = loadedModule;
143
+ const requiredMethods = [
144
+ "extractFileSync",
145
+ "extractFile",
146
+ "extractBytesSync",
147
+ "extractBytes",
148
+ "batchExtractFilesSync",
149
+ "batchExtractFiles",
150
+ "batchExtractBytesSync",
151
+ "batchExtractBytes"
152
+ ];
153
+ for (const method of requiredMethods) {
154
+ if (typeof module2[method] !== "function") {
155
+ throw new Error(
156
+ `Native binding is missing required method: ${method}. Ensure the native module is properly built and compatible.`
157
+ );
158
+ }
159
+ }
160
+ return module2;
161
+ }
162
+ function getBinding() {
163
+ if (bindingInitialized) {
164
+ if (binding === null) {
165
+ throw new Error("Native binding was previously failed to load.");
166
+ }
167
+ return binding;
168
+ }
169
+ try {
170
+ if (typeof process !== "undefined" && process.versions && process.versions.node) {
171
+ binding = loadNativeBinding();
172
+ bindingInitialized = true;
173
+ return binding;
174
+ }
175
+ } catch (error) {
176
+ bindingInitialized = true;
177
+ throw createNativeBindingError(error);
178
+ }
179
+ throw new Error(
180
+ "Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
181
+ );
182
+ }
183
+ function parseMetadata(metadataStr) {
184
+ try {
185
+ const parsed = JSON.parse(metadataStr);
186
+ if (typeof parsed === "object" && parsed !== null) {
187
+ return parsed;
188
+ }
189
+ return {};
190
+ } catch {
191
+ return {};
192
+ }
193
+ }
194
+ function ensureUint8Array(value) {
195
+ if (value instanceof Uint8Array) {
196
+ return value;
197
+ }
198
+ if (typeof Buffer !== "undefined" && value instanceof Buffer) {
199
+ return new Uint8Array(value);
200
+ }
201
+ if (Array.isArray(value)) {
202
+ return new Uint8Array(value);
203
+ }
204
+ return new Uint8Array();
205
+ }
206
+ function convertChunk(rawChunk) {
207
+ if (!rawChunk || typeof rawChunk !== "object") {
208
+ return {
209
+ content: "",
210
+ metadata: {
211
+ byteStart: 0,
212
+ byteEnd: 0,
213
+ tokenCount: null,
214
+ chunkIndex: 0,
215
+ totalChunks: 0
216
+ },
217
+ embedding: null
218
+ };
219
+ }
220
+ const chunk = rawChunk;
221
+ const metadata = chunk["metadata"] ?? {};
222
+ return {
223
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
224
+ content: chunk["content"] ?? "",
225
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
226
+ embedding: chunk["embedding"] ?? null,
227
+ metadata: {
228
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
229
+ byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
230
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
231
+ byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
232
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
233
+ tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
234
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
235
+ chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
236
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
237
+ totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
238
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
239
+ firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
240
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
241
+ lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
242
+ }
243
+ };
244
+ }
245
+ function convertImage(rawImage) {
246
+ if (!rawImage || typeof rawImage !== "object") {
247
+ return {
248
+ data: new Uint8Array(),
249
+ format: "unknown",
250
+ imageIndex: 0,
251
+ pageNumber: null,
252
+ width: null,
253
+ height: null,
254
+ colorspace: null,
255
+ bitsPerComponent: null,
256
+ isMask: false,
257
+ description: null,
258
+ ocrResult: null
259
+ };
260
+ }
261
+ const image = rawImage;
262
+ return {
263
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
264
+ data: ensureUint8Array(image["data"]),
265
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
266
+ format: image["format"] ?? "unknown",
267
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
268
+ imageIndex: image["imageIndex"] ?? 0,
269
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
270
+ pageNumber: image["pageNumber"] ?? null,
271
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
272
+ width: image["width"] ?? null,
273
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
274
+ height: image["height"] ?? null,
275
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
276
+ colorspace: image["colorspace"] ?? null,
277
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
278
+ bitsPerComponent: image["bitsPerComponent"] ?? null,
279
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
280
+ isMask: image["isMask"] ?? false,
281
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
282
+ description: image["description"] ?? null,
283
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
284
+ ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
285
+ };
286
+ }
287
+ function convertPageContent(rawPage) {
288
+ if (!rawPage || typeof rawPage !== "object") {
289
+ return {
290
+ pageNumber: 0,
291
+ content: "",
292
+ tables: [],
293
+ images: []
294
+ };
295
+ }
296
+ const page = rawPage;
297
+ return {
298
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
299
+ pageNumber: page["pageNumber"] ?? 0,
300
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
301
+ content: page["content"] ?? "",
302
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
303
+ tables: Array.isArray(page["tables"]) ? page["tables"] : [],
304
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
305
+ images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
306
+ };
307
+ }
308
+ function convertResult(rawResult) {
309
+ if (!rawResult || typeof rawResult !== "object") {
310
+ return {
311
+ content: "",
312
+ mimeType: "application/octet-stream",
313
+ metadata: {},
314
+ tables: [],
315
+ detectedLanguages: null,
316
+ chunks: null,
317
+ images: null,
318
+ pages: null
319
+ };
320
+ }
321
+ const result = rawResult;
322
+ const metadata = result["metadata"];
323
+ const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
324
+ return {
325
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
326
+ content: result["content"] ?? "",
327
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
328
+ mimeType: result["mimeType"] ?? "application/octet-stream",
329
+ metadata: metadataValue,
330
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
331
+ tables: Array.isArray(result["tables"]) ? result["tables"] : [],
332
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
333
+ detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
334
+ chunks: (() => {
335
+ const chunksData = result["chunks"];
336
+ return Array.isArray(chunksData) ? chunksData.map((chunk) => convertChunk(chunk)) : null;
337
+ })(),
338
+ images: (() => {
339
+ const imagesData = result["images"];
340
+ return Array.isArray(imagesData) ? imagesData.map((image) => convertImage(image)) : null;
341
+ })(),
342
+ pages: (() => {
343
+ const pagesData = result["pages"];
344
+ return Array.isArray(pagesData) ? pagesData.map((page) => convertPageContent(page)) : null;
345
+ })()
346
+ };
347
+ }
348
+ function setIfDefined(target, key, value) {
349
+ if (value !== void 0) {
350
+ target[key] = value;
351
+ }
352
+ }
353
+ function normalizeTesseractConfig(config) {
354
+ if (!config) {
355
+ return void 0;
356
+ }
357
+ const normalized = {};
358
+ setIfDefined(normalized, "psm", config.psm);
359
+ setIfDefined(normalized, "enableTableDetection", config.enableTableDetection);
360
+ setIfDefined(normalized, "tesseditCharWhitelist", config.tesseditCharWhitelist);
361
+ return normalized;
362
+ }
363
+ function normalizeOcrConfig(ocr) {
364
+ if (!ocr) {
365
+ return void 0;
366
+ }
367
+ const normalized = {
368
+ backend: ocr.backend
369
+ };
370
+ setIfDefined(normalized, "language", ocr.language);
371
+ const tesseract = normalizeTesseractConfig(ocr.tesseractConfig);
372
+ if (tesseract) {
373
+ setIfDefined(normalized, "tesseractConfig", tesseract);
374
+ }
375
+ return normalized;
376
+ }
377
+ function normalizeChunkingConfig(chunking) {
378
+ if (!chunking) {
379
+ return void 0;
380
+ }
381
+ const normalized = {};
382
+ setIfDefined(normalized, "maxChars", chunking.maxChars);
383
+ setIfDefined(normalized, "maxOverlap", chunking.maxOverlap);
384
+ setIfDefined(normalized, "preset", chunking.preset);
385
+ setIfDefined(normalized, "embedding", chunking.embedding);
386
+ setIfDefined(normalized, "enabled", chunking.enabled);
387
+ return normalized;
388
+ }
389
+ function normalizeImageExtractionConfig(images) {
390
+ if (!images) {
391
+ return void 0;
392
+ }
393
+ const normalized = {};
394
+ setIfDefined(normalized, "extractImages", images.extractImages);
395
+ setIfDefined(normalized, "targetDpi", images.targetDpi);
396
+ setIfDefined(normalized, "maxImageDimension", images.maxImageDimension);
397
+ setIfDefined(normalized, "autoAdjustDpi", images.autoAdjustDpi);
398
+ setIfDefined(normalized, "minDpi", images.minDpi);
399
+ setIfDefined(normalized, "maxDpi", images.maxDpi);
400
+ return normalized;
401
+ }
402
+ function normalizePdfConfig(pdf) {
403
+ if (!pdf) {
404
+ return void 0;
405
+ }
406
+ const normalized = {};
407
+ setIfDefined(normalized, "extractImages", pdf.extractImages);
408
+ setIfDefined(normalized, "passwords", pdf.passwords);
409
+ setIfDefined(normalized, "extractMetadata", pdf.extractMetadata);
410
+ return normalized;
411
+ }
412
+ function normalizeTokenReductionConfig(tokenReduction) {
413
+ if (!tokenReduction) {
414
+ return void 0;
415
+ }
416
+ const normalized = {};
417
+ setIfDefined(normalized, "mode", tokenReduction.mode);
418
+ setIfDefined(normalized, "preserveImportantWords", tokenReduction.preserveImportantWords);
419
+ return normalized;
420
+ }
421
+ function normalizeLanguageDetectionConfig(languageDetection) {
422
+ if (!languageDetection) {
423
+ return void 0;
424
+ }
425
+ const normalized = {};
426
+ setIfDefined(normalized, "enabled", languageDetection.enabled);
427
+ setIfDefined(normalized, "minConfidence", languageDetection.minConfidence);
428
+ setIfDefined(normalized, "detectMultiple", languageDetection.detectMultiple);
429
+ return normalized;
430
+ }
431
+ function normalizePostProcessorConfig(postprocessor) {
432
+ if (!postprocessor) {
433
+ return void 0;
434
+ }
435
+ const normalized = {};
436
+ setIfDefined(normalized, "enabled", postprocessor.enabled);
437
+ setIfDefined(normalized, "enabledProcessors", postprocessor.enabledProcessors);
438
+ setIfDefined(normalized, "disabledProcessors", postprocessor.disabledProcessors);
439
+ return normalized;
440
+ }
441
+ function normalizeHtmlPreprocessing(options) {
442
+ if (!options) {
443
+ return void 0;
444
+ }
445
+ const normalized = {};
446
+ setIfDefined(normalized, "enabled", options.enabled);
447
+ setIfDefined(normalized, "preset", options.preset);
448
+ setIfDefined(normalized, "removeNavigation", options.removeNavigation);
449
+ setIfDefined(normalized, "removeForms", options.removeForms);
450
+ return normalized;
451
+ }
452
+ function normalizeHtmlOptions(options) {
453
+ if (!options) {
454
+ return void 0;
455
+ }
456
+ const normalized = {};
457
+ setIfDefined(normalized, "headingStyle", options.headingStyle);
458
+ setIfDefined(normalized, "listIndentType", options.listIndentType);
459
+ setIfDefined(normalized, "listIndentWidth", options.listIndentWidth);
460
+ setIfDefined(normalized, "bullets", options.bullets);
461
+ setIfDefined(normalized, "strongEmSymbol", options.strongEmSymbol);
462
+ setIfDefined(normalized, "escapeAsterisks", options.escapeAsterisks);
463
+ setIfDefined(normalized, "escapeUnderscores", options.escapeUnderscores);
464
+ setIfDefined(normalized, "escapeMisc", options.escapeMisc);
465
+ setIfDefined(normalized, "escapeAscii", options.escapeAscii);
466
+ setIfDefined(normalized, "codeLanguage", options.codeLanguage);
467
+ setIfDefined(normalized, "autolinks", options.autolinks);
468
+ setIfDefined(normalized, "defaultTitle", options.defaultTitle);
469
+ setIfDefined(normalized, "brInTables", options.brInTables);
470
+ setIfDefined(normalized, "hocrSpatialTables", options.hocrSpatialTables);
471
+ setIfDefined(normalized, "highlightStyle", options.highlightStyle);
472
+ setIfDefined(normalized, "extractMetadata", options.extractMetadata);
473
+ setIfDefined(normalized, "whitespaceMode", options.whitespaceMode);
474
+ setIfDefined(normalized, "stripNewlines", options.stripNewlines);
475
+ setIfDefined(normalized, "wrap", options.wrap);
476
+ setIfDefined(normalized, "wrapWidth", options.wrapWidth);
477
+ setIfDefined(normalized, "convertAsInline", options.convertAsInline);
478
+ setIfDefined(normalized, "subSymbol", options.subSymbol);
479
+ setIfDefined(normalized, "supSymbol", options.supSymbol);
480
+ setIfDefined(normalized, "newlineStyle", options.newlineStyle);
481
+ setIfDefined(normalized, "codeBlockStyle", options.codeBlockStyle);
482
+ setIfDefined(normalized, "keepInlineImagesIn", options.keepInlineImagesIn);
483
+ setIfDefined(normalized, "encoding", options.encoding);
484
+ setIfDefined(normalized, "debug", options.debug);
485
+ setIfDefined(normalized, "stripTags", options.stripTags);
486
+ setIfDefined(normalized, "preserveTags", options.preserveTags);
487
+ const preprocessing = normalizeHtmlPreprocessing(options.preprocessing);
488
+ setIfDefined(normalized, "preprocessing", preprocessing);
489
+ return normalized;
490
+ }
491
+ function normalizeKeywordConfig(config) {
492
+ if (!config) {
493
+ return void 0;
494
+ }
495
+ const normalized = {};
496
+ setIfDefined(normalized, "algorithm", config.algorithm);
497
+ setIfDefined(normalized, "maxKeywords", config.maxKeywords);
498
+ setIfDefined(normalized, "minScore", config.minScore);
499
+ setIfDefined(normalized, "ngramRange", config.ngramRange);
500
+ setIfDefined(normalized, "language", config.language);
501
+ setIfDefined(normalized, "yakeParams", config.yakeParams);
502
+ setIfDefined(normalized, "rakeParams", config.rakeParams);
503
+ return normalized;
504
+ }
505
+ function normalizePageConfig(pages) {
506
+ if (!pages) {
507
+ return void 0;
508
+ }
509
+ const normalized = {};
510
+ setIfDefined(normalized, "extract_pages", pages.extractPages);
511
+ setIfDefined(normalized, "insert_page_markers", pages.insertPageMarkers);
512
+ setIfDefined(normalized, "marker_format", pages.markerFormat);
513
+ return normalized;
514
+ }
515
+ function normalizeExtractionConfig(config) {
516
+ if (!config) {
517
+ return null;
518
+ }
519
+ const normalized = {};
520
+ setIfDefined(normalized, "useCache", config.useCache);
521
+ setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
522
+ setIfDefined(normalized, "forceOcr", config.forceOcr);
523
+ setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
524
+ const ocr = normalizeOcrConfig(config.ocr);
525
+ setIfDefined(normalized, "ocr", ocr);
526
+ const chunking = normalizeChunkingConfig(config.chunking);
527
+ setIfDefined(normalized, "chunking", chunking);
528
+ const images = normalizeImageExtractionConfig(config.images);
529
+ setIfDefined(normalized, "images", images);
530
+ const pdf = normalizePdfConfig(config.pdfOptions);
531
+ setIfDefined(normalized, "pdfOptions", pdf);
532
+ const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
533
+ setIfDefined(normalized, "tokenReduction", tokenReduction);
534
+ const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
535
+ setIfDefined(normalized, "languageDetection", languageDetection);
536
+ const postprocessor = normalizePostProcessorConfig(config.postprocessor);
537
+ setIfDefined(normalized, "postprocessor", postprocessor);
538
+ const keywords = normalizeKeywordConfig(config.keywords);
539
+ setIfDefined(normalized, "keywords", keywords);
540
+ const pages = normalizePageConfig(config.pages);
541
+ setIfDefined(normalized, "pages", pages);
542
+ const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
543
+ setIfDefined(normalized, "htmlOptions", htmlOptions);
544
+ return normalized;
545
+ }
546
+ function extractFileSync(filePath, mimeType = null, config = null) {
547
+ const normalizedConfig = normalizeExtractionConfig(config);
548
+ const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
549
+ return convertResult(rawResult);
550
+ }
551
+ async function extractFile(filePath, mimeType = null, config = null) {
552
+ const normalizedConfig = normalizeExtractionConfig(config);
553
+ const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
554
+ return convertResult(rawResult);
555
+ }
556
+ function extractBytesSync(data, mimeType, config = null) {
557
+ const validated = assertUint8Array(data, "data");
558
+ const normalizedConfig = normalizeExtractionConfig(config);
559
+ const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
560
+ return convertResult(rawResult);
561
+ }
562
+ async function extractBytes(data, mimeType, config = null) {
563
+ const validated = assertUint8Array(data, "data");
564
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
565
+ console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
566
+ }
567
+ const normalizedConfig = normalizeExtractionConfig(config);
568
+ const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
569
+ return convertResult(rawResult);
570
+ }
571
+ function batchExtractFilesSync(paths, config = null) {
572
+ const normalizedConfig = normalizeExtractionConfig(config);
573
+ const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
574
+ return rawResults.map(convertResult);
575
+ }
576
+ async function batchExtractFiles(paths, config = null) {
577
+ const normalizedConfig = normalizeExtractionConfig(config);
578
+ const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
579
+ return rawResults.map(convertResult);
580
+ }
581
+ function batchExtractBytesSync(dataList, mimeTypes, config = null) {
582
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
583
+ if (buffers.length !== mimeTypes.length) {
584
+ throw new TypeError("dataList and mimeTypes must have the same length");
585
+ }
586
+ const normalizedConfig = normalizeExtractionConfig(config);
587
+ const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
588
+ return rawResults.map(convertResult);
589
+ }
590
+ async function batchExtractBytes(dataList, mimeTypes, config = null) {
591
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
592
+ if (buffers.length !== mimeTypes.length) {
593
+ throw new TypeError("dataList and mimeTypes must have the same length");
594
+ }
595
+ const normalizedConfig = normalizeExtractionConfig(config);
596
+ const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
597
+ return rawResults.map(convertResult);
598
+ }
599
+ function registerPostProcessor(processor) {
600
+ const binding2 = getBinding();
601
+ const wrappedProcessor = {
602
+ name: processor.name.bind(processor),
603
+ processingStage: processor.processingStage?.bind(processor),
604
+ async process(...args) {
605
+ const wrappedValue = args[0];
606
+ const jsonString = wrappedValue[0];
607
+ const wireResult = JSON.parse(jsonString);
608
+ const result = {
609
+ content: wireResult.content,
610
+ mimeType: wireResult.mime_type,
611
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
612
+ tables: wireResult.tables || [],
613
+ detectedLanguages: wireResult.detected_languages ?? null,
614
+ chunks: wireResult.chunks ?? null,
615
+ images: wireResult.images ?? null
616
+ };
617
+ const updated = await processor.process(result);
618
+ const wireUpdated = {
619
+ content: updated.content,
620
+ mime_type: updated.mimeType,
621
+ metadata: updated.metadata,
622
+ tables: updated.tables,
623
+ detected_languages: updated.detectedLanguages,
624
+ chunks: updated.chunks,
625
+ images: updated.images
626
+ };
627
+ return JSON.stringify(wireUpdated);
628
+ }
629
+ };
630
+ Object.defineProperty(wrappedProcessor, "__original", {
631
+ value: processor,
632
+ enumerable: false
633
+ });
634
+ const stage = processor.processingStage?.() ?? "middle";
635
+ Object.defineProperty(wrappedProcessor, "__stage", {
636
+ value: stage,
637
+ enumerable: false
638
+ });
639
+ binding2.registerPostProcessor(wrappedProcessor);
640
+ }
641
+ function unregisterPostProcessor(name) {
642
+ const binding2 = getBinding();
643
+ binding2.unregisterPostProcessor(name);
644
+ }
645
+ function clearPostProcessors() {
646
+ const binding2 = getBinding();
647
+ binding2.clearPostProcessors();
648
+ }
649
+ function listPostProcessors() {
650
+ const binding2 = getBinding();
651
+ return binding2.listPostProcessors();
652
+ }
653
+ function registerValidator(validator) {
654
+ const binding2 = getBinding();
655
+ const wrappedValidator = {
656
+ name: validator.name.bind(validator),
657
+ priority: validator.priority?.bind(validator),
658
+ async validate(...args) {
659
+ const jsonString = args[0];
660
+ if (!jsonString || jsonString === "undefined") {
661
+ throw new Error("Validator received invalid JSON string");
662
+ }
663
+ const wireResult = JSON.parse(jsonString);
664
+ const result = {
665
+ content: wireResult.content,
666
+ mimeType: wireResult.mime_type,
667
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
668
+ tables: wireResult.tables || [],
669
+ detectedLanguages: wireResult.detected_languages,
670
+ chunks: wireResult.chunks,
671
+ images: wireResult.images ?? null
672
+ };
673
+ await Promise.resolve(validator.validate(result));
674
+ return "";
675
+ }
676
+ };
677
+ binding2.registerValidator(wrappedValidator);
678
+ }
679
+ function unregisterValidator(name) {
680
+ const binding2 = getBinding();
681
+ binding2.unregisterValidator(name);
682
+ }
683
+ function clearValidators() {
684
+ const binding2 = getBinding();
685
+ binding2.clearValidators();
686
+ }
687
+ function listValidators() {
688
+ const binding2 = getBinding();
689
+ return binding2.listValidators();
690
+ }
691
+ function isOcrProcessTuple(value) {
692
+ return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
693
+ }
694
+ function isNestedOcrProcessTuple(value) {
695
+ return Array.isArray(value) && value.length === 1 && isOcrProcessTuple(value[0]);
696
+ }
697
+ function describePayload(value) {
698
+ if (typeof value === "string") {
699
+ return { ctor: "String", length: value.length };
700
+ }
701
+ return { ctor: value.constructor?.name ?? "Buffer", length: value.length };
702
+ }
703
+ function registerOcrBackend(backend) {
704
+ const binding2 = getBinding();
705
+ const wrappedBackend = {
706
+ name: backend.name.bind(backend),
707
+ supportedLanguages: backend.supportedLanguages.bind(backend),
708
+ async processImage(...processArgs) {
709
+ const [imagePayload, maybeLanguage] = processArgs;
710
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
711
+ console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
712
+ console.log("[registerOcrBackend] Raw args", {
713
+ imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
714
+ maybeLanguageType: typeof maybeLanguage,
715
+ metadata: Array.isArray(imagePayload) ? { tupleLength: imagePayload.length } : describePayload(imagePayload)
716
+ });
717
+ }
718
+ let rawBytes;
719
+ let language = maybeLanguage;
720
+ if (isNestedOcrProcessTuple(imagePayload)) {
721
+ [rawBytes, language] = imagePayload[0];
722
+ } else if (isOcrProcessTuple(imagePayload)) {
723
+ [rawBytes, language] = imagePayload;
724
+ } else {
725
+ rawBytes = imagePayload;
726
+ }
727
+ if (typeof language !== "string") {
728
+ throw new Error("OCR backend did not receive a language parameter");
729
+ }
730
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
731
+ const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
732
+ console.log(
733
+ "[registerOcrBackend] Received payload",
734
+ Array.isArray(imagePayload) ? "tuple" : typeof rawBytes,
735
+ "ctor",
736
+ describePayload(rawBytes).ctor,
737
+ "length",
738
+ length
739
+ );
740
+ }
741
+ const buffer = typeof rawBytes === "string" ? Buffer.from(rawBytes, "base64") : Buffer.from(rawBytes);
742
+ const result = await backend.processImage(new Uint8Array(buffer), language);
743
+ return JSON.stringify(result);
744
+ }
745
+ };
746
+ binding2.registerOcrBackend(wrappedBackend);
747
+ }
748
+ function listOcrBackends() {
749
+ const binding2 = getBinding();
750
+ return binding2.listOcrBackends();
751
+ }
752
+ function unregisterOcrBackend(name) {
753
+ const binding2 = getBinding();
754
+ binding2.unregisterOcrBackend(name);
755
+ }
756
+ function clearOcrBackends() {
757
+ const binding2 = getBinding();
758
+ binding2.clearOcrBackends();
759
+ }
760
+ function listDocumentExtractors() {
761
+ const binding2 = getBinding();
762
+ return binding2.listDocumentExtractors();
763
+ }
764
+ function unregisterDocumentExtractor(name) {
765
+ const binding2 = getBinding();
766
+ binding2.unregisterDocumentExtractor(name);
767
+ }
768
+ function clearDocumentExtractors() {
769
+ const binding2 = getBinding();
770
+ binding2.clearDocumentExtractors();
771
+ }
772
+ const ExtractionConfig = {
773
+ /**
774
+ * Load extraction configuration from a file.
775
+ *
776
+ * Automatically detects the file format based on extension:
777
+ * - `.toml` - TOML format
778
+ * - `.yaml` - YAML format
779
+ * - `.json` - JSON format
780
+ *
781
+ * @param filePath - Path to the configuration file (absolute or relative)
782
+ * @returns ExtractionConfig object loaded from the file
783
+ *
784
+ * @throws {Error} If file does not exist or is not accessible
785
+ * @throws {Error} If file content is not valid TOML/YAML/JSON
786
+ * @throws {Error} If configuration structure is invalid
787
+ * @throws {Error} If file extension is not supported
788
+ *
789
+ * @example
790
+ * ```typescript
791
+ * import { ExtractionConfig } from '@kreuzberg/node';
792
+ *
793
+ * // Load from TOML file
794
+ * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
795
+ *
796
+ * // Load from YAML file
797
+ * const config2 = ExtractionConfig.fromFile('./config.yaml');
798
+ *
799
+ * // Load from JSON file
800
+ * const config3 = ExtractionConfig.fromFile('./config.json');
801
+ * ```
802
+ */
803
+ fromFile(filePath) {
804
+ const binding2 = getBinding();
805
+ return binding2.loadExtractionConfigFromFile(filePath);
806
+ },
807
+ /**
808
+ * Discover and load configuration from current or parent directories.
809
+ *
810
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
811
+ * and traversing up the directory tree. Returns the first configuration file found.
812
+ *
813
+ * @returns ExtractionConfig object if found, or null if no configuration file exists
814
+ *
815
+ * @example
816
+ * ```typescript
817
+ * import { ExtractionConfig } from '@kreuzberg/node';
818
+ *
819
+ * // Try to find config in current or parent directories
820
+ * const config = ExtractionConfig.discover();
821
+ * if (config) {
822
+ * console.log('Found configuration');
823
+ * // Use config for extraction
824
+ * } else {
825
+ * console.log('No configuration file found, using defaults');
826
+ * }
827
+ * ```
828
+ */
829
+ discover() {
830
+ const binding2 = getBinding();
831
+ return binding2.discoverExtractionConfig();
832
+ }
833
+ };
834
+ function detectMimeType(bytes) {
835
+ const binding2 = getBinding();
836
+ return binding2.detectMimeTypeFromBytes(bytes);
837
+ }
838
+ function detectMimeTypeFromPath(path) {
839
+ const binding2 = getBinding();
840
+ return binding2.detectMimeTypeFromPath(path);
841
+ }
842
+ function validateMimeType(mimeType) {
843
+ const binding2 = getBinding();
844
+ return binding2.validateMimeType(mimeType);
845
+ }
846
+ function getExtensionsForMime(mimeType) {
847
+ const binding2 = getBinding();
848
+ return binding2.getExtensionsForMime(mimeType);
849
+ }
850
+ function listEmbeddingPresets() {
851
+ const binding2 = getBinding();
852
+ return binding2.listEmbeddingPresets();
853
+ }
854
+ function getEmbeddingPreset(name) {
855
+ const binding2 = getBinding();
856
+ const result = binding2.getEmbeddingPreset(name);
857
+ return result;
858
+ }
859
+ function getLastErrorCode() {
860
+ const binding2 = getBinding();
861
+ return binding2.getLastErrorCode();
862
+ }
863
+ function getLastPanicContext() {
864
+ const binding2 = getBinding();
865
+ const result = binding2.getLastPanicContext();
866
+ return result;
867
+ }
868
+ const __version__ = "4.0.0-rc.10";
869
+ // Annotate the CommonJS export names for ESM import in node:
870
+ 0 && (module.exports = {
871
+ CacheError,
872
+ ErrorCode,
873
+ ExtractionConfig,
874
+ GutenOcrBackend,
875
+ ImageProcessingError,
876
+ KreuzbergError,
877
+ MissingDependencyError,
878
+ OcrError,
879
+ ParsingError,
880
+ PluginError,
881
+ ValidationError,
882
+ __resetBindingForTests,
883
+ __setBindingForTests,
884
+ __version__,
885
+ batchExtractBytes,
886
+ batchExtractBytesSync,
887
+ batchExtractFiles,
888
+ batchExtractFilesSync,
889
+ clearDocumentExtractors,
890
+ clearOcrBackends,
891
+ clearPostProcessors,
892
+ clearValidators,
893
+ detectMimeType,
894
+ detectMimeTypeFromPath,
895
+ extractBytes,
896
+ extractBytesSync,
897
+ extractFile,
898
+ extractFileSync,
899
+ getEmbeddingPreset,
900
+ getExtensionsForMime,
901
+ getLastErrorCode,
902
+ getLastPanicContext,
903
+ listDocumentExtractors,
904
+ listEmbeddingPresets,
905
+ listOcrBackends,
906
+ listPostProcessors,
907
+ listValidators,
908
+ registerOcrBackend,
909
+ registerPostProcessor,
910
+ registerValidator,
911
+ unregisterDocumentExtractor,
912
+ unregisterOcrBackend,
913
+ unregisterPostProcessor,
914
+ unregisterValidator,
915
+ validateMimeType,
916
+ ...require("./types.js")
917
+ });
918
+ //# sourceMappingURL=index.js.map