@kreuzberg/node 4.0.0-rc.6 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1044 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __reExport = (target, mod, secondTarget) => (__copyProps(target, mod, "default"), secondTarget && __copyProps(secondTarget, mod, "default"));
19
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
20
+ var index_exports = {};
21
+ __export(index_exports, {
22
+ CacheError: () => import_errors.CacheError,
23
+ ErrorCode: () => import_errors.ErrorCode,
24
+ ExtractionConfig: () => ExtractionConfig,
25
+ GutenOcrBackend: () => import_guten_ocr.GutenOcrBackend,
26
+ ImageProcessingError: () => import_errors.ImageProcessingError,
27
+ KreuzbergError: () => import_errors.KreuzbergError,
28
+ MissingDependencyError: () => import_errors.MissingDependencyError,
29
+ OcrError: () => import_errors.OcrError,
30
+ ParsingError: () => import_errors.ParsingError,
31
+ PluginError: () => import_errors.PluginError,
32
+ ValidationError: () => import_errors.ValidationError,
33
+ __resetBindingForTests: () => __resetBindingForTests,
34
+ __setBindingForTests: () => __setBindingForTests,
35
+ __version__: () => __version__,
36
+ batchExtractBytes: () => batchExtractBytes,
37
+ batchExtractBytesSync: () => batchExtractBytesSync,
38
+ batchExtractFiles: () => batchExtractFiles,
39
+ batchExtractFilesInWorker: () => batchExtractFilesInWorker,
40
+ batchExtractFilesSync: () => batchExtractFilesSync,
41
+ classifyError: () => classifyError,
42
+ clearDocumentExtractors: () => clearDocumentExtractors,
43
+ clearOcrBackends: () => clearOcrBackends,
44
+ clearPostProcessors: () => clearPostProcessors,
45
+ clearValidators: () => clearValidators,
46
+ closeWorkerPool: () => closeWorkerPool,
47
+ createWorkerPool: () => createWorkerPool,
48
+ detectMimeType: () => detectMimeType,
49
+ detectMimeTypeFromPath: () => detectMimeTypeFromPath,
50
+ extractBytes: () => extractBytes,
51
+ extractBytesSync: () => extractBytesSync,
52
+ extractFile: () => extractFile,
53
+ extractFileInWorker: () => extractFileInWorker,
54
+ extractFileSync: () => extractFileSync,
55
+ getEmbeddingPreset: () => getEmbeddingPreset,
56
+ getErrorCodeDescription: () => getErrorCodeDescription,
57
+ getErrorCodeName: () => getErrorCodeName,
58
+ getExtensionsForMime: () => getExtensionsForMime,
59
+ getLastErrorCode: () => getLastErrorCode,
60
+ getLastPanicContext: () => getLastPanicContext,
61
+ getWorkerPoolStats: () => getWorkerPoolStats,
62
+ listDocumentExtractors: () => listDocumentExtractors,
63
+ listEmbeddingPresets: () => listEmbeddingPresets,
64
+ listOcrBackends: () => listOcrBackends,
65
+ listPostProcessors: () => listPostProcessors,
66
+ listValidators: () => listValidators,
67
+ registerOcrBackend: () => registerOcrBackend,
68
+ registerPostProcessor: () => registerPostProcessor,
69
+ registerValidator: () => registerValidator,
70
+ unregisterDocumentExtractor: () => unregisterDocumentExtractor,
71
+ unregisterOcrBackend: () => unregisterOcrBackend,
72
+ unregisterPostProcessor: () => unregisterPostProcessor,
73
+ unregisterValidator: () => unregisterValidator,
74
+ validateMimeType: () => validateMimeType
75
+ });
76
+ module.exports = __toCommonJS(index_exports);
77
+ var import_node_fs = require("node:fs");
78
+ var import_node_module = require("node:module");
79
+ var import_errors = require("./errors.js");
80
+ var import_guten_ocr = require("./ocr/guten-ocr.js");
81
+ __reExport(index_exports, require("./types.js"), module.exports);
82
+ const import_meta = {};
83
+ let binding = null;
84
+ let bindingInitialized = false;
85
+ function createNativeBindingError(error) {
86
+ const hintParts = [];
87
+ let detail = "Unknown error while requiring native module.";
88
+ if (error instanceof Error) {
89
+ detail = error.message || error.toString();
90
+ if (/pdfium/i.test(detail)) {
91
+ hintParts.push(
92
+ "Pdfium runtime library was not found. Ensure the bundled libpdfium (dll/dylib/so) is present next to the native module."
93
+ );
94
+ }
95
+ return new Error(
96
+ [
97
+ "Failed to load Kreuzberg native bindings.",
98
+ hintParts.length ? hintParts.join(" ") : "",
99
+ "Report this error and attach the logs/stack trace for investigation.",
100
+ `Underlying error: ${detail}`
101
+ ].filter(Boolean).join(" "),
102
+ { cause: error }
103
+ );
104
+ }
105
+ return new Error(
106
+ [
107
+ "Failed to load Kreuzberg native bindings.",
108
+ "Report this error and attach the logs/stack trace for investigation.",
109
+ `Underlying error: ${String(error)}`
110
+ ].join(" ")
111
+ );
112
+ }
113
+ function assertUint8Array(value, name) {
114
+ if (!(value instanceof Uint8Array)) {
115
+ throw new TypeError(`${name} must be a Uint8Array`);
116
+ }
117
+ return value;
118
+ }
119
+ function assertUint8ArrayList(values, name) {
120
+ if (!Array.isArray(values)) {
121
+ throw new TypeError(`${name} must be an array of Uint8Array`);
122
+ }
123
+ const array = values;
124
+ return array.map((value, index) => {
125
+ try {
126
+ return assertUint8Array(value, `${name}[${index}]`);
127
+ } catch {
128
+ throw new TypeError(`${name}[${index}] must be a Uint8Array`);
129
+ }
130
+ });
131
+ }
132
+ function __setBindingForTests(mock) {
133
+ binding = mock;
134
+ bindingInitialized = true;
135
+ }
136
+ function __resetBindingForTests() {
137
+ binding = null;
138
+ bindingInitialized = false;
139
+ }
140
+ function loadNativeBinding() {
141
+ let localRequire;
142
+ if (typeof require !== "undefined") {
143
+ localRequire = require;
144
+ } else {
145
+ try {
146
+ localRequire = (0, import_node_module.createRequire)(import_meta.url);
147
+ } catch {
148
+ localRequire = void 0;
149
+ }
150
+ }
151
+ if (!localRequire) {
152
+ throw new Error("Unable to resolve native binding loader (require not available).");
153
+ }
154
+ const loadedModule = localRequire("../index.js");
155
+ if (typeof loadedModule !== "object" || loadedModule === null) {
156
+ throw new Error(
157
+ "Native binding is not a valid object. Ensure the native module is properly built and compatible."
158
+ );
159
+ }
160
+ const module2 = loadedModule;
161
+ const requiredMethods = [
162
+ "extractFileSync",
163
+ "extractFile",
164
+ "extractBytesSync",
165
+ "extractBytes",
166
+ "batchExtractFilesSync",
167
+ "batchExtractFiles",
168
+ "batchExtractBytesSync",
169
+ "batchExtractBytes"
170
+ ];
171
+ for (const method of requiredMethods) {
172
+ if (typeof module2[method] !== "function") {
173
+ throw new Error(
174
+ `Native binding is missing required method: ${method}. Ensure the native module is properly built and compatible.`
175
+ );
176
+ }
177
+ }
178
+ return module2;
179
+ }
180
+ function getBinding() {
181
+ if (bindingInitialized) {
182
+ if (binding === null) {
183
+ throw new Error("Native binding was previously failed to load.");
184
+ }
185
+ return binding;
186
+ }
187
+ try {
188
+ if (typeof process !== "undefined" && process.versions && process.versions.node) {
189
+ binding = loadNativeBinding();
190
+ bindingInitialized = true;
191
+ return binding;
192
+ }
193
+ } catch (error) {
194
+ bindingInitialized = true;
195
+ throw createNativeBindingError(error);
196
+ }
197
+ throw new Error(
198
+ "Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
199
+ );
200
+ }
201
+ function parseMetadata(metadataStr) {
202
+ try {
203
+ const parsed = JSON.parse(metadataStr);
204
+ if (typeof parsed === "object" && parsed !== null) {
205
+ return parsed;
206
+ }
207
+ return {};
208
+ } catch {
209
+ return {};
210
+ }
211
+ }
212
+ function ensureUint8Array(value) {
213
+ if (value instanceof Uint8Array) {
214
+ return value;
215
+ }
216
+ if (typeof Buffer !== "undefined" && value instanceof Buffer) {
217
+ return new Uint8Array(value);
218
+ }
219
+ if (Array.isArray(value)) {
220
+ return new Uint8Array(value);
221
+ }
222
+ return new Uint8Array();
223
+ }
224
+ function convertChunk(rawChunk) {
225
+ if (!rawChunk || typeof rawChunk !== "object") {
226
+ return {
227
+ content: "",
228
+ metadata: {
229
+ byteStart: 0,
230
+ byteEnd: 0,
231
+ tokenCount: null,
232
+ chunkIndex: 0,
233
+ totalChunks: 0
234
+ },
235
+ embedding: null
236
+ };
237
+ }
238
+ const chunk = rawChunk;
239
+ const metadata = chunk["metadata"] ?? {};
240
+ return {
241
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
242
+ content: chunk["content"] ?? "",
243
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
244
+ embedding: chunk["embedding"] ?? null,
245
+ metadata: {
246
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
247
+ byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
248
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
249
+ byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
250
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
251
+ tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
252
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
253
+ chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
254
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
255
+ totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
256
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
257
+ firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
258
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
259
+ lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
260
+ }
261
+ };
262
+ }
263
+ function convertImage(rawImage) {
264
+ if (!rawImage || typeof rawImage !== "object") {
265
+ return {
266
+ data: new Uint8Array(),
267
+ format: "unknown",
268
+ imageIndex: 0,
269
+ pageNumber: null,
270
+ width: null,
271
+ height: null,
272
+ colorspace: null,
273
+ bitsPerComponent: null,
274
+ isMask: false,
275
+ description: null,
276
+ ocrResult: null
277
+ };
278
+ }
279
+ const image = rawImage;
280
+ return {
281
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
282
+ data: ensureUint8Array(image["data"]),
283
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
284
+ format: image["format"] ?? "unknown",
285
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
286
+ imageIndex: image["imageIndex"] ?? 0,
287
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
288
+ pageNumber: image["pageNumber"] ?? null,
289
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
290
+ width: image["width"] ?? null,
291
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
292
+ height: image["height"] ?? null,
293
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
294
+ colorspace: image["colorspace"] ?? null,
295
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
296
+ bitsPerComponent: image["bitsPerComponent"] ?? null,
297
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
298
+ isMask: image["isMask"] ?? false,
299
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
300
+ description: image["description"] ?? null,
301
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
302
+ ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
303
+ };
304
+ }
305
+ function convertPageContent(rawPage) {
306
+ if (!rawPage || typeof rawPage !== "object") {
307
+ return {
308
+ pageNumber: 0,
309
+ content: "",
310
+ tables: [],
311
+ images: []
312
+ };
313
+ }
314
+ const page = rawPage;
315
+ return {
316
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
317
+ pageNumber: page["pageNumber"] ?? 0,
318
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
319
+ content: page["content"] ?? "",
320
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
321
+ tables: Array.isArray(page["tables"]) ? page["tables"] : [],
322
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
323
+ images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
324
+ };
325
+ }
326
+ function convertResult(rawResult) {
327
+ if (!rawResult || typeof rawResult !== "object") {
328
+ return {
329
+ content: "",
330
+ mimeType: "application/octet-stream",
331
+ metadata: {},
332
+ tables: [],
333
+ detectedLanguages: null,
334
+ chunks: null,
335
+ images: null,
336
+ pages: null
337
+ };
338
+ }
339
+ const result = rawResult;
340
+ const metadata = result["metadata"];
341
+ const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
342
+ const returnObj = {
343
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
344
+ content: result["content"] ?? "",
345
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
346
+ mimeType: result["mimeType"] ?? "application/octet-stream",
347
+ metadata: metadataValue,
348
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
349
+ tables: Array.isArray(result["tables"]) ? result["tables"] : [],
350
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
351
+ detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
352
+ chunks: null,
353
+ images: null,
354
+ pages: null
355
+ };
356
+ const chunksData = result["chunks"];
357
+ if (Array.isArray(chunksData)) {
358
+ returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
359
+ }
360
+ const imagesData = result["images"];
361
+ if (Array.isArray(imagesData)) {
362
+ returnObj.images = imagesData.map((image) => convertImage(image));
363
+ }
364
+ const pagesData = result["pages"];
365
+ if (Array.isArray(pagesData)) {
366
+ returnObj.pages = pagesData.map((page) => convertPageContent(page));
367
+ }
368
+ return returnObj;
369
+ }
370
+ function setIfDefined(target, key, value) {
371
+ if (value !== void 0) {
372
+ target[key] = value;
373
+ }
374
+ }
375
+ function normalizeTesseractConfig(config) {
376
+ if (!config) {
377
+ return void 0;
378
+ }
379
+ const normalized = {};
380
+ setIfDefined(normalized, "psm", config.psm);
381
+ setIfDefined(normalized, "enableTableDetection", config.enableTableDetection);
382
+ setIfDefined(normalized, "tesseditCharWhitelist", config.tesseditCharWhitelist);
383
+ return normalized;
384
+ }
385
+ function normalizeOcrConfig(ocr) {
386
+ if (!ocr) {
387
+ return void 0;
388
+ }
389
+ const normalized = {
390
+ backend: ocr.backend
391
+ };
392
+ setIfDefined(normalized, "language", ocr.language);
393
+ const tesseract = normalizeTesseractConfig(ocr.tesseractConfig);
394
+ if (tesseract) {
395
+ setIfDefined(normalized, "tesseractConfig", tesseract);
396
+ }
397
+ return normalized;
398
+ }
399
+ function normalizeChunkingConfig(chunking) {
400
+ if (!chunking) {
401
+ return void 0;
402
+ }
403
+ const normalized = {};
404
+ setIfDefined(normalized, "maxChars", chunking.maxChars);
405
+ setIfDefined(normalized, "maxOverlap", chunking.maxOverlap);
406
+ setIfDefined(normalized, "preset", chunking.preset);
407
+ setIfDefined(normalized, "embedding", chunking.embedding);
408
+ setIfDefined(normalized, "enabled", chunking.enabled);
409
+ return normalized;
410
+ }
411
+ function normalizeImageExtractionConfig(images) {
412
+ if (!images) {
413
+ return void 0;
414
+ }
415
+ const normalized = {};
416
+ setIfDefined(normalized, "extractImages", images.extractImages);
417
+ setIfDefined(normalized, "targetDpi", images.targetDpi);
418
+ setIfDefined(normalized, "maxImageDimension", images.maxImageDimension);
419
+ setIfDefined(normalized, "autoAdjustDpi", images.autoAdjustDpi);
420
+ setIfDefined(normalized, "minDpi", images.minDpi);
421
+ setIfDefined(normalized, "maxDpi", images.maxDpi);
422
+ return normalized;
423
+ }
424
+ function normalizePdfConfig(pdf) {
425
+ if (!pdf) {
426
+ return void 0;
427
+ }
428
+ const normalized = {};
429
+ setIfDefined(normalized, "extractImages", pdf.extractImages);
430
+ setIfDefined(normalized, "passwords", pdf.passwords);
431
+ setIfDefined(normalized, "extractMetadata", pdf.extractMetadata);
432
+ return normalized;
433
+ }
434
+ function normalizeTokenReductionConfig(tokenReduction) {
435
+ if (!tokenReduction) {
436
+ return void 0;
437
+ }
438
+ const normalized = {};
439
+ setIfDefined(normalized, "mode", tokenReduction.mode);
440
+ setIfDefined(normalized, "preserveImportantWords", tokenReduction.preserveImportantWords);
441
+ return normalized;
442
+ }
443
+ function normalizeLanguageDetectionConfig(languageDetection) {
444
+ if (!languageDetection) {
445
+ return void 0;
446
+ }
447
+ const normalized = {};
448
+ setIfDefined(normalized, "enabled", languageDetection.enabled);
449
+ setIfDefined(normalized, "minConfidence", languageDetection.minConfidence);
450
+ setIfDefined(normalized, "detectMultiple", languageDetection.detectMultiple);
451
+ return normalized;
452
+ }
453
+ function normalizePostProcessorConfig(postprocessor) {
454
+ if (!postprocessor) {
455
+ return void 0;
456
+ }
457
+ const normalized = {};
458
+ setIfDefined(normalized, "enabled", postprocessor.enabled);
459
+ setIfDefined(normalized, "enabledProcessors", postprocessor.enabledProcessors);
460
+ setIfDefined(normalized, "disabledProcessors", postprocessor.disabledProcessors);
461
+ return normalized;
462
+ }
463
+ function normalizeHtmlPreprocessing(options) {
464
+ if (!options) {
465
+ return void 0;
466
+ }
467
+ const normalized = {};
468
+ setIfDefined(normalized, "enabled", options.enabled);
469
+ setIfDefined(normalized, "preset", options.preset);
470
+ setIfDefined(normalized, "removeNavigation", options.removeNavigation);
471
+ setIfDefined(normalized, "removeForms", options.removeForms);
472
+ return normalized;
473
+ }
474
+ function normalizeHtmlOptions(options) {
475
+ if (!options) {
476
+ return void 0;
477
+ }
478
+ const normalized = {};
479
+ setIfDefined(normalized, "headingStyle", options.headingStyle);
480
+ setIfDefined(normalized, "listIndentType", options.listIndentType);
481
+ setIfDefined(normalized, "listIndentWidth", options.listIndentWidth);
482
+ setIfDefined(normalized, "bullets", options.bullets);
483
+ setIfDefined(normalized, "strongEmSymbol", options.strongEmSymbol);
484
+ setIfDefined(normalized, "escapeAsterisks", options.escapeAsterisks);
485
+ setIfDefined(normalized, "escapeUnderscores", options.escapeUnderscores);
486
+ setIfDefined(normalized, "escapeMisc", options.escapeMisc);
487
+ setIfDefined(normalized, "escapeAscii", options.escapeAscii);
488
+ setIfDefined(normalized, "codeLanguage", options.codeLanguage);
489
+ setIfDefined(normalized, "autolinks", options.autolinks);
490
+ setIfDefined(normalized, "defaultTitle", options.defaultTitle);
491
+ setIfDefined(normalized, "brInTables", options.brInTables);
492
+ setIfDefined(normalized, "hocrSpatialTables", options.hocrSpatialTables);
493
+ setIfDefined(normalized, "highlightStyle", options.highlightStyle);
494
+ setIfDefined(normalized, "extractMetadata", options.extractMetadata);
495
+ setIfDefined(normalized, "whitespaceMode", options.whitespaceMode);
496
+ setIfDefined(normalized, "stripNewlines", options.stripNewlines);
497
+ setIfDefined(normalized, "wrap", options.wrap);
498
+ setIfDefined(normalized, "wrapWidth", options.wrapWidth);
499
+ setIfDefined(normalized, "convertAsInline", options.convertAsInline);
500
+ setIfDefined(normalized, "subSymbol", options.subSymbol);
501
+ setIfDefined(normalized, "supSymbol", options.supSymbol);
502
+ setIfDefined(normalized, "newlineStyle", options.newlineStyle);
503
+ setIfDefined(normalized, "codeBlockStyle", options.codeBlockStyle);
504
+ setIfDefined(normalized, "keepInlineImagesIn", options.keepInlineImagesIn);
505
+ setIfDefined(normalized, "encoding", options.encoding);
506
+ setIfDefined(normalized, "debug", options.debug);
507
+ setIfDefined(normalized, "stripTags", options.stripTags);
508
+ setIfDefined(normalized, "preserveTags", options.preserveTags);
509
+ const preprocessing = normalizeHtmlPreprocessing(options.preprocessing);
510
+ setIfDefined(normalized, "preprocessing", preprocessing);
511
+ return normalized;
512
+ }
513
+ function normalizeKeywordConfig(config) {
514
+ if (!config) {
515
+ return void 0;
516
+ }
517
+ const normalized = {};
518
+ setIfDefined(normalized, "algorithm", config.algorithm);
519
+ setIfDefined(normalized, "maxKeywords", config.maxKeywords);
520
+ setIfDefined(normalized, "minScore", config.minScore);
521
+ setIfDefined(normalized, "ngramRange", config.ngramRange);
522
+ setIfDefined(normalized, "language", config.language);
523
+ setIfDefined(normalized, "yakeParams", config.yakeParams);
524
+ setIfDefined(normalized, "rakeParams", config.rakeParams);
525
+ return normalized;
526
+ }
527
+ function normalizePageConfig(pages) {
528
+ if (!pages) {
529
+ return void 0;
530
+ }
531
+ const normalized = {};
532
+ setIfDefined(normalized, "extractPages", pages.extractPages);
533
+ setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
534
+ setIfDefined(normalized, "markerFormat", pages.markerFormat);
535
+ return normalized;
536
+ }
537
+ function normalizeExtractionConfig(config) {
538
+ if (!config) {
539
+ return null;
540
+ }
541
+ const normalized = {};
542
+ setIfDefined(normalized, "useCache", config.useCache);
543
+ setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
544
+ setIfDefined(normalized, "forceOcr", config.forceOcr);
545
+ setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
546
+ const ocr = normalizeOcrConfig(config.ocr);
547
+ setIfDefined(normalized, "ocr", ocr);
548
+ const chunking = normalizeChunkingConfig(config.chunking);
549
+ setIfDefined(normalized, "chunking", chunking);
550
+ const images = normalizeImageExtractionConfig(config.images);
551
+ setIfDefined(normalized, "images", images);
552
+ const pdf = normalizePdfConfig(config.pdfOptions);
553
+ setIfDefined(normalized, "pdfOptions", pdf);
554
+ const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
555
+ setIfDefined(normalized, "tokenReduction", tokenReduction);
556
+ const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
557
+ setIfDefined(normalized, "languageDetection", languageDetection);
558
+ const postprocessor = normalizePostProcessorConfig(config.postprocessor);
559
+ setIfDefined(normalized, "postprocessor", postprocessor);
560
+ const keywords = normalizeKeywordConfig(config.keywords);
561
+ setIfDefined(normalized, "keywords", keywords);
562
+ const pages = normalizePageConfig(config.pages);
563
+ setIfDefined(normalized, "pages", pages);
564
+ const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
565
+ setIfDefined(normalized, "htmlOptions", htmlOptions);
566
+ return normalized;
567
+ }
568
+ function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
569
+ let mimeType = null;
570
+ let config = null;
571
+ if (typeof mimeTypeOrConfig === "string") {
572
+ mimeType = mimeTypeOrConfig;
573
+ config = maybeConfig ?? null;
574
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
575
+ config = mimeTypeOrConfig;
576
+ mimeType = null;
577
+ } else {
578
+ config = maybeConfig ?? null;
579
+ mimeType = null;
580
+ }
581
+ const normalizedConfig = normalizeExtractionConfig(config);
582
+ const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
583
+ return convertResult(rawResult);
584
+ }
585
+ async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
586
+ let mimeType = null;
587
+ let config = null;
588
+ if (typeof mimeTypeOrConfig === "string") {
589
+ mimeType = mimeTypeOrConfig;
590
+ config = maybeConfig ?? null;
591
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
592
+ config = mimeTypeOrConfig;
593
+ mimeType = null;
594
+ } else {
595
+ config = maybeConfig ?? null;
596
+ mimeType = null;
597
+ }
598
+ const normalizedConfig = normalizeExtractionConfig(config);
599
+ const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
600
+ return convertResult(rawResult);
601
+ }
602
+ function extractBytesSync(dataOrPath, mimeType, config = null) {
603
+ let data;
604
+ if (typeof dataOrPath === "string") {
605
+ data = (0, import_node_fs.readFileSync)(dataOrPath);
606
+ } else {
607
+ data = dataOrPath;
608
+ }
609
+ const validated = assertUint8Array(data, "data");
610
+ const normalizedConfig = normalizeExtractionConfig(config);
611
+ const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
612
+ return convertResult(rawResult);
613
+ }
614
+ async function extractBytes(dataOrPath, mimeType, config = null) {
615
+ let data;
616
+ if (typeof dataOrPath === "string") {
617
+ data = (0, import_node_fs.readFileSync)(dataOrPath);
618
+ } else {
619
+ data = dataOrPath;
620
+ }
621
+ const validated = assertUint8Array(data, "data");
622
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
623
+ console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
624
+ }
625
+ const normalizedConfig = normalizeExtractionConfig(config);
626
+ const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
627
+ return convertResult(rawResult);
628
+ }
629
+ function batchExtractFilesSync(paths, config = null) {
630
+ const normalizedConfig = normalizeExtractionConfig(config);
631
+ const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
632
+ return rawResults.map(convertResult);
633
+ }
634
+ async function batchExtractFiles(paths, config = null) {
635
+ const normalizedConfig = normalizeExtractionConfig(config);
636
+ const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
637
+ return rawResults.map(convertResult);
638
+ }
639
+ function batchExtractBytesSync(dataList, mimeTypes, config = null) {
640
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
641
+ if (buffers.length !== mimeTypes.length) {
642
+ throw new TypeError("dataList and mimeTypes must have the same length");
643
+ }
644
+ const normalizedConfig = normalizeExtractionConfig(config);
645
+ const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
646
+ return rawResults.map(convertResult);
647
+ }
648
+ async function batchExtractBytes(dataList, mimeTypes, config = null) {
649
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
650
+ if (buffers.length !== mimeTypes.length) {
651
+ throw new TypeError("dataList and mimeTypes must have the same length");
652
+ }
653
+ const normalizedConfig = normalizeExtractionConfig(config);
654
+ const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
655
+ return rawResults.map(convertResult);
656
+ }
657
+ function registerPostProcessor(processor) {
658
+ const binding2 = getBinding();
659
+ const wrappedProcessor = {
660
+ name: typeof processor.name === "function" ? processor.name() : processor.name,
661
+ processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
662
+ async process(...args) {
663
+ const wrappedValue = args[0];
664
+ const jsonString = wrappedValue[0];
665
+ const wireResult = JSON.parse(jsonString);
666
+ const result = {
667
+ content: wireResult.content,
668
+ mimeType: wireResult.mime_type,
669
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
670
+ tables: wireResult.tables || [],
671
+ detectedLanguages: wireResult.detected_languages ?? null,
672
+ chunks: wireResult.chunks ?? null,
673
+ images: wireResult.images ?? null
674
+ };
675
+ const updated = await processor.process(result);
676
+ const wireUpdated = {
677
+ content: updated.content,
678
+ mime_type: updated.mimeType,
679
+ metadata: updated.metadata,
680
+ tables: updated.tables,
681
+ detected_languages: updated.detectedLanguages,
682
+ chunks: updated.chunks,
683
+ images: updated.images
684
+ };
685
+ return JSON.stringify(wireUpdated);
686
+ }
687
+ };
688
+ Object.defineProperty(wrappedProcessor, "__original", {
689
+ value: processor,
690
+ enumerable: false
691
+ });
692
+ const stage = processor.processingStage?.() ?? "middle";
693
+ Object.defineProperty(wrappedProcessor, "__stage", {
694
+ value: stage,
695
+ enumerable: false
696
+ });
697
+ binding2.registerPostProcessor(wrappedProcessor);
698
+ }
699
+ function unregisterPostProcessor(name) {
700
+ const binding2 = getBinding();
701
+ binding2.unregisterPostProcessor(name);
702
+ }
703
+ function clearPostProcessors() {
704
+ const binding2 = getBinding();
705
+ binding2.clearPostProcessors();
706
+ }
707
+ function listPostProcessors() {
708
+ const binding2 = getBinding();
709
+ return binding2.listPostProcessors();
710
+ }
711
+ function registerValidator(validator) {
712
+ const binding2 = getBinding();
713
+ const wrappedValidator = {
714
+ name: typeof validator.name === "function" ? validator.name() : validator.name,
715
+ priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
716
+ async validate(...args) {
717
+ const jsonString = args[0];
718
+ if (!jsonString || jsonString === "undefined") {
719
+ throw new Error("Validator received invalid JSON string");
720
+ }
721
+ const wireResult = JSON.parse(jsonString);
722
+ const result = {
723
+ content: wireResult.content,
724
+ mimeType: wireResult.mime_type,
725
+ metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
726
+ tables: wireResult.tables || [],
727
+ detectedLanguages: wireResult.detected_languages,
728
+ chunks: wireResult.chunks,
729
+ images: wireResult.images ?? null
730
+ };
731
+ await Promise.resolve(validator.validate(result));
732
+ return "";
733
+ }
734
+ };
735
+ binding2.registerValidator(wrappedValidator);
736
+ }
737
+ function unregisterValidator(name) {
738
+ const binding2 = getBinding();
739
+ binding2.unregisterValidator(name);
740
+ }
741
+ function clearValidators() {
742
+ const binding2 = getBinding();
743
+ binding2.clearValidators();
744
+ }
745
+ function listValidators() {
746
+ const binding2 = getBinding();
747
+ return binding2.listValidators();
748
+ }
749
+ function isOcrProcessTuple(value) {
750
+ return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
751
+ }
752
+ function isNestedOcrProcessTuple(value) {
753
+ return Array.isArray(value) && value.length === 1 && isOcrProcessTuple(value[0]);
754
+ }
755
+ function describePayload(value) {
756
+ if (typeof value === "string") {
757
+ return { ctor: "String", length: value.length };
758
+ }
759
+ return { ctor: value.constructor?.name ?? "Buffer", length: value.length };
760
+ }
761
+ function registerOcrBackend(backend) {
762
+ const binding2 = getBinding();
763
+ const wrappedBackend = {
764
+ name: typeof backend.name === "function" ? backend.name() : backend.name,
765
+ supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
766
+ async processImage(...processArgs) {
767
+ const [imagePayload, maybeLanguage] = processArgs;
768
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
769
+ console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
770
+ console.log("[registerOcrBackend] Raw args", {
771
+ imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
772
+ maybeLanguageType: typeof maybeLanguage,
773
+ metadata: Array.isArray(imagePayload) ? { tupleLength: imagePayload.length } : describePayload(imagePayload)
774
+ });
775
+ }
776
+ let rawBytes;
777
+ let language = maybeLanguage;
778
+ if (isNestedOcrProcessTuple(imagePayload)) {
779
+ [rawBytes, language] = imagePayload[0];
780
+ } else if (isOcrProcessTuple(imagePayload)) {
781
+ [rawBytes, language] = imagePayload;
782
+ } else {
783
+ rawBytes = imagePayload;
784
+ }
785
+ if (typeof language !== "string") {
786
+ throw new Error("OCR backend did not receive a language parameter");
787
+ }
788
+ if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
789
+ const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
790
+ console.log(
791
+ "[registerOcrBackend] Received payload",
792
+ Array.isArray(imagePayload) ? "tuple" : typeof rawBytes,
793
+ "ctor",
794
+ describePayload(rawBytes).ctor,
795
+ "length",
796
+ length
797
+ );
798
+ }
799
+ const buffer = typeof rawBytes === "string" ? Buffer.from(rawBytes, "base64") : Buffer.from(rawBytes);
800
+ const result = await backend.processImage(new Uint8Array(buffer), language);
801
+ return JSON.stringify(result);
802
+ }
803
+ };
804
+ binding2.registerOcrBackend(wrappedBackend);
805
+ }
806
+ function listOcrBackends() {
807
+ const binding2 = getBinding();
808
+ return binding2.listOcrBackends();
809
+ }
810
+ function unregisterOcrBackend(name) {
811
+ const binding2 = getBinding();
812
+ binding2.unregisterOcrBackend(name);
813
+ }
814
+ function clearOcrBackends() {
815
+ const binding2 = getBinding();
816
+ binding2.clearOcrBackends();
817
+ }
818
+ function listDocumentExtractors() {
819
+ const binding2 = getBinding();
820
+ return binding2.listDocumentExtractors();
821
+ }
822
+ function unregisterDocumentExtractor(name) {
823
+ const binding2 = getBinding();
824
+ binding2.unregisterDocumentExtractor(name);
825
+ }
826
+ function clearDocumentExtractors() {
827
+ const binding2 = getBinding();
828
+ binding2.clearDocumentExtractors();
829
+ }
830
+ const ExtractionConfig = {
831
+ /**
832
+ * Load extraction configuration from a file.
833
+ *
834
+ * Automatically detects the file format based on extension:
835
+ * - `.toml` - TOML format
836
+ * - `.yaml` - YAML format
837
+ * - `.json` - JSON format
838
+ *
839
+ * @param filePath - Path to the configuration file (absolute or relative)
840
+ * @returns ExtractionConfig object loaded from the file
841
+ *
842
+ * @throws {Error} If file does not exist or is not accessible
843
+ * @throws {Error} If file content is not valid TOML/YAML/JSON
844
+ * @throws {Error} If configuration structure is invalid
845
+ * @throws {Error} If file extension is not supported
846
+ *
847
+ * @example
848
+ * ```typescript
849
+ * import { ExtractionConfig } from '@kreuzberg/node';
850
+ *
851
+ * // Load from TOML file
852
+ * const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
853
+ *
854
+ * // Load from YAML file
855
+ * const config2 = ExtractionConfig.fromFile('./config.yaml');
856
+ *
857
+ * // Load from JSON file
858
+ * const config3 = ExtractionConfig.fromFile('./config.json');
859
+ * ```
860
+ */
861
+ fromFile(filePath) {
862
+ const binding2 = getBinding();
863
+ return binding2.loadExtractionConfigFromFile(filePath);
864
+ },
865
+ /**
866
+ * Discover and load configuration from current or parent directories.
867
+ *
868
+ * Searches for a `kreuzberg.toml` file starting from the current working directory
869
+ * and traversing up the directory tree. Returns the first configuration file found.
870
+ *
871
+ * @returns ExtractionConfig object if found, or null if no configuration file exists
872
+ *
873
+ * @example
874
+ * ```typescript
875
+ * import { ExtractionConfig } from '@kreuzberg/node';
876
+ *
877
+ * // Try to find config in current or parent directories
878
+ * const config = ExtractionConfig.discover();
879
+ * if (config) {
880
+ * console.log('Found configuration');
881
+ * // Use config for extraction
882
+ * } else {
883
+ * console.log('No configuration file found, using defaults');
884
+ * }
885
+ * ```
886
+ */
887
+ discover() {
888
+ const binding2 = getBinding();
889
+ return binding2.discoverExtractionConfig();
890
+ }
891
+ };
892
+ function detectMimeType(bytes) {
893
+ const binding2 = getBinding();
894
+ return binding2.detectMimeTypeFromBytes(bytes);
895
+ }
896
+ function detectMimeTypeFromPath(filePath, checkExists) {
897
+ const binding2 = getBinding();
898
+ return binding2.detectMimeTypeFromPath(filePath, checkExists);
899
+ }
900
+ function validateMimeType(mimeType) {
901
+ const binding2 = getBinding();
902
+ return binding2.validateMimeType(mimeType);
903
+ }
904
+ function getExtensionsForMime(mimeType) {
905
+ const binding2 = getBinding();
906
+ return binding2.getExtensionsForMime(mimeType);
907
+ }
908
+ function listEmbeddingPresets() {
909
+ const binding2 = getBinding();
910
+ return binding2.listEmbeddingPresets();
911
+ }
912
+ function getEmbeddingPreset(name) {
913
+ const binding2 = getBinding();
914
+ const result = binding2.getEmbeddingPreset(name);
915
+ return result;
916
+ }
917
+ function getLastErrorCode() {
918
+ const binding2 = getBinding();
919
+ return binding2.getLastErrorCode();
920
+ }
921
+ function getLastPanicContext() {
922
+ const binding2 = getBinding();
923
+ const result = binding2.getLastPanicContext();
924
+ return result;
925
+ }
926
+ function getErrorCodeName(code) {
927
+ const binding2 = getBinding();
928
+ return binding2.getErrorCodeName(code);
929
+ }
930
+ function getErrorCodeDescription(code) {
931
+ const binding2 = getBinding();
932
+ return binding2.getErrorCodeDescription(code);
933
+ }
934
+ function classifyError(errorMessage) {
935
+ const binding2 = getBinding();
936
+ const result = binding2.classifyError(errorMessage);
937
+ return result;
938
+ }
939
+ function createWorkerPool(size) {
940
+ const binding2 = getBinding();
941
+ const rawPool = binding2.createWorkerPool(size);
942
+ return rawPool;
943
+ }
944
+ function getWorkerPoolStats(pool) {
945
+ const binding2 = getBinding();
946
+ const rawStats = binding2.getWorkerPoolStats(pool);
947
+ return rawStats;
948
+ }
949
+ async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
950
+ let mimeType = null;
951
+ let config = null;
952
+ if (typeof mimeTypeOrConfig === "string") {
953
+ mimeType = mimeTypeOrConfig;
954
+ config = maybeConfig ?? null;
955
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
956
+ config = mimeTypeOrConfig;
957
+ mimeType = null;
958
+ } else {
959
+ config = maybeConfig ?? null;
960
+ mimeType = null;
961
+ }
962
+ const normalizedConfig = normalizeExtractionConfig(config);
963
+ const binding2 = getBinding();
964
+ const rawResult = await binding2.extractFileInWorker(
965
+ pool,
966
+ filePath,
967
+ mimeType,
968
+ normalizedConfig
969
+ );
970
+ return convertResult(rawResult);
971
+ }
972
+ async function batchExtractFilesInWorker(pool, paths, config = null) {
973
+ const normalizedConfig = normalizeExtractionConfig(config);
974
+ const binding2 = getBinding();
975
+ const rawResults = await binding2.batchExtractFilesInWorker(
976
+ pool,
977
+ paths,
978
+ normalizedConfig
979
+ );
980
+ return rawResults.map(convertResult);
981
+ }
982
+ async function closeWorkerPool(pool) {
983
+ const binding2 = getBinding();
984
+ await binding2.closeWorkerPool(pool);
985
+ }
986
+ const __version__ = "4.0.0";
987
+ // Annotate the CommonJS export names for ESM import in node:
988
+ 0 && (module.exports = {
989
+ CacheError,
990
+ ErrorCode,
991
+ ExtractionConfig,
992
+ GutenOcrBackend,
993
+ ImageProcessingError,
994
+ KreuzbergError,
995
+ MissingDependencyError,
996
+ OcrError,
997
+ ParsingError,
998
+ PluginError,
999
+ ValidationError,
1000
+ __resetBindingForTests,
1001
+ __setBindingForTests,
1002
+ __version__,
1003
+ batchExtractBytes,
1004
+ batchExtractBytesSync,
1005
+ batchExtractFiles,
1006
+ batchExtractFilesInWorker,
1007
+ batchExtractFilesSync,
1008
+ classifyError,
1009
+ clearDocumentExtractors,
1010
+ clearOcrBackends,
1011
+ clearPostProcessors,
1012
+ clearValidators,
1013
+ closeWorkerPool,
1014
+ createWorkerPool,
1015
+ detectMimeType,
1016
+ detectMimeTypeFromPath,
1017
+ extractBytes,
1018
+ extractBytesSync,
1019
+ extractFile,
1020
+ extractFileInWorker,
1021
+ extractFileSync,
1022
+ getEmbeddingPreset,
1023
+ getErrorCodeDescription,
1024
+ getErrorCodeName,
1025
+ getExtensionsForMime,
1026
+ getLastErrorCode,
1027
+ getLastPanicContext,
1028
+ getWorkerPoolStats,
1029
+ listDocumentExtractors,
1030
+ listEmbeddingPresets,
1031
+ listOcrBackends,
1032
+ listPostProcessors,
1033
+ listValidators,
1034
+ registerOcrBackend,
1035
+ registerPostProcessor,
1036
+ registerValidator,
1037
+ unregisterDocumentExtractor,
1038
+ unregisterOcrBackend,
1039
+ unregisterPostProcessor,
1040
+ unregisterValidator,
1041
+ validateMimeType,
1042
+ ...require("./types.js")
1043
+ });
1044
+ //# sourceMappingURL=index.js.map