@kreuzberg/node 4.0.0-rc.5 → 4.0.0-rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -14
- package/dist/cli.d.mts +9 -0
- package/dist/cli.d.ts +9 -0
- package/dist/cli.js +78 -0
- package/dist/cli.js.map +1 -0
- package/dist/cli.mjs +43 -0
- package/dist/cli.mjs.map +1 -0
- package/dist/errors.d.mts +358 -0
- package/dist/errors.d.ts +358 -0
- package/dist/errors.js +139 -0
- package/dist/errors.js.map +1 -0
- package/dist/errors.mjs +107 -0
- package/dist/errors.mjs.map +1 -0
- package/dist/index.d.mts +857 -0
- package/dist/index.d.ts +857 -0
- package/dist/index.js +815 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +754 -0
- package/dist/index.mjs.map +1 -0
- package/dist/ocr/guten-ocr.d.mts +193 -0
- package/dist/ocr/guten-ocr.d.ts +193 -0
- package/dist/ocr/guten-ocr.js +232 -0
- package/dist/ocr/guten-ocr.js.map +1 -0
- package/dist/ocr/guten-ocr.mjs +198 -0
- package/dist/ocr/guten-ocr.mjs.map +1 -0
- package/dist/types.d.mts +666 -0
- package/dist/types.d.ts +666 -0
- package/dist/types.js +17 -0
- package/dist/types.js.map +1 -0
- package/dist/types.mjs +1 -0
- package/dist/types.mjs.map +1 -0
- package/index.d.ts +11 -2
- package/index.js +52 -52
- package/package.json +30 -29
- package/LICENSE +0 -7
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,754 @@
|
|
|
1
|
+
import { createRequire } from "node:module";
|
|
2
|
+
import {
|
|
3
|
+
CacheError,
|
|
4
|
+
ErrorCode,
|
|
5
|
+
ImageProcessingError,
|
|
6
|
+
KreuzbergError,
|
|
7
|
+
MissingDependencyError,
|
|
8
|
+
OcrError,
|
|
9
|
+
ParsingError,
|
|
10
|
+
PluginError,
|
|
11
|
+
ValidationError
|
|
12
|
+
} from "./errors.js";
|
|
13
|
+
import { GutenOcrBackend } from "./ocr/guten-ocr.js";
|
|
14
|
+
export * from "./types.js";
|
|
15
|
+
let binding = null;
|
|
16
|
+
let bindingInitialized = false;
|
|
17
|
+
function createNativeBindingError(error) {
|
|
18
|
+
const hintParts = [];
|
|
19
|
+
let detail = "Unknown error while requiring native module.";
|
|
20
|
+
if (error instanceof Error) {
|
|
21
|
+
detail = error.message || error.toString();
|
|
22
|
+
if (/pdfium/i.test(detail)) {
|
|
23
|
+
hintParts.push(
|
|
24
|
+
"Pdfium runtime library was not found. Ensure the bundled libpdfium (dll/dylib/so) is present next to the native module."
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
return new Error(
|
|
28
|
+
[
|
|
29
|
+
"Failed to load Kreuzberg native bindings.",
|
|
30
|
+
hintParts.length ? hintParts.join(" ") : "",
|
|
31
|
+
"Report this error and attach the logs/stack trace for investigation.",
|
|
32
|
+
`Underlying error: ${detail}`
|
|
33
|
+
].filter(Boolean).join(" "),
|
|
34
|
+
{ cause: error }
|
|
35
|
+
);
|
|
36
|
+
}
|
|
37
|
+
return new Error(
|
|
38
|
+
[
|
|
39
|
+
"Failed to load Kreuzberg native bindings.",
|
|
40
|
+
"Report this error and attach the logs/stack trace for investigation.",
|
|
41
|
+
`Underlying error: ${String(error)}`
|
|
42
|
+
].join(" ")
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
function assertUint8Array(value, name) {
|
|
46
|
+
if (!(value instanceof Uint8Array)) {
|
|
47
|
+
throw new TypeError(`${name} must be a Uint8Array`);
|
|
48
|
+
}
|
|
49
|
+
return value;
|
|
50
|
+
}
|
|
51
|
+
function assertUint8ArrayList(values, name) {
|
|
52
|
+
if (!Array.isArray(values)) {
|
|
53
|
+
throw new TypeError(`${name} must be an array of Uint8Array`);
|
|
54
|
+
}
|
|
55
|
+
const array = values;
|
|
56
|
+
return array.map((value, index) => {
|
|
57
|
+
try {
|
|
58
|
+
return assertUint8Array(value, `${name}[${index}]`);
|
|
59
|
+
} catch {
|
|
60
|
+
throw new TypeError(`${name}[${index}] must be a Uint8Array`);
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
function __setBindingForTests(mock) {
|
|
65
|
+
binding = mock;
|
|
66
|
+
bindingInitialized = true;
|
|
67
|
+
}
|
|
68
|
+
function __resetBindingForTests() {
|
|
69
|
+
binding = null;
|
|
70
|
+
bindingInitialized = false;
|
|
71
|
+
}
|
|
72
|
+
function loadNativeBinding() {
|
|
73
|
+
const localRequire = typeof require !== "undefined" ? (
|
|
74
|
+
// biome-ignore lint/suspicious/noExplicitAny: Node typings are available at runtime
|
|
75
|
+
require
|
|
76
|
+
) : createRequire(import.meta.url);
|
|
77
|
+
if (!localRequire) {
|
|
78
|
+
throw new Error("Unable to resolve native binding loader (require not available).");
|
|
79
|
+
}
|
|
80
|
+
return localRequire("../index.js");
|
|
81
|
+
}
|
|
82
|
+
function getBinding() {
|
|
83
|
+
if (bindingInitialized) {
|
|
84
|
+
return binding;
|
|
85
|
+
}
|
|
86
|
+
try {
|
|
87
|
+
if (typeof process !== "undefined" && process.versions && process.versions.node) {
|
|
88
|
+
binding = loadNativeBinding();
|
|
89
|
+
bindingInitialized = true;
|
|
90
|
+
return binding;
|
|
91
|
+
}
|
|
92
|
+
} catch (error) {
|
|
93
|
+
throw createNativeBindingError(error);
|
|
94
|
+
}
|
|
95
|
+
throw new Error(
|
|
96
|
+
"Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
|
|
97
|
+
);
|
|
98
|
+
}
|
|
99
|
+
function parseMetadata(metadataStr) {
|
|
100
|
+
try {
|
|
101
|
+
return JSON.parse(metadataStr);
|
|
102
|
+
} catch {
|
|
103
|
+
return {};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
function ensureUint8Array(value) {
|
|
107
|
+
if (value instanceof Uint8Array) {
|
|
108
|
+
return value;
|
|
109
|
+
}
|
|
110
|
+
if (typeof Buffer !== "undefined" && value instanceof Buffer) {
|
|
111
|
+
return new Uint8Array(value);
|
|
112
|
+
}
|
|
113
|
+
if (Array.isArray(value)) {
|
|
114
|
+
return new Uint8Array(value);
|
|
115
|
+
}
|
|
116
|
+
return new Uint8Array();
|
|
117
|
+
}
|
|
118
|
+
function convertChunk(rawChunk) {
|
|
119
|
+
if (!rawChunk) {
|
|
120
|
+
return {
|
|
121
|
+
content: "",
|
|
122
|
+
metadata: {
|
|
123
|
+
byteStart: 0,
|
|
124
|
+
byteEnd: 0,
|
|
125
|
+
tokenCount: null,
|
|
126
|
+
chunkIndex: 0,
|
|
127
|
+
totalChunks: 0
|
|
128
|
+
},
|
|
129
|
+
embedding: null
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
const metadata = rawChunk.metadata ?? {};
|
|
133
|
+
return {
|
|
134
|
+
content: rawChunk.content ?? "",
|
|
135
|
+
embedding: rawChunk.embedding ?? null,
|
|
136
|
+
metadata: {
|
|
137
|
+
byteStart: metadata.byte_start ?? metadata.charStart ?? 0,
|
|
138
|
+
byteEnd: metadata.byte_end ?? metadata.charEnd ?? 0,
|
|
139
|
+
tokenCount: metadata.token_count ?? metadata.tokenCount ?? null,
|
|
140
|
+
chunkIndex: metadata.chunk_index ?? metadata.chunkIndex ?? 0,
|
|
141
|
+
totalChunks: metadata.total_chunks ?? metadata.totalChunks ?? 0,
|
|
142
|
+
firstPage: metadata.first_page ?? metadata.firstPage ?? null,
|
|
143
|
+
lastPage: metadata.last_page ?? metadata.lastPage ?? null
|
|
144
|
+
}
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
function convertImage(rawImage) {
|
|
148
|
+
if (!rawImage) {
|
|
149
|
+
return {
|
|
150
|
+
data: new Uint8Array(),
|
|
151
|
+
format: "unknown",
|
|
152
|
+
imageIndex: 0,
|
|
153
|
+
pageNumber: null,
|
|
154
|
+
width: null,
|
|
155
|
+
height: null,
|
|
156
|
+
colorspace: null,
|
|
157
|
+
bitsPerComponent: null,
|
|
158
|
+
isMask: false,
|
|
159
|
+
description: null,
|
|
160
|
+
ocrResult: null
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
return {
|
|
164
|
+
data: ensureUint8Array(rawImage.data),
|
|
165
|
+
format: rawImage.format ?? "unknown",
|
|
166
|
+
imageIndex: rawImage.imageIndex ?? 0,
|
|
167
|
+
pageNumber: rawImage.pageNumber ?? null,
|
|
168
|
+
width: rawImage.width ?? null,
|
|
169
|
+
height: rawImage.height ?? null,
|
|
170
|
+
colorspace: rawImage.colorspace ?? null,
|
|
171
|
+
bitsPerComponent: rawImage.bitsPerComponent ?? null,
|
|
172
|
+
isMask: rawImage.isMask ?? false,
|
|
173
|
+
description: rawImage.description ?? null,
|
|
174
|
+
ocrResult: rawImage.ocrResult ? convertResult(rawImage.ocrResult) : null
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
function convertResult(rawResult) {
|
|
178
|
+
return {
|
|
179
|
+
content: rawResult.content,
|
|
180
|
+
mimeType: rawResult.mimeType,
|
|
181
|
+
metadata: typeof rawResult.metadata === "string" ? parseMetadata(rawResult.metadata) : rawResult.metadata,
|
|
182
|
+
tables: rawResult.tables || [],
|
|
183
|
+
detectedLanguages: rawResult.detectedLanguages || null,
|
|
184
|
+
chunks: Array.isArray(rawResult.chunks) ? rawResult.chunks.map((chunk) => convertChunk(chunk)) : null,
|
|
185
|
+
images: Array.isArray(rawResult.images) ? rawResult.images.map((image) => convertImage(image)) : null
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
function setIfDefined(target, key, value) {
|
|
189
|
+
if (value !== void 0) {
|
|
190
|
+
target[key] = value;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
function normalizeTesseractConfig(config) {
|
|
194
|
+
if (!config) {
|
|
195
|
+
return void 0;
|
|
196
|
+
}
|
|
197
|
+
const normalized = {};
|
|
198
|
+
setIfDefined(normalized, "psm", config.psm);
|
|
199
|
+
setIfDefined(normalized, "enableTableDetection", config.enableTableDetection);
|
|
200
|
+
setIfDefined(normalized, "tesseditCharWhitelist", config.tesseditCharWhitelist);
|
|
201
|
+
return normalized;
|
|
202
|
+
}
|
|
203
|
+
function normalizeOcrConfig(ocr) {
|
|
204
|
+
if (!ocr) {
|
|
205
|
+
return void 0;
|
|
206
|
+
}
|
|
207
|
+
const normalized = {
|
|
208
|
+
backend: ocr.backend
|
|
209
|
+
};
|
|
210
|
+
setIfDefined(normalized, "language", ocr.language);
|
|
211
|
+
const tesseract = normalizeTesseractConfig(ocr.tesseractConfig);
|
|
212
|
+
if (tesseract) {
|
|
213
|
+
setIfDefined(normalized, "tesseractConfig", tesseract);
|
|
214
|
+
}
|
|
215
|
+
return normalized;
|
|
216
|
+
}
|
|
217
|
+
function normalizeChunkingConfig(chunking) {
|
|
218
|
+
if (!chunking) {
|
|
219
|
+
return void 0;
|
|
220
|
+
}
|
|
221
|
+
const normalized = {};
|
|
222
|
+
setIfDefined(normalized, "maxChars", chunking.maxChars);
|
|
223
|
+
setIfDefined(normalized, "maxOverlap", chunking.maxOverlap);
|
|
224
|
+
setIfDefined(normalized, "preset", chunking.preset);
|
|
225
|
+
setIfDefined(normalized, "embedding", chunking.embedding);
|
|
226
|
+
setIfDefined(normalized, "enabled", chunking.enabled);
|
|
227
|
+
return normalized;
|
|
228
|
+
}
|
|
229
|
+
function normalizeImageExtractionConfig(images) {
|
|
230
|
+
if (!images) {
|
|
231
|
+
return void 0;
|
|
232
|
+
}
|
|
233
|
+
const normalized = {};
|
|
234
|
+
setIfDefined(normalized, "extractImages", images.extractImages);
|
|
235
|
+
setIfDefined(normalized, "targetDpi", images.targetDpi);
|
|
236
|
+
setIfDefined(normalized, "maxImageDimension", images.maxImageDimension);
|
|
237
|
+
setIfDefined(normalized, "autoAdjustDpi", images.autoAdjustDpi);
|
|
238
|
+
setIfDefined(normalized, "minDpi", images.minDpi);
|
|
239
|
+
setIfDefined(normalized, "maxDpi", images.maxDpi);
|
|
240
|
+
return normalized;
|
|
241
|
+
}
|
|
242
|
+
function normalizePdfConfig(pdf) {
|
|
243
|
+
if (!pdf) {
|
|
244
|
+
return void 0;
|
|
245
|
+
}
|
|
246
|
+
const normalized = {};
|
|
247
|
+
setIfDefined(normalized, "extractImages", pdf.extractImages);
|
|
248
|
+
setIfDefined(normalized, "passwords", pdf.passwords);
|
|
249
|
+
setIfDefined(normalized, "extractMetadata", pdf.extractMetadata);
|
|
250
|
+
return normalized;
|
|
251
|
+
}
|
|
252
|
+
function normalizeTokenReductionConfig(tokenReduction) {
|
|
253
|
+
if (!tokenReduction) {
|
|
254
|
+
return void 0;
|
|
255
|
+
}
|
|
256
|
+
const normalized = {};
|
|
257
|
+
setIfDefined(normalized, "mode", tokenReduction.mode);
|
|
258
|
+
setIfDefined(normalized, "preserveImportantWords", tokenReduction.preserveImportantWords);
|
|
259
|
+
return normalized;
|
|
260
|
+
}
|
|
261
|
+
function normalizeLanguageDetectionConfig(languageDetection) {
|
|
262
|
+
if (!languageDetection) {
|
|
263
|
+
return void 0;
|
|
264
|
+
}
|
|
265
|
+
const normalized = {};
|
|
266
|
+
setIfDefined(normalized, "enabled", languageDetection.enabled);
|
|
267
|
+
setIfDefined(normalized, "minConfidence", languageDetection.minConfidence);
|
|
268
|
+
setIfDefined(normalized, "detectMultiple", languageDetection.detectMultiple);
|
|
269
|
+
return normalized;
|
|
270
|
+
}
|
|
271
|
+
function normalizePostProcessorConfig(postprocessor) {
|
|
272
|
+
if (!postprocessor) {
|
|
273
|
+
return void 0;
|
|
274
|
+
}
|
|
275
|
+
const normalized = {};
|
|
276
|
+
setIfDefined(normalized, "enabled", postprocessor.enabled);
|
|
277
|
+
setIfDefined(normalized, "enabledProcessors", postprocessor.enabledProcessors);
|
|
278
|
+
setIfDefined(normalized, "disabledProcessors", postprocessor.disabledProcessors);
|
|
279
|
+
return normalized;
|
|
280
|
+
}
|
|
281
|
+
function normalizeHtmlPreprocessing(options) {
|
|
282
|
+
if (!options) {
|
|
283
|
+
return void 0;
|
|
284
|
+
}
|
|
285
|
+
const normalized = {};
|
|
286
|
+
setIfDefined(normalized, "enabled", options.enabled);
|
|
287
|
+
setIfDefined(normalized, "preset", options.preset);
|
|
288
|
+
setIfDefined(normalized, "removeNavigation", options.removeNavigation);
|
|
289
|
+
setIfDefined(normalized, "removeForms", options.removeForms);
|
|
290
|
+
return normalized;
|
|
291
|
+
}
|
|
292
|
+
function normalizeHtmlOptions(options) {
|
|
293
|
+
if (!options) {
|
|
294
|
+
return void 0;
|
|
295
|
+
}
|
|
296
|
+
const normalized = {};
|
|
297
|
+
setIfDefined(normalized, "headingStyle", options.headingStyle);
|
|
298
|
+
setIfDefined(normalized, "listIndentType", options.listIndentType);
|
|
299
|
+
setIfDefined(normalized, "listIndentWidth", options.listIndentWidth);
|
|
300
|
+
setIfDefined(normalized, "bullets", options.bullets);
|
|
301
|
+
setIfDefined(normalized, "strongEmSymbol", options.strongEmSymbol);
|
|
302
|
+
setIfDefined(normalized, "escapeAsterisks", options.escapeAsterisks);
|
|
303
|
+
setIfDefined(normalized, "escapeUnderscores", options.escapeUnderscores);
|
|
304
|
+
setIfDefined(normalized, "escapeMisc", options.escapeMisc);
|
|
305
|
+
setIfDefined(normalized, "escapeAscii", options.escapeAscii);
|
|
306
|
+
setIfDefined(normalized, "codeLanguage", options.codeLanguage);
|
|
307
|
+
setIfDefined(normalized, "autolinks", options.autolinks);
|
|
308
|
+
setIfDefined(normalized, "defaultTitle", options.defaultTitle);
|
|
309
|
+
setIfDefined(normalized, "brInTables", options.brInTables);
|
|
310
|
+
setIfDefined(normalized, "hocrSpatialTables", options.hocrSpatialTables);
|
|
311
|
+
setIfDefined(normalized, "highlightStyle", options.highlightStyle);
|
|
312
|
+
setIfDefined(normalized, "extractMetadata", options.extractMetadata);
|
|
313
|
+
setIfDefined(normalized, "whitespaceMode", options.whitespaceMode);
|
|
314
|
+
setIfDefined(normalized, "stripNewlines", options.stripNewlines);
|
|
315
|
+
setIfDefined(normalized, "wrap", options.wrap);
|
|
316
|
+
setIfDefined(normalized, "wrapWidth", options.wrapWidth);
|
|
317
|
+
setIfDefined(normalized, "convertAsInline", options.convertAsInline);
|
|
318
|
+
setIfDefined(normalized, "subSymbol", options.subSymbol);
|
|
319
|
+
setIfDefined(normalized, "supSymbol", options.supSymbol);
|
|
320
|
+
setIfDefined(normalized, "newlineStyle", options.newlineStyle);
|
|
321
|
+
setIfDefined(normalized, "codeBlockStyle", options.codeBlockStyle);
|
|
322
|
+
setIfDefined(normalized, "keepInlineImagesIn", options.keepInlineImagesIn);
|
|
323
|
+
setIfDefined(normalized, "encoding", options.encoding);
|
|
324
|
+
setIfDefined(normalized, "debug", options.debug);
|
|
325
|
+
setIfDefined(normalized, "stripTags", options.stripTags);
|
|
326
|
+
setIfDefined(normalized, "preserveTags", options.preserveTags);
|
|
327
|
+
const preprocessing = normalizeHtmlPreprocessing(options.preprocessing);
|
|
328
|
+
setIfDefined(normalized, "preprocessing", preprocessing);
|
|
329
|
+
return normalized;
|
|
330
|
+
}
|
|
331
|
+
function normalizeKeywordConfig(config) {
|
|
332
|
+
if (!config) {
|
|
333
|
+
return void 0;
|
|
334
|
+
}
|
|
335
|
+
const normalized = {};
|
|
336
|
+
setIfDefined(normalized, "algorithm", config.algorithm);
|
|
337
|
+
setIfDefined(normalized, "maxKeywords", config.maxKeywords);
|
|
338
|
+
setIfDefined(normalized, "minScore", config.minScore);
|
|
339
|
+
setIfDefined(normalized, "ngramRange", config.ngramRange);
|
|
340
|
+
setIfDefined(normalized, "language", config.language);
|
|
341
|
+
setIfDefined(normalized, "yakeParams", config.yakeParams);
|
|
342
|
+
setIfDefined(normalized, "rakeParams", config.rakeParams);
|
|
343
|
+
return normalized;
|
|
344
|
+
}
|
|
345
|
+
function normalizePageConfig(pages) {
|
|
346
|
+
if (!pages) {
|
|
347
|
+
return void 0;
|
|
348
|
+
}
|
|
349
|
+
const normalized = {};
|
|
350
|
+
setIfDefined(normalized, "extract_pages", pages.extractPages);
|
|
351
|
+
setIfDefined(normalized, "insert_page_markers", pages.insertPageMarkers);
|
|
352
|
+
setIfDefined(normalized, "marker_format", pages.markerFormat);
|
|
353
|
+
return normalized;
|
|
354
|
+
}
|
|
355
|
+
function normalizeExtractionConfig(config) {
|
|
356
|
+
if (!config) {
|
|
357
|
+
return null;
|
|
358
|
+
}
|
|
359
|
+
const normalized = {};
|
|
360
|
+
setIfDefined(normalized, "useCache", config.useCache);
|
|
361
|
+
setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
|
|
362
|
+
setIfDefined(normalized, "forceOcr", config.forceOcr);
|
|
363
|
+
setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
|
|
364
|
+
const ocr = normalizeOcrConfig(config.ocr);
|
|
365
|
+
setIfDefined(normalized, "ocr", ocr);
|
|
366
|
+
const chunking = normalizeChunkingConfig(config.chunking);
|
|
367
|
+
setIfDefined(normalized, "chunking", chunking);
|
|
368
|
+
const images = normalizeImageExtractionConfig(config.images);
|
|
369
|
+
setIfDefined(normalized, "images", images);
|
|
370
|
+
const pdf = normalizePdfConfig(config.pdfOptions);
|
|
371
|
+
setIfDefined(normalized, "pdfOptions", pdf);
|
|
372
|
+
const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
|
|
373
|
+
setIfDefined(normalized, "tokenReduction", tokenReduction);
|
|
374
|
+
const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
|
|
375
|
+
setIfDefined(normalized, "languageDetection", languageDetection);
|
|
376
|
+
const postprocessor = normalizePostProcessorConfig(config.postprocessor);
|
|
377
|
+
setIfDefined(normalized, "postprocessor", postprocessor);
|
|
378
|
+
const keywords = normalizeKeywordConfig(config.keywords);
|
|
379
|
+
setIfDefined(normalized, "keywords", keywords);
|
|
380
|
+
const pages = normalizePageConfig(config.pages);
|
|
381
|
+
setIfDefined(normalized, "pages", pages);
|
|
382
|
+
const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
|
|
383
|
+
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
384
|
+
return normalized;
|
|
385
|
+
}
|
|
386
|
+
function extractFileSync(filePath, mimeType = null, config = null) {
|
|
387
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
388
|
+
const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
|
|
389
|
+
return convertResult(rawResult);
|
|
390
|
+
}
|
|
391
|
+
async function extractFile(filePath, mimeType = null, config = null) {
|
|
392
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
393
|
+
const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
|
|
394
|
+
return convertResult(rawResult);
|
|
395
|
+
}
|
|
396
|
+
function extractBytesSync(data, mimeType, config = null) {
|
|
397
|
+
const validated = assertUint8Array(data, "data");
|
|
398
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
399
|
+
const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
|
|
400
|
+
return convertResult(rawResult);
|
|
401
|
+
}
|
|
402
|
+
async function extractBytes(data, mimeType, config = null) {
|
|
403
|
+
const validated = assertUint8Array(data, "data");
|
|
404
|
+
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
405
|
+
console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
|
|
406
|
+
}
|
|
407
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
408
|
+
const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
|
|
409
|
+
return convertResult(rawResult);
|
|
410
|
+
}
|
|
411
|
+
function batchExtractFilesSync(paths, config = null) {
|
|
412
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
413
|
+
const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
|
|
414
|
+
return rawResults.map(convertResult);
|
|
415
|
+
}
|
|
416
|
+
async function batchExtractFiles(paths, config = null) {
|
|
417
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
418
|
+
const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
|
|
419
|
+
return rawResults.map(convertResult);
|
|
420
|
+
}
|
|
421
|
+
function batchExtractBytesSync(dataList, mimeTypes, config = null) {
|
|
422
|
+
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
423
|
+
if (buffers.length !== mimeTypes.length) {
|
|
424
|
+
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
425
|
+
}
|
|
426
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
427
|
+
const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
|
|
428
|
+
return rawResults.map(convertResult);
|
|
429
|
+
}
|
|
430
|
+
async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
431
|
+
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
432
|
+
if (buffers.length !== mimeTypes.length) {
|
|
433
|
+
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
434
|
+
}
|
|
435
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
436
|
+
const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
|
|
437
|
+
return rawResults.map(convertResult);
|
|
438
|
+
}
|
|
439
|
+
function registerPostProcessor(processor) {
|
|
440
|
+
const binding2 = getBinding();
|
|
441
|
+
const wrappedProcessor = {
|
|
442
|
+
name: processor.name.bind(processor),
|
|
443
|
+
processingStage: processor.processingStage?.bind(processor),
|
|
444
|
+
async process(...args) {
|
|
445
|
+
const wrappedValue = args[0];
|
|
446
|
+
const jsonString = wrappedValue[0];
|
|
447
|
+
const wireResult = JSON.parse(jsonString);
|
|
448
|
+
const result = {
|
|
449
|
+
content: wireResult.content,
|
|
450
|
+
mimeType: wireResult.mime_type,
|
|
451
|
+
metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
|
|
452
|
+
tables: wireResult.tables || [],
|
|
453
|
+
detectedLanguages: wireResult.detected_languages ?? null,
|
|
454
|
+
chunks: wireResult.chunks ?? null,
|
|
455
|
+
images: wireResult.images ?? null
|
|
456
|
+
};
|
|
457
|
+
const updated = await processor.process(result);
|
|
458
|
+
const wireUpdated = {
|
|
459
|
+
content: updated.content,
|
|
460
|
+
mime_type: updated.mimeType,
|
|
461
|
+
metadata: updated.metadata,
|
|
462
|
+
tables: updated.tables,
|
|
463
|
+
detected_languages: updated.detectedLanguages,
|
|
464
|
+
chunks: updated.chunks,
|
|
465
|
+
images: updated.images
|
|
466
|
+
};
|
|
467
|
+
return JSON.stringify(wireUpdated);
|
|
468
|
+
}
|
|
469
|
+
};
|
|
470
|
+
Object.defineProperty(wrappedProcessor, "__original", {
|
|
471
|
+
value: processor,
|
|
472
|
+
enumerable: false
|
|
473
|
+
});
|
|
474
|
+
const stage = processor.processingStage?.() ?? "middle";
|
|
475
|
+
Object.defineProperty(wrappedProcessor, "__stage", {
|
|
476
|
+
value: stage,
|
|
477
|
+
enumerable: false
|
|
478
|
+
});
|
|
479
|
+
binding2.registerPostProcessor(wrappedProcessor);
|
|
480
|
+
}
|
|
481
|
+
function unregisterPostProcessor(name) {
|
|
482
|
+
const binding2 = getBinding();
|
|
483
|
+
binding2.unregisterPostProcessor(name);
|
|
484
|
+
}
|
|
485
|
+
function clearPostProcessors() {
|
|
486
|
+
const binding2 = getBinding();
|
|
487
|
+
binding2.clearPostProcessors();
|
|
488
|
+
}
|
|
489
|
+
function listPostProcessors() {
|
|
490
|
+
const binding2 = getBinding();
|
|
491
|
+
return binding2.listPostProcessors();
|
|
492
|
+
}
|
|
493
|
+
function registerValidator(validator) {
|
|
494
|
+
const binding2 = getBinding();
|
|
495
|
+
const wrappedValidator = {
|
|
496
|
+
name: validator.name.bind(validator),
|
|
497
|
+
priority: validator.priority?.bind(validator),
|
|
498
|
+
async validate(...args) {
|
|
499
|
+
const jsonString = args[0];
|
|
500
|
+
if (!jsonString || jsonString === "undefined") {
|
|
501
|
+
throw new Error("Validator received invalid JSON string");
|
|
502
|
+
}
|
|
503
|
+
const wireResult = JSON.parse(jsonString);
|
|
504
|
+
const result = {
|
|
505
|
+
content: wireResult.content,
|
|
506
|
+
mimeType: wireResult.mime_type,
|
|
507
|
+
metadata: typeof wireResult.metadata === "string" ? JSON.parse(wireResult.metadata) : wireResult.metadata,
|
|
508
|
+
tables: wireResult.tables || [],
|
|
509
|
+
detectedLanguages: wireResult.detected_languages,
|
|
510
|
+
chunks: wireResult.chunks,
|
|
511
|
+
images: wireResult.images ?? null
|
|
512
|
+
};
|
|
513
|
+
await Promise.resolve(validator.validate(result));
|
|
514
|
+
return "";
|
|
515
|
+
}
|
|
516
|
+
};
|
|
517
|
+
binding2.registerValidator(wrappedValidator);
|
|
518
|
+
}
|
|
519
|
+
function unregisterValidator(name) {
|
|
520
|
+
const binding2 = getBinding();
|
|
521
|
+
binding2.unregisterValidator(name);
|
|
522
|
+
}
|
|
523
|
+
function clearValidators() {
|
|
524
|
+
const binding2 = getBinding();
|
|
525
|
+
binding2.clearValidators();
|
|
526
|
+
}
|
|
527
|
+
function listValidators() {
|
|
528
|
+
const binding2 = getBinding();
|
|
529
|
+
return binding2.listValidators();
|
|
530
|
+
}
|
|
531
|
+
function isOcrProcessTuple(value) {
|
|
532
|
+
return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
|
|
533
|
+
}
|
|
534
|
+
function isNestedOcrProcessTuple(value) {
|
|
535
|
+
return Array.isArray(value) && value.length === 1 && isOcrProcessTuple(value[0]);
|
|
536
|
+
}
|
|
537
|
+
function describePayload(value) {
|
|
538
|
+
if (typeof value === "string") {
|
|
539
|
+
return { ctor: "String", length: value.length };
|
|
540
|
+
}
|
|
541
|
+
return { ctor: value.constructor?.name ?? "Buffer", length: value.length };
|
|
542
|
+
}
|
|
543
|
+
function registerOcrBackend(backend) {
|
|
544
|
+
const binding2 = getBinding();
|
|
545
|
+
const wrappedBackend = {
|
|
546
|
+
name: backend.name.bind(backend),
|
|
547
|
+
supportedLanguages: backend.supportedLanguages.bind(backend),
|
|
548
|
+
async processImage(...processArgs) {
|
|
549
|
+
const [imagePayload, maybeLanguage] = processArgs;
|
|
550
|
+
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
551
|
+
console.log("[registerOcrBackend] JS arguments", { length: processArgs.length });
|
|
552
|
+
console.log("[registerOcrBackend] Raw args", {
|
|
553
|
+
imagePayloadType: Array.isArray(imagePayload) ? "tuple" : typeof imagePayload,
|
|
554
|
+
maybeLanguageType: typeof maybeLanguage,
|
|
555
|
+
metadata: Array.isArray(imagePayload) ? { tupleLength: imagePayload.length } : describePayload(imagePayload)
|
|
556
|
+
});
|
|
557
|
+
}
|
|
558
|
+
let rawBytes;
|
|
559
|
+
let language = maybeLanguage;
|
|
560
|
+
if (isNestedOcrProcessTuple(imagePayload)) {
|
|
561
|
+
[rawBytes, language] = imagePayload[0];
|
|
562
|
+
} else if (isOcrProcessTuple(imagePayload)) {
|
|
563
|
+
[rawBytes, language] = imagePayload;
|
|
564
|
+
} else {
|
|
565
|
+
rawBytes = imagePayload;
|
|
566
|
+
}
|
|
567
|
+
if (typeof language !== "string") {
|
|
568
|
+
throw new Error("OCR backend did not receive a language parameter");
|
|
569
|
+
}
|
|
570
|
+
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
571
|
+
const length = typeof rawBytes === "string" ? rawBytes.length : rawBytes.length;
|
|
572
|
+
console.log(
|
|
573
|
+
"[registerOcrBackend] Received payload",
|
|
574
|
+
Array.isArray(imagePayload) ? "tuple" : typeof rawBytes,
|
|
575
|
+
"ctor",
|
|
576
|
+
describePayload(rawBytes).ctor,
|
|
577
|
+
"length",
|
|
578
|
+
length
|
|
579
|
+
);
|
|
580
|
+
}
|
|
581
|
+
const buffer = typeof rawBytes === "string" ? Buffer.from(rawBytes, "base64") : Buffer.from(rawBytes);
|
|
582
|
+
const result = await backend.processImage(new Uint8Array(buffer), language);
|
|
583
|
+
return JSON.stringify(result);
|
|
584
|
+
}
|
|
585
|
+
};
|
|
586
|
+
binding2.registerOcrBackend(wrappedBackend);
|
|
587
|
+
}
|
|
588
|
+
function listOcrBackends() {
|
|
589
|
+
const binding2 = getBinding();
|
|
590
|
+
return binding2.listOcrBackends();
|
|
591
|
+
}
|
|
592
|
+
function unregisterOcrBackend(name) {
|
|
593
|
+
const binding2 = getBinding();
|
|
594
|
+
binding2.unregisterOcrBackend(name);
|
|
595
|
+
}
|
|
596
|
+
function clearOcrBackends() {
|
|
597
|
+
const binding2 = getBinding();
|
|
598
|
+
binding2.clearOcrBackends();
|
|
599
|
+
}
|
|
600
|
+
function listDocumentExtractors() {
|
|
601
|
+
const binding2 = getBinding();
|
|
602
|
+
return binding2.listDocumentExtractors();
|
|
603
|
+
}
|
|
604
|
+
function unregisterDocumentExtractor(name) {
|
|
605
|
+
const binding2 = getBinding();
|
|
606
|
+
binding2.unregisterDocumentExtractor(name);
|
|
607
|
+
}
|
|
608
|
+
function clearDocumentExtractors() {
|
|
609
|
+
const binding2 = getBinding();
|
|
610
|
+
binding2.clearDocumentExtractors();
|
|
611
|
+
}
|
|
612
|
+
const ExtractionConfig = {
|
|
613
|
+
/**
|
|
614
|
+
* Load extraction configuration from a file.
|
|
615
|
+
*
|
|
616
|
+
* Automatically detects the file format based on extension:
|
|
617
|
+
* - `.toml` - TOML format
|
|
618
|
+
* - `.yaml` - YAML format
|
|
619
|
+
* - `.json` - JSON format
|
|
620
|
+
*
|
|
621
|
+
* @param filePath - Path to the configuration file (absolute or relative)
|
|
622
|
+
* @returns ExtractionConfig object loaded from the file
|
|
623
|
+
*
|
|
624
|
+
* @throws {Error} If file does not exist or is not accessible
|
|
625
|
+
* @throws {Error} If file content is not valid TOML/YAML/JSON
|
|
626
|
+
* @throws {Error} If configuration structure is invalid
|
|
627
|
+
* @throws {Error} If file extension is not supported
|
|
628
|
+
*
|
|
629
|
+
* @example
|
|
630
|
+
* ```typescript
|
|
631
|
+
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
632
|
+
*
|
|
633
|
+
* // Load from TOML file
|
|
634
|
+
* const config1 = ExtractionConfig.fromFile('kreuzberg.toml');
|
|
635
|
+
*
|
|
636
|
+
* // Load from YAML file
|
|
637
|
+
* const config2 = ExtractionConfig.fromFile('./config.yaml');
|
|
638
|
+
*
|
|
639
|
+
* // Load from JSON file
|
|
640
|
+
* const config3 = ExtractionConfig.fromFile('./config.json');
|
|
641
|
+
* ```
|
|
642
|
+
*/
|
|
643
|
+
fromFile(filePath) {
|
|
644
|
+
const binding2 = getBinding();
|
|
645
|
+
return binding2.loadExtractionConfigFromFile(filePath);
|
|
646
|
+
},
|
|
647
|
+
/**
|
|
648
|
+
* Discover and load configuration from current or parent directories.
|
|
649
|
+
*
|
|
650
|
+
* Searches for a `kreuzberg.toml` file starting from the current working directory
|
|
651
|
+
* and traversing up the directory tree. Returns the first configuration file found.
|
|
652
|
+
*
|
|
653
|
+
* @returns ExtractionConfig object if found, or null if no configuration file exists
|
|
654
|
+
*
|
|
655
|
+
* @example
|
|
656
|
+
* ```typescript
|
|
657
|
+
* import { ExtractionConfig } from '@kreuzberg/node';
|
|
658
|
+
*
|
|
659
|
+
* // Try to find config in current or parent directories
|
|
660
|
+
* const config = ExtractionConfig.discover();
|
|
661
|
+
* if (config) {
|
|
662
|
+
* console.log('Found configuration');
|
|
663
|
+
* // Use config for extraction
|
|
664
|
+
* } else {
|
|
665
|
+
* console.log('No configuration file found, using defaults');
|
|
666
|
+
* }
|
|
667
|
+
* ```
|
|
668
|
+
*/
|
|
669
|
+
discover() {
|
|
670
|
+
const binding2 = getBinding();
|
|
671
|
+
return binding2.discoverExtractionConfig();
|
|
672
|
+
}
|
|
673
|
+
};
|
|
674
|
+
function detectMimeType(bytes) {
|
|
675
|
+
const binding2 = getBinding();
|
|
676
|
+
return binding2.detectMimeType(bytes);
|
|
677
|
+
}
|
|
678
|
+
function detectMimeTypeFromPath(path, checkExists) {
|
|
679
|
+
const binding2 = getBinding();
|
|
680
|
+
return binding2.detectMimeTypeFromPath(path, checkExists);
|
|
681
|
+
}
|
|
682
|
+
function validateMimeType(mimeType) {
|
|
683
|
+
const binding2 = getBinding();
|
|
684
|
+
return binding2.validateMimeType(mimeType);
|
|
685
|
+
}
|
|
686
|
+
function getExtensionsForMime(mimeType) {
|
|
687
|
+
const binding2 = getBinding();
|
|
688
|
+
return binding2.getExtensionsForMime(mimeType);
|
|
689
|
+
}
|
|
690
|
+
function listEmbeddingPresets() {
|
|
691
|
+
const binding2 = getBinding();
|
|
692
|
+
return binding2.listEmbeddingPresets();
|
|
693
|
+
}
|
|
694
|
+
function getEmbeddingPreset(name) {
|
|
695
|
+
const binding2 = getBinding();
|
|
696
|
+
return binding2.getEmbeddingPreset(name);
|
|
697
|
+
}
|
|
698
|
+
function getLastErrorCode() {
|
|
699
|
+
const binding2 = getBinding();
|
|
700
|
+
return binding2.getLastErrorCode();
|
|
701
|
+
}
|
|
702
|
+
function getLastPanicContext() {
|
|
703
|
+
const binding2 = getBinding();
|
|
704
|
+
return binding2.getLastPanicContext();
|
|
705
|
+
}
|
|
706
|
+
const __version__ = "4.0.0-rc.8";
|
|
707
|
+
export {
|
|
708
|
+
CacheError,
|
|
709
|
+
ErrorCode,
|
|
710
|
+
ExtractionConfig,
|
|
711
|
+
GutenOcrBackend,
|
|
712
|
+
ImageProcessingError,
|
|
713
|
+
KreuzbergError,
|
|
714
|
+
MissingDependencyError,
|
|
715
|
+
OcrError,
|
|
716
|
+
ParsingError,
|
|
717
|
+
PluginError,
|
|
718
|
+
ValidationError,
|
|
719
|
+
__resetBindingForTests,
|
|
720
|
+
__setBindingForTests,
|
|
721
|
+
__version__,
|
|
722
|
+
batchExtractBytes,
|
|
723
|
+
batchExtractBytesSync,
|
|
724
|
+
batchExtractFiles,
|
|
725
|
+
batchExtractFilesSync,
|
|
726
|
+
clearDocumentExtractors,
|
|
727
|
+
clearOcrBackends,
|
|
728
|
+
clearPostProcessors,
|
|
729
|
+
clearValidators,
|
|
730
|
+
detectMimeType,
|
|
731
|
+
detectMimeTypeFromPath,
|
|
732
|
+
extractBytes,
|
|
733
|
+
extractBytesSync,
|
|
734
|
+
extractFile,
|
|
735
|
+
extractFileSync,
|
|
736
|
+
getEmbeddingPreset,
|
|
737
|
+
getExtensionsForMime,
|
|
738
|
+
getLastErrorCode,
|
|
739
|
+
getLastPanicContext,
|
|
740
|
+
listDocumentExtractors,
|
|
741
|
+
listEmbeddingPresets,
|
|
742
|
+
listOcrBackends,
|
|
743
|
+
listPostProcessors,
|
|
744
|
+
listValidators,
|
|
745
|
+
registerOcrBackend,
|
|
746
|
+
registerPostProcessor,
|
|
747
|
+
registerValidator,
|
|
748
|
+
unregisterDocumentExtractor,
|
|
749
|
+
unregisterOcrBackend,
|
|
750
|
+
unregisterPostProcessor,
|
|
751
|
+
unregisterValidator,
|
|
752
|
+
validateMimeType
|
|
753
|
+
};
|
|
754
|
+
//# sourceMappingURL=index.mjs.map
|