@kreuzberg/node 4.0.0-rc.8 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +342 -530
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +337 -62
- package/dist/index.d.ts +337 -62
- package/dist/index.js +285 -56
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +277 -56
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +469 -54
- package/dist/types.d.ts +469 -54
- package/dist/types.js.map +1 -1
- package/index.d.ts +662 -1
- package/index.js +85 -55
- package/metadata.d.ts +53 -33
- package/package.json +17 -19
package/dist/index.js
CHANGED
|
@@ -36,21 +36,29 @@ __export(index_exports, {
|
|
|
36
36
|
batchExtractBytes: () => batchExtractBytes,
|
|
37
37
|
batchExtractBytesSync: () => batchExtractBytesSync,
|
|
38
38
|
batchExtractFiles: () => batchExtractFiles,
|
|
39
|
+
batchExtractFilesInWorker: () => batchExtractFilesInWorker,
|
|
39
40
|
batchExtractFilesSync: () => batchExtractFilesSync,
|
|
41
|
+
classifyError: () => classifyError,
|
|
40
42
|
clearDocumentExtractors: () => clearDocumentExtractors,
|
|
41
43
|
clearOcrBackends: () => clearOcrBackends,
|
|
42
44
|
clearPostProcessors: () => clearPostProcessors,
|
|
43
45
|
clearValidators: () => clearValidators,
|
|
46
|
+
closeWorkerPool: () => closeWorkerPool,
|
|
47
|
+
createWorkerPool: () => createWorkerPool,
|
|
44
48
|
detectMimeType: () => detectMimeType,
|
|
45
49
|
detectMimeTypeFromPath: () => detectMimeTypeFromPath,
|
|
46
50
|
extractBytes: () => extractBytes,
|
|
47
51
|
extractBytesSync: () => extractBytesSync,
|
|
48
52
|
extractFile: () => extractFile,
|
|
53
|
+
extractFileInWorker: () => extractFileInWorker,
|
|
49
54
|
extractFileSync: () => extractFileSync,
|
|
50
55
|
getEmbeddingPreset: () => getEmbeddingPreset,
|
|
56
|
+
getErrorCodeDescription: () => getErrorCodeDescription,
|
|
57
|
+
getErrorCodeName: () => getErrorCodeName,
|
|
51
58
|
getExtensionsForMime: () => getExtensionsForMime,
|
|
52
59
|
getLastErrorCode: () => getLastErrorCode,
|
|
53
60
|
getLastPanicContext: () => getLastPanicContext,
|
|
61
|
+
getWorkerPoolStats: () => getWorkerPoolStats,
|
|
54
62
|
listDocumentExtractors: () => listDocumentExtractors,
|
|
55
63
|
listEmbeddingPresets: () => listEmbeddingPresets,
|
|
56
64
|
listOcrBackends: () => listOcrBackends,
|
|
@@ -66,6 +74,7 @@ __export(index_exports, {
|
|
|
66
74
|
validateMimeType: () => validateMimeType
|
|
67
75
|
});
|
|
68
76
|
module.exports = __toCommonJS(index_exports);
|
|
77
|
+
var import_node_fs = require("node:fs");
|
|
69
78
|
var import_node_module = require("node:module");
|
|
70
79
|
var import_errors = require("./errors.js");
|
|
71
80
|
var import_guten_ocr = require("./ocr/guten-ocr.js");
|
|
@@ -129,17 +138,50 @@ function __resetBindingForTests() {
|
|
|
129
138
|
bindingInitialized = false;
|
|
130
139
|
}
|
|
131
140
|
function loadNativeBinding() {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
require
|
|
135
|
-
|
|
141
|
+
let localRequire;
|
|
142
|
+
if (typeof require !== "undefined") {
|
|
143
|
+
localRequire = require;
|
|
144
|
+
} else {
|
|
145
|
+
try {
|
|
146
|
+
localRequire = (0, import_node_module.createRequire)(import_meta.url);
|
|
147
|
+
} catch {
|
|
148
|
+
localRequire = void 0;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
136
151
|
if (!localRequire) {
|
|
137
152
|
throw new Error("Unable to resolve native binding loader (require not available).");
|
|
138
153
|
}
|
|
139
|
-
|
|
154
|
+
const loadedModule = localRequire("../index.js");
|
|
155
|
+
if (typeof loadedModule !== "object" || loadedModule === null) {
|
|
156
|
+
throw new Error(
|
|
157
|
+
"Native binding is not a valid object. Ensure the native module is properly built and compatible."
|
|
158
|
+
);
|
|
159
|
+
}
|
|
160
|
+
const module2 = loadedModule;
|
|
161
|
+
const requiredMethods = [
|
|
162
|
+
"extractFileSync",
|
|
163
|
+
"extractFile",
|
|
164
|
+
"extractBytesSync",
|
|
165
|
+
"extractBytes",
|
|
166
|
+
"batchExtractFilesSync",
|
|
167
|
+
"batchExtractFiles",
|
|
168
|
+
"batchExtractBytesSync",
|
|
169
|
+
"batchExtractBytes"
|
|
170
|
+
];
|
|
171
|
+
for (const method of requiredMethods) {
|
|
172
|
+
if (typeof module2[method] !== "function") {
|
|
173
|
+
throw new Error(
|
|
174
|
+
`Native binding is missing required method: ${method}. Ensure the native module is properly built and compatible.`
|
|
175
|
+
);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return module2;
|
|
140
179
|
}
|
|
141
180
|
function getBinding() {
|
|
142
181
|
if (bindingInitialized) {
|
|
182
|
+
if (binding === null) {
|
|
183
|
+
throw new Error("Native binding was previously failed to load.");
|
|
184
|
+
}
|
|
143
185
|
return binding;
|
|
144
186
|
}
|
|
145
187
|
try {
|
|
@@ -149,6 +191,7 @@ function getBinding() {
|
|
|
149
191
|
return binding;
|
|
150
192
|
}
|
|
151
193
|
} catch (error) {
|
|
194
|
+
bindingInitialized = true;
|
|
152
195
|
throw createNativeBindingError(error);
|
|
153
196
|
}
|
|
154
197
|
throw new Error(
|
|
@@ -157,7 +200,11 @@ function getBinding() {
|
|
|
157
200
|
}
|
|
158
201
|
function parseMetadata(metadataStr) {
|
|
159
202
|
try {
|
|
160
|
-
|
|
203
|
+
const parsed = JSON.parse(metadataStr);
|
|
204
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
205
|
+
return parsed;
|
|
206
|
+
}
|
|
207
|
+
return {};
|
|
161
208
|
} catch {
|
|
162
209
|
return {};
|
|
163
210
|
}
|
|
@@ -175,7 +222,7 @@ function ensureUint8Array(value) {
|
|
|
175
222
|
return new Uint8Array();
|
|
176
223
|
}
|
|
177
224
|
function convertChunk(rawChunk) {
|
|
178
|
-
if (!rawChunk) {
|
|
225
|
+
if (!rawChunk || typeof rawChunk !== "object") {
|
|
179
226
|
return {
|
|
180
227
|
content: "",
|
|
181
228
|
metadata: {
|
|
@@ -188,23 +235,33 @@ function convertChunk(rawChunk) {
|
|
|
188
235
|
embedding: null
|
|
189
236
|
};
|
|
190
237
|
}
|
|
191
|
-
const
|
|
238
|
+
const chunk = rawChunk;
|
|
239
|
+
const metadata = chunk["metadata"] ?? {};
|
|
192
240
|
return {
|
|
193
|
-
|
|
194
|
-
|
|
241
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
242
|
+
content: chunk["content"] ?? "",
|
|
243
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
244
|
+
embedding: chunk["embedding"] ?? null,
|
|
195
245
|
metadata: {
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
246
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
247
|
+
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
248
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
249
|
+
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
250
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
251
|
+
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
252
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
253
|
+
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
254
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
255
|
+
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
256
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
257
|
+
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
258
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
259
|
+
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
|
|
203
260
|
}
|
|
204
261
|
};
|
|
205
262
|
}
|
|
206
263
|
function convertImage(rawImage) {
|
|
207
|
-
if (!rawImage) {
|
|
264
|
+
if (!rawImage || typeof rawImage !== "object") {
|
|
208
265
|
return {
|
|
209
266
|
data: new Uint8Array(),
|
|
210
267
|
format: "unknown",
|
|
@@ -219,31 +276,97 @@ function convertImage(rawImage) {
|
|
|
219
276
|
ocrResult: null
|
|
220
277
|
};
|
|
221
278
|
}
|
|
279
|
+
const image = rawImage;
|
|
222
280
|
return {
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
281
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
282
|
+
data: ensureUint8Array(image["data"]),
|
|
283
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
284
|
+
format: image["format"] ?? "unknown",
|
|
285
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
286
|
+
imageIndex: image["imageIndex"] ?? 0,
|
|
287
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
288
|
+
pageNumber: image["pageNumber"] ?? null,
|
|
289
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
290
|
+
width: image["width"] ?? null,
|
|
291
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
292
|
+
height: image["height"] ?? null,
|
|
293
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
294
|
+
colorspace: image["colorspace"] ?? null,
|
|
295
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
296
|
+
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
297
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
298
|
+
isMask: image["isMask"] ?? false,
|
|
299
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
300
|
+
description: image["description"] ?? null,
|
|
301
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
302
|
+
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
234
303
|
};
|
|
235
304
|
}
|
|
236
|
-
function
|
|
305
|
+
function convertPageContent(rawPage) {
|
|
306
|
+
if (!rawPage || typeof rawPage !== "object") {
|
|
307
|
+
return {
|
|
308
|
+
pageNumber: 0,
|
|
309
|
+
content: "",
|
|
310
|
+
tables: [],
|
|
311
|
+
images: []
|
|
312
|
+
};
|
|
313
|
+
}
|
|
314
|
+
const page = rawPage;
|
|
237
315
|
return {
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
316
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
317
|
+
pageNumber: page["pageNumber"] ?? 0,
|
|
318
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
319
|
+
content: page["content"] ?? "",
|
|
320
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
321
|
+
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
322
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
323
|
+
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
|
|
245
324
|
};
|
|
246
325
|
}
|
|
326
|
+
function convertResult(rawResult) {
|
|
327
|
+
if (!rawResult || typeof rawResult !== "object") {
|
|
328
|
+
return {
|
|
329
|
+
content: "",
|
|
330
|
+
mimeType: "application/octet-stream",
|
|
331
|
+
metadata: {},
|
|
332
|
+
tables: [],
|
|
333
|
+
detectedLanguages: null,
|
|
334
|
+
chunks: null,
|
|
335
|
+
images: null,
|
|
336
|
+
pages: null
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
const result = rawResult;
|
|
340
|
+
const metadata = result["metadata"];
|
|
341
|
+
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
342
|
+
const returnObj = {
|
|
343
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
344
|
+
content: result["content"] ?? "",
|
|
345
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
346
|
+
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
347
|
+
metadata: metadataValue,
|
|
348
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
349
|
+
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
350
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
351
|
+
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
352
|
+
chunks: null,
|
|
353
|
+
images: null,
|
|
354
|
+
pages: null
|
|
355
|
+
};
|
|
356
|
+
const chunksData = result["chunks"];
|
|
357
|
+
if (Array.isArray(chunksData)) {
|
|
358
|
+
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
359
|
+
}
|
|
360
|
+
const imagesData = result["images"];
|
|
361
|
+
if (Array.isArray(imagesData)) {
|
|
362
|
+
returnObj.images = imagesData.map((image) => convertImage(image));
|
|
363
|
+
}
|
|
364
|
+
const pagesData = result["pages"];
|
|
365
|
+
if (Array.isArray(pagesData)) {
|
|
366
|
+
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
367
|
+
}
|
|
368
|
+
return returnObj;
|
|
369
|
+
}
|
|
247
370
|
function setIfDefined(target, key, value) {
|
|
248
371
|
if (value !== void 0) {
|
|
249
372
|
target[key] = value;
|
|
@@ -406,9 +529,9 @@ function normalizePageConfig(pages) {
|
|
|
406
529
|
return void 0;
|
|
407
530
|
}
|
|
408
531
|
const normalized = {};
|
|
409
|
-
setIfDefined(normalized, "
|
|
410
|
-
setIfDefined(normalized, "
|
|
411
|
-
setIfDefined(normalized, "
|
|
532
|
+
setIfDefined(normalized, "extractPages", pages.extractPages);
|
|
533
|
+
setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
|
|
534
|
+
setIfDefined(normalized, "markerFormat", pages.markerFormat);
|
|
412
535
|
return normalized;
|
|
413
536
|
}
|
|
414
537
|
function normalizeExtractionConfig(config) {
|
|
@@ -442,23 +565,59 @@ function normalizeExtractionConfig(config) {
|
|
|
442
565
|
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
443
566
|
return normalized;
|
|
444
567
|
}
|
|
445
|
-
function extractFileSync(filePath,
|
|
568
|
+
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
569
|
+
let mimeType = null;
|
|
570
|
+
let config = null;
|
|
571
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
572
|
+
mimeType = mimeTypeOrConfig;
|
|
573
|
+
config = maybeConfig ?? null;
|
|
574
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
575
|
+
config = mimeTypeOrConfig;
|
|
576
|
+
mimeType = null;
|
|
577
|
+
} else {
|
|
578
|
+
config = maybeConfig ?? null;
|
|
579
|
+
mimeType = null;
|
|
580
|
+
}
|
|
446
581
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
447
582
|
const rawResult = getBinding().extractFileSync(filePath, mimeType, normalizedConfig);
|
|
448
583
|
return convertResult(rawResult);
|
|
449
584
|
}
|
|
450
|
-
async function extractFile(filePath,
|
|
585
|
+
async function extractFile(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
586
|
+
let mimeType = null;
|
|
587
|
+
let config = null;
|
|
588
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
589
|
+
mimeType = mimeTypeOrConfig;
|
|
590
|
+
config = maybeConfig ?? null;
|
|
591
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
592
|
+
config = mimeTypeOrConfig;
|
|
593
|
+
mimeType = null;
|
|
594
|
+
} else {
|
|
595
|
+
config = maybeConfig ?? null;
|
|
596
|
+
mimeType = null;
|
|
597
|
+
}
|
|
451
598
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
452
599
|
const rawResult = await getBinding().extractFile(filePath, mimeType, normalizedConfig);
|
|
453
600
|
return convertResult(rawResult);
|
|
454
601
|
}
|
|
455
|
-
function extractBytesSync(
|
|
602
|
+
function extractBytesSync(dataOrPath, mimeType, config = null) {
|
|
603
|
+
let data;
|
|
604
|
+
if (typeof dataOrPath === "string") {
|
|
605
|
+
data = (0, import_node_fs.readFileSync)(dataOrPath);
|
|
606
|
+
} else {
|
|
607
|
+
data = dataOrPath;
|
|
608
|
+
}
|
|
456
609
|
const validated = assertUint8Array(data, "data");
|
|
457
610
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
458
611
|
const rawResult = getBinding().extractBytesSync(Buffer.from(validated), mimeType, normalizedConfig);
|
|
459
612
|
return convertResult(rawResult);
|
|
460
613
|
}
|
|
461
|
-
async function extractBytes(
|
|
614
|
+
async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
615
|
+
let data;
|
|
616
|
+
if (typeof dataOrPath === "string") {
|
|
617
|
+
data = (0, import_node_fs.readFileSync)(dataOrPath);
|
|
618
|
+
} else {
|
|
619
|
+
data = dataOrPath;
|
|
620
|
+
}
|
|
462
621
|
const validated = assertUint8Array(data, "data");
|
|
463
622
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
464
623
|
console.log("[TypeScript] Debug input header:", Array.from(validated.slice(0, 8)));
|
|
@@ -498,8 +657,8 @@ async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
|
498
657
|
function registerPostProcessor(processor) {
|
|
499
658
|
const binding2 = getBinding();
|
|
500
659
|
const wrappedProcessor = {
|
|
501
|
-
name: processor.name.
|
|
502
|
-
processingStage: processor.processingStage
|
|
660
|
+
name: typeof processor.name === "function" ? processor.name() : processor.name,
|
|
661
|
+
processingStage: typeof processor.processingStage === "function" ? processor.processingStage() : processor.processingStage,
|
|
503
662
|
async process(...args) {
|
|
504
663
|
const wrappedValue = args[0];
|
|
505
664
|
const jsonString = wrappedValue[0];
|
|
@@ -552,8 +711,8 @@ function listPostProcessors() {
|
|
|
552
711
|
function registerValidator(validator) {
|
|
553
712
|
const binding2 = getBinding();
|
|
554
713
|
const wrappedValidator = {
|
|
555
|
-
name: validator.name.
|
|
556
|
-
priority: validator.priority
|
|
714
|
+
name: typeof validator.name === "function" ? validator.name() : validator.name,
|
|
715
|
+
priority: typeof validator.priority === "function" ? validator.priority() : validator.priority,
|
|
557
716
|
async validate(...args) {
|
|
558
717
|
const jsonString = args[0];
|
|
559
718
|
if (!jsonString || jsonString === "undefined") {
|
|
@@ -602,8 +761,8 @@ function describePayload(value) {
|
|
|
602
761
|
function registerOcrBackend(backend) {
|
|
603
762
|
const binding2 = getBinding();
|
|
604
763
|
const wrappedBackend = {
|
|
605
|
-
name: backend.name.
|
|
606
|
-
supportedLanguages: backend.supportedLanguages.
|
|
764
|
+
name: typeof backend.name === "function" ? backend.name() : backend.name,
|
|
765
|
+
supportedLanguages: typeof backend.supportedLanguages === "function" ? backend.supportedLanguages() : backend.supportedLanguages ?? ["en"],
|
|
607
766
|
async processImage(...processArgs) {
|
|
608
767
|
const [imagePayload, maybeLanguage] = processArgs;
|
|
609
768
|
if (process.env["KREUZBERG_DEBUG_GUTEN"] === "1") {
|
|
@@ -732,11 +891,11 @@ const ExtractionConfig = {
|
|
|
732
891
|
};
|
|
733
892
|
function detectMimeType(bytes) {
|
|
734
893
|
const binding2 = getBinding();
|
|
735
|
-
return binding2.
|
|
894
|
+
return binding2.detectMimeTypeFromBytes(bytes);
|
|
736
895
|
}
|
|
737
|
-
function detectMimeTypeFromPath(
|
|
896
|
+
function detectMimeTypeFromPath(filePath, checkExists) {
|
|
738
897
|
const binding2 = getBinding();
|
|
739
|
-
return binding2.detectMimeTypeFromPath(
|
|
898
|
+
return binding2.detectMimeTypeFromPath(filePath, checkExists);
|
|
740
899
|
}
|
|
741
900
|
function validateMimeType(mimeType) {
|
|
742
901
|
const binding2 = getBinding();
|
|
@@ -752,7 +911,8 @@ function listEmbeddingPresets() {
|
|
|
752
911
|
}
|
|
753
912
|
function getEmbeddingPreset(name) {
|
|
754
913
|
const binding2 = getBinding();
|
|
755
|
-
|
|
914
|
+
const result = binding2.getEmbeddingPreset(name);
|
|
915
|
+
return result;
|
|
756
916
|
}
|
|
757
917
|
function getLastErrorCode() {
|
|
758
918
|
const binding2 = getBinding();
|
|
@@ -760,9 +920,70 @@ function getLastErrorCode() {
|
|
|
760
920
|
}
|
|
761
921
|
function getLastPanicContext() {
|
|
762
922
|
const binding2 = getBinding();
|
|
763
|
-
|
|
923
|
+
const result = binding2.getLastPanicContext();
|
|
924
|
+
return result;
|
|
925
|
+
}
|
|
926
|
+
function getErrorCodeName(code) {
|
|
927
|
+
const binding2 = getBinding();
|
|
928
|
+
return binding2.getErrorCodeName(code);
|
|
929
|
+
}
|
|
930
|
+
function getErrorCodeDescription(code) {
|
|
931
|
+
const binding2 = getBinding();
|
|
932
|
+
return binding2.getErrorCodeDescription(code);
|
|
933
|
+
}
|
|
934
|
+
function classifyError(errorMessage) {
|
|
935
|
+
const binding2 = getBinding();
|
|
936
|
+
const result = binding2.classifyError(errorMessage);
|
|
937
|
+
return result;
|
|
938
|
+
}
|
|
939
|
+
function createWorkerPool(size) {
|
|
940
|
+
const binding2 = getBinding();
|
|
941
|
+
const rawPool = binding2.createWorkerPool(size);
|
|
942
|
+
return rawPool;
|
|
943
|
+
}
|
|
944
|
+
function getWorkerPoolStats(pool) {
|
|
945
|
+
const binding2 = getBinding();
|
|
946
|
+
const rawStats = binding2.getWorkerPoolStats(pool);
|
|
947
|
+
return rawStats;
|
|
948
|
+
}
|
|
949
|
+
async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
|
|
950
|
+
let mimeType = null;
|
|
951
|
+
let config = null;
|
|
952
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
953
|
+
mimeType = mimeTypeOrConfig;
|
|
954
|
+
config = maybeConfig ?? null;
|
|
955
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
956
|
+
config = mimeTypeOrConfig;
|
|
957
|
+
mimeType = null;
|
|
958
|
+
} else {
|
|
959
|
+
config = maybeConfig ?? null;
|
|
960
|
+
mimeType = null;
|
|
961
|
+
}
|
|
962
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
963
|
+
const binding2 = getBinding();
|
|
964
|
+
const rawResult = await binding2.extractFileInWorker(
|
|
965
|
+
pool,
|
|
966
|
+
filePath,
|
|
967
|
+
mimeType,
|
|
968
|
+
normalizedConfig
|
|
969
|
+
);
|
|
970
|
+
return convertResult(rawResult);
|
|
971
|
+
}
|
|
972
|
+
async function batchExtractFilesInWorker(pool, paths, config = null) {
|
|
973
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
974
|
+
const binding2 = getBinding();
|
|
975
|
+
const rawResults = await binding2.batchExtractFilesInWorker(
|
|
976
|
+
pool,
|
|
977
|
+
paths,
|
|
978
|
+
normalizedConfig
|
|
979
|
+
);
|
|
980
|
+
return rawResults.map(convertResult);
|
|
981
|
+
}
|
|
982
|
+
async function closeWorkerPool(pool) {
|
|
983
|
+
const binding2 = getBinding();
|
|
984
|
+
await binding2.closeWorkerPool(pool);
|
|
764
985
|
}
|
|
765
|
-
const __version__ = "4.0.0
|
|
986
|
+
const __version__ = "4.0.0";
|
|
766
987
|
// Annotate the CommonJS export names for ESM import in node:
|
|
767
988
|
0 && (module.exports = {
|
|
768
989
|
CacheError,
|
|
@@ -782,21 +1003,29 @@ const __version__ = "4.0.0-rc.8";
|
|
|
782
1003
|
batchExtractBytes,
|
|
783
1004
|
batchExtractBytesSync,
|
|
784
1005
|
batchExtractFiles,
|
|
1006
|
+
batchExtractFilesInWorker,
|
|
785
1007
|
batchExtractFilesSync,
|
|
1008
|
+
classifyError,
|
|
786
1009
|
clearDocumentExtractors,
|
|
787
1010
|
clearOcrBackends,
|
|
788
1011
|
clearPostProcessors,
|
|
789
1012
|
clearValidators,
|
|
1013
|
+
closeWorkerPool,
|
|
1014
|
+
createWorkerPool,
|
|
790
1015
|
detectMimeType,
|
|
791
1016
|
detectMimeTypeFromPath,
|
|
792
1017
|
extractBytes,
|
|
793
1018
|
extractBytesSync,
|
|
794
1019
|
extractFile,
|
|
1020
|
+
extractFileInWorker,
|
|
795
1021
|
extractFileSync,
|
|
796
1022
|
getEmbeddingPreset,
|
|
1023
|
+
getErrorCodeDescription,
|
|
1024
|
+
getErrorCodeName,
|
|
797
1025
|
getExtensionsForMime,
|
|
798
1026
|
getLastErrorCode,
|
|
799
1027
|
getLastPanicContext,
|
|
1028
|
+
getWorkerPoolStats,
|
|
800
1029
|
listDocumentExtractors,
|
|
801
1030
|
listEmbeddingPresets,
|
|
802
1031
|
listOcrBackends,
|