@kreuzberg/node 4.0.8 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli.js +6 -4
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +13 -5
- package/dist/cli.mjs.map +1 -1
- package/dist/errors.js +26 -24
- package/dist/errors.js.map +1 -1
- package/dist/errors.mjs +25 -24
- package/dist/errors.mjs.map +1 -1
- package/dist/index.d.mts +608 -535
- package/dist/index.d.ts +608 -535
- package/dist/index.js +682 -338
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +662 -334
- package/dist/index.mjs.map +1 -1
- package/dist/ocr/guten-ocr.js +4 -2
- package/dist/ocr/guten-ocr.js.map +1 -1
- package/dist/ocr/guten-ocr.mjs +3 -2
- package/dist/ocr/guten-ocr.mjs.map +1 -1
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -1
- package/index.d.ts +77 -178
- package/index.js +54 -52
- package/package.json +7 -7
package/dist/index.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
2
3
|
var __defProp = Object.defineProperty;
|
|
3
4
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
5
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
8
|
var __export = (target, all) => {
|
|
7
9
|
for (var name in all)
|
|
@@ -15,21 +17,30 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
15
17
|
}
|
|
16
18
|
return to;
|
|
17
19
|
};
|
|
18
|
-
var
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
19
28
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// typescript/index.ts
|
|
20
31
|
var index_exports = {};
|
|
21
32
|
__export(index_exports, {
|
|
22
|
-
CacheError: () =>
|
|
23
|
-
ErrorCode: () =>
|
|
33
|
+
CacheError: () => CacheError,
|
|
34
|
+
ErrorCode: () => ErrorCode,
|
|
24
35
|
ExtractionConfig: () => ExtractionConfig,
|
|
25
|
-
GutenOcrBackend: () =>
|
|
26
|
-
ImageProcessingError: () =>
|
|
27
|
-
KreuzbergError: () =>
|
|
28
|
-
MissingDependencyError: () =>
|
|
29
|
-
OcrError: () =>
|
|
30
|
-
ParsingError: () =>
|
|
31
|
-
PluginError: () =>
|
|
32
|
-
ValidationError: () =>
|
|
36
|
+
GutenOcrBackend: () => GutenOcrBackend,
|
|
37
|
+
ImageProcessingError: () => ImageProcessingError,
|
|
38
|
+
KreuzbergError: () => KreuzbergError,
|
|
39
|
+
MissingDependencyError: () => MissingDependencyError,
|
|
40
|
+
OcrError: () => OcrError,
|
|
41
|
+
ParsingError: () => ParsingError,
|
|
42
|
+
PluginError: () => PluginError,
|
|
43
|
+
ValidationError: () => ValidationError,
|
|
33
44
|
__resetBindingForTests: () => __resetBindingForTests,
|
|
34
45
|
__setBindingForTests: () => __setBindingForTests,
|
|
35
46
|
__version__: () => __version__,
|
|
@@ -64,6 +75,8 @@ __export(index_exports, {
|
|
|
64
75
|
listOcrBackends: () => listOcrBackends,
|
|
65
76
|
listPostProcessors: () => listPostProcessors,
|
|
66
77
|
listValidators: () => listValidators,
|
|
78
|
+
loadConfigFile: () => loadConfigFile,
|
|
79
|
+
loadConfigFromPath: () => loadConfigFromPath,
|
|
67
80
|
registerOcrBackend: () => registerOcrBackend,
|
|
68
81
|
registerPostProcessor: () => registerPostProcessor,
|
|
69
82
|
registerValidator: () => registerValidator,
|
|
@@ -74,14 +87,12 @@ __export(index_exports, {
|
|
|
74
87
|
validateMimeType: () => validateMimeType
|
|
75
88
|
});
|
|
76
89
|
module.exports = __toCommonJS(index_exports);
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
var
|
|
80
|
-
var
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
let binding = null;
|
|
84
|
-
let bindingInitialized = false;
|
|
90
|
+
|
|
91
|
+
// typescript/core/binding.ts
|
|
92
|
+
var import_node_module = require("module");
|
|
93
|
+
var import_meta = {};
|
|
94
|
+
var binding = null;
|
|
95
|
+
var bindingInitialized = false;
|
|
85
96
|
function createNativeBindingError(error) {
|
|
86
97
|
const hintParts = [];
|
|
87
98
|
let detail = "Unknown error while requiring native module.";
|
|
@@ -110,42 +121,13 @@ function createNativeBindingError(error) {
|
|
|
110
121
|
].join(" ")
|
|
111
122
|
);
|
|
112
123
|
}
|
|
113
|
-
function assertUint8Array(value, name) {
|
|
114
|
-
if (!(value instanceof Uint8Array)) {
|
|
115
|
-
throw new TypeError(`${name} must be a Uint8Array`);
|
|
116
|
-
}
|
|
117
|
-
return value;
|
|
118
|
-
}
|
|
119
|
-
function assertUint8ArrayList(values, name) {
|
|
120
|
-
if (!Array.isArray(values)) {
|
|
121
|
-
throw new TypeError(`${name} must be an array of Uint8Array`);
|
|
122
|
-
}
|
|
123
|
-
const array = values;
|
|
124
|
-
return array.map((value, index) => {
|
|
125
|
-
try {
|
|
126
|
-
return assertUint8Array(value, `${name}[${index}]`);
|
|
127
|
-
} catch {
|
|
128
|
-
throw new TypeError(`${name}[${index}] must be a Uint8Array`);
|
|
129
|
-
}
|
|
130
|
-
});
|
|
131
|
-
}
|
|
132
|
-
function __setBindingForTests(mock) {
|
|
133
|
-
binding = mock;
|
|
134
|
-
bindingInitialized = true;
|
|
135
|
-
}
|
|
136
|
-
function __resetBindingForTests() {
|
|
137
|
-
binding = null;
|
|
138
|
-
bindingInitialized = false;
|
|
139
|
-
}
|
|
140
124
|
function loadNativeBinding() {
|
|
141
125
|
let localRequire;
|
|
142
|
-
|
|
143
|
-
localRequire =
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
localRequire =
|
|
147
|
-
} catch {
|
|
148
|
-
localRequire = void 0;
|
|
126
|
+
try {
|
|
127
|
+
localRequire = (0, import_node_module.createRequire)(import_meta.url);
|
|
128
|
+
} catch {
|
|
129
|
+
if (typeof require !== "undefined") {
|
|
130
|
+
localRequire = require;
|
|
149
131
|
}
|
|
150
132
|
}
|
|
151
133
|
if (!localRequire) {
|
|
@@ -198,175 +180,158 @@ function getBinding() {
|
|
|
198
180
|
"Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
|
|
199
181
|
);
|
|
200
182
|
}
|
|
201
|
-
function
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
if (typeof parsed === "object" && parsed !== null) {
|
|
205
|
-
return parsed;
|
|
206
|
-
}
|
|
207
|
-
return {};
|
|
208
|
-
} catch {
|
|
209
|
-
return {};
|
|
210
|
-
}
|
|
183
|
+
function __setBindingForTests(mock) {
|
|
184
|
+
binding = mock;
|
|
185
|
+
bindingInitialized = true;
|
|
211
186
|
}
|
|
212
|
-
function
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
}
|
|
216
|
-
if (typeof Buffer !== "undefined" && value instanceof Buffer) {
|
|
217
|
-
return new Uint8Array(value);
|
|
218
|
-
}
|
|
219
|
-
if (Array.isArray(value)) {
|
|
220
|
-
return new Uint8Array(value);
|
|
221
|
-
}
|
|
222
|
-
return new Uint8Array();
|
|
187
|
+
function __resetBindingForTests() {
|
|
188
|
+
binding = null;
|
|
189
|
+
bindingInitialized = false;
|
|
223
190
|
}
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
byteStart: 0,
|
|
230
|
-
byteEnd: 0,
|
|
231
|
-
tokenCount: null,
|
|
232
|
-
chunkIndex: 0,
|
|
233
|
-
totalChunks: 0
|
|
234
|
-
},
|
|
235
|
-
embedding: null
|
|
236
|
-
};
|
|
237
|
-
}
|
|
238
|
-
const chunk = rawChunk;
|
|
239
|
-
const metadata = chunk["metadata"] ?? {};
|
|
240
|
-
return {
|
|
241
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
242
|
-
content: chunk["content"] ?? "",
|
|
243
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
244
|
-
embedding: chunk["embedding"] ?? null,
|
|
245
|
-
metadata: {
|
|
246
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
247
|
-
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
248
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
249
|
-
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
250
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
251
|
-
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
252
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
253
|
-
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
254
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
255
|
-
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
256
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
257
|
-
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
258
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
259
|
-
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
|
|
260
|
-
}
|
|
261
|
-
};
|
|
191
|
+
|
|
192
|
+
// typescript/errors/diagnostics.ts
|
|
193
|
+
function getLastErrorCode() {
|
|
194
|
+
const binding2 = getBinding();
|
|
195
|
+
return binding2.getLastErrorCode();
|
|
262
196
|
}
|
|
263
|
-
function
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
format: "unknown",
|
|
268
|
-
imageIndex: 0,
|
|
269
|
-
pageNumber: null,
|
|
270
|
-
width: null,
|
|
271
|
-
height: null,
|
|
272
|
-
colorspace: null,
|
|
273
|
-
bitsPerComponent: null,
|
|
274
|
-
isMask: false,
|
|
275
|
-
description: null,
|
|
276
|
-
ocrResult: null
|
|
277
|
-
};
|
|
278
|
-
}
|
|
279
|
-
const image = rawImage;
|
|
280
|
-
return {
|
|
281
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
282
|
-
data: ensureUint8Array(image["data"]),
|
|
283
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
284
|
-
format: image["format"] ?? "unknown",
|
|
285
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
286
|
-
imageIndex: image["imageIndex"] ?? 0,
|
|
287
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
288
|
-
pageNumber: image["pageNumber"] ?? null,
|
|
289
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
290
|
-
width: image["width"] ?? null,
|
|
291
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
292
|
-
height: image["height"] ?? null,
|
|
293
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
294
|
-
colorspace: image["colorspace"] ?? null,
|
|
295
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
296
|
-
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
297
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
298
|
-
isMask: image["isMask"] ?? false,
|
|
299
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
300
|
-
description: image["description"] ?? null,
|
|
301
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
302
|
-
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
303
|
-
};
|
|
197
|
+
function getLastPanicContext() {
|
|
198
|
+
const binding2 = getBinding();
|
|
199
|
+
const result = binding2.getLastPanicContext();
|
|
200
|
+
return result;
|
|
304
201
|
}
|
|
305
|
-
function
|
|
306
|
-
|
|
202
|
+
function getErrorCodeName(code) {
|
|
203
|
+
const binding2 = getBinding();
|
|
204
|
+
return binding2.getErrorCodeName(code);
|
|
205
|
+
}
|
|
206
|
+
function getErrorCodeDescription(code) {
|
|
207
|
+
const binding2 = getBinding();
|
|
208
|
+
return binding2.getErrorCodeDescription(code);
|
|
209
|
+
}
|
|
210
|
+
function classifyError(errorMessage) {
|
|
211
|
+
const binding2 = getBinding();
|
|
212
|
+
const result = binding2.classifyError(errorMessage);
|
|
213
|
+
return result;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// typescript/errors.ts
|
|
217
|
+
var ErrorCode = /* @__PURE__ */ ((ErrorCode2) => {
|
|
218
|
+
ErrorCode2[ErrorCode2["Success"] = 0] = "Success";
|
|
219
|
+
ErrorCode2[ErrorCode2["GenericError"] = 1] = "GenericError";
|
|
220
|
+
ErrorCode2[ErrorCode2["Panic"] = 2] = "Panic";
|
|
221
|
+
ErrorCode2[ErrorCode2["InvalidArgument"] = 3] = "InvalidArgument";
|
|
222
|
+
ErrorCode2[ErrorCode2["IoError"] = 4] = "IoError";
|
|
223
|
+
ErrorCode2[ErrorCode2["ParsingError"] = 5] = "ParsingError";
|
|
224
|
+
ErrorCode2[ErrorCode2["OcrError"] = 6] = "OcrError";
|
|
225
|
+
ErrorCode2[ErrorCode2["MissingDependency"] = 7] = "MissingDependency";
|
|
226
|
+
return ErrorCode2;
|
|
227
|
+
})(ErrorCode || {});
|
|
228
|
+
var KreuzbergError = class _KreuzbergError extends Error {
|
|
229
|
+
/**
|
|
230
|
+
* Panic context if error was caused by a panic in native code.
|
|
231
|
+
* Will be null for non-panic errors.
|
|
232
|
+
*/
|
|
233
|
+
panicContext;
|
|
234
|
+
constructor(message, panicContext) {
|
|
235
|
+
super(message);
|
|
236
|
+
this.name = "KreuzbergError";
|
|
237
|
+
this.panicContext = panicContext ?? null;
|
|
238
|
+
Object.setPrototypeOf(this, _KreuzbergError.prototype);
|
|
239
|
+
}
|
|
240
|
+
toJSON() {
|
|
307
241
|
return {
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
242
|
+
name: this.name,
|
|
243
|
+
message: this.message,
|
|
244
|
+
panicContext: this.panicContext,
|
|
245
|
+
stack: this.stack
|
|
312
246
|
};
|
|
313
247
|
}
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
248
|
+
};
|
|
249
|
+
var ValidationError = class _ValidationError extends KreuzbergError {
|
|
250
|
+
constructor(message, panicContext) {
|
|
251
|
+
super(message, panicContext);
|
|
252
|
+
this.name = "ValidationError";
|
|
253
|
+
Object.setPrototypeOf(this, _ValidationError.prototype);
|
|
254
|
+
}
|
|
255
|
+
};
|
|
256
|
+
var ParsingError = class _ParsingError extends KreuzbergError {
|
|
257
|
+
constructor(message, panicContext) {
|
|
258
|
+
super(message, panicContext);
|
|
259
|
+
this.name = "ParsingError";
|
|
260
|
+
Object.setPrototypeOf(this, _ParsingError.prototype);
|
|
261
|
+
}
|
|
262
|
+
};
|
|
263
|
+
var OcrError = class _OcrError extends KreuzbergError {
|
|
264
|
+
constructor(message, panicContext) {
|
|
265
|
+
super(message, panicContext);
|
|
266
|
+
this.name = "OcrError";
|
|
267
|
+
Object.setPrototypeOf(this, _OcrError.prototype);
|
|
268
|
+
}
|
|
269
|
+
};
|
|
270
|
+
var CacheError = class _CacheError extends KreuzbergError {
|
|
271
|
+
constructor(message, panicContext) {
|
|
272
|
+
super(message, panicContext);
|
|
273
|
+
this.name = "CacheError";
|
|
274
|
+
Object.setPrototypeOf(this, _CacheError.prototype);
|
|
275
|
+
}
|
|
276
|
+
};
|
|
277
|
+
var ImageProcessingError = class _ImageProcessingError extends KreuzbergError {
|
|
278
|
+
constructor(message, panicContext) {
|
|
279
|
+
super(message, panicContext);
|
|
280
|
+
this.name = "ImageProcessingError";
|
|
281
|
+
Object.setPrototypeOf(this, _ImageProcessingError.prototype);
|
|
282
|
+
}
|
|
283
|
+
};
|
|
284
|
+
var PluginError = class _PluginError extends KreuzbergError {
|
|
285
|
+
/**
|
|
286
|
+
* Name of the plugin that threw the error.
|
|
287
|
+
*/
|
|
288
|
+
pluginName;
|
|
289
|
+
constructor(message, pluginName, panicContext) {
|
|
290
|
+
super(`Plugin error in '${pluginName}': ${message}`, panicContext);
|
|
291
|
+
this.name = "PluginError";
|
|
292
|
+
this.pluginName = pluginName;
|
|
293
|
+
Object.setPrototypeOf(this, _PluginError.prototype);
|
|
294
|
+
}
|
|
295
|
+
toJSON() {
|
|
328
296
|
return {
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
chunks: null,
|
|
335
|
-
images: null,
|
|
336
|
-
pages: null
|
|
297
|
+
name: this.name,
|
|
298
|
+
message: this.message,
|
|
299
|
+
pluginName: this.pluginName,
|
|
300
|
+
panicContext: this.panicContext,
|
|
301
|
+
stack: this.stack
|
|
337
302
|
};
|
|
338
303
|
}
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
346
|
-
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
347
|
-
metadata: metadataValue,
|
|
348
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
349
|
-
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
350
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
351
|
-
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
352
|
-
chunks: null,
|
|
353
|
-
images: null,
|
|
354
|
-
pages: null
|
|
355
|
-
};
|
|
356
|
-
const chunksData = result["chunks"];
|
|
357
|
-
if (Array.isArray(chunksData)) {
|
|
358
|
-
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
304
|
+
};
|
|
305
|
+
var MissingDependencyError = class _MissingDependencyError extends KreuzbergError {
|
|
306
|
+
constructor(message, panicContext) {
|
|
307
|
+
super(message, panicContext);
|
|
308
|
+
this.name = "MissingDependencyError";
|
|
309
|
+
Object.setPrototypeOf(this, _MissingDependencyError.prototype);
|
|
359
310
|
}
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
311
|
+
};
|
|
312
|
+
|
|
313
|
+
// typescript/core/assertions.ts
|
|
314
|
+
function assertUint8Array(value, name) {
|
|
315
|
+
if (!(value instanceof Uint8Array)) {
|
|
316
|
+
throw new TypeError(`${name} must be a Uint8Array`);
|
|
363
317
|
}
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
318
|
+
return value;
|
|
319
|
+
}
|
|
320
|
+
function assertUint8ArrayList(values, name) {
|
|
321
|
+
if (!Array.isArray(values)) {
|
|
322
|
+
throw new TypeError(`${name} must be an array of Uint8Array`);
|
|
367
323
|
}
|
|
368
|
-
|
|
324
|
+
const array = values;
|
|
325
|
+
return array.map((value, index) => {
|
|
326
|
+
try {
|
|
327
|
+
return assertUint8Array(value, `${name}[${index}]`);
|
|
328
|
+
} catch {
|
|
329
|
+
throw new TypeError(`${name}[${index}] must be a Uint8Array`);
|
|
330
|
+
}
|
|
331
|
+
});
|
|
369
332
|
}
|
|
333
|
+
|
|
334
|
+
// typescript/core/config-normalizer.ts
|
|
370
335
|
function setIfDefined(target, key, value) {
|
|
371
336
|
if (value !== void 0) {
|
|
372
337
|
target[key] = value;
|
|
@@ -524,47 +489,251 @@ function normalizeKeywordConfig(config) {
|
|
|
524
489
|
setIfDefined(normalized, "rakeParams", config.rakeParams);
|
|
525
490
|
return normalized;
|
|
526
491
|
}
|
|
527
|
-
function normalizePageConfig(pages) {
|
|
528
|
-
if (!pages) {
|
|
529
|
-
return void 0;
|
|
492
|
+
function normalizePageConfig(pages) {
|
|
493
|
+
if (!pages) {
|
|
494
|
+
return void 0;
|
|
495
|
+
}
|
|
496
|
+
const normalized = {};
|
|
497
|
+
setIfDefined(normalized, "extractPages", pages.extractPages);
|
|
498
|
+
setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
|
|
499
|
+
setIfDefined(normalized, "markerFormat", pages.markerFormat);
|
|
500
|
+
return normalized;
|
|
501
|
+
}
|
|
502
|
+
function normalizeExtractionConfig(config) {
|
|
503
|
+
if (!config) {
|
|
504
|
+
return null;
|
|
505
|
+
}
|
|
506
|
+
const normalized = {};
|
|
507
|
+
setIfDefined(normalized, "useCache", config.useCache);
|
|
508
|
+
setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
|
|
509
|
+
setIfDefined(normalized, "forceOcr", config.forceOcr);
|
|
510
|
+
setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
|
|
511
|
+
const ocr = normalizeOcrConfig(config.ocr);
|
|
512
|
+
setIfDefined(normalized, "ocr", ocr);
|
|
513
|
+
const chunking = normalizeChunkingConfig(config.chunking);
|
|
514
|
+
setIfDefined(normalized, "chunking", chunking);
|
|
515
|
+
const images = normalizeImageExtractionConfig(config.images);
|
|
516
|
+
setIfDefined(normalized, "images", images);
|
|
517
|
+
const pdf = normalizePdfConfig(config.pdfOptions);
|
|
518
|
+
setIfDefined(normalized, "pdfOptions", pdf);
|
|
519
|
+
const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
|
|
520
|
+
setIfDefined(normalized, "tokenReduction", tokenReduction);
|
|
521
|
+
const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
|
|
522
|
+
setIfDefined(normalized, "languageDetection", languageDetection);
|
|
523
|
+
const postprocessor = normalizePostProcessorConfig(config.postprocessor);
|
|
524
|
+
setIfDefined(normalized, "postprocessor", postprocessor);
|
|
525
|
+
const keywords = normalizeKeywordConfig(config.keywords);
|
|
526
|
+
setIfDefined(normalized, "keywords", keywords);
|
|
527
|
+
const pages = normalizePageConfig(config.pages);
|
|
528
|
+
setIfDefined(normalized, "pages", pages);
|
|
529
|
+
const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
|
|
530
|
+
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
531
|
+
return normalized;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// typescript/core/type-converters.ts
|
|
535
|
+
function parseMetadata(metadataStr) {
|
|
536
|
+
try {
|
|
537
|
+
const parsed = JSON.parse(metadataStr);
|
|
538
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
539
|
+
return parsed;
|
|
540
|
+
}
|
|
541
|
+
return {};
|
|
542
|
+
} catch {
|
|
543
|
+
return {};
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
function ensureUint8Array(value) {
|
|
547
|
+
if (value instanceof Uint8Array) {
|
|
548
|
+
return value;
|
|
549
|
+
}
|
|
550
|
+
if (typeof Buffer !== "undefined" && value instanceof Buffer) {
|
|
551
|
+
return new Uint8Array(value);
|
|
552
|
+
}
|
|
553
|
+
if (Array.isArray(value)) {
|
|
554
|
+
return new Uint8Array(value);
|
|
555
|
+
}
|
|
556
|
+
return new Uint8Array();
|
|
557
|
+
}
|
|
558
|
+
function convertChunk(rawChunk) {
|
|
559
|
+
if (!rawChunk || typeof rawChunk !== "object") {
|
|
560
|
+
return {
|
|
561
|
+
content: "",
|
|
562
|
+
metadata: {
|
|
563
|
+
byteStart: 0,
|
|
564
|
+
byteEnd: 0,
|
|
565
|
+
tokenCount: null,
|
|
566
|
+
chunkIndex: 0,
|
|
567
|
+
totalChunks: 0
|
|
568
|
+
},
|
|
569
|
+
embedding: null
|
|
570
|
+
};
|
|
571
|
+
}
|
|
572
|
+
const chunk = rawChunk;
|
|
573
|
+
const metadata = chunk["metadata"] ?? {};
|
|
574
|
+
return {
|
|
575
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
576
|
+
content: chunk["content"] ?? "",
|
|
577
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
578
|
+
embedding: chunk["embedding"] ?? null,
|
|
579
|
+
metadata: {
|
|
580
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
581
|
+
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
582
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
583
|
+
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
584
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
585
|
+
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
586
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
587
|
+
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
588
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
589
|
+
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
590
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
591
|
+
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
592
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
593
|
+
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
|
|
594
|
+
}
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
function convertImage(rawImage) {
|
|
598
|
+
if (!rawImage || typeof rawImage !== "object") {
|
|
599
|
+
return {
|
|
600
|
+
data: new Uint8Array(),
|
|
601
|
+
format: "unknown",
|
|
602
|
+
imageIndex: 0,
|
|
603
|
+
pageNumber: null,
|
|
604
|
+
width: null,
|
|
605
|
+
height: null,
|
|
606
|
+
colorspace: null,
|
|
607
|
+
bitsPerComponent: null,
|
|
608
|
+
isMask: false,
|
|
609
|
+
description: null,
|
|
610
|
+
ocrResult: null
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
const image = rawImage;
|
|
614
|
+
return {
|
|
615
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
616
|
+
data: ensureUint8Array(image["data"]),
|
|
617
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
618
|
+
format: image["format"] ?? "unknown",
|
|
619
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
620
|
+
imageIndex: image["imageIndex"] ?? 0,
|
|
621
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
622
|
+
pageNumber: image["pageNumber"] ?? null,
|
|
623
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
624
|
+
width: image["width"] ?? null,
|
|
625
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
626
|
+
height: image["height"] ?? null,
|
|
627
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
628
|
+
colorspace: image["colorspace"] ?? null,
|
|
629
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
630
|
+
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
631
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
632
|
+
isMask: image["isMask"] ?? false,
|
|
633
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
634
|
+
description: image["description"] ?? null,
|
|
635
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
636
|
+
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
637
|
+
};
|
|
638
|
+
}
|
|
639
|
+
function convertPageContent(rawPage) {
|
|
640
|
+
if (!rawPage || typeof rawPage !== "object") {
|
|
641
|
+
return {
|
|
642
|
+
pageNumber: 0,
|
|
643
|
+
content: "",
|
|
644
|
+
tables: [],
|
|
645
|
+
images: []
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
const page = rawPage;
|
|
649
|
+
return {
|
|
650
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
651
|
+
pageNumber: page["pageNumber"] ?? 0,
|
|
652
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
653
|
+
content: page["content"] ?? "",
|
|
654
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
655
|
+
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
656
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
657
|
+
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
|
|
658
|
+
};
|
|
659
|
+
}
|
|
660
|
+
function convertResult(rawResult) {
|
|
661
|
+
if (!rawResult || typeof rawResult !== "object") {
|
|
662
|
+
return {
|
|
663
|
+
content: "",
|
|
664
|
+
mimeType: "application/octet-stream",
|
|
665
|
+
metadata: {},
|
|
666
|
+
tables: [],
|
|
667
|
+
detectedLanguages: null,
|
|
668
|
+
chunks: null,
|
|
669
|
+
images: null,
|
|
670
|
+
pages: null
|
|
671
|
+
};
|
|
672
|
+
}
|
|
673
|
+
const result = rawResult;
|
|
674
|
+
const metadata = result["metadata"];
|
|
675
|
+
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
676
|
+
const returnObj = {
|
|
677
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
678
|
+
content: result["content"] ?? "",
|
|
679
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
680
|
+
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
681
|
+
metadata: metadataValue,
|
|
682
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
683
|
+
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
684
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
685
|
+
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
686
|
+
chunks: null,
|
|
687
|
+
images: null,
|
|
688
|
+
pages: null
|
|
689
|
+
};
|
|
690
|
+
const chunksData = result["chunks"];
|
|
691
|
+
if (Array.isArray(chunksData)) {
|
|
692
|
+
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
693
|
+
}
|
|
694
|
+
const imagesData = result["images"];
|
|
695
|
+
if (Array.isArray(imagesData)) {
|
|
696
|
+
returnObj.images = imagesData.map((image) => convertImage(image));
|
|
697
|
+
}
|
|
698
|
+
const pagesData = result["pages"];
|
|
699
|
+
if (Array.isArray(pagesData)) {
|
|
700
|
+
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
701
|
+
}
|
|
702
|
+
return returnObj;
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
// typescript/extraction/batch.ts
|
|
706
|
+
function batchExtractFilesSync(paths, config = null) {
|
|
707
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
708
|
+
const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
|
|
709
|
+
return rawResults.map(convertResult);
|
|
710
|
+
}
|
|
711
|
+
async function batchExtractFiles(paths, config = null) {
|
|
712
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
713
|
+
const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
|
|
714
|
+
return rawResults.map(convertResult);
|
|
715
|
+
}
|
|
716
|
+
function batchExtractBytesSync(dataList, mimeTypes, config = null) {
|
|
717
|
+
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
718
|
+
if (buffers.length !== mimeTypes.length) {
|
|
719
|
+
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
530
720
|
}
|
|
531
|
-
const
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
setIfDefined(normalized, "markerFormat", pages.markerFormat);
|
|
535
|
-
return normalized;
|
|
721
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
722
|
+
const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
|
|
723
|
+
return rawResults.map(convertResult);
|
|
536
724
|
}
|
|
537
|
-
function
|
|
538
|
-
|
|
539
|
-
|
|
725
|
+
async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
726
|
+
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
727
|
+
if (buffers.length !== mimeTypes.length) {
|
|
728
|
+
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
540
729
|
}
|
|
541
|
-
const
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
setIfDefined(normalized, "forceOcr", config.forceOcr);
|
|
545
|
-
setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
|
|
546
|
-
const ocr = normalizeOcrConfig(config.ocr);
|
|
547
|
-
setIfDefined(normalized, "ocr", ocr);
|
|
548
|
-
const chunking = normalizeChunkingConfig(config.chunking);
|
|
549
|
-
setIfDefined(normalized, "chunking", chunking);
|
|
550
|
-
const images = normalizeImageExtractionConfig(config.images);
|
|
551
|
-
setIfDefined(normalized, "images", images);
|
|
552
|
-
const pdf = normalizePdfConfig(config.pdfOptions);
|
|
553
|
-
setIfDefined(normalized, "pdfOptions", pdf);
|
|
554
|
-
const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
|
|
555
|
-
setIfDefined(normalized, "tokenReduction", tokenReduction);
|
|
556
|
-
const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
|
|
557
|
-
setIfDefined(normalized, "languageDetection", languageDetection);
|
|
558
|
-
const postprocessor = normalizePostProcessorConfig(config.postprocessor);
|
|
559
|
-
setIfDefined(normalized, "postprocessor", postprocessor);
|
|
560
|
-
const keywords = normalizeKeywordConfig(config.keywords);
|
|
561
|
-
setIfDefined(normalized, "keywords", keywords);
|
|
562
|
-
const pages = normalizePageConfig(config.pages);
|
|
563
|
-
setIfDefined(normalized, "pages", pages);
|
|
564
|
-
const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
|
|
565
|
-
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
566
|
-
return normalized;
|
|
730
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
731
|
+
const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
|
|
732
|
+
return rawResults.map(convertResult);
|
|
567
733
|
}
|
|
734
|
+
|
|
735
|
+
// typescript/extraction/single.ts
|
|
736
|
+
var import_node_fs = require("fs");
|
|
568
737
|
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
569
738
|
let mimeType = null;
|
|
570
739
|
let config = null;
|
|
@@ -626,34 +795,57 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
|
626
795
|
const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
|
|
627
796
|
return convertResult(rawResult);
|
|
628
797
|
}
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
798
|
+
|
|
799
|
+
// typescript/extraction/worker-pool.ts
|
|
800
|
+
function createWorkerPool(size) {
|
|
801
|
+
const binding2 = getBinding();
|
|
802
|
+
const rawPool = binding2.createWorkerPool(size);
|
|
803
|
+
return rawPool;
|
|
633
804
|
}
|
|
634
|
-
|
|
635
|
-
const
|
|
636
|
-
const
|
|
637
|
-
return
|
|
805
|
+
function getWorkerPoolStats(pool) {
|
|
806
|
+
const binding2 = getBinding();
|
|
807
|
+
const rawStats = binding2.getWorkerPoolStats(pool);
|
|
808
|
+
return rawStats;
|
|
638
809
|
}
|
|
639
|
-
function
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
810
|
+
async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
|
|
811
|
+
let mimeType = null;
|
|
812
|
+
let config = null;
|
|
813
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
814
|
+
mimeType = mimeTypeOrConfig;
|
|
815
|
+
config = maybeConfig ?? null;
|
|
816
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
817
|
+
config = mimeTypeOrConfig;
|
|
818
|
+
mimeType = null;
|
|
819
|
+
} else {
|
|
820
|
+
config = maybeConfig ?? null;
|
|
821
|
+
mimeType = null;
|
|
643
822
|
}
|
|
644
823
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
645
|
-
const
|
|
646
|
-
|
|
824
|
+
const binding2 = getBinding();
|
|
825
|
+
const rawResult = await binding2.extractFileInWorker(
|
|
826
|
+
pool,
|
|
827
|
+
filePath,
|
|
828
|
+
mimeType,
|
|
829
|
+
normalizedConfig
|
|
830
|
+
);
|
|
831
|
+
return convertResult(rawResult);
|
|
647
832
|
}
|
|
648
|
-
async function
|
|
649
|
-
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
650
|
-
if (buffers.length !== mimeTypes.length) {
|
|
651
|
-
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
652
|
-
}
|
|
833
|
+
async function batchExtractFilesInWorker(pool, paths, config = null) {
|
|
653
834
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
654
|
-
const
|
|
835
|
+
const binding2 = getBinding();
|
|
836
|
+
const rawResults = await binding2.batchExtractFilesInWorker(
|
|
837
|
+
pool,
|
|
838
|
+
paths,
|
|
839
|
+
normalizedConfig
|
|
840
|
+
);
|
|
655
841
|
return rawResults.map(convertResult);
|
|
656
842
|
}
|
|
843
|
+
async function closeWorkerPool(pool) {
|
|
844
|
+
const binding2 = getBinding();
|
|
845
|
+
await binding2.closeWorkerPool(pool);
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
// typescript/plugins/post-processors.ts
|
|
657
849
|
function registerPostProcessor(processor) {
|
|
658
850
|
const binding2 = getBinding();
|
|
659
851
|
const wrappedProcessor = {
|
|
@@ -708,6 +900,8 @@ function listPostProcessors() {
|
|
|
708
900
|
const binding2 = getBinding();
|
|
709
901
|
return binding2.listPostProcessors();
|
|
710
902
|
}
|
|
903
|
+
|
|
904
|
+
// typescript/plugins/validators.ts
|
|
711
905
|
function registerValidator(validator) {
|
|
712
906
|
const binding2 = getBinding();
|
|
713
907
|
const wrappedValidator = {
|
|
@@ -746,6 +940,204 @@ function listValidators() {
|
|
|
746
940
|
const binding2 = getBinding();
|
|
747
941
|
return binding2.listValidators();
|
|
748
942
|
}
|
|
943
|
+
|
|
944
|
+
// typescript/ocr/guten-ocr.ts
|
|
945
|
+
var GutenOcrBackend = class {
|
|
946
|
+
ocr = null;
|
|
947
|
+
ocrModule = null;
|
|
948
|
+
options;
|
|
949
|
+
/**
|
|
950
|
+
* Create a new Guten OCR backend.
|
|
951
|
+
*
|
|
952
|
+
* @param options - Optional configuration for Guten OCR
|
|
953
|
+
* @param options.models - Custom model paths (default: uses bundled models)
|
|
954
|
+
* @param options.isDebug - Enable debug mode (default: false)
|
|
955
|
+
* @param options.debugOutputDir - Directory for debug output (default: undefined)
|
|
956
|
+
* @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
|
|
957
|
+
*
|
|
958
|
+
* @example
|
|
959
|
+
* ```typescript
|
|
960
|
+
* // Default configuration
|
|
961
|
+
* const backend = new GutenOcrBackend();
|
|
962
|
+
*
|
|
963
|
+
* // With debug enabled
|
|
964
|
+
* const debugBackend = new GutenOcrBackend({
|
|
965
|
+
* isDebug: true,
|
|
966
|
+
* debugOutputDir: './ocr_debug'
|
|
967
|
+
* });
|
|
968
|
+
* ```
|
|
969
|
+
*/
|
|
970
|
+
constructor(options) {
|
|
971
|
+
if (options !== void 0) {
|
|
972
|
+
this.options = options;
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
/**
|
|
976
|
+
* Get the backend name.
|
|
977
|
+
*
|
|
978
|
+
* @returns Backend name ("guten-ocr")
|
|
979
|
+
*/
|
|
980
|
+
name() {
|
|
981
|
+
return "guten-ocr";
|
|
982
|
+
}
|
|
983
|
+
/**
|
|
984
|
+
* Get list of supported language codes.
|
|
985
|
+
*
|
|
986
|
+
* Guten OCR supports multiple languages depending on the model configuration.
|
|
987
|
+
* The default models support English and Chinese.
|
|
988
|
+
*
|
|
989
|
+
* @returns Array of ISO 639-1/2 language codes
|
|
990
|
+
*/
|
|
991
|
+
supportedLanguages() {
|
|
992
|
+
return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
|
|
993
|
+
}
|
|
994
|
+
/**
|
|
995
|
+
* Initialize the OCR backend.
|
|
996
|
+
*
|
|
997
|
+
* This method loads the Guten OCR module and creates an OCR instance.
|
|
998
|
+
* Call this before using processImage().
|
|
999
|
+
*
|
|
1000
|
+
* @throws {Error} If @gutenye/ocr-node is not installed
|
|
1001
|
+
* @throws {Error} If OCR initialization fails
|
|
1002
|
+
*
|
|
1003
|
+
* @example
|
|
1004
|
+
* ```typescript
|
|
1005
|
+
* const backend = new GutenOcrBackend();
|
|
1006
|
+
* await backend.initialize();
|
|
1007
|
+
* ```
|
|
1008
|
+
*/
|
|
1009
|
+
async initialize() {
|
|
1010
|
+
if (this.ocr !== null) {
|
|
1011
|
+
return;
|
|
1012
|
+
}
|
|
1013
|
+
try {
|
|
1014
|
+
this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
|
|
1015
|
+
} catch (e) {
|
|
1016
|
+
const error = e;
|
|
1017
|
+
throw new Error(
|
|
1018
|
+
`Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
|
|
1019
|
+
);
|
|
1020
|
+
}
|
|
1021
|
+
try {
|
|
1022
|
+
this.ocr = await this.ocrModule?.create(this.options) ?? null;
|
|
1023
|
+
} catch (e) {
|
|
1024
|
+
const error = e;
|
|
1025
|
+
throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
/**
|
|
1029
|
+
* Shutdown the backend and release resources.
|
|
1030
|
+
*
|
|
1031
|
+
* This method cleans up all resources associated with the backend,
|
|
1032
|
+
* including the GutenOCR instance and module references.
|
|
1033
|
+
*
|
|
1034
|
+
* @example
|
|
1035
|
+
* ```typescript
|
|
1036
|
+
* const backend = new GutenOcrBackend();
|
|
1037
|
+
* await backend.initialize();
|
|
1038
|
+
* // ... use backend ...
|
|
1039
|
+
* await backend.shutdown();
|
|
1040
|
+
* ```
|
|
1041
|
+
*/
|
|
1042
|
+
async shutdown() {
|
|
1043
|
+
if (this.ocr !== null) {
|
|
1044
|
+
this.ocr = null;
|
|
1045
|
+
}
|
|
1046
|
+
if (this.ocrModule !== null) {
|
|
1047
|
+
this.ocrModule = null;
|
|
1048
|
+
}
|
|
1049
|
+
}
|
|
1050
|
+
/**
|
|
1051
|
+
* Process image bytes and extract text using Guten OCR.
|
|
1052
|
+
*
|
|
1053
|
+
* This method:
|
|
1054
|
+
* 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
|
|
1055
|
+
* 2. Runs OCR detection to find text regions
|
|
1056
|
+
* 3. Runs OCR recognition on each text region
|
|
1057
|
+
* 4. Returns extracted text with metadata
|
|
1058
|
+
*
|
|
1059
|
+
* @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
|
|
1060
|
+
* @param language - Language code (must be in supportedLanguages())
|
|
1061
|
+
* @returns Promise resolving to OCR result with content and metadata
|
|
1062
|
+
*
|
|
1063
|
+
* @throws {Error} If backend is not initialized
|
|
1064
|
+
* @throws {Error} If OCR processing fails
|
|
1065
|
+
*
|
|
1066
|
+
* @example
|
|
1067
|
+
* ```typescript
|
|
1068
|
+
* import { readFile } from 'fs/promises';
|
|
1069
|
+
*
|
|
1070
|
+
* const backend = new GutenOcrBackend();
|
|
1071
|
+
* await backend.initialize();
|
|
1072
|
+
*
|
|
1073
|
+
* const imageBytes = await readFile('scanned.png');
|
|
1074
|
+
* const result = await backend.processImage(imageBytes, 'en');
|
|
1075
|
+
* console.log(result.content);
|
|
1076
|
+
* console.log(result.metadata.confidence);
|
|
1077
|
+
* ```
|
|
1078
|
+
*/
|
|
1079
|
+
async processImage(imageBytes, language) {
|
|
1080
|
+
if (this.ocr === null) {
|
|
1081
|
+
await this.initialize();
|
|
1082
|
+
}
|
|
1083
|
+
if (this.ocr === null) {
|
|
1084
|
+
throw new Error("Guten OCR backend failed to initialize");
|
|
1085
|
+
}
|
|
1086
|
+
try {
|
|
1087
|
+
const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
|
|
1088
|
+
const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
|
|
1089
|
+
if (debugEnv === "1") {
|
|
1090
|
+
const header = Array.from(buffer.subarray(0, 8));
|
|
1091
|
+
console.log("[Guten OCR] Debug input header:", header);
|
|
1092
|
+
console.log(
|
|
1093
|
+
"[Guten OCR] Buffer?",
|
|
1094
|
+
Buffer.isBuffer(buffer),
|
|
1095
|
+
"constructor",
|
|
1096
|
+
imageBytes?.constructor?.name,
|
|
1097
|
+
"length",
|
|
1098
|
+
buffer.length,
|
|
1099
|
+
"type",
|
|
1100
|
+
typeof imageBytes
|
|
1101
|
+
);
|
|
1102
|
+
}
|
|
1103
|
+
let width = 0;
|
|
1104
|
+
let height = 0;
|
|
1105
|
+
try {
|
|
1106
|
+
const sharpModule = await import("sharp");
|
|
1107
|
+
const sharp = sharpModule.default || sharpModule;
|
|
1108
|
+
const image = sharp(buffer);
|
|
1109
|
+
const metadata = await image.metadata();
|
|
1110
|
+
const metadataRecord = metadata;
|
|
1111
|
+
width = metadataRecord["width"] ?? 0;
|
|
1112
|
+
height = metadataRecord["height"] ?? 0;
|
|
1113
|
+
} catch (metadataError) {
|
|
1114
|
+
const error = metadataError;
|
|
1115
|
+
console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
|
|
1116
|
+
}
|
|
1117
|
+
const result = await this.ocr.detect(buffer);
|
|
1118
|
+
const textLines = result.map((line) => line.text);
|
|
1119
|
+
const content = textLines.join("\n");
|
|
1120
|
+
const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
|
|
1121
|
+
return {
|
|
1122
|
+
content,
|
|
1123
|
+
mime_type: "text/plain",
|
|
1124
|
+
metadata: {
|
|
1125
|
+
width,
|
|
1126
|
+
height,
|
|
1127
|
+
confidence: avgConfidence,
|
|
1128
|
+
text_regions: result.length,
|
|
1129
|
+
language
|
|
1130
|
+
},
|
|
1131
|
+
tables: []
|
|
1132
|
+
};
|
|
1133
|
+
} catch (e) {
|
|
1134
|
+
const error = e;
|
|
1135
|
+
throw new Error(`Guten OCR processing failed: ${error.message}`);
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
};
|
|
1139
|
+
|
|
1140
|
+
// typescript/plugins/ocr-backends.ts
|
|
749
1141
|
function isOcrProcessTuple(value) {
|
|
750
1142
|
return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
|
|
751
1143
|
}
|
|
@@ -815,6 +1207,8 @@ function clearOcrBackends() {
|
|
|
815
1207
|
const binding2 = getBinding();
|
|
816
1208
|
binding2.clearOcrBackends();
|
|
817
1209
|
}
|
|
1210
|
+
|
|
1211
|
+
// typescript/registry/document-extractors.ts
|
|
818
1212
|
function listDocumentExtractors() {
|
|
819
1213
|
const binding2 = getBinding();
|
|
820
1214
|
return binding2.listDocumentExtractors();
|
|
@@ -827,7 +1221,9 @@ function clearDocumentExtractors() {
|
|
|
827
1221
|
const binding2 = getBinding();
|
|
828
1222
|
binding2.clearDocumentExtractors();
|
|
829
1223
|
}
|
|
830
|
-
|
|
1224
|
+
|
|
1225
|
+
// typescript/config/loader.ts
|
|
1226
|
+
var ExtractionConfig = {
|
|
831
1227
|
/**
|
|
832
1228
|
* Load extraction configuration from a file.
|
|
833
1229
|
*
|
|
@@ -889,6 +1285,18 @@ const ExtractionConfig = {
|
|
|
889
1285
|
return binding2.discoverExtractionConfig();
|
|
890
1286
|
}
|
|
891
1287
|
};
|
|
1288
|
+
function loadConfigFile(filePath) {
|
|
1289
|
+
return ExtractionConfig.fromFile(filePath);
|
|
1290
|
+
}
|
|
1291
|
+
function loadConfigFromPath(path) {
|
|
1292
|
+
try {
|
|
1293
|
+
return ExtractionConfig.fromFile(path);
|
|
1294
|
+
} catch {
|
|
1295
|
+
return ExtractionConfig.discover();
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
// typescript/mime/utilities.ts
|
|
892
1300
|
function detectMimeType(bytes) {
|
|
893
1301
|
const binding2 = getBinding();
|
|
894
1302
|
return binding2.detectMimeTypeFromBytes(bytes);
|
|
@@ -905,6 +1313,8 @@ function getExtensionsForMime(mimeType) {
|
|
|
905
1313
|
const binding2 = getBinding();
|
|
906
1314
|
return binding2.getExtensionsForMime(mimeType);
|
|
907
1315
|
}
|
|
1316
|
+
|
|
1317
|
+
// typescript/embeddings/presets.ts
|
|
908
1318
|
function listEmbeddingPresets() {
|
|
909
1319
|
const binding2 = getBinding();
|
|
910
1320
|
return binding2.listEmbeddingPresets();
|
|
@@ -914,76 +1324,9 @@ function getEmbeddingPreset(name) {
|
|
|
914
1324
|
const result = binding2.getEmbeddingPreset(name);
|
|
915
1325
|
return result;
|
|
916
1326
|
}
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
}
|
|
921
|
-
function getLastPanicContext() {
|
|
922
|
-
const binding2 = getBinding();
|
|
923
|
-
const result = binding2.getLastPanicContext();
|
|
924
|
-
return result;
|
|
925
|
-
}
|
|
926
|
-
function getErrorCodeName(code) {
|
|
927
|
-
const binding2 = getBinding();
|
|
928
|
-
return binding2.getErrorCodeName(code);
|
|
929
|
-
}
|
|
930
|
-
function getErrorCodeDescription(code) {
|
|
931
|
-
const binding2 = getBinding();
|
|
932
|
-
return binding2.getErrorCodeDescription(code);
|
|
933
|
-
}
|
|
934
|
-
function classifyError(errorMessage) {
|
|
935
|
-
const binding2 = getBinding();
|
|
936
|
-
const result = binding2.classifyError(errorMessage);
|
|
937
|
-
return result;
|
|
938
|
-
}
|
|
939
|
-
function createWorkerPool(size) {
|
|
940
|
-
const binding2 = getBinding();
|
|
941
|
-
const rawPool = binding2.createWorkerPool(size);
|
|
942
|
-
return rawPool;
|
|
943
|
-
}
|
|
944
|
-
function getWorkerPoolStats(pool) {
|
|
945
|
-
const binding2 = getBinding();
|
|
946
|
-
const rawStats = binding2.getWorkerPoolStats(pool);
|
|
947
|
-
return rawStats;
|
|
948
|
-
}
|
|
949
|
-
async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
|
|
950
|
-
let mimeType = null;
|
|
951
|
-
let config = null;
|
|
952
|
-
if (typeof mimeTypeOrConfig === "string") {
|
|
953
|
-
mimeType = mimeTypeOrConfig;
|
|
954
|
-
config = maybeConfig ?? null;
|
|
955
|
-
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
956
|
-
config = mimeTypeOrConfig;
|
|
957
|
-
mimeType = null;
|
|
958
|
-
} else {
|
|
959
|
-
config = maybeConfig ?? null;
|
|
960
|
-
mimeType = null;
|
|
961
|
-
}
|
|
962
|
-
const normalizedConfig = normalizeExtractionConfig(config);
|
|
963
|
-
const binding2 = getBinding();
|
|
964
|
-
const rawResult = await binding2.extractFileInWorker(
|
|
965
|
-
pool,
|
|
966
|
-
filePath,
|
|
967
|
-
mimeType,
|
|
968
|
-
normalizedConfig
|
|
969
|
-
);
|
|
970
|
-
return convertResult(rawResult);
|
|
971
|
-
}
|
|
972
|
-
async function batchExtractFilesInWorker(pool, paths, config = null) {
|
|
973
|
-
const normalizedConfig = normalizeExtractionConfig(config);
|
|
974
|
-
const binding2 = getBinding();
|
|
975
|
-
const rawResults = await binding2.batchExtractFilesInWorker(
|
|
976
|
-
pool,
|
|
977
|
-
paths,
|
|
978
|
-
normalizedConfig
|
|
979
|
-
);
|
|
980
|
-
return rawResults.map(convertResult);
|
|
981
|
-
}
|
|
982
|
-
async function closeWorkerPool(pool) {
|
|
983
|
-
const binding2 = getBinding();
|
|
984
|
-
await binding2.closeWorkerPool(pool);
|
|
985
|
-
}
|
|
986
|
-
const __version__ = "4.0.8";
|
|
1327
|
+
|
|
1328
|
+
// typescript/index.ts
|
|
1329
|
+
var __version__ = "4.1.1";
|
|
987
1330
|
// Annotate the CommonJS export names for ESM import in node:
|
|
988
1331
|
0 && (module.exports = {
|
|
989
1332
|
CacheError,
|
|
@@ -1031,6 +1374,8 @@ const __version__ = "4.0.8";
|
|
|
1031
1374
|
listOcrBackends,
|
|
1032
1375
|
listPostProcessors,
|
|
1033
1376
|
listValidators,
|
|
1377
|
+
loadConfigFile,
|
|
1378
|
+
loadConfigFromPath,
|
|
1034
1379
|
registerOcrBackend,
|
|
1035
1380
|
registerPostProcessor,
|
|
1036
1381
|
registerValidator,
|
|
@@ -1038,7 +1383,6 @@ const __version__ = "4.0.8";
|
|
|
1038
1383
|
unregisterOcrBackend,
|
|
1039
1384
|
unregisterPostProcessor,
|
|
1040
1385
|
unregisterValidator,
|
|
1041
|
-
validateMimeType
|
|
1042
|
-
...require("./types.js")
|
|
1386
|
+
validateMimeType
|
|
1043
1387
|
});
|
|
1044
1388
|
//# sourceMappingURL=index.js.map
|