@kreuzberg/node 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli.js +6 -4
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +13 -5
- package/dist/cli.mjs.map +1 -1
- package/dist/errors.js +26 -24
- package/dist/errors.js.map +1 -1
- package/dist/errors.mjs +25 -24
- package/dist/errors.mjs.map +1 -1
- package/dist/index.d.mts +608 -535
- package/dist/index.d.ts +608 -535
- package/dist/index.js +682 -338
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +662 -334
- package/dist/index.mjs.map +1 -1
- package/dist/ocr/guten-ocr.js +4 -2
- package/dist/ocr/guten-ocr.js.map +1 -1
- package/dist/ocr/guten-ocr.mjs +3 -2
- package/dist/ocr/guten-ocr.mjs.map +1 -1
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -1
- package/index.d.ts +77 -178
- package/index.js +54 -52
- package/package.json +7 -7
package/dist/index.mjs
CHANGED
|
@@ -1,20 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
ValidationError
|
|
13
|
-
} from "./errors.js";
|
|
14
|
-
import { GutenOcrBackend } from "./ocr/guten-ocr.js";
|
|
15
|
-
export * from "./types.js";
|
|
16
|
-
let binding = null;
|
|
17
|
-
let bindingInitialized = false;
|
|
1
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
2
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
3
|
+
}) : x)(function(x) {
|
|
4
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
5
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
6
|
+
});
|
|
7
|
+
|
|
8
|
+
// typescript/core/binding.ts
|
|
9
|
+
import { createRequire } from "module";
|
|
10
|
+
var binding = null;
|
|
11
|
+
var bindingInitialized = false;
|
|
18
12
|
function createNativeBindingError(error) {
|
|
19
13
|
const hintParts = [];
|
|
20
14
|
let detail = "Unknown error while requiring native module.";
|
|
@@ -43,42 +37,13 @@ function createNativeBindingError(error) {
|
|
|
43
37
|
].join(" ")
|
|
44
38
|
);
|
|
45
39
|
}
|
|
46
|
-
function assertUint8Array(value, name) {
|
|
47
|
-
if (!(value instanceof Uint8Array)) {
|
|
48
|
-
throw new TypeError(`${name} must be a Uint8Array`);
|
|
49
|
-
}
|
|
50
|
-
return value;
|
|
51
|
-
}
|
|
52
|
-
function assertUint8ArrayList(values, name) {
|
|
53
|
-
if (!Array.isArray(values)) {
|
|
54
|
-
throw new TypeError(`${name} must be an array of Uint8Array`);
|
|
55
|
-
}
|
|
56
|
-
const array = values;
|
|
57
|
-
return array.map((value, index) => {
|
|
58
|
-
try {
|
|
59
|
-
return assertUint8Array(value, `${name}[${index}]`);
|
|
60
|
-
} catch {
|
|
61
|
-
throw new TypeError(`${name}[${index}] must be a Uint8Array`);
|
|
62
|
-
}
|
|
63
|
-
});
|
|
64
|
-
}
|
|
65
|
-
function __setBindingForTests(mock) {
|
|
66
|
-
binding = mock;
|
|
67
|
-
bindingInitialized = true;
|
|
68
|
-
}
|
|
69
|
-
function __resetBindingForTests() {
|
|
70
|
-
binding = null;
|
|
71
|
-
bindingInitialized = false;
|
|
72
|
-
}
|
|
73
40
|
function loadNativeBinding() {
|
|
74
41
|
let localRequire;
|
|
75
|
-
|
|
76
|
-
localRequire =
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
localRequire =
|
|
80
|
-
} catch {
|
|
81
|
-
localRequire = void 0;
|
|
42
|
+
try {
|
|
43
|
+
localRequire = createRequire(import.meta.url);
|
|
44
|
+
} catch {
|
|
45
|
+
if (typeof __require !== "undefined") {
|
|
46
|
+
localRequire = __require;
|
|
82
47
|
}
|
|
83
48
|
}
|
|
84
49
|
if (!localRequire) {
|
|
@@ -131,175 +96,158 @@ function getBinding() {
|
|
|
131
96
|
"Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
|
|
132
97
|
);
|
|
133
98
|
}
|
|
134
|
-
function
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
if (typeof parsed === "object" && parsed !== null) {
|
|
138
|
-
return parsed;
|
|
139
|
-
}
|
|
140
|
-
return {};
|
|
141
|
-
} catch {
|
|
142
|
-
return {};
|
|
143
|
-
}
|
|
99
|
+
function __setBindingForTests(mock) {
|
|
100
|
+
binding = mock;
|
|
101
|
+
bindingInitialized = true;
|
|
144
102
|
}
|
|
145
|
-
function
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
}
|
|
149
|
-
if (typeof Buffer !== "undefined" && value instanceof Buffer) {
|
|
150
|
-
return new Uint8Array(value);
|
|
151
|
-
}
|
|
152
|
-
if (Array.isArray(value)) {
|
|
153
|
-
return new Uint8Array(value);
|
|
154
|
-
}
|
|
155
|
-
return new Uint8Array();
|
|
103
|
+
function __resetBindingForTests() {
|
|
104
|
+
binding = null;
|
|
105
|
+
bindingInitialized = false;
|
|
156
106
|
}
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
byteStart: 0,
|
|
163
|
-
byteEnd: 0,
|
|
164
|
-
tokenCount: null,
|
|
165
|
-
chunkIndex: 0,
|
|
166
|
-
totalChunks: 0
|
|
167
|
-
},
|
|
168
|
-
embedding: null
|
|
169
|
-
};
|
|
170
|
-
}
|
|
171
|
-
const chunk = rawChunk;
|
|
172
|
-
const metadata = chunk["metadata"] ?? {};
|
|
173
|
-
return {
|
|
174
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
175
|
-
content: chunk["content"] ?? "",
|
|
176
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
177
|
-
embedding: chunk["embedding"] ?? null,
|
|
178
|
-
metadata: {
|
|
179
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
180
|
-
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
181
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
182
|
-
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
183
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
184
|
-
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
185
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
186
|
-
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
187
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
188
|
-
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
189
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
190
|
-
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
191
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
192
|
-
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
|
|
193
|
-
}
|
|
194
|
-
};
|
|
107
|
+
|
|
108
|
+
// typescript/errors/diagnostics.ts
|
|
109
|
+
function getLastErrorCode() {
|
|
110
|
+
const binding2 = getBinding();
|
|
111
|
+
return binding2.getLastErrorCode();
|
|
195
112
|
}
|
|
196
|
-
function
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
format: "unknown",
|
|
201
|
-
imageIndex: 0,
|
|
202
|
-
pageNumber: null,
|
|
203
|
-
width: null,
|
|
204
|
-
height: null,
|
|
205
|
-
colorspace: null,
|
|
206
|
-
bitsPerComponent: null,
|
|
207
|
-
isMask: false,
|
|
208
|
-
description: null,
|
|
209
|
-
ocrResult: null
|
|
210
|
-
};
|
|
211
|
-
}
|
|
212
|
-
const image = rawImage;
|
|
213
|
-
return {
|
|
214
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
215
|
-
data: ensureUint8Array(image["data"]),
|
|
216
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
217
|
-
format: image["format"] ?? "unknown",
|
|
218
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
219
|
-
imageIndex: image["imageIndex"] ?? 0,
|
|
220
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
221
|
-
pageNumber: image["pageNumber"] ?? null,
|
|
222
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
223
|
-
width: image["width"] ?? null,
|
|
224
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
225
|
-
height: image["height"] ?? null,
|
|
226
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
227
|
-
colorspace: image["colorspace"] ?? null,
|
|
228
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
229
|
-
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
230
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
231
|
-
isMask: image["isMask"] ?? false,
|
|
232
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
233
|
-
description: image["description"] ?? null,
|
|
234
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
235
|
-
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
236
|
-
};
|
|
113
|
+
function getLastPanicContext() {
|
|
114
|
+
const binding2 = getBinding();
|
|
115
|
+
const result = binding2.getLastPanicContext();
|
|
116
|
+
return result;
|
|
237
117
|
}
|
|
238
|
-
function
|
|
239
|
-
|
|
118
|
+
function getErrorCodeName(code) {
|
|
119
|
+
const binding2 = getBinding();
|
|
120
|
+
return binding2.getErrorCodeName(code);
|
|
121
|
+
}
|
|
122
|
+
function getErrorCodeDescription(code) {
|
|
123
|
+
const binding2 = getBinding();
|
|
124
|
+
return binding2.getErrorCodeDescription(code);
|
|
125
|
+
}
|
|
126
|
+
function classifyError(errorMessage) {
|
|
127
|
+
const binding2 = getBinding();
|
|
128
|
+
const result = binding2.classifyError(errorMessage);
|
|
129
|
+
return result;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// typescript/errors.ts
|
|
133
|
+
var ErrorCode = /* @__PURE__ */ ((ErrorCode2) => {
|
|
134
|
+
ErrorCode2[ErrorCode2["Success"] = 0] = "Success";
|
|
135
|
+
ErrorCode2[ErrorCode2["GenericError"] = 1] = "GenericError";
|
|
136
|
+
ErrorCode2[ErrorCode2["Panic"] = 2] = "Panic";
|
|
137
|
+
ErrorCode2[ErrorCode2["InvalidArgument"] = 3] = "InvalidArgument";
|
|
138
|
+
ErrorCode2[ErrorCode2["IoError"] = 4] = "IoError";
|
|
139
|
+
ErrorCode2[ErrorCode2["ParsingError"] = 5] = "ParsingError";
|
|
140
|
+
ErrorCode2[ErrorCode2["OcrError"] = 6] = "OcrError";
|
|
141
|
+
ErrorCode2[ErrorCode2["MissingDependency"] = 7] = "MissingDependency";
|
|
142
|
+
return ErrorCode2;
|
|
143
|
+
})(ErrorCode || {});
|
|
144
|
+
var KreuzbergError = class _KreuzbergError extends Error {
|
|
145
|
+
/**
|
|
146
|
+
* Panic context if error was caused by a panic in native code.
|
|
147
|
+
* Will be null for non-panic errors.
|
|
148
|
+
*/
|
|
149
|
+
panicContext;
|
|
150
|
+
constructor(message, panicContext) {
|
|
151
|
+
super(message);
|
|
152
|
+
this.name = "KreuzbergError";
|
|
153
|
+
this.panicContext = panicContext ?? null;
|
|
154
|
+
Object.setPrototypeOf(this, _KreuzbergError.prototype);
|
|
155
|
+
}
|
|
156
|
+
toJSON() {
|
|
240
157
|
return {
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
158
|
+
name: this.name,
|
|
159
|
+
message: this.message,
|
|
160
|
+
panicContext: this.panicContext,
|
|
161
|
+
stack: this.stack
|
|
245
162
|
};
|
|
246
163
|
}
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
164
|
+
};
|
|
165
|
+
var ValidationError = class _ValidationError extends KreuzbergError {
|
|
166
|
+
constructor(message, panicContext) {
|
|
167
|
+
super(message, panicContext);
|
|
168
|
+
this.name = "ValidationError";
|
|
169
|
+
Object.setPrototypeOf(this, _ValidationError.prototype);
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
var ParsingError = class _ParsingError extends KreuzbergError {
|
|
173
|
+
constructor(message, panicContext) {
|
|
174
|
+
super(message, panicContext);
|
|
175
|
+
this.name = "ParsingError";
|
|
176
|
+
Object.setPrototypeOf(this, _ParsingError.prototype);
|
|
177
|
+
}
|
|
178
|
+
};
|
|
179
|
+
var OcrError = class _OcrError extends KreuzbergError {
|
|
180
|
+
constructor(message, panicContext) {
|
|
181
|
+
super(message, panicContext);
|
|
182
|
+
this.name = "OcrError";
|
|
183
|
+
Object.setPrototypeOf(this, _OcrError.prototype);
|
|
184
|
+
}
|
|
185
|
+
};
|
|
186
|
+
var CacheError = class _CacheError extends KreuzbergError {
|
|
187
|
+
constructor(message, panicContext) {
|
|
188
|
+
super(message, panicContext);
|
|
189
|
+
this.name = "CacheError";
|
|
190
|
+
Object.setPrototypeOf(this, _CacheError.prototype);
|
|
191
|
+
}
|
|
192
|
+
};
|
|
193
|
+
var ImageProcessingError = class _ImageProcessingError extends KreuzbergError {
|
|
194
|
+
constructor(message, panicContext) {
|
|
195
|
+
super(message, panicContext);
|
|
196
|
+
this.name = "ImageProcessingError";
|
|
197
|
+
Object.setPrototypeOf(this, _ImageProcessingError.prototype);
|
|
198
|
+
}
|
|
199
|
+
};
|
|
200
|
+
var PluginError = class _PluginError extends KreuzbergError {
|
|
201
|
+
/**
|
|
202
|
+
* Name of the plugin that threw the error.
|
|
203
|
+
*/
|
|
204
|
+
pluginName;
|
|
205
|
+
constructor(message, pluginName, panicContext) {
|
|
206
|
+
super(`Plugin error in '${pluginName}': ${message}`, panicContext);
|
|
207
|
+
this.name = "PluginError";
|
|
208
|
+
this.pluginName = pluginName;
|
|
209
|
+
Object.setPrototypeOf(this, _PluginError.prototype);
|
|
210
|
+
}
|
|
211
|
+
toJSON() {
|
|
261
212
|
return {
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
chunks: null,
|
|
268
|
-
images: null,
|
|
269
|
-
pages: null
|
|
213
|
+
name: this.name,
|
|
214
|
+
message: this.message,
|
|
215
|
+
pluginName: this.pluginName,
|
|
216
|
+
panicContext: this.panicContext,
|
|
217
|
+
stack: this.stack
|
|
270
218
|
};
|
|
271
219
|
}
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
279
|
-
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
280
|
-
metadata: metadataValue,
|
|
281
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
282
|
-
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
283
|
-
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
284
|
-
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
285
|
-
chunks: null,
|
|
286
|
-
images: null,
|
|
287
|
-
pages: null
|
|
288
|
-
};
|
|
289
|
-
const chunksData = result["chunks"];
|
|
290
|
-
if (Array.isArray(chunksData)) {
|
|
291
|
-
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
220
|
+
};
|
|
221
|
+
var MissingDependencyError = class _MissingDependencyError extends KreuzbergError {
|
|
222
|
+
constructor(message, panicContext) {
|
|
223
|
+
super(message, panicContext);
|
|
224
|
+
this.name = "MissingDependencyError";
|
|
225
|
+
Object.setPrototypeOf(this, _MissingDependencyError.prototype);
|
|
292
226
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
// typescript/core/assertions.ts
|
|
230
|
+
function assertUint8Array(value, name) {
|
|
231
|
+
if (!(value instanceof Uint8Array)) {
|
|
232
|
+
throw new TypeError(`${name} must be a Uint8Array`);
|
|
296
233
|
}
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
234
|
+
return value;
|
|
235
|
+
}
|
|
236
|
+
function assertUint8ArrayList(values, name) {
|
|
237
|
+
if (!Array.isArray(values)) {
|
|
238
|
+
throw new TypeError(`${name} must be an array of Uint8Array`);
|
|
300
239
|
}
|
|
301
|
-
|
|
240
|
+
const array = values;
|
|
241
|
+
return array.map((value, index) => {
|
|
242
|
+
try {
|
|
243
|
+
return assertUint8Array(value, `${name}[${index}]`);
|
|
244
|
+
} catch {
|
|
245
|
+
throw new TypeError(`${name}[${index}] must be a Uint8Array`);
|
|
246
|
+
}
|
|
247
|
+
});
|
|
302
248
|
}
|
|
249
|
+
|
|
250
|
+
// typescript/core/config-normalizer.ts
|
|
303
251
|
function setIfDefined(target, key, value) {
|
|
304
252
|
if (value !== void 0) {
|
|
305
253
|
target[key] = value;
|
|
@@ -457,47 +405,251 @@ function normalizeKeywordConfig(config) {
|
|
|
457
405
|
setIfDefined(normalized, "rakeParams", config.rakeParams);
|
|
458
406
|
return normalized;
|
|
459
407
|
}
|
|
460
|
-
function normalizePageConfig(pages) {
|
|
461
|
-
if (!pages) {
|
|
462
|
-
return void 0;
|
|
408
|
+
function normalizePageConfig(pages) {
|
|
409
|
+
if (!pages) {
|
|
410
|
+
return void 0;
|
|
411
|
+
}
|
|
412
|
+
const normalized = {};
|
|
413
|
+
setIfDefined(normalized, "extractPages", pages.extractPages);
|
|
414
|
+
setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
|
|
415
|
+
setIfDefined(normalized, "markerFormat", pages.markerFormat);
|
|
416
|
+
return normalized;
|
|
417
|
+
}
|
|
418
|
+
function normalizeExtractionConfig(config) {
|
|
419
|
+
if (!config) {
|
|
420
|
+
return null;
|
|
421
|
+
}
|
|
422
|
+
const normalized = {};
|
|
423
|
+
setIfDefined(normalized, "useCache", config.useCache);
|
|
424
|
+
setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
|
|
425
|
+
setIfDefined(normalized, "forceOcr", config.forceOcr);
|
|
426
|
+
setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
|
|
427
|
+
const ocr = normalizeOcrConfig(config.ocr);
|
|
428
|
+
setIfDefined(normalized, "ocr", ocr);
|
|
429
|
+
const chunking = normalizeChunkingConfig(config.chunking);
|
|
430
|
+
setIfDefined(normalized, "chunking", chunking);
|
|
431
|
+
const images = normalizeImageExtractionConfig(config.images);
|
|
432
|
+
setIfDefined(normalized, "images", images);
|
|
433
|
+
const pdf = normalizePdfConfig(config.pdfOptions);
|
|
434
|
+
setIfDefined(normalized, "pdfOptions", pdf);
|
|
435
|
+
const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
|
|
436
|
+
setIfDefined(normalized, "tokenReduction", tokenReduction);
|
|
437
|
+
const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
|
|
438
|
+
setIfDefined(normalized, "languageDetection", languageDetection);
|
|
439
|
+
const postprocessor = normalizePostProcessorConfig(config.postprocessor);
|
|
440
|
+
setIfDefined(normalized, "postprocessor", postprocessor);
|
|
441
|
+
const keywords = normalizeKeywordConfig(config.keywords);
|
|
442
|
+
setIfDefined(normalized, "keywords", keywords);
|
|
443
|
+
const pages = normalizePageConfig(config.pages);
|
|
444
|
+
setIfDefined(normalized, "pages", pages);
|
|
445
|
+
const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
|
|
446
|
+
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
447
|
+
return normalized;
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// typescript/core/type-converters.ts
|
|
451
|
+
function parseMetadata(metadataStr) {
|
|
452
|
+
try {
|
|
453
|
+
const parsed = JSON.parse(metadataStr);
|
|
454
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
455
|
+
return parsed;
|
|
456
|
+
}
|
|
457
|
+
return {};
|
|
458
|
+
} catch {
|
|
459
|
+
return {};
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
function ensureUint8Array(value) {
|
|
463
|
+
if (value instanceof Uint8Array) {
|
|
464
|
+
return value;
|
|
465
|
+
}
|
|
466
|
+
if (typeof Buffer !== "undefined" && value instanceof Buffer) {
|
|
467
|
+
return new Uint8Array(value);
|
|
468
|
+
}
|
|
469
|
+
if (Array.isArray(value)) {
|
|
470
|
+
return new Uint8Array(value);
|
|
471
|
+
}
|
|
472
|
+
return new Uint8Array();
|
|
473
|
+
}
|
|
474
|
+
function convertChunk(rawChunk) {
|
|
475
|
+
if (!rawChunk || typeof rawChunk !== "object") {
|
|
476
|
+
return {
|
|
477
|
+
content: "",
|
|
478
|
+
metadata: {
|
|
479
|
+
byteStart: 0,
|
|
480
|
+
byteEnd: 0,
|
|
481
|
+
tokenCount: null,
|
|
482
|
+
chunkIndex: 0,
|
|
483
|
+
totalChunks: 0
|
|
484
|
+
},
|
|
485
|
+
embedding: null
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
const chunk = rawChunk;
|
|
489
|
+
const metadata = chunk["metadata"] ?? {};
|
|
490
|
+
return {
|
|
491
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
492
|
+
content: chunk["content"] ?? "",
|
|
493
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
494
|
+
embedding: chunk["embedding"] ?? null,
|
|
495
|
+
metadata: {
|
|
496
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
497
|
+
byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
|
|
498
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
499
|
+
byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
|
|
500
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
501
|
+
tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
|
|
502
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
503
|
+
chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
|
|
504
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
505
|
+
totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
|
|
506
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
507
|
+
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
508
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
509
|
+
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
|
|
510
|
+
}
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
function convertImage(rawImage) {
|
|
514
|
+
if (!rawImage || typeof rawImage !== "object") {
|
|
515
|
+
return {
|
|
516
|
+
data: new Uint8Array(),
|
|
517
|
+
format: "unknown",
|
|
518
|
+
imageIndex: 0,
|
|
519
|
+
pageNumber: null,
|
|
520
|
+
width: null,
|
|
521
|
+
height: null,
|
|
522
|
+
colorspace: null,
|
|
523
|
+
bitsPerComponent: null,
|
|
524
|
+
isMask: false,
|
|
525
|
+
description: null,
|
|
526
|
+
ocrResult: null
|
|
527
|
+
};
|
|
528
|
+
}
|
|
529
|
+
const image = rawImage;
|
|
530
|
+
return {
|
|
531
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
532
|
+
data: ensureUint8Array(image["data"]),
|
|
533
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
534
|
+
format: image["format"] ?? "unknown",
|
|
535
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
536
|
+
imageIndex: image["imageIndex"] ?? 0,
|
|
537
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
538
|
+
pageNumber: image["pageNumber"] ?? null,
|
|
539
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
540
|
+
width: image["width"] ?? null,
|
|
541
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
542
|
+
height: image["height"] ?? null,
|
|
543
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
544
|
+
colorspace: image["colorspace"] ?? null,
|
|
545
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
546
|
+
bitsPerComponent: image["bitsPerComponent"] ?? null,
|
|
547
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
548
|
+
isMask: image["isMask"] ?? false,
|
|
549
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
550
|
+
description: image["description"] ?? null,
|
|
551
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
552
|
+
ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
function convertPageContent(rawPage) {
|
|
556
|
+
if (!rawPage || typeof rawPage !== "object") {
|
|
557
|
+
return {
|
|
558
|
+
pageNumber: 0,
|
|
559
|
+
content: "",
|
|
560
|
+
tables: [],
|
|
561
|
+
images: []
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
const page = rawPage;
|
|
565
|
+
return {
|
|
566
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
567
|
+
pageNumber: page["pageNumber"] ?? 0,
|
|
568
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
569
|
+
content: page["content"] ?? "",
|
|
570
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
571
|
+
tables: Array.isArray(page["tables"]) ? page["tables"] : [],
|
|
572
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
573
|
+
images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
|
|
574
|
+
};
|
|
575
|
+
}
|
|
576
|
+
function convertResult(rawResult) {
|
|
577
|
+
if (!rawResult || typeof rawResult !== "object") {
|
|
578
|
+
return {
|
|
579
|
+
content: "",
|
|
580
|
+
mimeType: "application/octet-stream",
|
|
581
|
+
metadata: {},
|
|
582
|
+
tables: [],
|
|
583
|
+
detectedLanguages: null,
|
|
584
|
+
chunks: null,
|
|
585
|
+
images: null,
|
|
586
|
+
pages: null
|
|
587
|
+
};
|
|
588
|
+
}
|
|
589
|
+
const result = rawResult;
|
|
590
|
+
const metadata = result["metadata"];
|
|
591
|
+
const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
|
|
592
|
+
const returnObj = {
|
|
593
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
594
|
+
content: result["content"] ?? "",
|
|
595
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
596
|
+
mimeType: result["mimeType"] ?? "application/octet-stream",
|
|
597
|
+
metadata: metadataValue,
|
|
598
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
599
|
+
tables: Array.isArray(result["tables"]) ? result["tables"] : [],
|
|
600
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
601
|
+
detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
|
|
602
|
+
chunks: null,
|
|
603
|
+
images: null,
|
|
604
|
+
pages: null
|
|
605
|
+
};
|
|
606
|
+
const chunksData = result["chunks"];
|
|
607
|
+
if (Array.isArray(chunksData)) {
|
|
608
|
+
returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
|
|
609
|
+
}
|
|
610
|
+
const imagesData = result["images"];
|
|
611
|
+
if (Array.isArray(imagesData)) {
|
|
612
|
+
returnObj.images = imagesData.map((image) => convertImage(image));
|
|
613
|
+
}
|
|
614
|
+
const pagesData = result["pages"];
|
|
615
|
+
if (Array.isArray(pagesData)) {
|
|
616
|
+
returnObj.pages = pagesData.map((page) => convertPageContent(page));
|
|
617
|
+
}
|
|
618
|
+
return returnObj;
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
// typescript/extraction/batch.ts
|
|
622
|
+
function batchExtractFilesSync(paths, config = null) {
|
|
623
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
624
|
+
const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
|
|
625
|
+
return rawResults.map(convertResult);
|
|
626
|
+
}
|
|
627
|
+
async function batchExtractFiles(paths, config = null) {
|
|
628
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
629
|
+
const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
|
|
630
|
+
return rawResults.map(convertResult);
|
|
631
|
+
}
|
|
632
|
+
function batchExtractBytesSync(dataList, mimeTypes, config = null) {
|
|
633
|
+
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
634
|
+
if (buffers.length !== mimeTypes.length) {
|
|
635
|
+
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
463
636
|
}
|
|
464
|
-
const
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
setIfDefined(normalized, "markerFormat", pages.markerFormat);
|
|
468
|
-
return normalized;
|
|
637
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
638
|
+
const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
|
|
639
|
+
return rawResults.map(convertResult);
|
|
469
640
|
}
|
|
470
|
-
function
|
|
471
|
-
|
|
472
|
-
|
|
641
|
+
async function batchExtractBytes(dataList, mimeTypes, config = null) {
|
|
642
|
+
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
643
|
+
if (buffers.length !== mimeTypes.length) {
|
|
644
|
+
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
473
645
|
}
|
|
474
|
-
const
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
setIfDefined(normalized, "forceOcr", config.forceOcr);
|
|
478
|
-
setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
|
|
479
|
-
const ocr = normalizeOcrConfig(config.ocr);
|
|
480
|
-
setIfDefined(normalized, "ocr", ocr);
|
|
481
|
-
const chunking = normalizeChunkingConfig(config.chunking);
|
|
482
|
-
setIfDefined(normalized, "chunking", chunking);
|
|
483
|
-
const images = normalizeImageExtractionConfig(config.images);
|
|
484
|
-
setIfDefined(normalized, "images", images);
|
|
485
|
-
const pdf = normalizePdfConfig(config.pdfOptions);
|
|
486
|
-
setIfDefined(normalized, "pdfOptions", pdf);
|
|
487
|
-
const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
|
|
488
|
-
setIfDefined(normalized, "tokenReduction", tokenReduction);
|
|
489
|
-
const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
|
|
490
|
-
setIfDefined(normalized, "languageDetection", languageDetection);
|
|
491
|
-
const postprocessor = normalizePostProcessorConfig(config.postprocessor);
|
|
492
|
-
setIfDefined(normalized, "postprocessor", postprocessor);
|
|
493
|
-
const keywords = normalizeKeywordConfig(config.keywords);
|
|
494
|
-
setIfDefined(normalized, "keywords", keywords);
|
|
495
|
-
const pages = normalizePageConfig(config.pages);
|
|
496
|
-
setIfDefined(normalized, "pages", pages);
|
|
497
|
-
const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
|
|
498
|
-
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
499
|
-
return normalized;
|
|
646
|
+
const normalizedConfig = normalizeExtractionConfig(config);
|
|
647
|
+
const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
|
|
648
|
+
return rawResults.map(convertResult);
|
|
500
649
|
}
|
|
650
|
+
|
|
651
|
+
// typescript/extraction/single.ts
|
|
652
|
+
import { readFileSync } from "fs";
|
|
501
653
|
function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
|
|
502
654
|
let mimeType = null;
|
|
503
655
|
let config = null;
|
|
@@ -559,34 +711,57 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
|
|
|
559
711
|
const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
|
|
560
712
|
return convertResult(rawResult);
|
|
561
713
|
}
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
714
|
+
|
|
715
|
+
// typescript/extraction/worker-pool.ts
|
|
716
|
+
function createWorkerPool(size) {
|
|
717
|
+
const binding2 = getBinding();
|
|
718
|
+
const rawPool = binding2.createWorkerPool(size);
|
|
719
|
+
return rawPool;
|
|
566
720
|
}
|
|
567
|
-
|
|
568
|
-
const
|
|
569
|
-
const
|
|
570
|
-
return
|
|
721
|
+
function getWorkerPoolStats(pool) {
|
|
722
|
+
const binding2 = getBinding();
|
|
723
|
+
const rawStats = binding2.getWorkerPoolStats(pool);
|
|
724
|
+
return rawStats;
|
|
571
725
|
}
|
|
572
|
-
function
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
726
|
+
async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
|
|
727
|
+
let mimeType = null;
|
|
728
|
+
let config = null;
|
|
729
|
+
if (typeof mimeTypeOrConfig === "string") {
|
|
730
|
+
mimeType = mimeTypeOrConfig;
|
|
731
|
+
config = maybeConfig ?? null;
|
|
732
|
+
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
733
|
+
config = mimeTypeOrConfig;
|
|
734
|
+
mimeType = null;
|
|
735
|
+
} else {
|
|
736
|
+
config = maybeConfig ?? null;
|
|
737
|
+
mimeType = null;
|
|
576
738
|
}
|
|
577
739
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
578
|
-
const
|
|
579
|
-
|
|
740
|
+
const binding2 = getBinding();
|
|
741
|
+
const rawResult = await binding2.extractFileInWorker(
|
|
742
|
+
pool,
|
|
743
|
+
filePath,
|
|
744
|
+
mimeType,
|
|
745
|
+
normalizedConfig
|
|
746
|
+
);
|
|
747
|
+
return convertResult(rawResult);
|
|
580
748
|
}
|
|
581
|
-
async function
|
|
582
|
-
const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
|
|
583
|
-
if (buffers.length !== mimeTypes.length) {
|
|
584
|
-
throw new TypeError("dataList and mimeTypes must have the same length");
|
|
585
|
-
}
|
|
749
|
+
async function batchExtractFilesInWorker(pool, paths, config = null) {
|
|
586
750
|
const normalizedConfig = normalizeExtractionConfig(config);
|
|
587
|
-
const
|
|
751
|
+
const binding2 = getBinding();
|
|
752
|
+
const rawResults = await binding2.batchExtractFilesInWorker(
|
|
753
|
+
pool,
|
|
754
|
+
paths,
|
|
755
|
+
normalizedConfig
|
|
756
|
+
);
|
|
588
757
|
return rawResults.map(convertResult);
|
|
589
758
|
}
|
|
759
|
+
async function closeWorkerPool(pool) {
|
|
760
|
+
const binding2 = getBinding();
|
|
761
|
+
await binding2.closeWorkerPool(pool);
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
// typescript/plugins/post-processors.ts
|
|
590
765
|
function registerPostProcessor(processor) {
|
|
591
766
|
const binding2 = getBinding();
|
|
592
767
|
const wrappedProcessor = {
|
|
@@ -641,6 +816,8 @@ function listPostProcessors() {
|
|
|
641
816
|
const binding2 = getBinding();
|
|
642
817
|
return binding2.listPostProcessors();
|
|
643
818
|
}
|
|
819
|
+
|
|
820
|
+
// typescript/plugins/validators.ts
|
|
644
821
|
function registerValidator(validator) {
|
|
645
822
|
const binding2 = getBinding();
|
|
646
823
|
const wrappedValidator = {
|
|
@@ -679,6 +856,204 @@ function listValidators() {
|
|
|
679
856
|
const binding2 = getBinding();
|
|
680
857
|
return binding2.listValidators();
|
|
681
858
|
}
|
|
859
|
+
|
|
860
|
+
// typescript/ocr/guten-ocr.ts
|
|
861
|
+
var GutenOcrBackend = class {
|
|
862
|
+
ocr = null;
|
|
863
|
+
ocrModule = null;
|
|
864
|
+
options;
|
|
865
|
+
/**
|
|
866
|
+
* Create a new Guten OCR backend.
|
|
867
|
+
*
|
|
868
|
+
* @param options - Optional configuration for Guten OCR
|
|
869
|
+
* @param options.models - Custom model paths (default: uses bundled models)
|
|
870
|
+
* @param options.isDebug - Enable debug mode (default: false)
|
|
871
|
+
* @param options.debugOutputDir - Directory for debug output (default: undefined)
|
|
872
|
+
* @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
|
|
873
|
+
*
|
|
874
|
+
* @example
|
|
875
|
+
* ```typescript
|
|
876
|
+
* // Default configuration
|
|
877
|
+
* const backend = new GutenOcrBackend();
|
|
878
|
+
*
|
|
879
|
+
* // With debug enabled
|
|
880
|
+
* const debugBackend = new GutenOcrBackend({
|
|
881
|
+
* isDebug: true,
|
|
882
|
+
* debugOutputDir: './ocr_debug'
|
|
883
|
+
* });
|
|
884
|
+
* ```
|
|
885
|
+
*/
|
|
886
|
+
constructor(options) {
|
|
887
|
+
if (options !== void 0) {
|
|
888
|
+
this.options = options;
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
/**
|
|
892
|
+
* Get the backend name.
|
|
893
|
+
*
|
|
894
|
+
* @returns Backend name ("guten-ocr")
|
|
895
|
+
*/
|
|
896
|
+
name() {
|
|
897
|
+
return "guten-ocr";
|
|
898
|
+
}
|
|
899
|
+
/**
|
|
900
|
+
* Get list of supported language codes.
|
|
901
|
+
*
|
|
902
|
+
* Guten OCR supports multiple languages depending on the model configuration.
|
|
903
|
+
* The default models support English and Chinese.
|
|
904
|
+
*
|
|
905
|
+
* @returns Array of ISO 639-1/2 language codes
|
|
906
|
+
*/
|
|
907
|
+
supportedLanguages() {
|
|
908
|
+
return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
|
|
909
|
+
}
|
|
910
|
+
/**
|
|
911
|
+
* Initialize the OCR backend.
|
|
912
|
+
*
|
|
913
|
+
* This method loads the Guten OCR module and creates an OCR instance.
|
|
914
|
+
* Call this before using processImage().
|
|
915
|
+
*
|
|
916
|
+
* @throws {Error} If @gutenye/ocr-node is not installed
|
|
917
|
+
* @throws {Error} If OCR initialization fails
|
|
918
|
+
*
|
|
919
|
+
* @example
|
|
920
|
+
* ```typescript
|
|
921
|
+
* const backend = new GutenOcrBackend();
|
|
922
|
+
* await backend.initialize();
|
|
923
|
+
* ```
|
|
924
|
+
*/
|
|
925
|
+
async initialize() {
|
|
926
|
+
if (this.ocr !== null) {
|
|
927
|
+
return;
|
|
928
|
+
}
|
|
929
|
+
try {
|
|
930
|
+
this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
|
|
931
|
+
} catch (e) {
|
|
932
|
+
const error = e;
|
|
933
|
+
throw new Error(
|
|
934
|
+
`Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
|
|
935
|
+
);
|
|
936
|
+
}
|
|
937
|
+
try {
|
|
938
|
+
this.ocr = await this.ocrModule?.create(this.options) ?? null;
|
|
939
|
+
} catch (e) {
|
|
940
|
+
const error = e;
|
|
941
|
+
throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
/**
|
|
945
|
+
* Shutdown the backend and release resources.
|
|
946
|
+
*
|
|
947
|
+
* This method cleans up all resources associated with the backend,
|
|
948
|
+
* including the GutenOCR instance and module references.
|
|
949
|
+
*
|
|
950
|
+
* @example
|
|
951
|
+
* ```typescript
|
|
952
|
+
* const backend = new GutenOcrBackend();
|
|
953
|
+
* await backend.initialize();
|
|
954
|
+
* // ... use backend ...
|
|
955
|
+
* await backend.shutdown();
|
|
956
|
+
* ```
|
|
957
|
+
*/
|
|
958
|
+
async shutdown() {
|
|
959
|
+
if (this.ocr !== null) {
|
|
960
|
+
this.ocr = null;
|
|
961
|
+
}
|
|
962
|
+
if (this.ocrModule !== null) {
|
|
963
|
+
this.ocrModule = null;
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
/**
|
|
967
|
+
* Process image bytes and extract text using Guten OCR.
|
|
968
|
+
*
|
|
969
|
+
* This method:
|
|
970
|
+
* 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
|
|
971
|
+
* 2. Runs OCR detection to find text regions
|
|
972
|
+
* 3. Runs OCR recognition on each text region
|
|
973
|
+
* 4. Returns extracted text with metadata
|
|
974
|
+
*
|
|
975
|
+
* @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
|
|
976
|
+
* @param language - Language code (must be in supportedLanguages())
|
|
977
|
+
* @returns Promise resolving to OCR result with content and metadata
|
|
978
|
+
*
|
|
979
|
+
* @throws {Error} If backend is not initialized
|
|
980
|
+
* @throws {Error} If OCR processing fails
|
|
981
|
+
*
|
|
982
|
+
* @example
|
|
983
|
+
* ```typescript
|
|
984
|
+
* import { readFile } from 'fs/promises';
|
|
985
|
+
*
|
|
986
|
+
* const backend = new GutenOcrBackend();
|
|
987
|
+
* await backend.initialize();
|
|
988
|
+
*
|
|
989
|
+
* const imageBytes = await readFile('scanned.png');
|
|
990
|
+
* const result = await backend.processImage(imageBytes, 'en');
|
|
991
|
+
* console.log(result.content);
|
|
992
|
+
* console.log(result.metadata.confidence);
|
|
993
|
+
* ```
|
|
994
|
+
*/
|
|
995
|
+
async processImage(imageBytes, language) {
|
|
996
|
+
if (this.ocr === null) {
|
|
997
|
+
await this.initialize();
|
|
998
|
+
}
|
|
999
|
+
if (this.ocr === null) {
|
|
1000
|
+
throw new Error("Guten OCR backend failed to initialize");
|
|
1001
|
+
}
|
|
1002
|
+
try {
|
|
1003
|
+
const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
|
|
1004
|
+
const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
|
|
1005
|
+
if (debugEnv === "1") {
|
|
1006
|
+
const header = Array.from(buffer.subarray(0, 8));
|
|
1007
|
+
console.log("[Guten OCR] Debug input header:", header);
|
|
1008
|
+
console.log(
|
|
1009
|
+
"[Guten OCR] Buffer?",
|
|
1010
|
+
Buffer.isBuffer(buffer),
|
|
1011
|
+
"constructor",
|
|
1012
|
+
imageBytes?.constructor?.name,
|
|
1013
|
+
"length",
|
|
1014
|
+
buffer.length,
|
|
1015
|
+
"type",
|
|
1016
|
+
typeof imageBytes
|
|
1017
|
+
);
|
|
1018
|
+
}
|
|
1019
|
+
let width = 0;
|
|
1020
|
+
let height = 0;
|
|
1021
|
+
try {
|
|
1022
|
+
const sharpModule = await import("sharp");
|
|
1023
|
+
const sharp = sharpModule.default || sharpModule;
|
|
1024
|
+
const image = sharp(buffer);
|
|
1025
|
+
const metadata = await image.metadata();
|
|
1026
|
+
const metadataRecord = metadata;
|
|
1027
|
+
width = metadataRecord["width"] ?? 0;
|
|
1028
|
+
height = metadataRecord["height"] ?? 0;
|
|
1029
|
+
} catch (metadataError) {
|
|
1030
|
+
const error = metadataError;
|
|
1031
|
+
console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
|
|
1032
|
+
}
|
|
1033
|
+
const result = await this.ocr.detect(buffer);
|
|
1034
|
+
const textLines = result.map((line) => line.text);
|
|
1035
|
+
const content = textLines.join("\n");
|
|
1036
|
+
const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
|
|
1037
|
+
return {
|
|
1038
|
+
content,
|
|
1039
|
+
mime_type: "text/plain",
|
|
1040
|
+
metadata: {
|
|
1041
|
+
width,
|
|
1042
|
+
height,
|
|
1043
|
+
confidence: avgConfidence,
|
|
1044
|
+
text_regions: result.length,
|
|
1045
|
+
language
|
|
1046
|
+
},
|
|
1047
|
+
tables: []
|
|
1048
|
+
};
|
|
1049
|
+
} catch (e) {
|
|
1050
|
+
const error = e;
|
|
1051
|
+
throw new Error(`Guten OCR processing failed: ${error.message}`);
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
};
|
|
1055
|
+
|
|
1056
|
+
// typescript/plugins/ocr-backends.ts
|
|
682
1057
|
function isOcrProcessTuple(value) {
|
|
683
1058
|
return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
|
|
684
1059
|
}
|
|
@@ -748,6 +1123,8 @@ function clearOcrBackends() {
|
|
|
748
1123
|
const binding2 = getBinding();
|
|
749
1124
|
binding2.clearOcrBackends();
|
|
750
1125
|
}
|
|
1126
|
+
|
|
1127
|
+
// typescript/registry/document-extractors.ts
|
|
751
1128
|
function listDocumentExtractors() {
|
|
752
1129
|
const binding2 = getBinding();
|
|
753
1130
|
return binding2.listDocumentExtractors();
|
|
@@ -760,7 +1137,9 @@ function clearDocumentExtractors() {
|
|
|
760
1137
|
const binding2 = getBinding();
|
|
761
1138
|
binding2.clearDocumentExtractors();
|
|
762
1139
|
}
|
|
763
|
-
|
|
1140
|
+
|
|
1141
|
+
// typescript/config/loader.ts
|
|
1142
|
+
var ExtractionConfig = {
|
|
764
1143
|
/**
|
|
765
1144
|
* Load extraction configuration from a file.
|
|
766
1145
|
*
|
|
@@ -822,6 +1201,18 @@ const ExtractionConfig = {
|
|
|
822
1201
|
return binding2.discoverExtractionConfig();
|
|
823
1202
|
}
|
|
824
1203
|
};
|
|
1204
|
+
function loadConfigFile(filePath) {
|
|
1205
|
+
return ExtractionConfig.fromFile(filePath);
|
|
1206
|
+
}
|
|
1207
|
+
function loadConfigFromPath(path) {
|
|
1208
|
+
try {
|
|
1209
|
+
return ExtractionConfig.fromFile(path);
|
|
1210
|
+
} catch {
|
|
1211
|
+
return ExtractionConfig.discover();
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
// typescript/mime/utilities.ts
|
|
825
1216
|
function detectMimeType(bytes) {
|
|
826
1217
|
const binding2 = getBinding();
|
|
827
1218
|
return binding2.detectMimeTypeFromBytes(bytes);
|
|
@@ -838,6 +1229,8 @@ function getExtensionsForMime(mimeType) {
|
|
|
838
1229
|
const binding2 = getBinding();
|
|
839
1230
|
return binding2.getExtensionsForMime(mimeType);
|
|
840
1231
|
}
|
|
1232
|
+
|
|
1233
|
+
// typescript/embeddings/presets.ts
|
|
841
1234
|
function listEmbeddingPresets() {
|
|
842
1235
|
const binding2 = getBinding();
|
|
843
1236
|
return binding2.listEmbeddingPresets();
|
|
@@ -847,76 +1240,9 @@ function getEmbeddingPreset(name) {
|
|
|
847
1240
|
const result = binding2.getEmbeddingPreset(name);
|
|
848
1241
|
return result;
|
|
849
1242
|
}
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
}
|
|
854
|
-
function getLastPanicContext() {
|
|
855
|
-
const binding2 = getBinding();
|
|
856
|
-
const result = binding2.getLastPanicContext();
|
|
857
|
-
return result;
|
|
858
|
-
}
|
|
859
|
-
function getErrorCodeName(code) {
|
|
860
|
-
const binding2 = getBinding();
|
|
861
|
-
return binding2.getErrorCodeName(code);
|
|
862
|
-
}
|
|
863
|
-
function getErrorCodeDescription(code) {
|
|
864
|
-
const binding2 = getBinding();
|
|
865
|
-
return binding2.getErrorCodeDescription(code);
|
|
866
|
-
}
|
|
867
|
-
function classifyError(errorMessage) {
|
|
868
|
-
const binding2 = getBinding();
|
|
869
|
-
const result = binding2.classifyError(errorMessage);
|
|
870
|
-
return result;
|
|
871
|
-
}
|
|
872
|
-
function createWorkerPool(size) {
|
|
873
|
-
const binding2 = getBinding();
|
|
874
|
-
const rawPool = binding2.createWorkerPool(size);
|
|
875
|
-
return rawPool;
|
|
876
|
-
}
|
|
877
|
-
function getWorkerPoolStats(pool) {
|
|
878
|
-
const binding2 = getBinding();
|
|
879
|
-
const rawStats = binding2.getWorkerPoolStats(pool);
|
|
880
|
-
return rawStats;
|
|
881
|
-
}
|
|
882
|
-
async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
|
|
883
|
-
let mimeType = null;
|
|
884
|
-
let config = null;
|
|
885
|
-
if (typeof mimeTypeOrConfig === "string") {
|
|
886
|
-
mimeType = mimeTypeOrConfig;
|
|
887
|
-
config = maybeConfig ?? null;
|
|
888
|
-
} else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
|
|
889
|
-
config = mimeTypeOrConfig;
|
|
890
|
-
mimeType = null;
|
|
891
|
-
} else {
|
|
892
|
-
config = maybeConfig ?? null;
|
|
893
|
-
mimeType = null;
|
|
894
|
-
}
|
|
895
|
-
const normalizedConfig = normalizeExtractionConfig(config);
|
|
896
|
-
const binding2 = getBinding();
|
|
897
|
-
const rawResult = await binding2.extractFileInWorker(
|
|
898
|
-
pool,
|
|
899
|
-
filePath,
|
|
900
|
-
mimeType,
|
|
901
|
-
normalizedConfig
|
|
902
|
-
);
|
|
903
|
-
return convertResult(rawResult);
|
|
904
|
-
}
|
|
905
|
-
async function batchExtractFilesInWorker(pool, paths, config = null) {
|
|
906
|
-
const normalizedConfig = normalizeExtractionConfig(config);
|
|
907
|
-
const binding2 = getBinding();
|
|
908
|
-
const rawResults = await binding2.batchExtractFilesInWorker(
|
|
909
|
-
pool,
|
|
910
|
-
paths,
|
|
911
|
-
normalizedConfig
|
|
912
|
-
);
|
|
913
|
-
return rawResults.map(convertResult);
|
|
914
|
-
}
|
|
915
|
-
async function closeWorkerPool(pool) {
|
|
916
|
-
const binding2 = getBinding();
|
|
917
|
-
await binding2.closeWorkerPool(pool);
|
|
918
|
-
}
|
|
919
|
-
const __version__ = "4.0.8";
|
|
1243
|
+
|
|
1244
|
+
// typescript/index.ts
|
|
1245
|
+
var __version__ = "4.1.0";
|
|
920
1246
|
export {
|
|
921
1247
|
CacheError,
|
|
922
1248
|
ErrorCode,
|
|
@@ -963,6 +1289,8 @@ export {
|
|
|
963
1289
|
listOcrBackends,
|
|
964
1290
|
listPostProcessors,
|
|
965
1291
|
listValidators,
|
|
1292
|
+
loadConfigFile,
|
|
1293
|
+
loadConfigFromPath,
|
|
966
1294
|
registerOcrBackend,
|
|
967
1295
|
registerPostProcessor,
|
|
968
1296
|
registerValidator,
|