@kreuzberg/node 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,9 @@
1
1
  "use strict";
2
+ var __create = Object.create;
2
3
  var __defProp = Object.defineProperty;
3
4
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
5
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
6
8
  var __export = (target, all) => {
7
9
  for (var name in all)
@@ -15,21 +17,30 @@ var __copyProps = (to, from, except, desc) => {
15
17
  }
16
18
  return to;
17
19
  };
18
- var __reExport = (target, mod, secondTarget) => (__copyProps(target, mod, "default"), secondTarget && __copyProps(secondTarget, mod, "default"));
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
19
28
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // typescript/index.ts
20
31
  var index_exports = {};
21
32
  __export(index_exports, {
22
- CacheError: () => import_errors.CacheError,
23
- ErrorCode: () => import_errors.ErrorCode,
33
+ CacheError: () => CacheError,
34
+ ErrorCode: () => ErrorCode,
24
35
  ExtractionConfig: () => ExtractionConfig,
25
- GutenOcrBackend: () => import_guten_ocr.GutenOcrBackend,
26
- ImageProcessingError: () => import_errors.ImageProcessingError,
27
- KreuzbergError: () => import_errors.KreuzbergError,
28
- MissingDependencyError: () => import_errors.MissingDependencyError,
29
- OcrError: () => import_errors.OcrError,
30
- ParsingError: () => import_errors.ParsingError,
31
- PluginError: () => import_errors.PluginError,
32
- ValidationError: () => import_errors.ValidationError,
36
+ GutenOcrBackend: () => GutenOcrBackend,
37
+ ImageProcessingError: () => ImageProcessingError,
38
+ KreuzbergError: () => KreuzbergError,
39
+ MissingDependencyError: () => MissingDependencyError,
40
+ OcrError: () => OcrError,
41
+ ParsingError: () => ParsingError,
42
+ PluginError: () => PluginError,
43
+ ValidationError: () => ValidationError,
33
44
  __resetBindingForTests: () => __resetBindingForTests,
34
45
  __setBindingForTests: () => __setBindingForTests,
35
46
  __version__: () => __version__,
@@ -64,6 +75,8 @@ __export(index_exports, {
64
75
  listOcrBackends: () => listOcrBackends,
65
76
  listPostProcessors: () => listPostProcessors,
66
77
  listValidators: () => listValidators,
78
+ loadConfigFile: () => loadConfigFile,
79
+ loadConfigFromPath: () => loadConfigFromPath,
67
80
  registerOcrBackend: () => registerOcrBackend,
68
81
  registerPostProcessor: () => registerPostProcessor,
69
82
  registerValidator: () => registerValidator,
@@ -74,14 +87,12 @@ __export(index_exports, {
74
87
  validateMimeType: () => validateMimeType
75
88
  });
76
89
  module.exports = __toCommonJS(index_exports);
77
- var import_node_fs = require("node:fs");
78
- var import_node_module = require("node:module");
79
- var import_errors = require("./errors.js");
80
- var import_guten_ocr = require("./ocr/guten-ocr.js");
81
- __reExport(index_exports, require("./types.js"), module.exports);
82
- const import_meta = {};
83
- let binding = null;
84
- let bindingInitialized = false;
90
+
91
+ // typescript/core/binding.ts
92
+ var import_node_module = require("module");
93
+ var import_meta = {};
94
+ var binding = null;
95
+ var bindingInitialized = false;
85
96
  function createNativeBindingError(error) {
86
97
  const hintParts = [];
87
98
  let detail = "Unknown error while requiring native module.";
@@ -110,42 +121,13 @@ function createNativeBindingError(error) {
110
121
  ].join(" ")
111
122
  );
112
123
  }
113
- function assertUint8Array(value, name) {
114
- if (!(value instanceof Uint8Array)) {
115
- throw new TypeError(`${name} must be a Uint8Array`);
116
- }
117
- return value;
118
- }
119
- function assertUint8ArrayList(values, name) {
120
- if (!Array.isArray(values)) {
121
- throw new TypeError(`${name} must be an array of Uint8Array`);
122
- }
123
- const array = values;
124
- return array.map((value, index) => {
125
- try {
126
- return assertUint8Array(value, `${name}[${index}]`);
127
- } catch {
128
- throw new TypeError(`${name}[${index}] must be a Uint8Array`);
129
- }
130
- });
131
- }
132
- function __setBindingForTests(mock) {
133
- binding = mock;
134
- bindingInitialized = true;
135
- }
136
- function __resetBindingForTests() {
137
- binding = null;
138
- bindingInitialized = false;
139
- }
140
124
  function loadNativeBinding() {
141
125
  let localRequire;
142
- if (typeof require !== "undefined") {
143
- localRequire = require;
144
- } else {
145
- try {
146
- localRequire = (0, import_node_module.createRequire)(import_meta.url);
147
- } catch {
148
- localRequire = void 0;
126
+ try {
127
+ localRequire = (0, import_node_module.createRequire)(import_meta.url);
128
+ } catch {
129
+ if (typeof require !== "undefined") {
130
+ localRequire = require;
149
131
  }
150
132
  }
151
133
  if (!localRequire) {
@@ -198,175 +180,158 @@ function getBinding() {
198
180
  "Failed to load Kreuzberg bindings. Neither NAPI (Node.js) nor WASM (browsers/Deno) bindings are available. Make sure you have installed the @kreuzberg/node package for Node.js/Bun."
199
181
  );
200
182
  }
201
- function parseMetadata(metadataStr) {
202
- try {
203
- const parsed = JSON.parse(metadataStr);
204
- if (typeof parsed === "object" && parsed !== null) {
205
- return parsed;
206
- }
207
- return {};
208
- } catch {
209
- return {};
210
- }
183
+ function __setBindingForTests(mock) {
184
+ binding = mock;
185
+ bindingInitialized = true;
211
186
  }
212
- function ensureUint8Array(value) {
213
- if (value instanceof Uint8Array) {
214
- return value;
215
- }
216
- if (typeof Buffer !== "undefined" && value instanceof Buffer) {
217
- return new Uint8Array(value);
218
- }
219
- if (Array.isArray(value)) {
220
- return new Uint8Array(value);
221
- }
222
- return new Uint8Array();
187
+ function __resetBindingForTests() {
188
+ binding = null;
189
+ bindingInitialized = false;
223
190
  }
224
- function convertChunk(rawChunk) {
225
- if (!rawChunk || typeof rawChunk !== "object") {
226
- return {
227
- content: "",
228
- metadata: {
229
- byteStart: 0,
230
- byteEnd: 0,
231
- tokenCount: null,
232
- chunkIndex: 0,
233
- totalChunks: 0
234
- },
235
- embedding: null
236
- };
237
- }
238
- const chunk = rawChunk;
239
- const metadata = chunk["metadata"] ?? {};
240
- return {
241
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
242
- content: chunk["content"] ?? "",
243
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
244
- embedding: chunk["embedding"] ?? null,
245
- metadata: {
246
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
247
- byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
248
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
249
- byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
250
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
251
- tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
252
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
253
- chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
254
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
255
- totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
256
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
257
- firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
258
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
259
- lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
260
- }
261
- };
191
+
192
+ // typescript/errors/diagnostics.ts
193
+ function getLastErrorCode() {
194
+ const binding2 = getBinding();
195
+ return binding2.getLastErrorCode();
262
196
  }
263
- function convertImage(rawImage) {
264
- if (!rawImage || typeof rawImage !== "object") {
265
- return {
266
- data: new Uint8Array(),
267
- format: "unknown",
268
- imageIndex: 0,
269
- pageNumber: null,
270
- width: null,
271
- height: null,
272
- colorspace: null,
273
- bitsPerComponent: null,
274
- isMask: false,
275
- description: null,
276
- ocrResult: null
277
- };
278
- }
279
- const image = rawImage;
280
- return {
281
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
282
- data: ensureUint8Array(image["data"]),
283
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
284
- format: image["format"] ?? "unknown",
285
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
286
- imageIndex: image["imageIndex"] ?? 0,
287
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
288
- pageNumber: image["pageNumber"] ?? null,
289
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
290
- width: image["width"] ?? null,
291
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
292
- height: image["height"] ?? null,
293
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
294
- colorspace: image["colorspace"] ?? null,
295
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
296
- bitsPerComponent: image["bitsPerComponent"] ?? null,
297
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
298
- isMask: image["isMask"] ?? false,
299
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
300
- description: image["description"] ?? null,
301
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
302
- ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
303
- };
197
+ function getLastPanicContext() {
198
+ const binding2 = getBinding();
199
+ const result = binding2.getLastPanicContext();
200
+ return result;
304
201
  }
305
- function convertPageContent(rawPage) {
306
- if (!rawPage || typeof rawPage !== "object") {
202
+ function getErrorCodeName(code) {
203
+ const binding2 = getBinding();
204
+ return binding2.getErrorCodeName(code);
205
+ }
206
+ function getErrorCodeDescription(code) {
207
+ const binding2 = getBinding();
208
+ return binding2.getErrorCodeDescription(code);
209
+ }
210
+ function classifyError(errorMessage) {
211
+ const binding2 = getBinding();
212
+ const result = binding2.classifyError(errorMessage);
213
+ return result;
214
+ }
215
+
216
+ // typescript/errors.ts
217
+ var ErrorCode = /* @__PURE__ */ ((ErrorCode2) => {
218
+ ErrorCode2[ErrorCode2["Success"] = 0] = "Success";
219
+ ErrorCode2[ErrorCode2["GenericError"] = 1] = "GenericError";
220
+ ErrorCode2[ErrorCode2["Panic"] = 2] = "Panic";
221
+ ErrorCode2[ErrorCode2["InvalidArgument"] = 3] = "InvalidArgument";
222
+ ErrorCode2[ErrorCode2["IoError"] = 4] = "IoError";
223
+ ErrorCode2[ErrorCode2["ParsingError"] = 5] = "ParsingError";
224
+ ErrorCode2[ErrorCode2["OcrError"] = 6] = "OcrError";
225
+ ErrorCode2[ErrorCode2["MissingDependency"] = 7] = "MissingDependency";
226
+ return ErrorCode2;
227
+ })(ErrorCode || {});
228
+ var KreuzbergError = class _KreuzbergError extends Error {
229
+ /**
230
+ * Panic context if error was caused by a panic in native code.
231
+ * Will be null for non-panic errors.
232
+ */
233
+ panicContext;
234
+ constructor(message, panicContext) {
235
+ super(message);
236
+ this.name = "KreuzbergError";
237
+ this.panicContext = panicContext ?? null;
238
+ Object.setPrototypeOf(this, _KreuzbergError.prototype);
239
+ }
240
+ toJSON() {
307
241
  return {
308
- pageNumber: 0,
309
- content: "",
310
- tables: [],
311
- images: []
242
+ name: this.name,
243
+ message: this.message,
244
+ panicContext: this.panicContext,
245
+ stack: this.stack
312
246
  };
313
247
  }
314
- const page = rawPage;
315
- return {
316
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
317
- pageNumber: page["pageNumber"] ?? 0,
318
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
319
- content: page["content"] ?? "",
320
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
321
- tables: Array.isArray(page["tables"]) ? page["tables"] : [],
322
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
323
- images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
324
- };
325
- }
326
- function convertResult(rawResult) {
327
- if (!rawResult || typeof rawResult !== "object") {
248
+ };
249
+ var ValidationError = class _ValidationError extends KreuzbergError {
250
+ constructor(message, panicContext) {
251
+ super(message, panicContext);
252
+ this.name = "ValidationError";
253
+ Object.setPrototypeOf(this, _ValidationError.prototype);
254
+ }
255
+ };
256
+ var ParsingError = class _ParsingError extends KreuzbergError {
257
+ constructor(message, panicContext) {
258
+ super(message, panicContext);
259
+ this.name = "ParsingError";
260
+ Object.setPrototypeOf(this, _ParsingError.prototype);
261
+ }
262
+ };
263
+ var OcrError = class _OcrError extends KreuzbergError {
264
+ constructor(message, panicContext) {
265
+ super(message, panicContext);
266
+ this.name = "OcrError";
267
+ Object.setPrototypeOf(this, _OcrError.prototype);
268
+ }
269
+ };
270
+ var CacheError = class _CacheError extends KreuzbergError {
271
+ constructor(message, panicContext) {
272
+ super(message, panicContext);
273
+ this.name = "CacheError";
274
+ Object.setPrototypeOf(this, _CacheError.prototype);
275
+ }
276
+ };
277
+ var ImageProcessingError = class _ImageProcessingError extends KreuzbergError {
278
+ constructor(message, panicContext) {
279
+ super(message, panicContext);
280
+ this.name = "ImageProcessingError";
281
+ Object.setPrototypeOf(this, _ImageProcessingError.prototype);
282
+ }
283
+ };
284
+ var PluginError = class _PluginError extends KreuzbergError {
285
+ /**
286
+ * Name of the plugin that threw the error.
287
+ */
288
+ pluginName;
289
+ constructor(message, pluginName, panicContext) {
290
+ super(`Plugin error in '${pluginName}': ${message}`, panicContext);
291
+ this.name = "PluginError";
292
+ this.pluginName = pluginName;
293
+ Object.setPrototypeOf(this, _PluginError.prototype);
294
+ }
295
+ toJSON() {
328
296
  return {
329
- content: "",
330
- mimeType: "application/octet-stream",
331
- metadata: {},
332
- tables: [],
333
- detectedLanguages: null,
334
- chunks: null,
335
- images: null,
336
- pages: null
297
+ name: this.name,
298
+ message: this.message,
299
+ pluginName: this.pluginName,
300
+ panicContext: this.panicContext,
301
+ stack: this.stack
337
302
  };
338
303
  }
339
- const result = rawResult;
340
- const metadata = result["metadata"];
341
- const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
342
- const returnObj = {
343
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
344
- content: result["content"] ?? "",
345
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
346
- mimeType: result["mimeType"] ?? "application/octet-stream",
347
- metadata: metadataValue,
348
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
349
- tables: Array.isArray(result["tables"]) ? result["tables"] : [],
350
- // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
351
- detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
352
- chunks: null,
353
- images: null,
354
- pages: null
355
- };
356
- const chunksData = result["chunks"];
357
- if (Array.isArray(chunksData)) {
358
- returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
304
+ };
305
+ var MissingDependencyError = class _MissingDependencyError extends KreuzbergError {
306
+ constructor(message, panicContext) {
307
+ super(message, panicContext);
308
+ this.name = "MissingDependencyError";
309
+ Object.setPrototypeOf(this, _MissingDependencyError.prototype);
359
310
  }
360
- const imagesData = result["images"];
361
- if (Array.isArray(imagesData)) {
362
- returnObj.images = imagesData.map((image) => convertImage(image));
311
+ };
312
+
313
+ // typescript/core/assertions.ts
314
+ function assertUint8Array(value, name) {
315
+ if (!(value instanceof Uint8Array)) {
316
+ throw new TypeError(`${name} must be a Uint8Array`);
363
317
  }
364
- const pagesData = result["pages"];
365
- if (Array.isArray(pagesData)) {
366
- returnObj.pages = pagesData.map((page) => convertPageContent(page));
318
+ return value;
319
+ }
320
+ function assertUint8ArrayList(values, name) {
321
+ if (!Array.isArray(values)) {
322
+ throw new TypeError(`${name} must be an array of Uint8Array`);
367
323
  }
368
- return returnObj;
324
+ const array = values;
325
+ return array.map((value, index) => {
326
+ try {
327
+ return assertUint8Array(value, `${name}[${index}]`);
328
+ } catch {
329
+ throw new TypeError(`${name}[${index}] must be a Uint8Array`);
330
+ }
331
+ });
369
332
  }
333
+
334
+ // typescript/core/config-normalizer.ts
370
335
  function setIfDefined(target, key, value) {
371
336
  if (value !== void 0) {
372
337
  target[key] = value;
@@ -524,47 +489,251 @@ function normalizeKeywordConfig(config) {
524
489
  setIfDefined(normalized, "rakeParams", config.rakeParams);
525
490
  return normalized;
526
491
  }
527
- function normalizePageConfig(pages) {
528
- if (!pages) {
529
- return void 0;
492
+ function normalizePageConfig(pages) {
493
+ if (!pages) {
494
+ return void 0;
495
+ }
496
+ const normalized = {};
497
+ setIfDefined(normalized, "extractPages", pages.extractPages);
498
+ setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
499
+ setIfDefined(normalized, "markerFormat", pages.markerFormat);
500
+ return normalized;
501
+ }
502
+ function normalizeExtractionConfig(config) {
503
+ if (!config) {
504
+ return null;
505
+ }
506
+ const normalized = {};
507
+ setIfDefined(normalized, "useCache", config.useCache);
508
+ setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
509
+ setIfDefined(normalized, "forceOcr", config.forceOcr);
510
+ setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
511
+ const ocr = normalizeOcrConfig(config.ocr);
512
+ setIfDefined(normalized, "ocr", ocr);
513
+ const chunking = normalizeChunkingConfig(config.chunking);
514
+ setIfDefined(normalized, "chunking", chunking);
515
+ const images = normalizeImageExtractionConfig(config.images);
516
+ setIfDefined(normalized, "images", images);
517
+ const pdf = normalizePdfConfig(config.pdfOptions);
518
+ setIfDefined(normalized, "pdfOptions", pdf);
519
+ const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
520
+ setIfDefined(normalized, "tokenReduction", tokenReduction);
521
+ const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
522
+ setIfDefined(normalized, "languageDetection", languageDetection);
523
+ const postprocessor = normalizePostProcessorConfig(config.postprocessor);
524
+ setIfDefined(normalized, "postprocessor", postprocessor);
525
+ const keywords = normalizeKeywordConfig(config.keywords);
526
+ setIfDefined(normalized, "keywords", keywords);
527
+ const pages = normalizePageConfig(config.pages);
528
+ setIfDefined(normalized, "pages", pages);
529
+ const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
530
+ setIfDefined(normalized, "htmlOptions", htmlOptions);
531
+ return normalized;
532
+ }
533
+
534
+ // typescript/core/type-converters.ts
535
+ function parseMetadata(metadataStr) {
536
+ try {
537
+ const parsed = JSON.parse(metadataStr);
538
+ if (typeof parsed === "object" && parsed !== null) {
539
+ return parsed;
540
+ }
541
+ return {};
542
+ } catch {
543
+ return {};
544
+ }
545
+ }
546
+ function ensureUint8Array(value) {
547
+ if (value instanceof Uint8Array) {
548
+ return value;
549
+ }
550
+ if (typeof Buffer !== "undefined" && value instanceof Buffer) {
551
+ return new Uint8Array(value);
552
+ }
553
+ if (Array.isArray(value)) {
554
+ return new Uint8Array(value);
555
+ }
556
+ return new Uint8Array();
557
+ }
558
+ function convertChunk(rawChunk) {
559
+ if (!rawChunk || typeof rawChunk !== "object") {
560
+ return {
561
+ content: "",
562
+ metadata: {
563
+ byteStart: 0,
564
+ byteEnd: 0,
565
+ tokenCount: null,
566
+ chunkIndex: 0,
567
+ totalChunks: 0
568
+ },
569
+ embedding: null
570
+ };
571
+ }
572
+ const chunk = rawChunk;
573
+ const metadata = chunk["metadata"] ?? {};
574
+ return {
575
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
576
+ content: chunk["content"] ?? "",
577
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
578
+ embedding: chunk["embedding"] ?? null,
579
+ metadata: {
580
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
581
+ byteStart: metadata["byte_start"] ?? metadata["charStart"] ?? 0,
582
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
583
+ byteEnd: metadata["byte_end"] ?? metadata["charEnd"] ?? 0,
584
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
585
+ tokenCount: metadata["token_count"] ?? metadata["tokenCount"] ?? null,
586
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
587
+ chunkIndex: metadata["chunk_index"] ?? metadata["chunkIndex"] ?? 0,
588
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
589
+ totalChunks: metadata["total_chunks"] ?? metadata["totalChunks"] ?? 0,
590
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
591
+ firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
592
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
593
+ lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
594
+ }
595
+ };
596
+ }
597
+ function convertImage(rawImage) {
598
+ if (!rawImage || typeof rawImage !== "object") {
599
+ return {
600
+ data: new Uint8Array(),
601
+ format: "unknown",
602
+ imageIndex: 0,
603
+ pageNumber: null,
604
+ width: null,
605
+ height: null,
606
+ colorspace: null,
607
+ bitsPerComponent: null,
608
+ isMask: false,
609
+ description: null,
610
+ ocrResult: null
611
+ };
612
+ }
613
+ const image = rawImage;
614
+ return {
615
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
616
+ data: ensureUint8Array(image["data"]),
617
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
618
+ format: image["format"] ?? "unknown",
619
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
620
+ imageIndex: image["imageIndex"] ?? 0,
621
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
622
+ pageNumber: image["pageNumber"] ?? null,
623
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
624
+ width: image["width"] ?? null,
625
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
626
+ height: image["height"] ?? null,
627
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
628
+ colorspace: image["colorspace"] ?? null,
629
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
630
+ bitsPerComponent: image["bitsPerComponent"] ?? null,
631
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
632
+ isMask: image["isMask"] ?? false,
633
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
634
+ description: image["description"] ?? null,
635
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
636
+ ocrResult: image["ocrResult"] ? convertResult(image["ocrResult"]) : null
637
+ };
638
+ }
639
+ function convertPageContent(rawPage) {
640
+ if (!rawPage || typeof rawPage !== "object") {
641
+ return {
642
+ pageNumber: 0,
643
+ content: "",
644
+ tables: [],
645
+ images: []
646
+ };
647
+ }
648
+ const page = rawPage;
649
+ return {
650
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
651
+ pageNumber: page["pageNumber"] ?? 0,
652
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
653
+ content: page["content"] ?? "",
654
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
655
+ tables: Array.isArray(page["tables"]) ? page["tables"] : [],
656
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
657
+ images: Array.isArray(page["images"]) ? page["images"].map((image) => convertImage(image)) : []
658
+ };
659
+ }
660
+ function convertResult(rawResult) {
661
+ if (!rawResult || typeof rawResult !== "object") {
662
+ return {
663
+ content: "",
664
+ mimeType: "application/octet-stream",
665
+ metadata: {},
666
+ tables: [],
667
+ detectedLanguages: null,
668
+ chunks: null,
669
+ images: null,
670
+ pages: null
671
+ };
672
+ }
673
+ const result = rawResult;
674
+ const metadata = result["metadata"];
675
+ const metadataValue = typeof metadata === "string" ? parseMetadata(metadata) : metadata ?? {};
676
+ const returnObj = {
677
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
678
+ content: result["content"] ?? "",
679
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
680
+ mimeType: result["mimeType"] ?? "application/octet-stream",
681
+ metadata: metadataValue,
682
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
683
+ tables: Array.isArray(result["tables"]) ? result["tables"] : [],
684
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
685
+ detectedLanguages: Array.isArray(result["detectedLanguages"]) ? result["detectedLanguages"] : null,
686
+ chunks: null,
687
+ images: null,
688
+ pages: null
689
+ };
690
+ const chunksData = result["chunks"];
691
+ if (Array.isArray(chunksData)) {
692
+ returnObj.chunks = chunksData.map((chunk) => convertChunk(chunk));
693
+ }
694
+ const imagesData = result["images"];
695
+ if (Array.isArray(imagesData)) {
696
+ returnObj.images = imagesData.map((image) => convertImage(image));
697
+ }
698
+ const pagesData = result["pages"];
699
+ if (Array.isArray(pagesData)) {
700
+ returnObj.pages = pagesData.map((page) => convertPageContent(page));
701
+ }
702
+ return returnObj;
703
+ }
704
+
705
+ // typescript/extraction/batch.ts
706
+ function batchExtractFilesSync(paths, config = null) {
707
+ const normalizedConfig = normalizeExtractionConfig(config);
708
+ const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
709
+ return rawResults.map(convertResult);
710
+ }
711
+ async function batchExtractFiles(paths, config = null) {
712
+ const normalizedConfig = normalizeExtractionConfig(config);
713
+ const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
714
+ return rawResults.map(convertResult);
715
+ }
716
+ function batchExtractBytesSync(dataList, mimeTypes, config = null) {
717
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
718
+ if (buffers.length !== mimeTypes.length) {
719
+ throw new TypeError("dataList and mimeTypes must have the same length");
530
720
  }
531
- const normalized = {};
532
- setIfDefined(normalized, "extractPages", pages.extractPages);
533
- setIfDefined(normalized, "insertPageMarkers", pages.insertPageMarkers);
534
- setIfDefined(normalized, "markerFormat", pages.markerFormat);
535
- return normalized;
721
+ const normalizedConfig = normalizeExtractionConfig(config);
722
+ const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
723
+ return rawResults.map(convertResult);
536
724
  }
537
- function normalizeExtractionConfig(config) {
538
- if (!config) {
539
- return null;
725
+ async function batchExtractBytes(dataList, mimeTypes, config = null) {
726
+ const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
727
+ if (buffers.length !== mimeTypes.length) {
728
+ throw new TypeError("dataList and mimeTypes must have the same length");
540
729
  }
541
- const normalized = {};
542
- setIfDefined(normalized, "useCache", config.useCache);
543
- setIfDefined(normalized, "enableQualityProcessing", config.enableQualityProcessing);
544
- setIfDefined(normalized, "forceOcr", config.forceOcr);
545
- setIfDefined(normalized, "maxConcurrentExtractions", config.maxConcurrentExtractions);
546
- const ocr = normalizeOcrConfig(config.ocr);
547
- setIfDefined(normalized, "ocr", ocr);
548
- const chunking = normalizeChunkingConfig(config.chunking);
549
- setIfDefined(normalized, "chunking", chunking);
550
- const images = normalizeImageExtractionConfig(config.images);
551
- setIfDefined(normalized, "images", images);
552
- const pdf = normalizePdfConfig(config.pdfOptions);
553
- setIfDefined(normalized, "pdfOptions", pdf);
554
- const tokenReduction = normalizeTokenReductionConfig(config.tokenReduction);
555
- setIfDefined(normalized, "tokenReduction", tokenReduction);
556
- const languageDetection = normalizeLanguageDetectionConfig(config.languageDetection);
557
- setIfDefined(normalized, "languageDetection", languageDetection);
558
- const postprocessor = normalizePostProcessorConfig(config.postprocessor);
559
- setIfDefined(normalized, "postprocessor", postprocessor);
560
- const keywords = normalizeKeywordConfig(config.keywords);
561
- setIfDefined(normalized, "keywords", keywords);
562
- const pages = normalizePageConfig(config.pages);
563
- setIfDefined(normalized, "pages", pages);
564
- const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
565
- setIfDefined(normalized, "htmlOptions", htmlOptions);
566
- return normalized;
730
+ const normalizedConfig = normalizeExtractionConfig(config);
731
+ const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
732
+ return rawResults.map(convertResult);
567
733
  }
734
+
735
+ // typescript/extraction/single.ts
736
+ var import_node_fs = require("fs");
568
737
  function extractFileSync(filePath, mimeTypeOrConfig, maybeConfig) {
569
738
  let mimeType = null;
570
739
  let config = null;
@@ -626,34 +795,57 @@ async function extractBytes(dataOrPath, mimeType, config = null) {
626
795
  const rawResult = await getBinding().extractBytes(Buffer.from(validated), mimeType, normalizedConfig);
627
796
  return convertResult(rawResult);
628
797
  }
629
- function batchExtractFilesSync(paths, config = null) {
630
- const normalizedConfig = normalizeExtractionConfig(config);
631
- const rawResults = getBinding().batchExtractFilesSync(paths, normalizedConfig);
632
- return rawResults.map(convertResult);
798
+
799
+ // typescript/extraction/worker-pool.ts
800
+ function createWorkerPool(size) {
801
+ const binding2 = getBinding();
802
+ const rawPool = binding2.createWorkerPool(size);
803
+ return rawPool;
633
804
  }
634
- async function batchExtractFiles(paths, config = null) {
635
- const normalizedConfig = normalizeExtractionConfig(config);
636
- const rawResults = await getBinding().batchExtractFiles(paths, normalizedConfig);
637
- return rawResults.map(convertResult);
805
+ function getWorkerPoolStats(pool) {
806
+ const binding2 = getBinding();
807
+ const rawStats = binding2.getWorkerPoolStats(pool);
808
+ return rawStats;
638
809
  }
639
- function batchExtractBytesSync(dataList, mimeTypes, config = null) {
640
- const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
641
- if (buffers.length !== mimeTypes.length) {
642
- throw new TypeError("dataList and mimeTypes must have the same length");
810
+ async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
811
+ let mimeType = null;
812
+ let config = null;
813
+ if (typeof mimeTypeOrConfig === "string") {
814
+ mimeType = mimeTypeOrConfig;
815
+ config = maybeConfig ?? null;
816
+ } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
817
+ config = mimeTypeOrConfig;
818
+ mimeType = null;
819
+ } else {
820
+ config = maybeConfig ?? null;
821
+ mimeType = null;
643
822
  }
644
823
  const normalizedConfig = normalizeExtractionConfig(config);
645
- const rawResults = getBinding().batchExtractBytesSync(buffers, mimeTypes, normalizedConfig);
646
- return rawResults.map(convertResult);
824
+ const binding2 = getBinding();
825
+ const rawResult = await binding2.extractFileInWorker(
826
+ pool,
827
+ filePath,
828
+ mimeType,
829
+ normalizedConfig
830
+ );
831
+ return convertResult(rawResult);
647
832
  }
648
- async function batchExtractBytes(dataList, mimeTypes, config = null) {
649
- const buffers = assertUint8ArrayList(dataList, "dataList").map((data) => Buffer.from(data));
650
- if (buffers.length !== mimeTypes.length) {
651
- throw new TypeError("dataList and mimeTypes must have the same length");
652
- }
833
+ async function batchExtractFilesInWorker(pool, paths, config = null) {
653
834
  const normalizedConfig = normalizeExtractionConfig(config);
654
- const rawResults = await getBinding().batchExtractBytes(buffers, mimeTypes, normalizedConfig);
835
+ const binding2 = getBinding();
836
+ const rawResults = await binding2.batchExtractFilesInWorker(
837
+ pool,
838
+ paths,
839
+ normalizedConfig
840
+ );
655
841
  return rawResults.map(convertResult);
656
842
  }
843
+ async function closeWorkerPool(pool) {
844
+ const binding2 = getBinding();
845
+ await binding2.closeWorkerPool(pool);
846
+ }
847
+
848
+ // typescript/plugins/post-processors.ts
657
849
  function registerPostProcessor(processor) {
658
850
  const binding2 = getBinding();
659
851
  const wrappedProcessor = {
@@ -708,6 +900,8 @@ function listPostProcessors() {
708
900
  const binding2 = getBinding();
709
901
  return binding2.listPostProcessors();
710
902
  }
903
+
904
+ // typescript/plugins/validators.ts
711
905
  function registerValidator(validator) {
712
906
  const binding2 = getBinding();
713
907
  const wrappedValidator = {
@@ -746,6 +940,204 @@ function listValidators() {
746
940
  const binding2 = getBinding();
747
941
  return binding2.listValidators();
748
942
  }
943
+
944
+ // typescript/ocr/guten-ocr.ts
945
+ var GutenOcrBackend = class {
946
+ ocr = null;
947
+ ocrModule = null;
948
+ options;
949
+ /**
950
+ * Create a new Guten OCR backend.
951
+ *
952
+ * @param options - Optional configuration for Guten OCR
953
+ * @param options.models - Custom model paths (default: uses bundled models)
954
+ * @param options.isDebug - Enable debug mode (default: false)
955
+ * @param options.debugOutputDir - Directory for debug output (default: undefined)
956
+ * @param options.onnxOptions - Custom ONNX Runtime options (default: undefined)
957
+ *
958
+ * @example
959
+ * ```typescript
960
+ * // Default configuration
961
+ * const backend = new GutenOcrBackend();
962
+ *
963
+ * // With debug enabled
964
+ * const debugBackend = new GutenOcrBackend({
965
+ * isDebug: true,
966
+ * debugOutputDir: './ocr_debug'
967
+ * });
968
+ * ```
969
+ */
970
+ constructor(options) {
971
+ if (options !== void 0) {
972
+ this.options = options;
973
+ }
974
+ }
975
+ /**
976
+ * Get the backend name.
977
+ *
978
+ * @returns Backend name ("guten-ocr")
979
+ */
980
+ name() {
981
+ return "guten-ocr";
982
+ }
983
+ /**
984
+ * Get list of supported language codes.
985
+ *
986
+ * Guten OCR supports multiple languages depending on the model configuration.
987
+ * The default models support English and Chinese.
988
+ *
989
+ * @returns Array of ISO 639-1/2 language codes
990
+ */
991
+ supportedLanguages() {
992
+ return ["en", "eng", "ch_sim", "ch_tra", "chinese"];
993
+ }
994
+ /**
995
+ * Initialize the OCR backend.
996
+ *
997
+ * This method loads the Guten OCR module and creates an OCR instance.
998
+ * Call this before using processImage().
999
+ *
1000
+ * @throws {Error} If @gutenye/ocr-node is not installed
1001
+ * @throws {Error} If OCR initialization fails
1002
+ *
1003
+ * @example
1004
+ * ```typescript
1005
+ * const backend = new GutenOcrBackend();
1006
+ * await backend.initialize();
1007
+ * ```
1008
+ */
1009
+ async initialize() {
1010
+ if (this.ocr !== null) {
1011
+ return;
1012
+ }
1013
+ try {
1014
+ this.ocrModule = await import("@gutenye/ocr-node").then((m) => m.default || m);
1015
+ } catch (e) {
1016
+ const error = e;
1017
+ throw new Error(
1018
+ `Guten OCR support requires the '@gutenye/ocr-node' package. Install with: npm install @gutenye/ocr-node. Error: ${error.message}`
1019
+ );
1020
+ }
1021
+ try {
1022
+ this.ocr = await this.ocrModule?.create(this.options) ?? null;
1023
+ } catch (e) {
1024
+ const error = e;
1025
+ throw new Error(`Failed to initialize Guten OCR: ${error.message}`);
1026
+ }
1027
+ }
1028
+ /**
1029
+ * Shutdown the backend and release resources.
1030
+ *
1031
+ * This method cleans up all resources associated with the backend,
1032
+ * including the GutenOCR instance and module references.
1033
+ *
1034
+ * @example
1035
+ * ```typescript
1036
+ * const backend = new GutenOcrBackend();
1037
+ * await backend.initialize();
1038
+ * // ... use backend ...
1039
+ * await backend.shutdown();
1040
+ * ```
1041
+ */
1042
+ async shutdown() {
1043
+ if (this.ocr !== null) {
1044
+ this.ocr = null;
1045
+ }
1046
+ if (this.ocrModule !== null) {
1047
+ this.ocrModule = null;
1048
+ }
1049
+ }
1050
+ /**
1051
+ * Process image bytes and extract text using Guten OCR.
1052
+ *
1053
+ * This method:
1054
+ * 1. Decodes the image using sharp (if pixel data is needed) or passes bytes directly
1055
+ * 2. Runs OCR detection to find text regions
1056
+ * 3. Runs OCR recognition on each text region
1057
+ * 4. Returns extracted text with metadata
1058
+ *
1059
+ * @param imageBytes - Raw image data (PNG, JPEG, TIFF, etc.)
1060
+ * @param language - Language code (must be in supportedLanguages())
1061
+ * @returns Promise resolving to OCR result with content and metadata
1062
+ *
1063
+ * @throws {Error} If backend is not initialized
1064
+ * @throws {Error} If OCR processing fails
1065
+ *
1066
+ * @example
1067
+ * ```typescript
1068
+ * import { readFile } from 'fs/promises';
1069
+ *
1070
+ * const backend = new GutenOcrBackend();
1071
+ * await backend.initialize();
1072
+ *
1073
+ * const imageBytes = await readFile('scanned.png');
1074
+ * const result = await backend.processImage(imageBytes, 'en');
1075
+ * console.log(result.content);
1076
+ * console.log(result.metadata.confidence);
1077
+ * ```
1078
+ */
1079
+ async processImage(imageBytes, language) {
1080
+ if (this.ocr === null) {
1081
+ await this.initialize();
1082
+ }
1083
+ if (this.ocr === null) {
1084
+ throw new Error("Guten OCR backend failed to initialize");
1085
+ }
1086
+ try {
1087
+ const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
1088
+ const debugEnv = process.env["KREUZBERG_DEBUG_GUTEN"];
1089
+ if (debugEnv === "1") {
1090
+ const header = Array.from(buffer.subarray(0, 8));
1091
+ console.log("[Guten OCR] Debug input header:", header);
1092
+ console.log(
1093
+ "[Guten OCR] Buffer?",
1094
+ Buffer.isBuffer(buffer),
1095
+ "constructor",
1096
+ imageBytes?.constructor?.name,
1097
+ "length",
1098
+ buffer.length,
1099
+ "type",
1100
+ typeof imageBytes
1101
+ );
1102
+ }
1103
+ let width = 0;
1104
+ let height = 0;
1105
+ try {
1106
+ const sharpModule = await import("sharp");
1107
+ const sharp = sharpModule.default || sharpModule;
1108
+ const image = sharp(buffer);
1109
+ const metadata = await image.metadata();
1110
+ const metadataRecord = metadata;
1111
+ width = metadataRecord["width"] ?? 0;
1112
+ height = metadataRecord["height"] ?? 0;
1113
+ } catch (metadataError) {
1114
+ const error = metadataError;
1115
+ console.warn(`[Guten OCR] Unable to read image metadata via sharp: ${error.message}`);
1116
+ }
1117
+ const result = await this.ocr.detect(buffer);
1118
+ const textLines = result.map((line) => line.text);
1119
+ const content = textLines.join("\n");
1120
+ const avgConfidence = result.length > 0 ? result.reduce((sum, line) => sum + line.mean, 0) / result.length : 0;
1121
+ return {
1122
+ content,
1123
+ mime_type: "text/plain",
1124
+ metadata: {
1125
+ width,
1126
+ height,
1127
+ confidence: avgConfidence,
1128
+ text_regions: result.length,
1129
+ language
1130
+ },
1131
+ tables: []
1132
+ };
1133
+ } catch (e) {
1134
+ const error = e;
1135
+ throw new Error(`Guten OCR processing failed: ${error.message}`);
1136
+ }
1137
+ }
1138
+ };
1139
+
1140
+ // typescript/plugins/ocr-backends.ts
749
1141
  function isOcrProcessTuple(value) {
750
1142
  return Array.isArray(value) && value.length === 2 && typeof value[1] === "string" && (typeof value[0] === "string" || Buffer.isBuffer(value[0]) || value[0] instanceof Uint8Array);
751
1143
  }
@@ -815,6 +1207,8 @@ function clearOcrBackends() {
815
1207
  const binding2 = getBinding();
816
1208
  binding2.clearOcrBackends();
817
1209
  }
1210
+
1211
+ // typescript/registry/document-extractors.ts
818
1212
  function listDocumentExtractors() {
819
1213
  const binding2 = getBinding();
820
1214
  return binding2.listDocumentExtractors();
@@ -827,7 +1221,9 @@ function clearDocumentExtractors() {
827
1221
  const binding2 = getBinding();
828
1222
  binding2.clearDocumentExtractors();
829
1223
  }
830
- const ExtractionConfig = {
1224
+
1225
+ // typescript/config/loader.ts
1226
+ var ExtractionConfig = {
831
1227
  /**
832
1228
  * Load extraction configuration from a file.
833
1229
  *
@@ -889,6 +1285,18 @@ const ExtractionConfig = {
889
1285
  return binding2.discoverExtractionConfig();
890
1286
  }
891
1287
  };
1288
+ function loadConfigFile(filePath) {
1289
+ return ExtractionConfig.fromFile(filePath);
1290
+ }
1291
+ function loadConfigFromPath(path) {
1292
+ try {
1293
+ return ExtractionConfig.fromFile(path);
1294
+ } catch {
1295
+ return ExtractionConfig.discover();
1296
+ }
1297
+ }
1298
+
1299
+ // typescript/mime/utilities.ts
892
1300
  function detectMimeType(bytes) {
893
1301
  const binding2 = getBinding();
894
1302
  return binding2.detectMimeTypeFromBytes(bytes);
@@ -905,6 +1313,8 @@ function getExtensionsForMime(mimeType) {
905
1313
  const binding2 = getBinding();
906
1314
  return binding2.getExtensionsForMime(mimeType);
907
1315
  }
1316
+
1317
+ // typescript/embeddings/presets.ts
908
1318
  function listEmbeddingPresets() {
909
1319
  const binding2 = getBinding();
910
1320
  return binding2.listEmbeddingPresets();
@@ -914,76 +1324,9 @@ function getEmbeddingPreset(name) {
914
1324
  const result = binding2.getEmbeddingPreset(name);
915
1325
  return result;
916
1326
  }
917
- function getLastErrorCode() {
918
- const binding2 = getBinding();
919
- return binding2.getLastErrorCode();
920
- }
921
- function getLastPanicContext() {
922
- const binding2 = getBinding();
923
- const result = binding2.getLastPanicContext();
924
- return result;
925
- }
926
- function getErrorCodeName(code) {
927
- const binding2 = getBinding();
928
- return binding2.getErrorCodeName(code);
929
- }
930
- function getErrorCodeDescription(code) {
931
- const binding2 = getBinding();
932
- return binding2.getErrorCodeDescription(code);
933
- }
934
- function classifyError(errorMessage) {
935
- const binding2 = getBinding();
936
- const result = binding2.classifyError(errorMessage);
937
- return result;
938
- }
939
- function createWorkerPool(size) {
940
- const binding2 = getBinding();
941
- const rawPool = binding2.createWorkerPool(size);
942
- return rawPool;
943
- }
944
- function getWorkerPoolStats(pool) {
945
- const binding2 = getBinding();
946
- const rawStats = binding2.getWorkerPoolStats(pool);
947
- return rawStats;
948
- }
949
- async function extractFileInWorker(pool, filePath, mimeTypeOrConfig, maybeConfig) {
950
- let mimeType = null;
951
- let config = null;
952
- if (typeof mimeTypeOrConfig === "string") {
953
- mimeType = mimeTypeOrConfig;
954
- config = maybeConfig ?? null;
955
- } else if (mimeTypeOrConfig !== null && typeof mimeTypeOrConfig === "object") {
956
- config = mimeTypeOrConfig;
957
- mimeType = null;
958
- } else {
959
- config = maybeConfig ?? null;
960
- mimeType = null;
961
- }
962
- const normalizedConfig = normalizeExtractionConfig(config);
963
- const binding2 = getBinding();
964
- const rawResult = await binding2.extractFileInWorker(
965
- pool,
966
- filePath,
967
- mimeType,
968
- normalizedConfig
969
- );
970
- return convertResult(rawResult);
971
- }
972
- async function batchExtractFilesInWorker(pool, paths, config = null) {
973
- const normalizedConfig = normalizeExtractionConfig(config);
974
- const binding2 = getBinding();
975
- const rawResults = await binding2.batchExtractFilesInWorker(
976
- pool,
977
- paths,
978
- normalizedConfig
979
- );
980
- return rawResults.map(convertResult);
981
- }
982
- async function closeWorkerPool(pool) {
983
- const binding2 = getBinding();
984
- await binding2.closeWorkerPool(pool);
985
- }
986
- const __version__ = "4.0.8";
1327
+
1328
+ // typescript/index.ts
1329
+ var __version__ = "4.1.0";
987
1330
  // Annotate the CommonJS export names for ESM import in node:
988
1331
  0 && (module.exports = {
989
1332
  CacheError,
@@ -1031,6 +1374,8 @@ const __version__ = "4.0.8";
1031
1374
  listOcrBackends,
1032
1375
  listPostProcessors,
1033
1376
  listValidators,
1377
+ loadConfigFile,
1378
+ loadConfigFromPath,
1034
1379
  registerOcrBackend,
1035
1380
  registerPostProcessor,
1036
1381
  registerValidator,
@@ -1038,7 +1383,6 @@ const __version__ = "4.0.8";
1038
1383
  unregisterOcrBackend,
1039
1384
  unregisterPostProcessor,
1040
1385
  unregisterValidator,
1041
- validateMimeType,
1042
- ...require("./types.js")
1386
+ validateMimeType
1043
1387
  });
1044
1388
  //# sourceMappingURL=index.js.map