@kreuzberg/wasm 4.0.0-rc.21 → 4.0.0-rc.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +520 -837
- package/dist/adapters/wasm-adapter.cjs.map +1 -1
- package/dist/adapters/wasm-adapter.d.cts +1 -1
- package/dist/adapters/wasm-adapter.d.ts +1 -1
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.cjs +192 -48
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +219 -3
- package/dist/index.d.ts +219 -3
- package/dist/index.js +199 -48
- package/dist/index.js.map +1 -1
- package/dist/ocr/registry.cjs.map +1 -1
- package/dist/ocr/registry.d.cts +1 -1
- package/dist/ocr/registry.d.ts +1 -1
- package/dist/ocr/registry.js.map +1 -1
- package/dist/ocr/tesseract-wasm-backend.cjs +0 -46
- package/dist/ocr/tesseract-wasm-backend.cjs.map +1 -1
- package/dist/ocr/tesseract-wasm-backend.d.cts +1 -1
- package/dist/ocr/tesseract-wasm-backend.d.ts +1 -1
- package/dist/ocr/tesseract-wasm-backend.js +0 -46
- package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
- package/dist/pdfium.js +0 -5
- package/dist/runtime.cjs +0 -1
- package/dist/runtime.cjs.map +1 -1
- package/dist/runtime.js +0 -1
- package/dist/runtime.js.map +1 -1
- package/dist/{types-CKjcIYcX.d.cts → types-wVLLDHkl.d.cts} +73 -3
- package/dist/{types-CKjcIYcX.d.ts → types-wVLLDHkl.d.ts} +73 -3
- package/package.json +162 -162
package/dist/index.js
CHANGED
|
@@ -1,3 +1,29 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __esm = (fn, res) => function __init() {
|
|
4
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
+
};
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// typescript/pdfium.js
|
|
12
|
+
var pdfium_exports = {};
|
|
13
|
+
__export(pdfium_exports, {
|
|
14
|
+
default: () => initPdfium
|
|
15
|
+
});
|
|
16
|
+
async function initPdfium() {
|
|
17
|
+
return {
|
|
18
|
+
// Dummy implementation for testing
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
var init_pdfium = __esm({
|
|
22
|
+
"typescript/pdfium.js"() {
|
|
23
|
+
"use strict";
|
|
24
|
+
}
|
|
25
|
+
});
|
|
26
|
+
|
|
1
27
|
// typescript/adapters/wasm-adapter.ts
|
|
2
28
|
var MAX_FILE_SIZE = 512 * 1024 * 1024;
|
|
3
29
|
function isNumberOrNull(value) {
|
|
@@ -312,93 +338,48 @@ var TesseractWasmBackend = class {
|
|
|
312
338
|
return this.supportedLangsCache;
|
|
313
339
|
}
|
|
314
340
|
this.supportedLangsCache = [
|
|
315
|
-
// Major languages
|
|
316
341
|
"eng",
|
|
317
|
-
// English
|
|
318
342
|
"deu",
|
|
319
|
-
// German
|
|
320
343
|
"fra",
|
|
321
|
-
// French
|
|
322
344
|
"spa",
|
|
323
|
-
// Spanish
|
|
324
345
|
"ita",
|
|
325
|
-
// Italian
|
|
326
346
|
"por",
|
|
327
|
-
// Portuguese
|
|
328
347
|
"nld",
|
|
329
|
-
// Dutch
|
|
330
348
|
"rus",
|
|
331
|
-
// Russian
|
|
332
349
|
"jpn",
|
|
333
|
-
// Japanese
|
|
334
350
|
"kor",
|
|
335
|
-
// Korean
|
|
336
351
|
"chi_sim",
|
|
337
|
-
// Chinese (Simplified)
|
|
338
352
|
"chi_tra",
|
|
339
|
-
// Chinese (Traditional)
|
|
340
|
-
// Additional European languages
|
|
341
353
|
"pol",
|
|
342
|
-
// Polish
|
|
343
354
|
"tur",
|
|
344
|
-
// Turkish
|
|
345
355
|
"swe",
|
|
346
|
-
// Swedish
|
|
347
356
|
"dan",
|
|
348
|
-
// Danish
|
|
349
357
|
"fin",
|
|
350
|
-
// Finnish
|
|
351
358
|
"nor",
|
|
352
|
-
// Norwegian
|
|
353
359
|
"ces",
|
|
354
|
-
// Czech
|
|
355
360
|
"slk",
|
|
356
|
-
// Slovak
|
|
357
361
|
"ron",
|
|
358
|
-
// Romanian
|
|
359
362
|
"hun",
|
|
360
|
-
// Hungarian
|
|
361
363
|
"hrv",
|
|
362
|
-
// Croatian
|
|
363
364
|
"srp",
|
|
364
|
-
// Serbian
|
|
365
365
|
"bul",
|
|
366
|
-
// Bulgarian
|
|
367
366
|
"ukr",
|
|
368
|
-
// Ukrainian
|
|
369
367
|
"ell",
|
|
370
|
-
// Greek
|
|
371
|
-
// Asian languages
|
|
372
368
|
"ara",
|
|
373
|
-
// Arabic
|
|
374
369
|
"heb",
|
|
375
|
-
// Hebrew
|
|
376
370
|
"hin",
|
|
377
|
-
// Hindi
|
|
378
371
|
"tha",
|
|
379
|
-
// Thai
|
|
380
372
|
"vie",
|
|
381
|
-
// Vietnamese
|
|
382
373
|
"mkd",
|
|
383
|
-
// Macedonian
|
|
384
374
|
"ben",
|
|
385
|
-
// Bengali
|
|
386
375
|
"tam",
|
|
387
|
-
// Tamil
|
|
388
376
|
"tel",
|
|
389
|
-
// Telugu
|
|
390
377
|
"kan",
|
|
391
|
-
// Kannada
|
|
392
378
|
"mal",
|
|
393
|
-
// Malayalam
|
|
394
379
|
"mya",
|
|
395
|
-
// Burmese
|
|
396
380
|
"khm",
|
|
397
|
-
// Khmer
|
|
398
381
|
"lao",
|
|
399
|
-
// Lao
|
|
400
382
|
"sin"
|
|
401
|
-
// Sinhala
|
|
402
383
|
];
|
|
403
384
|
return this.supportedLangsCache;
|
|
404
385
|
}
|
|
@@ -500,7 +481,6 @@ var TesseractWasmBackend = class {
|
|
|
500
481
|
...pageMetadata
|
|
501
482
|
},
|
|
502
483
|
tables: []
|
|
503
|
-
// Tesseract-wasm doesn't provide structured table detection
|
|
504
484
|
};
|
|
505
485
|
} catch (error) {
|
|
506
486
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -786,7 +766,6 @@ function getRuntimeVersion() {
|
|
|
786
766
|
switch (runtime) {
|
|
787
767
|
case "node":
|
|
788
768
|
return process.version?.substring(1);
|
|
789
|
-
// Remove 'v' prefix
|
|
790
769
|
case "deno": {
|
|
791
770
|
const deno = globalThis.Deno;
|
|
792
771
|
const version = deno?.version;
|
|
@@ -834,6 +813,168 @@ function getRuntimeInfo() {
|
|
|
834
813
|
};
|
|
835
814
|
}
|
|
836
815
|
|
|
816
|
+
// typescript/plugin-registry.ts
|
|
817
|
+
var postProcessors = /* @__PURE__ */ new Map();
|
|
818
|
+
var validators = /* @__PURE__ */ new Map();
|
|
819
|
+
function validatePostProcessor(processor) {
|
|
820
|
+
if (processor === null || processor === void 0) {
|
|
821
|
+
throw new Error("Post-processor cannot be null or undefined");
|
|
822
|
+
}
|
|
823
|
+
const obj = processor;
|
|
824
|
+
if (typeof obj.name !== "function") {
|
|
825
|
+
throw new Error("Post-processor must implement name() method");
|
|
826
|
+
}
|
|
827
|
+
if (typeof obj.process !== "function") {
|
|
828
|
+
throw new Error("Post-processor must implement process() method");
|
|
829
|
+
}
|
|
830
|
+
const name = obj.name();
|
|
831
|
+
if (typeof name !== "string" || name.trim() === "") {
|
|
832
|
+
throw new Error("Post-processor name must be a non-empty string");
|
|
833
|
+
}
|
|
834
|
+
return true;
|
|
835
|
+
}
|
|
836
|
+
function registerPostProcessor(processor) {
|
|
837
|
+
validatePostProcessor(processor);
|
|
838
|
+
const name = processor.name();
|
|
839
|
+
if (postProcessors.has(name)) {
|
|
840
|
+
console.warn(`Post-processor "${name}" already registered, overwriting with new implementation`);
|
|
841
|
+
}
|
|
842
|
+
postProcessors.set(name, processor);
|
|
843
|
+
}
|
|
844
|
+
function getPostProcessor(name) {
|
|
845
|
+
return postProcessors.get(name);
|
|
846
|
+
}
|
|
847
|
+
function listPostProcessors() {
|
|
848
|
+
return Array.from(postProcessors.keys());
|
|
849
|
+
}
|
|
850
|
+
async function unregisterPostProcessor(name) {
|
|
851
|
+
const processor = postProcessors.get(name);
|
|
852
|
+
if (!processor) {
|
|
853
|
+
const available = Array.from(postProcessors.keys());
|
|
854
|
+
const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
|
|
855
|
+
throw new Error(`Post-processor "${name}" is not registered.${availableStr}`);
|
|
856
|
+
}
|
|
857
|
+
try {
|
|
858
|
+
if (processor.shutdown) {
|
|
859
|
+
await processor.shutdown();
|
|
860
|
+
}
|
|
861
|
+
} catch (error) {
|
|
862
|
+
console.warn(`Error during shutdown of post-processor "${name}":`, error);
|
|
863
|
+
}
|
|
864
|
+
postProcessors.delete(name);
|
|
865
|
+
}
|
|
866
|
+
async function clearPostProcessors() {
|
|
867
|
+
const entries = Array.from(postProcessors.entries());
|
|
868
|
+
for (const [_name, processor] of entries) {
|
|
869
|
+
try {
|
|
870
|
+
if (processor.shutdown) {
|
|
871
|
+
await processor.shutdown();
|
|
872
|
+
}
|
|
873
|
+
} catch (error) {
|
|
874
|
+
console.warn(`Error during shutdown of post-processor "${_name}":`, error);
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
postProcessors.clear();
|
|
878
|
+
}
|
|
879
|
+
function validateValidator(validator) {
|
|
880
|
+
if (validator === null || validator === void 0) {
|
|
881
|
+
throw new Error("Validator cannot be null or undefined");
|
|
882
|
+
}
|
|
883
|
+
const obj = validator;
|
|
884
|
+
if (typeof obj.name !== "function") {
|
|
885
|
+
throw new Error("Validator must implement name() method");
|
|
886
|
+
}
|
|
887
|
+
if (typeof obj.validate !== "function") {
|
|
888
|
+
throw new Error("Validator must implement validate() method");
|
|
889
|
+
}
|
|
890
|
+
const name = obj.name();
|
|
891
|
+
if (typeof name !== "string" || name.trim() === "") {
|
|
892
|
+
throw new Error("Validator name must be a non-empty string");
|
|
893
|
+
}
|
|
894
|
+
return true;
|
|
895
|
+
}
|
|
896
|
+
function registerValidator(validator) {
|
|
897
|
+
validateValidator(validator);
|
|
898
|
+
const name = validator.name();
|
|
899
|
+
if (validators.has(name)) {
|
|
900
|
+
console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
|
|
901
|
+
}
|
|
902
|
+
validators.set(name, validator);
|
|
903
|
+
}
|
|
904
|
+
function getValidator(name) {
|
|
905
|
+
return validators.get(name);
|
|
906
|
+
}
|
|
907
|
+
function listValidators() {
|
|
908
|
+
return Array.from(validators.keys());
|
|
909
|
+
}
|
|
910
|
+
async function unregisterValidator(name) {
|
|
911
|
+
const validator = validators.get(name);
|
|
912
|
+
if (!validator) {
|
|
913
|
+
const available = Array.from(validators.keys());
|
|
914
|
+
const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
|
|
915
|
+
throw new Error(`Validator "${name}" is not registered.${availableStr}`);
|
|
916
|
+
}
|
|
917
|
+
try {
|
|
918
|
+
if (validator.shutdown) {
|
|
919
|
+
await validator.shutdown();
|
|
920
|
+
}
|
|
921
|
+
} catch (error) {
|
|
922
|
+
console.warn(`Error during shutdown of validator "${name}":`, error);
|
|
923
|
+
}
|
|
924
|
+
validators.delete(name);
|
|
925
|
+
}
|
|
926
|
+
async function clearValidators() {
|
|
927
|
+
const entries = Array.from(validators.entries());
|
|
928
|
+
for (const [_name, validator] of entries) {
|
|
929
|
+
try {
|
|
930
|
+
if (validator.shutdown) {
|
|
931
|
+
await validator.shutdown();
|
|
932
|
+
}
|
|
933
|
+
} catch (error) {
|
|
934
|
+
console.warn(`Error during shutdown of validator "${_name}":`, error);
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
validators.clear();
|
|
938
|
+
}
|
|
939
|
+
function executePostProcessor(name, result) {
|
|
940
|
+
const processor = postProcessors.get(name);
|
|
941
|
+
if (!processor) {
|
|
942
|
+
return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
|
|
943
|
+
}
|
|
944
|
+
try {
|
|
945
|
+
const output = processor.process(result);
|
|
946
|
+
if (output instanceof Promise) {
|
|
947
|
+
return output;
|
|
948
|
+
}
|
|
949
|
+
return Promise.resolve(output);
|
|
950
|
+
} catch (error) {
|
|
951
|
+
return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
function executeValidator(name, result) {
|
|
955
|
+
const validator = validators.get(name);
|
|
956
|
+
if (!validator) {
|
|
957
|
+
return Promise.reject(new Error(`Validator "${name}" is not registered`));
|
|
958
|
+
}
|
|
959
|
+
try {
|
|
960
|
+
const output = validator.validate(result);
|
|
961
|
+
if (output instanceof Promise) {
|
|
962
|
+
return output;
|
|
963
|
+
}
|
|
964
|
+
return Promise.resolve(output);
|
|
965
|
+
} catch (error) {
|
|
966
|
+
return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
|
|
967
|
+
}
|
|
968
|
+
}
|
|
969
|
+
function setupGlobalCallbacks() {
|
|
970
|
+
if (typeof globalThis !== "undefined") {
|
|
971
|
+
const callbacksObj = globalThis;
|
|
972
|
+
callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
|
|
973
|
+
callbacksObj.__kreuzberg_execute_validator = executeValidator;
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
setupGlobalCallbacks();
|
|
977
|
+
|
|
837
978
|
// typescript/index.ts
|
|
838
979
|
var wasm = null;
|
|
839
980
|
var initialized = false;
|
|
@@ -848,7 +989,7 @@ async function initializePdfiumAsync(wasmModule) {
|
|
|
848
989
|
return;
|
|
849
990
|
}
|
|
850
991
|
try {
|
|
851
|
-
const pdfiumModule = await
|
|
992
|
+
const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
|
|
852
993
|
const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
|
|
853
994
|
const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
|
|
854
995
|
if (!success) {
|
|
@@ -1172,6 +1313,8 @@ export {
|
|
|
1172
1313
|
batchExtractBytesSync,
|
|
1173
1314
|
batchExtractFiles,
|
|
1174
1315
|
clearOcrBackends,
|
|
1316
|
+
clearPostProcessors,
|
|
1317
|
+
clearValidators,
|
|
1175
1318
|
configToJS,
|
|
1176
1319
|
detectRuntime,
|
|
1177
1320
|
enableOcr,
|
|
@@ -1182,8 +1325,10 @@ export {
|
|
|
1182
1325
|
fileToUint8Array,
|
|
1183
1326
|
getInitializationError,
|
|
1184
1327
|
getOcrBackend,
|
|
1328
|
+
getPostProcessor,
|
|
1185
1329
|
getRuntimeInfo,
|
|
1186
1330
|
getRuntimeVersion,
|
|
1331
|
+
getValidator,
|
|
1187
1332
|
getVersion,
|
|
1188
1333
|
getWasmCapabilities,
|
|
1189
1334
|
hasBigInt,
|
|
@@ -1205,8 +1350,14 @@ export {
|
|
|
1205
1350
|
isWebEnvironment,
|
|
1206
1351
|
jsToExtractionResult,
|
|
1207
1352
|
listOcrBackends,
|
|
1353
|
+
listPostProcessors,
|
|
1354
|
+
listValidators,
|
|
1208
1355
|
registerOcrBackend,
|
|
1356
|
+
registerPostProcessor,
|
|
1357
|
+
registerValidator,
|
|
1209
1358
|
unregisterOcrBackend,
|
|
1359
|
+
unregisterPostProcessor,
|
|
1360
|
+
unregisterValidator,
|
|
1210
1361
|
wrapWasmError
|
|
1211
1362
|
};
|
|
1212
1363
|
//# sourceMappingURL=index.js.map
|