@kreuzberg/wasm 4.0.0-rc.21 → 4.0.0-rc.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +520 -837
- package/dist/adapters/wasm-adapter.d.ts +7 -10
- package/dist/adapters/wasm-adapter.d.ts.map +1 -0
- package/dist/adapters/wasm-adapter.js +41 -19
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.d.ts +23 -24
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +240 -67
- package/dist/index.js.map +1 -1
- package/dist/ocr/registry.d.ts +7 -10
- package/dist/ocr/registry.d.ts.map +1 -0
- package/dist/ocr/registry.js.map +1 -1
- package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
- package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.js +0 -46
- package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
- package/dist/pdfium.js +0 -5
- package/dist/plugin-registry.d.ts +246 -0
- package/dist/plugin-registry.d.ts.map +1 -0
- package/dist/runtime.d.ts +21 -22
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +0 -1
- package/dist/runtime.js.map +1 -1
- package/dist/{types-CKjcIYcX.d.ts → types.d.ts} +91 -22
- package/dist/types.d.ts.map +1 -0
- package/package.json +119 -162
- package/dist/adapters/wasm-adapter.cjs +0 -245
- package/dist/adapters/wasm-adapter.cjs.map +0 -1
- package/dist/adapters/wasm-adapter.d.cts +0 -121
- package/dist/index.cjs +0 -1245
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.cts +0 -423
- package/dist/ocr/registry.cjs +0 -92
- package/dist/ocr/registry.cjs.map +0 -1
- package/dist/ocr/registry.d.cts +0 -102
- package/dist/ocr/tesseract-wasm-backend.cjs +0 -456
- package/dist/ocr/tesseract-wasm-backend.cjs.map +0 -1
- package/dist/ocr/tesseract-wasm-backend.d.cts +0 -257
- package/dist/runtime.cjs +0 -174
- package/dist/runtime.cjs.map +0 -1
- package/dist/runtime.d.cts +0 -256
- package/dist/types-CKjcIYcX.d.cts +0 -294
package/dist/index.js
CHANGED
|
@@ -1,3 +1,29 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __esm = (fn, res) => function __init() {
|
|
4
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
+
};
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// typescript/pdfium.js
|
|
12
|
+
var pdfium_exports = {};
|
|
13
|
+
__export(pdfium_exports, {
|
|
14
|
+
default: () => initPdfium
|
|
15
|
+
});
|
|
16
|
+
async function initPdfium() {
|
|
17
|
+
return {
|
|
18
|
+
// Dummy implementation for testing
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
var init_pdfium = __esm({
|
|
22
|
+
"typescript/pdfium.js"() {
|
|
23
|
+
"use strict";
|
|
24
|
+
}
|
|
25
|
+
});
|
|
26
|
+
|
|
1
27
|
// typescript/adapters/wasm-adapter.ts
|
|
2
28
|
var MAX_FILE_SIZE = 512 * 1024 * 1024;
|
|
3
29
|
function isNumberOrNull(value) {
|
|
@@ -104,30 +130,52 @@ function jsToExtractionResult(jsValue) {
|
|
|
104
130
|
}
|
|
105
131
|
embedding = c.embedding;
|
|
106
132
|
}
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
133
|
+
const coerceToNumber = (value, fieldName) => {
|
|
134
|
+
if (typeof value === "number") {
|
|
135
|
+
return value;
|
|
136
|
+
}
|
|
137
|
+
if (typeof value === "bigint") {
|
|
138
|
+
return Number(value);
|
|
139
|
+
}
|
|
140
|
+
if (typeof value === "string") {
|
|
141
|
+
const parsed = parseInt(value, 10);
|
|
142
|
+
if (isNaN(parsed)) {
|
|
143
|
+
throw new Error(`Invalid chunk metadata: ${fieldName} must be a valid number, got "${value}"`);
|
|
144
|
+
}
|
|
145
|
+
return parsed;
|
|
146
|
+
}
|
|
147
|
+
throw new Error(`Invalid chunk metadata: ${fieldName} must be a number, got ${typeof value}`);
|
|
148
|
+
};
|
|
149
|
+
const charStart = coerceToNumber(
|
|
150
|
+
metadata.charStart ?? metadata.char_start ?? metadata.byteStart ?? metadata.byte_start,
|
|
151
|
+
"charStart"
|
|
152
|
+
);
|
|
153
|
+
const charEnd = coerceToNumber(
|
|
154
|
+
metadata.charEnd ?? metadata.char_end ?? metadata.byteEnd ?? metadata.byte_end,
|
|
155
|
+
"charEnd"
|
|
156
|
+
);
|
|
157
|
+
const chunkIndex = coerceToNumber(
|
|
158
|
+
metadata.chunkIndex ?? metadata.chunk_index,
|
|
159
|
+
"chunkIndex"
|
|
160
|
+
);
|
|
161
|
+
const totalChunks = coerceToNumber(
|
|
162
|
+
metadata.totalChunks ?? metadata.total_chunks,
|
|
163
|
+
"totalChunks"
|
|
164
|
+
);
|
|
165
|
+
let tokenCount = null;
|
|
166
|
+
const tokenCountValue = metadata.tokenCount ?? metadata.token_count;
|
|
167
|
+
if (tokenCountValue !== null && tokenCountValue !== void 0) {
|
|
168
|
+
tokenCount = coerceToNumber(tokenCountValue, "tokenCount");
|
|
121
169
|
}
|
|
122
170
|
return {
|
|
123
171
|
content: c.content,
|
|
124
172
|
embedding,
|
|
125
173
|
metadata: {
|
|
126
|
-
charStart
|
|
127
|
-
charEnd
|
|
128
|
-
tokenCount
|
|
129
|
-
chunkIndex
|
|
130
|
-
totalChunks
|
|
174
|
+
charStart,
|
|
175
|
+
charEnd,
|
|
176
|
+
tokenCount,
|
|
177
|
+
chunkIndex,
|
|
178
|
+
totalChunks
|
|
131
179
|
}
|
|
132
180
|
};
|
|
133
181
|
}) : null;
|
|
@@ -312,93 +360,48 @@ var TesseractWasmBackend = class {
|
|
|
312
360
|
return this.supportedLangsCache;
|
|
313
361
|
}
|
|
314
362
|
this.supportedLangsCache = [
|
|
315
|
-
// Major languages
|
|
316
363
|
"eng",
|
|
317
|
-
// English
|
|
318
364
|
"deu",
|
|
319
|
-
// German
|
|
320
365
|
"fra",
|
|
321
|
-
// French
|
|
322
366
|
"spa",
|
|
323
|
-
// Spanish
|
|
324
367
|
"ita",
|
|
325
|
-
// Italian
|
|
326
368
|
"por",
|
|
327
|
-
// Portuguese
|
|
328
369
|
"nld",
|
|
329
|
-
// Dutch
|
|
330
370
|
"rus",
|
|
331
|
-
// Russian
|
|
332
371
|
"jpn",
|
|
333
|
-
// Japanese
|
|
334
372
|
"kor",
|
|
335
|
-
// Korean
|
|
336
373
|
"chi_sim",
|
|
337
|
-
// Chinese (Simplified)
|
|
338
374
|
"chi_tra",
|
|
339
|
-
// Chinese (Traditional)
|
|
340
|
-
// Additional European languages
|
|
341
375
|
"pol",
|
|
342
|
-
// Polish
|
|
343
376
|
"tur",
|
|
344
|
-
// Turkish
|
|
345
377
|
"swe",
|
|
346
|
-
// Swedish
|
|
347
378
|
"dan",
|
|
348
|
-
// Danish
|
|
349
379
|
"fin",
|
|
350
|
-
// Finnish
|
|
351
380
|
"nor",
|
|
352
|
-
// Norwegian
|
|
353
381
|
"ces",
|
|
354
|
-
// Czech
|
|
355
382
|
"slk",
|
|
356
|
-
// Slovak
|
|
357
383
|
"ron",
|
|
358
|
-
// Romanian
|
|
359
384
|
"hun",
|
|
360
|
-
// Hungarian
|
|
361
385
|
"hrv",
|
|
362
|
-
// Croatian
|
|
363
386
|
"srp",
|
|
364
|
-
// Serbian
|
|
365
387
|
"bul",
|
|
366
|
-
// Bulgarian
|
|
367
388
|
"ukr",
|
|
368
|
-
// Ukrainian
|
|
369
389
|
"ell",
|
|
370
|
-
// Greek
|
|
371
|
-
// Asian languages
|
|
372
390
|
"ara",
|
|
373
|
-
// Arabic
|
|
374
391
|
"heb",
|
|
375
|
-
// Hebrew
|
|
376
392
|
"hin",
|
|
377
|
-
// Hindi
|
|
378
393
|
"tha",
|
|
379
|
-
// Thai
|
|
380
394
|
"vie",
|
|
381
|
-
// Vietnamese
|
|
382
395
|
"mkd",
|
|
383
|
-
// Macedonian
|
|
384
396
|
"ben",
|
|
385
|
-
// Bengali
|
|
386
397
|
"tam",
|
|
387
|
-
// Tamil
|
|
388
398
|
"tel",
|
|
389
|
-
// Telugu
|
|
390
399
|
"kan",
|
|
391
|
-
// Kannada
|
|
392
400
|
"mal",
|
|
393
|
-
// Malayalam
|
|
394
401
|
"mya",
|
|
395
|
-
// Burmese
|
|
396
402
|
"khm",
|
|
397
|
-
// Khmer
|
|
398
403
|
"lao",
|
|
399
|
-
// Lao
|
|
400
404
|
"sin"
|
|
401
|
-
// Sinhala
|
|
402
405
|
];
|
|
403
406
|
return this.supportedLangsCache;
|
|
404
407
|
}
|
|
@@ -500,7 +503,6 @@ var TesseractWasmBackend = class {
|
|
|
500
503
|
...pageMetadata
|
|
501
504
|
},
|
|
502
505
|
tables: []
|
|
503
|
-
// Tesseract-wasm doesn't provide structured table detection
|
|
504
506
|
};
|
|
505
507
|
} catch (error) {
|
|
506
508
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -786,7 +788,6 @@ function getRuntimeVersion() {
|
|
|
786
788
|
switch (runtime) {
|
|
787
789
|
case "node":
|
|
788
790
|
return process.version?.substring(1);
|
|
789
|
-
// Remove 'v' prefix
|
|
790
791
|
case "deno": {
|
|
791
792
|
const deno = globalThis.Deno;
|
|
792
793
|
const version = deno?.version;
|
|
@@ -834,6 +835,168 @@ function getRuntimeInfo() {
|
|
|
834
835
|
};
|
|
835
836
|
}
|
|
836
837
|
|
|
838
|
+
// typescript/plugin-registry.ts
|
|
839
|
+
var postProcessors = /* @__PURE__ */ new Map();
|
|
840
|
+
var validators = /* @__PURE__ */ new Map();
|
|
841
|
+
function validatePostProcessor(processor) {
|
|
842
|
+
if (processor === null || processor === void 0) {
|
|
843
|
+
throw new Error("Post-processor cannot be null or undefined");
|
|
844
|
+
}
|
|
845
|
+
const obj = processor;
|
|
846
|
+
if (typeof obj.name !== "function") {
|
|
847
|
+
throw new Error("Post-processor must implement name() method");
|
|
848
|
+
}
|
|
849
|
+
if (typeof obj.process !== "function") {
|
|
850
|
+
throw new Error("Post-processor must implement process() method");
|
|
851
|
+
}
|
|
852
|
+
const name = obj.name();
|
|
853
|
+
if (typeof name !== "string" || name.trim() === "") {
|
|
854
|
+
throw new Error("Post-processor name must be a non-empty string");
|
|
855
|
+
}
|
|
856
|
+
return true;
|
|
857
|
+
}
|
|
858
|
+
function registerPostProcessor(processor) {
|
|
859
|
+
validatePostProcessor(processor);
|
|
860
|
+
const name = processor.name();
|
|
861
|
+
if (postProcessors.has(name)) {
|
|
862
|
+
console.warn(`Post-processor "${name}" already registered, overwriting with new implementation`);
|
|
863
|
+
}
|
|
864
|
+
postProcessors.set(name, processor);
|
|
865
|
+
}
|
|
866
|
+
function getPostProcessor(name) {
|
|
867
|
+
return postProcessors.get(name);
|
|
868
|
+
}
|
|
869
|
+
function listPostProcessors() {
|
|
870
|
+
return Array.from(postProcessors.keys());
|
|
871
|
+
}
|
|
872
|
+
async function unregisterPostProcessor(name) {
|
|
873
|
+
const processor = postProcessors.get(name);
|
|
874
|
+
if (!processor) {
|
|
875
|
+
const available = Array.from(postProcessors.keys());
|
|
876
|
+
const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
|
|
877
|
+
throw new Error(`Post-processor "${name}" is not registered.${availableStr}`);
|
|
878
|
+
}
|
|
879
|
+
try {
|
|
880
|
+
if (processor.shutdown) {
|
|
881
|
+
await processor.shutdown();
|
|
882
|
+
}
|
|
883
|
+
} catch (error) {
|
|
884
|
+
console.warn(`Error during shutdown of post-processor "${name}":`, error);
|
|
885
|
+
}
|
|
886
|
+
postProcessors.delete(name);
|
|
887
|
+
}
|
|
888
|
+
async function clearPostProcessors() {
|
|
889
|
+
const entries = Array.from(postProcessors.entries());
|
|
890
|
+
for (const [_name, processor] of entries) {
|
|
891
|
+
try {
|
|
892
|
+
if (processor.shutdown) {
|
|
893
|
+
await processor.shutdown();
|
|
894
|
+
}
|
|
895
|
+
} catch (error) {
|
|
896
|
+
console.warn(`Error during shutdown of post-processor "${_name}":`, error);
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
postProcessors.clear();
|
|
900
|
+
}
|
|
901
|
+
function validateValidator(validator) {
|
|
902
|
+
if (validator === null || validator === void 0) {
|
|
903
|
+
throw new Error("Validator cannot be null or undefined");
|
|
904
|
+
}
|
|
905
|
+
const obj = validator;
|
|
906
|
+
if (typeof obj.name !== "function") {
|
|
907
|
+
throw new Error("Validator must implement name() method");
|
|
908
|
+
}
|
|
909
|
+
if (typeof obj.validate !== "function") {
|
|
910
|
+
throw new Error("Validator must implement validate() method");
|
|
911
|
+
}
|
|
912
|
+
const name = obj.name();
|
|
913
|
+
if (typeof name !== "string" || name.trim() === "") {
|
|
914
|
+
throw new Error("Validator name must be a non-empty string");
|
|
915
|
+
}
|
|
916
|
+
return true;
|
|
917
|
+
}
|
|
918
|
+
function registerValidator(validator) {
|
|
919
|
+
validateValidator(validator);
|
|
920
|
+
const name = validator.name();
|
|
921
|
+
if (validators.has(name)) {
|
|
922
|
+
console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
|
|
923
|
+
}
|
|
924
|
+
validators.set(name, validator);
|
|
925
|
+
}
|
|
926
|
+
function getValidator(name) {
|
|
927
|
+
return validators.get(name);
|
|
928
|
+
}
|
|
929
|
+
function listValidators() {
|
|
930
|
+
return Array.from(validators.keys());
|
|
931
|
+
}
|
|
932
|
+
async function unregisterValidator(name) {
|
|
933
|
+
const validator = validators.get(name);
|
|
934
|
+
if (!validator) {
|
|
935
|
+
const available = Array.from(validators.keys());
|
|
936
|
+
const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
|
|
937
|
+
throw new Error(`Validator "${name}" is not registered.${availableStr}`);
|
|
938
|
+
}
|
|
939
|
+
try {
|
|
940
|
+
if (validator.shutdown) {
|
|
941
|
+
await validator.shutdown();
|
|
942
|
+
}
|
|
943
|
+
} catch (error) {
|
|
944
|
+
console.warn(`Error during shutdown of validator "${name}":`, error);
|
|
945
|
+
}
|
|
946
|
+
validators.delete(name);
|
|
947
|
+
}
|
|
948
|
+
async function clearValidators() {
|
|
949
|
+
const entries = Array.from(validators.entries());
|
|
950
|
+
for (const [_name, validator] of entries) {
|
|
951
|
+
try {
|
|
952
|
+
if (validator.shutdown) {
|
|
953
|
+
await validator.shutdown();
|
|
954
|
+
}
|
|
955
|
+
} catch (error) {
|
|
956
|
+
console.warn(`Error during shutdown of validator "${_name}":`, error);
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
validators.clear();
|
|
960
|
+
}
|
|
961
|
+
function executePostProcessor(name, result) {
|
|
962
|
+
const processor = postProcessors.get(name);
|
|
963
|
+
if (!processor) {
|
|
964
|
+
return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
|
|
965
|
+
}
|
|
966
|
+
try {
|
|
967
|
+
const output = processor.process(result);
|
|
968
|
+
if (output instanceof Promise) {
|
|
969
|
+
return output;
|
|
970
|
+
}
|
|
971
|
+
return Promise.resolve(output);
|
|
972
|
+
} catch (error) {
|
|
973
|
+
return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
function executeValidator(name, result) {
|
|
977
|
+
const validator = validators.get(name);
|
|
978
|
+
if (!validator) {
|
|
979
|
+
return Promise.reject(new Error(`Validator "${name}" is not registered`));
|
|
980
|
+
}
|
|
981
|
+
try {
|
|
982
|
+
const output = validator.validate(result);
|
|
983
|
+
if (output instanceof Promise) {
|
|
984
|
+
return output;
|
|
985
|
+
}
|
|
986
|
+
return Promise.resolve(output);
|
|
987
|
+
} catch (error) {
|
|
988
|
+
return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
function setupGlobalCallbacks() {
|
|
992
|
+
if (typeof globalThis !== "undefined") {
|
|
993
|
+
const callbacksObj = globalThis;
|
|
994
|
+
callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
|
|
995
|
+
callbacksObj.__kreuzberg_execute_validator = executeValidator;
|
|
996
|
+
}
|
|
997
|
+
}
|
|
998
|
+
setupGlobalCallbacks();
|
|
999
|
+
|
|
837
1000
|
// typescript/index.ts
|
|
838
1001
|
var wasm = null;
|
|
839
1002
|
var initialized = false;
|
|
@@ -848,7 +1011,7 @@ async function initializePdfiumAsync(wasmModule) {
|
|
|
848
1011
|
return;
|
|
849
1012
|
}
|
|
850
1013
|
try {
|
|
851
|
-
const pdfiumModule = await
|
|
1014
|
+
const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
|
|
852
1015
|
const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
|
|
853
1016
|
const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
|
|
854
1017
|
if (!success) {
|
|
@@ -1172,6 +1335,8 @@ export {
|
|
|
1172
1335
|
batchExtractBytesSync,
|
|
1173
1336
|
batchExtractFiles,
|
|
1174
1337
|
clearOcrBackends,
|
|
1338
|
+
clearPostProcessors,
|
|
1339
|
+
clearValidators,
|
|
1175
1340
|
configToJS,
|
|
1176
1341
|
detectRuntime,
|
|
1177
1342
|
enableOcr,
|
|
@@ -1182,8 +1347,10 @@ export {
|
|
|
1182
1347
|
fileToUint8Array,
|
|
1183
1348
|
getInitializationError,
|
|
1184
1349
|
getOcrBackend,
|
|
1350
|
+
getPostProcessor,
|
|
1185
1351
|
getRuntimeInfo,
|
|
1186
1352
|
getRuntimeVersion,
|
|
1353
|
+
getValidator,
|
|
1187
1354
|
getVersion,
|
|
1188
1355
|
getWasmCapabilities,
|
|
1189
1356
|
hasBigInt,
|
|
@@ -1205,8 +1372,14 @@ export {
|
|
|
1205
1372
|
isWebEnvironment,
|
|
1206
1373
|
jsToExtractionResult,
|
|
1207
1374
|
listOcrBackends,
|
|
1375
|
+
listPostProcessors,
|
|
1376
|
+
listValidators,
|
|
1208
1377
|
registerOcrBackend,
|
|
1378
|
+
registerPostProcessor,
|
|
1379
|
+
registerValidator,
|
|
1209
1380
|
unregisterOcrBackend,
|
|
1381
|
+
unregisterPostProcessor,
|
|
1382
|
+
unregisterValidator,
|
|
1210
1383
|
wrapWasmError
|
|
1211
1384
|
};
|
|
1212
1385
|
//# sourceMappingURL=index.js.map
|