@kreuzberg/wasm 4.0.0-rc.21 → 4.0.0-rc.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +520 -837
  2. package/dist/adapters/wasm-adapter.d.ts +7 -10
  3. package/dist/adapters/wasm-adapter.d.ts.map +1 -0
  4. package/dist/adapters/wasm-adapter.js +41 -19
  5. package/dist/adapters/wasm-adapter.js.map +1 -1
  6. package/dist/index.d.ts +23 -24
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +240 -67
  9. package/dist/index.js.map +1 -1
  10. package/dist/ocr/registry.d.ts +7 -10
  11. package/dist/ocr/registry.d.ts.map +1 -0
  12. package/dist/ocr/registry.js.map +1 -1
  13. package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
  14. package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
  15. package/dist/ocr/tesseract-wasm-backend.js +0 -46
  16. package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
  17. package/dist/pdfium.js +0 -5
  18. package/dist/plugin-registry.d.ts +246 -0
  19. package/dist/plugin-registry.d.ts.map +1 -0
  20. package/dist/runtime.d.ts +21 -22
  21. package/dist/runtime.d.ts.map +1 -0
  22. package/dist/runtime.js +0 -1
  23. package/dist/runtime.js.map +1 -1
  24. package/dist/{types-CKjcIYcX.d.ts → types.d.ts} +91 -22
  25. package/dist/types.d.ts.map +1 -0
  26. package/package.json +119 -162
  27. package/dist/adapters/wasm-adapter.cjs +0 -245
  28. package/dist/adapters/wasm-adapter.cjs.map +0 -1
  29. package/dist/adapters/wasm-adapter.d.cts +0 -121
  30. package/dist/index.cjs +0 -1245
  31. package/dist/index.cjs.map +0 -1
  32. package/dist/index.d.cts +0 -423
  33. package/dist/ocr/registry.cjs +0 -92
  34. package/dist/ocr/registry.cjs.map +0 -1
  35. package/dist/ocr/registry.d.cts +0 -102
  36. package/dist/ocr/tesseract-wasm-backend.cjs +0 -456
  37. package/dist/ocr/tesseract-wasm-backend.cjs.map +0 -1
  38. package/dist/ocr/tesseract-wasm-backend.d.cts +0 -257
  39. package/dist/runtime.cjs +0 -174
  40. package/dist/runtime.cjs.map +0 -1
  41. package/dist/runtime.d.cts +0 -256
  42. package/dist/types-CKjcIYcX.d.cts +0 -294
package/dist/index.js CHANGED
@@ -1,3 +1,29 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropNames = Object.getOwnPropertyNames;
3
+ var __esm = (fn, res) => function __init() {
4
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
5
+ };
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+
11
+ // typescript/pdfium.js
12
+ var pdfium_exports = {};
13
+ __export(pdfium_exports, {
14
+ default: () => initPdfium
15
+ });
16
+ async function initPdfium() {
17
+ return {
18
+ // Dummy implementation for testing
19
+ };
20
+ }
21
+ var init_pdfium = __esm({
22
+ "typescript/pdfium.js"() {
23
+ "use strict";
24
+ }
25
+ });
26
+
1
27
  // typescript/adapters/wasm-adapter.ts
2
28
  var MAX_FILE_SIZE = 512 * 1024 * 1024;
3
29
  function isNumberOrNull(value) {
@@ -104,30 +130,52 @@ function jsToExtractionResult(jsValue) {
104
130
  }
105
131
  embedding = c.embedding;
106
132
  }
107
- if (typeof metadata.charStart !== "number") {
108
- throw new Error("Invalid chunk metadata: charStart must be a number");
109
- }
110
- if (typeof metadata.charEnd !== "number") {
111
- throw new Error("Invalid chunk metadata: charEnd must be a number");
112
- }
113
- if (!isNumberOrNull(metadata.tokenCount)) {
114
- throw new Error("Invalid chunk metadata: tokenCount must be a number or null");
115
- }
116
- if (typeof metadata.chunkIndex !== "number") {
117
- throw new Error("Invalid chunk metadata: chunkIndex must be a number");
118
- }
119
- if (typeof metadata.totalChunks !== "number") {
120
- throw new Error("Invalid chunk metadata: totalChunks must be a number");
133
+ const coerceToNumber = (value, fieldName) => {
134
+ if (typeof value === "number") {
135
+ return value;
136
+ }
137
+ if (typeof value === "bigint") {
138
+ return Number(value);
139
+ }
140
+ if (typeof value === "string") {
141
+ const parsed = parseInt(value, 10);
142
+ if (isNaN(parsed)) {
143
+ throw new Error(`Invalid chunk metadata: ${fieldName} must be a valid number, got "${value}"`);
144
+ }
145
+ return parsed;
146
+ }
147
+ throw new Error(`Invalid chunk metadata: ${fieldName} must be a number, got ${typeof value}`);
148
+ };
149
+ const charStart = coerceToNumber(
150
+ metadata.charStart ?? metadata.char_start ?? metadata.byteStart ?? metadata.byte_start,
151
+ "charStart"
152
+ );
153
+ const charEnd = coerceToNumber(
154
+ metadata.charEnd ?? metadata.char_end ?? metadata.byteEnd ?? metadata.byte_end,
155
+ "charEnd"
156
+ );
157
+ const chunkIndex = coerceToNumber(
158
+ metadata.chunkIndex ?? metadata.chunk_index,
159
+ "chunkIndex"
160
+ );
161
+ const totalChunks = coerceToNumber(
162
+ metadata.totalChunks ?? metadata.total_chunks,
163
+ "totalChunks"
164
+ );
165
+ let tokenCount = null;
166
+ const tokenCountValue = metadata.tokenCount ?? metadata.token_count;
167
+ if (tokenCountValue !== null && tokenCountValue !== void 0) {
168
+ tokenCount = coerceToNumber(tokenCountValue, "tokenCount");
121
169
  }
122
170
  return {
123
171
  content: c.content,
124
172
  embedding,
125
173
  metadata: {
126
- charStart: metadata.charStart,
127
- charEnd: metadata.charEnd,
128
- tokenCount: metadata.tokenCount,
129
- chunkIndex: metadata.chunkIndex,
130
- totalChunks: metadata.totalChunks
174
+ charStart,
175
+ charEnd,
176
+ tokenCount,
177
+ chunkIndex,
178
+ totalChunks
131
179
  }
132
180
  };
133
181
  }) : null;
@@ -312,93 +360,48 @@ var TesseractWasmBackend = class {
312
360
  return this.supportedLangsCache;
313
361
  }
314
362
  this.supportedLangsCache = [
315
- // Major languages
316
363
  "eng",
317
- // English
318
364
  "deu",
319
- // German
320
365
  "fra",
321
- // French
322
366
  "spa",
323
- // Spanish
324
367
  "ita",
325
- // Italian
326
368
  "por",
327
- // Portuguese
328
369
  "nld",
329
- // Dutch
330
370
  "rus",
331
- // Russian
332
371
  "jpn",
333
- // Japanese
334
372
  "kor",
335
- // Korean
336
373
  "chi_sim",
337
- // Chinese (Simplified)
338
374
  "chi_tra",
339
- // Chinese (Traditional)
340
- // Additional European languages
341
375
  "pol",
342
- // Polish
343
376
  "tur",
344
- // Turkish
345
377
  "swe",
346
- // Swedish
347
378
  "dan",
348
- // Danish
349
379
  "fin",
350
- // Finnish
351
380
  "nor",
352
- // Norwegian
353
381
  "ces",
354
- // Czech
355
382
  "slk",
356
- // Slovak
357
383
  "ron",
358
- // Romanian
359
384
  "hun",
360
- // Hungarian
361
385
  "hrv",
362
- // Croatian
363
386
  "srp",
364
- // Serbian
365
387
  "bul",
366
- // Bulgarian
367
388
  "ukr",
368
- // Ukrainian
369
389
  "ell",
370
- // Greek
371
- // Asian languages
372
390
  "ara",
373
- // Arabic
374
391
  "heb",
375
- // Hebrew
376
392
  "hin",
377
- // Hindi
378
393
  "tha",
379
- // Thai
380
394
  "vie",
381
- // Vietnamese
382
395
  "mkd",
383
- // Macedonian
384
396
  "ben",
385
- // Bengali
386
397
  "tam",
387
- // Tamil
388
398
  "tel",
389
- // Telugu
390
399
  "kan",
391
- // Kannada
392
400
  "mal",
393
- // Malayalam
394
401
  "mya",
395
- // Burmese
396
402
  "khm",
397
- // Khmer
398
403
  "lao",
399
- // Lao
400
404
  "sin"
401
- // Sinhala
402
405
  ];
403
406
  return this.supportedLangsCache;
404
407
  }
@@ -500,7 +503,6 @@ var TesseractWasmBackend = class {
500
503
  ...pageMetadata
501
504
  },
502
505
  tables: []
503
- // Tesseract-wasm doesn't provide structured table detection
504
506
  };
505
507
  } catch (error) {
506
508
  const message = error instanceof Error ? error.message : String(error);
@@ -786,7 +788,6 @@ function getRuntimeVersion() {
786
788
  switch (runtime) {
787
789
  case "node":
788
790
  return process.version?.substring(1);
789
- // Remove 'v' prefix
790
791
  case "deno": {
791
792
  const deno = globalThis.Deno;
792
793
  const version = deno?.version;
@@ -834,6 +835,168 @@ function getRuntimeInfo() {
834
835
  };
835
836
  }
836
837
 
838
+ // typescript/plugin-registry.ts
839
+ var postProcessors = /* @__PURE__ */ new Map();
840
+ var validators = /* @__PURE__ */ new Map();
841
+ function validatePostProcessor(processor) {
842
+ if (processor === null || processor === void 0) {
843
+ throw new Error("Post-processor cannot be null or undefined");
844
+ }
845
+ const obj = processor;
846
+ if (typeof obj.name !== "function") {
847
+ throw new Error("Post-processor must implement name() method");
848
+ }
849
+ if (typeof obj.process !== "function") {
850
+ throw new Error("Post-processor must implement process() method");
851
+ }
852
+ const name = obj.name();
853
+ if (typeof name !== "string" || name.trim() === "") {
854
+ throw new Error("Post-processor name must be a non-empty string");
855
+ }
856
+ return true;
857
+ }
858
+ function registerPostProcessor(processor) {
859
+ validatePostProcessor(processor);
860
+ const name = processor.name();
861
+ if (postProcessors.has(name)) {
862
+ console.warn(`Post-processor "${name}" already registered, overwriting with new implementation`);
863
+ }
864
+ postProcessors.set(name, processor);
865
+ }
866
+ function getPostProcessor(name) {
867
+ return postProcessors.get(name);
868
+ }
869
+ function listPostProcessors() {
870
+ return Array.from(postProcessors.keys());
871
+ }
872
+ async function unregisterPostProcessor(name) {
873
+ const processor = postProcessors.get(name);
874
+ if (!processor) {
875
+ const available = Array.from(postProcessors.keys());
876
+ const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
877
+ throw new Error(`Post-processor "${name}" is not registered.${availableStr}`);
878
+ }
879
+ try {
880
+ if (processor.shutdown) {
881
+ await processor.shutdown();
882
+ }
883
+ } catch (error) {
884
+ console.warn(`Error during shutdown of post-processor "${name}":`, error);
885
+ }
886
+ postProcessors.delete(name);
887
+ }
888
+ async function clearPostProcessors() {
889
+ const entries = Array.from(postProcessors.entries());
890
+ for (const [_name, processor] of entries) {
891
+ try {
892
+ if (processor.shutdown) {
893
+ await processor.shutdown();
894
+ }
895
+ } catch (error) {
896
+ console.warn(`Error during shutdown of post-processor "${_name}":`, error);
897
+ }
898
+ }
899
+ postProcessors.clear();
900
+ }
901
+ function validateValidator(validator) {
902
+ if (validator === null || validator === void 0) {
903
+ throw new Error("Validator cannot be null or undefined");
904
+ }
905
+ const obj = validator;
906
+ if (typeof obj.name !== "function") {
907
+ throw new Error("Validator must implement name() method");
908
+ }
909
+ if (typeof obj.validate !== "function") {
910
+ throw new Error("Validator must implement validate() method");
911
+ }
912
+ const name = obj.name();
913
+ if (typeof name !== "string" || name.trim() === "") {
914
+ throw new Error("Validator name must be a non-empty string");
915
+ }
916
+ return true;
917
+ }
918
+ function registerValidator(validator) {
919
+ validateValidator(validator);
920
+ const name = validator.name();
921
+ if (validators.has(name)) {
922
+ console.warn(`Validator "${name}" already registered, overwriting with new implementation`);
923
+ }
924
+ validators.set(name, validator);
925
+ }
926
+ function getValidator(name) {
927
+ return validators.get(name);
928
+ }
929
+ function listValidators() {
930
+ return Array.from(validators.keys());
931
+ }
932
+ async function unregisterValidator(name) {
933
+ const validator = validators.get(name);
934
+ if (!validator) {
935
+ const available = Array.from(validators.keys());
936
+ const availableStr = available.length > 0 ? ` Available: ${available.join(", ")}` : "";
937
+ throw new Error(`Validator "${name}" is not registered.${availableStr}`);
938
+ }
939
+ try {
940
+ if (validator.shutdown) {
941
+ await validator.shutdown();
942
+ }
943
+ } catch (error) {
944
+ console.warn(`Error during shutdown of validator "${name}":`, error);
945
+ }
946
+ validators.delete(name);
947
+ }
948
+ async function clearValidators() {
949
+ const entries = Array.from(validators.entries());
950
+ for (const [_name, validator] of entries) {
951
+ try {
952
+ if (validator.shutdown) {
953
+ await validator.shutdown();
954
+ }
955
+ } catch (error) {
956
+ console.warn(`Error during shutdown of validator "${_name}":`, error);
957
+ }
958
+ }
959
+ validators.clear();
960
+ }
961
+ function executePostProcessor(name, result) {
962
+ const processor = postProcessors.get(name);
963
+ if (!processor) {
964
+ return Promise.reject(new Error(`Post-processor "${name}" is not registered`));
965
+ }
966
+ try {
967
+ const output = processor.process(result);
968
+ if (output instanceof Promise) {
969
+ return output;
970
+ }
971
+ return Promise.resolve(output);
972
+ } catch (error) {
973
+ return Promise.reject(new Error(`Error executing post-processor "${name}": ${String(error)}`));
974
+ }
975
+ }
976
+ function executeValidator(name, result) {
977
+ const validator = validators.get(name);
978
+ if (!validator) {
979
+ return Promise.reject(new Error(`Validator "${name}" is not registered`));
980
+ }
981
+ try {
982
+ const output = validator.validate(result);
983
+ if (output instanceof Promise) {
984
+ return output;
985
+ }
986
+ return Promise.resolve(output);
987
+ } catch (error) {
988
+ return Promise.reject(new Error(`Error executing validator "${name}": ${String(error)}`));
989
+ }
990
+ }
991
+ function setupGlobalCallbacks() {
992
+ if (typeof globalThis !== "undefined") {
993
+ const callbacksObj = globalThis;
994
+ callbacksObj.__kreuzberg_execute_post_processor = executePostProcessor;
995
+ callbacksObj.__kreuzberg_execute_validator = executeValidator;
996
+ }
997
+ }
998
+ setupGlobalCallbacks();
999
+
837
1000
  // typescript/index.ts
838
1001
  var wasm = null;
839
1002
  var initialized = false;
@@ -848,7 +1011,7 @@ async function initializePdfiumAsync(wasmModule) {
848
1011
  return;
849
1012
  }
850
1013
  try {
851
- const pdfiumModule = await import("./pdfium.js");
1014
+ const pdfiumModule = await Promise.resolve().then(() => (init_pdfium(), pdfium_exports));
852
1015
  const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
853
1016
  const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
854
1017
  if (!success) {
@@ -1172,6 +1335,8 @@ export {
1172
1335
  batchExtractBytesSync,
1173
1336
  batchExtractFiles,
1174
1337
  clearOcrBackends,
1338
+ clearPostProcessors,
1339
+ clearValidators,
1175
1340
  configToJS,
1176
1341
  detectRuntime,
1177
1342
  enableOcr,
@@ -1182,8 +1347,10 @@ export {
1182
1347
  fileToUint8Array,
1183
1348
  getInitializationError,
1184
1349
  getOcrBackend,
1350
+ getPostProcessor,
1185
1351
  getRuntimeInfo,
1186
1352
  getRuntimeVersion,
1353
+ getValidator,
1187
1354
  getVersion,
1188
1355
  getWasmCapabilities,
1189
1356
  hasBigInt,
@@ -1205,8 +1372,14 @@ export {
1205
1372
  isWebEnvironment,
1206
1373
  jsToExtractionResult,
1207
1374
  listOcrBackends,
1375
+ listPostProcessors,
1376
+ listValidators,
1208
1377
  registerOcrBackend,
1378
+ registerPostProcessor,
1379
+ registerValidator,
1209
1380
  unregisterOcrBackend,
1381
+ unregisterPostProcessor,
1382
+ unregisterValidator,
1210
1383
  wrapWasmError
1211
1384
  };
1212
1385
  //# sourceMappingURL=index.js.map