@kreuzberg/wasm 4.3.8 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.8" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -57,6 +57,8 @@
57
57
 
58
58
  Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
59
59
 
60
+ > **Full Feature Parity** — The WASM package supports all extraction capabilities at full parity with native bindings: PDF (via PDFium), Excel/spreadsheets (via Calamine), archives (ZIP, TAR, 7z, GZIP), and OCR (via built-in Tesseract-WASM). No external dependencies required.
61
+
60
62
 
61
63
  ## Installation
62
64
 
@@ -95,7 +97,7 @@ yarn add @kreuzberg/wasm
95
97
  ### System Requirements
96
98
 
97
99
  - Modern browser with WebAssembly support, or Deno 1.0+, or Cloudflare Workers
98
- - Optional: [Tesseract WASM](https://github.com/naptha/tesseract.js) for OCR functionality
100
+ - OCR is built-in via Tesseract-WASM (enable at runtime with `enableOcr()`)
99
101
 
100
102
 
101
103
 
@@ -174,6 +176,40 @@ extractWithOcr().catch(console.error);
174
176
  See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
175
177
 
176
178
 
179
+ #### Excel/Spreadsheet Extraction
180
+
181
+ Extract structured data from Excel files directly in the browser or server-side runtimes:
182
+
183
+ ```ts
184
+ import { extractBytes, initWasm } from "@kreuzberg/wasm";
185
+
186
+ async function extractSpreadsheet() {
187
+ await initWasm();
188
+
189
+ const bytes = new Uint8Array(
190
+ await fetch("report.xlsx").then((r) => r.arrayBuffer()),
191
+ );
192
+
193
+ const result = await extractBytes(
194
+ bytes,
195
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
196
+ );
197
+
198
+ console.log("Spreadsheet content:");
199
+ console.log(result.content);
200
+
201
+ if (result.tables && result.tables.length > 0) {
202
+ result.tables.forEach((table, index) => {
203
+ console.log(`\nSheet ${index + 1}:`);
204
+ console.log(table.markdown);
205
+ });
206
+ }
207
+ }
208
+
209
+ extractSpreadsheet().catch(console.error);
210
+ ```
211
+
212
+
177
213
 
178
214
  #### Processing Multiple Files
179
215
 
@@ -318,14 +354,10 @@ extractDocuments(fileBytes, mimes)
318
354
  - **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
319
355
  - **Table Extraction** - Parse tables with structure and cell content preservation
320
356
  - **Image Extraction** - Extract embedded images and render page previews
321
- - **OCR Support** - Integrate multiple OCR backends for scanned documents
322
-
357
+ - **OCR Support** - Built-in Tesseract-WASM for scanned documents and images
358
+ - **Full Feature Parity** - All extraction capabilities at parity with native bindings: PDF, Excel, archives, OCR, and 75+ formats
323
359
  - **Async/Await** - Non-blocking document processing with concurrent operations
324
-
325
-
326
360
  - **Plugin System** - Extensible post-processing for custom text transformation
327
-
328
-
329
361
  - **Batch Processing** - Efficiently process multiple documents in parallel
330
362
  - **Memory Efficient** - Stream large files without loading entirely into memory
331
363
  - **Language Detection** - Detect and support multiple languages in documents
@@ -185,6 +185,31 @@ export function clear_post_processors(): void;
185
185
  */
186
186
  export function clear_validators(): void;
187
187
 
188
+ /**
189
+ * Compresses multiple entries into a 7z archive in WebAssembly environment.
190
+ *
191
+ * This function creates a compressed archive from multiple file entries,
192
+ * designed specifically for WASM targets.
193
+ *
194
+ * # Arguments
195
+ * * `entries` - Vector of JavaScript strings representing file names/paths
196
+ * * `datas` - Vector of Uint8Arrays containing the file data corresponding to entries
197
+ */
198
+ export function compress(entries: string[], datas: Uint8Array[]): Uint8Array;
199
+
200
+ /**
201
+ * Decompresses a 7z archive in WebAssembly environment.
202
+ *
203
+ * This function is specifically designed for WASM targets and uses JavaScript interop
204
+ * to handle the decompression process with a callback function.
205
+ *
206
+ * # Arguments
207
+ * * `src` - Uint8Array containing the compressed archive data
208
+ * * `pwd` - Password string for encrypted archives (use empty string for unencrypted)
209
+ * * `f` - JavaScript callback function to handle extracted entries
210
+ */
211
+ export function decompress(src: Uint8Array, pwd: string, f: Function): void;
212
+
188
213
  /**
189
214
  * Detect MIME type from raw file bytes.
190
215
  *
@@ -600,6 +625,61 @@ export function loadConfigFromString(content: string, format: string): any;
600
625
  */
601
626
  export function normalizeMimeType(mime_type: string): string;
602
627
 
628
+ /**
629
+ * Check if OCR support is available in this WASM build.
630
+ *
631
+ * Returns `true` if the `ocr-wasm` feature was enabled at build time.
632
+ */
633
+ export function ocrIsAvailable(): boolean;
634
+
635
+ /**
636
+ * Perform OCR on encoded image bytes (PNG, JPEG, BMP, GIF, TIFF).
637
+ *
638
+ * Automatically decodes the image to RGB pixels before running Tesseract.
639
+ * This is the primary function for OCR in WASM - it handles image decoding
640
+ * internally so the caller doesn't need browser APIs like `createImageBitmap`.
641
+ *
642
+ * # Arguments
643
+ *
644
+ * * `image_bytes` - Encoded image data (PNG, JPEG, BMP, GIF, TIFF)
645
+ * * `tessdata` - Raw `.traineddata` file content loaded into memory
646
+ * * `language` - Tesseract language code (e.g., "eng")
647
+ *
648
+ * # Returns
649
+ *
650
+ * The recognized text as a string.
651
+ */
652
+ export function ocrRecognize(image_bytes: Uint8Array, tessdata: Uint8Array, language: string): string;
653
+
654
+ /**
655
+ * Perform OCR on raw image pixel data using Tesseract.
656
+ *
657
+ * This function accepts pre-decoded image pixels (RGB format) along with
658
+ * tessdata loaded into memory. No filesystem access is needed.
659
+ *
660
+ * # Arguments
661
+ *
662
+ * * `image_data` - Raw pixel data in RGB format (3 bytes per pixel)
663
+ * * `width` - Image width in pixels
664
+ * * `height` - Image height in pixels
665
+ * * `bytes_per_pixel` - Bytes per pixel (typically 3 for RGB, 1 for grayscale)
666
+ * * `bytes_per_line` - Bytes per scan line (typically width * bytes_per_pixel)
667
+ * * `tessdata` - Raw `.traineddata` file content loaded into memory
668
+ * * `language` - Tesseract language code (e.g., "eng")
669
+ *
670
+ * # Returns
671
+ *
672
+ * The recognized text as a string.
673
+ */
674
+ export function ocrRecognizeRaw(image_data: Uint8Array, width: number, height: number, bytes_per_pixel: number, bytes_per_line: number, tessdata: Uint8Array, language: string): string;
675
+
676
+ /**
677
+ * Get the Tesseract version string compiled into this WASM binary.
678
+ *
679
+ * Returns the version of the statically linked Tesseract library.
680
+ */
681
+ export function ocrTesseractVersion(): string;
682
+
603
683
  /**
604
684
  * A callback function that can be invoked by Pdfium's `FPDF_LoadCustomDocument()` function,
605
685
  * wrapping around `crate::utils::files::read_block_from_callback()` to shuffle data buffers
@@ -773,81 +853,3 @@ export function version(): string;
773
853
  * from Pdfium's WASM memory heap to our WASM memory heap as they are written.
774
854
  */
775
855
  export function write_block_from_callback_wasm(param: number, buf: number, size: number): number;
776
-
777
- export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
778
-
779
- export interface InitOutput {
780
- readonly memory: WebAssembly.Memory;
781
- readonly __wbg_moduleinfo_free: (a: number, b: number) => void;
782
- readonly batchExtractBytes: (a: number, b: number, c: number, d: number, e: number) => any;
783
- readonly batchExtractBytesSync: (a: number, b: number, c: number, d: number, e: number) => [number, number, number];
784
- readonly batchExtractFiles: (a: number, b: number, c: number) => any;
785
- readonly batchExtractFilesSync: () => [number, number, number];
786
- readonly clear_ocr_backends: () => [number, number];
787
- readonly clear_post_processors: () => [number, number];
788
- readonly clear_validators: () => [number, number];
789
- readonly detectMimeFromBytes: (a: any) => [number, number, number, number];
790
- readonly discoverConfig: () => [number, number, number];
791
- readonly extractBytes: (a: any, b: number, c: number, d: number) => any;
792
- readonly extractBytesSync: (a: any, b: number, c: number, d: number) => [number, number, number];
793
- readonly extractFile: (a: any, b: number, c: number, d: number) => any;
794
- readonly extractFileSync: () => [number, number, number];
795
- readonly getExtensionsForMime: (a: number, b: number) => [number, number, number];
796
- readonly getMimeFromExtension: (a: number, b: number) => [number, number];
797
- readonly get_module_info: () => number;
798
- readonly initThreadPool: (a: number) => any;
799
- readonly init_thread_pool_safe: (a: number) => number;
800
- readonly list_ocr_backends: () => [number, number, number];
801
- readonly list_post_processors: () => [number, number, number];
802
- readonly list_validators: () => [number, number, number];
803
- readonly loadConfigFromString: (a: number, b: number, c: number, d: number) => [number, number, number];
804
- readonly moduleinfo_name: (a: number) => [number, number];
805
- readonly moduleinfo_version: (a: number) => [number, number];
806
- readonly normalizeMimeType: (a: number, b: number) => [number, number];
807
- readonly register_ocr_backend: (a: any) => [number, number];
808
- readonly register_post_processor: (a: any) => [number, number];
809
- readonly register_validator: (a: any) => [number, number];
810
- readonly unregister_ocr_backend: (a: number, b: number) => [number, number];
811
- readonly unregister_post_processor: (a: number, b: number) => [number, number];
812
- readonly unregister_validator: (a: number, b: number) => [number, number];
813
- readonly version: () => [number, number];
814
- readonly init: () => void;
815
- readonly initialize_pdfium_render: (a: any, b: any, c: number) => number;
816
- readonly read_block_from_callback_wasm: (a: number, b: number, c: number, d: number) => number;
817
- readonly write_block_from_callback_wasm: (a: number, b: number, c: number) => number;
818
- readonly wasm_bindgen_62eaabd966a5dd3b___closure__destroy___dyn_core_dde6c4b55a98adc4___ops__function__FnMut__wasm_bindgen_62eaabd966a5dd3b___JsValue____Output_______: (a: number, b: number) => void;
819
- readonly wasm_bindgen_62eaabd966a5dd3b___closure__destroy___dyn_core_dde6c4b55a98adc4___ops__function__FnMut__wasm_bindgen_62eaabd966a5dd3b___JsValue____Output___core_dde6c4b55a98adc4___result__Result_____wasm_bindgen_62eaabd966a5dd3b___JsError___: (a: number, b: number) => void;
820
- readonly wasm_bindgen_62eaabd966a5dd3b___convert__closures_____invoke___wasm_bindgen_62eaabd966a5dd3b___JsValue__core_dde6c4b55a98adc4___result__Result_____wasm_bindgen_62eaabd966a5dd3b___JsError__: (a: number, b: number, c: any) => [number, number];
821
- readonly wasm_bindgen_62eaabd966a5dd3b___convert__closures_____invoke___js_sys_3747b0537fa588d4___Function_fn_wasm_bindgen_62eaabd966a5dd3b___JsValue_____wasm_bindgen_62eaabd966a5dd3b___sys__Undefined___js_sys_3747b0537fa588d4___Function_fn_wasm_bindgen_62eaabd966a5dd3b___JsValue_____wasm_bindgen_62eaabd966a5dd3b___sys__Undefined______: (a: number, b: number, c: any, d: any) => void;
822
- readonly wasm_bindgen_62eaabd966a5dd3b___convert__closures_____invoke___wasm_bindgen_62eaabd966a5dd3b___JsValue_____: (a: number, b: number, c: any) => void;
823
- readonly __wbindgen_externrefs: WebAssembly.Table;
824
- readonly __wbindgen_malloc: (a: number, b: number) => number;
825
- readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
826
- readonly __wbindgen_exn_store: (a: number) => void;
827
- readonly __externref_table_alloc: () => number;
828
- readonly __wbindgen_free: (a: number, b: number, c: number) => void;
829
- readonly __externref_table_dealloc: (a: number) => void;
830
- readonly __wbindgen_start: () => void;
831
- }
832
-
833
- export type SyncInitInput = BufferSource | WebAssembly.Module;
834
-
835
- /**
836
- * Instantiates the given `module`, which can either be bytes or
837
- * a precompiled `WebAssembly.Module`.
838
- *
839
- * @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
840
- *
841
- * @returns {InitOutput}
842
- */
843
- export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
844
-
845
- /**
846
- * If `module_or_path` is {RequestInfo} or {URL}, makes a request and
847
- * for everything else, calls `WebAssembly.instantiate` directly.
848
- *
849
- * @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
850
- *
851
- * @returns {Promise<InitOutput>}
852
- */
853
- export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;