@kreuzberg/wasm 4.3.8 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -8
- package/dist/adapters/wasm-adapter.d.ts.map +1 -1
- package/dist/adapters/wasm-adapter.js +15 -7
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.js +232 -45
- package/dist/index.js.map +1 -1
- package/dist/initialization/pdfium-loader.d.ts +22 -3
- package/dist/initialization/pdfium-loader.d.ts.map +1 -1
- package/dist/initialization/state.d.ts +4 -0
- package/dist/initialization/state.d.ts.map +1 -1
- package/dist/ocr/enabler.d.ts +15 -60
- package/dist/ocr/enabler.d.ts.map +1 -1
- package/dist/pdfium.esm.wasm +0 -0
- package/dist/pdfium.js +10 -73
- package/dist/pkg/README.md +40 -8
- package/dist/pkg/kreuzberg_wasm.d.ts +80 -78
- package/dist/pkg/kreuzberg_wasm.js +424 -281
- package/dist/pkg/kreuzberg_wasm_bg.js +337 -170
- package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +18 -11
- package/dist/types.d.ts +15 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +11 -11
- package/dist/pkg/package.json +0 -31
package/dist/pkg/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -57,6 +57,8 @@
|
|
|
57
57
|
|
|
58
58
|
Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
|
|
59
59
|
|
|
60
|
+
> **Full Feature Parity** — The WASM package supports all extraction capabilities at full parity with native bindings: PDF (via PDFium), Excel/spreadsheets (via Calamine), archives (ZIP, TAR, 7z, GZIP), and OCR (via built-in Tesseract-WASM). No external dependencies required.
|
|
61
|
+
|
|
60
62
|
|
|
61
63
|
## Installation
|
|
62
64
|
|
|
@@ -95,7 +97,7 @@ yarn add @kreuzberg/wasm
|
|
|
95
97
|
### System Requirements
|
|
96
98
|
|
|
97
99
|
- Modern browser with WebAssembly support, or Deno 1.0+, or Cloudflare Workers
|
|
98
|
-
-
|
|
100
|
+
- OCR is built-in via Tesseract-WASM (enable at runtime with `enableOcr()`)
|
|
99
101
|
|
|
100
102
|
|
|
101
103
|
|
|
@@ -174,6 +176,40 @@ extractWithOcr().catch(console.error);
|
|
|
174
176
|
See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
|
|
175
177
|
|
|
176
178
|
|
|
179
|
+
#### Excel/Spreadsheet Extraction
|
|
180
|
+
|
|
181
|
+
Extract structured data from Excel files directly in the browser or server-side runtimes:
|
|
182
|
+
|
|
183
|
+
```ts
|
|
184
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
185
|
+
|
|
186
|
+
async function extractSpreadsheet() {
|
|
187
|
+
await initWasm();
|
|
188
|
+
|
|
189
|
+
const bytes = new Uint8Array(
|
|
190
|
+
await fetch("report.xlsx").then((r) => r.arrayBuffer()),
|
|
191
|
+
);
|
|
192
|
+
|
|
193
|
+
const result = await extractBytes(
|
|
194
|
+
bytes,
|
|
195
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
196
|
+
);
|
|
197
|
+
|
|
198
|
+
console.log("Spreadsheet content:");
|
|
199
|
+
console.log(result.content);
|
|
200
|
+
|
|
201
|
+
if (result.tables && result.tables.length > 0) {
|
|
202
|
+
result.tables.forEach((table, index) => {
|
|
203
|
+
console.log(`\nSheet ${index + 1}:`);
|
|
204
|
+
console.log(table.markdown);
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
extractSpreadsheet().catch(console.error);
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
|
|
177
213
|
|
|
178
214
|
#### Processing Multiple Files
|
|
179
215
|
|
|
@@ -318,14 +354,10 @@ extractDocuments(fileBytes, mimes)
|
|
|
318
354
|
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
319
355
|
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
320
356
|
- **Image Extraction** - Extract embedded images and render page previews
|
|
321
|
-
- **OCR Support** -
|
|
322
|
-
|
|
357
|
+
- **OCR Support** - Built-in Tesseract-WASM for scanned documents and images
|
|
358
|
+
- **Full Feature Parity** - All extraction capabilities at parity with native bindings: PDF, Excel, archives, OCR, and 75+ formats
|
|
323
359
|
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
324
|
-
|
|
325
|
-
|
|
326
360
|
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
327
|
-
|
|
328
|
-
|
|
329
361
|
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
330
362
|
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
331
363
|
- **Language Detection** - Detect and support multiple languages in documents
|
|
@@ -185,6 +185,31 @@ export function clear_post_processors(): void;
|
|
|
185
185
|
*/
|
|
186
186
|
export function clear_validators(): void;
|
|
187
187
|
|
|
188
|
+
/**
|
|
189
|
+
* Compresses multiple entries into a 7z archive in WebAssembly environment.
|
|
190
|
+
*
|
|
191
|
+
* This function creates a compressed archive from multiple file entries,
|
|
192
|
+
* designed specifically for WASM targets.
|
|
193
|
+
*
|
|
194
|
+
* # Arguments
|
|
195
|
+
* * `entries` - Vector of JavaScript strings representing file names/paths
|
|
196
|
+
* * `datas` - Vector of Uint8Arrays containing the file data corresponding to entries
|
|
197
|
+
*/
|
|
198
|
+
export function compress(entries: string[], datas: Uint8Array[]): Uint8Array;
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Decompresses a 7z archive in WebAssembly environment.
|
|
202
|
+
*
|
|
203
|
+
* This function is specifically designed for WASM targets and uses JavaScript interop
|
|
204
|
+
* to handle the decompression process with a callback function.
|
|
205
|
+
*
|
|
206
|
+
* # Arguments
|
|
207
|
+
* * `src` - Uint8Array containing the compressed archive data
|
|
208
|
+
* * `pwd` - Password string for encrypted archives (use empty string for unencrypted)
|
|
209
|
+
* * `f` - JavaScript callback function to handle extracted entries
|
|
210
|
+
*/
|
|
211
|
+
export function decompress(src: Uint8Array, pwd: string, f: Function): void;
|
|
212
|
+
|
|
188
213
|
/**
|
|
189
214
|
* Detect MIME type from raw file bytes.
|
|
190
215
|
*
|
|
@@ -600,6 +625,61 @@ export function loadConfigFromString(content: string, format: string): any;
|
|
|
600
625
|
*/
|
|
601
626
|
export function normalizeMimeType(mime_type: string): string;
|
|
602
627
|
|
|
628
|
+
/**
|
|
629
|
+
* Check if OCR support is available in this WASM build.
|
|
630
|
+
*
|
|
631
|
+
* Returns `true` if the `ocr-wasm` feature was enabled at build time.
|
|
632
|
+
*/
|
|
633
|
+
export function ocrIsAvailable(): boolean;
|
|
634
|
+
|
|
635
|
+
/**
|
|
636
|
+
* Perform OCR on encoded image bytes (PNG, JPEG, BMP, GIF, TIFF).
|
|
637
|
+
*
|
|
638
|
+
* Automatically decodes the image to RGB pixels before running Tesseract.
|
|
639
|
+
* This is the primary function for OCR in WASM - it handles image decoding
|
|
640
|
+
* internally so the caller doesn't need browser APIs like `createImageBitmap`.
|
|
641
|
+
*
|
|
642
|
+
* # Arguments
|
|
643
|
+
*
|
|
644
|
+
* * `image_bytes` - Encoded image data (PNG, JPEG, BMP, GIF, TIFF)
|
|
645
|
+
* * `tessdata` - Raw `.traineddata` file content loaded into memory
|
|
646
|
+
* * `language` - Tesseract language code (e.g., "eng")
|
|
647
|
+
*
|
|
648
|
+
* # Returns
|
|
649
|
+
*
|
|
650
|
+
* The recognized text as a string.
|
|
651
|
+
*/
|
|
652
|
+
export function ocrRecognize(image_bytes: Uint8Array, tessdata: Uint8Array, language: string): string;
|
|
653
|
+
|
|
654
|
+
/**
|
|
655
|
+
* Perform OCR on raw image pixel data using Tesseract.
|
|
656
|
+
*
|
|
657
|
+
* This function accepts pre-decoded image pixels (RGB format) along with
|
|
658
|
+
* tessdata loaded into memory. No filesystem access is needed.
|
|
659
|
+
*
|
|
660
|
+
* # Arguments
|
|
661
|
+
*
|
|
662
|
+
* * `image_data` - Raw pixel data in RGB format (3 bytes per pixel)
|
|
663
|
+
* * `width` - Image width in pixels
|
|
664
|
+
* * `height` - Image height in pixels
|
|
665
|
+
* * `bytes_per_pixel` - Bytes per pixel (typically 3 for RGB, 1 for grayscale)
|
|
666
|
+
* * `bytes_per_line` - Bytes per scan line (typically width * bytes_per_pixel)
|
|
667
|
+
* * `tessdata` - Raw `.traineddata` file content loaded into memory
|
|
668
|
+
* * `language` - Tesseract language code (e.g., "eng")
|
|
669
|
+
*
|
|
670
|
+
* # Returns
|
|
671
|
+
*
|
|
672
|
+
* The recognized text as a string.
|
|
673
|
+
*/
|
|
674
|
+
export function ocrRecognizeRaw(image_data: Uint8Array, width: number, height: number, bytes_per_pixel: number, bytes_per_line: number, tessdata: Uint8Array, language: string): string;
|
|
675
|
+
|
|
676
|
+
/**
|
|
677
|
+
* Get the Tesseract version string compiled into this WASM binary.
|
|
678
|
+
*
|
|
679
|
+
* Returns the version of the statically linked Tesseract library.
|
|
680
|
+
*/
|
|
681
|
+
export function ocrTesseractVersion(): string;
|
|
682
|
+
|
|
603
683
|
/**
|
|
604
684
|
* A callback function that can be invoked by Pdfium's `FPDF_LoadCustomDocument()` function,
|
|
605
685
|
* wrapping around `crate::utils::files::read_block_from_callback()` to shuffle data buffers
|
|
@@ -773,81 +853,3 @@ export function version(): string;
|
|
|
773
853
|
* from Pdfium's WASM memory heap to our WASM memory heap as they are written.
|
|
774
854
|
*/
|
|
775
855
|
export function write_block_from_callback_wasm(param: number, buf: number, size: number): number;
|
|
776
|
-
|
|
777
|
-
export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
|
|
778
|
-
|
|
779
|
-
export interface InitOutput {
|
|
780
|
-
readonly memory: WebAssembly.Memory;
|
|
781
|
-
readonly __wbg_moduleinfo_free: (a: number, b: number) => void;
|
|
782
|
-
readonly batchExtractBytes: (a: number, b: number, c: number, d: number, e: number) => any;
|
|
783
|
-
readonly batchExtractBytesSync: (a: number, b: number, c: number, d: number, e: number) => [number, number, number];
|
|
784
|
-
readonly batchExtractFiles: (a: number, b: number, c: number) => any;
|
|
785
|
-
readonly batchExtractFilesSync: () => [number, number, number];
|
|
786
|
-
readonly clear_ocr_backends: () => [number, number];
|
|
787
|
-
readonly clear_post_processors: () => [number, number];
|
|
788
|
-
readonly clear_validators: () => [number, number];
|
|
789
|
-
readonly detectMimeFromBytes: (a: any) => [number, number, number, number];
|
|
790
|
-
readonly discoverConfig: () => [number, number, number];
|
|
791
|
-
readonly extractBytes: (a: any, b: number, c: number, d: number) => any;
|
|
792
|
-
readonly extractBytesSync: (a: any, b: number, c: number, d: number) => [number, number, number];
|
|
793
|
-
readonly extractFile: (a: any, b: number, c: number, d: number) => any;
|
|
794
|
-
readonly extractFileSync: () => [number, number, number];
|
|
795
|
-
readonly getExtensionsForMime: (a: number, b: number) => [number, number, number];
|
|
796
|
-
readonly getMimeFromExtension: (a: number, b: number) => [number, number];
|
|
797
|
-
readonly get_module_info: () => number;
|
|
798
|
-
readonly initThreadPool: (a: number) => any;
|
|
799
|
-
readonly init_thread_pool_safe: (a: number) => number;
|
|
800
|
-
readonly list_ocr_backends: () => [number, number, number];
|
|
801
|
-
readonly list_post_processors: () => [number, number, number];
|
|
802
|
-
readonly list_validators: () => [number, number, number];
|
|
803
|
-
readonly loadConfigFromString: (a: number, b: number, c: number, d: number) => [number, number, number];
|
|
804
|
-
readonly moduleinfo_name: (a: number) => [number, number];
|
|
805
|
-
readonly moduleinfo_version: (a: number) => [number, number];
|
|
806
|
-
readonly normalizeMimeType: (a: number, b: number) => [number, number];
|
|
807
|
-
readonly register_ocr_backend: (a: any) => [number, number];
|
|
808
|
-
readonly register_post_processor: (a: any) => [number, number];
|
|
809
|
-
readonly register_validator: (a: any) => [number, number];
|
|
810
|
-
readonly unregister_ocr_backend: (a: number, b: number) => [number, number];
|
|
811
|
-
readonly unregister_post_processor: (a: number, b: number) => [number, number];
|
|
812
|
-
readonly unregister_validator: (a: number, b: number) => [number, number];
|
|
813
|
-
readonly version: () => [number, number];
|
|
814
|
-
readonly init: () => void;
|
|
815
|
-
readonly initialize_pdfium_render: (a: any, b: any, c: number) => number;
|
|
816
|
-
readonly read_block_from_callback_wasm: (a: number, b: number, c: number, d: number) => number;
|
|
817
|
-
readonly write_block_from_callback_wasm: (a: number, b: number, c: number) => number;
|
|
818
|
-
readonly wasm_bindgen_62eaabd966a5dd3b___closure__destroy___dyn_core_dde6c4b55a98adc4___ops__function__FnMut__wasm_bindgen_62eaabd966a5dd3b___JsValue____Output_______: (a: number, b: number) => void;
|
|
819
|
-
readonly wasm_bindgen_62eaabd966a5dd3b___closure__destroy___dyn_core_dde6c4b55a98adc4___ops__function__FnMut__wasm_bindgen_62eaabd966a5dd3b___JsValue____Output___core_dde6c4b55a98adc4___result__Result_____wasm_bindgen_62eaabd966a5dd3b___JsError___: (a: number, b: number) => void;
|
|
820
|
-
readonly wasm_bindgen_62eaabd966a5dd3b___convert__closures_____invoke___wasm_bindgen_62eaabd966a5dd3b___JsValue__core_dde6c4b55a98adc4___result__Result_____wasm_bindgen_62eaabd966a5dd3b___JsError__: (a: number, b: number, c: any) => [number, number];
|
|
821
|
-
readonly wasm_bindgen_62eaabd966a5dd3b___convert__closures_____invoke___js_sys_3747b0537fa588d4___Function_fn_wasm_bindgen_62eaabd966a5dd3b___JsValue_____wasm_bindgen_62eaabd966a5dd3b___sys__Undefined___js_sys_3747b0537fa588d4___Function_fn_wasm_bindgen_62eaabd966a5dd3b___JsValue_____wasm_bindgen_62eaabd966a5dd3b___sys__Undefined______: (a: number, b: number, c: any, d: any) => void;
|
|
822
|
-
readonly wasm_bindgen_62eaabd966a5dd3b___convert__closures_____invoke___wasm_bindgen_62eaabd966a5dd3b___JsValue_____: (a: number, b: number, c: any) => void;
|
|
823
|
-
readonly __wbindgen_externrefs: WebAssembly.Table;
|
|
824
|
-
readonly __wbindgen_malloc: (a: number, b: number) => number;
|
|
825
|
-
readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
|
|
826
|
-
readonly __wbindgen_exn_store: (a: number) => void;
|
|
827
|
-
readonly __externref_table_alloc: () => number;
|
|
828
|
-
readonly __wbindgen_free: (a: number, b: number, c: number) => void;
|
|
829
|
-
readonly __externref_table_dealloc: (a: number) => void;
|
|
830
|
-
readonly __wbindgen_start: () => void;
|
|
831
|
-
}
|
|
832
|
-
|
|
833
|
-
export type SyncInitInput = BufferSource | WebAssembly.Module;
|
|
834
|
-
|
|
835
|
-
/**
|
|
836
|
-
* Instantiates the given `module`, which can either be bytes or
|
|
837
|
-
* a precompiled `WebAssembly.Module`.
|
|
838
|
-
*
|
|
839
|
-
* @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
|
|
840
|
-
*
|
|
841
|
-
* @returns {InitOutput}
|
|
842
|
-
*/
|
|
843
|
-
export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
|
|
844
|
-
|
|
845
|
-
/**
|
|
846
|
-
* If `module_or_path` is {RequestInfo} or {URL}, makes a request and
|
|
847
|
-
* for everything else, calls `WebAssembly.instantiate` directly.
|
|
848
|
-
*
|
|
849
|
-
* @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
|
|
850
|
-
*
|
|
851
|
-
* @returns {Promise<InitOutput>}
|
|
852
|
-
*/
|
|
853
|
-
export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;
|