@kreuzberg/wasm 4.0.0-rc.25 → 4.0.0-rc.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/pkg/LICENSE +7 -0
- package/dist/pkg/README.md +742 -0
- package/dist/pkg/kreuzberg_wasm.d.ts +770 -0
- package/dist/pkg/kreuzberg_wasm.js +1904 -0
- package/dist/pkg/kreuzberg_wasm_bg.js +1871 -0
- package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +51 -0
- package/dist/pkg/package.json +27 -0
- package/package.json +1 -1
|
@@ -0,0 +1,742 @@
|
|
|
1
|
+
# WebAssembly Bindings
|
|
2
|
+
|
|
3
|
+
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
|
+
<!-- Language Bindings -->
|
|
5
|
+
<a href="https://crates.io/crates/kreuzberg">
|
|
6
|
+
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
|
7
|
+
</a>
|
|
8
|
+
<a href="https://hex.pm/packages/kreuzberg">
|
|
9
|
+
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://pypi.org/project/kreuzberg/">
|
|
12
|
+
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
|
15
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
|
18
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
|
19
|
+
</a>
|
|
20
|
+
|
|
21
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
22
|
+
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
|
+
</a>
|
|
24
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0-*" alt="Go">
|
|
26
|
+
</a>
|
|
27
|
+
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
|
+
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
29
|
+
</a>
|
|
30
|
+
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
|
31
|
+
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
|
+
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
|
+
</a>
|
|
36
|
+
|
|
37
|
+
<!-- Project Info -->
|
|
38
|
+
|
|
39
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
40
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://docs.kreuzberg.dev">
|
|
43
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
44
|
+
</a>
|
|
45
|
+
</div>
|
|
46
|
+
|
|
47
|
+
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
48
|
+
|
|
49
|
+
<div align="center" style="margin-top: 20px;">
|
|
50
|
+
<a href="https://discord.gg/pXxagNK2zN">
|
|
51
|
+
<img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
|
|
52
|
+
</a>
|
|
53
|
+
</div>
|
|
54
|
+
|
|
55
|
+
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Node.js, Deno, and Cloudflare Workers with portable deployment and optional multi-threading support.
|
|
56
|
+
|
|
57
|
+
> **Version 4.0.0 Release Candidate**
|
|
58
|
+
> Kreuzberg v4.0.0 is in **Release Candidate** stage. Bugs and breaking changes are expected.
|
|
59
|
+
> This is a pre-release version. Please test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
### Package Installation
|
|
64
|
+
|
|
65
|
+
Install via one of the supported package managers:
|
|
66
|
+
|
|
67
|
+
**npm:**
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
npm install @kreuzberg/wasm
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**pnpm:**
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pnpm add @kreuzberg/wasm
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**yarn:**
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
yarn add @kreuzberg/wasm
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Platform Support
|
|
86
|
+
|
|
87
|
+
Runs on:
|
|
88
|
+
- Modern browsers (Chrome, Firefox, Safari, Edge with WebAssembly support)
|
|
89
|
+
- Node.js 16+ (with WASM runtime)
|
|
90
|
+
- Deno 1.0+
|
|
91
|
+
- Cloudflare Workers
|
|
92
|
+
- Any JavaScript environment with WebAssembly support
|
|
93
|
+
|
|
94
|
+
### System Requirements
|
|
95
|
+
|
|
96
|
+
- WebAssembly support in runtime environment
|
|
97
|
+
- 50 MB minimum free memory for extraction
|
|
98
|
+
- Optional: [Tesseract WASM](https://github.com/naptha/tesseract.js) for OCR functionality
|
|
99
|
+
|
|
100
|
+
### Runtime Detection
|
|
101
|
+
|
|
102
|
+
Check platform capabilities before extraction:
|
|
103
|
+
|
|
104
|
+
```typescript
|
|
105
|
+
import { getWasmCapabilities } from '@kreuzberg/wasm';
|
|
106
|
+
|
|
107
|
+
const caps = getWasmCapabilities();
|
|
108
|
+
console.log('WASM available:', caps.hasWasm);
|
|
109
|
+
console.log('Web Workers available:', caps.hasWorkers);
|
|
110
|
+
console.log('Module Workers available:', caps.hasModuleWorkers);
|
|
111
|
+
console.log('File API available:', caps.hasFileApi);
|
|
112
|
+
console.log('SharedArrayBuffer available:', caps.hasSharedArrayBuffer);
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Quick Start
|
|
116
|
+
|
|
117
|
+
### Basic Extraction
|
|
118
|
+
|
|
119
|
+
Extract text, metadata, and structure from any supported document format:
|
|
120
|
+
|
|
121
|
+
```ts
|
|
122
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
123
|
+
|
|
124
|
+
async function main() {
|
|
125
|
+
await initWasm();
|
|
126
|
+
|
|
127
|
+
const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
|
|
128
|
+
const bytes = new Uint8Array(buffer);
|
|
129
|
+
|
|
130
|
+
const result = await extractBytes(bytes, "application/pdf");
|
|
131
|
+
|
|
132
|
+
console.log("Extracted content:");
|
|
133
|
+
console.log(result.content);
|
|
134
|
+
console.log("MIME type:", result.mimeType);
|
|
135
|
+
console.log("Metadata:", result.metadata);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
main().catch(console.error);
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Common Use Cases
|
|
142
|
+
|
|
143
|
+
#### Extract with Custom Configuration
|
|
144
|
+
|
|
145
|
+
Most use cases benefit from configuration to control extraction behavior:
|
|
146
|
+
|
|
147
|
+
**With OCR (for scanned documents):**
|
|
148
|
+
|
|
149
|
+
```ts
|
|
150
|
+
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
151
|
+
|
|
152
|
+
async function extractWithOcr() {
|
|
153
|
+
await initWasm();
|
|
154
|
+
|
|
155
|
+
try {
|
|
156
|
+
await enableOcr();
|
|
157
|
+
console.log("OCR enabled successfully");
|
|
158
|
+
} catch (error) {
|
|
159
|
+
console.error("Failed to enable OCR:", error);
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
|
164
|
+
|
|
165
|
+
const result = await extractBytes(bytes, "image/png", {
|
|
166
|
+
ocr: {
|
|
167
|
+
backend: "tesseract-wasm",
|
|
168
|
+
language: "eng",
|
|
169
|
+
},
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
console.log("Extracted text:");
|
|
173
|
+
console.log(result.content);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
extractWithOcr().catch(console.error);
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
#### Table Extraction
|
|
180
|
+
|
|
181
|
+
See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
|
|
182
|
+
|
|
183
|
+
#### Processing Multiple Files
|
|
184
|
+
|
|
185
|
+
```ts
|
|
186
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
187
|
+
|
|
188
|
+
interface DocumentJob {
|
|
189
|
+
name: string;
|
|
190
|
+
bytes: Uint8Array;
|
|
191
|
+
mimeType: string;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
|
|
195
|
+
await initWasm();
|
|
196
|
+
|
|
197
|
+
const results: Record<string, string> = {};
|
|
198
|
+
const queue = [...documents];
|
|
199
|
+
|
|
200
|
+
const workers = Array(concurrency)
|
|
201
|
+
.fill(null)
|
|
202
|
+
.map(async () => {
|
|
203
|
+
while (queue.length > 0) {
|
|
204
|
+
const doc = queue.shift();
|
|
205
|
+
if (!doc) break;
|
|
206
|
+
|
|
207
|
+
try {
|
|
208
|
+
const result = await extractBytes(doc.bytes, doc.mimeType);
|
|
209
|
+
results[doc.name] = result.content;
|
|
210
|
+
} catch (error) {
|
|
211
|
+
console.error(`Failed to process ${doc.name}:`, error);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
await Promise.all(workers);
|
|
217
|
+
return results;
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
#### Async Processing
|
|
222
|
+
|
|
223
|
+
For non-blocking document processing:
|
|
224
|
+
|
|
225
|
+
```ts
|
|
226
|
+
import { extractBytes, initWasm, getWasmCapabilities } from "@kreuzberg/wasm";
|
|
227
|
+
|
|
228
|
+
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
|
|
229
|
+
const caps = getWasmCapabilities();
|
|
230
|
+
if (!caps.hasWasm) {
|
|
231
|
+
throw new Error("WebAssembly not supported");
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
await initWasm();
|
|
235
|
+
|
|
236
|
+
const results = await Promise.all(
|
|
237
|
+
files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
|
|
238
|
+
);
|
|
239
|
+
|
|
240
|
+
return results.map((r) => ({
|
|
241
|
+
content: r.content,
|
|
242
|
+
pageCount: r.metadata?.pageCount,
|
|
243
|
+
}));
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
const fileBytes = [new Uint8Array([1, 2, 3])];
|
|
247
|
+
const mimes = ["application/pdf"];
|
|
248
|
+
|
|
249
|
+
extractDocuments(fileBytes, mimes)
|
|
250
|
+
.then((results) => console.log(results))
|
|
251
|
+
.catch(console.error);
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
#### Worker Pool Usage
|
|
255
|
+
|
|
256
|
+
When Web Workers are available, use worker threads for parallel document processing without blocking the main thread:
|
|
257
|
+
|
|
258
|
+
```typescript
|
|
259
|
+
import { extractBytes, initWasm, hasWorkers, hasModuleWorkers } from '@kreuzberg/wasm';
|
|
260
|
+
|
|
261
|
+
class DocumentWorkerPool {
|
|
262
|
+
private workers: Worker[] = [];
|
|
263
|
+
private taskQueue: Array<{ id: number; data: Uint8Array; mimeType: string; resolve: Function; reject: Function }> = [];
|
|
264
|
+
private currentTaskId = 0;
|
|
265
|
+
|
|
266
|
+
constructor(workerCount: number = navigator.hardwareConcurrency || 4) {
|
|
267
|
+
// Module workers allow importing ES modules, standard workers are more compatible
|
|
268
|
+
const useModuleWorkers = hasModuleWorkers();
|
|
269
|
+
|
|
270
|
+
for (let i = 0; i < workerCount; i++) {
|
|
271
|
+
const worker = useModuleWorkers
|
|
272
|
+
? new Worker(new URL('./extraction-worker.ts', import.meta.url), { type: 'module' })
|
|
273
|
+
: new Worker(new URL('./extraction-worker.js', import.meta.url));
|
|
274
|
+
|
|
275
|
+
worker.onmessage = (event) => this.handleWorkerMessage(event.data);
|
|
276
|
+
worker.onerror = (error) => this.handleWorkerError(error);
|
|
277
|
+
this.workers.push(worker);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
async extract(data: Uint8Array, mimeType: string): Promise<string> {
|
|
282
|
+
return new Promise((resolve, reject) => {
|
|
283
|
+
this.taskQueue.push({
|
|
284
|
+
id: this.currentTaskId++,
|
|
285
|
+
data,
|
|
286
|
+
mimeType,
|
|
287
|
+
resolve,
|
|
288
|
+
reject
|
|
289
|
+
});
|
|
290
|
+
this.processQueue();
|
|
291
|
+
});
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
private processQueue(): void {
|
|
295
|
+
while (this.taskQueue.length > 0) {
|
|
296
|
+
const task = this.taskQueue.shift();
|
|
297
|
+
if (task) {
|
|
298
|
+
const worker = this.workers[task.id % this.workers.length];
|
|
299
|
+
worker.postMessage({ id: task.id, data: task.data, mimeType: task.mimeType });
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
private handleWorkerMessage(data: { id: number; result: string }): void {
|
|
305
|
+
const task = this.taskQueue.find(t => t.id === data.id);
|
|
306
|
+
if (task) {
|
|
307
|
+
task.resolve(data.result);
|
|
308
|
+
this.processQueue();
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
private handleWorkerError(error: ErrorEvent): void {
|
|
313
|
+
console.error('Worker error:', error.message);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
terminate(): void {
|
|
317
|
+
this.workers.forEach(w => w.terminate());
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Usage
|
|
322
|
+
async function processDocumentsInParallel() {
|
|
323
|
+
if (!hasWorkers()) {
|
|
324
|
+
console.log('Web Workers not available, falling back to main thread');
|
|
325
|
+
return;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
await initWasm();
|
|
329
|
+
const pool = new DocumentWorkerPool(4);
|
|
330
|
+
|
|
331
|
+
const documents = [
|
|
332
|
+
{ data: new Uint8Array([...]), mimeType: 'application/pdf' },
|
|
333
|
+
{ data: new Uint8Array([...]), mimeType: 'application/pdf' },
|
|
334
|
+
];
|
|
335
|
+
|
|
336
|
+
const results = await Promise.all(
|
|
337
|
+
documents.map(doc => pool.extract(doc.data, doc.mimeType))
|
|
338
|
+
);
|
|
339
|
+
|
|
340
|
+
pool.terminate();
|
|
341
|
+
return results;
|
|
342
|
+
}
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
Worker code (`extraction-worker.ts`):
|
|
346
|
+
|
|
347
|
+
```typescript
|
|
348
|
+
import { extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
349
|
+
|
|
350
|
+
let wasmInitialized = false;
|
|
351
|
+
|
|
352
|
+
self.onmessage = async (event) => {
|
|
353
|
+
if (!wasmInitialized) {
|
|
354
|
+
await initWasm();
|
|
355
|
+
wasmInitialized = true;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
const { id, data, mimeType } = event.data;
|
|
359
|
+
try {
|
|
360
|
+
const result = await extractBytes(new Uint8Array(data), mimeType);
|
|
361
|
+
self.postMessage({ id, result: result.content });
|
|
362
|
+
} catch (error) {
|
|
363
|
+
self.postMessage({ id, error: (error as Error).message });
|
|
364
|
+
}
|
|
365
|
+
};
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
### Memory Management
|
|
369
|
+
|
|
370
|
+
WASM memory is managed by the JavaScript garbage collector:
|
|
371
|
+
|
|
372
|
+
```typescript
|
|
373
|
+
import { initWasm, extractBytes } from '@kreuzberg/wasm';
|
|
374
|
+
|
|
375
|
+
async function extractWithMemoryAwareness() {
|
|
376
|
+
await initWasm();
|
|
377
|
+
|
|
378
|
+
// Process documents one at a time to control memory usage
|
|
379
|
+
const documents = [/* ... */];
|
|
380
|
+
|
|
381
|
+
for (const doc of documents) {
|
|
382
|
+
const result = await extractBytes(doc, 'application/pdf');
|
|
383
|
+
|
|
384
|
+
// Process result immediately
|
|
385
|
+
console.log(result.content);
|
|
386
|
+
|
|
387
|
+
// Result will be garbage collected when no longer referenced
|
|
388
|
+
// Explicitly clear large objects if needed
|
|
389
|
+
// gc(); // Requires --expose-gc flag
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// Check available memory (browser only)
|
|
394
|
+
if (performance.memory) {
|
|
395
|
+
console.log('Memory usage:', {
|
|
396
|
+
usedJSHeapSize: performance.memory.usedJSHeapSize,
|
|
397
|
+
totalJSHeapSize: performance.memory.totalJSHeapSize,
|
|
398
|
+
jsHeapSizeLimit: performance.memory.jsHeapSizeLimit
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
### Next Steps
|
|
404
|
+
|
|
405
|
+
- **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
|
406
|
+
- **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
|
|
407
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
|
|
408
|
+
- **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
|
|
409
|
+
- **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
|
|
410
|
+
|
|
411
|
+
## WASM-Specific Implementation Details
|
|
412
|
+
|
|
413
|
+
### Initialization
|
|
414
|
+
|
|
415
|
+
WASM binaries must be loaded before extraction:
|
|
416
|
+
|
|
417
|
+
```typescript
|
|
418
|
+
import { initWasm } from '@kreuzberg/wasm';
|
|
419
|
+
|
|
420
|
+
// Initialize once at application startup
|
|
421
|
+
await initWasm();
|
|
422
|
+
|
|
423
|
+
// Now extraction functions can be used
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
The init function:
|
|
427
|
+
- Downloads and instantiates the WASM binary
|
|
428
|
+
- Initializes the memory space (linear memory module)
|
|
429
|
+
- Prepares thread pools if available
|
|
430
|
+
- Throws if WASM is not supported in the environment
|
|
431
|
+
|
|
432
|
+
### Threading Model
|
|
433
|
+
|
|
434
|
+
- Single-threaded by default (main thread execution)
|
|
435
|
+
- Web Workers optional for background processing
|
|
436
|
+
- Shared memory (SharedArrayBuffer) not required
|
|
437
|
+
- Message passing used for worker communication
|
|
438
|
+
- No blocking operations on main thread with worker pool
|
|
439
|
+
|
|
440
|
+
### Memory Considerations
|
|
441
|
+
|
|
442
|
+
- Each WASM instance has its own 4GB linear memory address space
|
|
443
|
+
- Large documents (> 100 MB) may not fit in WASM memory
|
|
444
|
+
- Binary data is copied between JavaScript and WASM boundaries
|
|
445
|
+
- Garbage collection is handled by JavaScript runtime
|
|
446
|
+
- No manual memory management required
|
|
447
|
+
|
|
448
|
+
### Supported Extraction Targets
|
|
449
|
+
|
|
450
|
+
Different file formats have varying support in WASM:
|
|
451
|
+
|
|
452
|
+
| Format | Support | Notes |
|
|
453
|
+
|--------|---------|-------|
|
|
454
|
+
| PDF | Full | Text, images, metadata extraction |
|
|
455
|
+
| Office (DOCX, XLSX, PPTX) | Full | All features supported |
|
|
456
|
+
| Images (PNG, JPG, etc) | Full | EXIF metadata extraction |
|
|
457
|
+
| Archives (ZIP, TAR) | Full | Listing and extraction |
|
|
458
|
+
| OCR | Limited | Tesseract WASM only, main thread only |
|
|
459
|
+
| Embeddings | Not Available | WASM has no ML model support |
|
|
460
|
+
|
|
461
|
+
### Platform Limitations
|
|
462
|
+
|
|
463
|
+
**LibreOffice-Dependent Formats Not Available**
|
|
464
|
+
|
|
465
|
+
WASM cannot load native LibreOffice binaries, so older Office formats are **not supported**:
|
|
466
|
+
|
|
467
|
+
- ❌ **DOC** (Microsoft Word 97-2003) - Use DOCX instead
|
|
468
|
+
- ❌ **XLS** (Microsoft Excel 97-2003) - Use XLSX instead
|
|
469
|
+
- ❌ **PPT** (Microsoft PowerPoint 97-2003) - Use PPTX instead
|
|
470
|
+
- ❌ **RTF** (Rich Text Format with complex features)
|
|
471
|
+
- ❌ **ODT/ODS/ODP** (LibreOffice/OpenOffice formats)
|
|
472
|
+
|
|
473
|
+
Modern Office formats (DOCX, XLSX, PPTX) are fully supported and don't require LibreOffice.
|
|
474
|
+
|
|
475
|
+
**Polars Integration Not Available**
|
|
476
|
+
|
|
477
|
+
- ❌ Polars DataFrame extraction/conversion not available in WASM
|
|
478
|
+
- ❌ Structured data operations limited compared to Node.js binding
|
|
479
|
+
|
|
480
|
+
**Alternative: Use Node.js Binding**
|
|
481
|
+
|
|
482
|
+
If you need support for older Office formats or Polars integration, use the `@kreuzberg/node` package instead:
|
|
483
|
+
|
|
484
|
+
```bash
|
|
485
|
+
npm install @kreuzberg/node
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
The Node.js binding provides:
|
|
489
|
+
- ✅ Full LibreOffice format support (DOC, XLS, PPT, RTF, ODT)
|
|
490
|
+
- ✅ Polars DataFrame integration
|
|
491
|
+
- ✅ All OCR backends (Tesseract, EasyOCR, PaddleOCR)
|
|
492
|
+
- ✅ Full embedding model support
|
|
493
|
+
|
|
494
|
+
**Format Comparison Table**
|
|
495
|
+
|
|
496
|
+
| Format Type | WASM Support | Node.js Support |
|
|
497
|
+
|-------------|--------------|-----------------|
|
|
498
|
+
| Modern Office (DOCX/XLSX/PPTX) | ✅ Full | ✅ Full |
|
|
499
|
+
| Legacy Office (DOC/XLS/PPT) | ❌ Not Available | ✅ Requires LibreOffice |
|
|
500
|
+
| OpenOffice (ODT/ODS/ODP) | ❌ Not Available | ✅ Requires LibreOffice |
|
|
501
|
+
| PDF | ✅ Full | ✅ Full |
|
|
502
|
+
| Images | ✅ Full | ✅ Full |
|
|
503
|
+
| Embeddings | ❌ Not Available | ✅ With ONNX Runtime |
|
|
504
|
+
| Polars | ❌ Not Available | ✅ Available |
|
|
505
|
+
|
|
506
|
+
### Sandbox Security
|
|
507
|
+
|
|
508
|
+
- WASM code runs in a sandbox with restricted capabilities
|
|
509
|
+
- File system access requires user interaction (File API)
|
|
510
|
+
- Network access follows CORS restrictions
|
|
511
|
+
- No access to Node.js native modules
|
|
512
|
+
- Content Security Policy (CSP) may restrict WASM loading
|
|
513
|
+
|
|
514
|
+
## Features
|
|
515
|
+
|
|
516
|
+
### Supported File Formats (56+)
|
|
517
|
+
|
|
518
|
+
56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
519
|
+
|
|
520
|
+
#### Office Documents
|
|
521
|
+
|
|
522
|
+
| Category | Formats | Capabilities |
|
|
523
|
+
|----------|---------|--------------|
|
|
524
|
+
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
525
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
526
|
+
| **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
|
|
527
|
+
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
528
|
+
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
529
|
+
|
|
530
|
+
#### Images (OCR-Enabled)
|
|
531
|
+
|
|
532
|
+
| Category | Formats | Features |
|
|
533
|
+
|----------|---------|----------|
|
|
534
|
+
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
|
535
|
+
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR, table detection, format-specific metadata |
|
|
536
|
+
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
|
537
|
+
|
|
538
|
+
#### Web & Data
|
|
539
|
+
|
|
540
|
+
| Category | Formats | Features |
|
|
541
|
+
|----------|---------|----------|
|
|
542
|
+
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
543
|
+
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
544
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
545
|
+
|
|
546
|
+
#### Email & Archives
|
|
547
|
+
|
|
548
|
+
| Category | Formats | Features |
|
|
549
|
+
|----------|---------|----------|
|
|
550
|
+
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
|
551
|
+
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
|
552
|
+
|
|
553
|
+
#### Academic & Scientific
|
|
554
|
+
|
|
555
|
+
| Category | Formats | Features |
|
|
556
|
+
|----------|---------|----------|
|
|
557
|
+
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.enw`, `.csl` | Bibliography parsing, citation extraction |
|
|
558
|
+
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
559
|
+
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
560
|
+
|
|
561
|
+
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
562
|
+
|
|
563
|
+
### Key Capabilities
|
|
564
|
+
|
|
565
|
+
- **Text Extraction** - Extract all text content with position and formatting information
|
|
566
|
+
|
|
567
|
+
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
568
|
+
|
|
569
|
+
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
570
|
+
|
|
571
|
+
- **Image Extraction** - Extract embedded images and render page previews
|
|
572
|
+
|
|
573
|
+
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
|
574
|
+
|
|
575
|
+
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
576
|
+
|
|
577
|
+
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
578
|
+
|
|
579
|
+
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
580
|
+
|
|
581
|
+
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
582
|
+
|
|
583
|
+
- **Language Detection** - Detect and support multiple languages in documents
|
|
584
|
+
|
|
585
|
+
- **Configuration** - Fine-grained control over extraction behavior
|
|
586
|
+
|
|
587
|
+
### Performance Characteristics
|
|
588
|
+
|
|
589
|
+
| Format | Speed | Memory | Notes |
|
|
590
|
+
|--------|-------|--------|-------|
|
|
591
|
+
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
|
592
|
+
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
|
593
|
+
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
|
594
|
+
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
|
595
|
+
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
|
596
|
+
|
|
597
|
+
## OCR Support
|
|
598
|
+
|
|
599
|
+
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
|
600
|
+
|
|
601
|
+
- **Tesseract-Wasm**
|
|
602
|
+
|
|
603
|
+
### OCR Configuration Example
|
|
604
|
+
|
|
605
|
+
```ts
|
|
606
|
+
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
607
|
+
|
|
608
|
+
async function extractWithOcr() {
|
|
609
|
+
await initWasm();
|
|
610
|
+
|
|
611
|
+
try {
|
|
612
|
+
await enableOcr();
|
|
613
|
+
console.log("OCR enabled successfully");
|
|
614
|
+
} catch (error) {
|
|
615
|
+
console.error("Failed to enable OCR:", error);
|
|
616
|
+
return;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
|
620
|
+
|
|
621
|
+
const result = await extractBytes(bytes, "image/png", {
|
|
622
|
+
ocr: {
|
|
623
|
+
backend: "tesseract-wasm",
|
|
624
|
+
language: "eng",
|
|
625
|
+
},
|
|
626
|
+
});
|
|
627
|
+
|
|
628
|
+
console.log("Extracted text:");
|
|
629
|
+
console.log(result.content);
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
extractWithOcr().catch(console.error);
|
|
633
|
+
```
|
|
634
|
+
|
|
635
|
+
## Async Support
|
|
636
|
+
|
|
637
|
+
This binding provides full async/await support for non-blocking document processing:
|
|
638
|
+
|
|
639
|
+
```ts
|
|
640
|
+
import { extractBytes, initWasm, getWasmCapabilities } from "@kreuzberg/wasm";
|
|
641
|
+
|
|
642
|
+
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
|
|
643
|
+
const caps = getWasmCapabilities();
|
|
644
|
+
if (!caps.hasWasm) {
|
|
645
|
+
throw new Error("WebAssembly not supported");
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
await initWasm();
|
|
649
|
+
|
|
650
|
+
const results = await Promise.all(
|
|
651
|
+
files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
|
|
652
|
+
);
|
|
653
|
+
|
|
654
|
+
return results.map((r) => ({
|
|
655
|
+
content: r.content,
|
|
656
|
+
pageCount: r.metadata?.pageCount,
|
|
657
|
+
}));
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
const fileBytes = [new Uint8Array([1, 2, 3])];
|
|
661
|
+
const mimes = ["application/pdf"];
|
|
662
|
+
|
|
663
|
+
extractDocuments(fileBytes, mimes)
|
|
664
|
+
.then((results) => console.log(results))
|
|
665
|
+
.catch(console.error);
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
## Plugin System
|
|
669
|
+
|
|
670
|
+
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
|
671
|
+
|
|
672
|
+
For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
|
|
673
|
+
|
|
674
|
+
## Batch Processing
|
|
675
|
+
|
|
676
|
+
Process multiple documents efficiently:
|
|
677
|
+
|
|
678
|
+
```ts
|
|
679
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
680
|
+
|
|
681
|
+
interface DocumentJob {
|
|
682
|
+
name: string;
|
|
683
|
+
bytes: Uint8Array;
|
|
684
|
+
mimeType: string;
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
|
|
688
|
+
await initWasm();
|
|
689
|
+
|
|
690
|
+
const results: Record<string, string> = {};
|
|
691
|
+
const queue = [...documents];
|
|
692
|
+
|
|
693
|
+
const workers = Array(concurrency)
|
|
694
|
+
.fill(null)
|
|
695
|
+
.map(async () => {
|
|
696
|
+
while (queue.length > 0) {
|
|
697
|
+
const doc = queue.shift();
|
|
698
|
+
if (!doc) break;
|
|
699
|
+
|
|
700
|
+
try {
|
|
701
|
+
const result = await extractBytes(doc.bytes, doc.mimeType);
|
|
702
|
+
results[doc.name] = result.content;
|
|
703
|
+
} catch (error) {
|
|
704
|
+
console.error(`Failed to process ${doc.name}:`, error);
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
});
|
|
708
|
+
|
|
709
|
+
await Promise.all(workers);
|
|
710
|
+
return results;
|
|
711
|
+
}
|
|
712
|
+
```
|
|
713
|
+
|
|
714
|
+
## Configuration
|
|
715
|
+
|
|
716
|
+
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
|
717
|
+
|
|
718
|
+
**[Configuration Guide](https://kreuzberg.dev/configuration/)**
|
|
719
|
+
|
|
720
|
+
## Documentation
|
|
721
|
+
|
|
722
|
+
- **[Official Documentation](https://kreuzberg.dev/)**
|
|
723
|
+
- **[API Reference](https://kreuzberg.dev/reference/api-wasm/)**
|
|
724
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)**
|
|
725
|
+
|
|
726
|
+
## Troubleshooting
|
|
727
|
+
|
|
728
|
+
For common issues and solutions, visit [Troubleshooting Guide](https://kreuzberg.dev/troubleshooting/).
|
|
729
|
+
|
|
730
|
+
## Contributing
|
|
731
|
+
|
|
732
|
+
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
|
733
|
+
|
|
734
|
+
## License
|
|
735
|
+
|
|
736
|
+
MIT License - see LICENSE file for details.
|
|
737
|
+
|
|
738
|
+
## Support
|
|
739
|
+
|
|
740
|
+
- **Discord Community**: [Join our Discord](https://discord.gg/pXxagNK2zN)
|
|
741
|
+
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
742
|
+
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|