@kreuzberg/wasm 4.0.0-rc.29 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +182 -421
- package/dist/adapters/wasm-adapter.d.ts.map +1 -1
- package/dist/adapters/wasm-adapter.js +3 -9
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -11
- package/dist/index.js.map +1 -1
- package/dist/pkg/README.md +182 -421
- package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/pkg/package.json +1 -1
- package/package.json +119 -119
package/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# WebAssembly
|
|
1
|
+
# WebAssembly
|
|
2
2
|
|
|
3
3
|
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
4
|
<!-- Language Bindings -->
|
|
@@ -18,11 +18,11 @@
|
|
|
18
18
|
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
|
19
19
|
</a>
|
|
20
20
|
|
|
21
|
-
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
21
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -34,9 +34,8 @@
|
|
|
34
34
|
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
35
|
</a>
|
|
36
36
|
|
|
37
|
-
<!-- Project Info -->
|
|
38
|
-
|
|
39
|
-
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
37
|
+
<!-- Project Info -->
|
|
38
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
40
39
|
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
41
40
|
</a>
|
|
42
41
|
<a href="https://docs.kreuzberg.dev">
|
|
@@ -52,65 +51,50 @@
|
|
|
52
51
|
</a>
|
|
53
52
|
</div>
|
|
54
53
|
|
|
55
|
-
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Node.js, Deno, and Cloudflare Workers with portable deployment and optional multi-threading support.
|
|
56
54
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
> This is a pre-release version. Please test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
55
|
+
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
|
|
56
|
+
|
|
60
57
|
|
|
61
58
|
## Installation
|
|
62
59
|
|
|
63
60
|
### Package Installation
|
|
64
61
|
|
|
62
|
+
|
|
65
63
|
Install via one of the supported package managers:
|
|
66
64
|
|
|
67
|
-
**npm:**
|
|
68
65
|
|
|
66
|
+
|
|
67
|
+
**npm:**
|
|
69
68
|
```bash
|
|
70
69
|
npm install @kreuzberg/wasm
|
|
71
70
|
```
|
|
72
71
|
|
|
73
|
-
**pnpm:**
|
|
74
72
|
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
**pnpm:**
|
|
75
76
|
```bash
|
|
76
77
|
pnpm add @kreuzberg/wasm
|
|
77
78
|
```
|
|
78
79
|
|
|
79
|
-
**yarn:**
|
|
80
80
|
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
**yarn:**
|
|
81
84
|
```bash
|
|
82
85
|
yarn add @kreuzberg/wasm
|
|
83
86
|
```
|
|
84
87
|
|
|
85
|
-
### Platform Support
|
|
86
88
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
- Node.js 16+ (with WASM runtime)
|
|
90
|
-
- Deno 1.0+
|
|
91
|
-
- Cloudflare Workers
|
|
92
|
-
- Any JavaScript environment with WebAssembly support
|
|
89
|
+
|
|
90
|
+
|
|
93
91
|
|
|
94
92
|
### System Requirements
|
|
95
93
|
|
|
96
|
-
- WebAssembly support
|
|
97
|
-
- 50 MB minimum free memory for extraction
|
|
94
|
+
- Modern browser with WebAssembly support, or Deno 1.0+, or Cloudflare Workers
|
|
98
95
|
- Optional: [Tesseract WASM](https://github.com/naptha/tesseract.js) for OCR functionality
|
|
99
96
|
|
|
100
|
-
### Runtime Detection
|
|
101
97
|
|
|
102
|
-
Check platform capabilities before extraction:
|
|
103
|
-
|
|
104
|
-
```typescript
|
|
105
|
-
import { getWasmCapabilities } from '@kreuzberg/wasm';
|
|
106
|
-
|
|
107
|
-
const caps = getWasmCapabilities();
|
|
108
|
-
console.log('WASM available:', caps.hasWasm);
|
|
109
|
-
console.log('Web Workers available:', caps.hasWorkers);
|
|
110
|
-
console.log('Module Workers available:', caps.hasModuleWorkers);
|
|
111
|
-
console.log('File API available:', caps.hasFileApi);
|
|
112
|
-
console.log('SharedArrayBuffer available:', caps.hasSharedArrayBuffer);
|
|
113
|
-
```
|
|
114
98
|
|
|
115
99
|
## Quick Start
|
|
116
100
|
|
|
@@ -122,283 +106,150 @@ Extract text, metadata, and structure from any supported document format:
|
|
|
122
106
|
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
123
107
|
|
|
124
108
|
async function main() {
|
|
125
|
-
|
|
109
|
+
await initWasm();
|
|
126
110
|
|
|
127
|
-
|
|
128
|
-
|
|
111
|
+
const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
|
|
112
|
+
const bytes = new Uint8Array(buffer);
|
|
129
113
|
|
|
130
|
-
|
|
114
|
+
const result = await extractBytes(bytes, "application/pdf");
|
|
131
115
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
116
|
+
console.log("Extracted content:");
|
|
117
|
+
console.log(result.content);
|
|
118
|
+
console.log("MIME type:", result.mimeType);
|
|
119
|
+
console.log("Metadata:", result.metadata);
|
|
136
120
|
}
|
|
137
121
|
|
|
138
122
|
main().catch(console.error);
|
|
139
123
|
```
|
|
140
124
|
|
|
125
|
+
|
|
141
126
|
### Common Use Cases
|
|
142
127
|
|
|
143
128
|
#### Extract with Custom Configuration
|
|
144
129
|
|
|
145
130
|
Most use cases benefit from configuration to control extraction behavior:
|
|
146
131
|
|
|
132
|
+
|
|
147
133
|
**With OCR (for scanned documents):**
|
|
148
134
|
|
|
149
135
|
```ts
|
|
150
136
|
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
151
137
|
|
|
152
138
|
async function extractWithOcr() {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
139
|
+
await initWasm();
|
|
140
|
+
|
|
141
|
+
try {
|
|
142
|
+
await enableOcr();
|
|
143
|
+
console.log("OCR enabled successfully");
|
|
144
|
+
} catch (error) {
|
|
145
|
+
console.error("Failed to enable OCR:", error);
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
|
150
|
+
|
|
151
|
+
const result = await extractBytes(bytes, "image/png", {
|
|
152
|
+
ocr: {
|
|
153
|
+
backend: "tesseract-wasm",
|
|
154
|
+
language: "eng",
|
|
155
|
+
},
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
console.log("Extracted text:");
|
|
159
|
+
console.log(result.content);
|
|
174
160
|
}
|
|
175
161
|
|
|
176
162
|
extractWithOcr().catch(console.error);
|
|
177
163
|
```
|
|
178
164
|
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
|
|
179
168
|
#### Table Extraction
|
|
180
169
|
|
|
170
|
+
|
|
181
171
|
See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
|
|
182
172
|
|
|
173
|
+
|
|
174
|
+
|
|
183
175
|
#### Processing Multiple Files
|
|
184
176
|
|
|
177
|
+
|
|
185
178
|
```ts
|
|
186
179
|
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
187
180
|
|
|
188
181
|
interface DocumentJob {
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
182
|
+
name: string;
|
|
183
|
+
bytes: Uint8Array;
|
|
184
|
+
mimeType: string;
|
|
192
185
|
}
|
|
193
186
|
|
|
194
|
-
async function
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
187
|
+
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
|
|
188
|
+
await initWasm();
|
|
189
|
+
|
|
190
|
+
const results: Record<string, string> = {};
|
|
191
|
+
const queue = [...documents];
|
|
192
|
+
|
|
193
|
+
const workers = Array(concurrency)
|
|
194
|
+
.fill(null)
|
|
195
|
+
.map(async () => {
|
|
196
|
+
while (queue.length > 0) {
|
|
197
|
+
const doc = queue.shift();
|
|
198
|
+
if (!doc) break;
|
|
199
|
+
|
|
200
|
+
try {
|
|
201
|
+
const result = await extractBytes(doc.bytes, doc.mimeType);
|
|
202
|
+
results[doc.name] = result.content;
|
|
203
|
+
} catch (error) {
|
|
204
|
+
console.error(`Failed to process ${doc.name}:`, error);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
await Promise.all(workers);
|
|
210
|
+
return results;
|
|
218
211
|
}
|
|
219
212
|
```
|
|
220
213
|
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
221
218
|
#### Async Processing
|
|
222
219
|
|
|
223
220
|
For non-blocking document processing:
|
|
224
221
|
|
|
225
222
|
```ts
|
|
226
|
-
import { extractBytes,
|
|
223
|
+
import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
|
|
227
224
|
|
|
228
225
|
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
226
|
+
const caps = getWasmCapabilities();
|
|
227
|
+
if (!caps.hasWasm) {
|
|
228
|
+
throw new Error("WebAssembly not supported");
|
|
229
|
+
}
|
|
233
230
|
|
|
234
|
-
|
|
231
|
+
await initWasm();
|
|
235
232
|
|
|
236
|
-
|
|
237
|
-
files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
|
|
238
|
-
);
|
|
233
|
+
const results = await Promise.all(files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
|
|
239
234
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
235
|
+
return results.map((r) => ({
|
|
236
|
+
content: r.content,
|
|
237
|
+
pageCount: r.metadata?.pageCount,
|
|
238
|
+
}));
|
|
244
239
|
}
|
|
245
240
|
|
|
246
241
|
const fileBytes = [new Uint8Array([1, 2, 3])];
|
|
247
242
|
const mimes = ["application/pdf"];
|
|
248
243
|
|
|
249
244
|
extractDocuments(fileBytes, mimes)
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
```
|
|
253
|
-
|
|
254
|
-
#### Worker Pool Usage
|
|
255
|
-
|
|
256
|
-
When Web Workers are available, use worker threads for parallel document processing without blocking the main thread:
|
|
257
|
-
|
|
258
|
-
```typescript
|
|
259
|
-
import { extractBytes, initWasm, hasWorkers, hasModuleWorkers } from '@kreuzberg/wasm';
|
|
260
|
-
|
|
261
|
-
class DocumentWorkerPool {
|
|
262
|
-
private workers: Worker[] = [];
|
|
263
|
-
private taskQueue: Array<{ id: number; data: Uint8Array; mimeType: string; resolve: Function; reject: Function }> = [];
|
|
264
|
-
private currentTaskId = 0;
|
|
265
|
-
|
|
266
|
-
constructor(workerCount: number = navigator.hardwareConcurrency || 4) {
|
|
267
|
-
// Module workers allow importing ES modules, standard workers are more compatible
|
|
268
|
-
const useModuleWorkers = hasModuleWorkers();
|
|
269
|
-
|
|
270
|
-
for (let i = 0; i < workerCount; i++) {
|
|
271
|
-
const worker = useModuleWorkers
|
|
272
|
-
? new Worker(new URL('./extraction-worker.ts', import.meta.url), { type: 'module' })
|
|
273
|
-
: new Worker(new URL('./extraction-worker.js', import.meta.url));
|
|
274
|
-
|
|
275
|
-
worker.onmessage = (event) => this.handleWorkerMessage(event.data);
|
|
276
|
-
worker.onerror = (error) => this.handleWorkerError(error);
|
|
277
|
-
this.workers.push(worker);
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
async extract(data: Uint8Array, mimeType: string): Promise<string> {
|
|
282
|
-
return new Promise((resolve, reject) => {
|
|
283
|
-
this.taskQueue.push({
|
|
284
|
-
id: this.currentTaskId++,
|
|
285
|
-
data,
|
|
286
|
-
mimeType,
|
|
287
|
-
resolve,
|
|
288
|
-
reject
|
|
289
|
-
});
|
|
290
|
-
this.processQueue();
|
|
291
|
-
});
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
private processQueue(): void {
|
|
295
|
-
while (this.taskQueue.length > 0) {
|
|
296
|
-
const task = this.taskQueue.shift();
|
|
297
|
-
if (task) {
|
|
298
|
-
const worker = this.workers[task.id % this.workers.length];
|
|
299
|
-
worker.postMessage({ id: task.id, data: task.data, mimeType: task.mimeType });
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
private handleWorkerMessage(data: { id: number; result: string }): void {
|
|
305
|
-
const task = this.taskQueue.find(t => t.id === data.id);
|
|
306
|
-
if (task) {
|
|
307
|
-
task.resolve(data.result);
|
|
308
|
-
this.processQueue();
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
private handleWorkerError(error: ErrorEvent): void {
|
|
313
|
-
console.error('Worker error:', error.message);
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
terminate(): void {
|
|
317
|
-
this.workers.forEach(w => w.terminate());
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
// Usage
|
|
322
|
-
async function processDocumentsInParallel() {
|
|
323
|
-
if (!hasWorkers()) {
|
|
324
|
-
console.log('Web Workers not available, falling back to main thread');
|
|
325
|
-
return;
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
await initWasm();
|
|
329
|
-
const pool = new DocumentWorkerPool(4);
|
|
330
|
-
|
|
331
|
-
const documents = [
|
|
332
|
-
{ data: new Uint8Array([...]), mimeType: 'application/pdf' },
|
|
333
|
-
{ data: new Uint8Array([...]), mimeType: 'application/pdf' },
|
|
334
|
-
];
|
|
335
|
-
|
|
336
|
-
const results = await Promise.all(
|
|
337
|
-
documents.map(doc => pool.extract(doc.data, doc.mimeType))
|
|
338
|
-
);
|
|
339
|
-
|
|
340
|
-
pool.terminate();
|
|
341
|
-
return results;
|
|
342
|
-
}
|
|
343
|
-
```
|
|
344
|
-
|
|
345
|
-
Worker code (`extraction-worker.ts`):
|
|
346
|
-
|
|
347
|
-
```typescript
|
|
348
|
-
import { extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
349
|
-
|
|
350
|
-
let wasmInitialized = false;
|
|
351
|
-
|
|
352
|
-
self.onmessage = async (event) => {
|
|
353
|
-
if (!wasmInitialized) {
|
|
354
|
-
await initWasm();
|
|
355
|
-
wasmInitialized = true;
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
const { id, data, mimeType } = event.data;
|
|
359
|
-
try {
|
|
360
|
-
const result = await extractBytes(new Uint8Array(data), mimeType);
|
|
361
|
-
self.postMessage({ id, result: result.content });
|
|
362
|
-
} catch (error) {
|
|
363
|
-
self.postMessage({ id, error: (error as Error).message });
|
|
364
|
-
}
|
|
365
|
-
};
|
|
245
|
+
.then((results) => console.log(results))
|
|
246
|
+
.catch(console.error);
|
|
366
247
|
```
|
|
367
248
|
|
|
368
|
-
### Memory Management
|
|
369
|
-
|
|
370
|
-
WASM memory is managed by the JavaScript garbage collector:
|
|
371
|
-
|
|
372
|
-
```typescript
|
|
373
|
-
import { initWasm, extractBytes } from '@kreuzberg/wasm';
|
|
374
249
|
|
|
375
|
-
async function extractWithMemoryAwareness() {
|
|
376
|
-
await initWasm();
|
|
377
250
|
|
|
378
|
-
// Process documents one at a time to control memory usage
|
|
379
|
-
const documents = [/* ... */];
|
|
380
251
|
|
|
381
|
-
for (const doc of documents) {
|
|
382
|
-
const result = await extractBytes(doc, 'application/pdf');
|
|
383
252
|
|
|
384
|
-
// Process result immediately
|
|
385
|
-
console.log(result.content);
|
|
386
|
-
|
|
387
|
-
// Result will be garbage collected when no longer referenced
|
|
388
|
-
// Explicitly clear large objects if needed
|
|
389
|
-
// gc(); // Requires --expose-gc flag
|
|
390
|
-
}
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
// Check available memory (browser only)
|
|
394
|
-
if (performance.memory) {
|
|
395
|
-
console.log('Memory usage:', {
|
|
396
|
-
usedJSHeapSize: performance.memory.usedJSHeapSize,
|
|
397
|
-
totalJSHeapSize: performance.memory.totalJSHeapSize,
|
|
398
|
-
jsHeapSizeLimit: performance.memory.jsHeapSizeLimit
|
|
399
|
-
});
|
|
400
|
-
}
|
|
401
|
-
```
|
|
402
253
|
|
|
403
254
|
### Next Steps
|
|
404
255
|
|
|
@@ -408,108 +259,7 @@ if (performance.memory) {
|
|
|
408
259
|
- **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
|
|
409
260
|
- **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
|
|
410
261
|
|
|
411
|
-
## WASM-Specific Implementation Details
|
|
412
|
-
|
|
413
|
-
### Initialization
|
|
414
|
-
|
|
415
|
-
WASM binaries must be loaded before extraction:
|
|
416
262
|
|
|
417
|
-
```typescript
|
|
418
|
-
import { initWasm } from '@kreuzberg/wasm';
|
|
419
|
-
|
|
420
|
-
// Initialize once at application startup
|
|
421
|
-
await initWasm();
|
|
422
|
-
|
|
423
|
-
// Now extraction functions can be used
|
|
424
|
-
```
|
|
425
|
-
|
|
426
|
-
The init function:
|
|
427
|
-
- Downloads and instantiates the WASM binary
|
|
428
|
-
- Initializes the memory space (linear memory module)
|
|
429
|
-
- Prepares thread pools if available
|
|
430
|
-
- Throws if WASM is not supported in the environment
|
|
431
|
-
|
|
432
|
-
### Threading Model
|
|
433
|
-
|
|
434
|
-
- Single-threaded by default (main thread execution)
|
|
435
|
-
- Web Workers optional for background processing
|
|
436
|
-
- Shared memory (SharedArrayBuffer) not required
|
|
437
|
-
- Message passing used for worker communication
|
|
438
|
-
- No blocking operations on main thread with worker pool
|
|
439
|
-
|
|
440
|
-
### Memory Considerations
|
|
441
|
-
|
|
442
|
-
- Each WASM instance has its own 4GB linear memory address space
|
|
443
|
-
- Large documents (> 100 MB) may not fit in WASM memory
|
|
444
|
-
- Binary data is copied between JavaScript and WASM boundaries
|
|
445
|
-
- Garbage collection is handled by JavaScript runtime
|
|
446
|
-
- No manual memory management required
|
|
447
|
-
|
|
448
|
-
### Supported Extraction Targets
|
|
449
|
-
|
|
450
|
-
Different file formats have varying support in WASM:
|
|
451
|
-
|
|
452
|
-
| Format | Support | Notes |
|
|
453
|
-
|--------|---------|-------|
|
|
454
|
-
| PDF | Full | Text, images, metadata extraction |
|
|
455
|
-
| Office (DOCX, XLSX, PPTX) | Full | All features supported |
|
|
456
|
-
| Images (PNG, JPG, etc) | Full | EXIF metadata extraction |
|
|
457
|
-
| Archives (ZIP, TAR) | Full | Listing and extraction |
|
|
458
|
-
| OCR | Limited | Tesseract WASM only, main thread only |
|
|
459
|
-
| Embeddings | Not Available | WASM has no ML model support |
|
|
460
|
-
|
|
461
|
-
### Platform Limitations
|
|
462
|
-
|
|
463
|
-
**LibreOffice-Dependent Formats Not Available**
|
|
464
|
-
|
|
465
|
-
WASM cannot load native LibreOffice binaries, so older Office formats are **not supported**:
|
|
466
|
-
|
|
467
|
-
- ❌ **DOC** (Microsoft Word 97-2003) - Use DOCX instead
|
|
468
|
-
- ❌ **XLS** (Microsoft Excel 97-2003) - Use XLSX instead
|
|
469
|
-
- ❌ **PPT** (Microsoft PowerPoint 97-2003) - Use PPTX instead
|
|
470
|
-
- ❌ **RTF** (Rich Text Format with complex features)
|
|
471
|
-
- ❌ **ODT/ODS/ODP** (LibreOffice/OpenOffice formats)
|
|
472
|
-
|
|
473
|
-
Modern Office formats (DOCX, XLSX, PPTX) are fully supported and don't require LibreOffice.
|
|
474
|
-
|
|
475
|
-
**Polars Integration Not Available**
|
|
476
|
-
|
|
477
|
-
- ❌ Polars DataFrame extraction/conversion not available in WASM
|
|
478
|
-
- ❌ Structured data operations limited compared to Node.js binding
|
|
479
|
-
|
|
480
|
-
**Alternative: Use Node.js Binding**
|
|
481
|
-
|
|
482
|
-
If you need support for older Office formats or Polars integration, use the `@kreuzberg/node` package instead:
|
|
483
|
-
|
|
484
|
-
```bash
|
|
485
|
-
npm install @kreuzberg/node
|
|
486
|
-
```
|
|
487
|
-
|
|
488
|
-
The Node.js binding provides:
|
|
489
|
-
- ✅ Full LibreOffice format support (DOC, XLS, PPT, RTF, ODT)
|
|
490
|
-
- ✅ Polars DataFrame integration
|
|
491
|
-
- ✅ All OCR backends (Tesseract, EasyOCR, PaddleOCR)
|
|
492
|
-
- ✅ Full embedding model support
|
|
493
|
-
|
|
494
|
-
**Format Comparison Table**
|
|
495
|
-
|
|
496
|
-
| Format Type | WASM Support | Node.js Support |
|
|
497
|
-
|-------------|--------------|-----------------|
|
|
498
|
-
| Modern Office (DOCX/XLSX/PPTX) | ✅ Full | ✅ Full |
|
|
499
|
-
| Legacy Office (DOC/XLS/PPT) | ❌ Not Available | ✅ Requires LibreOffice |
|
|
500
|
-
| OpenOffice (ODT/ODS/ODP) | ❌ Not Available | ✅ Requires LibreOffice |
|
|
501
|
-
| PDF | ✅ Full | ✅ Full |
|
|
502
|
-
| Images | ✅ Full | ✅ Full |
|
|
503
|
-
| Embeddings | ❌ Not Available | ✅ With ONNX Runtime |
|
|
504
|
-
| Polars | ❌ Not Available | ✅ Available |
|
|
505
|
-
|
|
506
|
-
### Sandbox Security
|
|
507
|
-
|
|
508
|
-
- WASM code runs in a sandbox with restricted capabilities
|
|
509
|
-
- File system access requires user interaction (File API)
|
|
510
|
-
- Network access follows CORS restrictions
|
|
511
|
-
- No access to Node.js native modules
|
|
512
|
-
- Content Security Policy (CSP) may restrict WASM loading
|
|
513
263
|
|
|
514
264
|
## Features
|
|
515
265
|
|
|
@@ -563,25 +313,20 @@ The Node.js binding provides:
|
|
|
563
313
|
### Key Capabilities
|
|
564
314
|
|
|
565
315
|
- **Text Extraction** - Extract all text content with position and formatting information
|
|
566
|
-
|
|
567
316
|
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
568
|
-
|
|
569
317
|
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
570
|
-
|
|
571
318
|
- **Image Extraction** - Extract embedded images and render page previews
|
|
572
|
-
|
|
573
319
|
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
|
574
320
|
|
|
575
321
|
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
576
322
|
|
|
323
|
+
|
|
577
324
|
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
578
325
|
|
|
579
|
-
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
580
326
|
|
|
327
|
+
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
581
328
|
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
582
|
-
|
|
583
329
|
- **Language Detection** - Detect and support multiple languages in documents
|
|
584
|
-
|
|
585
330
|
- **Configuration** - Fine-grained control over extraction behavior
|
|
586
331
|
|
|
587
332
|
### Performance Characteristics
|
|
@@ -594,83 +339,96 @@ The Node.js binding provides:
|
|
|
594
339
|
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
|
595
340
|
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
|
596
341
|
|
|
342
|
+
|
|
343
|
+
|
|
597
344
|
## OCR Support
|
|
598
345
|
|
|
599
346
|
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
|
600
347
|
|
|
348
|
+
|
|
601
349
|
- **Tesseract-Wasm**
|
|
602
350
|
|
|
351
|
+
|
|
603
352
|
### OCR Configuration Example
|
|
604
353
|
|
|
605
354
|
```ts
|
|
606
355
|
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
607
356
|
|
|
608
357
|
async function extractWithOcr() {
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
358
|
+
await initWasm();
|
|
359
|
+
|
|
360
|
+
try {
|
|
361
|
+
await enableOcr();
|
|
362
|
+
console.log("OCR enabled successfully");
|
|
363
|
+
} catch (error) {
|
|
364
|
+
console.error("Failed to enable OCR:", error);
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
|
369
|
+
|
|
370
|
+
const result = await extractBytes(bytes, "image/png", {
|
|
371
|
+
ocr: {
|
|
372
|
+
backend: "tesseract-wasm",
|
|
373
|
+
language: "eng",
|
|
374
|
+
},
|
|
375
|
+
});
|
|
376
|
+
|
|
377
|
+
console.log("Extracted text:");
|
|
378
|
+
console.log(result.content);
|
|
630
379
|
}
|
|
631
380
|
|
|
632
381
|
extractWithOcr().catch(console.error);
|
|
633
382
|
```
|
|
634
383
|
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
|
|
635
387
|
## Async Support
|
|
636
388
|
|
|
637
389
|
This binding provides full async/await support for non-blocking document processing:
|
|
638
390
|
|
|
639
391
|
```ts
|
|
640
|
-
import { extractBytes,
|
|
392
|
+
import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
|
|
641
393
|
|
|
642
394
|
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
395
|
+
const caps = getWasmCapabilities();
|
|
396
|
+
if (!caps.hasWasm) {
|
|
397
|
+
throw new Error("WebAssembly not supported");
|
|
398
|
+
}
|
|
647
399
|
|
|
648
|
-
|
|
400
|
+
await initWasm();
|
|
649
401
|
|
|
650
|
-
|
|
651
|
-
files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
|
|
652
|
-
);
|
|
402
|
+
const results = await Promise.all(files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
|
|
653
403
|
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
404
|
+
return results.map((r) => ({
|
|
405
|
+
content: r.content,
|
|
406
|
+
pageCount: r.metadata?.pageCount,
|
|
407
|
+
}));
|
|
658
408
|
}
|
|
659
409
|
|
|
660
410
|
const fileBytes = [new Uint8Array([1, 2, 3])];
|
|
661
411
|
const mimes = ["application/pdf"];
|
|
662
412
|
|
|
663
413
|
extractDocuments(fileBytes, mimes)
|
|
664
|
-
|
|
665
|
-
|
|
414
|
+
.then((results) => console.log(results))
|
|
415
|
+
.catch(console.error);
|
|
666
416
|
```
|
|
667
417
|
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
|
|
668
421
|
## Plugin System
|
|
669
422
|
|
|
670
423
|
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
|
671
424
|
|
|
672
425
|
For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
|
|
673
426
|
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
|
|
674
432
|
## Batch Processing
|
|
675
433
|
|
|
676
434
|
Process multiple documents efficiently:
|
|
@@ -679,38 +437,41 @@ Process multiple documents efficiently:
|
|
|
679
437
|
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
680
438
|
|
|
681
439
|
interface DocumentJob {
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
440
|
+
name: string;
|
|
441
|
+
bytes: Uint8Array;
|
|
442
|
+
mimeType: string;
|
|
685
443
|
}
|
|
686
444
|
|
|
687
|
-
async function
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
445
|
+
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
|
|
446
|
+
await initWasm();
|
|
447
|
+
|
|
448
|
+
const results: Record<string, string> = {};
|
|
449
|
+
const queue = [...documents];
|
|
450
|
+
|
|
451
|
+
const workers = Array(concurrency)
|
|
452
|
+
.fill(null)
|
|
453
|
+
.map(async () => {
|
|
454
|
+
while (queue.length > 0) {
|
|
455
|
+
const doc = queue.shift();
|
|
456
|
+
if (!doc) break;
|
|
457
|
+
|
|
458
|
+
try {
|
|
459
|
+
const result = await extractBytes(doc.bytes, doc.mimeType);
|
|
460
|
+
results[doc.name] = result.content;
|
|
461
|
+
} catch (error) {
|
|
462
|
+
console.error(`Failed to process ${doc.name}:`, error);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
await Promise.all(workers);
|
|
468
|
+
return results;
|
|
711
469
|
}
|
|
712
470
|
```
|
|
713
471
|
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
|
|
714
475
|
## Configuration
|
|
715
476
|
|
|
716
477
|
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|