@kreuzberg/wasm 4.0.0-rc.25 → 4.0.0-rc.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,742 @@
1
+ # WebAssembly Bindings
2
+
3
+ <div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
4
+ <!-- Language Bindings -->
5
+ <a href="https://crates.io/crates/kreuzberg">
6
+ <img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
7
+ </a>
8
+ <a href="https://hex.pm/packages/kreuzberg">
9
+ <img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
10
+ </a>
11
+ <a href="https://pypi.org/project/kreuzberg/">
12
+ <img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
13
+ </a>
14
+ <a href="https://www.npmjs.com/package/@kreuzberg/node">
15
+ <img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
16
+ </a>
17
+ <a href="https://www.npmjs.com/package/@kreuzberg/wasm">
18
+ <img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
19
+ </a>
20
+
21
+ <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
22
+ <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
+ </a>
24
+ <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0-*" alt="Go">
26
+ </a>
27
+ <a href="https://www.nuget.org/packages/Kreuzberg/">
28
+ <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
29
+ </a>
30
+ <a href="https://packagist.org/packages/kreuzberg/kreuzberg">
31
+ <img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
32
+ </a>
33
+ <a href="https://rubygems.org/gems/kreuzberg">
34
+ <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
+ </a>
36
+
37
+ <!-- Project Info -->
38
+
39
+ <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
40
+ <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
41
+ </a>
42
+ <a href="https://docs.kreuzberg.dev">
43
+ <img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
44
+ </a>
45
+ </div>
46
+
47
+ <img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
48
+
49
+ <div align="center" style="margin-top: 20px;">
50
+ <a href="https://discord.gg/pXxagNK2zN">
51
+ <img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
52
+ </a>
53
+ </div>
54
+
55
+ Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Node.js, Deno, and Cloudflare Workers with portable deployment and optional multi-threading support.
56
+
57
+ > **Version 4.0.0 Release Candidate**
58
+ > Kreuzberg v4.0.0 is in **Release Candidate** stage. Bugs and breaking changes are expected.
59
+ > This is a pre-release version. Please test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
60
+
61
+ ## Installation
62
+
63
+ ### Package Installation
64
+
65
+ Install via one of the supported package managers:
66
+
67
+ **npm:**
68
+
69
+ ```bash
70
+ npm install @kreuzberg/wasm
71
+ ```
72
+
73
+ **pnpm:**
74
+
75
+ ```bash
76
+ pnpm add @kreuzberg/wasm
77
+ ```
78
+
79
+ **yarn:**
80
+
81
+ ```bash
82
+ yarn add @kreuzberg/wasm
83
+ ```
84
+
85
+ ### Platform Support
86
+
87
+ Runs on:
88
+ - Modern browsers (Chrome, Firefox, Safari, Edge with WebAssembly support)
89
+ - Node.js 16+ (with WASM runtime)
90
+ - Deno 1.0+
91
+ - Cloudflare Workers
92
+ - Any JavaScript environment with WebAssembly support
93
+
94
+ ### System Requirements
95
+
96
+ - WebAssembly support in runtime environment
97
+ - 50 MB minimum free memory for extraction
98
+ - Optional: [Tesseract WASM](https://github.com/naptha/tesseract.js) for OCR functionality
99
+
100
+ ### Runtime Detection
101
+
102
+ Check platform capabilities before extraction:
103
+
104
+ ```typescript
105
+ import { getWasmCapabilities } from '@kreuzberg/wasm';
106
+
107
+ const caps = getWasmCapabilities();
108
+ console.log('WASM available:', caps.hasWasm);
109
+ console.log('Web Workers available:', caps.hasWorkers);
110
+ console.log('Module Workers available:', caps.hasModuleWorkers);
111
+ console.log('File API available:', caps.hasFileApi);
112
+ console.log('SharedArrayBuffer available:', caps.hasSharedArrayBuffer);
113
+ ```
114
+
115
+ ## Quick Start
116
+
117
+ ### Basic Extraction
118
+
119
+ Extract text, metadata, and structure from any supported document format:
120
+
121
+ ```ts
122
+ import { extractBytes, initWasm } from "@kreuzberg/wasm";
123
+
124
+ async function main() {
125
+ await initWasm();
126
+
127
+ const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
128
+ const bytes = new Uint8Array(buffer);
129
+
130
+ const result = await extractBytes(bytes, "application/pdf");
131
+
132
+ console.log("Extracted content:");
133
+ console.log(result.content);
134
+ console.log("MIME type:", result.mimeType);
135
+ console.log("Metadata:", result.metadata);
136
+ }
137
+
138
+ main().catch(console.error);
139
+ ```
140
+
141
+ ### Common Use Cases
142
+
143
+ #### Extract with Custom Configuration
144
+
145
+ Most use cases benefit from configuration to control extraction behavior:
146
+
147
+ **With OCR (for scanned documents):**
148
+
149
+ ```ts
150
+ import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
151
+
152
+ async function extractWithOcr() {
153
+ await initWasm();
154
+
155
+ try {
156
+ await enableOcr();
157
+ console.log("OCR enabled successfully");
158
+ } catch (error) {
159
+ console.error("Failed to enable OCR:", error);
160
+ return;
161
+ }
162
+
163
+ const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
164
+
165
+ const result = await extractBytes(bytes, "image/png", {
166
+ ocr: {
167
+ backend: "tesseract-wasm",
168
+ language: "eng",
169
+ },
170
+ });
171
+
172
+ console.log("Extracted text:");
173
+ console.log(result.content);
174
+ }
175
+
176
+ extractWithOcr().catch(console.error);
177
+ ```
178
+
179
+ #### Table Extraction
180
+
181
+ See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
182
+
183
+ #### Processing Multiple Files
184
+
185
+ ```ts
186
+ import { extractBytes, initWasm } from "@kreuzberg/wasm";
187
+
188
+ interface DocumentJob {
189
+ name: string;
190
+ bytes: Uint8Array;
191
+ mimeType: string;
192
+ }
193
+
194
+ async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
195
+ await initWasm();
196
+
197
+ const results: Record<string, string> = {};
198
+ const queue = [...documents];
199
+
200
+ const workers = Array(concurrency)
201
+ .fill(null)
202
+ .map(async () => {
203
+ while (queue.length > 0) {
204
+ const doc = queue.shift();
205
+ if (!doc) break;
206
+
207
+ try {
208
+ const result = await extractBytes(doc.bytes, doc.mimeType);
209
+ results[doc.name] = result.content;
210
+ } catch (error) {
211
+ console.error(`Failed to process ${doc.name}:`, error);
212
+ }
213
+ }
214
+ });
215
+
216
+ await Promise.all(workers);
217
+ return results;
218
+ }
219
+ ```
220
+
221
+ #### Async Processing
222
+
223
+ For non-blocking document processing:
224
+
225
+ ```ts
226
+ import { extractBytes, initWasm, getWasmCapabilities } from "@kreuzberg/wasm";
227
+
228
+ async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
229
+ const caps = getWasmCapabilities();
230
+ if (!caps.hasWasm) {
231
+ throw new Error("WebAssembly not supported");
232
+ }
233
+
234
+ await initWasm();
235
+
236
+ const results = await Promise.all(
237
+ files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
238
+ );
239
+
240
+ return results.map((r) => ({
241
+ content: r.content,
242
+ pageCount: r.metadata?.pageCount,
243
+ }));
244
+ }
245
+
246
+ const fileBytes = [new Uint8Array([1, 2, 3])];
247
+ const mimes = ["application/pdf"];
248
+
249
+ extractDocuments(fileBytes, mimes)
250
+ .then((results) => console.log(results))
251
+ .catch(console.error);
252
+ ```
253
+
254
+ #### Worker Pool Usage
255
+
256
+ When Web Workers are available, use worker threads for parallel document processing without blocking the main thread:
257
+
258
+ ```typescript
259
+ import { extractBytes, initWasm, hasWorkers, hasModuleWorkers } from '@kreuzberg/wasm';
260
+
261
+ class DocumentWorkerPool {
262
+ private workers: Worker[] = [];
263
+ private taskQueue: Array<{ id: number; data: Uint8Array; mimeType: string; resolve: Function; reject: Function }> = [];
264
+ private currentTaskId = 0;
265
+
266
+ constructor(workerCount: number = navigator.hardwareConcurrency || 4) {
267
+ // Module workers allow importing ES modules, standard workers are more compatible
268
+ const useModuleWorkers = hasModuleWorkers();
269
+
270
+ for (let i = 0; i < workerCount; i++) {
271
+ const worker = useModuleWorkers
272
+ ? new Worker(new URL('./extraction-worker.ts', import.meta.url), { type: 'module' })
273
+ : new Worker(new URL('./extraction-worker.js', import.meta.url));
274
+
275
+ worker.onmessage = (event) => this.handleWorkerMessage(event.data);
276
+ worker.onerror = (error) => this.handleWorkerError(error);
277
+ this.workers.push(worker);
278
+ }
279
+ }
280
+
281
+ async extract(data: Uint8Array, mimeType: string): Promise<string> {
282
+ return new Promise((resolve, reject) => {
283
+ this.taskQueue.push({
284
+ id: this.currentTaskId++,
285
+ data,
286
+ mimeType,
287
+ resolve,
288
+ reject
289
+ });
290
+ this.processQueue();
291
+ });
292
+ }
293
+
294
+ private processQueue(): void {
295
+ while (this.taskQueue.length > 0) {
296
+ const task = this.taskQueue.shift();
297
+ if (task) {
298
+ const worker = this.workers[task.id % this.workers.length];
299
+ worker.postMessage({ id: task.id, data: task.data, mimeType: task.mimeType });
300
+ }
301
+ }
302
+ }
303
+
304
+ private handleWorkerMessage(data: { id: number; result: string }): void {
305
+ const task = this.taskQueue.find(t => t.id === data.id);
306
+ if (task) {
307
+ task.resolve(data.result);
308
+ this.processQueue();
309
+ }
310
+ }
311
+
312
+ private handleWorkerError(error: ErrorEvent): void {
313
+ console.error('Worker error:', error.message);
314
+ }
315
+
316
+ terminate(): void {
317
+ this.workers.forEach(w => w.terminate());
318
+ }
319
+ }
320
+
321
+ // Usage
322
+ async function processDocumentsInParallel() {
323
+ if (!hasWorkers()) {
324
+ console.log('Web Workers not available, falling back to main thread');
325
+ return;
326
+ }
327
+
328
+ await initWasm();
329
+ const pool = new DocumentWorkerPool(4);
330
+
331
+ const documents = [
332
+ { data: new Uint8Array([...]), mimeType: 'application/pdf' },
333
+ { data: new Uint8Array([...]), mimeType: 'application/pdf' },
334
+ ];
335
+
336
+ const results = await Promise.all(
337
+ documents.map(doc => pool.extract(doc.data, doc.mimeType))
338
+ );
339
+
340
+ pool.terminate();
341
+ return results;
342
+ }
343
+ ```
344
+
345
+ Worker code (`extraction-worker.ts`):
346
+
347
+ ```typescript
348
+ import { extractBytes, initWasm } from '@kreuzberg/wasm';
349
+
350
+ let wasmInitialized = false;
351
+
352
+ self.onmessage = async (event) => {
353
+ if (!wasmInitialized) {
354
+ await initWasm();
355
+ wasmInitialized = true;
356
+ }
357
+
358
+ const { id, data, mimeType } = event.data;
359
+ try {
360
+ const result = await extractBytes(new Uint8Array(data), mimeType);
361
+ self.postMessage({ id, result: result.content });
362
+ } catch (error) {
363
+ self.postMessage({ id, error: (error as Error).message });
364
+ }
365
+ };
366
+ ```
367
+
368
+ ### Memory Management
369
+
370
+ WASM memory is managed by the JavaScript garbage collector:
371
+
372
+ ```typescript
373
+ import { initWasm, extractBytes } from '@kreuzberg/wasm';
374
+
375
+ async function extractWithMemoryAwareness() {
376
+ await initWasm();
377
+
378
+ // Process documents one at a time to control memory usage
379
+ const documents = [/* ... */];
380
+
381
+ for (const doc of documents) {
382
+ const result = await extractBytes(doc, 'application/pdf');
383
+
384
+ // Process result immediately
385
+ console.log(result.content);
386
+
387
+ // Result will be garbage collected when no longer referenced
388
+ // Explicitly clear large objects if needed
389
+ // gc(); // Requires --expose-gc flag
390
+ }
391
+ }
392
+
393
+ // Check available memory (browser only)
394
+ if (performance.memory) {
395
+ console.log('Memory usage:', {
396
+ usedJSHeapSize: performance.memory.usedJSHeapSize,
397
+ totalJSHeapSize: performance.memory.totalJSHeapSize,
398
+ jsHeapSizeLimit: performance.memory.jsHeapSizeLimit
399
+ });
400
+ }
401
+ ```
402
+
403
+ ### Next Steps
404
+
405
+ - **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
406
+ - **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
407
+ - **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
408
+ - **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
409
+ - **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
410
+
411
+ ## WASM-Specific Implementation Details
412
+
413
+ ### Initialization
414
+
415
+ WASM binaries must be loaded before extraction:
416
+
417
+ ```typescript
418
+ import { initWasm } from '@kreuzberg/wasm';
419
+
420
+ // Initialize once at application startup
421
+ await initWasm();
422
+
423
+ // Now extraction functions can be used
424
+ ```
425
+
426
+ The init function:
427
+ - Downloads and instantiates the WASM binary
428
+ - Initializes the memory space (linear memory module)
429
+ - Prepares thread pools if available
430
+ - Throws if WASM is not supported in the environment
431
+
432
+ ### Threading Model
433
+
434
+ - Single-threaded by default (main thread execution)
435
+ - Web Workers optional for background processing
436
+ - Shared memory (SharedArrayBuffer) not required
437
+ - Message passing used for worker communication
438
+ - No blocking operations on main thread with worker pool
439
+
440
+ ### Memory Considerations
441
+
442
+ - Each WASM instance has its own 4GB linear memory address space
443
+ - Large documents (> 100 MB) may not fit in WASM memory
444
+ - Binary data is copied between JavaScript and WASM boundaries
445
+ - Garbage collection is handled by JavaScript runtime
446
+ - No manual memory management required
447
+
448
+ ### Supported Extraction Targets
449
+
450
+ Different file formats have varying support in WASM:
451
+
452
+ | Format | Support | Notes |
453
+ |--------|---------|-------|
454
+ | PDF | Full | Text, images, metadata extraction |
455
+ | Office (DOCX, XLSX, PPTX) | Full | All features supported |
456
+ | Images (PNG, JPG, etc) | Full | EXIF metadata extraction |
457
+ | Archives (ZIP, TAR) | Full | Listing and extraction |
458
+ | OCR | Limited | Tesseract WASM only, main thread only |
459
+ | Embeddings | Not Available | WASM has no ML model support |
460
+
461
+ ### Platform Limitations
462
+
463
+ **LibreOffice-Dependent Formats Not Available**
464
+
465
+ WASM cannot load native LibreOffice binaries, so older Office formats are **not supported**:
466
+
467
+ - ❌ **DOC** (Microsoft Word 97-2003) - Use DOCX instead
468
+ - ❌ **XLS** (Microsoft Excel 97-2003) - Use XLSX instead
469
+ - ❌ **PPT** (Microsoft PowerPoint 97-2003) - Use PPTX instead
470
+ - ❌ **RTF** (Rich Text Format with complex features)
471
+ - ❌ **ODT/ODS/ODP** (LibreOffice/OpenOffice formats)
472
+
473
+ Modern Office formats (DOCX, XLSX, PPTX) are fully supported and don't require LibreOffice.
474
+
475
+ **Polars Integration Not Available**
476
+
477
+ - ❌ Polars DataFrame extraction/conversion not available in WASM
478
+ - ❌ Structured data operations limited compared to Node.js binding
479
+
480
+ **Alternative: Use Node.js Binding**
481
+
482
+ If you need support for older Office formats or Polars integration, use the `@kreuzberg/node` package instead:
483
+
484
+ ```bash
485
+ npm install @kreuzberg/node
486
+ ```
487
+
488
+ The Node.js binding provides:
489
+ - ✅ Full LibreOffice format support (DOC, XLS, PPT, RTF, ODT)
490
+ - ✅ Polars DataFrame integration
491
+ - ✅ All OCR backends (Tesseract, EasyOCR, PaddleOCR)
492
+ - ✅ Full embedding model support
493
+
494
+ **Format Comparison Table**
495
+
496
+ | Format Type | WASM Support | Node.js Support |
497
+ |-------------|--------------|-----------------|
498
+ | Modern Office (DOCX/XLSX/PPTX) | ✅ Full | ✅ Full |
499
+ | Legacy Office (DOC/XLS/PPT) | ❌ Not Available | ✅ Requires LibreOffice |
500
+ | OpenOffice (ODT/ODS/ODP) | ❌ Not Available | ✅ Requires LibreOffice |
501
+ | PDF | ✅ Full | ✅ Full |
502
+ | Images | ✅ Full | ✅ Full |
503
+ | Embeddings | ❌ Not Available | ✅ With ONNX Runtime |
504
+ | Polars | ❌ Not Available | ✅ Available |
505
+
506
+ ### Sandbox Security
507
+
508
+ - WASM code runs in a sandbox with restricted capabilities
509
+ - File system access requires user interaction (File API)
510
+ - Network access follows CORS restrictions
511
+ - No access to Node.js native modules
512
+ - Content Security Policy (CSP) may restrict WASM loading
513
+
514
+ ## Features
515
+
516
+ ### Supported File Formats (56+)
517
+
518
+ 56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
519
+
520
+ #### Office Documents
521
+
522
+ | Category | Formats | Capabilities |
523
+ |----------|---------|--------------|
524
+ | **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
525
+ | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
526
+ | **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
527
+ | **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
528
+ | **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
529
+
530
+ #### Images (OCR-Enabled)
531
+
532
+ | Category | Formats | Features |
533
+ |----------|---------|----------|
534
+ | **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
535
+ | **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR, table detection, format-specific metadata |
536
+ | **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
537
+
538
+ #### Web & Data
539
+
540
+ | Category | Formats | Features |
541
+ |----------|---------|----------|
542
+ | **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
543
+ | **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
544
+ | **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
545
+
546
+ #### Email & Archives
547
+
548
+ | Category | Formats | Features |
549
+ |----------|---------|----------|
550
+ | **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
551
+ | **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
552
+
553
+ #### Academic & Scientific
554
+
555
+ | Category | Formats | Features |
556
+ |----------|---------|----------|
557
+ | **Citations** | `.bib`, `.biblatex`, `.ris`, `.enw`, `.csl` | Bibliography parsing, citation extraction |
558
+ | **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
559
+ | **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
560
+
561
+ **[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
562
+
563
+ ### Key Capabilities
564
+
565
+ - **Text Extraction** - Extract all text content with position and formatting information
566
+
567
+ - **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
568
+
569
+ - **Table Extraction** - Parse tables with structure and cell content preservation
570
+
571
+ - **Image Extraction** - Extract embedded images and render page previews
572
+
573
+ - **OCR Support** - Integrate multiple OCR backends for scanned documents
574
+
575
+ - **Async/Await** - Non-blocking document processing with concurrent operations
576
+
577
+ - **Plugin System** - Extensible post-processing for custom text transformation
578
+
579
+ - **Batch Processing** - Efficiently process multiple documents in parallel
580
+
581
+ - **Memory Efficient** - Stream large files without loading entirely into memory
582
+
583
+ - **Language Detection** - Detect and support multiple languages in documents
584
+
585
+ - **Configuration** - Fine-grained control over extraction behavior
586
+
587
+ ### Performance Characteristics
588
+
589
+ | Format | Speed | Memory | Notes |
590
+ |--------|-------|--------|-------|
591
+ | **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
592
+ | **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
593
+ | **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
594
+ | **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
595
+ | **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
596
+
597
+ ## OCR Support
598
+
599
+ Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
600
+
601
+ - **Tesseract-Wasm**
602
+
603
+ ### OCR Configuration Example
604
+
605
+ ```ts
606
+ import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
607
+
608
+ async function extractWithOcr() {
609
+ await initWasm();
610
+
611
+ try {
612
+ await enableOcr();
613
+ console.log("OCR enabled successfully");
614
+ } catch (error) {
615
+ console.error("Failed to enable OCR:", error);
616
+ return;
617
+ }
618
+
619
+ const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
620
+
621
+ const result = await extractBytes(bytes, "image/png", {
622
+ ocr: {
623
+ backend: "tesseract-wasm",
624
+ language: "eng",
625
+ },
626
+ });
627
+
628
+ console.log("Extracted text:");
629
+ console.log(result.content);
630
+ }
631
+
632
+ extractWithOcr().catch(console.error);
633
+ ```
634
+
635
+ ## Async Support
636
+
637
+ This binding provides full async/await support for non-blocking document processing:
638
+
639
+ ```ts
640
+ import { extractBytes, initWasm, getWasmCapabilities } from "@kreuzberg/wasm";
641
+
642
+ async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
643
+ const caps = getWasmCapabilities();
644
+ if (!caps.hasWasm) {
645
+ throw new Error("WebAssembly not supported");
646
+ }
647
+
648
+ await initWasm();
649
+
650
+ const results = await Promise.all(
651
+ files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
652
+ );
653
+
654
+ return results.map((r) => ({
655
+ content: r.content,
656
+ pageCount: r.metadata?.pageCount,
657
+ }));
658
+ }
659
+
660
+ const fileBytes = [new Uint8Array([1, 2, 3])];
661
+ const mimes = ["application/pdf"];
662
+
663
+ extractDocuments(fileBytes, mimes)
664
+ .then((results) => console.log(results))
665
+ .catch(console.error);
666
+ ```
667
+
668
+ ## Plugin System
669
+
670
+ Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
671
+
672
+ For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
673
+
674
+ ## Batch Processing
675
+
676
+ Process multiple documents efficiently:
677
+
678
+ ```ts
679
+ import { extractBytes, initWasm } from "@kreuzberg/wasm";
680
+
681
+ interface DocumentJob {
682
+ name: string;
683
+ bytes: Uint8Array;
684
+ mimeType: string;
685
+ }
686
+
687
+ async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
688
+ await initWasm();
689
+
690
+ const results: Record<string, string> = {};
691
+ const queue = [...documents];
692
+
693
+ const workers = Array(concurrency)
694
+ .fill(null)
695
+ .map(async () => {
696
+ while (queue.length > 0) {
697
+ const doc = queue.shift();
698
+ if (!doc) break;
699
+
700
+ try {
701
+ const result = await extractBytes(doc.bytes, doc.mimeType);
702
+ results[doc.name] = result.content;
703
+ } catch (error) {
704
+ console.error(`Failed to process ${doc.name}:`, error);
705
+ }
706
+ }
707
+ });
708
+
709
+ await Promise.all(workers);
710
+ return results;
711
+ }
712
+ ```
713
+
714
+ ## Configuration
715
+
716
+ For advanced configuration options including language detection, table extraction, OCR settings, and more:
717
+
718
+ **[Configuration Guide](https://kreuzberg.dev/configuration/)**
719
+
720
+ ## Documentation
721
+
722
+ - **[Official Documentation](https://kreuzberg.dev/)**
723
+ - **[API Reference](https://kreuzberg.dev/reference/api-wasm/)**
724
+ - **[Examples & Guides](https://kreuzberg.dev/guides/)**
725
+
726
+ ## Troubleshooting
727
+
728
+ For common issues and solutions, visit [Troubleshooting Guide](https://kreuzberg.dev/troubleshooting/).
729
+
730
+ ## Contributing
731
+
732
+ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
733
+
734
+ ## License
735
+
736
+ MIT License - see LICENSE file for details.
737
+
738
+ ## Support
739
+
740
+ - **Discord Community**: [Join our Discord](https://discord.gg/pXxagNK2zN)
741
+ - **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
742
+ - **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)