@kreuzberg/wasm 4.0.0-rc.29 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # WebAssembly Bindings
1
+ # WebAssembly
2
2
 
3
3
  <div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
4
4
  <!-- Language Bindings -->
@@ -18,11 +18,11 @@
18
18
  <img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
19
19
  </a>
20
20
 
21
- <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
21
+ <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0-*" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -34,9 +34,8 @@
34
34
  <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
35
  </a>
36
36
 
37
- <!-- Project Info -->
38
-
39
- <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
37
+ <!-- Project Info -->
38
+ <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
40
39
  <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
41
40
  </a>
42
41
  <a href="https://docs.kreuzberg.dev">
@@ -52,65 +51,50 @@
52
51
  </a>
53
52
  </div>
54
53
 
55
- Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Node.js, Deno, and Cloudflare Workers with portable deployment and optional multi-threading support.
56
54
 
57
- > **Version 4.0.0 Release Candidate**
58
- > Kreuzberg v4.0.0 is in **Release Candidate** stage. Bugs and breaking changes are expected.
59
- > This is a pre-release version. Please test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
55
+ Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
56
+
60
57
 
61
58
  ## Installation
62
59
 
63
60
  ### Package Installation
64
61
 
62
+
65
63
  Install via one of the supported package managers:
66
64
 
67
- **npm:**
68
65
 
66
+
67
+ **npm:**
69
68
  ```bash
70
69
  npm install @kreuzberg/wasm
71
70
  ```
72
71
 
73
- **pnpm:**
74
72
 
73
+
74
+
75
+ **pnpm:**
75
76
  ```bash
76
77
  pnpm add @kreuzberg/wasm
77
78
  ```
78
79
 
79
- **yarn:**
80
80
 
81
+
82
+
83
+ **yarn:**
81
84
  ```bash
82
85
  yarn add @kreuzberg/wasm
83
86
  ```
84
87
 
85
- ### Platform Support
86
88
 
87
- Runs on:
88
- - Modern browsers (Chrome, Firefox, Safari, Edge with WebAssembly support)
89
- - Node.js 16+ (with WASM runtime)
90
- - Deno 1.0+
91
- - Cloudflare Workers
92
- - Any JavaScript environment with WebAssembly support
89
+
90
+
93
91
 
94
92
  ### System Requirements
95
93
 
96
- - WebAssembly support in runtime environment
97
- - 50 MB minimum free memory for extraction
94
+ - Modern browser with WebAssembly support, or Deno 1.0+, or Cloudflare Workers
98
95
  - Optional: [Tesseract WASM](https://github.com/naptha/tesseract.js) for OCR functionality
99
96
 
100
- ### Runtime Detection
101
97
 
102
- Check platform capabilities before extraction:
103
-
104
- ```typescript
105
- import { getWasmCapabilities } from '@kreuzberg/wasm';
106
-
107
- const caps = getWasmCapabilities();
108
- console.log('WASM available:', caps.hasWasm);
109
- console.log('Web Workers available:', caps.hasWorkers);
110
- console.log('Module Workers available:', caps.hasModuleWorkers);
111
- console.log('File API available:', caps.hasFileApi);
112
- console.log('SharedArrayBuffer available:', caps.hasSharedArrayBuffer);
113
- ```
114
98
 
115
99
  ## Quick Start
116
100
 
@@ -122,283 +106,150 @@ Extract text, metadata, and structure from any supported document format:
122
106
  import { extractBytes, initWasm } from "@kreuzberg/wasm";
123
107
 
124
108
  async function main() {
125
- await initWasm();
109
+ await initWasm();
126
110
 
127
- const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
128
- const bytes = new Uint8Array(buffer);
111
+ const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
112
+ const bytes = new Uint8Array(buffer);
129
113
 
130
- const result = await extractBytes(bytes, "application/pdf");
114
+ const result = await extractBytes(bytes, "application/pdf");
131
115
 
132
- console.log("Extracted content:");
133
- console.log(result.content);
134
- console.log("MIME type:", result.mimeType);
135
- console.log("Metadata:", result.metadata);
116
+ console.log("Extracted content:");
117
+ console.log(result.content);
118
+ console.log("MIME type:", result.mimeType);
119
+ console.log("Metadata:", result.metadata);
136
120
  }
137
121
 
138
122
  main().catch(console.error);
139
123
  ```
140
124
 
125
+
141
126
  ### Common Use Cases
142
127
 
143
128
  #### Extract with Custom Configuration
144
129
 
145
130
  Most use cases benefit from configuration to control extraction behavior:
146
131
 
132
+
147
133
  **With OCR (for scanned documents):**
148
134
 
149
135
  ```ts
150
136
  import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
151
137
 
152
138
  async function extractWithOcr() {
153
- await initWasm();
154
-
155
- try {
156
- await enableOcr();
157
- console.log("OCR enabled successfully");
158
- } catch (error) {
159
- console.error("Failed to enable OCR:", error);
160
- return;
161
- }
162
-
163
- const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
164
-
165
- const result = await extractBytes(bytes, "image/png", {
166
- ocr: {
167
- backend: "tesseract-wasm",
168
- language: "eng",
169
- },
170
- });
171
-
172
- console.log("Extracted text:");
173
- console.log(result.content);
139
+ await initWasm();
140
+
141
+ try {
142
+ await enableOcr();
143
+ console.log("OCR enabled successfully");
144
+ } catch (error) {
145
+ console.error("Failed to enable OCR:", error);
146
+ return;
147
+ }
148
+
149
+ const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
150
+
151
+ const result = await extractBytes(bytes, "image/png", {
152
+ ocr: {
153
+ backend: "tesseract-wasm",
154
+ language: "eng",
155
+ },
156
+ });
157
+
158
+ console.log("Extracted text:");
159
+ console.log(result.content);
174
160
  }
175
161
 
176
162
  extractWithOcr().catch(console.error);
177
163
  ```
178
164
 
165
+
166
+
167
+
179
168
  #### Table Extraction
180
169
 
170
+
181
171
  See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
182
172
 
173
+
174
+
183
175
  #### Processing Multiple Files
184
176
 
177
+
185
178
  ```ts
186
179
  import { extractBytes, initWasm } from "@kreuzberg/wasm";
187
180
 
188
181
  interface DocumentJob {
189
- name: string;
190
- bytes: Uint8Array;
191
- mimeType: string;
182
+ name: string;
183
+ bytes: Uint8Array;
184
+ mimeType: string;
192
185
  }
193
186
 
194
- async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
195
- await initWasm();
196
-
197
- const results: Record<string, string> = {};
198
- const queue = [...documents];
199
-
200
- const workers = Array(concurrency)
201
- .fill(null)
202
- .map(async () => {
203
- while (queue.length > 0) {
204
- const doc = queue.shift();
205
- if (!doc) break;
206
-
207
- try {
208
- const result = await extractBytes(doc.bytes, doc.mimeType);
209
- results[doc.name] = result.content;
210
- } catch (error) {
211
- console.error(`Failed to process ${doc.name}:`, error);
212
- }
213
- }
214
- });
215
-
216
- await Promise.all(workers);
217
- return results;
187
+ async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
188
+ await initWasm();
189
+
190
+ const results: Record<string, string> = {};
191
+ const queue = [...documents];
192
+
193
+ const workers = Array(concurrency)
194
+ .fill(null)
195
+ .map(async () => {
196
+ while (queue.length > 0) {
197
+ const doc = queue.shift();
198
+ if (!doc) break;
199
+
200
+ try {
201
+ const result = await extractBytes(doc.bytes, doc.mimeType);
202
+ results[doc.name] = result.content;
203
+ } catch (error) {
204
+ console.error(`Failed to process ${doc.name}:`, error);
205
+ }
206
+ }
207
+ });
208
+
209
+ await Promise.all(workers);
210
+ return results;
218
211
  }
219
212
  ```
220
213
 
214
+
215
+
216
+
217
+
221
218
  #### Async Processing
222
219
 
223
220
  For non-blocking document processing:
224
221
 
225
222
  ```ts
226
- import { extractBytes, initWasm, getWasmCapabilities } from "@kreuzberg/wasm";
223
+ import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
227
224
 
228
225
  async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
229
- const caps = getWasmCapabilities();
230
- if (!caps.hasWasm) {
231
- throw new Error("WebAssembly not supported");
232
- }
226
+ const caps = getWasmCapabilities();
227
+ if (!caps.hasWasm) {
228
+ throw new Error("WebAssembly not supported");
229
+ }
233
230
 
234
- await initWasm();
231
+ await initWasm();
235
232
 
236
- const results = await Promise.all(
237
- files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
238
- );
233
+ const results = await Promise.all(files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
239
234
 
240
- return results.map((r) => ({
241
- content: r.content,
242
- pageCount: r.metadata?.pageCount,
243
- }));
235
+ return results.map((r) => ({
236
+ content: r.content,
237
+ pageCount: r.metadata?.pageCount,
238
+ }));
244
239
  }
245
240
 
246
241
  const fileBytes = [new Uint8Array([1, 2, 3])];
247
242
  const mimes = ["application/pdf"];
248
243
 
249
244
  extractDocuments(fileBytes, mimes)
250
- .then((results) => console.log(results))
251
- .catch(console.error);
252
- ```
253
-
254
- #### Worker Pool Usage
255
-
256
- When Web Workers are available, use worker threads for parallel document processing without blocking the main thread:
257
-
258
- ```typescript
259
- import { extractBytes, initWasm, hasWorkers, hasModuleWorkers } from '@kreuzberg/wasm';
260
-
261
- class DocumentWorkerPool {
262
- private workers: Worker[] = [];
263
- private taskQueue: Array<{ id: number; data: Uint8Array; mimeType: string; resolve: Function; reject: Function }> = [];
264
- private currentTaskId = 0;
265
-
266
- constructor(workerCount: number = navigator.hardwareConcurrency || 4) {
267
- // Module workers allow importing ES modules, standard workers are more compatible
268
- const useModuleWorkers = hasModuleWorkers();
269
-
270
- for (let i = 0; i < workerCount; i++) {
271
- const worker = useModuleWorkers
272
- ? new Worker(new URL('./extraction-worker.ts', import.meta.url), { type: 'module' })
273
- : new Worker(new URL('./extraction-worker.js', import.meta.url));
274
-
275
- worker.onmessage = (event) => this.handleWorkerMessage(event.data);
276
- worker.onerror = (error) => this.handleWorkerError(error);
277
- this.workers.push(worker);
278
- }
279
- }
280
-
281
- async extract(data: Uint8Array, mimeType: string): Promise<string> {
282
- return new Promise((resolve, reject) => {
283
- this.taskQueue.push({
284
- id: this.currentTaskId++,
285
- data,
286
- mimeType,
287
- resolve,
288
- reject
289
- });
290
- this.processQueue();
291
- });
292
- }
293
-
294
- private processQueue(): void {
295
- while (this.taskQueue.length > 0) {
296
- const task = this.taskQueue.shift();
297
- if (task) {
298
- const worker = this.workers[task.id % this.workers.length];
299
- worker.postMessage({ id: task.id, data: task.data, mimeType: task.mimeType });
300
- }
301
- }
302
- }
303
-
304
- private handleWorkerMessage(data: { id: number; result: string }): void {
305
- const task = this.taskQueue.find(t => t.id === data.id);
306
- if (task) {
307
- task.resolve(data.result);
308
- this.processQueue();
309
- }
310
- }
311
-
312
- private handleWorkerError(error: ErrorEvent): void {
313
- console.error('Worker error:', error.message);
314
- }
315
-
316
- terminate(): void {
317
- this.workers.forEach(w => w.terminate());
318
- }
319
- }
320
-
321
- // Usage
322
- async function processDocumentsInParallel() {
323
- if (!hasWorkers()) {
324
- console.log('Web Workers not available, falling back to main thread');
325
- return;
326
- }
327
-
328
- await initWasm();
329
- const pool = new DocumentWorkerPool(4);
330
-
331
- const documents = [
332
- { data: new Uint8Array([...]), mimeType: 'application/pdf' },
333
- { data: new Uint8Array([...]), mimeType: 'application/pdf' },
334
- ];
335
-
336
- const results = await Promise.all(
337
- documents.map(doc => pool.extract(doc.data, doc.mimeType))
338
- );
339
-
340
- pool.terminate();
341
- return results;
342
- }
343
- ```
344
-
345
- Worker code (`extraction-worker.ts`):
346
-
347
- ```typescript
348
- import { extractBytes, initWasm } from '@kreuzberg/wasm';
349
-
350
- let wasmInitialized = false;
351
-
352
- self.onmessage = async (event) => {
353
- if (!wasmInitialized) {
354
- await initWasm();
355
- wasmInitialized = true;
356
- }
357
-
358
- const { id, data, mimeType } = event.data;
359
- try {
360
- const result = await extractBytes(new Uint8Array(data), mimeType);
361
- self.postMessage({ id, result: result.content });
362
- } catch (error) {
363
- self.postMessage({ id, error: (error as Error).message });
364
- }
365
- };
245
+ .then((results) => console.log(results))
246
+ .catch(console.error);
366
247
  ```
367
248
 
368
- ### Memory Management
369
-
370
- WASM memory is managed by the JavaScript garbage collector:
371
-
372
- ```typescript
373
- import { initWasm, extractBytes } from '@kreuzberg/wasm';
374
249
 
375
- async function extractWithMemoryAwareness() {
376
- await initWasm();
377
250
 
378
- // Process documents one at a time to control memory usage
379
- const documents = [/* ... */];
380
251
 
381
- for (const doc of documents) {
382
- const result = await extractBytes(doc, 'application/pdf');
383
252
 
384
- // Process result immediately
385
- console.log(result.content);
386
-
387
- // Result will be garbage collected when no longer referenced
388
- // Explicitly clear large objects if needed
389
- // gc(); // Requires --expose-gc flag
390
- }
391
- }
392
-
393
- // Check available memory (browser only)
394
- if (performance.memory) {
395
- console.log('Memory usage:', {
396
- usedJSHeapSize: performance.memory.usedJSHeapSize,
397
- totalJSHeapSize: performance.memory.totalJSHeapSize,
398
- jsHeapSizeLimit: performance.memory.jsHeapSizeLimit
399
- });
400
- }
401
- ```
402
253
 
403
254
  ### Next Steps
404
255
 
@@ -408,108 +259,7 @@ if (performance.memory) {
408
259
  - **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
409
260
  - **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
410
261
 
411
- ## WASM-Specific Implementation Details
412
-
413
- ### Initialization
414
-
415
- WASM binaries must be loaded before extraction:
416
262
 
417
- ```typescript
418
- import { initWasm } from '@kreuzberg/wasm';
419
-
420
- // Initialize once at application startup
421
- await initWasm();
422
-
423
- // Now extraction functions can be used
424
- ```
425
-
426
- The init function:
427
- - Downloads and instantiates the WASM binary
428
- - Initializes the memory space (linear memory module)
429
- - Prepares thread pools if available
430
- - Throws if WASM is not supported in the environment
431
-
432
- ### Threading Model
433
-
434
- - Single-threaded by default (main thread execution)
435
- - Web Workers optional for background processing
436
- - Shared memory (SharedArrayBuffer) not required
437
- - Message passing used for worker communication
438
- - No blocking operations on main thread with worker pool
439
-
440
- ### Memory Considerations
441
-
442
- - Each WASM instance has its own 4GB linear memory address space
443
- - Large documents (> 100 MB) may not fit in WASM memory
444
- - Binary data is copied between JavaScript and WASM boundaries
445
- - Garbage collection is handled by JavaScript runtime
446
- - No manual memory management required
447
-
448
- ### Supported Extraction Targets
449
-
450
- Different file formats have varying support in WASM:
451
-
452
- | Format | Support | Notes |
453
- |--------|---------|-------|
454
- | PDF | Full | Text, images, metadata extraction |
455
- | Office (DOCX, XLSX, PPTX) | Full | All features supported |
456
- | Images (PNG, JPG, etc) | Full | EXIF metadata extraction |
457
- | Archives (ZIP, TAR) | Full | Listing and extraction |
458
- | OCR | Limited | Tesseract WASM only, main thread only |
459
- | Embeddings | Not Available | WASM has no ML model support |
460
-
461
- ### Platform Limitations
462
-
463
- **LibreOffice-Dependent Formats Not Available**
464
-
465
- WASM cannot load native LibreOffice binaries, so older Office formats are **not supported**:
466
-
467
- - ❌ **DOC** (Microsoft Word 97-2003) - Use DOCX instead
468
- - ❌ **XLS** (Microsoft Excel 97-2003) - Use XLSX instead
469
- - ❌ **PPT** (Microsoft PowerPoint 97-2003) - Use PPTX instead
470
- - ❌ **RTF** (Rich Text Format with complex features)
471
- - ❌ **ODT/ODS/ODP** (LibreOffice/OpenOffice formats)
472
-
473
- Modern Office formats (DOCX, XLSX, PPTX) are fully supported and don't require LibreOffice.
474
-
475
- **Polars Integration Not Available**
476
-
477
- - ❌ Polars DataFrame extraction/conversion not available in WASM
478
- - ❌ Structured data operations limited compared to Node.js binding
479
-
480
- **Alternative: Use Node.js Binding**
481
-
482
- If you need support for older Office formats or Polars integration, use the `@kreuzberg/node` package instead:
483
-
484
- ```bash
485
- npm install @kreuzberg/node
486
- ```
487
-
488
- The Node.js binding provides:
489
- - ✅ Full LibreOffice format support (DOC, XLS, PPT, RTF, ODT)
490
- - ✅ Polars DataFrame integration
491
- - ✅ All OCR backends (Tesseract, EasyOCR, PaddleOCR)
492
- - ✅ Full embedding model support
493
-
494
- **Format Comparison Table**
495
-
496
- | Format Type | WASM Support | Node.js Support |
497
- |-------------|--------------|-----------------|
498
- | Modern Office (DOCX/XLSX/PPTX) | ✅ Full | ✅ Full |
499
- | Legacy Office (DOC/XLS/PPT) | ❌ Not Available | ✅ Requires LibreOffice |
500
- | OpenOffice (ODT/ODS/ODP) | ❌ Not Available | ✅ Requires LibreOffice |
501
- | PDF | ✅ Full | ✅ Full |
502
- | Images | ✅ Full | ✅ Full |
503
- | Embeddings | ❌ Not Available | ✅ With ONNX Runtime |
504
- | Polars | ❌ Not Available | ✅ Available |
505
-
506
- ### Sandbox Security
507
-
508
- - WASM code runs in a sandbox with restricted capabilities
509
- - File system access requires user interaction (File API)
510
- - Network access follows CORS restrictions
511
- - No access to Node.js native modules
512
- - Content Security Policy (CSP) may restrict WASM loading
513
263
 
514
264
  ## Features
515
265
 
@@ -563,25 +313,20 @@ The Node.js binding provides:
563
313
  ### Key Capabilities
564
314
 
565
315
  - **Text Extraction** - Extract all text content with position and formatting information
566
-
567
316
  - **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
568
-
569
317
  - **Table Extraction** - Parse tables with structure and cell content preservation
570
-
571
318
  - **Image Extraction** - Extract embedded images and render page previews
572
-
573
319
  - **OCR Support** - Integrate multiple OCR backends for scanned documents
574
320
 
575
321
  - **Async/Await** - Non-blocking document processing with concurrent operations
576
322
 
323
+
577
324
  - **Plugin System** - Extensible post-processing for custom text transformation
578
325
 
579
- - **Batch Processing** - Efficiently process multiple documents in parallel
580
326
 
327
+ - **Batch Processing** - Efficiently process multiple documents in parallel
581
328
  - **Memory Efficient** - Stream large files without loading entirely into memory
582
-
583
329
  - **Language Detection** - Detect and support multiple languages in documents
584
-
585
330
  - **Configuration** - Fine-grained control over extraction behavior
586
331
 
587
332
  ### Performance Characteristics
@@ -594,83 +339,96 @@ The Node.js binding provides:
594
339
  | **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
595
340
  | **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
596
341
 
342
+
343
+
597
344
  ## OCR Support
598
345
 
599
346
  Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
600
347
 
348
+
601
349
  - **Tesseract-Wasm**
602
350
 
351
+
603
352
  ### OCR Configuration Example
604
353
 
605
354
  ```ts
606
355
  import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
607
356
 
608
357
  async function extractWithOcr() {
609
- await initWasm();
610
-
611
- try {
612
- await enableOcr();
613
- console.log("OCR enabled successfully");
614
- } catch (error) {
615
- console.error("Failed to enable OCR:", error);
616
- return;
617
- }
618
-
619
- const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
620
-
621
- const result = await extractBytes(bytes, "image/png", {
622
- ocr: {
623
- backend: "tesseract-wasm",
624
- language: "eng",
625
- },
626
- });
627
-
628
- console.log("Extracted text:");
629
- console.log(result.content);
358
+ await initWasm();
359
+
360
+ try {
361
+ await enableOcr();
362
+ console.log("OCR enabled successfully");
363
+ } catch (error) {
364
+ console.error("Failed to enable OCR:", error);
365
+ return;
366
+ }
367
+
368
+ const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
369
+
370
+ const result = await extractBytes(bytes, "image/png", {
371
+ ocr: {
372
+ backend: "tesseract-wasm",
373
+ language: "eng",
374
+ },
375
+ });
376
+
377
+ console.log("Extracted text:");
378
+ console.log(result.content);
630
379
  }
631
380
 
632
381
  extractWithOcr().catch(console.error);
633
382
  ```
634
383
 
384
+
385
+
386
+
635
387
  ## Async Support
636
388
 
637
389
  This binding provides full async/await support for non-blocking document processing:
638
390
 
639
391
  ```ts
640
- import { extractBytes, initWasm, getWasmCapabilities } from "@kreuzberg/wasm";
392
+ import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
641
393
 
642
394
  async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
643
- const caps = getWasmCapabilities();
644
- if (!caps.hasWasm) {
645
- throw new Error("WebAssembly not supported");
646
- }
395
+ const caps = getWasmCapabilities();
396
+ if (!caps.hasWasm) {
397
+ throw new Error("WebAssembly not supported");
398
+ }
647
399
 
648
- await initWasm();
400
+ await initWasm();
649
401
 
650
- const results = await Promise.all(
651
- files.map((bytes, index) => extractBytes(bytes, mimeTypes[index]))
652
- );
402
+ const results = await Promise.all(files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
653
403
 
654
- return results.map((r) => ({
655
- content: r.content,
656
- pageCount: r.metadata?.pageCount,
657
- }));
404
+ return results.map((r) => ({
405
+ content: r.content,
406
+ pageCount: r.metadata?.pageCount,
407
+ }));
658
408
  }
659
409
 
660
410
  const fileBytes = [new Uint8Array([1, 2, 3])];
661
411
  const mimes = ["application/pdf"];
662
412
 
663
413
  extractDocuments(fileBytes, mimes)
664
- .then((results) => console.log(results))
665
- .catch(console.error);
414
+ .then((results) => console.log(results))
415
+ .catch(console.error);
666
416
  ```
667
417
 
418
+
419
+
420
+
668
421
  ## Plugin System
669
422
 
670
423
  Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
671
424
 
672
425
  For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
673
426
 
427
+
428
+
429
+
430
+
431
+
674
432
  ## Batch Processing
675
433
 
676
434
  Process multiple documents efficiently:
@@ -679,38 +437,41 @@ Process multiple documents efficiently:
679
437
  import { extractBytes, initWasm } from "@kreuzberg/wasm";
680
438
 
681
439
  interface DocumentJob {
682
- name: string;
683
- bytes: Uint8Array;
684
- mimeType: string;
440
+ name: string;
441
+ bytes: Uint8Array;
442
+ mimeType: string;
685
443
  }
686
444
 
687
- async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
688
- await initWasm();
689
-
690
- const results: Record<string, string> = {};
691
- const queue = [...documents];
692
-
693
- const workers = Array(concurrency)
694
- .fill(null)
695
- .map(async () => {
696
- while (queue.length > 0) {
697
- const doc = queue.shift();
698
- if (!doc) break;
699
-
700
- try {
701
- const result = await extractBytes(doc.bytes, doc.mimeType);
702
- results[doc.name] = result.content;
703
- } catch (error) {
704
- console.error(`Failed to process ${doc.name}:`, error);
705
- }
706
- }
707
- });
708
-
709
- await Promise.all(workers);
710
- return results;
445
+ async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
446
+ await initWasm();
447
+
448
+ const results: Record<string, string> = {};
449
+ const queue = [...documents];
450
+
451
+ const workers = Array(concurrency)
452
+ .fill(null)
453
+ .map(async () => {
454
+ while (queue.length > 0) {
455
+ const doc = queue.shift();
456
+ if (!doc) break;
457
+
458
+ try {
459
+ const result = await extractBytes(doc.bytes, doc.mimeType);
460
+ results[doc.name] = result.content;
461
+ } catch (error) {
462
+ console.error(`Failed to process ${doc.name}:`, error);
463
+ }
464
+ }
465
+ });
466
+
467
+ await Promise.all(workers);
468
+ return results;
711
469
  }
712
470
  ```
713
471
 
472
+
473
+
474
+
714
475
  ## Configuration
715
476
 
716
477
  For advanced configuration options including language detection, table extraction, OCR settings, and more: