@kreuzberg/wasm 4.0.0-rc.6 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +321 -800
- package/dist/adapters/wasm-adapter.d.ts +7 -10
- package/dist/adapters/wasm-adapter.d.ts.map +1 -0
- package/dist/adapters/wasm-adapter.js +53 -54
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.d.ts +23 -67
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1102 -104
- package/dist/index.js.map +1 -1
- package/dist/ocr/registry.d.ts +7 -10
- package/dist/ocr/registry.d.ts.map +1 -0
- package/dist/ocr/registry.js +9 -28
- package/dist/ocr/registry.js.map +1 -1
- package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
- package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.js +8 -83
- package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
- package/dist/pdfium.js +77 -0
- package/dist/pkg/LICENSE +7 -0
- package/dist/pkg/README.md +503 -0
- package/dist/{kreuzberg_wasm.d.ts → pkg/kreuzberg_wasm.d.ts} +24 -12
- package/dist/{kreuzberg_wasm.js → pkg/kreuzberg_wasm.js} +224 -233
- package/dist/pkg/kreuzberg_wasm_bg.js +1871 -0
- package/dist/{kreuzberg_wasm_bg.wasm → pkg/kreuzberg_wasm_bg.wasm} +0 -0
- package/dist/{kreuzberg_wasm_bg.wasm.d.ts → pkg/kreuzberg_wasm_bg.wasm.d.ts} +10 -13
- package/dist/pkg/package.json +27 -0
- package/dist/plugin-registry.d.ts +246 -0
- package/dist/plugin-registry.d.ts.map +1 -0
- package/dist/runtime.d.ts +21 -22
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +21 -41
- package/dist/runtime.js.map +1 -1
- package/dist/types.d.ts +363 -0
- package/dist/types.d.ts.map +1 -0
- package/package.json +34 -51
- package/dist/adapters/wasm-adapter.d.mts +0 -121
- package/dist/adapters/wasm-adapter.mjs +0 -221
- package/dist/adapters/wasm-adapter.mjs.map +0 -1
- package/dist/index.d.mts +0 -466
- package/dist/index.mjs +0 -384
- package/dist/index.mjs.map +0 -1
- package/dist/kreuzberg_wasm.d.mts +0 -758
- package/dist/kreuzberg_wasm.mjs +0 -48
- package/dist/ocr/registry.d.mts +0 -102
- package/dist/ocr/registry.mjs +0 -70
- package/dist/ocr/registry.mjs.map +0 -1
- package/dist/ocr/tesseract-wasm-backend.d.mts +0 -257
- package/dist/ocr/tesseract-wasm-backend.mjs +0 -424
- package/dist/ocr/tesseract-wasm-backend.mjs.map +0 -1
- package/dist/runtime.d.mts +0 -256
- package/dist/runtime.mjs +0 -152
- package/dist/runtime.mjs.map +0 -1
- package/dist/snippets/wasm-bindgen-rayon-38edf6e439f6d70d/src/workerHelpers.js +0 -107
- package/dist/types-GJVIvbPy.d.mts +0 -221
- package/dist/types-GJVIvbPy.d.ts +0 -221
|
@@ -0,0 +1,503 @@
|
|
|
1
|
+
# WebAssembly
|
|
2
|
+
|
|
3
|
+
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
|
+
<!-- Language Bindings -->
|
|
5
|
+
<a href="https://crates.io/crates/kreuzberg">
|
|
6
|
+
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
|
7
|
+
</a>
|
|
8
|
+
<a href="https://hex.pm/packages/kreuzberg">
|
|
9
|
+
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://pypi.org/project/kreuzberg/">
|
|
12
|
+
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
|
15
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
|
18
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
|
19
|
+
</a>
|
|
20
|
+
|
|
21
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
22
|
+
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
|
+
</a>
|
|
24
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
|
|
26
|
+
</a>
|
|
27
|
+
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
|
+
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
29
|
+
</a>
|
|
30
|
+
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
|
31
|
+
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
|
+
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
|
+
</a>
|
|
36
|
+
|
|
37
|
+
<!-- Project Info -->
|
|
38
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
39
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
40
|
+
</a>
|
|
41
|
+
<a href="https://docs.kreuzberg.dev">
|
|
42
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
43
|
+
</a>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
47
|
+
|
|
48
|
+
<div align="center" style="margin-top: 20px;">
|
|
49
|
+
<a href="https://discord.gg/pXxagNK2zN">
|
|
50
|
+
<img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
|
|
51
|
+
</a>
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
### Package Installation
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
Install via one of the supported package managers:
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
**npm:**
|
|
68
|
+
```bash
|
|
69
|
+
npm install @kreuzberg/wasm
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
**pnpm:**
|
|
76
|
+
```bash
|
|
77
|
+
pnpm add @kreuzberg/wasm
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
**yarn:**
|
|
84
|
+
```bash
|
|
85
|
+
yarn add @kreuzberg/wasm
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
### System Requirements
|
|
93
|
+
|
|
94
|
+
- Modern browser with WebAssembly support, or Deno 1.0+, or Cloudflare Workers
|
|
95
|
+
- Optional: [Tesseract WASM](https://github.com/naptha/tesseract.js) for OCR functionality
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
## Quick Start
|
|
100
|
+
|
|
101
|
+
### Basic Extraction
|
|
102
|
+
|
|
103
|
+
Extract text, metadata, and structure from any supported document format:
|
|
104
|
+
|
|
105
|
+
```ts
|
|
106
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
107
|
+
|
|
108
|
+
async function main() {
|
|
109
|
+
await initWasm();
|
|
110
|
+
|
|
111
|
+
const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
|
|
112
|
+
const bytes = new Uint8Array(buffer);
|
|
113
|
+
|
|
114
|
+
const result = await extractBytes(bytes, "application/pdf");
|
|
115
|
+
|
|
116
|
+
console.log("Extracted content:");
|
|
117
|
+
console.log(result.content);
|
|
118
|
+
console.log("MIME type:", result.mimeType);
|
|
119
|
+
console.log("Metadata:", result.metadata);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
main().catch(console.error);
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
### Common Use Cases
|
|
127
|
+
|
|
128
|
+
#### Extract with Custom Configuration
|
|
129
|
+
|
|
130
|
+
Most use cases benefit from configuration to control extraction behavior:
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
**With OCR (for scanned documents):**
|
|
134
|
+
|
|
135
|
+
```ts
|
|
136
|
+
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
137
|
+
|
|
138
|
+
async function extractWithOcr() {
|
|
139
|
+
await initWasm();
|
|
140
|
+
|
|
141
|
+
try {
|
|
142
|
+
await enableOcr();
|
|
143
|
+
console.log("OCR enabled successfully");
|
|
144
|
+
} catch (error) {
|
|
145
|
+
console.error("Failed to enable OCR:", error);
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
|
150
|
+
|
|
151
|
+
const result = await extractBytes(bytes, "image/png", {
|
|
152
|
+
ocr: {
|
|
153
|
+
backend: "tesseract-wasm",
|
|
154
|
+
language: "eng",
|
|
155
|
+
},
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
console.log("Extracted text:");
|
|
159
|
+
console.log(result.content);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
extractWithOcr().catch(console.error);
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
#### Table Extraction
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
#### Processing Multiple Files
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
```ts
|
|
179
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
180
|
+
|
|
181
|
+
interface DocumentJob {
|
|
182
|
+
name: string;
|
|
183
|
+
bytes: Uint8Array;
|
|
184
|
+
mimeType: string;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
|
|
188
|
+
await initWasm();
|
|
189
|
+
|
|
190
|
+
const results: Record<string, string> = {};
|
|
191
|
+
const queue = [...documents];
|
|
192
|
+
|
|
193
|
+
const workers = Array(concurrency)
|
|
194
|
+
.fill(null)
|
|
195
|
+
.map(async () => {
|
|
196
|
+
while (queue.length > 0) {
|
|
197
|
+
const doc = queue.shift();
|
|
198
|
+
if (!doc) break;
|
|
199
|
+
|
|
200
|
+
try {
|
|
201
|
+
const result = await extractBytes(doc.bytes, doc.mimeType);
|
|
202
|
+
results[doc.name] = result.content;
|
|
203
|
+
} catch (error) {
|
|
204
|
+
console.error(`Failed to process ${doc.name}:`, error);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
await Promise.all(workers);
|
|
210
|
+
return results;
|
|
211
|
+
}
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
#### Async Processing
|
|
219
|
+
|
|
220
|
+
For non-blocking document processing:
|
|
221
|
+
|
|
222
|
+
```ts
|
|
223
|
+
import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
|
|
224
|
+
|
|
225
|
+
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
|
|
226
|
+
const caps = getWasmCapabilities();
|
|
227
|
+
if (!caps.hasWasm) {
|
|
228
|
+
throw new Error("WebAssembly not supported");
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
await initWasm();
|
|
232
|
+
|
|
233
|
+
const results = await Promise.all(files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
|
|
234
|
+
|
|
235
|
+
return results.map((r) => ({
|
|
236
|
+
content: r.content,
|
|
237
|
+
pageCount: r.metadata?.pageCount,
|
|
238
|
+
}));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
const fileBytes = [new Uint8Array([1, 2, 3])];
|
|
242
|
+
const mimes = ["application/pdf"];
|
|
243
|
+
|
|
244
|
+
extractDocuments(fileBytes, mimes)
|
|
245
|
+
.then((results) => console.log(results))
|
|
246
|
+
.catch(console.error);
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
### Next Steps
|
|
255
|
+
|
|
256
|
+
- **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
|
257
|
+
- **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
|
|
258
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
|
|
259
|
+
- **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
|
|
260
|
+
- **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
## Features
|
|
265
|
+
|
|
266
|
+
### Supported File Formats (56+)
|
|
267
|
+
|
|
268
|
+
56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
269
|
+
|
|
270
|
+
#### Office Documents
|
|
271
|
+
|
|
272
|
+
| Category | Formats | Capabilities |
|
|
273
|
+
|----------|---------|--------------|
|
|
274
|
+
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
275
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
276
|
+
| **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
|
|
277
|
+
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
278
|
+
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
279
|
+
|
|
280
|
+
#### Images (OCR-Enabled)
|
|
281
|
+
|
|
282
|
+
| Category | Formats | Features |
|
|
283
|
+
|----------|---------|----------|
|
|
284
|
+
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
|
285
|
+
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR, table detection, format-specific metadata |
|
|
286
|
+
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
|
287
|
+
|
|
288
|
+
#### Web & Data
|
|
289
|
+
|
|
290
|
+
| Category | Formats | Features |
|
|
291
|
+
|----------|---------|----------|
|
|
292
|
+
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
293
|
+
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
294
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
295
|
+
|
|
296
|
+
#### Email & Archives
|
|
297
|
+
|
|
298
|
+
| Category | Formats | Features |
|
|
299
|
+
|----------|---------|----------|
|
|
300
|
+
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
|
301
|
+
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
|
302
|
+
|
|
303
|
+
#### Academic & Scientific
|
|
304
|
+
|
|
305
|
+
| Category | Formats | Features |
|
|
306
|
+
|----------|---------|----------|
|
|
307
|
+
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.enw`, `.csl` | Bibliography parsing, citation extraction |
|
|
308
|
+
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
309
|
+
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
310
|
+
|
|
311
|
+
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
312
|
+
|
|
313
|
+
### Key Capabilities
|
|
314
|
+
|
|
315
|
+
- **Text Extraction** - Extract all text content with position and formatting information
|
|
316
|
+
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
317
|
+
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
318
|
+
- **Image Extraction** - Extract embedded images and render page previews
|
|
319
|
+
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
|
320
|
+
|
|
321
|
+
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
328
|
+
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
329
|
+
- **Language Detection** - Detect and support multiple languages in documents
|
|
330
|
+
- **Configuration** - Fine-grained control over extraction behavior
|
|
331
|
+
|
|
332
|
+
### Performance Characteristics
|
|
333
|
+
|
|
334
|
+
| Format | Speed | Memory | Notes |
|
|
335
|
+
|--------|-------|--------|-------|
|
|
336
|
+
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
|
337
|
+
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
|
338
|
+
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
|
339
|
+
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
|
340
|
+
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
## OCR Support
|
|
345
|
+
|
|
346
|
+
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
- **Tesseract-Wasm**
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
### OCR Configuration Example
|
|
353
|
+
|
|
354
|
+
```ts
|
|
355
|
+
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
356
|
+
|
|
357
|
+
async function extractWithOcr() {
|
|
358
|
+
await initWasm();
|
|
359
|
+
|
|
360
|
+
try {
|
|
361
|
+
await enableOcr();
|
|
362
|
+
console.log("OCR enabled successfully");
|
|
363
|
+
} catch (error) {
|
|
364
|
+
console.error("Failed to enable OCR:", error);
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
|
369
|
+
|
|
370
|
+
const result = await extractBytes(bytes, "image/png", {
|
|
371
|
+
ocr: {
|
|
372
|
+
backend: "tesseract-wasm",
|
|
373
|
+
language: "eng",
|
|
374
|
+
},
|
|
375
|
+
});
|
|
376
|
+
|
|
377
|
+
console.log("Extracted text:");
|
|
378
|
+
console.log(result.content);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
extractWithOcr().catch(console.error);
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
## Async Support
|
|
388
|
+
|
|
389
|
+
This binding provides full async/await support for non-blocking document processing:
|
|
390
|
+
|
|
391
|
+
```ts
|
|
392
|
+
import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
|
|
393
|
+
|
|
394
|
+
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
|
|
395
|
+
const caps = getWasmCapabilities();
|
|
396
|
+
if (!caps.hasWasm) {
|
|
397
|
+
throw new Error("WebAssembly not supported");
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
await initWasm();
|
|
401
|
+
|
|
402
|
+
const results = await Promise.all(files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
|
|
403
|
+
|
|
404
|
+
return results.map((r) => ({
|
|
405
|
+
content: r.content,
|
|
406
|
+
pageCount: r.metadata?.pageCount,
|
|
407
|
+
}));
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
const fileBytes = [new Uint8Array([1, 2, 3])];
|
|
411
|
+
const mimes = ["application/pdf"];
|
|
412
|
+
|
|
413
|
+
extractDocuments(fileBytes, mimes)
|
|
414
|
+
.then((results) => console.log(results))
|
|
415
|
+
.catch(console.error);
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
## Plugin System
|
|
422
|
+
|
|
423
|
+
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
|
424
|
+
|
|
425
|
+
For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
## Batch Processing
|
|
433
|
+
|
|
434
|
+
Process multiple documents efficiently:
|
|
435
|
+
|
|
436
|
+
```ts
|
|
437
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
438
|
+
|
|
439
|
+
interface DocumentJob {
|
|
440
|
+
name: string;
|
|
441
|
+
bytes: Uint8Array;
|
|
442
|
+
mimeType: string;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
|
|
446
|
+
await initWasm();
|
|
447
|
+
|
|
448
|
+
const results: Record<string, string> = {};
|
|
449
|
+
const queue = [...documents];
|
|
450
|
+
|
|
451
|
+
const workers = Array(concurrency)
|
|
452
|
+
.fill(null)
|
|
453
|
+
.map(async () => {
|
|
454
|
+
while (queue.length > 0) {
|
|
455
|
+
const doc = queue.shift();
|
|
456
|
+
if (!doc) break;
|
|
457
|
+
|
|
458
|
+
try {
|
|
459
|
+
const result = await extractBytes(doc.bytes, doc.mimeType);
|
|
460
|
+
results[doc.name] = result.content;
|
|
461
|
+
} catch (error) {
|
|
462
|
+
console.error(`Failed to process ${doc.name}:`, error);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
await Promise.all(workers);
|
|
468
|
+
return results;
|
|
469
|
+
}
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
## Configuration
|
|
476
|
+
|
|
477
|
+
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
|
478
|
+
|
|
479
|
+
**[Configuration Guide](https://kreuzberg.dev/configuration/)**
|
|
480
|
+
|
|
481
|
+
## Documentation
|
|
482
|
+
|
|
483
|
+
- **[Official Documentation](https://kreuzberg.dev/)**
|
|
484
|
+
- **[API Reference](https://kreuzberg.dev/reference/api-wasm/)**
|
|
485
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)**
|
|
486
|
+
|
|
487
|
+
## Troubleshooting
|
|
488
|
+
|
|
489
|
+
For common issues and solutions, visit [Troubleshooting Guide](https://kreuzberg.dev/troubleshooting/).
|
|
490
|
+
|
|
491
|
+
## Contributing
|
|
492
|
+
|
|
493
|
+
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
|
494
|
+
|
|
495
|
+
## License
|
|
496
|
+
|
|
497
|
+
MIT License - see LICENSE file for details.
|
|
498
|
+
|
|
499
|
+
## Support
|
|
500
|
+
|
|
501
|
+
- **Discord Community**: [Join our Discord](https://discord.gg/pXxagNK2zN)
|
|
502
|
+
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
503
|
+
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|
|
@@ -429,7 +429,7 @@ export function getExtensionsForMime(mime_type: string): Array<any>;
|
|
|
429
429
|
* console.log(unknownMime); // null
|
|
430
430
|
* ```
|
|
431
431
|
*/
|
|
432
|
-
export function getMimeFromExtension(extension: string): string
|
|
432
|
+
export function getMimeFromExtension(extension: string): string;
|
|
433
433
|
|
|
434
434
|
/**
|
|
435
435
|
* Get module information
|
|
@@ -442,7 +442,7 @@ export function get_module_info(): ModuleInfo;
|
|
|
442
442
|
*/
|
|
443
443
|
export function init(): void;
|
|
444
444
|
|
|
445
|
-
export function initThreadPool(
|
|
445
|
+
export function initThreadPool(_num_threads: number): Promise<any>;
|
|
446
446
|
|
|
447
447
|
/**
|
|
448
448
|
* Helper function to initialize the thread pool with error handling
|
|
@@ -455,6 +455,15 @@ export function initThreadPool(num_threads: number): Promise<any>;
|
|
|
455
455
|
*/
|
|
456
456
|
export function init_thread_pool_safe(num_threads: number): boolean;
|
|
457
457
|
|
|
458
|
+
/**
|
|
459
|
+
* Establishes a binding between an external Pdfium WASM module and `pdfium-render`'s WASM module.
|
|
460
|
+
* This function should be called from Javascript once the external Pdfium WASM module has been loaded
|
|
461
|
+
* into the browser. It is essential that this function is called _before_ initializing
|
|
462
|
+
* `pdfium-render` from within Rust code. For an example, see:
|
|
463
|
+
* <https://github.com/ajrcarey/pdfium-render/blob/master/examples/index.html>
|
|
464
|
+
*/
|
|
465
|
+
export function initialize_pdfium_render(pdfium_wasm_module: any, local_wasm_module: any, debug: boolean): boolean;
|
|
466
|
+
|
|
458
467
|
/**
|
|
459
468
|
* List all registered OCR backend names.
|
|
460
469
|
*
|
|
@@ -586,6 +595,13 @@ export function loadConfigFromString(content: string, format: string): any;
|
|
|
586
595
|
*/
|
|
587
596
|
export function normalizeMimeType(mime_type: string): string;
|
|
588
597
|
|
|
598
|
+
/**
|
|
599
|
+
* A callback function that can be invoked by Pdfium's `FPDF_LoadCustomDocument()` function,
|
|
600
|
+
* wrapping around `crate::utils::files::read_block_from_callback()` to shuffle data buffers
|
|
601
|
+
* from our WASM memory heap to Pdfium's WASM memory heap as they are loaded.
|
|
602
|
+
*/
|
|
603
|
+
export function read_block_from_callback_wasm(param: number, position: number, pBuf: number, size: number): number;
|
|
604
|
+
|
|
589
605
|
/**
|
|
590
606
|
* Register a custom OCR backend.
|
|
591
607
|
*
|
|
@@ -746,13 +762,9 @@ export function unregister_validator(name: string): void;
|
|
|
746
762
|
*/
|
|
747
763
|
export function version(): string;
|
|
748
764
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
receiver(): number;
|
|
756
|
-
}
|
|
757
|
-
|
|
758
|
-
export function wbg_rayon_start_worker(receiver: number): void;
|
|
765
|
+
/**
|
|
766
|
+
* A callback function that can be invoked by Pdfium's `FPDF_SaveAsCopy()` and `FPDF_SaveWithVersion()`
|
|
767
|
+
* functions, wrapping around `crate::utils::files::write_block_from_callback()` to shuffle data buffers
|
|
768
|
+
* from Pdfium's WASM memory heap to our WASM memory heap as they are written.
|
|
769
|
+
*/
|
|
770
|
+
export function write_block_from_callback_wasm(param: number, buf: number, size: number): number;
|