@kreuzberg/node 4.0.0-rc.21 → 4.0.0-rc.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +345 -534
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +158 -91
- package/dist/index.d.ts +158 -91
- package/dist/index.js +77 -103
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +72 -103
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +141 -36
- package/dist/types.d.ts +141 -36
- package/dist/types.js.map +1 -1
- package/index.d.ts +183 -0
- package/index.js +64 -54
- package/metadata.d.ts +53 -33
- package/package.json +5 -6
package/README.md
CHANGED
|
@@ -1,705 +1,516 @@
|
|
|
1
|
-
#
|
|
1
|
+
# TypeScript (Node.js)
|
|
2
|
+
|
|
3
|
+
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
|
+
<!-- Language Bindings -->
|
|
5
|
+
<a href="https://crates.io/crates/kreuzberg">
|
|
6
|
+
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
|
7
|
+
</a>
|
|
8
|
+
<a href="https://hex.pm/packages/kreuzberg">
|
|
9
|
+
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://pypi.org/project/kreuzberg/">
|
|
12
|
+
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
|
15
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
|
18
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
|
19
|
+
</a>
|
|
20
|
+
|
|
21
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
22
|
+
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
|
+
</a>
|
|
24
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0-*" alt="Go">
|
|
26
|
+
</a>
|
|
27
|
+
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
|
+
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
29
|
+
</a>
|
|
30
|
+
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
|
31
|
+
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
|
+
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
|
+
</a>
|
|
36
|
+
|
|
37
|
+
<!-- Project Info -->
|
|
38
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
39
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
40
|
+
</a>
|
|
41
|
+
<a href="https://docs.kreuzberg.dev">
|
|
42
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
43
|
+
</a>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
47
|
+
|
|
48
|
+
<div align="center" style="margin-top: 20px;">
|
|
49
|
+
<a href="https://discord.gg/pXxagNK2zN">
|
|
50
|
+
<img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
|
|
51
|
+
</a>
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
2
56
|
|
|
3
|
-
[](https://crates.io/crates/kreuzberg)
|
|
4
|
-
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
-
[](https://www.npmjs.com/package/@kreuzberg/node)
|
|
6
|
-
[](https://www.npmjs.com/package/@kreuzberg/wasm)
|
|
7
|
-
[](https://rubygems.org/gems/kreuzberg)
|
|
8
|
-
[](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
|
|
9
|
-
[](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
|
|
10
|
-
[](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
|
|
11
57
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
[
|
|
58
|
+
> **Version 4.0.0 Release Candidate**
|
|
59
|
+
> Kreuzberg v4.0.0 is in **Release Candidate** stage. Bugs and breaking changes are expected.
|
|
60
|
+
> This is a pre-release version. Please test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
15
61
|
|
|
16
|
-
|
|
62
|
+
## Installation
|
|
17
63
|
|
|
18
|
-
|
|
64
|
+
### Package Installation
|
|
19
65
|
|
|
20
|
-
> **Recommended for Node.js and Bun** - Native NAPI-RS bindings provide the best performance (2-3x faster than WASM).
|
|
21
|
-
>
|
|
22
|
-
> For browser, Deno, or Cloudflare Workers, use [@kreuzberg/wasm](../kreuzberg-wasm/) instead.
|
|
23
66
|
|
|
24
|
-
|
|
25
|
-
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
67
|
+
Install via one of the supported package managers:
|
|
26
68
|
|
|
27
|
-
## Features
|
|
28
69
|
|
|
29
|
-
- **56 File Formats**: PDF, DOCX, PPTX, XLSX, images, HTML, Markdown, XML, JSON, and more
|
|
30
|
-
- **OCR Support**: Built-in Tesseract, EasyOCR, and PaddleOCR backends for scanned documents
|
|
31
|
-
- **Table Extraction**: Advanced table detection and structured data extraction
|
|
32
|
-
- **Native Performance**: 2-3x faster than WASM; 10-50x faster than pure JavaScript
|
|
33
|
-
- **Zero-Copy Operations**: Direct system calls and minimal data copying
|
|
34
|
-
- **Type-Safe**: Full TypeScript definitions for all methods, configurations, and return types
|
|
35
|
-
- **Async/Sync APIs**: Both asynchronous and synchronous extraction methods
|
|
36
|
-
- **Batch Processing**: Process multiple documents in parallel with optimized concurrency
|
|
37
|
-
- **Language Detection**: Automatic language detection for extracted text
|
|
38
|
-
- **Text Chunking**: Split long documents into manageable chunks for LLM processing
|
|
39
|
-
- **Caching**: Built-in result caching for faster repeated extractions
|
|
40
|
-
- **Zero Configuration**: Works out of the box with sensible defaults
|
|
41
70
|
|
|
42
|
-
|
|
71
|
+
**npm:**
|
|
72
|
+
```bash
|
|
73
|
+
npm install @kreuzberg/node
|
|
74
|
+
```
|
|
43
75
|
|
|
44
|
-
Choose `@kreuzberg/node` if you're building with:
|
|
45
76
|
|
|
46
|
-
- **Node.js 18+** - Native bindings provide direct access to system resources
|
|
47
|
-
- **Bun** - Full compatibility with Bun's Node.js API
|
|
48
|
-
- **Performance-critical applications** - Processing large document batches or real-time extraction
|
|
49
|
-
- **Server-side extraction** - APIs, microservices, document processing pipelines
|
|
50
77
|
|
|
51
|
-
### Comparison with @kreuzberg/wasm
|
|
52
78
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
| **Bundle Size** | 10-15 MB (prebuilt binary) | 2-4 MB (WASM module) |
|
|
58
|
-
| **System Access** | Direct system calls | Sandboxed via WASM |
|
|
59
|
-
| **Best For** | Server-side, batch processing | Client-side, edge computing |
|
|
79
|
+
**pnpm:**
|
|
80
|
+
```bash
|
|
81
|
+
pnpm add @kreuzberg/node
|
|
82
|
+
```
|
|
60
83
|
|
|
61
|
-
Use `@kreuzberg/wasm` for browser applications, Cloudflare Workers, Deno, or when you need a smaller bundle size.
|
|
62
84
|
|
|
63
|
-
## Requirements
|
|
64
85
|
|
|
65
|
-
- Node.js 18 or higher
|
|
66
|
-
- Native bindings are prebuilt for:
|
|
67
|
-
- macOS (x64, arm64)
|
|
68
|
-
- Linux (x64, arm64, armv7)
|
|
69
|
-
- Windows (x64, arm64)
|
|
70
86
|
|
|
71
|
-
|
|
87
|
+
**yarn:**
|
|
88
|
+
```bash
|
|
89
|
+
yarn add @kreuzberg/node
|
|
90
|
+
```
|
|
72
91
|
|
|
73
|
-
- **ONNX Runtime**: For embeddings functionality
|
|
74
|
-
- macOS: `brew install onnxruntime`
|
|
75
|
-
- Ubuntu: `sudo apt-get install libonnxruntime libonnxruntime-dev`
|
|
76
|
-
- Windows: `scoop install onnxruntime` or download from [GitHub](https://github.com/microsoft/onnxruntime/releases)
|
|
77
92
|
|
|
78
|
-
- **Tesseract**: For OCR functionality
|
|
79
|
-
- macOS: `brew install tesseract`
|
|
80
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
81
|
-
- Windows: Download from [GitHub](https://github.com/tesseract-ocr/tesseract)
|
|
82
93
|
|
|
83
|
-
- **LibreOffice**: For legacy MS Office formats (.doc, .ppt)
|
|
84
|
-
- macOS: `brew install libreoffice`
|
|
85
|
-
- Ubuntu: `sudo apt-get install libreoffice`
|
|
86
94
|
|
|
87
|
-
- **Pandoc**: For advanced document conversion
|
|
88
|
-
- macOS: `brew install pandoc`
|
|
89
|
-
- Ubuntu: `sudo apt-get install pandoc`
|
|
90
95
|
|
|
91
|
-
|
|
96
|
+
### System Requirements
|
|
92
97
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
98
|
+
- **Node.js 22+** required (NAPI-RS native bindings)
|
|
99
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
100
|
+
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
96
101
|
|
|
97
|
-
|
|
102
|
+
- Optional: [LibreOffice](https://www.libreoffice.org/download/download/) for legacy Office formats (DOC, XLS, PPT, RTF, ODT, ODS, ODP)
|
|
98
103
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
104
|
+
**Format Support Notes:**
|
|
105
|
+
- Modern Office formats (DOCX, XLSX, PPTX) work without LibreOffice
|
|
106
|
+
- Legacy formats (DOC, XLS, PPT) require LibreOffice installation
|
|
107
|
+
- WASM binding does NOT support LibreOffice formats (use Node.js for full format support)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
### Platform Support
|
|
112
|
+
|
|
113
|
+
Pre-built binaries available for:
|
|
114
|
+
- macOS (arm64, x64)
|
|
115
|
+
- Linux (x64)
|
|
116
|
+
- Windows (x64)
|
|
102
117
|
|
|
103
|
-
Or with yarn:
|
|
104
118
|
|
|
105
|
-
```bash
|
|
106
|
-
yarn add @kreuzberg/node
|
|
107
|
-
```
|
|
108
119
|
|
|
109
|
-
The package includes prebuilt native binaries for major platforms. No additional build steps required.
|
|
110
120
|
|
|
111
121
|
## Quick Start
|
|
112
122
|
|
|
113
123
|
### Basic Extraction
|
|
114
124
|
|
|
125
|
+
Extract text, metadata, and structure from any supported document format:
|
|
126
|
+
|
|
115
127
|
```typescript
|
|
116
128
|
import { extractFileSync } from '@kreuzberg/node';
|
|
117
129
|
|
|
118
|
-
|
|
119
|
-
|
|
130
|
+
const config = {
|
|
131
|
+
useCache: true,
|
|
132
|
+
enableQualityProcessing: true,
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
const result = extractFileSync('document.pdf', null, config);
|
|
136
|
+
|
|
120
137
|
console.log(result.content);
|
|
121
|
-
console.log(result.
|
|
138
|
+
console.log(`MIME Type: ${result.mimeType}`);
|
|
122
139
|
```
|
|
123
140
|
|
|
124
|
-
|
|
141
|
+
|
|
142
|
+
### Common Use Cases
|
|
143
|
+
|
|
144
|
+
#### Extract with Custom Configuration
|
|
145
|
+
|
|
146
|
+
Most use cases benefit from configuration to control extraction behavior:
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
**With OCR (for scanned documents):**
|
|
125
150
|
|
|
126
151
|
```typescript
|
|
127
152
|
import { extractFile } from '@kreuzberg/node';
|
|
128
153
|
|
|
129
|
-
|
|
130
|
-
|
|
154
|
+
const config = {
|
|
155
|
+
ocr: {
|
|
156
|
+
backend: 'tesseract',
|
|
157
|
+
language: 'eng+fra',
|
|
158
|
+
tesseractConfig: {
|
|
159
|
+
psm: 3,
|
|
160
|
+
},
|
|
161
|
+
},
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
const result = await extractFile('document.pdf', null, config);
|
|
131
165
|
console.log(result.content);
|
|
132
|
-
console.log(result.tables);
|
|
133
166
|
```
|
|
134
167
|
|
|
135
|
-
### With Full Type Safety
|
|
136
168
|
|
|
137
|
-
```typescript
|
|
138
|
-
import {
|
|
139
|
-
extractFile,
|
|
140
|
-
type ExtractionConfig,
|
|
141
|
-
type ExtractionResult
|
|
142
|
-
} from '@kreuzberg/node';
|
|
143
|
-
|
|
144
|
-
const config: ExtractionConfig = {
|
|
145
|
-
useCache: true,
|
|
146
|
-
enableQualityProcessing: true
|
|
147
|
-
};
|
|
148
169
|
|
|
149
|
-
const result: ExtractionResult = await extractFile('invoice.pdf', config);
|
|
150
170
|
|
|
151
|
-
|
|
152
|
-
console.log(result.content);
|
|
153
|
-
console.log(result.mimeType);
|
|
154
|
-
console.log(result.metadata);
|
|
171
|
+
#### Table Extraction
|
|
155
172
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
173
|
+
|
|
174
|
+
```typescript
|
|
175
|
+
import { extractFileSync } from '@kreuzberg/node';
|
|
176
|
+
|
|
177
|
+
const result = extractFileSync('document.pdf');
|
|
178
|
+
|
|
179
|
+
for (const table of result.tables) {
|
|
180
|
+
console.log(`Table with ${table.cells.length} rows`);
|
|
181
|
+
console.log(`Page: ${table.pageNumber}`);
|
|
182
|
+
console.log(table.markdown);
|
|
160
183
|
}
|
|
161
184
|
```
|
|
162
185
|
|
|
163
|
-
## Configuration
|
|
164
186
|
|
|
165
|
-
### OCR Configuration
|
|
166
187
|
|
|
167
|
-
```typescript
|
|
168
|
-
import { extractFile, type ExtractionConfig, type OcrConfig } from '@kreuzberg/node';
|
|
169
|
-
|
|
170
|
-
const config: ExtractionConfig = {
|
|
171
|
-
ocr: {
|
|
172
|
-
backend: 'tesseract',
|
|
173
|
-
language: 'eng',
|
|
174
|
-
tesseractConfig: {
|
|
175
|
-
enableTableDetection: true,
|
|
176
|
-
psm: 6,
|
|
177
|
-
minConfidence: 50.0
|
|
178
|
-
}
|
|
179
|
-
} as OcrConfig
|
|
180
|
-
};
|
|
181
188
|
|
|
182
|
-
|
|
183
|
-
console.log(result.content);
|
|
184
|
-
```
|
|
189
|
+
#### Processing Multiple Files
|
|
185
190
|
|
|
186
|
-
### PDF Password Protection
|
|
187
191
|
|
|
188
192
|
```typescript
|
|
189
|
-
import {
|
|
193
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
190
194
|
|
|
191
|
-
const
|
|
192
|
-
|
|
193
|
-
passwords: ['password1', 'password2'],
|
|
194
|
-
extractImages: true,
|
|
195
|
-
extractMetadata: true
|
|
196
|
-
} as PdfConfig
|
|
197
|
-
};
|
|
195
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
196
|
+
const results = batchExtractFilesSync(files);
|
|
198
197
|
|
|
199
|
-
|
|
198
|
+
results.forEach((result, i) => {
|
|
199
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
200
|
+
});
|
|
200
201
|
```
|
|
201
202
|
|
|
202
|
-
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
#### Async Processing
|
|
208
|
+
|
|
209
|
+
For non-blocking document processing:
|
|
203
210
|
|
|
204
211
|
```typescript
|
|
205
212
|
import { extractFile } from '@kreuzberg/node';
|
|
206
213
|
|
|
207
|
-
const result = await extractFile('
|
|
214
|
+
const result = await extractFile('document.pdf');
|
|
215
|
+
console.log(result.content);
|
|
216
|
+
```
|
|
208
217
|
|
|
209
|
-
if (result.tables) {
|
|
210
|
-
for (const table of result.tables) {
|
|
211
|
-
console.log('Table as Markdown:');
|
|
212
|
-
console.log(table.markdown);
|
|
213
218
|
|
|
214
|
-
console.log('Table cells:');
|
|
215
|
-
console.log(JSON.stringify(table.cells, null, 2));
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
```
|
|
219
219
|
|
|
220
|
-
### Text Chunking
|
|
221
220
|
|
|
222
|
-
```typescript
|
|
223
|
-
import { extractFile, type ChunkingConfig } from '@kreuzberg/node';
|
|
224
221
|
|
|
225
|
-
|
|
226
|
-
chunking: {
|
|
227
|
-
maxChars: 1000,
|
|
228
|
-
maxOverlap: 200
|
|
229
|
-
} as ChunkingConfig
|
|
230
|
-
};
|
|
222
|
+
#### Configuration Discovery
|
|
231
223
|
|
|
232
|
-
|
|
224
|
+
```typescript
|
|
225
|
+
import { discoverExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
233
226
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
227
|
+
const config = discoverExtractionConfig();
|
|
228
|
+
if (config) {
|
|
229
|
+
console.log('Found configuration file');
|
|
230
|
+
const result = await extractFile('document.pdf', null, config);
|
|
231
|
+
console.log(result.content);
|
|
232
|
+
} else {
|
|
233
|
+
console.log('No configuration file found, using defaults');
|
|
234
|
+
const result = await extractFile('document.pdf');
|
|
235
|
+
console.log(result.content);
|
|
238
236
|
}
|
|
239
237
|
```
|
|
240
238
|
|
|
241
|
-
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
#### Worker Thread Pool
|
|
242
244
|
|
|
243
245
|
```typescript
|
|
244
|
-
import {
|
|
246
|
+
import { createWorkerPool, extractFileInWorker, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
245
247
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
enabled: true,
|
|
249
|
-
minConfidence: 0.8,
|
|
250
|
-
detectMultiple: false
|
|
251
|
-
} as LanguageDetectionConfig
|
|
252
|
-
};
|
|
248
|
+
// Create a pool with 4 worker threads
|
|
249
|
+
const pool = createWorkerPool(4);
|
|
253
250
|
|
|
254
|
-
|
|
251
|
+
try {
|
|
252
|
+
// Extract single file in worker
|
|
253
|
+
const result = await extractFileInWorker(pool, 'document.pdf', null, {
|
|
254
|
+
useCache: true
|
|
255
|
+
});
|
|
256
|
+
console.log(result.content);
|
|
255
257
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
258
|
+
// Extract multiple files concurrently
|
|
259
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
260
|
+
const results = await batchExtractFilesInWorker(pool, files, {
|
|
261
|
+
useCache: true
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
results.forEach((result, i) => {
|
|
265
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
266
|
+
});
|
|
267
|
+
} finally {
|
|
268
|
+
// Always close the pool when done
|
|
269
|
+
await closeWorkerPool(pool);
|
|
259
270
|
}
|
|
260
271
|
```
|
|
261
272
|
|
|
262
|
-
### Image Extraction
|
|
263
273
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
274
|
+
**Performance Benefits:**
|
|
275
|
+
- **Parallel Processing**: Multiple documents extracted simultaneously
|
|
276
|
+
- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
|
|
277
|
+
- **Queue Management**: Automatically distributes work across available workers
|
|
278
|
+
- **Resource Control**: Prevents thread exhaustion with configurable pool size
|
|
267
279
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
autoAdjustDpi: true
|
|
274
|
-
} as ImageExtractionConfig
|
|
275
|
-
};
|
|
280
|
+
**Best Practices:**
|
|
281
|
+
- Use worker pools for batches of 10+ documents
|
|
282
|
+
- Set pool size to number of CPU cores (default behavior)
|
|
283
|
+
- Always close pools with `closeWorkerPool()` to prevent resource leaks
|
|
284
|
+
- Reuse pools across multiple batch operations for efficiency
|
|
276
285
|
|
|
277
|
-
const result = await extractFile('document-with-images.pdf', config);
|
|
278
286
|
|
|
279
|
-
if (result.images) {
|
|
280
|
-
for (let i = 0; i < result.images.length; i++) {
|
|
281
|
-
const image = result.images[i];
|
|
282
|
-
await writeFile(`image-${i}.${image.format}`, Buffer.from(image.data));
|
|
283
|
-
}
|
|
284
|
-
}
|
|
285
|
-
```
|
|
286
287
|
|
|
287
|
-
###
|
|
288
|
+
### Next Steps
|
|
288
289
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
type ChunkingConfig,
|
|
295
|
-
type ImageExtractionConfig,
|
|
296
|
-
type PdfConfig,
|
|
297
|
-
type TokenReductionConfig,
|
|
298
|
-
type LanguageDetectionConfig
|
|
299
|
-
} from '@kreuzberg/node';
|
|
300
|
-
|
|
301
|
-
const config: ExtractionConfig = {
|
|
302
|
-
useCache: true,
|
|
303
|
-
enableQualityProcessing: true,
|
|
304
|
-
forceOcr: false,
|
|
305
|
-
maxConcurrentExtractions: 8,
|
|
306
|
-
|
|
307
|
-
ocr: {
|
|
308
|
-
backend: 'tesseract',
|
|
309
|
-
language: 'eng',
|
|
310
|
-
preprocessing: true,
|
|
311
|
-
tesseractConfig: {
|
|
312
|
-
enableTableDetection: true,
|
|
313
|
-
psm: 6,
|
|
314
|
-
oem: 3,
|
|
315
|
-
minConfidence: 50.0
|
|
316
|
-
}
|
|
317
|
-
} as OcrConfig,
|
|
318
|
-
|
|
319
|
-
chunking: {
|
|
320
|
-
maxChars: 1000,
|
|
321
|
-
maxOverlap: 200
|
|
322
|
-
} as ChunkingConfig,
|
|
323
|
-
|
|
324
|
-
images: {
|
|
325
|
-
extractImages: true,
|
|
326
|
-
targetDpi: 300,
|
|
327
|
-
maxImageDimension: 4096,
|
|
328
|
-
autoAdjustDpi: true
|
|
329
|
-
} as ImageExtractionConfig,
|
|
330
|
-
|
|
331
|
-
pdfOptions: {
|
|
332
|
-
extractImages: true,
|
|
333
|
-
passwords: [],
|
|
334
|
-
extractMetadata: true
|
|
335
|
-
} as PdfConfig,
|
|
336
|
-
|
|
337
|
-
tokenReduction: {
|
|
338
|
-
mode: 'moderate',
|
|
339
|
-
preserveImportantWords: true
|
|
340
|
-
} as TokenReductionConfig,
|
|
341
|
-
|
|
342
|
-
languageDetection: {
|
|
343
|
-
enabled: true,
|
|
344
|
-
minConfidence: 0.8,
|
|
345
|
-
detectMultiple: false
|
|
346
|
-
} as LanguageDetectionConfig
|
|
347
|
-
};
|
|
290
|
+
- **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
|
291
|
+
- **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
|
|
292
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
|
|
293
|
+
- **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
|
|
294
|
+
- **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
|
|
348
295
|
|
|
349
|
-
const result = await extractFile('document.pdf', config);
|
|
350
|
-
```
|
|
351
296
|
|
|
352
|
-
## Advanced Usage
|
|
353
297
|
|
|
354
|
-
|
|
298
|
+
## NAPI-RS Implementation Details
|
|
355
299
|
|
|
356
|
-
|
|
357
|
-
import { extractBytes } from '@kreuzberg/node';
|
|
358
|
-
import { readFile } from 'fs/promises';
|
|
300
|
+
### Native Performance
|
|
359
301
|
|
|
360
|
-
|
|
361
|
-
const result = await extractBytes(buffer, 'application/pdf');
|
|
362
|
-
console.log(result.content);
|
|
363
|
-
```
|
|
302
|
+
This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
364
303
|
|
|
365
|
-
|
|
304
|
+
- **Zero-copy data transfer** between JavaScript and Rust layers
|
|
305
|
+
- **Native thread pool** for concurrent document processing
|
|
306
|
+
- **Direct memory management** for efficient large document handling
|
|
307
|
+
- **Binary-compatible** pre-built native modules across platforms
|
|
366
308
|
|
|
367
|
-
|
|
368
|
-
import { batchExtractFiles } from '@kreuzberg/node';
|
|
309
|
+
### Threading Model
|
|
369
310
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
];
|
|
311
|
+
- Single documents are processed synchronously or asynchronously in a dedicated thread
|
|
312
|
+
- Batch operations distribute work across available CPU cores
|
|
313
|
+
- Thread count is configurable but defaults to system CPU count
|
|
314
|
+
- Long-running extractions block the event loop unless using async APIs
|
|
375
315
|
|
|
376
|
-
|
|
316
|
+
### Memory Management
|
|
377
317
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
318
|
+
- Large documents (> 100 MB) are streamed to avoid loading entirely into memory
|
|
319
|
+
- Temporary files are created in system temp directory for extraction
|
|
320
|
+
- Memory is automatically released after extraction completion
|
|
321
|
+
- ONNX models are cached in memory for repeated embeddings operations
|
|
382
322
|
|
|
383
|
-
### Batch Processing with Custom Concurrency
|
|
384
323
|
|
|
385
|
-
```typescript
|
|
386
|
-
import { batchExtractFiles } from '@kreuzberg/node';
|
|
387
324
|
|
|
388
|
-
|
|
389
|
-
maxConcurrentExtractions: 4 // Process 4 files at a time
|
|
390
|
-
};
|
|
325
|
+
## Features
|
|
391
326
|
|
|
392
|
-
|
|
393
|
-
const results = await batchExtractFiles(files, config);
|
|
327
|
+
### Supported File Formats (56+)
|
|
394
328
|
|
|
395
|
-
|
|
396
|
-
```
|
|
329
|
+
56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
397
330
|
|
|
398
|
-
|
|
331
|
+
#### Office Documents
|
|
399
332
|
|
|
400
|
-
|
|
401
|
-
|
|
333
|
+
| Category | Formats | Capabilities |
|
|
334
|
+
|----------|---------|--------------|
|
|
335
|
+
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
336
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
337
|
+
| **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
|
|
338
|
+
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
339
|
+
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
402
340
|
|
|
403
|
-
|
|
341
|
+
#### Images (OCR-Enabled)
|
|
404
342
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
console.log('Word Count:', result.metadata.wordCount);
|
|
411
|
-
}
|
|
412
|
-
```
|
|
343
|
+
| Category | Formats | Features |
|
|
344
|
+
|----------|---------|----------|
|
|
345
|
+
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
|
346
|
+
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR, table detection, format-specific metadata |
|
|
347
|
+
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
|
413
348
|
|
|
414
|
-
|
|
349
|
+
#### Web & Data
|
|
415
350
|
|
|
416
|
-
|
|
417
|
-
|
|
351
|
+
| Category | Formats | Features |
|
|
352
|
+
|----------|---------|----------|
|
|
353
|
+
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
354
|
+
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
355
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
418
356
|
|
|
419
|
-
|
|
420
|
-
tokenReduction: {
|
|
421
|
-
mode: 'aggressive', // Options: 'light', 'moderate', 'aggressive'
|
|
422
|
-
preserveImportantWords: true
|
|
423
|
-
} as TokenReductionConfig
|
|
424
|
-
};
|
|
357
|
+
#### Email & Archives
|
|
425
358
|
|
|
426
|
-
|
|
359
|
+
| Category | Formats | Features |
|
|
360
|
+
|----------|---------|----------|
|
|
361
|
+
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
|
362
|
+
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
|
427
363
|
|
|
428
|
-
|
|
429
|
-
console.log(`Original length: ${result.content.length}`);
|
|
430
|
-
console.log(`Processed for LLM context window`);
|
|
431
|
-
```
|
|
364
|
+
#### Academic & Scientific
|
|
432
365
|
|
|
433
|
-
|
|
366
|
+
| Category | Formats | Features |
|
|
367
|
+
|----------|---------|----------|
|
|
368
|
+
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.enw`, `.csl` | Bibliography parsing, citation extraction |
|
|
369
|
+
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
370
|
+
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
434
371
|
|
|
435
|
-
|
|
436
|
-
import {
|
|
437
|
-
extractFile,
|
|
438
|
-
KreuzbergError,
|
|
439
|
-
ValidationError,
|
|
440
|
-
ParsingError,
|
|
441
|
-
OCRError,
|
|
442
|
-
MissingDependencyError
|
|
443
|
-
} from '@kreuzberg/node';
|
|
372
|
+
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
444
373
|
|
|
445
|
-
|
|
446
|
-
const result = await extractFile('document.pdf');
|
|
447
|
-
console.log(result.content);
|
|
448
|
-
} catch (error) {
|
|
449
|
-
if (error instanceof ValidationError) {
|
|
450
|
-
console.error('Invalid configuration or input:', error.message);
|
|
451
|
-
} else if (error instanceof ParsingError) {
|
|
452
|
-
console.error('Failed to parse document:', error.message);
|
|
453
|
-
} else if (error instanceof OCRError) {
|
|
454
|
-
console.error('OCR processing failed:', error.message);
|
|
455
|
-
} else if (error instanceof MissingDependencyError) {
|
|
456
|
-
console.error(`Missing dependency: ${error.dependency}`);
|
|
457
|
-
console.error('Installation instructions:', error.message);
|
|
458
|
-
} else if (error instanceof KreuzbergError) {
|
|
459
|
-
console.error('Kreuzberg error:', error.message);
|
|
460
|
-
} else {
|
|
461
|
-
throw error;
|
|
462
|
-
}
|
|
463
|
-
}
|
|
464
|
-
```
|
|
374
|
+
### Key Capabilities
|
|
465
375
|
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
Asynchronously extract content from a file.
|
|
472
|
-
|
|
473
|
-
#### `extractFileSync(filePath: string, config?: ExtractionConfig): ExtractionResult`
|
|
474
|
-
Synchronously extract content from a file.
|
|
475
|
-
|
|
476
|
-
#### `extractBytes(data: Buffer, mimeType: string, config?: ExtractionConfig): Promise<ExtractionResult>`
|
|
477
|
-
Asynchronously extract content from a buffer.
|
|
478
|
-
|
|
479
|
-
#### `extractBytesSync(data: Buffer, mimeType: string, config?: ExtractionConfig): ExtractionResult`
|
|
480
|
-
Synchronously extract content from a buffer.
|
|
481
|
-
|
|
482
|
-
#### `batchExtractFiles(paths: string[], config?: ExtractionConfig): Promise<ExtractionResult[]>`
|
|
483
|
-
Asynchronously extract content from multiple files in parallel.
|
|
484
|
-
|
|
485
|
-
#### `batchExtractFilesSync(paths: string[], config?: ExtractionConfig): ExtractionResult[]`
|
|
486
|
-
Synchronously extract content from multiple files.
|
|
487
|
-
|
|
488
|
-
### Types
|
|
489
|
-
|
|
490
|
-
#### `ExtractionResult`
|
|
491
|
-
Main result object containing:
|
|
492
|
-
- `content: string` - Extracted text content
|
|
493
|
-
- `mimeType: string` - MIME type of the document
|
|
494
|
-
- `metadata?: Metadata` - Document metadata
|
|
495
|
-
- `tables?: Table[]` - Extracted tables
|
|
496
|
-
- `images?: ImageData[]` - Extracted images
|
|
497
|
-
- `chunks?: Chunk[]` - Text chunks (if chunking enabled)
|
|
498
|
-
- `language?: LanguageInfo` - Detected language (if enabled)
|
|
499
|
-
|
|
500
|
-
#### `ExtractionConfig`
|
|
501
|
-
Configuration object for extraction:
|
|
502
|
-
- `useCache?: boolean` - Enable result caching
|
|
503
|
-
- `enableQualityProcessing?: boolean` - Enable text quality improvements
|
|
504
|
-
- `forceOcr?: boolean` - Force OCR even for text-based PDFs
|
|
505
|
-
- `maxConcurrentExtractions?: number` - Max parallel extractions
|
|
506
|
-
- `ocr?: OcrConfig` - OCR settings
|
|
507
|
-
- `chunking?: ChunkingConfig` - Text chunking settings
|
|
508
|
-
- `images?: ImageExtractionConfig` - Image extraction settings
|
|
509
|
-
- `pdfOptions?: PdfConfig` - PDF-specific options
|
|
510
|
-
- `tokenReduction?: TokenReductionConfig` - Token reduction settings
|
|
511
|
-
- `languageDetection?: LanguageDetectionConfig` - Language detection settings
|
|
512
|
-
|
|
513
|
-
#### `OcrConfig`
|
|
514
|
-
OCR configuration:
|
|
515
|
-
- `backend: string` - OCR backend ('tesseract', 'easyocr', 'paddleocr')
|
|
516
|
-
- `language: string` - Language code (e.g., 'eng', 'fra', 'deu')
|
|
517
|
-
- `preprocessing?: boolean` - Enable image preprocessing
|
|
518
|
-
- `tesseractConfig?: TesseractConfig` - Tesseract-specific options
|
|
519
|
-
|
|
520
|
-
#### `Table`
|
|
521
|
-
Extracted table structure:
|
|
522
|
-
- `markdown: string` - Table in Markdown format
|
|
523
|
-
- `cells: TableCell[][]` - 2D array of table cells
|
|
524
|
-
- `rowCount: number` - Number of rows
|
|
525
|
-
- `columnCount: number` - Number of columns
|
|
526
|
-
|
|
527
|
-
### Exceptions
|
|
528
|
-
|
|
529
|
-
All Kreuzberg exceptions extend the base `KreuzbergError` class:
|
|
530
|
-
|
|
531
|
-
- `KreuzbergError` - Base error class for all Kreuzberg errors
|
|
532
|
-
- `ValidationError` - Invalid configuration, missing required fields, or invalid input
|
|
533
|
-
- `ParsingError` - Document parsing failure or corrupted file
|
|
534
|
-
- `OCRError` - OCR processing failure
|
|
535
|
-
- `MissingDependencyError` - Missing optional system dependency (includes installation instructions)
|
|
536
|
-
|
|
537
|
-
## Supported Formats
|
|
538
|
-
|
|
539
|
-
| Category | Formats |
|
|
540
|
-
|----------|---------|
|
|
541
|
-
| **Documents** | PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, ODT, ODP, ODS, RTF |
|
|
542
|
-
| **Images** | PNG, JPEG, JPG, WEBP, BMP, TIFF, GIF |
|
|
543
|
-
| **Web** | HTML, XHTML, XML |
|
|
544
|
-
| **Text** | TXT, MD, CSV, TSV, JSON, YAML, TOML |
|
|
545
|
-
| **Email** | EML, MSG |
|
|
546
|
-
| **Archives** | ZIP, TAR, 7Z |
|
|
547
|
-
| **Other** | And 30+ more formats |
|
|
548
|
-
|
|
549
|
-
## Performance
|
|
550
|
-
|
|
551
|
-
Kreuzberg is built with a native Rust core, providing significant performance improvements over pure JavaScript solutions:
|
|
552
|
-
|
|
553
|
-
- **10-50x faster** text extraction compared to pure Node.js libraries
|
|
554
|
-
- **Native multithreading** for batch processing
|
|
555
|
-
- **Optimized memory usage** with streaming for large files
|
|
556
|
-
- **Zero-copy operations** where possible
|
|
557
|
-
- **Efficient caching** to avoid redundant processing
|
|
558
|
-
|
|
559
|
-
### Benchmarks
|
|
560
|
-
|
|
561
|
-
Processing 100 mixed documents (PDF, DOCX, XLSX):
|
|
562
|
-
|
|
563
|
-
| Library | Time | Memory |
|
|
564
|
-
|---------|------|--------|
|
|
565
|
-
| Kreuzberg | 2.3s | 145 MB |
|
|
566
|
-
| pdf-parse + mammoth | 23.1s | 890 MB |
|
|
567
|
-
| textract | 45.2s | 1.2 GB |
|
|
376
|
+
- **Text Extraction** - Extract all text content with position and formatting information
|
|
377
|
+
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
378
|
+
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
379
|
+
- **Image Extraction** - Extract embedded images and render page previews
|
|
380
|
+
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
|
568
381
|
|
|
569
|
-
|
|
382
|
+
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
570
383
|
|
|
571
|
-
### Native Module Not Found
|
|
572
384
|
|
|
573
|
-
|
|
385
|
+
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
574
386
|
|
|
575
|
-
```bash
|
|
576
|
-
npm rebuild @kreuzberg/node
|
|
577
|
-
```
|
|
578
387
|
|
|
579
|
-
|
|
388
|
+
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
|
|
580
389
|
|
|
581
|
-
|
|
390
|
+
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
391
|
+
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
392
|
+
- **Language Detection** - Detect and support multiple languages in documents
|
|
393
|
+
- **Configuration** - Fine-grained control over extraction behavior
|
|
582
394
|
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
395
|
+
### Performance Characteristics
|
|
396
|
+
|
|
397
|
+
| Format | Speed | Memory | Notes |
|
|
398
|
+
|--------|-------|--------|-------|
|
|
399
|
+
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
|
400
|
+
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
|
401
|
+
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
|
402
|
+
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
|
403
|
+
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
|
586
404
|
|
|
587
|
-
If Tesseract is not found:
|
|
588
|
-
- macOS: `brew install tesseract`
|
|
589
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
590
|
-
- Windows: Download from [tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)
|
|
591
405
|
|
|
592
|
-
### Memory Issues with Large PDFs
|
|
593
406
|
|
|
594
|
-
|
|
407
|
+
## OCR Support
|
|
408
|
+
|
|
409
|
+
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
- **Tesseract**
|
|
413
|
+
|
|
414
|
+
- **Guten**
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
### OCR Configuration Example
|
|
595
418
|
|
|
596
419
|
```typescript
|
|
420
|
+
import { extractFile } from '@kreuzberg/node';
|
|
421
|
+
|
|
597
422
|
const config = {
|
|
598
|
-
|
|
423
|
+
ocr: {
|
|
424
|
+
backend: 'tesseract',
|
|
425
|
+
language: 'eng+fra',
|
|
426
|
+
tesseractConfig: {
|
|
427
|
+
psm: 3,
|
|
428
|
+
},
|
|
429
|
+
},
|
|
599
430
|
};
|
|
600
|
-
|
|
431
|
+
|
|
432
|
+
const result = await extractFile('document.pdf', null, config);
|
|
433
|
+
console.log(result.content);
|
|
601
434
|
```
|
|
602
435
|
|
|
603
|
-
### TypeScript Types Not Resolving
|
|
604
436
|
|
|
605
|
-
Make sure you're using:
|
|
606
|
-
- Node.js 18 or higher
|
|
607
|
-
- TypeScript 5.0 or higher
|
|
608
437
|
|
|
609
|
-
The package includes built-in type definitions.
|
|
610
438
|
|
|
611
|
-
|
|
439
|
+
## Async Support
|
|
612
440
|
|
|
613
|
-
|
|
441
|
+
This binding provides full async/await support for non-blocking document processing:
|
|
614
442
|
|
|
615
443
|
```typescript
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
444
|
+
import { extractFile } from '@kreuzberg/node';
|
|
445
|
+
|
|
446
|
+
const result = await extractFile('document.pdf');
|
|
447
|
+
console.log(result.content);
|
|
620
448
|
```
|
|
621
449
|
|
|
622
|
-
## Examples
|
|
623
450
|
|
|
624
|
-
### Extract Invoice Data
|
|
625
451
|
|
|
626
|
-
```typescript
|
|
627
|
-
import { extractFile } from '@kreuzberg/node';
|
|
628
452
|
|
|
629
|
-
|
|
453
|
+
## Plugin System
|
|
630
454
|
|
|
631
|
-
|
|
632
|
-
if (result.tables && result.tables.length > 0) {
|
|
633
|
-
const lineItems = result.tables[0];
|
|
634
|
-
console.log(lineItems.markdown);
|
|
635
|
-
}
|
|
455
|
+
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
|
636
456
|
|
|
637
|
-
|
|
638
|
-
if (result.metadata) {
|
|
639
|
-
console.log('Invoice Date:', result.metadata.creationDate);
|
|
640
|
-
}
|
|
641
|
-
```
|
|
457
|
+
For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
|
|
642
458
|
|
|
643
|
-
### Process Scanned Documents
|
|
644
459
|
|
|
645
|
-
```typescript
|
|
646
|
-
import { extractFile } from '@kreuzberg/node';
|
|
647
460
|
|
|
648
|
-
const config = {
|
|
649
|
-
forceOcr: true,
|
|
650
|
-
ocr: {
|
|
651
|
-
backend: 'tesseract',
|
|
652
|
-
language: 'eng',
|
|
653
|
-
preprocessing: true
|
|
654
|
-
}
|
|
655
|
-
};
|
|
656
461
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
462
|
+
## Embeddings Support
|
|
463
|
+
|
|
464
|
+
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
|
|
465
|
+
|
|
466
|
+
**[Embeddings Guide](https://kreuzberg.dev/features/#embeddings)**
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
## Batch Processing
|
|
660
471
|
|
|
661
|
-
|
|
472
|
+
Process multiple documents efficiently:
|
|
662
473
|
|
|
663
474
|
```typescript
|
|
664
|
-
import {
|
|
665
|
-
import { glob } from 'glob';
|
|
475
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
666
476
|
|
|
667
|
-
|
|
668
|
-
const
|
|
477
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
478
|
+
const results = batchExtractFilesSync(files);
|
|
669
479
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
maxConcurrentExtractions: 8,
|
|
673
|
-
enableQualityProcessing: true
|
|
480
|
+
results.forEach((result, i) => {
|
|
481
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
674
482
|
});
|
|
483
|
+
```
|
|
675
484
|
|
|
676
|
-
// Build search index
|
|
677
|
-
const searchIndex = results.map((result, i) => ({
|
|
678
|
-
path: files[i],
|
|
679
|
-
content: result.content,
|
|
680
|
-
metadata: result.metadata
|
|
681
|
-
}));
|
|
682
485
|
|
|
683
|
-
|
|
684
|
-
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
## Configuration
|
|
489
|
+
|
|
490
|
+
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
|
491
|
+
|
|
492
|
+
**[Configuration Guide](https://kreuzberg.dev/configuration/)**
|
|
685
493
|
|
|
686
494
|
## Documentation
|
|
687
495
|
|
|
688
|
-
|
|
496
|
+
- **[Official Documentation](https://kreuzberg.dev/)**
|
|
497
|
+
- **[API Reference](https://kreuzberg.dev/reference/api-typescript/)**
|
|
498
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)**
|
|
499
|
+
|
|
500
|
+
## Troubleshooting
|
|
501
|
+
|
|
502
|
+
For common issues and solutions, visit [Troubleshooting Guide](https://kreuzberg.dev/troubleshooting/).
|
|
689
503
|
|
|
690
504
|
## Contributing
|
|
691
505
|
|
|
692
|
-
|
|
506
|
+
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
|
693
507
|
|
|
694
508
|
## License
|
|
695
509
|
|
|
696
|
-
MIT
|
|
510
|
+
MIT License - see LICENSE file for details.
|
|
697
511
|
|
|
698
|
-
##
|
|
512
|
+
## Support
|
|
699
513
|
|
|
700
|
-
- [
|
|
701
|
-
- [
|
|
702
|
-
- [
|
|
703
|
-
- [Issue Tracker](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
704
|
-
- [Changelog](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md)
|
|
705
|
-
- [npm Package](https://www.npmjs.com/package/@kreuzberg/node)
|
|
514
|
+
- **Discord Community**: [Join our Discord](https://discord.gg/pXxagNK2zN)
|
|
515
|
+
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
516
|
+
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|