@kreuzberg/node 4.0.0-rc.6 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +347 -504
- package/dist/cli.d.mts +13 -0
- package/dist/cli.d.ts +13 -0
- package/dist/cli.js +88 -0
- package/dist/cli.js.map +1 -0
- package/dist/cli.mjs +54 -0
- package/dist/cli.mjs.map +1 -0
- package/dist/errors.d.mts +358 -0
- package/dist/errors.d.ts +358 -0
- package/dist/errors.js +139 -0
- package/dist/errors.js.map +1 -0
- package/dist/errors.mjs +107 -0
- package/dist/errors.mjs.map +1 -0
- package/dist/index.d.mts +1132 -0
- package/dist/index.d.ts +1132 -0
- package/dist/index.js +1044 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +975 -0
- package/dist/index.mjs.map +1 -0
- package/dist/ocr/guten-ocr.d.mts +193 -0
- package/dist/ocr/guten-ocr.d.ts +193 -0
- package/dist/ocr/guten-ocr.js +232 -0
- package/dist/ocr/guten-ocr.js.map +1 -0
- package/dist/ocr/guten-ocr.mjs +198 -0
- package/dist/ocr/guten-ocr.mjs.map +1 -0
- package/dist/types.d.mts +1081 -0
- package/dist/types.d.ts +1081 -0
- package/dist/types.js +17 -0
- package/dist/types.js.map +1 -0
- package/dist/types.mjs +1 -0
- package/dist/types.mjs.map +1 -0
- package/index.d.ts +673 -3
- package/index.js +85 -55
- package/metadata.d.ts +53 -33
- package/package.json +33 -34
- package/LICENSE +0 -7
package/README.md
CHANGED
|
@@ -1,669 +1,512 @@
|
|
|
1
|
-
#
|
|
1
|
+
# TypeScript (Node.js)
|
|
2
|
+
|
|
3
|
+
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
|
+
<!-- Language Bindings -->
|
|
5
|
+
<a href="https://crates.io/crates/kreuzberg">
|
|
6
|
+
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
|
7
|
+
</a>
|
|
8
|
+
<a href="https://hex.pm/packages/kreuzberg">
|
|
9
|
+
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://pypi.org/project/kreuzberg/">
|
|
12
|
+
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
|
15
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
|
18
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
|
19
|
+
</a>
|
|
20
|
+
|
|
21
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
22
|
+
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
|
+
</a>
|
|
24
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
|
|
26
|
+
</a>
|
|
27
|
+
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
|
+
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
29
|
+
</a>
|
|
30
|
+
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
|
31
|
+
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
|
+
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
|
+
</a>
|
|
36
|
+
|
|
37
|
+
<!-- Project Info -->
|
|
38
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
39
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
40
|
+
</a>
|
|
41
|
+
<a href="https://docs.kreuzberg.dev">
|
|
42
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
43
|
+
</a>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
47
|
+
|
|
48
|
+
<div align="center" style="margin-top: 20px;">
|
|
49
|
+
<a href="https://discord.gg/pXxagNK2zN">
|
|
50
|
+
<img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
|
|
51
|
+
</a>
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
2
56
|
|
|
3
|
-
[](https://www.npmjs.com/package/kreuzberg)
|
|
4
|
-
[](https://crates.io/crates/kreuzberg)
|
|
5
|
-
[](https://pypi.org/project/kreuzberg/)
|
|
6
|
-
[](https://rubygems.org/gems/kreuzberg)
|
|
7
|
-
[](https://www.npmjs.com/package/kreuzberg)
|
|
8
|
-
[](https://opensource.org/licenses/MIT)
|
|
9
|
-
[](https://kreuzberg.dev)
|
|
10
57
|
|
|
11
|
-
|
|
58
|
+
## Installation
|
|
12
59
|
|
|
13
|
-
|
|
60
|
+
### Package Installation
|
|
14
61
|
|
|
15
|
-
> **🚀 Version 4.0.0 Release Candidate**
|
|
16
|
-
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
17
62
|
|
|
18
|
-
|
|
63
|
+
Install via one of the supported package managers:
|
|
19
64
|
|
|
20
|
-
- **50+ File Formats**: PDF, DOCX, PPTX, XLSX, images, HTML, Markdown, XML, JSON, and more
|
|
21
|
-
- **OCR Support**: Built-in Tesseract, EasyOCR, and PaddleOCR backends for scanned documents
|
|
22
|
-
- **Table Extraction**: Advanced table detection and structured data extraction
|
|
23
|
-
- **High Performance**: Native Rust core provides 10-50x performance improvements over pure JavaScript
|
|
24
|
-
- **Type-Safe**: Full TypeScript definitions for all methods, configurations, and return types
|
|
25
|
-
- **Async/Sync APIs**: Both asynchronous and synchronous extraction methods
|
|
26
|
-
- **Batch Processing**: Process multiple documents in parallel with optimized concurrency
|
|
27
|
-
- **Language Detection**: Automatic language detection for extracted text
|
|
28
|
-
- **Text Chunking**: Split long documents into manageable chunks for LLM processing
|
|
29
|
-
- **Caching**: Built-in result caching for faster repeated extractions
|
|
30
|
-
- **Zero Configuration**: Works out of the box with sensible defaults
|
|
31
|
-
|
|
32
|
-
## Requirements
|
|
33
|
-
|
|
34
|
-
- Node.js 18 or higher
|
|
35
|
-
- Native bindings are prebuilt for:
|
|
36
|
-
- macOS (x64, arm64)
|
|
37
|
-
- Linux (x64, arm64, armv7)
|
|
38
|
-
- Windows (x64, arm64)
|
|
39
|
-
|
|
40
|
-
### Optional System Dependencies
|
|
41
|
-
|
|
42
|
-
- **Tesseract**: For OCR functionality
|
|
43
|
-
- macOS: `brew install tesseract`
|
|
44
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
45
|
-
- Windows: Download from [GitHub](https://github.com/tesseract-ocr/tesseract)
|
|
46
|
-
|
|
47
|
-
- **LibreOffice**: For legacy MS Office formats (.doc, .ppt)
|
|
48
|
-
- macOS: `brew install libreoffice`
|
|
49
|
-
- Ubuntu: `sudo apt-get install libreoffice`
|
|
50
|
-
|
|
51
|
-
- **Pandoc**: For advanced document conversion
|
|
52
|
-
- macOS: `brew install pandoc`
|
|
53
|
-
- Ubuntu: `sudo apt-get install pandoc`
|
|
54
65
|
|
|
55
|
-
## Installation
|
|
56
66
|
|
|
67
|
+
**npm:**
|
|
57
68
|
```bash
|
|
58
69
|
npm install @kreuzberg/node
|
|
59
70
|
```
|
|
60
71
|
|
|
61
|
-
Or with pnpm:
|
|
62
72
|
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
**pnpm:**
|
|
63
76
|
```bash
|
|
64
77
|
pnpm add @kreuzberg/node
|
|
65
78
|
```
|
|
66
79
|
|
|
67
|
-
Or with yarn:
|
|
68
80
|
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
**yarn:**
|
|
69
84
|
```bash
|
|
70
85
|
yarn add @kreuzberg/node
|
|
71
86
|
```
|
|
72
87
|
|
|
73
|
-
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
### System Requirements
|
|
93
|
+
|
|
94
|
+
- **Node.js 22+** required (NAPI-RS native bindings)
|
|
95
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
96
|
+
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
97
|
+
|
|
98
|
+
- Optional: [LibreOffice](https://www.libreoffice.org/download/download/) for legacy Office formats (DOC, XLS, PPT, RTF, ODT, ODS, ODP)
|
|
99
|
+
|
|
100
|
+
**Format Support Notes:**
|
|
101
|
+
- Modern Office formats (DOCX, XLSX, PPTX) work without LibreOffice
|
|
102
|
+
- Legacy formats (DOC, XLS, PPT) require LibreOffice installation
|
|
103
|
+
- WASM binding does NOT support LibreOffice formats (use Node.js for full format support)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
### Platform Support
|
|
108
|
+
|
|
109
|
+
Pre-built binaries available for:
|
|
110
|
+
- macOS (arm64, x64)
|
|
111
|
+
- Linux (x64)
|
|
112
|
+
- Windows (x64)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
74
116
|
|
|
75
117
|
## Quick Start
|
|
76
118
|
|
|
77
119
|
### Basic Extraction
|
|
78
120
|
|
|
121
|
+
Extract text, metadata, and structure from any supported document format:
|
|
122
|
+
|
|
79
123
|
```typescript
|
|
80
124
|
import { extractFileSync } from '@kreuzberg/node';
|
|
81
125
|
|
|
82
|
-
|
|
83
|
-
|
|
126
|
+
const config = {
|
|
127
|
+
useCache: true,
|
|
128
|
+
enableQualityProcessing: true,
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
const result = extractFileSync('document.pdf', null, config);
|
|
132
|
+
|
|
84
133
|
console.log(result.content);
|
|
85
|
-
console.log(result.
|
|
134
|
+
console.log(`MIME Type: ${result.mimeType}`);
|
|
86
135
|
```
|
|
87
136
|
|
|
88
|
-
|
|
137
|
+
|
|
138
|
+
### Common Use Cases
|
|
139
|
+
|
|
140
|
+
#### Extract with Custom Configuration
|
|
141
|
+
|
|
142
|
+
Most use cases benefit from configuration to control extraction behavior:
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
**With OCR (for scanned documents):**
|
|
89
146
|
|
|
90
147
|
```typescript
|
|
91
148
|
import { extractFile } from '@kreuzberg/node';
|
|
92
149
|
|
|
93
|
-
|
|
94
|
-
|
|
150
|
+
const config = {
|
|
151
|
+
ocr: {
|
|
152
|
+
backend: 'tesseract',
|
|
153
|
+
language: 'eng+fra',
|
|
154
|
+
tesseractConfig: {
|
|
155
|
+
psm: 3,
|
|
156
|
+
},
|
|
157
|
+
},
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
const result = await extractFile('document.pdf', null, config);
|
|
95
161
|
console.log(result.content);
|
|
96
|
-
console.log(result.tables);
|
|
97
162
|
```
|
|
98
163
|
|
|
99
|
-
### With Full Type Safety
|
|
100
164
|
|
|
101
|
-
```typescript
|
|
102
|
-
import {
|
|
103
|
-
extractFile,
|
|
104
|
-
type ExtractionConfig,
|
|
105
|
-
type ExtractionResult
|
|
106
|
-
} from '@kreuzberg/node';
|
|
107
|
-
|
|
108
|
-
const config: ExtractionConfig = {
|
|
109
|
-
useCache: true,
|
|
110
|
-
enableQualityProcessing: true
|
|
111
|
-
};
|
|
112
165
|
|
|
113
|
-
const result: ExtractionResult = await extractFile('invoice.pdf', config);
|
|
114
166
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
console.log(result.mimeType);
|
|
118
|
-
console.log(result.metadata);
|
|
167
|
+
#### Table Extraction
|
|
168
|
+
|
|
119
169
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
170
|
+
```typescript
|
|
171
|
+
import { extractFileSync } from '@kreuzberg/node';
|
|
172
|
+
|
|
173
|
+
const result = extractFileSync('document.pdf');
|
|
174
|
+
|
|
175
|
+
for (const table of result.tables) {
|
|
176
|
+
console.log(`Table with ${table.cells.length} rows`);
|
|
177
|
+
console.log(`Page: ${table.pageNumber}`);
|
|
178
|
+
console.log(table.markdown);
|
|
124
179
|
}
|
|
125
180
|
```
|
|
126
181
|
|
|
127
|
-
## Configuration
|
|
128
182
|
|
|
129
|
-
### OCR Configuration
|
|
130
183
|
|
|
131
|
-
```typescript
|
|
132
|
-
import { extractFile, type ExtractionConfig, type OcrConfig } from '@kreuzberg/node';
|
|
133
|
-
|
|
134
|
-
const config: ExtractionConfig = {
|
|
135
|
-
ocr: {
|
|
136
|
-
backend: 'tesseract',
|
|
137
|
-
language: 'eng',
|
|
138
|
-
tesseractConfig: {
|
|
139
|
-
enableTableDetection: true,
|
|
140
|
-
psm: 6,
|
|
141
|
-
minConfidence: 50.0
|
|
142
|
-
}
|
|
143
|
-
} as OcrConfig
|
|
144
|
-
};
|
|
145
184
|
|
|
146
|
-
|
|
147
|
-
console.log(result.content);
|
|
148
|
-
```
|
|
185
|
+
#### Processing Multiple Files
|
|
149
186
|
|
|
150
|
-
### PDF Password Protection
|
|
151
187
|
|
|
152
188
|
```typescript
|
|
153
|
-
import {
|
|
189
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
154
190
|
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
passwords: ['password1', 'password2'],
|
|
158
|
-
extractImages: true,
|
|
159
|
-
extractMetadata: true
|
|
160
|
-
} as PdfConfig
|
|
161
|
-
};
|
|
191
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
192
|
+
const results = batchExtractFilesSync(files);
|
|
162
193
|
|
|
163
|
-
|
|
194
|
+
results.forEach((result, i) => {
|
|
195
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
196
|
+
});
|
|
164
197
|
```
|
|
165
198
|
|
|
166
|
-
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
#### Async Processing
|
|
204
|
+
|
|
205
|
+
For non-blocking document processing:
|
|
167
206
|
|
|
168
207
|
```typescript
|
|
169
208
|
import { extractFile } from '@kreuzberg/node';
|
|
170
209
|
|
|
171
|
-
const result = await extractFile('
|
|
210
|
+
const result = await extractFile('document.pdf');
|
|
211
|
+
console.log(result.content);
|
|
212
|
+
```
|
|
172
213
|
|
|
173
|
-
if (result.tables) {
|
|
174
|
-
for (const table of result.tables) {
|
|
175
|
-
console.log('Table as Markdown:');
|
|
176
|
-
console.log(table.markdown);
|
|
177
214
|
|
|
178
|
-
console.log('Table cells:');
|
|
179
|
-
console.log(JSON.stringify(table.cells, null, 2));
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
```
|
|
183
215
|
|
|
184
|
-
### Text Chunking
|
|
185
216
|
|
|
186
|
-
```typescript
|
|
187
|
-
import { extractFile, type ChunkingConfig } from '@kreuzberg/node';
|
|
188
217
|
|
|
189
|
-
|
|
190
|
-
chunking: {
|
|
191
|
-
maxChars: 1000,
|
|
192
|
-
maxOverlap: 200
|
|
193
|
-
} as ChunkingConfig
|
|
194
|
-
};
|
|
218
|
+
#### Configuration Discovery
|
|
195
219
|
|
|
196
|
-
|
|
220
|
+
```typescript
|
|
221
|
+
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
197
222
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
223
|
+
const config = ExtractionConfig.discover();
|
|
224
|
+
if (config) {
|
|
225
|
+
console.log('Found configuration file');
|
|
226
|
+
const result = await extractFile('document.pdf', null, config);
|
|
227
|
+
console.log(result.content);
|
|
228
|
+
} else {
|
|
229
|
+
console.log('No configuration file found, using defaults');
|
|
230
|
+
const result = await extractFile('document.pdf');
|
|
231
|
+
console.log(result.content);
|
|
202
232
|
}
|
|
203
233
|
```
|
|
204
234
|
|
|
205
|
-
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
#### Worker Thread Pool
|
|
206
240
|
|
|
207
241
|
```typescript
|
|
208
|
-
import {
|
|
242
|
+
import { createWorkerPool, extractFileInWorker, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
209
243
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
enabled: true,
|
|
213
|
-
minConfidence: 0.8,
|
|
214
|
-
detectMultiple: false
|
|
215
|
-
} as LanguageDetectionConfig
|
|
216
|
-
};
|
|
244
|
+
// Create a pool with 4 worker threads
|
|
245
|
+
const pool = createWorkerPool(4);
|
|
217
246
|
|
|
218
|
-
|
|
247
|
+
try {
|
|
248
|
+
// Extract single file in worker
|
|
249
|
+
const result = await extractFileInWorker(pool, 'document.pdf', null, {
|
|
250
|
+
useCache: true
|
|
251
|
+
});
|
|
252
|
+
console.log(result.content);
|
|
219
253
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
254
|
+
// Extract multiple files concurrently
|
|
255
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
256
|
+
const results = await batchExtractFilesInWorker(pool, files, {
|
|
257
|
+
useCache: true
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
results.forEach((result, i) => {
|
|
261
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
262
|
+
});
|
|
263
|
+
} finally {
|
|
264
|
+
// Always close the pool when done
|
|
265
|
+
await closeWorkerPool(pool);
|
|
223
266
|
}
|
|
224
267
|
```
|
|
225
268
|
|
|
226
|
-
### Image Extraction
|
|
227
269
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
270
|
+
**Performance Benefits:**
|
|
271
|
+
- **Parallel Processing**: Multiple documents extracted simultaneously
|
|
272
|
+
- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
|
|
273
|
+
- **Queue Management**: Automatically distributes work across available workers
|
|
274
|
+
- **Resource Control**: Prevents thread exhaustion with configurable pool size
|
|
231
275
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
autoAdjustDpi: true
|
|
238
|
-
} as ImageExtractionConfig
|
|
239
|
-
};
|
|
276
|
+
**Best Practices:**
|
|
277
|
+
- Use worker pools for batches of 10+ documents
|
|
278
|
+
- Set pool size to number of CPU cores (default behavior)
|
|
279
|
+
- Always close pools with `closeWorkerPool()` to prevent resource leaks
|
|
280
|
+
- Reuse pools across multiple batch operations for efficiency
|
|
240
281
|
|
|
241
|
-
const result = await extractFile('document-with-images.pdf', config);
|
|
242
282
|
|
|
243
|
-
if (result.images) {
|
|
244
|
-
for (let i = 0; i < result.images.length; i++) {
|
|
245
|
-
const image = result.images[i];
|
|
246
|
-
await writeFile(`image-${i}.${image.format}`, Buffer.from(image.data));
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
```
|
|
250
283
|
|
|
251
|
-
###
|
|
284
|
+
### Next Steps
|
|
252
285
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
type ChunkingConfig,
|
|
259
|
-
type ImageExtractionConfig,
|
|
260
|
-
type PdfConfig,
|
|
261
|
-
type TokenReductionConfig,
|
|
262
|
-
type LanguageDetectionConfig
|
|
263
|
-
} from '@kreuzberg/node';
|
|
264
|
-
|
|
265
|
-
const config: ExtractionConfig = {
|
|
266
|
-
useCache: true,
|
|
267
|
-
enableQualityProcessing: true,
|
|
268
|
-
forceOcr: false,
|
|
269
|
-
maxConcurrentExtractions: 8,
|
|
270
|
-
|
|
271
|
-
ocr: {
|
|
272
|
-
backend: 'tesseract',
|
|
273
|
-
language: 'eng',
|
|
274
|
-
preprocessing: true,
|
|
275
|
-
tesseractConfig: {
|
|
276
|
-
enableTableDetection: true,
|
|
277
|
-
psm: 6,
|
|
278
|
-
oem: 3,
|
|
279
|
-
minConfidence: 50.0
|
|
280
|
-
}
|
|
281
|
-
} as OcrConfig,
|
|
282
|
-
|
|
283
|
-
chunking: {
|
|
284
|
-
maxChars: 1000,
|
|
285
|
-
maxOverlap: 200
|
|
286
|
-
} as ChunkingConfig,
|
|
287
|
-
|
|
288
|
-
images: {
|
|
289
|
-
extractImages: true,
|
|
290
|
-
targetDpi: 300,
|
|
291
|
-
maxImageDimension: 4096,
|
|
292
|
-
autoAdjustDpi: true
|
|
293
|
-
} as ImageExtractionConfig,
|
|
294
|
-
|
|
295
|
-
pdfOptions: {
|
|
296
|
-
extractImages: true,
|
|
297
|
-
passwords: [],
|
|
298
|
-
extractMetadata: true
|
|
299
|
-
} as PdfConfig,
|
|
300
|
-
|
|
301
|
-
tokenReduction: {
|
|
302
|
-
mode: 'moderate',
|
|
303
|
-
preserveImportantWords: true
|
|
304
|
-
} as TokenReductionConfig,
|
|
305
|
-
|
|
306
|
-
languageDetection: {
|
|
307
|
-
enabled: true,
|
|
308
|
-
minConfidence: 0.8,
|
|
309
|
-
detectMultiple: false
|
|
310
|
-
} as LanguageDetectionConfig
|
|
311
|
-
};
|
|
286
|
+
- **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
|
287
|
+
- **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
|
|
288
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
|
|
289
|
+
- **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
|
|
290
|
+
- **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
|
|
312
291
|
|
|
313
|
-
const result = await extractFile('document.pdf', config);
|
|
314
|
-
```
|
|
315
292
|
|
|
316
|
-
## Advanced Usage
|
|
317
293
|
|
|
318
|
-
|
|
294
|
+
## NAPI-RS Implementation Details
|
|
319
295
|
|
|
320
|
-
|
|
321
|
-
import { extractBytes } from '@kreuzberg/node';
|
|
322
|
-
import { readFile } from 'fs/promises';
|
|
296
|
+
### Native Performance
|
|
323
297
|
|
|
324
|
-
|
|
325
|
-
const result = await extractBytes(buffer, 'application/pdf');
|
|
326
|
-
console.log(result.content);
|
|
327
|
-
```
|
|
298
|
+
This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
328
299
|
|
|
329
|
-
|
|
300
|
+
- **Zero-copy data transfer** between JavaScript and Rust layers
|
|
301
|
+
- **Native thread pool** for concurrent document processing
|
|
302
|
+
- **Direct memory management** for efficient large document handling
|
|
303
|
+
- **Binary-compatible** pre-built native modules across platforms
|
|
330
304
|
|
|
331
|
-
|
|
332
|
-
import { batchExtractFiles } from '@kreuzberg/node';
|
|
305
|
+
### Threading Model
|
|
333
306
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
];
|
|
307
|
+
- Single documents are processed synchronously or asynchronously in a dedicated thread
|
|
308
|
+
- Batch operations distribute work across available CPU cores
|
|
309
|
+
- Thread count is configurable but defaults to system CPU count
|
|
310
|
+
- Long-running extractions block the event loop unless using async APIs
|
|
339
311
|
|
|
340
|
-
|
|
312
|
+
### Memory Management
|
|
341
313
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
314
|
+
- Large documents (> 100 MB) are streamed to avoid loading entirely into memory
|
|
315
|
+
- Temporary files are created in system temp directory for extraction
|
|
316
|
+
- Memory is automatically released after extraction completion
|
|
317
|
+
- ONNX models are cached in memory for repeated embeddings operations
|
|
346
318
|
|
|
347
|
-
### Batch Processing with Custom Concurrency
|
|
348
319
|
|
|
349
|
-
```typescript
|
|
350
|
-
import { batchExtractFiles } from '@kreuzberg/node';
|
|
351
320
|
|
|
352
|
-
|
|
353
|
-
maxConcurrentExtractions: 4 // Process 4 files at a time
|
|
354
|
-
};
|
|
321
|
+
## Features
|
|
355
322
|
|
|
356
|
-
|
|
357
|
-
const results = await batchExtractFiles(files, config);
|
|
323
|
+
### Supported File Formats (56+)
|
|
358
324
|
|
|
359
|
-
|
|
360
|
-
```
|
|
325
|
+
56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
361
326
|
|
|
362
|
-
|
|
327
|
+
#### Office Documents
|
|
363
328
|
|
|
364
|
-
|
|
365
|
-
|
|
329
|
+
| Category | Formats | Capabilities |
|
|
330
|
+
|----------|---------|--------------|
|
|
331
|
+
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
332
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
333
|
+
| **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
|
|
334
|
+
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
335
|
+
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
366
336
|
|
|
367
|
-
|
|
337
|
+
#### Images (OCR-Enabled)
|
|
368
338
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
console.log('Word Count:', result.metadata.wordCount);
|
|
375
|
-
}
|
|
376
|
-
```
|
|
339
|
+
| Category | Formats | Features |
|
|
340
|
+
|----------|---------|----------|
|
|
341
|
+
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
|
342
|
+
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR, table detection, format-specific metadata |
|
|
343
|
+
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
|
377
344
|
|
|
378
|
-
|
|
345
|
+
#### Web & Data
|
|
379
346
|
|
|
380
|
-
|
|
381
|
-
|
|
347
|
+
| Category | Formats | Features |
|
|
348
|
+
|----------|---------|----------|
|
|
349
|
+
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
350
|
+
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
351
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
382
352
|
|
|
383
|
-
|
|
384
|
-
tokenReduction: {
|
|
385
|
-
mode: 'aggressive', // Options: 'light', 'moderate', 'aggressive'
|
|
386
|
-
preserveImportantWords: true
|
|
387
|
-
} as TokenReductionConfig
|
|
388
|
-
};
|
|
353
|
+
#### Email & Archives
|
|
389
354
|
|
|
390
|
-
|
|
355
|
+
| Category | Formats | Features |
|
|
356
|
+
|----------|---------|----------|
|
|
357
|
+
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
|
358
|
+
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
|
391
359
|
|
|
392
|
-
|
|
393
|
-
console.log(`Original length: ${result.content.length}`);
|
|
394
|
-
console.log(`Processed for LLM context window`);
|
|
395
|
-
```
|
|
360
|
+
#### Academic & Scientific
|
|
396
361
|
|
|
397
|
-
|
|
362
|
+
| Category | Formats | Features |
|
|
363
|
+
|----------|---------|----------|
|
|
364
|
+
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.enw`, `.csl` | Bibliography parsing, citation extraction |
|
|
365
|
+
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
366
|
+
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
398
367
|
|
|
399
|
-
|
|
400
|
-
import {
|
|
401
|
-
extractFile,
|
|
402
|
-
KreuzbergError,
|
|
403
|
-
ValidationError,
|
|
404
|
-
ParsingError,
|
|
405
|
-
OCRError,
|
|
406
|
-
MissingDependencyError
|
|
407
|
-
} from '@kreuzberg/node';
|
|
368
|
+
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
408
369
|
|
|
409
|
-
|
|
410
|
-
const result = await extractFile('document.pdf');
|
|
411
|
-
console.log(result.content);
|
|
412
|
-
} catch (error) {
|
|
413
|
-
if (error instanceof ValidationError) {
|
|
414
|
-
console.error('Invalid configuration or input:', error.message);
|
|
415
|
-
} else if (error instanceof ParsingError) {
|
|
416
|
-
console.error('Failed to parse document:', error.message);
|
|
417
|
-
} else if (error instanceof OCRError) {
|
|
418
|
-
console.error('OCR processing failed:', error.message);
|
|
419
|
-
} else if (error instanceof MissingDependencyError) {
|
|
420
|
-
console.error(`Missing dependency: ${error.dependency}`);
|
|
421
|
-
console.error('Installation instructions:', error.message);
|
|
422
|
-
} else if (error instanceof KreuzbergError) {
|
|
423
|
-
console.error('Kreuzberg error:', error.message);
|
|
424
|
-
} else {
|
|
425
|
-
throw error;
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
```
|
|
370
|
+
### Key Capabilities
|
|
429
371
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
Asynchronously extract content from a file.
|
|
436
|
-
|
|
437
|
-
#### `extractFileSync(filePath: string, config?: ExtractionConfig): ExtractionResult`
|
|
438
|
-
Synchronously extract content from a file.
|
|
439
|
-
|
|
440
|
-
#### `extractBytes(data: Buffer, mimeType: string, config?: ExtractionConfig): Promise<ExtractionResult>`
|
|
441
|
-
Asynchronously extract content from a buffer.
|
|
442
|
-
|
|
443
|
-
#### `extractBytesSync(data: Buffer, mimeType: string, config?: ExtractionConfig): ExtractionResult`
|
|
444
|
-
Synchronously extract content from a buffer.
|
|
445
|
-
|
|
446
|
-
#### `batchExtractFiles(paths: string[], config?: ExtractionConfig): Promise<ExtractionResult[]>`
|
|
447
|
-
Asynchronously extract content from multiple files in parallel.
|
|
448
|
-
|
|
449
|
-
#### `batchExtractFilesSync(paths: string[], config?: ExtractionConfig): ExtractionResult[]`
|
|
450
|
-
Synchronously extract content from multiple files.
|
|
451
|
-
|
|
452
|
-
### Types
|
|
453
|
-
|
|
454
|
-
#### `ExtractionResult`
|
|
455
|
-
Main result object containing:
|
|
456
|
-
- `content: string` - Extracted text content
|
|
457
|
-
- `mimeType: string` - MIME type of the document
|
|
458
|
-
- `metadata?: Metadata` - Document metadata
|
|
459
|
-
- `tables?: Table[]` - Extracted tables
|
|
460
|
-
- `images?: ImageData[]` - Extracted images
|
|
461
|
-
- `chunks?: Chunk[]` - Text chunks (if chunking enabled)
|
|
462
|
-
- `language?: LanguageInfo` - Detected language (if enabled)
|
|
463
|
-
|
|
464
|
-
#### `ExtractionConfig`
|
|
465
|
-
Configuration object for extraction:
|
|
466
|
-
- `useCache?: boolean` - Enable result caching
|
|
467
|
-
- `enableQualityProcessing?: boolean` - Enable text quality improvements
|
|
468
|
-
- `forceOcr?: boolean` - Force OCR even for text-based PDFs
|
|
469
|
-
- `maxConcurrentExtractions?: number` - Max parallel extractions
|
|
470
|
-
- `ocr?: OcrConfig` - OCR settings
|
|
471
|
-
- `chunking?: ChunkingConfig` - Text chunking settings
|
|
472
|
-
- `images?: ImageExtractionConfig` - Image extraction settings
|
|
473
|
-
- `pdfOptions?: PdfConfig` - PDF-specific options
|
|
474
|
-
- `tokenReduction?: TokenReductionConfig` - Token reduction settings
|
|
475
|
-
- `languageDetection?: LanguageDetectionConfig` - Language detection settings
|
|
476
|
-
|
|
477
|
-
#### `OcrConfig`
|
|
478
|
-
OCR configuration:
|
|
479
|
-
- `backend: string` - OCR backend ('tesseract', 'easyocr', 'paddleocr')
|
|
480
|
-
- `language: string` - Language code (e.g., 'eng', 'fra', 'deu')
|
|
481
|
-
- `preprocessing?: boolean` - Enable image preprocessing
|
|
482
|
-
- `tesseractConfig?: TesseractConfig` - Tesseract-specific options
|
|
483
|
-
|
|
484
|
-
#### `Table`
|
|
485
|
-
Extracted table structure:
|
|
486
|
-
- `markdown: string` - Table in Markdown format
|
|
487
|
-
- `cells: TableCell[][]` - 2D array of table cells
|
|
488
|
-
- `rowCount: number` - Number of rows
|
|
489
|
-
- `columnCount: number` - Number of columns
|
|
490
|
-
|
|
491
|
-
### Exceptions
|
|
492
|
-
|
|
493
|
-
All Kreuzberg exceptions extend the base `KreuzbergError` class:
|
|
494
|
-
|
|
495
|
-
- `KreuzbergError` - Base error class for all Kreuzberg errors
|
|
496
|
-
- `ValidationError` - Invalid configuration, missing required fields, or invalid input
|
|
497
|
-
- `ParsingError` - Document parsing failure or corrupted file
|
|
498
|
-
- `OCRError` - OCR processing failure
|
|
499
|
-
- `MissingDependencyError` - Missing optional system dependency (includes installation instructions)
|
|
500
|
-
|
|
501
|
-
## Supported Formats
|
|
502
|
-
|
|
503
|
-
| Category | Formats |
|
|
504
|
-
|----------|---------|
|
|
505
|
-
| **Documents** | PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, ODT, ODP, ODS, RTF |
|
|
506
|
-
| **Images** | PNG, JPEG, JPG, WEBP, BMP, TIFF, GIF |
|
|
507
|
-
| **Web** | HTML, XHTML, XML |
|
|
508
|
-
| **Text** | TXT, MD, CSV, TSV, JSON, YAML, TOML |
|
|
509
|
-
| **Email** | EML, MSG |
|
|
510
|
-
| **Archives** | ZIP, TAR, 7Z |
|
|
511
|
-
| **Other** | And 30+ more formats |
|
|
512
|
-
|
|
513
|
-
## Performance
|
|
514
|
-
|
|
515
|
-
Kreuzberg is built with a native Rust core, providing significant performance improvements over pure JavaScript solutions:
|
|
516
|
-
|
|
517
|
-
- **10-50x faster** text extraction compared to pure Node.js libraries
|
|
518
|
-
- **Native multithreading** for batch processing
|
|
519
|
-
- **Optimized memory usage** with streaming for large files
|
|
520
|
-
- **Zero-copy operations** where possible
|
|
521
|
-
- **Efficient caching** to avoid redundant processing
|
|
522
|
-
|
|
523
|
-
### Benchmarks
|
|
524
|
-
|
|
525
|
-
Processing 100 mixed documents (PDF, DOCX, XLSX):
|
|
526
|
-
|
|
527
|
-
| Library | Time | Memory |
|
|
528
|
-
|---------|------|--------|
|
|
529
|
-
| Kreuzberg | 2.3s | 145 MB |
|
|
530
|
-
| pdf-parse + mammoth | 23.1s | 890 MB |
|
|
531
|
-
| textract | 45.2s | 1.2 GB |
|
|
372
|
+
- **Text Extraction** - Extract all text content with position and formatting information
|
|
373
|
+
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
374
|
+
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
375
|
+
- **Image Extraction** - Extract embedded images and render page previews
|
|
376
|
+
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
|
532
377
|
|
|
533
|
-
|
|
378
|
+
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
534
379
|
|
|
535
|
-
### Native Module Not Found
|
|
536
380
|
|
|
537
|
-
|
|
381
|
+
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
538
382
|
|
|
539
|
-
```bash
|
|
540
|
-
npm rebuild kreuzberg
|
|
541
|
-
```
|
|
542
383
|
|
|
543
|
-
|
|
384
|
+
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
|
|
385
|
+
|
|
386
|
+
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
387
|
+
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
388
|
+
- **Language Detection** - Detect and support multiple languages in documents
|
|
389
|
+
- **Configuration** - Fine-grained control over extraction behavior
|
|
390
|
+
|
|
391
|
+
### Performance Characteristics
|
|
392
|
+
|
|
393
|
+
| Format | Speed | Memory | Notes |
|
|
394
|
+
|--------|-------|--------|-------|
|
|
395
|
+
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
|
396
|
+
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
|
397
|
+
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
|
398
|
+
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
|
399
|
+
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
|
544
400
|
|
|
545
|
-
Ensure Tesseract is installed and available in PATH:
|
|
546
401
|
|
|
547
|
-
```bash
|
|
548
|
-
tesseract --version
|
|
549
|
-
```
|
|
550
402
|
|
|
551
|
-
|
|
552
|
-
- macOS: `brew install tesseract`
|
|
553
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
554
|
-
- Windows: Download from [tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)
|
|
403
|
+
## OCR Support
|
|
555
404
|
|
|
556
|
-
|
|
405
|
+
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
|
557
406
|
|
|
558
|
-
|
|
407
|
+
|
|
408
|
+
- **Tesseract**
|
|
409
|
+
|
|
410
|
+
- **Guten**
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
### OCR Configuration Example
|
|
559
414
|
|
|
560
415
|
```typescript
|
|
416
|
+
import { extractFile } from '@kreuzberg/node';
|
|
417
|
+
|
|
561
418
|
const config = {
|
|
562
|
-
|
|
419
|
+
ocr: {
|
|
420
|
+
backend: 'tesseract',
|
|
421
|
+
language: 'eng+fra',
|
|
422
|
+
tesseractConfig: {
|
|
423
|
+
psm: 3,
|
|
424
|
+
},
|
|
425
|
+
},
|
|
563
426
|
};
|
|
564
|
-
|
|
427
|
+
|
|
428
|
+
const result = await extractFile('document.pdf', null, config);
|
|
429
|
+
console.log(result.content);
|
|
565
430
|
```
|
|
566
431
|
|
|
567
|
-
### TypeScript Types Not Resolving
|
|
568
432
|
|
|
569
|
-
Make sure you're using:
|
|
570
|
-
- Node.js 18 or higher
|
|
571
|
-
- TypeScript 5.0 or higher
|
|
572
433
|
|
|
573
|
-
The package includes built-in type definitions.
|
|
574
434
|
|
|
575
|
-
|
|
435
|
+
## Async Support
|
|
576
436
|
|
|
577
|
-
|
|
437
|
+
This binding provides full async/await support for non-blocking document processing:
|
|
578
438
|
|
|
579
439
|
```typescript
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
440
|
+
import { extractFile } from '@kreuzberg/node';
|
|
441
|
+
|
|
442
|
+
const result = await extractFile('document.pdf');
|
|
443
|
+
console.log(result.content);
|
|
584
444
|
```
|
|
585
445
|
|
|
586
|
-
## Examples
|
|
587
446
|
|
|
588
|
-
### Extract Invoice Data
|
|
589
447
|
|
|
590
|
-
```typescript
|
|
591
|
-
import { extractFile } from '@kreuzberg/node';
|
|
592
448
|
|
|
593
|
-
|
|
449
|
+
## Plugin System
|
|
594
450
|
|
|
595
|
-
|
|
596
|
-
if (result.tables && result.tables.length > 0) {
|
|
597
|
-
const lineItems = result.tables[0];
|
|
598
|
-
console.log(lineItems.markdown);
|
|
599
|
-
}
|
|
451
|
+
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
|
600
452
|
|
|
601
|
-
|
|
602
|
-
if (result.metadata) {
|
|
603
|
-
console.log('Invoice Date:', result.metadata.creationDate);
|
|
604
|
-
}
|
|
605
|
-
```
|
|
453
|
+
For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
|
|
606
454
|
|
|
607
|
-
### Process Scanned Documents
|
|
608
455
|
|
|
609
|
-
```typescript
|
|
610
|
-
import { extractFile } from '@kreuzberg/node';
|
|
611
456
|
|
|
612
|
-
const config = {
|
|
613
|
-
forceOcr: true,
|
|
614
|
-
ocr: {
|
|
615
|
-
backend: 'tesseract',
|
|
616
|
-
language: 'eng',
|
|
617
|
-
preprocessing: true
|
|
618
|
-
}
|
|
619
|
-
};
|
|
620
457
|
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
458
|
+
## Embeddings Support
|
|
459
|
+
|
|
460
|
+
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
|
|
624
461
|
|
|
625
|
-
|
|
462
|
+
**[Embeddings Guide](https://kreuzberg.dev/features/#embeddings)**
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
## Batch Processing
|
|
467
|
+
|
|
468
|
+
Process multiple documents efficiently:
|
|
626
469
|
|
|
627
470
|
```typescript
|
|
628
|
-
import {
|
|
629
|
-
import { glob } from 'glob';
|
|
471
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
630
472
|
|
|
631
|
-
|
|
632
|
-
const
|
|
473
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
474
|
+
const results = batchExtractFilesSync(files);
|
|
633
475
|
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
maxConcurrentExtractions: 8,
|
|
637
|
-
enableQualityProcessing: true
|
|
476
|
+
results.forEach((result, i) => {
|
|
477
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
638
478
|
});
|
|
479
|
+
```
|
|
639
480
|
|
|
640
|
-
// Build search index
|
|
641
|
-
const searchIndex = results.map((result, i) => ({
|
|
642
|
-
path: files[i],
|
|
643
|
-
content: result.content,
|
|
644
|
-
metadata: result.metadata
|
|
645
|
-
}));
|
|
646
481
|
|
|
647
|
-
|
|
648
|
-
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
## Configuration
|
|
485
|
+
|
|
486
|
+
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
|
487
|
+
|
|
488
|
+
**[Configuration Guide](https://kreuzberg.dev/configuration/)**
|
|
649
489
|
|
|
650
490
|
## Documentation
|
|
651
491
|
|
|
652
|
-
|
|
492
|
+
- **[Official Documentation](https://kreuzberg.dev/)**
|
|
493
|
+
- **[API Reference](https://kreuzberg.dev/reference/api-typescript/)**
|
|
494
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)**
|
|
495
|
+
|
|
496
|
+
## Troubleshooting
|
|
497
|
+
|
|
498
|
+
For common issues and solutions, visit [Troubleshooting Guide](https://kreuzberg.dev/troubleshooting/).
|
|
653
499
|
|
|
654
500
|
## Contributing
|
|
655
501
|
|
|
656
|
-
|
|
502
|
+
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
|
657
503
|
|
|
658
504
|
## License
|
|
659
505
|
|
|
660
|
-
MIT
|
|
506
|
+
MIT License - see LICENSE file for details.
|
|
661
507
|
|
|
662
|
-
##
|
|
508
|
+
## Support
|
|
663
509
|
|
|
664
|
-
- [
|
|
665
|
-
- [
|
|
666
|
-
- [
|
|
667
|
-
- [Issue Tracker](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
668
|
-
- [Changelog](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md)
|
|
669
|
-
- [npm Package](https://www.npmjs.com/package/kreuzberg)
|
|
510
|
+
- **Discord Community**: [Join our Discord](https://discord.gg/pXxagNK2zN)
|
|
511
|
+
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
512
|
+
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|