@kreuzberg/node 4.0.0-rc.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +669 -0
- package/index.d.ts +1109 -0
- package/index.js +607 -0
- package/metadata.d.ts +502 -0
- package/package.json +128 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2025 Na'aman Hirschfeld
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
# Kreuzberg for Node.js
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/kreuzberg)
|
|
4
|
+
[](https://crates.io/crates/kreuzberg)
|
|
5
|
+
[](https://pypi.org/project/kreuzberg/)
|
|
6
|
+
[](https://rubygems.org/gems/kreuzberg)
|
|
7
|
+
[](https://www.npmjs.com/package/kreuzberg)
|
|
8
|
+
[](https://opensource.org/licenses/MIT)
|
|
9
|
+
[](https://kreuzberg.dev)
|
|
10
|
+
|
|
11
|
+
High-performance document intelligence for Node.js and TypeScript, powered by Rust.
|
|
12
|
+
|
|
13
|
+
Extract text, tables, images, and metadata from 50+ file formats including PDF, DOCX, PPTX, XLSX, images, and more.
|
|
14
|
+
|
|
15
|
+
> **🚀 Version 4.0.0 Release Candidate**
|
|
16
|
+
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- **50+ File Formats**: PDF, DOCX, PPTX, XLSX, images, HTML, Markdown, XML, JSON, and more
|
|
21
|
+
- **OCR Support**: Built-in Tesseract, EasyOCR, and PaddleOCR backends for scanned documents
|
|
22
|
+
- **Table Extraction**: Advanced table detection and structured data extraction
|
|
23
|
+
- **High Performance**: Native Rust core provides 10-50x performance improvements over pure JavaScript
|
|
24
|
+
- **Type-Safe**: Full TypeScript definitions for all methods, configurations, and return types
|
|
25
|
+
- **Async/Sync APIs**: Both asynchronous and synchronous extraction methods
|
|
26
|
+
- **Batch Processing**: Process multiple documents in parallel with optimized concurrency
|
|
27
|
+
- **Language Detection**: Automatic language detection for extracted text
|
|
28
|
+
- **Text Chunking**: Split long documents into manageable chunks for LLM processing
|
|
29
|
+
- **Caching**: Built-in result caching for faster repeated extractions
|
|
30
|
+
- **Zero Configuration**: Works out of the box with sensible defaults
|
|
31
|
+
|
|
32
|
+
## Requirements
|
|
33
|
+
|
|
34
|
+
- Node.js 18 or higher
|
|
35
|
+
- Native bindings are prebuilt for:
|
|
36
|
+
- macOS (x64, arm64)
|
|
37
|
+
- Linux (x64, arm64, armv7)
|
|
38
|
+
- Windows (x64, arm64)
|
|
39
|
+
|
|
40
|
+
### Optional System Dependencies
|
|
41
|
+
|
|
42
|
+
- **Tesseract**: For OCR functionality
|
|
43
|
+
- macOS: `brew install tesseract`
|
|
44
|
+
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
45
|
+
- Windows: Download from [GitHub](https://github.com/tesseract-ocr/tesseract)
|
|
46
|
+
|
|
47
|
+
- **LibreOffice**: For legacy MS Office formats (.doc, .ppt)
|
|
48
|
+
- macOS: `brew install libreoffice`
|
|
49
|
+
- Ubuntu: `sudo apt-get install libreoffice`
|
|
50
|
+
|
|
51
|
+
- **Pandoc**: For advanced document conversion
|
|
52
|
+
- macOS: `brew install pandoc`
|
|
53
|
+
- Ubuntu: `sudo apt-get install pandoc`
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
npm install @kreuzberg/node
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Or with pnpm:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pnpm add @kreuzberg/node
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Or with yarn:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
yarn add @kreuzberg/node
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
The package includes prebuilt native binaries for major platforms. No additional build steps required.
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
### Basic Extraction
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
import { extractFileSync } from '@kreuzberg/node';
|
|
81
|
+
|
|
82
|
+
// Synchronous extraction
|
|
83
|
+
const result = extractFileSync('document.pdf');
|
|
84
|
+
console.log(result.content);
|
|
85
|
+
console.log(result.metadata);
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Async Extraction (Recommended)
|
|
89
|
+
|
|
90
|
+
```typescript
|
|
91
|
+
import { extractFile } from '@kreuzberg/node';
|
|
92
|
+
|
|
93
|
+
// Asynchronous extraction
|
|
94
|
+
const result = await extractFile('document.pdf');
|
|
95
|
+
console.log(result.content);
|
|
96
|
+
console.log(result.tables);
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### With Full Type Safety
|
|
100
|
+
|
|
101
|
+
```typescript
|
|
102
|
+
import {
|
|
103
|
+
extractFile,
|
|
104
|
+
type ExtractionConfig,
|
|
105
|
+
type ExtractionResult
|
|
106
|
+
} from '@kreuzberg/node';
|
|
107
|
+
|
|
108
|
+
const config: ExtractionConfig = {
|
|
109
|
+
useCache: true,
|
|
110
|
+
enableQualityProcessing: true
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
const result: ExtractionResult = await extractFile('invoice.pdf', config);
|
|
114
|
+
|
|
115
|
+
// Type-safe access to all properties
|
|
116
|
+
console.log(result.content);
|
|
117
|
+
console.log(result.mimeType);
|
|
118
|
+
console.log(result.metadata);
|
|
119
|
+
|
|
120
|
+
if (result.tables) {
|
|
121
|
+
for (const table of result.tables) {
|
|
122
|
+
console.log(table.markdown);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Configuration
|
|
128
|
+
|
|
129
|
+
### OCR Configuration
|
|
130
|
+
|
|
131
|
+
```typescript
|
|
132
|
+
import { extractFile, type ExtractionConfig, type OcrConfig } from '@kreuzberg/node';
|
|
133
|
+
|
|
134
|
+
const config: ExtractionConfig = {
|
|
135
|
+
ocr: {
|
|
136
|
+
backend: 'tesseract',
|
|
137
|
+
language: 'eng',
|
|
138
|
+
tesseractConfig: {
|
|
139
|
+
enableTableDetection: true,
|
|
140
|
+
psm: 6,
|
|
141
|
+
minConfidence: 50.0
|
|
142
|
+
}
|
|
143
|
+
} as OcrConfig
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
const result = await extractFile('scanned.pdf', config);
|
|
147
|
+
console.log(result.content);
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### PDF Password Protection
|
|
151
|
+
|
|
152
|
+
```typescript
|
|
153
|
+
import { extractFile, type PdfConfig } from '@kreuzberg/node';
|
|
154
|
+
|
|
155
|
+
const config = {
|
|
156
|
+
pdfOptions: {
|
|
157
|
+
passwords: ['password1', 'password2'],
|
|
158
|
+
extractImages: true,
|
|
159
|
+
extractMetadata: true
|
|
160
|
+
} as PdfConfig
|
|
161
|
+
};
|
|
162
|
+
|
|
163
|
+
const result = await extractFile('protected.pdf', config);
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Extract Tables
|
|
167
|
+
|
|
168
|
+
```typescript
|
|
169
|
+
import { extractFile } from '@kreuzberg/node';
|
|
170
|
+
|
|
171
|
+
const result = await extractFile('financial-report.pdf');
|
|
172
|
+
|
|
173
|
+
if (result.tables) {
|
|
174
|
+
for (const table of result.tables) {
|
|
175
|
+
console.log('Table as Markdown:');
|
|
176
|
+
console.log(table.markdown);
|
|
177
|
+
|
|
178
|
+
console.log('Table cells:');
|
|
179
|
+
console.log(JSON.stringify(table.cells, null, 2));
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Text Chunking
|
|
185
|
+
|
|
186
|
+
```typescript
|
|
187
|
+
import { extractFile, type ChunkingConfig } from '@kreuzberg/node';
|
|
188
|
+
|
|
189
|
+
const config = {
|
|
190
|
+
chunking: {
|
|
191
|
+
maxChars: 1000,
|
|
192
|
+
maxOverlap: 200
|
|
193
|
+
} as ChunkingConfig
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
const result = await extractFile('long-document.pdf', config);
|
|
197
|
+
|
|
198
|
+
if (result.chunks) {
|
|
199
|
+
for (const chunk of result.chunks) {
|
|
200
|
+
console.log(`Chunk ${chunk.index}: ${chunk.text.substring(0, 100)}...`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Language Detection
|
|
206
|
+
|
|
207
|
+
```typescript
|
|
208
|
+
import { extractFile, type LanguageDetectionConfig } from '@kreuzberg/node';
|
|
209
|
+
|
|
210
|
+
const config = {
|
|
211
|
+
languageDetection: {
|
|
212
|
+
enabled: true,
|
|
213
|
+
minConfidence: 0.8,
|
|
214
|
+
detectMultiple: false
|
|
215
|
+
} as LanguageDetectionConfig
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
const result = await extractFile('multilingual.pdf', config);
|
|
219
|
+
|
|
220
|
+
if (result.language) {
|
|
221
|
+
console.log(`Detected language: ${result.language.code}`);
|
|
222
|
+
console.log(`Confidence: ${result.language.confidence}`);
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Image Extraction
|
|
227
|
+
|
|
228
|
+
```typescript
|
|
229
|
+
import { extractFile, type ImageExtractionConfig } from '@kreuzberg/node';
|
|
230
|
+
import { writeFile } from 'fs/promises';
|
|
231
|
+
|
|
232
|
+
const config = {
|
|
233
|
+
images: {
|
|
234
|
+
extractImages: true,
|
|
235
|
+
targetDpi: 300,
|
|
236
|
+
maxImageDimension: 4096,
|
|
237
|
+
autoAdjustDpi: true
|
|
238
|
+
} as ImageExtractionConfig
|
|
239
|
+
};
|
|
240
|
+
|
|
241
|
+
const result = await extractFile('document-with-images.pdf', config);
|
|
242
|
+
|
|
243
|
+
if (result.images) {
|
|
244
|
+
for (let i = 0; i < result.images.length; i++) {
|
|
245
|
+
const image = result.images[i];
|
|
246
|
+
await writeFile(`image-${i}.${image.format}`, Buffer.from(image.data));
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### Complete Configuration Example
|
|
252
|
+
|
|
253
|
+
```typescript
|
|
254
|
+
import {
|
|
255
|
+
extractFile,
|
|
256
|
+
type ExtractionConfig,
|
|
257
|
+
type OcrConfig,
|
|
258
|
+
type ChunkingConfig,
|
|
259
|
+
type ImageExtractionConfig,
|
|
260
|
+
type PdfConfig,
|
|
261
|
+
type TokenReductionConfig,
|
|
262
|
+
type LanguageDetectionConfig
|
|
263
|
+
} from '@kreuzberg/node';
|
|
264
|
+
|
|
265
|
+
const config: ExtractionConfig = {
|
|
266
|
+
useCache: true,
|
|
267
|
+
enableQualityProcessing: true,
|
|
268
|
+
forceOcr: false,
|
|
269
|
+
maxConcurrentExtractions: 8,
|
|
270
|
+
|
|
271
|
+
ocr: {
|
|
272
|
+
backend: 'tesseract',
|
|
273
|
+
language: 'eng',
|
|
274
|
+
preprocessing: true,
|
|
275
|
+
tesseractConfig: {
|
|
276
|
+
enableTableDetection: true,
|
|
277
|
+
psm: 6,
|
|
278
|
+
oem: 3,
|
|
279
|
+
minConfidence: 50.0
|
|
280
|
+
}
|
|
281
|
+
} as OcrConfig,
|
|
282
|
+
|
|
283
|
+
chunking: {
|
|
284
|
+
maxChars: 1000,
|
|
285
|
+
maxOverlap: 200
|
|
286
|
+
} as ChunkingConfig,
|
|
287
|
+
|
|
288
|
+
images: {
|
|
289
|
+
extractImages: true,
|
|
290
|
+
targetDpi: 300,
|
|
291
|
+
maxImageDimension: 4096,
|
|
292
|
+
autoAdjustDpi: true
|
|
293
|
+
} as ImageExtractionConfig,
|
|
294
|
+
|
|
295
|
+
pdfOptions: {
|
|
296
|
+
extractImages: true,
|
|
297
|
+
passwords: [],
|
|
298
|
+
extractMetadata: true
|
|
299
|
+
} as PdfConfig,
|
|
300
|
+
|
|
301
|
+
tokenReduction: {
|
|
302
|
+
mode: 'moderate',
|
|
303
|
+
preserveImportantWords: true
|
|
304
|
+
} as TokenReductionConfig,
|
|
305
|
+
|
|
306
|
+
languageDetection: {
|
|
307
|
+
enabled: true,
|
|
308
|
+
minConfidence: 0.8,
|
|
309
|
+
detectMultiple: false
|
|
310
|
+
} as LanguageDetectionConfig
|
|
311
|
+
};
|
|
312
|
+
|
|
313
|
+
const result = await extractFile('document.pdf', config);
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
## Advanced Usage
|
|
317
|
+
|
|
318
|
+
### Extract from Buffer
|
|
319
|
+
|
|
320
|
+
```typescript
|
|
321
|
+
import { extractBytes } from '@kreuzberg/node';
|
|
322
|
+
import { readFile } from 'fs/promises';
|
|
323
|
+
|
|
324
|
+
const buffer = await readFile('document.pdf');
|
|
325
|
+
const result = await extractBytes(buffer, 'application/pdf');
|
|
326
|
+
console.log(result.content);
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Batch Processing
|
|
330
|
+
|
|
331
|
+
```typescript
|
|
332
|
+
import { batchExtractFiles } from '@kreuzberg/node';
|
|
333
|
+
|
|
334
|
+
const files = [
|
|
335
|
+
'document1.pdf',
|
|
336
|
+
'document2.docx',
|
|
337
|
+
'document3.xlsx'
|
|
338
|
+
];
|
|
339
|
+
|
|
340
|
+
const results = await batchExtractFiles(files);
|
|
341
|
+
|
|
342
|
+
for (const result of results) {
|
|
343
|
+
console.log(`${result.mimeType}: ${result.content.length} characters`);
|
|
344
|
+
}
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
### Batch Processing with Custom Concurrency
|
|
348
|
+
|
|
349
|
+
```typescript
|
|
350
|
+
import { batchExtractFiles } from '@kreuzberg/node';
|
|
351
|
+
|
|
352
|
+
const config = {
|
|
353
|
+
maxConcurrentExtractions: 4 // Process 4 files at a time
|
|
354
|
+
};
|
|
355
|
+
|
|
356
|
+
const files = Array.from({ length: 20 }, (_, i) => `file-${i}.pdf`);
|
|
357
|
+
const results = await batchExtractFiles(files, config);
|
|
358
|
+
|
|
359
|
+
console.log(`Processed ${results.length} files`);
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
### Extract with Metadata
|
|
363
|
+
|
|
364
|
+
```typescript
|
|
365
|
+
import { extractFile } from '@kreuzberg/node';
|
|
366
|
+
|
|
367
|
+
const result = await extractFile('document.pdf');
|
|
368
|
+
|
|
369
|
+
if (result.metadata) {
|
|
370
|
+
console.log('Title:', result.metadata.title);
|
|
371
|
+
console.log('Author:', result.metadata.author);
|
|
372
|
+
console.log('Creation Date:', result.metadata.creationDate);
|
|
373
|
+
console.log('Page Count:', result.metadata.pageCount);
|
|
374
|
+
console.log('Word Count:', result.metadata.wordCount);
|
|
375
|
+
}
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
### Token Reduction for LLM Processing
|
|
379
|
+
|
|
380
|
+
```typescript
|
|
381
|
+
import { extractFile, type TokenReductionConfig } from '@kreuzberg/node';
|
|
382
|
+
|
|
383
|
+
const config = {
|
|
384
|
+
tokenReduction: {
|
|
385
|
+
mode: 'aggressive', // Options: 'light', 'moderate', 'aggressive'
|
|
386
|
+
preserveImportantWords: true
|
|
387
|
+
} as TokenReductionConfig
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
const result = await extractFile('long-document.pdf', config);
|
|
391
|
+
|
|
392
|
+
// Reduced token count while preserving meaning
|
|
393
|
+
console.log(`Original length: ${result.content.length}`);
|
|
394
|
+
console.log(`Processed for LLM context window`);
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
## Error Handling
|
|
398
|
+
|
|
399
|
+
```typescript
|
|
400
|
+
import {
|
|
401
|
+
extractFile,
|
|
402
|
+
KreuzbergError,
|
|
403
|
+
ValidationError,
|
|
404
|
+
ParsingError,
|
|
405
|
+
OCRError,
|
|
406
|
+
MissingDependencyError
|
|
407
|
+
} from '@kreuzberg/node';
|
|
408
|
+
|
|
409
|
+
try {
|
|
410
|
+
const result = await extractFile('document.pdf');
|
|
411
|
+
console.log(result.content);
|
|
412
|
+
} catch (error) {
|
|
413
|
+
if (error instanceof ValidationError) {
|
|
414
|
+
console.error('Invalid configuration or input:', error.message);
|
|
415
|
+
} else if (error instanceof ParsingError) {
|
|
416
|
+
console.error('Failed to parse document:', error.message);
|
|
417
|
+
} else if (error instanceof OCRError) {
|
|
418
|
+
console.error('OCR processing failed:', error.message);
|
|
419
|
+
} else if (error instanceof MissingDependencyError) {
|
|
420
|
+
console.error(`Missing dependency: ${error.dependency}`);
|
|
421
|
+
console.error('Installation instructions:', error.message);
|
|
422
|
+
} else if (error instanceof KreuzbergError) {
|
|
423
|
+
console.error('Kreuzberg error:', error.message);
|
|
424
|
+
} else {
|
|
425
|
+
throw error;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
## API Reference
|
|
431
|
+
|
|
432
|
+
### Extraction Functions
|
|
433
|
+
|
|
434
|
+
#### `extractFile(filePath: string, config?: ExtractionConfig): Promise<ExtractionResult>`
|
|
435
|
+
Asynchronously extract content from a file.
|
|
436
|
+
|
|
437
|
+
#### `extractFileSync(filePath: string, config?: ExtractionConfig): ExtractionResult`
|
|
438
|
+
Synchronously extract content from a file.
|
|
439
|
+
|
|
440
|
+
#### `extractBytes(data: Buffer, mimeType: string, config?: ExtractionConfig): Promise<ExtractionResult>`
|
|
441
|
+
Asynchronously extract content from a buffer.
|
|
442
|
+
|
|
443
|
+
#### `extractBytesSync(data: Buffer, mimeType: string, config?: ExtractionConfig): ExtractionResult`
|
|
444
|
+
Synchronously extract content from a buffer.
|
|
445
|
+
|
|
446
|
+
#### `batchExtractFiles(paths: string[], config?: ExtractionConfig): Promise<ExtractionResult[]>`
|
|
447
|
+
Asynchronously extract content from multiple files in parallel.
|
|
448
|
+
|
|
449
|
+
#### `batchExtractFilesSync(paths: string[], config?: ExtractionConfig): ExtractionResult[]`
|
|
450
|
+
Synchronously extract content from multiple files.
|
|
451
|
+
|
|
452
|
+
### Types
|
|
453
|
+
|
|
454
|
+
#### `ExtractionResult`
|
|
455
|
+
Main result object containing:
|
|
456
|
+
- `content: string` - Extracted text content
|
|
457
|
+
- `mimeType: string` - MIME type of the document
|
|
458
|
+
- `metadata?: Metadata` - Document metadata
|
|
459
|
+
- `tables?: Table[]` - Extracted tables
|
|
460
|
+
- `images?: ImageData[]` - Extracted images
|
|
461
|
+
- `chunks?: Chunk[]` - Text chunks (if chunking enabled)
|
|
462
|
+
- `language?: LanguageInfo` - Detected language (if enabled)
|
|
463
|
+
|
|
464
|
+
#### `ExtractionConfig`
|
|
465
|
+
Configuration object for extraction:
|
|
466
|
+
- `useCache?: boolean` - Enable result caching
|
|
467
|
+
- `enableQualityProcessing?: boolean` - Enable text quality improvements
|
|
468
|
+
- `forceOcr?: boolean` - Force OCR even for text-based PDFs
|
|
469
|
+
- `maxConcurrentExtractions?: number` - Max parallel extractions
|
|
470
|
+
- `ocr?: OcrConfig` - OCR settings
|
|
471
|
+
- `chunking?: ChunkingConfig` - Text chunking settings
|
|
472
|
+
- `images?: ImageExtractionConfig` - Image extraction settings
|
|
473
|
+
- `pdfOptions?: PdfConfig` - PDF-specific options
|
|
474
|
+
- `tokenReduction?: TokenReductionConfig` - Token reduction settings
|
|
475
|
+
- `languageDetection?: LanguageDetectionConfig` - Language detection settings
|
|
476
|
+
|
|
477
|
+
#### `OcrConfig`
|
|
478
|
+
OCR configuration:
|
|
479
|
+
- `backend: string` - OCR backend ('tesseract', 'easyocr', 'paddleocr')
|
|
480
|
+
- `language: string` - Language code (e.g., 'eng', 'fra', 'deu')
|
|
481
|
+
- `preprocessing?: boolean` - Enable image preprocessing
|
|
482
|
+
- `tesseractConfig?: TesseractConfig` - Tesseract-specific options
|
|
483
|
+
|
|
484
|
+
#### `Table`
|
|
485
|
+
Extracted table structure:
|
|
486
|
+
- `markdown: string` - Table in Markdown format
|
|
487
|
+
- `cells: TableCell[][]` - 2D array of table cells
|
|
488
|
+
- `rowCount: number` - Number of rows
|
|
489
|
+
- `columnCount: number` - Number of columns
|
|
490
|
+
|
|
491
|
+
### Exceptions
|
|
492
|
+
|
|
493
|
+
All Kreuzberg exceptions extend the base `KreuzbergError` class:
|
|
494
|
+
|
|
495
|
+
- `KreuzbergError` - Base error class for all Kreuzberg errors
|
|
496
|
+
- `ValidationError` - Invalid configuration, missing required fields, or invalid input
|
|
497
|
+
- `ParsingError` - Document parsing failure or corrupted file
|
|
498
|
+
- `OCRError` - OCR processing failure
|
|
499
|
+
- `MissingDependencyError` - Missing optional system dependency (includes installation instructions)
|
|
500
|
+
|
|
501
|
+
## Supported Formats
|
|
502
|
+
|
|
503
|
+
| Category | Formats |
|
|
504
|
+
|----------|---------|
|
|
505
|
+
| **Documents** | PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, ODT, ODP, ODS, RTF |
|
|
506
|
+
| **Images** | PNG, JPEG, JPG, WEBP, BMP, TIFF, GIF |
|
|
507
|
+
| **Web** | HTML, XHTML, XML |
|
|
508
|
+
| **Text** | TXT, MD, CSV, TSV, JSON, YAML, TOML |
|
|
509
|
+
| **Email** | EML, MSG |
|
|
510
|
+
| **Archives** | ZIP, TAR, 7Z |
|
|
511
|
+
| **Other** | And 30+ more formats |
|
|
512
|
+
|
|
513
|
+
## Performance
|
|
514
|
+
|
|
515
|
+
Kreuzberg is built with a native Rust core, providing significant performance improvements over pure JavaScript solutions:
|
|
516
|
+
|
|
517
|
+
- **10-50x faster** text extraction compared to pure Node.js libraries
|
|
518
|
+
- **Native multithreading** for batch processing
|
|
519
|
+
- **Optimized memory usage** with streaming for large files
|
|
520
|
+
- **Zero-copy operations** where possible
|
|
521
|
+
- **Efficient caching** to avoid redundant processing
|
|
522
|
+
|
|
523
|
+
### Benchmarks
|
|
524
|
+
|
|
525
|
+
Processing 100 mixed documents (PDF, DOCX, XLSX):
|
|
526
|
+
|
|
527
|
+
| Library | Time | Memory |
|
|
528
|
+
|---------|------|--------|
|
|
529
|
+
| Kreuzberg | 2.3s | 145 MB |
|
|
530
|
+
| pdf-parse + mammoth | 23.1s | 890 MB |
|
|
531
|
+
| textract | 45.2s | 1.2 GB |
|
|
532
|
+
|
|
533
|
+
## Troubleshooting
|
|
534
|
+
|
|
535
|
+
### Native Module Not Found
|
|
536
|
+
|
|
537
|
+
If you encounter errors about missing native modules:
|
|
538
|
+
|
|
539
|
+
```bash
|
|
540
|
+
npm rebuild kreuzberg
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
### OCR Not Working
|
|
544
|
+
|
|
545
|
+
Ensure Tesseract is installed and available in PATH:
|
|
546
|
+
|
|
547
|
+
```bash
|
|
548
|
+
tesseract --version
|
|
549
|
+
```
|
|
550
|
+
|
|
551
|
+
If Tesseract is not found:
|
|
552
|
+
- macOS: `brew install tesseract`
|
|
553
|
+
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
554
|
+
- Windows: Download from [tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)
|
|
555
|
+
|
|
556
|
+
### Memory Issues with Large PDFs
|
|
557
|
+
|
|
558
|
+
For very large PDFs, use chunking to reduce memory usage:
|
|
559
|
+
|
|
560
|
+
```typescript
|
|
561
|
+
const config = {
|
|
562
|
+
chunking: { maxChars: 1000 }
|
|
563
|
+
};
|
|
564
|
+
const result = await extractFile('large.pdf', config);
|
|
565
|
+
```
|
|
566
|
+
|
|
567
|
+
### TypeScript Types Not Resolving
|
|
568
|
+
|
|
569
|
+
Make sure you're using:
|
|
570
|
+
- Node.js 18 or higher
|
|
571
|
+
- TypeScript 5.0 or higher
|
|
572
|
+
|
|
573
|
+
The package includes built-in type definitions.
|
|
574
|
+
|
|
575
|
+
### Performance Optimization
|
|
576
|
+
|
|
577
|
+
For maximum performance when processing many files:
|
|
578
|
+
|
|
579
|
+
```typescript
|
|
580
|
+
// Use batch processing instead of sequential
|
|
581
|
+
const results = await batchExtractFiles(files, {
|
|
582
|
+
maxConcurrentExtractions: 8 // Tune based on CPU cores
|
|
583
|
+
});
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
## Examples
|
|
587
|
+
|
|
588
|
+
### Extract Invoice Data
|
|
589
|
+
|
|
590
|
+
```typescript
|
|
591
|
+
import { extractFile } from '@kreuzberg/node';
|
|
592
|
+
|
|
593
|
+
const result = await extractFile('invoice.pdf');
|
|
594
|
+
|
|
595
|
+
// Access tables for line items
|
|
596
|
+
if (result.tables && result.tables.length > 0) {
|
|
597
|
+
const lineItems = result.tables[0];
|
|
598
|
+
console.log(lineItems.markdown);
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
// Access metadata for invoice details
|
|
602
|
+
if (result.metadata) {
|
|
603
|
+
console.log('Invoice Date:', result.metadata.creationDate);
|
|
604
|
+
}
|
|
605
|
+
```
|
|
606
|
+
|
|
607
|
+
### Process Scanned Documents
|
|
608
|
+
|
|
609
|
+
```typescript
|
|
610
|
+
import { extractFile } from '@kreuzberg/node';
|
|
611
|
+
|
|
612
|
+
const config = {
|
|
613
|
+
forceOcr: true,
|
|
614
|
+
ocr: {
|
|
615
|
+
backend: 'tesseract',
|
|
616
|
+
language: 'eng',
|
|
617
|
+
preprocessing: true
|
|
618
|
+
}
|
|
619
|
+
};
|
|
620
|
+
|
|
621
|
+
const result = await extractFile('scanned-contract.pdf', config);
|
|
622
|
+
console.log(result.content);
|
|
623
|
+
```
|
|
624
|
+
|
|
625
|
+
### Build a Document Search Index
|
|
626
|
+
|
|
627
|
+
```typescript
|
|
628
|
+
import { batchExtractFiles } from '@kreuzberg/node';
|
|
629
|
+
import { glob } from 'glob';
|
|
630
|
+
|
|
631
|
+
// Find all documents
|
|
632
|
+
const files = await glob('documents/**/*.{pdf,docx,xlsx}');
|
|
633
|
+
|
|
634
|
+
// Extract in batches
|
|
635
|
+
const results = await batchExtractFiles(files, {
|
|
636
|
+
maxConcurrentExtractions: 8,
|
|
637
|
+
enableQualityProcessing: true
|
|
638
|
+
});
|
|
639
|
+
|
|
640
|
+
// Build search index
|
|
641
|
+
const searchIndex = results.map((result, i) => ({
|
|
642
|
+
path: files[i],
|
|
643
|
+
content: result.content,
|
|
644
|
+
metadata: result.metadata
|
|
645
|
+
}));
|
|
646
|
+
|
|
647
|
+
console.log(`Indexed ${searchIndex.length} documents`);
|
|
648
|
+
```
|
|
649
|
+
|
|
650
|
+
## Documentation
|
|
651
|
+
|
|
652
|
+
For comprehensive documentation, visit [https://kreuzberg.dev](https://kreuzberg.dev)
|
|
653
|
+
|
|
654
|
+
## Contributing
|
|
655
|
+
|
|
656
|
+
We welcome contributions! Please see our [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/docs/contributing.md) for details.
|
|
657
|
+
|
|
658
|
+
## License
|
|
659
|
+
|
|
660
|
+
MIT
|
|
661
|
+
|
|
662
|
+
## Links
|
|
663
|
+
|
|
664
|
+
- [Website](https://kreuzberg.dev)
|
|
665
|
+
- [Documentation](https://kreuzberg.dev)
|
|
666
|
+
- [GitHub](https://github.com/kreuzberg-dev/kreuzberg)
|
|
667
|
+
- [Issue Tracker](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
668
|
+
- [Changelog](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md)
|
|
669
|
+
- [npm Package](https://www.npmjs.com/package/kreuzberg)
|