@kreuzberg/node 4.0.0-rc.8 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +342 -530
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +337 -62
- package/dist/index.d.ts +337 -62
- package/dist/index.js +285 -56
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +277 -56
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +469 -54
- package/dist/types.d.ts +469 -54
- package/dist/types.js.map +1 -1
- package/index.d.ts +662 -1
- package/index.js +85 -55
- package/metadata.d.ts +53 -33
- package/package.json +17 -19
package/README.md
CHANGED
|
@@ -1,700 +1,512 @@
|
|
|
1
|
-
#
|
|
1
|
+
# TypeScript (Node.js)
|
|
2
|
+
|
|
3
|
+
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
|
+
<!-- Language Bindings -->
|
|
5
|
+
<a href="https://crates.io/crates/kreuzberg">
|
|
6
|
+
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
|
7
|
+
</a>
|
|
8
|
+
<a href="https://hex.pm/packages/kreuzberg">
|
|
9
|
+
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://pypi.org/project/kreuzberg/">
|
|
12
|
+
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
|
15
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
|
18
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
|
19
|
+
</a>
|
|
20
|
+
|
|
21
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
22
|
+
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
|
+
</a>
|
|
24
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
|
|
26
|
+
</a>
|
|
27
|
+
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
|
+
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
29
|
+
</a>
|
|
30
|
+
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
|
31
|
+
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
|
+
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
|
+
</a>
|
|
36
|
+
|
|
37
|
+
<!-- Project Info -->
|
|
38
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
39
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
40
|
+
</a>
|
|
41
|
+
<a href="https://docs.kreuzberg.dev">
|
|
42
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
43
|
+
</a>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
47
|
+
|
|
48
|
+
<div align="center" style="margin-top: 20px;">
|
|
49
|
+
<a href="https://discord.gg/pXxagNK2zN">
|
|
50
|
+
<img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
|
|
51
|
+
</a>
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
2
56
|
|
|
3
|
-
[](https://crates.io/crates/kreuzberg)
|
|
4
|
-
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
-
[](https://www.npmjs.com/package/@kreuzberg/node)
|
|
6
|
-
[](https://www.npmjs.com/package/@kreuzberg/wasm)
|
|
7
|
-
[](https://rubygems.org/gems/kreuzberg)
|
|
8
|
-
[](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
|
|
9
|
-
[](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
|
|
10
|
-
[](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
|
|
11
57
|
|
|
12
|
-
|
|
13
|
-
[](https://kreuzberg.dev/)
|
|
14
|
-
[](https://discord.gg/pXxagNK2zN)
|
|
58
|
+
## Installation
|
|
15
59
|
|
|
16
|
-
|
|
60
|
+
### Package Installation
|
|
17
61
|
|
|
18
|
-
Extract text, tables, images, and metadata from 56 file formats including PDF, DOCX, PPTX, XLSX, images, and more.
|
|
19
62
|
|
|
20
|
-
|
|
21
|
-
>
|
|
22
|
-
> For browser, Deno, or Cloudflare Workers, use [@kreuzberg/wasm](../kreuzberg-wasm/) instead.
|
|
63
|
+
Install via one of the supported package managers:
|
|
23
64
|
|
|
24
|
-
> **Version 4.0.0 Release Candidate**
|
|
25
|
-
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
26
65
|
|
|
27
|
-
## Features
|
|
28
66
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
- **Zero-Copy Operations**: Direct system calls and minimal data copying
|
|
34
|
-
- **Type-Safe**: Full TypeScript definitions for all methods, configurations, and return types
|
|
35
|
-
- **Async/Sync APIs**: Both asynchronous and synchronous extraction methods
|
|
36
|
-
- **Batch Processing**: Process multiple documents in parallel with optimized concurrency
|
|
37
|
-
- **Language Detection**: Automatic language detection for extracted text
|
|
38
|
-
- **Text Chunking**: Split long documents into manageable chunks for LLM processing
|
|
39
|
-
- **Caching**: Built-in result caching for faster repeated extractions
|
|
40
|
-
- **Zero Configuration**: Works out of the box with sensible defaults
|
|
67
|
+
**npm:**
|
|
68
|
+
```bash
|
|
69
|
+
npm install @kreuzberg/node
|
|
70
|
+
```
|
|
41
71
|
|
|
42
|
-
## Why Use This Package?
|
|
43
72
|
|
|
44
|
-
Choose `@kreuzberg/node` if you're building with:
|
|
45
73
|
|
|
46
|
-
- **Node.js 18+** - Native bindings provide direct access to system resources
|
|
47
|
-
- **Bun** - Full compatibility with Bun's Node.js API
|
|
48
|
-
- **Performance-critical applications** - Processing large document batches or real-time extraction
|
|
49
|
-
- **Server-side extraction** - APIs, microservices, document processing pipelines
|
|
50
74
|
|
|
51
|
-
|
|
75
|
+
**pnpm:**
|
|
76
|
+
```bash
|
|
77
|
+
pnpm add @kreuzberg/node
|
|
78
|
+
```
|
|
52
79
|
|
|
53
|
-
| Aspect | `@kreuzberg/node` | `@kreuzberg/wasm` |
|
|
54
|
-
|--------|------------------|-------------------|
|
|
55
|
-
| **Performance** | 2-3x faster (native) | Standard baseline |
|
|
56
|
-
| **Environment** | Node.js, Bun | Browser, Deno, Workers, Node.js |
|
|
57
|
-
| **Bundle Size** | 10-15 MB (prebuilt binary) | 2-4 MB (WASM module) |
|
|
58
|
-
| **System Access** | Direct system calls | Sandboxed via WASM |
|
|
59
|
-
| **Best For** | Server-side, batch processing | Client-side, edge computing |
|
|
60
80
|
|
|
61
|
-
Use `@kreuzberg/wasm` for browser applications, Cloudflare Workers, Deno, or when you need a smaller bundle size.
|
|
62
81
|
|
|
63
|
-
## Requirements
|
|
64
82
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
- Windows (x64, arm64)
|
|
83
|
+
**yarn:**
|
|
84
|
+
```bash
|
|
85
|
+
yarn add @kreuzberg/node
|
|
86
|
+
```
|
|
70
87
|
|
|
71
|
-
### Optional System Dependencies
|
|
72
88
|
|
|
73
|
-
- **Tesseract**: For OCR functionality
|
|
74
|
-
- macOS: `brew install tesseract`
|
|
75
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
76
|
-
- Windows: Download from [GitHub](https://github.com/tesseract-ocr/tesseract)
|
|
77
89
|
|
|
78
|
-
- **LibreOffice**: For legacy MS Office formats (.doc, .ppt)
|
|
79
|
-
- macOS: `brew install libreoffice`
|
|
80
|
-
- Ubuntu: `sudo apt-get install libreoffice`
|
|
81
90
|
|
|
82
|
-
- **Pandoc**: For advanced document conversion
|
|
83
|
-
- macOS: `brew install pandoc`
|
|
84
|
-
- Ubuntu: `sudo apt-get install pandoc`
|
|
85
91
|
|
|
86
|
-
|
|
92
|
+
### System Requirements
|
|
87
93
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
94
|
+
- **Node.js 22+** required (NAPI-RS native bindings)
|
|
95
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
96
|
+
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
91
97
|
|
|
92
|
-
|
|
98
|
+
- Optional: [LibreOffice](https://www.libreoffice.org/download/download/) for legacy Office formats (DOC, XLS, PPT, RTF, ODT, ODS, ODP)
|
|
93
99
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
100
|
+
**Format Support Notes:**
|
|
101
|
+
- Modern Office formats (DOCX, XLSX, PPTX) work without LibreOffice
|
|
102
|
+
- Legacy formats (DOC, XLS, PPT) require LibreOffice installation
|
|
103
|
+
- WASM binding does NOT support LibreOffice formats (use Node.js for full format support)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
### Platform Support
|
|
108
|
+
|
|
109
|
+
Pre-built binaries available for:
|
|
110
|
+
- macOS (arm64, x64)
|
|
111
|
+
- Linux (x64)
|
|
112
|
+
- Windows (x64)
|
|
97
113
|
|
|
98
|
-
Or with yarn:
|
|
99
114
|
|
|
100
|
-
```bash
|
|
101
|
-
yarn add @kreuzberg/node
|
|
102
|
-
```
|
|
103
115
|
|
|
104
|
-
The package includes prebuilt native binaries for major platforms. No additional build steps required.
|
|
105
116
|
|
|
106
117
|
## Quick Start
|
|
107
118
|
|
|
108
119
|
### Basic Extraction
|
|
109
120
|
|
|
121
|
+
Extract text, metadata, and structure from any supported document format:
|
|
122
|
+
|
|
110
123
|
```typescript
|
|
111
124
|
import { extractFileSync } from '@kreuzberg/node';
|
|
112
125
|
|
|
113
|
-
|
|
114
|
-
|
|
126
|
+
const config = {
|
|
127
|
+
useCache: true,
|
|
128
|
+
enableQualityProcessing: true,
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
const result = extractFileSync('document.pdf', null, config);
|
|
132
|
+
|
|
115
133
|
console.log(result.content);
|
|
116
|
-
console.log(result.
|
|
134
|
+
console.log(`MIME Type: ${result.mimeType}`);
|
|
117
135
|
```
|
|
118
136
|
|
|
119
|
-
|
|
137
|
+
|
|
138
|
+
### Common Use Cases
|
|
139
|
+
|
|
140
|
+
#### Extract with Custom Configuration
|
|
141
|
+
|
|
142
|
+
Most use cases benefit from configuration to control extraction behavior:
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
**With OCR (for scanned documents):**
|
|
120
146
|
|
|
121
147
|
```typescript
|
|
122
148
|
import { extractFile } from '@kreuzberg/node';
|
|
123
149
|
|
|
124
|
-
|
|
125
|
-
|
|
150
|
+
const config = {
|
|
151
|
+
ocr: {
|
|
152
|
+
backend: 'tesseract',
|
|
153
|
+
language: 'eng+fra',
|
|
154
|
+
tesseractConfig: {
|
|
155
|
+
psm: 3,
|
|
156
|
+
},
|
|
157
|
+
},
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
const result = await extractFile('document.pdf', null, config);
|
|
126
161
|
console.log(result.content);
|
|
127
|
-
console.log(result.tables);
|
|
128
162
|
```
|
|
129
163
|
|
|
130
|
-
### With Full Type Safety
|
|
131
164
|
|
|
132
|
-
```typescript
|
|
133
|
-
import {
|
|
134
|
-
extractFile,
|
|
135
|
-
type ExtractionConfig,
|
|
136
|
-
type ExtractionResult
|
|
137
|
-
} from '@kreuzberg/node';
|
|
138
|
-
|
|
139
|
-
const config: ExtractionConfig = {
|
|
140
|
-
useCache: true,
|
|
141
|
-
enableQualityProcessing: true
|
|
142
|
-
};
|
|
143
165
|
|
|
144
|
-
const result: ExtractionResult = await extractFile('invoice.pdf', config);
|
|
145
166
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
167
|
+
#### Table Extraction
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
```typescript
|
|
171
|
+
import { extractFileSync } from '@kreuzberg/node';
|
|
150
172
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
173
|
+
const result = extractFileSync('document.pdf');
|
|
174
|
+
|
|
175
|
+
for (const table of result.tables) {
|
|
176
|
+
console.log(`Table with ${table.cells.length} rows`);
|
|
177
|
+
console.log(`Page: ${table.pageNumber}`);
|
|
178
|
+
console.log(table.markdown);
|
|
155
179
|
}
|
|
156
180
|
```
|
|
157
181
|
|
|
158
|
-
## Configuration
|
|
159
182
|
|
|
160
|
-
### OCR Configuration
|
|
161
183
|
|
|
162
|
-
```typescript
|
|
163
|
-
import { extractFile, type ExtractionConfig, type OcrConfig } from '@kreuzberg/node';
|
|
164
|
-
|
|
165
|
-
const config: ExtractionConfig = {
|
|
166
|
-
ocr: {
|
|
167
|
-
backend: 'tesseract',
|
|
168
|
-
language: 'eng',
|
|
169
|
-
tesseractConfig: {
|
|
170
|
-
enableTableDetection: true,
|
|
171
|
-
psm: 6,
|
|
172
|
-
minConfidence: 50.0
|
|
173
|
-
}
|
|
174
|
-
} as OcrConfig
|
|
175
|
-
};
|
|
176
184
|
|
|
177
|
-
|
|
178
|
-
console.log(result.content);
|
|
179
|
-
```
|
|
185
|
+
#### Processing Multiple Files
|
|
180
186
|
|
|
181
|
-
### PDF Password Protection
|
|
182
187
|
|
|
183
188
|
```typescript
|
|
184
|
-
import {
|
|
189
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
185
190
|
|
|
186
|
-
const
|
|
187
|
-
|
|
188
|
-
passwords: ['password1', 'password2'],
|
|
189
|
-
extractImages: true,
|
|
190
|
-
extractMetadata: true
|
|
191
|
-
} as PdfConfig
|
|
192
|
-
};
|
|
191
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
192
|
+
const results = batchExtractFilesSync(files);
|
|
193
193
|
|
|
194
|
-
|
|
194
|
+
results.forEach((result, i) => {
|
|
195
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
196
|
+
});
|
|
195
197
|
```
|
|
196
198
|
|
|
197
|
-
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
#### Async Processing
|
|
204
|
+
|
|
205
|
+
For non-blocking document processing:
|
|
198
206
|
|
|
199
207
|
```typescript
|
|
200
208
|
import { extractFile } from '@kreuzberg/node';
|
|
201
209
|
|
|
202
|
-
const result = await extractFile('
|
|
210
|
+
const result = await extractFile('document.pdf');
|
|
211
|
+
console.log(result.content);
|
|
212
|
+
```
|
|
203
213
|
|
|
204
|
-
if (result.tables) {
|
|
205
|
-
for (const table of result.tables) {
|
|
206
|
-
console.log('Table as Markdown:');
|
|
207
|
-
console.log(table.markdown);
|
|
208
214
|
|
|
209
|
-
console.log('Table cells:');
|
|
210
|
-
console.log(JSON.stringify(table.cells, null, 2));
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
```
|
|
214
215
|
|
|
215
|
-
### Text Chunking
|
|
216
216
|
|
|
217
|
-
```typescript
|
|
218
|
-
import { extractFile, type ChunkingConfig } from '@kreuzberg/node';
|
|
219
217
|
|
|
220
|
-
|
|
221
|
-
chunking: {
|
|
222
|
-
maxChars: 1000,
|
|
223
|
-
maxOverlap: 200
|
|
224
|
-
} as ChunkingConfig
|
|
225
|
-
};
|
|
218
|
+
#### Configuration Discovery
|
|
226
219
|
|
|
227
|
-
|
|
220
|
+
```typescript
|
|
221
|
+
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
228
222
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
223
|
+
const config = ExtractionConfig.discover();
|
|
224
|
+
if (config) {
|
|
225
|
+
console.log('Found configuration file');
|
|
226
|
+
const result = await extractFile('document.pdf', null, config);
|
|
227
|
+
console.log(result.content);
|
|
228
|
+
} else {
|
|
229
|
+
console.log('No configuration file found, using defaults');
|
|
230
|
+
const result = await extractFile('document.pdf');
|
|
231
|
+
console.log(result.content);
|
|
233
232
|
}
|
|
234
233
|
```
|
|
235
234
|
|
|
236
|
-
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
#### Worker Thread Pool
|
|
237
240
|
|
|
238
241
|
```typescript
|
|
239
|
-
import {
|
|
242
|
+
import { createWorkerPool, extractFileInWorker, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
240
243
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
enabled: true,
|
|
244
|
-
minConfidence: 0.8,
|
|
245
|
-
detectMultiple: false
|
|
246
|
-
} as LanguageDetectionConfig
|
|
247
|
-
};
|
|
244
|
+
// Create a pool with 4 worker threads
|
|
245
|
+
const pool = createWorkerPool(4);
|
|
248
246
|
|
|
249
|
-
|
|
247
|
+
try {
|
|
248
|
+
// Extract single file in worker
|
|
249
|
+
const result = await extractFileInWorker(pool, 'document.pdf', null, {
|
|
250
|
+
useCache: true
|
|
251
|
+
});
|
|
252
|
+
console.log(result.content);
|
|
250
253
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
+
// Extract multiple files concurrently
|
|
255
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
256
|
+
const results = await batchExtractFilesInWorker(pool, files, {
|
|
257
|
+
useCache: true
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
results.forEach((result, i) => {
|
|
261
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
262
|
+
});
|
|
263
|
+
} finally {
|
|
264
|
+
// Always close the pool when done
|
|
265
|
+
await closeWorkerPool(pool);
|
|
254
266
|
}
|
|
255
267
|
```
|
|
256
268
|
|
|
257
|
-
### Image Extraction
|
|
258
269
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
270
|
+
**Performance Benefits:**
|
|
271
|
+
- **Parallel Processing**: Multiple documents extracted simultaneously
|
|
272
|
+
- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
|
|
273
|
+
- **Queue Management**: Automatically distributes work across available workers
|
|
274
|
+
- **Resource Control**: Prevents thread exhaustion with configurable pool size
|
|
262
275
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
autoAdjustDpi: true
|
|
269
|
-
} as ImageExtractionConfig
|
|
270
|
-
};
|
|
276
|
+
**Best Practices:**
|
|
277
|
+
- Use worker pools for batches of 10+ documents
|
|
278
|
+
- Set pool size to number of CPU cores (default behavior)
|
|
279
|
+
- Always close pools with `closeWorkerPool()` to prevent resource leaks
|
|
280
|
+
- Reuse pools across multiple batch operations for efficiency
|
|
271
281
|
|
|
272
|
-
const result = await extractFile('document-with-images.pdf', config);
|
|
273
282
|
|
|
274
|
-
if (result.images) {
|
|
275
|
-
for (let i = 0; i < result.images.length; i++) {
|
|
276
|
-
const image = result.images[i];
|
|
277
|
-
await writeFile(`image-${i}.${image.format}`, Buffer.from(image.data));
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
```
|
|
281
283
|
|
|
282
|
-
###
|
|
284
|
+
### Next Steps
|
|
283
285
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
type ChunkingConfig,
|
|
290
|
-
type ImageExtractionConfig,
|
|
291
|
-
type PdfConfig,
|
|
292
|
-
type TokenReductionConfig,
|
|
293
|
-
type LanguageDetectionConfig
|
|
294
|
-
} from '@kreuzberg/node';
|
|
295
|
-
|
|
296
|
-
const config: ExtractionConfig = {
|
|
297
|
-
useCache: true,
|
|
298
|
-
enableQualityProcessing: true,
|
|
299
|
-
forceOcr: false,
|
|
300
|
-
maxConcurrentExtractions: 8,
|
|
301
|
-
|
|
302
|
-
ocr: {
|
|
303
|
-
backend: 'tesseract',
|
|
304
|
-
language: 'eng',
|
|
305
|
-
preprocessing: true,
|
|
306
|
-
tesseractConfig: {
|
|
307
|
-
enableTableDetection: true,
|
|
308
|
-
psm: 6,
|
|
309
|
-
oem: 3,
|
|
310
|
-
minConfidence: 50.0
|
|
311
|
-
}
|
|
312
|
-
} as OcrConfig,
|
|
313
|
-
|
|
314
|
-
chunking: {
|
|
315
|
-
maxChars: 1000,
|
|
316
|
-
maxOverlap: 200
|
|
317
|
-
} as ChunkingConfig,
|
|
318
|
-
|
|
319
|
-
images: {
|
|
320
|
-
extractImages: true,
|
|
321
|
-
targetDpi: 300,
|
|
322
|
-
maxImageDimension: 4096,
|
|
323
|
-
autoAdjustDpi: true
|
|
324
|
-
} as ImageExtractionConfig,
|
|
325
|
-
|
|
326
|
-
pdfOptions: {
|
|
327
|
-
extractImages: true,
|
|
328
|
-
passwords: [],
|
|
329
|
-
extractMetadata: true
|
|
330
|
-
} as PdfConfig,
|
|
331
|
-
|
|
332
|
-
tokenReduction: {
|
|
333
|
-
mode: 'moderate',
|
|
334
|
-
preserveImportantWords: true
|
|
335
|
-
} as TokenReductionConfig,
|
|
336
|
-
|
|
337
|
-
languageDetection: {
|
|
338
|
-
enabled: true,
|
|
339
|
-
minConfidence: 0.8,
|
|
340
|
-
detectMultiple: false
|
|
341
|
-
} as LanguageDetectionConfig
|
|
342
|
-
};
|
|
286
|
+
- **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
|
287
|
+
- **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
|
|
288
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
|
|
289
|
+
- **[Configuration Guide](https://kreuzberg.dev/configuration/)** - Advanced configuration options
|
|
290
|
+
- **[Troubleshooting](https://kreuzberg.dev/troubleshooting/)** - Common issues and solutions
|
|
343
291
|
|
|
344
|
-
const result = await extractFile('document.pdf', config);
|
|
345
|
-
```
|
|
346
292
|
|
|
347
|
-
## Advanced Usage
|
|
348
293
|
|
|
349
|
-
|
|
294
|
+
## NAPI-RS Implementation Details
|
|
350
295
|
|
|
351
|
-
|
|
352
|
-
import { extractBytes } from '@kreuzberg/node';
|
|
353
|
-
import { readFile } from 'fs/promises';
|
|
296
|
+
### Native Performance
|
|
354
297
|
|
|
355
|
-
|
|
356
|
-
const result = await extractBytes(buffer, 'application/pdf');
|
|
357
|
-
console.log(result.content);
|
|
358
|
-
```
|
|
298
|
+
This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
359
299
|
|
|
360
|
-
|
|
300
|
+
- **Zero-copy data transfer** between JavaScript and Rust layers
|
|
301
|
+
- **Native thread pool** for concurrent document processing
|
|
302
|
+
- **Direct memory management** for efficient large document handling
|
|
303
|
+
- **Binary-compatible** pre-built native modules across platforms
|
|
361
304
|
|
|
362
|
-
|
|
363
|
-
import { batchExtractFiles } from '@kreuzberg/node';
|
|
305
|
+
### Threading Model
|
|
364
306
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
];
|
|
307
|
+
- Single documents are processed synchronously or asynchronously in a dedicated thread
|
|
308
|
+
- Batch operations distribute work across available CPU cores
|
|
309
|
+
- Thread count is configurable but defaults to system CPU count
|
|
310
|
+
- Long-running extractions block the event loop unless using async APIs
|
|
370
311
|
|
|
371
|
-
|
|
312
|
+
### Memory Management
|
|
372
313
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
314
|
+
- Large documents (> 100 MB) are streamed to avoid loading entirely into memory
|
|
315
|
+
- Temporary files are created in system temp directory for extraction
|
|
316
|
+
- Memory is automatically released after extraction completion
|
|
317
|
+
- ONNX models are cached in memory for repeated embeddings operations
|
|
377
318
|
|
|
378
|
-
### Batch Processing with Custom Concurrency
|
|
379
319
|
|
|
380
|
-
```typescript
|
|
381
|
-
import { batchExtractFiles } from '@kreuzberg/node';
|
|
382
320
|
|
|
383
|
-
|
|
384
|
-
maxConcurrentExtractions: 4 // Process 4 files at a time
|
|
385
|
-
};
|
|
321
|
+
## Features
|
|
386
322
|
|
|
387
|
-
|
|
388
|
-
const results = await batchExtractFiles(files, config);
|
|
323
|
+
### Supported File Formats (56+)
|
|
389
324
|
|
|
390
|
-
|
|
391
|
-
```
|
|
325
|
+
56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
392
326
|
|
|
393
|
-
|
|
327
|
+
#### Office Documents
|
|
394
328
|
|
|
395
|
-
|
|
396
|
-
|
|
329
|
+
| Category | Formats | Capabilities |
|
|
330
|
+
|----------|---------|--------------|
|
|
331
|
+
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
332
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
333
|
+
| **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
|
|
334
|
+
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
335
|
+
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
397
336
|
|
|
398
|
-
|
|
337
|
+
#### Images (OCR-Enabled)
|
|
399
338
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
console.log('Word Count:', result.metadata.wordCount);
|
|
406
|
-
}
|
|
407
|
-
```
|
|
339
|
+
| Category | Formats | Features |
|
|
340
|
+
|----------|---------|----------|
|
|
341
|
+
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
|
342
|
+
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR, table detection, format-specific metadata |
|
|
343
|
+
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
|
408
344
|
|
|
409
|
-
|
|
345
|
+
#### Web & Data
|
|
410
346
|
|
|
411
|
-
|
|
412
|
-
|
|
347
|
+
| Category | Formats | Features |
|
|
348
|
+
|----------|---------|----------|
|
|
349
|
+
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
350
|
+
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
351
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
413
352
|
|
|
414
|
-
|
|
415
|
-
tokenReduction: {
|
|
416
|
-
mode: 'aggressive', // Options: 'light', 'moderate', 'aggressive'
|
|
417
|
-
preserveImportantWords: true
|
|
418
|
-
} as TokenReductionConfig
|
|
419
|
-
};
|
|
353
|
+
#### Email & Archives
|
|
420
354
|
|
|
421
|
-
|
|
355
|
+
| Category | Formats | Features |
|
|
356
|
+
|----------|---------|----------|
|
|
357
|
+
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
|
358
|
+
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
|
422
359
|
|
|
423
|
-
|
|
424
|
-
console.log(`Original length: ${result.content.length}`);
|
|
425
|
-
console.log(`Processed for LLM context window`);
|
|
426
|
-
```
|
|
360
|
+
#### Academic & Scientific
|
|
427
361
|
|
|
428
|
-
|
|
362
|
+
| Category | Formats | Features |
|
|
363
|
+
|----------|---------|----------|
|
|
364
|
+
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.enw`, `.csl` | Bibliography parsing, citation extraction |
|
|
365
|
+
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
366
|
+
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
429
367
|
|
|
430
|
-
|
|
431
|
-
import {
|
|
432
|
-
extractFile,
|
|
433
|
-
KreuzbergError,
|
|
434
|
-
ValidationError,
|
|
435
|
-
ParsingError,
|
|
436
|
-
OCRError,
|
|
437
|
-
MissingDependencyError
|
|
438
|
-
} from '@kreuzberg/node';
|
|
368
|
+
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
439
369
|
|
|
440
|
-
|
|
441
|
-
const result = await extractFile('document.pdf');
|
|
442
|
-
console.log(result.content);
|
|
443
|
-
} catch (error) {
|
|
444
|
-
if (error instanceof ValidationError) {
|
|
445
|
-
console.error('Invalid configuration or input:', error.message);
|
|
446
|
-
} else if (error instanceof ParsingError) {
|
|
447
|
-
console.error('Failed to parse document:', error.message);
|
|
448
|
-
} else if (error instanceof OCRError) {
|
|
449
|
-
console.error('OCR processing failed:', error.message);
|
|
450
|
-
} else if (error instanceof MissingDependencyError) {
|
|
451
|
-
console.error(`Missing dependency: ${error.dependency}`);
|
|
452
|
-
console.error('Installation instructions:', error.message);
|
|
453
|
-
} else if (error instanceof KreuzbergError) {
|
|
454
|
-
console.error('Kreuzberg error:', error.message);
|
|
455
|
-
} else {
|
|
456
|
-
throw error;
|
|
457
|
-
}
|
|
458
|
-
}
|
|
459
|
-
```
|
|
370
|
+
### Key Capabilities
|
|
460
371
|
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
Asynchronously extract content from a file.
|
|
467
|
-
|
|
468
|
-
#### `extractFileSync(filePath: string, config?: ExtractionConfig): ExtractionResult`
|
|
469
|
-
Synchronously extract content from a file.
|
|
470
|
-
|
|
471
|
-
#### `extractBytes(data: Buffer, mimeType: string, config?: ExtractionConfig): Promise<ExtractionResult>`
|
|
472
|
-
Asynchronously extract content from a buffer.
|
|
473
|
-
|
|
474
|
-
#### `extractBytesSync(data: Buffer, mimeType: string, config?: ExtractionConfig): ExtractionResult`
|
|
475
|
-
Synchronously extract content from a buffer.
|
|
476
|
-
|
|
477
|
-
#### `batchExtractFiles(paths: string[], config?: ExtractionConfig): Promise<ExtractionResult[]>`
|
|
478
|
-
Asynchronously extract content from multiple files in parallel.
|
|
479
|
-
|
|
480
|
-
#### `batchExtractFilesSync(paths: string[], config?: ExtractionConfig): ExtractionResult[]`
|
|
481
|
-
Synchronously extract content from multiple files.
|
|
482
|
-
|
|
483
|
-
### Types
|
|
484
|
-
|
|
485
|
-
#### `ExtractionResult`
|
|
486
|
-
Main result object containing:
|
|
487
|
-
- `content: string` - Extracted text content
|
|
488
|
-
- `mimeType: string` - MIME type of the document
|
|
489
|
-
- `metadata?: Metadata` - Document metadata
|
|
490
|
-
- `tables?: Table[]` - Extracted tables
|
|
491
|
-
- `images?: ImageData[]` - Extracted images
|
|
492
|
-
- `chunks?: Chunk[]` - Text chunks (if chunking enabled)
|
|
493
|
-
- `language?: LanguageInfo` - Detected language (if enabled)
|
|
494
|
-
|
|
495
|
-
#### `ExtractionConfig`
|
|
496
|
-
Configuration object for extraction:
|
|
497
|
-
- `useCache?: boolean` - Enable result caching
|
|
498
|
-
- `enableQualityProcessing?: boolean` - Enable text quality improvements
|
|
499
|
-
- `forceOcr?: boolean` - Force OCR even for text-based PDFs
|
|
500
|
-
- `maxConcurrentExtractions?: number` - Max parallel extractions
|
|
501
|
-
- `ocr?: OcrConfig` - OCR settings
|
|
502
|
-
- `chunking?: ChunkingConfig` - Text chunking settings
|
|
503
|
-
- `images?: ImageExtractionConfig` - Image extraction settings
|
|
504
|
-
- `pdfOptions?: PdfConfig` - PDF-specific options
|
|
505
|
-
- `tokenReduction?: TokenReductionConfig` - Token reduction settings
|
|
506
|
-
- `languageDetection?: LanguageDetectionConfig` - Language detection settings
|
|
507
|
-
|
|
508
|
-
#### `OcrConfig`
|
|
509
|
-
OCR configuration:
|
|
510
|
-
- `backend: string` - OCR backend ('tesseract', 'easyocr', 'paddleocr')
|
|
511
|
-
- `language: string` - Language code (e.g., 'eng', 'fra', 'deu')
|
|
512
|
-
- `preprocessing?: boolean` - Enable image preprocessing
|
|
513
|
-
- `tesseractConfig?: TesseractConfig` - Tesseract-specific options
|
|
514
|
-
|
|
515
|
-
#### `Table`
|
|
516
|
-
Extracted table structure:
|
|
517
|
-
- `markdown: string` - Table in Markdown format
|
|
518
|
-
- `cells: TableCell[][]` - 2D array of table cells
|
|
519
|
-
- `rowCount: number` - Number of rows
|
|
520
|
-
- `columnCount: number` - Number of columns
|
|
521
|
-
|
|
522
|
-
### Exceptions
|
|
523
|
-
|
|
524
|
-
All Kreuzberg exceptions extend the base `KreuzbergError` class:
|
|
525
|
-
|
|
526
|
-
- `KreuzbergError` - Base error class for all Kreuzberg errors
|
|
527
|
-
- `ValidationError` - Invalid configuration, missing required fields, or invalid input
|
|
528
|
-
- `ParsingError` - Document parsing failure or corrupted file
|
|
529
|
-
- `OCRError` - OCR processing failure
|
|
530
|
-
- `MissingDependencyError` - Missing optional system dependency (includes installation instructions)
|
|
531
|
-
|
|
532
|
-
## Supported Formats
|
|
533
|
-
|
|
534
|
-
| Category | Formats |
|
|
535
|
-
|----------|---------|
|
|
536
|
-
| **Documents** | PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, ODT, ODP, ODS, RTF |
|
|
537
|
-
| **Images** | PNG, JPEG, JPG, WEBP, BMP, TIFF, GIF |
|
|
538
|
-
| **Web** | HTML, XHTML, XML |
|
|
539
|
-
| **Text** | TXT, MD, CSV, TSV, JSON, YAML, TOML |
|
|
540
|
-
| **Email** | EML, MSG |
|
|
541
|
-
| **Archives** | ZIP, TAR, 7Z |
|
|
542
|
-
| **Other** | And 30+ more formats |
|
|
543
|
-
|
|
544
|
-
## Performance
|
|
545
|
-
|
|
546
|
-
Kreuzberg is built with a native Rust core, providing significant performance improvements over pure JavaScript solutions:
|
|
547
|
-
|
|
548
|
-
- **10-50x faster** text extraction compared to pure Node.js libraries
|
|
549
|
-
- **Native multithreading** for batch processing
|
|
550
|
-
- **Optimized memory usage** with streaming for large files
|
|
551
|
-
- **Zero-copy operations** where possible
|
|
552
|
-
- **Efficient caching** to avoid redundant processing
|
|
553
|
-
|
|
554
|
-
### Benchmarks
|
|
555
|
-
|
|
556
|
-
Processing 100 mixed documents (PDF, DOCX, XLSX):
|
|
557
|
-
|
|
558
|
-
| Library | Time | Memory |
|
|
559
|
-
|---------|------|--------|
|
|
560
|
-
| Kreuzberg | 2.3s | 145 MB |
|
|
561
|
-
| pdf-parse + mammoth | 23.1s | 890 MB |
|
|
562
|
-
| textract | 45.2s | 1.2 GB |
|
|
372
|
+
- **Text Extraction** - Extract all text content with position and formatting information
|
|
373
|
+
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
374
|
+
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
375
|
+
- **Image Extraction** - Extract embedded images and render page previews
|
|
376
|
+
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
|
563
377
|
|
|
564
|
-
|
|
378
|
+
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
565
379
|
|
|
566
|
-
### Native Module Not Found
|
|
567
380
|
|
|
568
|
-
|
|
381
|
+
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
569
382
|
|
|
570
|
-
```bash
|
|
571
|
-
npm rebuild @kreuzberg/node
|
|
572
|
-
```
|
|
573
383
|
|
|
574
|
-
|
|
384
|
+
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
|
|
575
385
|
|
|
576
|
-
|
|
386
|
+
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
387
|
+
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
388
|
+
- **Language Detection** - Detect and support multiple languages in documents
|
|
389
|
+
- **Configuration** - Fine-grained control over extraction behavior
|
|
390
|
+
|
|
391
|
+
### Performance Characteristics
|
|
392
|
+
|
|
393
|
+
| Format | Speed | Memory | Notes |
|
|
394
|
+
|--------|-------|--------|-------|
|
|
395
|
+
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
|
396
|
+
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
|
397
|
+
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
|
398
|
+
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
|
399
|
+
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
|
577
400
|
|
|
578
|
-
```bash
|
|
579
|
-
tesseract --version
|
|
580
|
-
```
|
|
581
401
|
|
|
582
|
-
If Tesseract is not found:
|
|
583
|
-
- macOS: `brew install tesseract`
|
|
584
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
585
|
-
- Windows: Download from [tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)
|
|
586
402
|
|
|
587
|
-
|
|
403
|
+
## OCR Support
|
|
588
404
|
|
|
589
|
-
|
|
405
|
+
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
- **Tesseract**
|
|
409
|
+
|
|
410
|
+
- **Guten**
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
### OCR Configuration Example
|
|
590
414
|
|
|
591
415
|
```typescript
|
|
416
|
+
import { extractFile } from '@kreuzberg/node';
|
|
417
|
+
|
|
592
418
|
const config = {
|
|
593
|
-
|
|
419
|
+
ocr: {
|
|
420
|
+
backend: 'tesseract',
|
|
421
|
+
language: 'eng+fra',
|
|
422
|
+
tesseractConfig: {
|
|
423
|
+
psm: 3,
|
|
424
|
+
},
|
|
425
|
+
},
|
|
594
426
|
};
|
|
595
|
-
|
|
427
|
+
|
|
428
|
+
const result = await extractFile('document.pdf', null, config);
|
|
429
|
+
console.log(result.content);
|
|
596
430
|
```
|
|
597
431
|
|
|
598
|
-
### TypeScript Types Not Resolving
|
|
599
432
|
|
|
600
|
-
Make sure you're using:
|
|
601
|
-
- Node.js 18 or higher
|
|
602
|
-
- TypeScript 5.0 or higher
|
|
603
433
|
|
|
604
|
-
The package includes built-in type definitions.
|
|
605
434
|
|
|
606
|
-
|
|
435
|
+
## Async Support
|
|
607
436
|
|
|
608
|
-
|
|
437
|
+
This binding provides full async/await support for non-blocking document processing:
|
|
609
438
|
|
|
610
439
|
```typescript
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
440
|
+
import { extractFile } from '@kreuzberg/node';
|
|
441
|
+
|
|
442
|
+
const result = await extractFile('document.pdf');
|
|
443
|
+
console.log(result.content);
|
|
615
444
|
```
|
|
616
445
|
|
|
617
|
-
## Examples
|
|
618
446
|
|
|
619
|
-
### Extract Invoice Data
|
|
620
447
|
|
|
621
|
-
```typescript
|
|
622
|
-
import { extractFile } from '@kreuzberg/node';
|
|
623
448
|
|
|
624
|
-
|
|
449
|
+
## Plugin System
|
|
625
450
|
|
|
626
|
-
|
|
627
|
-
if (result.tables && result.tables.length > 0) {
|
|
628
|
-
const lineItems = result.tables[0];
|
|
629
|
-
console.log(lineItems.markdown);
|
|
630
|
-
}
|
|
451
|
+
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
|
631
452
|
|
|
632
|
-
|
|
633
|
-
if (result.metadata) {
|
|
634
|
-
console.log('Invoice Date:', result.metadata.creationDate);
|
|
635
|
-
}
|
|
636
|
-
```
|
|
453
|
+
For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/plugins/).
|
|
637
454
|
|
|
638
|
-
### Process Scanned Documents
|
|
639
455
|
|
|
640
|
-
```typescript
|
|
641
|
-
import { extractFile } from '@kreuzberg/node';
|
|
642
456
|
|
|
643
|
-
const config = {
|
|
644
|
-
forceOcr: true,
|
|
645
|
-
ocr: {
|
|
646
|
-
backend: 'tesseract',
|
|
647
|
-
language: 'eng',
|
|
648
|
-
preprocessing: true
|
|
649
|
-
}
|
|
650
|
-
};
|
|
651
457
|
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
458
|
+
## Embeddings Support
|
|
459
|
+
|
|
460
|
+
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
|
|
461
|
+
|
|
462
|
+
**[Embeddings Guide](https://kreuzberg.dev/features/#embeddings)**
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
## Batch Processing
|
|
655
467
|
|
|
656
|
-
|
|
468
|
+
Process multiple documents efficiently:
|
|
657
469
|
|
|
658
470
|
```typescript
|
|
659
|
-
import {
|
|
660
|
-
import { glob } from 'glob';
|
|
471
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
661
472
|
|
|
662
|
-
|
|
663
|
-
const
|
|
473
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
474
|
+
const results = batchExtractFilesSync(files);
|
|
664
475
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
maxConcurrentExtractions: 8,
|
|
668
|
-
enableQualityProcessing: true
|
|
476
|
+
results.forEach((result, i) => {
|
|
477
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
669
478
|
});
|
|
479
|
+
```
|
|
670
480
|
|
|
671
|
-
// Build search index
|
|
672
|
-
const searchIndex = results.map((result, i) => ({
|
|
673
|
-
path: files[i],
|
|
674
|
-
content: result.content,
|
|
675
|
-
metadata: result.metadata
|
|
676
|
-
}));
|
|
677
481
|
|
|
678
|
-
|
|
679
|
-
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
## Configuration
|
|
485
|
+
|
|
486
|
+
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
|
487
|
+
|
|
488
|
+
**[Configuration Guide](https://kreuzberg.dev/configuration/)**
|
|
680
489
|
|
|
681
490
|
## Documentation
|
|
682
491
|
|
|
683
|
-
|
|
492
|
+
- **[Official Documentation](https://kreuzberg.dev/)**
|
|
493
|
+
- **[API Reference](https://kreuzberg.dev/reference/api-typescript/)**
|
|
494
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)**
|
|
495
|
+
|
|
496
|
+
## Troubleshooting
|
|
497
|
+
|
|
498
|
+
For common issues and solutions, visit [Troubleshooting Guide](https://kreuzberg.dev/troubleshooting/).
|
|
684
499
|
|
|
685
500
|
## Contributing
|
|
686
501
|
|
|
687
|
-
|
|
502
|
+
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
|
688
503
|
|
|
689
504
|
## License
|
|
690
505
|
|
|
691
|
-
MIT
|
|
506
|
+
MIT License - see LICENSE file for details.
|
|
692
507
|
|
|
693
|
-
##
|
|
508
|
+
## Support
|
|
694
509
|
|
|
695
|
-
- [
|
|
696
|
-
- [
|
|
697
|
-
- [
|
|
698
|
-
- [Issue Tracker](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
699
|
-
- [Changelog](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md)
|
|
700
|
-
- [npm Package](https://www.npmjs.com/package/@kreuzberg/node)
|
|
510
|
+
- **Discord Community**: [Join our Discord](https://discord.gg/pXxagNK2zN)
|
|
511
|
+
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
512
|
+
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|