@kreuzberg/node 4.0.0-rc.8 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +321 -514
- package/dist/cli.d.mts +4 -0
- package/dist/cli.d.ts +4 -0
- package/dist/cli.js +12 -2
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +12 -1
- package/dist/cli.mjs.map +1 -1
- package/dist/index.d.mts +337 -62
- package/dist/index.d.ts +337 -62
- package/dist/index.js +285 -56
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +277 -56
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +469 -54
- package/dist/types.d.ts +469 -54
- package/dist/types.js.map +1 -1
- package/index.d.ts +662 -1
- package/index.js +85 -55
- package/metadata.d.ts +53 -33
- package/package.json +17 -19
package/README.md
CHANGED
|
@@ -1,700 +1,507 @@
|
|
|
1
|
-
#
|
|
1
|
+
# TypeScript (Node.js)
|
|
2
|
+
|
|
3
|
+
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
|
+
<!-- Language Bindings -->
|
|
5
|
+
<a href="https://crates.io/crates/kreuzberg">
|
|
6
|
+
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
|
7
|
+
</a>
|
|
8
|
+
<a href="https://hex.pm/packages/kreuzberg">
|
|
9
|
+
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://pypi.org/project/kreuzberg/">
|
|
12
|
+
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
|
15
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
|
18
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
|
19
|
+
</a>
|
|
20
|
+
|
|
21
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
|
22
|
+
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
|
+
</a>
|
|
24
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
|
|
26
|
+
</a>
|
|
27
|
+
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
|
+
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
29
|
+
</a>
|
|
30
|
+
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
|
31
|
+
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
|
+
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
|
+
</a>
|
|
36
|
+
|
|
37
|
+
<!-- Project Info -->
|
|
38
|
+
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
39
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
40
|
+
</a>
|
|
41
|
+
<a href="https://docs.kreuzberg.dev">
|
|
42
|
+
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
43
|
+
</a>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
47
|
+
|
|
48
|
+
<div align="center" style="margin-top: 20px;">
|
|
49
|
+
<a href="https://discord.gg/pXxagNK2zN">
|
|
50
|
+
<img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
|
|
51
|
+
</a>
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
Extract text, tables, images, and metadata from 56 file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
2
56
|
|
|
3
|
-
[](https://crates.io/crates/kreuzberg)
|
|
4
|
-
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
-
[](https://www.npmjs.com/package/@kreuzberg/node)
|
|
6
|
-
[](https://www.npmjs.com/package/@kreuzberg/wasm)
|
|
7
|
-
[](https://rubygems.org/gems/kreuzberg)
|
|
8
|
-
[](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
|
|
9
|
-
[](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
|
|
10
|
-
[](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
|
|
11
57
|
|
|
12
|
-
|
|
13
|
-
[](https://kreuzberg.dev/)
|
|
14
|
-
[](https://discord.gg/pXxagNK2zN)
|
|
58
|
+
## Installation
|
|
15
59
|
|
|
16
|
-
|
|
60
|
+
### Package Installation
|
|
17
61
|
|
|
18
|
-
Extract text, tables, images, and metadata from 56 file formats including PDF, DOCX, PPTX, XLSX, images, and more.
|
|
19
62
|
|
|
20
|
-
|
|
21
|
-
>
|
|
22
|
-
> For browser, Deno, or Cloudflare Workers, use [@kreuzberg/wasm](../kreuzberg-wasm/) instead.
|
|
63
|
+
Install via one of the supported package managers:
|
|
23
64
|
|
|
24
|
-
> **Version 4.0.0 Release Candidate**
|
|
25
|
-
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
26
65
|
|
|
27
|
-
## Features
|
|
28
66
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
- **Zero-Copy Operations**: Direct system calls and minimal data copying
|
|
34
|
-
- **Type-Safe**: Full TypeScript definitions for all methods, configurations, and return types
|
|
35
|
-
- **Async/Sync APIs**: Both asynchronous and synchronous extraction methods
|
|
36
|
-
- **Batch Processing**: Process multiple documents in parallel with optimized concurrency
|
|
37
|
-
- **Language Detection**: Automatic language detection for extracted text
|
|
38
|
-
- **Text Chunking**: Split long documents into manageable chunks for LLM processing
|
|
39
|
-
- **Caching**: Built-in result caching for faster repeated extractions
|
|
40
|
-
- **Zero Configuration**: Works out of the box with sensible defaults
|
|
67
|
+
**npm:**
|
|
68
|
+
```bash
|
|
69
|
+
npm install @kreuzberg/node
|
|
70
|
+
```
|
|
41
71
|
|
|
42
|
-
## Why Use This Package?
|
|
43
72
|
|
|
44
|
-
Choose `@kreuzberg/node` if you're building with:
|
|
45
73
|
|
|
46
|
-
- **Node.js 18+** - Native bindings provide direct access to system resources
|
|
47
|
-
- **Bun** - Full compatibility with Bun's Node.js API
|
|
48
|
-
- **Performance-critical applications** - Processing large document batches or real-time extraction
|
|
49
|
-
- **Server-side extraction** - APIs, microservices, document processing pipelines
|
|
50
74
|
|
|
51
|
-
|
|
75
|
+
**pnpm:**
|
|
76
|
+
```bash
|
|
77
|
+
pnpm add @kreuzberg/node
|
|
78
|
+
```
|
|
52
79
|
|
|
53
|
-
| Aspect | `@kreuzberg/node` | `@kreuzberg/wasm` |
|
|
54
|
-
|--------|------------------|-------------------|
|
|
55
|
-
| **Performance** | 2-3x faster (native) | Standard baseline |
|
|
56
|
-
| **Environment** | Node.js, Bun | Browser, Deno, Workers, Node.js |
|
|
57
|
-
| **Bundle Size** | 10-15 MB (prebuilt binary) | 2-4 MB (WASM module) |
|
|
58
|
-
| **System Access** | Direct system calls | Sandboxed via WASM |
|
|
59
|
-
| **Best For** | Server-side, batch processing | Client-side, edge computing |
|
|
60
80
|
|
|
61
|
-
Use `@kreuzberg/wasm` for browser applications, Cloudflare Workers, Deno, or when you need a smaller bundle size.
|
|
62
81
|
|
|
63
|
-
## Requirements
|
|
64
82
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
- Windows (x64, arm64)
|
|
83
|
+
**yarn:**
|
|
84
|
+
```bash
|
|
85
|
+
yarn add @kreuzberg/node
|
|
86
|
+
```
|
|
70
87
|
|
|
71
|
-
### Optional System Dependencies
|
|
72
88
|
|
|
73
|
-
- **Tesseract**: For OCR functionality
|
|
74
|
-
- macOS: `brew install tesseract`
|
|
75
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
76
|
-
- Windows: Download from [GitHub](https://github.com/tesseract-ocr/tesseract)
|
|
77
89
|
|
|
78
|
-
- **LibreOffice**: For legacy MS Office formats (.doc, .ppt)
|
|
79
|
-
- macOS: `brew install libreoffice`
|
|
80
|
-
- Ubuntu: `sudo apt-get install libreoffice`
|
|
81
90
|
|
|
82
|
-
- **Pandoc**: For advanced document conversion
|
|
83
|
-
- macOS: `brew install pandoc`
|
|
84
|
-
- Ubuntu: `sudo apt-get install pandoc`
|
|
85
91
|
|
|
86
|
-
|
|
92
|
+
### System Requirements
|
|
87
93
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
94
|
+
- **Node.js 22+** required (NAPI-RS native bindings)
|
|
95
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
96
|
+
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
91
97
|
|
|
92
|
-
|
|
98
|
+
- Optional: [LibreOffice](https://www.libreoffice.org/download/download/) for legacy Office formats (DOC, XLS, PPT, RTF, ODT, ODS, ODP)
|
|
93
99
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
100
|
+
**Format Support Notes:**
|
|
101
|
+
- Modern Office formats (DOCX, XLSX, PPTX) work without LibreOffice
|
|
102
|
+
- Legacy formats (DOC, XLS, PPT) require LibreOffice installation
|
|
103
|
+
- WASM binding does NOT support LibreOffice formats (use Node.js for full format support)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
### Platform Support
|
|
108
|
+
|
|
109
|
+
Pre-built binaries available for:
|
|
110
|
+
- macOS (arm64, x64)
|
|
111
|
+
- Linux (x64)
|
|
112
|
+
- Windows (x64)
|
|
97
113
|
|
|
98
|
-
Or with yarn:
|
|
99
114
|
|
|
100
|
-
```bash
|
|
101
|
-
yarn add @kreuzberg/node
|
|
102
|
-
```
|
|
103
115
|
|
|
104
|
-
The package includes prebuilt native binaries for major platforms. No additional build steps required.
|
|
105
116
|
|
|
106
117
|
## Quick Start
|
|
107
118
|
|
|
108
119
|
### Basic Extraction
|
|
109
120
|
|
|
121
|
+
Extract text, metadata, and structure from any supported document format:
|
|
122
|
+
|
|
110
123
|
```typescript
|
|
111
124
|
import { extractFileSync } from '@kreuzberg/node';
|
|
112
125
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
### Async Extraction (Recommended)
|
|
126
|
+
const config = {
|
|
127
|
+
useCache: true,
|
|
128
|
+
enableQualityProcessing: true,
|
|
129
|
+
};
|
|
120
130
|
|
|
121
|
-
|
|
122
|
-
import { extractFile } from '@kreuzberg/node';
|
|
131
|
+
const result = extractFileSync('document.pdf', null, config);
|
|
123
132
|
|
|
124
|
-
// Asynchronous extraction
|
|
125
|
-
const result = await extractFile('document.pdf');
|
|
126
133
|
console.log(result.content);
|
|
127
|
-
console.log(result.
|
|
134
|
+
console.log(`MIME Type: ${result.mimeType}`);
|
|
128
135
|
```
|
|
129
136
|
|
|
130
|
-
### With Full Type Safety
|
|
131
|
-
|
|
132
|
-
```typescript
|
|
133
|
-
import {
|
|
134
|
-
extractFile,
|
|
135
|
-
type ExtractionConfig,
|
|
136
|
-
type ExtractionResult
|
|
137
|
-
} from '@kreuzberg/node';
|
|
138
|
-
|
|
139
|
-
const config: ExtractionConfig = {
|
|
140
|
-
useCache: true,
|
|
141
|
-
enableQualityProcessing: true
|
|
142
|
-
};
|
|
143
137
|
|
|
144
|
-
|
|
138
|
+
### Common Use Cases
|
|
145
139
|
|
|
146
|
-
|
|
147
|
-
console.log(result.content);
|
|
148
|
-
console.log(result.mimeType);
|
|
149
|
-
console.log(result.metadata);
|
|
140
|
+
#### Extract with Custom Configuration
|
|
150
141
|
|
|
151
|
-
|
|
152
|
-
for (const table of result.tables) {
|
|
153
|
-
console.log(table.markdown);
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
```
|
|
142
|
+
Most use cases benefit from configuration to control extraction behavior:
|
|
157
143
|
|
|
158
|
-
## Configuration
|
|
159
144
|
|
|
160
|
-
|
|
145
|
+
**With OCR (for scanned documents):**
|
|
161
146
|
|
|
162
147
|
```typescript
|
|
163
|
-
import { extractFile
|
|
164
|
-
|
|
165
|
-
const config
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
}
|
|
174
|
-
} as OcrConfig
|
|
148
|
+
import { extractFile } from '@kreuzberg/node';
|
|
149
|
+
|
|
150
|
+
const config = {
|
|
151
|
+
ocr: {
|
|
152
|
+
backend: 'tesseract',
|
|
153
|
+
language: 'eng+fra',
|
|
154
|
+
tesseractConfig: {
|
|
155
|
+
psm: 3,
|
|
156
|
+
},
|
|
157
|
+
},
|
|
175
158
|
};
|
|
176
159
|
|
|
177
|
-
const result = await extractFile('
|
|
160
|
+
const result = await extractFile('document.pdf', null, config);
|
|
178
161
|
console.log(result.content);
|
|
179
162
|
```
|
|
180
163
|
|
|
181
|
-
### PDF Password Protection
|
|
182
164
|
|
|
183
|
-
```typescript
|
|
184
|
-
import { extractFile, type PdfConfig } from '@kreuzberg/node';
|
|
185
165
|
|
|
186
|
-
const config = {
|
|
187
|
-
pdfOptions: {
|
|
188
|
-
passwords: ['password1', 'password2'],
|
|
189
|
-
extractImages: true,
|
|
190
|
-
extractMetadata: true
|
|
191
|
-
} as PdfConfig
|
|
192
|
-
};
|
|
193
166
|
|
|
194
|
-
|
|
195
|
-
```
|
|
167
|
+
#### Table Extraction
|
|
196
168
|
|
|
197
|
-
### Extract Tables
|
|
198
169
|
|
|
199
170
|
```typescript
|
|
200
|
-
import {
|
|
201
|
-
|
|
202
|
-
const result = await extractFile('financial-report.pdf');
|
|
171
|
+
import { extractFileSync } from '@kreuzberg/node';
|
|
203
172
|
|
|
204
|
-
|
|
205
|
-
for (const table of result.tables) {
|
|
206
|
-
console.log('Table as Markdown:');
|
|
207
|
-
console.log(table.markdown);
|
|
173
|
+
const result = extractFileSync('document.pdf');
|
|
208
174
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
175
|
+
for (const table of result.tables) {
|
|
176
|
+
console.log(`Table with ${table.cells.length} rows`);
|
|
177
|
+
console.log(`Page: ${table.pageNumber}`);
|
|
178
|
+
console.log(table.markdown);
|
|
212
179
|
}
|
|
213
180
|
```
|
|
214
181
|
|
|
215
|
-
### Text Chunking
|
|
216
182
|
|
|
217
|
-
```typescript
|
|
218
|
-
import { extractFile, type ChunkingConfig } from '@kreuzberg/node';
|
|
219
183
|
|
|
220
|
-
const config = {
|
|
221
|
-
chunking: {
|
|
222
|
-
maxChars: 1000,
|
|
223
|
-
maxOverlap: 200
|
|
224
|
-
} as ChunkingConfig
|
|
225
|
-
};
|
|
226
184
|
|
|
227
|
-
|
|
185
|
+
#### Processing Multiple Files
|
|
228
186
|
|
|
229
|
-
if (result.chunks) {
|
|
230
|
-
for (const chunk of result.chunks) {
|
|
231
|
-
console.log(`Chunk ${chunk.index}: ${chunk.text.substring(0, 100)}...`);
|
|
232
|
-
}
|
|
233
|
-
}
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
### Language Detection
|
|
237
187
|
|
|
238
188
|
```typescript
|
|
239
|
-
import {
|
|
189
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
240
190
|
|
|
241
|
-
const
|
|
242
|
-
|
|
243
|
-
enabled: true,
|
|
244
|
-
minConfidence: 0.8,
|
|
245
|
-
detectMultiple: false
|
|
246
|
-
} as LanguageDetectionConfig
|
|
247
|
-
};
|
|
191
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
192
|
+
const results = batchExtractFilesSync(files);
|
|
248
193
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
console.log(`Detected language: ${result.language.code}`);
|
|
253
|
-
console.log(`Confidence: ${result.language.confidence}`);
|
|
254
|
-
}
|
|
194
|
+
results.forEach((result, i) => {
|
|
195
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
196
|
+
});
|
|
255
197
|
```
|
|
256
198
|
|
|
257
|
-
### Image Extraction
|
|
258
199
|
|
|
259
|
-
```typescript
|
|
260
|
-
import { extractFile, type ImageExtractionConfig } from '@kreuzberg/node';
|
|
261
|
-
import { writeFile } from 'fs/promises';
|
|
262
200
|
|
|
263
|
-
const config = {
|
|
264
|
-
images: {
|
|
265
|
-
extractImages: true,
|
|
266
|
-
targetDpi: 300,
|
|
267
|
-
maxImageDimension: 4096,
|
|
268
|
-
autoAdjustDpi: true
|
|
269
|
-
} as ImageExtractionConfig
|
|
270
|
-
};
|
|
271
201
|
|
|
272
|
-
const result = await extractFile('document-with-images.pdf', config);
|
|
273
202
|
|
|
274
|
-
|
|
275
|
-
for (let i = 0; i < result.images.length; i++) {
|
|
276
|
-
const image = result.images[i];
|
|
277
|
-
await writeFile(`image-${i}.${image.format}`, Buffer.from(image.data));
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
```
|
|
203
|
+
#### Async Processing
|
|
281
204
|
|
|
282
|
-
|
|
205
|
+
For non-blocking document processing:
|
|
283
206
|
|
|
284
207
|
```typescript
|
|
285
|
-
import {
|
|
286
|
-
extractFile,
|
|
287
|
-
type ExtractionConfig,
|
|
288
|
-
type OcrConfig,
|
|
289
|
-
type ChunkingConfig,
|
|
290
|
-
type ImageExtractionConfig,
|
|
291
|
-
type PdfConfig,
|
|
292
|
-
type TokenReductionConfig,
|
|
293
|
-
type LanguageDetectionConfig
|
|
294
|
-
} from '@kreuzberg/node';
|
|
295
|
-
|
|
296
|
-
const config: ExtractionConfig = {
|
|
297
|
-
useCache: true,
|
|
298
|
-
enableQualityProcessing: true,
|
|
299
|
-
forceOcr: false,
|
|
300
|
-
maxConcurrentExtractions: 8,
|
|
301
|
-
|
|
302
|
-
ocr: {
|
|
303
|
-
backend: 'tesseract',
|
|
304
|
-
language: 'eng',
|
|
305
|
-
preprocessing: true,
|
|
306
|
-
tesseractConfig: {
|
|
307
|
-
enableTableDetection: true,
|
|
308
|
-
psm: 6,
|
|
309
|
-
oem: 3,
|
|
310
|
-
minConfidence: 50.0
|
|
311
|
-
}
|
|
312
|
-
} as OcrConfig,
|
|
313
|
-
|
|
314
|
-
chunking: {
|
|
315
|
-
maxChars: 1000,
|
|
316
|
-
maxOverlap: 200
|
|
317
|
-
} as ChunkingConfig,
|
|
318
|
-
|
|
319
|
-
images: {
|
|
320
|
-
extractImages: true,
|
|
321
|
-
targetDpi: 300,
|
|
322
|
-
maxImageDimension: 4096,
|
|
323
|
-
autoAdjustDpi: true
|
|
324
|
-
} as ImageExtractionConfig,
|
|
325
|
-
|
|
326
|
-
pdfOptions: {
|
|
327
|
-
extractImages: true,
|
|
328
|
-
passwords: [],
|
|
329
|
-
extractMetadata: true
|
|
330
|
-
} as PdfConfig,
|
|
331
|
-
|
|
332
|
-
tokenReduction: {
|
|
333
|
-
mode: 'moderate',
|
|
334
|
-
preserveImportantWords: true
|
|
335
|
-
} as TokenReductionConfig,
|
|
336
|
-
|
|
337
|
-
languageDetection: {
|
|
338
|
-
enabled: true,
|
|
339
|
-
minConfidence: 0.8,
|
|
340
|
-
detectMultiple: false
|
|
341
|
-
} as LanguageDetectionConfig
|
|
342
|
-
};
|
|
208
|
+
import { extractFile } from '@kreuzberg/node';
|
|
343
209
|
|
|
344
|
-
const result = await extractFile('document.pdf'
|
|
210
|
+
const result = await extractFile('document.pdf');
|
|
211
|
+
console.log(result.content);
|
|
345
212
|
```
|
|
346
213
|
|
|
347
|
-
## Advanced Usage
|
|
348
214
|
|
|
349
|
-
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
#### Configuration Discovery
|
|
350
219
|
|
|
351
220
|
```typescript
|
|
352
|
-
import {
|
|
353
|
-
import { readFile } from 'fs/promises';
|
|
221
|
+
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
354
222
|
|
|
355
|
-
const
|
|
356
|
-
|
|
357
|
-
console.log(
|
|
223
|
+
const config = ExtractionConfig.discover();
|
|
224
|
+
if (config) {
|
|
225
|
+
console.log('Found configuration file');
|
|
226
|
+
const result = await extractFile('document.pdf', null, config);
|
|
227
|
+
console.log(result.content);
|
|
228
|
+
} else {
|
|
229
|
+
console.log('No configuration file found, using defaults');
|
|
230
|
+
const result = await extractFile('document.pdf');
|
|
231
|
+
console.log(result.content);
|
|
232
|
+
}
|
|
358
233
|
```
|
|
359
234
|
|
|
360
|
-
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
#### Worker Thread Pool
|
|
361
240
|
|
|
362
241
|
```typescript
|
|
363
|
-
import {
|
|
242
|
+
import { createWorkerPool, extractFileInWorker, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
|
|
364
243
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
'document2.docx',
|
|
368
|
-
'document3.xlsx'
|
|
369
|
-
];
|
|
244
|
+
// Create a pool with 4 worker threads
|
|
245
|
+
const pool = createWorkerPool(4);
|
|
370
246
|
|
|
371
|
-
|
|
247
|
+
try {
|
|
248
|
+
// Extract single file in worker
|
|
249
|
+
const result = await extractFileInWorker(pool, 'document.pdf', null, {
|
|
250
|
+
useCache: true
|
|
251
|
+
});
|
|
252
|
+
console.log(result.content);
|
|
372
253
|
|
|
373
|
-
|
|
374
|
-
|
|
254
|
+
// Extract multiple files concurrently
|
|
255
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
|
256
|
+
const results = await batchExtractFilesInWorker(pool, files, {
|
|
257
|
+
useCache: true
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
results.forEach((result, i) => {
|
|
261
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
262
|
+
});
|
|
263
|
+
} finally {
|
|
264
|
+
// Always close the pool when done
|
|
265
|
+
await closeWorkerPool(pool);
|
|
375
266
|
}
|
|
376
267
|
```
|
|
377
268
|
|
|
378
|
-
### Batch Processing with Custom Concurrency
|
|
379
269
|
|
|
380
|
-
|
|
381
|
-
|
|
270
|
+
**Performance Benefits:**
|
|
271
|
+
- **Parallel Processing**: Multiple documents extracted simultaneously
|
|
272
|
+
- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
|
|
273
|
+
- **Queue Management**: Automatically distributes work across available workers
|
|
274
|
+
- **Resource Control**: Prevents thread exhaustion with configurable pool size
|
|
382
275
|
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
276
|
+
**Best Practices:**
|
|
277
|
+
- Use worker pools for batches of 10+ documents
|
|
278
|
+
- Set pool size to number of CPU cores (default behavior)
|
|
279
|
+
- Always close pools with `closeWorkerPool()` to prevent resource leaks
|
|
280
|
+
- Reuse pools across multiple batch operations for efficiency
|
|
386
281
|
|
|
387
|
-
const files = Array.from({ length: 20 }, (_, i) => `file-${i}.pdf`);
|
|
388
|
-
const results = await batchExtractFiles(files, config);
|
|
389
282
|
|
|
390
|
-
console.log(`Processed ${results.length} files`);
|
|
391
|
-
```
|
|
392
283
|
|
|
393
|
-
###
|
|
284
|
+
### Next Steps
|
|
394
285
|
|
|
395
|
-
|
|
396
|
-
|
|
286
|
+
- **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
|
287
|
+
- **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
|
|
288
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
|
|
289
|
+
- **[Configuration Guide](https://kreuzberg.dev/guides/configuration/)** - Advanced configuration options
|
|
397
290
|
|
|
398
|
-
const result = await extractFile('document.pdf');
|
|
399
291
|
|
|
400
|
-
if (result.metadata) {
|
|
401
|
-
console.log('Title:', result.metadata.title);
|
|
402
|
-
console.log('Author:', result.metadata.author);
|
|
403
|
-
console.log('Creation Date:', result.metadata.creationDate);
|
|
404
|
-
console.log('Page Count:', result.metadata.pageCount);
|
|
405
|
-
console.log('Word Count:', result.metadata.wordCount);
|
|
406
|
-
}
|
|
407
|
-
```
|
|
408
292
|
|
|
409
|
-
|
|
293
|
+
## NAPI-RS Implementation Details
|
|
410
294
|
|
|
411
|
-
|
|
412
|
-
import { extractFile, type TokenReductionConfig } from '@kreuzberg/node';
|
|
295
|
+
### Native Performance
|
|
413
296
|
|
|
414
|
-
|
|
415
|
-
tokenReduction: {
|
|
416
|
-
mode: 'aggressive', // Options: 'light', 'moderate', 'aggressive'
|
|
417
|
-
preserveImportantWords: true
|
|
418
|
-
} as TokenReductionConfig
|
|
419
|
-
};
|
|
297
|
+
This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
420
298
|
|
|
421
|
-
|
|
299
|
+
- **Zero-copy data transfer** between JavaScript and Rust layers
|
|
300
|
+
- **Native thread pool** for concurrent document processing
|
|
301
|
+
- **Direct memory management** for efficient large document handling
|
|
302
|
+
- **Binary-compatible** pre-built native modules across platforms
|
|
422
303
|
|
|
423
|
-
|
|
424
|
-
console.log(`Original length: ${result.content.length}`);
|
|
425
|
-
console.log(`Processed for LLM context window`);
|
|
426
|
-
```
|
|
304
|
+
### Threading Model
|
|
427
305
|
|
|
428
|
-
|
|
306
|
+
- Single documents are processed synchronously or asynchronously in a dedicated thread
|
|
307
|
+
- Batch operations distribute work across available CPU cores
|
|
308
|
+
- Thread count is configurable but defaults to system CPU count
|
|
309
|
+
- Long-running extractions block the event loop unless using async APIs
|
|
429
310
|
|
|
430
|
-
|
|
431
|
-
import {
|
|
432
|
-
extractFile,
|
|
433
|
-
KreuzbergError,
|
|
434
|
-
ValidationError,
|
|
435
|
-
ParsingError,
|
|
436
|
-
OCRError,
|
|
437
|
-
MissingDependencyError
|
|
438
|
-
} from '@kreuzberg/node';
|
|
311
|
+
### Memory Management
|
|
439
312
|
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
if (error instanceof ValidationError) {
|
|
445
|
-
console.error('Invalid configuration or input:', error.message);
|
|
446
|
-
} else if (error instanceof ParsingError) {
|
|
447
|
-
console.error('Failed to parse document:', error.message);
|
|
448
|
-
} else if (error instanceof OCRError) {
|
|
449
|
-
console.error('OCR processing failed:', error.message);
|
|
450
|
-
} else if (error instanceof MissingDependencyError) {
|
|
451
|
-
console.error(`Missing dependency: ${error.dependency}`);
|
|
452
|
-
console.error('Installation instructions:', error.message);
|
|
453
|
-
} else if (error instanceof KreuzbergError) {
|
|
454
|
-
console.error('Kreuzberg error:', error.message);
|
|
455
|
-
} else {
|
|
456
|
-
throw error;
|
|
457
|
-
}
|
|
458
|
-
}
|
|
459
|
-
```
|
|
313
|
+
- Large documents (> 100 MB) are streamed to avoid loading entirely into memory
|
|
314
|
+
- Temporary files are created in system temp directory for extraction
|
|
315
|
+
- Memory is automatically released after extraction completion
|
|
316
|
+
- ONNX models are cached in memory for repeated embeddings operations
|
|
460
317
|
|
|
461
|
-
## API Reference
|
|
462
|
-
|
|
463
|
-
### Extraction Functions
|
|
464
|
-
|
|
465
|
-
#### `extractFile(filePath: string, config?: ExtractionConfig): Promise<ExtractionResult>`
|
|
466
|
-
Asynchronously extract content from a file.
|
|
467
318
|
|
|
468
|
-
#### `extractFileSync(filePath: string, config?: ExtractionConfig): ExtractionResult`
|
|
469
|
-
Synchronously extract content from a file.
|
|
470
319
|
|
|
471
|
-
|
|
472
|
-
Asynchronously extract content from a buffer.
|
|
320
|
+
## Features
|
|
473
321
|
|
|
474
|
-
|
|
475
|
-
Synchronously extract content from a buffer.
|
|
322
|
+
### Supported File Formats (56+)
|
|
476
323
|
|
|
477
|
-
|
|
478
|
-
Asynchronously extract content from multiple files in parallel.
|
|
324
|
+
56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
479
325
|
|
|
480
|
-
####
|
|
481
|
-
Synchronously extract content from multiple files.
|
|
326
|
+
#### Office Documents
|
|
482
327
|
|
|
483
|
-
|
|
328
|
+
| Category | Formats | Capabilities |
|
|
329
|
+
|----------|---------|--------------|
|
|
330
|
+
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
331
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
332
|
+
| **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
|
|
333
|
+
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
334
|
+
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
484
335
|
|
|
485
|
-
####
|
|
486
|
-
Main result object containing:
|
|
487
|
-
- `content: string` - Extracted text content
|
|
488
|
-
- `mimeType: string` - MIME type of the document
|
|
489
|
-
- `metadata?: Metadata` - Document metadata
|
|
490
|
-
- `tables?: Table[]` - Extracted tables
|
|
491
|
-
- `images?: ImageData[]` - Extracted images
|
|
492
|
-
- `chunks?: Chunk[]` - Text chunks (if chunking enabled)
|
|
493
|
-
- `language?: LanguageInfo` - Detected language (if enabled)
|
|
336
|
+
#### Images (OCR-Enabled)
|
|
494
337
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
- `maxConcurrentExtractions?: number` - Max parallel extractions
|
|
501
|
-
- `ocr?: OcrConfig` - OCR settings
|
|
502
|
-
- `chunking?: ChunkingConfig` - Text chunking settings
|
|
503
|
-
- `images?: ImageExtractionConfig` - Image extraction settings
|
|
504
|
-
- `pdfOptions?: PdfConfig` - PDF-specific options
|
|
505
|
-
- `tokenReduction?: TokenReductionConfig` - Token reduction settings
|
|
506
|
-
- `languageDetection?: LanguageDetectionConfig` - Language detection settings
|
|
338
|
+
| Category | Formats | Features |
|
|
339
|
+
|----------|---------|----------|
|
|
340
|
+
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
|
341
|
+
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR, table detection, format-specific metadata |
|
|
342
|
+
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
|
507
343
|
|
|
508
|
-
####
|
|
509
|
-
OCR configuration:
|
|
510
|
-
- `backend: string` - OCR backend ('tesseract', 'easyocr', 'paddleocr')
|
|
511
|
-
- `language: string` - Language code (e.g., 'eng', 'fra', 'deu')
|
|
512
|
-
- `preprocessing?: boolean` - Enable image preprocessing
|
|
513
|
-
- `tesseractConfig?: TesseractConfig` - Tesseract-specific options
|
|
344
|
+
#### Web & Data
|
|
514
345
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
- `columnCount: number` - Number of columns
|
|
346
|
+
| Category | Formats | Features |
|
|
347
|
+
|----------|---------|----------|
|
|
348
|
+
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
349
|
+
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
350
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
521
351
|
|
|
522
|
-
|
|
352
|
+
#### Email & Archives
|
|
523
353
|
|
|
524
|
-
|
|
354
|
+
| Category | Formats | Features |
|
|
355
|
+
|----------|---------|----------|
|
|
356
|
+
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
|
357
|
+
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
|
525
358
|
|
|
526
|
-
|
|
527
|
-
- `ValidationError` - Invalid configuration, missing required fields, or invalid input
|
|
528
|
-
- `ParsingError` - Document parsing failure or corrupted file
|
|
529
|
-
- `OCRError` - OCR processing failure
|
|
530
|
-
- `MissingDependencyError` - Missing optional system dependency (includes installation instructions)
|
|
359
|
+
#### Academic & Scientific
|
|
531
360
|
|
|
532
|
-
|
|
361
|
+
| Category | Formats | Features |
|
|
362
|
+
|----------|---------|----------|
|
|
363
|
+
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.enw`, `.csl` | Bibliography parsing, citation extraction |
|
|
364
|
+
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
|
365
|
+
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
|
533
366
|
|
|
534
|
-
|
|
535
|
-
|----------|---------|
|
|
536
|
-
| **Documents** | PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, ODT, ODP, ODS, RTF |
|
|
537
|
-
| **Images** | PNG, JPEG, JPG, WEBP, BMP, TIFF, GIF |
|
|
538
|
-
| **Web** | HTML, XHTML, XML |
|
|
539
|
-
| **Text** | TXT, MD, CSV, TSV, JSON, YAML, TOML |
|
|
540
|
-
| **Email** | EML, MSG |
|
|
541
|
-
| **Archives** | ZIP, TAR, 7Z |
|
|
542
|
-
| **Other** | And 30+ more formats |
|
|
367
|
+
**[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
|
|
543
368
|
|
|
544
|
-
|
|
369
|
+
### Key Capabilities
|
|
545
370
|
|
|
546
|
-
|
|
371
|
+
- **Text Extraction** - Extract all text content with position and formatting information
|
|
372
|
+
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
373
|
+
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
374
|
+
- **Image Extraction** - Extract embedded images and render page previews
|
|
375
|
+
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
|
547
376
|
|
|
548
|
-
- **
|
|
549
|
-
- **Native multithreading** for batch processing
|
|
550
|
-
- **Optimized memory usage** with streaming for large files
|
|
551
|
-
- **Zero-copy operations** where possible
|
|
552
|
-
- **Efficient caching** to avoid redundant processing
|
|
377
|
+
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
553
378
|
|
|
554
|
-
### Benchmarks
|
|
555
379
|
|
|
556
|
-
|
|
380
|
+
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
557
381
|
|
|
558
|
-
| Library | Time | Memory |
|
|
559
|
-
|---------|------|--------|
|
|
560
|
-
| Kreuzberg | 2.3s | 145 MB |
|
|
561
|
-
| pdf-parse + mammoth | 23.1s | 890 MB |
|
|
562
|
-
| textract | 45.2s | 1.2 GB |
|
|
563
382
|
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
### Native Module Not Found
|
|
383
|
+
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
|
|
567
384
|
|
|
568
|
-
|
|
385
|
+
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
386
|
+
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
387
|
+
- **Language Detection** - Detect and support multiple languages in documents
|
|
388
|
+
- **Configuration** - Fine-grained control over extraction behavior
|
|
569
389
|
|
|
570
|
-
|
|
571
|
-
npm rebuild @kreuzberg/node
|
|
572
|
-
```
|
|
390
|
+
### Performance Characteristics
|
|
573
391
|
|
|
574
|
-
|
|
392
|
+
| Format | Speed | Memory | Notes |
|
|
393
|
+
|--------|-------|--------|-------|
|
|
394
|
+
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
|
395
|
+
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
|
396
|
+
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
|
397
|
+
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
|
398
|
+
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
|
575
399
|
|
|
576
|
-
Ensure Tesseract is installed and available in PATH:
|
|
577
400
|
|
|
578
|
-
```bash
|
|
579
|
-
tesseract --version
|
|
580
|
-
```
|
|
581
401
|
|
|
582
|
-
|
|
583
|
-
- macOS: `brew install tesseract`
|
|
584
|
-
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
585
|
-
- Windows: Download from [tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)
|
|
402
|
+
## OCR Support
|
|
586
403
|
|
|
587
|
-
|
|
404
|
+
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
|
588
405
|
|
|
589
|
-
|
|
406
|
+
|
|
407
|
+
- **Tesseract**
|
|
408
|
+
|
|
409
|
+
- **Guten**
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
### OCR Configuration Example
|
|
590
413
|
|
|
591
414
|
```typescript
|
|
415
|
+
import { extractFile } from '@kreuzberg/node';
|
|
416
|
+
|
|
592
417
|
const config = {
|
|
593
|
-
|
|
418
|
+
ocr: {
|
|
419
|
+
backend: 'tesseract',
|
|
420
|
+
language: 'eng+fra',
|
|
421
|
+
tesseractConfig: {
|
|
422
|
+
psm: 3,
|
|
423
|
+
},
|
|
424
|
+
},
|
|
594
425
|
};
|
|
595
|
-
|
|
426
|
+
|
|
427
|
+
const result = await extractFile('document.pdf', null, config);
|
|
428
|
+
console.log(result.content);
|
|
596
429
|
```
|
|
597
430
|
|
|
598
|
-
### TypeScript Types Not Resolving
|
|
599
431
|
|
|
600
|
-
Make sure you're using:
|
|
601
|
-
- Node.js 18 or higher
|
|
602
|
-
- TypeScript 5.0 or higher
|
|
603
432
|
|
|
604
|
-
The package includes built-in type definitions.
|
|
605
433
|
|
|
606
|
-
|
|
434
|
+
## Async Support
|
|
607
435
|
|
|
608
|
-
|
|
436
|
+
This binding provides full async/await support for non-blocking document processing:
|
|
609
437
|
|
|
610
438
|
```typescript
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
439
|
+
import { extractFile } from '@kreuzberg/node';
|
|
440
|
+
|
|
441
|
+
const result = await extractFile('document.pdf');
|
|
442
|
+
console.log(result.content);
|
|
615
443
|
```
|
|
616
444
|
|
|
617
|
-
## Examples
|
|
618
445
|
|
|
619
|
-
### Extract Invoice Data
|
|
620
446
|
|
|
621
|
-
```typescript
|
|
622
|
-
import { extractFile } from '@kreuzberg/node';
|
|
623
447
|
|
|
624
|
-
|
|
448
|
+
## Plugin System
|
|
625
449
|
|
|
626
|
-
|
|
627
|
-
if (result.tables && result.tables.length > 0) {
|
|
628
|
-
const lineItems = result.tables[0];
|
|
629
|
-
console.log(lineItems.markdown);
|
|
630
|
-
}
|
|
450
|
+
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
|
631
451
|
|
|
632
|
-
|
|
633
|
-
if (result.metadata) {
|
|
634
|
-
console.log('Invoice Date:', result.metadata.creationDate);
|
|
635
|
-
}
|
|
636
|
-
```
|
|
452
|
+
For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/guides/plugins/).
|
|
637
453
|
|
|
638
|
-
### Process Scanned Documents
|
|
639
454
|
|
|
640
|
-
```typescript
|
|
641
|
-
import { extractFile } from '@kreuzberg/node';
|
|
642
455
|
|
|
643
|
-
const config = {
|
|
644
|
-
forceOcr: true,
|
|
645
|
-
ocr: {
|
|
646
|
-
backend: 'tesseract',
|
|
647
|
-
language: 'eng',
|
|
648
|
-
preprocessing: true
|
|
649
|
-
}
|
|
650
|
-
};
|
|
651
456
|
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
457
|
+
## Embeddings Support
|
|
458
|
+
|
|
459
|
+
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
|
|
460
|
+
|
|
461
|
+
**[Embeddings Guide](https://kreuzberg.dev/features/#embeddings)**
|
|
462
|
+
|
|
655
463
|
|
|
656
|
-
|
|
464
|
+
|
|
465
|
+
## Batch Processing
|
|
466
|
+
|
|
467
|
+
Process multiple documents efficiently:
|
|
657
468
|
|
|
658
469
|
```typescript
|
|
659
|
-
import {
|
|
660
|
-
import { glob } from 'glob';
|
|
470
|
+
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
661
471
|
|
|
662
|
-
|
|
663
|
-
const
|
|
472
|
+
const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
|
|
473
|
+
const results = batchExtractFilesSync(files);
|
|
664
474
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
maxConcurrentExtractions: 8,
|
|
668
|
-
enableQualityProcessing: true
|
|
475
|
+
results.forEach((result, i) => {
|
|
476
|
+
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
|
669
477
|
});
|
|
478
|
+
```
|
|
670
479
|
|
|
671
|
-
// Build search index
|
|
672
|
-
const searchIndex = results.map((result, i) => ({
|
|
673
|
-
path: files[i],
|
|
674
|
-
content: result.content,
|
|
675
|
-
metadata: result.metadata
|
|
676
|
-
}));
|
|
677
480
|
|
|
678
|
-
|
|
679
|
-
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
## Configuration
|
|
484
|
+
|
|
485
|
+
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
|
486
|
+
|
|
487
|
+
**[Configuration Guide](https://kreuzberg.dev/guides/configuration/)**
|
|
680
488
|
|
|
681
489
|
## Documentation
|
|
682
490
|
|
|
683
|
-
|
|
491
|
+
- **[Official Documentation](https://kreuzberg.dev/)**
|
|
492
|
+
- **[API Reference](https://kreuzberg.dev/reference/api-typescript/)**
|
|
493
|
+
- **[Examples & Guides](https://kreuzberg.dev/guides/)**
|
|
684
494
|
|
|
685
495
|
## Contributing
|
|
686
496
|
|
|
687
|
-
|
|
497
|
+
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
|
688
498
|
|
|
689
499
|
## License
|
|
690
500
|
|
|
691
|
-
MIT
|
|
501
|
+
MIT License - see LICENSE file for details.
|
|
692
502
|
|
|
693
|
-
##
|
|
503
|
+
## Support
|
|
694
504
|
|
|
695
|
-
- [
|
|
696
|
-
- [
|
|
697
|
-
- [
|
|
698
|
-
- [Issue Tracker](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
699
|
-
- [Changelog](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md)
|
|
700
|
-
- [npm Package](https://www.npmjs.com/package/@kreuzberg/node)
|
|
505
|
+
- **Discord Community**: [Join our Discord](https://discord.gg/pXxagNK2zN)
|
|
506
|
+
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
|
507
|
+
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|