@kreuzberg/node 4.0.0-rc.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +669 -0
- package/index.d.ts +1109 -0
- package/index.js +607 -0
- package/metadata.d.ts +502 -0
- package/package.json +128 -0
package/index.d.ts
ADDED
|
@@ -0,0 +1,1109 @@
|
|
|
1
|
+
/* auto-generated by NAPI-RS */
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
/**
|
|
4
|
+
* Batch extract from multiple byte arrays (asynchronous).
|
|
5
|
+
*
|
|
6
|
+
* Asynchronously processes multiple in-memory buffers in parallel. Non-blocking
|
|
7
|
+
* alternative to `batchExtractBytesSync`.
|
|
8
|
+
*
|
|
9
|
+
* # Parameters
|
|
10
|
+
*
|
|
11
|
+
* * `data_list` - Array of buffers to extract
|
|
12
|
+
* * `mime_types` - Array of MIME types (must match data_list length)
|
|
13
|
+
* * `config` - Optional extraction configuration
|
|
14
|
+
*
|
|
15
|
+
* # Returns
|
|
16
|
+
*
|
|
17
|
+
* Promise resolving to array of `ExtractionResult`.
|
|
18
|
+
*
|
|
19
|
+
* # Example
|
|
20
|
+
*
|
|
21
|
+
* ```typescript
|
|
22
|
+
* import { batchExtractBytes } from '@kreuzberg/node';
|
|
23
|
+
*
|
|
24
|
+
* const responses = await Promise.all([
|
|
25
|
+
* fetch('https://example.com/doc1.pdf'),
|
|
26
|
+
* fetch('https://example.com/doc2.pdf')
|
|
27
|
+
* ]);
|
|
28
|
+
* const buffers = await Promise.all(
|
|
29
|
+
* responses.map(r => r.arrayBuffer().then(b => Buffer.from(b)))
|
|
30
|
+
* );
|
|
31
|
+
* const results = await batchExtractBytes(
|
|
32
|
+
* buffers,
|
|
33
|
+
* ['application/pdf', 'application/pdf'],
|
|
34
|
+
* null
|
|
35
|
+
* );
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
export declare function batchExtractBytes(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Batch extract from multiple byte arrays (synchronous).
|
|
42
|
+
*
|
|
43
|
+
* Synchronously processes multiple in-memory buffers in parallel. Requires
|
|
44
|
+
* corresponding MIME types for each buffer.
|
|
45
|
+
*
|
|
46
|
+
* # Parameters
|
|
47
|
+
*
|
|
48
|
+
* * `data_list` - Array of buffers to extract
|
|
49
|
+
* * `mime_types` - Array of MIME types (must match data_list length)
|
|
50
|
+
* * `config` - Optional extraction configuration
|
|
51
|
+
*
|
|
52
|
+
* # Returns
|
|
53
|
+
*
|
|
54
|
+
* Array of `ExtractionResult` in the same order as inputs.
|
|
55
|
+
*
|
|
56
|
+
* # Errors
|
|
57
|
+
*
|
|
58
|
+
* Throws if data_list and mime_types lengths don't match.
|
|
59
|
+
*
|
|
60
|
+
* # Example
|
|
61
|
+
*
|
|
62
|
+
* ```typescript
|
|
63
|
+
* import { batchExtractBytesSync } from '@kreuzberg/node';
|
|
64
|
+
*
|
|
65
|
+
* const buffers = [buffer1, buffer2, buffer3];
|
|
66
|
+
* const mimeTypes = ['application/pdf', 'image/png', 'text/plain'];
|
|
67
|
+
* const results = batchExtractBytesSync(buffers, mimeTypes, null);
|
|
68
|
+
* ```
|
|
69
|
+
*/
|
|
70
|
+
export declare function batchExtractBytesSync(dataList: Array<Buffer>, mimeTypes: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Batch extract from multiple files (asynchronous).
|
|
74
|
+
*
|
|
75
|
+
* Asynchronously processes multiple files in parallel. Non-blocking alternative
|
|
76
|
+
* to `batchExtractFilesSync` with same performance benefits.
|
|
77
|
+
*
|
|
78
|
+
* # Parameters
|
|
79
|
+
*
|
|
80
|
+
* * `paths` - Array of file paths to extract
|
|
81
|
+
* * `config` - Optional extraction configuration (applied to all files)
|
|
82
|
+
*
|
|
83
|
+
* # Returns
|
|
84
|
+
*
|
|
85
|
+
* Promise resolving to array of `ExtractionResult`.
|
|
86
|
+
*
|
|
87
|
+
* # Example
|
|
88
|
+
*
|
|
89
|
+
* ```typescript
|
|
90
|
+
* import { batchExtractFiles } from '@kreuzberg/node';
|
|
91
|
+
*
|
|
92
|
+
* const files = ['report1.pdf', 'report2.pdf', 'report3.pdf'];
|
|
93
|
+
* const results = await batchExtractFiles(files, null);
|
|
94
|
+
* console.log(`Processed ${results.length} files`);
|
|
95
|
+
* ```
|
|
96
|
+
*/
|
|
97
|
+
export declare function batchExtractFiles(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Promise<Array<JsExtractionResult>>
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Batch extract from multiple files (synchronous).
|
|
101
|
+
*
|
|
102
|
+
* Synchronously processes multiple files in parallel using Rayon. Significantly
|
|
103
|
+
* faster than sequential processing for large batches.
|
|
104
|
+
*
|
|
105
|
+
* # Parameters
|
|
106
|
+
*
|
|
107
|
+
* * `paths` - Array of file paths to extract
|
|
108
|
+
* * `config` - Optional extraction configuration (applied to all files)
|
|
109
|
+
*
|
|
110
|
+
* # Returns
|
|
111
|
+
*
|
|
112
|
+
* Array of `ExtractionResult` in the same order as input paths.
|
|
113
|
+
*
|
|
114
|
+
* # Example
|
|
115
|
+
*
|
|
116
|
+
* ```typescript
|
|
117
|
+
* import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
118
|
+
*
|
|
119
|
+
* const files = ['doc1.pdf', 'doc2.docx', 'doc3.txt'];
|
|
120
|
+
* const results = batchExtractFilesSync(files, null);
|
|
121
|
+
* results.forEach((result, i) => {
|
|
122
|
+
* console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
|
|
123
|
+
* });
|
|
124
|
+
* ```
|
|
125
|
+
*/
|
|
126
|
+
export declare function batchExtractFilesSync(paths: Array<string>, config?: JsExtractionConfig | undefined | null): Array<JsExtractionResult>
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Clear all registered document extractors.
|
|
130
|
+
*
|
|
131
|
+
* Removes all document extractors from the registry, including built-in extractors.
|
|
132
|
+
* Use with caution as this will make document extraction unavailable until
|
|
133
|
+
* extractors are re-registered.
|
|
134
|
+
*
|
|
135
|
+
* # Example
|
|
136
|
+
*
|
|
137
|
+
* ```typescript
|
|
138
|
+
* import { clearDocumentExtractors } from 'kreuzberg';
|
|
139
|
+
*
|
|
140
|
+
* clearDocumentExtractors();
|
|
141
|
+
* ```
|
|
142
|
+
*/
|
|
143
|
+
export declare function clearDocumentExtractors(): void
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Clear all registered OCR backends.
|
|
147
|
+
*
|
|
148
|
+
* Removes all OCR backends from the registry, including built-in backends.
|
|
149
|
+
* Use with caution as this will make OCR functionality unavailable until
|
|
150
|
+
* backends are re-registered.
|
|
151
|
+
*
|
|
152
|
+
* # Example
|
|
153
|
+
*
|
|
154
|
+
* ```typescript
|
|
155
|
+
* import { clearOcrBackends } from 'kreuzberg';
|
|
156
|
+
*
|
|
157
|
+
* clearOcrBackends();
|
|
158
|
+
* ```
|
|
159
|
+
*/
|
|
160
|
+
export declare function clearOcrBackends(): void
|
|
161
|
+
|
|
162
|
+
/** Clear all registered postprocessors */
|
|
163
|
+
export declare function clearPostProcessors(): void
|
|
164
|
+
|
|
165
|
+
/** Clear all registered validators */
|
|
166
|
+
export declare function clearValidators(): void
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Detect MIME type from raw bytes.
|
|
170
|
+
*
|
|
171
|
+
* Uses content inspection (magic bytes) to determine MIME type.
|
|
172
|
+
* This is more accurate than extension-based detection but requires
|
|
173
|
+
* reading the file content.
|
|
174
|
+
*
|
|
175
|
+
* # Parameters
|
|
176
|
+
*
|
|
177
|
+
* * `bytes` - Raw file content as Buffer
|
|
178
|
+
*
|
|
179
|
+
* # Returns
|
|
180
|
+
*
|
|
181
|
+
* The detected MIME type string.
|
|
182
|
+
*
|
|
183
|
+
* # Errors
|
|
184
|
+
*
|
|
185
|
+
* Throws an error if MIME type cannot be determined from content.
|
|
186
|
+
*
|
|
187
|
+
* # Example
|
|
188
|
+
*
|
|
189
|
+
* ```typescript
|
|
190
|
+
* import { detectMimeType } from 'kreuzberg';
|
|
191
|
+
* import * as fs from 'fs';
|
|
192
|
+
*
|
|
193
|
+
* // Read file content
|
|
194
|
+
* const content = fs.readFileSync('document.pdf');
|
|
195
|
+
*
|
|
196
|
+
* // Detect MIME type from bytes
|
|
197
|
+
* const mimeType = detectMimeType(content);
|
|
198
|
+
* console.log(mimeType); // 'application/pdf'
|
|
199
|
+
* ```
|
|
200
|
+
*/
|
|
201
|
+
export declare function detectMimeType(bytes: Buffer): string
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Detect MIME type from a file path.
|
|
205
|
+
*
|
|
206
|
+
* Uses file extension to determine MIME type. Falls back to `mime_guess` crate
|
|
207
|
+
* if extension-based detection fails.
|
|
208
|
+
*
|
|
209
|
+
* # Parameters
|
|
210
|
+
*
|
|
211
|
+
* * `path` - Path to the file (string)
|
|
212
|
+
* * `check_exists` - Whether to verify file existence (default: true)
|
|
213
|
+
*
|
|
214
|
+
* # Returns
|
|
215
|
+
*
|
|
216
|
+
* The detected MIME type string.
|
|
217
|
+
*
|
|
218
|
+
* # Errors
|
|
219
|
+
*
|
|
220
|
+
* Throws an error if:
|
|
221
|
+
* - File doesn't exist (when check_exists is true)
|
|
222
|
+
* - MIME type cannot be determined from path/extension
|
|
223
|
+
* - Extension is unknown
|
|
224
|
+
*
|
|
225
|
+
* # Example
|
|
226
|
+
*
|
|
227
|
+
* ```typescript
|
|
228
|
+
* import { detectMimeTypeFromPath } from 'kreuzberg';
|
|
229
|
+
*
|
|
230
|
+
* // Detect from existing file
|
|
231
|
+
* const mimeType = detectMimeTypeFromPath('document.pdf');
|
|
232
|
+
* console.log(mimeType); // 'application/pdf'
|
|
233
|
+
*
|
|
234
|
+
* const mimeType2 = detectMimeTypeFromPath('document.docx');
|
|
235
|
+
* console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
236
|
+
* ```
|
|
237
|
+
*/
|
|
238
|
+
export declare function detectMimeTypeFromPath(path: string, checkExists?: boolean | undefined | null): string
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Discover and load extraction configuration from current or parent directories.
|
|
242
|
+
*
|
|
243
|
+
* Searches for a `kreuzberg.toml` file starting from the current working directory
|
|
244
|
+
* and traversing up the directory tree. Returns the first configuration file found.
|
|
245
|
+
*
|
|
246
|
+
* # Returns
|
|
247
|
+
*
|
|
248
|
+
* `JsExtractionConfig` object if a configuration file is found, or `null` if no
|
|
249
|
+
* configuration file exists in the current or parent directories.
|
|
250
|
+
*
|
|
251
|
+
* # Example
|
|
252
|
+
*
|
|
253
|
+
* ```typescript
|
|
254
|
+
* import { ExtractionConfig } from 'kreuzberg';
|
|
255
|
+
*
|
|
256
|
+
* // Try to find config in current or parent directories
|
|
257
|
+
* const config = ExtractionConfig.discover();
|
|
258
|
+
* if (config) {
|
|
259
|
+
* console.log('Found configuration');
|
|
260
|
+
* // Use config for extraction
|
|
261
|
+
* } else {
|
|
262
|
+
* console.log('No configuration file found, using defaults');
|
|
263
|
+
* }
|
|
264
|
+
* ```
|
|
265
|
+
*/
|
|
266
|
+
export declare function discoverExtractionConfig(): JsExtractionConfig | null
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Embedding preset configuration for TypeScript bindings.
|
|
270
|
+
*
|
|
271
|
+
* Contains all settings for a specific embedding model preset.
|
|
272
|
+
*/
|
|
273
|
+
export interface EmbeddingPreset {
|
|
274
|
+
/** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
|
|
275
|
+
name: string
|
|
276
|
+
/** Recommended chunk size in characters */
|
|
277
|
+
chunkSize: number
|
|
278
|
+
/** Recommended overlap in characters */
|
|
279
|
+
overlap: number
|
|
280
|
+
/** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
|
|
281
|
+
modelName: string
|
|
282
|
+
/** Embedding vector dimensions */
|
|
283
|
+
dimensions: number
|
|
284
|
+
/** Human-readable description of the preset */
|
|
285
|
+
description: string
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Extract content from bytes (asynchronous).
|
|
290
|
+
*
|
|
291
|
+
* Asynchronously extracts content from a byte buffer. Non-blocking alternative
|
|
292
|
+
* to `extractBytesSync` for processing in-memory data.
|
|
293
|
+
*
|
|
294
|
+
* # Parameters
|
|
295
|
+
*
|
|
296
|
+
* * `data` - Buffer containing the document bytes
|
|
297
|
+
* * `mime_type` - MIME type of the data
|
|
298
|
+
* * `config` - Optional extraction configuration
|
|
299
|
+
*
|
|
300
|
+
* # Returns
|
|
301
|
+
*
|
|
302
|
+
* Promise resolving to `ExtractionResult`.
|
|
303
|
+
*
|
|
304
|
+
* # Example
|
|
305
|
+
*
|
|
306
|
+
* ```typescript
|
|
307
|
+
* import { extractBytes } from '@kreuzberg/node';
|
|
308
|
+
*
|
|
309
|
+
* const response = await fetch('https://example.com/document.pdf');
|
|
310
|
+
* const buffer = Buffer.from(await response.arrayBuffer());
|
|
311
|
+
* const result = await extractBytes(buffer, 'application/pdf', null);
|
|
312
|
+
* ```
|
|
313
|
+
*/
|
|
314
|
+
export declare function extractBytes(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Extract content from bytes (synchronous).
|
|
318
|
+
*
|
|
319
|
+
* Synchronously extracts content from a byte buffer without requiring a file path.
|
|
320
|
+
* Useful for processing in-memory data, network streams, or database BLOBs.
|
|
321
|
+
*
|
|
322
|
+
* # Parameters
|
|
323
|
+
*
|
|
324
|
+
* * `data` - Buffer containing the document bytes
|
|
325
|
+
* * `mime_type` - MIME type of the data (e.g., "application/pdf", "image/png")
|
|
326
|
+
* * `config` - Optional extraction configuration
|
|
327
|
+
*
|
|
328
|
+
* # Returns
|
|
329
|
+
*
|
|
330
|
+
* `ExtractionResult` with extracted content and metadata.
|
|
331
|
+
*
|
|
332
|
+
* # Errors
|
|
333
|
+
*
|
|
334
|
+
* Throws an error if data is malformed or MIME type is unsupported.
|
|
335
|
+
*
|
|
336
|
+
* # Example
|
|
337
|
+
*
|
|
338
|
+
* ```typescript
|
|
339
|
+
* import { extractBytesSync } from '@kreuzberg/node';
|
|
340
|
+
* import fs from 'fs';
|
|
341
|
+
*
|
|
342
|
+
* const buffer = fs.readFileSync('document.pdf');
|
|
343
|
+
* const result = extractBytesSync(buffer, 'application/pdf', null);
|
|
344
|
+
* console.log(result.content);
|
|
345
|
+
* ```
|
|
346
|
+
*/
|
|
347
|
+
export declare function extractBytesSync(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): JsExtractionResult
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Extract content from a file (asynchronous).
|
|
351
|
+
*
|
|
352
|
+
* Asynchronously extracts text, tables, images, and metadata from a document file.
|
|
353
|
+
* Non-blocking alternative to `extractFileSync` for use in async/await contexts.
|
|
354
|
+
*
|
|
355
|
+
* # Parameters
|
|
356
|
+
*
|
|
357
|
+
* * `file_path` - Path to the file to extract (absolute or relative)
|
|
358
|
+
* * `mime_type` - Optional MIME type hint (auto-detected if omitted)
|
|
359
|
+
* * `config` - Optional extraction configuration (OCR, chunking, etc.)
|
|
360
|
+
*
|
|
361
|
+
* # Returns
|
|
362
|
+
*
|
|
363
|
+
* Promise resolving to `ExtractionResult` with extracted content and metadata.
|
|
364
|
+
*
|
|
365
|
+
* # Errors
|
|
366
|
+
*
|
|
367
|
+
* Rejects if file processing fails (see `extractFileSync` for error conditions).
|
|
368
|
+
*
|
|
369
|
+
* # Example
|
|
370
|
+
*
|
|
371
|
+
* ```typescript
|
|
372
|
+
* import { extractFile } from '@kreuzberg/node';
|
|
373
|
+
*
|
|
374
|
+
* // Async/await usage
|
|
375
|
+
* const result = await extractFile('document.pdf', null, null);
|
|
376
|
+
* console.log(result.content);
|
|
377
|
+
*
|
|
378
|
+
* // Promise usage
|
|
379
|
+
* extractFile('report.docx', null, null)
|
|
380
|
+
* .then(result => console.log(result.content))
|
|
381
|
+
* .catch(err => console.error(err));
|
|
382
|
+
* ```
|
|
383
|
+
*/
|
|
384
|
+
export declare function extractFile(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise<JsExtractionResult>
|
|
385
|
+
|
|
386
|
+
/**
|
|
387
|
+
* Extract content from a file (synchronous).
|
|
388
|
+
*
|
|
389
|
+
* Synchronously extracts text, tables, images, and metadata from a document file.
|
|
390
|
+
* Supports 118+ file formats including PDFs, Office documents, images, and more.
|
|
391
|
+
*
|
|
392
|
+
* # Parameters
|
|
393
|
+
*
|
|
394
|
+
* * `file_path` - Path to the file to extract (absolute or relative)
|
|
395
|
+
* * `mime_type` - Optional MIME type hint (auto-detected if omitted)
|
|
396
|
+
* * `config` - Optional extraction configuration (OCR, chunking, etc.)
|
|
397
|
+
*
|
|
398
|
+
* # Returns
|
|
399
|
+
*
|
|
400
|
+
* `ExtractionResult` containing:
|
|
401
|
+
* - `content`: Extracted text content
|
|
402
|
+
* - `mimeType`: Detected MIME type
|
|
403
|
+
* - `metadata`: File metadata (author, title, etc.)
|
|
404
|
+
* - `tables`: Extracted tables (if any)
|
|
405
|
+
* - `images`: Extracted images (if configured)
|
|
406
|
+
* - `chunks`: Text chunks (if chunking enabled)
|
|
407
|
+
* - `detectedLanguages`: Detected languages (if enabled)
|
|
408
|
+
*
|
|
409
|
+
* # Errors
|
|
410
|
+
*
|
|
411
|
+
* Throws an error if:
|
|
412
|
+
* - File does not exist or is not accessible
|
|
413
|
+
* - File format is unsupported
|
|
414
|
+
* - File is corrupted or malformed
|
|
415
|
+
* - OCR processing fails (if enabled)
|
|
416
|
+
*
|
|
417
|
+
* # Example
|
|
418
|
+
*
|
|
419
|
+
* ```typescript
|
|
420
|
+
* import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';
|
|
421
|
+
*
|
|
422
|
+
* // Basic extraction
|
|
423
|
+
* const result = extractFileSync('document.pdf', null, null);
|
|
424
|
+
* console.log(result.content);
|
|
425
|
+
*
|
|
426
|
+
* // With MIME type hint
|
|
427
|
+
* const result2 = extractFileSync('file.bin', 'application/pdf', null);
|
|
428
|
+
*
|
|
429
|
+
* // With OCR enabled
|
|
430
|
+
* const config: ExtractionConfig = {
|
|
431
|
+
* ocr: {
|
|
432
|
+
* backend: 'tesseract',
|
|
433
|
+
* language: 'eng',
|
|
434
|
+
* }
|
|
435
|
+
* };
|
|
436
|
+
* const result3 = extractFileSync('scanned.pdf', null, config);
|
|
437
|
+
* ```
|
|
438
|
+
*/
|
|
439
|
+
export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Get a specific embedding preset by name.
|
|
443
|
+
*
|
|
444
|
+
* Returns a preset configuration object, or null if the preset name is not found.
|
|
445
|
+
*
|
|
446
|
+
* # Arguments
|
|
447
|
+
*
|
|
448
|
+
* * `name` - The preset name (case-sensitive)
|
|
449
|
+
*
|
|
450
|
+
* # Returns
|
|
451
|
+
*
|
|
452
|
+
* An `EmbeddingPreset` object with the following properties:
|
|
453
|
+
* - `name`: string - Preset name
|
|
454
|
+
* - `chunkSize`: number - Recommended chunk size in characters
|
|
455
|
+
* - `overlap`: number - Recommended overlap in characters
|
|
456
|
+
* - `modelName`: string - Model identifier
|
|
457
|
+
* - `dimensions`: number - Embedding vector dimensions
|
|
458
|
+
* - `description`: string - Human-readable description
|
|
459
|
+
*
|
|
460
|
+
* Returns `null` if preset name is not found.
|
|
461
|
+
*
|
|
462
|
+
* # Example
|
|
463
|
+
*
|
|
464
|
+
* ```typescript
|
|
465
|
+
* import { getEmbeddingPreset } from 'kreuzberg';
|
|
466
|
+
*
|
|
467
|
+
* const preset = getEmbeddingPreset('balanced');
|
|
468
|
+
* if (preset) {
|
|
469
|
+
* console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
|
|
470
|
+
* // Model: BGEBaseENV15, Dims: 768
|
|
471
|
+
* }
|
|
472
|
+
* ```
|
|
473
|
+
*/
|
|
474
|
+
export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null
|
|
475
|
+
|
|
476
|
+
/**
|
|
477
|
+
* Get file extensions for a given MIME type.
|
|
478
|
+
*
|
|
479
|
+
* Returns an array of file extensions commonly associated with the specified
|
|
480
|
+
* MIME type. For example, 'application/pdf' returns ['pdf'].
|
|
481
|
+
*
|
|
482
|
+
* # Parameters
|
|
483
|
+
*
|
|
484
|
+
* * `mime_type` - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg')
|
|
485
|
+
*
|
|
486
|
+
* # Returns
|
|
487
|
+
*
|
|
488
|
+
* Array of file extensions (without leading dots).
|
|
489
|
+
*
|
|
490
|
+
* # Errors
|
|
491
|
+
*
|
|
492
|
+
* Throws an error if the MIME type is not recognized or supported.
|
|
493
|
+
*
|
|
494
|
+
* # Example
|
|
495
|
+
*
|
|
496
|
+
* ```typescript
|
|
497
|
+
* import { getExtensionsForMime } from 'kreuzberg';
|
|
498
|
+
*
|
|
499
|
+
* // Get extensions for PDF
|
|
500
|
+
* const pdfExts = getExtensionsForMime('application/pdf');
|
|
501
|
+
* console.log(pdfExts); // ['pdf']
|
|
502
|
+
*
|
|
503
|
+
* // Get extensions for JPEG
|
|
504
|
+
* const jpegExts = getExtensionsForMime('image/jpeg');
|
|
505
|
+
* console.log(jpegExts); // ['jpg', 'jpeg']
|
|
506
|
+
* ```
|
|
507
|
+
*/
|
|
508
|
+
export declare function getExtensionsForMime(mimeType: string): Array<string>
|
|
509
|
+
|
|
510
|
+
/**
|
|
511
|
+
* Get the error code for the last FFI error.
|
|
512
|
+
*
|
|
513
|
+
* Returns the FFI error code as an integer. Error codes are:
|
|
514
|
+
* - 0: Success (no error)
|
|
515
|
+
* - 1: GenericError
|
|
516
|
+
* - 2: Panic
|
|
517
|
+
* - 3: InvalidArgument
|
|
518
|
+
* - 4: IoError
|
|
519
|
+
* - 5: ParsingError
|
|
520
|
+
* - 6: OcrError
|
|
521
|
+
* - 7: MissingDependency
|
|
522
|
+
*
|
|
523
|
+
* This is useful for programmatic error handling and distinguishing
|
|
524
|
+
* between different types of failures in native code.
|
|
525
|
+
*
|
|
526
|
+
* # Returns
|
|
527
|
+
*
|
|
528
|
+
* The integer error code.
|
|
529
|
+
*
|
|
530
|
+
* # Example
|
|
531
|
+
*
|
|
532
|
+
* ```typescript
|
|
533
|
+
* import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
|
|
534
|
+
*
|
|
535
|
+
* try {
|
|
536
|
+
* const result = await extractFile('document.pdf');
|
|
537
|
+
* } catch (error) {
|
|
538
|
+
* const code = getLastErrorCode();
|
|
539
|
+
* if (code === ErrorCode.Panic) {
|
|
540
|
+
* console.error('Native code panic detected');
|
|
541
|
+
* }
|
|
542
|
+
* }
|
|
543
|
+
* ```
|
|
544
|
+
*/
|
|
545
|
+
export declare function getLastErrorCode(): number
|
|
546
|
+
|
|
547
|
+
/**
|
|
548
|
+
* Get panic context information if the last error was a panic.
|
|
549
|
+
*
|
|
550
|
+
* Returns detailed information about a panic in native code, or null
|
|
551
|
+
* if the last error was not a panic.
|
|
552
|
+
*
|
|
553
|
+
* # Returns
|
|
554
|
+
*
|
|
555
|
+
* A `PanicContext` object with:
|
|
556
|
+
* - `file`: string - Source file where panic occurred
|
|
557
|
+
* - `line`: number - Line number
|
|
558
|
+
* - `function`: string - Function name
|
|
559
|
+
* - `message`: string - Panic message
|
|
560
|
+
* - `timestamp_secs`: number - Unix timestamp (seconds since epoch)
|
|
561
|
+
*
|
|
562
|
+
* Returns `null` if no panic context is available.
|
|
563
|
+
*
|
|
564
|
+
* # Example
|
|
565
|
+
*
|
|
566
|
+
* ```typescript
|
|
567
|
+
* import { extractFile, getLastPanicContext } from '@kreuzberg/node';
|
|
568
|
+
*
|
|
569
|
+
* try {
|
|
570
|
+
* const result = await extractFile('document.pdf');
|
|
571
|
+
* } catch (error) {
|
|
572
|
+
* const context = getLastPanicContext();
|
|
573
|
+
* if (context) {
|
|
574
|
+
* console.error(`Panic at ${context.file}:${context.line}`);
|
|
575
|
+
* console.error(`In function: ${context.function}`);
|
|
576
|
+
* console.error(`Message: ${context.message}`);
|
|
577
|
+
* }
|
|
578
|
+
* }
|
|
579
|
+
* ```
|
|
580
|
+
*/
|
|
581
|
+
export declare function getLastPanicContext(): any | null
|
|
582
|
+
|
|
583
|
+
export interface JsChunk {
|
|
584
|
+
content: string
|
|
585
|
+
embedding?: number[] | undefined
|
|
586
|
+
metadata: JsChunkMetadata
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
export interface JsChunkingConfig {
|
|
590
|
+
maxChars?: number
|
|
591
|
+
maxOverlap?: number
|
|
592
|
+
/** Optional embedding configuration for generating embeddings */
|
|
593
|
+
embedding?: JsEmbeddingConfig
|
|
594
|
+
/** Optional preset name for chunking parameters */
|
|
595
|
+
preset?: string
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
export interface JsChunkMetadata {
|
|
599
|
+
charStart: number
|
|
600
|
+
charEnd: number
|
|
601
|
+
tokenCount?: number
|
|
602
|
+
chunkIndex: number
|
|
603
|
+
totalChunks: number
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
/** Embedding generation configuration for Node.js bindings. */
|
|
607
|
+
export interface JsEmbeddingConfig {
|
|
608
|
+
/** Embedding model configuration */
|
|
609
|
+
model?: JsEmbeddingModelType
|
|
610
|
+
/** Whether to normalize embeddings (L2 normalization) */
|
|
611
|
+
normalize?: boolean
|
|
612
|
+
/** Batch size for embedding generation */
|
|
613
|
+
batchSize?: number
|
|
614
|
+
/** Whether to show download progress for models */
|
|
615
|
+
showDownloadProgress?: boolean
|
|
616
|
+
/** Custom cache directory for model storage */
|
|
617
|
+
cacheDir?: string
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
/**
|
|
621
|
+
* Embedding model type configuration for Node.js bindings.
|
|
622
|
+
*
|
|
623
|
+
* This struct represents different embedding model sources:
|
|
624
|
+
* - `preset`: Use a named preset (e.g., "balanced", "fast", "quality", "multilingual")
|
|
625
|
+
* - `fastembed`: Use a FastEmbed model with custom dimensions
|
|
626
|
+
* - `custom`: Use a custom ONNX model
|
|
627
|
+
*/
|
|
628
|
+
export interface JsEmbeddingModelType {
|
|
629
|
+
/** Type of model: "preset", "fastembed", or "custom" */
|
|
630
|
+
modelType: string
|
|
631
|
+
/** For preset: preset name; for fastembed/custom: model ID */
|
|
632
|
+
value: string
|
|
633
|
+
/** Number of dimensions (only for fastembed/custom) */
|
|
634
|
+
dimensions?: number
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
export interface JsExtractedImage {
|
|
638
|
+
data: Buffer
|
|
639
|
+
format: string
|
|
640
|
+
imageIndex: number
|
|
641
|
+
pageNumber?: number
|
|
642
|
+
width?: number
|
|
643
|
+
height?: number
|
|
644
|
+
colorspace?: string
|
|
645
|
+
bitsPerComponent?: number
|
|
646
|
+
isMask: boolean
|
|
647
|
+
description?: string
|
|
648
|
+
ocrResult?: JsExtractionResult | undefined
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
export interface JsExtractionConfig {
|
|
652
|
+
useCache?: boolean
|
|
653
|
+
enableQualityProcessing?: boolean
|
|
654
|
+
ocr?: JsOcrConfig
|
|
655
|
+
forceOcr?: boolean
|
|
656
|
+
chunking?: JsChunkingConfig
|
|
657
|
+
images?: JsImageExtractionConfig
|
|
658
|
+
pdfOptions?: JsPdfConfig
|
|
659
|
+
tokenReduction?: JsTokenReductionConfig
|
|
660
|
+
languageDetection?: JsLanguageDetectionConfig
|
|
661
|
+
postprocessor?: JsPostProcessorConfig
|
|
662
|
+
keywords?: JsKeywordConfig
|
|
663
|
+
htmlOptions?: JsHtmlOptions
|
|
664
|
+
maxConcurrentExtractions?: number
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
export interface JsExtractionResult {
|
|
668
|
+
content: string
|
|
669
|
+
mimeType: string
|
|
670
|
+
metadata: Metadata
|
|
671
|
+
tables: Array<JsTable>
|
|
672
|
+
detectedLanguages?: Array<string>
|
|
673
|
+
chunks?: Array<JsChunk>
|
|
674
|
+
images?: Array<JsExtractedImage>
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
export interface JsHtmlOptions {
|
|
678
|
+
headingStyle?: string
|
|
679
|
+
listIndentType?: string
|
|
680
|
+
listIndentWidth?: number
|
|
681
|
+
bullets?: string
|
|
682
|
+
strongEmSymbol?: string
|
|
683
|
+
escapeAsterisks?: boolean
|
|
684
|
+
escapeUnderscores?: boolean
|
|
685
|
+
escapeMisc?: boolean
|
|
686
|
+
escapeAscii?: boolean
|
|
687
|
+
codeLanguage?: string
|
|
688
|
+
autolinks?: boolean
|
|
689
|
+
defaultTitle?: boolean
|
|
690
|
+
brInTables?: boolean
|
|
691
|
+
hocrSpatialTables?: boolean
|
|
692
|
+
highlightStyle?: string
|
|
693
|
+
extractMetadata?: boolean
|
|
694
|
+
whitespaceMode?: string
|
|
695
|
+
stripNewlines?: boolean
|
|
696
|
+
wrap?: boolean
|
|
697
|
+
wrapWidth?: number
|
|
698
|
+
convertAsInline?: boolean
|
|
699
|
+
subSymbol?: string
|
|
700
|
+
supSymbol?: string
|
|
701
|
+
newlineStyle?: string
|
|
702
|
+
codeBlockStyle?: string
|
|
703
|
+
keepInlineImagesIn?: Array<string>
|
|
704
|
+
encoding?: string
|
|
705
|
+
debug?: boolean
|
|
706
|
+
stripTags?: Array<string>
|
|
707
|
+
preserveTags?: Array<string>
|
|
708
|
+
preprocessing?: JsHtmlPreprocessingOptions
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
export interface JsHtmlPreprocessingOptions {
|
|
712
|
+
enabled?: boolean
|
|
713
|
+
preset?: string
|
|
714
|
+
removeNavigation?: boolean
|
|
715
|
+
removeForms?: boolean
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
export interface JsImageExtractionConfig {
|
|
719
|
+
extractImages?: boolean
|
|
720
|
+
targetDpi?: number
|
|
721
|
+
maxImageDimension?: number
|
|
722
|
+
autoAdjustDpi?: boolean
|
|
723
|
+
minDpi?: number
|
|
724
|
+
maxDpi?: number
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
export interface JsKeywordConfig {
|
|
728
|
+
algorithm?: string
|
|
729
|
+
maxKeywords?: number
|
|
730
|
+
minScore?: number
|
|
731
|
+
ngramRange?: [number, number] | undefined
|
|
732
|
+
language?: string
|
|
733
|
+
yakeParams?: JsYakeParams
|
|
734
|
+
rakeParams?: JsRakeParams
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
export interface JsLanguageDetectionConfig {
|
|
738
|
+
enabled?: boolean
|
|
739
|
+
minConfidence?: number
|
|
740
|
+
detectMultiple?: boolean
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
export interface JsOcrConfig {
|
|
744
|
+
backend: string
|
|
745
|
+
language?: string
|
|
746
|
+
tesseractConfig?: JsTesseractConfig
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
export interface JsPdfConfig {
|
|
750
|
+
extractImages?: boolean
|
|
751
|
+
passwords?: Array<string>
|
|
752
|
+
extractMetadata?: boolean
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
export interface JsPostProcessorConfig {
|
|
756
|
+
enabled?: boolean
|
|
757
|
+
enabledProcessors?: Array<string>
|
|
758
|
+
disabledProcessors?: Array<string>
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
export interface JsRakeParams {
|
|
762
|
+
minWordLength?: number
|
|
763
|
+
maxWordsPerPhrase?: number
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
export interface JsTable {
|
|
767
|
+
cells: Array<Array<string>>
|
|
768
|
+
markdown: string
|
|
769
|
+
pageNumber: number
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
export interface JsTesseractConfig {
|
|
773
|
+
psm?: number
|
|
774
|
+
enableTableDetection?: boolean
|
|
775
|
+
tesseditCharWhitelist?: string
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
export interface JsTokenReductionConfig {
|
|
779
|
+
mode?: string
|
|
780
|
+
preserveImportantWords?: boolean
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
export interface JsYakeParams {
|
|
784
|
+
windowSize?: number
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
/**
|
|
788
|
+
* List all registered document extractors.
|
|
789
|
+
*
|
|
790
|
+
* Returns an array of names of all currently registered document extractors,
|
|
791
|
+
* including built-in extractors for PDF, Office documents, images, etc.
|
|
792
|
+
*
|
|
793
|
+
* # Returns
|
|
794
|
+
*
|
|
795
|
+
* Array of document extractor names.
|
|
796
|
+
*
|
|
797
|
+
* # Example
|
|
798
|
+
*
|
|
799
|
+
* ```typescript
|
|
800
|
+
* import { listDocumentExtractors } from 'kreuzberg';
|
|
801
|
+
*
|
|
802
|
+
* const extractors = listDocumentExtractors();
|
|
803
|
+
* console.log(extractors); // ['PDFExtractor', 'ImageExtractor', ...]
|
|
804
|
+
* ```
|
|
805
|
+
*/
|
|
806
|
+
export declare function listDocumentExtractors(): Array<string>
|
|
807
|
+
|
|
808
|
+
/**
|
|
809
|
+
* List all available embedding preset names.
|
|
810
|
+
*
|
|
811
|
+
* Returns an array of preset names that can be used with `getEmbeddingPreset`.
|
|
812
|
+
*
|
|
813
|
+
* # Returns
|
|
814
|
+
*
|
|
815
|
+
* Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
|
|
816
|
+
*
|
|
817
|
+
* # Example
|
|
818
|
+
*
|
|
819
|
+
* ```typescript
|
|
820
|
+
* import { listEmbeddingPresets } from 'kreuzberg';
|
|
821
|
+
*
|
|
822
|
+
* const presets = listEmbeddingPresets();
|
|
823
|
+
* console.log(presets); // ['fast', 'balanced', 'quality', 'multilingual']
|
|
824
|
+
* ```
|
|
825
|
+
*/
|
|
826
|
+
export declare function listEmbeddingPresets(): Array<string>
|
|
827
|
+
|
|
828
|
+
/**
|
|
829
|
+
* List all registered OCR backends.
|
|
830
|
+
*
|
|
831
|
+
* Returns an array of names of all currently registered OCR backends,
|
|
832
|
+
* including built-in backends like "tesseract".
|
|
833
|
+
*
|
|
834
|
+
* # Returns
|
|
835
|
+
*
|
|
836
|
+
* Array of OCR backend names.
|
|
837
|
+
*
|
|
838
|
+
* # Example
|
|
839
|
+
*
|
|
840
|
+
* ```typescript
|
|
841
|
+
* import { listOcrBackends } from 'kreuzberg';
|
|
842
|
+
*
|
|
843
|
+
* const backends = listOcrBackends();
|
|
844
|
+
* console.log(backends); // ['tesseract', 'my-custom-backend', ...]
|
|
845
|
+
* ```
|
|
846
|
+
*/
|
|
847
|
+
export declare function listOcrBackends(): Array<string>
|
|
848
|
+
|
|
849
|
+
/** List all registered post-processors */
|
|
850
|
+
export declare function listPostProcessors(): Array<string>
|
|
851
|
+
|
|
852
|
+
/** List all registered validators */
|
|
853
|
+
export declare function listValidators(): Array<string>
|
|
854
|
+
|
|
855
|
+
/**
|
|
856
|
+
* Load extraction configuration from a file.
|
|
857
|
+
*
|
|
858
|
+
* Automatically detects the file format based on extension:
|
|
859
|
+
* - `.toml` - TOML format
|
|
860
|
+
* - `.yaml` - YAML format
|
|
861
|
+
* - `.json` - JSON format
|
|
862
|
+
*
|
|
863
|
+
* # Parameters
|
|
864
|
+
*
|
|
865
|
+
* * `file_path` - Path to the configuration file (absolute or relative)
|
|
866
|
+
*
|
|
867
|
+
* # Returns
|
|
868
|
+
*
|
|
869
|
+
* `JsExtractionConfig` object with loaded configuration.
|
|
870
|
+
*
|
|
871
|
+
* # Errors
|
|
872
|
+
*
|
|
873
|
+
* Throws an error if:
|
|
874
|
+
* - File does not exist or is not accessible
|
|
875
|
+
* - File content is not valid TOML/YAML/JSON
|
|
876
|
+
* - Configuration structure is invalid
|
|
877
|
+
*
|
|
878
|
+
* # Example
|
|
879
|
+
*
|
|
880
|
+
* ```typescript
|
|
881
|
+
* import { loadExtractionConfigFromFile } from 'kreuzberg';
|
|
882
|
+
*
|
|
883
|
+
* // Load from TOML file
|
|
884
|
+
* const config = loadExtractionConfigFromFile('kreuzberg.toml');
|
|
885
|
+
*
|
|
886
|
+
* // Load from YAML file
|
|
887
|
+
* const config2 = loadExtractionConfigFromFile('./config.yaml');
|
|
888
|
+
*
|
|
889
|
+
* // Use with extraction
|
|
890
|
+
* const result = await extractFile('document.pdf', null, config);
|
|
891
|
+
* ```
|
|
892
|
+
*/
|
|
893
|
+
export declare function loadExtractionConfigFromFile(filePath: string): JsExtractionConfig
|
|
894
|
+
|
|
895
|
+
/**
|
|
896
|
+
* Register a custom OCR backend
|
|
897
|
+
*
|
|
898
|
+
* Registers a JavaScript OCR backend that can process images and extract text.
|
|
899
|
+
*
|
|
900
|
+
* # Arguments
|
|
901
|
+
*
|
|
902
|
+
* * `backend` - JavaScript object with the following interface:
|
|
903
|
+
* - `name(): string` - Unique backend name
|
|
904
|
+
* - `supportedLanguages(): string[]` - Array of supported ISO 639-2/3 language codes
|
|
905
|
+
* - `processImage(imageBytes: string, language: string): Promise<result>` - Process image and return extraction result
|
|
906
|
+
*
|
|
907
|
+
* # Implementation Notes
|
|
908
|
+
*
|
|
909
|
+
* Due to NAPI ThreadsafeFunction limitations, the processImage function receives:
|
|
910
|
+
* - `imageBytes` as a Base64 string (first argument)
|
|
911
|
+
* - `language` as string (second argument)
|
|
912
|
+
*
|
|
913
|
+
* And must return a Promise resolving to a JSON-serializable object with:
|
|
914
|
+
* ```typescript
|
|
915
|
+
* {
|
|
916
|
+
* content: string,
|
|
917
|
+
* mime_type: string, // default: "text/plain"
|
|
918
|
+
* metadata: object, // default: {}
|
|
919
|
+
* tables: array // default: []
|
|
920
|
+
* }
|
|
921
|
+
* ```
|
|
922
|
+
*
|
|
923
|
+
* # Example
|
|
924
|
+
*
|
|
925
|
+
* ```typescript
|
|
926
|
+
* import { registerOcrBackend } from '@kreuzberg/node';
|
|
927
|
+
*
|
|
928
|
+
* registerOcrBackend({
|
|
929
|
+
* name: () => "my-ocr",
|
|
930
|
+
* supportedLanguages: () => ["eng", "deu", "fra"],
|
|
931
|
+
* processImage: async (imageBytes, language) => {
|
|
932
|
+
* const buffer = Buffer.from(imageBytes, "base64");
|
|
933
|
+
* const text = await myOcrLibrary.process(buffer, language);
|
|
934
|
+
* return {
|
|
935
|
+
* content: text,
|
|
936
|
+
* mime_type: "text/plain",
|
|
937
|
+
* metadata: { confidence: 0.95 },
|
|
938
|
+
* tables: []
|
|
939
|
+
* };
|
|
940
|
+
* }
|
|
941
|
+
* });
|
|
942
|
+
* ```
|
|
943
|
+
*/
|
|
944
|
+
export declare function registerOcrBackend(backend: object): void
|
|
945
|
+
|
|
946
|
+
/**
|
|
947
|
+
* Register a custom postprocessor
|
|
948
|
+
*
|
|
949
|
+
* Registers a JavaScript PostProcessor that will be called after extraction.
|
|
950
|
+
*
|
|
951
|
+
* # Arguments
|
|
952
|
+
*
|
|
953
|
+
* * `processor` - JavaScript object with the following interface:
|
|
954
|
+
* - `name(): string` - Unique processor name
|
|
955
|
+
* - `process(...args): string` - Process function that receives JSON string as args\[0\]
|
|
956
|
+
* - `processingStage(): "early" | "middle" | "late"` - Optional processing stage
|
|
957
|
+
*
|
|
958
|
+
* # Implementation Notes
|
|
959
|
+
*
|
|
960
|
+
* Due to NAPI ThreadsafeFunction limitations, the process function receives the extraction
|
|
961
|
+
* result as a JSON string in args\[0\] and must return a JSON string. Use the TypeScript
|
|
962
|
+
* wrapper functions for a cleaner API.
|
|
963
|
+
*
|
|
964
|
+
* # Example
|
|
965
|
+
*
|
|
966
|
+
* ```typescript
|
|
967
|
+
* import { registerPostProcessor } from '@kreuzberg/node';
|
|
968
|
+
*
|
|
969
|
+
* registerPostProcessor({
|
|
970
|
+
* name: () => "word-counter",
|
|
971
|
+
* processingStage: () => "middle",
|
|
972
|
+
* process: (...args) => {
|
|
973
|
+
* const result = JSON.parse(args[0]);
|
|
974
|
+
* const wordCount = result.content.split(/\s+/).length;
|
|
975
|
+
* result.metadata.word_count = wordCount;
|
|
976
|
+
* return JSON.stringify(result);
|
|
977
|
+
* }
|
|
978
|
+
* });
|
|
979
|
+
* ```
|
|
980
|
+
*/
|
|
981
|
+
export declare function registerPostProcessor(processor: object): void
|
|
982
|
+
|
|
983
|
+
/**
|
|
984
|
+
* Register a custom validator
|
|
985
|
+
*
|
|
986
|
+
* Registers a JavaScript Validator that will be called after extraction.
|
|
987
|
+
*
|
|
988
|
+
* # Arguments
|
|
989
|
+
*
|
|
990
|
+
* * `validator` - JavaScript object with the following interface:
|
|
991
|
+
* - `name(): string` - Unique validator name
|
|
992
|
+
* - `validate(...args): Promise<string>` - Validate function that receives JSON string as args\[0\]
|
|
993
|
+
* - `priority(): number` - Optional priority (defaults to 50, higher runs first)
|
|
994
|
+
*
|
|
995
|
+
* # Implementation Notes
|
|
996
|
+
*
|
|
997
|
+
* Due to NAPI ThreadsafeFunction limitations, the validate function receives the extraction
|
|
998
|
+
* result as a JSON string in args\[0\]. On success, return an empty string. On validation
|
|
999
|
+
* failure, throw an error (the Promise should reject). Use the TypeScript wrapper functions
|
|
1000
|
+
* for a cleaner API.
|
|
1001
|
+
*
|
|
1002
|
+
* # Example
|
|
1003
|
+
*
|
|
1004
|
+
* ```typescript
|
|
1005
|
+
* import { registerValidator } from '@kreuzberg/node';
|
|
1006
|
+
*
|
|
1007
|
+
* registerValidator({
|
|
1008
|
+
* name: () => "min-length",
|
|
1009
|
+
* priority: () => 100,
|
|
1010
|
+
* validate: async (...args) => {
|
|
1011
|
+
* const result = JSON.parse(args[0]);
|
|
1012
|
+
* if (result.content.length < 100) {
|
|
1013
|
+
* throw new Error("ValidationError: Content too short");
|
|
1014
|
+
* }
|
|
1015
|
+
* return ""; // Success - return empty string
|
|
1016
|
+
* }
|
|
1017
|
+
* });
|
|
1018
|
+
* ```
|
|
1019
|
+
*/
|
|
1020
|
+
export declare function registerValidator(validator: object): void
|
|
1021
|
+
|
|
1022
|
+
/**
|
|
1023
|
+
* Unregister a document extractor by name.
|
|
1024
|
+
*
|
|
1025
|
+
* Removes the specified document extractor from the registry. If the extractor
|
|
1026
|
+
* doesn't exist, this operation is a no-op (does not throw an error).
|
|
1027
|
+
*
|
|
1028
|
+
* # Parameters
|
|
1029
|
+
*
|
|
1030
|
+
* * `name` - Name of the document extractor to unregister
|
|
1031
|
+
*
|
|
1032
|
+
* # Example
|
|
1033
|
+
*
|
|
1034
|
+
* ```typescript
|
|
1035
|
+
* import { unregisterDocumentExtractor } from 'kreuzberg';
|
|
1036
|
+
*
|
|
1037
|
+
* // Unregister a custom extractor
|
|
1038
|
+
* unregisterDocumentExtractor('MyCustomExtractor');
|
|
1039
|
+
* ```
|
|
1040
|
+
*/
|
|
1041
|
+
export declare function unregisterDocumentExtractor(name: string): void
|
|
1042
|
+
|
|
1043
|
+
/**
|
|
1044
|
+
* Unregister an OCR backend by name.
|
|
1045
|
+
*
|
|
1046
|
+
* Removes the specified OCR backend from the registry. If the backend doesn't exist,
|
|
1047
|
+
* this operation is a no-op (does not throw an error).
|
|
1048
|
+
*
|
|
1049
|
+
* # Parameters
|
|
1050
|
+
*
|
|
1051
|
+
* * `name` - Name of the OCR backend to unregister
|
|
1052
|
+
*
|
|
1053
|
+
* # Example
|
|
1054
|
+
*
|
|
1055
|
+
* ```typescript
|
|
1056
|
+
* import { unregisterOcrBackend } from 'kreuzberg';
|
|
1057
|
+
*
|
|
1058
|
+
* // Unregister a custom backend
|
|
1059
|
+
* unregisterOcrBackend('my-custom-ocr');
|
|
1060
|
+
* ```
|
|
1061
|
+
*/
|
|
1062
|
+
export declare function unregisterOcrBackend(name: string): void
|
|
1063
|
+
|
|
1064
|
+
/** Unregister a postprocessor by name */
|
|
1065
|
+
export declare function unregisterPostProcessor(name: string): void
|
|
1066
|
+
|
|
1067
|
+
/** Unregister a validator by name */
|
|
1068
|
+
export declare function unregisterValidator(name: string): void
|
|
1069
|
+
|
|
1070
|
+
/**
|
|
1071
|
+
* Validate that a MIME type is supported by Kreuzberg.
|
|
1072
|
+
*
|
|
1073
|
+
* Checks if a MIME type is in the list of supported formats. Note that any
|
|
1074
|
+
* `image/*` MIME type is automatically considered valid.
|
|
1075
|
+
*
|
|
1076
|
+
* # Parameters
|
|
1077
|
+
*
|
|
1078
|
+
* * `mime_type` - The MIME type to validate (string)
|
|
1079
|
+
*
|
|
1080
|
+
* # Returns
|
|
1081
|
+
*
|
|
1082
|
+
* The validated MIME type (may be normalized).
|
|
1083
|
+
*
|
|
1084
|
+
* # Errors
|
|
1085
|
+
*
|
|
1086
|
+
* Throws an error if the MIME type is not supported.
|
|
1087
|
+
*
|
|
1088
|
+
* # Example
|
|
1089
|
+
*
|
|
1090
|
+
* ```typescript
|
|
1091
|
+
* import { validateMimeType } from 'kreuzberg';
|
|
1092
|
+
*
|
|
1093
|
+
* // Validate supported type
|
|
1094
|
+
* const validated = validateMimeType('application/pdf');
|
|
1095
|
+
* console.log(validated); // 'application/pdf'
|
|
1096
|
+
*
|
|
1097
|
+
* // Validate custom image type
|
|
1098
|
+
* const validated2 = validateMimeType('image/custom-format');
|
|
1099
|
+
* console.log(validated2); // 'image/custom-format' (any image/* is valid)
|
|
1100
|
+
*
|
|
1101
|
+
* // Validate unsupported type (throws error)
|
|
1102
|
+
* try {
|
|
1103
|
+
* validateMimeType('video/mp4');
|
|
1104
|
+
* } catch (err) {
|
|
1105
|
+
* console.error(err); // Error: Unsupported format: video/mp4
|
|
1106
|
+
* }
|
|
1107
|
+
* ```
|
|
1108
|
+
*/
|
|
1109
|
+
export declare function validateMimeType(mimeType: string): string
|