@kreuzberg/node 4.4.5 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -15
- package/dist/index.d.mts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +35 -2
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +35 -2
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +38 -1
- package/dist/types.d.ts +38 -1
- package/dist/types.js.map +1 -1
- package/index.d.ts +121 -10
- package/index.js +52 -52
- package/package.json +11 -11
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -33,6 +33,9 @@
|
|
|
33
33
|
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
34
|
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
35
|
</a>
|
|
36
|
+
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
|
37
|
+
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
|
38
|
+
</a>
|
|
36
39
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
37
40
|
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
38
41
|
</a>
|
|
@@ -44,6 +47,9 @@
|
|
|
44
47
|
<a href="https://docs.kreuzberg.dev">
|
|
45
48
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
46
49
|
</a>
|
|
50
|
+
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
|
|
52
|
+
</a>
|
|
47
53
|
</div>
|
|
48
54
|
|
|
49
55
|
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
@@ -55,7 +61,7 @@
|
|
|
55
61
|
</div>
|
|
56
62
|
|
|
57
63
|
|
|
58
|
-
Extract text, tables, images, and metadata from
|
|
64
|
+
Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
59
65
|
|
|
60
66
|
|
|
61
67
|
## Installation
|
|
@@ -95,16 +101,9 @@ yarn add @kreuzberg/node
|
|
|
95
101
|
### System Requirements
|
|
96
102
|
|
|
97
103
|
- **Node.js 22+** required (NAPI-RS native bindings)
|
|
98
|
-
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.
|
|
104
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
99
105
|
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
100
106
|
|
|
101
|
-
**Format Support Notes:**
|
|
102
|
-
- Legacy formats (DOC, XLS, PPT) are now extracted natively without external tools
|
|
103
|
-
- Modern Office formats (DOCX, XLSX, PPTX) are fully supported
|
|
104
|
-
- WASM binding supports all document formats via in-memory parsing
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
107
|
### Platform Support
|
|
109
108
|
|
|
110
109
|
Pre-built binaries available for:
|
|
@@ -320,19 +319,21 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
|
320
319
|
|
|
321
320
|
## Features
|
|
322
321
|
|
|
323
|
-
### Supported File Formats (
|
|
322
|
+
### Supported File Formats (88+)
|
|
324
323
|
|
|
325
|
-
|
|
324
|
+
88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
326
325
|
|
|
327
326
|
#### Office Documents
|
|
328
327
|
|
|
329
328
|
| Category | Formats | Capabilities |
|
|
330
329
|
|----------|---------|--------------|
|
|
331
|
-
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
332
|
-
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
333
|
-
| **Presentations** | `.pptx`, `.
|
|
330
|
+
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
|
331
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
332
|
+
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
|
334
333
|
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
335
334
|
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
335
|
+
| **Database** | `.dbf` | Table data extraction, field type support |
|
|
336
|
+
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
|
336
337
|
|
|
337
338
|
#### Images (OCR-Enabled)
|
|
338
339
|
|
package/dist/index.d.mts
CHANGED
|
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1191
|
* @module @kreuzberg/node
|
|
1192
1192
|
*/
|
|
1193
1193
|
|
|
1194
|
-
declare const __version__ = "4.
|
|
1194
|
+
declare const __version__ = "4.5.0";
|
|
1195
1195
|
|
|
1196
1196
|
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.d.ts
CHANGED
|
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1191
|
* @module @kreuzberg/node
|
|
1192
1192
|
*/
|
|
1193
1193
|
|
|
1194
|
-
declare const __version__ = "4.
|
|
1194
|
+
declare const __version__ = "4.5.0";
|
|
1195
1195
|
|
|
1196
1196
|
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.js
CHANGED
|
@@ -368,6 +368,9 @@ function normalizeChunkingConfig(chunking) {
|
|
|
368
368
|
setIfDefined(normalized, "preset", chunking.preset);
|
|
369
369
|
setIfDefined(normalized, "embedding", chunking.embedding);
|
|
370
370
|
setIfDefined(normalized, "enabled", chunking.enabled);
|
|
371
|
+
setIfDefined(normalized, "sizingType", chunking.sizingType);
|
|
372
|
+
setIfDefined(normalized, "sizingModel", chunking.sizingModel);
|
|
373
|
+
setIfDefined(normalized, "sizingCacheDir", chunking.sizingCacheDir);
|
|
371
374
|
return normalized;
|
|
372
375
|
}
|
|
373
376
|
function normalizeImageExtractionConfig(images) {
|
|
@@ -530,10 +533,22 @@ function normalizeExtractionConfig(config) {
|
|
|
530
533
|
setIfDefined(normalized, "pages", pages);
|
|
531
534
|
const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
|
|
532
535
|
setIfDefined(normalized, "htmlOptions", htmlOptions);
|
|
536
|
+
const layout = normalizeLayoutDetectionConfig(config.layout);
|
|
537
|
+
setIfDefined(normalized, "layout", layout);
|
|
533
538
|
setIfDefined(normalized, "outputFormat", config.outputFormat);
|
|
534
539
|
setIfDefined(normalized, "resultFormat", config.resultFormat);
|
|
535
540
|
return normalized;
|
|
536
541
|
}
|
|
542
|
+
function normalizeLayoutDetectionConfig(config) {
|
|
543
|
+
if (!config) {
|
|
544
|
+
return void 0;
|
|
545
|
+
}
|
|
546
|
+
const normalized = {};
|
|
547
|
+
setIfDefined(normalized, "preset", config.preset);
|
|
548
|
+
setIfDefined(normalized, "confidenceThreshold", config.confidenceThreshold);
|
|
549
|
+
setIfDefined(normalized, "applyHeuristics", config.applyHeuristics);
|
|
550
|
+
return normalized;
|
|
551
|
+
}
|
|
537
552
|
|
|
538
553
|
// typescript/core/type-converters.ts
|
|
539
554
|
function parseMetadata(metadataStr) {
|
|
@@ -594,7 +609,25 @@ function convertChunk(rawChunk) {
|
|
|
594
609
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
595
610
|
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
596
611
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
597
|
-
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
|
|
612
|
+
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
|
|
613
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
614
|
+
headingContext: (() => {
|
|
615
|
+
const hc = metadata["heading_context"] ?? metadata["headingContext"];
|
|
616
|
+
if (!hc) return null;
|
|
617
|
+
const headings = hc["headings"];
|
|
618
|
+
if (!Array.isArray(headings)) return null;
|
|
619
|
+
return {
|
|
620
|
+
headings: headings.map((h) => {
|
|
621
|
+
const heading = h;
|
|
622
|
+
return {
|
|
623
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
624
|
+
level: heading["level"] ?? 0,
|
|
625
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
626
|
+
text: heading["text"] ?? ""
|
|
627
|
+
};
|
|
628
|
+
})
|
|
629
|
+
};
|
|
630
|
+
})()
|
|
598
631
|
}
|
|
599
632
|
};
|
|
600
633
|
}
|
|
@@ -1197,7 +1230,7 @@ function getEmbeddingPreset(name) {
|
|
|
1197
1230
|
}
|
|
1198
1231
|
|
|
1199
1232
|
// typescript/index.ts
|
|
1200
|
-
var __version__ = "4.
|
|
1233
|
+
var __version__ = "4.5.0";
|
|
1201
1234
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1202
1235
|
0 && (module.exports = {
|
|
1203
1236
|
CacheError,
|