@kreuzberg/node 4.4.5 → 4.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -15
- package/dist/index.d.mts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +23 -2
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +23 -2
- package/dist/index.mjs.map +1 -1
- package/dist/types.d.mts +21 -1
- package/dist/types.d.ts +21 -1
- package/dist/types.js.map +1 -1
- package/index.d.ts +18 -0
- package/index.js +52 -52
- package/package.json +10 -10
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.6" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -33,6 +33,9 @@
|
|
|
33
33
|
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
34
|
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
35
|
</a>
|
|
36
|
+
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
|
37
|
+
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
|
38
|
+
</a>
|
|
36
39
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
37
40
|
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
38
41
|
</a>
|
|
@@ -55,7 +58,7 @@
|
|
|
55
58
|
</div>
|
|
56
59
|
|
|
57
60
|
|
|
58
|
-
Extract text, tables, images, and metadata from
|
|
61
|
+
Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
|
59
62
|
|
|
60
63
|
|
|
61
64
|
## Installation
|
|
@@ -95,16 +98,9 @@ yarn add @kreuzberg/node
|
|
|
95
98
|
### System Requirements
|
|
96
99
|
|
|
97
100
|
- **Node.js 22+** required (NAPI-RS native bindings)
|
|
98
|
-
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.
|
|
101
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
99
102
|
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
100
103
|
|
|
101
|
-
**Format Support Notes:**
|
|
102
|
-
- Legacy formats (DOC, XLS, PPT) are now extracted natively without external tools
|
|
103
|
-
- Modern Office formats (DOCX, XLSX, PPTX) are fully supported
|
|
104
|
-
- WASM binding supports all document formats via in-memory parsing
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
104
|
### Platform Support
|
|
109
105
|
|
|
110
106
|
Pre-built binaries available for:
|
|
@@ -320,19 +316,21 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
|
|
|
320
316
|
|
|
321
317
|
## Features
|
|
322
318
|
|
|
323
|
-
### Supported File Formats (
|
|
319
|
+
### Supported File Formats (88+)
|
|
324
320
|
|
|
325
|
-
|
|
321
|
+
88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
326
322
|
|
|
327
323
|
#### Office Documents
|
|
328
324
|
|
|
329
325
|
| Category | Formats | Capabilities |
|
|
330
326
|
|----------|---------|--------------|
|
|
331
|
-
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
332
|
-
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
333
|
-
| **Presentations** | `.pptx`, `.
|
|
327
|
+
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
|
328
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
329
|
+
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
|
334
330
|
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
335
331
|
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
332
|
+
| **Database** | `.dbf` | Table data extraction, field type support |
|
|
333
|
+
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
|
336
334
|
|
|
337
335
|
#### Images (OCR-Enabled)
|
|
338
336
|
|
package/dist/index.d.mts
CHANGED
|
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1191
|
* @module @kreuzberg/node
|
|
1192
1192
|
*/
|
|
1193
1193
|
|
|
1194
|
-
declare const __version__ = "4.4.
|
|
1194
|
+
declare const __version__ = "4.4.6";
|
|
1195
1195
|
|
|
1196
1196
|
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.d.ts
CHANGED
|
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
|
|
|
1191
1191
|
* @module @kreuzberg/node
|
|
1192
1192
|
*/
|
|
1193
1193
|
|
|
1194
|
-
declare const __version__ = "4.4.
|
|
1194
|
+
declare const __version__ = "4.4.6";
|
|
1195
1195
|
|
|
1196
1196
|
export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
|
package/dist/index.js
CHANGED
|
@@ -368,6 +368,9 @@ function normalizeChunkingConfig(chunking) {
|
|
|
368
368
|
setIfDefined(normalized, "preset", chunking.preset);
|
|
369
369
|
setIfDefined(normalized, "embedding", chunking.embedding);
|
|
370
370
|
setIfDefined(normalized, "enabled", chunking.enabled);
|
|
371
|
+
setIfDefined(normalized, "sizingType", chunking.sizingType);
|
|
372
|
+
setIfDefined(normalized, "sizingModel", chunking.sizingModel);
|
|
373
|
+
setIfDefined(normalized, "sizingCacheDir", chunking.sizingCacheDir);
|
|
371
374
|
return normalized;
|
|
372
375
|
}
|
|
373
376
|
function normalizeImageExtractionConfig(images) {
|
|
@@ -594,7 +597,25 @@ function convertChunk(rawChunk) {
|
|
|
594
597
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
595
598
|
firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
|
|
596
599
|
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
597
|
-
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
|
|
600
|
+
lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
|
|
601
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
602
|
+
headingContext: (() => {
|
|
603
|
+
const hc = metadata["heading_context"] ?? metadata["headingContext"];
|
|
604
|
+
if (!hc) return null;
|
|
605
|
+
const headings = hc["headings"];
|
|
606
|
+
if (!Array.isArray(headings)) return null;
|
|
607
|
+
return {
|
|
608
|
+
headings: headings.map((h) => {
|
|
609
|
+
const heading = h;
|
|
610
|
+
return {
|
|
611
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
612
|
+
level: heading["level"] ?? 0,
|
|
613
|
+
// biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
|
|
614
|
+
text: heading["text"] ?? ""
|
|
615
|
+
};
|
|
616
|
+
})
|
|
617
|
+
};
|
|
618
|
+
})()
|
|
598
619
|
}
|
|
599
620
|
};
|
|
600
621
|
}
|
|
@@ -1197,7 +1218,7 @@ function getEmbeddingPreset(name) {
|
|
|
1197
1218
|
}
|
|
1198
1219
|
|
|
1199
1220
|
// typescript/index.ts
|
|
1200
|
-
var __version__ = "4.4.
|
|
1221
|
+
var __version__ = "4.4.6";
|
|
1201
1222
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1202
1223
|
0 && (module.exports = {
|
|
1203
1224
|
CacheError,
|