@kreuzberg/node 4.4.4 → 4.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.6" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -33,6 +33,9 @@
33
33
  <a href="https://rubygems.org/gems/kreuzberg">
34
34
  <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
35
  </a>
36
+ <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
37
+ <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
38
+ </a>
36
39
  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
37
40
  <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
38
41
  </a>
@@ -55,7 +58,7 @@
55
58
  </div>
56
59
 
57
60
 
58
- Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
61
+ Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
59
62
 
60
63
 
61
64
  ## Installation
@@ -95,16 +98,9 @@ yarn add @kreuzberg/node
95
98
  ### System Requirements
96
99
 
97
100
  - **Node.js 22+** required (NAPI-RS native bindings)
98
- - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.24+ for embeddings support
101
+ - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
99
102
  - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
100
103
 
101
- **Format Support Notes:**
102
- - Legacy formats (DOC, XLS, PPT) are now extracted natively without external tools
103
- - Modern Office formats (DOCX, XLSX, PPTX) are fully supported
104
- - WASM binding supports all document formats via in-memory parsing
105
-
106
-
107
-
108
104
  ### Platform Support
109
105
 
110
106
  Pre-built binaries available for:
@@ -320,19 +316,21 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
320
316
 
321
317
  ## Features
322
318
 
323
- ### Supported File Formats (75+)
319
+ ### Supported File Formats (88+)
324
320
 
325
- 75+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
321
+ 88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
326
322
 
327
323
  #### Office Documents
328
324
 
329
325
  | Category | Formats | Capabilities |
330
326
  |----------|---------|--------------|
331
- | **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
332
- | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
333
- | **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
327
+ | **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
328
+ | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
329
+ | **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
334
330
  | **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
335
331
  | **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
332
+ | **Database** | `.dbf` | Table data extraction, field type support |
333
+ | **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
336
334
 
337
335
  #### Images (OCR-Enabled)
338
336
 
package/dist/index.d.mts CHANGED
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
1191
1191
  * @module @kreuzberg/node
1192
1192
  */
1193
1193
 
1194
- declare const __version__ = "4.4.4";
1194
+ declare const __version__ = "4.4.6";
1195
1195
 
1196
1196
  export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.d.ts CHANGED
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
1191
1191
  * @module @kreuzberg/node
1192
1192
  */
1193
1193
 
1194
- declare const __version__ = "4.4.4";
1194
+ declare const __version__ = "4.4.6";
1195
1195
 
1196
1196
  export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.js CHANGED
@@ -368,6 +368,9 @@ function normalizeChunkingConfig(chunking) {
368
368
  setIfDefined(normalized, "preset", chunking.preset);
369
369
  setIfDefined(normalized, "embedding", chunking.embedding);
370
370
  setIfDefined(normalized, "enabled", chunking.enabled);
371
+ setIfDefined(normalized, "sizingType", chunking.sizingType);
372
+ setIfDefined(normalized, "sizingModel", chunking.sizingModel);
373
+ setIfDefined(normalized, "sizingCacheDir", chunking.sizingCacheDir);
371
374
  return normalized;
372
375
  }
373
376
  function normalizeImageExtractionConfig(images) {
@@ -594,7 +597,25 @@ function convertChunk(rawChunk) {
594
597
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
595
598
  firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
596
599
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
597
- lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
600
+ lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
601
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
602
+ headingContext: (() => {
603
+ const hc = metadata["heading_context"] ?? metadata["headingContext"];
604
+ if (!hc) return null;
605
+ const headings = hc["headings"];
606
+ if (!Array.isArray(headings)) return null;
607
+ return {
608
+ headings: headings.map((h) => {
609
+ const heading = h;
610
+ return {
611
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
612
+ level: heading["level"] ?? 0,
613
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
614
+ text: heading["text"] ?? ""
615
+ };
616
+ })
617
+ };
618
+ })()
598
619
  }
599
620
  };
600
621
  }
@@ -1197,7 +1218,7 @@ function getEmbeddingPreset(name) {
1197
1218
  }
1198
1219
 
1199
1220
  // typescript/index.ts
1200
- var __version__ = "4.4.4";
1221
+ var __version__ = "4.4.6";
1201
1222
  // Annotate the CommonJS export names for ESM import in node:
1202
1223
  0 && (module.exports = {
1203
1224
  CacheError,