@kreuzberg/node 4.4.5 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.5" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -33,6 +33,9 @@
33
33
  <a href="https://rubygems.org/gems/kreuzberg">
34
34
  <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
35
  </a>
36
+ <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
37
+ <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
38
+ </a>
36
39
  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
37
40
  <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
38
41
  </a>
@@ -44,6 +47,9 @@
44
47
  <a href="https://docs.kreuzberg.dev">
45
48
  <img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
46
49
  </a>
50
+ <a href="https://huggingface.co/Kreuzberg">
51
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
52
+ </a>
47
53
  </div>
48
54
 
49
55
  <img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
@@ -55,7 +61,7 @@
55
61
  </div>
56
62
 
57
63
 
58
- Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
64
+ Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
59
65
 
60
66
 
61
67
  ## Installation
@@ -95,16 +101,9 @@ yarn add @kreuzberg/node
95
101
  ### System Requirements
96
102
 
97
103
  - **Node.js 22+** required (NAPI-RS native bindings)
98
- - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.24+ for embeddings support
104
+ - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
99
105
  - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
100
106
 
101
- **Format Support Notes:**
102
- - Legacy formats (DOC, XLS, PPT) are now extracted natively without external tools
103
- - Modern Office formats (DOCX, XLSX, PPTX) are fully supported
104
- - WASM binding supports all document formats via in-memory parsing
105
-
106
-
107
-
108
107
  ### Platform Support
109
108
 
110
109
  Pre-built binaries available for:
@@ -320,19 +319,21 @@ This binding uses NAPI-RS to provide native Node.js bindings with:
320
319
 
321
320
  ## Features
322
321
 
323
- ### Supported File Formats (75+)
322
+ ### Supported File Formats (88+)
324
323
 
325
- 75+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
324
+ 88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
326
325
 
327
326
  #### Office Documents
328
327
 
329
328
  | Category | Formats | Capabilities |
330
329
  |----------|---------|--------------|
331
- | **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
332
- | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
333
- | **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
330
+ | **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
331
+ | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
332
+ | **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
334
333
  | **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
335
334
  | **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
335
+ | **Database** | `.dbf` | Table data extraction, field type support |
336
+ | **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
336
337
 
337
338
  #### Images (OCR-Enabled)
338
339
 
package/dist/index.d.mts CHANGED
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
1191
1191
  * @module @kreuzberg/node
1192
1192
  */
1193
1193
 
1194
- declare const __version__ = "4.4.5";
1194
+ declare const __version__ = "4.5.0";
1195
1195
 
1196
1196
  export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.d.ts CHANGED
@@ -1191,6 +1191,6 @@ declare function __resetBindingForTests(): void;
1191
1191
  * @module @kreuzberg/node
1192
1192
  */
1193
1193
 
1194
- declare const __version__ = "4.4.5";
1194
+ declare const __version__ = "4.5.0";
1195
1195
 
1196
1196
  export { type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, registerOcrBackend, registerPostProcessor, registerValidator, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };
package/dist/index.js CHANGED
@@ -368,6 +368,9 @@ function normalizeChunkingConfig(chunking) {
368
368
  setIfDefined(normalized, "preset", chunking.preset);
369
369
  setIfDefined(normalized, "embedding", chunking.embedding);
370
370
  setIfDefined(normalized, "enabled", chunking.enabled);
371
+ setIfDefined(normalized, "sizingType", chunking.sizingType);
372
+ setIfDefined(normalized, "sizingModel", chunking.sizingModel);
373
+ setIfDefined(normalized, "sizingCacheDir", chunking.sizingCacheDir);
371
374
  return normalized;
372
375
  }
373
376
  function normalizeImageExtractionConfig(images) {
@@ -530,10 +533,22 @@ function normalizeExtractionConfig(config) {
530
533
  setIfDefined(normalized, "pages", pages);
531
534
  const htmlOptions = normalizeHtmlOptions(config.htmlOptions);
532
535
  setIfDefined(normalized, "htmlOptions", htmlOptions);
536
+ const layout = normalizeLayoutDetectionConfig(config.layout);
537
+ setIfDefined(normalized, "layout", layout);
533
538
  setIfDefined(normalized, "outputFormat", config.outputFormat);
534
539
  setIfDefined(normalized, "resultFormat", config.resultFormat);
535
540
  return normalized;
536
541
  }
542
+ function normalizeLayoutDetectionConfig(config) {
543
+ if (!config) {
544
+ return void 0;
545
+ }
546
+ const normalized = {};
547
+ setIfDefined(normalized, "preset", config.preset);
548
+ setIfDefined(normalized, "confidenceThreshold", config.confidenceThreshold);
549
+ setIfDefined(normalized, "applyHeuristics", config.applyHeuristics);
550
+ return normalized;
551
+ }
537
552
 
538
553
  // typescript/core/type-converters.ts
539
554
  function parseMetadata(metadataStr) {
@@ -594,7 +609,25 @@ function convertChunk(rawChunk) {
594
609
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
595
610
  firstPage: metadata["first_page"] ?? metadata["firstPage"] ?? null,
596
611
  // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
597
- lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null
612
+ lastPage: metadata["last_page"] ?? metadata["lastPage"] ?? null,
613
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
614
+ headingContext: (() => {
615
+ const hc = metadata["heading_context"] ?? metadata["headingContext"];
616
+ if (!hc) return null;
617
+ const headings = hc["headings"];
618
+ if (!Array.isArray(headings)) return null;
619
+ return {
620
+ headings: headings.map((h) => {
621
+ const heading = h;
622
+ return {
623
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
624
+ level: heading["level"] ?? 0,
625
+ // biome-ignore lint/complexity/useLiteralKeys: required for strict TypeScript noPropertyAccessFromIndexSignature
626
+ text: heading["text"] ?? ""
627
+ };
628
+ })
629
+ };
630
+ })()
598
631
  }
599
632
  };
600
633
  }
@@ -1197,7 +1230,7 @@ function getEmbeddingPreset(name) {
1197
1230
  }
1198
1231
 
1199
1232
  // typescript/index.ts
1200
- var __version__ = "4.4.5";
1233
+ var __version__ = "4.5.0";
1201
1234
  // Annotate the CommonJS export names for ESM import in node:
1202
1235
  0 && (module.exports = {
1203
1236
  CacheError,