kreuzberg 4.4.5 → 4.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +15 -13
- data/ext/kreuzberg_rb/native/Cargo.lock +501 -87
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -1
- data/ext/kreuzberg_rb/native/src/config/types.rs +61 -22
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +25 -2
- data/vendor/Cargo.toml +7 -5
- data/vendor/kreuzberg/Cargo.toml +16 -5
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +2 -2
- data/vendor/kreuzberg/src/chunking/builder.rs +1 -0
- data/vendor/kreuzberg/src/chunking/config.rs +1 -1
- data/vendor/kreuzberg/src/chunking/core.rs +82 -93
- data/vendor/kreuzberg/src/chunking/headings.rs +174 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +5 -3
- data/vendor/kreuzberg/src/chunking/processor.rs +2 -4
- data/vendor/kreuzberg/src/chunking/tokenizer_cache.rs +81 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +22 -16
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +1 -1
- data/vendor/kreuzberg/src/core/config/mod.rs +3 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +84 -14
- data/vendor/kreuzberg/src/core/mime.rs +50 -3
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +2 -4
- data/vendor/kreuzberg/src/extraction/doc/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/ppt/mod.rs +0 -2
- data/vendor/kreuzberg/src/extractors/dbf.rs +179 -0
- data/vendor/kreuzberg/src/extractors/doc.rs +0 -2
- data/vendor/kreuzberg/src/extractors/docx.rs +69 -5
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -1
- data/vendor/kreuzberg/src/extractors/hwp.rs +124 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -2
- data/vendor/kreuzberg/src/extractors/ppt.rs +0 -2
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -3
- data/vendor/kreuzberg/src/lib.rs +3 -2
- data/vendor/kreuzberg/src/mcp/format.rs +1 -0
- data/vendor/kreuzberg/src/types/extraction.rs +29 -0
- data/vendor/kreuzberg/tests/config_features.rs +96 -15
- data/vendor/kreuzberg/tests/test_fastembed.rs +11 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +4 -3
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +1 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +1 -0
- data/vendor/kreuzberg-ffi/src/result.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +2 -0
- data/vendor/kreuzberg-ffi/src/types.rs +0 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +2 -2
- data/vendor/kreuzberg-pdfium-render/src/bindgen/pdfium_7678.rs +0 -14
- data/vendor/kreuzberg-pdfium-render/src/bindings/dynamic_bindings.rs +0 -23
- data/vendor/kreuzberg-pdfium-render/src/bindings/static_bindings.rs +0 -17
- data/vendor/kreuzberg-pdfium-render/src/bindings/wasm_bindings.rs +0 -109
- data/vendor/kreuzberg-pdfium-render/src/bindings.rs +0 -31
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/struct_element.rs +0 -8
- data/vendor/kreuzberg-pdfium-render/src/pdf/document.rs +0 -29
- data/vendor/kreuzberg-pdfium-render/src/pdf/font.rs +2 -2
- data/vendor/kreuzberg-pdfium-render/src/pdf/points.rs +1 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +6 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e4402ca953afc0bf4a58309cfd1b8fa6caa7b1ce6d2148021a66efa11e9326b1
|
|
4
|
+
data.tar.gz: 24132a491de22a0525ae99306ec84d28e6ee4c1ba31b7d1516d8f055c387d552
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0e2383af1191518bd57a2c1a3287d3bdebc41f1feb403da1830c35750142122b35c872daf07b218508397ab7daacf56ff108f7b1466394a1ba3e06e987e9782e
|
|
7
|
+
data.tar.gz: bbc4fb9ba9f9c3b97dacf796bf22bf4d7d7e87c16538ac662211271cd17e3b5b4abcb6b708d903fdc727be1fcc2ac69e34111db6b04833549d3ae99f05390e4a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.4.
|
|
4
|
+
kreuzberg (4.4.6)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.13023)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -222,7 +222,7 @@ CHECKSUMS
|
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
223
|
json (2.19.1) sha256=dd94fdc59e48bff85913829a32350b3148156bc4fd2a95a2568a78b11344082d
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.4.
|
|
225
|
+
kreuzberg (4.4.6)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -259,7 +259,7 @@ CHECKSUMS
|
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.13023) sha256=c00d11cc54951efbc0aece994dd6b20b1d1cb2a2606100c24d4ae7f840383073
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.6" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -33,12 +33,12 @@
|
|
|
33
33
|
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
34
|
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
35
|
</a>
|
|
36
|
+
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
|
37
|
+
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
|
38
|
+
</a>
|
|
36
39
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
37
40
|
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
38
41
|
</a>
|
|
39
|
-
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
40
|
-
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C">
|
|
41
|
-
</a>
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
@@ -58,7 +58,7 @@
|
|
|
58
58
|
</div>
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
Extract text, tables, images, and metadata from
|
|
61
|
+
Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
## Installation
|
|
@@ -91,7 +91,7 @@ gem 'kreuzberg'
|
|
|
91
91
|
|
|
92
92
|
- **Ruby 3.2.0 or higher** required (including Ruby 4.x)
|
|
93
93
|
- Ruby 4.0+ is fully supported with no code changes required
|
|
94
|
-
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.
|
|
94
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
95
95
|
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
96
96
|
|
|
97
97
|
**Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
|
|
@@ -188,7 +188,7 @@ config = Kreuzberg::Config::Extraction.new(
|
|
|
188
188
|
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
|
|
189
189
|
|
|
190
190
|
puts "Extracted #{result.content.length} characters"
|
|
191
|
-
puts "Quality score: #{result.
|
|
191
|
+
puts "Quality score: #{result.quality_score}"
|
|
192
192
|
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
193
193
|
```
|
|
194
194
|
|
|
@@ -208,19 +208,21 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
208
208
|
|
|
209
209
|
## Features
|
|
210
210
|
|
|
211
|
-
### Supported File Formats (
|
|
211
|
+
### Supported File Formats (88+)
|
|
212
212
|
|
|
213
|
-
|
|
213
|
+
88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
214
214
|
|
|
215
215
|
#### Office Documents
|
|
216
216
|
|
|
217
217
|
| Category | Formats | Capabilities |
|
|
218
218
|
|----------|---------|--------------|
|
|
219
|
-
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
220
|
-
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
221
|
-
| **Presentations** | `.pptx`, `.
|
|
219
|
+
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
|
220
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
221
|
+
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
|
222
222
|
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
223
223
|
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
224
|
+
| **Database** | `.dbf` | Table data extraction, field type support |
|
|
225
|
+
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
|
224
226
|
|
|
225
227
|
#### Images (OCR-Enabled)
|
|
226
228
|
|
|
@@ -334,7 +336,7 @@ config = Kreuzberg::Config::Extraction.new(
|
|
|
334
336
|
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
|
|
335
337
|
|
|
336
338
|
puts "Extracted #{result.content.length} characters"
|
|
337
|
-
puts "Quality score: #{result.
|
|
339
|
+
puts "Quality score: #{result.quality_score}"
|
|
338
340
|
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
339
341
|
```
|
|
340
342
|
|