kreuzberg 4.4.5 → 4.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +15 -13
  4. data/ext/kreuzberg_rb/native/Cargo.lock +501 -87
  5. data/ext/kreuzberg_rb/native/Cargo.toml +2 -1
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +61 -22
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +25 -2
  9. data/vendor/Cargo.toml +7 -5
  10. data/vendor/kreuzberg/Cargo.toml +16 -5
  11. data/vendor/kreuzberg/README.md +1 -1
  12. data/vendor/kreuzberg/src/api/handlers.rs +2 -2
  13. data/vendor/kreuzberg/src/chunking/builder.rs +1 -0
  14. data/vendor/kreuzberg/src/chunking/config.rs +1 -1
  15. data/vendor/kreuzberg/src/chunking/core.rs +82 -93
  16. data/vendor/kreuzberg/src/chunking/headings.rs +174 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +5 -3
  18. data/vendor/kreuzberg/src/chunking/processor.rs +2 -4
  19. data/vendor/kreuzberg/src/chunking/tokenizer_cache.rs +81 -0
  20. data/vendor/kreuzberg/src/core/config/extraction/env.rs +22 -16
  21. data/vendor/kreuzberg/src/core/config/extraction/types.rs +1 -1
  22. data/vendor/kreuzberg/src/core/config/mod.rs +3 -1
  23. data/vendor/kreuzberg/src/core/config/processing.rs +84 -14
  24. data/vendor/kreuzberg/src/core/mime.rs +50 -3
  25. data/vendor/kreuzberg/src/core/pipeline/tests.rs +2 -4
  26. data/vendor/kreuzberg/src/extraction/doc/mod.rs +0 -2
  27. data/vendor/kreuzberg/src/extraction/ppt/mod.rs +0 -2
  28. data/vendor/kreuzberg/src/extractors/dbf.rs +179 -0
  29. data/vendor/kreuzberg/src/extractors/doc.rs +0 -2
  30. data/vendor/kreuzberg/src/extractors/docx.rs +69 -5
  31. data/vendor/kreuzberg/src/extractors/excel.rs +2 -1
  32. data/vendor/kreuzberg/src/extractors/hwp.rs +124 -0
  33. data/vendor/kreuzberg/src/extractors/mod.rs +21 -1
  34. data/vendor/kreuzberg/src/extractors/odt.rs +0 -2
  35. data/vendor/kreuzberg/src/extractors/ppt.rs +0 -2
  36. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -3
  37. data/vendor/kreuzberg/src/lib.rs +3 -2
  38. data/vendor/kreuzberg/src/mcp/format.rs +1 -0
  39. data/vendor/kreuzberg/src/types/extraction.rs +29 -0
  40. data/vendor/kreuzberg/tests/config_features.rs +96 -15
  41. data/vendor/kreuzberg/tests/test_fastembed.rs +11 -0
  42. data/vendor/kreuzberg-ffi/Cargo.toml +4 -3
  43. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +1 -0
  44. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  45. data/vendor/kreuzberg-ffi/src/helpers.rs +1 -0
  46. data/vendor/kreuzberg-ffi/src/result.rs +2 -0
  47. data/vendor/kreuzberg-ffi/src/result_view.rs +2 -0
  48. data/vendor/kreuzberg-ffi/src/types.rs +0 -2
  49. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  50. data/vendor/kreuzberg-pdfium-render/Cargo.toml +2 -2
  51. data/vendor/kreuzberg-pdfium-render/src/bindgen/pdfium_7678.rs +0 -14
  52. data/vendor/kreuzberg-pdfium-render/src/bindings/dynamic_bindings.rs +0 -23
  53. data/vendor/kreuzberg-pdfium-render/src/bindings/static_bindings.rs +0 -17
  54. data/vendor/kreuzberg-pdfium-render/src/bindings/wasm_bindings.rs +0 -109
  55. data/vendor/kreuzberg-pdfium-render/src/bindings.rs +0 -31
  56. data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/struct_element.rs +0 -8
  57. data/vendor/kreuzberg-pdfium-render/src/pdf/document.rs +0 -29
  58. data/vendor/kreuzberg-pdfium-render/src/pdf/font.rs +2 -2
  59. data/vendor/kreuzberg-pdfium-render/src/pdf/points.rs +1 -0
  60. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  61. metadata +6 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c376485167aade739cda1b3ac0ba9f8f19bc6c69201f29ab6a13fdb1b9615c9e
4
- data.tar.gz: e89130964d89de12fb5dd2c179f695d239605eaf0dd39312afaeb8c96114d0e0
3
+ metadata.gz: e4402ca953afc0bf4a58309cfd1b8fa6caa7b1ce6d2148021a66efa11e9326b1
4
+ data.tar.gz: 24132a491de22a0525ae99306ec84d28e6ee4c1ba31b7d1516d8f055c387d552
5
5
  SHA512:
6
- metadata.gz: 740a010ae02293ec8228ed99f01c62f6d139953d77cd11de7913769f083b0e349986fd1c5b930f505176dcf39c1475d15077cf67bddec40a755e154f8974cd8d
7
- data.tar.gz: 8b04c3aa6a73b2284d81948ed0ab954b42f7605a3609e976f47914d5d274f70dc13b0432e7890bbc426b238627da3ed76a79f4c7c915e05859c3e4aef4e28600
6
+ metadata.gz: 0e2383af1191518bd57a2c1a3287d3bdebc41f1feb403da1830c35750142122b35c872daf07b218508397ab7daacf56ff108f7b1466394a1ba3e06e987e9782e
7
+ data.tar.gz: bbc4fb9ba9f9c3b97dacf796bf22bf4d7d7e87c16538ac662211271cd17e3b5b4abcb6b708d903fdc727be1fcc2ac69e34111db6b04833549d3ae99f05390e4a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.4.5)
4
+ kreuzberg (4.4.6)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -134,7 +134,7 @@ GEM
134
134
  rubocop (~> 1.81)
135
135
  ruby-progressbar (1.13.0)
136
136
  securerandom (0.4.1)
137
- sorbet-runtime (0.6.13011)
137
+ sorbet-runtime (0.6.13023)
138
138
  steep (1.10.0)
139
139
  activesupport (>= 5.1)
140
140
  concurrent-ruby (>= 1.1.10)
@@ -222,7 +222,7 @@ CHECKSUMS
222
222
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
223
223
  json (2.19.1) sha256=dd94fdc59e48bff85913829a32350b3148156bc4fd2a95a2568a78b11344082d
224
224
  json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
225
- kreuzberg (4.4.5)
225
+ kreuzberg (4.4.6)
226
226
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
227
227
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
228
228
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -259,7 +259,7 @@ CHECKSUMS
259
259
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
260
260
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
261
261
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
262
- sorbet-runtime (0.6.13011) sha256=d451e380097747d64d39595fbbb6db2a198310f9eff0f810cd6e5696b402833f
262
+ sorbet-runtime (0.6.13023) sha256=c00d11cc54951efbc0aece994dd6b20b1d1cb2a2606100c24d4ae7f840383073
263
263
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
264
264
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
265
265
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.5" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.6" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -33,12 +33,12 @@
33
33
  <a href="https://rubygems.org/gems/kreuzberg">
34
34
  <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
35
  </a>
36
+ <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
37
+ <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
38
+ </a>
36
39
  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
37
40
  <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
38
41
  </a>
39
- <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
40
- <img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C">
41
- </a>
42
42
 
43
43
  <!-- Project Info -->
44
44
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
@@ -58,7 +58,7 @@
58
58
  </div>
59
59
 
60
60
 
61
- Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
61
+ Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
62
62
 
63
63
 
64
64
  ## Installation
@@ -91,7 +91,7 @@ gem 'kreuzberg'
91
91
 
92
92
  - **Ruby 3.2.0 or higher** required (including Ruby 4.x)
93
93
  - Ruby 4.0+ is fully supported with no code changes required
94
- - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.24+ for embeddings support
94
+ - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
95
95
  - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
96
96
 
97
97
  **Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
@@ -188,7 +188,7 @@ config = Kreuzberg::Config::Extraction.new(
188
188
  result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
189
189
 
190
190
  puts "Extracted #{result.content.length} characters"
191
- puts "Quality score: #{result.metadata&.dig('quality_score')}"
191
+ puts "Quality score: #{result.quality_score}"
192
192
  puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
193
193
  ```
194
194
 
@@ -208,19 +208,21 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
208
208
 
209
209
  ## Features
210
210
 
211
- ### Supported File Formats (75+)
211
+ ### Supported File Formats (88+)
212
212
 
213
- 75+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
213
+ 88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
214
214
 
215
215
  #### Office Documents
216
216
 
217
217
  | Category | Formats | Capabilities |
218
218
  |----------|---------|--------------|
219
- | **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
220
- | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
221
- | **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
219
+ | **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
220
+ | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
221
+ | **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
222
222
  | **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
223
223
  | **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
224
+ | **Database** | `.dbf` | Table data extraction, field type support |
225
+ | **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
224
226
 
225
227
  #### Images (OCR-Enabled)
226
228
 
@@ -334,7 +336,7 @@ config = Kreuzberg::Config::Extraction.new(
334
336
  result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
335
337
 
336
338
  puts "Extracted #{result.content.length} characters"
337
- puts "Quality score: #{result.metadata&.dig('quality_score')}"
339
+ puts "Quality score: #{result.quality_score}"
338
340
  puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
339
341
  ```
340
342