kreuzberg 4.4.5-aarch64-linux → 4.4.6-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b0b2ee3a1f346bf1409b2f773a83bb25e3f18eb097988e2c03450f0e9e5aa0f6
4
- data.tar.gz: 906a83d73d97de0fbee6ac7bd278188513b1fee4f0a69adc523cb9ad8dc45ac9
3
+ metadata.gz: b52fd8ab029d63ee54f697c758806e2f32566d3b112cf48865f59f9ddb9b4d5f
4
+ data.tar.gz: 65c4f41bd2157f5d79e86fc9bda2b8389a991ffaed9568f602700e95adef55e9
5
5
  SHA512:
6
- metadata.gz: 49f8597b418c1f8b593a0f1870727e467fbcf0852720c8470bef92aa8a53f8644f32083be305ddb383fb63254e738129de197c7d91c8bb7db1cf9c96f35369ed
7
- data.tar.gz: a72287f790e915284d7dae4fd2b1e9e0122ed3ffebf290670066b6c3214a5462a481f814c0ade782e345f5c5eaeab868d7e5c81cd340d4023fd0185138301b89
6
+ metadata.gz: b2c65f9ea098867bda920c3732f6b81c4e52c35aa30d25f15f5755808c34b97a780d35e2f39fc05273658b629c3276577878b4cda1893aa901ccf801e7f39953
7
+ data.tar.gz: 5837f3f6e04712ad82d02bc121e3cb483d28a8b5b32a27f6b3af4f57e8ba875a339fc8e63d1fbc5a466e96d78968b63b5a303a9a58c812e4064e367094fbc511
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.4.5)
4
+ kreuzberg (4.4.6)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -134,7 +134,7 @@ GEM
134
134
  rubocop (~> 1.81)
135
135
  ruby-progressbar (1.13.0)
136
136
  securerandom (0.4.1)
137
- sorbet-runtime (0.6.13011)
137
+ sorbet-runtime (0.6.13023)
138
138
  steep (1.10.0)
139
139
  activesupport (>= 5.1)
140
140
  concurrent-ruby (>= 1.1.10)
@@ -222,7 +222,7 @@ CHECKSUMS
222
222
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
223
223
  json (2.19.1) sha256=dd94fdc59e48bff85913829a32350b3148156bc4fd2a95a2568a78b11344082d
224
224
  json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
225
- kreuzberg (4.4.5)
225
+ kreuzberg (4.4.6)
226
226
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
227
227
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
228
228
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -259,7 +259,7 @@ CHECKSUMS
259
259
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
260
260
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
261
261
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
262
- sorbet-runtime (0.6.13011) sha256=d451e380097747d64d39595fbbb6db2a198310f9eff0f810cd6e5696b402833f
262
+ sorbet-runtime (0.6.13023) sha256=c00d11cc54951efbc0aece994dd6b20b1d1cb2a2606100c24d4ae7f840383073
263
263
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
264
264
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
265
265
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.5" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.6" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -33,12 +33,12 @@
33
33
  <a href="https://rubygems.org/gems/kreuzberg">
34
34
  <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
35
  </a>
36
+ <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
37
+ <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
38
+ </a>
36
39
  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
37
40
  <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
38
41
  </a>
39
- <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
40
- <img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C">
41
- </a>
42
42
 
43
43
  <!-- Project Info -->
44
44
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
@@ -58,7 +58,7 @@
58
58
  </div>
59
59
 
60
60
 
61
- Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
61
+ Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
62
62
 
63
63
 
64
64
  ## Installation
@@ -91,7 +91,7 @@ gem 'kreuzberg'
91
91
 
92
92
  - **Ruby 3.2.0 or higher** required (including Ruby 4.x)
93
93
  - Ruby 4.0+ is fully supported with no code changes required
94
- - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.24+ for embeddings support
94
+ - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
95
95
  - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
96
96
 
97
97
  **Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
@@ -188,7 +188,7 @@ config = Kreuzberg::Config::Extraction.new(
188
188
  result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
189
189
 
190
190
  puts "Extracted #{result.content.length} characters"
191
- puts "Quality score: #{result.metadata&.dig('quality_score')}"
191
+ puts "Quality score: #{result.quality_score}"
192
192
  puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
193
193
  ```
194
194
 
@@ -208,19 +208,21 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
208
208
 
209
209
  ## Features
210
210
 
211
- ### Supported File Formats (75+)
211
+ ### Supported File Formats (88+)
212
212
 
213
- 75+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
213
+ 88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
214
214
 
215
215
  #### Office Documents
216
216
 
217
217
  | Category | Formats | Capabilities |
218
218
  |----------|---------|--------------|
219
- | **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
220
- | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
221
- | **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
219
+ | **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
220
+ | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
221
+ | **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
222
222
  | **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
223
223
  | **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
224
+ | **Database** | `.dbf` | Table data extraction, field type support |
225
+ | **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
224
226
 
225
227
  #### Images (OCR-Enabled)
226
228
 
@@ -334,7 +336,7 @@ config = Kreuzberg::Config::Extraction.new(
334
336
  result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
335
337
 
336
338
  puts "Extracted #{result.content.length} characters"
337
- puts "Quality score: #{result.metadata&.dig('quality_score')}"
339
+ puts "Quality score: #{result.quality_score}"
338
340
  puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
339
341
  ```
340
342
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.4.5'
4
+ VERSION = '4.4.6'
5
5
  end
data/lib/kreuzberg_rb.so CHANGED
Binary file
data/sig/kreuzberg.rbs CHANGED
@@ -259,6 +259,9 @@ module Kreuzberg
259
259
  attr_reader preset: String?
260
260
  attr_reader embedding: Embedding?
261
261
  attr_reader enabled: bool?
262
+ attr_reader sizing_type: String?
263
+ attr_reader sizing_model: String?
264
+ attr_reader sizing_cache_dir: String?
262
265
 
263
266
  def initialize: (
264
267
  ?max_chars: Integer?,
@@ -267,7 +270,10 @@ module Kreuzberg
267
270
  ?embedding: (Embedding | Hash[Symbol, untyped])?,
268
271
  ?chunk_size: Integer?,
269
272
  ?chunk_overlap: Integer?,
270
- ?enabled: bool
273
+ ?enabled: bool,
274
+ ?sizing_type: String?,
275
+ ?sizing_model: String?,
276
+ ?sizing_cache_dir: String?
271
277
  ) -> void
272
278
  def to_h: () -> Hash[Symbol, untyped]
273
279
  end
@@ -767,6 +773,21 @@ module Kreuzberg
767
773
  def to_h: () -> table_hash
768
774
  end
769
775
 
776
+ # Heading level in the document hierarchy
777
+ class HeadingLevel
778
+ attr_reader level: Integer
779
+ attr_reader text: String
780
+
781
+ def initialize: (level: Integer, text: String) -> void
782
+ end
783
+
784
+ # Heading context for a chunk's section
785
+ class HeadingContext
786
+ attr_reader headings: Array[HeadingLevel]
787
+
788
+ def initialize: (headings: Array[HeadingLevel]) -> void
789
+ end
790
+
770
791
  # Text chunk (Struct from result.rb)
771
792
  class Chunk
772
793
  attr_reader content: String
@@ -778,6 +799,7 @@ module Kreuzberg
778
799
  attr_reader first_page: Integer?
779
800
  attr_reader last_page: Integer?
780
801
  attr_reader embedding: Array[Float]?
802
+ attr_reader heading_context: HeadingContext?
781
803
 
782
804
  def initialize: (
783
805
  content: String,
@@ -788,7 +810,8 @@ module Kreuzberg
788
810
  total_chunks: Integer,
789
811
  first_page: Integer?,
790
812
  last_page: Integer?,
791
- embedding: Array[Float]?
813
+ embedding: Array[Float]?,
814
+ heading_context: HeadingContext?
792
815
  ) -> void
793
816
  def to_h: () -> chunk_hash
794
817
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.4.5
4
+ version: 4.4.6
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-11 00:00:00.000000000 Z
11
+ date: 2026-03-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler