kreuzberg 4.4.5-aarch64-linux → 4.4.6-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +15 -13
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +25 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b52fd8ab029d63ee54f697c758806e2f32566d3b112cf48865f59f9ddb9b4d5f
|
|
4
|
+
data.tar.gz: 65c4f41bd2157f5d79e86fc9bda2b8389a991ffaed9568f602700e95adef55e9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b2c65f9ea098867bda920c3732f6b81c4e52c35aa30d25f15f5755808c34b97a780d35e2f39fc05273658b629c3276577878b4cda1893aa901ccf801e7f39953
|
|
7
|
+
data.tar.gz: 5837f3f6e04712ad82d02bc121e3cb483d28a8b5b32a27f6b3af4f57e8ba875a339fc8e63d1fbc5a466e96d78968b63b5a303a9a58c812e4064e367094fbc511
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.4.
|
|
4
|
+
kreuzberg (4.4.6)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.13023)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -222,7 +222,7 @@ CHECKSUMS
|
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
223
|
json (2.19.1) sha256=dd94fdc59e48bff85913829a32350b3148156bc4fd2a95a2568a78b11344082d
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.4.
|
|
225
|
+
kreuzberg (4.4.6)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -259,7 +259,7 @@ CHECKSUMS
|
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.13023) sha256=c00d11cc54951efbc0aece994dd6b20b1d1cb2a2606100c24d4ae7f840383073
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.6" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -33,12 +33,12 @@
|
|
|
33
33
|
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
34
|
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
35
|
</a>
|
|
36
|
+
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
|
37
|
+
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
|
38
|
+
</a>
|
|
36
39
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
37
40
|
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
38
41
|
</a>
|
|
39
|
-
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
40
|
-
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C">
|
|
41
|
-
</a>
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
@@ -58,7 +58,7 @@
|
|
|
58
58
|
</div>
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
Extract text, tables, images, and metadata from
|
|
61
|
+
Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
## Installation
|
|
@@ -91,7 +91,7 @@ gem 'kreuzberg'
|
|
|
91
91
|
|
|
92
92
|
- **Ruby 3.2.0 or higher** required (including Ruby 4.x)
|
|
93
93
|
- Ruby 4.0+ is fully supported with no code changes required
|
|
94
|
-
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.
|
|
94
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
95
95
|
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
96
96
|
|
|
97
97
|
**Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
|
|
@@ -188,7 +188,7 @@ config = Kreuzberg::Config::Extraction.new(
|
|
|
188
188
|
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
|
|
189
189
|
|
|
190
190
|
puts "Extracted #{result.content.length} characters"
|
|
191
|
-
puts "Quality score: #{result.
|
|
191
|
+
puts "Quality score: #{result.quality_score}"
|
|
192
192
|
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
193
193
|
```
|
|
194
194
|
|
|
@@ -208,19 +208,21 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
208
208
|
|
|
209
209
|
## Features
|
|
210
210
|
|
|
211
|
-
### Supported File Formats (
|
|
211
|
+
### Supported File Formats (88+)
|
|
212
212
|
|
|
213
|
-
|
|
213
|
+
88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
214
214
|
|
|
215
215
|
#### Office Documents
|
|
216
216
|
|
|
217
217
|
| Category | Formats | Capabilities |
|
|
218
218
|
|----------|---------|--------------|
|
|
219
|
-
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
220
|
-
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
221
|
-
| **Presentations** | `.pptx`, `.
|
|
219
|
+
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
|
220
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
221
|
+
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
|
222
222
|
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
223
223
|
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
224
|
+
| **Database** | `.dbf` | Table data extraction, field type support |
|
|
225
|
+
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
|
224
226
|
|
|
225
227
|
#### Images (OCR-Enabled)
|
|
226
228
|
|
|
@@ -334,7 +336,7 @@ config = Kreuzberg::Config::Extraction.new(
|
|
|
334
336
|
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
|
|
335
337
|
|
|
336
338
|
puts "Extracted #{result.content.length} characters"
|
|
337
|
-
puts "Quality score: #{result.
|
|
339
|
+
puts "Quality score: #{result.quality_score}"
|
|
338
340
|
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
339
341
|
```
|
|
340
342
|
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -259,6 +259,9 @@ module Kreuzberg
|
|
|
259
259
|
attr_reader preset: String?
|
|
260
260
|
attr_reader embedding: Embedding?
|
|
261
261
|
attr_reader enabled: bool?
|
|
262
|
+
attr_reader sizing_type: String?
|
|
263
|
+
attr_reader sizing_model: String?
|
|
264
|
+
attr_reader sizing_cache_dir: String?
|
|
262
265
|
|
|
263
266
|
def initialize: (
|
|
264
267
|
?max_chars: Integer?,
|
|
@@ -267,7 +270,10 @@ module Kreuzberg
|
|
|
267
270
|
?embedding: (Embedding | Hash[Symbol, untyped])?,
|
|
268
271
|
?chunk_size: Integer?,
|
|
269
272
|
?chunk_overlap: Integer?,
|
|
270
|
-
?enabled: bool
|
|
273
|
+
?enabled: bool,
|
|
274
|
+
?sizing_type: String?,
|
|
275
|
+
?sizing_model: String?,
|
|
276
|
+
?sizing_cache_dir: String?
|
|
271
277
|
) -> void
|
|
272
278
|
def to_h: () -> Hash[Symbol, untyped]
|
|
273
279
|
end
|
|
@@ -767,6 +773,21 @@ module Kreuzberg
|
|
|
767
773
|
def to_h: () -> table_hash
|
|
768
774
|
end
|
|
769
775
|
|
|
776
|
+
# Heading level in the document hierarchy
|
|
777
|
+
class HeadingLevel
|
|
778
|
+
attr_reader level: Integer
|
|
779
|
+
attr_reader text: String
|
|
780
|
+
|
|
781
|
+
def initialize: (level: Integer, text: String) -> void
|
|
782
|
+
end
|
|
783
|
+
|
|
784
|
+
# Heading context for a chunk's section
|
|
785
|
+
class HeadingContext
|
|
786
|
+
attr_reader headings: Array[HeadingLevel]
|
|
787
|
+
|
|
788
|
+
def initialize: (headings: Array[HeadingLevel]) -> void
|
|
789
|
+
end
|
|
790
|
+
|
|
770
791
|
# Text chunk (Struct from result.rb)
|
|
771
792
|
class Chunk
|
|
772
793
|
attr_reader content: String
|
|
@@ -778,6 +799,7 @@ module Kreuzberg
|
|
|
778
799
|
attr_reader first_page: Integer?
|
|
779
800
|
attr_reader last_page: Integer?
|
|
780
801
|
attr_reader embedding: Array[Float]?
|
|
802
|
+
attr_reader heading_context: HeadingContext?
|
|
781
803
|
|
|
782
804
|
def initialize: (
|
|
783
805
|
content: String,
|
|
@@ -788,7 +810,8 @@ module Kreuzberg
|
|
|
788
810
|
total_chunks: Integer,
|
|
789
811
|
first_page: Integer?,
|
|
790
812
|
last_page: Integer?,
|
|
791
|
-
embedding: Array[Float]
|
|
813
|
+
embedding: Array[Float]?,
|
|
814
|
+
heading_context: HeadingContext?
|
|
792
815
|
) -> void
|
|
793
816
|
def to_h: () -> chunk_hash
|
|
794
817
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.4.
|
|
4
|
+
version: 4.4.6
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|