kreuzberg 4.4.5-aarch64-linux → 4.5.1-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +8 -8
- data/README.md +18 -13
- data/lib/kreuzberg/config.rb +116 -9
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +59 -8
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a4f7421dd696f83a87681b1250e5a2271dd4dbad4cf31999b272921c87a6cbc3
|
|
4
|
+
data.tar.gz: a9fb76d70dedec40e317f36c432a715a5076ac9b2cb66a5e1030900003dfcb73
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: efaec6320e12b8f8d56016d26067a8ba5de8a92e2aee51ffc530e189fc5df6c0fc14125fae03cd6bfcd6b5e8386066741b089170257d20e047a28592652a1019
|
|
7
|
+
data.tar.gz: 3fd93ff62a4d0e63b782e4da4f93288a2705a5d84907910fc475062861e7c32a4dbe9cd1d3d8aaa194d7f977891b653cfc19ee38a35ded03ed94aace011dc419
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.
|
|
4
|
+
kreuzberg (4.5.1)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -49,7 +49,7 @@ GEM
|
|
|
49
49
|
i18n (1.14.8)
|
|
50
50
|
concurrent-ruby (~> 1.0)
|
|
51
51
|
io-console (0.8.2)
|
|
52
|
-
json (2.19.
|
|
52
|
+
json (2.19.2)
|
|
53
53
|
json-schema (6.2.0)
|
|
54
54
|
addressable (~> 2.8)
|
|
55
55
|
bigdecimal (>= 3.1, < 5)
|
|
@@ -60,7 +60,7 @@ GEM
|
|
|
60
60
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
61
61
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
62
62
|
logger (1.7.0)
|
|
63
|
-
mcp (0.
|
|
63
|
+
mcp (0.9.0)
|
|
64
64
|
json-schema (>= 4.1)
|
|
65
65
|
method_source (1.1.0)
|
|
66
66
|
minitest (6.0.2)
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.13051)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -220,14 +220,14 @@ CHECKSUMS
|
|
|
220
220
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
221
221
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
|
-
json (2.19.
|
|
223
|
+
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.
|
|
225
|
+
kreuzberg (4.5.1)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
229
229
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
230
|
-
mcp (0.
|
|
230
|
+
mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
|
|
231
231
|
method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5
|
|
232
232
|
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
233
233
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
@@ -259,7 +259,7 @@ CHECKSUMS
|
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.13051) sha256=ae5495bf229c5e3e5e3a2e17ac4853798d993437f5b5f12b763d462183852452
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -33,12 +33,12 @@
|
|
|
33
33
|
<a href="https://rubygems.org/gems/kreuzberg">
|
|
34
34
|
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
|
35
35
|
</a>
|
|
36
|
+
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
|
37
|
+
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
|
38
|
+
</a>
|
|
36
39
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
37
40
|
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
38
41
|
</a>
|
|
39
|
-
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
40
|
-
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C">
|
|
41
|
-
</a>
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
@@ -47,6 +47,9 @@
|
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
48
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
49
49
|
</a>
|
|
50
|
+
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
|
|
52
|
+
</a>
|
|
50
53
|
</div>
|
|
51
54
|
|
|
52
55
|
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
@@ -58,7 +61,7 @@
|
|
|
58
61
|
</div>
|
|
59
62
|
|
|
60
63
|
|
|
61
|
-
Extract text, tables, images, and metadata from
|
|
64
|
+
Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
62
65
|
|
|
63
66
|
|
|
64
67
|
## Installation
|
|
@@ -91,7 +94,7 @@ gem 'kreuzberg'
|
|
|
91
94
|
|
|
92
95
|
- **Ruby 3.2.0 or higher** required (including Ruby 4.x)
|
|
93
96
|
- Ruby 4.0+ is fully supported with no code changes required
|
|
94
|
-
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.
|
|
97
|
+
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
95
98
|
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
96
99
|
|
|
97
100
|
**Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
|
|
@@ -188,7 +191,7 @@ config = Kreuzberg::Config::Extraction.new(
|
|
|
188
191
|
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
|
|
189
192
|
|
|
190
193
|
puts "Extracted #{result.content.length} characters"
|
|
191
|
-
puts "Quality score: #{result.
|
|
194
|
+
puts "Quality score: #{result.quality_score}"
|
|
192
195
|
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
193
196
|
```
|
|
194
197
|
|
|
@@ -208,19 +211,21 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
208
211
|
|
|
209
212
|
## Features
|
|
210
213
|
|
|
211
|
-
### Supported File Formats (
|
|
214
|
+
### Supported File Formats (88+)
|
|
212
215
|
|
|
213
|
-
|
|
216
|
+
88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
214
217
|
|
|
215
218
|
#### Office Documents
|
|
216
219
|
|
|
217
220
|
| Category | Formats | Capabilities |
|
|
218
221
|
|----------|---------|--------------|
|
|
219
|
-
| **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
|
|
220
|
-
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
221
|
-
| **Presentations** | `.pptx`, `.
|
|
222
|
+
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
|
223
|
+
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
|
224
|
+
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
|
222
225
|
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
|
223
226
|
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
|
227
|
+
| **Database** | `.dbf` | Table data extraction, field type support |
|
|
228
|
+
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
|
224
229
|
|
|
225
230
|
#### Images (OCR-Enabled)
|
|
226
231
|
|
|
@@ -334,7 +339,7 @@ config = Kreuzberg::Config::Extraction.new(
|
|
|
334
339
|
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
|
|
335
340
|
|
|
336
341
|
puts "Extracted #{result.content.length} characters"
|
|
337
|
-
puts "Quality score: #{result.
|
|
342
|
+
puts "Quality score: #{result.quality_score}"
|
|
338
343
|
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
339
344
|
```
|
|
340
345
|
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -392,7 +392,8 @@ module Kreuzberg
|
|
|
392
392
|
#
|
|
393
393
|
class PDF
|
|
394
394
|
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy,
|
|
395
|
-
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction
|
|
395
|
+
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction,
|
|
396
|
+
:allow_single_column_tables
|
|
396
397
|
|
|
397
398
|
def initialize(
|
|
398
399
|
extract_images: false,
|
|
@@ -402,7 +403,8 @@ module Kreuzberg
|
|
|
402
403
|
hierarchy: nil,
|
|
403
404
|
extract_annotations: false,
|
|
404
405
|
top_margin_fraction: nil,
|
|
405
|
-
bottom_margin_fraction: nil
|
|
406
|
+
bottom_margin_fraction: nil,
|
|
407
|
+
allow_single_column_tables: false
|
|
406
408
|
)
|
|
407
409
|
@extract_images = extract_images ? true : false
|
|
408
410
|
@passwords = if passwords.is_a?(Array)
|
|
@@ -416,6 +418,7 @@ module Kreuzberg
|
|
|
416
418
|
@extract_annotations = extract_annotations ? true : false
|
|
417
419
|
@top_margin_fraction = top_margin_fraction&.to_f
|
|
418
420
|
@bottom_margin_fraction = bottom_margin_fraction&.to_f
|
|
421
|
+
@allow_single_column_tables = allow_single_column_tables ? true : false
|
|
419
422
|
end
|
|
420
423
|
|
|
421
424
|
def to_h
|
|
@@ -427,7 +430,8 @@ module Kreuzberg
|
|
|
427
430
|
hierarchy: @hierarchy&.to_h,
|
|
428
431
|
extract_annotations: @extract_annotations,
|
|
429
432
|
top_margin_fraction: @top_margin_fraction,
|
|
430
|
-
bottom_margin_fraction: @bottom_margin_fraction
|
|
433
|
+
bottom_margin_fraction: @bottom_margin_fraction,
|
|
434
|
+
allow_single_column_tables: @allow_single_column_tables
|
|
431
435
|
}.compact
|
|
432
436
|
end
|
|
433
437
|
|
|
@@ -803,6 +807,85 @@ module Kreuzberg
|
|
|
803
807
|
end
|
|
804
808
|
end
|
|
805
809
|
|
|
810
|
+
# Hardware acceleration configuration for ONNX Runtime
|
|
811
|
+
#
|
|
812
|
+
# Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
|
813
|
+
# for inference in layout detection and embedding generation.
|
|
814
|
+
#
|
|
815
|
+
# @example Auto-select provider (CoreML on macOS, CUDA on Linux, CPU elsewhere)
|
|
816
|
+
# acceleration = Acceleration.new
|
|
817
|
+
#
|
|
818
|
+
# @example Force CPU only
|
|
819
|
+
# acceleration = Acceleration.new(provider: 'cpu')
|
|
820
|
+
#
|
|
821
|
+
# @example Use CUDA with specific device
|
|
822
|
+
# acceleration = Acceleration.new(provider: 'cuda', device_id: 0)
|
|
823
|
+
#
|
|
824
|
+
class Acceleration
|
|
825
|
+
attr_reader :provider, :device_id
|
|
826
|
+
|
|
827
|
+
def initialize(provider: 'auto', device_id: 0)
|
|
828
|
+
@provider = provider.to_s
|
|
829
|
+
@device_id = device_id.to_i
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
def to_h
|
|
833
|
+
{
|
|
834
|
+
provider: @provider,
|
|
835
|
+
device_id: @device_id
|
|
836
|
+
}
|
|
837
|
+
end
|
|
838
|
+
end
|
|
839
|
+
|
|
840
|
+
# Layout detection configuration
|
|
841
|
+
#
|
|
842
|
+
# @example Basic usage with fast preset
|
|
843
|
+
# layout = LayoutDetection.new(preset: "fast")
|
|
844
|
+
#
|
|
845
|
+
# @example Accurate preset with custom threshold
|
|
846
|
+
# layout = LayoutDetection.new(
|
|
847
|
+
# preset: "accurate",
|
|
848
|
+
# confidence_threshold: 0.5,
|
|
849
|
+
# apply_heuristics: true
|
|
850
|
+
# )
|
|
851
|
+
#
|
|
852
|
+
class LayoutDetection
|
|
853
|
+
attr_reader :preset, :confidence_threshold, :apply_heuristics
|
|
854
|
+
|
|
855
|
+
def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true)
|
|
856
|
+
@preset = preset.to_s
|
|
857
|
+
@confidence_threshold = confidence_threshold&.to_f
|
|
858
|
+
@apply_heuristics = apply_heuristics ? true : false
|
|
859
|
+
end
|
|
860
|
+
|
|
861
|
+
def to_h
|
|
862
|
+
{
|
|
863
|
+
preset: @preset,
|
|
864
|
+
confidence_threshold: @confidence_threshold,
|
|
865
|
+
apply_heuristics: @apply_heuristics
|
|
866
|
+
}.compact
|
|
867
|
+
end
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
# Concurrency configuration for thread pool management
|
|
871
|
+
#
|
|
872
|
+
# @example Limit max threads
|
|
873
|
+
# concurrency = Concurrency.new(max_threads: 4)
|
|
874
|
+
#
|
|
875
|
+
class Concurrency
|
|
876
|
+
attr_reader :max_threads
|
|
877
|
+
|
|
878
|
+
def initialize(max_threads: nil)
|
|
879
|
+
@max_threads = max_threads&.to_i
|
|
880
|
+
end
|
|
881
|
+
|
|
882
|
+
def to_h
|
|
883
|
+
h = {}
|
|
884
|
+
h[:max_threads] = @max_threads unless @max_threads.nil?
|
|
885
|
+
h
|
|
886
|
+
end
|
|
887
|
+
end
|
|
888
|
+
|
|
806
889
|
# Main extraction configuration
|
|
807
890
|
#
|
|
808
891
|
# @example Basic usage
|
|
@@ -847,7 +930,7 @@ module Kreuzberg
|
|
|
847
930
|
:images, :postprocessor,
|
|
848
931
|
:token_reduction, :keywords, :html_options, :pages,
|
|
849
932
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
850
|
-
:security_limits
|
|
933
|
+
:security_limits, :layout, :concurrency
|
|
851
934
|
|
|
852
935
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
853
936
|
alias image_extraction images
|
|
@@ -872,7 +955,7 @@ module Kreuzberg
|
|
|
872
955
|
language_detection pdf_options image_extraction
|
|
873
956
|
postprocessor token_reduction keywords html_options pages
|
|
874
957
|
max_concurrent_extractions output_format result_format
|
|
875
|
-
security_limits
|
|
958
|
+
security_limits layout concurrency
|
|
876
959
|
].freeze
|
|
877
960
|
|
|
878
961
|
# Aliases for backward compatibility
|
|
@@ -947,7 +1030,9 @@ module Kreuzberg
|
|
|
947
1030
|
max_concurrent_extractions: nil,
|
|
948
1031
|
output_format: nil,
|
|
949
1032
|
result_format: nil,
|
|
950
|
-
security_limits: nil
|
|
1033
|
+
security_limits: nil,
|
|
1034
|
+
layout: nil,
|
|
1035
|
+
concurrency: nil)
|
|
951
1036
|
kwargs = {
|
|
952
1037
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
953
1038
|
force_ocr: force_ocr, include_document_structure: include_document_structure,
|
|
@@ -957,7 +1042,8 @@ module Kreuzberg
|
|
|
957
1042
|
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
958
1043
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
959
1044
|
output_format: output_format, result_format: result_format,
|
|
960
|
-
security_limits: security_limits
|
|
1045
|
+
security_limits: security_limits, layout: layout,
|
|
1046
|
+
concurrency: concurrency
|
|
961
1047
|
}
|
|
962
1048
|
extracted = extract_from_hash(hash, kwargs)
|
|
963
1049
|
|
|
@@ -971,7 +1057,7 @@ module Kreuzberg
|
|
|
971
1057
|
defaults.merge(hash.slice(*defaults.keys))
|
|
972
1058
|
end
|
|
973
1059
|
|
|
974
|
-
def assign_attributes(params)
|
|
1060
|
+
def assign_attributes(params) # rubocop:disable Metrics/MethodLength
|
|
975
1061
|
@use_cache = params[:use_cache] ? true : false
|
|
976
1062
|
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
977
1063
|
@force_ocr = params[:force_ocr] ? true : false
|
|
@@ -986,6 +1072,8 @@ module Kreuzberg
|
|
|
986
1072
|
@keywords = normalize_config(params[:keywords], Keywords)
|
|
987
1073
|
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
988
1074
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
1075
|
+
@layout = normalize_config(params[:layout], LayoutDetection)
|
|
1076
|
+
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
989
1077
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
990
1078
|
@output_format = validate_output_format(params[:output_format])
|
|
991
1079
|
@result_format = validate_result_format(params[:result_format])
|
|
@@ -1034,7 +1122,8 @@ module Kreuzberg
|
|
|
1034
1122
|
language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
|
|
1035
1123
|
image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
|
|
1036
1124
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1037
|
-
html_options: @html_options&.to_h, pages: @pages&.to_h
|
|
1125
|
+
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1126
|
+
layout: @layout&.to_h, concurrency: @concurrency&.to_h
|
|
1038
1127
|
}
|
|
1039
1128
|
end
|
|
1040
1129
|
|
|
@@ -1172,6 +1261,10 @@ module Kreuzberg
|
|
|
1172
1261
|
@html_options = normalize_config(value, HtmlOptions)
|
|
1173
1262
|
when :pages
|
|
1174
1263
|
@pages = normalize_config(value, PageConfig)
|
|
1264
|
+
when :layout
|
|
1265
|
+
@layout = normalize_config(value, LayoutDetection)
|
|
1266
|
+
when :concurrency
|
|
1267
|
+
@concurrency = normalize_config(value, Concurrency)
|
|
1175
1268
|
when :max_concurrent_extractions
|
|
1176
1269
|
@max_concurrent_extractions = value&.to_i
|
|
1177
1270
|
when :output_format
|
|
@@ -1228,6 +1321,12 @@ module Kreuzberg
|
|
|
1228
1321
|
end
|
|
1229
1322
|
|
|
1230
1323
|
def update_from_merged(merged)
|
|
1324
|
+
update_core_options(merged)
|
|
1325
|
+
update_processing_options(merged)
|
|
1326
|
+
update_output_options(merged)
|
|
1327
|
+
end
|
|
1328
|
+
|
|
1329
|
+
def update_core_options(merged)
|
|
1231
1330
|
@use_cache = merged.use_cache
|
|
1232
1331
|
@enable_quality_processing = merged.enable_quality_processing
|
|
1233
1332
|
@force_ocr = merged.force_ocr
|
|
@@ -1235,6 +1334,9 @@ module Kreuzberg
|
|
|
1235
1334
|
@ocr = merged.ocr
|
|
1236
1335
|
@chunking = merged.chunking
|
|
1237
1336
|
@language_detection = merged.language_detection
|
|
1337
|
+
end
|
|
1338
|
+
|
|
1339
|
+
def update_processing_options(merged)
|
|
1238
1340
|
@pdf_options = merged.pdf_options
|
|
1239
1341
|
@images = merged.image_extraction
|
|
1240
1342
|
@postprocessor = merged.postprocessor
|
|
@@ -1242,6 +1344,11 @@ module Kreuzberg
|
|
|
1242
1344
|
@keywords = merged.keywords
|
|
1243
1345
|
@html_options = merged.html_options
|
|
1244
1346
|
@pages = merged.pages
|
|
1347
|
+
@layout = merged.layout
|
|
1348
|
+
end
|
|
1349
|
+
|
|
1350
|
+
def update_output_options(merged)
|
|
1351
|
+
@concurrency = merged.concurrency
|
|
1245
1352
|
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1246
1353
|
@output_format = merged.output_format
|
|
1247
1354
|
@result_format = merged.result_format
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -240,7 +240,9 @@ module Kreuzberg
|
|
|
240
240
|
attr_reader det_db_unclip_ratio: Float?
|
|
241
241
|
attr_reader det_limit_side_len: Integer?
|
|
242
242
|
attr_reader rec_batch_num: Integer?
|
|
243
|
-
|
|
243
|
+
attr_reader padding: Integer?
|
|
244
|
+
attr_reader model_tier: String?
|
|
245
|
+
def initialize: (?language: String?, ?cache_dir: String?, ?use_angle_cls: bool?, ?enable_table_detection: bool?, ?det_db_thresh: Float?, ?det_db_box_thresh: Float?, ?det_db_unclip_ratio: Float?, ?det_limit_side_len: Integer?, ?rec_batch_num: Integer?, ?padding: Integer?, ?model_tier: String?) -> void
|
|
244
246
|
def to_h: () -> Hash[Symbol, untyped]
|
|
245
247
|
end
|
|
246
248
|
|
|
@@ -259,6 +261,9 @@ module Kreuzberg
|
|
|
259
261
|
attr_reader preset: String?
|
|
260
262
|
attr_reader embedding: Embedding?
|
|
261
263
|
attr_reader enabled: bool?
|
|
264
|
+
attr_reader sizing_type: String?
|
|
265
|
+
attr_reader sizing_model: String?
|
|
266
|
+
attr_reader sizing_cache_dir: String?
|
|
262
267
|
|
|
263
268
|
def initialize: (
|
|
264
269
|
?max_chars: Integer?,
|
|
@@ -267,7 +272,10 @@ module Kreuzberg
|
|
|
267
272
|
?embedding: (Embedding | Hash[Symbol, untyped])?,
|
|
268
273
|
?chunk_size: Integer?,
|
|
269
274
|
?chunk_overlap: Integer?,
|
|
270
|
-
?enabled: bool
|
|
275
|
+
?enabled: bool,
|
|
276
|
+
?sizing_type: String?,
|
|
277
|
+
?sizing_model: String?,
|
|
278
|
+
?sizing_cache_dir: String?
|
|
271
279
|
) -> void
|
|
272
280
|
def to_h: () -> Hash[Symbol, untyped]
|
|
273
281
|
end
|
|
@@ -326,8 +334,9 @@ module Kreuzberg
|
|
|
326
334
|
attr_reader extract_annotations: bool
|
|
327
335
|
attr_reader top_margin_fraction: Float?
|
|
328
336
|
attr_reader bottom_margin_fraction: Float?
|
|
337
|
+
attr_reader allow_single_column_tables: bool
|
|
329
338
|
|
|
330
|
-
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?) -> void
|
|
339
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?, ?allow_single_column_tables: bool) -> void
|
|
331
340
|
def to_h: () -> Hash[Symbol, untyped]
|
|
332
341
|
end
|
|
333
342
|
|
|
@@ -450,6 +459,22 @@ module Kreuzberg
|
|
|
450
459
|
def to_h: () -> Hash[Symbol, untyped]
|
|
451
460
|
end
|
|
452
461
|
|
|
462
|
+
class LayoutDetection
|
|
463
|
+
attr_reader preset: String
|
|
464
|
+
attr_reader confidence_threshold: Float?
|
|
465
|
+
attr_reader apply_heuristics: bool
|
|
466
|
+
|
|
467
|
+
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool) -> void
|
|
468
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
class Concurrency
|
|
472
|
+
attr_reader max_threads: Integer?
|
|
473
|
+
|
|
474
|
+
def initialize: (?max_threads: Integer?) -> void
|
|
475
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
476
|
+
end
|
|
477
|
+
|
|
453
478
|
class Extraction
|
|
454
479
|
attr_reader use_cache: bool
|
|
455
480
|
attr_reader enable_quality_processing: bool
|
|
@@ -465,6 +490,8 @@ module Kreuzberg
|
|
|
465
490
|
attr_reader keywords: Keywords?
|
|
466
491
|
attr_reader html_options: HtmlOptions?
|
|
467
492
|
attr_reader pages: PageConfig?
|
|
493
|
+
attr_reader layout: LayoutDetection?
|
|
494
|
+
attr_reader concurrency: Concurrency?
|
|
468
495
|
attr_reader max_concurrent_extractions: Integer?
|
|
469
496
|
attr_reader output_format: String?
|
|
470
497
|
attr_reader result_format: String?
|
|
@@ -489,6 +516,8 @@ module Kreuzberg
|
|
|
489
516
|
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
490
517
|
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
491
518
|
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
519
|
+
?layout: (LayoutDetection | Hash[Symbol, untyped])?,
|
|
520
|
+
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
492
521
|
?max_concurrent_extractions: Integer?,
|
|
493
522
|
?output_format: String?,
|
|
494
523
|
?result_format: String?
|
|
@@ -749,6 +778,7 @@ module Kreuzberg
|
|
|
749
778
|
|
|
750
779
|
type config_hash = Hash[Symbol, untyped]
|
|
751
780
|
type config_input = config_hash | _ToH
|
|
781
|
+
type file_config_input = Hash[Symbol, untyped]?
|
|
752
782
|
|
|
753
783
|
interface _ToH
|
|
754
784
|
def to_h: () -> config_hash
|
|
@@ -767,6 +797,21 @@ module Kreuzberg
|
|
|
767
797
|
def to_h: () -> table_hash
|
|
768
798
|
end
|
|
769
799
|
|
|
800
|
+
# Heading level in the document hierarchy
|
|
801
|
+
class HeadingLevel
|
|
802
|
+
attr_reader level: Integer
|
|
803
|
+
attr_reader text: String
|
|
804
|
+
|
|
805
|
+
def initialize: (level: Integer, text: String) -> void
|
|
806
|
+
end
|
|
807
|
+
|
|
808
|
+
# Heading context for a chunk's section
|
|
809
|
+
class HeadingContext
|
|
810
|
+
attr_reader headings: Array[HeadingLevel]
|
|
811
|
+
|
|
812
|
+
def initialize: (headings: Array[HeadingLevel]) -> void
|
|
813
|
+
end
|
|
814
|
+
|
|
770
815
|
# Text chunk (Struct from result.rb)
|
|
771
816
|
class Chunk
|
|
772
817
|
attr_reader content: String
|
|
@@ -778,6 +823,7 @@ module Kreuzberg
|
|
|
778
823
|
attr_reader first_page: Integer?
|
|
779
824
|
attr_reader last_page: Integer?
|
|
780
825
|
attr_reader embedding: Array[Float]?
|
|
826
|
+
attr_reader heading_context: HeadingContext?
|
|
781
827
|
|
|
782
828
|
def initialize: (
|
|
783
829
|
content: String,
|
|
@@ -788,7 +834,8 @@ module Kreuzberg
|
|
|
788
834
|
total_chunks: Integer,
|
|
789
835
|
first_page: Integer?,
|
|
790
836
|
last_page: Integer?,
|
|
791
|
-
embedding: Array[Float]
|
|
837
|
+
embedding: Array[Float]?,
|
|
838
|
+
heading_context: HeadingContext?
|
|
792
839
|
) -> void
|
|
793
840
|
def to_h: () -> chunk_hash
|
|
794
841
|
end
|
|
@@ -1156,13 +1203,15 @@ module Kreuzberg
|
|
|
1156
1203
|
|
|
1157
1204
|
def self.batch_extract_files_sync: (
|
|
1158
1205
|
paths: Array[String | Pathname],
|
|
1159
|
-
?config: config_input
|
|
1206
|
+
?config: config_input?,
|
|
1207
|
+
?file_configs: Array[file_config_input]?
|
|
1160
1208
|
) -> Array[Result]
|
|
1161
1209
|
|
|
1162
1210
|
def self.batch_extract_bytes_sync: (
|
|
1163
1211
|
data_array: Array[String],
|
|
1164
1212
|
mime_types: Array[String],
|
|
1165
|
-
?config: config_input
|
|
1213
|
+
?config: config_input?,
|
|
1214
|
+
?file_configs: Array[file_config_input]?
|
|
1166
1215
|
) -> Array[Result]
|
|
1167
1216
|
|
|
1168
1217
|
def self.extract_file: (
|
|
@@ -1179,13 +1228,15 @@ module Kreuzberg
|
|
|
1179
1228
|
|
|
1180
1229
|
def self.batch_extract_files: (
|
|
1181
1230
|
paths: Array[String | Pathname],
|
|
1182
|
-
?config: config_input
|
|
1231
|
+
?config: config_input?,
|
|
1232
|
+
?file_configs: Array[file_config_input]?
|
|
1183
1233
|
) -> Array[Result]
|
|
1184
1234
|
|
|
1185
1235
|
def self.batch_extract_bytes: (
|
|
1186
1236
|
data_array: Array[String],
|
|
1187
1237
|
mime_types: Array[String],
|
|
1188
|
-
?config: config_input
|
|
1238
|
+
?config: config_input?,
|
|
1239
|
+
?file_configs: Array[file_config_input]?
|
|
1189
1240
|
) -> Array[Result]
|
|
1190
1241
|
|
|
1191
1242
|
# Cache API
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.
|
|
4
|
+
version: 4.5.1
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|