kreuzberg 4.4.6-x86_64-linux → 4.5.1-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +8 -8
- data/README.md +4 -1
- data/lib/kreuzberg/config.rb +116 -9
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +34 -6
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a41906b6bf000dbec2fa20fd9080fca7ec759d514664aa4f3703054935888756
|
|
4
|
+
data.tar.gz: 22fb7ae544516195203c19bc133f90ae5a80d67b60a6654a17b3d8f622e6a6aa
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6380bb10531bcf895663281168a9c1dba1cd0c0185710de458e9c7b448063fcacae0fc613959f863f8eb43177965900a9d242bff6c745adcf6df6958393de769
|
|
7
|
+
data.tar.gz: b3d594c9009d729b57531b59726d6e7e3cdb064a63c9635d6642d96dcda8f842b4d970afb86931a8c096dcd9ecb1b4f89e8fdadefe292b2c6feb9deb72126f11
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.
|
|
4
|
+
kreuzberg (4.5.1)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -49,7 +49,7 @@ GEM
|
|
|
49
49
|
i18n (1.14.8)
|
|
50
50
|
concurrent-ruby (~> 1.0)
|
|
51
51
|
io-console (0.8.2)
|
|
52
|
-
json (2.19.
|
|
52
|
+
json (2.19.2)
|
|
53
53
|
json-schema (6.2.0)
|
|
54
54
|
addressable (~> 2.8)
|
|
55
55
|
bigdecimal (>= 3.1, < 5)
|
|
@@ -60,7 +60,7 @@ GEM
|
|
|
60
60
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
61
61
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
62
62
|
logger (1.7.0)
|
|
63
|
-
mcp (0.
|
|
63
|
+
mcp (0.9.0)
|
|
64
64
|
json-schema (>= 4.1)
|
|
65
65
|
method_source (1.1.0)
|
|
66
66
|
minitest (6.0.2)
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.13051)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -220,14 +220,14 @@ CHECKSUMS
|
|
|
220
220
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
221
221
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
|
-
json (2.19.
|
|
223
|
+
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.
|
|
225
|
+
kreuzberg (4.5.1)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
229
229
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
230
|
-
mcp (0.
|
|
230
|
+
mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
|
|
231
231
|
method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5
|
|
232
232
|
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
233
233
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
@@ -259,7 +259,7 @@ CHECKSUMS
|
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.13051) sha256=ae5495bf229c5e3e5e3a2e17ac4853798d993437f5b5f12b763d462183852452
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -47,6 +47,9 @@
|
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
48
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
49
49
|
</a>
|
|
50
|
+
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
|
|
52
|
+
</a>
|
|
50
53
|
</div>
|
|
51
54
|
|
|
52
55
|
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -392,7 +392,8 @@ module Kreuzberg
|
|
|
392
392
|
#
|
|
393
393
|
class PDF
|
|
394
394
|
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy,
|
|
395
|
-
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction
|
|
395
|
+
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction,
|
|
396
|
+
:allow_single_column_tables
|
|
396
397
|
|
|
397
398
|
def initialize(
|
|
398
399
|
extract_images: false,
|
|
@@ -402,7 +403,8 @@ module Kreuzberg
|
|
|
402
403
|
hierarchy: nil,
|
|
403
404
|
extract_annotations: false,
|
|
404
405
|
top_margin_fraction: nil,
|
|
405
|
-
bottom_margin_fraction: nil
|
|
406
|
+
bottom_margin_fraction: nil,
|
|
407
|
+
allow_single_column_tables: false
|
|
406
408
|
)
|
|
407
409
|
@extract_images = extract_images ? true : false
|
|
408
410
|
@passwords = if passwords.is_a?(Array)
|
|
@@ -416,6 +418,7 @@ module Kreuzberg
|
|
|
416
418
|
@extract_annotations = extract_annotations ? true : false
|
|
417
419
|
@top_margin_fraction = top_margin_fraction&.to_f
|
|
418
420
|
@bottom_margin_fraction = bottom_margin_fraction&.to_f
|
|
421
|
+
@allow_single_column_tables = allow_single_column_tables ? true : false
|
|
419
422
|
end
|
|
420
423
|
|
|
421
424
|
def to_h
|
|
@@ -427,7 +430,8 @@ module Kreuzberg
|
|
|
427
430
|
hierarchy: @hierarchy&.to_h,
|
|
428
431
|
extract_annotations: @extract_annotations,
|
|
429
432
|
top_margin_fraction: @top_margin_fraction,
|
|
430
|
-
bottom_margin_fraction: @bottom_margin_fraction
|
|
433
|
+
bottom_margin_fraction: @bottom_margin_fraction,
|
|
434
|
+
allow_single_column_tables: @allow_single_column_tables
|
|
431
435
|
}.compact
|
|
432
436
|
end
|
|
433
437
|
|
|
@@ -803,6 +807,85 @@ module Kreuzberg
|
|
|
803
807
|
end
|
|
804
808
|
end
|
|
805
809
|
|
|
810
|
+
# Hardware acceleration configuration for ONNX Runtime
|
|
811
|
+
#
|
|
812
|
+
# Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
|
813
|
+
# for inference in layout detection and embedding generation.
|
|
814
|
+
#
|
|
815
|
+
# @example Auto-select provider (CoreML on macOS, CUDA on Linux, CPU elsewhere)
|
|
816
|
+
# acceleration = Acceleration.new
|
|
817
|
+
#
|
|
818
|
+
# @example Force CPU only
|
|
819
|
+
# acceleration = Acceleration.new(provider: 'cpu')
|
|
820
|
+
#
|
|
821
|
+
# @example Use CUDA with specific device
|
|
822
|
+
# acceleration = Acceleration.new(provider: 'cuda', device_id: 0)
|
|
823
|
+
#
|
|
824
|
+
class Acceleration
|
|
825
|
+
attr_reader :provider, :device_id
|
|
826
|
+
|
|
827
|
+
def initialize(provider: 'auto', device_id: 0)
|
|
828
|
+
@provider = provider.to_s
|
|
829
|
+
@device_id = device_id.to_i
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
def to_h
|
|
833
|
+
{
|
|
834
|
+
provider: @provider,
|
|
835
|
+
device_id: @device_id
|
|
836
|
+
}
|
|
837
|
+
end
|
|
838
|
+
end
|
|
839
|
+
|
|
840
|
+
# Layout detection configuration
|
|
841
|
+
#
|
|
842
|
+
# @example Basic usage with fast preset
|
|
843
|
+
# layout = LayoutDetection.new(preset: "fast")
|
|
844
|
+
#
|
|
845
|
+
# @example Accurate preset with custom threshold
|
|
846
|
+
# layout = LayoutDetection.new(
|
|
847
|
+
# preset: "accurate",
|
|
848
|
+
# confidence_threshold: 0.5,
|
|
849
|
+
# apply_heuristics: true
|
|
850
|
+
# )
|
|
851
|
+
#
|
|
852
|
+
class LayoutDetection
|
|
853
|
+
attr_reader :preset, :confidence_threshold, :apply_heuristics
|
|
854
|
+
|
|
855
|
+
def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true)
|
|
856
|
+
@preset = preset.to_s
|
|
857
|
+
@confidence_threshold = confidence_threshold&.to_f
|
|
858
|
+
@apply_heuristics = apply_heuristics ? true : false
|
|
859
|
+
end
|
|
860
|
+
|
|
861
|
+
def to_h
|
|
862
|
+
{
|
|
863
|
+
preset: @preset,
|
|
864
|
+
confidence_threshold: @confidence_threshold,
|
|
865
|
+
apply_heuristics: @apply_heuristics
|
|
866
|
+
}.compact
|
|
867
|
+
end
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
# Concurrency configuration for thread pool management
|
|
871
|
+
#
|
|
872
|
+
# @example Limit max threads
|
|
873
|
+
# concurrency = Concurrency.new(max_threads: 4)
|
|
874
|
+
#
|
|
875
|
+
class Concurrency
|
|
876
|
+
attr_reader :max_threads
|
|
877
|
+
|
|
878
|
+
def initialize(max_threads: nil)
|
|
879
|
+
@max_threads = max_threads&.to_i
|
|
880
|
+
end
|
|
881
|
+
|
|
882
|
+
def to_h
|
|
883
|
+
h = {}
|
|
884
|
+
h[:max_threads] = @max_threads unless @max_threads.nil?
|
|
885
|
+
h
|
|
886
|
+
end
|
|
887
|
+
end
|
|
888
|
+
|
|
806
889
|
# Main extraction configuration
|
|
807
890
|
#
|
|
808
891
|
# @example Basic usage
|
|
@@ -847,7 +930,7 @@ module Kreuzberg
|
|
|
847
930
|
:images, :postprocessor,
|
|
848
931
|
:token_reduction, :keywords, :html_options, :pages,
|
|
849
932
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
850
|
-
:security_limits
|
|
933
|
+
:security_limits, :layout, :concurrency
|
|
851
934
|
|
|
852
935
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
853
936
|
alias image_extraction images
|
|
@@ -872,7 +955,7 @@ module Kreuzberg
|
|
|
872
955
|
language_detection pdf_options image_extraction
|
|
873
956
|
postprocessor token_reduction keywords html_options pages
|
|
874
957
|
max_concurrent_extractions output_format result_format
|
|
875
|
-
security_limits
|
|
958
|
+
security_limits layout concurrency
|
|
876
959
|
].freeze
|
|
877
960
|
|
|
878
961
|
# Aliases for backward compatibility
|
|
@@ -947,7 +1030,9 @@ module Kreuzberg
|
|
|
947
1030
|
max_concurrent_extractions: nil,
|
|
948
1031
|
output_format: nil,
|
|
949
1032
|
result_format: nil,
|
|
950
|
-
security_limits: nil
|
|
1033
|
+
security_limits: nil,
|
|
1034
|
+
layout: nil,
|
|
1035
|
+
concurrency: nil)
|
|
951
1036
|
kwargs = {
|
|
952
1037
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
953
1038
|
force_ocr: force_ocr, include_document_structure: include_document_structure,
|
|
@@ -957,7 +1042,8 @@ module Kreuzberg
|
|
|
957
1042
|
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
958
1043
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
959
1044
|
output_format: output_format, result_format: result_format,
|
|
960
|
-
security_limits: security_limits
|
|
1045
|
+
security_limits: security_limits, layout: layout,
|
|
1046
|
+
concurrency: concurrency
|
|
961
1047
|
}
|
|
962
1048
|
extracted = extract_from_hash(hash, kwargs)
|
|
963
1049
|
|
|
@@ -971,7 +1057,7 @@ module Kreuzberg
|
|
|
971
1057
|
defaults.merge(hash.slice(*defaults.keys))
|
|
972
1058
|
end
|
|
973
1059
|
|
|
974
|
-
def assign_attributes(params)
|
|
1060
|
+
def assign_attributes(params) # rubocop:disable Metrics/MethodLength
|
|
975
1061
|
@use_cache = params[:use_cache] ? true : false
|
|
976
1062
|
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
977
1063
|
@force_ocr = params[:force_ocr] ? true : false
|
|
@@ -986,6 +1072,8 @@ module Kreuzberg
|
|
|
986
1072
|
@keywords = normalize_config(params[:keywords], Keywords)
|
|
987
1073
|
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
988
1074
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
1075
|
+
@layout = normalize_config(params[:layout], LayoutDetection)
|
|
1076
|
+
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
989
1077
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
990
1078
|
@output_format = validate_output_format(params[:output_format])
|
|
991
1079
|
@result_format = validate_result_format(params[:result_format])
|
|
@@ -1034,7 +1122,8 @@ module Kreuzberg
|
|
|
1034
1122
|
language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
|
|
1035
1123
|
image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
|
|
1036
1124
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1037
|
-
html_options: @html_options&.to_h, pages: @pages&.to_h
|
|
1125
|
+
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1126
|
+
layout: @layout&.to_h, concurrency: @concurrency&.to_h
|
|
1038
1127
|
}
|
|
1039
1128
|
end
|
|
1040
1129
|
|
|
@@ -1172,6 +1261,10 @@ module Kreuzberg
|
|
|
1172
1261
|
@html_options = normalize_config(value, HtmlOptions)
|
|
1173
1262
|
when :pages
|
|
1174
1263
|
@pages = normalize_config(value, PageConfig)
|
|
1264
|
+
when :layout
|
|
1265
|
+
@layout = normalize_config(value, LayoutDetection)
|
|
1266
|
+
when :concurrency
|
|
1267
|
+
@concurrency = normalize_config(value, Concurrency)
|
|
1175
1268
|
when :max_concurrent_extractions
|
|
1176
1269
|
@max_concurrent_extractions = value&.to_i
|
|
1177
1270
|
when :output_format
|
|
@@ -1228,6 +1321,12 @@ module Kreuzberg
|
|
|
1228
1321
|
end
|
|
1229
1322
|
|
|
1230
1323
|
def update_from_merged(merged)
|
|
1324
|
+
update_core_options(merged)
|
|
1325
|
+
update_processing_options(merged)
|
|
1326
|
+
update_output_options(merged)
|
|
1327
|
+
end
|
|
1328
|
+
|
|
1329
|
+
def update_core_options(merged)
|
|
1231
1330
|
@use_cache = merged.use_cache
|
|
1232
1331
|
@enable_quality_processing = merged.enable_quality_processing
|
|
1233
1332
|
@force_ocr = merged.force_ocr
|
|
@@ -1235,6 +1334,9 @@ module Kreuzberg
|
|
|
1235
1334
|
@ocr = merged.ocr
|
|
1236
1335
|
@chunking = merged.chunking
|
|
1237
1336
|
@language_detection = merged.language_detection
|
|
1337
|
+
end
|
|
1338
|
+
|
|
1339
|
+
def update_processing_options(merged)
|
|
1238
1340
|
@pdf_options = merged.pdf_options
|
|
1239
1341
|
@images = merged.image_extraction
|
|
1240
1342
|
@postprocessor = merged.postprocessor
|
|
@@ -1242,6 +1344,11 @@ module Kreuzberg
|
|
|
1242
1344
|
@keywords = merged.keywords
|
|
1243
1345
|
@html_options = merged.html_options
|
|
1244
1346
|
@pages = merged.pages
|
|
1347
|
+
@layout = merged.layout
|
|
1348
|
+
end
|
|
1349
|
+
|
|
1350
|
+
def update_output_options(merged)
|
|
1351
|
+
@concurrency = merged.concurrency
|
|
1245
1352
|
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1246
1353
|
@output_format = merged.output_format
|
|
1247
1354
|
@result_format = merged.result_format
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -240,7 +240,9 @@ module Kreuzberg
|
|
|
240
240
|
attr_reader det_db_unclip_ratio: Float?
|
|
241
241
|
attr_reader det_limit_side_len: Integer?
|
|
242
242
|
attr_reader rec_batch_num: Integer?
|
|
243
|
-
|
|
243
|
+
attr_reader padding: Integer?
|
|
244
|
+
attr_reader model_tier: String?
|
|
245
|
+
def initialize: (?language: String?, ?cache_dir: String?, ?use_angle_cls: bool?, ?enable_table_detection: bool?, ?det_db_thresh: Float?, ?det_db_box_thresh: Float?, ?det_db_unclip_ratio: Float?, ?det_limit_side_len: Integer?, ?rec_batch_num: Integer?, ?padding: Integer?, ?model_tier: String?) -> void
|
|
244
246
|
def to_h: () -> Hash[Symbol, untyped]
|
|
245
247
|
end
|
|
246
248
|
|
|
@@ -332,8 +334,9 @@ module Kreuzberg
|
|
|
332
334
|
attr_reader extract_annotations: bool
|
|
333
335
|
attr_reader top_margin_fraction: Float?
|
|
334
336
|
attr_reader bottom_margin_fraction: Float?
|
|
337
|
+
attr_reader allow_single_column_tables: bool
|
|
335
338
|
|
|
336
|
-
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?) -> void
|
|
339
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?, ?allow_single_column_tables: bool) -> void
|
|
337
340
|
def to_h: () -> Hash[Symbol, untyped]
|
|
338
341
|
end
|
|
339
342
|
|
|
@@ -456,6 +459,22 @@ module Kreuzberg
|
|
|
456
459
|
def to_h: () -> Hash[Symbol, untyped]
|
|
457
460
|
end
|
|
458
461
|
|
|
462
|
+
class LayoutDetection
|
|
463
|
+
attr_reader preset: String
|
|
464
|
+
attr_reader confidence_threshold: Float?
|
|
465
|
+
attr_reader apply_heuristics: bool
|
|
466
|
+
|
|
467
|
+
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool) -> void
|
|
468
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
class Concurrency
|
|
472
|
+
attr_reader max_threads: Integer?
|
|
473
|
+
|
|
474
|
+
def initialize: (?max_threads: Integer?) -> void
|
|
475
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
476
|
+
end
|
|
477
|
+
|
|
459
478
|
class Extraction
|
|
460
479
|
attr_reader use_cache: bool
|
|
461
480
|
attr_reader enable_quality_processing: bool
|
|
@@ -471,6 +490,8 @@ module Kreuzberg
|
|
|
471
490
|
attr_reader keywords: Keywords?
|
|
472
491
|
attr_reader html_options: HtmlOptions?
|
|
473
492
|
attr_reader pages: PageConfig?
|
|
493
|
+
attr_reader layout: LayoutDetection?
|
|
494
|
+
attr_reader concurrency: Concurrency?
|
|
474
495
|
attr_reader max_concurrent_extractions: Integer?
|
|
475
496
|
attr_reader output_format: String?
|
|
476
497
|
attr_reader result_format: String?
|
|
@@ -495,6 +516,8 @@ module Kreuzberg
|
|
|
495
516
|
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
496
517
|
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
497
518
|
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
519
|
+
?layout: (LayoutDetection | Hash[Symbol, untyped])?,
|
|
520
|
+
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
498
521
|
?max_concurrent_extractions: Integer?,
|
|
499
522
|
?output_format: String?,
|
|
500
523
|
?result_format: String?
|
|
@@ -755,6 +778,7 @@ module Kreuzberg
|
|
|
755
778
|
|
|
756
779
|
type config_hash = Hash[Symbol, untyped]
|
|
757
780
|
type config_input = config_hash | _ToH
|
|
781
|
+
type file_config_input = Hash[Symbol, untyped]?
|
|
758
782
|
|
|
759
783
|
interface _ToH
|
|
760
784
|
def to_h: () -> config_hash
|
|
@@ -1179,13 +1203,15 @@ module Kreuzberg
|
|
|
1179
1203
|
|
|
1180
1204
|
def self.batch_extract_files_sync: (
|
|
1181
1205
|
paths: Array[String | Pathname],
|
|
1182
|
-
?config: config_input
|
|
1206
|
+
?config: config_input?,
|
|
1207
|
+
?file_configs: Array[file_config_input]?
|
|
1183
1208
|
) -> Array[Result]
|
|
1184
1209
|
|
|
1185
1210
|
def self.batch_extract_bytes_sync: (
|
|
1186
1211
|
data_array: Array[String],
|
|
1187
1212
|
mime_types: Array[String],
|
|
1188
|
-
?config: config_input
|
|
1213
|
+
?config: config_input?,
|
|
1214
|
+
?file_configs: Array[file_config_input]?
|
|
1189
1215
|
) -> Array[Result]
|
|
1190
1216
|
|
|
1191
1217
|
def self.extract_file: (
|
|
@@ -1202,13 +1228,15 @@ module Kreuzberg
|
|
|
1202
1228
|
|
|
1203
1229
|
def self.batch_extract_files: (
|
|
1204
1230
|
paths: Array[String | Pathname],
|
|
1205
|
-
?config: config_input
|
|
1231
|
+
?config: config_input?,
|
|
1232
|
+
?file_configs: Array[file_config_input]?
|
|
1206
1233
|
) -> Array[Result]
|
|
1207
1234
|
|
|
1208
1235
|
def self.batch_extract_bytes: (
|
|
1209
1236
|
data_array: Array[String],
|
|
1210
1237
|
mime_types: Array[String],
|
|
1211
|
-
?config: config_input
|
|
1238
|
+
?config: config_input?,
|
|
1239
|
+
?file_configs: Array[file_config_input]?
|
|
1212
1240
|
) -> Array[Result]
|
|
1213
1241
|
|
|
1214
1242
|
# Cache API
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.
|
|
4
|
+
version: 4.5.1
|
|
5
5
|
platform: x86_64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|