kreuzberg 4.4.6-aarch64-linux → 4.5.2-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +8 -8
- data/README.md +4 -1
- data/lib/kreuzberg/config.rb +132 -10
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg_rb.so +0 -0
- data/sig/kreuzberg.rbs +39 -7
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5d37d9bc188b071a5a193db3a4cf610caaff6e8b9a84ef04bebfbf2abd5f7f16
|
|
4
|
+
data.tar.gz: 7d568f4f6b5e8e1be95ac814b55974d348a460c1ab4949aaafe3b0de9a3d9dbe
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9d5a0a4d9e6917ef0ac38e319410752e0032de7742a167a2a34e5c48356f1af145d1a2f56f0d7d6d28fef439c4f82710655116f03f1077c851f521e5fc4ba09c
|
|
7
|
+
data.tar.gz: '01898e03048b285ce8347ec975041bea2e633fed829abeaf8d1842fcc5137d8dacecf54f971ddc5a9a61261dc45ac30de3c408f8bab696af4597a648e0683a31'
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.
|
|
4
|
+
kreuzberg (4.5.2)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -49,7 +49,7 @@ GEM
|
|
|
49
49
|
i18n (1.14.8)
|
|
50
50
|
concurrent-ruby (~> 1.0)
|
|
51
51
|
io-console (0.8.2)
|
|
52
|
-
json (2.19.
|
|
52
|
+
json (2.19.2)
|
|
53
53
|
json-schema (6.2.0)
|
|
54
54
|
addressable (~> 2.8)
|
|
55
55
|
bigdecimal (>= 3.1, < 5)
|
|
@@ -60,7 +60,7 @@ GEM
|
|
|
60
60
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
61
61
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
62
62
|
logger (1.7.0)
|
|
63
|
-
mcp (0.
|
|
63
|
+
mcp (0.9.0)
|
|
64
64
|
json-schema (>= 4.1)
|
|
65
65
|
method_source (1.1.0)
|
|
66
66
|
minitest (6.0.2)
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.13055)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -220,14 +220,14 @@ CHECKSUMS
|
|
|
220
220
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
221
221
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
|
-
json (2.19.
|
|
223
|
+
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.
|
|
225
|
+
kreuzberg (4.5.2)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
229
229
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
230
|
-
mcp (0.
|
|
230
|
+
mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
|
|
231
231
|
method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5
|
|
232
232
|
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
233
233
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
@@ -259,7 +259,7 @@ CHECKSUMS
|
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.13055) sha256=c8ae8c81310e0a28d290b11f44ddca59659b7d7f13752c0ef5d16964bbb84d18
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -47,6 +47,9 @@
|
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
48
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
49
49
|
</a>
|
|
50
|
+
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
|
|
52
|
+
</a>
|
|
50
53
|
</div>
|
|
51
54
|
|
|
52
55
|
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -392,7 +392,8 @@ module Kreuzberg
|
|
|
392
392
|
#
|
|
393
393
|
class PDF
|
|
394
394
|
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy,
|
|
395
|
-
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction
|
|
395
|
+
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction,
|
|
396
|
+
:allow_single_column_tables
|
|
396
397
|
|
|
397
398
|
def initialize(
|
|
398
399
|
extract_images: false,
|
|
@@ -402,7 +403,8 @@ module Kreuzberg
|
|
|
402
403
|
hierarchy: nil,
|
|
403
404
|
extract_annotations: false,
|
|
404
405
|
top_margin_fraction: nil,
|
|
405
|
-
bottom_margin_fraction: nil
|
|
406
|
+
bottom_margin_fraction: nil,
|
|
407
|
+
allow_single_column_tables: false
|
|
406
408
|
)
|
|
407
409
|
@extract_images = extract_images ? true : false
|
|
408
410
|
@passwords = if passwords.is_a?(Array)
|
|
@@ -416,6 +418,7 @@ module Kreuzberg
|
|
|
416
418
|
@extract_annotations = extract_annotations ? true : false
|
|
417
419
|
@top_margin_fraction = top_margin_fraction&.to_f
|
|
418
420
|
@bottom_margin_fraction = bottom_margin_fraction&.to_f
|
|
421
|
+
@allow_single_column_tables = allow_single_column_tables ? true : false
|
|
419
422
|
end
|
|
420
423
|
|
|
421
424
|
def to_h
|
|
@@ -427,7 +430,8 @@ module Kreuzberg
|
|
|
427
430
|
hierarchy: @hierarchy&.to_h,
|
|
428
431
|
extract_annotations: @extract_annotations,
|
|
429
432
|
top_margin_fraction: @top_margin_fraction,
|
|
430
|
-
bottom_margin_fraction: @bottom_margin_fraction
|
|
433
|
+
bottom_margin_fraction: @bottom_margin_fraction,
|
|
434
|
+
allow_single_column_tables: @allow_single_column_tables
|
|
431
435
|
}.compact
|
|
432
436
|
end
|
|
433
437
|
|
|
@@ -803,6 +807,85 @@ module Kreuzberg
|
|
|
803
807
|
end
|
|
804
808
|
end
|
|
805
809
|
|
|
810
|
+
# Hardware acceleration configuration for ONNX Runtime
|
|
811
|
+
#
|
|
812
|
+
# Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
|
813
|
+
# for inference in layout detection and embedding generation.
|
|
814
|
+
#
|
|
815
|
+
# @example Auto-select provider (CoreML on macOS, CUDA on Linux, CPU elsewhere)
|
|
816
|
+
# acceleration = Acceleration.new
|
|
817
|
+
#
|
|
818
|
+
# @example Force CPU only
|
|
819
|
+
# acceleration = Acceleration.new(provider: 'cpu')
|
|
820
|
+
#
|
|
821
|
+
# @example Use CUDA with specific device
|
|
822
|
+
# acceleration = Acceleration.new(provider: 'cuda', device_id: 0)
|
|
823
|
+
#
|
|
824
|
+
class Acceleration
|
|
825
|
+
attr_reader :provider, :device_id
|
|
826
|
+
|
|
827
|
+
def initialize(provider: 'auto', device_id: 0)
|
|
828
|
+
@provider = provider.to_s
|
|
829
|
+
@device_id = device_id.to_i
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
def to_h
|
|
833
|
+
{
|
|
834
|
+
provider: @provider,
|
|
835
|
+
device_id: @device_id
|
|
836
|
+
}
|
|
837
|
+
end
|
|
838
|
+
end
|
|
839
|
+
|
|
840
|
+
# Layout detection configuration
|
|
841
|
+
#
|
|
842
|
+
# @example Basic usage with fast preset
|
|
843
|
+
# layout = LayoutDetection.new(preset: "fast")
|
|
844
|
+
#
|
|
845
|
+
# @example Accurate preset with custom threshold
|
|
846
|
+
# layout = LayoutDetection.new(
|
|
847
|
+
# preset: "accurate",
|
|
848
|
+
# confidence_threshold: 0.5,
|
|
849
|
+
# apply_heuristics: true
|
|
850
|
+
# )
|
|
851
|
+
#
|
|
852
|
+
class LayoutDetection
|
|
853
|
+
attr_reader :preset, :confidence_threshold, :apply_heuristics
|
|
854
|
+
|
|
855
|
+
def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true)
|
|
856
|
+
@preset = preset.to_s
|
|
857
|
+
@confidence_threshold = confidence_threshold&.to_f
|
|
858
|
+
@apply_heuristics = apply_heuristics ? true : false
|
|
859
|
+
end
|
|
860
|
+
|
|
861
|
+
def to_h
|
|
862
|
+
{
|
|
863
|
+
preset: @preset,
|
|
864
|
+
confidence_threshold: @confidence_threshold,
|
|
865
|
+
apply_heuristics: @apply_heuristics
|
|
866
|
+
}.compact
|
|
867
|
+
end
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
# Concurrency configuration for thread pool management
|
|
871
|
+
#
|
|
872
|
+
# @example Limit max threads
|
|
873
|
+
# concurrency = Concurrency.new(max_threads: 4)
|
|
874
|
+
#
|
|
875
|
+
class Concurrency
|
|
876
|
+
attr_reader :max_threads
|
|
877
|
+
|
|
878
|
+
def initialize(max_threads: nil)
|
|
879
|
+
@max_threads = max_threads&.to_i
|
|
880
|
+
end
|
|
881
|
+
|
|
882
|
+
def to_h
|
|
883
|
+
h = {}
|
|
884
|
+
h[:max_threads] = @max_threads unless @max_threads.nil?
|
|
885
|
+
h
|
|
886
|
+
end
|
|
887
|
+
end
|
|
888
|
+
|
|
806
889
|
# Main extraction configuration
|
|
807
890
|
#
|
|
808
891
|
# @example Basic usage
|
|
@@ -847,7 +930,8 @@ module Kreuzberg
|
|
|
847
930
|
:images, :postprocessor,
|
|
848
931
|
:token_reduction, :keywords, :html_options, :pages,
|
|
849
932
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
850
|
-
:security_limits
|
|
933
|
+
:security_limits, :layout, :concurrency,
|
|
934
|
+
:cache_namespace, :cache_ttl_secs
|
|
851
935
|
|
|
852
936
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
853
937
|
alias image_extraction images
|
|
@@ -872,7 +956,7 @@ module Kreuzberg
|
|
|
872
956
|
language_detection pdf_options image_extraction
|
|
873
957
|
postprocessor token_reduction keywords html_options pages
|
|
874
958
|
max_concurrent_extractions output_format result_format
|
|
875
|
-
security_limits
|
|
959
|
+
security_limits layout concurrency cache_namespace cache_ttl_secs
|
|
876
960
|
].freeze
|
|
877
961
|
|
|
878
962
|
# Aliases for backward compatibility
|
|
@@ -947,7 +1031,11 @@ module Kreuzberg
|
|
|
947
1031
|
max_concurrent_extractions: nil,
|
|
948
1032
|
output_format: nil,
|
|
949
1033
|
result_format: nil,
|
|
950
|
-
security_limits: nil
|
|
1034
|
+
security_limits: nil,
|
|
1035
|
+
layout: nil,
|
|
1036
|
+
concurrency: nil,
|
|
1037
|
+
cache_namespace: nil,
|
|
1038
|
+
cache_ttl_secs: nil)
|
|
951
1039
|
kwargs = {
|
|
952
1040
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
953
1041
|
force_ocr: force_ocr, include_document_structure: include_document_structure,
|
|
@@ -957,7 +1045,10 @@ module Kreuzberg
|
|
|
957
1045
|
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
958
1046
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
959
1047
|
output_format: output_format, result_format: result_format,
|
|
960
|
-
security_limits: security_limits
|
|
1048
|
+
security_limits: security_limits, layout: layout,
|
|
1049
|
+
concurrency: concurrency,
|
|
1050
|
+
cache_namespace: cache_namespace,
|
|
1051
|
+
cache_ttl_secs: cache_ttl_secs
|
|
961
1052
|
}
|
|
962
1053
|
extracted = extract_from_hash(hash, kwargs)
|
|
963
1054
|
|
|
@@ -971,7 +1062,7 @@ module Kreuzberg
|
|
|
971
1062
|
defaults.merge(hash.slice(*defaults.keys))
|
|
972
1063
|
end
|
|
973
1064
|
|
|
974
|
-
def assign_attributes(params)
|
|
1065
|
+
def assign_attributes(params) # rubocop:disable Metrics/MethodLength
|
|
975
1066
|
@use_cache = params[:use_cache] ? true : false
|
|
976
1067
|
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
977
1068
|
@force_ocr = params[:force_ocr] ? true : false
|
|
@@ -986,9 +1077,13 @@ module Kreuzberg
|
|
|
986
1077
|
@keywords = normalize_config(params[:keywords], Keywords)
|
|
987
1078
|
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
988
1079
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
1080
|
+
@layout = normalize_config(params[:layout], LayoutDetection)
|
|
1081
|
+
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
989
1082
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
990
1083
|
@output_format = validate_output_format(params[:output_format])
|
|
991
1084
|
@result_format = validate_result_format(params[:result_format])
|
|
1085
|
+
@cache_namespace = params[:cache_namespace]
|
|
1086
|
+
@cache_ttl_secs = params[:cache_ttl_secs]&.to_i
|
|
992
1087
|
@security_limits = params[:security_limits]
|
|
993
1088
|
end
|
|
994
1089
|
|
|
@@ -1024,7 +1119,9 @@ module Kreuzberg
|
|
|
1024
1119
|
include_document_structure: @include_document_structure,
|
|
1025
1120
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1026
1121
|
output_format: @output_format,
|
|
1027
|
-
result_format: @result_format
|
|
1122
|
+
result_format: @result_format,
|
|
1123
|
+
cache_namespace: @cache_namespace,
|
|
1124
|
+
cache_ttl_secs: @cache_ttl_secs
|
|
1028
1125
|
}
|
|
1029
1126
|
end
|
|
1030
1127
|
|
|
@@ -1034,7 +1131,8 @@ module Kreuzberg
|
|
|
1034
1131
|
language_detection: @language_detection&.to_h, pdf_options: @pdf_options&.to_h,
|
|
1035
1132
|
image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
|
|
1036
1133
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1037
|
-
html_options: @html_options&.to_h, pages: @pages&.to_h
|
|
1134
|
+
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1135
|
+
layout: @layout&.to_h, concurrency: @concurrency&.to_h
|
|
1038
1136
|
}
|
|
1039
1137
|
end
|
|
1040
1138
|
|
|
@@ -1172,12 +1270,20 @@ module Kreuzberg
|
|
|
1172
1270
|
@html_options = normalize_config(value, HtmlOptions)
|
|
1173
1271
|
when :pages
|
|
1174
1272
|
@pages = normalize_config(value, PageConfig)
|
|
1273
|
+
when :layout
|
|
1274
|
+
@layout = normalize_config(value, LayoutDetection)
|
|
1275
|
+
when :concurrency
|
|
1276
|
+
@concurrency = normalize_config(value, Concurrency)
|
|
1175
1277
|
when :max_concurrent_extractions
|
|
1176
1278
|
@max_concurrent_extractions = value&.to_i
|
|
1177
1279
|
when :output_format
|
|
1178
1280
|
@output_format = validate_output_format(value)
|
|
1179
1281
|
when :result_format
|
|
1180
1282
|
@result_format = validate_result_format(value)
|
|
1283
|
+
when :cache_namespace
|
|
1284
|
+
@cache_namespace = value
|
|
1285
|
+
when :cache_ttl_secs
|
|
1286
|
+
@cache_ttl_secs = value&.to_i
|
|
1181
1287
|
else
|
|
1182
1288
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1183
1289
|
end
|
|
@@ -1228,6 +1334,12 @@ module Kreuzberg
|
|
|
1228
1334
|
end
|
|
1229
1335
|
|
|
1230
1336
|
def update_from_merged(merged)
|
|
1337
|
+
update_core_options(merged)
|
|
1338
|
+
update_processing_options(merged)
|
|
1339
|
+
update_output_options(merged)
|
|
1340
|
+
end
|
|
1341
|
+
|
|
1342
|
+
def update_core_options(merged)
|
|
1231
1343
|
@use_cache = merged.use_cache
|
|
1232
1344
|
@enable_quality_processing = merged.enable_quality_processing
|
|
1233
1345
|
@force_ocr = merged.force_ocr
|
|
@@ -1235,6 +1347,9 @@ module Kreuzberg
|
|
|
1235
1347
|
@ocr = merged.ocr
|
|
1236
1348
|
@chunking = merged.chunking
|
|
1237
1349
|
@language_detection = merged.language_detection
|
|
1350
|
+
end
|
|
1351
|
+
|
|
1352
|
+
def update_processing_options(merged)
|
|
1238
1353
|
@pdf_options = merged.pdf_options
|
|
1239
1354
|
@images = merged.image_extraction
|
|
1240
1355
|
@postprocessor = merged.postprocessor
|
|
@@ -1242,9 +1357,16 @@ module Kreuzberg
|
|
|
1242
1357
|
@keywords = merged.keywords
|
|
1243
1358
|
@html_options = merged.html_options
|
|
1244
1359
|
@pages = merged.pages
|
|
1360
|
+
@layout = merged.layout
|
|
1361
|
+
end
|
|
1362
|
+
|
|
1363
|
+
def update_output_options(merged)
|
|
1364
|
+
@concurrency = merged.concurrency
|
|
1245
1365
|
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1246
1366
|
@output_format = merged.output_format
|
|
1247
1367
|
@result_format = merged.result_format
|
|
1368
|
+
@cache_namespace = merged.cache_namespace
|
|
1369
|
+
@cache_ttl_secs = merged.cache_ttl_secs
|
|
1248
1370
|
end
|
|
1249
1371
|
end
|
|
1250
1372
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg_rb.so
CHANGED
|
Binary file
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -240,7 +240,9 @@ module Kreuzberg
|
|
|
240
240
|
attr_reader det_db_unclip_ratio: Float?
|
|
241
241
|
attr_reader det_limit_side_len: Integer?
|
|
242
242
|
attr_reader rec_batch_num: Integer?
|
|
243
|
-
|
|
243
|
+
attr_reader padding: Integer?
|
|
244
|
+
attr_reader model_tier: String?
|
|
245
|
+
def initialize: (?language: String?, ?cache_dir: String?, ?use_angle_cls: bool?, ?enable_table_detection: bool?, ?det_db_thresh: Float?, ?det_db_box_thresh: Float?, ?det_db_unclip_ratio: Float?, ?det_limit_side_len: Integer?, ?rec_batch_num: Integer?, ?padding: Integer?, ?model_tier: String?) -> void
|
|
244
246
|
def to_h: () -> Hash[Symbol, untyped]
|
|
245
247
|
end
|
|
246
248
|
|
|
@@ -332,8 +334,9 @@ module Kreuzberg
|
|
|
332
334
|
attr_reader extract_annotations: bool
|
|
333
335
|
attr_reader top_margin_fraction: Float?
|
|
334
336
|
attr_reader bottom_margin_fraction: Float?
|
|
337
|
+
attr_reader allow_single_column_tables: bool
|
|
335
338
|
|
|
336
|
-
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?) -> void
|
|
339
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?, ?allow_single_column_tables: bool) -> void
|
|
337
340
|
def to_h: () -> Hash[Symbol, untyped]
|
|
338
341
|
end
|
|
339
342
|
|
|
@@ -456,9 +459,27 @@ module Kreuzberg
|
|
|
456
459
|
def to_h: () -> Hash[Symbol, untyped]
|
|
457
460
|
end
|
|
458
461
|
|
|
462
|
+
class LayoutDetection
|
|
463
|
+
attr_reader preset: String
|
|
464
|
+
attr_reader confidence_threshold: Float?
|
|
465
|
+
attr_reader apply_heuristics: bool
|
|
466
|
+
|
|
467
|
+
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool) -> void
|
|
468
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
class Concurrency
|
|
472
|
+
attr_reader max_threads: Integer?
|
|
473
|
+
|
|
474
|
+
def initialize: (?max_threads: Integer?) -> void
|
|
475
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
476
|
+
end
|
|
477
|
+
|
|
459
478
|
class Extraction
|
|
460
479
|
attr_reader use_cache: bool
|
|
461
480
|
attr_reader enable_quality_processing: bool
|
|
481
|
+
attr_reader cache_namespace: String?
|
|
482
|
+
attr_reader cache_ttl_secs: Integer?
|
|
462
483
|
attr_reader force_ocr: bool
|
|
463
484
|
attr_reader include_document_structure: bool
|
|
464
485
|
attr_reader ocr: OCR?
|
|
@@ -471,6 +492,8 @@ module Kreuzberg
|
|
|
471
492
|
attr_reader keywords: Keywords?
|
|
472
493
|
attr_reader html_options: HtmlOptions?
|
|
473
494
|
attr_reader pages: PageConfig?
|
|
495
|
+
attr_reader layout: LayoutDetection?
|
|
496
|
+
attr_reader concurrency: Concurrency?
|
|
474
497
|
attr_reader max_concurrent_extractions: Integer?
|
|
475
498
|
attr_reader output_format: String?
|
|
476
499
|
attr_reader result_format: String?
|
|
@@ -495,9 +518,13 @@ module Kreuzberg
|
|
|
495
518
|
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
496
519
|
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
497
520
|
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
521
|
+
?layout: (LayoutDetection | Hash[Symbol, untyped])?,
|
|
522
|
+
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
498
523
|
?max_concurrent_extractions: Integer?,
|
|
499
524
|
?output_format: String?,
|
|
500
|
-
?result_format: String
|
|
525
|
+
?result_format: String?,
|
|
526
|
+
?cache_namespace: String?,
|
|
527
|
+
?cache_ttl_secs: Integer?
|
|
501
528
|
) -> void
|
|
502
529
|
def to_h: () -> Hash[Symbol, untyped]
|
|
503
530
|
def to_json: (*untyped) -> String
|
|
@@ -755,6 +782,7 @@ module Kreuzberg
|
|
|
755
782
|
|
|
756
783
|
type config_hash = Hash[Symbol, untyped]
|
|
757
784
|
type config_input = config_hash | _ToH
|
|
785
|
+
type file_config_input = Hash[Symbol, untyped]?
|
|
758
786
|
|
|
759
787
|
interface _ToH
|
|
760
788
|
def to_h: () -> config_hash
|
|
@@ -1179,13 +1207,15 @@ module Kreuzberg
|
|
|
1179
1207
|
|
|
1180
1208
|
def self.batch_extract_files_sync: (
|
|
1181
1209
|
paths: Array[String | Pathname],
|
|
1182
|
-
?config: config_input
|
|
1210
|
+
?config: config_input?,
|
|
1211
|
+
?file_configs: Array[file_config_input]?
|
|
1183
1212
|
) -> Array[Result]
|
|
1184
1213
|
|
|
1185
1214
|
def self.batch_extract_bytes_sync: (
|
|
1186
1215
|
data_array: Array[String],
|
|
1187
1216
|
mime_types: Array[String],
|
|
1188
|
-
?config: config_input
|
|
1217
|
+
?config: config_input?,
|
|
1218
|
+
?file_configs: Array[file_config_input]?
|
|
1189
1219
|
) -> Array[Result]
|
|
1190
1220
|
|
|
1191
1221
|
def self.extract_file: (
|
|
@@ -1202,13 +1232,15 @@ module Kreuzberg
|
|
|
1202
1232
|
|
|
1203
1233
|
def self.batch_extract_files: (
|
|
1204
1234
|
paths: Array[String | Pathname],
|
|
1205
|
-
?config: config_input
|
|
1235
|
+
?config: config_input?,
|
|
1236
|
+
?file_configs: Array[file_config_input]?
|
|
1206
1237
|
) -> Array[Result]
|
|
1207
1238
|
|
|
1208
1239
|
def self.batch_extract_bytes: (
|
|
1209
1240
|
data_array: Array[String],
|
|
1210
1241
|
mime_types: Array[String],
|
|
1211
|
-
?config: config_input
|
|
1242
|
+
?config: config_input?,
|
|
1243
|
+
?file_configs: Array[file_config_input]?
|
|
1212
1244
|
) -> Array[Result]
|
|
1213
1245
|
|
|
1214
1246
|
# Cache API
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.
|
|
4
|
+
version: 4.5.2
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|