kreuzberg 4.2.1 → 4.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +9 -9
- data/README.md +1 -1
- data/lib/kreuzberg/api_proxy.rb +3 -3
- data/lib/kreuzberg/cli_proxy.rb +2 -2
- data/lib/kreuzberg/config.rb +4 -20
- data/lib/kreuzberg/mcp_proxy.rb +3 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/spec/binding/config_spec.rb +1 -1
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +89 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +11 -5
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/src/mcp/format.rs +46 -57
- data/vendor/kreuzberg/src/mcp/server.rs +2 -8
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +1 -7
- data/vendor/kreuzberg/tests/api_chunk.rs +25 -0
- data/vendor/kreuzberg/tests/api_embed.rs +60 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2c6fc44b151014f7e56c82bd191f55244a4294a259b24b95fc494dba6f8eaba6
|
|
4
|
+
data.tar.gz: 6e40a732814ff3e2a164e718cdb1c7a6ae838b2b2210a66b232f8675c7f79a80
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f9c3a45f31c3ad9e3857872d8705b397b40c4317844ef421f4da4c2918e57411f5a626df4f6706d7db4916f33b8644c736e7b41508b398fd0197f1a87170fa3c
|
|
7
|
+
data.tar.gz: 8b05a75be261dbe583c4873d9d21079efff97d6c9c0340bbd8a73a43c9d15955431f4de20cd8b4a8b7956872f52e4467c253f5da03177a1e7d3b6a10d202b59d
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.3)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -61,7 +61,7 @@ GEM
|
|
|
61
61
|
parser (3.3.10.1)
|
|
62
62
|
ast (~> 2.4.1)
|
|
63
63
|
racc
|
|
64
|
-
prism (1.
|
|
64
|
+
prism (1.9.0)
|
|
65
65
|
pry (0.16.0)
|
|
66
66
|
coderay (~> 1.1)
|
|
67
67
|
method_source (~> 1.0)
|
|
@@ -98,7 +98,7 @@ GEM
|
|
|
98
98
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
99
99
|
rspec-support (~> 3.13.0)
|
|
100
100
|
rspec-support (3.13.6)
|
|
101
|
-
rubocop (1.
|
|
101
|
+
rubocop (1.84.0)
|
|
102
102
|
json (~> 2.3)
|
|
103
103
|
language_server-protocol (~> 3.17.0.2)
|
|
104
104
|
lint_roller (~> 1.1.0)
|
|
@@ -106,7 +106,7 @@ GEM
|
|
|
106
106
|
parser (>= 3.3.0.2)
|
|
107
107
|
rainbow (>= 2.2.2, < 4.0)
|
|
108
108
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
109
|
-
rubocop-ast (>= 1.
|
|
109
|
+
rubocop-ast (>= 1.49.0, < 2.0)
|
|
110
110
|
ruby-progressbar (~> 1.7)
|
|
111
111
|
unicode-display_width (>= 2.4.0, < 4.0)
|
|
112
112
|
rubocop-ast (1.49.0)
|
|
@@ -121,7 +121,7 @@ GEM
|
|
|
121
121
|
rubocop (~> 1.81)
|
|
122
122
|
ruby-progressbar (1.13.0)
|
|
123
123
|
securerandom (0.4.1)
|
|
124
|
-
sorbet-runtime (0.6.
|
|
124
|
+
sorbet-runtime (0.6.12903)
|
|
125
125
|
steep (1.10.0)
|
|
126
126
|
activesupport (>= 5.1)
|
|
127
127
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.2.
|
|
210
|
+
kreuzberg (4.2.3)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -217,7 +217,7 @@ CHECKSUMS
|
|
|
217
217
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
218
218
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
219
219
|
parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
|
|
220
|
-
prism (1.
|
|
220
|
+
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
221
221
|
pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
|
|
222
222
|
pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
|
|
223
223
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
@@ -236,13 +236,13 @@ CHECKSUMS
|
|
|
236
236
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
237
237
|
rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
|
|
238
238
|
rspec-support (3.13.6) sha256=2e8de3702427eab064c9352fe74488cc12a1bfae887ad8b91cba480ec9f8afb2
|
|
239
|
-
rubocop (1.
|
|
239
|
+
rubocop (1.84.0) sha256=88dec310153bb685a879f5a7cdb601f6287b8f0ee675d9dc63a17c7204c4190a
|
|
240
240
|
rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
|
|
241
241
|
rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
|
|
242
242
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
243
243
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
244
244
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
245
|
-
sorbet-runtime (0.6.
|
|
245
|
+
sorbet-runtime (0.6.12903) sha256=c23968c0dcf5a5db57f32c003fe3db7fb588c168cdd57d92ea4dceaba063118a
|
|
246
246
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
247
247
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
248
248
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/api_proxy.rb
CHANGED
|
@@ -6,9 +6,9 @@ module Kreuzberg
|
|
|
6
6
|
# @example Start the server
|
|
7
7
|
# @example With block
|
|
8
8
|
module APIProxy
|
|
9
|
-
Error
|
|
10
|
-
MissingBinaryError
|
|
11
|
-
ServerError
|
|
9
|
+
class Error < Kreuzberg::Errors::Error; end
|
|
10
|
+
class MissingBinaryError < Error; end
|
|
11
|
+
class ServerError < Error; end
|
|
12
12
|
|
|
13
13
|
# API server instance
|
|
14
14
|
class Server
|
data/lib/kreuzberg/cli_proxy.rb
CHANGED
|
@@ -5,8 +5,8 @@ require 'open3'
|
|
|
5
5
|
module Kreuzberg
|
|
6
6
|
# @example
|
|
7
7
|
module CLIProxy
|
|
8
|
-
Error
|
|
9
|
-
MissingBinaryError
|
|
8
|
+
class Error < Kreuzberg::Errors::Error; end
|
|
9
|
+
class MissingBinaryError < Error; end
|
|
10
10
|
|
|
11
11
|
# CLI execution error with stderr and exit status
|
|
12
12
|
class CLIExecutionError < Error
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -684,13 +684,6 @@ module Kreuzberg
|
|
|
684
684
|
# image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
|
|
685
685
|
# config = Extraction.new(image_extraction: image)
|
|
686
686
|
#
|
|
687
|
-
# @example With preprocessing
|
|
688
|
-
# preprocessing = Config::ImagePreprocessing.new(
|
|
689
|
-
# binarization_method: "sauvola",
|
|
690
|
-
# denoise: true
|
|
691
|
-
# )
|
|
692
|
-
# config = Extraction.new(image_preprocessing: preprocessing)
|
|
693
|
-
#
|
|
694
687
|
# @example With post-processing
|
|
695
688
|
# postprocessor = Config::PostProcessor.new(
|
|
696
689
|
# enabled: true,
|
|
@@ -708,14 +701,13 @@ module Kreuzberg
|
|
|
708
701
|
# language_detection: Config::LanguageDetection.new(enabled: true),
|
|
709
702
|
# pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
|
|
710
703
|
# image_extraction: Config::ImageExtraction.new(target_dpi: 600),
|
|
711
|
-
# image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
|
|
712
704
|
# postprocessor: Config::PostProcessor.new(enabled: true)
|
|
713
705
|
# )
|
|
714
706
|
#
|
|
715
707
|
class Extraction
|
|
716
708
|
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
717
709
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
718
|
-
:images, :
|
|
710
|
+
:images, :postprocessor,
|
|
719
711
|
:token_reduction, :keywords, :html_options, :pages,
|
|
720
712
|
:max_concurrent_extractions, :output_format, :result_format
|
|
721
713
|
|
|
@@ -739,7 +731,7 @@ module Kreuzberg
|
|
|
739
731
|
# Keys that are allowed in the Extraction config
|
|
740
732
|
ALLOWED_KEYS = %i[
|
|
741
733
|
use_cache enable_quality_processing force_ocr ocr chunking
|
|
742
|
-
language_detection pdf_options image_extraction
|
|
734
|
+
language_detection pdf_options image_extraction
|
|
743
735
|
postprocessor token_reduction keywords html_options pages
|
|
744
736
|
max_concurrent_extractions output_format result_format
|
|
745
737
|
].freeze
|
|
@@ -800,14 +792,13 @@ module Kreuzberg
|
|
|
800
792
|
|
|
801
793
|
def initialize(hash = nil,
|
|
802
794
|
use_cache: true,
|
|
803
|
-
enable_quality_processing:
|
|
795
|
+
enable_quality_processing: true,
|
|
804
796
|
force_ocr: false,
|
|
805
797
|
ocr: nil,
|
|
806
798
|
chunking: nil,
|
|
807
799
|
language_detection: nil,
|
|
808
800
|
pdf_options: nil,
|
|
809
801
|
image_extraction: nil,
|
|
810
|
-
image_preprocessing: nil,
|
|
811
802
|
postprocessor: nil,
|
|
812
803
|
token_reduction: nil,
|
|
813
804
|
keywords: nil,
|
|
@@ -820,7 +811,7 @@ module Kreuzberg
|
|
|
820
811
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
821
812
|
force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
822
813
|
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
823
|
-
|
|
814
|
+
postprocessor: postprocessor,
|
|
824
815
|
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
825
816
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
826
817
|
output_format: output_format, result_format: result_format
|
|
@@ -846,7 +837,6 @@ module Kreuzberg
|
|
|
846
837
|
@language_detection = normalize_config(params[:language_detection], LanguageDetection)
|
|
847
838
|
@pdf_options = normalize_config(params[:pdf_options], PDF)
|
|
848
839
|
@images = normalize_config(params[:image_extraction], ImageExtraction)
|
|
849
|
-
@image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
|
|
850
840
|
@postprocessor = normalize_config(params[:postprocessor], PostProcessor)
|
|
851
841
|
@token_reduction = normalize_config(params[:token_reduction], TokenReduction)
|
|
852
842
|
@keywords = normalize_config(params[:keywords], Keywords)
|
|
@@ -878,7 +868,6 @@ module Kreuzberg
|
|
|
878
868
|
end
|
|
879
869
|
|
|
880
870
|
# rubocop:disable Metrics/CyclomaticComplexity
|
|
881
|
-
# rubocop:disable Metrics/MethodLength
|
|
882
871
|
def to_h
|
|
883
872
|
{
|
|
884
873
|
use_cache: @use_cache,
|
|
@@ -889,7 +878,6 @@ module Kreuzberg
|
|
|
889
878
|
language_detection: @language_detection&.to_h,
|
|
890
879
|
pdf_options: @pdf_options&.to_h,
|
|
891
880
|
images: @images&.to_h,
|
|
892
|
-
image_preprocessing: @image_preprocessing&.to_h,
|
|
893
881
|
postprocessor: @postprocessor&.to_h,
|
|
894
882
|
token_reduction: @token_reduction&.to_h,
|
|
895
883
|
keywords: @keywords&.to_h,
|
|
@@ -900,7 +888,6 @@ module Kreuzberg
|
|
|
900
888
|
result_format: @result_format
|
|
901
889
|
}.compact
|
|
902
890
|
end
|
|
903
|
-
# rubocop:enable Metrics/MethodLength
|
|
904
891
|
# rubocop:enable Metrics/CyclomaticComplexity
|
|
905
892
|
|
|
906
893
|
# Serialize configuration to JSON string
|
|
@@ -1025,8 +1012,6 @@ module Kreuzberg
|
|
|
1025
1012
|
@pdf_options = normalize_config(value, PDF)
|
|
1026
1013
|
when :image_extraction
|
|
1027
1014
|
@images = normalize_config(value, ImageExtraction)
|
|
1028
|
-
when :image_preprocessing
|
|
1029
|
-
@image_preprocessing = normalize_config(value, ImagePreprocessing)
|
|
1030
1015
|
when :postprocessor
|
|
1031
1016
|
@postprocessor = normalize_config(value, PostProcessor)
|
|
1032
1017
|
when :token_reduction
|
|
@@ -1101,7 +1086,6 @@ module Kreuzberg
|
|
|
1101
1086
|
@language_detection = merged.language_detection
|
|
1102
1087
|
@pdf_options = merged.pdf_options
|
|
1103
1088
|
@images = merged.image_extraction
|
|
1104
|
-
@image_preprocessing = merged.image_preprocessing
|
|
1105
1089
|
@postprocessor = merged.postprocessor
|
|
1106
1090
|
@token_reduction = merged.token_reduction
|
|
1107
1091
|
@keywords = merged.keywords
|
data/lib/kreuzberg/mcp_proxy.rb
CHANGED
|
@@ -6,9 +6,9 @@ require 'json'
|
|
|
6
6
|
module Kreuzberg
|
|
7
7
|
# @example Start MCP server
|
|
8
8
|
module MCPProxy
|
|
9
|
-
Error
|
|
10
|
-
MissingBinaryError
|
|
11
|
-
ServerError
|
|
9
|
+
class Error < Kreuzberg::Errors::Error; end
|
|
10
|
+
class MissingBinaryError < Error; end
|
|
11
|
+
class ServerError < Error; end
|
|
12
12
|
|
|
13
13
|
# MCP server instance
|
|
14
14
|
class Server
|
data/lib/kreuzberg/version.rb
CHANGED
data/spec/binding/config_spec.rb
CHANGED
|
@@ -309,7 +309,7 @@ RSpec.describe Kreuzberg::Config do
|
|
|
309
309
|
config = described_class.new
|
|
310
310
|
|
|
311
311
|
expect(config.use_cache).to be true
|
|
312
|
-
expect(config.enable_quality_processing).to be
|
|
312
|
+
expect(config.enable_quality_processing).to be true
|
|
313
313
|
expect(config.force_ocr).to be false
|
|
314
314
|
expect(config.ocr).to be_nil
|
|
315
315
|
expect(config.chunking).to be_nil
|
|
@@ -6,7 +6,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
|
|
|
6
6
|
config = described_class.new
|
|
7
7
|
|
|
8
8
|
expect(config.use_cache).to be true
|
|
9
|
-
expect(config.enable_quality_processing).to be
|
|
9
|
+
expect(config.enable_quality_processing).to be true
|
|
10
10
|
expect(config.force_ocr).to be false
|
|
11
11
|
expect(config.ocr).to be_nil
|
|
12
12
|
expect(config.chunking).to be_nil
|
|
@@ -103,7 +103,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
|
|
|
103
103
|
hash = config.to_h
|
|
104
104
|
|
|
105
105
|
expect(hash[:use_cache]).to be true
|
|
106
|
-
expect(hash[:enable_quality_processing]).to be
|
|
106
|
+
expect(hash[:enable_quality_processing]).to be true
|
|
107
107
|
expect(hash[:force_ocr]).to be false
|
|
108
108
|
end
|
|
109
109
|
end
|
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.3"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -71,7 +71,7 @@ keywords-yake = ["dep:yake-rust", "stopwords"]
|
|
|
71
71
|
keywords-rake = ["dep:rake", "stopwords"]
|
|
72
72
|
keywords = ["keywords-yake", "keywords-rake"]
|
|
73
73
|
|
|
74
|
-
api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
|
|
74
|
+
api = ["dep:axum", "dep:tower", "dep:tower-http", "dep:utoipa", "tokio-runtime"]
|
|
75
75
|
mcp = ["dep:rmcp", "tokio-runtime"]
|
|
76
76
|
mcp-http = ["mcp", "api"]
|
|
77
77
|
|
|
@@ -198,6 +198,7 @@ rake = { version = "0.3.6", optional = true }
|
|
|
198
198
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
199
199
|
tower = { version = "0.5", optional = true }
|
|
200
200
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
201
|
+
utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
|
|
201
202
|
rmcp = { version = "0.14.0", features = [
|
|
202
203
|
"server",
|
|
203
204
|
"macros",
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.3 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -2,14 +2,67 @@
|
|
|
2
2
|
|
|
3
3
|
use axum::{
|
|
4
4
|
Json,
|
|
5
|
+
body::to_bytes,
|
|
6
|
+
extract::{FromRequest, Request, rejection::JsonRejection},
|
|
5
7
|
http::StatusCode,
|
|
6
8
|
response::{IntoResponse, Response},
|
|
7
9
|
};
|
|
10
|
+
use serde::de::DeserializeOwned;
|
|
8
11
|
|
|
9
12
|
use crate::error::KreuzbergError;
|
|
10
13
|
|
|
11
14
|
use super::types::ErrorResponse;
|
|
12
15
|
|
|
16
|
+
/// Custom JSON extractor that returns JSON error responses instead of plain text.
|
|
17
|
+
///
|
|
18
|
+
/// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
|
|
19
|
+
/// ensuring that all JSON parsing errors are returned as JSON with proper content type.
|
|
20
|
+
///
|
|
21
|
+
/// Additionally, this extractor validates that the root JSON value is an object (not an array),
|
|
22
|
+
/// which prevents serde from incorrectly deserializing JSON arrays into struct fields.
|
|
23
|
+
#[derive(Debug, Clone, Copy, Default)]
|
|
24
|
+
pub struct JsonApi<T>(pub T);
|
|
25
|
+
|
|
26
|
+
impl<T, S> FromRequest<S> for JsonApi<T>
|
|
27
|
+
where
|
|
28
|
+
T: DeserializeOwned,
|
|
29
|
+
S: Send + Sync,
|
|
30
|
+
{
|
|
31
|
+
type Rejection = ApiError;
|
|
32
|
+
|
|
33
|
+
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
|
|
34
|
+
// First, extract the body to check if it's a valid JSON object (not array)
|
|
35
|
+
let (parts, body) = req.into_parts();
|
|
36
|
+
let bytes = to_bytes(body, usize::MAX).await.map_err(|_| {
|
|
37
|
+
ApiError::new(
|
|
38
|
+
StatusCode::BAD_REQUEST,
|
|
39
|
+
KreuzbergError::Other("Failed to read request body".to_string()),
|
|
40
|
+
)
|
|
41
|
+
})?;
|
|
42
|
+
|
|
43
|
+
// Validate that the root JSON is an object, not an array
|
|
44
|
+
if !bytes.is_empty() {
|
|
45
|
+
let trimmed = std::str::from_utf8(&bytes).unwrap_or("").trim_start();
|
|
46
|
+
if trimmed.starts_with('[') {
|
|
47
|
+
return Err(ApiError::new(
|
|
48
|
+
StatusCode::BAD_REQUEST,
|
|
49
|
+
KreuzbergError::validation(
|
|
50
|
+
"Expected JSON object, but received JSON array. \
|
|
51
|
+
Please wrap your data in an object with appropriate fields.",
|
|
52
|
+
),
|
|
53
|
+
));
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Reconstruct the request and use the standard Json extractor
|
|
58
|
+
let req = Request::from_parts(parts, axum::body::Body::from(bytes));
|
|
59
|
+
match Json::<T>::from_request(req, state).await {
|
|
60
|
+
Ok(Json(value)) => Ok(JsonApi(value)),
|
|
61
|
+
Err(rejection) => Err(ApiError::from(rejection)),
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
13
66
|
/// API-specific error wrapper.
|
|
14
67
|
#[derive(Debug)]
|
|
15
68
|
pub struct ApiError {
|
|
@@ -79,3 +132,39 @@ impl From<KreuzbergError> for ApiError {
|
|
|
79
132
|
}
|
|
80
133
|
}
|
|
81
134
|
}
|
|
135
|
+
|
|
136
|
+
impl From<JsonRejection> for ApiError {
|
|
137
|
+
fn from(rejection: JsonRejection) -> Self {
|
|
138
|
+
let (status, message) = match rejection {
|
|
139
|
+
JsonRejection::JsonDataError(err) => (
|
|
140
|
+
StatusCode::UNPROCESSABLE_ENTITY,
|
|
141
|
+
format!(
|
|
142
|
+
"Failed to deserialize the JSON body into the target type: {}",
|
|
143
|
+
err.body_text()
|
|
144
|
+
),
|
|
145
|
+
),
|
|
146
|
+
JsonRejection::JsonSyntaxError(err) => (
|
|
147
|
+
StatusCode::BAD_REQUEST,
|
|
148
|
+
format!("Failed to parse the request body as JSON: {}", err.body_text()),
|
|
149
|
+
),
|
|
150
|
+
JsonRejection::MissingJsonContentType(_) => (
|
|
151
|
+
StatusCode::UNSUPPORTED_MEDIA_TYPE,
|
|
152
|
+
"Expected request with `Content-Type: application/json`".to_string(),
|
|
153
|
+
),
|
|
154
|
+
JsonRejection::BytesRejection(err) => {
|
|
155
|
+
(StatusCode::BAD_REQUEST, format!("Failed to read request body: {}", err))
|
|
156
|
+
}
|
|
157
|
+
_ => (StatusCode::BAD_REQUEST, "Unknown JSON parsing error".to_string()),
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
Self {
|
|
161
|
+
status,
|
|
162
|
+
body: ErrorResponse {
|
|
163
|
+
error_type: "JsonParsingError".to_string(),
|
|
164
|
+
message,
|
|
165
|
+
traceback: None,
|
|
166
|
+
status_code: status.as_u16(),
|
|
167
|
+
},
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|