kreuzberg 4.2.0 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +59 -28
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/config_spec.rb +1 -1
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +21 -1
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +5 -2
|
@@ -15,11 +15,15 @@ module Kreuzberg
|
|
|
15
15
|
# @example Extract with explicit MIME type
|
|
16
16
|
# @example Extract with OCR enabled
|
|
17
17
|
def extract_file_sync(path:, mime_type: nil, config: nil)
|
|
18
|
+
# Validate that the file exists
|
|
19
|
+
path_str = path.to_s
|
|
20
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
21
|
+
|
|
18
22
|
opts = normalize_config(config)
|
|
19
23
|
hash = if mime_type
|
|
20
|
-
native_extract_file_sync(
|
|
24
|
+
native_extract_file_sync(path_str, mime_type.to_s, **opts)
|
|
21
25
|
else
|
|
22
|
-
native_extract_file_sync(
|
|
26
|
+
native_extract_file_sync(path_str, **opts)
|
|
23
27
|
end
|
|
24
28
|
result = Result.new(hash)
|
|
25
29
|
record_cache_entry!(result, opts)
|
|
@@ -53,6 +57,8 @@ module Kreuzberg
|
|
|
53
57
|
# response = HTTParty.get("https://example.com/document.docx")
|
|
54
58
|
# result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
55
59
|
def extract_bytes_sync(data:, mime_type:, config: nil)
|
|
60
|
+
raise TypeError, "mime_type must be a String, got #{mime_type.inspect}" if mime_type.nil?
|
|
61
|
+
|
|
56
62
|
opts = normalize_config(config)
|
|
57
63
|
hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
|
|
58
64
|
result = Result.new(hash)
|
|
@@ -92,6 +98,12 @@ module Kreuzberg
|
|
|
92
98
|
# config = Kreuzberg::Config::Extraction.new(force_ocr: true)
|
|
93
99
|
# results = Kreuzberg.batch_extract_files_sync(paths, config: config)
|
|
94
100
|
def batch_extract_files_sync(paths:, config: nil)
|
|
101
|
+
# Validate that all files exist
|
|
102
|
+
paths.each do |path|
|
|
103
|
+
path_str = path.to_s
|
|
104
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
105
|
+
end
|
|
106
|
+
|
|
95
107
|
opts = normalize_config(config)
|
|
96
108
|
hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
|
|
97
109
|
results = hashes.map { |hash| Result.new(hash) }
|
|
@@ -130,11 +142,15 @@ module Kreuzberg
|
|
|
130
142
|
# )
|
|
131
143
|
# result = Kreuzberg.extract_file("document.pdf", config: config)
|
|
132
144
|
def extract_file(path:, mime_type: nil, config: nil)
|
|
145
|
+
# Validate that the file exists
|
|
146
|
+
path_str = path.to_s
|
|
147
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
148
|
+
|
|
133
149
|
opts = normalize_config(config)
|
|
134
150
|
hash = if mime_type
|
|
135
|
-
native_extract_file(
|
|
151
|
+
native_extract_file(path_str, mime_type.to_s, **opts)
|
|
136
152
|
else
|
|
137
|
-
native_extract_file(
|
|
153
|
+
native_extract_file(path_str, **opts)
|
|
138
154
|
end
|
|
139
155
|
result = Result.new(hash)
|
|
140
156
|
record_cache_entry!(result, opts)
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -11,7 +11,7 @@ module Kreuzberg
|
|
|
11
11
|
# rubocop:disable Metrics/ClassLength
|
|
12
12
|
class Result
|
|
13
13
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
14
|
-
:detected_languages, :chunks, :images, :pages, :elements
|
|
14
|
+
:detected_languages, :chunks, :images, :pages, :elements, :djot_content
|
|
15
15
|
|
|
16
16
|
# @!attribute [r] cells
|
|
17
17
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -180,6 +180,7 @@ module Kreuzberg
|
|
|
180
180
|
#
|
|
181
181
|
# @param hash [Hash] Hash returned from native extension
|
|
182
182
|
#
|
|
183
|
+
# rubocop:disable Metrics/AbcSize
|
|
183
184
|
def initialize(hash)
|
|
184
185
|
@content = get_value(hash, 'content', '')
|
|
185
186
|
@mime_type = get_value(hash, 'mime_type', '')
|
|
@@ -191,7 +192,9 @@ module Kreuzberg
|
|
|
191
192
|
@images = parse_images(get_value(hash, 'images'))
|
|
192
193
|
@pages = parse_pages(get_value(hash, 'pages'))
|
|
193
194
|
@elements = parse_elements(get_value(hash, 'elements'))
|
|
195
|
+
@djot_content = parse_djot_content(get_value(hash, 'djot_content'))
|
|
194
196
|
end
|
|
197
|
+
# rubocop:enable Metrics/AbcSize
|
|
195
198
|
|
|
196
199
|
# Convert to hash
|
|
197
200
|
#
|
|
@@ -207,7 +210,8 @@ module Kreuzberg
|
|
|
207
210
|
chunks: serialize_chunks,
|
|
208
211
|
images: serialize_images,
|
|
209
212
|
pages: serialize_pages,
|
|
210
|
-
elements: serialize_elements
|
|
213
|
+
elements: serialize_elements,
|
|
214
|
+
djot_content: @djot_content&.to_h
|
|
211
215
|
}
|
|
212
216
|
end
|
|
213
217
|
|
|
@@ -434,6 +438,12 @@ module Kreuzberg
|
|
|
434
438
|
y1: coordinates_data['y1'].to_f
|
|
435
439
|
)
|
|
436
440
|
end
|
|
441
|
+
|
|
442
|
+
def parse_djot_content(djot_data)
|
|
443
|
+
return nil if djot_data.nil?
|
|
444
|
+
|
|
445
|
+
DjotContent.new(djot_data)
|
|
446
|
+
end
|
|
437
447
|
end
|
|
438
448
|
# rubocop:enable Metrics/ClassLength
|
|
439
449
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -87,6 +87,7 @@ end
|
|
|
87
87
|
|
|
88
88
|
require_relative 'kreuzberg/cache_api'
|
|
89
89
|
require_relative 'kreuzberg/extraction_api'
|
|
90
|
+
require_relative 'kreuzberg/djot_content'
|
|
90
91
|
|
|
91
92
|
Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
|
|
92
93
|
Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
|
data/sig/kreuzberg.rbs
CHANGED
|
@@ -417,14 +417,23 @@ module Kreuzberg
|
|
|
417
417
|
attr_reader plain_text: String
|
|
418
418
|
attr_reader blocks: Array[DjotContent::FormattedBlock]
|
|
419
419
|
attr_reader metadata: Hash[untyped, untyped]
|
|
420
|
-
attr_reader
|
|
420
|
+
attr_reader metadata_json: String
|
|
421
|
+
attr_reader tables: Array[untyped]
|
|
421
422
|
attr_reader images: Array[DjotContent::DjotImage]
|
|
422
423
|
attr_reader links: Array[DjotContent::DjotLink]
|
|
423
424
|
attr_reader footnotes: Array[DjotContent::Footnote]
|
|
424
425
|
attr_reader attributes: Hash[String, untyped]?
|
|
425
426
|
|
|
426
|
-
def initialize: (
|
|
427
|
-
def to_h: () ->
|
|
427
|
+
def initialize: (untyped hash) -> void
|
|
428
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
429
|
+
|
|
430
|
+
private
|
|
431
|
+
|
|
432
|
+
def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
|
|
433
|
+
def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
|
|
434
|
+
def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
|
|
435
|
+
def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
|
|
436
|
+
def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
|
|
428
437
|
|
|
429
438
|
class FormattedBlock
|
|
430
439
|
attr_reader block_type: String
|
|
@@ -433,28 +442,31 @@ module Kreuzberg
|
|
|
433
442
|
attr_reader children: Array[FormattedBlock]?
|
|
434
443
|
attr_reader attributes: Hash[String, untyped]?
|
|
435
444
|
|
|
436
|
-
def initialize: (
|
|
437
|
-
def to_h: () ->
|
|
445
|
+
def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
|
|
446
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
438
447
|
end
|
|
439
448
|
|
|
440
449
|
class DjotImage
|
|
441
450
|
attr_reader url: String
|
|
442
451
|
attr_reader alt: String?
|
|
443
452
|
attr_reader title: String?
|
|
444
|
-
attr_reader
|
|
453
|
+
attr_reader width: Integer?
|
|
454
|
+
attr_reader height: Integer?
|
|
445
455
|
|
|
446
|
-
def initialize: (
|
|
447
|
-
def
|
|
456
|
+
def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
|
|
457
|
+
def src: () -> String
|
|
458
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
448
459
|
end
|
|
449
460
|
|
|
450
461
|
class DjotLink
|
|
451
462
|
attr_reader url: String
|
|
452
|
-
attr_reader text: String
|
|
463
|
+
attr_reader text: String?
|
|
453
464
|
attr_reader title: String?
|
|
454
465
|
attr_reader link_type: String?
|
|
455
466
|
|
|
456
|
-
def initialize: (
|
|
457
|
-
def
|
|
467
|
+
def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
|
|
468
|
+
def href: () -> String
|
|
469
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
458
470
|
end
|
|
459
471
|
|
|
460
472
|
class Footnote
|
data/spec/binding/batch_spec.rb
CHANGED
|
@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
|
|
|
295
295
|
end
|
|
296
296
|
|
|
297
297
|
describe 'batch error handling' do
|
|
298
|
-
it '
|
|
298
|
+
it 'raises IOError for missing files in batch' do
|
|
299
299
|
paths = [
|
|
300
300
|
'/nonexistent/file1.txt',
|
|
301
301
|
'/nonexistent/file2.txt'
|
|
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
|
|
|
303
303
|
|
|
304
304
|
expect do
|
|
305
305
|
described_class.batch_extract_files_sync(paths: paths)
|
|
306
|
-
end.
|
|
306
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
307
307
|
end
|
|
308
308
|
|
|
309
|
-
it '
|
|
309
|
+
it 'raises IOError when batch contains invalid paths' do
|
|
310
310
|
paths = []
|
|
311
311
|
temp_dir = Dir.mktmpdir
|
|
312
312
|
|
|
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
|
|
|
316
316
|
|
|
317
317
|
paths << '/nonexistent/invalid.txt'
|
|
318
318
|
|
|
319
|
-
|
|
320
|
-
|
|
319
|
+
expect do
|
|
320
|
+
described_class.batch_extract_files_sync(paths: paths)
|
|
321
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
321
322
|
ensure
|
|
322
323
|
FileUtils.remove_entry(temp_dir)
|
|
323
324
|
end
|
data/spec/binding/config_spec.rb
CHANGED
|
@@ -309,7 +309,7 @@ RSpec.describe Kreuzberg::Config do
|
|
|
309
309
|
config = described_class.new
|
|
310
310
|
|
|
311
311
|
expect(config.use_cache).to be true
|
|
312
|
-
expect(config.enable_quality_processing).to be
|
|
312
|
+
expect(config.enable_quality_processing).to be true
|
|
313
313
|
expect(config.force_ocr).to be false
|
|
314
314
|
expect(config.ocr).to be_nil
|
|
315
315
|
expect(config.chunking).to be_nil
|
|
@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
57
57
|
nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
|
|
58
58
|
|
|
59
59
|
expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
|
|
60
|
-
.to raise_error(Kreuzberg::Errors::
|
|
60
|
+
.to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
|
|
61
61
|
end
|
|
62
62
|
|
|
63
63
|
it 'provides descriptive error messages for invalid MIME types' do
|
|
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
293
293
|
|
|
294
294
|
expect(validation_error).to be_a(ArgumentError)
|
|
295
295
|
|
|
296
|
-
# Runtime error (file not found)
|
|
296
|
+
# Runtime error (file not found) - IOError since the file doesn't exist
|
|
297
297
|
runtime_error = nil
|
|
298
298
|
begin
|
|
299
299
|
Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
|
|
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
|
|
|
301
301
|
runtime_error = e
|
|
302
302
|
end
|
|
303
303
|
|
|
304
|
-
expect(runtime_error).to be_a(Kreuzberg::Errors::
|
|
304
|
+
expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
|
|
305
305
|
end
|
|
306
306
|
|
|
307
307
|
it 'provides error recovery suggestions in messages' do
|
data/spec/binding/tables_spec.rb
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'spec_helper'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'fileutils'
|
|
4
6
|
|
|
5
7
|
RSpec.describe 'Table Extraction Quality' do
|
|
6
8
|
describe 'table structure extraction' do
|
|
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
|
|
|
523
525
|
it 'handles documents with no tables gracefully' do
|
|
524
526
|
config = Kreuzberg::Config::Extraction.new
|
|
525
527
|
|
|
528
|
+
# Create a temporary text file for this test
|
|
529
|
+
file = Tempfile.new(['no_tables_test', '.txt'])
|
|
530
|
+
file.write('This is a text document without any tables.')
|
|
531
|
+
file.close
|
|
532
|
+
|
|
526
533
|
begin
|
|
527
|
-
result = Kreuzberg.extract_file(path:
|
|
534
|
+
result = Kreuzberg.extract_file(path: file.path, config: config)
|
|
528
535
|
expect(result).not_to be_nil
|
|
529
536
|
expect(result.tables).to be_a(Array) if result.tables
|
|
530
|
-
rescue Kreuzberg::Errors::
|
|
537
|
+
rescue Kreuzberg::Errors::IOError
|
|
531
538
|
skip 'Text file not available for testing'
|
|
539
|
+
ensure
|
|
540
|
+
FileUtils.rm_f(file.path)
|
|
532
541
|
end
|
|
533
542
|
end
|
|
534
543
|
|
|
@@ -6,7 +6,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
|
|
|
6
6
|
config = described_class.new
|
|
7
7
|
|
|
8
8
|
expect(config.use_cache).to be true
|
|
9
|
-
expect(config.enable_quality_processing).to be
|
|
9
|
+
expect(config.enable_quality_processing).to be true
|
|
10
10
|
expect(config.force_ocr).to be false
|
|
11
11
|
expect(config.ocr).to be_nil
|
|
12
12
|
expect(config.chunking).to be_nil
|
|
@@ -103,7 +103,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
|
|
|
103
103
|
hash = config.to_h
|
|
104
104
|
|
|
105
105
|
expect(hash[:use_cache]).to be true
|
|
106
|
-
expect(hash[:enable_quality_processing]).to be
|
|
106
|
+
expect(hash[:enable_quality_processing]).to be true
|
|
107
107
|
expect(hash[:force_ocr]).to be false
|
|
108
108
|
end
|
|
109
109
|
end
|
|
@@ -282,34 +282,34 @@ RSpec.describe 'Output Format and Result Format Configuration' do
|
|
|
282
282
|
end
|
|
283
283
|
|
|
284
284
|
describe 'format validation and edge cases' do
|
|
285
|
-
it '
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
285
|
+
it 'raises error for empty string output_format' do
|
|
286
|
+
expect do
|
|
287
|
+
described_class.new(output_format: '')
|
|
288
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
289
289
|
end
|
|
290
290
|
|
|
291
|
-
it '
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
291
|
+
it 'raises error for empty string result_format' do
|
|
292
|
+
expect do
|
|
293
|
+
described_class.new(result_format: '')
|
|
294
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
295
295
|
end
|
|
296
296
|
|
|
297
|
-
it '
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
297
|
+
it 'raises error for whitespace in output_format' do
|
|
298
|
+
expect do
|
|
299
|
+
described_class.new(output_format: ' plain ')
|
|
300
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
301
301
|
end
|
|
302
302
|
|
|
303
|
-
it '
|
|
303
|
+
it 'normalizes case in output_format' do
|
|
304
304
|
config = described_class.new(output_format: 'MarkDown')
|
|
305
305
|
|
|
306
|
-
expect(config.output_format).to eq '
|
|
306
|
+
expect(config.output_format).to eq 'markdown'
|
|
307
307
|
end
|
|
308
308
|
|
|
309
|
-
it '
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
309
|
+
it 'raises error for custom string in result_format' do
|
|
310
|
+
expect do
|
|
311
|
+
described_class.new(result_format: 'custom_format')
|
|
312
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
313
313
|
end
|
|
314
314
|
end
|
|
315
315
|
|
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -71,7 +71,7 @@ keywords-yake = ["dep:yake-rust", "stopwords"]
|
|
|
71
71
|
keywords-rake = ["dep:rake", "stopwords"]
|
|
72
72
|
keywords = ["keywords-yake", "keywords-rake"]
|
|
73
73
|
|
|
74
|
-
api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
|
|
74
|
+
api = ["dep:axum", "dep:tower", "dep:tower-http", "dep:utoipa", "tokio-runtime"]
|
|
75
75
|
mcp = ["dep:rmcp", "tokio-runtime"]
|
|
76
76
|
mcp-http = ["mcp", "api"]
|
|
77
77
|
|
|
@@ -198,6 +198,7 @@ rake = { version = "0.3.6", optional = true }
|
|
|
198
198
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
199
199
|
tower = { version = "0.5", optional = true }
|
|
200
200
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
201
|
+
utoipa = { version = "5.3", features = ["axum_extras"], optional = true }
|
|
201
202
|
rmcp = { version = "0.14.0", features = [
|
|
202
203
|
"server",
|
|
203
204
|
"macros",
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.2 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -2,14 +2,38 @@
|
|
|
2
2
|
|
|
3
3
|
use axum::{
|
|
4
4
|
Json,
|
|
5
|
+
extract::{FromRequest, Request, rejection::JsonRejection},
|
|
5
6
|
http::StatusCode,
|
|
6
7
|
response::{IntoResponse, Response},
|
|
7
8
|
};
|
|
9
|
+
use serde::de::DeserializeOwned;
|
|
8
10
|
|
|
9
11
|
use crate::error::KreuzbergError;
|
|
10
12
|
|
|
11
13
|
use super::types::ErrorResponse;
|
|
12
14
|
|
|
15
|
+
/// Custom JSON extractor that returns JSON error responses instead of plain text.
|
|
16
|
+
///
|
|
17
|
+
/// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
|
|
18
|
+
/// ensuring that all JSON parsing errors are returned as JSON with proper content type.
|
|
19
|
+
#[derive(Debug, Clone, Copy, Default)]
|
|
20
|
+
pub struct JsonApi<T>(pub T);
|
|
21
|
+
|
|
22
|
+
impl<T, S> FromRequest<S> for JsonApi<T>
|
|
23
|
+
where
|
|
24
|
+
T: DeserializeOwned,
|
|
25
|
+
S: Send + Sync,
|
|
26
|
+
{
|
|
27
|
+
type Rejection = ApiError;
|
|
28
|
+
|
|
29
|
+
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
|
|
30
|
+
match Json::<T>::from_request(req, state).await {
|
|
31
|
+
Ok(Json(value)) => Ok(JsonApi(value)),
|
|
32
|
+
Err(rejection) => Err(ApiError::from(rejection)),
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
13
37
|
/// API-specific error wrapper.
|
|
14
38
|
#[derive(Debug)]
|
|
15
39
|
pub struct ApiError {
|
|
@@ -79,3 +103,39 @@ impl From<KreuzbergError> for ApiError {
|
|
|
79
103
|
}
|
|
80
104
|
}
|
|
81
105
|
}
|
|
106
|
+
|
|
107
|
+
impl From<JsonRejection> for ApiError {
|
|
108
|
+
fn from(rejection: JsonRejection) -> Self {
|
|
109
|
+
let (status, message) = match rejection {
|
|
110
|
+
JsonRejection::JsonDataError(err) => (
|
|
111
|
+
StatusCode::UNPROCESSABLE_ENTITY,
|
|
112
|
+
format!(
|
|
113
|
+
"Failed to deserialize the JSON body into the target type: {}",
|
|
114
|
+
err.body_text()
|
|
115
|
+
),
|
|
116
|
+
),
|
|
117
|
+
JsonRejection::JsonSyntaxError(err) => (
|
|
118
|
+
StatusCode::BAD_REQUEST,
|
|
119
|
+
format!("Failed to parse the request body as JSON: {}", err.body_text()),
|
|
120
|
+
),
|
|
121
|
+
JsonRejection::MissingJsonContentType(_) => (
|
|
122
|
+
StatusCode::UNSUPPORTED_MEDIA_TYPE,
|
|
123
|
+
"Expected request with `Content-Type: application/json`".to_string(),
|
|
124
|
+
),
|
|
125
|
+
JsonRejection::BytesRejection(err) => {
|
|
126
|
+
(StatusCode::BAD_REQUEST, format!("Failed to read request body: {}", err))
|
|
127
|
+
}
|
|
128
|
+
_ => (StatusCode::BAD_REQUEST, "Unknown JSON parsing error".to_string()),
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
Self {
|
|
132
|
+
status,
|
|
133
|
+
body: ErrorResponse {
|
|
134
|
+
error_type: "JsonParsingError".to_string(),
|
|
135
|
+
message,
|
|
136
|
+
traceback: None,
|
|
137
|
+
status_code: status.as_u16(),
|
|
138
|
+
},
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|