kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Cross-language serialization tests for Ruby bindings
|
|
4
|
+
#
|
|
5
|
+
# Validates that ExtractionConfig serializes consistently with other language bindings
|
|
6
|
+
|
|
7
|
+
require 'json'
|
|
8
|
+
require 'spec_helper'
|
|
9
|
+
|
|
10
|
+
RSpec.describe Kreuzberg::ExtractionConfig do
|
|
11
|
+
describe '#to_h' do
|
|
12
|
+
it 'serializes minimal config to hash' do
|
|
13
|
+
config = described_class.new
|
|
14
|
+
hash = config.to_h
|
|
15
|
+
|
|
16
|
+
expect(hash).to be_a(Hash)
|
|
17
|
+
expect(hash).to have_key(:use_cache)
|
|
18
|
+
expect(hash).to have_key(:enable_quality_processing)
|
|
19
|
+
expect(hash).to have_key(:force_ocr)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it 'serializes config with all fields' do
|
|
23
|
+
config = described_class.new(
|
|
24
|
+
use_cache: true,
|
|
25
|
+
enable_quality_processing: true,
|
|
26
|
+
force_ocr: false
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
hash = config.to_h
|
|
30
|
+
|
|
31
|
+
expect(hash[:use_cache]).to be(true)
|
|
32
|
+
expect(hash[:enable_quality_processing]).to be(true)
|
|
33
|
+
expect(hash[:force_ocr]).to be(false)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'preserves field values after serialization' do
|
|
37
|
+
original = described_class.new(
|
|
38
|
+
use_cache: false,
|
|
39
|
+
enable_quality_processing: true
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
hash = original.to_h
|
|
43
|
+
|
|
44
|
+
expect(hash[:use_cache]).to be(false)
|
|
45
|
+
expect(hash[:enable_quality_processing]).to be(true)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
describe '#to_json' do
|
|
50
|
+
it 'serializes to JSON' do
|
|
51
|
+
config = described_class.new(use_cache: true)
|
|
52
|
+
json = config.to_json
|
|
53
|
+
|
|
54
|
+
expect(json).to be_a(String)
|
|
55
|
+
|
|
56
|
+
parsed = JSON.parse(json, symbolize_names: true)
|
|
57
|
+
expect(parsed).to have_key(:use_cache)
|
|
58
|
+
expect(parsed[:use_cache]).to be(true)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it 'produces valid JSON' do
|
|
62
|
+
config = described_class.new
|
|
63
|
+
json = config.to_json
|
|
64
|
+
|
|
65
|
+
expect { JSON.parse(json) }.not_to raise_error
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
it 'uses snake_case field names' do
|
|
69
|
+
config = described_class.new(use_cache: true)
|
|
70
|
+
json = config.to_json
|
|
71
|
+
|
|
72
|
+
expect(json).to include('use_cache')
|
|
73
|
+
expect(json).not_to include('useCache')
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
describe 'round-trip serialization' do
|
|
78
|
+
it 'survives serialization -> deserialization -> serialization' do
|
|
79
|
+
config1 = described_class.new(
|
|
80
|
+
use_cache: true,
|
|
81
|
+
enable_quality_processing: false
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
json1 = config1.to_json
|
|
85
|
+
hash1 = JSON.parse(json1, symbolize_names: true)
|
|
86
|
+
|
|
87
|
+
config2 = described_class.new(hash1)
|
|
88
|
+
json2 = config2.to_json
|
|
89
|
+
|
|
90
|
+
# JSON strings should be equivalent
|
|
91
|
+
expect(JSON.parse(json1)).to eq(JSON.parse(json2))
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe 'field consistency' do
|
|
96
|
+
it 'includes all mandatory fields' do
|
|
97
|
+
config = described_class.new
|
|
98
|
+
hash = config.to_h
|
|
99
|
+
|
|
100
|
+
mandatory_fields = %i[use_cache enable_quality_processing force_ocr]
|
|
101
|
+
mandatory_fields.each do |field|
|
|
102
|
+
expect(hash).to have_key(field)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'handles nested ocr config' do
|
|
107
|
+
config = described_class.new(
|
|
108
|
+
ocr: {
|
|
109
|
+
backend: 'tesseract',
|
|
110
|
+
language: 'eng'
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
hash = config.to_h
|
|
115
|
+
|
|
116
|
+
expect(hash).to have_key(:ocr)
|
|
117
|
+
expect(hash[:ocr][:backend]).to eq('tesseract')
|
|
118
|
+
expect(hash[:ocr][:language]).to eq('eng')
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
describe 'immutability' do
|
|
123
|
+
it 'does not modify original config during serialization' do
|
|
124
|
+
config = described_class.new(use_cache: true)
|
|
125
|
+
|
|
126
|
+
json1 = config.to_json
|
|
127
|
+
json2 = config.to_json
|
|
128
|
+
json3 = config.to_json
|
|
129
|
+
|
|
130
|
+
expect(json1).to eq(json2)
|
|
131
|
+
expect(json2).to eq(json3)
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# rubocop:disable RSpec/RepeatedExample
|
|
4
|
+
RSpec.describe 'Output Format and Result Format Configuration' do
|
|
5
|
+
describe Kreuzberg::Config::Extraction do
|
|
6
|
+
describe 'output_format' do
|
|
7
|
+
it 'accepts output_format as initialization parameter' do
|
|
8
|
+
config = described_class.new(output_format: 'markdown')
|
|
9
|
+
|
|
10
|
+
expect(config.output_format).to eq 'markdown'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'defaults to nil when not specified' do
|
|
14
|
+
config = described_class.new
|
|
15
|
+
|
|
16
|
+
expect(config.output_format).to be_nil
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'accepts plain format' do
|
|
20
|
+
config = described_class.new(output_format: 'plain')
|
|
21
|
+
|
|
22
|
+
expect(config.output_format).to eq 'plain'
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'accepts markdown format' do
|
|
26
|
+
config = described_class.new(output_format: 'markdown')
|
|
27
|
+
|
|
28
|
+
expect(config.output_format).to eq 'markdown'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'accepts djot format' do
|
|
32
|
+
config = described_class.new(output_format: 'djot')
|
|
33
|
+
|
|
34
|
+
expect(config.output_format).to eq 'djot'
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'accepts html format' do
|
|
38
|
+
config = described_class.new(output_format: 'html')
|
|
39
|
+
|
|
40
|
+
expect(config.output_format).to eq 'html'
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'converts output_format to string' do
|
|
44
|
+
config = described_class.new(output_format: :markdown)
|
|
45
|
+
|
|
46
|
+
expect(config.output_format).to eq 'markdown'
|
|
47
|
+
expect(config.output_format).to be_a String
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it 'includes output_format in to_h' do
|
|
51
|
+
config = described_class.new(output_format: 'markdown')
|
|
52
|
+
hash = config.to_h
|
|
53
|
+
|
|
54
|
+
expect(hash[:output_format]).to eq 'markdown'
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it 'excludes nil output_format from to_h' do
|
|
58
|
+
config = described_class.new(output_format: nil)
|
|
59
|
+
hash = config.to_h
|
|
60
|
+
|
|
61
|
+
expect(hash.key?(:output_format)).to be false
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'includes output_format in JSON' do
|
|
65
|
+
config = described_class.new(output_format: 'markdown')
|
|
66
|
+
json = config.to_json
|
|
67
|
+
parsed = JSON.parse(json)
|
|
68
|
+
|
|
69
|
+
expect(parsed['output_format']).to eq 'markdown'
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'retrieves output_format with get_field' do
|
|
73
|
+
config = described_class.new(output_format: 'djot')
|
|
74
|
+
|
|
75
|
+
expect(config.get_field('output_format')).to eq 'djot'
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it 'can be set with []=' do
|
|
79
|
+
config = described_class.new
|
|
80
|
+
config[:output_format] = 'html'
|
|
81
|
+
|
|
82
|
+
expect(config.output_format).to eq 'html'
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it 'can be set with []= using symbol' do
|
|
86
|
+
config = described_class.new
|
|
87
|
+
config[:output_format] = :plain
|
|
88
|
+
|
|
89
|
+
expect(config.output_format).to eq 'plain'
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'can be retrieved with []' do
|
|
93
|
+
config = described_class.new(output_format: 'markdown')
|
|
94
|
+
|
|
95
|
+
expect(config[:output_format]).to eq 'markdown'
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
describe 'result_format' do
|
|
100
|
+
it 'accepts result_format as initialization parameter' do
|
|
101
|
+
config = described_class.new(result_format: 'unified')
|
|
102
|
+
|
|
103
|
+
expect(config.result_format).to eq 'unified'
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'defaults to nil when not specified' do
|
|
107
|
+
config = described_class.new
|
|
108
|
+
|
|
109
|
+
expect(config.result_format).to be_nil
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'accepts unified format' do
|
|
113
|
+
config = described_class.new(result_format: 'unified')
|
|
114
|
+
|
|
115
|
+
expect(config.result_format).to eq 'unified'
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it 'accepts element_based format' do
|
|
119
|
+
config = described_class.new(result_format: 'element_based')
|
|
120
|
+
|
|
121
|
+
expect(config.result_format).to eq 'element_based'
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it 'converts result_format to string' do
|
|
125
|
+
config = described_class.new(result_format: :unified)
|
|
126
|
+
|
|
127
|
+
expect(config.result_format).to eq 'unified'
|
|
128
|
+
expect(config.result_format).to be_a String
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'includes result_format in to_h' do
|
|
132
|
+
config = described_class.new(result_format: 'element_based')
|
|
133
|
+
hash = config.to_h
|
|
134
|
+
|
|
135
|
+
expect(hash[:result_format]).to eq 'element_based'
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it 'excludes nil result_format from to_h' do
|
|
139
|
+
config = described_class.new(result_format: nil)
|
|
140
|
+
hash = config.to_h
|
|
141
|
+
|
|
142
|
+
expect(hash.key?(:result_format)).to be false
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it 'includes result_format in JSON' do
|
|
146
|
+
config = described_class.new(result_format: 'element_based')
|
|
147
|
+
json = config.to_json
|
|
148
|
+
parsed = JSON.parse(json)
|
|
149
|
+
|
|
150
|
+
expect(parsed['result_format']).to eq 'element_based'
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'retrieves result_format with get_field' do
|
|
154
|
+
config = described_class.new(result_format: 'unified')
|
|
155
|
+
|
|
156
|
+
expect(config.get_field('result_format')).to eq 'unified'
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'can be set with []=' do
|
|
160
|
+
config = described_class.new
|
|
161
|
+
config[:result_format] = 'unified'
|
|
162
|
+
|
|
163
|
+
expect(config.result_format).to eq 'unified'
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it 'can be set with []= using symbol' do
|
|
167
|
+
config = described_class.new
|
|
168
|
+
config[:result_format] = :element_based
|
|
169
|
+
|
|
170
|
+
expect(config.result_format).to eq 'element_based'
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
it 'can be retrieved with []' do
|
|
174
|
+
config = described_class.new(result_format: 'element_based')
|
|
175
|
+
|
|
176
|
+
expect(config[:result_format]).to eq 'element_based'
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
describe 'combined output and result formats' do
|
|
181
|
+
it 'accepts both output_format and result_format' do
|
|
182
|
+
config = described_class.new(
|
|
183
|
+
output_format: 'markdown',
|
|
184
|
+
result_format: 'unified'
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
expect(config.output_format).to eq 'markdown'
|
|
188
|
+
expect(config.result_format).to eq 'unified'
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it 'serializes both formats in to_h' do
|
|
192
|
+
config = described_class.new(
|
|
193
|
+
output_format: 'djot',
|
|
194
|
+
result_format: 'element_based'
|
|
195
|
+
)
|
|
196
|
+
hash = config.to_h
|
|
197
|
+
|
|
198
|
+
expect(hash[:output_format]).to eq 'djot'
|
|
199
|
+
expect(hash[:result_format]).to eq 'element_based'
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it 'serializes both formats in JSON' do
|
|
203
|
+
config = described_class.new(
|
|
204
|
+
output_format: 'html',
|
|
205
|
+
result_format: 'unified'
|
|
206
|
+
)
|
|
207
|
+
json = config.to_json
|
|
208
|
+
parsed = JSON.parse(json)
|
|
209
|
+
|
|
210
|
+
expect(parsed['output_format']).to eq 'html'
|
|
211
|
+
expect(parsed['result_format']).to eq 'unified'
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
it 'merges both formats correctly' do
|
|
215
|
+
base = described_class.new(
|
|
216
|
+
output_format: 'markdown',
|
|
217
|
+
result_format: 'unified'
|
|
218
|
+
)
|
|
219
|
+
override = described_class.new(output_format: 'html')
|
|
220
|
+
merged = base.merge(override)
|
|
221
|
+
|
|
222
|
+
expect(merged.output_format).to eq 'html'
|
|
223
|
+
expect(merged.result_format).to eq 'unified'
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
it 'merges both formats with merge!' do
|
|
227
|
+
config = described_class.new(
|
|
228
|
+
output_format: 'markdown',
|
|
229
|
+
result_format: 'unified'
|
|
230
|
+
)
|
|
231
|
+
override = described_class.new(
|
|
232
|
+
output_format: 'djot',
|
|
233
|
+
result_format: 'element_based'
|
|
234
|
+
)
|
|
235
|
+
config.merge!(override)
|
|
236
|
+
|
|
237
|
+
expect(config.output_format).to eq 'djot'
|
|
238
|
+
expect(config.result_format).to eq 'element_based'
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
it 'handles merge with hash containing both formats' do
|
|
242
|
+
config = described_class.new(
|
|
243
|
+
output_format: 'plain',
|
|
244
|
+
result_format: 'unified'
|
|
245
|
+
)
|
|
246
|
+
merged = config.merge({ output_format: 'markdown' })
|
|
247
|
+
|
|
248
|
+
expect(merged.output_format).to eq 'markdown'
|
|
249
|
+
expect(merged.result_format).to eq 'unified'
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
describe 'format persistence across operations' do
|
|
254
|
+
it 'persists output_format through multiple conversions' do
|
|
255
|
+
config = described_class.new(output_format: 'markdown')
|
|
256
|
+
hash = config.to_h
|
|
257
|
+
new_config = described_class.new(**hash)
|
|
258
|
+
|
|
259
|
+
expect(new_config.output_format).to eq 'markdown'
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
it 'persists result_format through multiple conversions' do
|
|
263
|
+
config = described_class.new(result_format: 'element_based')
|
|
264
|
+
hash = config.to_h
|
|
265
|
+
new_config = described_class.new(**hash)
|
|
266
|
+
|
|
267
|
+
expect(new_config.result_format).to eq 'element_based'
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
it 'round-trips through JSON' do
|
|
271
|
+
config = described_class.new(
|
|
272
|
+
output_format: 'djot',
|
|
273
|
+
result_format: 'unified'
|
|
274
|
+
)
|
|
275
|
+
json = config.to_json
|
|
276
|
+
parsed = JSON.parse(json)
|
|
277
|
+
new_config = described_class.new(**parsed.transform_keys(&:to_sym))
|
|
278
|
+
|
|
279
|
+
expect(new_config.output_format).to eq 'djot'
|
|
280
|
+
expect(new_config.result_format).to eq 'unified'
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
describe 'format validation and edge cases' do
|
|
285
|
+
it 'handles empty string output_format' do
|
|
286
|
+
config = described_class.new(output_format: '')
|
|
287
|
+
|
|
288
|
+
expect(config.output_format).to eq ''
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
it 'handles empty string result_format' do
|
|
292
|
+
config = described_class.new(result_format: '')
|
|
293
|
+
|
|
294
|
+
expect(config.result_format).to eq ''
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
it 'handles whitespace in output_format' do
|
|
298
|
+
config = described_class.new(output_format: ' plain ')
|
|
299
|
+
|
|
300
|
+
expect(config.output_format).to eq ' plain '
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it 'handles case sensitivity in output_format' do
|
|
304
|
+
config = described_class.new(output_format: 'MarkDown')
|
|
305
|
+
|
|
306
|
+
expect(config.output_format).to eq 'MarkDown'
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it 'handles custom string in result_format' do
|
|
310
|
+
config = described_class.new(result_format: 'custom_format')
|
|
311
|
+
|
|
312
|
+
expect(config.result_format).to eq 'custom_format'
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
describe 'integration with other config fields' do
|
|
317
|
+
it 'works with output_format and chunking together' do
|
|
318
|
+
config = described_class.new(
|
|
319
|
+
output_format: 'markdown',
|
|
320
|
+
chunking: { max_chars: 500 }
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
expect(config.output_format).to eq 'markdown'
|
|
324
|
+
expect(config.chunking.max_chars).to eq 500
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
it 'works with result_format and OCR together' do
|
|
328
|
+
config = described_class.new(
|
|
329
|
+
result_format: 'element_based',
|
|
330
|
+
ocr: { backend: 'tesseract' }
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
expect(config.result_format).to eq 'element_based'
|
|
334
|
+
expect(config.ocr.backend).to eq 'tesseract'
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
it 'works with both formats and language detection' do
|
|
338
|
+
config = described_class.new(
|
|
339
|
+
output_format: 'html',
|
|
340
|
+
result_format: 'unified',
|
|
341
|
+
language_detection: { enabled: true }
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
expect(config.output_format).to eq 'html'
|
|
345
|
+
expect(config.result_format).to eq 'unified'
|
|
346
|
+
expect(config.language_detection.enabled).to be true
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
it 'preserves formats in complex config merge' do
|
|
350
|
+
base = described_class.new(
|
|
351
|
+
output_format: 'markdown',
|
|
352
|
+
result_format: 'unified',
|
|
353
|
+
chunking: { max_chars: 500 },
|
|
354
|
+
ocr: { backend: 'tesseract' }
|
|
355
|
+
)
|
|
356
|
+
override = described_class.new(
|
|
357
|
+
output_format: 'djot',
|
|
358
|
+
chunking: { max_chars: 750 }
|
|
359
|
+
)
|
|
360
|
+
merged = base.merge(override)
|
|
361
|
+
|
|
362
|
+
expect(merged.output_format).to eq 'djot'
|
|
363
|
+
expect(merged.result_format).to eq 'unified'
|
|
364
|
+
expect(merged.chunking.max_chars).to eq 750
|
|
365
|
+
expect(merged.ocr.backend).to eq 'tesseract'
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
describe 'allowed keys integration' do
|
|
370
|
+
it 'includes output_format in ALLOWED_KEYS' do
|
|
371
|
+
expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:output_format)
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
it 'includes result_format in ALLOWED_KEYS' do
|
|
375
|
+
expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:result_format)
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
end
|
|
380
|
+
# rubocop:enable RSpec/RepeatedExample
|
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.
|
|
3
|
+
version = "4.2.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -142,7 +142,7 @@ mime_guess = "2.0"
|
|
|
142
142
|
rmp-serde = "1.3"
|
|
143
143
|
thiserror = { workspace = true }
|
|
144
144
|
tokio = { workspace = true, optional = true }
|
|
145
|
-
uuid = { version = "1.
|
|
145
|
+
uuid = { version = "1.20.0", features = ["v4", "js"] }
|
|
146
146
|
indexmap = "2.13.0"
|
|
147
147
|
tracing = { workspace = true }
|
|
148
148
|
pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", features = [
|
|
@@ -198,7 +198,7 @@ rake = { version = "0.3.6", optional = true }
|
|
|
198
198
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
199
199
|
tower = { version = "0.5", optional = true }
|
|
200
200
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
201
|
-
rmcp = { version = "0.
|
|
201
|
+
rmcp = { version = "0.14.0", features = [
|
|
202
202
|
"server",
|
|
203
203
|
"macros",
|
|
204
204
|
"base64",
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.
|
|
20
|
+
> **🚀 Version 4.2.0 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -219,10 +219,10 @@ pub fn get_or_init_model(
|
|
|
219
219
|
// This prevents panics that cannot unwind through FFI boundaries
|
|
220
220
|
fn ensure_onnx_available() -> Result<(), String> {
|
|
221
221
|
// Check if ORT_DYLIB_PATH is already set and valid
|
|
222
|
-
if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
222
|
+
if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
|
|
223
|
+
&& std::path::Path::new(&path).exists()
|
|
224
|
+
{
|
|
225
|
+
return Ok(());
|
|
226
226
|
}
|
|
227
227
|
|
|
228
228
|
// Check common installation paths and set ORT_DYLIB_PATH if found
|