kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::OCR do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.backend).to eq 'tesseract'
|
|
9
|
+
expect(config.language).to eq 'eng'
|
|
10
|
+
expect(config.tesseract_config).to be_nil
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'creates config with custom string values' do
|
|
14
|
+
config = described_class.new(
|
|
15
|
+
backend: 'easyocr',
|
|
16
|
+
language: 'fra'
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
expect(config.backend).to eq 'easyocr'
|
|
20
|
+
expect(config.language).to eq 'fra'
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'converts symbol keys to strings' do
|
|
24
|
+
config = described_class.new(backend: :tesseract, language: :deu)
|
|
25
|
+
|
|
26
|
+
expect(config.backend).to eq 'tesseract'
|
|
27
|
+
expect(config.language).to eq 'deu'
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
it 'accepts tesseract_config as instance' do
|
|
31
|
+
tesseract = Kreuzberg::Config::Tesseract.new(options: 'value')
|
|
32
|
+
config = described_class.new(tesseract_config: tesseract)
|
|
33
|
+
|
|
34
|
+
expect(config.tesseract_config).to be_a Kreuzberg::Config::Tesseract
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'converts tesseract_config hash to instance' do
|
|
38
|
+
config = described_class.new(tesseract_config: { option: 'value' })
|
|
39
|
+
|
|
40
|
+
expect(config.tesseract_config).to be_a Kreuzberg::Config::Tesseract
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
describe '#to_h' do
|
|
45
|
+
it 'serializes to hash with default values' do
|
|
46
|
+
config = described_class.new
|
|
47
|
+
hash = config.to_h
|
|
48
|
+
|
|
49
|
+
expect(hash).to be_a Hash
|
|
50
|
+
expect(hash[:backend]).to eq 'tesseract'
|
|
51
|
+
expect(hash[:language]).to eq 'eng'
|
|
52
|
+
expect(hash[:tesseract_config]).to be_nil
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it 'includes tesseract_config in hash when present' do
|
|
56
|
+
config = described_class.new(
|
|
57
|
+
backend: 'tesseract',
|
|
58
|
+
tesseract_config: { dpi: 300 }
|
|
59
|
+
)
|
|
60
|
+
hash = config.to_h
|
|
61
|
+
|
|
62
|
+
expect(hash[:tesseract_config]).to be_a Hash
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it 'compacts nil values from hash' do
|
|
66
|
+
config = described_class.new(backend: 'tesseract')
|
|
67
|
+
hash = config.to_h
|
|
68
|
+
|
|
69
|
+
expect(hash.key?(:tesseract_config)).to be false
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
describe 'validation' do
|
|
74
|
+
it 'accepts valid backends' do
|
|
75
|
+
expect do
|
|
76
|
+
described_class.new(backend: 'tesseract')
|
|
77
|
+
end.not_to raise_error
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it 'accepts symbol language' do
|
|
81
|
+
expect do
|
|
82
|
+
described_class.new(language: :fra)
|
|
83
|
+
end.not_to raise_error
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it 'raises error for invalid tesseract_config type' do
|
|
87
|
+
expect do
|
|
88
|
+
described_class.new(tesseract_config: 'invalid')
|
|
89
|
+
end.to raise_error ArgumentError, /Expected.*Tesseract.*Hash.*nil/
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
describe 'keyword arguments' do
|
|
94
|
+
it 'accepts keyword arguments only' do
|
|
95
|
+
config = described_class.new(backend: 'tesseract', language: 'eng')
|
|
96
|
+
|
|
97
|
+
expect(config.backend).to eq 'tesseract'
|
|
98
|
+
expect(config.language).to eq 'eng'
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
it 'ignores unknown keywords gracefully' do
|
|
102
|
+
# This test documents current behavior
|
|
103
|
+
# The initialize method doesn't explicitly reject unknown keys
|
|
104
|
+
config = described_class.new(backend: 'tesseract')
|
|
105
|
+
expect(config).to be_a described_class
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
describe 'equality' do
|
|
110
|
+
it 'compares configs by value' do
|
|
111
|
+
config1 = described_class.new(backend: 'tesseract', language: 'eng')
|
|
112
|
+
config2 = described_class.new(backend: 'tesseract', language: 'eng')
|
|
113
|
+
|
|
114
|
+
expect(config1.backend).to eq config2.backend
|
|
115
|
+
expect(config1.language).to eq config2.language
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it 'detects differences in backend' do
|
|
119
|
+
config1 = described_class.new(backend: 'tesseract')
|
|
120
|
+
config2 = described_class.new(backend: 'easyocr')
|
|
121
|
+
|
|
122
|
+
expect(config1.backend).not_to eq config2.backend
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
it 'detects differences in language' do
|
|
126
|
+
config1 = described_class.new(language: 'eng')
|
|
127
|
+
config2 = described_class.new(language: 'fra')
|
|
128
|
+
|
|
129
|
+
expect(config1.language).not_to eq config2.language
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
describe 'nested config integration' do
|
|
134
|
+
it 'integrates with Extraction config' do
|
|
135
|
+
ocr_config = described_class.new(backend: 'tesseract', language: 'deu')
|
|
136
|
+
extraction = Kreuzberg::Config::Extraction.new(ocr: ocr_config)
|
|
137
|
+
|
|
138
|
+
expect(extraction.ocr).to be_a described_class
|
|
139
|
+
expect(extraction.ocr.backend).to eq 'tesseract'
|
|
140
|
+
expect(extraction.ocr.language).to eq 'deu'
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it 'accepts hash in Extraction config and converts to instance' do
|
|
144
|
+
extraction = Kreuzberg::Config::Extraction.new(
|
|
145
|
+
ocr: { backend: 'easyocr', language: 'fra' }
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
expect(extraction.ocr).to be_a described_class
|
|
149
|
+
expect(extraction.ocr.backend).to eq 'easyocr'
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
describe 'symbol vs string key handling' do
|
|
154
|
+
it 'converts symbol keys to correct attributes' do
|
|
155
|
+
config = described_class.new(backend: :tesseract, language: :fra)
|
|
156
|
+
|
|
157
|
+
expect(config.backend).to eq 'tesseract'
|
|
158
|
+
expect(config.language).to eq 'fra'
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
it 'handles mixed symbol and string values' do
|
|
162
|
+
config = described_class.new(
|
|
163
|
+
backend: 'tesseract',
|
|
164
|
+
language: :eng
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
expect(config.backend).to eq 'tesseract'
|
|
168
|
+
expect(config.language).to eq 'eng'
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# rubocop:disable RSpec/RepeatedExample
|
|
4
|
+
RSpec.describe 'Output Format and Result Format Configuration' do
|
|
5
|
+
describe Kreuzberg::Config::Extraction do
|
|
6
|
+
describe 'output_format' do
|
|
7
|
+
it 'accepts output_format as initialization parameter' do
|
|
8
|
+
config = described_class.new(output_format: 'markdown')
|
|
9
|
+
|
|
10
|
+
expect(config.output_format).to eq 'markdown'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'defaults to nil when not specified' do
|
|
14
|
+
config = described_class.new
|
|
15
|
+
|
|
16
|
+
expect(config.output_format).to be_nil
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'accepts plain format' do
|
|
20
|
+
config = described_class.new(output_format: 'plain')
|
|
21
|
+
|
|
22
|
+
expect(config.output_format).to eq 'plain'
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'accepts markdown format' do
|
|
26
|
+
config = described_class.new(output_format: 'markdown')
|
|
27
|
+
|
|
28
|
+
expect(config.output_format).to eq 'markdown'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'accepts djot format' do
|
|
32
|
+
config = described_class.new(output_format: 'djot')
|
|
33
|
+
|
|
34
|
+
expect(config.output_format).to eq 'djot'
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'accepts html format' do
|
|
38
|
+
config = described_class.new(output_format: 'html')
|
|
39
|
+
|
|
40
|
+
expect(config.output_format).to eq 'html'
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'converts output_format to string' do
|
|
44
|
+
config = described_class.new(output_format: :markdown)
|
|
45
|
+
|
|
46
|
+
expect(config.output_format).to eq 'markdown'
|
|
47
|
+
expect(config.output_format).to be_a String
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it 'includes output_format in to_h' do
|
|
51
|
+
config = described_class.new(output_format: 'markdown')
|
|
52
|
+
hash = config.to_h
|
|
53
|
+
|
|
54
|
+
expect(hash[:output_format]).to eq 'markdown'
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it 'excludes nil output_format from to_h' do
|
|
58
|
+
config = described_class.new(output_format: nil)
|
|
59
|
+
hash = config.to_h
|
|
60
|
+
|
|
61
|
+
expect(hash.key?(:output_format)).to be false
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'includes output_format in JSON' do
|
|
65
|
+
config = described_class.new(output_format: 'markdown')
|
|
66
|
+
json = config.to_json
|
|
67
|
+
parsed = JSON.parse(json)
|
|
68
|
+
|
|
69
|
+
expect(parsed['output_format']).to eq 'markdown'
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'retrieves output_format with get_field' do
|
|
73
|
+
config = described_class.new(output_format: 'djot')
|
|
74
|
+
|
|
75
|
+
expect(config.get_field('output_format')).to eq 'djot'
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it 'can be set with []=' do
|
|
79
|
+
config = described_class.new
|
|
80
|
+
config[:output_format] = 'html'
|
|
81
|
+
|
|
82
|
+
expect(config.output_format).to eq 'html'
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it 'can be set with []= using symbol' do
|
|
86
|
+
config = described_class.new
|
|
87
|
+
config[:output_format] = :plain
|
|
88
|
+
|
|
89
|
+
expect(config.output_format).to eq 'plain'
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'can be retrieved with []' do
|
|
93
|
+
config = described_class.new(output_format: 'markdown')
|
|
94
|
+
|
|
95
|
+
expect(config[:output_format]).to eq 'markdown'
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
describe 'result_format' do
|
|
100
|
+
it 'accepts result_format as initialization parameter' do
|
|
101
|
+
config = described_class.new(result_format: 'unified')
|
|
102
|
+
|
|
103
|
+
expect(config.result_format).to eq 'unified'
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'defaults to nil when not specified' do
|
|
107
|
+
config = described_class.new
|
|
108
|
+
|
|
109
|
+
expect(config.result_format).to be_nil
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'accepts unified format' do
|
|
113
|
+
config = described_class.new(result_format: 'unified')
|
|
114
|
+
|
|
115
|
+
expect(config.result_format).to eq 'unified'
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it 'accepts element_based format' do
|
|
119
|
+
config = described_class.new(result_format: 'element_based')
|
|
120
|
+
|
|
121
|
+
expect(config.result_format).to eq 'element_based'
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it 'converts result_format to string' do
|
|
125
|
+
config = described_class.new(result_format: :unified)
|
|
126
|
+
|
|
127
|
+
expect(config.result_format).to eq 'unified'
|
|
128
|
+
expect(config.result_format).to be_a String
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'includes result_format in to_h' do
|
|
132
|
+
config = described_class.new(result_format: 'element_based')
|
|
133
|
+
hash = config.to_h
|
|
134
|
+
|
|
135
|
+
expect(hash[:result_format]).to eq 'element_based'
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it 'excludes nil result_format from to_h' do
|
|
139
|
+
config = described_class.new(result_format: nil)
|
|
140
|
+
hash = config.to_h
|
|
141
|
+
|
|
142
|
+
expect(hash.key?(:result_format)).to be false
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it 'includes result_format in JSON' do
|
|
146
|
+
config = described_class.new(result_format: 'element_based')
|
|
147
|
+
json = config.to_json
|
|
148
|
+
parsed = JSON.parse(json)
|
|
149
|
+
|
|
150
|
+
expect(parsed['result_format']).to eq 'element_based'
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'retrieves result_format with get_field' do
|
|
154
|
+
config = described_class.new(result_format: 'unified')
|
|
155
|
+
|
|
156
|
+
expect(config.get_field('result_format')).to eq 'unified'
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'can be set with []=' do
|
|
160
|
+
config = described_class.new
|
|
161
|
+
config[:result_format] = 'unified'
|
|
162
|
+
|
|
163
|
+
expect(config.result_format).to eq 'unified'
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it 'can be set with []= using symbol' do
|
|
167
|
+
config = described_class.new
|
|
168
|
+
config[:result_format] = :element_based
|
|
169
|
+
|
|
170
|
+
expect(config.result_format).to eq 'element_based'
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
it 'can be retrieved with []' do
|
|
174
|
+
config = described_class.new(result_format: 'element_based')
|
|
175
|
+
|
|
176
|
+
expect(config[:result_format]).to eq 'element_based'
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
describe 'combined output and result formats' do
|
|
181
|
+
it 'accepts both output_format and result_format' do
|
|
182
|
+
config = described_class.new(
|
|
183
|
+
output_format: 'markdown',
|
|
184
|
+
result_format: 'unified'
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
expect(config.output_format).to eq 'markdown'
|
|
188
|
+
expect(config.result_format).to eq 'unified'
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it 'serializes both formats in to_h' do
|
|
192
|
+
config = described_class.new(
|
|
193
|
+
output_format: 'djot',
|
|
194
|
+
result_format: 'element_based'
|
|
195
|
+
)
|
|
196
|
+
hash = config.to_h
|
|
197
|
+
|
|
198
|
+
expect(hash[:output_format]).to eq 'djot'
|
|
199
|
+
expect(hash[:result_format]).to eq 'element_based'
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it 'serializes both formats in JSON' do
|
|
203
|
+
config = described_class.new(
|
|
204
|
+
output_format: 'html',
|
|
205
|
+
result_format: 'unified'
|
|
206
|
+
)
|
|
207
|
+
json = config.to_json
|
|
208
|
+
parsed = JSON.parse(json)
|
|
209
|
+
|
|
210
|
+
expect(parsed['output_format']).to eq 'html'
|
|
211
|
+
expect(parsed['result_format']).to eq 'unified'
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
it 'merges both formats correctly' do
|
|
215
|
+
base = described_class.new(
|
|
216
|
+
output_format: 'markdown',
|
|
217
|
+
result_format: 'unified'
|
|
218
|
+
)
|
|
219
|
+
override = described_class.new(output_format: 'html')
|
|
220
|
+
merged = base.merge(override)
|
|
221
|
+
|
|
222
|
+
expect(merged.output_format).to eq 'html'
|
|
223
|
+
expect(merged.result_format).to eq 'unified'
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
it 'merges both formats with merge!' do
|
|
227
|
+
config = described_class.new(
|
|
228
|
+
output_format: 'markdown',
|
|
229
|
+
result_format: 'unified'
|
|
230
|
+
)
|
|
231
|
+
override = described_class.new(
|
|
232
|
+
output_format: 'djot',
|
|
233
|
+
result_format: 'element_based'
|
|
234
|
+
)
|
|
235
|
+
config.merge!(override)
|
|
236
|
+
|
|
237
|
+
expect(config.output_format).to eq 'djot'
|
|
238
|
+
expect(config.result_format).to eq 'element_based'
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
it 'handles merge with hash containing both formats' do
|
|
242
|
+
config = described_class.new(
|
|
243
|
+
output_format: 'plain',
|
|
244
|
+
result_format: 'unified'
|
|
245
|
+
)
|
|
246
|
+
merged = config.merge({ output_format: 'markdown' })
|
|
247
|
+
|
|
248
|
+
expect(merged.output_format).to eq 'markdown'
|
|
249
|
+
expect(merged.result_format).to eq 'unified'
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
describe 'format persistence across operations' do
|
|
254
|
+
it 'persists output_format through multiple conversions' do
|
|
255
|
+
config = described_class.new(output_format: 'markdown')
|
|
256
|
+
hash = config.to_h
|
|
257
|
+
new_config = described_class.new(**hash)
|
|
258
|
+
|
|
259
|
+
expect(new_config.output_format).to eq 'markdown'
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
it 'persists result_format through multiple conversions' do
|
|
263
|
+
config = described_class.new(result_format: 'element_based')
|
|
264
|
+
hash = config.to_h
|
|
265
|
+
new_config = described_class.new(**hash)
|
|
266
|
+
|
|
267
|
+
expect(new_config.result_format).to eq 'element_based'
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
it 'round-trips through JSON' do
|
|
271
|
+
config = described_class.new(
|
|
272
|
+
output_format: 'djot',
|
|
273
|
+
result_format: 'unified'
|
|
274
|
+
)
|
|
275
|
+
json = config.to_json
|
|
276
|
+
parsed = JSON.parse(json)
|
|
277
|
+
new_config = described_class.new(**parsed.transform_keys(&:to_sym))
|
|
278
|
+
|
|
279
|
+
expect(new_config.output_format).to eq 'djot'
|
|
280
|
+
expect(new_config.result_format).to eq 'unified'
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
describe 'format validation and edge cases' do
|
|
285
|
+
it 'raises error for empty string output_format' do
|
|
286
|
+
expect do
|
|
287
|
+
described_class.new(output_format: '')
|
|
288
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
it 'raises error for empty string result_format' do
|
|
292
|
+
expect do
|
|
293
|
+
described_class.new(result_format: '')
|
|
294
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
it 'raises error for whitespace in output_format' do
|
|
298
|
+
expect do
|
|
299
|
+
described_class.new(output_format: ' plain ')
|
|
300
|
+
end.to raise_error(ArgumentError, /Invalid output_format/)
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it 'normalizes case in output_format' do
|
|
304
|
+
config = described_class.new(output_format: 'MarkDown')
|
|
305
|
+
|
|
306
|
+
expect(config.output_format).to eq 'markdown'
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it 'raises error for custom string in result_format' do
|
|
310
|
+
expect do
|
|
311
|
+
described_class.new(result_format: 'custom_format')
|
|
312
|
+
end.to raise_error(ArgumentError, /Invalid result_format/)
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
describe 'integration with other config fields' do
|
|
317
|
+
it 'works with output_format and chunking together' do
|
|
318
|
+
config = described_class.new(
|
|
319
|
+
output_format: 'markdown',
|
|
320
|
+
chunking: { max_chars: 500 }
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
expect(config.output_format).to eq 'markdown'
|
|
324
|
+
expect(config.chunking.max_chars).to eq 500
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
it 'works with result_format and OCR together' do
|
|
328
|
+
config = described_class.new(
|
|
329
|
+
result_format: 'element_based',
|
|
330
|
+
ocr: { backend: 'tesseract' }
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
expect(config.result_format).to eq 'element_based'
|
|
334
|
+
expect(config.ocr.backend).to eq 'tesseract'
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
it 'works with both formats and language detection' do
|
|
338
|
+
config = described_class.new(
|
|
339
|
+
output_format: 'html',
|
|
340
|
+
result_format: 'unified',
|
|
341
|
+
language_detection: { enabled: true }
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
expect(config.output_format).to eq 'html'
|
|
345
|
+
expect(config.result_format).to eq 'unified'
|
|
346
|
+
expect(config.language_detection.enabled).to be true
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
it 'preserves formats in complex config merge' do
|
|
350
|
+
base = described_class.new(
|
|
351
|
+
output_format: 'markdown',
|
|
352
|
+
result_format: 'unified',
|
|
353
|
+
chunking: { max_chars: 500 },
|
|
354
|
+
ocr: { backend: 'tesseract' }
|
|
355
|
+
)
|
|
356
|
+
override = described_class.new(
|
|
357
|
+
output_format: 'djot',
|
|
358
|
+
chunking: { max_chars: 750 }
|
|
359
|
+
)
|
|
360
|
+
merged = base.merge(override)
|
|
361
|
+
|
|
362
|
+
expect(merged.output_format).to eq 'djot'
|
|
363
|
+
expect(merged.result_format).to eq 'unified'
|
|
364
|
+
expect(merged.chunking.max_chars).to eq 750
|
|
365
|
+
expect(merged.ocr.backend).to eq 'tesseract'
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
describe 'allowed keys integration' do
|
|
370
|
+
it 'includes output_format in ALLOWED_KEYS' do
|
|
371
|
+
expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:output_format)
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
it 'includes result_format in ALLOWED_KEYS' do
|
|
375
|
+
expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:result_format)
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
end
|
|
380
|
+
# rubocop:enable RSpec/RepeatedExample
|