kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::ImagePreprocessing do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.target_dpi).to eq 300
|
|
9
|
+
expect(config.auto_rotate).to be true
|
|
10
|
+
expect(config.deskew).to be true
|
|
11
|
+
expect(config.denoise).to be false
|
|
12
|
+
expect(config.contrast_enhance).to be true
|
|
13
|
+
expect(config.binarization_method).to eq 'otsu'
|
|
14
|
+
expect(config.invert_colors).to be false
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'creates config with custom values' do
|
|
18
|
+
config = described_class.new(
|
|
19
|
+
target_dpi: 600,
|
|
20
|
+
auto_rotate: false,
|
|
21
|
+
deskew: false,
|
|
22
|
+
denoise: true,
|
|
23
|
+
contrast_enhance: false,
|
|
24
|
+
binarization_method: 'sauvola',
|
|
25
|
+
invert_colors: true
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
expect(config.target_dpi).to eq 600
|
|
29
|
+
expect(config.auto_rotate).to be false
|
|
30
|
+
expect(config.deskew).to be false
|
|
31
|
+
expect(config.denoise).to be true
|
|
32
|
+
expect(config.contrast_enhance).to be false
|
|
33
|
+
expect(config.binarization_method).to eq 'sauvola'
|
|
34
|
+
expect(config.invert_colors).to be true
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'converts target_dpi to integer' do
|
|
38
|
+
config = described_class.new(target_dpi: '300')
|
|
39
|
+
|
|
40
|
+
expect(config.target_dpi).to eq 300
|
|
41
|
+
expect(config.target_dpi).to be_a Integer
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it 'converts binarization_method to string' do
|
|
45
|
+
config = described_class.new(binarization_method: :niblack)
|
|
46
|
+
|
|
47
|
+
expect(config.binarization_method).to eq 'niblack'
|
|
48
|
+
expect(config.binarization_method).to be_a String
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
describe '#to_h' do
|
|
53
|
+
it 'serializes to hash with all values' do
|
|
54
|
+
config = described_class.new(target_dpi: 300, denoise: true)
|
|
55
|
+
hash = config.to_h
|
|
56
|
+
|
|
57
|
+
expect(hash).to be_a Hash
|
|
58
|
+
expect(hash[:target_dpi]).to eq 300
|
|
59
|
+
expect(hash[:denoise]).to be true
|
|
60
|
+
expect(hash[:auto_rotate]).to be true
|
|
61
|
+
expect(hash[:binarization_method]).to eq 'otsu'
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'always includes all keys in hash' do
|
|
65
|
+
config = described_class.new
|
|
66
|
+
hash = config.to_h
|
|
67
|
+
|
|
68
|
+
expect(hash.keys).to contain_exactly(
|
|
69
|
+
:target_dpi,
|
|
70
|
+
:auto_rotate,
|
|
71
|
+
:deskew,
|
|
72
|
+
:denoise,
|
|
73
|
+
:contrast_enhance,
|
|
74
|
+
:binarization_method,
|
|
75
|
+
:invert_colors
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
describe 'validation' do
|
|
81
|
+
it 'rejects invalid binarization method' do
|
|
82
|
+
expect do
|
|
83
|
+
described_class.new(binarization_method: 'invalid_method')
|
|
84
|
+
end.to raise_error ArgumentError, /Invalid binarization_method/
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it 'accepts all valid binarization methods' do
|
|
88
|
+
valid_methods = %w[otsu sauvola niblack wolf bradley adaptive]
|
|
89
|
+
|
|
90
|
+
valid_methods.each do |method|
|
|
91
|
+
expect do
|
|
92
|
+
described_class.new(binarization_method: method)
|
|
93
|
+
end.not_to raise_error
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
it 'accepts binarization method as symbol' do
|
|
98
|
+
expect do
|
|
99
|
+
described_class.new(binarization_method: :sauvola)
|
|
100
|
+
end.not_to raise_error
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
describe 'keyword arguments' do
|
|
105
|
+
it 'accepts all keyword arguments' do
|
|
106
|
+
config = described_class.new(
|
|
107
|
+
target_dpi: 600,
|
|
108
|
+
auto_rotate: true,
|
|
109
|
+
deskew: false,
|
|
110
|
+
denoise: true,
|
|
111
|
+
contrast_enhance: false,
|
|
112
|
+
binarization_method: 'bradley',
|
|
113
|
+
invert_colors: true
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
expect(config.target_dpi).to eq 600
|
|
117
|
+
expect(config.auto_rotate).to be true
|
|
118
|
+
expect(config.deskew).to be false
|
|
119
|
+
expect(config.denoise).to be true
|
|
120
|
+
expect(config.contrast_enhance).to be false
|
|
121
|
+
expect(config.binarization_method).to eq 'bradley'
|
|
122
|
+
expect(config.invert_colors).to be true
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
describe 'equality' do
|
|
127
|
+
it 'compares configs by value' do
|
|
128
|
+
config1 = described_class.new(
|
|
129
|
+
target_dpi: 300,
|
|
130
|
+
binarization_method: 'otsu',
|
|
131
|
+
denoise: true
|
|
132
|
+
)
|
|
133
|
+
config2 = described_class.new(
|
|
134
|
+
target_dpi: 300,
|
|
135
|
+
binarization_method: 'otsu',
|
|
136
|
+
denoise: true
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
expect(config1.target_dpi).to eq config2.target_dpi
|
|
140
|
+
expect(config1.binarization_method).to eq config2.binarization_method
|
|
141
|
+
expect(config1.denoise).to eq config2.denoise
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
it 'detects differences in target_dpi' do
|
|
145
|
+
config1 = described_class.new(target_dpi: 300)
|
|
146
|
+
config2 = described_class.new(target_dpi: 600)
|
|
147
|
+
|
|
148
|
+
expect(config1.target_dpi).not_to eq config2.target_dpi
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
it 'detects differences in binarization_method' do
|
|
152
|
+
config1 = described_class.new(binarization_method: 'otsu')
|
|
153
|
+
config2 = described_class.new(binarization_method: 'sauvola')
|
|
154
|
+
|
|
155
|
+
expect(config1.binarization_method).not_to eq config2.binarization_method
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
describe 'nested config integration' do
|
|
160
|
+
it 'can be nested in Tesseract config' do
|
|
161
|
+
preprocessing = described_class.new(denoise: true)
|
|
162
|
+
tesseract = Kreuzberg::Config::Tesseract.new(preprocessing: preprocessing)
|
|
163
|
+
|
|
164
|
+
expect(tesseract.options[:preprocessing]).to be_a described_class
|
|
165
|
+
expect(tesseract.options[:preprocessing].denoise).to be true
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
describe 'symbol vs string key handling' do
|
|
170
|
+
it 'converts symbol binarization method to string' do
|
|
171
|
+
config = described_class.new(binarization_method: :bradley)
|
|
172
|
+
|
|
173
|
+
expect(config.binarization_method).to eq 'bradley'
|
|
174
|
+
expect(config.binarization_method).to be_a String
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
it 'converts string target_dpi to integer' do
|
|
178
|
+
config = described_class.new(target_dpi: '600')
|
|
179
|
+
|
|
180
|
+
expect(config.target_dpi).to eq 600
|
|
181
|
+
expect(config.target_dpi).to be_a Integer
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
describe 'boolean conversion' do
|
|
186
|
+
it 'converts truthy values to boolean' do
|
|
187
|
+
config = described_class.new(
|
|
188
|
+
auto_rotate: 1,
|
|
189
|
+
deskew: 'yes',
|
|
190
|
+
denoise: true
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
expect(config.auto_rotate).to be true
|
|
194
|
+
expect(config.deskew).to be true
|
|
195
|
+
expect(config.denoise).to be true
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
it 'converts false values to boolean' do
|
|
199
|
+
config = described_class.new(
|
|
200
|
+
auto_rotate: false,
|
|
201
|
+
deskew: false,
|
|
202
|
+
denoise: false
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
expect(config.auto_rotate).to be false
|
|
206
|
+
expect(config.deskew).to be false
|
|
207
|
+
expect(config.denoise).to be false
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
describe 'DPI configuration' do
|
|
212
|
+
it 'accepts realistic DPI values' do
|
|
213
|
+
config = described_class.new(target_dpi: 300)
|
|
214
|
+
|
|
215
|
+
expect(config.target_dpi).to eq 300
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
it 'accepts high DPI values' do
|
|
219
|
+
config = described_class.new(target_dpi: 1200)
|
|
220
|
+
|
|
221
|
+
expect(config.target_dpi).to eq 1200
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
it 'accepts low DPI values' do
|
|
225
|
+
config = described_class.new(target_dpi: 72)
|
|
226
|
+
|
|
227
|
+
expect(config.target_dpi).to eq 72
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::Keywords do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.algorithm).to be_nil
|
|
9
|
+
expect(config.max_keywords).to be_nil
|
|
10
|
+
expect(config.min_score).to be_nil
|
|
11
|
+
expect(config.ngram_range).to be_nil
|
|
12
|
+
expect(config.language).to be_nil
|
|
13
|
+
expect(config.yake_params).to be_nil
|
|
14
|
+
expect(config.rake_params).to be_nil
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'creates config with custom values' do
|
|
18
|
+
config = described_class.new(
|
|
19
|
+
algorithm: 'yake',
|
|
20
|
+
max_keywords: 10,
|
|
21
|
+
min_score: 0.5,
|
|
22
|
+
ngram_range: [1, 3],
|
|
23
|
+
language: 'en'
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
expect(config.algorithm).to eq 'yake'
|
|
27
|
+
expect(config.max_keywords).to eq 10
|
|
28
|
+
expect(config.min_score).to eq 0.5
|
|
29
|
+
expect(config.ngram_range).to eq [1, 3]
|
|
30
|
+
expect(config.language).to eq 'en'
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it 'accepts yake_params as instance' do
|
|
34
|
+
yake_params = Kreuzberg::Config::KeywordYakeParams.new(window_size: 3)
|
|
35
|
+
config = described_class.new(yake_params: yake_params)
|
|
36
|
+
|
|
37
|
+
expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
|
|
38
|
+
expect(config.yake_params.window_size).to eq 3
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it 'converts yake_params hash to instance' do
|
|
42
|
+
config = described_class.new(yake_params: { window_size: 2 })
|
|
43
|
+
|
|
44
|
+
expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
|
|
45
|
+
expect(config.yake_params.window_size).to eq 2
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it 'accepts rake_params as instance' do
|
|
49
|
+
rake_params = Kreuzberg::Config::KeywordRakeParams.new(min_word_length: 3)
|
|
50
|
+
config = described_class.new(rake_params: rake_params)
|
|
51
|
+
|
|
52
|
+
expect(config.rake_params).to be_a Kreuzberg::Config::KeywordRakeParams
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it 'converts rake_params hash to instance' do
|
|
56
|
+
config = described_class.new(rake_params: { min_word_length: 2 })
|
|
57
|
+
|
|
58
|
+
expect(config.rake_params).to be_a Kreuzberg::Config::KeywordRakeParams
|
|
59
|
+
expect(config.rake_params.min_word_length).to eq 2
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
describe '#to_h' do
|
|
64
|
+
it 'serializes to hash' do
|
|
65
|
+
config = described_class.new(algorithm: 'yake', max_keywords: 10)
|
|
66
|
+
hash = config.to_h
|
|
67
|
+
|
|
68
|
+
expect(hash).to be_a Hash
|
|
69
|
+
expect(hash[:algorithm]).to eq 'yake'
|
|
70
|
+
expect(hash[:max_keywords]).to eq 10
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it 'includes nested params in hash' do
|
|
74
|
+
config = described_class.new(
|
|
75
|
+
algorithm: 'yake',
|
|
76
|
+
yake_params: { window_size: 3 }
|
|
77
|
+
)
|
|
78
|
+
hash = config.to_h
|
|
79
|
+
|
|
80
|
+
expect(hash[:yake_params]).to be_a Hash
|
|
81
|
+
expect(hash[:yake_params][:window_size]).to eq 3
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it 'compacts nil values from hash' do
|
|
85
|
+
config = described_class.new(algorithm: 'rake')
|
|
86
|
+
hash = config.to_h
|
|
87
|
+
|
|
88
|
+
expect(hash.key?(:max_keywords)).to be false
|
|
89
|
+
expect(hash.key?(:yake_params)).to be false
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
describe 'validation' do
|
|
94
|
+
it 'accepts valid algorithm names' do
|
|
95
|
+
expect do
|
|
96
|
+
described_class.new(algorithm: 'yake')
|
|
97
|
+
end.not_to raise_error
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it 'accepts valid max_keywords' do
|
|
101
|
+
expect do
|
|
102
|
+
described_class.new(max_keywords: 20)
|
|
103
|
+
end.not_to raise_error
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it 'raises error for invalid yake_params type' do
|
|
107
|
+
expect do
|
|
108
|
+
described_class.new(yake_params: 'invalid')
|
|
109
|
+
end.to raise_error ArgumentError, /Expected.*KeywordYakeParams.*Hash.*nil/
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'raises error for invalid rake_params type' do
|
|
113
|
+
expect do
|
|
114
|
+
described_class.new(rake_params: 'invalid')
|
|
115
|
+
end.to raise_error ArgumentError, /Expected.*KeywordRakeParams.*Hash.*nil/
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
describe 'keyword arguments' do
|
|
120
|
+
it 'accepts all keyword arguments' do
|
|
121
|
+
config = described_class.new(
|
|
122
|
+
algorithm: 'yake',
|
|
123
|
+
max_keywords: 15,
|
|
124
|
+
min_score: 0.7,
|
|
125
|
+
ngram_range: [1, 2],
|
|
126
|
+
language: 'fr',
|
|
127
|
+
yake_params: { window_size: 3 }
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
expect(config.algorithm).to eq 'yake'
|
|
131
|
+
expect(config.max_keywords).to eq 15
|
|
132
|
+
expect(config.min_score).to eq 0.7
|
|
133
|
+
expect(config.ngram_range).to eq [1, 2]
|
|
134
|
+
expect(config.language).to eq 'fr'
|
|
135
|
+
expect(config.yake_params).to be_a Kreuzberg::Config::KeywordYakeParams
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
describe 'equality' do
|
|
140
|
+
it 'compares configs by value' do
|
|
141
|
+
config1 = described_class.new(algorithm: 'yake', max_keywords: 10)
|
|
142
|
+
config2 = described_class.new(algorithm: 'yake', max_keywords: 10)
|
|
143
|
+
|
|
144
|
+
expect(config1.algorithm).to eq config2.algorithm
|
|
145
|
+
expect(config1.max_keywords).to eq config2.max_keywords
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it 'detects differences in algorithm' do
|
|
149
|
+
config1 = described_class.new(algorithm: 'yake')
|
|
150
|
+
config2 = described_class.new(algorithm: 'rake')
|
|
151
|
+
|
|
152
|
+
expect(config1.algorithm).not_to eq config2.algorithm
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it 'detects differences in max_keywords' do
|
|
156
|
+
config1 = described_class.new(max_keywords: 10)
|
|
157
|
+
config2 = described_class.new(max_keywords: 20)
|
|
158
|
+
|
|
159
|
+
expect(config1.max_keywords).not_to eq config2.max_keywords
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
describe 'nested config integration' do
|
|
164
|
+
it 'can be nested in Extraction config' do
|
|
165
|
+
keywords = described_class.new(algorithm: 'yake', max_keywords: 15)
|
|
166
|
+
extraction = Kreuzberg::Config::Extraction.new(keywords: keywords)
|
|
167
|
+
|
|
168
|
+
expect(extraction.keywords).to be_a described_class
|
|
169
|
+
expect(extraction.keywords.algorithm).to eq 'yake'
|
|
170
|
+
expect(extraction.keywords.max_keywords).to eq 15
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
it 'accepts hash in Extraction config' do
|
|
174
|
+
extraction = Kreuzberg::Config::Extraction.new(
|
|
175
|
+
keywords: { algorithm: 'rake', max_keywords: 10 }
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
expect(extraction.keywords).to be_a described_class
|
|
179
|
+
expect(extraction.keywords.algorithm).to eq 'rake'
|
|
180
|
+
expect(extraction.keywords.max_keywords).to eq 10
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
describe 'symbol vs string key handling' do
|
|
185
|
+
it 'converts symbol algorithm to string' do
|
|
186
|
+
config = described_class.new(algorithm: :yake)
|
|
187
|
+
|
|
188
|
+
expect(config.algorithm).to eq 'yake'
|
|
189
|
+
expect(config.algorithm).to be_a String
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'converts symbol language to string' do
|
|
193
|
+
config = described_class.new(language: :eng)
|
|
194
|
+
|
|
195
|
+
expect(config.language).to eq 'eng'
|
|
196
|
+
expect(config.language).to be_a String
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
it 'converts ngram_range values to integers' do
|
|
200
|
+
config = described_class.new(ngram_range: %w[1 3])
|
|
201
|
+
|
|
202
|
+
expect(config.ngram_range).to eq [1, 3]
|
|
203
|
+
expect(config.ngram_range.all?(Integer)).to be true
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
describe 'parameter conversions' do
|
|
208
|
+
it 'converts max_keywords to integer' do
|
|
209
|
+
config = described_class.new(max_keywords: '20')
|
|
210
|
+
|
|
211
|
+
expect(config.max_keywords).to eq 20
|
|
212
|
+
expect(config.max_keywords).to be_a Integer
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
it 'converts min_score to float' do
|
|
216
|
+
config = described_class.new(min_score: '0.75')
|
|
217
|
+
|
|
218
|
+
expect(config.min_score).to eq 0.75
|
|
219
|
+
expect(config.min_score).to be_a Float
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
it 'converts ngram_range to array of integers' do
|
|
223
|
+
config = described_class.new(ngram_range: [1, 2])
|
|
224
|
+
|
|
225
|
+
expect(config.ngram_range).to eq [1, 2]
|
|
226
|
+
expect(config.ngram_range.all?(Integer)).to be true
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::LanguageDetection do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.enabled).to be false
|
|
9
|
+
expect(config.min_confidence).to eq 0.5
|
|
10
|
+
expect(config.detect_multiple).to be false
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'creates config with custom values' do
|
|
14
|
+
config = described_class.new(
|
|
15
|
+
enabled: true,
|
|
16
|
+
min_confidence: 0.9,
|
|
17
|
+
detect_multiple: true
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
expect(config.enabled).to be true
|
|
21
|
+
expect(config.min_confidence).to eq 0.9
|
|
22
|
+
expect(config.detect_multiple).to be true
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'converts enabled to boolean' do
|
|
26
|
+
config = described_class.new(enabled: 1)
|
|
27
|
+
|
|
28
|
+
expect(config.enabled).to be true
|
|
29
|
+
expect(config.enabled).to be_a TrueClass
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
it 'converts min_confidence to float' do
|
|
33
|
+
config = described_class.new(min_confidence: '0.75')
|
|
34
|
+
|
|
35
|
+
expect(config.min_confidence).to eq 0.75
|
|
36
|
+
expect(config.min_confidence).to be_a Float
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'converts detect_multiple to boolean' do
|
|
40
|
+
config = described_class.new(detect_multiple: 'yes')
|
|
41
|
+
|
|
42
|
+
expect(config.detect_multiple).to be true
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
describe '#to_h' do
|
|
47
|
+
it 'serializes to hash with all values' do
|
|
48
|
+
config = described_class.new(
|
|
49
|
+
enabled: true,
|
|
50
|
+
min_confidence: 0.8,
|
|
51
|
+
detect_multiple: true
|
|
52
|
+
)
|
|
53
|
+
hash = config.to_h
|
|
54
|
+
|
|
55
|
+
expect(hash).to be_a Hash
|
|
56
|
+
expect(hash[:enabled]).to be true
|
|
57
|
+
expect(hash[:min_confidence]).to eq 0.8
|
|
58
|
+
expect(hash[:detect_multiple]).to be true
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it 'always includes all keys in hash' do
|
|
62
|
+
config = described_class.new
|
|
63
|
+
hash = config.to_h
|
|
64
|
+
|
|
65
|
+
expect(hash.keys).to contain_exactly(
|
|
66
|
+
:enabled,
|
|
67
|
+
:min_confidence,
|
|
68
|
+
:detect_multiple
|
|
69
|
+
)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
describe 'validation' do
|
|
74
|
+
it 'accepts confidence value of 0.5' do
|
|
75
|
+
expect do
|
|
76
|
+
described_class.new(min_confidence: 0.5)
|
|
77
|
+
end.not_to raise_error
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it 'accepts confidence value of 0.0' do
|
|
81
|
+
expect do
|
|
82
|
+
described_class.new(min_confidence: 0.0)
|
|
83
|
+
end.not_to raise_error
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it 'accepts confidence value of 1.0' do
|
|
87
|
+
expect do
|
|
88
|
+
described_class.new(min_confidence: 1.0)
|
|
89
|
+
end.not_to raise_error
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'accepts boolean enabled' do
|
|
93
|
+
expect do
|
|
94
|
+
described_class.new(enabled: true)
|
|
95
|
+
end.not_to raise_error
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
describe 'keyword arguments' do
|
|
100
|
+
it 'accepts all keyword arguments' do
|
|
101
|
+
config = described_class.new(
|
|
102
|
+
enabled: true,
|
|
103
|
+
min_confidence: 0.85,
|
|
104
|
+
detect_multiple: true
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
expect(config.enabled).to be true
|
|
108
|
+
expect(config.min_confidence).to eq 0.85
|
|
109
|
+
expect(config.detect_multiple).to be true
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
describe 'equality' do
|
|
114
|
+
it 'compares configs by value' do
|
|
115
|
+
config1 = described_class.new(
|
|
116
|
+
enabled: true,
|
|
117
|
+
min_confidence: 0.8
|
|
118
|
+
)
|
|
119
|
+
config2 = described_class.new(
|
|
120
|
+
enabled: true,
|
|
121
|
+
min_confidence: 0.8
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
expect(config1.enabled).to eq config2.enabled
|
|
125
|
+
expect(config1.min_confidence).to eq config2.min_confidence
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
it 'detects differences in enabled' do
|
|
129
|
+
config1 = described_class.new(enabled: true)
|
|
130
|
+
config2 = described_class.new(enabled: false)
|
|
131
|
+
|
|
132
|
+
expect(config1.enabled).not_to eq config2.enabled
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it 'detects differences in min_confidence' do
|
|
136
|
+
config1 = described_class.new(min_confidence: 0.5)
|
|
137
|
+
config2 = described_class.new(min_confidence: 0.9)
|
|
138
|
+
|
|
139
|
+
expect(config1.min_confidence).not_to eq config2.min_confidence
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
it 'detects differences in detect_multiple' do
|
|
143
|
+
config1 = described_class.new(detect_multiple: true)
|
|
144
|
+
config2 = described_class.new(detect_multiple: false)
|
|
145
|
+
|
|
146
|
+
expect(config1.detect_multiple).not_to eq config2.detect_multiple
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
describe 'nested config integration' do
|
|
151
|
+
it 'can be nested in Extraction config' do
|
|
152
|
+
lang_detect = described_class.new(enabled: true, min_confidence: 0.9)
|
|
153
|
+
extraction = Kreuzberg::Config::Extraction.new(language_detection: lang_detect)
|
|
154
|
+
|
|
155
|
+
expect(extraction.language_detection).to be_a described_class
|
|
156
|
+
expect(extraction.language_detection.enabled).to be true
|
|
157
|
+
expect(extraction.language_detection.min_confidence).to eq 0.9
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
it 'accepts hash in Extraction config' do
|
|
161
|
+
extraction = Kreuzberg::Config::Extraction.new(
|
|
162
|
+
language_detection: { enabled: true, min_confidence: 0.75 }
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
expect(extraction.language_detection).to be_a described_class
|
|
166
|
+
expect(extraction.language_detection.enabled).to be true
|
|
167
|
+
expect(extraction.language_detection.min_confidence).to eq 0.75
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
describe 'symbol vs string key handling' do
|
|
172
|
+
it 'accepts symbol and string enabled values' do
|
|
173
|
+
config = described_class.new(enabled: true)
|
|
174
|
+
|
|
175
|
+
expect(config.enabled).to be true
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
it 'converts min_confidence string to float' do
|
|
179
|
+
config = described_class.new(min_confidence: '0.95')
|
|
180
|
+
|
|
181
|
+
expect(config.min_confidence).to eq 0.95
|
|
182
|
+
expect(config.min_confidence).to be_a Float
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
describe 'boolean conversion' do
|
|
187
|
+
it 'converts truthy enabled to true' do
|
|
188
|
+
config = described_class.new(enabled: 1)
|
|
189
|
+
|
|
190
|
+
expect(config.enabled).to be true
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
it 'converts false enabled to false' do
|
|
194
|
+
config = described_class.new(enabled: false)
|
|
195
|
+
|
|
196
|
+
expect(config.enabled).to be false
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
it 'converts truthy detect_multiple to true' do
|
|
200
|
+
config = described_class.new(detect_multiple: 'yes')
|
|
201
|
+
|
|
202
|
+
expect(config.detect_multiple).to be true
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
it 'converts false detect_multiple to false' do
|
|
206
|
+
config = described_class.new(detect_multiple: false)
|
|
207
|
+
|
|
208
|
+
expect(config.detect_multiple).to be false
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
describe 'confidence range' do
|
|
213
|
+
it 'accepts minimum confidence value' do
|
|
214
|
+
config = described_class.new(min_confidence: 0.0)
|
|
215
|
+
|
|
216
|
+
expect(config.min_confidence).to eq 0.0
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
it 'accepts maximum confidence value' do
|
|
220
|
+
config = described_class.new(min_confidence: 1.0)
|
|
221
|
+
|
|
222
|
+
expect(config.min_confidence).to eq 1.0
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
it 'accepts mid-range confidence value' do
|
|
226
|
+
config = described_class.new(min_confidence: 0.6)
|
|
227
|
+
|
|
228
|
+
expect(config.min_confidence).to eq 0.6
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
it 'preserves high precision confidence values' do
|
|
232
|
+
config = described_class.new(min_confidence: 0.123456)
|
|
233
|
+
|
|
234
|
+
expect(config.min_confidence).to be_within(0.00001).of(0.123456)
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
describe 'multiple language detection' do
|
|
239
|
+
it 'allows enabling multiple language detection' do
|
|
240
|
+
config = described_class.new(detect_multiple: true)
|
|
241
|
+
|
|
242
|
+
expect(config.detect_multiple).to be true
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
it 'defaults to single language detection' do
|
|
246
|
+
config = described_class.new
|
|
247
|
+
|
|
248
|
+
expect(config.detect_multiple).to be false
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
it 'can be disabled when enabled is true' do
|
|
252
|
+
config = described_class.new(enabled: true, detect_multiple: false)
|
|
253
|
+
|
|
254
|
+
expect(config.enabled).to be true
|
|
255
|
+
expect(config.detect_multiple).to be false
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|