kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::Extraction do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.use_cache).to be true
|
|
9
|
+
expect(config.enable_quality_processing).to be true
|
|
10
|
+
expect(config.force_ocr).to be false
|
|
11
|
+
expect(config.ocr).to be_nil
|
|
12
|
+
expect(config.chunking).to be_nil
|
|
13
|
+
expect(config.language_detection).to be_nil
|
|
14
|
+
expect(config.pdf_options).to be_nil
|
|
15
|
+
expect(config.image_extraction).to be_nil
|
|
16
|
+
expect(config.postprocessor).to be_nil
|
|
17
|
+
expect(config.token_reduction).to be_nil
|
|
18
|
+
expect(config.keywords).to be_nil
|
|
19
|
+
expect(config.html_options).to be_nil
|
|
20
|
+
expect(config.pages).to be_nil
|
|
21
|
+
expect(config.max_concurrent_extractions).to be_nil
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it 'creates config with custom boolean values' do
|
|
25
|
+
config = described_class.new(
|
|
26
|
+
use_cache: false,
|
|
27
|
+
enable_quality_processing: true,
|
|
28
|
+
force_ocr: true
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
expect(config.use_cache).to be false
|
|
32
|
+
expect(config.enable_quality_processing).to be true
|
|
33
|
+
expect(config.force_ocr).to be true
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'accepts all nested config instances' do
|
|
37
|
+
ocr = Kreuzberg::Config::OCR.new(backend: 'tesseract')
|
|
38
|
+
chunking = Kreuzberg::Config::Chunking.new(max_chars: 500)
|
|
39
|
+
lang_detect = Kreuzberg::Config::LanguageDetection.new(enabled: true)
|
|
40
|
+
|
|
41
|
+
config = described_class.new(
|
|
42
|
+
ocr: ocr,
|
|
43
|
+
chunking: chunking,
|
|
44
|
+
language_detection: lang_detect
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
expect(config.ocr).to be ocr
|
|
48
|
+
expect(config.chunking).to be chunking
|
|
49
|
+
expect(config.language_detection).to be lang_detect
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it 'converts nested config hashes to instances' do
|
|
53
|
+
config = described_class.new(
|
|
54
|
+
ocr: { backend: 'easyocr', language: 'fra' },
|
|
55
|
+
chunking: { max_chars: 750 }
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
expect(config.ocr).to be_a Kreuzberg::Config::OCR
|
|
59
|
+
expect(config.ocr.backend).to eq 'easyocr'
|
|
60
|
+
expect(config.chunking).to be_a Kreuzberg::Config::Chunking
|
|
61
|
+
expect(config.chunking.max_chars).to eq 750
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'converts max_concurrent_extractions to integer' do
|
|
65
|
+
config = described_class.new(max_concurrent_extractions: '4')
|
|
66
|
+
|
|
67
|
+
expect(config.max_concurrent_extractions).to eq 4
|
|
68
|
+
expect(config.max_concurrent_extractions).to be_a Integer
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
describe '#to_h' do
|
|
73
|
+
it 'serializes to hash' do
|
|
74
|
+
config = described_class.new(use_cache: true)
|
|
75
|
+
hash = config.to_h
|
|
76
|
+
|
|
77
|
+
expect(hash).to be_a Hash
|
|
78
|
+
expect(hash[:use_cache]).to be true
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
it 'includes all nested configs in hash' do
|
|
82
|
+
config = described_class.new(
|
|
83
|
+
ocr: { backend: 'tesseract' },
|
|
84
|
+
chunking: { max_chars: 500 }
|
|
85
|
+
)
|
|
86
|
+
hash = config.to_h
|
|
87
|
+
|
|
88
|
+
expect(hash[:ocr]).to be_a Hash
|
|
89
|
+
expect(hash[:chunking]).to be_a Hash
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'compacts nil nested configs from hash' do
|
|
93
|
+
config = described_class.new(use_cache: true)
|
|
94
|
+
hash = config.to_h
|
|
95
|
+
|
|
96
|
+
expect(hash.key?(:ocr)).to be false
|
|
97
|
+
expect(hash.key?(:chunking)).to be false
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it 'always includes top-level boolean values' do
|
|
101
|
+
config = described_class.new
|
|
102
|
+
hash = config.to_h
|
|
103
|
+
|
|
104
|
+
expect(hash[:use_cache]).to be true
|
|
105
|
+
expect(hash[:enable_quality_processing]).to be true
|
|
106
|
+
expect(hash[:force_ocr]).to be false
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
describe '#to_json' do
|
|
111
|
+
it 'serializes to JSON string' do
|
|
112
|
+
config = described_class.new(use_cache: true, force_ocr: false)
|
|
113
|
+
json = config.to_json
|
|
114
|
+
|
|
115
|
+
expect(json).to be_a String
|
|
116
|
+
parsed = JSON.parse(json)
|
|
117
|
+
expect(parsed['use_cache']).to be true
|
|
118
|
+
expect(parsed['force_ocr']).to be false
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
it 'handles nested configs in JSON' do
|
|
122
|
+
config = described_class.new(ocr: { backend: 'tesseract' })
|
|
123
|
+
json = config.to_json
|
|
124
|
+
|
|
125
|
+
parsed = JSON.parse(json)
|
|
126
|
+
expect(parsed['ocr']['backend']).to eq 'tesseract'
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
describe '#get_field' do
|
|
131
|
+
it 'retrieves top-level field' do
|
|
132
|
+
config = described_class.new(use_cache: false)
|
|
133
|
+
|
|
134
|
+
expect(config.get_field('use_cache')).to be false
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
it 'retrieves nested field with dot notation' do
|
|
138
|
+
config = described_class.new(ocr: { backend: 'tesseract' })
|
|
139
|
+
|
|
140
|
+
expect(config.get_field('ocr.backend')).to eq 'tesseract'
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it 'returns nil for non-existent field' do
|
|
144
|
+
config = described_class.new
|
|
145
|
+
|
|
146
|
+
expect(config.get_field('nonexistent')).to be_nil
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
it 'accepts symbol field names' do
|
|
150
|
+
config = described_class.new(use_cache: true)
|
|
151
|
+
|
|
152
|
+
expect(config.get_field(:use_cache)).to be true
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it 'handles deeply nested fields' do
|
|
156
|
+
config = described_class.new(
|
|
157
|
+
chunking: { embedding: { model: { type: :preset, name: 'fast' } } }
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
expect(config.get_field('chunking.embedding.model')).to be_a Hash
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
describe '#merge' do
|
|
165
|
+
it 'merges two configs' do
|
|
166
|
+
base = described_class.new(use_cache: true, force_ocr: false)
|
|
167
|
+
override = described_class.new(force_ocr: true)
|
|
168
|
+
merged = base.merge(override)
|
|
169
|
+
|
|
170
|
+
expect(merged.use_cache).to be true
|
|
171
|
+
expect(merged.force_ocr).to be true
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
it 'returns new config without modifying original' do
|
|
175
|
+
base = described_class.new(use_cache: true)
|
|
176
|
+
override = described_class.new(use_cache: false)
|
|
177
|
+
merged = base.merge(override)
|
|
178
|
+
|
|
179
|
+
expect(base.use_cache).to be true
|
|
180
|
+
expect(merged.use_cache).to be false
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
it 'merges nested configs' do
|
|
184
|
+
base = described_class.new(ocr: { backend: 'tesseract' })
|
|
185
|
+
override = described_class.new(ocr: { language: 'fra' })
|
|
186
|
+
merged = base.merge(override)
|
|
187
|
+
|
|
188
|
+
expect(merged.ocr.backend).to eq 'tesseract'
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it 'accepts hash as merge argument' do
|
|
192
|
+
base = described_class.new(use_cache: true)
|
|
193
|
+
merged = base.merge({ use_cache: false })
|
|
194
|
+
|
|
195
|
+
expect(merged.use_cache).to be false
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
describe '#merge!' do
|
|
200
|
+
it 'mutates config in-place' do
|
|
201
|
+
config = described_class.new(use_cache: true, force_ocr: false)
|
|
202
|
+
override = described_class.new(force_ocr: true)
|
|
203
|
+
result = config.merge!(override)
|
|
204
|
+
|
|
205
|
+
expect(config.force_ocr).to be true
|
|
206
|
+
expect(result).to be config
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
it 'returns self' do
|
|
210
|
+
config = described_class.new
|
|
211
|
+
override = described_class.new
|
|
212
|
+
|
|
213
|
+
expect(config.merge!(override)).to be config
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
it 'accepts hash argument' do
|
|
217
|
+
config = described_class.new(use_cache: true)
|
|
218
|
+
config[:use_cache] = false
|
|
219
|
+
config[:force_ocr] = true
|
|
220
|
+
|
|
221
|
+
expect(config.use_cache).to be false
|
|
222
|
+
expect(config.force_ocr).to be true
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
describe 'validation' do
|
|
227
|
+
it 'rejects invalid ocr type' do
|
|
228
|
+
expect do
|
|
229
|
+
described_class.new(ocr: 'invalid')
|
|
230
|
+
end.to raise_error ArgumentError, /Expected.*OCR/
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
it 'rejects invalid chunking type' do
|
|
234
|
+
expect do
|
|
235
|
+
described_class.new(chunking: 123)
|
|
236
|
+
end.to raise_error ArgumentError, /Expected.*Chunking/
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
it 'accepts valid nested instances' do
|
|
240
|
+
expect do
|
|
241
|
+
described_class.new(
|
|
242
|
+
ocr: Kreuzberg::Config::OCR.new,
|
|
243
|
+
chunking: Kreuzberg::Config::Chunking.new
|
|
244
|
+
)
|
|
245
|
+
end.not_to raise_error
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
describe 'keyword arguments' do
|
|
250
|
+
it 'accepts all keyword arguments' do
|
|
251
|
+
config = described_class.new(
|
|
252
|
+
use_cache: false,
|
|
253
|
+
enable_quality_processing: true,
|
|
254
|
+
force_ocr: true,
|
|
255
|
+
ocr: { backend: 'tesseract' },
|
|
256
|
+
chunking: { max_chars: 500 },
|
|
257
|
+
language_detection: { enabled: true },
|
|
258
|
+
pdf_options: { extract_images: true },
|
|
259
|
+
image_extraction: { target_dpi: 600 },
|
|
260
|
+
postprocessor: { enabled: true },
|
|
261
|
+
token_reduction: { mode: 'light' },
|
|
262
|
+
keywords: { algorithm: 'yake' },
|
|
263
|
+
pages: { extract_pages: true },
|
|
264
|
+
max_concurrent_extractions: 4
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
expect(config.use_cache).to be false
|
|
268
|
+
expect(config.enable_quality_processing).to be true
|
|
269
|
+
expect(config.force_ocr).to be true
|
|
270
|
+
expect(config.ocr).to be_a Kreuzberg::Config::OCR
|
|
271
|
+
expect(config.max_concurrent_extractions).to eq 4
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
describe 'equality' do
|
|
276
|
+
it 'compares configs with same values' do
|
|
277
|
+
config1 = described_class.new(use_cache: true, force_ocr: false)
|
|
278
|
+
config2 = described_class.new(use_cache: true, force_ocr: false)
|
|
279
|
+
|
|
280
|
+
expect(config1.use_cache).to eq config2.use_cache
|
|
281
|
+
expect(config1.force_ocr).to eq config2.force_ocr
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
it 'detects differences' do
|
|
285
|
+
config1 = described_class.new(use_cache: true)
|
|
286
|
+
config2 = described_class.new(use_cache: false)
|
|
287
|
+
|
|
288
|
+
expect(config1.use_cache).not_to eq config2.use_cache
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
describe '.from_file' do
|
|
293
|
+
it 'loads from TOML file' do
|
|
294
|
+
config_path = File.join(__dir__, '../../fixtures/config.toml')
|
|
295
|
+
config = described_class.from_file(config_path)
|
|
296
|
+
|
|
297
|
+
expect(config).to be_a described_class
|
|
298
|
+
expect(config.use_cache).to be false
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
it 'loads from YAML file' do
|
|
302
|
+
config_path = File.join(__dir__, '../../fixtures/config.yaml')
|
|
303
|
+
config = described_class.from_file(config_path)
|
|
304
|
+
|
|
305
|
+
expect(config).to be_a described_class
|
|
306
|
+
expect(config.use_cache).to be false
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it 'raises error for non-existent file' do
|
|
310
|
+
expect do
|
|
311
|
+
described_class.from_file('/nonexistent/path/config.toml')
|
|
312
|
+
end.to raise_error Kreuzberg::Errors::ValidationError
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
describe '.discover' do
|
|
317
|
+
it 'returns nil when no config file found' do
|
|
318
|
+
# This test may vary by environment
|
|
319
|
+
# Documenting the behavior
|
|
320
|
+
config = described_class.discover
|
|
321
|
+
# Should either return a config or nil
|
|
322
|
+
expect(config.nil? || config.is_a?(described_class)).to be true
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
describe 'boolean conversion' do
|
|
327
|
+
it 'converts truthy use_cache to true' do
|
|
328
|
+
config = described_class.new(use_cache: 1)
|
|
329
|
+
|
|
330
|
+
expect(config.use_cache).to be true
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
it 'converts false use_cache to false' do
|
|
334
|
+
config = described_class.new(use_cache: false)
|
|
335
|
+
|
|
336
|
+
expect(config.use_cache).to be false
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
it 'converts truthy enable_quality_processing to true' do
|
|
340
|
+
config = described_class.new(enable_quality_processing: 'yes')
|
|
341
|
+
|
|
342
|
+
expect(config.enable_quality_processing).to be true
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
it 'converts false enable_quality_processing to false' do
|
|
346
|
+
config = described_class.new(enable_quality_processing: false)
|
|
347
|
+
|
|
348
|
+
expect(config.enable_quality_processing).to be false
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
it 'converts truthy force_ocr to true' do
|
|
352
|
+
config = described_class.new(force_ocr: [1])
|
|
353
|
+
|
|
354
|
+
expect(config.force_ocr).to be true
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
it 'converts false force_ocr to false' do
|
|
358
|
+
config = described_class.new(force_ocr: false)
|
|
359
|
+
|
|
360
|
+
expect(config.force_ocr).to be false
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
describe 'complex nested configurations' do
|
|
365
|
+
it 'handles deeply nested configs' do
|
|
366
|
+
config = described_class.new(
|
|
367
|
+
chunking: {
|
|
368
|
+
max_chars: 750,
|
|
369
|
+
embedding: {
|
|
370
|
+
model: { type: :preset, name: 'balanced' },
|
|
371
|
+
batch_size: 64
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
expect(config.chunking.embedding).to be_a Kreuzberg::Config::Embedding
|
|
377
|
+
expect(config.chunking.embedding.batch_size).to eq 64
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
it 'handles PDF with font and hierarchy configs' do
|
|
381
|
+
config = described_class.new(
|
|
382
|
+
pdf_options: {
|
|
383
|
+
extract_images: true,
|
|
384
|
+
font_config: { enabled: true, custom_font_dirs: ['/fonts'] },
|
|
385
|
+
hierarchy: { k_clusters: 8 }
|
|
386
|
+
}
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
expect(config.pdf_options.font_config).to be_a Kreuzberg::Config::FontConfig
|
|
390
|
+
expect(config.pdf_options.hierarchy).to be_a Kreuzberg::Config::Hierarchy
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
it 'handles complete extraction config' do
|
|
394
|
+
config = described_class.new(
|
|
395
|
+
use_cache: false,
|
|
396
|
+
force_ocr: true,
|
|
397
|
+
ocr: { backend: 'tesseract', language: 'deu' },
|
|
398
|
+
chunking: { max_chars: 500, preset: 'fast' },
|
|
399
|
+
language_detection: { enabled: true, min_confidence: 0.9 },
|
|
400
|
+
pdf_options: { extract_images: true, passwords: ['secret'] },
|
|
401
|
+
image_extraction: { target_dpi: 600 },
|
|
402
|
+
postprocessor: { enabled: true, enabled_processors: %w[quality] },
|
|
403
|
+
token_reduction: { mode: 'light' },
|
|
404
|
+
keywords: { algorithm: 'yake', max_keywords: 10 },
|
|
405
|
+
pages: { extract_pages: true }
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
expect(config.use_cache).to be false
|
|
409
|
+
expect(config.force_ocr).to be true
|
|
410
|
+
expect(config.ocr.language).to eq 'deu'
|
|
411
|
+
expect(config.chunking.max_chars).to eq 500
|
|
412
|
+
expect(config.language_detection.enabled).to be true
|
|
413
|
+
expect(config.pdf_options.extract_images).to be true
|
|
414
|
+
expect(config.image_extraction.target_dpi).to eq 600
|
|
415
|
+
expect(config.postprocessor.enabled).to be true
|
|
416
|
+
expect(config.token_reduction.mode).to eq 'light'
|
|
417
|
+
expect(config.keywords.max_keywords).to eq 10
|
|
418
|
+
expect(config.pages.extract_pages).to be true
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
describe 'ExtractionConfig alias' do
|
|
423
|
+
it 'exists as module constant' do
|
|
424
|
+
expect(Kreuzberg.const_defined?(:ExtractionConfig)).to be true
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
it 'can be instantiated through alias' do
|
|
428
|
+
config = Kreuzberg::ExtractionConfig.new(use_cache: false)
|
|
429
|
+
|
|
430
|
+
expect(config).to be_a described_class
|
|
431
|
+
expect(config.use_cache).to be false
|
|
432
|
+
end
|
|
433
|
+
end
|
|
434
|
+
end
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::FontConfig do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.enabled).to be true
|
|
9
|
+
expect(config.custom_font_dirs).to be_nil
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it 'creates config with enabled false' do
|
|
13
|
+
config = described_class.new(enabled: false)
|
|
14
|
+
|
|
15
|
+
expect(config.enabled).to be false
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'creates config with custom_font_dirs' do
|
|
19
|
+
dirs = ['/usr/share/fonts', '/home/user/.fonts']
|
|
20
|
+
config = described_class.new(custom_font_dirs: dirs)
|
|
21
|
+
|
|
22
|
+
expect(config.custom_font_dirs).to eq dirs
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'accepts single font directory as string' do
|
|
26
|
+
config = described_class.new(custom_font_dirs: '/usr/share/fonts')
|
|
27
|
+
|
|
28
|
+
expect(config.custom_font_dirs).to eq '/usr/share/fonts'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'accepts multiple directories as array' do
|
|
32
|
+
dirs = ['/fonts1', '/fonts2', '/fonts3']
|
|
33
|
+
config = described_class.new(custom_font_dirs: dirs)
|
|
34
|
+
|
|
35
|
+
expect(config.custom_font_dirs).to eq dirs
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it 'converts enabled to boolean' do
|
|
39
|
+
config = described_class.new(enabled: 1)
|
|
40
|
+
|
|
41
|
+
expect(config.enabled).to be true
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
describe '#to_h' do
|
|
46
|
+
it 'serializes to hash with default values' do
|
|
47
|
+
config = described_class.new
|
|
48
|
+
hash = config.to_h
|
|
49
|
+
|
|
50
|
+
expect(hash).to be_a Hash
|
|
51
|
+
expect(hash[:enabled]).to be true
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'includes custom_font_dirs when present' do
|
|
55
|
+
dirs = ['/fonts']
|
|
56
|
+
config = described_class.new(custom_font_dirs: dirs)
|
|
57
|
+
hash = config.to_h
|
|
58
|
+
|
|
59
|
+
expect(hash[:custom_font_dirs]).to eq dirs
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
it 'compacts nil values from hash' do
|
|
63
|
+
config = described_class.new(enabled: true)
|
|
64
|
+
hash = config.to_h
|
|
65
|
+
|
|
66
|
+
expect(hash.key?(:custom_font_dirs)).to be false
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it 'includes both keys when both are present' do
|
|
70
|
+
config = described_class.new(
|
|
71
|
+
enabled: true,
|
|
72
|
+
custom_font_dirs: ['/fonts']
|
|
73
|
+
)
|
|
74
|
+
hash = config.to_h
|
|
75
|
+
|
|
76
|
+
expect(hash.keys).to contain_exactly(:enabled, :custom_font_dirs)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
describe 'validation' do
|
|
81
|
+
it 'accepts enabled true' do
|
|
82
|
+
expect do
|
|
83
|
+
described_class.new(enabled: true)
|
|
84
|
+
end.not_to raise_error
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it 'accepts enabled false' do
|
|
88
|
+
expect do
|
|
89
|
+
described_class.new(enabled: false)
|
|
90
|
+
end.not_to raise_error
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
it 'accepts custom_font_dirs as string' do
|
|
94
|
+
expect do
|
|
95
|
+
described_class.new(custom_font_dirs: '/fonts')
|
|
96
|
+
end.not_to raise_error
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it 'accepts custom_font_dirs as array' do
|
|
100
|
+
expect do
|
|
101
|
+
described_class.new(custom_font_dirs: ['/fonts1', '/fonts2'])
|
|
102
|
+
end.not_to raise_error
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
describe 'keyword arguments' do
|
|
107
|
+
it 'accepts all keyword arguments' do
|
|
108
|
+
dirs = ['/fonts']
|
|
109
|
+
config = described_class.new(
|
|
110
|
+
enabled: false,
|
|
111
|
+
custom_font_dirs: dirs
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
expect(config.enabled).to be false
|
|
115
|
+
expect(config.custom_font_dirs).to eq dirs
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
describe 'equality' do
|
|
120
|
+
it 'compares configs by value' do
|
|
121
|
+
config1 = described_class.new(
|
|
122
|
+
enabled: true,
|
|
123
|
+
custom_font_dirs: ['/fonts']
|
|
124
|
+
)
|
|
125
|
+
config2 = described_class.new(
|
|
126
|
+
enabled: true,
|
|
127
|
+
custom_font_dirs: ['/fonts']
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
expect(config1.enabled).to eq config2.enabled
|
|
131
|
+
expect(config1.custom_font_dirs).to eq config2.custom_font_dirs
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it 'detects differences in enabled' do
|
|
135
|
+
config1 = described_class.new(enabled: true)
|
|
136
|
+
config2 = described_class.new(enabled: false)
|
|
137
|
+
|
|
138
|
+
expect(config1.enabled).not_to eq config2.enabled
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
it 'detects differences in custom_font_dirs' do
|
|
142
|
+
config1 = described_class.new(custom_font_dirs: ['/fonts1'])
|
|
143
|
+
config2 = described_class.new(custom_font_dirs: ['/fonts2'])
|
|
144
|
+
|
|
145
|
+
expect(config1.custom_font_dirs).not_to eq config2.custom_font_dirs
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
describe 'nested config integration' do
|
|
150
|
+
it 'can be nested in PDF config' do
|
|
151
|
+
font_config = described_class.new(enabled: true, custom_font_dirs: ['/fonts'])
|
|
152
|
+
pdf = Kreuzberg::Config::PDF.new(font_config: font_config)
|
|
153
|
+
|
|
154
|
+
expect(pdf.font_config).to be_a described_class
|
|
155
|
+
expect(pdf.font_config.enabled).to be true
|
|
156
|
+
expect(pdf.font_config.custom_font_dirs).to eq ['/fonts']
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'accepts hash in PDF config' do
|
|
160
|
+
pdf = Kreuzberg::Config::PDF.new(
|
|
161
|
+
font_config: { enabled: true, custom_font_dirs: ['/fonts'] }
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
expect(pdf.font_config).to be_a described_class
|
|
165
|
+
expect(pdf.font_config.enabled).to be true
|
|
166
|
+
expect(pdf.font_config.custom_font_dirs).to eq ['/fonts']
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
it 'can be nested in Extraction config via PDF' do
|
|
170
|
+
extraction = Kreuzberg::Config::Extraction.new(
|
|
171
|
+
pdf_options: { font_config: { enabled: true } }
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
expect(extraction.pdf_options.font_config).to be_a described_class
|
|
175
|
+
expect(extraction.pdf_options.font_config.enabled).to be true
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
describe 'symbol vs string key handling' do
|
|
180
|
+
it 'converts symbol enabled to boolean' do
|
|
181
|
+
config = described_class.new(enabled: true)
|
|
182
|
+
|
|
183
|
+
expect(config.enabled).to be true
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
it 'preserves custom_font_dirs as array' do
|
|
187
|
+
dirs = ['/fonts1', '/fonts2']
|
|
188
|
+
config = described_class.new(custom_font_dirs: dirs)
|
|
189
|
+
|
|
190
|
+
expect(config.custom_font_dirs).to eq dirs
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
describe 'boolean conversion' do
|
|
195
|
+
it 'converts truthy enabled to true' do
|
|
196
|
+
config = described_class.new(enabled: 1)
|
|
197
|
+
|
|
198
|
+
expect(config.enabled).to be true
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
it 'converts false enabled to false' do
|
|
202
|
+
config = described_class.new(enabled: false)
|
|
203
|
+
|
|
204
|
+
expect(config.enabled).to be false
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
it 'converts string yes to true' do
|
|
208
|
+
config = described_class.new(enabled: 'yes')
|
|
209
|
+
|
|
210
|
+
expect(config.enabled).to be true
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
it 'converts nil to false' do
|
|
214
|
+
config = described_class.new(enabled: nil)
|
|
215
|
+
|
|
216
|
+
expect(config.enabled).to be false
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
describe 'font directory handling' do
|
|
221
|
+
it 'stores single directory path as string' do
|
|
222
|
+
config = described_class.new(custom_font_dirs: '/usr/share/fonts')
|
|
223
|
+
|
|
224
|
+
expect(config.custom_font_dirs).to eq '/usr/share/fonts'
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
it 'stores multiple directories as array' do
|
|
228
|
+
dirs = ['/fonts1', '/fonts2', '/fonts3']
|
|
229
|
+
config = described_class.new(custom_font_dirs: dirs)
|
|
230
|
+
|
|
231
|
+
expect(config.custom_font_dirs).to eq dirs
|
|
232
|
+
expect(config.custom_font_dirs).to be_a Array
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
it 'preserves exact directory paths' do
|
|
236
|
+
dir = '/home/user/.local/share/fonts'
|
|
237
|
+
config = described_class.new(custom_font_dirs: dir)
|
|
238
|
+
|
|
239
|
+
expect(config.custom_font_dirs).to eq dir
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
it 'preserves array of directory paths' do
|
|
243
|
+
dirs = ['/usr/share/fonts', '/home/user/.fonts', '~/.local/share/fonts']
|
|
244
|
+
config = described_class.new(custom_font_dirs: dirs)
|
|
245
|
+
|
|
246
|
+
expect(config.custom_font_dirs).to eq dirs
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
describe 'mutability' do
|
|
251
|
+
it 'allows modification of enabled' do
|
|
252
|
+
config = described_class.new(enabled: true)
|
|
253
|
+
config.enabled = false
|
|
254
|
+
|
|
255
|
+
expect(config.enabled).to be false
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
it 'allows modification of custom_font_dirs' do
|
|
259
|
+
config = described_class.new(custom_font_dirs: ['/fonts1'])
|
|
260
|
+
config.custom_font_dirs = ['/fonts2']
|
|
261
|
+
|
|
262
|
+
expect(config.custom_font_dirs).to eq ['/fonts2']
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
describe 'default behavior' do
|
|
267
|
+
it 'defaults to enabled' do
|
|
268
|
+
config = described_class.new
|
|
269
|
+
|
|
270
|
+
expect(config.enabled).to be true
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
it 'defaults to no custom font directories' do
|
|
274
|
+
config = described_class.new
|
|
275
|
+
|
|
276
|
+
expect(config.custom_font_dirs).to be_nil
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
it 'allows disabling font support' do
|
|
280
|
+
config = described_class.new(enabled: false)
|
|
281
|
+
|
|
282
|
+
expect(config.enabled).to be false
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|