kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe 'Configuration Validation' do
|
|
4
|
+
describe Kreuzberg::Config::Extraction do
|
|
5
|
+
it 'accepts all valid parameters' do
|
|
6
|
+
config = described_class.new(
|
|
7
|
+
use_cache: true,
|
|
8
|
+
enable_quality_processing: false,
|
|
9
|
+
force_ocr: false,
|
|
10
|
+
ocr: Kreuzberg::Config::OCR.new,
|
|
11
|
+
chunking: Kreuzberg::Config::Chunking.new,
|
|
12
|
+
language_detection: Kreuzberg::Config::LanguageDetection.new,
|
|
13
|
+
pdf_options: Kreuzberg::Config::PDF.new
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
expect(config.use_cache).to be true
|
|
17
|
+
expect(config.enable_quality_processing).to be false
|
|
18
|
+
expect(config.force_ocr).to be false
|
|
19
|
+
expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
|
|
20
|
+
expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
|
|
21
|
+
expect(config.language_detection).to be_a(Kreuzberg::Config::LanguageDetection)
|
|
22
|
+
expect(config.pdf_options).to be_a(Kreuzberg::Config::PDF)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'accepts hashes for nested configs' do
|
|
26
|
+
config = described_class.new(
|
|
27
|
+
ocr: { backend: 'tesseract', language: 'eng' },
|
|
28
|
+
chunking: { max_chars: 500 }
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
|
|
32
|
+
expect(config.ocr.backend).to eq('tesseract')
|
|
33
|
+
expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
|
|
34
|
+
expect(config.chunking.max_chars).to eq(500)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'validates ocr config type' do
|
|
38
|
+
expect do
|
|
39
|
+
described_class.new(ocr: 'invalid')
|
|
40
|
+
end.to raise_error(ArgumentError, /Expected.*OCR/)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'validates chunking config type' do
|
|
44
|
+
expect do
|
|
45
|
+
described_class.new(chunking: 'invalid')
|
|
46
|
+
end.to raise_error(ArgumentError, /Expected.*Chunking/)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it 'converts to hash correctly' do
|
|
50
|
+
config = described_class.new(
|
|
51
|
+
use_cache: false,
|
|
52
|
+
force_ocr: true
|
|
53
|
+
)
|
|
54
|
+
hash = config.to_h
|
|
55
|
+
|
|
56
|
+
expect(hash).to be_a(Hash)
|
|
57
|
+
expect(hash[:use_cache]).to be false
|
|
58
|
+
expect(hash[:force_ocr]).to be true
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it 'omits nil values from hash' do
|
|
62
|
+
config = described_class.new
|
|
63
|
+
hash = config.to_h
|
|
64
|
+
|
|
65
|
+
expect(hash[:ocr]).to be_nil
|
|
66
|
+
expect(hash[:chunking]).to be_nil
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it 'accepts html options hashes' do
|
|
70
|
+
config = described_class.new(html_options: { heading_style: :atx, wrap: true })
|
|
71
|
+
expect(config.html_options).to be_a(Kreuzberg::Config::HtmlOptions)
|
|
72
|
+
expect(config.html_options.to_h[:heading_style]).to eq(:atx)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
it 'accepts keyword configurations' do
|
|
76
|
+
keywords = Kreuzberg::Config::Keywords.new(algorithm: :yake, max_keywords: 5)
|
|
77
|
+
config = described_class.new(keywords: keywords, max_concurrent_extractions: 4)
|
|
78
|
+
expect(config.keywords).to be_a(Kreuzberg::Config::Keywords)
|
|
79
|
+
expect(config.max_concurrent_extractions).to eq(4)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
describe Kreuzberg::Config::OCR do
|
|
84
|
+
it 'has sensible defaults' do
|
|
85
|
+
config = described_class.new
|
|
86
|
+
|
|
87
|
+
expect(config.backend).to eq('tesseract')
|
|
88
|
+
expect(config.language).to eq('eng')
|
|
89
|
+
expect(config.tesseract_config).to be_nil
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'accepts custom values' do
|
|
93
|
+
config = described_class.new(
|
|
94
|
+
backend: 'easyocr',
|
|
95
|
+
language: 'deu'
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
expect(config.backend).to eq('easyocr')
|
|
99
|
+
expect(config.language).to eq('deu')
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it 'coerces types correctly' do
|
|
103
|
+
config = described_class.new(
|
|
104
|
+
backend: :tesseract,
|
|
105
|
+
language: 123
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
expect(config.backend).to eq('tesseract')
|
|
109
|
+
expect(config.language).to eq('123')
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'accepts tesseract config hashes' do
|
|
113
|
+
config = described_class.new(
|
|
114
|
+
tesseract_config: {
|
|
115
|
+
psm: 6,
|
|
116
|
+
enable_table_detection: true
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
expect(config.tesseract_config).to be_a(Kreuzberg::Config::Tesseract)
|
|
121
|
+
expect(config.tesseract_config.to_h[:psm]).to eq(6)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
describe Kreuzberg::Config::Chunking do
|
|
126
|
+
it 'has sensible defaults' do
|
|
127
|
+
config = described_class.new
|
|
128
|
+
|
|
129
|
+
expect(config.max_chars).to eq(1000)
|
|
130
|
+
expect(config.max_overlap).to eq(200)
|
|
131
|
+
expect(config.preset).to be_nil
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it 'accepts custom chunk sizes' do
|
|
135
|
+
config = described_class.new(
|
|
136
|
+
max_chars: 500,
|
|
137
|
+
max_overlap: 100
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
expect(config.max_chars).to eq(500)
|
|
141
|
+
expect(config.max_overlap).to eq(100)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
it 'supports different strategies' do
|
|
145
|
+
config = described_class.new(preset: 'fast')
|
|
146
|
+
expect(config.preset).to eq('fast')
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
it 'accepts embedding configs' do
|
|
150
|
+
embedding = { model: { type: :preset, name: 'quality' }, normalize: false }
|
|
151
|
+
config = described_class.new(embedding: embedding)
|
|
152
|
+
expect(config.embedding).to be_a(Kreuzberg::Config::Embedding)
|
|
153
|
+
expect(config.embedding.to_h[:model]).to include(type: :preset, name: 'quality')
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
describe Kreuzberg::Config::LanguageDetection do
|
|
158
|
+
it 'has sensible defaults' do
|
|
159
|
+
config = described_class.new
|
|
160
|
+
|
|
161
|
+
expect(config.enabled).to be false
|
|
162
|
+
expect(config.min_confidence).to eq(0.5)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
it 'accepts custom confidence thresholds' do
|
|
166
|
+
config = described_class.new(
|
|
167
|
+
enabled: true,
|
|
168
|
+
min_confidence: 0.9
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
expect(config.enabled).to be true
|
|
172
|
+
expect(config.min_confidence).to eq(0.9)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it 'coerces confidence to float' do
|
|
176
|
+
config = described_class.new(min_confidence: '0.75')
|
|
177
|
+
expect(config.min_confidence).to eq(0.75)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it 'supports detect_multiple flag' do
|
|
181
|
+
config = described_class.new(detect_multiple: true)
|
|
182
|
+
expect(config.detect_multiple).to be true
|
|
183
|
+
expect(config.to_h[:detect_multiple]).to be true
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
describe Kreuzberg::Config::PDF do
|
|
188
|
+
it 'has sensible defaults' do
|
|
189
|
+
config = described_class.new
|
|
190
|
+
|
|
191
|
+
expect(config.extract_images).to be false
|
|
192
|
+
expect(config.passwords).to be_nil
|
|
193
|
+
expect(config.extract_metadata).to be true
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
it 'accepts custom values' do
|
|
197
|
+
config = described_class.new(
|
|
198
|
+
extract_images: true,
|
|
199
|
+
passwords: ['secret123']
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
expect(config.extract_images).to be true
|
|
203
|
+
expect(config.passwords).to eq(['secret123'])
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
it 'converts password to string' do
|
|
207
|
+
config = described_class.new(passwords: 12_345)
|
|
208
|
+
expect(config.passwords).to eq(['12345'])
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
describe Kreuzberg::Config::HtmlOptions do
|
|
213
|
+
it 'normalizes preprocessing settings' do
|
|
214
|
+
options = described_class.new(
|
|
215
|
+
heading_style: :atx_closed,
|
|
216
|
+
preprocessing: { enabled: true, preset: :standard }
|
|
217
|
+
)
|
|
218
|
+
hash = options.to_h
|
|
219
|
+
expect(hash[:heading_style]).to eq(:atx_closed)
|
|
220
|
+
expect(hash[:preprocessing]).to include(preset: :standard)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
describe Kreuzberg::Config::Keywords do
|
|
225
|
+
it 'accepts hash arguments' do
|
|
226
|
+
config = described_class.new(
|
|
227
|
+
algorithm: :yake,
|
|
228
|
+
max_keywords: 10,
|
|
229
|
+
ngram_range: [1, 3],
|
|
230
|
+
yake_params: { window_size: 4 }
|
|
231
|
+
)
|
|
232
|
+
expect(config.to_h[:algorithm]).to eq('yake')
|
|
233
|
+
expect(config.to_h[:yake_params]).to eq(window_size: 4)
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
describe Kreuzberg::Config::ImagePreprocessing do
|
|
238
|
+
it 'has sensible defaults' do
|
|
239
|
+
config = described_class.new
|
|
240
|
+
expect(config.target_dpi).to eq(300)
|
|
241
|
+
expect(config.auto_rotate).to be true
|
|
242
|
+
expect(config.deskew).to be true
|
|
243
|
+
expect(config.denoise).to be false
|
|
244
|
+
expect(config.contrast_enhance).to be true
|
|
245
|
+
expect(config.binarization_method).to eq('otsu')
|
|
246
|
+
expect(config.invert_colors).to be false
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
it 'accepts custom values' do
|
|
250
|
+
config = described_class.new(
|
|
251
|
+
target_dpi: 600,
|
|
252
|
+
auto_rotate: false,
|
|
253
|
+
deskew: false,
|
|
254
|
+
denoise: true,
|
|
255
|
+
contrast_enhance: false,
|
|
256
|
+
binarization_method: 'sauvola',
|
|
257
|
+
invert_colors: true
|
|
258
|
+
)
|
|
259
|
+
expect(config.target_dpi).to eq(600)
|
|
260
|
+
expect(config.auto_rotate).to be false
|
|
261
|
+
expect(config.deskew).to be false
|
|
262
|
+
expect(config.denoise).to be true
|
|
263
|
+
expect(config.contrast_enhance).to be false
|
|
264
|
+
expect(config.binarization_method).to eq('sauvola')
|
|
265
|
+
expect(config.invert_colors).to be true
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
it 'validates binarization method via FFI' do
|
|
269
|
+
expect { described_class.new(binarization_method: 'otsu') }.not_to raise_error
|
|
270
|
+
expect { described_class.new(binarization_method: 'adaptive') }.not_to raise_error
|
|
271
|
+
expect { described_class.new(binarization_method: 'sauvola') }.not_to raise_error
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
it 'rejects invalid binarization methods' do
|
|
275
|
+
expect do
|
|
276
|
+
described_class.new(binarization_method: 'invalid_method')
|
|
277
|
+
end.to raise_error(ArgumentError, /Invalid binarization_method/)
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
it 'converts to hash correctly' do
|
|
281
|
+
config = described_class.new(
|
|
282
|
+
target_dpi: 500,
|
|
283
|
+
binarization_method: 'adaptive'
|
|
284
|
+
)
|
|
285
|
+
hash = config.to_h
|
|
286
|
+
expect(hash[:target_dpi]).to eq(500)
|
|
287
|
+
expect(hash[:binarization_method]).to eq('adaptive')
|
|
288
|
+
expect(hash[:auto_rotate]).to be true
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
describe Kreuzberg::Config::TokenReduction do
|
|
293
|
+
it 'has sensible defaults' do
|
|
294
|
+
config = described_class.new
|
|
295
|
+
expect(config.mode).to eq('off')
|
|
296
|
+
expect(config.preserve_important_words).to be true
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
it 'accepts custom values' do
|
|
300
|
+
config = described_class.new(
|
|
301
|
+
mode: 'aggressive',
|
|
302
|
+
preserve_important_words: false
|
|
303
|
+
)
|
|
304
|
+
expect(config.mode).to eq('aggressive')
|
|
305
|
+
expect(config.preserve_important_words).to be false
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
it 'validates token reduction levels via FFI' do
|
|
309
|
+
expect { described_class.new(mode: 'off') }.not_to raise_error
|
|
310
|
+
expect { described_class.new(mode: 'light') }.not_to raise_error
|
|
311
|
+
expect { described_class.new(mode: 'moderate') }.not_to raise_error
|
|
312
|
+
expect { described_class.new(mode: 'aggressive') }.not_to raise_error
|
|
313
|
+
expect { described_class.new(mode: 'maximum') }.not_to raise_error
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
it 'rejects invalid token reduction modes' do
|
|
317
|
+
expect do
|
|
318
|
+
described_class.new(mode: 'extreme')
|
|
319
|
+
end.to raise_error(ArgumentError, /Invalid token reduction mode/)
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
it 'converts to hash correctly' do
|
|
323
|
+
config = described_class.new(
|
|
324
|
+
mode: 'light',
|
|
325
|
+
preserve_important_words: true
|
|
326
|
+
)
|
|
327
|
+
hash = config.to_h
|
|
328
|
+
expect(hash[:mode]).to eq('light')
|
|
329
|
+
expect(hash[:preserve_important_words]).to be true
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
describe 'config usage in extraction' do
|
|
334
|
+
it 'works with OCR config' do
|
|
335
|
+
path = create_test_file('OCR config test')
|
|
336
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
337
|
+
ocr: Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
result = Kreuzberg.extract_file_sync(path: path, config: config)
|
|
341
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
it 'works with chunking config' do
|
|
345
|
+
path = create_test_file('Chunking config test' * 50)
|
|
346
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
347
|
+
chunking: Kreuzberg::Config::Chunking.new(max_chars: 50)
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
result = Kreuzberg.extract_file_sync(path: path, config: config)
|
|
351
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
it 'works with language detection config' do
|
|
355
|
+
path = create_test_file('Language detection test')
|
|
356
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
357
|
+
language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
result = Kreuzberg.extract_file_sync(path: path, config: config)
|
|
361
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
it 'works with combined configs' do
|
|
365
|
+
path = create_test_file('Combined config test')
|
|
366
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
367
|
+
use_cache: false,
|
|
368
|
+
force_ocr: false,
|
|
369
|
+
ocr: { backend: 'tesseract', language: 'eng' },
|
|
370
|
+
language_detection: { enabled: false }
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
result = Kreuzberg.extract_file_sync(path: path, config: config)
|
|
374
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
end
|