kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::Config::Embedding do
|
|
4
|
+
describe '#initialize' do
|
|
5
|
+
it 'creates config with default values' do
|
|
6
|
+
config = described_class.new
|
|
7
|
+
|
|
8
|
+
expect(config.model).to be_a Hash
|
|
9
|
+
expect(config.model[:type]).to eq :preset
|
|
10
|
+
expect(config.model[:name]).to eq 'balanced'
|
|
11
|
+
expect(config.normalize).to be true
|
|
12
|
+
expect(config.batch_size).to eq 32
|
|
13
|
+
expect(config.show_download_progress).to be false
|
|
14
|
+
expect(config.cache_dir).to be_nil
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'creates config with custom model hash' do
|
|
18
|
+
model = { type: :preset, name: 'fast' }
|
|
19
|
+
config = described_class.new(model: model)
|
|
20
|
+
|
|
21
|
+
expect(config.model[:type]).to eq :preset
|
|
22
|
+
expect(config.model[:name]).to eq 'fast'
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'creates config with custom values' do
|
|
26
|
+
config = described_class.new(
|
|
27
|
+
normalize: false,
|
|
28
|
+
batch_size: 64,
|
|
29
|
+
show_download_progress: true,
|
|
30
|
+
cache_dir: '/cache'
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
expect(config.normalize).to be false
|
|
34
|
+
expect(config.batch_size).to eq 64
|
|
35
|
+
expect(config.show_download_progress).to be true
|
|
36
|
+
expect(config.cache_dir).to eq '/cache'
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'converts model with to_h method' do
|
|
40
|
+
model_like = double(to_h: { type: :custom, name: 'model' })
|
|
41
|
+
config = described_class.new(model: model_like)
|
|
42
|
+
|
|
43
|
+
expect(config.model).to be_a Hash
|
|
44
|
+
expect(config.model[:type]).to eq :custom
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it 'converts batch_size to integer' do
|
|
48
|
+
config = described_class.new(batch_size: '128')
|
|
49
|
+
|
|
50
|
+
expect(config.batch_size).to eq 128
|
|
51
|
+
expect(config.batch_size).to be_a Integer
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'converts normalize to boolean' do
|
|
55
|
+
config = described_class.new(normalize: 1)
|
|
56
|
+
|
|
57
|
+
expect(config.normalize).to be true
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'converts show_download_progress to boolean' do
|
|
61
|
+
config = described_class.new(show_download_progress: 'yes')
|
|
62
|
+
|
|
63
|
+
expect(config.show_download_progress).to be true
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
it 'converts cache_dir to string' do
|
|
67
|
+
config = described_class.new(cache_dir: :default_cache)
|
|
68
|
+
|
|
69
|
+
expect(config.cache_dir).to be_a String
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
describe '#to_h' do
|
|
74
|
+
it 'serializes to hash with all values' do
|
|
75
|
+
config = described_class.new(
|
|
76
|
+
model: { type: :preset, name: 'fast' },
|
|
77
|
+
batch_size: 64
|
|
78
|
+
)
|
|
79
|
+
hash = config.to_h
|
|
80
|
+
|
|
81
|
+
expect(hash).to be_a Hash
|
|
82
|
+
expect(hash[:model]).to be_a Hash
|
|
83
|
+
expect(hash[:normalize]).to be true
|
|
84
|
+
expect(hash[:batch_size]).to eq 64
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it 'includes cache_dir when present' do
|
|
88
|
+
config = described_class.new(cache_dir: '/cache')
|
|
89
|
+
hash = config.to_h
|
|
90
|
+
|
|
91
|
+
expect(hash[:cache_dir]).to eq '/cache'
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
it 'compacts nil values from hash' do
|
|
95
|
+
config = described_class.new
|
|
96
|
+
hash = config.to_h
|
|
97
|
+
|
|
98
|
+
expect(hash.key?(:cache_dir)).to be false
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
it 'always includes model in hash' do
|
|
102
|
+
config = described_class.new
|
|
103
|
+
hash = config.to_h
|
|
104
|
+
|
|
105
|
+
expect(hash.key?(:model)).to be true
|
|
106
|
+
expect(hash[:model]).to be_a Hash
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
describe 'validation' do
|
|
111
|
+
it 'rejects invalid model type (not hash)' do
|
|
112
|
+
expect do
|
|
113
|
+
described_class.new(model: 'invalid_string')
|
|
114
|
+
end.to raise_error ArgumentError, /model must be a Hash/
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it 'accepts model as hash' do
|
|
118
|
+
expect do
|
|
119
|
+
described_class.new(model: { type: :preset, name: 'fast' })
|
|
120
|
+
end.not_to raise_error
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it 'accepts model with to_h method' do
|
|
124
|
+
model_like = double(to_h: { type: :preset, name: 'fast' })
|
|
125
|
+
expect do
|
|
126
|
+
described_class.new(model: model_like)
|
|
127
|
+
end.not_to raise_error
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it 'accepts valid batch_size' do
|
|
131
|
+
expect do
|
|
132
|
+
described_class.new(batch_size: 32)
|
|
133
|
+
end.not_to raise_error
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
it 'accepts valid cache_dir' do
|
|
137
|
+
expect do
|
|
138
|
+
described_class.new(cache_dir: '/tmp/cache')
|
|
139
|
+
end.not_to raise_error
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
describe 'keyword arguments' do
|
|
144
|
+
it 'accepts all keyword arguments' do
|
|
145
|
+
config = described_class.new(
|
|
146
|
+
model: { type: :preset, name: 'balanced' },
|
|
147
|
+
normalize: true,
|
|
148
|
+
batch_size: 48,
|
|
149
|
+
show_download_progress: true,
|
|
150
|
+
cache_dir: '/cache'
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
expect(config.model[:name]).to eq 'balanced'
|
|
154
|
+
expect(config.normalize).to be true
|
|
155
|
+
expect(config.batch_size).to eq 48
|
|
156
|
+
expect(config.show_download_progress).to be true
|
|
157
|
+
expect(config.cache_dir).to eq '/cache'
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
describe 'equality' do
|
|
162
|
+
it 'compares configs by value' do
|
|
163
|
+
config1 = described_class.new(
|
|
164
|
+
model: { type: :preset, name: 'fast' },
|
|
165
|
+
batch_size: 64
|
|
166
|
+
)
|
|
167
|
+
config2 = described_class.new(
|
|
168
|
+
model: { type: :preset, name: 'fast' },
|
|
169
|
+
batch_size: 64
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
expect(config1.model).to eq config2.model
|
|
173
|
+
expect(config1.batch_size).to eq config2.batch_size
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it 'detects differences in model' do
|
|
177
|
+
config1 = described_class.new(model: { type: :preset, name: 'fast' })
|
|
178
|
+
config2 = described_class.new(model: { type: :preset, name: 'balanced' })
|
|
179
|
+
|
|
180
|
+
expect(config1.model).not_to eq config2.model
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
it 'detects differences in batch_size' do
|
|
184
|
+
config1 = described_class.new(batch_size: 32)
|
|
185
|
+
config2 = described_class.new(batch_size: 64)
|
|
186
|
+
|
|
187
|
+
expect(config1.batch_size).not_to eq config2.batch_size
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
it 'detects differences in normalize' do
|
|
191
|
+
config1 = described_class.new(normalize: true)
|
|
192
|
+
config2 = described_class.new(normalize: false)
|
|
193
|
+
|
|
194
|
+
expect(config1.normalize).not_to eq config2.normalize
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
describe 'nested config integration' do
|
|
199
|
+
it 'can be nested in Chunking config' do
|
|
200
|
+
embedding = described_class.new(
|
|
201
|
+
model: { type: :preset, name: 'fast' },
|
|
202
|
+
batch_size: 64
|
|
203
|
+
)
|
|
204
|
+
chunking = Kreuzberg::Config::Chunking.new(embedding: embedding)
|
|
205
|
+
|
|
206
|
+
expect(chunking.embedding).to be_a described_class
|
|
207
|
+
expect(chunking.embedding.batch_size).to eq 64
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
it 'accepts hash in Chunking config' do
|
|
211
|
+
chunking = Kreuzberg::Config::Chunking.new(
|
|
212
|
+
embedding: { model: { type: :preset, name: 'balanced' } }
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
expect(chunking.embedding).to be_a described_class
|
|
216
|
+
expect(chunking.embedding.model[:name]).to eq 'balanced'
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
it 'can be nested in Extraction config via Chunking' do
|
|
220
|
+
extraction = Kreuzberg::Config::Extraction.new(
|
|
221
|
+
chunking: { embedding: { batch_size: 48 } }
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
expect(extraction.chunking.embedding).to be_a described_class
|
|
225
|
+
expect(extraction.chunking.embedding.batch_size).to eq 48
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
describe 'symbol vs string key handling' do
|
|
230
|
+
it 'normalizes model keys to symbols' do
|
|
231
|
+
config = described_class.new(model: { 'type' => :preset, 'name' => 'fast' })
|
|
232
|
+
|
|
233
|
+
expect(config.model).to be_a Hash
|
|
234
|
+
expect(config.model[:type]).to eq :preset
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
it 'preserves symbol values in model' do
|
|
238
|
+
config = described_class.new(model: { type: :preset })
|
|
239
|
+
|
|
240
|
+
expect(config.model[:type]).to eq :preset
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
describe 'boolean conversion' do
|
|
245
|
+
it 'converts truthy normalize to true' do
|
|
246
|
+
config = described_class.new(normalize: 1)
|
|
247
|
+
|
|
248
|
+
expect(config.normalize).to be true
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
it 'converts false normalize to false' do
|
|
252
|
+
config = described_class.new(normalize: false)
|
|
253
|
+
|
|
254
|
+
expect(config.normalize).to be false
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
it 'converts truthy show_download_progress to true' do
|
|
258
|
+
config = described_class.new(show_download_progress: 'yes')
|
|
259
|
+
|
|
260
|
+
expect(config.show_download_progress).to be true
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
it 'converts false show_download_progress to false' do
|
|
264
|
+
config = described_class.new(show_download_progress: false)
|
|
265
|
+
|
|
266
|
+
expect(config.show_download_progress).to be false
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
describe 'model configuration' do
|
|
271
|
+
it 'accepts preset model type' do
|
|
272
|
+
config = described_class.new(model: { type: :preset, name: 'fast' })
|
|
273
|
+
|
|
274
|
+
expect(config.model[:type]).to eq :preset
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
it 'accepts custom model type' do
|
|
278
|
+
config = described_class.new(model: { type: :custom, path: '/model' })
|
|
279
|
+
|
|
280
|
+
expect(config.model[:type]).to eq :custom
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
it 'preserves model configuration details' do
|
|
284
|
+
model = { type: :preset, name: 'balanced', dimensions: 384 }
|
|
285
|
+
config = described_class.new(model: model)
|
|
286
|
+
|
|
287
|
+
expect(config.model[:dimensions]).to eq 384
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
describe 'batch size handling' do
|
|
292
|
+
it 'defaults to 32' do
|
|
293
|
+
config = described_class.new
|
|
294
|
+
|
|
295
|
+
expect(config.batch_size).to eq 32
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
it 'accepts small batch sizes' do
|
|
299
|
+
config = described_class.new(batch_size: 1)
|
|
300
|
+
|
|
301
|
+
expect(config.batch_size).to eq 1
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
it 'accepts large batch sizes' do
|
|
305
|
+
config = described_class.new(batch_size: 512)
|
|
306
|
+
|
|
307
|
+
expect(config.batch_size).to eq 512
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
it 'converts string batch_size to integer' do
|
|
311
|
+
config = described_class.new(batch_size: '256')
|
|
312
|
+
|
|
313
|
+
expect(config.batch_size).to eq 256
|
|
314
|
+
expect(config.batch_size).to be_a Integer
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
describe 'cache directory' do
|
|
319
|
+
it 'defaults to nil' do
|
|
320
|
+
config = described_class.new
|
|
321
|
+
|
|
322
|
+
expect(config.cache_dir).to be_nil
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
it 'accepts absolute paths' do
|
|
326
|
+
config = described_class.new(cache_dir: '/var/cache/embeddings')
|
|
327
|
+
|
|
328
|
+
expect(config.cache_dir).to eq '/var/cache/embeddings'
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
it 'accepts relative paths' do
|
|
332
|
+
config = described_class.new(cache_dir: './cache')
|
|
333
|
+
|
|
334
|
+
expect(config.cache_dir).to eq './cache'
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
it 'converts path to string' do
|
|
338
|
+
config = described_class.new(cache_dir: :default_cache)
|
|
339
|
+
|
|
340
|
+
expect(config.cache_dir).to be_a String
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
end
|