kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,343 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::Embedding do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.model).to be_a Hash
9
+ expect(config.model[:type]).to eq :preset
10
+ expect(config.model[:name]).to eq 'balanced'
11
+ expect(config.normalize).to be true
12
+ expect(config.batch_size).to eq 32
13
+ expect(config.show_download_progress).to be false
14
+ expect(config.cache_dir).to be_nil
15
+ end
16
+
17
+ it 'creates config with custom model hash' do
18
+ model = { type: :preset, name: 'fast' }
19
+ config = described_class.new(model: model)
20
+
21
+ expect(config.model[:type]).to eq :preset
22
+ expect(config.model[:name]).to eq 'fast'
23
+ end
24
+
25
+ it 'creates config with custom values' do
26
+ config = described_class.new(
27
+ normalize: false,
28
+ batch_size: 64,
29
+ show_download_progress: true,
30
+ cache_dir: '/cache'
31
+ )
32
+
33
+ expect(config.normalize).to be false
34
+ expect(config.batch_size).to eq 64
35
+ expect(config.show_download_progress).to be true
36
+ expect(config.cache_dir).to eq '/cache'
37
+ end
38
+
39
+ it 'converts model with to_h method' do
40
+ model_like = double(to_h: { type: :custom, name: 'model' })
41
+ config = described_class.new(model: model_like)
42
+
43
+ expect(config.model).to be_a Hash
44
+ expect(config.model[:type]).to eq :custom
45
+ end
46
+
47
+ it 'converts batch_size to integer' do
48
+ config = described_class.new(batch_size: '128')
49
+
50
+ expect(config.batch_size).to eq 128
51
+ expect(config.batch_size).to be_a Integer
52
+ end
53
+
54
+ it 'converts normalize to boolean' do
55
+ config = described_class.new(normalize: 1)
56
+
57
+ expect(config.normalize).to be true
58
+ end
59
+
60
+ it 'converts show_download_progress to boolean' do
61
+ config = described_class.new(show_download_progress: 'yes')
62
+
63
+ expect(config.show_download_progress).to be true
64
+ end
65
+
66
+ it 'converts cache_dir to string' do
67
+ config = described_class.new(cache_dir: :default_cache)
68
+
69
+ expect(config.cache_dir).to be_a String
70
+ end
71
+ end
72
+
73
+ describe '#to_h' do
74
+ it 'serializes to hash with all values' do
75
+ config = described_class.new(
76
+ model: { type: :preset, name: 'fast' },
77
+ batch_size: 64
78
+ )
79
+ hash = config.to_h
80
+
81
+ expect(hash).to be_a Hash
82
+ expect(hash[:model]).to be_a Hash
83
+ expect(hash[:normalize]).to be true
84
+ expect(hash[:batch_size]).to eq 64
85
+ end
86
+
87
+ it 'includes cache_dir when present' do
88
+ config = described_class.new(cache_dir: '/cache')
89
+ hash = config.to_h
90
+
91
+ expect(hash[:cache_dir]).to eq '/cache'
92
+ end
93
+
94
+ it 'compacts nil values from hash' do
95
+ config = described_class.new
96
+ hash = config.to_h
97
+
98
+ expect(hash.key?(:cache_dir)).to be false
99
+ end
100
+
101
+ it 'always includes model in hash' do
102
+ config = described_class.new
103
+ hash = config.to_h
104
+
105
+ expect(hash.key?(:model)).to be true
106
+ expect(hash[:model]).to be_a Hash
107
+ end
108
+ end
109
+
110
+ describe 'validation' do
111
+ it 'rejects invalid model type (not hash)' do
112
+ expect do
113
+ described_class.new(model: 'invalid_string')
114
+ end.to raise_error ArgumentError, /model must be a Hash/
115
+ end
116
+
117
+ it 'accepts model as hash' do
118
+ expect do
119
+ described_class.new(model: { type: :preset, name: 'fast' })
120
+ end.not_to raise_error
121
+ end
122
+
123
+ it 'accepts model with to_h method' do
124
+ model_like = double(to_h: { type: :preset, name: 'fast' })
125
+ expect do
126
+ described_class.new(model: model_like)
127
+ end.not_to raise_error
128
+ end
129
+
130
+ it 'accepts valid batch_size' do
131
+ expect do
132
+ described_class.new(batch_size: 32)
133
+ end.not_to raise_error
134
+ end
135
+
136
+ it 'accepts valid cache_dir' do
137
+ expect do
138
+ described_class.new(cache_dir: '/tmp/cache')
139
+ end.not_to raise_error
140
+ end
141
+ end
142
+
143
+ describe 'keyword arguments' do
144
+ it 'accepts all keyword arguments' do
145
+ config = described_class.new(
146
+ model: { type: :preset, name: 'balanced' },
147
+ normalize: true,
148
+ batch_size: 48,
149
+ show_download_progress: true,
150
+ cache_dir: '/cache'
151
+ )
152
+
153
+ expect(config.model[:name]).to eq 'balanced'
154
+ expect(config.normalize).to be true
155
+ expect(config.batch_size).to eq 48
156
+ expect(config.show_download_progress).to be true
157
+ expect(config.cache_dir).to eq '/cache'
158
+ end
159
+ end
160
+
161
+ describe 'equality' do
162
+ it 'compares configs by value' do
163
+ config1 = described_class.new(
164
+ model: { type: :preset, name: 'fast' },
165
+ batch_size: 64
166
+ )
167
+ config2 = described_class.new(
168
+ model: { type: :preset, name: 'fast' },
169
+ batch_size: 64
170
+ )
171
+
172
+ expect(config1.model).to eq config2.model
173
+ expect(config1.batch_size).to eq config2.batch_size
174
+ end
175
+
176
+ it 'detects differences in model' do
177
+ config1 = described_class.new(model: { type: :preset, name: 'fast' })
178
+ config2 = described_class.new(model: { type: :preset, name: 'balanced' })
179
+
180
+ expect(config1.model).not_to eq config2.model
181
+ end
182
+
183
+ it 'detects differences in batch_size' do
184
+ config1 = described_class.new(batch_size: 32)
185
+ config2 = described_class.new(batch_size: 64)
186
+
187
+ expect(config1.batch_size).not_to eq config2.batch_size
188
+ end
189
+
190
+ it 'detects differences in normalize' do
191
+ config1 = described_class.new(normalize: true)
192
+ config2 = described_class.new(normalize: false)
193
+
194
+ expect(config1.normalize).not_to eq config2.normalize
195
+ end
196
+ end
197
+
198
+ describe 'nested config integration' do
199
+ it 'can be nested in Chunking config' do
200
+ embedding = described_class.new(
201
+ model: { type: :preset, name: 'fast' },
202
+ batch_size: 64
203
+ )
204
+ chunking = Kreuzberg::Config::Chunking.new(embedding: embedding)
205
+
206
+ expect(chunking.embedding).to be_a described_class
207
+ expect(chunking.embedding.batch_size).to eq 64
208
+ end
209
+
210
+ it 'accepts hash in Chunking config' do
211
+ chunking = Kreuzberg::Config::Chunking.new(
212
+ embedding: { model: { type: :preset, name: 'balanced' } }
213
+ )
214
+
215
+ expect(chunking.embedding).to be_a described_class
216
+ expect(chunking.embedding.model[:name]).to eq 'balanced'
217
+ end
218
+
219
+ it 'can be nested in Extraction config via Chunking' do
220
+ extraction = Kreuzberg::Config::Extraction.new(
221
+ chunking: { embedding: { batch_size: 48 } }
222
+ )
223
+
224
+ expect(extraction.chunking.embedding).to be_a described_class
225
+ expect(extraction.chunking.embedding.batch_size).to eq 48
226
+ end
227
+ end
228
+
229
+ describe 'symbol vs string key handling' do
230
+ it 'normalizes model keys to symbols' do
231
+ config = described_class.new(model: { 'type' => :preset, 'name' => 'fast' })
232
+
233
+ expect(config.model).to be_a Hash
234
+ expect(config.model[:type]).to eq :preset
235
+ end
236
+
237
+ it 'preserves symbol values in model' do
238
+ config = described_class.new(model: { type: :preset })
239
+
240
+ expect(config.model[:type]).to eq :preset
241
+ end
242
+ end
243
+
244
+ describe 'boolean conversion' do
245
+ it 'converts truthy normalize to true' do
246
+ config = described_class.new(normalize: 1)
247
+
248
+ expect(config.normalize).to be true
249
+ end
250
+
251
+ it 'converts false normalize to false' do
252
+ config = described_class.new(normalize: false)
253
+
254
+ expect(config.normalize).to be false
255
+ end
256
+
257
+ it 'converts truthy show_download_progress to true' do
258
+ config = described_class.new(show_download_progress: 'yes')
259
+
260
+ expect(config.show_download_progress).to be true
261
+ end
262
+
263
+ it 'converts false show_download_progress to false' do
264
+ config = described_class.new(show_download_progress: false)
265
+
266
+ expect(config.show_download_progress).to be false
267
+ end
268
+ end
269
+
270
+ describe 'model configuration' do
271
+ it 'accepts preset model type' do
272
+ config = described_class.new(model: { type: :preset, name: 'fast' })
273
+
274
+ expect(config.model[:type]).to eq :preset
275
+ end
276
+
277
+ it 'accepts custom model type' do
278
+ config = described_class.new(model: { type: :custom, path: '/model' })
279
+
280
+ expect(config.model[:type]).to eq :custom
281
+ end
282
+
283
+ it 'preserves model configuration details' do
284
+ model = { type: :preset, name: 'balanced', dimensions: 384 }
285
+ config = described_class.new(model: model)
286
+
287
+ expect(config.model[:dimensions]).to eq 384
288
+ end
289
+ end
290
+
291
+ describe 'batch size handling' do
292
+ it 'defaults to 32' do
293
+ config = described_class.new
294
+
295
+ expect(config.batch_size).to eq 32
296
+ end
297
+
298
+ it 'accepts small batch sizes' do
299
+ config = described_class.new(batch_size: 1)
300
+
301
+ expect(config.batch_size).to eq 1
302
+ end
303
+
304
+ it 'accepts large batch sizes' do
305
+ config = described_class.new(batch_size: 512)
306
+
307
+ expect(config.batch_size).to eq 512
308
+ end
309
+
310
+ it 'converts string batch_size to integer' do
311
+ config = described_class.new(batch_size: '256')
312
+
313
+ expect(config.batch_size).to eq 256
314
+ expect(config.batch_size).to be_a Integer
315
+ end
316
+ end
317
+
318
+ describe 'cache directory' do
319
+ it 'defaults to nil' do
320
+ config = described_class.new
321
+
322
+ expect(config.cache_dir).to be_nil
323
+ end
324
+
325
+ it 'accepts absolute paths' do
326
+ config = described_class.new(cache_dir: '/var/cache/embeddings')
327
+
328
+ expect(config.cache_dir).to eq '/var/cache/embeddings'
329
+ end
330
+
331
+ it 'accepts relative paths' do
332
+ config = described_class.new(cache_dir: './cache')
333
+
334
+ expect(config.cache_dir).to eq './cache'
335
+ end
336
+
337
+ it 'converts path to string' do
338
+ config = described_class.new(cache_dir: :default_cache)
339
+
340
+ expect(config.cache_dir).to be_a String
341
+ end
342
+ end
343
+ end