kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,377 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe 'Configuration Validation' do
4
+ describe Kreuzberg::Config::Extraction do
5
+ it 'accepts all valid parameters' do
6
+ config = described_class.new(
7
+ use_cache: true,
8
+ enable_quality_processing: false,
9
+ force_ocr: false,
10
+ ocr: Kreuzberg::Config::OCR.new,
11
+ chunking: Kreuzberg::Config::Chunking.new,
12
+ language_detection: Kreuzberg::Config::LanguageDetection.new,
13
+ pdf_options: Kreuzberg::Config::PDF.new
14
+ )
15
+
16
+ expect(config.use_cache).to be true
17
+ expect(config.enable_quality_processing).to be false
18
+ expect(config.force_ocr).to be false
19
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
20
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
21
+ expect(config.language_detection).to be_a(Kreuzberg::Config::LanguageDetection)
22
+ expect(config.pdf_options).to be_a(Kreuzberg::Config::PDF)
23
+ end
24
+
25
+ it 'accepts hashes for nested configs' do
26
+ config = described_class.new(
27
+ ocr: { backend: 'tesseract', language: 'eng' },
28
+ chunking: { max_chars: 500 }
29
+ )
30
+
31
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
32
+ expect(config.ocr.backend).to eq('tesseract')
33
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
34
+ expect(config.chunking.max_chars).to eq(500)
35
+ end
36
+
37
+ it 'validates ocr config type' do
38
+ expect do
39
+ described_class.new(ocr: 'invalid')
40
+ end.to raise_error(ArgumentError, /Expected.*OCR/)
41
+ end
42
+
43
+ it 'validates chunking config type' do
44
+ expect do
45
+ described_class.new(chunking: 'invalid')
46
+ end.to raise_error(ArgumentError, /Expected.*Chunking/)
47
+ end
48
+
49
+ it 'converts to hash correctly' do
50
+ config = described_class.new(
51
+ use_cache: false,
52
+ force_ocr: true
53
+ )
54
+ hash = config.to_h
55
+
56
+ expect(hash).to be_a(Hash)
57
+ expect(hash[:use_cache]).to be false
58
+ expect(hash[:force_ocr]).to be true
59
+ end
60
+
61
+ it 'omits nil values from hash' do
62
+ config = described_class.new
63
+ hash = config.to_h
64
+
65
+ expect(hash[:ocr]).to be_nil
66
+ expect(hash[:chunking]).to be_nil
67
+ end
68
+
69
+ it 'accepts html options hashes' do
70
+ config = described_class.new(html_options: { heading_style: :atx, wrap: true })
71
+ expect(config.html_options).to be_a(Kreuzberg::Config::HtmlOptions)
72
+ expect(config.html_options.to_h[:heading_style]).to eq(:atx)
73
+ end
74
+
75
+ it 'accepts keyword configurations' do
76
+ keywords = Kreuzberg::Config::Keywords.new(algorithm: :yake, max_keywords: 5)
77
+ config = described_class.new(keywords: keywords, max_concurrent_extractions: 4)
78
+ expect(config.keywords).to be_a(Kreuzberg::Config::Keywords)
79
+ expect(config.max_concurrent_extractions).to eq(4)
80
+ end
81
+ end
82
+
83
+ describe Kreuzberg::Config::OCR do
84
+ it 'has sensible defaults' do
85
+ config = described_class.new
86
+
87
+ expect(config.backend).to eq('tesseract')
88
+ expect(config.language).to eq('eng')
89
+ expect(config.tesseract_config).to be_nil
90
+ end
91
+
92
+ it 'accepts custom values' do
93
+ config = described_class.new(
94
+ backend: 'easyocr',
95
+ language: 'deu'
96
+ )
97
+
98
+ expect(config.backend).to eq('easyocr')
99
+ expect(config.language).to eq('deu')
100
+ end
101
+
102
+ it 'coerces types correctly' do
103
+ config = described_class.new(
104
+ backend: :tesseract,
105
+ language: 123
106
+ )
107
+
108
+ expect(config.backend).to eq('tesseract')
109
+ expect(config.language).to eq('123')
110
+ end
111
+
112
+ it 'accepts tesseract config hashes' do
113
+ config = described_class.new(
114
+ tesseract_config: {
115
+ psm: 6,
116
+ enable_table_detection: true
117
+ }
118
+ )
119
+
120
+ expect(config.tesseract_config).to be_a(Kreuzberg::Config::Tesseract)
121
+ expect(config.tesseract_config.to_h[:psm]).to eq(6)
122
+ end
123
+ end
124
+
125
+ describe Kreuzberg::Config::Chunking do
126
+ it 'has sensible defaults' do
127
+ config = described_class.new
128
+
129
+ expect(config.max_chars).to eq(1000)
130
+ expect(config.max_overlap).to eq(200)
131
+ expect(config.preset).to be_nil
132
+ end
133
+
134
+ it 'accepts custom chunk sizes' do
135
+ config = described_class.new(
136
+ max_chars: 500,
137
+ max_overlap: 100
138
+ )
139
+
140
+ expect(config.max_chars).to eq(500)
141
+ expect(config.max_overlap).to eq(100)
142
+ end
143
+
144
+ it 'supports different strategies' do
145
+ config = described_class.new(preset: 'fast')
146
+ expect(config.preset).to eq('fast')
147
+ end
148
+
149
+ it 'accepts embedding configs' do
150
+ embedding = { model: { type: :preset, name: 'quality' }, normalize: false }
151
+ config = described_class.new(embedding: embedding)
152
+ expect(config.embedding).to be_a(Kreuzberg::Config::Embedding)
153
+ expect(config.embedding.to_h[:model]).to include(type: :preset, name: 'quality')
154
+ end
155
+ end
156
+
157
+ describe Kreuzberg::Config::LanguageDetection do
158
+ it 'has sensible defaults' do
159
+ config = described_class.new
160
+
161
+ expect(config.enabled).to be false
162
+ expect(config.min_confidence).to eq(0.5)
163
+ end
164
+
165
+ it 'accepts custom confidence thresholds' do
166
+ config = described_class.new(
167
+ enabled: true,
168
+ min_confidence: 0.9
169
+ )
170
+
171
+ expect(config.enabled).to be true
172
+ expect(config.min_confidence).to eq(0.9)
173
+ end
174
+
175
+ it 'coerces confidence to float' do
176
+ config = described_class.new(min_confidence: '0.75')
177
+ expect(config.min_confidence).to eq(0.75)
178
+ end
179
+
180
+ it 'supports detect_multiple flag' do
181
+ config = described_class.new(detect_multiple: true)
182
+ expect(config.detect_multiple).to be true
183
+ expect(config.to_h[:detect_multiple]).to be true
184
+ end
185
+ end
186
+
187
+ describe Kreuzberg::Config::PDF do
188
+ it 'has sensible defaults' do
189
+ config = described_class.new
190
+
191
+ expect(config.extract_images).to be false
192
+ expect(config.passwords).to be_nil
193
+ expect(config.extract_metadata).to be true
194
+ end
195
+
196
+ it 'accepts custom values' do
197
+ config = described_class.new(
198
+ extract_images: true,
199
+ passwords: ['secret123']
200
+ )
201
+
202
+ expect(config.extract_images).to be true
203
+ expect(config.passwords).to eq(['secret123'])
204
+ end
205
+
206
+ it 'converts password to string' do
207
+ config = described_class.new(passwords: 12_345)
208
+ expect(config.passwords).to eq(['12345'])
209
+ end
210
+ end
211
+
212
+ describe Kreuzberg::Config::HtmlOptions do
213
+ it 'normalizes preprocessing settings' do
214
+ options = described_class.new(
215
+ heading_style: :atx_closed,
216
+ preprocessing: { enabled: true, preset: :standard }
217
+ )
218
+ hash = options.to_h
219
+ expect(hash[:heading_style]).to eq(:atx_closed)
220
+ expect(hash[:preprocessing]).to include(preset: :standard)
221
+ end
222
+ end
223
+
224
+ describe Kreuzberg::Config::Keywords do
225
+ it 'accepts hash arguments' do
226
+ config = described_class.new(
227
+ algorithm: :yake,
228
+ max_keywords: 10,
229
+ ngram_range: [1, 3],
230
+ yake_params: { window_size: 4 }
231
+ )
232
+ expect(config.to_h[:algorithm]).to eq('yake')
233
+ expect(config.to_h[:yake_params]).to eq(window_size: 4)
234
+ end
235
+ end
236
+
237
+ describe Kreuzberg::Config::ImagePreprocessing do
238
+ it 'has sensible defaults' do
239
+ config = described_class.new
240
+ expect(config.target_dpi).to eq(300)
241
+ expect(config.auto_rotate).to be true
242
+ expect(config.deskew).to be true
243
+ expect(config.denoise).to be false
244
+ expect(config.contrast_enhance).to be true
245
+ expect(config.binarization_method).to eq('otsu')
246
+ expect(config.invert_colors).to be false
247
+ end
248
+
249
+ it 'accepts custom values' do
250
+ config = described_class.new(
251
+ target_dpi: 600,
252
+ auto_rotate: false,
253
+ deskew: false,
254
+ denoise: true,
255
+ contrast_enhance: false,
256
+ binarization_method: 'sauvola',
257
+ invert_colors: true
258
+ )
259
+ expect(config.target_dpi).to eq(600)
260
+ expect(config.auto_rotate).to be false
261
+ expect(config.deskew).to be false
262
+ expect(config.denoise).to be true
263
+ expect(config.contrast_enhance).to be false
264
+ expect(config.binarization_method).to eq('sauvola')
265
+ expect(config.invert_colors).to be true
266
+ end
267
+
268
+ it 'validates binarization method via FFI' do
269
+ expect { described_class.new(binarization_method: 'otsu') }.not_to raise_error
270
+ expect { described_class.new(binarization_method: 'adaptive') }.not_to raise_error
271
+ expect { described_class.new(binarization_method: 'sauvola') }.not_to raise_error
272
+ end
273
+
274
+ it 'rejects invalid binarization methods' do
275
+ expect do
276
+ described_class.new(binarization_method: 'invalid_method')
277
+ end.to raise_error(ArgumentError, /Invalid binarization_method/)
278
+ end
279
+
280
+ it 'converts to hash correctly' do
281
+ config = described_class.new(
282
+ target_dpi: 500,
283
+ binarization_method: 'adaptive'
284
+ )
285
+ hash = config.to_h
286
+ expect(hash[:target_dpi]).to eq(500)
287
+ expect(hash[:binarization_method]).to eq('adaptive')
288
+ expect(hash[:auto_rotate]).to be true
289
+ end
290
+ end
291
+
292
+ describe Kreuzberg::Config::TokenReduction do
293
+ it 'has sensible defaults' do
294
+ config = described_class.new
295
+ expect(config.mode).to eq('off')
296
+ expect(config.preserve_important_words).to be true
297
+ end
298
+
299
+ it 'accepts custom values' do
300
+ config = described_class.new(
301
+ mode: 'aggressive',
302
+ preserve_important_words: false
303
+ )
304
+ expect(config.mode).to eq('aggressive')
305
+ expect(config.preserve_important_words).to be false
306
+ end
307
+
308
+ it 'validates token reduction levels via FFI' do
309
+ expect { described_class.new(mode: 'off') }.not_to raise_error
310
+ expect { described_class.new(mode: 'light') }.not_to raise_error
311
+ expect { described_class.new(mode: 'moderate') }.not_to raise_error
312
+ expect { described_class.new(mode: 'aggressive') }.not_to raise_error
313
+ expect { described_class.new(mode: 'maximum') }.not_to raise_error
314
+ end
315
+
316
+ it 'rejects invalid token reduction modes' do
317
+ expect do
318
+ described_class.new(mode: 'extreme')
319
+ end.to raise_error(ArgumentError, /Invalid token reduction mode/)
320
+ end
321
+
322
+ it 'converts to hash correctly' do
323
+ config = described_class.new(
324
+ mode: 'light',
325
+ preserve_important_words: true
326
+ )
327
+ hash = config.to_h
328
+ expect(hash[:mode]).to eq('light')
329
+ expect(hash[:preserve_important_words]).to be true
330
+ end
331
+ end
332
+
333
+ describe 'config usage in extraction' do
334
+ it 'works with OCR config' do
335
+ path = create_test_file('OCR config test')
336
+ config = Kreuzberg::Config::Extraction.new(
337
+ ocr: Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
338
+ )
339
+
340
+ result = Kreuzberg.extract_file_sync(path: path, config: config)
341
+ expect(result).to be_a(Kreuzberg::Result)
342
+ end
343
+
344
+ it 'works with chunking config' do
345
+ path = create_test_file('Chunking config test' * 50)
346
+ config = Kreuzberg::Config::Extraction.new(
347
+ chunking: Kreuzberg::Config::Chunking.new(max_chars: 50)
348
+ )
349
+
350
+ result = Kreuzberg.extract_file_sync(path: path, config: config)
351
+ expect(result).to be_a(Kreuzberg::Result)
352
+ end
353
+
354
+ it 'works with language detection config' do
355
+ path = create_test_file('Language detection test')
356
+ config = Kreuzberg::Config::Extraction.new(
357
+ language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
358
+ )
359
+
360
+ result = Kreuzberg.extract_file_sync(path: path, config: config)
361
+ expect(result).to be_a(Kreuzberg::Result)
362
+ end
363
+
364
+ it 'works with combined configs' do
365
+ path = create_test_file('Combined config test')
366
+ config = Kreuzberg::Config::Extraction.new(
367
+ use_cache: false,
368
+ force_ocr: false,
369
+ ocr: { backend: 'tesseract', language: 'eng' },
370
+ language_detection: { enabled: false }
371
+ )
372
+
373
+ result = Kreuzberg.extract_file_sync(path: path, config: config)
374
+ expect(result).to be_a(Kreuzberg::Result)
375
+ end
376
+ end
377
+ end