kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,419 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config do
4
+ describe Kreuzberg::Config::OCR do
5
+ it 'creates with default values' do
6
+ ocr = described_class.new
7
+
8
+ expect(ocr.backend).to eq('tesseract')
9
+ expect(ocr.language).to eq('eng')
10
+ expect(ocr.tesseract_config).to be_nil
11
+ end
12
+
13
+ it 'creates with custom values' do
14
+ ocr = described_class.new(
15
+ backend: 'easyocr',
16
+ language: 'deu'
17
+ )
18
+
19
+ expect(ocr.backend).to eq('easyocr')
20
+ expect(ocr.language).to eq('deu')
21
+ end
22
+
23
+ it 'converts to hash' do
24
+ ocr = described_class.new(backend: 'tesseract', language: 'fra')
25
+ hash = ocr.to_h
26
+
27
+ expect(hash).to be_a(Hash)
28
+ expect(hash[:backend]).to eq('tesseract')
29
+ expect(hash[:language]).to eq('fra')
30
+ end
31
+ end
32
+
33
+ describe Kreuzberg::Config::Chunking do
34
+ it 'creates with default values' do
35
+ chunking = described_class.new
36
+
37
+ expect(chunking.max_chars).to eq(1000)
38
+ expect(chunking.max_overlap).to eq(200)
39
+ expect(chunking.preset).to be_nil
40
+ expect(chunking.embedding).to be_nil
41
+ end
42
+
43
+ it 'creates with custom values' do
44
+ chunking = described_class.new(
45
+ max_chars: 500,
46
+ max_overlap: 100,
47
+ preset: 'fast'
48
+ )
49
+
50
+ expect(chunking.max_chars).to eq(500)
51
+ expect(chunking.max_overlap).to eq(100)
52
+ expect(chunking.preset).to eq('fast')
53
+ end
54
+
55
+ it 'converts to hash' do
56
+ chunking = described_class.new(max_chars: 750)
57
+ hash = chunking.to_h
58
+
59
+ expect(hash).to be_a(Hash)
60
+ expect(hash[:max_chars]).to eq(750)
61
+ end
62
+ end
63
+
64
+ describe Kreuzberg::Config::LanguageDetection do
65
+ it 'creates with default values' do
66
+ lang = described_class.new
67
+
68
+ expect(lang.enabled).to be false
69
+ expect(lang.min_confidence).to eq(0.5)
70
+ end
71
+
72
+ it 'creates with custom values' do
73
+ lang = described_class.new(enabled: true, min_confidence: 0.9)
74
+
75
+ expect(lang.enabled).to be true
76
+ expect(lang.min_confidence).to eq(0.9)
77
+ end
78
+
79
+ it 'converts to hash' do
80
+ lang = described_class.new(enabled: true, min_confidence: 0.75)
81
+ hash = lang.to_h
82
+
83
+ expect(hash).to be_a(Hash)
84
+ expect(hash[:enabled]).to be true
85
+ expect(hash[:min_confidence]).to eq(0.75)
86
+ end
87
+ end
88
+
89
+ describe Kreuzberg::Config::FontConfig do
90
+ it 'creates with default values' do
91
+ font_config = described_class.new
92
+
93
+ expect(font_config.enabled).to be true
94
+ expect(font_config.custom_font_dirs).to be_nil
95
+ end
96
+
97
+ it 'creates with custom values' do
98
+ dirs = ['/usr/share/fonts', '/home/user/.fonts']
99
+ font_config = described_class.new(
100
+ enabled: false,
101
+ custom_font_dirs: dirs
102
+ )
103
+
104
+ expect(font_config.enabled).to be false
105
+ expect(font_config.custom_font_dirs).to eq(dirs)
106
+ end
107
+
108
+ it 'converts to hash' do
109
+ dirs = ['/usr/share/fonts']
110
+ font_config = described_class.new(
111
+ enabled: true,
112
+ custom_font_dirs: dirs
113
+ )
114
+ hash = font_config.to_h
115
+
116
+ expect(hash).to be_a(Hash)
117
+ expect(hash[:enabled]).to be true
118
+ expect(hash[:custom_font_dirs]).to eq(dirs)
119
+ end
120
+
121
+ it 'compacts nil values in hash' do
122
+ font_config = described_class.new(enabled: true)
123
+ hash = font_config.to_h
124
+
125
+ expect(hash).to be_a(Hash)
126
+ expect(hash.key?(:custom_font_dirs)).to be false
127
+ end
128
+ end
129
+
130
+ describe Kreuzberg::Config::PDF do
131
+ it 'creates with default values' do
132
+ pdf = described_class.new
133
+
134
+ expect(pdf.extract_images).to be false
135
+ expect(pdf.passwords).to be_nil
136
+ expect(pdf.extract_metadata).to be true
137
+ expect(pdf.font_config).to be_nil
138
+ end
139
+
140
+ it 'creates with custom values' do
141
+ pdf = described_class.new(
142
+ extract_images: true,
143
+ passwords: %w[secret backup]
144
+ )
145
+
146
+ expect(pdf.extract_images).to be true
147
+ expect(pdf.passwords).to eq(%w[secret backup])
148
+ end
149
+
150
+ it 'creates with font_config as instance' do
151
+ font_config = Kreuzberg::Config::FontConfig.new(enabled: true)
152
+ pdf = described_class.new(font_config: font_config)
153
+
154
+ expect(pdf.font_config).to be_a(Kreuzberg::Config::FontConfig)
155
+ expect(pdf.font_config.enabled).to be true
156
+ end
157
+
158
+ it 'creates with font_config as hash' do
159
+ font_config_hash = { enabled: false, custom_font_dirs: ['/fonts'] }
160
+ pdf = described_class.new(font_config: font_config_hash)
161
+
162
+ expect(pdf.font_config).to be_a(Kreuzberg::Config::FontConfig)
163
+ expect(pdf.font_config.enabled).to be false
164
+ expect(pdf.font_config.custom_font_dirs).to eq(['/fonts'])
165
+ end
166
+
167
+ it 'converts to hash' do
168
+ pdf = described_class.new(extract_images: true, passwords: ['test'])
169
+ hash = pdf.to_h
170
+
171
+ expect(hash).to be_a(Hash)
172
+ expect(hash[:extract_images]).to be true
173
+ expect(hash[:passwords]).to eq(['test'])
174
+ end
175
+
176
+ it 'includes font_config in hash when present' do
177
+ font_config = Kreuzberg::Config::FontConfig.new(enabled: true)
178
+ pdf = described_class.new(font_config: font_config)
179
+ hash = pdf.to_h
180
+
181
+ expect(hash[:font_config]).to be_a(Hash)
182
+ expect(hash[:font_config][:enabled]).to be true
183
+ end
184
+
185
+ it 'raises error with invalid font_config type' do
186
+ expect do
187
+ described_class.new(font_config: 'invalid')
188
+ end.to raise_error(ArgumentError)
189
+ end
190
+ end
191
+
192
+ describe Kreuzberg::Config::Extraction do
193
+ describe '.from_file' do
194
+ it 'loads configuration from TOML file' do
195
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
196
+ config = described_class.from_file(config_path)
197
+
198
+ expect(config.use_cache).to be false
199
+ expect(config.enable_quality_processing).to be true
200
+ expect(config.force_ocr).to be true
201
+ end
202
+
203
+ it 'loads OCR config from TOML file' do
204
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
205
+ config = described_class.from_file(config_path)
206
+
207
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
208
+ expect(config.ocr.backend).to eq('tesseract')
209
+ expect(config.ocr.language).to eq('deu')
210
+ end
211
+
212
+ it 'loads chunking config from TOML file' do
213
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
214
+ config = described_class.from_file(config_path)
215
+
216
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
217
+ expect(config.chunking.max_chars).to eq(500)
218
+ expect(config.chunking.max_overlap).to eq(100)
219
+ expect(config.chunking.preset).to eq('fast')
220
+ end
221
+
222
+ it 'loads language detection config from TOML file' do
223
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
224
+ config = described_class.from_file(config_path)
225
+
226
+ expect(config.language_detection).to be_a(Kreuzberg::Config::LanguageDetection)
227
+ expect(config.language_detection.enabled).to be true
228
+ expect(config.language_detection.min_confidence).to eq(0.9)
229
+ end
230
+
231
+ it 'loads PDF options from TOML file' do
232
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
233
+ config = described_class.from_file(config_path)
234
+
235
+ expect(config.pdf_options).to be_a(Kreuzberg::Config::PDF)
236
+ expect(config.pdf_options.extract_images).to be true
237
+ expect(config.pdf_options.passwords).to eq(%w[secret backup])
238
+ expect(config.pdf_options.extract_metadata).to be true
239
+ end
240
+
241
+ it 'loads configuration from YAML file' do
242
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
243
+ config = described_class.from_file(config_path)
244
+
245
+ expect(config.use_cache).to be false
246
+ expect(config.enable_quality_processing).to be true
247
+ expect(config.force_ocr).to be true
248
+ end
249
+
250
+ it 'loads OCR config from YAML file' do
251
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
252
+ config = described_class.from_file(config_path)
253
+
254
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
255
+ expect(config.ocr.backend).to eq('tesseract')
256
+ expect(config.ocr.language).to eq('fra')
257
+ end
258
+
259
+ it 'loads chunking config from YAML file' do
260
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
261
+ config = described_class.from_file(config_path)
262
+
263
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
264
+ expect(config.chunking.max_chars).to eq(750)
265
+ expect(config.chunking.max_overlap).to eq(150)
266
+ expect(config.chunking.preset).to eq('balanced')
267
+ end
268
+
269
+ it 'works with absolute paths' do
270
+ config_path = File.expand_path('../fixtures/config.toml', __dir__)
271
+ config = described_class.from_file(config_path)
272
+
273
+ expect(config.use_cache).to be false
274
+ end
275
+
276
+ it 'works with relative paths' do
277
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
278
+ config = described_class.from_file(config_path)
279
+
280
+ expect(config.use_cache).to be false
281
+ end
282
+
283
+ it 'raises error for non-existent file' do
284
+ expect do
285
+ described_class.from_file('/path/to/nonexistent/config.toml')
286
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Failed to read config file/)
287
+ end
288
+
289
+ it 'raises error for invalid TOML file' do
290
+ config_path = File.join(__dir__, '..', 'fixtures', 'invalid_config.toml')
291
+ expect do
292
+ described_class.from_file(config_path)
293
+ end.to raise_error(Kreuzberg::Errors::ValidationError, /Invalid TOML/)
294
+ end
295
+
296
+ it 'detects file format from extension' do
297
+ toml_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
298
+ yaml_path = File.join(__dir__, '..', 'fixtures', 'config.yaml')
299
+
300
+ toml_config = described_class.from_file(toml_path)
301
+ yaml_config = described_class.from_file(yaml_path)
302
+
303
+ expect(toml_config.ocr.language).to eq('deu')
304
+ expect(yaml_config.ocr.language).to eq('fra')
305
+ end
306
+ end
307
+
308
+ it 'creates with default values' do
309
+ config = described_class.new
310
+
311
+ expect(config.use_cache).to be true
312
+ expect(config.enable_quality_processing).to be true
313
+ expect(config.force_ocr).to be false
314
+ expect(config.ocr).to be_nil
315
+ expect(config.chunking).to be_nil
316
+ expect(config.language_detection).to be_nil
317
+ expect(config.pdf_options).to be_nil
318
+ end
319
+
320
+ it 'creates with custom values' do
321
+ ocr = Kreuzberg::Config::OCR.new(backend: 'easyocr')
322
+ chunking = Kreuzberg::Config::Chunking.new(max_chars: 500)
323
+ lang = Kreuzberg::Config::LanguageDetection.new(enabled: true)
324
+ pdf = Kreuzberg::Config::PDF.new(extract_images: true)
325
+
326
+ config = described_class.new(
327
+ use_cache: false,
328
+ enable_quality_processing: true,
329
+ force_ocr: true,
330
+ ocr: ocr,
331
+ chunking: chunking,
332
+ language_detection: lang,
333
+ pdf_options: pdf
334
+ )
335
+
336
+ expect(config.use_cache).to be false
337
+ expect(config.enable_quality_processing).to be true
338
+ expect(config.force_ocr).to be true
339
+ expect(config.ocr).to eq(ocr)
340
+ expect(config.chunking).to eq(chunking)
341
+ expect(config.language_detection).to eq(lang)
342
+ expect(config.pdf_options).to eq(pdf)
343
+ end
344
+
345
+ it 'accepts hash for nested configs' do
346
+ config = described_class.new(
347
+ ocr: { backend: 'tesseract', language: 'eng' },
348
+ chunking: { max_chars: 500 }
349
+ )
350
+
351
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
352
+ expect(config.ocr.backend).to eq('tesseract')
353
+ expect(config.chunking).to be_a(Kreuzberg::Config::Chunking)
354
+ expect(config.chunking.max_chars).to eq(500)
355
+ end
356
+
357
+ it 'converts to hash' do
358
+ config = described_class.new(
359
+ use_cache: false,
360
+ ocr: { backend: 'tesseract' }
361
+ )
362
+ hash = config.to_h
363
+
364
+ expect(hash).to be_a(Hash)
365
+ expect(hash[:use_cache]).to be false
366
+ expect(hash[:ocr]).to be_a(Hash)
367
+ expect(hash[:ocr][:backend]).to eq('tesseract')
368
+ end
369
+
370
+ it 'raises error for invalid config type' do
371
+ expect do
372
+ described_class.new(ocr: 'invalid')
373
+ end.to raise_error(ArgumentError, /Expected.*OCR/)
374
+ end
375
+ end
376
+
377
+ describe 'ExtractionConfig alias' do
378
+ it 'exists at module level' do
379
+ expect(Kreuzberg.const_defined?(:ExtractionConfig)).to be true
380
+ end
381
+
382
+ it 'is the same class as Config::Extraction' do
383
+ expect(Kreuzberg::ExtractionConfig).to eq(Kreuzberg::Config::Extraction)
384
+ end
385
+
386
+ it 'can be instantiated using the alias' do
387
+ config = Kreuzberg::ExtractionConfig.new(use_cache: false)
388
+
389
+ expect(config).to be_a(Kreuzberg::Config::Extraction)
390
+ expect(config.use_cache).to be false
391
+ end
392
+
393
+ it 'supports all methods through the alias' do
394
+ config = Kreuzberg::ExtractionConfig.new(
395
+ use_cache: false,
396
+ force_ocr: true,
397
+ ocr: { backend: 'tesseract', language: 'eng' }
398
+ )
399
+
400
+ expect(config.use_cache).to be false
401
+ expect(config.force_ocr).to be true
402
+ expect(config.ocr).to be_a(Kreuzberg::Config::OCR)
403
+ expect(config.ocr.backend).to eq('tesseract')
404
+
405
+ hash = config.to_h
406
+ expect(hash[:use_cache]).to be false
407
+ expect(hash[:force_ocr]).to be true
408
+ end
409
+
410
+ it 'supports from_file through the alias' do
411
+ config_path = File.join(__dir__, '..', 'fixtures', 'config.toml')
412
+ config = Kreuzberg::ExtractionConfig.from_file(config_path)
413
+
414
+ expect(config).to be_a(Kreuzberg::Config::Extraction)
415
+ expect(config.use_cache).to be false
416
+ expect(config.enable_quality_processing).to be true
417
+ end
418
+ end
419
+ end