kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,399 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe 'Error Handling' do
4
+ let(:nested_ocr_result) do
5
+ {
6
+ 'content' => 'ocr text',
7
+ 'mime_type' => 'text/plain',
8
+ 'metadata_json' => '{}',
9
+ 'tables' => []
10
+ }
11
+ end
12
+
13
+ let(:image_result_payload) do
14
+ {
15
+ content: 'Test',
16
+ mime_type: 'text/plain',
17
+ images: [
18
+ {
19
+ 'data' => "binary\0data",
20
+ 'format' => 'png',
21
+ 'image_index' => 0,
22
+ 'page_number' => 1,
23
+ 'width' => 100,
24
+ 'height' => 200,
25
+ 'colorspace' => 'RGB',
26
+ 'bits_per_component' => 8,
27
+ 'is_mask' => false,
28
+ 'description' => 'inline image',
29
+ 'ocr_result' => nested_ocr_result
30
+ }
31
+ ]
32
+ }
33
+ end
34
+
35
+ describe 'invalid configuration handling' do
36
+ it 'raises error for negative max_chars in chunking' do
37
+ # rubocop:disable Style/MultilineBlockChain
38
+ expect do
39
+ Kreuzberg::Config::Extraction.new(
40
+ chunking: Kreuzberg::Config::Chunking.new(max_chars: -100)
41
+ )
42
+ end.to raise_error do |error|
43
+ expect(error).to be_a(StandardError)
44
+ expect(error.message.downcase).to match(/negative|invalid|positive|max_chars/)
45
+ end
46
+ # rubocop:enable Style/MultilineBlockChain
47
+ end
48
+
49
+ it 'raises error for negative max_overlap in chunking' do
50
+ # rubocop:disable Style/MultilineBlockChain
51
+ expect do
52
+ Kreuzberg::Config::Chunking.new(max_overlap: -50)
53
+ end.to raise_error do |error|
54
+ expect(error).to be_a(StandardError)
55
+ expect(error.message.downcase).to match(/negative|invalid|overlap/)
56
+ end
57
+ # rubocop:enable Style/MultilineBlockChain
58
+ end
59
+
60
+ it 'raises ArgumentError for invalid OCR config type' do
61
+ # rubocop:disable Style/MultilineBlockChain
62
+ expect do
63
+ Kreuzberg::Config::Extraction.new(ocr: 'invalid_string')
64
+ end.to raise_error(ArgumentError) do |error|
65
+ expect(error.message).to include('Expected')
66
+ expect(error.message).to include('OCR')
67
+ end
68
+ # rubocop:enable Style/MultilineBlockChain
69
+ end
70
+
71
+ it 'raises ArgumentError for invalid chunking config type' do
72
+ expect do
73
+ Kreuzberg::Config::Extraction.new(chunking: 123)
74
+ end.to raise_error(ArgumentError)
75
+ end
76
+
77
+ it 'raises ArgumentError for invalid language_detection config' do
78
+ expect do
79
+ Kreuzberg::Config::Extraction.new(language_detection: [])
80
+ end.to raise_error(ArgumentError)
81
+ end
82
+
83
+ it 'raises ArgumentError for invalid pdf_options config' do
84
+ expect do
85
+ Kreuzberg::Config::Extraction.new(pdf_options: 'invalid_string')
86
+ end.to raise_error(ArgumentError)
87
+ end
88
+
89
+ it 'provides descriptive error messages for config validation' do
90
+ error = nil
91
+ begin
92
+ Kreuzberg::Config::Extraction.new(ocr: 12_345)
93
+ rescue ArgumentError => e
94
+ error = e
95
+ end
96
+
97
+ expect(error).not_to be_nil
98
+ expect(error.message).to be_a(String)
99
+ expect(error.message).not_to be_empty
100
+ end
101
+ end
102
+
103
+ describe 'file not found and corrupted files' do
104
+ it 'raises error for non-existent file with meaningful message' do
105
+ # rubocop:disable Style/MultilineBlockChain
106
+ expect do
107
+ Kreuzberg.extract_file_sync(path: '/nonexistent/path/file.txt')
108
+ end.to raise_error do |error|
109
+ expect(error).to be_a(StandardError)
110
+ expect(error.message).not_to be_empty
111
+ end
112
+ # rubocop:enable Style/MultilineBlockChain
113
+ end
114
+
115
+ it 'raises error for empty file path' do
116
+ expect do
117
+ Kreuzberg.extract_file_sync(path: '')
118
+ end.to raise_error(StandardError)
119
+ end
120
+
121
+ it 'raises error for nil file path' do
122
+ expect do
123
+ Kreuzberg.extract_file_sync(path: nil)
124
+ end.to raise_error(StandardError)
125
+ end
126
+
127
+ it 'handles corrupted file gracefully' do
128
+ # Create a file with binary garbage that is not a valid document
129
+ corrupted_path = create_test_file("\x00\x01\x02\xFF\xFE\xFD", filename: 'corrupted.bin')
130
+
131
+ begin
132
+ result = Kreuzberg.extract_file_sync(path: corrupted_path, mime_type: 'application/octet-stream')
133
+ # May succeed with empty content or raise error - both acceptable
134
+ expect(result).to be_a(Kreuzberg::Result)
135
+ rescue Kreuzberg::Errors::ParsingError => e
136
+ expect(e).to be_a(Kreuzberg::Errors::ParsingError)
137
+ expect(e.message).not_to be_empty
138
+ rescue StandardError => e
139
+ expect(e).to be_a(StandardError)
140
+ end
141
+ end
142
+ end
143
+
144
+ describe 'invalid MIME type handling' do
145
+ it 'gracefully handles unknown MIME types' do
146
+ path = create_test_file('Content with unknown type')
147
+
148
+ result_or_error = nil
149
+ begin
150
+ result_or_error = Kreuzberg.extract_file_sync(path, mime_type: 'application/x-custom-unknown-format')
151
+ rescue Kreuzberg::Errors::UnsupportedFormatError, StandardError => e
152
+ result_or_error = e
153
+ end
154
+
155
+ if result_or_error.is_a?(Kreuzberg::Result)
156
+ expect(result_or_error).to be_a(Kreuzberg::Result)
157
+ else
158
+ expect(result_or_error).to be_a(StandardError)
159
+ expect(result_or_error.message).not_to be_empty
160
+ end
161
+ end
162
+
163
+ it 'handles malformed MIME type strings' do
164
+ path = create_test_file('Test content')
165
+
166
+ # Either succeeds or raises with meaningful error - both acceptable
167
+ result_or_error = nil
168
+ begin
169
+ result_or_error = Kreuzberg.extract_file_sync(path, mime_type: '///invalid@@@')
170
+ rescue StandardError => e
171
+ result_or_error = e
172
+ end
173
+
174
+ expect([Kreuzberg::Result, StandardError].any? { |klass| result_or_error.is_a?(klass) }).to be_truthy
175
+ end
176
+
177
+ it 'rejects empty MIME type with appropriate error' do
178
+ path = create_test_file('Test')
179
+
180
+ # Empty MIME type should either be rejected or handled gracefully
181
+ result_or_error = nil
182
+ begin
183
+ Kreuzberg.extract_file_sync(path, mime_type: '')
184
+ rescue StandardError => e
185
+ result_or_error = e
186
+ end
187
+
188
+ expect(result_or_error).to be_a(StandardError) if result_or_error
189
+ end
190
+ end
191
+
192
+ describe 'permission and I/O errors' do
193
+ it 'raises IOError or subclass for permission denied scenario' do
194
+ # This is environment-dependent, so we test gracefully
195
+
196
+ # Try to write to a file we cannot read from (if setup permits)
197
+ test_file = create_test_file('test content')
198
+ File.chmod(0o000, test_file)
199
+
200
+ begin
201
+ Kreuzberg.extract_file_sync(path: test_file)
202
+ ensure
203
+ File.chmod(0o644, test_file)
204
+ end
205
+ rescue Kreuzberg::Errors::IOError => e
206
+ expect(e).to be_a(Kreuzberg::Errors::IOError)
207
+ rescue Errno::EACCES
208
+ # Platform-specific permission error is acceptable
209
+ expect(true).to be_truthy
210
+ rescue StandardError => e
211
+ # Other IO errors are acceptable
212
+ expect(e).to be_a(StandardError)
213
+ end
214
+ end
215
+
216
+ describe 'malformed document handling' do
217
+ it 'handles invalid JSON metadata gracefully' do
218
+ result = Kreuzberg::Result.new(
219
+ content: 'Test content',
220
+ mime_type: 'text/plain',
221
+ metadata_json: 'this is not valid json {'
222
+ )
223
+
224
+ expect(result.content).to eq('Test content')
225
+ expect(result.metadata).to eq({})
226
+ expect(result.metadata).to be_a(Hash)
227
+ end
228
+
229
+ it 'handles empty metadata JSON' do
230
+ result = Kreuzberg::Result.new(
231
+ content: 'Test',
232
+ mime_type: 'text/plain',
233
+ metadata_json: ''
234
+ )
235
+
236
+ expect(result.metadata).to eq({})
237
+ expect(result.content).to eq('Test')
238
+ end
239
+
240
+ it 'handles nil metadata JSON' do
241
+ result = Kreuzberg::Result.new(
242
+ content: 'Test',
243
+ mime_type: 'text/plain',
244
+ metadata_json: nil
245
+ )
246
+
247
+ expect(result.metadata).to eq({})
248
+ end
249
+
250
+ it 'handles malformed result object gracefully' do
251
+ result = Kreuzberg::Result.new({})
252
+
253
+ expect(result.content).to eq('')
254
+ expect(result.mime_type).to eq('')
255
+ expect(result.metadata).to eq({})
256
+ expect(result.tables).to eq([])
257
+ expect(result.detected_languages).to be_nil
258
+ expect(result.chunks).to eq([])
259
+ expect(result.images).to be_nil
260
+ end
261
+
262
+ it 'handles partial result data without errors' do
263
+ result = Kreuzberg::Result.new(
264
+ content: 'Partial content',
265
+ mime_type: 'text/plain'
266
+ )
267
+
268
+ expect(result.content).to eq('Partial content')
269
+ expect(result.mime_type).to eq('text/plain')
270
+ expect(result.tables).to eq([])
271
+ expect(result.metadata).to eq({})
272
+ end
273
+ end
274
+
275
+ describe 'batch extraction error handling' do
276
+ it 'handles mixed valid and invalid files in batch' do
277
+ files = [
278
+ create_test_file('Valid file content'),
279
+ '/definitely/nonexistent/file.txt'
280
+ ]
281
+
282
+ begin
283
+ result = Kreuzberg.batch_extract_files_sync(files)
284
+ expect(result).to be_an(Array)
285
+ rescue StandardError => e
286
+ expect(e).to be_a(StandardError)
287
+ expect(e.message).not_to be_empty
288
+ end
289
+ end
290
+
291
+ it 'handles all invalid files in batch without crashing' do
292
+ files = [
293
+ '/nonexistent1.txt',
294
+ '/nonexistent2.txt',
295
+ '/nonexistent3.txt'
296
+ ]
297
+
298
+ begin
299
+ result = Kreuzberg.batch_extract_files_sync(files)
300
+ expect(result).to be_an(Array)
301
+ rescue StandardError => e
302
+ expect(e).to be_a(StandardError)
303
+ end
304
+ end
305
+
306
+ it 'provides error context in batch results' do
307
+ files = [
308
+ create_test_file('First file'),
309
+ '/nonexistent/second.txt'
310
+ ]
311
+
312
+ begin
313
+ results = Kreuzberg.batch_extract_files_sync(files)
314
+ expect(results).to be_an(Array)
315
+ rescue StandardError => e
316
+ expect(e).to be_a(StandardError)
317
+ end
318
+ end
319
+ end
320
+
321
+ describe 'concurrent error states' do
322
+ it 'handles rapid successive error operations' do
323
+ errors = []
324
+
325
+ 3.times do |i|
326
+ Kreuzberg.extract_file_sync(path: "/nonexistent#{i}.pdf")
327
+ rescue StandardError => e
328
+ errors << e
329
+ end
330
+
331
+ expect(errors.length).to eq(3)
332
+ expect(errors).to all(be_a(StandardError))
333
+ end
334
+ end
335
+
336
+ describe 'error recovery and graceful degradation' do
337
+ it 'recovers gracefully after file not found error' do
338
+ # First operation: try to extract from nonexistent file
339
+ error_caught = false
340
+ begin
341
+ Kreuzberg.extract_file_sync(path: '/nonexistent/does_not_exist.txt')
342
+ rescue StandardError
343
+ error_caught = true
344
+ end
345
+
346
+ expect(error_caught).to be_truthy
347
+
348
+ # Second operation: should work fine with valid file
349
+ valid_file = create_test_file('Valid content after error')
350
+ result = Kreuzberg.extract_file_sync(path: valid_file)
351
+
352
+ expect(result).to be_a(Kreuzberg::Result)
353
+ end
354
+
355
+ it 'handles mixed error and success scenarios in sequence' do
356
+ results = []
357
+
358
+ # Try invalid file
359
+ begin
360
+ Kreuzberg.extract_file_sync(path: '/nonexistent/file1.txt')
361
+ rescue StandardError
362
+ results << :error1
363
+ end
364
+
365
+ # Valid extraction
366
+ valid_file = create_test_file('Valid content')
367
+ Kreuzberg.extract_file_sync(path: valid_file)
368
+ results << :success1
369
+
370
+ # Another invalid file
371
+ begin
372
+ Kreuzberg.extract_file_sync(path: '/nonexistent/file2.txt')
373
+ rescue StandardError
374
+ results << :error2
375
+ end
376
+
377
+ expect(results).to eq(%i[error1 success1 error2])
378
+ end
379
+ end
380
+
381
+ describe 'type conversion and coercion errors' do
382
+ it 'handles non-string content in results gracefully' do
383
+ path = create_test_file('Type coercion test')
384
+ result = Kreuzberg.extract_file_sync(path: path)
385
+
386
+ expect(result.content).to be_a(String)
387
+ expect(result.mime_type).to be_a(String)
388
+ end
389
+
390
+ it 'extracts images with proper encoding handling' do
391
+ result = Kreuzberg::Result.new(image_result_payload)
392
+ image = result.images&.first
393
+
394
+ expect(image&.format).to eq('png')
395
+ expect(image&.data&.encoding).to eq(Encoding::BINARY)
396
+ expect(image&.ocr_result).to be_a(Kreuzberg::Result)
397
+ end
398
+ end
399
+ end