kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::OCR do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.backend).to eq 'tesseract'
9
+ expect(config.language).to eq 'eng'
10
+ expect(config.tesseract_config).to be_nil
11
+ end
12
+
13
+ it 'creates config with custom string values' do
14
+ config = described_class.new(
15
+ backend: 'easyocr',
16
+ language: 'fra'
17
+ )
18
+
19
+ expect(config.backend).to eq 'easyocr'
20
+ expect(config.language).to eq 'fra'
21
+ end
22
+
23
+ it 'converts symbol keys to strings' do
24
+ config = described_class.new(backend: :tesseract, language: :deu)
25
+
26
+ expect(config.backend).to eq 'tesseract'
27
+ expect(config.language).to eq 'deu'
28
+ end
29
+
30
+ it 'accepts tesseract_config as instance' do
31
+ tesseract = Kreuzberg::Config::Tesseract.new(options: 'value')
32
+ config = described_class.new(tesseract_config: tesseract)
33
+
34
+ expect(config.tesseract_config).to be_a Kreuzberg::Config::Tesseract
35
+ end
36
+
37
+ it 'converts tesseract_config hash to instance' do
38
+ config = described_class.new(tesseract_config: { option: 'value' })
39
+
40
+ expect(config.tesseract_config).to be_a Kreuzberg::Config::Tesseract
41
+ end
42
+ end
43
+
44
+ describe '#to_h' do
45
+ it 'serializes to hash with default values' do
46
+ config = described_class.new
47
+ hash = config.to_h
48
+
49
+ expect(hash).to be_a Hash
50
+ expect(hash[:backend]).to eq 'tesseract'
51
+ expect(hash[:language]).to eq 'eng'
52
+ expect(hash[:tesseract_config]).to be_nil
53
+ end
54
+
55
+ it 'includes tesseract_config in hash when present' do
56
+ config = described_class.new(
57
+ backend: 'tesseract',
58
+ tesseract_config: { dpi: 300 }
59
+ )
60
+ hash = config.to_h
61
+
62
+ expect(hash[:tesseract_config]).to be_a Hash
63
+ end
64
+
65
+ it 'compacts nil values from hash' do
66
+ config = described_class.new(backend: 'tesseract')
67
+ hash = config.to_h
68
+
69
+ expect(hash.key?(:tesseract_config)).to be false
70
+ end
71
+ end
72
+
73
+ describe 'validation' do
74
+ it 'accepts valid backends' do
75
+ expect do
76
+ described_class.new(backend: 'tesseract')
77
+ end.not_to raise_error
78
+ end
79
+
80
+ it 'accepts symbol language' do
81
+ expect do
82
+ described_class.new(language: :fra)
83
+ end.not_to raise_error
84
+ end
85
+
86
+ it 'raises error for invalid tesseract_config type' do
87
+ expect do
88
+ described_class.new(tesseract_config: 'invalid')
89
+ end.to raise_error ArgumentError, /Expected.*Tesseract.*Hash.*nil/
90
+ end
91
+ end
92
+
93
+ describe 'keyword arguments' do
94
+ it 'accepts keyword arguments only' do
95
+ config = described_class.new(backend: 'tesseract', language: 'eng')
96
+
97
+ expect(config.backend).to eq 'tesseract'
98
+ expect(config.language).to eq 'eng'
99
+ end
100
+
101
+ it 'ignores unknown keywords gracefully' do
102
+ # This test documents current behavior
103
+ # The initialize method doesn't explicitly reject unknown keys
104
+ config = described_class.new(backend: 'tesseract')
105
+ expect(config).to be_a described_class
106
+ end
107
+ end
108
+
109
+ describe 'equality' do
110
+ it 'compares configs by value' do
111
+ config1 = described_class.new(backend: 'tesseract', language: 'eng')
112
+ config2 = described_class.new(backend: 'tesseract', language: 'eng')
113
+
114
+ expect(config1.backend).to eq config2.backend
115
+ expect(config1.language).to eq config2.language
116
+ end
117
+
118
+ it 'detects differences in backend' do
119
+ config1 = described_class.new(backend: 'tesseract')
120
+ config2 = described_class.new(backend: 'easyocr')
121
+
122
+ expect(config1.backend).not_to eq config2.backend
123
+ end
124
+
125
+ it 'detects differences in language' do
126
+ config1 = described_class.new(language: 'eng')
127
+ config2 = described_class.new(language: 'fra')
128
+
129
+ expect(config1.language).not_to eq config2.language
130
+ end
131
+ end
132
+
133
+ describe 'nested config integration' do
134
+ it 'integrates with Extraction config' do
135
+ ocr_config = described_class.new(backend: 'tesseract', language: 'deu')
136
+ extraction = Kreuzberg::Config::Extraction.new(ocr: ocr_config)
137
+
138
+ expect(extraction.ocr).to be_a described_class
139
+ expect(extraction.ocr.backend).to eq 'tesseract'
140
+ expect(extraction.ocr.language).to eq 'deu'
141
+ end
142
+
143
+ it 'accepts hash in Extraction config and converts to instance' do
144
+ extraction = Kreuzberg::Config::Extraction.new(
145
+ ocr: { backend: 'easyocr', language: 'fra' }
146
+ )
147
+
148
+ expect(extraction.ocr).to be_a described_class
149
+ expect(extraction.ocr.backend).to eq 'easyocr'
150
+ end
151
+ end
152
+
153
+ describe 'symbol vs string key handling' do
154
+ it 'converts symbol keys to correct attributes' do
155
+ config = described_class.new(backend: :tesseract, language: :fra)
156
+
157
+ expect(config.backend).to eq 'tesseract'
158
+ expect(config.language).to eq 'fra'
159
+ end
160
+
161
+ it 'handles mixed symbol and string values' do
162
+ config = described_class.new(
163
+ backend: 'tesseract',
164
+ language: :eng
165
+ )
166
+
167
+ expect(config.backend).to eq 'tesseract'
168
+ expect(config.language).to eq 'eng'
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,380 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable RSpec/RepeatedExample
4
+ RSpec.describe 'Output Format and Result Format Configuration' do
5
+ describe Kreuzberg::Config::Extraction do
6
+ describe 'output_format' do
7
+ it 'accepts output_format as initialization parameter' do
8
+ config = described_class.new(output_format: 'markdown')
9
+
10
+ expect(config.output_format).to eq 'markdown'
11
+ end
12
+
13
+ it 'defaults to nil when not specified' do
14
+ config = described_class.new
15
+
16
+ expect(config.output_format).to be_nil
17
+ end
18
+
19
+ it 'accepts plain format' do
20
+ config = described_class.new(output_format: 'plain')
21
+
22
+ expect(config.output_format).to eq 'plain'
23
+ end
24
+
25
+ it 'accepts markdown format' do
26
+ config = described_class.new(output_format: 'markdown')
27
+
28
+ expect(config.output_format).to eq 'markdown'
29
+ end
30
+
31
+ it 'accepts djot format' do
32
+ config = described_class.new(output_format: 'djot')
33
+
34
+ expect(config.output_format).to eq 'djot'
35
+ end
36
+
37
+ it 'accepts html format' do
38
+ config = described_class.new(output_format: 'html')
39
+
40
+ expect(config.output_format).to eq 'html'
41
+ end
42
+
43
+ it 'converts output_format to string' do
44
+ config = described_class.new(output_format: :markdown)
45
+
46
+ expect(config.output_format).to eq 'markdown'
47
+ expect(config.output_format).to be_a String
48
+ end
49
+
50
+ it 'includes output_format in to_h' do
51
+ config = described_class.new(output_format: 'markdown')
52
+ hash = config.to_h
53
+
54
+ expect(hash[:output_format]).to eq 'markdown'
55
+ end
56
+
57
+ it 'excludes nil output_format from to_h' do
58
+ config = described_class.new(output_format: nil)
59
+ hash = config.to_h
60
+
61
+ expect(hash.key?(:output_format)).to be false
62
+ end
63
+
64
+ it 'includes output_format in JSON' do
65
+ config = described_class.new(output_format: 'markdown')
66
+ json = config.to_json
67
+ parsed = JSON.parse(json)
68
+
69
+ expect(parsed['output_format']).to eq 'markdown'
70
+ end
71
+
72
+ it 'retrieves output_format with get_field' do
73
+ config = described_class.new(output_format: 'djot')
74
+
75
+ expect(config.get_field('output_format')).to eq 'djot'
76
+ end
77
+
78
+ it 'can be set with []=' do
79
+ config = described_class.new
80
+ config[:output_format] = 'html'
81
+
82
+ expect(config.output_format).to eq 'html'
83
+ end
84
+
85
+ it 'can be set with []= using symbol' do
86
+ config = described_class.new
87
+ config[:output_format] = :plain
88
+
89
+ expect(config.output_format).to eq 'plain'
90
+ end
91
+
92
+ it 'can be retrieved with []' do
93
+ config = described_class.new(output_format: 'markdown')
94
+
95
+ expect(config[:output_format]).to eq 'markdown'
96
+ end
97
+ end
98
+
99
+ describe 'result_format' do
100
+ it 'accepts result_format as initialization parameter' do
101
+ config = described_class.new(result_format: 'unified')
102
+
103
+ expect(config.result_format).to eq 'unified'
104
+ end
105
+
106
+ it 'defaults to nil when not specified' do
107
+ config = described_class.new
108
+
109
+ expect(config.result_format).to be_nil
110
+ end
111
+
112
+ it 'accepts unified format' do
113
+ config = described_class.new(result_format: 'unified')
114
+
115
+ expect(config.result_format).to eq 'unified'
116
+ end
117
+
118
+ it 'accepts element_based format' do
119
+ config = described_class.new(result_format: 'element_based')
120
+
121
+ expect(config.result_format).to eq 'element_based'
122
+ end
123
+
124
+ it 'converts result_format to string' do
125
+ config = described_class.new(result_format: :unified)
126
+
127
+ expect(config.result_format).to eq 'unified'
128
+ expect(config.result_format).to be_a String
129
+ end
130
+
131
+ it 'includes result_format in to_h' do
132
+ config = described_class.new(result_format: 'element_based')
133
+ hash = config.to_h
134
+
135
+ expect(hash[:result_format]).to eq 'element_based'
136
+ end
137
+
138
+ it 'excludes nil result_format from to_h' do
139
+ config = described_class.new(result_format: nil)
140
+ hash = config.to_h
141
+
142
+ expect(hash.key?(:result_format)).to be false
143
+ end
144
+
145
+ it 'includes result_format in JSON' do
146
+ config = described_class.new(result_format: 'element_based')
147
+ json = config.to_json
148
+ parsed = JSON.parse(json)
149
+
150
+ expect(parsed['result_format']).to eq 'element_based'
151
+ end
152
+
153
+ it 'retrieves result_format with get_field' do
154
+ config = described_class.new(result_format: 'unified')
155
+
156
+ expect(config.get_field('result_format')).to eq 'unified'
157
+ end
158
+
159
+ it 'can be set with []=' do
160
+ config = described_class.new
161
+ config[:result_format] = 'unified'
162
+
163
+ expect(config.result_format).to eq 'unified'
164
+ end
165
+
166
+ it 'can be set with []= using symbol' do
167
+ config = described_class.new
168
+ config[:result_format] = :element_based
169
+
170
+ expect(config.result_format).to eq 'element_based'
171
+ end
172
+
173
+ it 'can be retrieved with []' do
174
+ config = described_class.new(result_format: 'element_based')
175
+
176
+ expect(config[:result_format]).to eq 'element_based'
177
+ end
178
+ end
179
+
180
+ describe 'combined output and result formats' do
181
+ it 'accepts both output_format and result_format' do
182
+ config = described_class.new(
183
+ output_format: 'markdown',
184
+ result_format: 'unified'
185
+ )
186
+
187
+ expect(config.output_format).to eq 'markdown'
188
+ expect(config.result_format).to eq 'unified'
189
+ end
190
+
191
+ it 'serializes both formats in to_h' do
192
+ config = described_class.new(
193
+ output_format: 'djot',
194
+ result_format: 'element_based'
195
+ )
196
+ hash = config.to_h
197
+
198
+ expect(hash[:output_format]).to eq 'djot'
199
+ expect(hash[:result_format]).to eq 'element_based'
200
+ end
201
+
202
+ it 'serializes both formats in JSON' do
203
+ config = described_class.new(
204
+ output_format: 'html',
205
+ result_format: 'unified'
206
+ )
207
+ json = config.to_json
208
+ parsed = JSON.parse(json)
209
+
210
+ expect(parsed['output_format']).to eq 'html'
211
+ expect(parsed['result_format']).to eq 'unified'
212
+ end
213
+
214
+ it 'merges both formats correctly' do
215
+ base = described_class.new(
216
+ output_format: 'markdown',
217
+ result_format: 'unified'
218
+ )
219
+ override = described_class.new(output_format: 'html')
220
+ merged = base.merge(override)
221
+
222
+ expect(merged.output_format).to eq 'html'
223
+ expect(merged.result_format).to eq 'unified'
224
+ end
225
+
226
+ it 'merges both formats with merge!' do
227
+ config = described_class.new(
228
+ output_format: 'markdown',
229
+ result_format: 'unified'
230
+ )
231
+ override = described_class.new(
232
+ output_format: 'djot',
233
+ result_format: 'element_based'
234
+ )
235
+ config.merge!(override)
236
+
237
+ expect(config.output_format).to eq 'djot'
238
+ expect(config.result_format).to eq 'element_based'
239
+ end
240
+
241
+ it 'handles merge with hash containing both formats' do
242
+ config = described_class.new(
243
+ output_format: 'plain',
244
+ result_format: 'unified'
245
+ )
246
+ merged = config.merge({ output_format: 'markdown' })
247
+
248
+ expect(merged.output_format).to eq 'markdown'
249
+ expect(merged.result_format).to eq 'unified'
250
+ end
251
+ end
252
+
253
+ describe 'format persistence across operations' do
254
+ it 'persists output_format through multiple conversions' do
255
+ config = described_class.new(output_format: 'markdown')
256
+ hash = config.to_h
257
+ new_config = described_class.new(**hash)
258
+
259
+ expect(new_config.output_format).to eq 'markdown'
260
+ end
261
+
262
+ it 'persists result_format through multiple conversions' do
263
+ config = described_class.new(result_format: 'element_based')
264
+ hash = config.to_h
265
+ new_config = described_class.new(**hash)
266
+
267
+ expect(new_config.result_format).to eq 'element_based'
268
+ end
269
+
270
+ it 'round-trips through JSON' do
271
+ config = described_class.new(
272
+ output_format: 'djot',
273
+ result_format: 'unified'
274
+ )
275
+ json = config.to_json
276
+ parsed = JSON.parse(json)
277
+ new_config = described_class.new(**parsed.transform_keys(&:to_sym))
278
+
279
+ expect(new_config.output_format).to eq 'djot'
280
+ expect(new_config.result_format).to eq 'unified'
281
+ end
282
+ end
283
+
284
+ describe 'format validation and edge cases' do
285
+ it 'raises error for empty string output_format' do
286
+ expect do
287
+ described_class.new(output_format: '')
288
+ end.to raise_error(ArgumentError, /Invalid output_format/)
289
+ end
290
+
291
+ it 'raises error for empty string result_format' do
292
+ expect do
293
+ described_class.new(result_format: '')
294
+ end.to raise_error(ArgumentError, /Invalid result_format/)
295
+ end
296
+
297
+ it 'raises error for whitespace in output_format' do
298
+ expect do
299
+ described_class.new(output_format: ' plain ')
300
+ end.to raise_error(ArgumentError, /Invalid output_format/)
301
+ end
302
+
303
+ it 'normalizes case in output_format' do
304
+ config = described_class.new(output_format: 'MarkDown')
305
+
306
+ expect(config.output_format).to eq 'markdown'
307
+ end
308
+
309
+ it 'raises error for custom string in result_format' do
310
+ expect do
311
+ described_class.new(result_format: 'custom_format')
312
+ end.to raise_error(ArgumentError, /Invalid result_format/)
313
+ end
314
+ end
315
+
316
+ describe 'integration with other config fields' do
317
+ it 'works with output_format and chunking together' do
318
+ config = described_class.new(
319
+ output_format: 'markdown',
320
+ chunking: { max_chars: 500 }
321
+ )
322
+
323
+ expect(config.output_format).to eq 'markdown'
324
+ expect(config.chunking.max_chars).to eq 500
325
+ end
326
+
327
+ it 'works with result_format and OCR together' do
328
+ config = described_class.new(
329
+ result_format: 'element_based',
330
+ ocr: { backend: 'tesseract' }
331
+ )
332
+
333
+ expect(config.result_format).to eq 'element_based'
334
+ expect(config.ocr.backend).to eq 'tesseract'
335
+ end
336
+
337
+ it 'works with both formats and language detection' do
338
+ config = described_class.new(
339
+ output_format: 'html',
340
+ result_format: 'unified',
341
+ language_detection: { enabled: true }
342
+ )
343
+
344
+ expect(config.output_format).to eq 'html'
345
+ expect(config.result_format).to eq 'unified'
346
+ expect(config.language_detection.enabled).to be true
347
+ end
348
+
349
+ it 'preserves formats in complex config merge' do
350
+ base = described_class.new(
351
+ output_format: 'markdown',
352
+ result_format: 'unified',
353
+ chunking: { max_chars: 500 },
354
+ ocr: { backend: 'tesseract' }
355
+ )
356
+ override = described_class.new(
357
+ output_format: 'djot',
358
+ chunking: { max_chars: 750 }
359
+ )
360
+ merged = base.merge(override)
361
+
362
+ expect(merged.output_format).to eq 'djot'
363
+ expect(merged.result_format).to eq 'unified'
364
+ expect(merged.chunking.max_chars).to eq 750
365
+ expect(merged.ocr.backend).to eq 'tesseract'
366
+ end
367
+ end
368
+
369
+ describe 'allowed keys integration' do
370
+ it 'includes output_format in ALLOWED_KEYS' do
371
+ expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:output_format)
372
+ end
373
+
374
+ it 'includes result_format in ALLOWED_KEYS' do
375
+ expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:result_format)
376
+ end
377
+ end
378
+ end
379
+ end
380
+ # rubocop:enable RSpec/RepeatedExample