kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::CLI do
4
+ describe '.extract' do
5
+ it 'extracts content from a file' do
6
+ path = test_document_path('odt/simple.odt')
7
+ output = described_class.extract(path)
8
+
9
+ expect(output).to be_a(String)
10
+ expect(output).not_to be_empty
11
+ end
12
+
13
+ it 'accepts output format option' do
14
+ path = test_document_path('odt/simple.odt')
15
+ output = described_class.extract(path, output: 'json')
16
+
17
+ expect(output).to be_a(String)
18
+ expect(output).not_to be_empty
19
+ end
20
+
21
+ it 'accepts OCR option' do
22
+ path = test_document_path('pdf/100_g_networking_technology_overview_slides_toronto_august_2016.pdf')
23
+ output = described_class.extract(path, ocr: false)
24
+
25
+ expect(output).to be_a(String)
26
+ expect(output).not_to be_empty
27
+ end
28
+ end
29
+
30
+ describe '.detect' do
31
+ it 'detects MIME type' do
32
+ path = test_document_path('odt/simple.odt')
33
+ mime_type = described_class.detect(path)
34
+
35
+ expect(mime_type).to be_a(String)
36
+ expect(mime_type).not_to be_empty
37
+ end
38
+ end
39
+
40
+ describe '.version' do
41
+ it 'returns version string' do
42
+ version = described_class.version
43
+ expect(version).to be_a(String)
44
+ expect(version).to match(/\d+\.\d+/)
45
+ end
46
+ end
47
+
48
+ describe '.help' do
49
+ it 'returns help text' do
50
+ help_text = described_class.help
51
+ expect(help_text).to be_a(String)
52
+ expect(help_text).to include('kreuzberg')
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,377 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe 'Phase 1 FFI Config and Result Methods' do
6
+ let(:fixture_dir) { File.expand_path('../fixtures', __dir__) }
7
+
8
+ describe 'Kreuzberg::Config::Extraction' do
9
+ describe '#to_json' do
10
+ it 'serializes a basic config to JSON' do
11
+ config = Kreuzberg::Config::Extraction.new(use_cache: true)
12
+ json = config.to_json
13
+ expect(json).to be_a(String)
14
+ parsed = JSON.parse(json)
15
+ expect(parsed['use_cache']).to be true
16
+ end
17
+
18
+ it 'serializes complex nested config to JSON' do
19
+ ocr = Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'deu')
20
+ chunking = Kreuzberg::Config::Chunking.new(max_chars: 500, max_overlap: 50)
21
+ config = Kreuzberg::Config::Extraction.new(
22
+ use_cache: true,
23
+ force_ocr: false,
24
+ ocr: ocr,
25
+ chunking: chunking
26
+ )
27
+
28
+ json = config.to_json
29
+ parsed = JSON.parse(json)
30
+
31
+ expect(parsed['use_cache']).to be true
32
+ expect(parsed['force_ocr']).to be false
33
+ expect(parsed['ocr']['backend']).to eq('tesseract')
34
+ expect(parsed['ocr']['language']).to eq('deu')
35
+ expect(parsed['chunking']['max_chars']).to eq(500)
36
+ expect(parsed['chunking']['max_overlap']).to eq(50)
37
+ end
38
+
39
+ it 'handles minimal config' do
40
+ config = Kreuzberg::Config::Extraction.new
41
+ json = config.to_json
42
+ expect(json).to be_a(String)
43
+ parsed = JSON.parse(json)
44
+ expect(parsed).to be_a(Hash)
45
+ end
46
+ end
47
+
48
+ describe '#get_field' do
49
+ let(:config) do
50
+ ocr = Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
51
+ Kreuzberg::Config::Extraction.new(
52
+ use_cache: true,
53
+ force_ocr: false,
54
+ ocr: ocr
55
+ )
56
+ end
57
+
58
+ it 'gets a top-level field' do
59
+ value = config.get_field('use_cache')
60
+ expect(value).to be true
61
+ end
62
+
63
+ it 'gets a nested field with dot notation' do
64
+ value = config.get_field('ocr.backend')
65
+ expect(value).to eq('tesseract')
66
+ end
67
+
68
+ it 'gets another nested field' do
69
+ value = config.get_field('ocr.language')
70
+ expect(value).to eq('eng')
71
+ end
72
+
73
+ it 'returns nil for non-existent field' do
74
+ value = config.get_field('nonexistent')
75
+ expect(value).to be_nil
76
+ end
77
+
78
+ it 'returns nil for non-existent nested field' do
79
+ value = config.get_field('ocr.nonexistent')
80
+ expect(value).to be_nil
81
+ end
82
+
83
+ it 'supports symbol field names' do
84
+ value = config.get_field(:use_cache)
85
+ expect(value).to be true
86
+ end
87
+
88
+ it 'gets boolean fields correctly' do
89
+ value = config.get_field('force_ocr')
90
+ expect(value).to be false
91
+ end
92
+ end
93
+
94
+ describe '#merge' do
95
+ let(:base_config) do
96
+ Kreuzberg::Config::Extraction.new(
97
+ use_cache: true,
98
+ force_ocr: false,
99
+ enable_quality_processing: false
100
+ )
101
+ end
102
+
103
+ let(:override_config) do
104
+ Kreuzberg::Config::Extraction.new(
105
+ force_ocr: true,
106
+ enable_quality_processing: true
107
+ )
108
+ end
109
+
110
+ it 'merges two configs without modifying original' do
111
+ merged = base_config.merge(override_config)
112
+
113
+ expect(base_config.use_cache).to be true
114
+ expect(base_config.force_ocr).to be false
115
+ expect(base_config.enable_quality_processing).to be false
116
+
117
+ expect(merged.use_cache).to be true
118
+ expect(merged.force_ocr).to be true
119
+ expect(merged.enable_quality_processing).to be true
120
+ end
121
+
122
+ it 'returns a new Extraction instance' do
123
+ merged = base_config.merge(override_config)
124
+ expect(merged).to be_a(Kreuzberg::Config::Extraction)
125
+ expect(merged).not_to be(base_config)
126
+ end
127
+
128
+ it 'merges with a Hash' do
129
+ merged = base_config.merge(force_ocr: true)
130
+
131
+ expect(merged.use_cache).to be true
132
+ expect(merged.force_ocr).to be true
133
+ end
134
+
135
+ it 'handles nested config merging' do
136
+ ocr1 = Kreuzberg::Config::OCR.new(backend: 'tesseract', language: 'eng')
137
+ base = Kreuzberg::Config::Extraction.new(ocr: ocr1, use_cache: true)
138
+
139
+ ocr2 = Kreuzberg::Config::OCR.new(backend: 'easyocr')
140
+ override = Kreuzberg::Config::Extraction.new(ocr: ocr2)
141
+
142
+ merged = base.merge(override)
143
+
144
+ expect(merged.use_cache).to be true
145
+ expect(merged.ocr.backend).to eq('easyocr')
146
+ end
147
+ end
148
+
149
+ describe '#merge!' do
150
+ let(:base_config) do
151
+ Kreuzberg::Config::Extraction.new(
152
+ use_cache: true,
153
+ force_ocr: false
154
+ )
155
+ end
156
+
157
+ let(:override_config) do
158
+ Kreuzberg::Config::Extraction.new(
159
+ force_ocr: true
160
+ )
161
+ end
162
+
163
+ it 'modifies the original config in place' do
164
+ original_object_id = base_config.object_id
165
+ result = base_config.merge!(override_config)
166
+
167
+ expect(result.object_id).to eq(original_object_id)
168
+ expect(base_config.use_cache).to be true
169
+ expect(base_config.force_ocr).to be true
170
+ end
171
+
172
+ it 'returns self' do
173
+ result = base_config.merge!(override_config)
174
+ expect(result).to be(base_config)
175
+ end
176
+
177
+ it 'works with Hash argument' do
178
+ base_config.merge!(force_ocr: true, use_cache: false)
179
+
180
+ expect(base_config.force_ocr).to be true
181
+ expect(base_config.use_cache).to be false
182
+ end
183
+
184
+ it 'updates all fields correctly' do
185
+ ocr = Kreuzberg::Config::OCR.new(backend: 'easyocr')
186
+ base_config.merge!(ocr: ocr, enable_quality_processing: true)
187
+
188
+ expect(base_config.ocr.backend).to eq('easyocr')
189
+ expect(base_config.enable_quality_processing).to be true
190
+ end
191
+ end
192
+ end
193
+
194
+ describe 'Kreuzberg::Result' do
195
+ let(:sample_result_hash) do
196
+ {
197
+ 'content' => 'Sample document content',
198
+ 'mime_type' => 'application/pdf',
199
+ 'metadata_json' => {
200
+ 'title' => 'Test Document',
201
+ 'language' => 'en',
202
+ 'pages' => {
203
+ 'total_count' => 10,
204
+ 'unit_type' => 'Page'
205
+ },
206
+ 'format' => {
207
+ 'name' => 'PDF',
208
+ 'pages' => 10
209
+ }
210
+ }.to_json,
211
+ 'tables' => [],
212
+ 'detected_languages' => %w[en de],
213
+ 'chunks' => [
214
+ {
215
+ 'content' => 'Chunk 1',
216
+ 'byte_start' => 0,
217
+ 'byte_end' => 7,
218
+ 'token_count' => 2,
219
+ 'chunk_index' => 0,
220
+ 'total_chunks' => 2,
221
+ 'first_page' => 1,
222
+ 'last_page' => 1,
223
+ 'embedding' => nil
224
+ },
225
+ {
226
+ 'content' => 'Chunk 2',
227
+ 'byte_start' => 8,
228
+ 'byte_end' => 15,
229
+ 'token_count' => 2,
230
+ 'chunk_index' => 1,
231
+ 'total_chunks' => 2,
232
+ 'first_page' => 2,
233
+ 'last_page' => 2,
234
+ 'embedding' => nil
235
+ }
236
+ ]
237
+ }
238
+ end
239
+
240
+ let(:result) { Kreuzberg::Result.new(sample_result_hash) }
241
+
242
+ describe '#page_count' do
243
+ it 'returns the total page count' do
244
+ expect(result.page_count).to eq(10)
245
+ end
246
+
247
+ it 'returns 0 for result without page info' do
248
+ minimal_result = Kreuzberg::Result.new(
249
+ 'content' => 'Test',
250
+ 'mime_type' => 'text/plain',
251
+ 'metadata_json' => '{}'
252
+ )
253
+ expect(minimal_result.page_count).to eq(0)
254
+ end
255
+
256
+ it 'returns 0 when metadata has no pages info' do
257
+ result_no_pages = Kreuzberg::Result.new(
258
+ 'content' => 'Test',
259
+ 'mime_type' => 'text/plain',
260
+ 'metadata_json' => '{"title": "Test"}'
261
+ )
262
+ expect(result_no_pages.page_count).to eq(0)
263
+ end
264
+ end
265
+
266
+ describe '#chunk_count' do
267
+ it 'returns the total number of chunks' do
268
+ expect(result.chunk_count).to eq(2)
269
+ end
270
+
271
+ it 'returns 0 for result without chunks' do
272
+ no_chunks_result = Kreuzberg::Result.new(
273
+ 'content' => 'Test',
274
+ 'mime_type' => 'text/plain',
275
+ 'metadata_json' => '{}'
276
+ )
277
+ expect(no_chunks_result.chunk_count).to eq(0)
278
+ end
279
+
280
+ it 'returns 0 for empty chunks array' do
281
+ empty_chunks_result = Kreuzberg::Result.new(
282
+ 'content' => 'Test',
283
+ 'mime_type' => 'text/plain',
284
+ 'metadata_json' => '{}',
285
+ 'chunks' => []
286
+ )
287
+ expect(empty_chunks_result.chunk_count).to eq(0)
288
+ end
289
+ end
290
+
291
+ describe '#detected_language' do
292
+ it 'returns the primary detected language from metadata' do
293
+ expect(result.detected_language).to eq('en')
294
+ end
295
+
296
+ it 'returns the first detected language if metadata language is not set' do
297
+ result_with_detected = Kreuzberg::Result.new(
298
+ 'content' => 'Test',
299
+ 'mime_type' => 'text/plain',
300
+ 'metadata_json' => '{}',
301
+ 'detected_languages' => %w[fr de]
302
+ )
303
+ expect(result_with_detected.detected_language).to eq('fr')
304
+ end
305
+
306
+ it 'returns nil when no language is detected' do
307
+ no_lang_result = Kreuzberg::Result.new(
308
+ 'content' => 'Test',
309
+ 'mime_type' => 'text/plain',
310
+ 'metadata_json' => '{}'
311
+ )
312
+ expect(no_lang_result.detected_language).to be_nil
313
+ end
314
+
315
+ it 'returns nil for empty detected languages array' do
316
+ empty_langs_result = Kreuzberg::Result.new(
317
+ 'content' => 'Test',
318
+ 'mime_type' => 'text/plain',
319
+ 'metadata_json' => '{}',
320
+ 'detected_languages' => []
321
+ )
322
+ expect(empty_langs_result.detected_language).to be_nil
323
+ end
324
+ end
325
+
326
+ describe '#metadata_field' do
327
+ it 'gets a top-level metadata field' do
328
+ value = result.metadata_field('title')
329
+ expect(value).to eq('Test Document')
330
+ end
331
+
332
+ it 'gets a nested metadata field with dot notation' do
333
+ value = result.metadata_field('pages.total_count')
334
+ expect(value).to eq(10)
335
+ end
336
+
337
+ it 'gets another nested field' do
338
+ value = result.metadata_field('format.name')
339
+ expect(value).to eq('PDF')
340
+ end
341
+
342
+ it 'returns nil for non-existent field' do
343
+ value = result.metadata_field('nonexistent')
344
+ expect(value).to be_nil
345
+ end
346
+
347
+ it 'returns nil for non-existent nested field' do
348
+ value = result.metadata_field('format.nonexistent')
349
+ expect(value).to be_nil
350
+ end
351
+
352
+ it 'supports symbol field names' do
353
+ value = result.metadata_field(:title)
354
+ expect(value).to eq('Test Document')
355
+ end
356
+
357
+ it 'returns nil when trying to access nested field on non-hash value' do
358
+ value = result.metadata_field('title.nested')
359
+ expect(value).to be_nil
360
+ end
361
+
362
+ it 'handles deeply nested fields' do
363
+ value = result.metadata_field('format.pages')
364
+ expect(value).to eq(10)
365
+ end
366
+
367
+ it 'returns nil for result without metadata' do
368
+ no_metadata = Kreuzberg::Result.new(
369
+ 'content' => 'Test',
370
+ 'mime_type' => 'text/plain',
371
+ 'metadata_json' => 'invalid json'
372
+ )
373
+ expect(no_metadata.metadata_field('title')).to be_nil
374
+ end
375
+ end
376
+ end
377
+ end