kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,677 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'tempfile'
5
+ require 'fileutils'
6
+ require 'securerandom'
7
+
8
+ RSpec.describe 'Batch Operations' do
9
+ describe 'batch_extract_files with multiple file types' do
10
+ it 'processes mixed file types in single batch' do
11
+ paths = []
12
+
13
+ # Create text file
14
+ txt_file = Tempfile.new(['batch_test', '.txt'])
15
+ txt_file.write('Text file content: Machine learning transforms technology.')
16
+ txt_file.close
17
+ paths << txt_file.path
18
+
19
+ # Create markdown file
20
+ md_file = Tempfile.new(['batch_test', '.md'])
21
+ md_file.write('# Markdown Header\n\nContent about artificial intelligence.')
22
+ md_file.close
23
+ paths << md_file.path
24
+
25
+ config = Kreuzberg::Config::Extraction.new
26
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
27
+
28
+ expect(results).to be_a(Array)
29
+ expect(results.length).to eq(2)
30
+ results.each do |result|
31
+ expect(result).to be_a(Kreuzberg::Result)
32
+ expect(result.content).not_to be_empty
33
+ end
34
+
35
+ paths.each { |p| FileUtils.rm_f(p) }
36
+ end
37
+
38
+ it 'maintains file order through batch processing' do
39
+ paths = []
40
+ unique_markers = []
41
+
42
+ 3.times do |i|
43
+ file = Tempfile.new(["ordered_#{i}", '.txt'])
44
+ marker = "MARKER_#{SecureRandom.hex(4)}"
45
+ file.write("File #{i}: #{marker}")
46
+ file.close
47
+ paths << file.path
48
+ unique_markers << marker
49
+ end
50
+
51
+ config = Kreuzberg::Config::Extraction.new
52
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
53
+
54
+ expect(results.length).to eq(paths.length)
55
+ results.each_with_index do |result, idx|
56
+ expect(result.content).to include(unique_markers[idx])
57
+ end
58
+
59
+ paths.each { |p| FileUtils.rm_f(p) }
60
+ end
61
+
62
+ it 'processes large batch operations efficiently' do
63
+ paths = []
64
+
65
+ # Create 20 test files
66
+ 20.times do |i|
67
+ file = Tempfile.new(["large_batch_#{i}", '.txt'])
68
+ file.write("Content #{i}: Machine learning technology")
69
+ file.close
70
+ paths << file.path
71
+ end
72
+
73
+ config = Kreuzberg::Config::Extraction.new
74
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
75
+
76
+ expect(results.length).to eq(20)
77
+ expect(results).to all(be_a(Kreuzberg::Result))
78
+
79
+ paths.each { |p| FileUtils.rm_f(p) }
80
+ end
81
+
82
+ it 'handles batch with different file sizes' do
83
+ paths = []
84
+
85
+ # Small file
86
+ small = Tempfile.new(['small', '.txt'])
87
+ small.write('AI')
88
+ small.close
89
+ paths << small.path
90
+
91
+ # Medium file
92
+ medium = Tempfile.new(['medium', '.txt'])
93
+ medium.write('Machine learning is a subset of artificial intelligence.')
94
+ medium.close
95
+ paths << medium.path
96
+
97
+ # Large file
98
+ large = Tempfile.new(['large', '.txt'])
99
+ large.write('Machine learning ' * 100)
100
+ large.close
101
+ paths << large.path
102
+
103
+ config = Kreuzberg::Config::Extraction.new
104
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
105
+
106
+ expect(results.length).to eq(3)
107
+ expect(results).to all(be_a(Kreuzberg::Result))
108
+ expect(results[2].content.length).to be >= results[0].content.length
109
+
110
+ paths.each { |p| FileUtils.rm_f(p) }
111
+ end
112
+ end
113
+
114
+ describe 'batch extraction with configuration options' do
115
+ it 'applies consistent configuration across batch' do
116
+ paths = []
117
+
118
+ 3.times do |i|
119
+ file = Tempfile.new(["config_batch_#{i}", '.txt'])
120
+ file.write("Machine learning content #{i}. Artificial intelligence advances.")
121
+ file.close
122
+ paths << file.path
123
+ end
124
+
125
+ config = Kreuzberg::Config::Extraction.new(
126
+ keywords: Kreuzberg::Config::Keywords.new(
127
+ algorithm: 'yake',
128
+ max_keywords: 5
129
+ )
130
+ )
131
+
132
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
133
+
134
+ expect(results.length).to eq(3)
135
+ results.each do |result|
136
+ expect(result).to be_a(Kreuzberg::Result)
137
+ expect(result.content).not_to be_nil
138
+ end
139
+
140
+ paths.each { |p| FileUtils.rm_f(p) }
141
+ end
142
+
143
+ it 'batch respects caching configuration' do
144
+ path = Tempfile.new(['cache_test', '.txt']).tap do |f|
145
+ f.write('Cache test content')
146
+ f.close
147
+ end
148
+
149
+ config_no_cache = Kreuzberg::Config::Extraction.new(use_cache: false)
150
+ results1 = Kreuzberg.batch_extract_files_sync(paths: [path.path], config: config_no_cache)
151
+
152
+ config_with_cache = Kreuzberg::Config::Extraction.new(use_cache: true)
153
+ results2 = Kreuzberg.batch_extract_files_sync(paths: [path.path], config: config_with_cache)
154
+
155
+ expect(results1.length).to eq(1)
156
+ expect(results2.length).to eq(1)
157
+ expect(results1[0].content).to eq(results2[0].content)
158
+
159
+ FileUtils.rm_f(path.path)
160
+ end
161
+
162
+ it 'supports keyword extraction configuration in batch' do
163
+ paths = []
164
+
165
+ 2.times do |i|
166
+ file = Tempfile.new(["keywords_batch_#{i}", '.txt'])
167
+ file.write('Machine learning and deep learning enable artificial intelligence.')
168
+ file.close
169
+ paths << file.path
170
+ end
171
+
172
+ algorithms = %w[yake rake]
173
+
174
+ algorithms.each do |algo|
175
+ config = Kreuzberg::Config::Extraction.new(
176
+ keywords: Kreuzberg::Config::Keywords.new(
177
+ algorithm: algo,
178
+ max_keywords: 5
179
+ )
180
+ )
181
+
182
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
183
+ expect(results.length).to eq(2)
184
+ end
185
+
186
+ paths.each { |p| FileUtils.rm_f(p) }
187
+ end
188
+ end
189
+
190
+ describe 'batch error handling and resilience' do
191
+ it 'processes batch with some invalid paths gracefully' do
192
+ valid_file = Tempfile.new(['valid_batch', '.txt']).tap do |f|
193
+ f.write('Valid content')
194
+ f.close
195
+ end
196
+
197
+ valid_path = valid_file.path
198
+ config = Kreuzberg::Config::Extraction.new
199
+
200
+ # Process just the valid path
201
+ results = Kreuzberg.batch_extract_files_sync(paths: [valid_path], config: config)
202
+ expect(results.length).to eq(1)
203
+ expect(results[0]).to be_a(Kreuzberg::Result)
204
+
205
+ FileUtils.rm_f(valid_path)
206
+ end
207
+
208
+ it 'handles empty file list in batch' do
209
+ config = Kreuzberg::Config::Extraction.new
210
+ results = Kreuzberg.batch_extract_files_sync(paths: [], config: config)
211
+
212
+ expect(results).to be_a(Array)
213
+ expect(results).to be_empty
214
+ end
215
+
216
+ it 'processes batch with single file' do
217
+ file = Tempfile.new(['single_batch', '.txt']).tap do |f|
218
+ f.write('Single file batch processing')
219
+ f.close
220
+ end
221
+
222
+ config = Kreuzberg::Config::Extraction.new
223
+ results = Kreuzberg.batch_extract_files_sync(paths: [file.path], config: config)
224
+
225
+ expect(results.length).to eq(1)
226
+ expect(results[0]).to be_a(Kreuzberg::Result)
227
+
228
+ FileUtils.rm_f(file.path)
229
+ end
230
+
231
+ it 'maintains batch execution on partial failures' do
232
+ valid_file = Tempfile.new(['valid', '.txt']).tap do |f|
233
+ f.write('Valid content')
234
+ f.close
235
+ end
236
+
237
+ config = Kreuzberg::Config::Extraction.new
238
+ results = Kreuzberg.batch_extract_files_sync(paths: [valid_file.path], config: config)
239
+
240
+ expect(results).to be_a(Array)
241
+ expect(results).to all(be_a(Kreuzberg::Result))
242
+
243
+ FileUtils.rm_f(valid_file.path)
244
+ end
245
+ end
246
+
247
+ describe 'batch enumerable processing' do
248
+ it 'iterates over batch results with each' do
249
+ paths = []
250
+
251
+ 3.times do |i|
252
+ file = Tempfile.new(["enum_#{i}", '.txt'])
253
+ file.write("Enumerable test #{i}")
254
+ file.close
255
+ paths << file.path
256
+ end
257
+
258
+ config = Kreuzberg::Config::Extraction.new
259
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
260
+
261
+ count = 0
262
+ results.each do |result|
263
+ expect(result).to be_a(Kreuzberg::Result)
264
+ count += 1
265
+ end
266
+
267
+ expect(count).to eq(3)
268
+
269
+ paths.each { |p| FileUtils.rm_f(p) }
270
+ end
271
+
272
+ it 'maps batch results to extract content' do
273
+ paths = []
274
+
275
+ 3.times do |i|
276
+ file = Tempfile.new(["map_#{i}", '.txt'])
277
+ file.write("Mapping #{i}")
278
+ file.close
279
+ paths << file.path
280
+ end
281
+
282
+ config = Kreuzberg::Config::Extraction.new
283
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
284
+
285
+ contents = results.map(&:content)
286
+ expect(contents).to be_a(Array)
287
+ expect(contents.length).to eq(3)
288
+ expect(contents).to all(be_a(String))
289
+
290
+ paths.each { |p| FileUtils.rm_f(p) }
291
+ end
292
+
293
+ it 'filters batch results by content length' do
294
+ paths = []
295
+
296
+ # Small file
297
+ small = Tempfile.new(['small', '.txt']).tap do |f|
298
+ f.write('x')
299
+ f.close
300
+ end
301
+ paths << small.path
302
+
303
+ # Large file
304
+ large = Tempfile.new(['large', '.txt']).tap do |f|
305
+ f.write('content ' * 50)
306
+ f.close
307
+ end
308
+ paths << large.path
309
+
310
+ config = Kreuzberg::Config::Extraction.new
311
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
312
+
313
+ large_results = results.select { |r| r.content.length > 20 }
314
+ expect(large_results.length).to be >= 1
315
+
316
+ paths.each { |p| FileUtils.rm_f(p) }
317
+ end
318
+
319
+ it 'reduces batch results to combined content' do
320
+ paths = []
321
+
322
+ 3.times do |i|
323
+ file = Tempfile.new(["reduce_#{i}", '.txt'])
324
+ file.write("Part #{i} ")
325
+ file.close
326
+ paths << file.path
327
+ end
328
+
329
+ config = Kreuzberg::Config::Extraction.new
330
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
331
+
332
+ combined = results.reduce('') { |acc, r| acc + r.content }
333
+ expect(combined).not_to be_empty
334
+ expect(combined).to include('Part')
335
+
336
+ paths.each { |p| FileUtils.rm_f(p) }
337
+ end
338
+ end
339
+
340
+ describe 'batch with chunking and embeddings' do
341
+ it 'processes batch with chunking enabled' do
342
+ paths = []
343
+
344
+ 2.times do |i|
345
+ file = Tempfile.new(["chunking_batch_#{i}", '.txt'])
346
+ file.write('Machine learning ' * 50)
347
+ file.close
348
+ paths << file.path
349
+ end
350
+
351
+ config = Kreuzberg::Config::Extraction.new(
352
+ chunking: Kreuzberg::Config::Chunking.new(
353
+ enabled: true,
354
+ max_chars: 100
355
+ )
356
+ )
357
+
358
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
359
+
360
+ expect(results.length).to eq(2)
361
+ expect(results).to all(be_a(Kreuzberg::Result))
362
+
363
+ paths.each { |p| FileUtils.rm_f(p) }
364
+ end
365
+
366
+ it 'batch processes with embedding generation' do
367
+ paths = []
368
+
369
+ 2.times do |i|
370
+ file = Tempfile.new(["embedding_batch_#{i}", '.txt'])
371
+ file.write('Artificial intelligence transforms technology development.')
372
+ file.close
373
+ paths << file.path
374
+ end
375
+
376
+ # Use basic chunking without embeddings to avoid ONNX dependency
377
+ config = Kreuzberg::Config::Extraction.new(
378
+ chunking: Kreuzberg::Config::Chunking.new(
379
+ enabled: true,
380
+ max_chars: 100
381
+ )
382
+ )
383
+
384
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
385
+
386
+ expect(results.length).to eq(2)
387
+ expect(results).to all(be_a(Kreuzberg::Result))
388
+
389
+ paths.each { |p| FileUtils.rm_f(p) }
390
+ end
391
+ end
392
+
393
+ describe 'batch result properties and validation' do
394
+ it 'each batch result has required properties' do
395
+ paths = []
396
+
397
+ 2.times do |i|
398
+ file = Tempfile.new(["props_#{i}", '.txt'])
399
+ file.write("Result properties test #{i}")
400
+ file.close
401
+ paths << file.path
402
+ end
403
+
404
+ config = Kreuzberg::Config::Extraction.new
405
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
406
+
407
+ results.each do |result|
408
+ expect(result).to respond_to(:content)
409
+ expect(result).to respond_to(:mime_type)
410
+ expect(result.content).to be_a(String)
411
+ expect(result.mime_type).to be_a(String)
412
+ end
413
+
414
+ paths.each { |p| FileUtils.rm_f(p) }
415
+ end
416
+
417
+ it 'batch results maintain independence' do
418
+ file1 = Tempfile.new(['indep1', '.txt']).tap do |f|
419
+ f.write('First file content')
420
+ f.close
421
+ end
422
+
423
+ file2 = Tempfile.new(['indep2', '.txt']).tap do |f|
424
+ f.write('Second file content')
425
+ f.close
426
+ end
427
+
428
+ paths = [file1.path, file2.path]
429
+
430
+ config = Kreuzberg::Config::Extraction.new
431
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
432
+
433
+ expect(results[0].content).not_to eq(results[1].content)
434
+ expect(results[0].content).to include('First')
435
+ expect(results[1].content).to include('Second')
436
+
437
+ paths.each { |p| FileUtils.rm_f(p) }
438
+ end
439
+
440
+ it 'batch results have consistent structure' do
441
+ paths = []
442
+ tempfiles = []
443
+
444
+ 3.times do |i|
445
+ file = Tempfile.new(["struct_#{i}", '.txt'])
446
+ file.write("Structure test #{i}")
447
+ file.close
448
+ paths << file.path
449
+ tempfiles << file
450
+ end
451
+
452
+ config = Kreuzberg::Config::Extraction.new
453
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
454
+
455
+ first_keys = results.first.respond_to?(:to_h) ? results.first.to_h.keys : []
456
+
457
+ results.each do |result|
458
+ if result.respond_to?(:to_h)
459
+ result_keys = result.to_h.keys
460
+ expect(result_keys).to match_array(first_keys) if first_keys.any?
461
+ end
462
+ end
463
+
464
+ paths.each { |p| FileUtils.rm_f(p) }
465
+ end
466
+ end
467
+
468
+ describe 'batch performance characteristics' do
469
+ it 'completes batch faster than sequential processing' do
470
+ paths = []
471
+
472
+ 5.times do |i|
473
+ file = Tempfile.new(["perf_#{i}", '.txt'])
474
+ file.write("Performance test #{i}")
475
+ file.close
476
+ paths << file.path
477
+ end
478
+
479
+ config = Kreuzberg::Config::Extraction.new
480
+
481
+ # Batch time
482
+ batch_start = Time.now
483
+ batch_results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
484
+ batch_time = Time.now - batch_start
485
+
486
+ # Sequential time
487
+ seq_start = Time.now
488
+ seq_results = paths.map { |p| Kreuzberg.extract_file_sync(path: p, config: config) }
489
+ seq_time = Time.now - seq_start
490
+
491
+ expect(batch_results.length).to eq(seq_results.length)
492
+ # Batch should be faster or comparable
493
+ expect(batch_time).to be <= seq_time + 1.0
494
+
495
+ paths.each { |p| FileUtils.rm_f(p) }
496
+ end
497
+ end
498
+
499
+ describe 'batch with special configurations' do
500
+ it 'batch processes with language detection' do
501
+ paths = []
502
+
503
+ file = Tempfile.new(['lang_batch', '.txt']).tap do |f|
504
+ f.write('Machine learning is transforming industries worldwide.')
505
+ f.close
506
+ end
507
+ paths << file.path
508
+
509
+ config = Kreuzberg::Config::Extraction.new(
510
+ language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
511
+ )
512
+
513
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
514
+ expect(results.length).to eq(1)
515
+
516
+ paths.each { |p| FileUtils.rm_f(p) }
517
+ end
518
+
519
+ it 'batch with mixed keyword algorithms' do
520
+ paths = []
521
+
522
+ 2.times do |i|
523
+ file = Tempfile.new(["mixed_algo_#{i}", '.txt'])
524
+ file.write('Machine learning neural networks artificial intelligence')
525
+ file.close
526
+ paths << file.path
527
+ end
528
+
529
+ # First batch with YAKE
530
+ config_yake = Kreuzberg::Config::Extraction.new(
531
+ keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 3)
532
+ )
533
+ results_yake = Kreuzberg.batch_extract_files_sync(paths: paths, config: config_yake)
534
+ expect(results_yake.length).to eq(2)
535
+
536
+ # Second batch with RAKE
537
+ config_rake = Kreuzberg::Config::Extraction.new(
538
+ keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 3)
539
+ )
540
+ results_rake = Kreuzberg.batch_extract_files_sync(paths: paths, config: config_rake)
541
+ expect(results_rake.length).to eq(2)
542
+
543
+ paths.each { |p| FileUtils.rm_f(p) }
544
+ end
545
+ end
546
+
547
+ describe 'batch with result aggregation' do
548
+ it 'aggregates batch results into statistics' do
549
+ paths = []
550
+
551
+ 3.times do |i|
552
+ file = Tempfile.new(["stats_#{i}", '.txt'])
553
+ file.write("Content #{i} " * 10)
554
+ file.close
555
+ paths << file.path
556
+ end
557
+
558
+ config = Kreuzberg::Config::Extraction.new
559
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
560
+
561
+ # Create aggregated statistics
562
+ stats = {
563
+ total_files: results.length,
564
+ total_content_length: results.sum { |r| r.content.length },
565
+ avg_content_length: results.sum { |r| r.content.length } / results.length,
566
+ mime_types: results.map(&:mime_type).uniq
567
+ }
568
+
569
+ expect(stats[:total_files]).to eq(3)
570
+ expect(stats[:total_content_length]).to be > 0
571
+ expect(stats[:avg_content_length]).to be > 0
572
+ expect(stats[:mime_types]).to be_a(Array)
573
+
574
+ paths.each { |p| FileUtils.rm_f(p) }
575
+ end
576
+
577
+ it 'batch results support JSON serialization' do
578
+ paths = []
579
+
580
+ file = Tempfile.new(['json_batch', '.txt']).tap do |f|
581
+ f.write('JSON serialization test')
582
+ f.close
583
+ end
584
+ paths << file.path
585
+
586
+ config = Kreuzberg::Config::Extraction.new
587
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
588
+
589
+ expect(results.first).to respond_to(:to_json)
590
+ json_str = results.first.to_json
591
+ expect(json_str).to be_a(String)
592
+ expect(json_str.length).to be > 0
593
+
594
+ paths.each { |p| FileUtils.rm_f(p) }
595
+ end
596
+ end
597
+
598
+ describe 'batch with output and result formats' do
599
+ it 'batch processes with output_format' do
600
+ paths = []
601
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
602
+ f.write('Test content for output format')
603
+ f.close
604
+ end
605
+ paths << file.path
606
+
607
+ config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
608
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
609
+
610
+ expect(results).to be_an Array
611
+ expect(results.length).to eq 1
612
+ expect(results[0]).to be_a Kreuzberg::Result
613
+
614
+ paths.each { |p| FileUtils.rm_f(p) }
615
+ end
616
+
617
+ it 'batch processes with result_format' do
618
+ paths = []
619
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
620
+ f.write('Test content for result format')
621
+ f.close
622
+ end
623
+ paths << file.path
624
+
625
+ config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
626
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
627
+
628
+ expect(results).to be_an Array
629
+ expect(results.length).to eq 1
630
+ expect(results[0]).to be_a Kreuzberg::Result
631
+
632
+ paths.each { |p| FileUtils.rm_f(p) }
633
+ end
634
+
635
+ it 'batch processes with both output and result formats' do
636
+ paths = []
637
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
638
+ f.write('Test content for both formats')
639
+ f.close
640
+ end
641
+ paths << file.path
642
+
643
+ config = Kreuzberg::Config::Extraction.new(
644
+ output_format: 'plain',
645
+ result_format: 'element_based'
646
+ )
647
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
648
+
649
+ expect(results).to be_an Array
650
+ expect(results.length).to eq 1
651
+ expect(results[0]).to be_a Kreuzberg::Result
652
+
653
+ paths.each { |p| FileUtils.rm_f(p) }
654
+ end
655
+
656
+ it 'batch processes with chunking and output_format' do
657
+ paths = []
658
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
659
+ f.write('Test content ' * 100)
660
+ f.close
661
+ end
662
+ paths << file.path
663
+
664
+ config = Kreuzberg::Config::Extraction.new(
665
+ output_format: 'markdown',
666
+ chunking: { max_chars: 1000 }
667
+ )
668
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
669
+
670
+ expect(results).to be_an Array
671
+ expect(results.length).to eq 1
672
+ expect(results[0]).to be_a Kreuzberg::Result
673
+
674
+ paths.each { |p| FileUtils.rm_f(p) }
675
+ end
676
+ end
677
+ end