kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
require 'securerandom'
|
|
7
|
+
|
|
8
|
+
RSpec.describe 'Batch Operations' do
|
|
9
|
+
describe 'batch_extract_files with multiple file types' do
|
|
10
|
+
it 'processes mixed file types in single batch' do
|
|
11
|
+
paths = []
|
|
12
|
+
|
|
13
|
+
# Create text file
|
|
14
|
+
txt_file = Tempfile.new(['batch_test', '.txt'])
|
|
15
|
+
txt_file.write('Text file content: Machine learning transforms technology.')
|
|
16
|
+
txt_file.close
|
|
17
|
+
paths << txt_file.path
|
|
18
|
+
|
|
19
|
+
# Create markdown file
|
|
20
|
+
md_file = Tempfile.new(['batch_test', '.md'])
|
|
21
|
+
md_file.write('# Markdown Header\n\nContent about artificial intelligence.')
|
|
22
|
+
md_file.close
|
|
23
|
+
paths << md_file.path
|
|
24
|
+
|
|
25
|
+
config = Kreuzberg::Config::Extraction.new
|
|
26
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
27
|
+
|
|
28
|
+
expect(results).to be_a(Array)
|
|
29
|
+
expect(results.length).to eq(2)
|
|
30
|
+
results.each do |result|
|
|
31
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
32
|
+
expect(result.content).not_to be_empty
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it 'maintains file order through batch processing' do
|
|
39
|
+
paths = []
|
|
40
|
+
unique_markers = []
|
|
41
|
+
|
|
42
|
+
3.times do |i|
|
|
43
|
+
file = Tempfile.new(["ordered_#{i}", '.txt'])
|
|
44
|
+
marker = "MARKER_#{SecureRandom.hex(4)}"
|
|
45
|
+
file.write("File #{i}: #{marker}")
|
|
46
|
+
file.close
|
|
47
|
+
paths << file.path
|
|
48
|
+
unique_markers << marker
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
config = Kreuzberg::Config::Extraction.new
|
|
52
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
53
|
+
|
|
54
|
+
expect(results.length).to eq(paths.length)
|
|
55
|
+
results.each_with_index do |result, idx|
|
|
56
|
+
expect(result.content).to include(unique_markers[idx])
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
it 'processes large batch operations efficiently' do
|
|
63
|
+
paths = []
|
|
64
|
+
|
|
65
|
+
# Create 20 test files
|
|
66
|
+
20.times do |i|
|
|
67
|
+
file = Tempfile.new(["large_batch_#{i}", '.txt'])
|
|
68
|
+
file.write("Content #{i}: Machine learning technology")
|
|
69
|
+
file.close
|
|
70
|
+
paths << file.path
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
config = Kreuzberg::Config::Extraction.new
|
|
74
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
75
|
+
|
|
76
|
+
expect(results.length).to eq(20)
|
|
77
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
78
|
+
|
|
79
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it 'handles batch with different file sizes' do
|
|
83
|
+
paths = []
|
|
84
|
+
|
|
85
|
+
# Small file
|
|
86
|
+
small = Tempfile.new(['small', '.txt'])
|
|
87
|
+
small.write('AI')
|
|
88
|
+
small.close
|
|
89
|
+
paths << small.path
|
|
90
|
+
|
|
91
|
+
# Medium file
|
|
92
|
+
medium = Tempfile.new(['medium', '.txt'])
|
|
93
|
+
medium.write('Machine learning is a subset of artificial intelligence.')
|
|
94
|
+
medium.close
|
|
95
|
+
paths << medium.path
|
|
96
|
+
|
|
97
|
+
# Large file
|
|
98
|
+
large = Tempfile.new(['large', '.txt'])
|
|
99
|
+
large.write('Machine learning ' * 100)
|
|
100
|
+
large.close
|
|
101
|
+
paths << large.path
|
|
102
|
+
|
|
103
|
+
config = Kreuzberg::Config::Extraction.new
|
|
104
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
105
|
+
|
|
106
|
+
expect(results.length).to eq(3)
|
|
107
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
108
|
+
expect(results[2].content.length).to be >= results[0].content.length
|
|
109
|
+
|
|
110
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
describe 'batch extraction with configuration options' do
|
|
115
|
+
it 'applies consistent configuration across batch' do
|
|
116
|
+
paths = []
|
|
117
|
+
|
|
118
|
+
3.times do |i|
|
|
119
|
+
file = Tempfile.new(["config_batch_#{i}", '.txt'])
|
|
120
|
+
file.write("Machine learning content #{i}. Artificial intelligence advances.")
|
|
121
|
+
file.close
|
|
122
|
+
paths << file.path
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
126
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
127
|
+
algorithm: 'yake',
|
|
128
|
+
max_keywords: 5
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
133
|
+
|
|
134
|
+
expect(results.length).to eq(3)
|
|
135
|
+
results.each do |result|
|
|
136
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
137
|
+
expect(result.content).not_to be_nil
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it 'batch respects caching configuration' do
|
|
144
|
+
path = Tempfile.new(['cache_test', '.txt']).tap do |f|
|
|
145
|
+
f.write('Cache test content')
|
|
146
|
+
f.close
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
config_no_cache = Kreuzberg::Config::Extraction.new(use_cache: false)
|
|
150
|
+
results1 = Kreuzberg.batch_extract_files_sync(paths: [path.path], config: config_no_cache)
|
|
151
|
+
|
|
152
|
+
config_with_cache = Kreuzberg::Config::Extraction.new(use_cache: true)
|
|
153
|
+
results2 = Kreuzberg.batch_extract_files_sync(paths: [path.path], config: config_with_cache)
|
|
154
|
+
|
|
155
|
+
expect(results1.length).to eq(1)
|
|
156
|
+
expect(results2.length).to eq(1)
|
|
157
|
+
expect(results1[0].content).to eq(results2[0].content)
|
|
158
|
+
|
|
159
|
+
FileUtils.rm_f(path.path)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
it 'supports keyword extraction configuration in batch' do
|
|
163
|
+
paths = []
|
|
164
|
+
|
|
165
|
+
2.times do |i|
|
|
166
|
+
file = Tempfile.new(["keywords_batch_#{i}", '.txt'])
|
|
167
|
+
file.write('Machine learning and deep learning enable artificial intelligence.')
|
|
168
|
+
file.close
|
|
169
|
+
paths << file.path
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
algorithms = %w[yake rake]
|
|
173
|
+
|
|
174
|
+
algorithms.each do |algo|
|
|
175
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
176
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
177
|
+
algorithm: algo,
|
|
178
|
+
max_keywords: 5
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
183
|
+
expect(results.length).to eq(2)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
describe 'batch error handling and resilience' do
|
|
191
|
+
it 'processes batch with some invalid paths gracefully' do
|
|
192
|
+
valid_file = Tempfile.new(['valid_batch', '.txt']).tap do |f|
|
|
193
|
+
f.write('Valid content')
|
|
194
|
+
f.close
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
valid_path = valid_file.path
|
|
198
|
+
config = Kreuzberg::Config::Extraction.new
|
|
199
|
+
|
|
200
|
+
# Process just the valid path
|
|
201
|
+
results = Kreuzberg.batch_extract_files_sync(paths: [valid_path], config: config)
|
|
202
|
+
expect(results.length).to eq(1)
|
|
203
|
+
expect(results[0]).to be_a(Kreuzberg::Result)
|
|
204
|
+
|
|
205
|
+
FileUtils.rm_f(valid_path)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
it 'handles empty file list in batch' do
|
|
209
|
+
config = Kreuzberg::Config::Extraction.new
|
|
210
|
+
results = Kreuzberg.batch_extract_files_sync(paths: [], config: config)
|
|
211
|
+
|
|
212
|
+
expect(results).to be_a(Array)
|
|
213
|
+
expect(results).to be_empty
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
it 'processes batch with single file' do
|
|
217
|
+
file = Tempfile.new(['single_batch', '.txt']).tap do |f|
|
|
218
|
+
f.write('Single file batch processing')
|
|
219
|
+
f.close
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
config = Kreuzberg::Config::Extraction.new
|
|
223
|
+
results = Kreuzberg.batch_extract_files_sync(paths: [file.path], config: config)
|
|
224
|
+
|
|
225
|
+
expect(results.length).to eq(1)
|
|
226
|
+
expect(results[0]).to be_a(Kreuzberg::Result)
|
|
227
|
+
|
|
228
|
+
FileUtils.rm_f(file.path)
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
it 'maintains batch execution on partial failures' do
|
|
232
|
+
valid_file = Tempfile.new(['valid', '.txt']).tap do |f|
|
|
233
|
+
f.write('Valid content')
|
|
234
|
+
f.close
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
config = Kreuzberg::Config::Extraction.new
|
|
238
|
+
results = Kreuzberg.batch_extract_files_sync(paths: [valid_file.path], config: config)
|
|
239
|
+
|
|
240
|
+
expect(results).to be_a(Array)
|
|
241
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
242
|
+
|
|
243
|
+
FileUtils.rm_f(valid_file.path)
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
describe 'batch enumerable processing' do
|
|
248
|
+
it 'iterates over batch results with each' do
|
|
249
|
+
paths = []
|
|
250
|
+
|
|
251
|
+
3.times do |i|
|
|
252
|
+
file = Tempfile.new(["enum_#{i}", '.txt'])
|
|
253
|
+
file.write("Enumerable test #{i}")
|
|
254
|
+
file.close
|
|
255
|
+
paths << file.path
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
config = Kreuzberg::Config::Extraction.new
|
|
259
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
260
|
+
|
|
261
|
+
count = 0
|
|
262
|
+
results.each do |result|
|
|
263
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
264
|
+
count += 1
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
expect(count).to eq(3)
|
|
268
|
+
|
|
269
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
it 'maps batch results to extract content' do
|
|
273
|
+
paths = []
|
|
274
|
+
|
|
275
|
+
3.times do |i|
|
|
276
|
+
file = Tempfile.new(["map_#{i}", '.txt'])
|
|
277
|
+
file.write("Mapping #{i}")
|
|
278
|
+
file.close
|
|
279
|
+
paths << file.path
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
config = Kreuzberg::Config::Extraction.new
|
|
283
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
284
|
+
|
|
285
|
+
contents = results.map(&:content)
|
|
286
|
+
expect(contents).to be_a(Array)
|
|
287
|
+
expect(contents.length).to eq(3)
|
|
288
|
+
expect(contents).to all(be_a(String))
|
|
289
|
+
|
|
290
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
it 'filters batch results by content length' do
|
|
294
|
+
paths = []
|
|
295
|
+
|
|
296
|
+
# Small file
|
|
297
|
+
small = Tempfile.new(['small', '.txt']).tap do |f|
|
|
298
|
+
f.write('x')
|
|
299
|
+
f.close
|
|
300
|
+
end
|
|
301
|
+
paths << small.path
|
|
302
|
+
|
|
303
|
+
# Large file
|
|
304
|
+
large = Tempfile.new(['large', '.txt']).tap do |f|
|
|
305
|
+
f.write('content ' * 50)
|
|
306
|
+
f.close
|
|
307
|
+
end
|
|
308
|
+
paths << large.path
|
|
309
|
+
|
|
310
|
+
config = Kreuzberg::Config::Extraction.new
|
|
311
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
312
|
+
|
|
313
|
+
large_results = results.select { |r| r.content.length > 20 }
|
|
314
|
+
expect(large_results.length).to be >= 1
|
|
315
|
+
|
|
316
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
it 'reduces batch results to combined content' do
|
|
320
|
+
paths = []
|
|
321
|
+
|
|
322
|
+
3.times do |i|
|
|
323
|
+
file = Tempfile.new(["reduce_#{i}", '.txt'])
|
|
324
|
+
file.write("Part #{i} ")
|
|
325
|
+
file.close
|
|
326
|
+
paths << file.path
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
config = Kreuzberg::Config::Extraction.new
|
|
330
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
331
|
+
|
|
332
|
+
combined = results.reduce('') { |acc, r| acc + r.content }
|
|
333
|
+
expect(combined).not_to be_empty
|
|
334
|
+
expect(combined).to include('Part')
|
|
335
|
+
|
|
336
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
describe 'batch with chunking and embeddings' do
|
|
341
|
+
it 'processes batch with chunking enabled' do
|
|
342
|
+
paths = []
|
|
343
|
+
|
|
344
|
+
2.times do |i|
|
|
345
|
+
file = Tempfile.new(["chunking_batch_#{i}", '.txt'])
|
|
346
|
+
file.write('Machine learning ' * 50)
|
|
347
|
+
file.close
|
|
348
|
+
paths << file.path
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
352
|
+
chunking: Kreuzberg::Config::Chunking.new(
|
|
353
|
+
enabled: true,
|
|
354
|
+
max_chars: 100
|
|
355
|
+
)
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
359
|
+
|
|
360
|
+
expect(results.length).to eq(2)
|
|
361
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
362
|
+
|
|
363
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
it 'batch processes with embedding generation' do
|
|
367
|
+
paths = []
|
|
368
|
+
|
|
369
|
+
2.times do |i|
|
|
370
|
+
file = Tempfile.new(["embedding_batch_#{i}", '.txt'])
|
|
371
|
+
file.write('Artificial intelligence transforms technology development.')
|
|
372
|
+
file.close
|
|
373
|
+
paths << file.path
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
# Use basic chunking without embeddings to avoid ONNX dependency
|
|
377
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
378
|
+
chunking: Kreuzberg::Config::Chunking.new(
|
|
379
|
+
enabled: true,
|
|
380
|
+
max_chars: 100
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
385
|
+
|
|
386
|
+
expect(results.length).to eq(2)
|
|
387
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
388
|
+
|
|
389
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
describe 'batch result properties and validation' do
|
|
394
|
+
it 'each batch result has required properties' do
|
|
395
|
+
paths = []
|
|
396
|
+
|
|
397
|
+
2.times do |i|
|
|
398
|
+
file = Tempfile.new(["props_#{i}", '.txt'])
|
|
399
|
+
file.write("Result properties test #{i}")
|
|
400
|
+
file.close
|
|
401
|
+
paths << file.path
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
config = Kreuzberg::Config::Extraction.new
|
|
405
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
406
|
+
|
|
407
|
+
results.each do |result|
|
|
408
|
+
expect(result).to respond_to(:content)
|
|
409
|
+
expect(result).to respond_to(:mime_type)
|
|
410
|
+
expect(result.content).to be_a(String)
|
|
411
|
+
expect(result.mime_type).to be_a(String)
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
it 'batch results maintain independence' do
|
|
418
|
+
file1 = Tempfile.new(['indep1', '.txt']).tap do |f|
|
|
419
|
+
f.write('First file content')
|
|
420
|
+
f.close
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
file2 = Tempfile.new(['indep2', '.txt']).tap do |f|
|
|
424
|
+
f.write('Second file content')
|
|
425
|
+
f.close
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
paths = [file1.path, file2.path]
|
|
429
|
+
|
|
430
|
+
config = Kreuzberg::Config::Extraction.new
|
|
431
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
432
|
+
|
|
433
|
+
expect(results[0].content).not_to eq(results[1].content)
|
|
434
|
+
expect(results[0].content).to include('First')
|
|
435
|
+
expect(results[1].content).to include('Second')
|
|
436
|
+
|
|
437
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
it 'batch results have consistent structure' do
|
|
441
|
+
paths = []
|
|
442
|
+
tempfiles = []
|
|
443
|
+
|
|
444
|
+
3.times do |i|
|
|
445
|
+
file = Tempfile.new(["struct_#{i}", '.txt'])
|
|
446
|
+
file.write("Structure test #{i}")
|
|
447
|
+
file.close
|
|
448
|
+
paths << file.path
|
|
449
|
+
tempfiles << file
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
config = Kreuzberg::Config::Extraction.new
|
|
453
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
454
|
+
|
|
455
|
+
first_keys = results.first.respond_to?(:to_h) ? results.first.to_h.keys : []
|
|
456
|
+
|
|
457
|
+
results.each do |result|
|
|
458
|
+
if result.respond_to?(:to_h)
|
|
459
|
+
result_keys = result.to_h.keys
|
|
460
|
+
expect(result_keys).to match_array(first_keys) if first_keys.any?
|
|
461
|
+
end
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
describe 'batch performance characteristics' do
|
|
469
|
+
it 'completes batch faster than sequential processing' do
|
|
470
|
+
paths = []
|
|
471
|
+
|
|
472
|
+
5.times do |i|
|
|
473
|
+
file = Tempfile.new(["perf_#{i}", '.txt'])
|
|
474
|
+
file.write("Performance test #{i}")
|
|
475
|
+
file.close
|
|
476
|
+
paths << file.path
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
config = Kreuzberg::Config::Extraction.new
|
|
480
|
+
|
|
481
|
+
# Batch time
|
|
482
|
+
batch_start = Time.now
|
|
483
|
+
batch_results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
484
|
+
batch_time = Time.now - batch_start
|
|
485
|
+
|
|
486
|
+
# Sequential time
|
|
487
|
+
seq_start = Time.now
|
|
488
|
+
seq_results = paths.map { |p| Kreuzberg.extract_file_sync(path: p, config: config) }
|
|
489
|
+
seq_time = Time.now - seq_start
|
|
490
|
+
|
|
491
|
+
expect(batch_results.length).to eq(seq_results.length)
|
|
492
|
+
# Batch should be faster or comparable
|
|
493
|
+
expect(batch_time).to be <= seq_time + 1.0
|
|
494
|
+
|
|
495
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
describe 'batch with special configurations' do
|
|
500
|
+
it 'batch processes with language detection' do
|
|
501
|
+
paths = []
|
|
502
|
+
|
|
503
|
+
file = Tempfile.new(['lang_batch', '.txt']).tap do |f|
|
|
504
|
+
f.write('Machine learning is transforming industries worldwide.')
|
|
505
|
+
f.close
|
|
506
|
+
end
|
|
507
|
+
paths << file.path
|
|
508
|
+
|
|
509
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
510
|
+
language_detection: Kreuzberg::Config::LanguageDetection.new(enabled: true)
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
514
|
+
expect(results.length).to eq(1)
|
|
515
|
+
|
|
516
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
it 'batch with mixed keyword algorithms' do
|
|
520
|
+
paths = []
|
|
521
|
+
|
|
522
|
+
2.times do |i|
|
|
523
|
+
file = Tempfile.new(["mixed_algo_#{i}", '.txt'])
|
|
524
|
+
file.write('Machine learning neural networks artificial intelligence')
|
|
525
|
+
file.close
|
|
526
|
+
paths << file.path
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
# First batch with YAKE
|
|
530
|
+
config_yake = Kreuzberg::Config::Extraction.new(
|
|
531
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 3)
|
|
532
|
+
)
|
|
533
|
+
results_yake = Kreuzberg.batch_extract_files_sync(paths: paths, config: config_yake)
|
|
534
|
+
expect(results_yake.length).to eq(2)
|
|
535
|
+
|
|
536
|
+
# Second batch with RAKE
|
|
537
|
+
config_rake = Kreuzberg::Config::Extraction.new(
|
|
538
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 3)
|
|
539
|
+
)
|
|
540
|
+
results_rake = Kreuzberg.batch_extract_files_sync(paths: paths, config: config_rake)
|
|
541
|
+
expect(results_rake.length).to eq(2)
|
|
542
|
+
|
|
543
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
544
|
+
end
|
|
545
|
+
end
|
|
546
|
+
|
|
547
|
+
describe 'batch with result aggregation' do
|
|
548
|
+
it 'aggregates batch results into statistics' do
|
|
549
|
+
paths = []
|
|
550
|
+
|
|
551
|
+
3.times do |i|
|
|
552
|
+
file = Tempfile.new(["stats_#{i}", '.txt'])
|
|
553
|
+
file.write("Content #{i} " * 10)
|
|
554
|
+
file.close
|
|
555
|
+
paths << file.path
|
|
556
|
+
end
|
|
557
|
+
|
|
558
|
+
config = Kreuzberg::Config::Extraction.new
|
|
559
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
560
|
+
|
|
561
|
+
# Create aggregated statistics
|
|
562
|
+
stats = {
|
|
563
|
+
total_files: results.length,
|
|
564
|
+
total_content_length: results.sum { |r| r.content.length },
|
|
565
|
+
avg_content_length: results.sum { |r| r.content.length } / results.length,
|
|
566
|
+
mime_types: results.map(&:mime_type).uniq
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
expect(stats[:total_files]).to eq(3)
|
|
570
|
+
expect(stats[:total_content_length]).to be > 0
|
|
571
|
+
expect(stats[:avg_content_length]).to be > 0
|
|
572
|
+
expect(stats[:mime_types]).to be_a(Array)
|
|
573
|
+
|
|
574
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
it 'batch results support JSON serialization' do
|
|
578
|
+
paths = []
|
|
579
|
+
|
|
580
|
+
file = Tempfile.new(['json_batch', '.txt']).tap do |f|
|
|
581
|
+
f.write('JSON serialization test')
|
|
582
|
+
f.close
|
|
583
|
+
end
|
|
584
|
+
paths << file.path
|
|
585
|
+
|
|
586
|
+
config = Kreuzberg::Config::Extraction.new
|
|
587
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
588
|
+
|
|
589
|
+
expect(results.first).to respond_to(:to_json)
|
|
590
|
+
json_str = results.first.to_json
|
|
591
|
+
expect(json_str).to be_a(String)
|
|
592
|
+
expect(json_str.length).to be > 0
|
|
593
|
+
|
|
594
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
595
|
+
end
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
describe 'batch with output and result formats' do
|
|
599
|
+
it 'batch processes with output_format' do
|
|
600
|
+
paths = []
|
|
601
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
602
|
+
f.write('Test content for output format')
|
|
603
|
+
f.close
|
|
604
|
+
end
|
|
605
|
+
paths << file.path
|
|
606
|
+
|
|
607
|
+
config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
|
|
608
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
609
|
+
|
|
610
|
+
expect(results).to be_an Array
|
|
611
|
+
expect(results.length).to eq 1
|
|
612
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
613
|
+
|
|
614
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
615
|
+
end
|
|
616
|
+
|
|
617
|
+
it 'batch processes with result_format' do
|
|
618
|
+
paths = []
|
|
619
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
620
|
+
f.write('Test content for result format')
|
|
621
|
+
f.close
|
|
622
|
+
end
|
|
623
|
+
paths << file.path
|
|
624
|
+
|
|
625
|
+
config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
|
|
626
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
627
|
+
|
|
628
|
+
expect(results).to be_an Array
|
|
629
|
+
expect(results.length).to eq 1
|
|
630
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
631
|
+
|
|
632
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
633
|
+
end
|
|
634
|
+
|
|
635
|
+
it 'batch processes with both output and result formats' do
|
|
636
|
+
paths = []
|
|
637
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
638
|
+
f.write('Test content for both formats')
|
|
639
|
+
f.close
|
|
640
|
+
end
|
|
641
|
+
paths << file.path
|
|
642
|
+
|
|
643
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
644
|
+
output_format: 'plain',
|
|
645
|
+
result_format: 'element_based'
|
|
646
|
+
)
|
|
647
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
648
|
+
|
|
649
|
+
expect(results).to be_an Array
|
|
650
|
+
expect(results.length).to eq 1
|
|
651
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
652
|
+
|
|
653
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
it 'batch processes with chunking and output_format' do
|
|
657
|
+
paths = []
|
|
658
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
659
|
+
f.write('Test content ' * 100)
|
|
660
|
+
f.close
|
|
661
|
+
end
|
|
662
|
+
paths << file.path
|
|
663
|
+
|
|
664
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
665
|
+
output_format: 'markdown',
|
|
666
|
+
chunking: { max_chars: 1000 }
|
|
667
|
+
)
|
|
668
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
669
|
+
|
|
670
|
+
expect(results).to be_an Array
|
|
671
|
+
expect(results.length).to eq 1
|
|
672
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
673
|
+
|
|
674
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
675
|
+
end
|
|
676
|
+
end
|
|
677
|
+
end
|