kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
|
|
7
|
+
RSpec.describe 'Async Operations' do
|
|
8
|
+
describe 'Fiber-based async extraction patterns' do
|
|
9
|
+
it 'extracts content asynchronously using Fiber' do
|
|
10
|
+
fiber = Fiber.new do
|
|
11
|
+
config = Kreuzberg::Config::Extraction.new
|
|
12
|
+
text = 'Machine learning transforms technology globally.'
|
|
13
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
14
|
+
|
|
15
|
+
expect(result).not_to be_nil
|
|
16
|
+
expect(result.content).not_to be_nil
|
|
17
|
+
result
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
result = fiber.resume
|
|
21
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it 'handles multiple concurrent Fibers with different configs' do
|
|
25
|
+
configs = [
|
|
26
|
+
Kreuzberg::Config::Extraction.new(
|
|
27
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 5)
|
|
28
|
+
),
|
|
29
|
+
Kreuzberg::Config::Extraction.new(
|
|
30
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 5)
|
|
31
|
+
)
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
text = 'Artificial intelligence and machine learning drive innovation.'
|
|
35
|
+
|
|
36
|
+
fibers = configs.map do |config|
|
|
37
|
+
Fiber.new do
|
|
38
|
+
Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
results = fibers.map(&:resume)
|
|
43
|
+
expect(results.length).to eq(2)
|
|
44
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it 'maintains context across Fiber yielding' do
|
|
48
|
+
accumulated_results = []
|
|
49
|
+
|
|
50
|
+
texts = [
|
|
51
|
+
'Machine learning enables predictions.',
|
|
52
|
+
'Deep learning powers neural networks.',
|
|
53
|
+
'Data science transforms insights.'
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
fiber = Fiber.new do
|
|
57
|
+
config = Kreuzberg::Config::Extraction.new
|
|
58
|
+
texts.each do |text|
|
|
59
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
60
|
+
accumulated_results << result
|
|
61
|
+
Fiber.yield result
|
|
62
|
+
end
|
|
63
|
+
accumulated_results
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
expect(fiber.resume).to be_a(Kreuzberg::Result)
|
|
67
|
+
expect(fiber.resume).to be_a(Kreuzberg::Result)
|
|
68
|
+
expect(fiber.resume).to be_a(Kreuzberg::Result)
|
|
69
|
+
final = fiber.resume
|
|
70
|
+
expect(final).to be_a(Array)
|
|
71
|
+
expect(final.length).to eq(3)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it 'handles Fiber with configuration updates' do
|
|
75
|
+
fiber = Fiber.new do
|
|
76
|
+
text = 'Artificial intelligence transforms technology.'
|
|
77
|
+
|
|
78
|
+
# First extraction with YAKE
|
|
79
|
+
config1 = Kreuzberg::Config::Extraction.new(
|
|
80
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 5)
|
|
81
|
+
)
|
|
82
|
+
result1 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config1)
|
|
83
|
+
Fiber.yield result1
|
|
84
|
+
|
|
85
|
+
# Second extraction with RAKE
|
|
86
|
+
config2 = Kreuzberg::Config::Extraction.new(
|
|
87
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 5)
|
|
88
|
+
)
|
|
89
|
+
result2 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config2)
|
|
90
|
+
result2
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
result1 = fiber.resume
|
|
94
|
+
expect(result1).to be_a(Kreuzberg::Result)
|
|
95
|
+
|
|
96
|
+
result2 = fiber.resume
|
|
97
|
+
expect(result2).to be_a(Kreuzberg::Result)
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
describe 'concurrent extraction operations' do
|
|
102
|
+
it 'processes multiple extractions sequentially with Fiber control' do
|
|
103
|
+
paths = []
|
|
104
|
+
3.times do |i|
|
|
105
|
+
file = Tempfile.new(["concurrent_#{i}", '.txt'])
|
|
106
|
+
file.write("Content #{i}: Machine learning and artificial intelligence")
|
|
107
|
+
file.close
|
|
108
|
+
paths << file.path
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
results = []
|
|
112
|
+
fiber = Fiber.new do
|
|
113
|
+
config = Kreuzberg::Config::Extraction.new
|
|
114
|
+
paths.each do |path|
|
|
115
|
+
result = Kreuzberg.extract_file_sync(path: path, config: config)
|
|
116
|
+
results << result
|
|
117
|
+
Fiber.yield result
|
|
118
|
+
end
|
|
119
|
+
results
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
paths.each do
|
|
123
|
+
expect(fiber.resume).to be_a(Kreuzberg::Result)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
final = fiber.resume
|
|
127
|
+
expect(final.length).to eq(3)
|
|
128
|
+
|
|
129
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
it 'extracts files with different configurations in Fiber' do
|
|
133
|
+
file = Tempfile.new(['fiber_test', '.txt'])
|
|
134
|
+
file.write('Machine learning and neural networks enable AI advancement.')
|
|
135
|
+
file.close
|
|
136
|
+
|
|
137
|
+
configs = [
|
|
138
|
+
Kreuzberg::Config::Extraction.new,
|
|
139
|
+
Kreuzberg::Config::Extraction.new(
|
|
140
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 5)
|
|
141
|
+
),
|
|
142
|
+
Kreuzberg::Config::Extraction.new(
|
|
143
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 10)
|
|
144
|
+
)
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
fiber = Fiber.new do
|
|
148
|
+
results = []
|
|
149
|
+
configs.each do |config|
|
|
150
|
+
result = Kreuzberg.extract_file_sync(path: file.path, config: config)
|
|
151
|
+
results << result
|
|
152
|
+
Fiber.yield result
|
|
153
|
+
end
|
|
154
|
+
results
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
configs.each do
|
|
158
|
+
expect(fiber.resume).to be_a(Kreuzberg::Result)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
final = fiber.resume
|
|
162
|
+
expect(final.length).to eq(3)
|
|
163
|
+
|
|
164
|
+
FileUtils.rm_f(file.path)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
it 'handles Fiber enumeration over extraction results' do
|
|
168
|
+
texts = [
|
|
169
|
+
'AI transforms industries',
|
|
170
|
+
'Machine learning enables insights',
|
|
171
|
+
'Data science drives decisions'
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
fiber_results = texts.map do |text|
|
|
175
|
+
Fiber.new do
|
|
176
|
+
config = Kreuzberg::Config::Extraction.new
|
|
177
|
+
Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
results = fiber_results.map(&:resume)
|
|
182
|
+
expect(results.length).to eq(3)
|
|
183
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
describe 'async error handling' do
|
|
188
|
+
it 'catches errors within Fiber context' do
|
|
189
|
+
fiber = Fiber.new do
|
|
190
|
+
# Attempt extraction with invalid config
|
|
191
|
+
Kreuzberg::Config::Extraction.new(chunking: 'invalid')
|
|
192
|
+
rescue ArgumentError => e
|
|
193
|
+
{ error: true, message: e.message }
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
result = fiber.resume
|
|
197
|
+
expect(result).to be_a(Hash)
|
|
198
|
+
expect(result[:error]).to be true
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
it 'maintains Fiber execution on recoverable errors' do
|
|
202
|
+
fiber = Fiber.new do
|
|
203
|
+
results = []
|
|
204
|
+
|
|
205
|
+
# First attempt - error
|
|
206
|
+
begin
|
|
207
|
+
Kreuzberg::Config::Extraction.new(ocr: 12_345)
|
|
208
|
+
rescue ArgumentError
|
|
209
|
+
results << 'first_error'
|
|
210
|
+
end
|
|
211
|
+
Fiber.yield results
|
|
212
|
+
|
|
213
|
+
# Second attempt - success
|
|
214
|
+
config = Kreuzberg::Config::Extraction.new
|
|
215
|
+
text = 'Machine learning.'
|
|
216
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
217
|
+
results << result
|
|
218
|
+
results
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
first = fiber.resume
|
|
222
|
+
expect(first).to include('first_error')
|
|
223
|
+
|
|
224
|
+
final = fiber.resume
|
|
225
|
+
expect(final).to include('first_error')
|
|
226
|
+
expect(final).to include(a_kind_of(Kreuzberg::Result))
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
it 'handles Fiber exception propagation' do
|
|
230
|
+
fiber = Fiber.new do
|
|
231
|
+
raise StandardError, 'Test error in Fiber'
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
expect { fiber.resume }.to raise_error(StandardError, /Test error in Fiber/)
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
it 'recovers from async operation failures' do
|
|
238
|
+
file = Tempfile.new(['error_recovery', '.txt'])
|
|
239
|
+
file.write('Test content for recovery.')
|
|
240
|
+
file.close
|
|
241
|
+
|
|
242
|
+
fiber = Fiber.new do
|
|
243
|
+
config = Kreuzberg::Config::Extraction.new
|
|
244
|
+
|
|
245
|
+
# First try
|
|
246
|
+
begin
|
|
247
|
+
result = Kreuzberg.extract_file_sync(path: file.path, config: config)
|
|
248
|
+
Fiber.yield result
|
|
249
|
+
rescue StandardError => e
|
|
250
|
+
error_hash = { error: e.message }
|
|
251
|
+
Fiber.yield error_hash
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Retry
|
|
255
|
+
retry_config = Kreuzberg::Config::Extraction.new(use_cache: false)
|
|
256
|
+
Kreuzberg.extract_file_sync(path: file.path, config: retry_config)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
first = fiber.resume
|
|
260
|
+
expect([Kreuzberg::Result, Hash]).to include(first.class)
|
|
261
|
+
|
|
262
|
+
second = fiber.resume
|
|
263
|
+
expect(second).to be_a(Kreuzberg::Result)
|
|
264
|
+
|
|
265
|
+
FileUtils.rm_f(file.path)
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
describe 'async batch processing' do
|
|
270
|
+
it 'processes batch with Fiber-based control' do
|
|
271
|
+
paths = []
|
|
272
|
+
3.times do |i|
|
|
273
|
+
file = Tempfile.new(["batch_fiber_#{i}", '.txt'])
|
|
274
|
+
file.write("Batch content #{i}")
|
|
275
|
+
file.close
|
|
276
|
+
paths << file.path
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
fiber = Fiber.new do
|
|
280
|
+
config = Kreuzberg::Config::Extraction.new
|
|
281
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
282
|
+
Fiber.yield results.length
|
|
283
|
+
results
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
count = fiber.resume
|
|
287
|
+
expect(count).to eq(3)
|
|
288
|
+
|
|
289
|
+
results = fiber.resume
|
|
290
|
+
expect(results.length).to eq(3)
|
|
291
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
292
|
+
|
|
293
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
it 'manages batch extraction with progress tracking via Fiber' do
|
|
297
|
+
paths = []
|
|
298
|
+
5.times do |i|
|
|
299
|
+
file = Tempfile.new(["progress_#{i}", '.txt'])
|
|
300
|
+
file.write("Progress tracking #{i}")
|
|
301
|
+
file.close
|
|
302
|
+
paths << file.path
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
progress = []
|
|
306
|
+
|
|
307
|
+
fiber = Fiber.new do
|
|
308
|
+
config = Kreuzberg::Config::Extraction.new
|
|
309
|
+
results = []
|
|
310
|
+
|
|
311
|
+
paths.each_with_index do |path, idx|
|
|
312
|
+
result = Kreuzberg.extract_file_sync(path: path, config: config)
|
|
313
|
+
results << result
|
|
314
|
+
progress << (idx + 1)
|
|
315
|
+
Fiber.yield(idx + 1)
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
results
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
paths.length.times do
|
|
322
|
+
progress_value = fiber.resume
|
|
323
|
+
expect(progress_value).to be > 0
|
|
324
|
+
expect(progress_value).to be <= paths.length
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
final = fiber.resume
|
|
328
|
+
expect(final.length).to eq(paths.length)
|
|
329
|
+
|
|
330
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
it 'handles Fiber-based enumerable processing of batch results' do
|
|
334
|
+
paths = []
|
|
335
|
+
4.times do |i|
|
|
336
|
+
file = Tempfile.new(["enumerable_#{i}", '.txt'])
|
|
337
|
+
file.write("Enumerable #{i}")
|
|
338
|
+
file.close
|
|
339
|
+
paths << file.path
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
config = Kreuzberg::Config::Extraction.new
|
|
343
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
344
|
+
|
|
345
|
+
fiber = Fiber.new do
|
|
346
|
+
processed = []
|
|
347
|
+
results.each do |result|
|
|
348
|
+
processed << result
|
|
349
|
+
Fiber.yield processed.length
|
|
350
|
+
end
|
|
351
|
+
processed
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
results.length.times do
|
|
355
|
+
count = fiber.resume
|
|
356
|
+
expect(count).to be > 0
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
final = fiber.resume
|
|
360
|
+
expect(final.length).to eq(results.length)
|
|
361
|
+
|
|
362
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
describe 'async performance and resource management' do
|
|
367
|
+
it 'yields control in Fiber during long extraction' do
|
|
368
|
+
fiber = Fiber.new do
|
|
369
|
+
config = Kreuzberg::Config::Extraction.new
|
|
370
|
+
text = 'Machine learning ' * 100
|
|
371
|
+
|
|
372
|
+
start_time = Time.now
|
|
373
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
374
|
+
duration = Time.now - start_time
|
|
375
|
+
|
|
376
|
+
Fiber.yield duration
|
|
377
|
+
result
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
duration = fiber.resume
|
|
381
|
+
expect(duration).to be > 0
|
|
382
|
+
|
|
383
|
+
result = fiber.resume
|
|
384
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
it 'maintains multiple Fiber contexts independently' do
|
|
388
|
+
fiber1 = Fiber.new do
|
|
389
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
390
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake')
|
|
391
|
+
)
|
|
392
|
+
Kreuzberg.extract_bytes_sync(data: 'Fiber 1 content', mime_type: 'text/plain', config: config)
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
fiber2 = Fiber.new do
|
|
396
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
397
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake')
|
|
398
|
+
)
|
|
399
|
+
Kreuzberg.extract_bytes_sync(data: 'Fiber 2 content', mime_type: 'text/plain', config: config)
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
result1 = fiber1.resume
|
|
403
|
+
result2 = fiber2.resume
|
|
404
|
+
|
|
405
|
+
expect(result1).to be_a(Kreuzberg::Result)
|
|
406
|
+
expect(result2).to be_a(Kreuzberg::Result)
|
|
407
|
+
expect(result1.content).to include('Fiber 1')
|
|
408
|
+
expect(result2.content).to include('Fiber 2')
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
it 'handles Fiber cleanup and resource management' do
|
|
412
|
+
fiber = Fiber.new do
|
|
413
|
+
paths = []
|
|
414
|
+
3.times do |i|
|
|
415
|
+
file = Tempfile.new(["cleanup_#{i}", '.txt'])
|
|
416
|
+
file.write("Cleanup test #{i}")
|
|
417
|
+
file.close
|
|
418
|
+
paths << file.path
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
begin
|
|
422
|
+
config = Kreuzberg::Config::Extraction.new
|
|
423
|
+
results = paths.map { |p| Kreuzberg.extract_file_sync(path: p, config: config) }
|
|
424
|
+
Fiber.yield results.length
|
|
425
|
+
results
|
|
426
|
+
ensure
|
|
427
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
count = fiber.resume
|
|
432
|
+
expect(count).to eq(3)
|
|
433
|
+
|
|
434
|
+
results = fiber.resume
|
|
435
|
+
expect(results.length).to eq(3)
|
|
436
|
+
end
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
describe 'async with configuration variations' do
|
|
440
|
+
it 'applies config changes in Fiber sequence' do
|
|
441
|
+
fiber = Fiber.new do
|
|
442
|
+
text = 'Machine learning transforms technology research and development.'
|
|
443
|
+
|
|
444
|
+
# No keywords
|
|
445
|
+
config1 = Kreuzberg::Config::Extraction.new
|
|
446
|
+
result1 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config1)
|
|
447
|
+
Fiber.yield result1.content.length
|
|
448
|
+
|
|
449
|
+
# With YAKE
|
|
450
|
+
config2 = Kreuzberg::Config::Extraction.new(
|
|
451
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 5)
|
|
452
|
+
)
|
|
453
|
+
result2 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config2)
|
|
454
|
+
Fiber.yield result2.content.length
|
|
455
|
+
|
|
456
|
+
# With RAKE
|
|
457
|
+
config3 = Kreuzberg::Config::Extraction.new(
|
|
458
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 10)
|
|
459
|
+
)
|
|
460
|
+
result3 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config3)
|
|
461
|
+
result3
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
len1 = fiber.resume
|
|
465
|
+
len2 = fiber.resume
|
|
466
|
+
result3 = fiber.resume
|
|
467
|
+
|
|
468
|
+
expect(len1).to be > 0
|
|
469
|
+
expect(len2).to be > 0
|
|
470
|
+
expect(result3).to be_a(Kreuzberg::Result)
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
end
|