kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
require 'securerandom'
|
|
7
|
+
|
|
8
|
+
RSpec.describe Kreuzberg do
|
|
9
|
+
describe '#batch_extract_files_sync' do
|
|
10
|
+
it 'extracts multiple files in a single batch operation' do
|
|
11
|
+
paths = []
|
|
12
|
+
3.times do |i|
|
|
13
|
+
file = Tempfile.new(["batch_test_#{i}", '.md'])
|
|
14
|
+
file.write("# Content of file #{i}\n\nSome markdown content")
|
|
15
|
+
file.close
|
|
16
|
+
paths << file.path
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
results = described_class.batch_extract_files_sync(paths: paths)
|
|
20
|
+
|
|
21
|
+
expect(results).to be_a(Array)
|
|
22
|
+
expect(results.length).to eq(3)
|
|
23
|
+
results.each do |result|
|
|
24
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
25
|
+
expect(result.content).not_to be_empty
|
|
26
|
+
end
|
|
27
|
+
ensure
|
|
28
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it 'maintains correct order of results' do
|
|
32
|
+
paths = []
|
|
33
|
+
unique_ids = []
|
|
34
|
+
3.times do |i|
|
|
35
|
+
file = Tempfile.new(["ordered_#{i}", '.md'])
|
|
36
|
+
unique_id = SecureRandom.hex(8)
|
|
37
|
+
content = "# File #{i}\n\nUnique marker: #{unique_id}\n\nSome content"
|
|
38
|
+
file.write(content)
|
|
39
|
+
file.close
|
|
40
|
+
paths << file.path
|
|
41
|
+
unique_ids << unique_id
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
results = described_class.batch_extract_files_sync(paths: paths)
|
|
45
|
+
|
|
46
|
+
expect(results.length).to eq(paths.length)
|
|
47
|
+
results.each_with_index do |result, idx|
|
|
48
|
+
expect(result.content).to include(unique_ids[idx])
|
|
49
|
+
end
|
|
50
|
+
ensure
|
|
51
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'handles empty file list gracefully' do
|
|
55
|
+
results = described_class.batch_extract_files_sync(paths: [])
|
|
56
|
+
expect(results).to be_a(Array)
|
|
57
|
+
expect(results).to be_empty
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'handles batch operations with configuration' do
|
|
61
|
+
paths = []
|
|
62
|
+
2.times do |i|
|
|
63
|
+
file = Tempfile.new("config_batch_#{i}.txt")
|
|
64
|
+
file.write("Config test content #{i}")
|
|
65
|
+
file.close
|
|
66
|
+
paths << file.path
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
70
|
+
use_cache: false
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
results = described_class.batch_extract_files_sync(paths: paths, config: config)
|
|
74
|
+
|
|
75
|
+
expect(results).to be_a(Array)
|
|
76
|
+
expect(results.length).to eq(2)
|
|
77
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
78
|
+
ensure
|
|
79
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it 'returns independent result objects' do
|
|
83
|
+
paths = []
|
|
84
|
+
2.times do |i|
|
|
85
|
+
file = Tempfile.new("independent_#{i}.txt")
|
|
86
|
+
file.write("Independent content #{i}")
|
|
87
|
+
file.close
|
|
88
|
+
paths << file.path
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
results = described_class.batch_extract_files_sync(paths: paths)
|
|
92
|
+
|
|
93
|
+
expect(results[0].content).not_to eq(results[1].content)
|
|
94
|
+
expect(results[0].mime_type).to eq(results[1].mime_type)
|
|
95
|
+
ensure
|
|
96
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it 'extracts different file types in batch' do
|
|
100
|
+
paths = []
|
|
101
|
+
temp_dir = Dir.mktmpdir
|
|
102
|
+
|
|
103
|
+
txt_file = File.join(temp_dir, 'test.txt')
|
|
104
|
+
File.write(txt_file, 'Text content')
|
|
105
|
+
paths << txt_file
|
|
106
|
+
|
|
107
|
+
csv_file = File.join(temp_dir, 'test.csv')
|
|
108
|
+
File.write(csv_file, "Name,Value\nAlice,1\nBob,2")
|
|
109
|
+
paths << csv_file
|
|
110
|
+
|
|
111
|
+
json_file = File.join(temp_dir, 'test.json')
|
|
112
|
+
File.write(json_file, '{"key": "value"}')
|
|
113
|
+
paths << json_file
|
|
114
|
+
|
|
115
|
+
results = described_class.batch_extract_files_sync(paths: paths)
|
|
116
|
+
|
|
117
|
+
expect(results.length).to eq(3)
|
|
118
|
+
results.each do |result|
|
|
119
|
+
expect(result.mime_type).not_to be_nil
|
|
120
|
+
expect(result.content).not_to be_empty
|
|
121
|
+
end
|
|
122
|
+
ensure
|
|
123
|
+
FileUtils.remove_entry(temp_dir)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
describe '#batch_extract_files' do
|
|
128
|
+
it 'extracts multiple files asynchronously' do
|
|
129
|
+
paths = []
|
|
130
|
+
3.times do |i|
|
|
131
|
+
file = Tempfile.new("async_batch_#{i}.txt")
|
|
132
|
+
file.write("Async content #{i}")
|
|
133
|
+
file.close
|
|
134
|
+
paths << file.path
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
results = described_class.batch_extract_files(paths: paths)
|
|
138
|
+
|
|
139
|
+
expect(results).to be_a(Array)
|
|
140
|
+
expect(results.length).to eq(3)
|
|
141
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
142
|
+
ensure
|
|
143
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
it 'handles async batch with configuration' do
|
|
147
|
+
paths = []
|
|
148
|
+
2.times do |i|
|
|
149
|
+
file = Tempfile.new("async_config_#{i}.txt")
|
|
150
|
+
file.write("Async config #{i}")
|
|
151
|
+
file.close
|
|
152
|
+
paths << file.path
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
156
|
+
use_cache: false
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
results = described_class.batch_extract_files(paths: paths, config: config)
|
|
160
|
+
|
|
161
|
+
expect(results.length).to eq(2)
|
|
162
|
+
results.each { |result| expect(result.content).not_to be_empty }
|
|
163
|
+
ensure
|
|
164
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
describe '#batch_extract_bytes_sync' do
|
|
169
|
+
it 'extracts multiple byte sources in batch' do
|
|
170
|
+
data = [
|
|
171
|
+
'First content',
|
|
172
|
+
'Second content',
|
|
173
|
+
'{"json": true}'
|
|
174
|
+
]
|
|
175
|
+
mime_types = [
|
|
176
|
+
'text/plain',
|
|
177
|
+
'text/plain',
|
|
178
|
+
'application/json'
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
results = described_class.batch_extract_bytes_sync(data_array: data, mime_types: mime_types)
|
|
182
|
+
|
|
183
|
+
expect(results).to be_a(Array)
|
|
184
|
+
expect(results.length).to eq(3)
|
|
185
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
it 'maintains order for batch byte operations' do
|
|
189
|
+
data = ['Content A', 'Content B', 'Content C']
|
|
190
|
+
mime_types = ['text/plain'] * 3
|
|
191
|
+
|
|
192
|
+
results = described_class.batch_extract_bytes_sync(data_array: data, mime_types: mime_types)
|
|
193
|
+
|
|
194
|
+
expect(results.length).to eq(3)
|
|
195
|
+
results.each_with_index do |result, idx|
|
|
196
|
+
expect(result.content).to include(data[idx])
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
it 'handles empty byte list' do
|
|
201
|
+
results = described_class.batch_extract_bytes_sync(data_array: [], mime_types: [])
|
|
202
|
+
expect(results).to be_a(Array)
|
|
203
|
+
expect(results).to be_empty
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
it 'applies configuration to byte batch operations' do
|
|
207
|
+
data = ['Batch bytes 1', 'Batch bytes 2']
|
|
208
|
+
mime_types = ['text/plain'] * 2
|
|
209
|
+
|
|
210
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
211
|
+
use_cache: false
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
results = described_class.batch_extract_bytes_sync(data_array: data, mime_types: mime_types, config: config)
|
|
215
|
+
|
|
216
|
+
expect(results.length).to eq(2)
|
|
217
|
+
results.each { |result| expect(result.mime_type).to eq('text/plain') }
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
describe '#batch_extract_bytes' do
|
|
222
|
+
it 'extracts multiple bytes asynchronously' do
|
|
223
|
+
data = ['Async bytes 1', 'Async bytes 2']
|
|
224
|
+
mime_types = ['text/plain'] * 2
|
|
225
|
+
|
|
226
|
+
results = described_class.batch_extract_bytes(data_array: data, mime_types: mime_types)
|
|
227
|
+
|
|
228
|
+
expect(results).to be_a(Array)
|
|
229
|
+
expect(results.length).to eq(2)
|
|
230
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
it 'handles async byte batch with configuration' do
|
|
234
|
+
data = ['Config async 1', 'Config async 2']
|
|
235
|
+
mime_types = ['text/plain'] * 2
|
|
236
|
+
|
|
237
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
238
|
+
use_cache: false
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
results = described_class.batch_extract_bytes(data_array: data, mime_types: mime_types, config: config)
|
|
242
|
+
|
|
243
|
+
expect(results.length).to eq(2)
|
|
244
|
+
results.each { |result| expect(result.content).not_to be_empty }
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
describe 'batch performance characteristics' do
|
|
249
|
+
it 'processes batch operations efficiently' do
|
|
250
|
+
paths = []
|
|
251
|
+
file_count = 5
|
|
252
|
+
|
|
253
|
+
temp_dir = Dir.mktmpdir
|
|
254
|
+
file_count.times do |i|
|
|
255
|
+
file_path = File.join(temp_dir, "perf_test_#{i}.txt")
|
|
256
|
+
File.write(file_path, "Performance test content #{i}")
|
|
257
|
+
paths << file_path
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
start_time = Time.now
|
|
261
|
+
results = described_class.batch_extract_files_sync(paths: paths)
|
|
262
|
+
batch_duration = Time.now - start_time
|
|
263
|
+
|
|
264
|
+
expect(results.length).to eq(file_count)
|
|
265
|
+
expect(results).to all(be_a(Kreuzberg::Result))
|
|
266
|
+
|
|
267
|
+
expect(batch_duration).to be < 60
|
|
268
|
+
ensure
|
|
269
|
+
FileUtils.remove_entry(temp_dir)
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
it 'batch results match sequential results' do
|
|
273
|
+
paths = []
|
|
274
|
+
temp_dir = Dir.mktmpdir
|
|
275
|
+
|
|
276
|
+
3.times do |i|
|
|
277
|
+
file_path = File.join(temp_dir, "compare_#{i}.txt")
|
|
278
|
+
File.write(file_path, "Comparison content #{i}")
|
|
279
|
+
paths << file_path
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
batch_results = described_class.batch_extract_files_sync(paths: paths)
|
|
283
|
+
|
|
284
|
+
sequential_results = paths.map { |p| described_class.extract_file_sync(path: p) }
|
|
285
|
+
|
|
286
|
+
expect(batch_results.length).to eq(sequential_results.length)
|
|
287
|
+
batch_results.each_with_index do |batch_result, idx|
|
|
288
|
+
seq_result = sequential_results[idx]
|
|
289
|
+
expect(batch_result.content).to eq(seq_result.content)
|
|
290
|
+
expect(batch_result.mime_type).to eq(seq_result.mime_type)
|
|
291
|
+
end
|
|
292
|
+
ensure
|
|
293
|
+
FileUtils.remove_entry(temp_dir)
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
describe 'batch error handling' do
|
|
298
|
+
it 'raises IOError for missing files in batch' do
|
|
299
|
+
paths = [
|
|
300
|
+
'/nonexistent/file1.txt',
|
|
301
|
+
'/nonexistent/file2.txt'
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
expect do
|
|
305
|
+
described_class.batch_extract_files_sync(paths: paths)
|
|
306
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it 'raises IOError when batch contains invalid paths' do
|
|
310
|
+
paths = []
|
|
311
|
+
temp_dir = Dir.mktmpdir
|
|
312
|
+
|
|
313
|
+
valid_path = File.join(temp_dir, 'valid.txt')
|
|
314
|
+
File.write(valid_path, 'Valid content')
|
|
315
|
+
paths << valid_path
|
|
316
|
+
|
|
317
|
+
paths << '/nonexistent/invalid.txt'
|
|
318
|
+
|
|
319
|
+
expect do
|
|
320
|
+
described_class.batch_extract_files_sync(paths: paths)
|
|
321
|
+
end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
|
|
322
|
+
ensure
|
|
323
|
+
FileUtils.remove_entry(temp_dir)
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
it 'raises error on invalid mime type in byte batch' do
|
|
327
|
+
data = ['Content']
|
|
328
|
+
mime_types = ['invalid/mime/type']
|
|
329
|
+
|
|
330
|
+
expect do
|
|
331
|
+
described_class.batch_extract_bytes_sync(data_array: data, mime_types: mime_types)
|
|
332
|
+
end.not_to raise_error
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
describe 'batch caching behavior' do
|
|
337
|
+
it 'respects cache configuration in batch' do
|
|
338
|
+
paths = []
|
|
339
|
+
temp_dir = Dir.mktmpdir
|
|
340
|
+
|
|
341
|
+
2.times do |i|
|
|
342
|
+
file_path = File.join(temp_dir, "cache_test_#{i}.txt")
|
|
343
|
+
File.write(file_path, "Cache test #{i}")
|
|
344
|
+
paths << file_path
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
config_no_cache = Kreuzberg::Config::Extraction.new(use_cache: false)
|
|
348
|
+
|
|
349
|
+
results1 = described_class.batch_extract_files_sync(paths: paths, config: config_no_cache)
|
|
350
|
+
results2 = described_class.batch_extract_files_sync(paths: paths, config: config_no_cache)
|
|
351
|
+
|
|
352
|
+
expect(results1.length).to eq(results2.length)
|
|
353
|
+
results1.each_with_index do |result, idx|
|
|
354
|
+
expect(result.content).to eq(results2[idx].content)
|
|
355
|
+
end
|
|
356
|
+
ensure
|
|
357
|
+
FileUtils.remove_entry(temp_dir)
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
end
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'Cache Management' do
|
|
6
|
+
let(:test_pdf) do
|
|
7
|
+
test_document_path('pdf/5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf')
|
|
8
|
+
end
|
|
9
|
+
let(:test_text) { test_document_path('text/contract_test.txt') }
|
|
10
|
+
let(:test_docx) { test_document_path('docx/extraction_test.docx') }
|
|
11
|
+
|
|
12
|
+
before do
|
|
13
|
+
Kreuzberg.clear_cache
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
after do
|
|
17
|
+
Kreuzberg.clear_cache
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe 'clear_cache' do
|
|
21
|
+
it 'removes all cached results' do
|
|
22
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
23
|
+
Kreuzberg.extract_file_sync(path: test_text)
|
|
24
|
+
|
|
25
|
+
stats_before = Kreuzberg.cache_stats
|
|
26
|
+
expect(stats_before['total_entries']).to be_positive
|
|
27
|
+
|
|
28
|
+
Kreuzberg.clear_cache
|
|
29
|
+
|
|
30
|
+
stats_after = Kreuzberg.cache_stats
|
|
31
|
+
expect(stats_after['total_entries']).to eq(0)
|
|
32
|
+
expect(stats_after['total_size_bytes']).to eq(0)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'returns nil (void return)' do
|
|
36
|
+
result = Kreuzberg.clear_cache
|
|
37
|
+
expect(result).to be_nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it 'can be called multiple times safely' do
|
|
41
|
+
Kreuzberg.clear_cache
|
|
42
|
+
Kreuzberg.clear_cache
|
|
43
|
+
Kreuzberg.clear_cache
|
|
44
|
+
|
|
45
|
+
stats = Kreuzberg.cache_stats
|
|
46
|
+
expect(stats['total_entries']).to eq(0)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it 'does not affect future extractions' do
|
|
50
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
51
|
+
Kreuzberg.clear_cache
|
|
52
|
+
|
|
53
|
+
result = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
54
|
+
|
|
55
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
56
|
+
expect(result.content).not_to be_empty
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
describe 'cache_stats' do
|
|
61
|
+
it 'returns hash with correct structure' do
|
|
62
|
+
stats = Kreuzberg.cache_stats
|
|
63
|
+
|
|
64
|
+
expect(stats).to be_a(Hash)
|
|
65
|
+
expect(stats).to have_key('total_entries')
|
|
66
|
+
expect(stats).to have_key('total_size_bytes')
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it 'returns zero stats when cache is empty' do
|
|
70
|
+
Kreuzberg.clear_cache
|
|
71
|
+
stats = Kreuzberg.cache_stats
|
|
72
|
+
|
|
73
|
+
expect(stats['total_entries']).to eq(0)
|
|
74
|
+
expect(stats['total_size_bytes']).to eq(0)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it 'shows entries after extractions' do
|
|
78
|
+
Kreuzberg.clear_cache
|
|
79
|
+
|
|
80
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
81
|
+
stats = Kreuzberg.cache_stats
|
|
82
|
+
|
|
83
|
+
expect(stats['total_entries']).to be_positive
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it 'shows total size in bytes' do
|
|
87
|
+
Kreuzberg.clear_cache
|
|
88
|
+
|
|
89
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
90
|
+
stats = Kreuzberg.cache_stats
|
|
91
|
+
|
|
92
|
+
expect(stats['total_size_bytes']).to be_positive
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
it 'increases stats with multiple extractions' do
|
|
96
|
+
Kreuzberg.clear_cache
|
|
97
|
+
|
|
98
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
99
|
+
stats_after_one = Kreuzberg.cache_stats
|
|
100
|
+
|
|
101
|
+
Kreuzberg.extract_file_sync(path: test_text)
|
|
102
|
+
stats_after_two = Kreuzberg.cache_stats
|
|
103
|
+
|
|
104
|
+
expect(stats_after_two['total_entries']).to be >= stats_after_one['total_entries']
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
describe 'cache behavior across extractions' do
|
|
109
|
+
it 'caches extraction results' do
|
|
110
|
+
Kreuzberg.clear_cache
|
|
111
|
+
stats_initial = Kreuzberg.cache_stats
|
|
112
|
+
expect(stats_initial['total_entries']).to eq(0)
|
|
113
|
+
|
|
114
|
+
result1 = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
115
|
+
stats_after_first = Kreuzberg.cache_stats
|
|
116
|
+
expect(stats_after_first['total_entries']).to be_positive
|
|
117
|
+
|
|
118
|
+
result2 = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
119
|
+
stats_after_second = Kreuzberg.cache_stats
|
|
120
|
+
|
|
121
|
+
expect(result1.content).to eq(result2.content)
|
|
122
|
+
expect(stats_after_second['total_entries']).to eq(stats_after_first['total_entries'] + 1)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
it 'tracks different files separately' do
|
|
126
|
+
Kreuzberg.clear_cache
|
|
127
|
+
|
|
128
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
129
|
+
stats_after_pdf = Kreuzberg.cache_stats
|
|
130
|
+
|
|
131
|
+
Kreuzberg.extract_file_sync(path: test_text)
|
|
132
|
+
stats_after_text = Kreuzberg.cache_stats
|
|
133
|
+
|
|
134
|
+
expect(stats_after_text['total_entries']).to be >= stats_after_pdf['total_entries']
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
it 'second extraction of same file may use cache' do
|
|
138
|
+
Kreuzberg.clear_cache
|
|
139
|
+
|
|
140
|
+
Time.now
|
|
141
|
+
result1 = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
142
|
+
Time.now
|
|
143
|
+
|
|
144
|
+
Time.now
|
|
145
|
+
result2 = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
146
|
+
Time.now
|
|
147
|
+
|
|
148
|
+
expect(result1.content).to eq(result2.content)
|
|
149
|
+
expect(result1.mime_type).to eq(result2.mime_type)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it 'clears cache between extractions when requested' do
|
|
153
|
+
result1 = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
154
|
+
|
|
155
|
+
Kreuzberg.clear_cache
|
|
156
|
+
|
|
157
|
+
result2 = Kreuzberg.extract_file_sync(path: test_pdf)
|
|
158
|
+
|
|
159
|
+
expect(result1.content).to eq(result2.content)
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
describe 'cache with different configurations' do
|
|
164
|
+
it 'respects use_cache flag in configs' do
|
|
165
|
+
Kreuzberg.clear_cache
|
|
166
|
+
|
|
167
|
+
config1 = Kreuzberg::Config::Extraction.new(use_cache: true)
|
|
168
|
+
config2 = Kreuzberg::Config::Extraction.new(use_cache: false)
|
|
169
|
+
|
|
170
|
+
Kreuzberg.extract_file_sync(path: test_pdf, config: config1)
|
|
171
|
+
stats_after_first = Kreuzberg.cache_stats
|
|
172
|
+
|
|
173
|
+
Kreuzberg.extract_file_sync(path: test_pdf, config: config2)
|
|
174
|
+
stats_after_second = Kreuzberg.cache_stats
|
|
175
|
+
|
|
176
|
+
expect(stats_after_second['total_entries']).to eq(stats_after_first['total_entries'])
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
describe 'cache stats consistency' do
|
|
181
|
+
it 'stats remain consistent after clear' do
|
|
182
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
183
|
+
Kreuzberg.extract_file_sync(path: test_text)
|
|
184
|
+
|
|
185
|
+
Kreuzberg.clear_cache
|
|
186
|
+
stats = Kreuzberg.cache_stats
|
|
187
|
+
|
|
188
|
+
expect(stats['total_entries']).to eq(0)
|
|
189
|
+
expect(stats['total_size_bytes']).to eq(0)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'stats update correctly after new extractions' do
|
|
193
|
+
Kreuzberg.clear_cache
|
|
194
|
+
|
|
195
|
+
Kreuzberg.extract_file_sync(path: test_pdf)
|
|
196
|
+
Kreuzberg.cache_stats
|
|
197
|
+
|
|
198
|
+
Kreuzberg.clear_cache
|
|
199
|
+
|
|
200
|
+
Kreuzberg.extract_file_sync(path: test_text)
|
|
201
|
+
stats2 = Kreuzberg.cache_stats
|
|
202
|
+
|
|
203
|
+
expect(stats2['total_entries']).to be_positive
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
describe 'integration with batch operations' do
|
|
208
|
+
it 'caches batch extraction results' do
|
|
209
|
+
Kreuzberg.clear_cache
|
|
210
|
+
|
|
211
|
+
results = Kreuzberg.batch_extract_files_sync(paths: [test_pdf, test_text])
|
|
212
|
+
stats = Kreuzberg.cache_stats
|
|
213
|
+
|
|
214
|
+
expect(results.length).to eq(2)
|
|
215
|
+
expect(stats['total_entries']).to be_positive
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
it 'clear_cache affects batch extractions' do
|
|
219
|
+
Kreuzberg.batch_extract_files_sync(paths: [test_pdf, test_text])
|
|
220
|
+
|
|
221
|
+
Kreuzberg.clear_cache
|
|
222
|
+
|
|
223
|
+
stats = Kreuzberg.cache_stats
|
|
224
|
+
expect(stats['total_entries']).to eq(0)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe Kreuzberg::CLIProxy do
|
|
4
|
+
describe '.find_cli_binary' do
|
|
5
|
+
context 'when binary exists' do
|
|
6
|
+
it 'finds the binary in search paths' do
|
|
7
|
+
binary = described_class.find_cli_binary
|
|
8
|
+
expect(binary).to be_a(Pathname)
|
|
9
|
+
expect(binary.file?).to be true
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
context 'when binary does not exist' do
|
|
14
|
+
before do
|
|
15
|
+
allow(described_class).to receive(:search_paths).and_return([])
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'raises MissingBinaryError' do
|
|
19
|
+
expect do
|
|
20
|
+
described_class.find_cli_binary
|
|
21
|
+
end.to raise_error(Kreuzberg::CLIProxy::MissingBinaryError, /not found/)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
describe '.call' do
|
|
27
|
+
context 'when binary is available' do
|
|
28
|
+
it 'executes CLI command successfully' do
|
|
29
|
+
output = described_class.call(['--version'])
|
|
30
|
+
expect(output).to be_a(String)
|
|
31
|
+
expect(output).not_to be_empty
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it 'raises CLIExecutionError on failure' do
|
|
35
|
+
expect do
|
|
36
|
+
described_class.call(['invalid-command'])
|
|
37
|
+
end.to raise_error(Kreuzberg::CLIProxy::CLIExecutionError)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
describe '.search_paths' do
|
|
43
|
+
it 'returns an array of Pathname objects' do
|
|
44
|
+
paths = described_class.search_paths('kreuzberg')
|
|
45
|
+
expect(paths).to be_an(Array)
|
|
46
|
+
expect(paths).to all(be_a(Pathname))
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it 'includes expected search locations' do
|
|
50
|
+
paths = described_class.search_paths('kreuzberg')
|
|
51
|
+
path_strings = paths.map(&:to_s)
|
|
52
|
+
|
|
53
|
+
expect(path_strings.any? { |p| p.include?('lib/bin') }).to be true
|
|
54
|
+
expect(path_strings.any? { |p| p.include?('target/release') }).to be true
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
describe '.root_path' do
|
|
59
|
+
it 'returns a Pathname' do
|
|
60
|
+
expect(described_class.root_path).to be_a(Pathname)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'points to an existing directory' do
|
|
64
|
+
expect(described_class.root_path.directory?).to be true
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
describe '.lib_path' do
|
|
69
|
+
it 'returns a Pathname' do
|
|
70
|
+
expect(described_class.lib_path).to be_a(Pathname)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it 'points to an existing directory' do
|
|
74
|
+
expect(described_class.lib_path.directory?).to be true
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
describe '.missing_binary_message' do
|
|
79
|
+
it 'returns helpful error message' do
|
|
80
|
+
message = described_class.missing_binary_message
|
|
81
|
+
expect(message).to include('cargo build')
|
|
82
|
+
expect(message).to include('kreuzberg-cli')
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|