documentrix 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +69 -0
- data/documentrix.gemspec +5 -5
- data/lib/documentrix/documents/cache/common.rb +63 -11
- data/lib/documentrix/documents/cache/records.rb +1 -1
- data/lib/documentrix/documents/cache/redis_cache.rb +3 -3
- data/lib/documentrix/documents/cache/sqlite_cache.rb +95 -27
- data/lib/documentrix/documents/splitters/character.rb +56 -4
- data/lib/documentrix/documents/splitters/common.rb +38 -0
- data/lib/documentrix/documents/splitters/semantic.rb +67 -8
- data/lib/documentrix/documents.rb +133 -29
- data/lib/documentrix/utils/colorize_texts.rb +25 -21
- data/lib/documentrix/utils/digests.rb +78 -0
- data/lib/documentrix/utils.rb +1 -0
- data/lib/documentrix/version.rb +1 -1
- data/spec/documentrix/documents/cache/interface_spec.rb +16 -3
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +64 -2
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +68 -19
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +128 -2
- data/spec/documentrix/documents/splitters/character_spec.rb +20 -2
- data/spec/documentrix/documents/splitters/semantic_spec.rb +17 -5
- data/spec/documents_spec.rb +59 -3
- data/spec/utils/colorize_texts_spec.rb +0 -2
- data/spec/utils/digests_spec.rb +97 -0
- data/spec/utils/tags_spec.rb +0 -2
- metadata +7 -1
|
@@ -1,28 +1,20 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::RedisCache do
|
|
2
|
+
let :object_class do
|
|
3
|
+
Documentrix::Documents::Cache::Records::Record
|
|
4
|
+
end
|
|
5
|
+
|
|
4
6
|
let :prefix do
|
|
5
7
|
'test-'
|
|
6
8
|
end
|
|
7
9
|
|
|
8
10
|
let :cache do
|
|
9
|
-
described_class.new prefix:, url: 'something'
|
|
11
|
+
described_class.new prefix:, url: 'something', object_class:
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
it 'can be instantiated' do
|
|
13
15
|
expect(cache).to be_a described_class
|
|
14
16
|
end
|
|
15
17
|
|
|
16
|
-
it 'defaults to nil object_class' do
|
|
17
|
-
expect(cache.object_class).to be_nil
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
it 'can be configured with object_class' do
|
|
21
|
-
object_class = Class.new(JSON::GenericObject)
|
|
22
|
-
cache = described_class.new(prefix:, url: 'something', object_class:)
|
|
23
|
-
expect(cache.object_class).to eq object_class
|
|
24
|
-
end
|
|
25
|
-
|
|
26
18
|
it 'raises ArgumentError if url is missing' do
|
|
27
19
|
expect {
|
|
28
20
|
described_class.new prefix:, url: nil
|
|
@@ -38,6 +30,10 @@ describe Documentrix::Documents::RedisCache do
|
|
|
38
30
|
allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
|
|
39
31
|
end
|
|
40
32
|
|
|
33
|
+
it 'can be configured with object_class' do
|
|
34
|
+
expect(cache.object_class).to eq object_class
|
|
35
|
+
end
|
|
36
|
+
|
|
41
37
|
it 'has Redis client' do
|
|
42
38
|
expect(cache.redis).to eq redis
|
|
43
39
|
end
|
|
@@ -62,9 +58,9 @@ describe Documentrix::Documents::RedisCache do
|
|
|
62
58
|
end
|
|
63
59
|
|
|
64
60
|
it 'can move prefixes' do
|
|
65
|
-
expect(redis).to receive(:get).with(prefix + 'foo').and_return(
|
|
66
|
-
expect(redis).to receive(:get).with('test2-bar').and_return(
|
|
67
|
-
expect(redis).to receive(:set).with('test3-foo',
|
|
61
|
+
expect(redis).to receive(:get).with(prefix + 'foo').and_return(object_class[foo: true].to_json)
|
|
62
|
+
expect(redis).to receive(:get).with('test2-bar').and_return(object_class[foo: true].to_json)
|
|
63
|
+
expect(redis).to receive(:set).with('test3-foo', /"foo":true/)
|
|
68
64
|
expect(redis).to receive(:del).with('test-foo')
|
|
69
65
|
expect(redis).to receive(:scan_each).with(match: ?*).
|
|
70
66
|
and_yield("#{prefix}foo").
|
|
@@ -81,12 +77,12 @@ describe Documentrix::Documents::RedisCache do
|
|
|
81
77
|
end
|
|
82
78
|
|
|
83
79
|
it 'can iterate over keys, values' do
|
|
84
|
-
key, value = 'foo',
|
|
85
|
-
expect(redis).to receive(:set).with(prefix + key,
|
|
80
|
+
key, value = 'foo', object_class[test: true]
|
|
81
|
+
expect(redis).to receive(:set).with(prefix + key, object_class[value].to_json)
|
|
86
82
|
cache[key] = value
|
|
87
83
|
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
|
|
88
84
|
and_yield("#{prefix}foo")
|
|
89
|
-
expect(redis).to receive(:get).with(prefix + key).and_return(
|
|
85
|
+
expect(redis).to receive(:get).with(prefix + key).and_return(object_class[test: true].to_json)
|
|
90
86
|
cache.each do |k, v|
|
|
91
87
|
expect(k).to eq prefix + key
|
|
92
88
|
expect(v).to eq value
|
|
@@ -135,5 +131,58 @@ describe Documentrix::Documents::RedisCache do
|
|
|
135
131
|
it 'can remove prefix with unpre' do
|
|
136
132
|
expect(cache.unpre('test-foo')).to eq 'foo'
|
|
137
133
|
end
|
|
134
|
+
|
|
135
|
+
it 'can iterate over unique sources' do
|
|
136
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
137
|
+
"#{prefix}foo"
|
|
138
|
+
).and_yield(
|
|
139
|
+
"#{prefix}bar"
|
|
140
|
+
)
|
|
141
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1'))
|
|
142
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2'))
|
|
143
|
+
|
|
144
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
it 'can retrieve all unique tags' do
|
|
148
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
149
|
+
"#{prefix}foo"
|
|
150
|
+
).and_yield(
|
|
151
|
+
"#{prefix}bar"
|
|
152
|
+
)
|
|
153
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['a', 'b']))
|
|
154
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['b', 'c']))
|
|
155
|
+
|
|
156
|
+
expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'can clear records by tags' do
|
|
160
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
161
|
+
"#{prefix}foo"
|
|
162
|
+
).and_yield(
|
|
163
|
+
"#{prefix}bar"
|
|
164
|
+
)
|
|
165
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['trash']))
|
|
166
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['keep']))
|
|
167
|
+
expect(redis).to receive(:del).with("#{prefix}foo")
|
|
168
|
+
|
|
169
|
+
expect(cache.clear_for_tags(['trash'])).to eq cache
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it 'can check if a source exists with a specific digest' do
|
|
173
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
174
|
+
"#{prefix}foo"
|
|
175
|
+
)
|
|
176
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
|
|
177
|
+
|
|
178
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
179
|
+
|
|
180
|
+
# Reset for the negative case
|
|
181
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
182
|
+
"#{prefix}foo"
|
|
183
|
+
)
|
|
184
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
|
|
185
|
+
expect(cache.source_exist?('s1', digest: 'd2')).to be false
|
|
186
|
+
end
|
|
138
187
|
end
|
|
139
188
|
end
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::SQLiteCache do
|
|
4
2
|
let :prefix do
|
|
5
3
|
'test-'
|
|
@@ -145,6 +143,20 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
145
143
|
expect(cache).to be_key 'bar'
|
|
146
144
|
end
|
|
147
145
|
|
|
146
|
+
it 'can clear all without tags' do
|
|
147
|
+
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
|
148
|
+
cache[key] = value
|
|
149
|
+
key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
|
|
150
|
+
cache[key] = value
|
|
151
|
+
expect {
|
|
152
|
+
expect(cache.clear_for_tags).to eq cache
|
|
153
|
+
}.to change {
|
|
154
|
+
cache.size
|
|
155
|
+
}.from(2).to(0)
|
|
156
|
+
expect(cache).not_to be_key 'foo'
|
|
157
|
+
expect(cache).not_to be_key 'bar'
|
|
158
|
+
end
|
|
159
|
+
|
|
148
160
|
it 'can clear by source' do
|
|
149
161
|
val1 = test_value.merge(source: 's1')
|
|
150
162
|
val2 = test_value.merge(source: 's1')
|
|
@@ -159,6 +171,46 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
159
171
|
expect(cache.key?('foo')).to be false
|
|
160
172
|
end
|
|
161
173
|
|
|
174
|
+
it 'can clear by source and digest' do
|
|
175
|
+
allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2', 'd3')
|
|
176
|
+
cache['foo'] = test_value.merge(source: 's1') # d1
|
|
177
|
+
cache['bar'] = test_value.merge(source: 's1') # d2
|
|
178
|
+
cache['baz'] = test_value.merge(source: 's1') # d3
|
|
179
|
+
|
|
180
|
+
# Clear those that match d1
|
|
181
|
+
expect {
|
|
182
|
+
cache.clear_by_source('s1', digest: 'd1')
|
|
183
|
+
}.to change { cache.size }.from(3).to(2)
|
|
184
|
+
expect(cache.key?('foo')).to be false
|
|
185
|
+
expect(cache.key?('bar')).to be true
|
|
186
|
+
|
|
187
|
+
# Clear those that do NOT match d2 (should clear baz)
|
|
188
|
+
expect {
|
|
189
|
+
cache.clear_by_source('s1', digest: 'd2', operator: '!=')
|
|
190
|
+
}.to change { cache.size }.from(2).to(1)
|
|
191
|
+
expect(cache.key?('baz')).to be false
|
|
192
|
+
expect(cache.key?('bar')).to be true
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
describe '#source_exist?' do
|
|
196
|
+
it 'returns true if source exists' do
|
|
197
|
+
cache['foo'] = test_value
|
|
198
|
+
expect(cache.source_exist?('for-test.txt')).to be true
|
|
199
|
+
expect(cache.source_exist?('non-existent')).to be false
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it 'filters by digest' do
|
|
203
|
+
allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2')
|
|
204
|
+
cache['foo'] = test_value.merge(source: 's1') # d1
|
|
205
|
+
cache['bar'] = test_value.merge(source: 's1') # d2
|
|
206
|
+
|
|
207
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
208
|
+
expect(cache.source_exist?('s1', digest: 'd3')).to be false
|
|
209
|
+
expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be true # bar exists
|
|
210
|
+
expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true # foo exists
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
162
214
|
it 'can return tags' do
|
|
163
215
|
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
|
164
216
|
cache[key] = value
|
|
@@ -169,6 +221,17 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
169
221
|
expect(tags.to_a).to eq %w[ bar baz foo ]
|
|
170
222
|
end
|
|
171
223
|
|
|
224
|
+
it 'can iterate over unique sources' do
|
|
225
|
+
val1 = test_value.merge(source: 's1')
|
|
226
|
+
val2 = test_value.merge(source: 's1')
|
|
227
|
+
val3 = test_value.merge(source: 's2')
|
|
228
|
+
cache['foo'] = val1
|
|
229
|
+
cache['bar'] = val2
|
|
230
|
+
cache['baz'] = val3
|
|
231
|
+
|
|
232
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
233
|
+
end
|
|
234
|
+
|
|
172
235
|
it 'can iterate over keys under a prefix' do
|
|
173
236
|
cache['foo'] = test_value
|
|
174
237
|
expect(cache.each.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
|
|
@@ -186,4 +249,67 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
186
249
|
["test2-bar", Documentrix::Documents::Record[test_value] ],
|
|
187
250
|
]
|
|
188
251
|
end
|
|
252
|
+
|
|
253
|
+
describe '#find_records' do
|
|
254
|
+
let(:needle) { [ 0.5 ] * 1_024 }
|
|
255
|
+
|
|
256
|
+
it 'raises ArgumentError if needle length is incorrect' do
|
|
257
|
+
expect {
|
|
258
|
+
cache.find_records([ 0.1 ])
|
|
259
|
+
}.to raise_error(ArgumentError, /needle embedding length/)
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
it 'returns the most similar record' do
|
|
263
|
+
# Record 1: Exact match
|
|
264
|
+
val1 = test_value.merge(text: 'match', embedding: needle)
|
|
265
|
+
# Record 2: Different vector
|
|
266
|
+
val2 = test_value.merge(text: 'diff', embedding: [ 0.1 ] * 1_024)
|
|
267
|
+
|
|
268
|
+
cache['r1'] = val1
|
|
269
|
+
cache['r2'] = val2
|
|
270
|
+
|
|
271
|
+
results = cache.find_records(needle)
|
|
272
|
+
|
|
273
|
+
expect(results.size).to eq 2
|
|
274
|
+
expect(results.first.text).to eq 'match'
|
|
275
|
+
expect(results.first.similarity).to be_within(0.001).of(1.0)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
it 'filters results by tags' do
|
|
279
|
+
val1 = test_value.merge(text: 'tagged', tags: %w[ a ], embedding: needle)
|
|
280
|
+
val2 = test_value.merge(text: 'untagged', tags: %w[ b ], embedding: needle)
|
|
281
|
+
|
|
282
|
+
cache['r1'] = val1
|
|
283
|
+
cache['r2'] = val2
|
|
284
|
+
|
|
285
|
+
expect(cache.find_records(needle, tags: %w[ a ]).map(&:text)).to eq %w[ tagged ]
|
|
286
|
+
expect(cache.find_records(needle, tags: %w[ b ]).map(&:text)).to eq %w[ untagged ]
|
|
287
|
+
expect(cache.find_records(needle, tags: %w[ c ]).size).to eq 0
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
it 'filters results by min_similarity' do
|
|
291
|
+
# Exact match
|
|
292
|
+
cache['r1'] = test_value.merge(text: 'match', embedding: needle)
|
|
293
|
+
# Very different vector
|
|
294
|
+
cache['r2'] = test_value.merge(text: 'diff', embedding: [ -0.5 ] * 1_024)
|
|
295
|
+
|
|
296
|
+
# Low threshold: both should appear
|
|
297
|
+
expect(cache.find_records(needle, min_similarity: -1).size).to eq 2
|
|
298
|
+
|
|
299
|
+
# High threshold: only match should appear
|
|
300
|
+
expect(cache.find_records(needle, min_similarity: 0.9).map(&:text)).to eq %w[ match ]
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it 'limits results via max_records' do
|
|
304
|
+
3.times do |i|
|
|
305
|
+
cache["r#{i}"] = test_value.merge(text: "t#{i}", embedding: needle)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
expect(cache.find_records(needle, max_records: 2).size).to eq 2
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
it 'returns empty array when no records match' do
|
|
312
|
+
expect(cache.find_records(needle)).to eq []
|
|
313
|
+
end
|
|
314
|
+
end
|
|
189
315
|
end
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::Splitters::Character do
|
|
4
2
|
let :splitter do
|
|
5
3
|
described_class.new chunk_size: 23, combining_string: ''
|
|
@@ -50,6 +48,26 @@ describe Documentrix::Documents::Splitters::Character do
|
|
|
50
48
|
expect(result.to_a.join('')).to eq ?A * 25
|
|
51
49
|
end
|
|
52
50
|
|
|
51
|
+
context 'with force' do
|
|
52
|
+
let :splitter do
|
|
53
|
+
described_class.new chunk_size: 23, combining_string: '', force: true
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'can split with force' do
|
|
57
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
|
58
|
+
result = splitter.split(text)
|
|
59
|
+
expect(result.count).to eq 10
|
|
60
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'can with force split2' do
|
|
64
|
+
text = ?A * 25
|
|
65
|
+
result = splitter.split(text)
|
|
66
|
+
expect(result.count).to eq 2
|
|
67
|
+
expect(result.to_a.join('')).to eq ?A * 25
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
53
71
|
it 'can split sentences' do
|
|
54
72
|
text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
|
|
55
73
|
splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::Splitters::Semantic do
|
|
4
2
|
let :ollama do
|
|
5
3
|
double('Ollama::Client')
|
|
@@ -29,11 +27,11 @@ describe Documentrix::Documents::Splitters::Semantic do
|
|
|
29
27
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
30
28
|
end
|
|
31
29
|
|
|
32
|
-
it 'can split with breakpoint :percentile' do
|
|
33
|
-
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size:
|
|
30
|
+
it 'can split with breakpoint :percentile, chunk_size 23' do
|
|
31
|
+
splitter = described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 23
|
|
34
32
|
text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
|
35
33
|
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
|
36
|
-
expect(result.count).to eq
|
|
34
|
+
expect(result.count).to eq 11
|
|
37
35
|
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
38
36
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
39
37
|
end
|
|
@@ -53,4 +51,18 @@ describe Documentrix::Documents::Splitters::Semantic do
|
|
|
53
51
|
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
54
52
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
55
53
|
end
|
|
54
|
+
|
|
55
|
+
context 'with force' do
|
|
56
|
+
let :splitter do
|
|
57
|
+
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 7, force: true
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'can split with force' do
|
|
61
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
|
62
|
+
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
|
63
|
+
expect(result.count).to eq 18
|
|
64
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
65
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
56
68
|
end
|
data/spec/documents_spec.rb
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents do
|
|
4
2
|
let :ollama do
|
|
5
3
|
double('Ollama::Client')
|
|
@@ -168,7 +166,7 @@ describe Documentrix::Documents do
|
|
|
168
166
|
|
|
169
167
|
expect(documents.size).to eq 3
|
|
170
168
|
|
|
171
|
-
documents.
|
|
169
|
+
documents.source_remove('source1')
|
|
172
170
|
|
|
173
171
|
expect(documents.size).to eq 1
|
|
174
172
|
expect(documents.exist?('baz')).to be true
|
|
@@ -211,4 +209,62 @@ describe Documentrix::Documents do
|
|
|
211
209
|
to(:new_collection)
|
|
212
210
|
end
|
|
213
211
|
end
|
|
212
|
+
|
|
213
|
+
context 'source management' do
|
|
214
|
+
before do
|
|
215
|
+
allow(documents).to receive(:compute_file_digest).and_return('d1')
|
|
216
|
+
allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
|
|
217
|
+
|
|
218
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
it 'can check if a source exists' do
|
|
222
|
+
documents.add('foo', source: 's1')
|
|
223
|
+
expect(documents.source_exist?('s1')).to be true
|
|
224
|
+
expect(documents.source_exist?('s2')).to be false
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
it 'can determine if a source is modified' do
|
|
228
|
+
documents.add('foo', source: 's1')
|
|
229
|
+
|
|
230
|
+
# Case 1: Source is up to date
|
|
231
|
+
expect(documents.source_modified?('s1')).to be false
|
|
232
|
+
|
|
233
|
+
# Case 2: Source is missing
|
|
234
|
+
expect(documents.source_modified?('s2')).to be true
|
|
235
|
+
|
|
236
|
+
# Case 3: Digest changed - mock both to return the new digest
|
|
237
|
+
allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
238
|
+
allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
239
|
+
expect(documents.source_modified?('s1')).to be true
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
it 'does not update the source if the digest matches' do
|
|
243
|
+
documents.add('foo', source: 's1')
|
|
244
|
+
|
|
245
|
+
expect(ollama).not_to receive(:embed)
|
|
246
|
+
documents.source_update(['foo'], source: 's1')
|
|
247
|
+
expect(documents.exist?('foo')).to be true
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
it 'updates the source if the digest has changed' do
|
|
251
|
+
documents.add('foo', source: 's1')
|
|
252
|
+
|
|
253
|
+
# Simulate a file change by updating the digest
|
|
254
|
+
allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
255
|
+
allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
256
|
+
|
|
257
|
+
expect(ollama).to receive(:embed).once
|
|
258
|
+
documents.source_update(['bar'], source: 's1')
|
|
259
|
+
|
|
260
|
+
expect(documents.exist?('bar')).to be true
|
|
261
|
+
expect(documents.exist?('foo')).to be false
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it 'updates the source if it is an URL' do
|
|
265
|
+
expect(ollama).to receive(:embed).once
|
|
266
|
+
documents.source_update('foo', source: 'https://www.example.com/s1')
|
|
267
|
+
expect(documents.exist?('foo')).to be true
|
|
268
|
+
end
|
|
269
|
+
end
|
|
214
270
|
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
require 'tempfile'
|
|
2
|
+
|
|
3
|
+
describe Documentrix::Utils::Digests do
|
|
4
|
+
let(:test_class) do
|
|
5
|
+
Class.new do
|
|
6
|
+
include Documentrix::Utils::Digests
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
let(:subject) { test_class.new.expose }
|
|
11
|
+
|
|
12
|
+
describe '#compute_digest' do
|
|
13
|
+
it 'computes a valid SHA256 digest of a string' do
|
|
14
|
+
text = 'hello world'
|
|
15
|
+
expected = Digest::SHA256.hexdigest(text)
|
|
16
|
+
expect(subject.compute_digest(text)).to eq expected
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe '#compute_file_digest' do
|
|
21
|
+
it 'returns nil for an empty filename' do
|
|
22
|
+
expect(subject.compute_file_digest(nil)).to be_nil
|
|
23
|
+
expect(subject.compute_file_digest('')).to be_nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it 'returns nil for an absolute URL' do
|
|
27
|
+
expect(subject.compute_file_digest('https://example.com/file.txt')).to be_nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
it 'returns nil for a non-existent file' do
|
|
31
|
+
expect(subject.compute_file_digest('/tmp/non_existent_file_12345')).to be_nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it 'computes the digest of a local file' do
|
|
35
|
+
file = Tempfile.create('documentrix_test')
|
|
36
|
+
content = 'file content'
|
|
37
|
+
file.write(content)
|
|
38
|
+
file.close
|
|
39
|
+
|
|
40
|
+
expected = Digest::SHA256.hexdigest(content)
|
|
41
|
+
expect(subject.compute_file_digest(file.path)).to eq expected
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
context 'with caching' do
|
|
45
|
+
let(:file) { Tempfile.create('documentrix_cache_test') }
|
|
46
|
+
let(:content) { 'initial content' }
|
|
47
|
+
|
|
48
|
+
before do
|
|
49
|
+
file.write(content)
|
|
50
|
+
file.close
|
|
51
|
+
subject.file_digest_cache_clear
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'returns the same digest on subsequent calls' do
|
|
55
|
+
digest1 = subject.compute_file_digest(file.path)
|
|
56
|
+
digest2 = subject.compute_file_digest(file.path)
|
|
57
|
+
expect(digest1).to eq digest2
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'recomputes the digest when the file is modified' do
|
|
61
|
+
digest1 = subject.compute_file_digest(file.path)
|
|
62
|
+
|
|
63
|
+
# Update file content and force mtime change
|
|
64
|
+
File.write(file.path, 'updated content')
|
|
65
|
+
# Ensure mtime is actually different (some FS have low precision)
|
|
66
|
+
File.utime(Time.now + 1, Time.now + 1, file.path)
|
|
67
|
+
|
|
68
|
+
digest2 = subject.compute_file_digest(file.path)
|
|
69
|
+
expect(digest1).not_to eq digest2
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'recomputes the digest after cache clear' do
|
|
73
|
+
digest1 = subject.compute_file_digest(file.path)
|
|
74
|
+
subject.file_digest_cache_clear
|
|
75
|
+
|
|
76
|
+
# Even though file hasn't changed, it should re-read and return same value
|
|
77
|
+
digest2 = subject.compute_file_digest(file.path)
|
|
78
|
+
expect(digest1).to eq digest2
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
describe '#file_digest_cache_clear' do
|
|
84
|
+
it 'clears the internal cache' do
|
|
85
|
+
file = Tempfile.create('documentrix_clear_test')
|
|
86
|
+
file.write('test')
|
|
87
|
+
file.close
|
|
88
|
+
|
|
89
|
+
subject.compute_file_digest(file.path)
|
|
90
|
+
subject.file_digest_cache_clear
|
|
91
|
+
|
|
92
|
+
# We can verify this indirectly by checking if the cache is empty
|
|
93
|
+
# or by the fact that it will re-compute in tests.
|
|
94
|
+
expect(subject).to respond_to(:file_digest_cache_clear)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
data/spec/utils/tags_spec.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: documentrix
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Florian Frank
|
|
@@ -249,9 +249,11 @@ extra_rdoc_files:
|
|
|
249
249
|
- lib/documentrix/documents/cache/redis_cache.rb
|
|
250
250
|
- lib/documentrix/documents/cache/sqlite_cache.rb
|
|
251
251
|
- lib/documentrix/documents/splitters/character.rb
|
|
252
|
+
- lib/documentrix/documents/splitters/common.rb
|
|
252
253
|
- lib/documentrix/documents/splitters/semantic.rb
|
|
253
254
|
- lib/documentrix/utils.rb
|
|
254
255
|
- lib/documentrix/utils/colorize_texts.rb
|
|
256
|
+
- lib/documentrix/utils/digests.rb
|
|
255
257
|
- lib/documentrix/utils/math.rb
|
|
256
258
|
- lib/documentrix/utils/tags.rb
|
|
257
259
|
- lib/documentrix/version.rb
|
|
@@ -274,9 +276,11 @@ files:
|
|
|
274
276
|
- lib/documentrix/documents/cache/redis_cache.rb
|
|
275
277
|
- lib/documentrix/documents/cache/sqlite_cache.rb
|
|
276
278
|
- lib/documentrix/documents/splitters/character.rb
|
|
279
|
+
- lib/documentrix/documents/splitters/common.rb
|
|
277
280
|
- lib/documentrix/documents/splitters/semantic.rb
|
|
278
281
|
- lib/documentrix/utils.rb
|
|
279
282
|
- lib/documentrix/utils/colorize_texts.rb
|
|
283
|
+
- lib/documentrix/utils/digests.rb
|
|
280
284
|
- lib/documentrix/utils/math.rb
|
|
281
285
|
- lib/documentrix/utils/tags.rb
|
|
282
286
|
- lib/documentrix/version.rb
|
|
@@ -291,6 +295,7 @@ files:
|
|
|
291
295
|
- spec/documents_spec.rb
|
|
292
296
|
- spec/spec_helper.rb
|
|
293
297
|
- spec/utils/colorize_texts_spec.rb
|
|
298
|
+
- spec/utils/digests_spec.rb
|
|
294
299
|
- spec/utils/tags_spec.rb
|
|
295
300
|
homepage: https://github.com/flori/documentrix
|
|
296
301
|
licenses:
|
|
@@ -327,4 +332,5 @@ test_files:
|
|
|
327
332
|
- spec/documents_spec.rb
|
|
328
333
|
- spec/spec_helper.rb
|
|
329
334
|
- spec/utils/colorize_texts_spec.rb
|
|
335
|
+
- spec/utils/digests_spec.rb
|
|
330
336
|
- spec/utils/tags_spec.rb
|