documentrix 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::MemoryCache do
4
2
  let :prefix do
5
3
  'test-'
@@ -120,8 +118,83 @@ describe Documentrix::Documents::MemoryCache do
120
118
  }.from(1).to(0)
121
119
  end
122
120
 
121
+ it 'can clear by source' do
122
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
123
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
124
+ cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
125
+ expect {
126
+ cache.clear_by_source('s1')
127
+ }.to change { cache.size }.from(3).to(1)
128
+ expect(cache.key?('baz')).to be true
129
+ expect(cache.key?('foo')).to be false
130
+ end
131
+
123
132
  it 'can iterate over keys under a prefix' do
124
133
  cache['foo'] = 'bar'
125
134
  expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
126
135
  end
136
+
137
+ it 'can iterate over unique sources' do
138
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
139
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
140
+ cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
141
+
142
+ expect(cache.each_source.to_a).to match_array(['s1', 's2'])
143
+ end
144
+
145
+ it 'can retrieve all unique tags' do
146
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['a', 'b'], embedding: [0.1]]
147
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's2', tags: ['b', 'c'], embedding: [0.1]]
148
+
149
+ expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
150
+ end
151
+
152
+ it 'can clear records by tags' do
153
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['keep'], embedding: [0.1]]
154
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', tags: ['trash'], embedding: [0.1]]
155
+
156
+ expect {
157
+ cache.clear_for_tags(['trash'])
158
+ }.to change { cache.size }.from(2).to(1)
159
+ expect(cache.key?('foo')).to be true
160
+ expect(cache.key?('bar')).to be false
161
+ end
162
+
163
+ it 'can check if a source exists' do
164
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
165
+
166
+ expect(cache.source_exist?('s1')).to be true
167
+ expect(cache.source_exist?('s2')).to be false
168
+ end
169
+
170
+ it 'can clear by source with a specific digest' do
171
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
172
+ cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
173
+
174
+ expect {
175
+ cache.clear_by_source('s1', digest: 'd1')
176
+ }.to change { cache.size }.from(2).to(1)
177
+ expect(cache.key?('f2')).to be true
178
+ expect(cache.key?('f1')).to be false
179
+ end
180
+
181
+ it 'can clear outdated versions of a source' do
182
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
183
+ cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
184
+
185
+ expect {
186
+ cache.clear_by_source('s1', digest: 'd2', operator: '!=')
187
+ }.to change { cache.size }.from(2).to(1)
188
+ expect(cache.key?('f2')).to be true
189
+ expect(cache.key?('f1')).to be false
190
+ end
191
+
192
+ it 'can check if a source exists with a specific digest' do
193
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
194
+
195
+ expect(cache.source_exist?('s1', digest: 'd1')).to be true
196
+ expect(cache.source_exist?('s1', digest: 'd2')).to be false
197
+ expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be false
198
+ expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true
199
+ end
127
200
  end
@@ -1,28 +1,20 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::RedisCache do
2
+ let :object_class do
3
+ Documentrix::Documents::Cache::Records::Record
4
+ end
5
+
4
6
  let :prefix do
5
7
  'test-'
6
8
  end
7
9
 
8
10
  let :cache do
9
- described_class.new prefix:, url: 'something'
11
+ described_class.new prefix:, url: 'something', object_class:
10
12
  end
11
13
 
12
14
  it 'can be instantiated' do
13
15
  expect(cache).to be_a described_class
14
16
  end
15
17
 
16
- it 'defaults to nil object_class' do
17
- expect(cache.object_class).to be_nil
18
- end
19
-
20
- it 'can be configured with object_class' do
21
- object_class = Class.new(JSON::GenericObject)
22
- cache = described_class.new(prefix:, url: 'something', object_class:)
23
- expect(cache.object_class).to eq object_class
24
- end
25
-
26
18
  it 'raises ArgumentError if url is missing' do
27
19
  expect {
28
20
  described_class.new prefix:, url: nil
@@ -38,6 +30,10 @@ describe Documentrix::Documents::RedisCache do
38
30
  allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
39
31
  end
40
32
 
33
+ it 'can be configured with object_class' do
34
+ expect(cache.object_class).to eq object_class
35
+ end
36
+
41
37
  it 'has Redis client' do
42
38
  expect(cache.redis).to eq redis
43
39
  end
@@ -62,9 +58,9 @@ describe Documentrix::Documents::RedisCache do
62
58
  end
63
59
 
64
60
  it 'can move prefixes' do
65
- expect(redis).to receive(:get).with(prefix + 'foo').and_return(JSON(foo: true))
66
- expect(redis).to receive(:get).with('test2-bar').and_return(JSON(foo: true))
67
- expect(redis).to receive(:set).with('test3-foo', '{"foo":true}')
61
+ expect(redis).to receive(:get).with(prefix + 'foo').and_return(object_class[foo: true].to_json)
62
+ expect(redis).to receive(:get).with('test2-bar').and_return(object_class[foo: true].to_json)
63
+ expect(redis).to receive(:set).with('test3-foo', /"foo":true/)
68
64
  expect(redis).to receive(:del).with('test-foo')
69
65
  expect(redis).to receive(:scan_each).with(match: ?*).
70
66
  and_yield("#{prefix}foo").
@@ -81,12 +77,12 @@ describe Documentrix::Documents::RedisCache do
81
77
  end
82
78
 
83
79
  it 'can iterate over keys, values' do
84
- key, value = 'foo', { 'test' => true }
85
- expect(redis).to receive(:set).with(prefix + key, JSON(value))
80
+ key, value = 'foo', object_class[test: true]
81
+ expect(redis).to receive(:set).with(prefix + key, object_class[value].to_json)
86
82
  cache[key] = value
87
83
  expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
88
84
  and_yield("#{prefix}foo")
89
- expect(redis).to receive(:get).with(prefix + key).and_return(JSON(test: true))
85
+ expect(redis).to receive(:get).with(prefix + key).and_return(object_class[test: true].to_json)
90
86
  cache.each do |k, v|
91
87
  expect(k).to eq prefix + key
92
88
  expect(v).to eq value
@@ -109,6 +105,20 @@ describe Documentrix::Documents::RedisCache do
109
105
  expect(cache.clear).to eq cache
110
106
  end
111
107
 
108
+ it 'can clear by source' do
109
+ object_class = Class.new(JSON::GenericObject)
110
+ cache = described_class.new(prefix:, url: 'something', object_class:)
111
+ expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
112
+ 'test-foo'
113
+ ).and_yield(
114
+ 'test-bar'
115
+ )
116
+ expect(redis).to receive(:get).with('test-foo').and_return(JSON(source: 's1'))
117
+ expect(redis).to receive(:get).with('test-bar').and_return(JSON(source: 's2'))
118
+ expect(redis).to receive(:del).with('test-foo')
119
+ expect(cache.clear_by_source('s1')).to eq cache
120
+ end
121
+
112
122
  it 'can iterate over keys under a prefix' do
113
123
  expect(redis).to receive(:scan_each).with(match: 'test-*')
114
124
  cache.to_a
@@ -121,5 +131,58 @@ describe Documentrix::Documents::RedisCache do
121
131
  it 'can remove prefix with unpre' do
122
132
  expect(cache.unpre('test-foo')).to eq 'foo'
123
133
  end
134
+
135
+ it 'can iterate over unique sources' do
136
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
137
+ "#{prefix}foo"
138
+ ).and_yield(
139
+ "#{prefix}bar"
140
+ )
141
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1'))
142
+ expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2'))
143
+
144
+ expect(cache.each_source.to_a).to match_array(['s1', 's2'])
145
+ end
146
+
147
+ it 'can retrieve all unique tags' do
148
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
149
+ "#{prefix}foo"
150
+ ).and_yield(
151
+ "#{prefix}bar"
152
+ )
153
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['a', 'b']))
154
+ expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['b', 'c']))
155
+
156
+ expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
157
+ end
158
+
159
+ it 'can clear records by tags' do
160
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
161
+ "#{prefix}foo"
162
+ ).and_yield(
163
+ "#{prefix}bar"
164
+ )
165
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['trash']))
166
+ expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['keep']))
167
+ expect(redis).to receive(:del).with("#{prefix}foo")
168
+
169
+ expect(cache.clear_for_tags(['trash'])).to eq cache
170
+ end
171
+
172
+ it 'can check if a source exists with a specific digest' do
173
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
174
+ "#{prefix}foo"
175
+ )
176
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
177
+
178
+ expect(cache.source_exist?('s1', digest: 'd1')).to be true
179
+
180
+ # Reset for the negative case
181
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
182
+ "#{prefix}foo"
183
+ )
184
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
185
+ expect(cache.source_exist?('s1', digest: 'd2')).to be false
186
+ end
124
187
  end
125
188
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::SQLiteCache do
4
2
  let :prefix do
5
3
  'test-'
@@ -145,6 +143,74 @@ describe Documentrix::Documents::SQLiteCache do
145
143
  expect(cache).to be_key 'bar'
146
144
  end
147
145
 
146
+ it 'can clear all without tags' do
147
+ key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
148
+ cache[key] = value
149
+ key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
150
+ cache[key] = value
151
+ expect {
152
+ expect(cache.clear_for_tags).to eq cache
153
+ }.to change {
154
+ cache.size
155
+ }.from(2).to(0)
156
+ expect(cache).not_to be_key 'foo'
157
+ expect(cache).not_to be_key 'bar'
158
+ end
159
+
160
+ it 'can clear by source' do
161
+ val1 = test_value.merge(source: 's1')
162
+ val2 = test_value.merge(source: 's1')
163
+ val3 = test_value.merge(source: 's2')
164
+ cache['foo'] = val1
165
+ cache['bar'] = val2
166
+ cache['baz'] = val3
167
+ expect {
168
+ cache.clear_by_source('s1')
169
+ }.to change { cache.size }.from(3).to(1)
170
+ expect(cache.key?('baz')).to be true
171
+ expect(cache.key?('foo')).to be false
172
+ end
173
+
174
+ it 'can clear by source and digest' do
175
+ allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2', 'd3')
176
+ cache['foo'] = test_value.merge(source: 's1') # d1
177
+ cache['bar'] = test_value.merge(source: 's1') # d2
178
+ cache['baz'] = test_value.merge(source: 's1') # d3
179
+
180
+ # Clear those that match d1
181
+ expect {
182
+ cache.clear_by_source('s1', digest: 'd1')
183
+ }.to change { cache.size }.from(3).to(2)
184
+ expect(cache.key?('foo')).to be false
185
+ expect(cache.key?('bar')).to be true
186
+
187
+ # Clear those that do NOT match d2 (should clear baz)
188
+ expect {
189
+ cache.clear_by_source('s1', digest: 'd2', operator: '!=')
190
+ }.to change { cache.size }.from(2).to(1)
191
+ expect(cache.key?('baz')).to be false
192
+ expect(cache.key?('bar')).to be true
193
+ end
194
+
195
+ describe '#source_exist?' do
196
+ it 'returns true if source exists' do
197
+ cache['foo'] = test_value
198
+ expect(cache.source_exist?('for-test.txt')).to be true
199
+ expect(cache.source_exist?('non-existent')).to be false
200
+ end
201
+
202
+ it 'filters by digest' do
203
+ allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2')
204
+ cache['foo'] = test_value.merge(source: 's1') # d1
205
+ cache['bar'] = test_value.merge(source: 's1') # d2
206
+
207
+ expect(cache.source_exist?('s1', digest: 'd1')).to be true
208
+ expect(cache.source_exist?('s1', digest: 'd3')).to be false
209
+ expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be true # bar exists
210
+ expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true # foo exists
211
+ end
212
+ end
213
+
148
214
  it 'can return tags' do
149
215
  key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
150
216
  cache[key] = value
@@ -155,6 +221,17 @@ describe Documentrix::Documents::SQLiteCache do
155
221
  expect(tags.to_a).to eq %w[ bar baz foo ]
156
222
  end
157
223
 
224
+ it 'can iterate over unique sources' do
225
+ val1 = test_value.merge(source: 's1')
226
+ val2 = test_value.merge(source: 's1')
227
+ val3 = test_value.merge(source: 's2')
228
+ cache['foo'] = val1
229
+ cache['bar'] = val2
230
+ cache['baz'] = val3
231
+
232
+ expect(cache.each_source.to_a).to match_array(['s1', 's2'])
233
+ end
234
+
158
235
  it 'can iterate over keys under a prefix' do
159
236
  cache['foo'] = test_value
160
237
  expect(cache.each.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
@@ -172,4 +249,67 @@ describe Documentrix::Documents::SQLiteCache do
172
249
  ["test2-bar", Documentrix::Documents::Record[test_value] ],
173
250
  ]
174
251
  end
252
+
253
+ describe '#find_records' do
254
+ let(:needle) { [ 0.5 ] * 1_024 }
255
+
256
+ it 'raises ArgumentError if needle length is incorrect' do
257
+ expect {
258
+ cache.find_records([ 0.1 ])
259
+ }.to raise_error(ArgumentError, /needle embedding length/)
260
+ end
261
+
262
+ it 'returns the most similar record' do
263
+ # Record 1: Exact match
264
+ val1 = test_value.merge(text: 'match', embedding: needle)
265
+ # Record 2: Different vector
266
+ val2 = test_value.merge(text: 'diff', embedding: [ 0.1 ] * 1_024)
267
+
268
+ cache['r1'] = val1
269
+ cache['r2'] = val2
270
+
271
+ results = cache.find_records(needle)
272
+
273
+ expect(results.size).to eq 2
274
+ expect(results.first.text).to eq 'match'
275
+ expect(results.first.similarity).to be_within(0.001).of(1.0)
276
+ end
277
+
278
+ it 'filters results by tags' do
279
+ val1 = test_value.merge(text: 'tagged', tags: %w[ a ], embedding: needle)
280
+ val2 = test_value.merge(text: 'untagged', tags: %w[ b ], embedding: needle)
281
+
282
+ cache['r1'] = val1
283
+ cache['r2'] = val2
284
+
285
+ expect(cache.find_records(needle, tags: %w[ a ]).map(&:text)).to eq %w[ tagged ]
286
+ expect(cache.find_records(needle, tags: %w[ b ]).map(&:text)).to eq %w[ untagged ]
287
+ expect(cache.find_records(needle, tags: %w[ c ]).size).to eq 0
288
+ end
289
+
290
+ it 'filters results by min_similarity' do
291
+ # Exact match
292
+ cache['r1'] = test_value.merge(text: 'match', embedding: needle)
293
+ # Very different vector
294
+ cache['r2'] = test_value.merge(text: 'diff', embedding: [ -0.5 ] * 1_024)
295
+
296
+ # Low threshold: both should appear
297
+ expect(cache.find_records(needle, min_similarity: -1).size).to eq 2
298
+
299
+ # High threshold: only match should appear
300
+ expect(cache.find_records(needle, min_similarity: 0.9).map(&:text)).to eq %w[ match ]
301
+ end
302
+
303
+ it 'limits results via max_records' do
304
+ 3.times do |i|
305
+ cache["r#{i}"] = test_value.merge(text: "t#{i}", embedding: needle)
306
+ end
307
+
308
+ expect(cache.find_records(needle, max_records: 2).size).to eq 2
309
+ end
310
+
311
+ it 'returns empty array when no records match' do
312
+ expect(cache.find_records(needle)).to eq []
313
+ end
314
+ end
175
315
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::Splitters::Character do
4
2
  let :splitter do
5
3
  described_class.new chunk_size: 23, combining_string: ''
@@ -50,6 +48,26 @@ describe Documentrix::Documents::Splitters::Character do
50
48
  expect(result.to_a.join('')).to eq ?A * 25
51
49
  end
52
50
 
51
+ context 'with force' do
52
+ let :splitter do
53
+ described_class.new chunk_size: 23, combining_string: '', force: true
54
+ end
55
+
56
+ it 'can split with force' do
57
+ text = [ ?A * 10 ] * 10 * "\n"
58
+ result = splitter.split(text)
59
+ expect(result.count).to eq 10
60
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
61
+ end
62
+
63
+ it 'can with force split2' do
64
+ text = ?A * 25
65
+ result = splitter.split(text)
66
+ expect(result.count).to eq 2
67
+ expect(result.to_a.join('')).to eq ?A * 25
68
+ end
69
+ end
70
+
53
71
  it 'can split sentences' do
54
72
  text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
55
73
  splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::Splitters::Semantic do
4
2
  let :ollama do
5
3
  double('Ollama::Client')
@@ -29,11 +27,11 @@ describe Documentrix::Documents::Splitters::Semantic do
29
27
  expect(result.to_a.join('').count(?B)).to eq text.count(?B)
30
28
  end
31
29
 
32
- it 'can split with breakpoint :percentile' do
33
- described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 50
30
+ it 'can split with breakpoint :percentile, chunk_size 23' do
31
+ splitter = described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 23
34
32
  text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
35
33
  result = splitter.split(text, breakpoint: :percentile, percentile: 75)
36
- expect(result.count).to eq 4
34
+ expect(result.count).to eq 11
37
35
  expect(result.to_a.join('').count(?A)).to eq text.count(?A)
38
36
  expect(result.to_a.join('').count(?B)).to eq text.count(?B)
39
37
  end
@@ -53,4 +51,18 @@ describe Documentrix::Documents::Splitters::Semantic do
53
51
  expect(result.to_a.join('').count(?A)).to eq text.count(?A)
54
52
  expect(result.to_a.join('').count(?B)).to eq text.count(?B)
55
53
  end
54
+
55
+ context 'with force' do
56
+ let :splitter do
57
+ described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 7, force: true
58
+ end
59
+
60
+ it 'can split with force' do
61
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
62
+ result = splitter.split(text, breakpoint: :percentile, percentile: 75)
63
+ expect(result.count).to eq 18
64
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
65
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
66
+ end
67
+ end
56
68
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents do
4
2
  let :ollama do
5
3
  double('Ollama::Client')
@@ -158,6 +156,24 @@ describe Documentrix::Documents do
158
156
  }.to change { documents.size }.from(1).to(0)
159
157
  end
160
158
 
159
+ it 'can remove sources' do
160
+ allow(ollama).to receive(:embed).at_least(:once).
161
+ and_return(double(embeddings: [ [ 0.1 ] ]))
162
+
163
+ documents.add('foo', source: 'source1')
164
+ documents.add('bar', source: 'source1')
165
+ documents.add('baz', source: 'source2')
166
+
167
+ expect(documents.size).to eq 3
168
+
169
+ documents.source_remove('source1')
170
+
171
+ expect(documents.size).to eq 1
172
+ expect(documents.exist?('baz')).to be true
173
+ expect(documents.exist?('foo')).to be false
174
+ expect(documents.exist?('bar')).to be false
175
+ end
176
+
161
177
  it 'returns collections' do
162
178
  expect(documents.collections).to eq [ :default ]
163
179
  end
@@ -193,4 +209,62 @@ describe Documentrix::Documents do
193
209
  to(:new_collection)
194
210
  end
195
211
  end
212
+
213
+ context 'source management' do
214
+ before do
215
+ allow(documents).to receive(:compute_file_digest).and_return('d1')
216
+ allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
217
+
218
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
219
+ end
220
+
221
+ it 'can check if a source exists' do
222
+ documents.add('foo', source: 's1')
223
+ expect(documents.source_exist?('s1')).to be true
224
+ expect(documents.source_exist?('s2')).to be false
225
+ end
226
+
227
+ it 'can determine if a source is modified' do
228
+ documents.add('foo', source: 's1')
229
+
230
+ # Case 1: Source is up to date
231
+ expect(documents.source_modified?('s1')).to be false
232
+
233
+ # Case 2: Source is missing
234
+ expect(documents.source_modified?('s2')).to be true
235
+
236
+ # Case 3: Digest changed - mock both to return the new digest
237
+ allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
238
+ allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
239
+ expect(documents.source_modified?('s1')).to be true
240
+ end
241
+
242
+ it 'does not update the source if the digest matches' do
243
+ documents.add('foo', source: 's1')
244
+
245
+ expect(ollama).not_to receive(:embed)
246
+ documents.source_update(['foo'], source: 's1')
247
+ expect(documents.exist?('foo')).to be true
248
+ end
249
+
250
+ it 'updates the source if the digest has changed' do
251
+ documents.add('foo', source: 's1')
252
+
253
+ # Simulate a file change by updating the digest
254
+ allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
255
+ allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
256
+
257
+ expect(ollama).to receive(:embed).once
258
+ documents.source_update(['bar'], source: 's1')
259
+
260
+ expect(documents.exist?('bar')).to be true
261
+ expect(documents.exist?('foo')).to be false
262
+ end
263
+
264
+ it 'updates the source if it is an URL' do
265
+ expect(ollama).to receive(:embed).once
266
+ documents.source_update('foo', source: 'https://www.example.com/s1')
267
+ expect(documents.exist?('foo')).to be true
268
+ end
269
+ end
196
270
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Utils::ColorizeTexts do
4
2
  it 'colorizes texts' do
5
3
  ct = described_class.new(%w[ foo bar ])
@@ -0,0 +1,97 @@
1
+ require 'tempfile'
2
+
3
+ describe Documentrix::Utils::Digests do
4
+ let(:test_class) do
5
+ Class.new do
6
+ include Documentrix::Utils::Digests
7
+ end
8
+ end
9
+
10
+ let(:subject) { test_class.new.expose }
11
+
12
+ describe '#compute_digest' do
13
+ it 'computes a valid SHA256 digest of a string' do
14
+ text = 'hello world'
15
+ expected = Digest::SHA256.hexdigest(text)
16
+ expect(subject.compute_digest(text)).to eq expected
17
+ end
18
+ end
19
+
20
+ describe '#compute_file_digest' do
21
+ it 'returns nil for an empty filename' do
22
+ expect(subject.compute_file_digest(nil)).to be_nil
23
+ expect(subject.compute_file_digest('')).to be_nil
24
+ end
25
+
26
+ it 'returns nil for an absolute URL' do
27
+ expect(subject.compute_file_digest('https://example.com/file.txt')).to be_nil
28
+ end
29
+
30
+ it 'returns nil for a non-existent file' do
31
+ expect(subject.compute_file_digest('/tmp/non_existent_file_12345')).to be_nil
32
+ end
33
+
34
+ it 'computes the digest of a local file' do
35
+ file = Tempfile.create('documentrix_test')
36
+ content = 'file content'
37
+ file.write(content)
38
+ file.close
39
+
40
+ expected = Digest::SHA256.hexdigest(content)
41
+ expect(subject.compute_file_digest(file.path)).to eq expected
42
+ end
43
+
44
+ context 'with caching' do
45
+ let(:file) { Tempfile.create('documentrix_cache_test') }
46
+ let(:content) { 'initial content' }
47
+
48
+ before do
49
+ file.write(content)
50
+ file.close
51
+ subject.file_digest_cache_clear
52
+ end
53
+
54
+ it 'returns the same digest on subsequent calls' do
55
+ digest1 = subject.compute_file_digest(file.path)
56
+ digest2 = subject.compute_file_digest(file.path)
57
+ expect(digest1).to eq digest2
58
+ end
59
+
60
+ it 'recomputes the digest when the file is modified' do
61
+ digest1 = subject.compute_file_digest(file.path)
62
+
63
+ # Update file content and force mtime change
64
+ File.write(file.path, 'updated content')
65
+ # Ensure mtime is actually different (some FS have low precision)
66
+ File.utime(Time.now + 1, Time.now + 1, file.path)
67
+
68
+ digest2 = subject.compute_file_digest(file.path)
69
+ expect(digest1).not_to eq digest2
70
+ end
71
+
72
+ it 'recomputes the digest after cache clear' do
73
+ digest1 = subject.compute_file_digest(file.path)
74
+ subject.file_digest_cache_clear
75
+
76
+ # Even though file hasn't changed, it should re-read and return same value
77
+ digest2 = subject.compute_file_digest(file.path)
78
+ expect(digest1).to eq digest2
79
+ end
80
+ end
81
+ end
82
+
83
+ describe '#file_digest_cache_clear' do
84
+ it 'clears the internal cache' do
85
+ file = Tempfile.create('documentrix_clear_test')
86
+ file.write('test')
87
+ file.close
88
+
89
+ subject.compute_file_digest(file.path)
90
+ subject.file_digest_cache_clear
91
+
92
+ # We can verify this indirectly by checking if the cache is empty
93
+ # or by the fact that it will re-compute in tests.
94
+ expect(subject).to respond_to(:file_digest_cache_clear)
95
+ end
96
+ end
97
+ end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Utils::Tags do
4
2
  it 'can be instantiated' do
5
3
  expect(described_class.new).to be_a described_class