documentrix 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  module Documentrix
2
2
  # Documentrix version
3
- VERSION = '0.2.0'
3
+ VERSION = '0.3.1'
4
4
  VERSION_ARRAY = VERSION.split('.').map(&:to_i) # :nodoc:
5
5
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
6
6
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe 'Documentrix::Documents::Cache Interface' do
4
2
  describe 'MemoryCache Interface' do
5
3
  let(:cache) { Documentrix::Documents::MemoryCache.new(prefix: 'test-') }
@@ -55,13 +53,22 @@ describe 'Documentrix::Documents::Cache Interface' do
55
53
  expect(cache).to respond_to(:clear_by_source)
56
54
  expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
57
55
 
56
+ expect(cache).to respond_to(:source_exist?)
57
+ expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
58
+
58
59
  expect(cache).to respond_to(:clear)
59
60
  expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
60
61
  end
61
62
  end
62
63
 
63
64
  describe 'RedisCache Interface' do
64
- let(:cache) { Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379') }
65
+ let :object_class do
66
+ Documentrix::Documents::Cache::Records::Record
67
+ end
68
+
69
+ let(:cache) do
70
+ Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379', object_class:)
71
+ end
65
72
 
66
73
  it 'has proper method resolution' do
67
74
  # Basic cache operations
@@ -114,6 +121,9 @@ describe 'Documentrix::Documents::Cache Interface' do
114
121
  expect(cache).to respond_to(:clear_by_source)
115
122
  expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
116
123
 
124
+ expect(cache).to respond_to(:source_exist?)
125
+ expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
126
+
117
127
  expect(cache).to respond_to(:clear)
118
128
  expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
119
129
 
@@ -177,6 +187,9 @@ describe 'Documentrix::Documents::Cache Interface' do
177
187
  expect(cache).to respond_to(:clear_by_source)
178
188
  expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::SQLiteCache
179
189
 
190
+ expect(cache).to respond_to(:source_exist?)
191
+ expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::SQLiteCache
192
+
180
193
  expect(cache).to respond_to(:clear)
181
194
  expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
182
195
 
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::MemoryCache do
4
2
  let :prefix do
5
3
  'test-'
@@ -135,4 +133,68 @@ describe Documentrix::Documents::MemoryCache do
135
133
  cache['foo'] = 'bar'
136
134
  expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
137
135
  end
136
+
137
+ it 'can iterate over unique sources' do
138
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
139
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
140
+ cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
141
+
142
+ expect(cache.each_source.to_a).to match_array(['s1', 's2'])
143
+ end
144
+
145
+ it 'can retrieve all unique tags' do
146
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['a', 'b'], embedding: [0.1]]
147
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's2', tags: ['b', 'c'], embedding: [0.1]]
148
+
149
+ expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
150
+ end
151
+
152
+ it 'can clear records by tags' do
153
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['keep'], embedding: [0.1]]
154
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', tags: ['trash'], embedding: [0.1]]
155
+
156
+ expect {
157
+ cache.clear_for_tags(['trash'])
158
+ }.to change { cache.size }.from(2).to(1)
159
+ expect(cache.key?('foo')).to be true
160
+ expect(cache.key?('bar')).to be false
161
+ end
162
+
163
+ it 'can check if a source exists' do
164
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
165
+
166
+ expect(cache.source_exist?('s1')).to be true
167
+ expect(cache.source_exist?('s2')).to be false
168
+ end
169
+
170
+ it 'can clear by source with a specific digest' do
171
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
172
+ cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
173
+
174
+ expect {
175
+ cache.clear_by_source('s1', digest: 'd1')
176
+ }.to change { cache.size }.from(2).to(1)
177
+ expect(cache.key?('f2')).to be true
178
+ expect(cache.key?('f1')).to be false
179
+ end
180
+
181
+ it 'can clear outdated versions of a source' do
182
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
183
+ cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
184
+
185
+ expect {
186
+ cache.clear_by_source('s1', digest: 'd2', operator: '!=')
187
+ }.to change { cache.size }.from(2).to(1)
188
+ expect(cache.key?('f2')).to be true
189
+ expect(cache.key?('f1')).to be false
190
+ end
191
+
192
+ it 'can check if a source exists with a specific digest' do
193
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
194
+
195
+ expect(cache.source_exist?('s1', digest: 'd1')).to be true
196
+ expect(cache.source_exist?('s1', digest: 'd2')).to be false
197
+ expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be false
198
+ expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true
199
+ end
138
200
  end
@@ -1,28 +1,20 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::RedisCache do
2
+ let :object_class do
3
+ Documentrix::Documents::Cache::Records::Record
4
+ end
5
+
4
6
  let :prefix do
5
7
  'test-'
6
8
  end
7
9
 
8
10
  let :cache do
9
- described_class.new prefix:, url: 'something'
11
+ described_class.new prefix:, url: 'something', object_class:
10
12
  end
11
13
 
12
14
  it 'can be instantiated' do
13
15
  expect(cache).to be_a described_class
14
16
  end
15
17
 
16
- it 'defaults to nil object_class' do
17
- expect(cache.object_class).to be_nil
18
- end
19
-
20
- it 'can be configured with object_class' do
21
- object_class = Class.new(JSON::GenericObject)
22
- cache = described_class.new(prefix:, url: 'something', object_class:)
23
- expect(cache.object_class).to eq object_class
24
- end
25
-
26
18
  it 'raises ArgumentError if url is missing' do
27
19
  expect {
28
20
  described_class.new prefix:, url: nil
@@ -38,6 +30,10 @@ describe Documentrix::Documents::RedisCache do
38
30
  allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
39
31
  end
40
32
 
33
+ it 'can be configured with object_class' do
34
+ expect(cache.object_class).to eq object_class
35
+ end
36
+
41
37
  it 'has Redis client' do
42
38
  expect(cache.redis).to eq redis
43
39
  end
@@ -62,9 +58,9 @@ describe Documentrix::Documents::RedisCache do
62
58
  end
63
59
 
64
60
  it 'can move prefixes' do
65
- expect(redis).to receive(:get).with(prefix + 'foo').and_return(JSON(foo: true))
66
- expect(redis).to receive(:get).with('test2-bar').and_return(JSON(foo: true))
67
- expect(redis).to receive(:set).with('test3-foo', '{"foo":true}')
61
+ expect(redis).to receive(:get).with(prefix + 'foo').and_return(object_class[foo: true].to_json)
62
+ expect(redis).to receive(:get).with('test2-bar').and_return(object_class[foo: true].to_json)
63
+ expect(redis).to receive(:set).with('test3-foo', /"foo":true/)
68
64
  expect(redis).to receive(:del).with('test-foo')
69
65
  expect(redis).to receive(:scan_each).with(match: ?*).
70
66
  and_yield("#{prefix}foo").
@@ -81,12 +77,12 @@ describe Documentrix::Documents::RedisCache do
81
77
  end
82
78
 
83
79
  it 'can iterate over keys, values' do
84
- key, value = 'foo', { 'test' => true }
85
- expect(redis).to receive(:set).with(prefix + key, JSON(value))
80
+ key, value = 'foo', object_class[test: true]
81
+ expect(redis).to receive(:set).with(prefix + key, object_class[value].to_json)
86
82
  cache[key] = value
87
83
  expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
88
84
  and_yield("#{prefix}foo")
89
- expect(redis).to receive(:get).with(prefix + key).and_return(JSON(test: true))
85
+ expect(redis).to receive(:get).with(prefix + key).and_return(object_class[test: true].to_json)
90
86
  cache.each do |k, v|
91
87
  expect(k).to eq prefix + key
92
88
  expect(v).to eq value
@@ -135,5 +131,58 @@ describe Documentrix::Documents::RedisCache do
135
131
  it 'can remove prefix with unpre' do
136
132
  expect(cache.unpre('test-foo')).to eq 'foo'
137
133
  end
134
+
135
+ it 'can iterate over unique sources' do
136
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
137
+ "#{prefix}foo"
138
+ ).and_yield(
139
+ "#{prefix}bar"
140
+ )
141
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1'))
142
+ expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2'))
143
+
144
+ expect(cache.each_source.to_a).to match_array(['s1', 's2'])
145
+ end
146
+
147
+ it 'can retrieve all unique tags' do
148
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
149
+ "#{prefix}foo"
150
+ ).and_yield(
151
+ "#{prefix}bar"
152
+ )
153
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['a', 'b']))
154
+ expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['b', 'c']))
155
+
156
+ expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
157
+ end
158
+
159
+ it 'can clear records by tags' do
160
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
161
+ "#{prefix}foo"
162
+ ).and_yield(
163
+ "#{prefix}bar"
164
+ )
165
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['trash']))
166
+ expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['keep']))
167
+ expect(redis).to receive(:del).with("#{prefix}foo")
168
+
169
+ expect(cache.clear_for_tags(['trash'])).to eq cache
170
+ end
171
+
172
+ it 'can check if a source exists with a specific digest' do
173
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
174
+ "#{prefix}foo"
175
+ )
176
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
177
+
178
+ expect(cache.source_exist?('s1', digest: 'd1')).to be true
179
+
180
+ # Reset for the negative case
181
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
182
+ "#{prefix}foo"
183
+ )
184
+ expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
185
+ expect(cache.source_exist?('s1', digest: 'd2')).to be false
186
+ end
138
187
  end
139
188
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::SQLiteCache do
4
2
  let :prefix do
5
3
  'test-'
@@ -145,6 +143,20 @@ describe Documentrix::Documents::SQLiteCache do
145
143
  expect(cache).to be_key 'bar'
146
144
  end
147
145
 
146
+ it 'can clear all without tags' do
147
+ key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
148
+ cache[key] = value
149
+ key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
150
+ cache[key] = value
151
+ expect {
152
+ expect(cache.clear_for_tags).to eq cache
153
+ }.to change {
154
+ cache.size
155
+ }.from(2).to(0)
156
+ expect(cache).not_to be_key 'foo'
157
+ expect(cache).not_to be_key 'bar'
158
+ end
159
+
148
160
  it 'can clear by source' do
149
161
  val1 = test_value.merge(source: 's1')
150
162
  val2 = test_value.merge(source: 's1')
@@ -159,6 +171,46 @@ describe Documentrix::Documents::SQLiteCache do
159
171
  expect(cache.key?('foo')).to be false
160
172
  end
161
173
 
174
+ it 'can clear by source and digest' do
175
+ allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2', 'd3')
176
+ cache['foo'] = test_value.merge(source: 's1') # d1
177
+ cache['bar'] = test_value.merge(source: 's1') # d2
178
+ cache['baz'] = test_value.merge(source: 's1') # d3
179
+
180
+ # Clear those that match d1
181
+ expect {
182
+ cache.clear_by_source('s1', digest: 'd1')
183
+ }.to change { cache.size }.from(3).to(2)
184
+ expect(cache.key?('foo')).to be false
185
+ expect(cache.key?('bar')).to be true
186
+
187
+ # Clear those that do NOT match d2 (should clear baz)
188
+ expect {
189
+ cache.clear_by_source('s1', digest: 'd2', operator: '!=')
190
+ }.to change { cache.size }.from(2).to(1)
191
+ expect(cache.key?('baz')).to be false
192
+ expect(cache.key?('bar')).to be true
193
+ end
194
+
195
+ describe '#source_exist?' do
196
+ it 'returns true if source exists' do
197
+ cache['foo'] = test_value
198
+ expect(cache.source_exist?('for-test.txt')).to be true
199
+ expect(cache.source_exist?('non-existent')).to be false
200
+ end
201
+
202
+ it 'filters by digest' do
203
+ allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2')
204
+ cache['foo'] = test_value.merge(source: 's1') # d1
205
+ cache['bar'] = test_value.merge(source: 's1') # d2
206
+
207
+ expect(cache.source_exist?('s1', digest: 'd1')).to be true
208
+ expect(cache.source_exist?('s1', digest: 'd3')).to be false
209
+ expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be true # bar exists
210
+ expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true # foo exists
211
+ end
212
+ end
213
+
162
214
  it 'can return tags' do
163
215
  key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
164
216
  cache[key] = value
@@ -169,6 +221,17 @@ describe Documentrix::Documents::SQLiteCache do
169
221
  expect(tags.to_a).to eq %w[ bar baz foo ]
170
222
  end
171
223
 
224
+ it 'can iterate over unique sources' do
225
+ val1 = test_value.merge(source: 's1')
226
+ val2 = test_value.merge(source: 's1')
227
+ val3 = test_value.merge(source: 's2')
228
+ cache['foo'] = val1
229
+ cache['bar'] = val2
230
+ cache['baz'] = val3
231
+
232
+ expect(cache.each_source.to_a).to match_array(['s1', 's2'])
233
+ end
234
+
172
235
  it 'can iterate over keys under a prefix' do
173
236
  cache['foo'] = test_value
174
237
  expect(cache.each.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
@@ -186,4 +249,108 @@ describe Documentrix::Documents::SQLiteCache do
186
249
  ["test2-bar", Documentrix::Documents::Record[test_value] ],
187
250
  ]
188
251
  end
252
+
253
+ describe 'Prefix Isolation' do
254
+ let(:cache2) { cache.dup }
255
+
256
+ before do
257
+ cache2.prefix = 'other-'
258
+
259
+ # Setup shared sources and tags across prefixes
260
+ cache['foo'] = test_value.merge(source: 'shared.txt', tags: %w[ a ])
261
+ cache2['bar'] = test_value.merge(source: 'shared.txt', tags: %w[ a ])
262
+ end
263
+
264
+ it 'does not leak clear_by_source' do
265
+ expect {
266
+ cache.clear_by_source('shared.txt')
267
+ }.to change { cache.size }.from(1).to(0)
268
+
269
+ expect(cache2.size).to eq 1
270
+ expect(cache2.key?('bar')).to be true
271
+ end
272
+
273
+ it 'does not leak source_exist?' do
274
+ # Ensure we are checking a source that ONLY exists in the other prefix
275
+ cache.clear_all_with_prefix
276
+ cache2['baz'] = test_value.merge(source: 'only-in-2.txt')
277
+
278
+ expect(cache.source_exist?('only-in-2.txt')).to be false
279
+ expect(cache2.source_exist?('only-in-2.txt')).to be true
280
+ end
281
+
282
+ it 'does not leak tags' do
283
+ cache.clear_all_with_prefix
284
+ cache2.clear_all_with_prefix
285
+
286
+ cache['foo'] = test_value.merge(tags: %w[ prefix1 ])
287
+ cache2['bar'] = test_value.merge(tags: %w[ prefix2 ])
288
+
289
+ expect(cache.tags.to_a).to match_array(['prefix1'])
290
+ expect(cache2.tags.to_a).to match_array(['prefix2'])
291
+ end
292
+ end
293
+
294
+ describe '#find_records' do
295
+ let(:needle) { [ 0.5 ] * 1_024 }
296
+
297
+ it 'raises ArgumentError if needle length is incorrect' do
298
+ expect {
299
+ cache.find_records([ 0.1 ])
300
+ }.to raise_error(ArgumentError, /needle embedding length/)
301
+ end
302
+
303
+ it 'returns the most similar record' do
304
+ # Record 1: Exact match
305
+ val1 = test_value.merge(text: 'match', embedding: needle)
306
+ # Record 2: Different vector
307
+ val2 = test_value.merge(text: 'diff', embedding: [ 0.1 ] * 1_024)
308
+
309
+ cache['r1'] = val1
310
+ cache['r2'] = val2
311
+
312
+ results = cache.find_records(needle)
313
+
314
+ expect(results.size).to eq 2
315
+ expect(results.first.text).to eq 'match'
316
+ expect(results.first.similarity).to be_within(0.001).of(1.0)
317
+ end
318
+
319
+ it 'filters results by tags' do
320
+ val1 = test_value.merge(text: 'tagged', tags: %w[ a ], embedding: needle)
321
+ val2 = test_value.merge(text: 'untagged', tags: %w[ b ], embedding: needle)
322
+
323
+ cache['r1'] = val1
324
+ cache['r2'] = val2
325
+
326
+ expect(cache.find_records(needle, tags: %w[ a ]).map(&:text)).to eq %w[ tagged ]
327
+ expect(cache.find_records(needle, tags: %w[ b ]).map(&:text)).to eq %w[ untagged ]
328
+ expect(cache.find_records(needle, tags: %w[ c ]).size).to eq 0
329
+ end
330
+
331
+ it 'filters results by min_similarity' do
332
+ # Exact match
333
+ cache['r1'] = test_value.merge(text: 'match', embedding: needle)
334
+ # Very different vector
335
+ cache['r2'] = test_value.merge(text: 'diff', embedding: [ -0.5 ] * 1_024)
336
+
337
+ # Low threshold: both should appear
338
+ expect(cache.find_records(needle, min_similarity: -1).size).to eq 2
339
+
340
+ # High threshold: only match should appear
341
+ expect(cache.find_records(needle, min_similarity: 0.9).map(&:text)).to eq %w[ match ]
342
+ end
343
+
344
+ it 'limits results via max_records' do
345
+ 3.times do |i|
346
+ cache["r#{i}"] = test_value.merge(text: "t#{i}", embedding: needle)
347
+ end
348
+
349
+ expect(cache.find_records(needle, max_records: 2).size).to eq 2
350
+ end
351
+
352
+ it 'returns empty array when no records match' do
353
+ expect(cache.find_records(needle)).to eq []
354
+ end
355
+ end
189
356
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::Splitters::Character do
4
2
  let :splitter do
5
3
  described_class.new chunk_size: 23, combining_string: ''
@@ -50,6 +48,26 @@ describe Documentrix::Documents::Splitters::Character do
50
48
  expect(result.to_a.join('')).to eq ?A * 25
51
49
  end
52
50
 
51
+ context 'with force' do
52
+ let :splitter do
53
+ described_class.new chunk_size: 23, combining_string: '', force: true
54
+ end
55
+
56
+ it 'can split with force' do
57
+ text = [ ?A * 10 ] * 10 * "\n"
58
+ result = splitter.split(text)
59
+ expect(result.count).to eq 10
60
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
61
+ end
62
+
63
+ it 'can with force split2' do
64
+ text = ?A * 25
65
+ result = splitter.split(text)
66
+ expect(result.count).to eq 2
67
+ expect(result.to_a.join('')).to eq ?A * 25
68
+ end
69
+ end
70
+
53
71
  it 'can split sentences' do
54
72
  text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
55
73
  splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::Splitters::Semantic do
4
2
  let :ollama do
5
3
  double('Ollama::Client')
@@ -29,11 +27,11 @@ describe Documentrix::Documents::Splitters::Semantic do
29
27
  expect(result.to_a.join('').count(?B)).to eq text.count(?B)
30
28
  end
31
29
 
32
- it 'can split with breakpoint :percentile' do
33
- described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 50
30
+ it 'can split with breakpoint :percentile, chunk_size 23' do
31
+ splitter = described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 23
34
32
  text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
35
33
  result = splitter.split(text, breakpoint: :percentile, percentile: 75)
36
- expect(result.count).to eq 4
34
+ expect(result.count).to eq 11
37
35
  expect(result.to_a.join('').count(?A)).to eq text.count(?A)
38
36
  expect(result.to_a.join('').count(?B)).to eq text.count(?B)
39
37
  end
@@ -53,4 +51,18 @@ describe Documentrix::Documents::Splitters::Semantic do
53
51
  expect(result.to_a.join('').count(?A)).to eq text.count(?A)
54
52
  expect(result.to_a.join('').count(?B)).to eq text.count(?B)
55
53
  end
54
+
55
+ context 'with force' do
56
+ let :splitter do
57
+ described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 7, force: true
58
+ end
59
+
60
+ it 'can split with force' do
61
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
62
+ result = splitter.split(text, breakpoint: :percentile, percentile: 75)
63
+ expect(result.count).to eq 18
64
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
65
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
66
+ end
67
+ end
56
68
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents do
4
2
  let :ollama do
5
3
  double('Ollama::Client')
@@ -168,7 +166,7 @@ describe Documentrix::Documents do
168
166
 
169
167
  expect(documents.size).to eq 3
170
168
 
171
- documents.remove('source1')
169
+ documents.source_remove('source1')
172
170
 
173
171
  expect(documents.size).to eq 1
174
172
  expect(documents.exist?('baz')).to be true
@@ -211,4 +209,62 @@ describe Documentrix::Documents do
211
209
  to(:new_collection)
212
210
  end
213
211
  end
212
+
213
+ context 'source management' do
214
+ before do
215
+ allow(documents).to receive(:compute_file_digest).and_return('d1')
216
+ allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
217
+
218
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
219
+ end
220
+
221
+ it 'can check if a source exists' do
222
+ documents.add('foo', source: 's1')
223
+ expect(documents.source_exist?('s1')).to be true
224
+ expect(documents.source_exist?('s2')).to be false
225
+ end
226
+
227
+ it 'can determine if a source is modified' do
228
+ documents.add('foo', source: 's1')
229
+
230
+ # Case 1: Source is up to date
231
+ expect(documents.source_modified?('s1')).to be false
232
+
233
+ # Case 2: Source is missing
234
+ expect(documents.source_modified?('s2')).to be true
235
+
236
+ # Case 3: Digest changed - mock both to return the new digest
237
+ allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
238
+ allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
239
+ expect(documents.source_modified?('s1')).to be true
240
+ end
241
+
242
+ it 'does not update the source if the digest matches' do
243
+ documents.add('foo', source: 's1')
244
+
245
+ expect(ollama).not_to receive(:embed)
246
+ documents.source_update(['foo'], source: 's1')
247
+ expect(documents.exist?('foo')).to be true
248
+ end
249
+
250
+ it 'updates the source if the digest has changed' do
251
+ documents.add('foo', source: 's1')
252
+
253
+ # Simulate a file change by updating the digest
254
+ allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
255
+ allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
256
+
257
+ expect(ollama).to receive(:embed).once
258
+ documents.source_update(['bar'], source: 's1')
259
+
260
+ expect(documents.exist?('bar')).to be true
261
+ expect(documents.exist?('foo')).to be false
262
+ end
263
+
264
+ it 'updates the source if it is an URL' do
265
+ expect(ollama).to receive(:embed).once
266
+ documents.source_update('foo', source: 'https://www.example.com/s1')
267
+ expect(documents.exist?('foo')).to be true
268
+ end
269
+ end
214
270
  end
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Utils::ColorizeTexts do
4
2
  it 'colorizes texts' do
5
3
  ct = described_class.new(%w[ foo bar ])