documentrix 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +80 -0
- data/documentrix.gemspec +5 -5
- data/lib/documentrix/documents/cache/common.rb +63 -11
- data/lib/documentrix/documents/cache/records.rb +1 -1
- data/lib/documentrix/documents/cache/redis_cache.rb +3 -3
- data/lib/documentrix/documents/cache/sqlite_cache.rb +132 -33
- data/lib/documentrix/documents/splitters/character.rb +56 -4
- data/lib/documentrix/documents/splitters/common.rb +38 -0
- data/lib/documentrix/documents/splitters/semantic.rb +67 -8
- data/lib/documentrix/documents.rb +133 -29
- data/lib/documentrix/utils/colorize_texts.rb +25 -21
- data/lib/documentrix/utils/digests.rb +78 -0
- data/lib/documentrix/utils.rb +1 -0
- data/lib/documentrix/version.rb +1 -1
- data/spec/documentrix/documents/cache/interface_spec.rb +16 -3
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +64 -2
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +68 -19
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +169 -2
- data/spec/documentrix/documents/splitters/character_spec.rb +20 -2
- data/spec/documentrix/documents/splitters/semantic_spec.rb +17 -5
- data/spec/documents_spec.rb +59 -3
- data/spec/utils/colorize_texts_spec.rb +0 -2
- data/spec/utils/digests_spec.rb +97 -0
- data/spec/utils/tags_spec.rb +0 -2
- metadata +7 -1
data/lib/documentrix/version.rb
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe 'Documentrix::Documents::Cache Interface' do
|
|
4
2
|
describe 'MemoryCache Interface' do
|
|
5
3
|
let(:cache) { Documentrix::Documents::MemoryCache.new(prefix: 'test-') }
|
|
@@ -55,13 +53,22 @@ describe 'Documentrix::Documents::Cache Interface' do
|
|
|
55
53
|
expect(cache).to respond_to(:clear_by_source)
|
|
56
54
|
expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
|
|
57
55
|
|
|
56
|
+
expect(cache).to respond_to(:source_exist?)
|
|
57
|
+
expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
|
|
58
|
+
|
|
58
59
|
expect(cache).to respond_to(:clear)
|
|
59
60
|
expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
|
|
60
61
|
end
|
|
61
62
|
end
|
|
62
63
|
|
|
63
64
|
describe 'RedisCache Interface' do
|
|
64
|
-
let
|
|
65
|
+
let :object_class do
|
|
66
|
+
Documentrix::Documents::Cache::Records::Record
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
let(:cache) do
|
|
70
|
+
Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379', object_class:)
|
|
71
|
+
end
|
|
65
72
|
|
|
66
73
|
it 'has proper method resolution' do
|
|
67
74
|
# Basic cache operations
|
|
@@ -114,6 +121,9 @@ describe 'Documentrix::Documents::Cache Interface' do
|
|
|
114
121
|
expect(cache).to respond_to(:clear_by_source)
|
|
115
122
|
expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
|
|
116
123
|
|
|
124
|
+
expect(cache).to respond_to(:source_exist?)
|
|
125
|
+
expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
|
|
126
|
+
|
|
117
127
|
expect(cache).to respond_to(:clear)
|
|
118
128
|
expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
|
|
119
129
|
|
|
@@ -177,6 +187,9 @@ describe 'Documentrix::Documents::Cache Interface' do
|
|
|
177
187
|
expect(cache).to respond_to(:clear_by_source)
|
|
178
188
|
expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::SQLiteCache
|
|
179
189
|
|
|
190
|
+
expect(cache).to respond_to(:source_exist?)
|
|
191
|
+
expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::SQLiteCache
|
|
192
|
+
|
|
180
193
|
expect(cache).to respond_to(:clear)
|
|
181
194
|
expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
|
|
182
195
|
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::MemoryCache do
|
|
4
2
|
let :prefix do
|
|
5
3
|
'test-'
|
|
@@ -135,4 +133,68 @@ describe Documentrix::Documents::MemoryCache do
|
|
|
135
133
|
cache['foo'] = 'bar'
|
|
136
134
|
expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
|
|
137
135
|
end
|
|
136
|
+
|
|
137
|
+
it 'can iterate over unique sources' do
|
|
138
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
|
|
139
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
|
|
140
|
+
cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
|
|
141
|
+
|
|
142
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it 'can retrieve all unique tags' do
|
|
146
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['a', 'b'], embedding: [0.1]]
|
|
147
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's2', tags: ['b', 'c'], embedding: [0.1]]
|
|
148
|
+
|
|
149
|
+
expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it 'can clear records by tags' do
|
|
153
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['keep'], embedding: [0.1]]
|
|
154
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', tags: ['trash'], embedding: [0.1]]
|
|
155
|
+
|
|
156
|
+
expect {
|
|
157
|
+
cache.clear_for_tags(['trash'])
|
|
158
|
+
}.to change { cache.size }.from(2).to(1)
|
|
159
|
+
expect(cache.key?('foo')).to be true
|
|
160
|
+
expect(cache.key?('bar')).to be false
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it 'can check if a source exists' do
|
|
164
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
|
|
165
|
+
|
|
166
|
+
expect(cache.source_exist?('s1')).to be true
|
|
167
|
+
expect(cache.source_exist?('s2')).to be false
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it 'can clear by source with a specific digest' do
|
|
171
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
172
|
+
cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
|
|
173
|
+
|
|
174
|
+
expect {
|
|
175
|
+
cache.clear_by_source('s1', digest: 'd1')
|
|
176
|
+
}.to change { cache.size }.from(2).to(1)
|
|
177
|
+
expect(cache.key?('f2')).to be true
|
|
178
|
+
expect(cache.key?('f1')).to be false
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
it 'can clear outdated versions of a source' do
|
|
182
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
183
|
+
cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
|
|
184
|
+
|
|
185
|
+
expect {
|
|
186
|
+
cache.clear_by_source('s1', digest: 'd2', operator: '!=')
|
|
187
|
+
}.to change { cache.size }.from(2).to(1)
|
|
188
|
+
expect(cache.key?('f2')).to be true
|
|
189
|
+
expect(cache.key?('f1')).to be false
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'can check if a source exists with a specific digest' do
|
|
193
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
194
|
+
|
|
195
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
196
|
+
expect(cache.source_exist?('s1', digest: 'd2')).to be false
|
|
197
|
+
expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be false
|
|
198
|
+
expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true
|
|
199
|
+
end
|
|
138
200
|
end
|
|
@@ -1,28 +1,20 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::RedisCache do
|
|
2
|
+
let :object_class do
|
|
3
|
+
Documentrix::Documents::Cache::Records::Record
|
|
4
|
+
end
|
|
5
|
+
|
|
4
6
|
let :prefix do
|
|
5
7
|
'test-'
|
|
6
8
|
end
|
|
7
9
|
|
|
8
10
|
let :cache do
|
|
9
|
-
described_class.new prefix:, url: 'something'
|
|
11
|
+
described_class.new prefix:, url: 'something', object_class:
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
it 'can be instantiated' do
|
|
13
15
|
expect(cache).to be_a described_class
|
|
14
16
|
end
|
|
15
17
|
|
|
16
|
-
it 'defaults to nil object_class' do
|
|
17
|
-
expect(cache.object_class).to be_nil
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
it 'can be configured with object_class' do
|
|
21
|
-
object_class = Class.new(JSON::GenericObject)
|
|
22
|
-
cache = described_class.new(prefix:, url: 'something', object_class:)
|
|
23
|
-
expect(cache.object_class).to eq object_class
|
|
24
|
-
end
|
|
25
|
-
|
|
26
18
|
it 'raises ArgumentError if url is missing' do
|
|
27
19
|
expect {
|
|
28
20
|
described_class.new prefix:, url: nil
|
|
@@ -38,6 +30,10 @@ describe Documentrix::Documents::RedisCache do
|
|
|
38
30
|
allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
|
|
39
31
|
end
|
|
40
32
|
|
|
33
|
+
it 'can be configured with object_class' do
|
|
34
|
+
expect(cache.object_class).to eq object_class
|
|
35
|
+
end
|
|
36
|
+
|
|
41
37
|
it 'has Redis client' do
|
|
42
38
|
expect(cache.redis).to eq redis
|
|
43
39
|
end
|
|
@@ -62,9 +58,9 @@ describe Documentrix::Documents::RedisCache do
|
|
|
62
58
|
end
|
|
63
59
|
|
|
64
60
|
it 'can move prefixes' do
|
|
65
|
-
expect(redis).to receive(:get).with(prefix + 'foo').and_return(
|
|
66
|
-
expect(redis).to receive(:get).with('test2-bar').and_return(
|
|
67
|
-
expect(redis).to receive(:set).with('test3-foo',
|
|
61
|
+
expect(redis).to receive(:get).with(prefix + 'foo').and_return(object_class[foo: true].to_json)
|
|
62
|
+
expect(redis).to receive(:get).with('test2-bar').and_return(object_class[foo: true].to_json)
|
|
63
|
+
expect(redis).to receive(:set).with('test3-foo', /"foo":true/)
|
|
68
64
|
expect(redis).to receive(:del).with('test-foo')
|
|
69
65
|
expect(redis).to receive(:scan_each).with(match: ?*).
|
|
70
66
|
and_yield("#{prefix}foo").
|
|
@@ -81,12 +77,12 @@ describe Documentrix::Documents::RedisCache do
|
|
|
81
77
|
end
|
|
82
78
|
|
|
83
79
|
it 'can iterate over keys, values' do
|
|
84
|
-
key, value = 'foo',
|
|
85
|
-
expect(redis).to receive(:set).with(prefix + key,
|
|
80
|
+
key, value = 'foo', object_class[test: true]
|
|
81
|
+
expect(redis).to receive(:set).with(prefix + key, object_class[value].to_json)
|
|
86
82
|
cache[key] = value
|
|
87
83
|
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
|
|
88
84
|
and_yield("#{prefix}foo")
|
|
89
|
-
expect(redis).to receive(:get).with(prefix + key).and_return(
|
|
85
|
+
expect(redis).to receive(:get).with(prefix + key).and_return(object_class[test: true].to_json)
|
|
90
86
|
cache.each do |k, v|
|
|
91
87
|
expect(k).to eq prefix + key
|
|
92
88
|
expect(v).to eq value
|
|
@@ -135,5 +131,58 @@ describe Documentrix::Documents::RedisCache do
|
|
|
135
131
|
it 'can remove prefix with unpre' do
|
|
136
132
|
expect(cache.unpre('test-foo')).to eq 'foo'
|
|
137
133
|
end
|
|
134
|
+
|
|
135
|
+
it 'can iterate over unique sources' do
|
|
136
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
137
|
+
"#{prefix}foo"
|
|
138
|
+
).and_yield(
|
|
139
|
+
"#{prefix}bar"
|
|
140
|
+
)
|
|
141
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1'))
|
|
142
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2'))
|
|
143
|
+
|
|
144
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
it 'can retrieve all unique tags' do
|
|
148
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
149
|
+
"#{prefix}foo"
|
|
150
|
+
).and_yield(
|
|
151
|
+
"#{prefix}bar"
|
|
152
|
+
)
|
|
153
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['a', 'b']))
|
|
154
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['b', 'c']))
|
|
155
|
+
|
|
156
|
+
expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'can clear records by tags' do
|
|
160
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
161
|
+
"#{prefix}foo"
|
|
162
|
+
).and_yield(
|
|
163
|
+
"#{prefix}bar"
|
|
164
|
+
)
|
|
165
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['trash']))
|
|
166
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['keep']))
|
|
167
|
+
expect(redis).to receive(:del).with("#{prefix}foo")
|
|
168
|
+
|
|
169
|
+
expect(cache.clear_for_tags(['trash'])).to eq cache
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it 'can check if a source exists with a specific digest' do
|
|
173
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
174
|
+
"#{prefix}foo"
|
|
175
|
+
)
|
|
176
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
|
|
177
|
+
|
|
178
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
179
|
+
|
|
180
|
+
# Reset for the negative case
|
|
181
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
182
|
+
"#{prefix}foo"
|
|
183
|
+
)
|
|
184
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
|
|
185
|
+
expect(cache.source_exist?('s1', digest: 'd2')).to be false
|
|
186
|
+
end
|
|
138
187
|
end
|
|
139
188
|
end
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::SQLiteCache do
|
|
4
2
|
let :prefix do
|
|
5
3
|
'test-'
|
|
@@ -145,6 +143,20 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
145
143
|
expect(cache).to be_key 'bar'
|
|
146
144
|
end
|
|
147
145
|
|
|
146
|
+
it 'can clear all without tags' do
|
|
147
|
+
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
|
148
|
+
cache[key] = value
|
|
149
|
+
key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
|
|
150
|
+
cache[key] = value
|
|
151
|
+
expect {
|
|
152
|
+
expect(cache.clear_for_tags).to eq cache
|
|
153
|
+
}.to change {
|
|
154
|
+
cache.size
|
|
155
|
+
}.from(2).to(0)
|
|
156
|
+
expect(cache).not_to be_key 'foo'
|
|
157
|
+
expect(cache).not_to be_key 'bar'
|
|
158
|
+
end
|
|
159
|
+
|
|
148
160
|
it 'can clear by source' do
|
|
149
161
|
val1 = test_value.merge(source: 's1')
|
|
150
162
|
val2 = test_value.merge(source: 's1')
|
|
@@ -159,6 +171,46 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
159
171
|
expect(cache.key?('foo')).to be false
|
|
160
172
|
end
|
|
161
173
|
|
|
174
|
+
it 'can clear by source and digest' do
|
|
175
|
+
allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2', 'd3')
|
|
176
|
+
cache['foo'] = test_value.merge(source: 's1') # d1
|
|
177
|
+
cache['bar'] = test_value.merge(source: 's1') # d2
|
|
178
|
+
cache['baz'] = test_value.merge(source: 's1') # d3
|
|
179
|
+
|
|
180
|
+
# Clear those that match d1
|
|
181
|
+
expect {
|
|
182
|
+
cache.clear_by_source('s1', digest: 'd1')
|
|
183
|
+
}.to change { cache.size }.from(3).to(2)
|
|
184
|
+
expect(cache.key?('foo')).to be false
|
|
185
|
+
expect(cache.key?('bar')).to be true
|
|
186
|
+
|
|
187
|
+
# Clear those that do NOT match d2 (should clear baz)
|
|
188
|
+
expect {
|
|
189
|
+
cache.clear_by_source('s1', digest: 'd2', operator: '!=')
|
|
190
|
+
}.to change { cache.size }.from(2).to(1)
|
|
191
|
+
expect(cache.key?('baz')).to be false
|
|
192
|
+
expect(cache.key?('bar')).to be true
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
describe '#source_exist?' do
|
|
196
|
+
it 'returns true if source exists' do
|
|
197
|
+
cache['foo'] = test_value
|
|
198
|
+
expect(cache.source_exist?('for-test.txt')).to be true
|
|
199
|
+
expect(cache.source_exist?('non-existent')).to be false
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it 'filters by digest' do
|
|
203
|
+
allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2')
|
|
204
|
+
cache['foo'] = test_value.merge(source: 's1') # d1
|
|
205
|
+
cache['bar'] = test_value.merge(source: 's1') # d2
|
|
206
|
+
|
|
207
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
208
|
+
expect(cache.source_exist?('s1', digest: 'd3')).to be false
|
|
209
|
+
expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be true # bar exists
|
|
210
|
+
expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true # foo exists
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
162
214
|
it 'can return tags' do
|
|
163
215
|
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
|
164
216
|
cache[key] = value
|
|
@@ -169,6 +221,17 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
169
221
|
expect(tags.to_a).to eq %w[ bar baz foo ]
|
|
170
222
|
end
|
|
171
223
|
|
|
224
|
+
it 'can iterate over unique sources' do
|
|
225
|
+
val1 = test_value.merge(source: 's1')
|
|
226
|
+
val2 = test_value.merge(source: 's1')
|
|
227
|
+
val3 = test_value.merge(source: 's2')
|
|
228
|
+
cache['foo'] = val1
|
|
229
|
+
cache['bar'] = val2
|
|
230
|
+
cache['baz'] = val3
|
|
231
|
+
|
|
232
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
233
|
+
end
|
|
234
|
+
|
|
172
235
|
it 'can iterate over keys under a prefix' do
|
|
173
236
|
cache['foo'] = test_value
|
|
174
237
|
expect(cache.each.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
|
|
@@ -186,4 +249,108 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
186
249
|
["test2-bar", Documentrix::Documents::Record[test_value] ],
|
|
187
250
|
]
|
|
188
251
|
end
|
|
252
|
+
|
|
253
|
+
describe 'Prefix Isolation' do
|
|
254
|
+
let(:cache2) { cache.dup }
|
|
255
|
+
|
|
256
|
+
before do
|
|
257
|
+
cache2.prefix = 'other-'
|
|
258
|
+
|
|
259
|
+
# Setup shared sources and tags across prefixes
|
|
260
|
+
cache['foo'] = test_value.merge(source: 'shared.txt', tags: %w[ a ])
|
|
261
|
+
cache2['bar'] = test_value.merge(source: 'shared.txt', tags: %w[ a ])
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it 'does not leak clear_by_source' do
|
|
265
|
+
expect {
|
|
266
|
+
cache.clear_by_source('shared.txt')
|
|
267
|
+
}.to change { cache.size }.from(1).to(0)
|
|
268
|
+
|
|
269
|
+
expect(cache2.size).to eq 1
|
|
270
|
+
expect(cache2.key?('bar')).to be true
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
it 'does not leak source_exist?' do
|
|
274
|
+
# Ensure we are checking a source that ONLY exists in the other prefix
|
|
275
|
+
cache.clear_all_with_prefix
|
|
276
|
+
cache2['baz'] = test_value.merge(source: 'only-in-2.txt')
|
|
277
|
+
|
|
278
|
+
expect(cache.source_exist?('only-in-2.txt')).to be false
|
|
279
|
+
expect(cache2.source_exist?('only-in-2.txt')).to be true
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
it 'does not leak tags' do
|
|
283
|
+
cache.clear_all_with_prefix
|
|
284
|
+
cache2.clear_all_with_prefix
|
|
285
|
+
|
|
286
|
+
cache['foo'] = test_value.merge(tags: %w[ prefix1 ])
|
|
287
|
+
cache2['bar'] = test_value.merge(tags: %w[ prefix2 ])
|
|
288
|
+
|
|
289
|
+
expect(cache.tags.to_a).to match_array(['prefix1'])
|
|
290
|
+
expect(cache2.tags.to_a).to match_array(['prefix2'])
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
describe '#find_records' do
|
|
295
|
+
let(:needle) { [ 0.5 ] * 1_024 }
|
|
296
|
+
|
|
297
|
+
it 'raises ArgumentError if needle length is incorrect' do
|
|
298
|
+
expect {
|
|
299
|
+
cache.find_records([ 0.1 ])
|
|
300
|
+
}.to raise_error(ArgumentError, /needle embedding length/)
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it 'returns the most similar record' do
|
|
304
|
+
# Record 1: Exact match
|
|
305
|
+
val1 = test_value.merge(text: 'match', embedding: needle)
|
|
306
|
+
# Record 2: Different vector
|
|
307
|
+
val2 = test_value.merge(text: 'diff', embedding: [ 0.1 ] * 1_024)
|
|
308
|
+
|
|
309
|
+
cache['r1'] = val1
|
|
310
|
+
cache['r2'] = val2
|
|
311
|
+
|
|
312
|
+
results = cache.find_records(needle)
|
|
313
|
+
|
|
314
|
+
expect(results.size).to eq 2
|
|
315
|
+
expect(results.first.text).to eq 'match'
|
|
316
|
+
expect(results.first.similarity).to be_within(0.001).of(1.0)
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
it 'filters results by tags' do
|
|
320
|
+
val1 = test_value.merge(text: 'tagged', tags: %w[ a ], embedding: needle)
|
|
321
|
+
val2 = test_value.merge(text: 'untagged', tags: %w[ b ], embedding: needle)
|
|
322
|
+
|
|
323
|
+
cache['r1'] = val1
|
|
324
|
+
cache['r2'] = val2
|
|
325
|
+
|
|
326
|
+
expect(cache.find_records(needle, tags: %w[ a ]).map(&:text)).to eq %w[ tagged ]
|
|
327
|
+
expect(cache.find_records(needle, tags: %w[ b ]).map(&:text)).to eq %w[ untagged ]
|
|
328
|
+
expect(cache.find_records(needle, tags: %w[ c ]).size).to eq 0
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
it 'filters results by min_similarity' do
|
|
332
|
+
# Exact match
|
|
333
|
+
cache['r1'] = test_value.merge(text: 'match', embedding: needle)
|
|
334
|
+
# Very different vector
|
|
335
|
+
cache['r2'] = test_value.merge(text: 'diff', embedding: [ -0.5 ] * 1_024)
|
|
336
|
+
|
|
337
|
+
# Low threshold: both should appear
|
|
338
|
+
expect(cache.find_records(needle, min_similarity: -1).size).to eq 2
|
|
339
|
+
|
|
340
|
+
# High threshold: only match should appear
|
|
341
|
+
expect(cache.find_records(needle, min_similarity: 0.9).map(&:text)).to eq %w[ match ]
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
it 'limits results via max_records' do
|
|
345
|
+
3.times do |i|
|
|
346
|
+
cache["r#{i}"] = test_value.merge(text: "t#{i}", embedding: needle)
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
expect(cache.find_records(needle, max_records: 2).size).to eq 2
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
it 'returns empty array when no records match' do
|
|
353
|
+
expect(cache.find_records(needle)).to eq []
|
|
354
|
+
end
|
|
355
|
+
end
|
|
189
356
|
end
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::Splitters::Character do
|
|
4
2
|
let :splitter do
|
|
5
3
|
described_class.new chunk_size: 23, combining_string: ''
|
|
@@ -50,6 +48,26 @@ describe Documentrix::Documents::Splitters::Character do
|
|
|
50
48
|
expect(result.to_a.join('')).to eq ?A * 25
|
|
51
49
|
end
|
|
52
50
|
|
|
51
|
+
context 'with force' do
|
|
52
|
+
let :splitter do
|
|
53
|
+
described_class.new chunk_size: 23, combining_string: '', force: true
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'can split with force' do
|
|
57
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
|
58
|
+
result = splitter.split(text)
|
|
59
|
+
expect(result.count).to eq 10
|
|
60
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'can with force split2' do
|
|
64
|
+
text = ?A * 25
|
|
65
|
+
result = splitter.split(text)
|
|
66
|
+
expect(result.count).to eq 2
|
|
67
|
+
expect(result.to_a.join('')).to eq ?A * 25
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
53
71
|
it 'can split sentences' do
|
|
54
72
|
text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
|
|
55
73
|
splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::Splitters::Semantic do
|
|
4
2
|
let :ollama do
|
|
5
3
|
double('Ollama::Client')
|
|
@@ -29,11 +27,11 @@ describe Documentrix::Documents::Splitters::Semantic do
|
|
|
29
27
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
30
28
|
end
|
|
31
29
|
|
|
32
|
-
it 'can split with breakpoint :percentile' do
|
|
33
|
-
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size:
|
|
30
|
+
it 'can split with breakpoint :percentile, chunk_size 23' do
|
|
31
|
+
splitter = described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 23
|
|
34
32
|
text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
|
35
33
|
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
|
36
|
-
expect(result.count).to eq
|
|
34
|
+
expect(result.count).to eq 11
|
|
37
35
|
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
38
36
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
39
37
|
end
|
|
@@ -53,4 +51,18 @@ describe Documentrix::Documents::Splitters::Semantic do
|
|
|
53
51
|
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
54
52
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
55
53
|
end
|
|
54
|
+
|
|
55
|
+
context 'with force' do
|
|
56
|
+
let :splitter do
|
|
57
|
+
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 7, force: true
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'can split with force' do
|
|
61
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
|
62
|
+
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
|
63
|
+
expect(result.count).to eq 18
|
|
64
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
65
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
56
68
|
end
|
data/spec/documents_spec.rb
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents do
|
|
4
2
|
let :ollama do
|
|
5
3
|
double('Ollama::Client')
|
|
@@ -168,7 +166,7 @@ describe Documentrix::Documents do
|
|
|
168
166
|
|
|
169
167
|
expect(documents.size).to eq 3
|
|
170
168
|
|
|
171
|
-
documents.
|
|
169
|
+
documents.source_remove('source1')
|
|
172
170
|
|
|
173
171
|
expect(documents.size).to eq 1
|
|
174
172
|
expect(documents.exist?('baz')).to be true
|
|
@@ -211,4 +209,62 @@ describe Documentrix::Documents do
|
|
|
211
209
|
to(:new_collection)
|
|
212
210
|
end
|
|
213
211
|
end
|
|
212
|
+
|
|
213
|
+
context 'source management' do
|
|
214
|
+
before do
|
|
215
|
+
allow(documents).to receive(:compute_file_digest).and_return('d1')
|
|
216
|
+
allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
|
|
217
|
+
|
|
218
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
it 'can check if a source exists' do
|
|
222
|
+
documents.add('foo', source: 's1')
|
|
223
|
+
expect(documents.source_exist?('s1')).to be true
|
|
224
|
+
expect(documents.source_exist?('s2')).to be false
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
it 'can determine if a source is modified' do
|
|
228
|
+
documents.add('foo', source: 's1')
|
|
229
|
+
|
|
230
|
+
# Case 1: Source is up to date
|
|
231
|
+
expect(documents.source_modified?('s1')).to be false
|
|
232
|
+
|
|
233
|
+
# Case 2: Source is missing
|
|
234
|
+
expect(documents.source_modified?('s2')).to be true
|
|
235
|
+
|
|
236
|
+
# Case 3: Digest changed - mock both to return the new digest
|
|
237
|
+
allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
238
|
+
allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
239
|
+
expect(documents.source_modified?('s1')).to be true
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
it 'does not update the source if the digest matches' do
|
|
243
|
+
documents.add('foo', source: 's1')
|
|
244
|
+
|
|
245
|
+
expect(ollama).not_to receive(:embed)
|
|
246
|
+
documents.source_update(['foo'], source: 's1')
|
|
247
|
+
expect(documents.exist?('foo')).to be true
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
it 'updates the source if the digest has changed' do
|
|
251
|
+
documents.add('foo', source: 's1')
|
|
252
|
+
|
|
253
|
+
# Simulate a file change by updating the digest
|
|
254
|
+
allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
255
|
+
allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
256
|
+
|
|
257
|
+
expect(ollama).to receive(:embed).once
|
|
258
|
+
documents.source_update(['bar'], source: 's1')
|
|
259
|
+
|
|
260
|
+
expect(documents.exist?('bar')).to be true
|
|
261
|
+
expect(documents.exist?('foo')).to be false
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it 'updates the source if it is an URL' do
|
|
265
|
+
expect(ollama).to receive(:embed).once
|
|
266
|
+
documents.source_update('foo', source: 'https://www.example.com/s1')
|
|
267
|
+
expect(documents.exist?('foo')).to be true
|
|
268
|
+
end
|
|
269
|
+
end
|
|
214
270
|
end
|