documentrix 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +92 -0
- data/Rakefile +1 -1
- data/documentrix.gemspec +8 -8
- data/lib/documentrix/documents/cache/common.rb +74 -9
- data/lib/documentrix/documents/cache/records.rb +1 -1
- data/lib/documentrix/documents/cache/redis_cache.rb +3 -3
- data/lib/documentrix/documents/cache/sqlite_cache.rb +100 -21
- data/lib/documentrix/documents/splitters/character.rb +56 -4
- data/lib/documentrix/documents/splitters/common.rb +38 -0
- data/lib/documentrix/documents/splitters/semantic.rb +67 -8
- data/lib/documentrix/documents.rb +139 -25
- data/lib/documentrix/utils/colorize_texts.rb +25 -21
- data/lib/documentrix/utils/digests.rb +78 -0
- data/lib/documentrix/utils.rb +1 -0
- data/lib/documentrix/version.rb +1 -1
- data/spec/documentrix/documents/cache/interface_spec.rb +25 -3
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +75 -2
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +82 -19
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +142 -2
- data/spec/documentrix/documents/splitters/character_spec.rb +20 -2
- data/spec/documentrix/documents/splitters/semantic_spec.rb +17 -5
- data/spec/documents_spec.rb +76 -2
- data/spec/utils/colorize_texts_spec.rb +0 -2
- data/spec/utils/digests_spec.rb +97 -0
- data/spec/utils/tags_spec.rb +0 -2
- metadata +12 -6
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::MemoryCache do
|
|
4
2
|
let :prefix do
|
|
5
3
|
'test-'
|
|
@@ -120,8 +118,83 @@ describe Documentrix::Documents::MemoryCache do
|
|
|
120
118
|
}.from(1).to(0)
|
|
121
119
|
end
|
|
122
120
|
|
|
121
|
+
it 'can clear by source' do
|
|
122
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
|
|
123
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
|
|
124
|
+
cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
|
|
125
|
+
expect {
|
|
126
|
+
cache.clear_by_source('s1')
|
|
127
|
+
}.to change { cache.size }.from(3).to(1)
|
|
128
|
+
expect(cache.key?('baz')).to be true
|
|
129
|
+
expect(cache.key?('foo')).to be false
|
|
130
|
+
end
|
|
131
|
+
|
|
123
132
|
it 'can iterate over keys under a prefix' do
|
|
124
133
|
cache['foo'] = 'bar'
|
|
125
134
|
expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
|
|
126
135
|
end
|
|
136
|
+
|
|
137
|
+
it 'can iterate over unique sources' do
|
|
138
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
|
|
139
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
|
|
140
|
+
cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
|
|
141
|
+
|
|
142
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it 'can retrieve all unique tags' do
|
|
146
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['a', 'b'], embedding: [0.1]]
|
|
147
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's2', tags: ['b', 'c'], embedding: [0.1]]
|
|
148
|
+
|
|
149
|
+
expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it 'can clear records by tags' do
|
|
153
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['keep'], embedding: [0.1]]
|
|
154
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', tags: ['trash'], embedding: [0.1]]
|
|
155
|
+
|
|
156
|
+
expect {
|
|
157
|
+
cache.clear_for_tags(['trash'])
|
|
158
|
+
}.to change { cache.size }.from(2).to(1)
|
|
159
|
+
expect(cache.key?('foo')).to be true
|
|
160
|
+
expect(cache.key?('bar')).to be false
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it 'can check if a source exists' do
|
|
164
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
|
|
165
|
+
|
|
166
|
+
expect(cache.source_exist?('s1')).to be true
|
|
167
|
+
expect(cache.source_exist?('s2')).to be false
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it 'can clear by source with a specific digest' do
|
|
171
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
172
|
+
cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
|
|
173
|
+
|
|
174
|
+
expect {
|
|
175
|
+
cache.clear_by_source('s1', digest: 'd1')
|
|
176
|
+
}.to change { cache.size }.from(2).to(1)
|
|
177
|
+
expect(cache.key?('f2')).to be true
|
|
178
|
+
expect(cache.key?('f1')).to be false
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
it 'can clear outdated versions of a source' do
|
|
182
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
183
|
+
cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
|
|
184
|
+
|
|
185
|
+
expect {
|
|
186
|
+
cache.clear_by_source('s1', digest: 'd2', operator: '!=')
|
|
187
|
+
}.to change { cache.size }.from(2).to(1)
|
|
188
|
+
expect(cache.key?('f2')).to be true
|
|
189
|
+
expect(cache.key?('f1')).to be false
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'can check if a source exists with a specific digest' do
|
|
193
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
194
|
+
|
|
195
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
196
|
+
expect(cache.source_exist?('s1', digest: 'd2')).to be false
|
|
197
|
+
expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be false
|
|
198
|
+
expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true
|
|
199
|
+
end
|
|
127
200
|
end
|
|
@@ -1,28 +1,20 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::RedisCache do
|
|
2
|
+
let :object_class do
|
|
3
|
+
Documentrix::Documents::Cache::Records::Record
|
|
4
|
+
end
|
|
5
|
+
|
|
4
6
|
let :prefix do
|
|
5
7
|
'test-'
|
|
6
8
|
end
|
|
7
9
|
|
|
8
10
|
let :cache do
|
|
9
|
-
described_class.new prefix:, url: 'something'
|
|
11
|
+
described_class.new prefix:, url: 'something', object_class:
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
it 'can be instantiated' do
|
|
13
15
|
expect(cache).to be_a described_class
|
|
14
16
|
end
|
|
15
17
|
|
|
16
|
-
it 'defaults to nil object_class' do
|
|
17
|
-
expect(cache.object_class).to be_nil
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
it 'can be configured with object_class' do
|
|
21
|
-
object_class = Class.new(JSON::GenericObject)
|
|
22
|
-
cache = described_class.new(prefix:, url: 'something', object_class:)
|
|
23
|
-
expect(cache.object_class).to eq object_class
|
|
24
|
-
end
|
|
25
|
-
|
|
26
18
|
it 'raises ArgumentError if url is missing' do
|
|
27
19
|
expect {
|
|
28
20
|
described_class.new prefix:, url: nil
|
|
@@ -38,6 +30,10 @@ describe Documentrix::Documents::RedisCache do
|
|
|
38
30
|
allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
|
|
39
31
|
end
|
|
40
32
|
|
|
33
|
+
it 'can be configured with object_class' do
|
|
34
|
+
expect(cache.object_class).to eq object_class
|
|
35
|
+
end
|
|
36
|
+
|
|
41
37
|
it 'has Redis client' do
|
|
42
38
|
expect(cache.redis).to eq redis
|
|
43
39
|
end
|
|
@@ -62,9 +58,9 @@ describe Documentrix::Documents::RedisCache do
|
|
|
62
58
|
end
|
|
63
59
|
|
|
64
60
|
it 'can move prefixes' do
|
|
65
|
-
expect(redis).to receive(:get).with(prefix + 'foo').and_return(
|
|
66
|
-
expect(redis).to receive(:get).with('test2-bar').and_return(
|
|
67
|
-
expect(redis).to receive(:set).with('test3-foo',
|
|
61
|
+
expect(redis).to receive(:get).with(prefix + 'foo').and_return(object_class[foo: true].to_json)
|
|
62
|
+
expect(redis).to receive(:get).with('test2-bar').and_return(object_class[foo: true].to_json)
|
|
63
|
+
expect(redis).to receive(:set).with('test3-foo', /"foo":true/)
|
|
68
64
|
expect(redis).to receive(:del).with('test-foo')
|
|
69
65
|
expect(redis).to receive(:scan_each).with(match: ?*).
|
|
70
66
|
and_yield("#{prefix}foo").
|
|
@@ -81,12 +77,12 @@ describe Documentrix::Documents::RedisCache do
|
|
|
81
77
|
end
|
|
82
78
|
|
|
83
79
|
it 'can iterate over keys, values' do
|
|
84
|
-
key, value = 'foo',
|
|
85
|
-
expect(redis).to receive(:set).with(prefix + key,
|
|
80
|
+
key, value = 'foo', object_class[test: true]
|
|
81
|
+
expect(redis).to receive(:set).with(prefix + key, object_class[value].to_json)
|
|
86
82
|
cache[key] = value
|
|
87
83
|
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
|
|
88
84
|
and_yield("#{prefix}foo")
|
|
89
|
-
expect(redis).to receive(:get).with(prefix + key).and_return(
|
|
85
|
+
expect(redis).to receive(:get).with(prefix + key).and_return(object_class[test: true].to_json)
|
|
90
86
|
cache.each do |k, v|
|
|
91
87
|
expect(k).to eq prefix + key
|
|
92
88
|
expect(v).to eq value
|
|
@@ -109,6 +105,20 @@ describe Documentrix::Documents::RedisCache do
|
|
|
109
105
|
expect(cache.clear).to eq cache
|
|
110
106
|
end
|
|
111
107
|
|
|
108
|
+
it 'can clear by source' do
|
|
109
|
+
object_class = Class.new(JSON::GenericObject)
|
|
110
|
+
cache = described_class.new(prefix:, url: 'something', object_class:)
|
|
111
|
+
expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
|
|
112
|
+
'test-foo'
|
|
113
|
+
).and_yield(
|
|
114
|
+
'test-bar'
|
|
115
|
+
)
|
|
116
|
+
expect(redis).to receive(:get).with('test-foo').and_return(JSON(source: 's1'))
|
|
117
|
+
expect(redis).to receive(:get).with('test-bar').and_return(JSON(source: 's2'))
|
|
118
|
+
expect(redis).to receive(:del).with('test-foo')
|
|
119
|
+
expect(cache.clear_by_source('s1')).to eq cache
|
|
120
|
+
end
|
|
121
|
+
|
|
112
122
|
it 'can iterate over keys under a prefix' do
|
|
113
123
|
expect(redis).to receive(:scan_each).with(match: 'test-*')
|
|
114
124
|
cache.to_a
|
|
@@ -121,5 +131,58 @@ describe Documentrix::Documents::RedisCache do
|
|
|
121
131
|
it 'can remove prefix with unpre' do
|
|
122
132
|
expect(cache.unpre('test-foo')).to eq 'foo'
|
|
123
133
|
end
|
|
134
|
+
|
|
135
|
+
it 'can iterate over unique sources' do
|
|
136
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
137
|
+
"#{prefix}foo"
|
|
138
|
+
).and_yield(
|
|
139
|
+
"#{prefix}bar"
|
|
140
|
+
)
|
|
141
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1'))
|
|
142
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2'))
|
|
143
|
+
|
|
144
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
it 'can retrieve all unique tags' do
|
|
148
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
149
|
+
"#{prefix}foo"
|
|
150
|
+
).and_yield(
|
|
151
|
+
"#{prefix}bar"
|
|
152
|
+
)
|
|
153
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['a', 'b']))
|
|
154
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['b', 'c']))
|
|
155
|
+
|
|
156
|
+
expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'can clear records by tags' do
|
|
160
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
161
|
+
"#{prefix}foo"
|
|
162
|
+
).and_yield(
|
|
163
|
+
"#{prefix}bar"
|
|
164
|
+
)
|
|
165
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', tags: ['trash']))
|
|
166
|
+
expect(redis).to receive(:get).with("#{prefix}bar").and_return(JSON(source: 's2', tags: ['keep']))
|
|
167
|
+
expect(redis).to receive(:del).with("#{prefix}foo")
|
|
168
|
+
|
|
169
|
+
expect(cache.clear_for_tags(['trash'])).to eq cache
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it 'can check if a source exists with a specific digest' do
|
|
173
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
174
|
+
"#{prefix}foo"
|
|
175
|
+
)
|
|
176
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
|
|
177
|
+
|
|
178
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
179
|
+
|
|
180
|
+
# Reset for the negative case
|
|
181
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
|
|
182
|
+
"#{prefix}foo"
|
|
183
|
+
)
|
|
184
|
+
expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
|
|
185
|
+
expect(cache.source_exist?('s1', digest: 'd2')).to be false
|
|
186
|
+
end
|
|
124
187
|
end
|
|
125
188
|
end
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::SQLiteCache do
|
|
4
2
|
let :prefix do
|
|
5
3
|
'test-'
|
|
@@ -145,6 +143,74 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
145
143
|
expect(cache).to be_key 'bar'
|
|
146
144
|
end
|
|
147
145
|
|
|
146
|
+
it 'can clear all without tags' do
|
|
147
|
+
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
|
148
|
+
cache[key] = value
|
|
149
|
+
key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
|
|
150
|
+
cache[key] = value
|
|
151
|
+
expect {
|
|
152
|
+
expect(cache.clear_for_tags).to eq cache
|
|
153
|
+
}.to change {
|
|
154
|
+
cache.size
|
|
155
|
+
}.from(2).to(0)
|
|
156
|
+
expect(cache).not_to be_key 'foo'
|
|
157
|
+
expect(cache).not_to be_key 'bar'
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
it 'can clear by source' do
|
|
161
|
+
val1 = test_value.merge(source: 's1')
|
|
162
|
+
val2 = test_value.merge(source: 's1')
|
|
163
|
+
val3 = test_value.merge(source: 's2')
|
|
164
|
+
cache['foo'] = val1
|
|
165
|
+
cache['bar'] = val2
|
|
166
|
+
cache['baz'] = val3
|
|
167
|
+
expect {
|
|
168
|
+
cache.clear_by_source('s1')
|
|
169
|
+
}.to change { cache.size }.from(3).to(1)
|
|
170
|
+
expect(cache.key?('baz')).to be true
|
|
171
|
+
expect(cache.key?('foo')).to be false
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
it 'can clear by source and digest' do
|
|
175
|
+
allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2', 'd3')
|
|
176
|
+
cache['foo'] = test_value.merge(source: 's1') # d1
|
|
177
|
+
cache['bar'] = test_value.merge(source: 's1') # d2
|
|
178
|
+
cache['baz'] = test_value.merge(source: 's1') # d3
|
|
179
|
+
|
|
180
|
+
# Clear those that match d1
|
|
181
|
+
expect {
|
|
182
|
+
cache.clear_by_source('s1', digest: 'd1')
|
|
183
|
+
}.to change { cache.size }.from(3).to(2)
|
|
184
|
+
expect(cache.key?('foo')).to be false
|
|
185
|
+
expect(cache.key?('bar')).to be true
|
|
186
|
+
|
|
187
|
+
# Clear those that do NOT match d2 (should clear baz)
|
|
188
|
+
expect {
|
|
189
|
+
cache.clear_by_source('s1', digest: 'd2', operator: '!=')
|
|
190
|
+
}.to change { cache.size }.from(2).to(1)
|
|
191
|
+
expect(cache.key?('baz')).to be false
|
|
192
|
+
expect(cache.key?('bar')).to be true
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
describe '#source_exist?' do
|
|
196
|
+
it 'returns true if source exists' do
|
|
197
|
+
cache['foo'] = test_value
|
|
198
|
+
expect(cache.source_exist?('for-test.txt')).to be true
|
|
199
|
+
expect(cache.source_exist?('non-existent')).to be false
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it 'filters by digest' do
|
|
203
|
+
allow(cache).to receive(:compute_file_digest).and_return('d1', 'd2')
|
|
204
|
+
cache['foo'] = test_value.merge(source: 's1') # d1
|
|
205
|
+
cache['bar'] = test_value.merge(source: 's1') # d2
|
|
206
|
+
|
|
207
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
208
|
+
expect(cache.source_exist?('s1', digest: 'd3')).to be false
|
|
209
|
+
expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be true # bar exists
|
|
210
|
+
expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true # foo exists
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
148
214
|
it 'can return tags' do
|
|
149
215
|
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
|
150
216
|
cache[key] = value
|
|
@@ -155,6 +221,17 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
155
221
|
expect(tags.to_a).to eq %w[ bar baz foo ]
|
|
156
222
|
end
|
|
157
223
|
|
|
224
|
+
it 'can iterate over unique sources' do
|
|
225
|
+
val1 = test_value.merge(source: 's1')
|
|
226
|
+
val2 = test_value.merge(source: 's1')
|
|
227
|
+
val3 = test_value.merge(source: 's2')
|
|
228
|
+
cache['foo'] = val1
|
|
229
|
+
cache['bar'] = val2
|
|
230
|
+
cache['baz'] = val3
|
|
231
|
+
|
|
232
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
233
|
+
end
|
|
234
|
+
|
|
158
235
|
it 'can iterate over keys under a prefix' do
|
|
159
236
|
cache['foo'] = test_value
|
|
160
237
|
expect(cache.each.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
|
|
@@ -172,4 +249,67 @@ describe Documentrix::Documents::SQLiteCache do
|
|
|
172
249
|
["test2-bar", Documentrix::Documents::Record[test_value] ],
|
|
173
250
|
]
|
|
174
251
|
end
|
|
252
|
+
|
|
253
|
+
describe '#find_records' do
|
|
254
|
+
let(:needle) { [ 0.5 ] * 1_024 }
|
|
255
|
+
|
|
256
|
+
it 'raises ArgumentError if needle length is incorrect' do
|
|
257
|
+
expect {
|
|
258
|
+
cache.find_records([ 0.1 ])
|
|
259
|
+
}.to raise_error(ArgumentError, /needle embedding length/)
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
it 'returns the most similar record' do
|
|
263
|
+
# Record 1: Exact match
|
|
264
|
+
val1 = test_value.merge(text: 'match', embedding: needle)
|
|
265
|
+
# Record 2: Different vector
|
|
266
|
+
val2 = test_value.merge(text: 'diff', embedding: [ 0.1 ] * 1_024)
|
|
267
|
+
|
|
268
|
+
cache['r1'] = val1
|
|
269
|
+
cache['r2'] = val2
|
|
270
|
+
|
|
271
|
+
results = cache.find_records(needle)
|
|
272
|
+
|
|
273
|
+
expect(results.size).to eq 2
|
|
274
|
+
expect(results.first.text).to eq 'match'
|
|
275
|
+
expect(results.first.similarity).to be_within(0.001).of(1.0)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
it 'filters results by tags' do
|
|
279
|
+
val1 = test_value.merge(text: 'tagged', tags: %w[ a ], embedding: needle)
|
|
280
|
+
val2 = test_value.merge(text: 'untagged', tags: %w[ b ], embedding: needle)
|
|
281
|
+
|
|
282
|
+
cache['r1'] = val1
|
|
283
|
+
cache['r2'] = val2
|
|
284
|
+
|
|
285
|
+
expect(cache.find_records(needle, tags: %w[ a ]).map(&:text)).to eq %w[ tagged ]
|
|
286
|
+
expect(cache.find_records(needle, tags: %w[ b ]).map(&:text)).to eq %w[ untagged ]
|
|
287
|
+
expect(cache.find_records(needle, tags: %w[ c ]).size).to eq 0
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
it 'filters results by min_similarity' do
|
|
291
|
+
# Exact match
|
|
292
|
+
cache['r1'] = test_value.merge(text: 'match', embedding: needle)
|
|
293
|
+
# Very different vector
|
|
294
|
+
cache['r2'] = test_value.merge(text: 'diff', embedding: [ -0.5 ] * 1_024)
|
|
295
|
+
|
|
296
|
+
# Low threshold: both should appear
|
|
297
|
+
expect(cache.find_records(needle, min_similarity: -1).size).to eq 2
|
|
298
|
+
|
|
299
|
+
# High threshold: only match should appear
|
|
300
|
+
expect(cache.find_records(needle, min_similarity: 0.9).map(&:text)).to eq %w[ match ]
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it 'limits results via max_records' do
|
|
304
|
+
3.times do |i|
|
|
305
|
+
cache["r#{i}"] = test_value.merge(text: "t#{i}", embedding: needle)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
expect(cache.find_records(needle, max_records: 2).size).to eq 2
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
it 'returns empty array when no records match' do
|
|
312
|
+
expect(cache.find_records(needle)).to eq []
|
|
313
|
+
end
|
|
314
|
+
end
|
|
175
315
|
end
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::Splitters::Character do
|
|
4
2
|
let :splitter do
|
|
5
3
|
described_class.new chunk_size: 23, combining_string: ''
|
|
@@ -50,6 +48,26 @@ describe Documentrix::Documents::Splitters::Character do
|
|
|
50
48
|
expect(result.to_a.join('')).to eq ?A * 25
|
|
51
49
|
end
|
|
52
50
|
|
|
51
|
+
context 'with force' do
|
|
52
|
+
let :splitter do
|
|
53
|
+
described_class.new chunk_size: 23, combining_string: '', force: true
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'can split with force' do
|
|
57
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
|
58
|
+
result = splitter.split(text)
|
|
59
|
+
expect(result.count).to eq 10
|
|
60
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'can with force split2' do
|
|
64
|
+
text = ?A * 25
|
|
65
|
+
result = splitter.split(text)
|
|
66
|
+
expect(result.count).to eq 2
|
|
67
|
+
expect(result.to_a.join('')).to eq ?A * 25
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
53
71
|
it 'can split sentences' do
|
|
54
72
|
text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
|
|
55
73
|
splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::Splitters::Semantic do
|
|
4
2
|
let :ollama do
|
|
5
3
|
double('Ollama::Client')
|
|
@@ -29,11 +27,11 @@ describe Documentrix::Documents::Splitters::Semantic do
|
|
|
29
27
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
30
28
|
end
|
|
31
29
|
|
|
32
|
-
it 'can split with breakpoint :percentile' do
|
|
33
|
-
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size:
|
|
30
|
+
it 'can split with breakpoint :percentile, chunk_size 23' do
|
|
31
|
+
splitter = described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 23
|
|
34
32
|
text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
|
35
33
|
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
|
36
|
-
expect(result.count).to eq
|
|
34
|
+
expect(result.count).to eq 11
|
|
37
35
|
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
38
36
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
39
37
|
end
|
|
@@ -53,4 +51,18 @@ describe Documentrix::Documents::Splitters::Semantic do
|
|
|
53
51
|
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
54
52
|
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
55
53
|
end
|
|
54
|
+
|
|
55
|
+
context 'with force' do
|
|
56
|
+
let :splitter do
|
|
57
|
+
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 7, force: true
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'can split with force' do
|
|
61
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
|
62
|
+
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
|
63
|
+
expect(result.count).to eq 18
|
|
64
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
|
65
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
56
68
|
end
|
data/spec/documents_spec.rb
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents do
|
|
4
2
|
let :ollama do
|
|
5
3
|
double('Ollama::Client')
|
|
@@ -158,6 +156,24 @@ describe Documentrix::Documents do
|
|
|
158
156
|
}.to change { documents.size }.from(1).to(0)
|
|
159
157
|
end
|
|
160
158
|
|
|
159
|
+
it 'can remove sources' do
|
|
160
|
+
allow(ollama).to receive(:embed).at_least(:once).
|
|
161
|
+
and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
162
|
+
|
|
163
|
+
documents.add('foo', source: 'source1')
|
|
164
|
+
documents.add('bar', source: 'source1')
|
|
165
|
+
documents.add('baz', source: 'source2')
|
|
166
|
+
|
|
167
|
+
expect(documents.size).to eq 3
|
|
168
|
+
|
|
169
|
+
documents.source_remove('source1')
|
|
170
|
+
|
|
171
|
+
expect(documents.size).to eq 1
|
|
172
|
+
expect(documents.exist?('baz')).to be true
|
|
173
|
+
expect(documents.exist?('foo')).to be false
|
|
174
|
+
expect(documents.exist?('bar')).to be false
|
|
175
|
+
end
|
|
176
|
+
|
|
161
177
|
it 'returns collections' do
|
|
162
178
|
expect(documents.collections).to eq [ :default ]
|
|
163
179
|
end
|
|
@@ -193,4 +209,62 @@ describe Documentrix::Documents do
|
|
|
193
209
|
to(:new_collection)
|
|
194
210
|
end
|
|
195
211
|
end
|
|
212
|
+
|
|
213
|
+
context 'source management' do
|
|
214
|
+
before do
|
|
215
|
+
allow(documents).to receive(:compute_file_digest).and_return('d1')
|
|
216
|
+
allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
|
|
217
|
+
|
|
218
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
it 'can check if a source exists' do
|
|
222
|
+
documents.add('foo', source: 's1')
|
|
223
|
+
expect(documents.source_exist?('s1')).to be true
|
|
224
|
+
expect(documents.source_exist?('s2')).to be false
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
it 'can determine if a source is modified' do
|
|
228
|
+
documents.add('foo', source: 's1')
|
|
229
|
+
|
|
230
|
+
# Case 1: Source is up to date
|
|
231
|
+
expect(documents.source_modified?('s1')).to be false
|
|
232
|
+
|
|
233
|
+
# Case 2: Source is missing
|
|
234
|
+
expect(documents.source_modified?('s2')).to be true
|
|
235
|
+
|
|
236
|
+
# Case 3: Digest changed - mock both to return the new digest
|
|
237
|
+
allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
238
|
+
allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
239
|
+
expect(documents.source_modified?('s1')).to be true
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
it 'does not update the source if the digest matches' do
|
|
243
|
+
documents.add('foo', source: 's1')
|
|
244
|
+
|
|
245
|
+
expect(ollama).not_to receive(:embed)
|
|
246
|
+
documents.source_update(['foo'], source: 's1')
|
|
247
|
+
expect(documents.exist?('foo')).to be true
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
it 'updates the source if the digest has changed' do
|
|
251
|
+
documents.add('foo', source: 's1')
|
|
252
|
+
|
|
253
|
+
# Simulate a file change by updating the digest
|
|
254
|
+
allow(documents).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
255
|
+
allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
256
|
+
|
|
257
|
+
expect(ollama).to receive(:embed).once
|
|
258
|
+
documents.source_update(['bar'], source: 's1')
|
|
259
|
+
|
|
260
|
+
expect(documents.exist?('bar')).to be true
|
|
261
|
+
expect(documents.exist?('foo')).to be false
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it 'updates the source if it is an URL' do
|
|
265
|
+
expect(ollama).to receive(:embed).once
|
|
266
|
+
documents.source_update('foo', source: 'https://www.example.com/s1')
|
|
267
|
+
expect(documents.exist?('foo')).to be true
|
|
268
|
+
end
|
|
269
|
+
end
|
|
196
270
|
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
require 'tempfile'
|
|
2
|
+
|
|
3
|
+
describe Documentrix::Utils::Digests do
|
|
4
|
+
let(:test_class) do
|
|
5
|
+
Class.new do
|
|
6
|
+
include Documentrix::Utils::Digests
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
let(:subject) { test_class.new.expose }
|
|
11
|
+
|
|
12
|
+
describe '#compute_digest' do
|
|
13
|
+
it 'computes a valid SHA256 digest of a string' do
|
|
14
|
+
text = 'hello world'
|
|
15
|
+
expected = Digest::SHA256.hexdigest(text)
|
|
16
|
+
expect(subject.compute_digest(text)).to eq expected
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe '#compute_file_digest' do
|
|
21
|
+
it 'returns nil for an empty filename' do
|
|
22
|
+
expect(subject.compute_file_digest(nil)).to be_nil
|
|
23
|
+
expect(subject.compute_file_digest('')).to be_nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it 'returns nil for an absolute URL' do
|
|
27
|
+
expect(subject.compute_file_digest('https://example.com/file.txt')).to be_nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
it 'returns nil for a non-existent file' do
|
|
31
|
+
expect(subject.compute_file_digest('/tmp/non_existent_file_12345')).to be_nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it 'computes the digest of a local file' do
|
|
35
|
+
file = Tempfile.create('documentrix_test')
|
|
36
|
+
content = 'file content'
|
|
37
|
+
file.write(content)
|
|
38
|
+
file.close
|
|
39
|
+
|
|
40
|
+
expected = Digest::SHA256.hexdigest(content)
|
|
41
|
+
expect(subject.compute_file_digest(file.path)).to eq expected
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
context 'with caching' do
|
|
45
|
+
let(:file) { Tempfile.create('documentrix_cache_test') }
|
|
46
|
+
let(:content) { 'initial content' }
|
|
47
|
+
|
|
48
|
+
before do
|
|
49
|
+
file.write(content)
|
|
50
|
+
file.close
|
|
51
|
+
subject.file_digest_cache_clear
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'returns the same digest on subsequent calls' do
|
|
55
|
+
digest1 = subject.compute_file_digest(file.path)
|
|
56
|
+
digest2 = subject.compute_file_digest(file.path)
|
|
57
|
+
expect(digest1).to eq digest2
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'recomputes the digest when the file is modified' do
|
|
61
|
+
digest1 = subject.compute_file_digest(file.path)
|
|
62
|
+
|
|
63
|
+
# Update file content and force mtime change
|
|
64
|
+
File.write(file.path, 'updated content')
|
|
65
|
+
# Ensure mtime is actually different (some FS have low precision)
|
|
66
|
+
File.utime(Time.now + 1, Time.now + 1, file.path)
|
|
67
|
+
|
|
68
|
+
digest2 = subject.compute_file_digest(file.path)
|
|
69
|
+
expect(digest1).not_to eq digest2
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it 'recomputes the digest after cache clear' do
|
|
73
|
+
digest1 = subject.compute_file_digest(file.path)
|
|
74
|
+
subject.file_digest_cache_clear
|
|
75
|
+
|
|
76
|
+
# Even though file hasn't changed, it should re-read and return same value
|
|
77
|
+
digest2 = subject.compute_file_digest(file.path)
|
|
78
|
+
expect(digest1).to eq digest2
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
describe '#file_digest_cache_clear' do
|
|
84
|
+
it 'clears the internal cache' do
|
|
85
|
+
file = Tempfile.create('documentrix_clear_test')
|
|
86
|
+
file.write('test')
|
|
87
|
+
file.close
|
|
88
|
+
|
|
89
|
+
subject.compute_file_digest(file.path)
|
|
90
|
+
subject.file_digest_cache_clear
|
|
91
|
+
|
|
92
|
+
# We can verify this indirectly by checking if the cache is empty
|
|
93
|
+
# or by the fact that it will re-compute in tests.
|
|
94
|
+
expect(subject).to respond_to(:file_digest_cache_clear)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|