documentrix 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/Gemfile +5 -0
- data/README.md +71 -0
- data/Rakefile +48 -0
- data/documentrix.gemspec +41 -0
- data/lib/documentrix/documents/cache/common.rb +43 -0
- data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
- data/lib/documentrix/documents/cache/records.rb +145 -0
- data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
- data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
- data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
- data/lib/documentrix/documents/splitters/character.rb +72 -0
- data/lib/documentrix/documents/splitters/semantic.rb +91 -0
- data/lib/documentrix/documents.rb +328 -0
- data/lib/documentrix/utils/colorize_texts.rb +65 -0
- data/lib/documentrix/utils/math.rb +48 -0
- data/lib/documentrix/utils/tags.rb +112 -0
- data/lib/documentrix/utils.rb +5 -0
- data/lib/documentrix/version.rb +8 -0
- data/lib/documentrix.rb +11 -0
- data/spec/assets/embeddings.json +1 -0
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
- data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
- data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
- data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
- data/spec/documents_spec.rb +174 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/utils/colorize_texts_spec.rb +13 -0
- data/spec/utils/tags_spec.rb +53 -0
- metadata +329 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::MemoryCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :cache do
|
9
|
+
described_class.new prefix:
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can be instantiated' do
|
13
|
+
expect(cache).to be_a described_class
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can get/set a key' do
|
17
|
+
key, value = 'foo', { test: true }
|
18
|
+
expect {
|
19
|
+
cache[key] = value
|
20
|
+
}.to change {
|
21
|
+
cache[key]
|
22
|
+
}.from(nil).to(value)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'can determine if key exists' do
|
26
|
+
key, value = 'foo', { test: true }
|
27
|
+
expect {
|
28
|
+
cache[key] = value
|
29
|
+
}.to change {
|
30
|
+
cache.key?(key)
|
31
|
+
}.from(false).to(true)
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'can set key with different prefixes' do
|
35
|
+
key, value = 'foo', { test: true }
|
36
|
+
expect {
|
37
|
+
cache[key] = value
|
38
|
+
}.to change {
|
39
|
+
cache.size
|
40
|
+
}.from(0).to(1)
|
41
|
+
cache2 = cache.dup
|
42
|
+
cache2.prefix = 'test2-'
|
43
|
+
expect {
|
44
|
+
cache2[key] = value
|
45
|
+
}.to change {
|
46
|
+
cache2.size
|
47
|
+
}.from(0).to(1)
|
48
|
+
expect(cache.size).to eq 1
|
49
|
+
s = 0
|
50
|
+
cache.full_each { s += 1 }
|
51
|
+
expect(s).to eq 2
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'can delete' do
|
55
|
+
key, value = 'foo', { test: true }
|
56
|
+
expect(cache.delete(key)).to be_falsy
|
57
|
+
cache[key] = value
|
58
|
+
expect {
|
59
|
+
expect(cache.delete(key)).to eq true
|
60
|
+
}.to change {
|
61
|
+
cache.key?(key)
|
62
|
+
}.from(true).to(false)
|
63
|
+
expect(cache.delete(key)).to eq false
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'can iterate over keys, values' do
|
67
|
+
key, value = 'foo', { test: true }
|
68
|
+
cache[key] = value
|
69
|
+
cache.each do |k, v|
|
70
|
+
expect(k).to eq prefix + key
|
71
|
+
expect(v).to eq value
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'returns size' do
|
76
|
+
key, value = 'foo', { test: true }
|
77
|
+
expect {
|
78
|
+
cache[key] = value
|
79
|
+
}.to change {
|
80
|
+
cache.size
|
81
|
+
}.from(0).to(1)
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'can clear' do
|
85
|
+
key, value = 'foo', { test: true }
|
86
|
+
cache[key] = value
|
87
|
+
expect {
|
88
|
+
expect(cache.clear).to eq cache
|
89
|
+
}.to change {
|
90
|
+
cache.size
|
91
|
+
}.from(1).to(0)
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'can iterate over keys under a prefix' do
|
95
|
+
cache['foo'] = 'bar'
|
96
|
+
expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::RedisBackedMemoryCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :cache do
|
9
|
+
described_class.new prefix: 'test-', url: 'something'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'raises ArgumentError if url is missing' do
|
13
|
+
expect {
|
14
|
+
described_class.new prefix:, url: nil
|
15
|
+
}.to raise_error ArgumentError
|
16
|
+
end
|
17
|
+
|
18
|
+
context 'test redis interactions' do
|
19
|
+
let :data do
|
20
|
+
cache.instance_eval { @data }
|
21
|
+
end
|
22
|
+
|
23
|
+
let :redis_cache do
|
24
|
+
cache.instance_eval { @redis_cache }
|
25
|
+
end
|
26
|
+
|
27
|
+
let :redis do
|
28
|
+
double('Redis')
|
29
|
+
end
|
30
|
+
|
31
|
+
before do
|
32
|
+
allow_any_instance_of(Documentrix::Documents::RedisCache).to\
|
33
|
+
receive(:redis).and_return(redis)
|
34
|
+
allow(redis).to receive(:scan_each)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'can be instantiated and initialized' do
|
38
|
+
expect(cache).to be_a described_class
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'defaults to nil object_class' do
|
42
|
+
expect(cache.object_class).to be_nil
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can be configured with object_class' do
|
46
|
+
object_class = Class.new(JSON::GenericObject)
|
47
|
+
cache = described_class.new(prefix: 'test-', url: 'something', object_class:)
|
48
|
+
expect(cache.object_class).to eq object_class
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'has Redis client' do
|
52
|
+
expect(cache.redis).to eq redis
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'can get a key' do
|
56
|
+
key = 'foo'
|
57
|
+
expect(data).to receive(:[]).with('test-' + key).and_return 666
|
58
|
+
expect(cache[key]).to eq 666
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'can set a value for a key' do
|
62
|
+
key, value = 'foo', { test: true }
|
63
|
+
expect(data).to receive(:[]=).with('test-' + key, { test: true }).and_call_original
|
64
|
+
expect(redis).to receive(:set).with('test-' + key, JSON(value))
|
65
|
+
cache[key] = value
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'can determine if key exists' do
|
69
|
+
key = 'foo'
|
70
|
+
expect(data).to receive(:key?).with('test-' + key).and_return(false, true)
|
71
|
+
expect(cache.key?('foo')).to eq false
|
72
|
+
expect(cache.key?('foo')).to eq true
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'can delete' do
|
76
|
+
key = 'foo'
|
77
|
+
expect(data).to receive(:delete).with('test-' + key).and_return 'bar'
|
78
|
+
expect(redis).to receive(:del).with('test-' + key).and_return 1
|
79
|
+
expect(cache.delete(key)).to eq true
|
80
|
+
expect(data).to receive(:delete).with('test-' + key).and_return nil
|
81
|
+
expect(redis).to receive(:del).with(prefix + key).and_return 0
|
82
|
+
expect(cache.delete(key)).to eq false
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'can iterate over keys, values' do
|
86
|
+
key, value = 'foo', { 'test' => true }
|
87
|
+
expect(redis).to receive(:set).with('test-' + key, JSON(value))
|
88
|
+
cache[key] = value
|
89
|
+
cache.each do |k, v|
|
90
|
+
expect(k).to eq prefix + key
|
91
|
+
expect(v).to eq value
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'returns size' do
|
96
|
+
expect(cache).to receive(:count).and_return 3
|
97
|
+
expect(cache.size).to eq 3
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'can clear' do
|
101
|
+
expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
|
102
|
+
'test-foo'
|
103
|
+
)
|
104
|
+
expect(redis).to receive(:del).with('test-foo')
|
105
|
+
expect(cache.clear).to eq cache
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'can iterate over keys under a prefix' do
|
109
|
+
data['test-foo'] = 'bar'
|
110
|
+
expect(cache.to_a).to eq [ %w[ test-foo bar ] ]
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'can compute prefix with pre' do
|
114
|
+
expect(cache.pre('foo')).to eq 'test-foo'
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'can remove prefix with unpre' do
|
118
|
+
expect(cache.unpre('test-foo')).to eq 'foo'
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::RedisCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :cache do
|
9
|
+
described_class.new prefix:, url: 'something'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can be instantiated' do
|
13
|
+
expect(cache).to be_a described_class
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'defaults to nil object_class' do
|
17
|
+
expect(cache.object_class).to be_nil
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can be configured with object_class' do
|
21
|
+
object_class = Class.new(JSON::GenericObject)
|
22
|
+
cache = described_class.new(prefix:, url: 'something', object_class:)
|
23
|
+
expect(cache.object_class).to eq object_class
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'raises ArgumentError if url is missing' do
|
27
|
+
expect {
|
28
|
+
described_class.new prefix:, url: nil
|
29
|
+
}.to raise_error ArgumentError
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'test redis interactions' do
|
33
|
+
let :redis do
|
34
|
+
double('Redis')
|
35
|
+
end
|
36
|
+
|
37
|
+
before do
|
38
|
+
allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'has Redis client' do
|
42
|
+
expect(cache.redis).to eq redis
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can get a key' do
|
46
|
+
key = 'foo'
|
47
|
+
expect(redis).to receive(:get).with(prefix + key).and_return '"some_json"'
|
48
|
+
expect(cache[key]).to eq 'some_json'
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'can set a value for a key' do
|
52
|
+
key, value = 'foo', { test: true }
|
53
|
+
expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
|
54
|
+
cache[key] = value
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'can set a value for a key with ttl' do
|
58
|
+
cache = described_class.new prefix:, url: 'something', ex: 3_600
|
59
|
+
key, value = 'foo', { test: true }
|
60
|
+
expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: 3_600)
|
61
|
+
cache[key] = value
|
62
|
+
expect(redis).to receive(:ttl).with(prefix + key).and_return 3_600
|
63
|
+
expect(cache.ttl(key)).to eq 3_600
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'can determine if key exists' do
|
67
|
+
key = 'foo'
|
68
|
+
expect(redis).to receive(:exists?).with(prefix + key).and_return(false, true)
|
69
|
+
expect(cache.key?('foo')).to eq false
|
70
|
+
expect(cache.key?('foo')).to eq true
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'can delete' do
|
74
|
+
key = 'foo'
|
75
|
+
expect(redis).to receive(:del).with(prefix + key).and_return 1
|
76
|
+
expect(cache.delete(key)).to eq true
|
77
|
+
expect(redis).to receive(:del).with(prefix + key).and_return 0
|
78
|
+
expect(cache.delete(key)).to eq false
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'can iterate over keys, values' do
|
82
|
+
key, value = 'foo', { 'test' => true }
|
83
|
+
expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
|
84
|
+
cache[key] = value
|
85
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
|
86
|
+
and_yield("#{prefix}foo")
|
87
|
+
expect(redis).to receive(:get).with(prefix + key).and_return(JSON(test: true))
|
88
|
+
cache.each do |k, v|
|
89
|
+
expect(k).to eq prefix + key
|
90
|
+
expect(v).to eq value
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'returns size' do
|
95
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
|
96
|
+
and_yield("#{prefix}foo").
|
97
|
+
and_yield("#{prefix}bar").
|
98
|
+
and_yield("#{prefix}baz")
|
99
|
+
expect(cache.size).to eq 3
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'can clear' do
|
103
|
+
expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
|
104
|
+
'test-foo'
|
105
|
+
)
|
106
|
+
expect(redis).to receive(:del).with('test-foo')
|
107
|
+
expect(cache.clear).to eq cache
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'can iterate over keys under a prefix' do
|
111
|
+
expect(redis).to receive(:scan_each).with(match: 'test-*')
|
112
|
+
cache.to_a
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'can compute prefix with pre' do
|
116
|
+
expect(cache.pre('foo')).to eq 'test-foo'
|
117
|
+
end
|
118
|
+
|
119
|
+
it 'can remove prefix with unpre' do
|
120
|
+
expect(cache.unpre('test-foo')).to eq 'foo'
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::SQLiteCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :test_value do
|
9
|
+
{
|
10
|
+
key: 'test',
|
11
|
+
text: 'test text',
|
12
|
+
norm: 0.5,
|
13
|
+
source: 'for-test.txt',
|
14
|
+
tags: %w[ test ],
|
15
|
+
embedding: [ 0.5 ] * 1_024,
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
let :cache do
|
20
|
+
described_class.new prefix:
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'can be instantiated' do
|
24
|
+
expect(cache).to be_a described_class
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'defaults to :memory: mode' do
|
28
|
+
expect(cache.filename).to eq ':memory:'
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'can be switchted to file mode' do
|
32
|
+
expect(SQLite3::Database).to receive(:new).with('foo.sqlite').
|
33
|
+
and_return(double.as_null_object)
|
34
|
+
cache = described_class.new prefix:, filename: 'foo.sqlite'
|
35
|
+
expect(cache.filename).to eq 'foo.sqlite'
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'can get/set a key' do
|
39
|
+
key, value = 'foo', test_value
|
40
|
+
queried_value = nil
|
41
|
+
expect {
|
42
|
+
cache[key] = value
|
43
|
+
}.to change {
|
44
|
+
queried_value = cache[key]
|
45
|
+
}.from(nil).to(Documentrix::Documents::Record[value])
|
46
|
+
expect(queried_value.embedding).to eq [ 0.5 ] * 1_024
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'can determine if key exists' do
|
50
|
+
key, value = 'foo', test_value
|
51
|
+
expect {
|
52
|
+
cache[key] = value
|
53
|
+
}.to change {
|
54
|
+
cache.key?(key)
|
55
|
+
}.from(false).to(true)
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'can set key with different prefixes' do
|
59
|
+
key, value = 'foo', test_value
|
60
|
+
expect {
|
61
|
+
cache[key] = value
|
62
|
+
}.to change {
|
63
|
+
cache.size
|
64
|
+
}.from(0).to(1)
|
65
|
+
cache2 = cache.dup
|
66
|
+
cache2.prefix = 'test2-'
|
67
|
+
expect {
|
68
|
+
cache2[key] = value
|
69
|
+
}.to change {
|
70
|
+
cache2.size
|
71
|
+
}.from(0).to(1)
|
72
|
+
expect(cache.size).to eq 1
|
73
|
+
s = 0
|
74
|
+
cache.full_each { s += 1 }
|
75
|
+
expect(s).to eq 2
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'can delete' do
|
79
|
+
key, value = 'foo', test_value
|
80
|
+
expect(cache.delete(key)).to be_falsy
|
81
|
+
cache[key] = value
|
82
|
+
expect {
|
83
|
+
expect(cache.delete(key)).to eq true
|
84
|
+
}.to change {
|
85
|
+
cache.key?(key)
|
86
|
+
}.from(true).to(false)
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'returns size' do
|
90
|
+
key, value = 'foo', test_value
|
91
|
+
expect {
|
92
|
+
cache[key] = value
|
93
|
+
}.to change {
|
94
|
+
cache.size
|
95
|
+
}.from(0).to(1)
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'can convert_to_vector' do
|
99
|
+
vector = [ 23.0, 666.0 ]
|
100
|
+
expect(cache.convert_to_vector(vector)).to eq vector
|
101
|
+
end
|
102
|
+
|
103
|
+
it 'can clear' do
|
104
|
+
key, value = 'foo', { embedding: [ 0.5 ] * 1_024 }
|
105
|
+
cache[key] = value
|
106
|
+
expect {
|
107
|
+
expect(cache.clear).to eq cache
|
108
|
+
}.to change {
|
109
|
+
cache.size
|
110
|
+
}.from(1).to(0)
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'can clear for tags' do
|
114
|
+
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
115
|
+
cache[key] = value
|
116
|
+
key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
|
117
|
+
cache[key] = value
|
118
|
+
expect {
|
119
|
+
expect(cache.clear_for_tags(%w[ #foo ])).to eq cache
|
120
|
+
}.to change {
|
121
|
+
cache.size
|
122
|
+
}.from(2).to(1)
|
123
|
+
expect(cache).not_to be_key 'foo'
|
124
|
+
expect(cache).to be_key 'bar'
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'can return tags' do
|
128
|
+
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
129
|
+
cache[key] = value
|
130
|
+
key, value = 'bar', { tags: %w[ bar baz ], embedding: [ 0.5 ] * 1_024 }
|
131
|
+
cache[key] = value
|
132
|
+
tags = cache.tags
|
133
|
+
expect(tags).to be_a Documentrix::Utils::Tags
|
134
|
+
expect(tags.to_a).to eq %w[ bar baz foo ]
|
135
|
+
end
|
136
|
+
|
137
|
+
it 'can iterate over keys under a prefix' do
|
138
|
+
cache['foo'] = test_value
|
139
|
+
expect(cache.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::Splitters::Character do
|
4
|
+
let :splitter do
|
5
|
+
described_class.new chunk_size: 23, combining_string: ''
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'can be instantiated' do
|
9
|
+
expect(splitter).to be_a described_class
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can split' do
|
13
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
14
|
+
result = splitter.split(text)
|
15
|
+
expect(result.count).to eq 5
|
16
|
+
expect(result.to_a.join('')).to eq ?A * 100
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'can split combining with separation' do
|
20
|
+
splitter = described_class.new chunk_size: 25, include_separator: false,
|
21
|
+
combining_string: ?X
|
22
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
23
|
+
result = splitter.split(text)
|
24
|
+
expect(result.count).to eq 5
|
25
|
+
expect(result.to_a.join(?B)).to eq\
|
26
|
+
"AAAAAAAAAAXAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXB"\
|
27
|
+
"AAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAX"
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'can split including separator' do
|
31
|
+
splitter = described_class.new chunk_size: 25, include_separator: true,
|
32
|
+
combining_string: ''
|
33
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
34
|
+
result = splitter.split(text)
|
35
|
+
expect(result.count).to eq 5
|
36
|
+
expect(result.to_a.join('')).to eq text
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'cannot split' do
|
40
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
41
|
+
result = splitter.split(text)
|
42
|
+
expect(result.count).to eq 1
|
43
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'cannot split2' do
|
47
|
+
text = ?A * 25
|
48
|
+
result = splitter.split(text)
|
49
|
+
expect(result.count).to eq 1
|
50
|
+
expect(result.to_a.join('')).to eq ?A * 25
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'can split sentences' do
|
54
|
+
text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
|
55
|
+
splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
|
56
|
+
result = splitter.split(text)
|
57
|
+
expect(result.to_a).to eq %w[ foo foo bar bar baz baz quux quux ]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
RSpec.describe Documentrix::Documents::Splitters::RecursiveCharacter do
|
62
|
+
let :splitter do
|
63
|
+
described_class.new chunk_size: 23, combining_string: ''
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'can be instantiated' do
|
67
|
+
expect(splitter).to be_a described_class
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'can split' do
|
71
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
72
|
+
result = splitter.split(text)
|
73
|
+
expect(result.count).to eq 5
|
74
|
+
expect(result.to_a.join('')).to eq ?A * 100
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'cannot split' do
|
78
|
+
splitter = described_class.new chunk_size: 23, include_separator: true,
|
79
|
+
separators: described_class::DEFAULT_SEPARATORS[0..-2]
|
80
|
+
text = ?A * 25
|
81
|
+
result = splitter.split(text)
|
82
|
+
expect(result.count).to eq 1
|
83
|
+
expect(result.to_a.join('')).to eq ?A * 25
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'can split including separator' do
|
87
|
+
splitter = described_class.new chunk_size: 25, include_separator: true,
|
88
|
+
combining_string: ''
|
89
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
90
|
+
result = splitter.split(text)
|
91
|
+
expect(result.count).to eq 5
|
92
|
+
expect(result.to_a.join('')).to eq text
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'can split single newline as well' do
|
96
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
97
|
+
result = splitter.split(text)
|
98
|
+
expect(result.count).to eq 5
|
99
|
+
expect(result.to_a.join('')).to eq ?A * 100
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'can split single newline as well including separator' do
|
103
|
+
splitter = described_class.new chunk_size: 25, include_separator: true,
|
104
|
+
combining_string: ''
|
105
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
106
|
+
result = splitter.split(text)
|
107
|
+
expect(result.count).to eq 5
|
108
|
+
expect(result.to_a.join('')).to eq text
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::Splitters::Semantic do
|
4
|
+
let :ollama do
|
5
|
+
double('Ollama::Client')
|
6
|
+
end
|
7
|
+
|
8
|
+
let :splitter do
|
9
|
+
described_class.new ollama:, model: 'mxbai-embed-large'
|
10
|
+
end
|
11
|
+
|
12
|
+
let :embeddings do
|
13
|
+
JSON(File.read(asset('embeddings.json')))
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can be instantiated' do
|
17
|
+
expect(splitter).to be_a described_class
|
18
|
+
end
|
19
|
+
|
20
|
+
before do
|
21
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings:))
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'can split with breakpoint :percentile' do
|
25
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
26
|
+
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
27
|
+
expect(result.count).to eq 3
|
28
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
29
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'can split with breakpoint :percentile' do
|
33
|
+
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 50
|
34
|
+
text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
35
|
+
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
36
|
+
expect(result.count).to eq 4
|
37
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
38
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'can split with breakpoint :standard_deviation' do
|
42
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
43
|
+
result = splitter.split(text, breakpoint: :standard_deviation, percentage: 100)
|
44
|
+
expect(result.count).to eq 3
|
45
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
46
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'can split with breakpoint :interquartile' do
|
50
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
51
|
+
result = splitter.split(text, breakpoint: :interquartile, percentage: 75)
|
52
|
+
expect(result.count).to eq 3
|
53
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
54
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
55
|
+
end
|
56
|
+
end
|