documentrix 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/Gemfile +5 -0
- data/README.md +71 -0
- data/Rakefile +48 -0
- data/documentrix.gemspec +41 -0
- data/lib/documentrix/documents/cache/common.rb +43 -0
- data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
- data/lib/documentrix/documents/cache/records.rb +145 -0
- data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
- data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
- data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
- data/lib/documentrix/documents/splitters/character.rb +72 -0
- data/lib/documentrix/documents/splitters/semantic.rb +91 -0
- data/lib/documentrix/documents.rb +328 -0
- data/lib/documentrix/utils/colorize_texts.rb +65 -0
- data/lib/documentrix/utils/math.rb +48 -0
- data/lib/documentrix/utils/tags.rb +112 -0
- data/lib/documentrix/utils.rb +5 -0
- data/lib/documentrix/version.rb +8 -0
- data/lib/documentrix.rb +11 -0
- data/spec/assets/embeddings.json +1 -0
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
- data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
- data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
- data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
- data/spec/documents_spec.rb +174 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/utils/colorize_texts_spec.rb +13 -0
- data/spec/utils/tags_spec.rb +53 -0
- metadata +329 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::MemoryCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :cache do
|
9
|
+
described_class.new prefix:
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can be instantiated' do
|
13
|
+
expect(cache).to be_a described_class
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can get/set a key' do
|
17
|
+
key, value = 'foo', { test: true }
|
18
|
+
expect {
|
19
|
+
cache[key] = value
|
20
|
+
}.to change {
|
21
|
+
cache[key]
|
22
|
+
}.from(nil).to(value)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'can determine if key exists' do
|
26
|
+
key, value = 'foo', { test: true }
|
27
|
+
expect {
|
28
|
+
cache[key] = value
|
29
|
+
}.to change {
|
30
|
+
cache.key?(key)
|
31
|
+
}.from(false).to(true)
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'can set key with different prefixes' do
|
35
|
+
key, value = 'foo', { test: true }
|
36
|
+
expect {
|
37
|
+
cache[key] = value
|
38
|
+
}.to change {
|
39
|
+
cache.size
|
40
|
+
}.from(0).to(1)
|
41
|
+
cache2 = cache.dup
|
42
|
+
cache2.prefix = 'test2-'
|
43
|
+
expect {
|
44
|
+
cache2[key] = value
|
45
|
+
}.to change {
|
46
|
+
cache2.size
|
47
|
+
}.from(0).to(1)
|
48
|
+
expect(cache.size).to eq 1
|
49
|
+
s = 0
|
50
|
+
cache.full_each { s += 1 }
|
51
|
+
expect(s).to eq 2
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'can delete' do
|
55
|
+
key, value = 'foo', { test: true }
|
56
|
+
expect(cache.delete(key)).to be_falsy
|
57
|
+
cache[key] = value
|
58
|
+
expect {
|
59
|
+
expect(cache.delete(key)).to eq true
|
60
|
+
}.to change {
|
61
|
+
cache.key?(key)
|
62
|
+
}.from(true).to(false)
|
63
|
+
expect(cache.delete(key)).to eq false
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'can iterate over keys, values' do
|
67
|
+
key, value = 'foo', { test: true }
|
68
|
+
cache[key] = value
|
69
|
+
cache.each do |k, v|
|
70
|
+
expect(k).to eq prefix + key
|
71
|
+
expect(v).to eq value
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'returns size' do
|
76
|
+
key, value = 'foo', { test: true }
|
77
|
+
expect {
|
78
|
+
cache[key] = value
|
79
|
+
}.to change {
|
80
|
+
cache.size
|
81
|
+
}.from(0).to(1)
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'can clear' do
|
85
|
+
key, value = 'foo', { test: true }
|
86
|
+
cache[key] = value
|
87
|
+
expect {
|
88
|
+
expect(cache.clear).to eq cache
|
89
|
+
}.to change {
|
90
|
+
cache.size
|
91
|
+
}.from(1).to(0)
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'can iterate over keys under a prefix' do
|
95
|
+
cache['foo'] = 'bar'
|
96
|
+
expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::RedisBackedMemoryCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :cache do
|
9
|
+
described_class.new prefix: 'test-', url: 'something'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'raises ArgumentError if url is missing' do
|
13
|
+
expect {
|
14
|
+
described_class.new prefix:, url: nil
|
15
|
+
}.to raise_error ArgumentError
|
16
|
+
end
|
17
|
+
|
18
|
+
context 'test redis interactions' do
|
19
|
+
let :data do
|
20
|
+
cache.instance_eval { @data }
|
21
|
+
end
|
22
|
+
|
23
|
+
let :redis_cache do
|
24
|
+
cache.instance_eval { @redis_cache }
|
25
|
+
end
|
26
|
+
|
27
|
+
let :redis do
|
28
|
+
double('Redis')
|
29
|
+
end
|
30
|
+
|
31
|
+
before do
|
32
|
+
allow_any_instance_of(Documentrix::Documents::RedisCache).to\
|
33
|
+
receive(:redis).and_return(redis)
|
34
|
+
allow(redis).to receive(:scan_each)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'can be instantiated and initialized' do
|
38
|
+
expect(cache).to be_a described_class
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'defaults to nil object_class' do
|
42
|
+
expect(cache.object_class).to be_nil
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can be configured with object_class' do
|
46
|
+
object_class = Class.new(JSON::GenericObject)
|
47
|
+
cache = described_class.new(prefix: 'test-', url: 'something', object_class:)
|
48
|
+
expect(cache.object_class).to eq object_class
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'has Redis client' do
|
52
|
+
expect(cache.redis).to eq redis
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'can get a key' do
|
56
|
+
key = 'foo'
|
57
|
+
expect(data).to receive(:[]).with('test-' + key).and_return 666
|
58
|
+
expect(cache[key]).to eq 666
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'can set a value for a key' do
|
62
|
+
key, value = 'foo', { test: true }
|
63
|
+
expect(data).to receive(:[]=).with('test-' + key, { test: true }).and_call_original
|
64
|
+
expect(redis).to receive(:set).with('test-' + key, JSON(value))
|
65
|
+
cache[key] = value
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'can determine if key exists' do
|
69
|
+
key = 'foo'
|
70
|
+
expect(data).to receive(:key?).with('test-' + key).and_return(false, true)
|
71
|
+
expect(cache.key?('foo')).to eq false
|
72
|
+
expect(cache.key?('foo')).to eq true
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'can delete' do
|
76
|
+
key = 'foo'
|
77
|
+
expect(data).to receive(:delete).with('test-' + key).and_return 'bar'
|
78
|
+
expect(redis).to receive(:del).with('test-' + key).and_return 1
|
79
|
+
expect(cache.delete(key)).to eq true
|
80
|
+
expect(data).to receive(:delete).with('test-' + key).and_return nil
|
81
|
+
expect(redis).to receive(:del).with(prefix + key).and_return 0
|
82
|
+
expect(cache.delete(key)).to eq false
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'can iterate over keys, values' do
|
86
|
+
key, value = 'foo', { 'test' => true }
|
87
|
+
expect(redis).to receive(:set).with('test-' + key, JSON(value))
|
88
|
+
cache[key] = value
|
89
|
+
cache.each do |k, v|
|
90
|
+
expect(k).to eq prefix + key
|
91
|
+
expect(v).to eq value
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'returns size' do
|
96
|
+
expect(cache).to receive(:count).and_return 3
|
97
|
+
expect(cache.size).to eq 3
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'can clear' do
|
101
|
+
expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
|
102
|
+
'test-foo'
|
103
|
+
)
|
104
|
+
expect(redis).to receive(:del).with('test-foo')
|
105
|
+
expect(cache.clear).to eq cache
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'can iterate over keys under a prefix' do
|
109
|
+
data['test-foo'] = 'bar'
|
110
|
+
expect(cache.to_a).to eq [ %w[ test-foo bar ] ]
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'can compute prefix with pre' do
|
114
|
+
expect(cache.pre('foo')).to eq 'test-foo'
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'can remove prefix with unpre' do
|
118
|
+
expect(cache.unpre('test-foo')).to eq 'foo'
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::RedisCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :cache do
|
9
|
+
described_class.new prefix:, url: 'something'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can be instantiated' do
|
13
|
+
expect(cache).to be_a described_class
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'defaults to nil object_class' do
|
17
|
+
expect(cache.object_class).to be_nil
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can be configured with object_class' do
|
21
|
+
object_class = Class.new(JSON::GenericObject)
|
22
|
+
cache = described_class.new(prefix:, url: 'something', object_class:)
|
23
|
+
expect(cache.object_class).to eq object_class
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'raises ArgumentError if url is missing' do
|
27
|
+
expect {
|
28
|
+
described_class.new prefix:, url: nil
|
29
|
+
}.to raise_error ArgumentError
|
30
|
+
end
|
31
|
+
|
32
|
+
context 'test redis interactions' do
|
33
|
+
let :redis do
|
34
|
+
double('Redis')
|
35
|
+
end
|
36
|
+
|
37
|
+
before do
|
38
|
+
allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'has Redis client' do
|
42
|
+
expect(cache.redis).to eq redis
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can get a key' do
|
46
|
+
key = 'foo'
|
47
|
+
expect(redis).to receive(:get).with(prefix + key).and_return '"some_json"'
|
48
|
+
expect(cache[key]).to eq 'some_json'
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'can set a value for a key' do
|
52
|
+
key, value = 'foo', { test: true }
|
53
|
+
expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
|
54
|
+
cache[key] = value
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'can set a value for a key with ttl' do
|
58
|
+
cache = described_class.new prefix:, url: 'something', ex: 3_600
|
59
|
+
key, value = 'foo', { test: true }
|
60
|
+
expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: 3_600)
|
61
|
+
cache[key] = value
|
62
|
+
expect(redis).to receive(:ttl).with(prefix + key).and_return 3_600
|
63
|
+
expect(cache.ttl(key)).to eq 3_600
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'can determine if key exists' do
|
67
|
+
key = 'foo'
|
68
|
+
expect(redis).to receive(:exists?).with(prefix + key).and_return(false, true)
|
69
|
+
expect(cache.key?('foo')).to eq false
|
70
|
+
expect(cache.key?('foo')).to eq true
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'can delete' do
|
74
|
+
key = 'foo'
|
75
|
+
expect(redis).to receive(:del).with(prefix + key).and_return 1
|
76
|
+
expect(cache.delete(key)).to eq true
|
77
|
+
expect(redis).to receive(:del).with(prefix + key).and_return 0
|
78
|
+
expect(cache.delete(key)).to eq false
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'can iterate over keys, values' do
|
82
|
+
key, value = 'foo', { 'test' => true }
|
83
|
+
expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
|
84
|
+
cache[key] = value
|
85
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
|
86
|
+
and_yield("#{prefix}foo")
|
87
|
+
expect(redis).to receive(:get).with(prefix + key).and_return(JSON(test: true))
|
88
|
+
cache.each do |k, v|
|
89
|
+
expect(k).to eq prefix + key
|
90
|
+
expect(v).to eq value
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'returns size' do
|
95
|
+
expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
|
96
|
+
and_yield("#{prefix}foo").
|
97
|
+
and_yield("#{prefix}bar").
|
98
|
+
and_yield("#{prefix}baz")
|
99
|
+
expect(cache.size).to eq 3
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'can clear' do
|
103
|
+
expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
|
104
|
+
'test-foo'
|
105
|
+
)
|
106
|
+
expect(redis).to receive(:del).with('test-foo')
|
107
|
+
expect(cache.clear).to eq cache
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'can iterate over keys under a prefix' do
|
111
|
+
expect(redis).to receive(:scan_each).with(match: 'test-*')
|
112
|
+
cache.to_a
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'can compute prefix with pre' do
|
116
|
+
expect(cache.pre('foo')).to eq 'test-foo'
|
117
|
+
end
|
118
|
+
|
119
|
+
it 'can remove prefix with unpre' do
|
120
|
+
expect(cache.unpre('test-foo')).to eq 'foo'
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::SQLiteCache do
|
4
|
+
let :prefix do
|
5
|
+
'test-'
|
6
|
+
end
|
7
|
+
|
8
|
+
let :test_value do
|
9
|
+
{
|
10
|
+
key: 'test',
|
11
|
+
text: 'test text',
|
12
|
+
norm: 0.5,
|
13
|
+
source: 'for-test.txt',
|
14
|
+
tags: %w[ test ],
|
15
|
+
embedding: [ 0.5 ] * 1_024,
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
let :cache do
|
20
|
+
described_class.new prefix:
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'can be instantiated' do
|
24
|
+
expect(cache).to be_a described_class
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'defaults to :memory: mode' do
|
28
|
+
expect(cache.filename).to eq ':memory:'
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'can be switchted to file mode' do
|
32
|
+
expect(SQLite3::Database).to receive(:new).with('foo.sqlite').
|
33
|
+
and_return(double.as_null_object)
|
34
|
+
cache = described_class.new prefix:, filename: 'foo.sqlite'
|
35
|
+
expect(cache.filename).to eq 'foo.sqlite'
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'can get/set a key' do
|
39
|
+
key, value = 'foo', test_value
|
40
|
+
queried_value = nil
|
41
|
+
expect {
|
42
|
+
cache[key] = value
|
43
|
+
}.to change {
|
44
|
+
queried_value = cache[key]
|
45
|
+
}.from(nil).to(Documentrix::Documents::Record[value])
|
46
|
+
expect(queried_value.embedding).to eq [ 0.5 ] * 1_024
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'can determine if key exists' do
|
50
|
+
key, value = 'foo', test_value
|
51
|
+
expect {
|
52
|
+
cache[key] = value
|
53
|
+
}.to change {
|
54
|
+
cache.key?(key)
|
55
|
+
}.from(false).to(true)
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'can set key with different prefixes' do
|
59
|
+
key, value = 'foo', test_value
|
60
|
+
expect {
|
61
|
+
cache[key] = value
|
62
|
+
}.to change {
|
63
|
+
cache.size
|
64
|
+
}.from(0).to(1)
|
65
|
+
cache2 = cache.dup
|
66
|
+
cache2.prefix = 'test2-'
|
67
|
+
expect {
|
68
|
+
cache2[key] = value
|
69
|
+
}.to change {
|
70
|
+
cache2.size
|
71
|
+
}.from(0).to(1)
|
72
|
+
expect(cache.size).to eq 1
|
73
|
+
s = 0
|
74
|
+
cache.full_each { s += 1 }
|
75
|
+
expect(s).to eq 2
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'can delete' do
|
79
|
+
key, value = 'foo', test_value
|
80
|
+
expect(cache.delete(key)).to be_falsy
|
81
|
+
cache[key] = value
|
82
|
+
expect {
|
83
|
+
expect(cache.delete(key)).to eq true
|
84
|
+
}.to change {
|
85
|
+
cache.key?(key)
|
86
|
+
}.from(true).to(false)
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'returns size' do
|
90
|
+
key, value = 'foo', test_value
|
91
|
+
expect {
|
92
|
+
cache[key] = value
|
93
|
+
}.to change {
|
94
|
+
cache.size
|
95
|
+
}.from(0).to(1)
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'can convert_to_vector' do
|
99
|
+
vector = [ 23.0, 666.0 ]
|
100
|
+
expect(cache.convert_to_vector(vector)).to eq vector
|
101
|
+
end
|
102
|
+
|
103
|
+
it 'can clear' do
|
104
|
+
key, value = 'foo', { embedding: [ 0.5 ] * 1_024 }
|
105
|
+
cache[key] = value
|
106
|
+
expect {
|
107
|
+
expect(cache.clear).to eq cache
|
108
|
+
}.to change {
|
109
|
+
cache.size
|
110
|
+
}.from(1).to(0)
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'can clear for tags' do
|
114
|
+
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
115
|
+
cache[key] = value
|
116
|
+
key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
|
117
|
+
cache[key] = value
|
118
|
+
expect {
|
119
|
+
expect(cache.clear_for_tags(%w[ #foo ])).to eq cache
|
120
|
+
}.to change {
|
121
|
+
cache.size
|
122
|
+
}.from(2).to(1)
|
123
|
+
expect(cache).not_to be_key 'foo'
|
124
|
+
expect(cache).to be_key 'bar'
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'can return tags' do
|
128
|
+
key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
|
129
|
+
cache[key] = value
|
130
|
+
key, value = 'bar', { tags: %w[ bar baz ], embedding: [ 0.5 ] * 1_024 }
|
131
|
+
cache[key] = value
|
132
|
+
tags = cache.tags
|
133
|
+
expect(tags).to be_a Documentrix::Utils::Tags
|
134
|
+
expect(tags.to_a).to eq %w[ bar baz foo ]
|
135
|
+
end
|
136
|
+
|
137
|
+
it 'can iterate over keys under a prefix' do
|
138
|
+
cache['foo'] = test_value
|
139
|
+
expect(cache.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::Splitters::Character do
|
4
|
+
let :splitter do
|
5
|
+
described_class.new chunk_size: 23, combining_string: ''
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'can be instantiated' do
|
9
|
+
expect(splitter).to be_a described_class
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can split' do
|
13
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
14
|
+
result = splitter.split(text)
|
15
|
+
expect(result.count).to eq 5
|
16
|
+
expect(result.to_a.join('')).to eq ?A * 100
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'can split combining with separation' do
|
20
|
+
splitter = described_class.new chunk_size: 25, include_separator: false,
|
21
|
+
combining_string: ?X
|
22
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
23
|
+
result = splitter.split(text)
|
24
|
+
expect(result.count).to eq 5
|
25
|
+
expect(result.to_a.join(?B)).to eq\
|
26
|
+
"AAAAAAAAAAXAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXB"\
|
27
|
+
"AAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAX"
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'can split including separator' do
|
31
|
+
splitter = described_class.new chunk_size: 25, include_separator: true,
|
32
|
+
combining_string: ''
|
33
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
34
|
+
result = splitter.split(text)
|
35
|
+
expect(result.count).to eq 5
|
36
|
+
expect(result.to_a.join('')).to eq text
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'cannot split' do
|
40
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
41
|
+
result = splitter.split(text)
|
42
|
+
expect(result.count).to eq 1
|
43
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'cannot split2' do
|
47
|
+
text = ?A * 25
|
48
|
+
result = splitter.split(text)
|
49
|
+
expect(result.count).to eq 1
|
50
|
+
expect(result.to_a.join('')).to eq ?A * 25
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'can split sentences' do
|
54
|
+
text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
|
55
|
+
splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
|
56
|
+
result = splitter.split(text)
|
57
|
+
expect(result.to_a).to eq %w[ foo foo bar bar baz baz quux quux ]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
RSpec.describe Documentrix::Documents::Splitters::RecursiveCharacter do
|
62
|
+
let :splitter do
|
63
|
+
described_class.new chunk_size: 23, combining_string: ''
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'can be instantiated' do
|
67
|
+
expect(splitter).to be_a described_class
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'can split' do
|
71
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
72
|
+
result = splitter.split(text)
|
73
|
+
expect(result.count).to eq 5
|
74
|
+
expect(result.to_a.join('')).to eq ?A * 100
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'cannot split' do
|
78
|
+
splitter = described_class.new chunk_size: 23, include_separator: true,
|
79
|
+
separators: described_class::DEFAULT_SEPARATORS[0..-2]
|
80
|
+
text = ?A * 25
|
81
|
+
result = splitter.split(text)
|
82
|
+
expect(result.count).to eq 1
|
83
|
+
expect(result.to_a.join('')).to eq ?A * 25
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'can split including separator' do
|
87
|
+
splitter = described_class.new chunk_size: 25, include_separator: true,
|
88
|
+
combining_string: ''
|
89
|
+
text = [ ?A * 10 ] * 10 * "\n\n"
|
90
|
+
result = splitter.split(text)
|
91
|
+
expect(result.count).to eq 5
|
92
|
+
expect(result.to_a.join('')).to eq text
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'can split single newline as well' do
|
96
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
97
|
+
result = splitter.split(text)
|
98
|
+
expect(result.count).to eq 5
|
99
|
+
expect(result.to_a.join('')).to eq ?A * 100
|
100
|
+
end
|
101
|
+
|
102
|
+
it 'can split single newline as well including separator' do
|
103
|
+
splitter = described_class.new chunk_size: 25, include_separator: true,
|
104
|
+
combining_string: ''
|
105
|
+
text = [ ?A * 10 ] * 10 * "\n"
|
106
|
+
result = splitter.split(text)
|
107
|
+
expect(result.count).to eq 5
|
108
|
+
expect(result.to_a.join('')).to eq text
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Documentrix::Documents::Splitters::Semantic do
|
4
|
+
let :ollama do
|
5
|
+
double('Ollama::Client')
|
6
|
+
end
|
7
|
+
|
8
|
+
let :splitter do
|
9
|
+
described_class.new ollama:, model: 'mxbai-embed-large'
|
10
|
+
end
|
11
|
+
|
12
|
+
let :embeddings do
|
13
|
+
JSON(File.read(asset('embeddings.json')))
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can be instantiated' do
|
17
|
+
expect(splitter).to be_a described_class
|
18
|
+
end
|
19
|
+
|
20
|
+
before do
|
21
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings:))
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'can split with breakpoint :percentile' do
|
25
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
26
|
+
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
27
|
+
expect(result.count).to eq 3
|
28
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
29
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'can split with breakpoint :percentile' do
|
33
|
+
described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 50
|
34
|
+
text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
35
|
+
result = splitter.split(text, breakpoint: :percentile, percentile: 75)
|
36
|
+
expect(result.count).to eq 4
|
37
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
38
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'can split with breakpoint :standard_deviation' do
|
42
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
43
|
+
result = splitter.split(text, breakpoint: :standard_deviation, percentage: 100)
|
44
|
+
expect(result.count).to eq 3
|
45
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
46
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'can split with breakpoint :interquartile' do
|
50
|
+
text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
|
51
|
+
result = splitter.split(text, breakpoint: :interquartile, percentage: 75)
|
52
|
+
expect(result.count).to eq 3
|
53
|
+
expect(result.to_a.join('').count(?A)).to eq text.count(?A)
|
54
|
+
expect(result.to_a.join('').count(?B)).to eq text.count(?B)
|
55
|
+
end
|
56
|
+
end
|