documentrix 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/Gemfile +5 -0
  4. data/README.md +71 -0
  5. data/Rakefile +48 -0
  6. data/documentrix.gemspec +41 -0
  7. data/lib/documentrix/documents/cache/common.rb +43 -0
  8. data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
  9. data/lib/documentrix/documents/cache/records.rb +145 -0
  10. data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
  11. data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
  12. data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
  13. data/lib/documentrix/documents/splitters/character.rb +72 -0
  14. data/lib/documentrix/documents/splitters/semantic.rb +91 -0
  15. data/lib/documentrix/documents.rb +328 -0
  16. data/lib/documentrix/utils/colorize_texts.rb +65 -0
  17. data/lib/documentrix/utils/math.rb +48 -0
  18. data/lib/documentrix/utils/tags.rb +112 -0
  19. data/lib/documentrix/utils.rb +5 -0
  20. data/lib/documentrix/version.rb +8 -0
  21. data/lib/documentrix.rb +11 -0
  22. data/spec/assets/embeddings.json +1 -0
  23. data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
  24. data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
  25. data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
  26. data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
  27. data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
  28. data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
  29. data/spec/documents_spec.rb +174 -0
  30. data/spec/spec_helper.rb +23 -0
  31. data/spec/utils/colorize_texts_spec.rb +13 -0
  32. data/spec/utils/tags_spec.rb +53 -0
  33. metadata +329 -0
@@ -0,0 +1,98 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::MemoryCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :cache do
9
+ described_class.new prefix:
10
+ end
11
+
12
+ it 'can be instantiated' do
13
+ expect(cache).to be_a described_class
14
+ end
15
+
16
+ it 'can get/set a key' do
17
+ key, value = 'foo', { test: true }
18
+ expect {
19
+ cache[key] = value
20
+ }.to change {
21
+ cache[key]
22
+ }.from(nil).to(value)
23
+ end
24
+
25
+ it 'can determine if key exists' do
26
+ key, value = 'foo', { test: true }
27
+ expect {
28
+ cache[key] = value
29
+ }.to change {
30
+ cache.key?(key)
31
+ }.from(false).to(true)
32
+ end
33
+
34
+ it 'can set key with different prefixes' do
35
+ key, value = 'foo', { test: true }
36
+ expect {
37
+ cache[key] = value
38
+ }.to change {
39
+ cache.size
40
+ }.from(0).to(1)
41
+ cache2 = cache.dup
42
+ cache2.prefix = 'test2-'
43
+ expect {
44
+ cache2[key] = value
45
+ }.to change {
46
+ cache2.size
47
+ }.from(0).to(1)
48
+ expect(cache.size).to eq 1
49
+ s = 0
50
+ cache.full_each { s += 1 }
51
+ expect(s).to eq 2
52
+ end
53
+
54
+ it 'can delete' do
55
+ key, value = 'foo', { test: true }
56
+ expect(cache.delete(key)).to be_falsy
57
+ cache[key] = value
58
+ expect {
59
+ expect(cache.delete(key)).to eq true
60
+ }.to change {
61
+ cache.key?(key)
62
+ }.from(true).to(false)
63
+ expect(cache.delete(key)).to eq false
64
+ end
65
+
66
+ it 'can iterate over keys, values' do
67
+ key, value = 'foo', { test: true }
68
+ cache[key] = value
69
+ cache.each do |k, v|
70
+ expect(k).to eq prefix + key
71
+ expect(v).to eq value
72
+ end
73
+ end
74
+
75
+ it 'returns size' do
76
+ key, value = 'foo', { test: true }
77
+ expect {
78
+ cache[key] = value
79
+ }.to change {
80
+ cache.size
81
+ }.from(0).to(1)
82
+ end
83
+
84
+ it 'can clear' do
85
+ key, value = 'foo', { test: true }
86
+ cache[key] = value
87
+ expect {
88
+ expect(cache.clear).to eq cache
89
+ }.to change {
90
+ cache.size
91
+ }.from(1).to(0)
92
+ end
93
+
94
+ it 'can iterate over keys under a prefix' do
95
+ cache['foo'] = 'bar'
96
+ expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
97
+ end
98
+ end
@@ -0,0 +1,121 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::RedisBackedMemoryCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :cache do
9
+ described_class.new prefix: 'test-', url: 'something'
10
+ end
11
+
12
+ it 'raises ArgumentError if url is missing' do
13
+ expect {
14
+ described_class.new prefix:, url: nil
15
+ }.to raise_error ArgumentError
16
+ end
17
+
18
+ context 'test redis interactions' do
19
+ let :data do
20
+ cache.instance_eval { @data }
21
+ end
22
+
23
+ let :redis_cache do
24
+ cache.instance_eval { @redis_cache }
25
+ end
26
+
27
+ let :redis do
28
+ double('Redis')
29
+ end
30
+
31
+ before do
32
+ allow_any_instance_of(Documentrix::Documents::RedisCache).to\
33
+ receive(:redis).and_return(redis)
34
+ allow(redis).to receive(:scan_each)
35
+ end
36
+
37
+ it 'can be instantiated and initialized' do
38
+ expect(cache).to be_a described_class
39
+ end
40
+
41
+ it 'defaults to nil object_class' do
42
+ expect(cache.object_class).to be_nil
43
+ end
44
+
45
+ it 'can be configured with object_class' do
46
+ object_class = Class.new(JSON::GenericObject)
47
+ cache = described_class.new(prefix: 'test-', url: 'something', object_class:)
48
+ expect(cache.object_class).to eq object_class
49
+ end
50
+
51
+ it 'has Redis client' do
52
+ expect(cache.redis).to eq redis
53
+ end
54
+
55
+ it 'can get a key' do
56
+ key = 'foo'
57
+ expect(data).to receive(:[]).with('test-' + key).and_return 666
58
+ expect(cache[key]).to eq 666
59
+ end
60
+
61
+ it 'can set a value for a key' do
62
+ key, value = 'foo', { test: true }
63
+ expect(data).to receive(:[]=).with('test-' + key, { test: true }).and_call_original
64
+ expect(redis).to receive(:set).with('test-' + key, JSON(value))
65
+ cache[key] = value
66
+ end
67
+
68
+ it 'can determine if key exists' do
69
+ key = 'foo'
70
+ expect(data).to receive(:key?).with('test-' + key).and_return(false, true)
71
+ expect(cache.key?('foo')).to eq false
72
+ expect(cache.key?('foo')).to eq true
73
+ end
74
+
75
+ it 'can delete' do
76
+ key = 'foo'
77
+ expect(data).to receive(:delete).with('test-' + key).and_return 'bar'
78
+ expect(redis).to receive(:del).with('test-' + key).and_return 1
79
+ expect(cache.delete(key)).to eq true
80
+ expect(data).to receive(:delete).with('test-' + key).and_return nil
81
+ expect(redis).to receive(:del).with(prefix + key).and_return 0
82
+ expect(cache.delete(key)).to eq false
83
+ end
84
+
85
+ it 'can iterate over keys, values' do
86
+ key, value = 'foo', { 'test' => true }
87
+ expect(redis).to receive(:set).with('test-' + key, JSON(value))
88
+ cache[key] = value
89
+ cache.each do |k, v|
90
+ expect(k).to eq prefix + key
91
+ expect(v).to eq value
92
+ end
93
+ end
94
+
95
+ it 'returns size' do
96
+ expect(cache).to receive(:count).and_return 3
97
+ expect(cache.size).to eq 3
98
+ end
99
+
100
+ it 'can clear' do
101
+ expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
102
+ 'test-foo'
103
+ )
104
+ expect(redis).to receive(:del).with('test-foo')
105
+ expect(cache.clear).to eq cache
106
+ end
107
+
108
+ it 'can iterate over keys under a prefix' do
109
+ data['test-foo'] = 'bar'
110
+ expect(cache.to_a).to eq [ %w[ test-foo bar ] ]
111
+ end
112
+
113
+ it 'can compute prefix with pre' do
114
+ expect(cache.pre('foo')).to eq 'test-foo'
115
+ end
116
+
117
+ it 'can remove prefix with unpre' do
118
+ expect(cache.unpre('test-foo')).to eq 'foo'
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,123 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::RedisCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :cache do
9
+ described_class.new prefix:, url: 'something'
10
+ end
11
+
12
+ it 'can be instantiated' do
13
+ expect(cache).to be_a described_class
14
+ end
15
+
16
+ it 'defaults to nil object_class' do
17
+ expect(cache.object_class).to be_nil
18
+ end
19
+
20
+ it 'can be configured with object_class' do
21
+ object_class = Class.new(JSON::GenericObject)
22
+ cache = described_class.new(prefix:, url: 'something', object_class:)
23
+ expect(cache.object_class).to eq object_class
24
+ end
25
+
26
+ it 'raises ArgumentError if url is missing' do
27
+ expect {
28
+ described_class.new prefix:, url: nil
29
+ }.to raise_error ArgumentError
30
+ end
31
+
32
+ context 'test redis interactions' do
33
+ let :redis do
34
+ double('Redis')
35
+ end
36
+
37
+ before do
38
+ allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
39
+ end
40
+
41
+ it 'has Redis client' do
42
+ expect(cache.redis).to eq redis
43
+ end
44
+
45
+ it 'can get a key' do
46
+ key = 'foo'
47
+ expect(redis).to receive(:get).with(prefix + key).and_return '"some_json"'
48
+ expect(cache[key]).to eq 'some_json'
49
+ end
50
+
51
+ it 'can set a value for a key' do
52
+ key, value = 'foo', { test: true }
53
+ expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
54
+ cache[key] = value
55
+ end
56
+
57
+ it 'can set a value for a key with ttl' do
58
+ cache = described_class.new prefix:, url: 'something', ex: 3_600
59
+ key, value = 'foo', { test: true }
60
+ expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: 3_600)
61
+ cache[key] = value
62
+ expect(redis).to receive(:ttl).with(prefix + key).and_return 3_600
63
+ expect(cache.ttl(key)).to eq 3_600
64
+ end
65
+
66
+ it 'can determine if key exists' do
67
+ key = 'foo'
68
+ expect(redis).to receive(:exists?).with(prefix + key).and_return(false, true)
69
+ expect(cache.key?('foo')).to eq false
70
+ expect(cache.key?('foo')).to eq true
71
+ end
72
+
73
+ it 'can delete' do
74
+ key = 'foo'
75
+ expect(redis).to receive(:del).with(prefix + key).and_return 1
76
+ expect(cache.delete(key)).to eq true
77
+ expect(redis).to receive(:del).with(prefix + key).and_return 0
78
+ expect(cache.delete(key)).to eq false
79
+ end
80
+
81
+ it 'can iterate over keys, values' do
82
+ key, value = 'foo', { 'test' => true }
83
+ expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
84
+ cache[key] = value
85
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
86
+ and_yield("#{prefix}foo")
87
+ expect(redis).to receive(:get).with(prefix + key).and_return(JSON(test: true))
88
+ cache.each do |k, v|
89
+ expect(k).to eq prefix + key
90
+ expect(v).to eq value
91
+ end
92
+ end
93
+
94
+ it 'returns size' do
95
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
96
+ and_yield("#{prefix}foo").
97
+ and_yield("#{prefix}bar").
98
+ and_yield("#{prefix}baz")
99
+ expect(cache.size).to eq 3
100
+ end
101
+
102
+ it 'can clear' do
103
+ expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
104
+ 'test-foo'
105
+ )
106
+ expect(redis).to receive(:del).with('test-foo')
107
+ expect(cache.clear).to eq cache
108
+ end
109
+
110
+ it 'can iterate over keys under a prefix' do
111
+ expect(redis).to receive(:scan_each).with(match: 'test-*')
112
+ cache.to_a
113
+ end
114
+
115
+ it 'can compute prefix with pre' do
116
+ expect(cache.pre('foo')).to eq 'test-foo'
117
+ end
118
+
119
+ it 'can remove prefix with unpre' do
120
+ expect(cache.unpre('test-foo')).to eq 'foo'
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,141 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::SQLiteCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :test_value do
9
+ {
10
+ key: 'test',
11
+ text: 'test text',
12
+ norm: 0.5,
13
+ source: 'for-test.txt',
14
+ tags: %w[ test ],
15
+ embedding: [ 0.5 ] * 1_024,
16
+ }
17
+ end
18
+
19
+ let :cache do
20
+ described_class.new prefix:
21
+ end
22
+
23
+ it 'can be instantiated' do
24
+ expect(cache).to be_a described_class
25
+ end
26
+
27
+ it 'defaults to :memory: mode' do
28
+ expect(cache.filename).to eq ':memory:'
29
+ end
30
+
31
+ it 'can be switchted to file mode' do
32
+ expect(SQLite3::Database).to receive(:new).with('foo.sqlite').
33
+ and_return(double.as_null_object)
34
+ cache = described_class.new prefix:, filename: 'foo.sqlite'
35
+ expect(cache.filename).to eq 'foo.sqlite'
36
+ end
37
+
38
+ it 'can get/set a key' do
39
+ key, value = 'foo', test_value
40
+ queried_value = nil
41
+ expect {
42
+ cache[key] = value
43
+ }.to change {
44
+ queried_value = cache[key]
45
+ }.from(nil).to(Documentrix::Documents::Record[value])
46
+ expect(queried_value.embedding).to eq [ 0.5 ] * 1_024
47
+ end
48
+
49
+ it 'can determine if key exists' do
50
+ key, value = 'foo', test_value
51
+ expect {
52
+ cache[key] = value
53
+ }.to change {
54
+ cache.key?(key)
55
+ }.from(false).to(true)
56
+ end
57
+
58
+ it 'can set key with different prefixes' do
59
+ key, value = 'foo', test_value
60
+ expect {
61
+ cache[key] = value
62
+ }.to change {
63
+ cache.size
64
+ }.from(0).to(1)
65
+ cache2 = cache.dup
66
+ cache2.prefix = 'test2-'
67
+ expect {
68
+ cache2[key] = value
69
+ }.to change {
70
+ cache2.size
71
+ }.from(0).to(1)
72
+ expect(cache.size).to eq 1
73
+ s = 0
74
+ cache.full_each { s += 1 }
75
+ expect(s).to eq 2
76
+ end
77
+
78
+ it 'can delete' do
79
+ key, value = 'foo', test_value
80
+ expect(cache.delete(key)).to be_falsy
81
+ cache[key] = value
82
+ expect {
83
+ expect(cache.delete(key)).to eq true
84
+ }.to change {
85
+ cache.key?(key)
86
+ }.from(true).to(false)
87
+ end
88
+
89
+ it 'returns size' do
90
+ key, value = 'foo', test_value
91
+ expect {
92
+ cache[key] = value
93
+ }.to change {
94
+ cache.size
95
+ }.from(0).to(1)
96
+ end
97
+
98
+ it 'can convert_to_vector' do
99
+ vector = [ 23.0, 666.0 ]
100
+ expect(cache.convert_to_vector(vector)).to eq vector
101
+ end
102
+
103
+ it 'can clear' do
104
+ key, value = 'foo', { embedding: [ 0.5 ] * 1_024 }
105
+ cache[key] = value
106
+ expect {
107
+ expect(cache.clear).to eq cache
108
+ }.to change {
109
+ cache.size
110
+ }.from(1).to(0)
111
+ end
112
+
113
+ it 'can clear for tags' do
114
+ key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
115
+ cache[key] = value
116
+ key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
117
+ cache[key] = value
118
+ expect {
119
+ expect(cache.clear_for_tags(%w[ #foo ])).to eq cache
120
+ }.to change {
121
+ cache.size
122
+ }.from(2).to(1)
123
+ expect(cache).not_to be_key 'foo'
124
+ expect(cache).to be_key 'bar'
125
+ end
126
+
127
+ it 'can return tags' do
128
+ key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
129
+ cache[key] = value
130
+ key, value = 'bar', { tags: %w[ bar baz ], embedding: [ 0.5 ] * 1_024 }
131
+ cache[key] = value
132
+ tags = cache.tags
133
+ expect(tags).to be_a Documentrix::Utils::Tags
134
+ expect(tags.to_a).to eq %w[ bar baz foo ]
135
+ end
136
+
137
+ it 'can iterate over keys under a prefix' do
138
+ cache['foo'] = test_value
139
+ expect(cache.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
140
+ end
141
+ end
@@ -0,0 +1,110 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::Splitters::Character do
4
+ let :splitter do
5
+ described_class.new chunk_size: 23, combining_string: ''
6
+ end
7
+
8
+ it 'can be instantiated' do
9
+ expect(splitter).to be_a described_class
10
+ end
11
+
12
+ it 'can split' do
13
+ text = [ ?A * 10 ] * 10 * "\n\n"
14
+ result = splitter.split(text)
15
+ expect(result.count).to eq 5
16
+ expect(result.to_a.join('')).to eq ?A * 100
17
+ end
18
+
19
+ it 'can split combining with separation' do
20
+ splitter = described_class.new chunk_size: 25, include_separator: false,
21
+ combining_string: ?X
22
+ text = [ ?A * 10 ] * 10 * "\n\n"
23
+ result = splitter.split(text)
24
+ expect(result.count).to eq 5
25
+ expect(result.to_a.join(?B)).to eq\
26
+ "AAAAAAAAAAXAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXB"\
27
+ "AAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAX"
28
+ end
29
+
30
+ it 'can split including separator' do
31
+ splitter = described_class.new chunk_size: 25, include_separator: true,
32
+ combining_string: ''
33
+ text = [ ?A * 10 ] * 10 * "\n\n"
34
+ result = splitter.split(text)
35
+ expect(result.count).to eq 5
36
+ expect(result.to_a.join('')).to eq text
37
+ end
38
+
39
+ it 'cannot split' do
40
+ text = [ ?A * 10 ] * 10 * "\n"
41
+ result = splitter.split(text)
42
+ expect(result.count).to eq 1
43
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
44
+ end
45
+
46
+ it 'cannot split2' do
47
+ text = ?A * 25
48
+ result = splitter.split(text)
49
+ expect(result.count).to eq 1
50
+ expect(result.to_a.join('')).to eq ?A * 25
51
+ end
52
+
53
+ it 'can split sentences' do
54
+ text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
55
+ splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
56
+ result = splitter.split(text)
57
+ expect(result.to_a).to eq %w[ foo foo bar bar baz baz quux quux ]
58
+ end
59
+ end
60
+
61
+ RSpec.describe Documentrix::Documents::Splitters::RecursiveCharacter do
62
+ let :splitter do
63
+ described_class.new chunk_size: 23, combining_string: ''
64
+ end
65
+
66
+ it 'can be instantiated' do
67
+ expect(splitter).to be_a described_class
68
+ end
69
+
70
+ it 'can split' do
71
+ text = [ ?A * 10 ] * 10 * "\n\n"
72
+ result = splitter.split(text)
73
+ expect(result.count).to eq 5
74
+ expect(result.to_a.join('')).to eq ?A * 100
75
+ end
76
+
77
+ it 'cannot split' do
78
+ splitter = described_class.new chunk_size: 23, include_separator: true,
79
+ separators: described_class::DEFAULT_SEPARATORS[0..-2]
80
+ text = ?A * 25
81
+ result = splitter.split(text)
82
+ expect(result.count).to eq 1
83
+ expect(result.to_a.join('')).to eq ?A * 25
84
+ end
85
+
86
+ it 'can split including separator' do
87
+ splitter = described_class.new chunk_size: 25, include_separator: true,
88
+ combining_string: ''
89
+ text = [ ?A * 10 ] * 10 * "\n\n"
90
+ result = splitter.split(text)
91
+ expect(result.count).to eq 5
92
+ expect(result.to_a.join('')).to eq text
93
+ end
94
+
95
+ it 'can split single newline as well' do
96
+ text = [ ?A * 10 ] * 10 * "\n"
97
+ result = splitter.split(text)
98
+ expect(result.count).to eq 5
99
+ expect(result.to_a.join('')).to eq ?A * 100
100
+ end
101
+
102
+ it 'can split single newline as well including separator' do
103
+ splitter = described_class.new chunk_size: 25, include_separator: true,
104
+ combining_string: ''
105
+ text = [ ?A * 10 ] * 10 * "\n"
106
+ result = splitter.split(text)
107
+ expect(result.count).to eq 5
108
+ expect(result.to_a.join('')).to eq text
109
+ end
110
+ end
@@ -0,0 +1,56 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::Splitters::Semantic do
4
+ let :ollama do
5
+ double('Ollama::Client')
6
+ end
7
+
8
+ let :splitter do
9
+ described_class.new ollama:, model: 'mxbai-embed-large'
10
+ end
11
+
12
+ let :embeddings do
13
+ JSON(File.read(asset('embeddings.json')))
14
+ end
15
+
16
+ it 'can be instantiated' do
17
+ expect(splitter).to be_a described_class
18
+ end
19
+
20
+ before do
21
+ allow(ollama).to receive(:embed).and_return(double(embeddings:))
22
+ end
23
+
24
+ it 'can split with breakpoint :percentile' do
25
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
26
+ result = splitter.split(text, breakpoint: :percentile, percentile: 75)
27
+ expect(result.count).to eq 3
28
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
29
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
30
+ end
31
+
32
+ it 'can split with breakpoint :percentile' do
33
+ described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 50
34
+ text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
35
+ result = splitter.split(text, breakpoint: :percentile, percentile: 75)
36
+ expect(result.count).to eq 4
37
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
38
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
39
+ end
40
+
41
+ it 'can split with breakpoint :standard_deviation' do
42
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
43
+ result = splitter.split(text, breakpoint: :standard_deviation, percentage: 100)
44
+ expect(result.count).to eq 3
45
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
46
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
47
+ end
48
+
49
+ it 'can split with breakpoint :interquartile' do
50
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
51
+ result = splitter.split(text, breakpoint: :interquartile, percentage: 75)
52
+ expect(result.count).to eq 3
53
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
54
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
55
+ end
56
+ end