documentrix 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/Gemfile +5 -0
  4. data/README.md +71 -0
  5. data/Rakefile +48 -0
  6. data/documentrix.gemspec +41 -0
  7. data/lib/documentrix/documents/cache/common.rb +43 -0
  8. data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
  9. data/lib/documentrix/documents/cache/records.rb +145 -0
  10. data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
  11. data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
  12. data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
  13. data/lib/documentrix/documents/splitters/character.rb +72 -0
  14. data/lib/documentrix/documents/splitters/semantic.rb +91 -0
  15. data/lib/documentrix/documents.rb +328 -0
  16. data/lib/documentrix/utils/colorize_texts.rb +65 -0
  17. data/lib/documentrix/utils/math.rb +48 -0
  18. data/lib/documentrix/utils/tags.rb +112 -0
  19. data/lib/documentrix/utils.rb +5 -0
  20. data/lib/documentrix/version.rb +8 -0
  21. data/lib/documentrix.rb +11 -0
  22. data/spec/assets/embeddings.json +1 -0
  23. data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
  24. data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
  25. data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
  26. data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
  27. data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
  28. data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
  29. data/spec/documents_spec.rb +174 -0
  30. data/spec/spec_helper.rb +23 -0
  31. data/spec/utils/colorize_texts_spec.rb +13 -0
  32. data/spec/utils/tags_spec.rb +53 -0
  33. metadata +329 -0
@@ -0,0 +1,98 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::MemoryCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :cache do
9
+ described_class.new prefix:
10
+ end
11
+
12
+ it 'can be instantiated' do
13
+ expect(cache).to be_a described_class
14
+ end
15
+
16
+ it 'can get/set a key' do
17
+ key, value = 'foo', { test: true }
18
+ expect {
19
+ cache[key] = value
20
+ }.to change {
21
+ cache[key]
22
+ }.from(nil).to(value)
23
+ end
24
+
25
+ it 'can determine if key exists' do
26
+ key, value = 'foo', { test: true }
27
+ expect {
28
+ cache[key] = value
29
+ }.to change {
30
+ cache.key?(key)
31
+ }.from(false).to(true)
32
+ end
33
+
34
+ it 'can set key with different prefixes' do
35
+ key, value = 'foo', { test: true }
36
+ expect {
37
+ cache[key] = value
38
+ }.to change {
39
+ cache.size
40
+ }.from(0).to(1)
41
+ cache2 = cache.dup
42
+ cache2.prefix = 'test2-'
43
+ expect {
44
+ cache2[key] = value
45
+ }.to change {
46
+ cache2.size
47
+ }.from(0).to(1)
48
+ expect(cache.size).to eq 1
49
+ s = 0
50
+ cache.full_each { s += 1 }
51
+ expect(s).to eq 2
52
+ end
53
+
54
+ it 'can delete' do
55
+ key, value = 'foo', { test: true }
56
+ expect(cache.delete(key)).to be_falsy
57
+ cache[key] = value
58
+ expect {
59
+ expect(cache.delete(key)).to eq true
60
+ }.to change {
61
+ cache.key?(key)
62
+ }.from(true).to(false)
63
+ expect(cache.delete(key)).to eq false
64
+ end
65
+
66
+ it 'can iterate over keys, values' do
67
+ key, value = 'foo', { test: true }
68
+ cache[key] = value
69
+ cache.each do |k, v|
70
+ expect(k).to eq prefix + key
71
+ expect(v).to eq value
72
+ end
73
+ end
74
+
75
+ it 'returns size' do
76
+ key, value = 'foo', { test: true }
77
+ expect {
78
+ cache[key] = value
79
+ }.to change {
80
+ cache.size
81
+ }.from(0).to(1)
82
+ end
83
+
84
+ it 'can clear' do
85
+ key, value = 'foo', { test: true }
86
+ cache[key] = value
87
+ expect {
88
+ expect(cache.clear).to eq cache
89
+ }.to change {
90
+ cache.size
91
+ }.from(1).to(0)
92
+ end
93
+
94
+ it 'can iterate over keys under a prefix' do
95
+ cache['foo'] = 'bar'
96
+ expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
97
+ end
98
+ end
@@ -0,0 +1,121 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::RedisBackedMemoryCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :cache do
9
+ described_class.new prefix: 'test-', url: 'something'
10
+ end
11
+
12
+ it 'raises ArgumentError if url is missing' do
13
+ expect {
14
+ described_class.new prefix:, url: nil
15
+ }.to raise_error ArgumentError
16
+ end
17
+
18
+ context 'test redis interactions' do
19
+ let :data do
20
+ cache.instance_eval { @data }
21
+ end
22
+
23
+ let :redis_cache do
24
+ cache.instance_eval { @redis_cache }
25
+ end
26
+
27
+ let :redis do
28
+ double('Redis')
29
+ end
30
+
31
+ before do
32
+ allow_any_instance_of(Documentrix::Documents::RedisCache).to\
33
+ receive(:redis).and_return(redis)
34
+ allow(redis).to receive(:scan_each)
35
+ end
36
+
37
+ it 'can be instantiated and initialized' do
38
+ expect(cache).to be_a described_class
39
+ end
40
+
41
+ it 'defaults to nil object_class' do
42
+ expect(cache.object_class).to be_nil
43
+ end
44
+
45
+ it 'can be configured with object_class' do
46
+ object_class = Class.new(JSON::GenericObject)
47
+ cache = described_class.new(prefix: 'test-', url: 'something', object_class:)
48
+ expect(cache.object_class).to eq object_class
49
+ end
50
+
51
+ it 'has Redis client' do
52
+ expect(cache.redis).to eq redis
53
+ end
54
+
55
+ it 'can get a key' do
56
+ key = 'foo'
57
+ expect(data).to receive(:[]).with('test-' + key).and_return 666
58
+ expect(cache[key]).to eq 666
59
+ end
60
+
61
+ it 'can set a value for a key' do
62
+ key, value = 'foo', { test: true }
63
+ expect(data).to receive(:[]=).with('test-' + key, { test: true }).and_call_original
64
+ expect(redis).to receive(:set).with('test-' + key, JSON(value))
65
+ cache[key] = value
66
+ end
67
+
68
+ it 'can determine if key exists' do
69
+ key = 'foo'
70
+ expect(data).to receive(:key?).with('test-' + key).and_return(false, true)
71
+ expect(cache.key?('foo')).to eq false
72
+ expect(cache.key?('foo')).to eq true
73
+ end
74
+
75
+ it 'can delete' do
76
+ key = 'foo'
77
+ expect(data).to receive(:delete).with('test-' + key).and_return 'bar'
78
+ expect(redis).to receive(:del).with('test-' + key).and_return 1
79
+ expect(cache.delete(key)).to eq true
80
+ expect(data).to receive(:delete).with('test-' + key).and_return nil
81
+ expect(redis).to receive(:del).with(prefix + key).and_return 0
82
+ expect(cache.delete(key)).to eq false
83
+ end
84
+
85
+ it 'can iterate over keys, values' do
86
+ key, value = 'foo', { 'test' => true }
87
+ expect(redis).to receive(:set).with('test-' + key, JSON(value))
88
+ cache[key] = value
89
+ cache.each do |k, v|
90
+ expect(k).to eq prefix + key
91
+ expect(v).to eq value
92
+ end
93
+ end
94
+
95
+ it 'returns size' do
96
+ expect(cache).to receive(:count).and_return 3
97
+ expect(cache.size).to eq 3
98
+ end
99
+
100
+ it 'can clear' do
101
+ expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
102
+ 'test-foo'
103
+ )
104
+ expect(redis).to receive(:del).with('test-foo')
105
+ expect(cache.clear).to eq cache
106
+ end
107
+
108
+ it 'can iterate over keys under a prefix' do
109
+ data['test-foo'] = 'bar'
110
+ expect(cache.to_a).to eq [ %w[ test-foo bar ] ]
111
+ end
112
+
113
+ it 'can compute prefix with pre' do
114
+ expect(cache.pre('foo')).to eq 'test-foo'
115
+ end
116
+
117
+ it 'can remove prefix with unpre' do
118
+ expect(cache.unpre('test-foo')).to eq 'foo'
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,123 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::RedisCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :cache do
9
+ described_class.new prefix:, url: 'something'
10
+ end
11
+
12
+ it 'can be instantiated' do
13
+ expect(cache).to be_a described_class
14
+ end
15
+
16
+ it 'defaults to nil object_class' do
17
+ expect(cache.object_class).to be_nil
18
+ end
19
+
20
+ it 'can be configured with object_class' do
21
+ object_class = Class.new(JSON::GenericObject)
22
+ cache = described_class.new(prefix:, url: 'something', object_class:)
23
+ expect(cache.object_class).to eq object_class
24
+ end
25
+
26
+ it 'raises ArgumentError if url is missing' do
27
+ expect {
28
+ described_class.new prefix:, url: nil
29
+ }.to raise_error ArgumentError
30
+ end
31
+
32
+ context 'test redis interactions' do
33
+ let :redis do
34
+ double('Redis')
35
+ end
36
+
37
+ before do
38
+ allow_any_instance_of(described_class).to receive(:redis).and_return(redis)
39
+ end
40
+
41
+ it 'has Redis client' do
42
+ expect(cache.redis).to eq redis
43
+ end
44
+
45
+ it 'can get a key' do
46
+ key = 'foo'
47
+ expect(redis).to receive(:get).with(prefix + key).and_return '"some_json"'
48
+ expect(cache[key]).to eq 'some_json'
49
+ end
50
+
51
+ it 'can set a value for a key' do
52
+ key, value = 'foo', { test: true }
53
+ expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
54
+ cache[key] = value
55
+ end
56
+
57
+ it 'can set a value for a key with ttl' do
58
+ cache = described_class.new prefix:, url: 'something', ex: 3_600
59
+ key, value = 'foo', { test: true }
60
+ expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: 3_600)
61
+ cache[key] = value
62
+ expect(redis).to receive(:ttl).with(prefix + key).and_return 3_600
63
+ expect(cache.ttl(key)).to eq 3_600
64
+ end
65
+
66
+ it 'can determine if key exists' do
67
+ key = 'foo'
68
+ expect(redis).to receive(:exists?).with(prefix + key).and_return(false, true)
69
+ expect(cache.key?('foo')).to eq false
70
+ expect(cache.key?('foo')).to eq true
71
+ end
72
+
73
+ it 'can delete' do
74
+ key = 'foo'
75
+ expect(redis).to receive(:del).with(prefix + key).and_return 1
76
+ expect(cache.delete(key)).to eq true
77
+ expect(redis).to receive(:del).with(prefix + key).and_return 0
78
+ expect(cache.delete(key)).to eq false
79
+ end
80
+
81
+ it 'can iterate over keys, values' do
82
+ key, value = 'foo', { 'test' => true }
83
+ expect(redis).to receive(:set).with(prefix + key, JSON(value), ex: nil)
84
+ cache[key] = value
85
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
86
+ and_yield("#{prefix}foo")
87
+ expect(redis).to receive(:get).with(prefix + key).and_return(JSON(test: true))
88
+ cache.each do |k, v|
89
+ expect(k).to eq prefix + key
90
+ expect(v).to eq value
91
+ end
92
+ end
93
+
94
+ it 'returns size' do
95
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").
96
+ and_yield("#{prefix}foo").
97
+ and_yield("#{prefix}bar").
98
+ and_yield("#{prefix}baz")
99
+ expect(cache.size).to eq 3
100
+ end
101
+
102
+ it 'can clear' do
103
+ expect(redis).to receive(:scan_each).with(match: 'test-*').and_yield(
104
+ 'test-foo'
105
+ )
106
+ expect(redis).to receive(:del).with('test-foo')
107
+ expect(cache.clear).to eq cache
108
+ end
109
+
110
+ it 'can iterate over keys under a prefix' do
111
+ expect(redis).to receive(:scan_each).with(match: 'test-*')
112
+ cache.to_a
113
+ end
114
+
115
+ it 'can compute prefix with pre' do
116
+ expect(cache.pre('foo')).to eq 'test-foo'
117
+ end
118
+
119
+ it 'can remove prefix with unpre' do
120
+ expect(cache.unpre('test-foo')).to eq 'foo'
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,141 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::SQLiteCache do
4
+ let :prefix do
5
+ 'test-'
6
+ end
7
+
8
+ let :test_value do
9
+ {
10
+ key: 'test',
11
+ text: 'test text',
12
+ norm: 0.5,
13
+ source: 'for-test.txt',
14
+ tags: %w[ test ],
15
+ embedding: [ 0.5 ] * 1_024,
16
+ }
17
+ end
18
+
19
+ let :cache do
20
+ described_class.new prefix:
21
+ end
22
+
23
+ it 'can be instantiated' do
24
+ expect(cache).to be_a described_class
25
+ end
26
+
27
+ it 'defaults to :memory: mode' do
28
+ expect(cache.filename).to eq ':memory:'
29
+ end
30
+
31
+ it 'can be switchted to file mode' do
32
+ expect(SQLite3::Database).to receive(:new).with('foo.sqlite').
33
+ and_return(double.as_null_object)
34
+ cache = described_class.new prefix:, filename: 'foo.sqlite'
35
+ expect(cache.filename).to eq 'foo.sqlite'
36
+ end
37
+
38
+ it 'can get/set a key' do
39
+ key, value = 'foo', test_value
40
+ queried_value = nil
41
+ expect {
42
+ cache[key] = value
43
+ }.to change {
44
+ queried_value = cache[key]
45
+ }.from(nil).to(Documentrix::Documents::Record[value])
46
+ expect(queried_value.embedding).to eq [ 0.5 ] * 1_024
47
+ end
48
+
49
+ it 'can determine if key exists' do
50
+ key, value = 'foo', test_value
51
+ expect {
52
+ cache[key] = value
53
+ }.to change {
54
+ cache.key?(key)
55
+ }.from(false).to(true)
56
+ end
57
+
58
+ it 'can set key with different prefixes' do
59
+ key, value = 'foo', test_value
60
+ expect {
61
+ cache[key] = value
62
+ }.to change {
63
+ cache.size
64
+ }.from(0).to(1)
65
+ cache2 = cache.dup
66
+ cache2.prefix = 'test2-'
67
+ expect {
68
+ cache2[key] = value
69
+ }.to change {
70
+ cache2.size
71
+ }.from(0).to(1)
72
+ expect(cache.size).to eq 1
73
+ s = 0
74
+ cache.full_each { s += 1 }
75
+ expect(s).to eq 2
76
+ end
77
+
78
+ it 'can delete' do
79
+ key, value = 'foo', test_value
80
+ expect(cache.delete(key)).to be_falsy
81
+ cache[key] = value
82
+ expect {
83
+ expect(cache.delete(key)).to eq true
84
+ }.to change {
85
+ cache.key?(key)
86
+ }.from(true).to(false)
87
+ end
88
+
89
+ it 'returns size' do
90
+ key, value = 'foo', test_value
91
+ expect {
92
+ cache[key] = value
93
+ }.to change {
94
+ cache.size
95
+ }.from(0).to(1)
96
+ end
97
+
98
+ it 'can convert_to_vector' do
99
+ vector = [ 23.0, 666.0 ]
100
+ expect(cache.convert_to_vector(vector)).to eq vector
101
+ end
102
+
103
+ it 'can clear' do
104
+ key, value = 'foo', { embedding: [ 0.5 ] * 1_024 }
105
+ cache[key] = value
106
+ expect {
107
+ expect(cache.clear).to eq cache
108
+ }.to change {
109
+ cache.size
110
+ }.from(1).to(0)
111
+ end
112
+
113
+ it 'can clear for tags' do
114
+ key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
115
+ cache[key] = value
116
+ key, value = 'bar', { embedding: [ 0.5 ] * 1_024 }
117
+ cache[key] = value
118
+ expect {
119
+ expect(cache.clear_for_tags(%w[ #foo ])).to eq cache
120
+ }.to change {
121
+ cache.size
122
+ }.from(2).to(1)
123
+ expect(cache).not_to be_key 'foo'
124
+ expect(cache).to be_key 'bar'
125
+ end
126
+
127
+ it 'can return tags' do
128
+ key, value = 'foo', { tags: %w[ foo ], embedding: [ 0.5 ] * 1_024 }
129
+ cache[key] = value
130
+ key, value = 'bar', { tags: %w[ bar baz ], embedding: [ 0.5 ] * 1_024 }
131
+ cache[key] = value
132
+ tags = cache.tags
133
+ expect(tags).to be_a Documentrix::Utils::Tags
134
+ expect(tags.to_a).to eq %w[ bar baz foo ]
135
+ end
136
+
137
+ it 'can iterate over keys under a prefix' do
138
+ cache['foo'] = test_value
139
+ expect(cache.to_a).to eq [ [ 'test-foo', Documentrix::Documents::Record[test_value] ] ]
140
+ end
141
+ end
@@ -0,0 +1,110 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::Splitters::Character do
4
+ let :splitter do
5
+ described_class.new chunk_size: 23, combining_string: ''
6
+ end
7
+
8
+ it 'can be instantiated' do
9
+ expect(splitter).to be_a described_class
10
+ end
11
+
12
+ it 'can split' do
13
+ text = [ ?A * 10 ] * 10 * "\n\n"
14
+ result = splitter.split(text)
15
+ expect(result.count).to eq 5
16
+ expect(result.to_a.join('')).to eq ?A * 100
17
+ end
18
+
19
+ it 'can split combining with separation' do
20
+ splitter = described_class.new chunk_size: 25, include_separator: false,
21
+ combining_string: ?X
22
+ text = [ ?A * 10 ] * 10 * "\n\n"
23
+ result = splitter.split(text)
24
+ expect(result.count).to eq 5
25
+ expect(result.to_a.join(?B)).to eq\
26
+ "AAAAAAAAAAXAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAXB"\
27
+ "AAAAAAAAAAAAAAAAAAAAXBAAAAAAAAAAAAAAAAAAAAX"
28
+ end
29
+
30
+ it 'can split including separator' do
31
+ splitter = described_class.new chunk_size: 25, include_separator: true,
32
+ combining_string: ''
33
+ text = [ ?A * 10 ] * 10 * "\n\n"
34
+ result = splitter.split(text)
35
+ expect(result.count).to eq 5
36
+ expect(result.to_a.join('')).to eq text
37
+ end
38
+
39
+ it 'cannot split' do
40
+ text = [ ?A * 10 ] * 10 * "\n"
41
+ result = splitter.split(text)
42
+ expect(result.count).to eq 1
43
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
44
+ end
45
+
46
+ it 'cannot split2' do
47
+ text = ?A * 25
48
+ result = splitter.split(text)
49
+ expect(result.count).to eq 1
50
+ expect(result.to_a.join('')).to eq ?A * 25
51
+ end
52
+
53
+ it 'can split sentences' do
54
+ text = "foo.foo. bar!bar! baz?baz? quux.\nquux."
55
+ splitter = described_class.new(separator: /[.!?]\s*(?:\b|\z)/, chunk_size: 2)
56
+ result = splitter.split(text)
57
+ expect(result.to_a).to eq %w[ foo foo bar bar baz baz quux quux ]
58
+ end
59
+ end
60
+
61
+ RSpec.describe Documentrix::Documents::Splitters::RecursiveCharacter do
62
+ let :splitter do
63
+ described_class.new chunk_size: 23, combining_string: ''
64
+ end
65
+
66
+ it 'can be instantiated' do
67
+ expect(splitter).to be_a described_class
68
+ end
69
+
70
+ it 'can split' do
71
+ text = [ ?A * 10 ] * 10 * "\n\n"
72
+ result = splitter.split(text)
73
+ expect(result.count).to eq 5
74
+ expect(result.to_a.join('')).to eq ?A * 100
75
+ end
76
+
77
+ it 'cannot split' do
78
+ splitter = described_class.new chunk_size: 23, include_separator: true,
79
+ separators: described_class::DEFAULT_SEPARATORS[0..-2]
80
+ text = ?A * 25
81
+ result = splitter.split(text)
82
+ expect(result.count).to eq 1
83
+ expect(result.to_a.join('')).to eq ?A * 25
84
+ end
85
+
86
+ it 'can split including separator' do
87
+ splitter = described_class.new chunk_size: 25, include_separator: true,
88
+ combining_string: ''
89
+ text = [ ?A * 10 ] * 10 * "\n\n"
90
+ result = splitter.split(text)
91
+ expect(result.count).to eq 5
92
+ expect(result.to_a.join('')).to eq text
93
+ end
94
+
95
+ it 'can split single newline as well' do
96
+ text = [ ?A * 10 ] * 10 * "\n"
97
+ result = splitter.split(text)
98
+ expect(result.count).to eq 5
99
+ expect(result.to_a.join('')).to eq ?A * 100
100
+ end
101
+
102
+ it 'can split single newline as well including separator' do
103
+ splitter = described_class.new chunk_size: 25, include_separator: true,
104
+ combining_string: ''
105
+ text = [ ?A * 10 ] * 10 * "\n"
106
+ result = splitter.split(text)
107
+ expect(result.count).to eq 5
108
+ expect(result.to_a.join('')).to eq text
109
+ end
110
+ end
@@ -0,0 +1,56 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Documentrix::Documents::Splitters::Semantic do
4
+ let :ollama do
5
+ double('Ollama::Client')
6
+ end
7
+
8
+ let :splitter do
9
+ described_class.new ollama:, model: 'mxbai-embed-large'
10
+ end
11
+
12
+ let :embeddings do
13
+ JSON(File.read(asset('embeddings.json')))
14
+ end
15
+
16
+ it 'can be instantiated' do
17
+ expect(splitter).to be_a described_class
18
+ end
19
+
20
+ before do
21
+ allow(ollama).to receive(:embed).and_return(double(embeddings:))
22
+ end
23
+
24
+ it 'can split with breakpoint :percentile' do
25
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
26
+ result = splitter.split(text, breakpoint: :percentile, percentile: 75)
27
+ expect(result.count).to eq 3
28
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
29
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
30
+ end
31
+
32
+ it 'can split with breakpoint :percentile' do
33
+ described_class.new ollama:, model: 'mxbai-embed-large', chunk_size: 50
34
+ text = ([ "A" * 10 ] * 6 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
35
+ result = splitter.split(text, breakpoint: :percentile, percentile: 75)
36
+ expect(result.count).to eq 4
37
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
38
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
39
+ end
40
+
41
+ it 'can split with breakpoint :standard_deviation' do
42
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
43
+ result = splitter.split(text, breakpoint: :standard_deviation, percentage: 100)
44
+ expect(result.count).to eq 3
45
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
46
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
47
+ end
48
+
49
+ it 'can split with breakpoint :interquartile' do
50
+ text = ([ "A" * 10 ] * 3 + [ "B" * 10 ] * 3 + [ "A" * 10 ] * 3) * ". "
51
+ result = splitter.split(text, breakpoint: :interquartile, percentage: 75)
52
+ expect(result.count).to eq 3
53
+ expect(result.to_a.join('').count(?A)).to eq text.count(?A)
54
+ expect(result.to_a.join('').count(?B)).to eq text.count(?B)
55
+ end
56
+ end