ollama-ruby 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -0
- data/CHANGES.md +11 -0
- data/README.md +2 -2
- data/Rakefile +4 -5
- data/bin/ollama_chat +27 -21
- data/lib/ollama/client/command.rb +2 -2
- data/lib/ollama/version.rb +1 -1
- data/lib/ollama.rb +0 -4
- data/ollama-ruby.gemspec +11 -12
- metadata +26 -88
- data/lib/ollama/documents/cache/common.rb +0 -36
- data/lib/ollama/documents/cache/memory_cache.rb +0 -44
- data/lib/ollama/documents/cache/records.rb +0 -87
- data/lib/ollama/documents/cache/redis_backed_memory_cache.rb +0 -39
- data/lib/ollama/documents/cache/redis_cache.rb +0 -68
- data/lib/ollama/documents/cache/sqlite_cache.rb +0 -215
- data/lib/ollama/documents/splitters/character.rb +0 -72
- data/lib/ollama/documents/splitters/semantic.rb +0 -91
- data/lib/ollama/documents.rb +0 -184
- data/lib/ollama/utils/colorize_texts.rb +0 -65
- data/lib/ollama/utils/math.rb +0 -48
- data/lib/ollama/utils/tags.rb +0 -67
- data/spec/assets/embeddings.json +0 -1
- data/spec/ollama/documents/cache/memory_cache_spec.rb +0 -97
- data/spec/ollama/documents/cache/redis_backed_memory_cache_spec.rb +0 -118
- data/spec/ollama/documents/cache/redis_cache_spec.rb +0 -121
- data/spec/ollama/documents/cache/sqlite_cache_spec.rb +0 -141
- data/spec/ollama/documents/splitters/character_spec.rb +0 -110
- data/spec/ollama/documents/splitters/semantic_spec.rb +0 -56
- data/spec/ollama/documents_spec.rb +0 -162
- data/spec/ollama/utils/colorize_texts_spec.rb +0 -13
- data/spec/ollama/utils/tags_spec.rb +0 -53
@@ -1,36 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Cache::Common
|
2
|
-
include Ollama::Utils::Math
|
3
|
-
|
4
|
-
def initialize(prefix:)
|
5
|
-
self.prefix = prefix
|
6
|
-
end
|
7
|
-
|
8
|
-
attr_accessor :prefix # current prefix defined for the cache
|
9
|
-
|
10
|
-
# Returns an array of collection names that match the given prefix.
|
11
|
-
#
|
12
|
-
# @param prefix [String] a string to search for in collection names
|
13
|
-
# @return [Array<Symbol>] an array of matching collection names
|
14
|
-
def collections(prefix)
|
15
|
-
unique = Set.new
|
16
|
-
full_each { |key, _| unique << key[/\A#{prefix}(.*)-/, 1] }
|
17
|
-
unique.map(&:to_sym)
|
18
|
-
end
|
19
|
-
|
20
|
-
# Returns a string representing the given `key` prefixed with the defined
|
21
|
-
# prefix.
|
22
|
-
#
|
23
|
-
# @param key [String] the key to join with the prefix
|
24
|
-
# @return [String] the joined string of prefix and key
|
25
|
-
def pre(key)
|
26
|
-
[ @prefix, key ].join
|
27
|
-
end
|
28
|
-
|
29
|
-
# Returns a string with the prefix removed from the given `key`.
|
30
|
-
#
|
31
|
-
# @param key [String] the input string containing the prefix.
|
32
|
-
# @return [String] the input string without the prefix.
|
33
|
-
def unpre(key)
|
34
|
-
key.sub(/\A#@prefix/, '')
|
35
|
-
end
|
36
|
-
end
|
@@ -1,44 +0,0 @@
|
|
1
|
-
require 'ollama/documents/cache/common'
|
2
|
-
|
3
|
-
class Ollama::Documents::MemoryCache
|
4
|
-
include Ollama::Documents::Cache::Common
|
5
|
-
|
6
|
-
def initialize(prefix:)
|
7
|
-
super(prefix:)
|
8
|
-
@data = {}
|
9
|
-
end
|
10
|
-
|
11
|
-
def [](key)
|
12
|
-
@data[pre(key)]
|
13
|
-
end
|
14
|
-
|
15
|
-
def []=(key, value)
|
16
|
-
@data[pre(key)] = value
|
17
|
-
end
|
18
|
-
|
19
|
-
def key?(key)
|
20
|
-
@data.key?(pre(key))
|
21
|
-
end
|
22
|
-
|
23
|
-
def delete(key)
|
24
|
-
@data.delete(pre(key))
|
25
|
-
end
|
26
|
-
|
27
|
-
def size
|
28
|
-
count
|
29
|
-
end
|
30
|
-
|
31
|
-
def clear
|
32
|
-
@data.delete_if { |key, _| key.start_with?(@prefix) }
|
33
|
-
self
|
34
|
-
end
|
35
|
-
|
36
|
-
def each(&block)
|
37
|
-
@data.select { |key,| key.start_with?(@prefix) }.each(&block)
|
38
|
-
end
|
39
|
-
include Enumerable
|
40
|
-
|
41
|
-
def full_each(&block)
|
42
|
-
@data.each(&block)
|
43
|
-
end
|
44
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Cache::Records
|
2
|
-
class Record < JSON::GenericObject
|
3
|
-
def initialize(*a)
|
4
|
-
super
|
5
|
-
self.text ||= ''
|
6
|
-
self.norm ||= 0.0
|
7
|
-
end
|
8
|
-
|
9
|
-
def to_s
|
10
|
-
my_tags = tags_set
|
11
|
-
my_tags.empty? or my_tags = " #{my_tags}"
|
12
|
-
"#<#{self.class} #{text.inspect}#{my_tags} #{similarity || 'n/a'}>"
|
13
|
-
end
|
14
|
-
|
15
|
-
def tags_set
|
16
|
-
Ollama::Utils::Tags.new(tags, source:)
|
17
|
-
end
|
18
|
-
|
19
|
-
def ==(other)
|
20
|
-
text == other.text
|
21
|
-
end
|
22
|
-
|
23
|
-
alias inspect to_s
|
24
|
-
end
|
25
|
-
|
26
|
-
module RedisFullEach
|
27
|
-
def full_each(&block)
|
28
|
-
redis.scan_each(match: [ Ollama::Documents, ?* ] * ?-) do |key|
|
29
|
-
value = redis.get(key) or next
|
30
|
-
value = JSON(value, object_class: Ollama::Documents::Record)
|
31
|
-
block.(key, value)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
module FindRecords
|
37
|
-
def find_records(needle, tags: nil, max_records: nil)
|
38
|
-
tags = Ollama::Utils::Tags.new(Array(tags)).to_a
|
39
|
-
records = self
|
40
|
-
if tags.present?
|
41
|
-
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
42
|
-
end
|
43
|
-
needle_norm = norm(needle)
|
44
|
-
records = records.sort_by { |key, record|
|
45
|
-
record.key = key
|
46
|
-
record.similarity = cosine_similarity(
|
47
|
-
a: needle,
|
48
|
-
b: record.embedding,
|
49
|
-
a_norm: needle_norm,
|
50
|
-
b_norm: record.norm,
|
51
|
-
)
|
52
|
-
}
|
53
|
-
records.transpose.last&.reverse.to_a
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
module Tags
|
58
|
-
def clear(tags: nil)
|
59
|
-
tags = Ollama::Utils::Tags.new(tags).to_a
|
60
|
-
if tags.present?
|
61
|
-
if respond_to?(:clear_for_tags)
|
62
|
-
clear_for_tags(tags)
|
63
|
-
else
|
64
|
-
each do |key, record|
|
65
|
-
if (tags & record.tags.to_a).size >= 1
|
66
|
-
delete(unpre(key))
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
else
|
71
|
-
super()
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def tags
|
76
|
-
if defined? super
|
77
|
-
super
|
78
|
-
else
|
79
|
-
each_with_object(Ollama::Utils::Tags.new) do |(_, record), t|
|
80
|
-
record.tags.each do |tag|
|
81
|
-
t.add(tag, source: record.source)
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
require 'redis'
|
2
|
-
|
3
|
-
class Ollama::Documents
|
4
|
-
class RedisBackedMemoryCache < MemoryCache
|
5
|
-
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil)
|
6
|
-
super(prefix:)
|
7
|
-
url or raise ArgumentError, 'require redis url'
|
8
|
-
@url, @object_class = url, object_class
|
9
|
-
@redis_cache = Ollama::Documents::RedisCache.new(prefix:, url:, object_class:)
|
10
|
-
@redis_cache.extend(Ollama::Documents::Cache::Records::RedisFullEach)
|
11
|
-
@redis_cache.full_each do |key, value|
|
12
|
-
@data[key] = value
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
attr_reader :object_class
|
17
|
-
|
18
|
-
def redis
|
19
|
-
@redis_cache.redis
|
20
|
-
end
|
21
|
-
|
22
|
-
def []=(key, value)
|
23
|
-
super
|
24
|
-
redis.set(pre(key), JSON(value))
|
25
|
-
end
|
26
|
-
|
27
|
-
def delete(key)
|
28
|
-
result = redis.del(pre(key))
|
29
|
-
super
|
30
|
-
result
|
31
|
-
end
|
32
|
-
|
33
|
-
def clear
|
34
|
-
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
35
|
-
super
|
36
|
-
self
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
@@ -1,68 +0,0 @@
|
|
1
|
-
require 'ollama/documents/cache/common'
|
2
|
-
require 'redis'
|
3
|
-
|
4
|
-
class Ollama::Documents::RedisCache
|
5
|
-
include Ollama::Documents::Cache::Common
|
6
|
-
|
7
|
-
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil, ex: nil)
|
8
|
-
super(prefix:)
|
9
|
-
url or raise ArgumentError, 'require redis url'
|
10
|
-
@url, @object_class, @ex = url, object_class, ex
|
11
|
-
end
|
12
|
-
|
13
|
-
attr_reader :object_class
|
14
|
-
|
15
|
-
def redis
|
16
|
-
@redis ||= Redis.new(url: @url)
|
17
|
-
end
|
18
|
-
|
19
|
-
def [](key)
|
20
|
-
value = redis.get(pre(key))
|
21
|
-
unless value.nil?
|
22
|
-
object_class ? JSON(value, object_class:) : JSON(value)
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
def []=(key, value)
|
27
|
-
set(key, value)
|
28
|
-
end
|
29
|
-
|
30
|
-
def set(key, value, ex: nil)
|
31
|
-
ex ||= @ex
|
32
|
-
if !ex.nil? && ex < 1
|
33
|
-
redis.del(pre(key))
|
34
|
-
else
|
35
|
-
redis.set(pre(key), JSON.generate(value), ex:)
|
36
|
-
end
|
37
|
-
value
|
38
|
-
end
|
39
|
-
|
40
|
-
def ttl(key)
|
41
|
-
redis.ttl(pre(key))
|
42
|
-
end
|
43
|
-
|
44
|
-
def key?(key)
|
45
|
-
!!redis.exists?(pre(key))
|
46
|
-
end
|
47
|
-
|
48
|
-
def delete(key)
|
49
|
-
redis.del(pre(key)) == 1
|
50
|
-
end
|
51
|
-
|
52
|
-
def size
|
53
|
-
s = 0
|
54
|
-
redis.scan_each(match: "#@prefix*") { |key| s += 1 }
|
55
|
-
s
|
56
|
-
end
|
57
|
-
|
58
|
-
def clear
|
59
|
-
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
60
|
-
self
|
61
|
-
end
|
62
|
-
|
63
|
-
def each(&block)
|
64
|
-
redis.scan_each(match: "#@prefix*") { |key| block.(key, self[unpre(key)]) }
|
65
|
-
self
|
66
|
-
end
|
67
|
-
include Enumerable
|
68
|
-
end
|
@@ -1,215 +0,0 @@
|
|
1
|
-
require 'ollama/documents/cache/common'
|
2
|
-
require 'sqlite3'
|
3
|
-
require 'sqlite_vec'
|
4
|
-
require 'digest/md5'
|
5
|
-
|
6
|
-
class Ollama::Documents::Cache::SQLiteCache
|
7
|
-
include Ollama::Documents::Cache::Common
|
8
|
-
|
9
|
-
def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
|
10
|
-
super(prefix:)
|
11
|
-
@embedding_length = embedding_length
|
12
|
-
@filename = filename
|
13
|
-
@debug = debug
|
14
|
-
setup_database(filename)
|
15
|
-
end
|
16
|
-
|
17
|
-
attr_reader :filename # filename for the database, `:memory:` is in memory
|
18
|
-
|
19
|
-
attr_reader :embedding_length # length of the embeddings vector
|
20
|
-
|
21
|
-
def [](key)
|
22
|
-
result = execute(
|
23
|
-
%{
|
24
|
-
SELECT records.key, records.text, records.norm, records.source,
|
25
|
-
records.tags, embeddings.embedding
|
26
|
-
FROM records
|
27
|
-
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
28
|
-
WHERE records.key = ?
|
29
|
-
},
|
30
|
-
pre(key)
|
31
|
-
)&.first or return
|
32
|
-
key, text, norm, source, tags, embedding = *result
|
33
|
-
embedding = embedding.unpack("f*")
|
34
|
-
tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
35
|
-
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
36
|
-
end
|
37
|
-
|
38
|
-
def []=(key, value)
|
39
|
-
value = convert_value_to_record(value)
|
40
|
-
embedding = value.embedding.pack("f*")
|
41
|
-
execute(%{BEGIN})
|
42
|
-
execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
|
43
|
-
embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
|
44
|
-
execute(%{
|
45
|
-
INSERT INTO records(key,text,embedding_id,norm,source,tags)
|
46
|
-
VALUES(?,?,?,?,?,?)
|
47
|
-
}, [ pre(key), value.text, embedding_id, value.norm, value.source, JSON(value.tags) ])
|
48
|
-
execute(%{COMMIT})
|
49
|
-
end
|
50
|
-
|
51
|
-
def key?(key)
|
52
|
-
execute(
|
53
|
-
%{ SELECT count(records.key) FROM records WHERE records.key = ? },
|
54
|
-
pre(key)
|
55
|
-
).flatten.first == 1
|
56
|
-
end
|
57
|
-
|
58
|
-
def delete(key)
|
59
|
-
result = key?(key) ? pre(key) : nil
|
60
|
-
execute(
|
61
|
-
%{ DELETE FROM records WHERE records.key = ? },
|
62
|
-
pre(key)
|
63
|
-
)
|
64
|
-
result
|
65
|
-
end
|
66
|
-
|
67
|
-
def tags
|
68
|
-
result = Ollama::Utils::Tags.new
|
69
|
-
execute(%{
|
70
|
-
SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
|
71
|
-
}, [ "#@prefix%" ]
|
72
|
-
).flatten.each do
|
73
|
-
JSON(_1).each { |t| result.add(t) }
|
74
|
-
end
|
75
|
-
result
|
76
|
-
end
|
77
|
-
|
78
|
-
def size
|
79
|
-
execute(%{SELECT COUNT(*) FROM records WHERE key LIKE ?}, [ "#@prefix%" ]).flatten.first
|
80
|
-
end
|
81
|
-
|
82
|
-
def clear_for_tags(tags = nil)
|
83
|
-
tags = Ollama::Utils::Tags.new(tags).to_a
|
84
|
-
if tags.present?
|
85
|
-
records = find_records_for_tags(tags)
|
86
|
-
keys = '(%s)' % records.transpose.first.map { "'%s'" % quote(_1) }.join(?,)
|
87
|
-
execute(%{DELETE FROM records WHERE key IN #{keys}})
|
88
|
-
else
|
89
|
-
clear
|
90
|
-
end
|
91
|
-
self
|
92
|
-
end
|
93
|
-
|
94
|
-
def clear
|
95
|
-
execute(%{DELETE FROM records WHERE key LIKE ?}, [ "#@prefix%" ])
|
96
|
-
self
|
97
|
-
end
|
98
|
-
|
99
|
-
def each(prefix: "#@prefix%", &block)
|
100
|
-
execute(%{
|
101
|
-
SELECT records.key, records.text, records.norm, records.source,
|
102
|
-
records.tags, embeddings.embedding
|
103
|
-
FROM records
|
104
|
-
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
105
|
-
WHERE records.key LIKE ?
|
106
|
-
}, [ prefix ]).each do |key, text, norm, source, tags, embedding|
|
107
|
-
embedding = embedding.unpack("f*")
|
108
|
-
tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
109
|
-
value = convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
110
|
-
block.(key, value)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
include Enumerable
|
114
|
-
|
115
|
-
def full_each(&block)
|
116
|
-
each(prefix: ?%, &block)
|
117
|
-
end
|
118
|
-
|
119
|
-
def convert_to_vector(vector)
|
120
|
-
vector
|
121
|
-
end
|
122
|
-
|
123
|
-
def find_records_for_tags(tags)
|
124
|
-
if tags.present?
|
125
|
-
tags_filter = Ollama::Utils::Tags.new(tags).to_a
|
126
|
-
unless tags_filter.empty?
|
127
|
-
tags_where = ' AND (%s)' % tags_filter.map {
|
128
|
-
'tags LIKE "%%%s%%"' % quote(_1)
|
129
|
-
}.join(' OR ')
|
130
|
-
end
|
131
|
-
end
|
132
|
-
records = execute(%{
|
133
|
-
SELECT key, tags, embedding_id
|
134
|
-
FROM records
|
135
|
-
WHERE key LIKE ?#{tags_where}
|
136
|
-
}, [ "#@prefix%" ])
|
137
|
-
if tags_filter
|
138
|
-
records = records.select { |key, tags, embedding_id|
|
139
|
-
(tags_filter & JSON(tags.to_s).to_a).size >= 1
|
140
|
-
}
|
141
|
-
end
|
142
|
-
records
|
143
|
-
end
|
144
|
-
|
145
|
-
def find_records(needle, tags: nil, max_records: nil)
|
146
|
-
needle.size != @embedding_length and
|
147
|
-
raise ArgumentError, "needle embedding length != %s" % @embedding_length
|
148
|
-
needle_binary = needle.pack("f*")
|
149
|
-
max_records = [ max_records, size, 4_096 ].compact.min
|
150
|
-
records = find_records_for_tags(tags)
|
151
|
-
rowids_where = '(%s)' % records.transpose.last&.join(?,)
|
152
|
-
execute(%{
|
153
|
-
SELECT records.key, records.text, records.norm, records.source,
|
154
|
-
records.tags, embeddings.embedding
|
155
|
-
FROM records
|
156
|
-
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
157
|
-
WHERE embeddings.rowid IN #{rowids_where}
|
158
|
-
AND embeddings.embedding MATCH ? AND embeddings.k = ?
|
159
|
-
}, [ needle_binary, max_records ]).map do |key, text, norm, source, tags, embedding|
|
160
|
-
key = unpre(key)
|
161
|
-
embedding = embedding.unpack("f*")
|
162
|
-
tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
163
|
-
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
164
|
-
end
|
165
|
-
end
|
166
|
-
|
167
|
-
private
|
168
|
-
|
169
|
-
def execute(*a)
|
170
|
-
if @debug
|
171
|
-
e = a[0].gsub(/^\s*\n/, '')
|
172
|
-
e = e.gsub(/\A\s+/, '')
|
173
|
-
n = $&.to_s.size
|
174
|
-
e = e.gsub(/^\s{0,#{n}}/, '')
|
175
|
-
e = e.chomp
|
176
|
-
STDERR.puts("EXPLANATION:\n%s\n%s" % [
|
177
|
-
e,
|
178
|
-
@database.execute("EXPLAIN #{e}", *a[1..-1]).pretty_inspect
|
179
|
-
])
|
180
|
-
end
|
181
|
-
@database.execute(*a)
|
182
|
-
end
|
183
|
-
|
184
|
-
def quote(string)
|
185
|
-
SQLite3::Database.quote(string)
|
186
|
-
end
|
187
|
-
|
188
|
-
def setup_database(filename)
|
189
|
-
@database = SQLite3::Database.new(filename)
|
190
|
-
@database.enable_load_extension(true)
|
191
|
-
SqliteVec.load(@database)
|
192
|
-
@database.enable_load_extension(false)
|
193
|
-
execute %{
|
194
|
-
CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
|
195
|
-
embedding float[#@embedding_length]
|
196
|
-
)
|
197
|
-
}
|
198
|
-
execute %{
|
199
|
-
CREATE TABLE IF NOT EXISTS records (
|
200
|
-
key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
|
201
|
-
text text NOT NULL DEFAULT '',
|
202
|
-
embedding_id integer,
|
203
|
-
norm float NOT NULL DEFAULT 0.0,
|
204
|
-
source text,
|
205
|
-
tags json NOT NULL DEFAULT [],
|
206
|
-
FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
|
207
|
-
)
|
208
|
-
}
|
209
|
-
end
|
210
|
-
|
211
|
-
def convert_value_to_record(value)
|
212
|
-
value.is_a?(Ollama::Documents::Record) and return value
|
213
|
-
Ollama::Documents::Record[value.to_hash]
|
214
|
-
end
|
215
|
-
end
|
@@ -1,72 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Splitters
|
2
|
-
class Character
|
3
|
-
DEFAULT_SEPARATOR = /(?:\r?\n){2,}/
|
4
|
-
|
5
|
-
def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
|
6
|
-
@separator, @include_separator, @combining_string, @chunk_size =
|
7
|
-
separator, include_separator, combining_string, chunk_size
|
8
|
-
if include_separator
|
9
|
-
@separator = Regexp.new("(#@separator)")
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
def split(text)
|
14
|
-
texts = []
|
15
|
-
text.split(@separator) do |t|
|
16
|
-
if @include_separator && t =~ @separator
|
17
|
-
texts.last&.concat t
|
18
|
-
else
|
19
|
-
texts.push(t)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
result = []
|
23
|
-
current_text = +''
|
24
|
-
texts.each do |t|
|
25
|
-
if current_text.size + t.size < @chunk_size
|
26
|
-
current_text << t << @combining_string
|
27
|
-
else
|
28
|
-
current_text.empty? or result << current_text
|
29
|
-
current_text = t
|
30
|
-
end
|
31
|
-
end
|
32
|
-
current_text.empty? or result << current_text
|
33
|
-
result
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
class RecursiveCharacter
|
38
|
-
DEFAULT_SEPARATORS = [
|
39
|
-
/(?:\r?\n){2,}/,
|
40
|
-
/\r?\n/,
|
41
|
-
/\b/,
|
42
|
-
//,
|
43
|
-
].freeze
|
44
|
-
|
45
|
-
def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
|
46
|
-
separators.empty? and
|
47
|
-
raise ArgumentError, "non-empty array of separators required"
|
48
|
-
@separators, @include_separator, @combining_string, @chunk_size =
|
49
|
-
separators, include_separator, combining_string, chunk_size
|
50
|
-
end
|
51
|
-
|
52
|
-
def split(text, separators: @separators)
|
53
|
-
separators.empty? and return [ text ]
|
54
|
-
separators = separators.dup
|
55
|
-
separator = separators.shift
|
56
|
-
texts = Character.new(
|
57
|
-
separator:,
|
58
|
-
include_separator: @include_separator,
|
59
|
-
combining_string: @combining_string,
|
60
|
-
chunk_size: @chunk_size
|
61
|
-
).split(text)
|
62
|
-
texts.count == 0 and return [ text ]
|
63
|
-
texts.inject([]) do |r, t|
|
64
|
-
if t.size > @chunk_size
|
65
|
-
r.concat(split(t, separators:))
|
66
|
-
else
|
67
|
-
r.concat([ t ])
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
@@ -1,91 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Splitters
|
2
|
-
class Semantic
|
3
|
-
include Ollama::Utils::Math
|
4
|
-
|
5
|
-
DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
|
6
|
-
|
7
|
-
def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
|
8
|
-
@ollama, @model, @model_options, @separator, @chunk_size =
|
9
|
-
ollama, model, model_options, separator, chunk_size
|
10
|
-
end
|
11
|
-
|
12
|
-
def split(text, batch_size: 100, breakpoint: :percentile, **opts)
|
13
|
-
sentences = Ollama::Documents::Splitters::Character.new(
|
14
|
-
separator: @separator,
|
15
|
-
include_separator: opts.fetch(:include_separator, true),
|
16
|
-
chunk_size: 1,
|
17
|
-
).split(text)
|
18
|
-
embeddings = sentences.with_infobar(label: 'Split').each_slice(batch_size).inject([]) do |e, batch|
|
19
|
-
e.concat sentence_embeddings(batch)
|
20
|
-
infobar.progress by: batch.size
|
21
|
-
e
|
22
|
-
end
|
23
|
-
infobar.newline
|
24
|
-
embeddings.size < 2 and return sentences
|
25
|
-
distances = embeddings.each_cons(2).map do |a, b|
|
26
|
-
1.0 - cosine_similarity(a:, b:)
|
27
|
-
end
|
28
|
-
max_distance = calculate_breakpoint_threshold(breakpoint, distances, **opts)
|
29
|
-
gaps = distances.each_with_index.select do |d, i|
|
30
|
-
d > max_distance
|
31
|
-
end.transpose.last
|
32
|
-
gaps or return sentences
|
33
|
-
if gaps.last < distances.size
|
34
|
-
gaps << distances.size
|
35
|
-
end
|
36
|
-
if gaps.last < sentences.size - 1
|
37
|
-
gaps << sentences.size - 1
|
38
|
-
end
|
39
|
-
result = []
|
40
|
-
sg = 0
|
41
|
-
current_text = +''
|
42
|
-
gaps.each do |g|
|
43
|
-
sg.upto(g) do |i|
|
44
|
-
sentence = sentences[i]
|
45
|
-
if current_text.size + sentence.size < @chunk_size
|
46
|
-
current_text += sentence
|
47
|
-
else
|
48
|
-
current_text.empty? or result << current_text
|
49
|
-
current_text = sentence
|
50
|
-
end
|
51
|
-
end
|
52
|
-
unless current_text.empty?
|
53
|
-
result << current_text
|
54
|
-
current_text = +''
|
55
|
-
end
|
56
|
-
sg = g.succ
|
57
|
-
end
|
58
|
-
current_text.empty? or result << current_text
|
59
|
-
result
|
60
|
-
end
|
61
|
-
|
62
|
-
private
|
63
|
-
|
64
|
-
def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
|
65
|
-
sequence = MoreMath::Sequence.new(distances)
|
66
|
-
case breakpoint_method
|
67
|
-
when :percentile
|
68
|
-
percentile = opts.fetch(:percentile, 95)
|
69
|
-
sequence.percentile(percentile)
|
70
|
-
when :standard_deviation
|
71
|
-
percentage = opts.fetch(:percentage, 100)
|
72
|
-
(
|
73
|
-
sequence.mean + sequence.standard_deviation * (percentage / 100.0)
|
74
|
-
).clamp(0, sequence.max)
|
75
|
-
when :interquartile
|
76
|
-
percentage = opts.fetch(:percentage, 100)
|
77
|
-
iqr = sequence.interquartile_range
|
78
|
-
max = sequence.max
|
79
|
-
(sequence.mean + iqr * (percentage / 100.0)).clamp(0, max)
|
80
|
-
else
|
81
|
-
raise ArgumentError, "invalid breakpoint method #{breakpoint_method}"
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
def sentence_embeddings(input)
|
86
|
-
@ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
|
87
|
-
Numo::NArray[*_1]
|
88
|
-
}
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|