ollama-ruby 0.12.1 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,36 +0,0 @@
1
- module Ollama::Documents::Cache::Common
2
- include Ollama::Utils::Math
3
-
4
- def initialize(prefix:)
5
- self.prefix = prefix
6
- end
7
-
8
- attr_accessor :prefix # current prefix defined for the cache
9
-
10
- # Returns an array of collection names that match the given prefix.
11
- #
12
- # @param prefix [String] a string to search for in collection names
13
- # @return [Array<Symbol>] an array of matching collection names
14
- def collections(prefix)
15
- unique = Set.new
16
- full_each { |key, _| unique << key[/\A#{prefix}(.*)-/, 1] }
17
- unique.map(&:to_sym)
18
- end
19
-
20
- # Returns a string representing the given `key` prefixed with the defined
21
- # prefix.
22
- #
23
- # @param key [String] the key to join with the prefix
24
- # @return [String] the joined string of prefix and key
25
- def pre(key)
26
- [ @prefix, key ].join
27
- end
28
-
29
- # Returns a string with the prefix removed from the given `key`.
30
- #
31
- # @param key [String] the input string containing the prefix.
32
- # @return [String] the input string without the prefix.
33
- def unpre(key)
34
- key.sub(/\A#@prefix/, '')
35
- end
36
- end
@@ -1,44 +0,0 @@
1
- require 'ollama/documents/cache/common'
2
-
3
- class Ollama::Documents::MemoryCache
4
- include Ollama::Documents::Cache::Common
5
-
6
- def initialize(prefix:)
7
- super(prefix:)
8
- @data = {}
9
- end
10
-
11
- def [](key)
12
- @data[pre(key)]
13
- end
14
-
15
- def []=(key, value)
16
- @data[pre(key)] = value
17
- end
18
-
19
- def key?(key)
20
- @data.key?(pre(key))
21
- end
22
-
23
- def delete(key)
24
- @data.delete(pre(key))
25
- end
26
-
27
- def size
28
- count
29
- end
30
-
31
- def clear
32
- @data.delete_if { |key, _| key.start_with?(@prefix) }
33
- self
34
- end
35
-
36
- def each(&block)
37
- @data.select { |key,| key.start_with?(@prefix) }.each(&block)
38
- end
39
- include Enumerable
40
-
41
- def full_each(&block)
42
- @data.each(&block)
43
- end
44
- end
@@ -1,87 +0,0 @@
1
- module Ollama::Documents::Cache::Records
2
- class Record < JSON::GenericObject
3
- def initialize(*a)
4
- super
5
- self.text ||= ''
6
- self.norm ||= 0.0
7
- end
8
-
9
- def to_s
10
- my_tags = tags_set
11
- my_tags.empty? or my_tags = " #{my_tags}"
12
- "#<#{self.class} #{text.inspect}#{my_tags} #{similarity || 'n/a'}>"
13
- end
14
-
15
- def tags_set
16
- Ollama::Utils::Tags.new(tags, source:)
17
- end
18
-
19
- def ==(other)
20
- text == other.text
21
- end
22
-
23
- alias inspect to_s
24
- end
25
-
26
- module RedisFullEach
27
- def full_each(&block)
28
- redis.scan_each(match: [ Ollama::Documents, ?* ] * ?-) do |key|
29
- value = redis.get(key) or next
30
- value = JSON(value, object_class: Ollama::Documents::Record)
31
- block.(key, value)
32
- end
33
- end
34
- end
35
-
36
- module FindRecords
37
- def find_records(needle, tags: nil, max_records: nil)
38
- tags = Ollama::Utils::Tags.new(Array(tags)).to_a
39
- records = self
40
- if tags.present?
41
- records = records.select { |_key, record| (tags & record.tags).size >= 1 }
42
- end
43
- needle_norm = norm(needle)
44
- records = records.sort_by { |key, record|
45
- record.key = key
46
- record.similarity = cosine_similarity(
47
- a: needle,
48
- b: record.embedding,
49
- a_norm: needle_norm,
50
- b_norm: record.norm,
51
- )
52
- }
53
- records.transpose.last&.reverse.to_a
54
- end
55
- end
56
-
57
- module Tags
58
- def clear(tags: nil)
59
- tags = Ollama::Utils::Tags.new(tags).to_a
60
- if tags.present?
61
- if respond_to?(:clear_for_tags)
62
- clear_for_tags(tags)
63
- else
64
- each do |key, record|
65
- if (tags & record.tags.to_a).size >= 1
66
- delete(unpre(key))
67
- end
68
- end
69
- end
70
- else
71
- super()
72
- end
73
- end
74
-
75
- def tags
76
- if defined? super
77
- super
78
- else
79
- each_with_object(Ollama::Utils::Tags.new) do |(_, record), t|
80
- record.tags.each do |tag|
81
- t.add(tag, source: record.source)
82
- end
83
- end
84
- end
85
- end
86
- end
87
- end
@@ -1,39 +0,0 @@
1
- require 'redis'
2
-
3
- class Ollama::Documents
4
- class RedisBackedMemoryCache < MemoryCache
5
- def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil)
6
- super(prefix:)
7
- url or raise ArgumentError, 'require redis url'
8
- @url, @object_class = url, object_class
9
- @redis_cache = Ollama::Documents::RedisCache.new(prefix:, url:, object_class:)
10
- @redis_cache.extend(Ollama::Documents::Cache::Records::RedisFullEach)
11
- @redis_cache.full_each do |key, value|
12
- @data[key] = value
13
- end
14
- end
15
-
16
- attr_reader :object_class
17
-
18
- def redis
19
- @redis_cache.redis
20
- end
21
-
22
- def []=(key, value)
23
- super
24
- redis.set(pre(key), JSON(value))
25
- end
26
-
27
- def delete(key)
28
- result = redis.del(pre(key))
29
- super
30
- result
31
- end
32
-
33
- def clear
34
- redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
35
- super
36
- self
37
- end
38
- end
39
- end
@@ -1,68 +0,0 @@
1
- require 'ollama/documents/cache/common'
2
- require 'redis'
3
-
4
- class Ollama::Documents::RedisCache
5
- include Ollama::Documents::Cache::Common
6
-
7
- def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil, ex: nil)
8
- super(prefix:)
9
- url or raise ArgumentError, 'require redis url'
10
- @url, @object_class, @ex = url, object_class, ex
11
- end
12
-
13
- attr_reader :object_class
14
-
15
- def redis
16
- @redis ||= Redis.new(url: @url)
17
- end
18
-
19
- def [](key)
20
- value = redis.get(pre(key))
21
- unless value.nil?
22
- object_class ? JSON(value, object_class:) : JSON(value)
23
- end
24
- end
25
-
26
- def []=(key, value)
27
- set(key, value)
28
- end
29
-
30
- def set(key, value, ex: nil)
31
- ex ||= @ex
32
- if !ex.nil? && ex < 1
33
- redis.del(pre(key))
34
- else
35
- redis.set(pre(key), JSON.generate(value), ex:)
36
- end
37
- value
38
- end
39
-
40
- def ttl(key)
41
- redis.ttl(pre(key))
42
- end
43
-
44
- def key?(key)
45
- !!redis.exists?(pre(key))
46
- end
47
-
48
- def delete(key)
49
- redis.del(pre(key)) == 1
50
- end
51
-
52
- def size
53
- s = 0
54
- redis.scan_each(match: "#@prefix*") { |key| s += 1 }
55
- s
56
- end
57
-
58
- def clear
59
- redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
60
- self
61
- end
62
-
63
- def each(&block)
64
- redis.scan_each(match: "#@prefix*") { |key| block.(key, self[unpre(key)]) }
65
- self
66
- end
67
- include Enumerable
68
- end
@@ -1,215 +0,0 @@
1
- require 'ollama/documents/cache/common'
2
- require 'sqlite3'
3
- require 'sqlite_vec'
4
- require 'digest/md5'
5
-
6
- class Ollama::Documents::Cache::SQLiteCache
7
- include Ollama::Documents::Cache::Common
8
-
9
- def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
10
- super(prefix:)
11
- @embedding_length = embedding_length
12
- @filename = filename
13
- @debug = debug
14
- setup_database(filename)
15
- end
16
-
17
- attr_reader :filename # filename for the database, `:memory:` is in memory
18
-
19
- attr_reader :embedding_length # length of the embeddings vector
20
-
21
- def [](key)
22
- result = execute(
23
- %{
24
- SELECT records.key, records.text, records.norm, records.source,
25
- records.tags, embeddings.embedding
26
- FROM records
27
- INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
28
- WHERE records.key = ?
29
- },
30
- pre(key)
31
- )&.first or return
32
- key, text, norm, source, tags, embedding = *result
33
- embedding = embedding.unpack("f*")
34
- tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
35
- convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
36
- end
37
-
38
- def []=(key, value)
39
- value = convert_value_to_record(value)
40
- embedding = value.embedding.pack("f*")
41
- execute(%{BEGIN})
42
- execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
43
- embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
44
- execute(%{
45
- INSERT INTO records(key,text,embedding_id,norm,source,tags)
46
- VALUES(?,?,?,?,?,?)
47
- }, [ pre(key), value.text, embedding_id, value.norm, value.source, JSON(value.tags) ])
48
- execute(%{COMMIT})
49
- end
50
-
51
- def key?(key)
52
- execute(
53
- %{ SELECT count(records.key) FROM records WHERE records.key = ? },
54
- pre(key)
55
- ).flatten.first == 1
56
- end
57
-
58
- def delete(key)
59
- result = key?(key) ? pre(key) : nil
60
- execute(
61
- %{ DELETE FROM records WHERE records.key = ? },
62
- pre(key)
63
- )
64
- result
65
- end
66
-
67
- def tags
68
- result = Ollama::Utils::Tags.new
69
- execute(%{
70
- SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
71
- }, [ "#@prefix%" ]
72
- ).flatten.each do
73
- JSON(_1).each { |t| result.add(t) }
74
- end
75
- result
76
- end
77
-
78
- def size
79
- execute(%{SELECT COUNT(*) FROM records WHERE key LIKE ?}, [ "#@prefix%" ]).flatten.first
80
- end
81
-
82
- def clear_for_tags(tags = nil)
83
- tags = Ollama::Utils::Tags.new(tags).to_a
84
- if tags.present?
85
- records = find_records_for_tags(tags)
86
- keys = '(%s)' % records.transpose.first.map { "'%s'" % quote(_1) }.join(?,)
87
- execute(%{DELETE FROM records WHERE key IN #{keys}})
88
- else
89
- clear
90
- end
91
- self
92
- end
93
-
94
- def clear
95
- execute(%{DELETE FROM records WHERE key LIKE ?}, [ "#@prefix%" ])
96
- self
97
- end
98
-
99
- def each(prefix: "#@prefix%", &block)
100
- execute(%{
101
- SELECT records.key, records.text, records.norm, records.source,
102
- records.tags, embeddings.embedding
103
- FROM records
104
- INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
105
- WHERE records.key LIKE ?
106
- }, [ prefix ]).each do |key, text, norm, source, tags, embedding|
107
- embedding = embedding.unpack("f*")
108
- tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
109
- value = convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
110
- block.(key, value)
111
- end
112
- end
113
- include Enumerable
114
-
115
- def full_each(&block)
116
- each(prefix: ?%, &block)
117
- end
118
-
119
- def convert_to_vector(vector)
120
- vector
121
- end
122
-
123
- def find_records_for_tags(tags)
124
- if tags.present?
125
- tags_filter = Ollama::Utils::Tags.new(tags).to_a
126
- unless tags_filter.empty?
127
- tags_where = ' AND (%s)' % tags_filter.map {
128
- 'tags LIKE "%%%s%%"' % quote(_1)
129
- }.join(' OR ')
130
- end
131
- end
132
- records = execute(%{
133
- SELECT key, tags, embedding_id
134
- FROM records
135
- WHERE key LIKE ?#{tags_where}
136
- }, [ "#@prefix%" ])
137
- if tags_filter
138
- records = records.select { |key, tags, embedding_id|
139
- (tags_filter & JSON(tags.to_s).to_a).size >= 1
140
- }
141
- end
142
- records
143
- end
144
-
145
- def find_records(needle, tags: nil, max_records: nil)
146
- needle.size != @embedding_length and
147
- raise ArgumentError, "needle embedding length != %s" % @embedding_length
148
- needle_binary = needle.pack("f*")
149
- max_records = [ max_records, size, 4_096 ].compact.min
150
- records = find_records_for_tags(tags)
151
- rowids_where = '(%s)' % records.transpose.last&.join(?,)
152
- execute(%{
153
- SELECT records.key, records.text, records.norm, records.source,
154
- records.tags, embeddings.embedding
155
- FROM records
156
- INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
157
- WHERE embeddings.rowid IN #{rowids_where}
158
- AND embeddings.embedding MATCH ? AND embeddings.k = ?
159
- }, [ needle_binary, max_records ]).map do |key, text, norm, source, tags, embedding|
160
- key = unpre(key)
161
- embedding = embedding.unpack("f*")
162
- tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
163
- convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
164
- end
165
- end
166
-
167
- private
168
-
169
- def execute(*a)
170
- if @debug
171
- e = a[0].gsub(/^\s*\n/, '')
172
- e = e.gsub(/\A\s+/, '')
173
- n = $&.to_s.size
174
- e = e.gsub(/^\s{0,#{n}}/, '')
175
- e = e.chomp
176
- STDERR.puts("EXPLANATION:\n%s\n%s" % [
177
- e,
178
- @database.execute("EXPLAIN #{e}", *a[1..-1]).pretty_inspect
179
- ])
180
- end
181
- @database.execute(*a)
182
- end
183
-
184
- def quote(string)
185
- SQLite3::Database.quote(string)
186
- end
187
-
188
- def setup_database(filename)
189
- @database = SQLite3::Database.new(filename)
190
- @database.enable_load_extension(true)
191
- SqliteVec.load(@database)
192
- @database.enable_load_extension(false)
193
- execute %{
194
- CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
195
- embedding float[#@embedding_length]
196
- )
197
- }
198
- execute %{
199
- CREATE TABLE IF NOT EXISTS records (
200
- key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
201
- text text NOT NULL DEFAULT '',
202
- embedding_id integer,
203
- norm float NOT NULL DEFAULT 0.0,
204
- source text,
205
- tags json NOT NULL DEFAULT [],
206
- FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
207
- )
208
- }
209
- end
210
-
211
- def convert_value_to_record(value)
212
- value.is_a?(Ollama::Documents::Record) and return value
213
- Ollama::Documents::Record[value.to_hash]
214
- end
215
- end
@@ -1,72 +0,0 @@
1
- module Ollama::Documents::Splitters
2
- class Character
3
- DEFAULT_SEPARATOR = /(?:\r?\n){2,}/
4
-
5
- def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
6
- @separator, @include_separator, @combining_string, @chunk_size =
7
- separator, include_separator, combining_string, chunk_size
8
- if include_separator
9
- @separator = Regexp.new("(#@separator)")
10
- end
11
- end
12
-
13
- def split(text)
14
- texts = []
15
- text.split(@separator) do |t|
16
- if @include_separator && t =~ @separator
17
- texts.last&.concat t
18
- else
19
- texts.push(t)
20
- end
21
- end
22
- result = []
23
- current_text = +''
24
- texts.each do |t|
25
- if current_text.size + t.size < @chunk_size
26
- current_text << t << @combining_string
27
- else
28
- current_text.empty? or result << current_text
29
- current_text = t
30
- end
31
- end
32
- current_text.empty? or result << current_text
33
- result
34
- end
35
- end
36
-
37
- class RecursiveCharacter
38
- DEFAULT_SEPARATORS = [
39
- /(?:\r?\n){2,}/,
40
- /\r?\n/,
41
- /\b/,
42
- //,
43
- ].freeze
44
-
45
- def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
46
- separators.empty? and
47
- raise ArgumentError, "non-empty array of separators required"
48
- @separators, @include_separator, @combining_string, @chunk_size =
49
- separators, include_separator, combining_string, chunk_size
50
- end
51
-
52
- def split(text, separators: @separators)
53
- separators.empty? and return [ text ]
54
- separators = separators.dup
55
- separator = separators.shift
56
- texts = Character.new(
57
- separator:,
58
- include_separator: @include_separator,
59
- combining_string: @combining_string,
60
- chunk_size: @chunk_size
61
- ).split(text)
62
- texts.count == 0 and return [ text ]
63
- texts.inject([]) do |r, t|
64
- if t.size > @chunk_size
65
- r.concat(split(t, separators:))
66
- else
67
- r.concat([ t ])
68
- end
69
- end
70
- end
71
- end
72
- end
@@ -1,91 +0,0 @@
1
- module Ollama::Documents::Splitters
2
- class Semantic
3
- include Ollama::Utils::Math
4
-
5
- DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
6
-
7
- def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
8
- @ollama, @model, @model_options, @separator, @chunk_size =
9
- ollama, model, model_options, separator, chunk_size
10
- end
11
-
12
- def split(text, batch_size: 100, breakpoint: :percentile, **opts)
13
- sentences = Ollama::Documents::Splitters::Character.new(
14
- separator: @separator,
15
- include_separator: opts.fetch(:include_separator, true),
16
- chunk_size: 1,
17
- ).split(text)
18
- embeddings = sentences.with_infobar(label: 'Split').each_slice(batch_size).inject([]) do |e, batch|
19
- e.concat sentence_embeddings(batch)
20
- infobar.progress by: batch.size
21
- e
22
- end
23
- infobar.newline
24
- embeddings.size < 2 and return sentences
25
- distances = embeddings.each_cons(2).map do |a, b|
26
- 1.0 - cosine_similarity(a:, b:)
27
- end
28
- max_distance = calculate_breakpoint_threshold(breakpoint, distances, **opts)
29
- gaps = distances.each_with_index.select do |d, i|
30
- d > max_distance
31
- end.transpose.last
32
- gaps or return sentences
33
- if gaps.last < distances.size
34
- gaps << distances.size
35
- end
36
- if gaps.last < sentences.size - 1
37
- gaps << sentences.size - 1
38
- end
39
- result = []
40
- sg = 0
41
- current_text = +''
42
- gaps.each do |g|
43
- sg.upto(g) do |i|
44
- sentence = sentences[i]
45
- if current_text.size + sentence.size < @chunk_size
46
- current_text += sentence
47
- else
48
- current_text.empty? or result << current_text
49
- current_text = sentence
50
- end
51
- end
52
- unless current_text.empty?
53
- result << current_text
54
- current_text = +''
55
- end
56
- sg = g.succ
57
- end
58
- current_text.empty? or result << current_text
59
- result
60
- end
61
-
62
- private
63
-
64
- def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
65
- sequence = MoreMath::Sequence.new(distances)
66
- case breakpoint_method
67
- when :percentile
68
- percentile = opts.fetch(:percentile, 95)
69
- sequence.percentile(percentile)
70
- when :standard_deviation
71
- percentage = opts.fetch(:percentage, 100)
72
- (
73
- sequence.mean + sequence.standard_deviation * (percentage / 100.0)
74
- ).clamp(0, sequence.max)
75
- when :interquartile
76
- percentage = opts.fetch(:percentage, 100)
77
- iqr = sequence.interquartile_range
78
- max = sequence.max
79
- (sequence.mean + iqr * (percentage / 100.0)).clamp(0, max)
80
- else
81
- raise ArgumentError, "invalid breakpoint method #{breakpoint_method}"
82
- end
83
- end
84
-
85
- def sentence_embeddings(input)
86
- @ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
87
- Numo::NArray[*_1]
88
- }
89
- end
90
- end
91
- end