ollama-ruby 0.12.1 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -0
- data/CHANGES.md +39 -0
- data/README.md +70 -144
- data/Rakefile +5 -17
- data/bin/ollama_cli +37 -6
- data/lib/ollama/client/command.rb +2 -2
- data/lib/ollama/dto.rb +4 -0
- data/lib/ollama/version.rb +1 -1
- data/lib/ollama.rb +0 -11
- data/ollama-ruby.gemspec +11 -22
- data/spec/ollama/message_spec.rb +9 -0
- metadata +25 -255
- data/bin/ollama_chat +0 -1248
- data/config/redis.conf +0 -5
- data/docker-compose.yml +0 -10
- data/lib/ollama/documents/cache/common.rb +0 -36
- data/lib/ollama/documents/cache/memory_cache.rb +0 -44
- data/lib/ollama/documents/cache/records.rb +0 -87
- data/lib/ollama/documents/cache/redis_backed_memory_cache.rb +0 -39
- data/lib/ollama/documents/cache/redis_cache.rb +0 -68
- data/lib/ollama/documents/cache/sqlite_cache.rb +0 -215
- data/lib/ollama/documents/splitters/character.rb +0 -72
- data/lib/ollama/documents/splitters/semantic.rb +0 -91
- data/lib/ollama/documents.rb +0 -184
- data/lib/ollama/utils/cache_fetcher.rb +0 -38
- data/lib/ollama/utils/chooser.rb +0 -52
- data/lib/ollama/utils/colorize_texts.rb +0 -65
- data/lib/ollama/utils/fetcher.rb +0 -175
- data/lib/ollama/utils/file_argument.rb +0 -34
- data/lib/ollama/utils/math.rb +0 -48
- data/lib/ollama/utils/tags.rb +0 -67
- data/spec/assets/embeddings.json +0 -1
- data/spec/assets/prompt.txt +0 -1
- data/spec/ollama/documents/cache/memory_cache_spec.rb +0 -97
- data/spec/ollama/documents/cache/redis_backed_memory_cache_spec.rb +0 -118
- data/spec/ollama/documents/cache/redis_cache_spec.rb +0 -121
- data/spec/ollama/documents/cache/sqlite_cache_spec.rb +0 -141
- data/spec/ollama/documents/splitters/character_spec.rb +0 -110
- data/spec/ollama/documents/splitters/semantic_spec.rb +0 -56
- data/spec/ollama/documents_spec.rb +0 -162
- data/spec/ollama/utils/cache_fetcher_spec.rb +0 -43
- data/spec/ollama/utils/colorize_texts_spec.rb +0 -13
- data/spec/ollama/utils/fetcher_spec.rb +0 -137
- data/spec/ollama/utils/file_argument_spec.rb +0 -17
- data/spec/ollama/utils/tags_spec.rb +0 -53
data/config/redis.conf
DELETED
data/docker-compose.yml
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Cache::Common
|
2
|
-
include Ollama::Utils::Math
|
3
|
-
|
4
|
-
def initialize(prefix:)
|
5
|
-
self.prefix = prefix
|
6
|
-
end
|
7
|
-
|
8
|
-
attr_accessor :prefix # current prefix defined for the cache
|
9
|
-
|
10
|
-
# Returns an array of collection names that match the given prefix.
|
11
|
-
#
|
12
|
-
# @param prefix [String] a string to search for in collection names
|
13
|
-
# @return [Array<Symbol>] an array of matching collection names
|
14
|
-
def collections(prefix)
|
15
|
-
unique = Set.new
|
16
|
-
full_each { |key, _| unique << key[/\A#{prefix}(.*)-/, 1] }
|
17
|
-
unique.map(&:to_sym)
|
18
|
-
end
|
19
|
-
|
20
|
-
# Returns a string representing the given `key` prefixed with the defined
|
21
|
-
# prefix.
|
22
|
-
#
|
23
|
-
# @param key [String] the key to join with the prefix
|
24
|
-
# @return [String] the joined string of prefix and key
|
25
|
-
def pre(key)
|
26
|
-
[ @prefix, key ].join
|
27
|
-
end
|
28
|
-
|
29
|
-
# Returns a string with the prefix removed from the given `key`.
|
30
|
-
#
|
31
|
-
# @param key [String] the input string containing the prefix.
|
32
|
-
# @return [String] the input string without the prefix.
|
33
|
-
def unpre(key)
|
34
|
-
key.sub(/\A#@prefix/, '')
|
35
|
-
end
|
36
|
-
end
|
@@ -1,44 +0,0 @@
|
|
1
|
-
require 'ollama/documents/cache/common'
|
2
|
-
|
3
|
-
class Ollama::Documents::MemoryCache
|
4
|
-
include Ollama::Documents::Cache::Common
|
5
|
-
|
6
|
-
def initialize(prefix:)
|
7
|
-
super(prefix:)
|
8
|
-
@data = {}
|
9
|
-
end
|
10
|
-
|
11
|
-
def [](key)
|
12
|
-
@data[pre(key)]
|
13
|
-
end
|
14
|
-
|
15
|
-
def []=(key, value)
|
16
|
-
@data[pre(key)] = value
|
17
|
-
end
|
18
|
-
|
19
|
-
def key?(key)
|
20
|
-
@data.key?(pre(key))
|
21
|
-
end
|
22
|
-
|
23
|
-
def delete(key)
|
24
|
-
@data.delete(pre(key))
|
25
|
-
end
|
26
|
-
|
27
|
-
def size
|
28
|
-
count
|
29
|
-
end
|
30
|
-
|
31
|
-
def clear
|
32
|
-
@data.delete_if { |key, _| key.start_with?(@prefix) }
|
33
|
-
self
|
34
|
-
end
|
35
|
-
|
36
|
-
def each(&block)
|
37
|
-
@data.select { |key,| key.start_with?(@prefix) }.each(&block)
|
38
|
-
end
|
39
|
-
include Enumerable
|
40
|
-
|
41
|
-
def full_each(&block)
|
42
|
-
@data.each(&block)
|
43
|
-
end
|
44
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Cache::Records
|
2
|
-
class Record < JSON::GenericObject
|
3
|
-
def initialize(*a)
|
4
|
-
super
|
5
|
-
self.text ||= ''
|
6
|
-
self.norm ||= 0.0
|
7
|
-
end
|
8
|
-
|
9
|
-
def to_s
|
10
|
-
my_tags = tags_set
|
11
|
-
my_tags.empty? or my_tags = " #{my_tags}"
|
12
|
-
"#<#{self.class} #{text.inspect}#{my_tags} #{similarity || 'n/a'}>"
|
13
|
-
end
|
14
|
-
|
15
|
-
def tags_set
|
16
|
-
Ollama::Utils::Tags.new(tags, source:)
|
17
|
-
end
|
18
|
-
|
19
|
-
def ==(other)
|
20
|
-
text == other.text
|
21
|
-
end
|
22
|
-
|
23
|
-
alias inspect to_s
|
24
|
-
end
|
25
|
-
|
26
|
-
module RedisFullEach
|
27
|
-
def full_each(&block)
|
28
|
-
redis.scan_each(match: [ Ollama::Documents, ?* ] * ?-) do |key|
|
29
|
-
value = redis.get(key) or next
|
30
|
-
value = JSON(value, object_class: Ollama::Documents::Record)
|
31
|
-
block.(key, value)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
module FindRecords
|
37
|
-
def find_records(needle, tags: nil, max_records: nil)
|
38
|
-
tags = Ollama::Utils::Tags.new(Array(tags)).to_a
|
39
|
-
records = self
|
40
|
-
if tags.present?
|
41
|
-
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
42
|
-
end
|
43
|
-
needle_norm = norm(needle)
|
44
|
-
records = records.sort_by { |key, record|
|
45
|
-
record.key = key
|
46
|
-
record.similarity = cosine_similarity(
|
47
|
-
a: needle,
|
48
|
-
b: record.embedding,
|
49
|
-
a_norm: needle_norm,
|
50
|
-
b_norm: record.norm,
|
51
|
-
)
|
52
|
-
}
|
53
|
-
records.transpose.last&.reverse.to_a
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
module Tags
|
58
|
-
def clear(tags: nil)
|
59
|
-
tags = Ollama::Utils::Tags.new(tags).to_a
|
60
|
-
if tags.present?
|
61
|
-
if respond_to?(:clear_for_tags)
|
62
|
-
clear_for_tags(tags)
|
63
|
-
else
|
64
|
-
each do |key, record|
|
65
|
-
if (tags & record.tags.to_a).size >= 1
|
66
|
-
delete(unpre(key))
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
else
|
71
|
-
super()
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def tags
|
76
|
-
if defined? super
|
77
|
-
super
|
78
|
-
else
|
79
|
-
each_with_object(Ollama::Utils::Tags.new) do |(_, record), t|
|
80
|
-
record.tags.each do |tag|
|
81
|
-
t.add(tag, source: record.source)
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
require 'redis'
|
2
|
-
|
3
|
-
class Ollama::Documents
|
4
|
-
class RedisBackedMemoryCache < MemoryCache
|
5
|
-
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil)
|
6
|
-
super(prefix:)
|
7
|
-
url or raise ArgumentError, 'require redis url'
|
8
|
-
@url, @object_class = url, object_class
|
9
|
-
@redis_cache = Ollama::Documents::RedisCache.new(prefix:, url:, object_class:)
|
10
|
-
@redis_cache.extend(Ollama::Documents::Cache::Records::RedisFullEach)
|
11
|
-
@redis_cache.full_each do |key, value|
|
12
|
-
@data[key] = value
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
attr_reader :object_class
|
17
|
-
|
18
|
-
def redis
|
19
|
-
@redis_cache.redis
|
20
|
-
end
|
21
|
-
|
22
|
-
def []=(key, value)
|
23
|
-
super
|
24
|
-
redis.set(pre(key), JSON(value))
|
25
|
-
end
|
26
|
-
|
27
|
-
def delete(key)
|
28
|
-
result = redis.del(pre(key))
|
29
|
-
super
|
30
|
-
result
|
31
|
-
end
|
32
|
-
|
33
|
-
def clear
|
34
|
-
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
35
|
-
super
|
36
|
-
self
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
@@ -1,68 +0,0 @@
|
|
1
|
-
require 'ollama/documents/cache/common'
|
2
|
-
require 'redis'
|
3
|
-
|
4
|
-
class Ollama::Documents::RedisCache
|
5
|
-
include Ollama::Documents::Cache::Common
|
6
|
-
|
7
|
-
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil, ex: nil)
|
8
|
-
super(prefix:)
|
9
|
-
url or raise ArgumentError, 'require redis url'
|
10
|
-
@url, @object_class, @ex = url, object_class, ex
|
11
|
-
end
|
12
|
-
|
13
|
-
attr_reader :object_class
|
14
|
-
|
15
|
-
def redis
|
16
|
-
@redis ||= Redis.new(url: @url)
|
17
|
-
end
|
18
|
-
|
19
|
-
def [](key)
|
20
|
-
value = redis.get(pre(key))
|
21
|
-
unless value.nil?
|
22
|
-
object_class ? JSON(value, object_class:) : JSON(value)
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
def []=(key, value)
|
27
|
-
set(key, value)
|
28
|
-
end
|
29
|
-
|
30
|
-
def set(key, value, ex: nil)
|
31
|
-
ex ||= @ex
|
32
|
-
if !ex.nil? && ex < 1
|
33
|
-
redis.del(pre(key))
|
34
|
-
else
|
35
|
-
redis.set(pre(key), JSON.generate(value), ex:)
|
36
|
-
end
|
37
|
-
value
|
38
|
-
end
|
39
|
-
|
40
|
-
def ttl(key)
|
41
|
-
redis.ttl(pre(key))
|
42
|
-
end
|
43
|
-
|
44
|
-
def key?(key)
|
45
|
-
!!redis.exists?(pre(key))
|
46
|
-
end
|
47
|
-
|
48
|
-
def delete(key)
|
49
|
-
redis.del(pre(key)) == 1
|
50
|
-
end
|
51
|
-
|
52
|
-
def size
|
53
|
-
s = 0
|
54
|
-
redis.scan_each(match: "#@prefix*") { |key| s += 1 }
|
55
|
-
s
|
56
|
-
end
|
57
|
-
|
58
|
-
def clear
|
59
|
-
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
60
|
-
self
|
61
|
-
end
|
62
|
-
|
63
|
-
def each(&block)
|
64
|
-
redis.scan_each(match: "#@prefix*") { |key| block.(key, self[unpre(key)]) }
|
65
|
-
self
|
66
|
-
end
|
67
|
-
include Enumerable
|
68
|
-
end
|
@@ -1,215 +0,0 @@
|
|
1
|
-
require 'ollama/documents/cache/common'
|
2
|
-
require 'sqlite3'
|
3
|
-
require 'sqlite_vec'
|
4
|
-
require 'digest/md5'
|
5
|
-
|
6
|
-
class Ollama::Documents::Cache::SQLiteCache
|
7
|
-
include Ollama::Documents::Cache::Common
|
8
|
-
|
9
|
-
def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
|
10
|
-
super(prefix:)
|
11
|
-
@embedding_length = embedding_length
|
12
|
-
@filename = filename
|
13
|
-
@debug = debug
|
14
|
-
setup_database(filename)
|
15
|
-
end
|
16
|
-
|
17
|
-
attr_reader :filename # filename for the database, `:memory:` is in memory
|
18
|
-
|
19
|
-
attr_reader :embedding_length # length of the embeddings vector
|
20
|
-
|
21
|
-
def [](key)
|
22
|
-
result = execute(
|
23
|
-
%{
|
24
|
-
SELECT records.key, records.text, records.norm, records.source,
|
25
|
-
records.tags, embeddings.embedding
|
26
|
-
FROM records
|
27
|
-
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
28
|
-
WHERE records.key = ?
|
29
|
-
},
|
30
|
-
pre(key)
|
31
|
-
)&.first or return
|
32
|
-
key, text, norm, source, tags, embedding = *result
|
33
|
-
embedding = embedding.unpack("f*")
|
34
|
-
tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
35
|
-
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
36
|
-
end
|
37
|
-
|
38
|
-
def []=(key, value)
|
39
|
-
value = convert_value_to_record(value)
|
40
|
-
embedding = value.embedding.pack("f*")
|
41
|
-
execute(%{BEGIN})
|
42
|
-
execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
|
43
|
-
embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
|
44
|
-
execute(%{
|
45
|
-
INSERT INTO records(key,text,embedding_id,norm,source,tags)
|
46
|
-
VALUES(?,?,?,?,?,?)
|
47
|
-
}, [ pre(key), value.text, embedding_id, value.norm, value.source, JSON(value.tags) ])
|
48
|
-
execute(%{COMMIT})
|
49
|
-
end
|
50
|
-
|
51
|
-
def key?(key)
|
52
|
-
execute(
|
53
|
-
%{ SELECT count(records.key) FROM records WHERE records.key = ? },
|
54
|
-
pre(key)
|
55
|
-
).flatten.first == 1
|
56
|
-
end
|
57
|
-
|
58
|
-
def delete(key)
|
59
|
-
result = key?(key) ? pre(key) : nil
|
60
|
-
execute(
|
61
|
-
%{ DELETE FROM records WHERE records.key = ? },
|
62
|
-
pre(key)
|
63
|
-
)
|
64
|
-
result
|
65
|
-
end
|
66
|
-
|
67
|
-
def tags
|
68
|
-
result = Ollama::Utils::Tags.new
|
69
|
-
execute(%{
|
70
|
-
SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
|
71
|
-
}, [ "#@prefix%" ]
|
72
|
-
).flatten.each do
|
73
|
-
JSON(_1).each { |t| result.add(t) }
|
74
|
-
end
|
75
|
-
result
|
76
|
-
end
|
77
|
-
|
78
|
-
def size
|
79
|
-
execute(%{SELECT COUNT(*) FROM records WHERE key LIKE ?}, [ "#@prefix%" ]).flatten.first
|
80
|
-
end
|
81
|
-
|
82
|
-
def clear_for_tags(tags = nil)
|
83
|
-
tags = Ollama::Utils::Tags.new(tags).to_a
|
84
|
-
if tags.present?
|
85
|
-
records = find_records_for_tags(tags)
|
86
|
-
keys = '(%s)' % records.transpose.first.map { "'%s'" % quote(_1) }.join(?,)
|
87
|
-
execute(%{DELETE FROM records WHERE key IN #{keys}})
|
88
|
-
else
|
89
|
-
clear
|
90
|
-
end
|
91
|
-
self
|
92
|
-
end
|
93
|
-
|
94
|
-
def clear
|
95
|
-
execute(%{DELETE FROM records WHERE key LIKE ?}, [ "#@prefix%" ])
|
96
|
-
self
|
97
|
-
end
|
98
|
-
|
99
|
-
def each(prefix: "#@prefix%", &block)
|
100
|
-
execute(%{
|
101
|
-
SELECT records.key, records.text, records.norm, records.source,
|
102
|
-
records.tags, embeddings.embedding
|
103
|
-
FROM records
|
104
|
-
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
105
|
-
WHERE records.key LIKE ?
|
106
|
-
}, [ prefix ]).each do |key, text, norm, source, tags, embedding|
|
107
|
-
embedding = embedding.unpack("f*")
|
108
|
-
tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
109
|
-
value = convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
110
|
-
block.(key, value)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
include Enumerable
|
114
|
-
|
115
|
-
def full_each(&block)
|
116
|
-
each(prefix: ?%, &block)
|
117
|
-
end
|
118
|
-
|
119
|
-
def convert_to_vector(vector)
|
120
|
-
vector
|
121
|
-
end
|
122
|
-
|
123
|
-
def find_records_for_tags(tags)
|
124
|
-
if tags.present?
|
125
|
-
tags_filter = Ollama::Utils::Tags.new(tags).to_a
|
126
|
-
unless tags_filter.empty?
|
127
|
-
tags_where = ' AND (%s)' % tags_filter.map {
|
128
|
-
'tags LIKE "%%%s%%"' % quote(_1)
|
129
|
-
}.join(' OR ')
|
130
|
-
end
|
131
|
-
end
|
132
|
-
records = execute(%{
|
133
|
-
SELECT key, tags, embedding_id
|
134
|
-
FROM records
|
135
|
-
WHERE key LIKE ?#{tags_where}
|
136
|
-
}, [ "#@prefix%" ])
|
137
|
-
if tags_filter
|
138
|
-
records = records.select { |key, tags, embedding_id|
|
139
|
-
(tags_filter & JSON(tags.to_s).to_a).size >= 1
|
140
|
-
}
|
141
|
-
end
|
142
|
-
records
|
143
|
-
end
|
144
|
-
|
145
|
-
def find_records(needle, tags: nil, max_records: nil)
|
146
|
-
needle.size != @embedding_length and
|
147
|
-
raise ArgumentError, "needle embedding length != %s" % @embedding_length
|
148
|
-
needle_binary = needle.pack("f*")
|
149
|
-
max_records = [ max_records, size, 4_096 ].compact.min
|
150
|
-
records = find_records_for_tags(tags)
|
151
|
-
rowids_where = '(%s)' % records.transpose.last&.join(?,)
|
152
|
-
execute(%{
|
153
|
-
SELECT records.key, records.text, records.norm, records.source,
|
154
|
-
records.tags, embeddings.embedding
|
155
|
-
FROM records
|
156
|
-
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
157
|
-
WHERE embeddings.rowid IN #{rowids_where}
|
158
|
-
AND embeddings.embedding MATCH ? AND embeddings.k = ?
|
159
|
-
}, [ needle_binary, max_records ]).map do |key, text, norm, source, tags, embedding|
|
160
|
-
key = unpre(key)
|
161
|
-
embedding = embedding.unpack("f*")
|
162
|
-
tags = Ollama::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
163
|
-
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
164
|
-
end
|
165
|
-
end
|
166
|
-
|
167
|
-
private
|
168
|
-
|
169
|
-
def execute(*a)
|
170
|
-
if @debug
|
171
|
-
e = a[0].gsub(/^\s*\n/, '')
|
172
|
-
e = e.gsub(/\A\s+/, '')
|
173
|
-
n = $&.to_s.size
|
174
|
-
e = e.gsub(/^\s{0,#{n}}/, '')
|
175
|
-
e = e.chomp
|
176
|
-
STDERR.puts("EXPLANATION:\n%s\n%s" % [
|
177
|
-
e,
|
178
|
-
@database.execute("EXPLAIN #{e}", *a[1..-1]).pretty_inspect
|
179
|
-
])
|
180
|
-
end
|
181
|
-
@database.execute(*a)
|
182
|
-
end
|
183
|
-
|
184
|
-
def quote(string)
|
185
|
-
SQLite3::Database.quote(string)
|
186
|
-
end
|
187
|
-
|
188
|
-
def setup_database(filename)
|
189
|
-
@database = SQLite3::Database.new(filename)
|
190
|
-
@database.enable_load_extension(true)
|
191
|
-
SqliteVec.load(@database)
|
192
|
-
@database.enable_load_extension(false)
|
193
|
-
execute %{
|
194
|
-
CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
|
195
|
-
embedding float[#@embedding_length]
|
196
|
-
)
|
197
|
-
}
|
198
|
-
execute %{
|
199
|
-
CREATE TABLE IF NOT EXISTS records (
|
200
|
-
key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
|
201
|
-
text text NOT NULL DEFAULT '',
|
202
|
-
embedding_id integer,
|
203
|
-
norm float NOT NULL DEFAULT 0.0,
|
204
|
-
source text,
|
205
|
-
tags json NOT NULL DEFAULT [],
|
206
|
-
FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
|
207
|
-
)
|
208
|
-
}
|
209
|
-
end
|
210
|
-
|
211
|
-
def convert_value_to_record(value)
|
212
|
-
value.is_a?(Ollama::Documents::Record) and return value
|
213
|
-
Ollama::Documents::Record[value.to_hash]
|
214
|
-
end
|
215
|
-
end
|
@@ -1,72 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Splitters
|
2
|
-
class Character
|
3
|
-
DEFAULT_SEPARATOR = /(?:\r?\n){2,}/
|
4
|
-
|
5
|
-
def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
|
6
|
-
@separator, @include_separator, @combining_string, @chunk_size =
|
7
|
-
separator, include_separator, combining_string, chunk_size
|
8
|
-
if include_separator
|
9
|
-
@separator = Regexp.new("(#@separator)")
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
def split(text)
|
14
|
-
texts = []
|
15
|
-
text.split(@separator) do |t|
|
16
|
-
if @include_separator && t =~ @separator
|
17
|
-
texts.last&.concat t
|
18
|
-
else
|
19
|
-
texts.push(t)
|
20
|
-
end
|
21
|
-
end
|
22
|
-
result = []
|
23
|
-
current_text = +''
|
24
|
-
texts.each do |t|
|
25
|
-
if current_text.size + t.size < @chunk_size
|
26
|
-
current_text << t << @combining_string
|
27
|
-
else
|
28
|
-
current_text.empty? or result << current_text
|
29
|
-
current_text = t
|
30
|
-
end
|
31
|
-
end
|
32
|
-
current_text.empty? or result << current_text
|
33
|
-
result
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
class RecursiveCharacter
|
38
|
-
DEFAULT_SEPARATORS = [
|
39
|
-
/(?:\r?\n){2,}/,
|
40
|
-
/\r?\n/,
|
41
|
-
/\b/,
|
42
|
-
//,
|
43
|
-
].freeze
|
44
|
-
|
45
|
-
def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
|
46
|
-
separators.empty? and
|
47
|
-
raise ArgumentError, "non-empty array of separators required"
|
48
|
-
@separators, @include_separator, @combining_string, @chunk_size =
|
49
|
-
separators, include_separator, combining_string, chunk_size
|
50
|
-
end
|
51
|
-
|
52
|
-
def split(text, separators: @separators)
|
53
|
-
separators.empty? and return [ text ]
|
54
|
-
separators = separators.dup
|
55
|
-
separator = separators.shift
|
56
|
-
texts = Character.new(
|
57
|
-
separator:,
|
58
|
-
include_separator: @include_separator,
|
59
|
-
combining_string: @combining_string,
|
60
|
-
chunk_size: @chunk_size
|
61
|
-
).split(text)
|
62
|
-
texts.count == 0 and return [ text ]
|
63
|
-
texts.inject([]) do |r, t|
|
64
|
-
if t.size > @chunk_size
|
65
|
-
r.concat(split(t, separators:))
|
66
|
-
else
|
67
|
-
r.concat([ t ])
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
@@ -1,91 +0,0 @@
|
|
1
|
-
module Ollama::Documents::Splitters
|
2
|
-
class Semantic
|
3
|
-
include Ollama::Utils::Math
|
4
|
-
|
5
|
-
DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
|
6
|
-
|
7
|
-
def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
|
8
|
-
@ollama, @model, @model_options, @separator, @chunk_size =
|
9
|
-
ollama, model, model_options, separator, chunk_size
|
10
|
-
end
|
11
|
-
|
12
|
-
def split(text, batch_size: 100, breakpoint: :percentile, **opts)
|
13
|
-
sentences = Ollama::Documents::Splitters::Character.new(
|
14
|
-
separator: @separator,
|
15
|
-
include_separator: opts.fetch(:include_separator, true),
|
16
|
-
chunk_size: 1,
|
17
|
-
).split(text)
|
18
|
-
embeddings = sentences.with_infobar(label: 'Split').each_slice(batch_size).inject([]) do |e, batch|
|
19
|
-
e.concat sentence_embeddings(batch)
|
20
|
-
infobar.progress by: batch.size
|
21
|
-
e
|
22
|
-
end
|
23
|
-
infobar.newline
|
24
|
-
embeddings.size < 2 and return sentences
|
25
|
-
distances = embeddings.each_cons(2).map do |a, b|
|
26
|
-
1.0 - cosine_similarity(a:, b:)
|
27
|
-
end
|
28
|
-
max_distance = calculate_breakpoint_threshold(breakpoint, distances, **opts)
|
29
|
-
gaps = distances.each_with_index.select do |d, i|
|
30
|
-
d > max_distance
|
31
|
-
end.transpose.last
|
32
|
-
gaps or return sentences
|
33
|
-
if gaps.last < distances.size
|
34
|
-
gaps << distances.size
|
35
|
-
end
|
36
|
-
if gaps.last < sentences.size - 1
|
37
|
-
gaps << sentences.size - 1
|
38
|
-
end
|
39
|
-
result = []
|
40
|
-
sg = 0
|
41
|
-
current_text = +''
|
42
|
-
gaps.each do |g|
|
43
|
-
sg.upto(g) do |i|
|
44
|
-
sentence = sentences[i]
|
45
|
-
if current_text.size + sentence.size < @chunk_size
|
46
|
-
current_text += sentence
|
47
|
-
else
|
48
|
-
current_text.empty? or result << current_text
|
49
|
-
current_text = sentence
|
50
|
-
end
|
51
|
-
end
|
52
|
-
unless current_text.empty?
|
53
|
-
result << current_text
|
54
|
-
current_text = +''
|
55
|
-
end
|
56
|
-
sg = g.succ
|
57
|
-
end
|
58
|
-
current_text.empty? or result << current_text
|
59
|
-
result
|
60
|
-
end
|
61
|
-
|
62
|
-
private
|
63
|
-
|
64
|
-
def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
|
65
|
-
sequence = MoreMath::Sequence.new(distances)
|
66
|
-
case breakpoint_method
|
67
|
-
when :percentile
|
68
|
-
percentile = opts.fetch(:percentile, 95)
|
69
|
-
sequence.percentile(percentile)
|
70
|
-
when :standard_deviation
|
71
|
-
percentage = opts.fetch(:percentage, 100)
|
72
|
-
(
|
73
|
-
sequence.mean + sequence.standard_deviation * (percentage / 100.0)
|
74
|
-
).clamp(0, sequence.max)
|
75
|
-
when :interquartile
|
76
|
-
percentage = opts.fetch(:percentage, 100)
|
77
|
-
iqr = sequence.interquartile_range
|
78
|
-
max = sequence.max
|
79
|
-
(sequence.mean + iqr * (percentage / 100.0)).clamp(0, max)
|
80
|
-
else
|
81
|
-
raise ArgumentError, "invalid breakpoint method #{breakpoint_method}"
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
def sentence_embeddings(input)
|
86
|
-
@ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
|
87
|
-
Numo::NArray[*_1]
|
88
|
-
}
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|