documentrix 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/Gemfile +5 -0
- data/README.md +71 -0
- data/Rakefile +48 -0
- data/documentrix.gemspec +41 -0
- data/lib/documentrix/documents/cache/common.rb +43 -0
- data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
- data/lib/documentrix/documents/cache/records.rb +145 -0
- data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
- data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
- data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
- data/lib/documentrix/documents/splitters/character.rb +72 -0
- data/lib/documentrix/documents/splitters/semantic.rb +91 -0
- data/lib/documentrix/documents.rb +328 -0
- data/lib/documentrix/utils/colorize_texts.rb +65 -0
- data/lib/documentrix/utils/math.rb +48 -0
- data/lib/documentrix/utils/tags.rb +112 -0
- data/lib/documentrix/utils.rb +5 -0
- data/lib/documentrix/version.rb +8 -0
- data/lib/documentrix.rb +11 -0
- data/spec/assets/embeddings.json +1 -0
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
- data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
- data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
- data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
- data/spec/documents_spec.rb +174 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/utils/colorize_texts_spec.rb +13 -0
- data/spec/utils/tags_spec.rb +53 -0
- metadata +329 -0
@@ -0,0 +1,335 @@
|
|
1
|
+
require 'documentrix/documents/cache/common'
|
2
|
+
require 'sqlite3'
|
3
|
+
require 'sqlite_vec'
|
4
|
+
require 'digest/md5'
|
5
|
+
|
6
|
+
class Documentrix::Documents::Cache::SQLiteCache
|
7
|
+
include Documentrix::Documents::Cache::Common
|
8
|
+
|
9
|
+
# The initialize method sets up the cache by calling super and setting
|
10
|
+
# various instance variables.
|
11
|
+
#
|
12
|
+
# @param prefix [ String ] the prefix for keys
|
13
|
+
# @param embedding_length [ Integer ] the length of the embeddings vector
|
14
|
+
# @param filename [ String ] the name of the SQLite database file or ':memory:' for in-memory.
|
15
|
+
# @param debug [ FalseClass, TrueClass ] whether to enable debugging
|
16
|
+
#
|
17
|
+
# @return [ void ]
|
18
|
+
def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
|
19
|
+
super(prefix:)
|
20
|
+
@embedding_length = embedding_length
|
21
|
+
@filename = filename
|
22
|
+
@debug = debug
|
23
|
+
setup_database(filename)
|
24
|
+
end
|
25
|
+
|
26
|
+
attr_reader :filename # filename for the database, `:memory:` is in memory
|
27
|
+
|
28
|
+
attr_reader :embedding_length # length of the embeddings vector
|
29
|
+
|
30
|
+
# The [](key) method retrieves the value associated with the given key from
|
31
|
+
# the cache.
|
32
|
+
#
|
33
|
+
# @param [String] key The key for which to retrieve the value.
|
34
|
+
#
|
35
|
+
# @return [Documentrix::Documents::Record, NilClass] The value associated
|
36
|
+
# with the key, or nil if it does not exist in the cache.
|
37
|
+
def [](key)
|
38
|
+
result = execute(
|
39
|
+
%{
|
40
|
+
SELECT records.key, records.text, records.norm, records.source,
|
41
|
+
records.tags, embeddings.embedding
|
42
|
+
FROM records
|
43
|
+
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
44
|
+
WHERE records.key = ?
|
45
|
+
},
|
46
|
+
pre(key)
|
47
|
+
)&.first or return
|
48
|
+
key, text, norm, source, tags, embedding = *result
|
49
|
+
embedding = embedding.unpack("f*")
|
50
|
+
tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
51
|
+
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
52
|
+
end
|
53
|
+
|
54
|
+
# The []= method sets the value for a given key by inserting it into the
|
55
|
+
# database.
|
56
|
+
#
|
57
|
+
# @param [String] key the key to set
|
58
|
+
# @param [Hash, Documentrix::Documents::Record] value the hash or record
|
59
|
+
# containing the text, embedding, and other metadata
|
60
|
+
def []=(key, value)
|
61
|
+
value = convert_value_to_record(value)
|
62
|
+
embedding = value.embedding.pack("f*")
|
63
|
+
execute(%{BEGIN})
|
64
|
+
execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
|
65
|
+
embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
|
66
|
+
execute(%{
|
67
|
+
INSERT INTO records(key,text,embedding_id,norm,source,tags)
|
68
|
+
VALUES(?,?,?,?,?,?)
|
69
|
+
}, [ pre(key), value.text, embedding_id, value.norm, value.source, JSON(value.tags) ])
|
70
|
+
execute(%{COMMIT})
|
71
|
+
end
|
72
|
+
|
73
|
+
# The key? method checks if the given key exists in the cache by executing a
|
74
|
+
# SQL query.
|
75
|
+
#
|
76
|
+
# @param [String] key the key to check for existence
|
77
|
+
#
|
78
|
+
# @return [FalseClass, TrueClass] true if the key exists, false otherwise
|
79
|
+
def key?(key)
|
80
|
+
execute(
|
81
|
+
%{ SELECT count(records.key) FROM records WHERE records.key = ? },
|
82
|
+
pre(key)
|
83
|
+
).flatten.first == 1
|
84
|
+
end
|
85
|
+
|
86
|
+
# The delete method removes a key from the cache by executing a SQL query.
|
87
|
+
#
|
88
|
+
# @param key [ String ] the key to be deleted
|
89
|
+
#
|
90
|
+
# @return [ NilClass ]
|
91
|
+
def delete(key)
|
92
|
+
result = key?(key)
|
93
|
+
execute(
|
94
|
+
%{ DELETE FROM records WHERE records.key = ? },
|
95
|
+
pre(key)
|
96
|
+
)
|
97
|
+
result
|
98
|
+
end
|
99
|
+
|
100
|
+
# The tags method returns an array of unique tags from the database.
|
101
|
+
#
|
102
|
+
# @return [Documentrix::Utils::Tags] An instance of Documentrix::Utils::Tags
|
103
|
+
# containing all unique tags found in the database.
|
104
|
+
def tags
|
105
|
+
result = Documentrix::Utils::Tags.new
|
106
|
+
execute(%{
|
107
|
+
SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
|
108
|
+
}, [ "#@prefix%" ]
|
109
|
+
).flatten.each do
|
110
|
+
JSON(_1).each { |t| result.add(t) }
|
111
|
+
end
|
112
|
+
result
|
113
|
+
end
|
114
|
+
|
115
|
+
# The size method returns the total number of records stored in the cache,
|
116
|
+
# that is the ones with prefix `prefix`.
|
117
|
+
#
|
118
|
+
# @return [ Integer ] the count of records
|
119
|
+
def size
|
120
|
+
execute(%{SELECT COUNT(*) FROM records WHERE key LIKE ?}, [ "#@prefix%" ]).flatten.first
|
121
|
+
end
|
122
|
+
|
123
|
+
# The clear_for_tags method clears the cache for specific tags by deleting
|
124
|
+
# records that match those tags and have the prefix `prefix`.
|
125
|
+
#
|
126
|
+
# @param tags [Array<String>, NilClass] An array of tag names to clear from
|
127
|
+
# the cache or nil for all records
|
128
|
+
#
|
129
|
+
# @return [Documentrix::Documents::Cache::SQLiteCache] The SQLiteCache instance
|
130
|
+
# after clearing the specified tags.
|
131
|
+
def clear_for_tags(tags = nil)
|
132
|
+
tags = Documentrix::Utils::Tags.new(tags).to_a
|
133
|
+
if tags.present?
|
134
|
+
records = find_records_for_tags(tags)
|
135
|
+
keys = '(%s)' % records.transpose.first.map { "'%s'" % quote(_1) }.join(?,)
|
136
|
+
execute(%{DELETE FROM records WHERE key IN #{keys}})
|
137
|
+
else
|
138
|
+
clear
|
139
|
+
end
|
140
|
+
self
|
141
|
+
end
|
142
|
+
|
143
|
+
# The clear method deletes all records for prefix `prefix` from the cache by
|
144
|
+
# executing a SQL query.
|
145
|
+
#
|
146
|
+
# @return [ Documentrix::Documents::RedisBackedMemoryCache ] self
|
147
|
+
def clear
|
148
|
+
execute(%{DELETE FROM records WHERE key LIKE ?}, [ "#@prefix%" ])
|
149
|
+
self
|
150
|
+
end
|
151
|
+
|
152
|
+
# The each method iterates over records matching the given prefix and yields
|
153
|
+
# them to the block.
|
154
|
+
#
|
155
|
+
# @param prefix [ String ] the prefix to match
|
156
|
+
# @yield [ key, value ] where key is the record's key and value is the record itself
|
157
|
+
#
|
158
|
+
# @example
|
159
|
+
# cache.each do |key, value|
|
160
|
+
# puts "#{key}: #{value}"
|
161
|
+
# end
|
162
|
+
def each(prefix: "#@prefix%", &block)
|
163
|
+
execute(%{
|
164
|
+
SELECT records.key, records.text, records.norm, records.source,
|
165
|
+
records.tags, embeddings.embedding
|
166
|
+
FROM records
|
167
|
+
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
168
|
+
WHERE records.key LIKE ?
|
169
|
+
}, [ prefix ]).each do |key, text, norm, source, tags, embedding|
|
170
|
+
embedding = embedding.unpack("f*")
|
171
|
+
tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
172
|
+
value = convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
173
|
+
block.(key, value)
|
174
|
+
end
|
175
|
+
self
|
176
|
+
end
|
177
|
+
include Enumerable
|
178
|
+
|
179
|
+
# The full_each method iterates over all keys and values in the cache,
|
180
|
+
# regardless of their prefix.
|
181
|
+
#
|
182
|
+
# @yield [ key, value ]
|
183
|
+
#
|
184
|
+
# @return [ Documentrix::Documents::Cache::SQLiteCache ] self
|
185
|
+
def full_each(&block)
|
186
|
+
each(prefix: ?%, &block)
|
187
|
+
end
|
188
|
+
|
189
|
+
# The convert_to_vector method returns the input vector itself, because
|
190
|
+
# conversion isn't necessary for this cache class.
|
191
|
+
#
|
192
|
+
# @param vector [ Array ] the input vector
|
193
|
+
#
|
194
|
+
# @return [ Array ] the (not) converted vector
|
195
|
+
def convert_to_vector(vector)
|
196
|
+
vector
|
197
|
+
end
|
198
|
+
|
199
|
+
# The find_records_for_tags method filters records based on the provided tags.
|
200
|
+
#
|
201
|
+
# @param tags [ Array ] an array of tag names
|
202
|
+
#
|
203
|
+
# @return [ Array ] an array of filtered records
|
204
|
+
def find_records_for_tags(tags)
|
205
|
+
if tags.present?
|
206
|
+
tags_filter = Documentrix::Utils::Tags.new(tags).to_a
|
207
|
+
unless tags_filter.empty?
|
208
|
+
tags_where = ' AND (%s)' % tags_filter.map {
|
209
|
+
'tags LIKE "%%%s%%"' % quote(_1)
|
210
|
+
}.join(' OR ')
|
211
|
+
end
|
212
|
+
end
|
213
|
+
records = execute(%{
|
214
|
+
SELECT key, tags, embedding_id
|
215
|
+
FROM records
|
216
|
+
WHERE key LIKE ?#{tags_where}
|
217
|
+
}, [ "#@prefix%" ])
|
218
|
+
if tags_filter
|
219
|
+
records = records.select { |key, tags, embedding_id|
|
220
|
+
(tags_filter & JSON(tags.to_s).to_a).size >= 1
|
221
|
+
}
|
222
|
+
end
|
223
|
+
records
|
224
|
+
end
|
225
|
+
|
226
|
+
# The find_records method finds records that match the given needle and tags.
|
227
|
+
#
|
228
|
+
# @param needle [ Array ] the embedding vector
|
229
|
+
# @param tags [ Array ] the list of tags to filter by (optional)
|
230
|
+
# @param max_records [ Integer ] the maximum number of records to return (optional)
|
231
|
+
#
|
232
|
+
# @yield [ key, value ]
|
233
|
+
#
|
234
|
+
# @raise [ ArgumentError ] if needle size does not match embedding length
|
235
|
+
#
|
236
|
+
# @example
|
237
|
+
# documents.find_records([ 0.1 ] * 1_024, tags: %w[ test ])
|
238
|
+
#
|
239
|
+
# @return [ Array<Documentrix::Documents::Record> ] the list of matching records
|
240
|
+
def find_records(needle, tags: nil, max_records: nil)
|
241
|
+
needle.size != @embedding_length and
|
242
|
+
raise ArgumentError, "needle embedding length != %s" % @embedding_length
|
243
|
+
needle_binary = needle.pack("f*")
|
244
|
+
max_records = [ max_records, size, 4_096 ].compact.min
|
245
|
+
records = find_records_for_tags(tags)
|
246
|
+
rowids_where = '(%s)' % records.transpose.last&.join(?,)
|
247
|
+
execute(%{
|
248
|
+
SELECT records.key, records.text, records.norm, records.source,
|
249
|
+
records.tags, embeddings.embedding
|
250
|
+
FROM records
|
251
|
+
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
252
|
+
WHERE embeddings.rowid IN #{rowids_where}
|
253
|
+
AND embeddings.embedding MATCH ? AND embeddings.k = ?
|
254
|
+
}, [ needle_binary, max_records ]).map do |key, text, norm, source, tags, embedding|
|
255
|
+
key = unpre(key)
|
256
|
+
embedding = embedding.unpack("f*")
|
257
|
+
tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
258
|
+
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
private
|
263
|
+
|
264
|
+
# The execute method executes an SQL query on the database by calling the
|
265
|
+
# \@database.execute method.
|
266
|
+
#
|
267
|
+
# @param [Array] a the arguments to be passed to the @database.execute method
|
268
|
+
#
|
269
|
+
# @return [Result] the result of the executed query
|
270
|
+
def execute(*a)
|
271
|
+
if @debug
|
272
|
+
e = a[0].gsub(/^\s*\n/, '')
|
273
|
+
e = e.gsub(/\A\s+/, '')
|
274
|
+
n = $&.to_s.size
|
275
|
+
e = e.gsub(/^\s{0,#{n}}/, '')
|
276
|
+
e = e.chomp
|
277
|
+
STDERR.puts("EXPLANATION:\n%s\n%s" % [
|
278
|
+
e,
|
279
|
+
@database.execute("EXPLAIN #{e}", *a[1..-1]).pretty_inspect
|
280
|
+
])
|
281
|
+
end
|
282
|
+
@database.execute(*a)
|
283
|
+
end
|
284
|
+
|
285
|
+
# The quote method returns the quoted string as per
|
286
|
+
# SQLite3::Database.quote(string).
|
287
|
+
#
|
288
|
+
# @param string [String] the input string
|
289
|
+
#
|
290
|
+
# @return [String] the quoted string
|
291
|
+
def quote(string)
|
292
|
+
SQLite3::Database.quote(string)
|
293
|
+
end
|
294
|
+
|
295
|
+
# The setup_database method initializes the SQLite database by creating
|
296
|
+
# tables and loading extensions.
|
297
|
+
#
|
298
|
+
# @param filename [ String ] the name of the SQLite database file
|
299
|
+
#
|
300
|
+
# @return [ nil ]
|
301
|
+
def setup_database(filename)
|
302
|
+
@database = SQLite3::Database.new(filename)
|
303
|
+
@database.enable_load_extension(true)
|
304
|
+
SqliteVec.load(@database)
|
305
|
+
@database.enable_load_extension(false)
|
306
|
+
execute %{
|
307
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
|
308
|
+
embedding float[#@embedding_length]
|
309
|
+
)
|
310
|
+
}
|
311
|
+
execute %{
|
312
|
+
CREATE TABLE IF NOT EXISTS records (
|
313
|
+
key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
|
314
|
+
text text NOT NULL DEFAULT '',
|
315
|
+
embedding_id integer,
|
316
|
+
norm float NOT NULL DEFAULT 0.0,
|
317
|
+
source text,
|
318
|
+
tags json NOT NULL DEFAULT [],
|
319
|
+
FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
|
320
|
+
)
|
321
|
+
}
|
322
|
+
nil
|
323
|
+
end
|
324
|
+
|
325
|
+
# The convert_value_to_record method converts the given value into a
|
326
|
+
# Documentrix::Documents::Record object.
|
327
|
+
#
|
328
|
+
# @param value [ Documentrix::Documents::Record, Hash ] the value to be converted
|
329
|
+
#
|
330
|
+
# @return [ Documentrix::Documents::Record ] the converted record object
|
331
|
+
def convert_value_to_record(value)
|
332
|
+
value.is_a?(Documentrix::Documents::Record) and return value
|
333
|
+
Documentrix::Documents::Record[value.to_hash]
|
334
|
+
end
|
335
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Documentrix::Documents::Splitters
|
2
|
+
class Character
|
3
|
+
DEFAULT_SEPARATOR = /(?:\r?\n){2,}/
|
4
|
+
|
5
|
+
def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
|
6
|
+
@separator, @include_separator, @combining_string, @chunk_size =
|
7
|
+
separator, include_separator, combining_string, chunk_size
|
8
|
+
if include_separator
|
9
|
+
@separator = Regexp.new("(#@separator)")
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def split(text)
|
14
|
+
texts = []
|
15
|
+
text.split(@separator) do |t|
|
16
|
+
if @include_separator && t =~ @separator
|
17
|
+
texts.last&.concat t
|
18
|
+
else
|
19
|
+
texts.push(t)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
result = []
|
23
|
+
current_text = +''
|
24
|
+
texts.each do |t|
|
25
|
+
if current_text.size + t.size < @chunk_size
|
26
|
+
current_text << t << @combining_string
|
27
|
+
else
|
28
|
+
current_text.empty? or result << current_text
|
29
|
+
current_text = t
|
30
|
+
end
|
31
|
+
end
|
32
|
+
current_text.empty? or result << current_text
|
33
|
+
result
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class RecursiveCharacter
|
38
|
+
DEFAULT_SEPARATORS = [
|
39
|
+
/(?:\r?\n){2,}/,
|
40
|
+
/\r?\n/,
|
41
|
+
/\b/,
|
42
|
+
//,
|
43
|
+
].freeze
|
44
|
+
|
45
|
+
def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
|
46
|
+
separators.empty? and
|
47
|
+
raise ArgumentError, "non-empty array of separators required"
|
48
|
+
@separators, @include_separator, @combining_string, @chunk_size =
|
49
|
+
separators, include_separator, combining_string, chunk_size
|
50
|
+
end
|
51
|
+
|
52
|
+
def split(text, separators: @separators)
|
53
|
+
separators.empty? and return [ text ]
|
54
|
+
separators = separators.dup
|
55
|
+
separator = separators.shift
|
56
|
+
texts = Character.new(
|
57
|
+
separator:,
|
58
|
+
include_separator: @include_separator,
|
59
|
+
combining_string: @combining_string,
|
60
|
+
chunk_size: @chunk_size
|
61
|
+
).split(text)
|
62
|
+
texts.count == 0 and return [ text ]
|
63
|
+
texts.inject([]) do |r, t|
|
64
|
+
if t.size > @chunk_size
|
65
|
+
r.concat(split(t, separators:))
|
66
|
+
else
|
67
|
+
r.concat([ t ])
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Documentrix::Documents::Splitters
|
2
|
+
class Semantic
|
3
|
+
include Documentrix::Utils::Math
|
4
|
+
|
5
|
+
DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
|
6
|
+
|
7
|
+
def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
|
8
|
+
@ollama, @model, @model_options, @separator, @chunk_size =
|
9
|
+
ollama, model, model_options, separator, chunk_size
|
10
|
+
end
|
11
|
+
|
12
|
+
def split(text, batch_size: 100, breakpoint: :percentile, **opts)
|
13
|
+
sentences = Documentrix::Documents::Splitters::Character.new(
|
14
|
+
separator: @separator,
|
15
|
+
include_separator: opts.fetch(:include_separator, true),
|
16
|
+
chunk_size: 1,
|
17
|
+
).split(text)
|
18
|
+
embeddings = sentences.with_infobar(label: 'Split').each_slice(batch_size).inject([]) do |e, batch|
|
19
|
+
e.concat sentence_embeddings(batch)
|
20
|
+
infobar.progress by: batch.size
|
21
|
+
e
|
22
|
+
end
|
23
|
+
infobar.newline
|
24
|
+
embeddings.size < 2 and return sentences
|
25
|
+
distances = embeddings.each_cons(2).map do |a, b|
|
26
|
+
1.0 - cosine_similarity(a:, b:)
|
27
|
+
end
|
28
|
+
max_distance = calculate_breakpoint_threshold(breakpoint, distances, **opts)
|
29
|
+
gaps = distances.each_with_index.select do |d, i|
|
30
|
+
d > max_distance
|
31
|
+
end.transpose.last
|
32
|
+
gaps or return sentences
|
33
|
+
if gaps.last < distances.size
|
34
|
+
gaps << distances.size
|
35
|
+
end
|
36
|
+
if gaps.last < sentences.size - 1
|
37
|
+
gaps << sentences.size - 1
|
38
|
+
end
|
39
|
+
result = []
|
40
|
+
sg = 0
|
41
|
+
current_text = +''
|
42
|
+
gaps.each do |g|
|
43
|
+
sg.upto(g) do |i|
|
44
|
+
sentence = sentences[i]
|
45
|
+
if current_text.size + sentence.size < @chunk_size
|
46
|
+
current_text += sentence
|
47
|
+
else
|
48
|
+
current_text.empty? or result << current_text
|
49
|
+
current_text = sentence
|
50
|
+
end
|
51
|
+
end
|
52
|
+
unless current_text.empty?
|
53
|
+
result << current_text
|
54
|
+
current_text = +''
|
55
|
+
end
|
56
|
+
sg = g.succ
|
57
|
+
end
|
58
|
+
current_text.empty? or result << current_text
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
|
65
|
+
sequence = MoreMath::Sequence.new(distances)
|
66
|
+
case breakpoint_method
|
67
|
+
when :percentile
|
68
|
+
percentile = opts.fetch(:percentile, 95)
|
69
|
+
sequence.percentile(percentile)
|
70
|
+
when :standard_deviation
|
71
|
+
percentage = opts.fetch(:percentage, 100)
|
72
|
+
(
|
73
|
+
sequence.mean + sequence.standard_deviation * (percentage / 100.0)
|
74
|
+
).clamp(0, sequence.max)
|
75
|
+
when :interquartile
|
76
|
+
percentage = opts.fetch(:percentage, 100)
|
77
|
+
iqr = sequence.interquartile_range
|
78
|
+
max = sequence.max
|
79
|
+
(sequence.mean + iqr * (percentage / 100.0)).clamp(0, max)
|
80
|
+
else
|
81
|
+
raise ArgumentError, "invalid breakpoint method #{breakpoint_method}"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def sentence_embeddings(input)
|
86
|
+
@ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
|
87
|
+
Numo::NArray[*_1]
|
88
|
+
}
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|