documentrix 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/Gemfile +5 -0
  4. data/README.md +71 -0
  5. data/Rakefile +48 -0
  6. data/documentrix.gemspec +41 -0
  7. data/lib/documentrix/documents/cache/common.rb +43 -0
  8. data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
  9. data/lib/documentrix/documents/cache/records.rb +145 -0
  10. data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
  11. data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
  12. data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
  13. data/lib/documentrix/documents/splitters/character.rb +72 -0
  14. data/lib/documentrix/documents/splitters/semantic.rb +91 -0
  15. data/lib/documentrix/documents.rb +328 -0
  16. data/lib/documentrix/utils/colorize_texts.rb +65 -0
  17. data/lib/documentrix/utils/math.rb +48 -0
  18. data/lib/documentrix/utils/tags.rb +112 -0
  19. data/lib/documentrix/utils.rb +5 -0
  20. data/lib/documentrix/version.rb +8 -0
  21. data/lib/documentrix.rb +11 -0
  22. data/spec/assets/embeddings.json +1 -0
  23. data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
  24. data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
  25. data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
  26. data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
  27. data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
  28. data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
  29. data/spec/documents_spec.rb +174 -0
  30. data/spec/spec_helper.rb +23 -0
  31. data/spec/utils/colorize_texts_spec.rb +13 -0
  32. data/spec/utils/tags_spec.rb +53 -0
  33. metadata +329 -0
@@ -0,0 +1,335 @@
1
+ require 'documentrix/documents/cache/common'
2
+ require 'sqlite3'
3
+ require 'sqlite_vec'
4
+ require 'digest/md5'
5
+
6
+ class Documentrix::Documents::Cache::SQLiteCache
7
+ include Documentrix::Documents::Cache::Common
8
+
9
+ # The initialize method sets up the cache by calling super and setting
10
+ # various instance variables.
11
+ #
12
+ # @param prefix [ String ] the prefix for keys
13
+ # @param embedding_length [ Integer ] the length of the embeddings vector
14
+ # @param filename [ String ] the name of the SQLite database file or ':memory:' for in-memory.
15
+ # @param debug [ FalseClass, TrueClass ] whether to enable debugging
16
+ #
17
+ # @return [ void ]
18
+ def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
19
+ super(prefix:)
20
+ @embedding_length = embedding_length
21
+ @filename = filename
22
+ @debug = debug
23
+ setup_database(filename)
24
+ end
25
+
26
+ attr_reader :filename # filename for the database, `:memory:` is in memory
27
+
28
+ attr_reader :embedding_length # length of the embeddings vector
29
+
30
+ # The [](key) method retrieves the value associated with the given key from
31
+ # the cache.
32
+ #
33
+ # @param [String] key The key for which to retrieve the value.
34
+ #
35
+ # @return [Documentrix::Documents::Record, NilClass] The value associated
36
+ # with the key, or nil if it does not exist in the cache.
37
+ def [](key)
38
+ result = execute(
39
+ %{
40
+ SELECT records.key, records.text, records.norm, records.source,
41
+ records.tags, embeddings.embedding
42
+ FROM records
43
+ INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
44
+ WHERE records.key = ?
45
+ },
46
+ pre(key)
47
+ )&.first or return
48
+ key, text, norm, source, tags, embedding = *result
49
+ embedding = embedding.unpack("f*")
50
+ tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
51
+ convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
52
+ end
53
+
54
+ # The []= method sets the value for a given key by inserting it into the
55
+ # database.
56
+ #
57
+ # @param [String] key the key to set
58
+ # @param [Hash, Documentrix::Documents::Record] value the hash or record
59
+ # containing the text, embedding, and other metadata
60
+ def []=(key, value)
61
+ value = convert_value_to_record(value)
62
+ embedding = value.embedding.pack("f*")
63
+ execute(%{BEGIN})
64
+ execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
65
+ embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
66
+ execute(%{
67
+ INSERT INTO records(key,text,embedding_id,norm,source,tags)
68
+ VALUES(?,?,?,?,?,?)
69
+ }, [ pre(key), value.text, embedding_id, value.norm, value.source, JSON(value.tags) ])
70
+ execute(%{COMMIT})
71
+ end
72
+
73
+ # The key? method checks if the given key exists in the cache by executing a
74
+ # SQL query.
75
+ #
76
+ # @param [String] key the key to check for existence
77
+ #
78
+ # @return [FalseClass, TrueClass] true if the key exists, false otherwise
79
+ def key?(key)
80
+ execute(
81
+ %{ SELECT count(records.key) FROM records WHERE records.key = ? },
82
+ pre(key)
83
+ ).flatten.first == 1
84
+ end
85
+
86
+ # The delete method removes a key from the cache by executing a SQL query.
87
+ #
88
+ # @param key [ String ] the key to be deleted
89
+ #
90
+ # @return [ NilClass ]
91
+ def delete(key)
92
+ result = key?(key)
93
+ execute(
94
+ %{ DELETE FROM records WHERE records.key = ? },
95
+ pre(key)
96
+ )
97
+ result
98
+ end
99
+
100
+ # The tags method returns an array of unique tags from the database.
101
+ #
102
+ # @return [Documentrix::Utils::Tags] An instance of Documentrix::Utils::Tags
103
+ # containing all unique tags found in the database.
104
+ def tags
105
+ result = Documentrix::Utils::Tags.new
106
+ execute(%{
107
+ SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
108
+ }, [ "#@prefix%" ]
109
+ ).flatten.each do
110
+ JSON(_1).each { |t| result.add(t) }
111
+ end
112
+ result
113
+ end
114
+
115
+ # The size method returns the total number of records stored in the cache,
116
+ # that is the ones with prefix `prefix`.
117
+ #
118
+ # @return [ Integer ] the count of records
119
+ def size
120
+ execute(%{SELECT COUNT(*) FROM records WHERE key LIKE ?}, [ "#@prefix%" ]).flatten.first
121
+ end
122
+
123
+ # The clear_for_tags method clears the cache for specific tags by deleting
124
+ # records that match those tags and have the prefix `prefix`.
125
+ #
126
+ # @param tags [Array<String>, NilClass] An array of tag names to clear from
127
+ # the cache or nil for all records
128
+ #
129
+ # @return [Documentrix::Documents::Cache::SQLiteCache] The SQLiteCache instance
130
+ # after clearing the specified tags.
131
+ def clear_for_tags(tags = nil)
132
+ tags = Documentrix::Utils::Tags.new(tags).to_a
133
+ if tags.present?
134
+ records = find_records_for_tags(tags)
135
+ keys = '(%s)' % records.transpose.first.map { "'%s'" % quote(_1) }.join(?,)
136
+ execute(%{DELETE FROM records WHERE key IN #{keys}})
137
+ else
138
+ clear
139
+ end
140
+ self
141
+ end
142
+
143
+ # The clear method deletes all records for prefix `prefix` from the cache by
144
+ # executing a SQL query.
145
+ #
146
+ # @return [ Documentrix::Documents::RedisBackedMemoryCache ] self
147
+ def clear
148
+ execute(%{DELETE FROM records WHERE key LIKE ?}, [ "#@prefix%" ])
149
+ self
150
+ end
151
+
152
+ # The each method iterates over records matching the given prefix and yields
153
+ # them to the block.
154
+ #
155
+ # @param prefix [ String ] the prefix to match
156
+ # @yield [ key, value ] where key is the record's key and value is the record itself
157
+ #
158
+ # @example
159
+ # cache.each do |key, value|
160
+ # puts "#{key}: #{value}"
161
+ # end
162
+ def each(prefix: "#@prefix%", &block)
163
+ execute(%{
164
+ SELECT records.key, records.text, records.norm, records.source,
165
+ records.tags, embeddings.embedding
166
+ FROM records
167
+ INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
168
+ WHERE records.key LIKE ?
169
+ }, [ prefix ]).each do |key, text, norm, source, tags, embedding|
170
+ embedding = embedding.unpack("f*")
171
+ tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
172
+ value = convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
173
+ block.(key, value)
174
+ end
175
+ self
176
+ end
177
+ include Enumerable
178
+
179
+ # The full_each method iterates over all keys and values in the cache,
180
+ # regardless of their prefix.
181
+ #
182
+ # @yield [ key, value ]
183
+ #
184
+ # @return [ Documentrix::Documents::Cache::SQLiteCache ] self
185
+ def full_each(&block)
186
+ each(prefix: ?%, &block)
187
+ end
188
+
189
+ # The convert_to_vector method returns the input vector itself, because
190
+ # conversion isn't necessary for this cache class.
191
+ #
192
+ # @param vector [ Array ] the input vector
193
+ #
194
+ # @return [ Array ] the (not) converted vector
195
+ def convert_to_vector(vector)
196
+ vector
197
+ end
198
+
199
+ # The find_records_for_tags method filters records based on the provided tags.
200
+ #
201
+ # @param tags [ Array ] an array of tag names
202
+ #
203
+ # @return [ Array ] an array of filtered records
204
+ def find_records_for_tags(tags)
205
+ if tags.present?
206
+ tags_filter = Documentrix::Utils::Tags.new(tags).to_a
207
+ unless tags_filter.empty?
208
+ tags_where = ' AND (%s)' % tags_filter.map {
209
+ 'tags LIKE "%%%s%%"' % quote(_1)
210
+ }.join(' OR ')
211
+ end
212
+ end
213
+ records = execute(%{
214
+ SELECT key, tags, embedding_id
215
+ FROM records
216
+ WHERE key LIKE ?#{tags_where}
217
+ }, [ "#@prefix%" ])
218
+ if tags_filter
219
+ records = records.select { |key, tags, embedding_id|
220
+ (tags_filter & JSON(tags.to_s).to_a).size >= 1
221
+ }
222
+ end
223
+ records
224
+ end
225
+
226
+ # The find_records method finds records that match the given needle and tags.
227
+ #
228
+ # @param needle [ Array ] the embedding vector
229
+ # @param tags [ Array ] the list of tags to filter by (optional)
230
+ # @param max_records [ Integer ] the maximum number of records to return (optional)
231
+ #
232
+ # @yield [ key, value ]
233
+ #
234
+ # @raise [ ArgumentError ] if needle size does not match embedding length
235
+ #
236
+ # @example
237
+ # documents.find_records([ 0.1 ] * 1_024, tags: %w[ test ])
238
+ #
239
+ # @return [ Array<Documentrix::Documents::Record> ] the list of matching records
240
+ def find_records(needle, tags: nil, max_records: nil)
241
+ needle.size != @embedding_length and
242
+ raise ArgumentError, "needle embedding length != %s" % @embedding_length
243
+ needle_binary = needle.pack("f*")
244
+ max_records = [ max_records, size, 4_096 ].compact.min
245
+ records = find_records_for_tags(tags)
246
+ rowids_where = '(%s)' % records.transpose.last&.join(?,)
247
+ execute(%{
248
+ SELECT records.key, records.text, records.norm, records.source,
249
+ records.tags, embeddings.embedding
250
+ FROM records
251
+ INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
252
+ WHERE embeddings.rowid IN #{rowids_where}
253
+ AND embeddings.embedding MATCH ? AND embeddings.k = ?
254
+ }, [ needle_binary, max_records ]).map do |key, text, norm, source, tags, embedding|
255
+ key = unpre(key)
256
+ embedding = embedding.unpack("f*")
257
+ tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
258
+ convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
259
+ end
260
+ end
261
+
262
+ private
263
+
264
+ # The execute method executes an SQL query on the database by calling the
265
+ # \@database.execute method.
266
+ #
267
+ # @param [Array] a the arguments to be passed to the @database.execute method
268
+ #
269
+ # @return [Result] the result of the executed query
270
+ def execute(*a)
271
+ if @debug
272
+ e = a[0].gsub(/^\s*\n/, '')
273
+ e = e.gsub(/\A\s+/, '')
274
+ n = $&.to_s.size
275
+ e = e.gsub(/^\s{0,#{n}}/, '')
276
+ e = e.chomp
277
+ STDERR.puts("EXPLANATION:\n%s\n%s" % [
278
+ e,
279
+ @database.execute("EXPLAIN #{e}", *a[1..-1]).pretty_inspect
280
+ ])
281
+ end
282
+ @database.execute(*a)
283
+ end
284
+
285
+ # The quote method returns the quoted string as per
286
+ # SQLite3::Database.quote(string).
287
+ #
288
+ # @param string [String] the input string
289
+ #
290
+ # @return [String] the quoted string
291
+ def quote(string)
292
+ SQLite3::Database.quote(string)
293
+ end
294
+
295
+ # The setup_database method initializes the SQLite database by creating
296
+ # tables and loading extensions.
297
+ #
298
+ # @param filename [ String ] the name of the SQLite database file
299
+ #
300
+ # @return [ nil ]
301
+ def setup_database(filename)
302
+ @database = SQLite3::Database.new(filename)
303
+ @database.enable_load_extension(true)
304
+ SqliteVec.load(@database)
305
+ @database.enable_load_extension(false)
306
+ execute %{
307
+ CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
308
+ embedding float[#@embedding_length]
309
+ )
310
+ }
311
+ execute %{
312
+ CREATE TABLE IF NOT EXISTS records (
313
+ key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
314
+ text text NOT NULL DEFAULT '',
315
+ embedding_id integer,
316
+ norm float NOT NULL DEFAULT 0.0,
317
+ source text,
318
+ tags json NOT NULL DEFAULT [],
319
+ FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
320
+ )
321
+ }
322
+ nil
323
+ end
324
+
325
+ # The convert_value_to_record method converts the given value into a
326
+ # Documentrix::Documents::Record object.
327
+ #
328
+ # @param value [ Documentrix::Documents::Record, Hash ] the value to be converted
329
+ #
330
+ # @return [ Documentrix::Documents::Record ] the converted record object
331
+ def convert_value_to_record(value)
332
+ value.is_a?(Documentrix::Documents::Record) and return value
333
+ Documentrix::Documents::Record[value.to_hash]
334
+ end
335
+ end
@@ -0,0 +1,72 @@
1
+ module Documentrix::Documents::Splitters
2
+ class Character
3
+ DEFAULT_SEPARATOR = /(?:\r?\n){2,}/
4
+
5
+ def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
6
+ @separator, @include_separator, @combining_string, @chunk_size =
7
+ separator, include_separator, combining_string, chunk_size
8
+ if include_separator
9
+ @separator = Regexp.new("(#@separator)")
10
+ end
11
+ end
12
+
13
+ def split(text)
14
+ texts = []
15
+ text.split(@separator) do |t|
16
+ if @include_separator && t =~ @separator
17
+ texts.last&.concat t
18
+ else
19
+ texts.push(t)
20
+ end
21
+ end
22
+ result = []
23
+ current_text = +''
24
+ texts.each do |t|
25
+ if current_text.size + t.size < @chunk_size
26
+ current_text << t << @combining_string
27
+ else
28
+ current_text.empty? or result << current_text
29
+ current_text = t
30
+ end
31
+ end
32
+ current_text.empty? or result << current_text
33
+ result
34
+ end
35
+ end
36
+
37
+ class RecursiveCharacter
38
+ DEFAULT_SEPARATORS = [
39
+ /(?:\r?\n){2,}/,
40
+ /\r?\n/,
41
+ /\b/,
42
+ //,
43
+ ].freeze
44
+
45
+ def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
46
+ separators.empty? and
47
+ raise ArgumentError, "non-empty array of separators required"
48
+ @separators, @include_separator, @combining_string, @chunk_size =
49
+ separators, include_separator, combining_string, chunk_size
50
+ end
51
+
52
+ def split(text, separators: @separators)
53
+ separators.empty? and return [ text ]
54
+ separators = separators.dup
55
+ separator = separators.shift
56
+ texts = Character.new(
57
+ separator:,
58
+ include_separator: @include_separator,
59
+ combining_string: @combining_string,
60
+ chunk_size: @chunk_size
61
+ ).split(text)
62
+ texts.count == 0 and return [ text ]
63
+ texts.inject([]) do |r, t|
64
+ if t.size > @chunk_size
65
+ r.concat(split(t, separators:))
66
+ else
67
+ r.concat([ t ])
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,91 @@
1
+ module Documentrix::Documents::Splitters
2
+ class Semantic
3
+ include Documentrix::Utils::Math
4
+
5
+ DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
6
+
7
+ def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
8
+ @ollama, @model, @model_options, @separator, @chunk_size =
9
+ ollama, model, model_options, separator, chunk_size
10
+ end
11
+
12
+ def split(text, batch_size: 100, breakpoint: :percentile, **opts)
13
+ sentences = Documentrix::Documents::Splitters::Character.new(
14
+ separator: @separator,
15
+ include_separator: opts.fetch(:include_separator, true),
16
+ chunk_size: 1,
17
+ ).split(text)
18
+ embeddings = sentences.with_infobar(label: 'Split').each_slice(batch_size).inject([]) do |e, batch|
19
+ e.concat sentence_embeddings(batch)
20
+ infobar.progress by: batch.size
21
+ e
22
+ end
23
+ infobar.newline
24
+ embeddings.size < 2 and return sentences
25
+ distances = embeddings.each_cons(2).map do |a, b|
26
+ 1.0 - cosine_similarity(a:, b:)
27
+ end
28
+ max_distance = calculate_breakpoint_threshold(breakpoint, distances, **opts)
29
+ gaps = distances.each_with_index.select do |d, i|
30
+ d > max_distance
31
+ end.transpose.last
32
+ gaps or return sentences
33
+ if gaps.last < distances.size
34
+ gaps << distances.size
35
+ end
36
+ if gaps.last < sentences.size - 1
37
+ gaps << sentences.size - 1
38
+ end
39
+ result = []
40
+ sg = 0
41
+ current_text = +''
42
+ gaps.each do |g|
43
+ sg.upto(g) do |i|
44
+ sentence = sentences[i]
45
+ if current_text.size + sentence.size < @chunk_size
46
+ current_text += sentence
47
+ else
48
+ current_text.empty? or result << current_text
49
+ current_text = sentence
50
+ end
51
+ end
52
+ unless current_text.empty?
53
+ result << current_text
54
+ current_text = +''
55
+ end
56
+ sg = g.succ
57
+ end
58
+ current_text.empty? or result << current_text
59
+ result
60
+ end
61
+
62
+ private
63
+
64
+ def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
65
+ sequence = MoreMath::Sequence.new(distances)
66
+ case breakpoint_method
67
+ when :percentile
68
+ percentile = opts.fetch(:percentile, 95)
69
+ sequence.percentile(percentile)
70
+ when :standard_deviation
71
+ percentage = opts.fetch(:percentage, 100)
72
+ (
73
+ sequence.mean + sequence.standard_deviation * (percentage / 100.0)
74
+ ).clamp(0, sequence.max)
75
+ when :interquartile
76
+ percentage = opts.fetch(:percentage, 100)
77
+ iqr = sequence.interquartile_range
78
+ max = sequence.max
79
+ (sequence.mean + iqr * (percentage / 100.0)).clamp(0, max)
80
+ else
81
+ raise ArgumentError, "invalid breakpoint method #{breakpoint_method}"
82
+ end
83
+ end
84
+
85
+ def sentence_embeddings(input)
86
+ @ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
87
+ Numo::NArray[*_1]
88
+ }
89
+ end
90
+ end
91
+ end