documentrix 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/Gemfile +5 -0
  4. data/README.md +71 -0
  5. data/Rakefile +48 -0
  6. data/documentrix.gemspec +41 -0
  7. data/lib/documentrix/documents/cache/common.rb +43 -0
  8. data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
  9. data/lib/documentrix/documents/cache/records.rb +145 -0
  10. data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
  11. data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
  12. data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
  13. data/lib/documentrix/documents/splitters/character.rb +72 -0
  14. data/lib/documentrix/documents/splitters/semantic.rb +91 -0
  15. data/lib/documentrix/documents.rb +328 -0
  16. data/lib/documentrix/utils/colorize_texts.rb +65 -0
  17. data/lib/documentrix/utils/math.rb +48 -0
  18. data/lib/documentrix/utils/tags.rb +112 -0
  19. data/lib/documentrix/utils.rb +5 -0
  20. data/lib/documentrix/version.rb +8 -0
  21. data/lib/documentrix.rb +11 -0
  22. data/spec/assets/embeddings.json +1 -0
  23. data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
  24. data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
  25. data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
  26. data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
  27. data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
  28. data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
  29. data/spec/documents_spec.rb +174 -0
  30. data/spec/spec_helper.rb +23 -0
  31. data/spec/utils/colorize_texts_spec.rb +13 -0
  32. data/spec/utils/tags_spec.rb +53 -0
  33. metadata +329 -0
@@ -0,0 +1,335 @@
1
+ require 'documentrix/documents/cache/common'
2
+ require 'sqlite3'
3
+ require 'sqlite_vec'
4
+ require 'digest/md5'
5
+
6
+ class Documentrix::Documents::Cache::SQLiteCache
7
+ include Documentrix::Documents::Cache::Common
8
+
9
+ # The initialize method sets up the cache by calling super and setting
10
+ # various instance variables.
11
+ #
12
+ # @param prefix [ String ] the prefix for keys
13
+ # @param embedding_length [ Integer ] the length of the embeddings vector
14
+ # @param filename [ String ] the name of the SQLite database file or ':memory:' for in-memory.
15
+ # @param debug [ FalseClass, TrueClass ] whether to enable debugging
16
+ #
17
+ # @return [ void ]
18
+ def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
19
+ super(prefix:)
20
+ @embedding_length = embedding_length
21
+ @filename = filename
22
+ @debug = debug
23
+ setup_database(filename)
24
+ end
25
+
26
+ attr_reader :filename # filename for the database, `:memory:` is in memory
27
+
28
+ attr_reader :embedding_length # length of the embeddings vector
29
+
30
+ # The [](key) method retrieves the value associated with the given key from
31
+ # the cache.
32
+ #
33
+ # @param [String] key The key for which to retrieve the value.
34
+ #
35
+ # @return [Documentrix::Documents::Record, NilClass] The value associated
36
+ # with the key, or nil if it does not exist in the cache.
37
+ def [](key)
38
+ result = execute(
39
+ %{
40
+ SELECT records.key, records.text, records.norm, records.source,
41
+ records.tags, embeddings.embedding
42
+ FROM records
43
+ INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
44
+ WHERE records.key = ?
45
+ },
46
+ pre(key)
47
+ )&.first or return
48
+ key, text, norm, source, tags, embedding = *result
49
+ embedding = embedding.unpack("f*")
50
+ tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
51
+ convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
52
+ end
53
+
54
+ # The []= method sets the value for a given key by inserting it into the
55
+ # database.
56
+ #
57
+ # @param [String] key the key to set
58
+ # @param [Hash, Documentrix::Documents::Record] value the hash or record
59
+ # containing the text, embedding, and other metadata
60
+ def []=(key, value)
61
+ value = convert_value_to_record(value)
62
+ embedding = value.embedding.pack("f*")
63
+ execute(%{BEGIN})
64
+ execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
65
+ embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
66
+ execute(%{
67
+ INSERT INTO records(key,text,embedding_id,norm,source,tags)
68
+ VALUES(?,?,?,?,?,?)
69
+ }, [ pre(key), value.text, embedding_id, value.norm, value.source, JSON(value.tags) ])
70
+ execute(%{COMMIT})
71
+ end
72
+
73
+ # The key? method checks if the given key exists in the cache by executing a
74
+ # SQL query.
75
+ #
76
+ # @param [String] key the key to check for existence
77
+ #
78
+ # @return [FalseClass, TrueClass] true if the key exists, false otherwise
79
+ def key?(key)
80
+ execute(
81
+ %{ SELECT count(records.key) FROM records WHERE records.key = ? },
82
+ pre(key)
83
+ ).flatten.first == 1
84
+ end
85
+
86
+ # The delete method removes a key from the cache by executing a SQL query.
87
+ #
88
+ # @param key [ String ] the key to be deleted
89
+ #
90
+ # @return [ NilClass ]
91
+ def delete(key)
92
+ result = key?(key)
93
+ execute(
94
+ %{ DELETE FROM records WHERE records.key = ? },
95
+ pre(key)
96
+ )
97
+ result
98
+ end
99
+
100
+ # The tags method returns an array of unique tags from the database.
101
+ #
102
+ # @return [Documentrix::Utils::Tags] An instance of Documentrix::Utils::Tags
103
+ # containing all unique tags found in the database.
104
+ def tags
105
+ result = Documentrix::Utils::Tags.new
106
+ execute(%{
107
+ SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
108
+ }, [ "#@prefix%" ]
109
+ ).flatten.each do
110
+ JSON(_1).each { |t| result.add(t) }
111
+ end
112
+ result
113
+ end
114
+
115
+ # The size method returns the total number of records stored in the cache,
116
+ # that is the ones with prefix `prefix`.
117
+ #
118
+ # @return [ Integer ] the count of records
119
+ def size
120
+ execute(%{SELECT COUNT(*) FROM records WHERE key LIKE ?}, [ "#@prefix%" ]).flatten.first
121
+ end
122
+
123
+ # The clear_for_tags method clears the cache for specific tags by deleting
124
+ # records that match those tags and have the prefix `prefix`.
125
+ #
126
+ # @param tags [Array<String>, NilClass] An array of tag names to clear from
127
+ # the cache or nil for all records
128
+ #
129
+ # @return [Documentrix::Documents::Cache::SQLiteCache] The SQLiteCache instance
130
+ # after clearing the specified tags.
131
+ def clear_for_tags(tags = nil)
132
+ tags = Documentrix::Utils::Tags.new(tags).to_a
133
+ if tags.present?
134
+ records = find_records_for_tags(tags)
135
+ keys = '(%s)' % records.transpose.first.map { "'%s'" % quote(_1) }.join(?,)
136
+ execute(%{DELETE FROM records WHERE key IN #{keys}})
137
+ else
138
+ clear
139
+ end
140
+ self
141
+ end
142
+
143
+ # The clear method deletes all records for prefix `prefix` from the cache by
144
+ # executing a SQL query.
145
+ #
146
+ # @return [ Documentrix::Documents::RedisBackedMemoryCache ] self
147
+ def clear
148
+ execute(%{DELETE FROM records WHERE key LIKE ?}, [ "#@prefix%" ])
149
+ self
150
+ end
151
+
152
+ # The each method iterates over records matching the given prefix and yields
153
+ # them to the block.
154
+ #
155
+ # @param prefix [ String ] the prefix to match
156
+ # @yield [ key, value ] where key is the record's key and value is the record itself
157
+ #
158
+ # @example
159
+ # cache.each do |key, value|
160
+ # puts "#{key}: #{value}"
161
+ # end
162
+ def each(prefix: "#@prefix%", &block)
163
+ execute(%{
164
+ SELECT records.key, records.text, records.norm, records.source,
165
+ records.tags, embeddings.embedding
166
+ FROM records
167
+ INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
168
+ WHERE records.key LIKE ?
169
+ }, [ prefix ]).each do |key, text, norm, source, tags, embedding|
170
+ embedding = embedding.unpack("f*")
171
+ tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
172
+ value = convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
173
+ block.(key, value)
174
+ end
175
+ self
176
+ end
177
+ include Enumerable
178
+
179
+ # The full_each method iterates over all keys and values in the cache,
180
+ # regardless of their prefix.
181
+ #
182
+ # @yield [ key, value ]
183
+ #
184
+ # @return [ Documentrix::Documents::Cache::SQLiteCache ] self
185
+ def full_each(&block)
186
+ each(prefix: ?%, &block)
187
+ end
188
+
189
+ # The convert_to_vector method returns the input vector itself, because
190
+ # conversion isn't necessary for this cache class.
191
+ #
192
+ # @param vector [ Array ] the input vector
193
+ #
194
+ # @return [ Array ] the (not) converted vector
195
+ def convert_to_vector(vector)
196
+ vector
197
+ end
198
+
199
+ # The find_records_for_tags method filters records based on the provided tags.
200
+ #
201
+ # @param tags [ Array ] an array of tag names
202
+ #
203
+ # @return [ Array ] an array of filtered records
204
+ def find_records_for_tags(tags)
205
+ if tags.present?
206
+ tags_filter = Documentrix::Utils::Tags.new(tags).to_a
207
+ unless tags_filter.empty?
208
+ tags_where = ' AND (%s)' % tags_filter.map {
209
+ 'tags LIKE "%%%s%%"' % quote(_1)
210
+ }.join(' OR ')
211
+ end
212
+ end
213
+ records = execute(%{
214
+ SELECT key, tags, embedding_id
215
+ FROM records
216
+ WHERE key LIKE ?#{tags_where}
217
+ }, [ "#@prefix%" ])
218
+ if tags_filter
219
+ records = records.select { |key, tags, embedding_id|
220
+ (tags_filter & JSON(tags.to_s).to_a).size >= 1
221
+ }
222
+ end
223
+ records
224
+ end
225
+
226
+ # The find_records method finds records that match the given needle and tags.
227
+ #
228
+ # @param needle [ Array ] the embedding vector
229
+ # @param tags [ Array ] the list of tags to filter by (optional)
230
+ # @param max_records [ Integer ] the maximum number of records to return (optional)
231
+ #
232
+ # @yield [ key, value ]
233
+ #
234
+ # @raise [ ArgumentError ] if needle size does not match embedding length
235
+ #
236
+ # @example
237
+ # documents.find_records([ 0.1 ] * 1_024, tags: %w[ test ])
238
+ #
239
+ # @return [ Array<Documentrix::Documents::Record> ] the list of matching records
240
+ def find_records(needle, tags: nil, max_records: nil)
241
+ needle.size != @embedding_length and
242
+ raise ArgumentError, "needle embedding length != %s" % @embedding_length
243
+ needle_binary = needle.pack("f*")
244
+ max_records = [ max_records, size, 4_096 ].compact.min
245
+ records = find_records_for_tags(tags)
246
+ rowids_where = '(%s)' % records.transpose.last&.join(?,)
247
+ execute(%{
248
+ SELECT records.key, records.text, records.norm, records.source,
249
+ records.tags, embeddings.embedding
250
+ FROM records
251
+ INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
252
+ WHERE embeddings.rowid IN #{rowids_where}
253
+ AND embeddings.embedding MATCH ? AND embeddings.k = ?
254
+ }, [ needle_binary, max_records ]).map do |key, text, norm, source, tags, embedding|
255
+ key = unpre(key)
256
+ embedding = embedding.unpack("f*")
257
+ tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
258
+ convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
259
+ end
260
+ end
261
+
262
+ private
263
+
264
+ # The execute method executes an SQL query on the database by calling the
265
+ # \@database.execute method.
266
+ #
267
+ # @param [Array] a the arguments to be passed to the @database.execute method
268
+ #
269
+ # @return [Result] the result of the executed query
270
+ def execute(*a)
271
+ if @debug
272
+ e = a[0].gsub(/^\s*\n/, '')
273
+ e = e.gsub(/\A\s+/, '')
274
+ n = $&.to_s.size
275
+ e = e.gsub(/^\s{0,#{n}}/, '')
276
+ e = e.chomp
277
+ STDERR.puts("EXPLANATION:\n%s\n%s" % [
278
+ e,
279
+ @database.execute("EXPLAIN #{e}", *a[1..-1]).pretty_inspect
280
+ ])
281
+ end
282
+ @database.execute(*a)
283
+ end
284
+
285
+ # The quote method returns the quoted string as per
286
+ # SQLite3::Database.quote(string).
287
+ #
288
+ # @param string [String] the input string
289
+ #
290
+ # @return [String] the quoted string
291
+ def quote(string)
292
+ SQLite3::Database.quote(string)
293
+ end
294
+
295
+ # The setup_database method initializes the SQLite database by creating
296
+ # tables and loading extensions.
297
+ #
298
+ # @param filename [ String ] the name of the SQLite database file
299
+ #
300
+ # @return [ nil ]
301
+ def setup_database(filename)
302
+ @database = SQLite3::Database.new(filename)
303
+ @database.enable_load_extension(true)
304
+ SqliteVec.load(@database)
305
+ @database.enable_load_extension(false)
306
+ execute %{
307
+ CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
308
+ embedding float[#@embedding_length]
309
+ )
310
+ }
311
+ execute %{
312
+ CREATE TABLE IF NOT EXISTS records (
313
+ key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
314
+ text text NOT NULL DEFAULT '',
315
+ embedding_id integer,
316
+ norm float NOT NULL DEFAULT 0.0,
317
+ source text,
318
+ tags json NOT NULL DEFAULT [],
319
+ FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
320
+ )
321
+ }
322
+ nil
323
+ end
324
+
325
+ # The convert_value_to_record method converts the given value into a
326
+ # Documentrix::Documents::Record object.
327
+ #
328
+ # @param value [ Documentrix::Documents::Record, Hash ] the value to be converted
329
+ #
330
+ # @return [ Documentrix::Documents::Record ] the converted record object
331
+ def convert_value_to_record(value)
332
+ value.is_a?(Documentrix::Documents::Record) and return value
333
+ Documentrix::Documents::Record[value.to_hash]
334
+ end
335
+ end
@@ -0,0 +1,72 @@
1
+ module Documentrix::Documents::Splitters
2
+ class Character
3
+ DEFAULT_SEPARATOR = /(?:\r?\n){2,}/
4
+
5
+ def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
6
+ @separator, @include_separator, @combining_string, @chunk_size =
7
+ separator, include_separator, combining_string, chunk_size
8
+ if include_separator
9
+ @separator = Regexp.new("(#@separator)")
10
+ end
11
+ end
12
+
13
+ def split(text)
14
+ texts = []
15
+ text.split(@separator) do |t|
16
+ if @include_separator && t =~ @separator
17
+ texts.last&.concat t
18
+ else
19
+ texts.push(t)
20
+ end
21
+ end
22
+ result = []
23
+ current_text = +''
24
+ texts.each do |t|
25
+ if current_text.size + t.size < @chunk_size
26
+ current_text << t << @combining_string
27
+ else
28
+ current_text.empty? or result << current_text
29
+ current_text = t
30
+ end
31
+ end
32
+ current_text.empty? or result << current_text
33
+ result
34
+ end
35
+ end
36
+
37
+ class RecursiveCharacter
38
+ DEFAULT_SEPARATORS = [
39
+ /(?:\r?\n){2,}/,
40
+ /\r?\n/,
41
+ /\b/,
42
+ //,
43
+ ].freeze
44
+
45
+ def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
46
+ separators.empty? and
47
+ raise ArgumentError, "non-empty array of separators required"
48
+ @separators, @include_separator, @combining_string, @chunk_size =
49
+ separators, include_separator, combining_string, chunk_size
50
+ end
51
+
52
+ def split(text, separators: @separators)
53
+ separators.empty? and return [ text ]
54
+ separators = separators.dup
55
+ separator = separators.shift
56
+ texts = Character.new(
57
+ separator:,
58
+ include_separator: @include_separator,
59
+ combining_string: @combining_string,
60
+ chunk_size: @chunk_size
61
+ ).split(text)
62
+ texts.count == 0 and return [ text ]
63
+ texts.inject([]) do |r, t|
64
+ if t.size > @chunk_size
65
+ r.concat(split(t, separators:))
66
+ else
67
+ r.concat([ t ])
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,91 @@
1
+ module Documentrix::Documents::Splitters
2
+ class Semantic
3
+ include Documentrix::Utils::Math
4
+
5
+ DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
6
+
7
+ def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
8
+ @ollama, @model, @model_options, @separator, @chunk_size =
9
+ ollama, model, model_options, separator, chunk_size
10
+ end
11
+
12
+ def split(text, batch_size: 100, breakpoint: :percentile, **opts)
13
+ sentences = Documentrix::Documents::Splitters::Character.new(
14
+ separator: @separator,
15
+ include_separator: opts.fetch(:include_separator, true),
16
+ chunk_size: 1,
17
+ ).split(text)
18
+ embeddings = sentences.with_infobar(label: 'Split').each_slice(batch_size).inject([]) do |e, batch|
19
+ e.concat sentence_embeddings(batch)
20
+ infobar.progress by: batch.size
21
+ e
22
+ end
23
+ infobar.newline
24
+ embeddings.size < 2 and return sentences
25
+ distances = embeddings.each_cons(2).map do |a, b|
26
+ 1.0 - cosine_similarity(a:, b:)
27
+ end
28
+ max_distance = calculate_breakpoint_threshold(breakpoint, distances, **opts)
29
+ gaps = distances.each_with_index.select do |d, i|
30
+ d > max_distance
31
+ end.transpose.last
32
+ gaps or return sentences
33
+ if gaps.last < distances.size
34
+ gaps << distances.size
35
+ end
36
+ if gaps.last < sentences.size - 1
37
+ gaps << sentences.size - 1
38
+ end
39
+ result = []
40
+ sg = 0
41
+ current_text = +''
42
+ gaps.each do |g|
43
+ sg.upto(g) do |i|
44
+ sentence = sentences[i]
45
+ if current_text.size + sentence.size < @chunk_size
46
+ current_text += sentence
47
+ else
48
+ current_text.empty? or result << current_text
49
+ current_text = sentence
50
+ end
51
+ end
52
+ unless current_text.empty?
53
+ result << current_text
54
+ current_text = +''
55
+ end
56
+ sg = g.succ
57
+ end
58
+ current_text.empty? or result << current_text
59
+ result
60
+ end
61
+
62
+ private
63
+
64
+ def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
65
+ sequence = MoreMath::Sequence.new(distances)
66
+ case breakpoint_method
67
+ when :percentile
68
+ percentile = opts.fetch(:percentile, 95)
69
+ sequence.percentile(percentile)
70
+ when :standard_deviation
71
+ percentage = opts.fetch(:percentage, 100)
72
+ (
73
+ sequence.mean + sequence.standard_deviation * (percentage / 100.0)
74
+ ).clamp(0, sequence.max)
75
+ when :interquartile
76
+ percentage = opts.fetch(:percentage, 100)
77
+ iqr = sequence.interquartile_range
78
+ max = sequence.max
79
+ (sequence.mean + iqr * (percentage / 100.0)).clamp(0, max)
80
+ else
81
+ raise ArgumentError, "invalid breakpoint method #{breakpoint_method}"
82
+ end
83
+ end
84
+
85
+ def sentence_embeddings(input)
86
+ @ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
87
+ Numo::NArray[*_1]
88
+ }
89
+ end
90
+ end
91
+ end