documentrix 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 68c409476101e4632597c139494f3fc1fe67bc0af7a655e5e5b3e4ebbd58f58c
4
- data.tar.gz: fcd07ca7694b3fbed81c3a25f3fa9d9a675d120e5b82b43a13f5f71970012f3a
3
+ metadata.gz: 332c67275b90bcc797cdd8e8df75a751bcfa7a52c759c9bcbe4a55970e905401
4
+ data.tar.gz: c546f89c61613b11d18c2cd724cadae8c303a988f925ad1624c5121f511e88fd
5
5
  SHA512:
6
- metadata.gz: cf1e95f2d994bb130b89bff9d2e43062621b849b5d0f7fb4543092b1f491bab4b9dd9e51047c3dfb216090a4f9bc853c08ba35719dea7409c34a15758070fcba
7
- data.tar.gz: 59aa3347c07c2521f5661326a55d73733470ef30c62e86fc7d1ff53a4acb6cbc2da06e4c7814896b59ca16bdfab8933a7018f3f41ab30ce29593c5fac480f342
6
+ metadata.gz: c80a4c253b4fe367fae1dbb7e6aea50e274a495a0026a7b676e1e962b80c33806fe89976338c12931f11a5059e384fdfddcfe4665f68bde406b118edd8c04d15
7
+ data.tar.gz: '09292fd5d6aed6c939c0244467c743d515e68e5e253d7f443ef96a9c295a61689f1be5b9aec53e402287b6b6c7d305757eb26f79f5ade0161f7e82d534a7b3bf'
data/CHANGES.md CHANGED
@@ -1,5 +1,39 @@
1
1
  # Changes
2
2
 
3
+ ## 2026-05-20 v0.3.2
4
+
5
+ ### Performance Improvements
6
+
7
+ - Optimized collections lookup by memoizing
8
+ `Documentrix::Documents#collections` using `@collections_cache`.
9
+ - Added `Documentrix::Documents#invalidate_collections_cache!` to reset the
10
+ memoized list.
11
+ - Integrated `invalidate_collections_cache!` into `add`, `delete`, `clear`,
12
+ `source_remove`, and `rename_collection` to maintain cache consistency.
13
+
14
+ ### Database & Cache
15
+
16
+ - Standardized SQL keyword casing to uppercase for data types (e.g., `FLOAT`,
17
+ `TEXT`, `INTEGER`, `JSON`) within `SQLiteCache`.
18
+
19
+ ### Documentation & Testing
20
+
21
+ - Refined documentation and return type descriptions for
22
+ `Documentrix::Documents#find` and `Documentrix::Documents#prefix`.
23
+ - Updated `spec/documents_spec.rb` to ensure `invalidate_collections_cache!` is
24
+ correctly triggered during mutations.
25
+
26
+ ## 2026-05-18 v0.3.1
27
+
28
+ - Fixed scoping bugs in `clear_by_source` and `source_exist?` by implementing a
29
+ `key LIKE ?` constraint.
30
+ - Added regression tests for prefix isolation in
31
+ `spec/documentrix/documents/cache/sqlite_cache_spec.rb`.
32
+ - Introduced the `start_with_prefix` method in
33
+ `lib/documentrix/documents/cache/sqlite_cache.rb` to unify prefix patterns.
34
+ - Updated `tags`, `size`, `clear_all_with_prefix`, and `each` to utilize the
35
+ new `start_with_prefix` method.
36
+
3
37
  ## 2026-05-17 v0.3.0
4
38
 
5
39
  ### New Features
data/documentrix.gemspec CHANGED
@@ -1,9 +1,9 @@
1
1
  # -*- encoding: utf-8 -*-
2
- # stub: documentrix 0.3.0 ruby lib
2
+ # stub: documentrix 0.3.2 ruby lib
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "documentrix".freeze
6
- s.version = "0.3.0".freeze
6
+ s.version = "0.3.2".freeze
7
7
 
8
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
9
9
  s.require_paths = ["lib".freeze]
@@ -114,7 +114,7 @@ class Documentrix::Documents::Cache::SQLiteCache
114
114
  result = Documentrix::Utils::Tags.new
115
115
  execute(%{
116
116
  SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
117
- }, [ "#@prefix%" ]
117
+ }, [ start_with_prefix ]
118
118
  ).flatten.each do
119
119
  JSON(_1).each { |t| result.add(t) }
120
120
  end
@@ -126,7 +126,10 @@ class Documentrix::Documents::Cache::SQLiteCache
126
126
  #
127
127
  # @return [ Integer ] the count of records
128
128
  def size
129
- execute(%{SELECT COUNT(*) FROM records WHERE key LIKE ?}, [ "#@prefix%" ]).flatten.first
129
+ execute(
130
+ %{SELECT COUNT(*) FROM records WHERE key LIKE ?},
131
+ [ start_with_prefix ]
132
+ ).flatten.first
130
133
  end
131
134
 
132
135
  # The clear_for_tags method clears the cache for specific tags by deleting
@@ -154,7 +157,7 @@ class Documentrix::Documents::Cache::SQLiteCache
154
157
  #
155
158
  # @return [ Documentrix::Documents::RedisBackedMemoryCache ] self
156
159
  def clear_all_with_prefix
157
- execute(%{DELETE FROM records WHERE key LIKE ?}, [ "#@prefix%" ])
160
+ execute(%{DELETE FROM records WHERE key LIKE ?}, [ start_with_prefix ])
158
161
  self
159
162
  end
160
163
 
@@ -172,9 +175,21 @@ class Documentrix::Documents::Cache::SQLiteCache
172
175
  def clear_by_source(source, digest: nil, operator: ?=)
173
176
  operator = '!=' if operator != ?=
174
177
  if digest
175
- execute(%{DELETE FROM records WHERE source = ? AND digest #{operator} ? }, [ source, digest ])
178
+ execute(
179
+ %{
180
+ DELETE FROM records
181
+ WHERE key LIKE ? AND source = ? AND digest #{operator} ?
182
+ },
183
+ [ start_with_prefix, source, digest ]
184
+ )
176
185
  else
177
- execute(%{DELETE FROM records WHERE source = ?}, [ source ])
186
+ execute(
187
+ %{
188
+ DELETE FROM records
189
+ WHERE key LIKE ? AND source = ?
190
+ },
191
+ [ start_with_prefix, source ]
192
+ )
178
193
  end
179
194
  self
180
195
  end
@@ -194,9 +209,19 @@ class Documentrix::Documents::Cache::SQLiteCache
194
209
  def source_exist?(source, digest: nil, operator: ?=)
195
210
  operator = '!=' if operator != ?=
196
211
  if digest
197
- !!execute(%{SELECT 1 FROM records WHERE source = ? AND digest #{operator} ? }, [ source, digest ]).first
212
+ !!execute(
213
+ %{
214
+ SELECT 1 FROM records WHERE key LIKE ? AND source = ? AND digest #{operator} ?
215
+ },
216
+ [ start_with_prefix, source, digest ]
217
+ ).first
198
218
  else
199
- !!execute(%{SELECT 1 FROM records WHERE source = ?}, [ source ]).first
219
+ !!execute(
220
+ %{
221
+ SELECT 1 FROM records WHERE key LIKE ? AND source = ?
222
+ },
223
+ [ start_with_prefix, source ]
224
+ ).first
200
225
  end
201
226
  end
202
227
 
@@ -211,10 +236,9 @@ class Documentrix::Documents::Cache::SQLiteCache
211
236
  block or return enum_for(__method__)
212
237
 
213
238
  execute(%{
214
- SELECT DISTINCT source
215
- FROM records
239
+ SELECT DISTINCT source FROM records
216
240
  WHERE key LIKE ? AND source IS NOT NULL
217
- }, [ "#@prefix%" ]).each do |source,|
241
+ }, [ start_with_prefix ]).each do |source,|
218
242
  source = source.full? or next
219
243
 
220
244
  block.(source)
@@ -257,7 +281,7 @@ class Documentrix::Documents::Cache::SQLiteCache
257
281
  # cache.each do |key, value|
258
282
  # puts "#{key}: #{value}"
259
283
  # end
260
- def each(prefix: "#@prefix%", &block)
284
+ def each(prefix: start_with_prefix, &block)
261
285
  block or return enum_for(__method__, prefix:)
262
286
 
263
287
  execute(%{
@@ -315,7 +339,7 @@ class Documentrix::Documents::Cache::SQLiteCache
315
339
  SELECT key, tags, embedding_id
316
340
  FROM records
317
341
  WHERE key LIKE ?#{tags_where}
318
- }, [ "#@prefix%" ])
342
+ }, [ start_with_prefix ])
319
343
  if tags_filter
320
344
  records = records.select { |key, tags, embedding_id|
321
345
  (tags_filter & JSON(tags.to_s).to_a).size >= 1
@@ -368,6 +392,13 @@ class Documentrix::Documents::Cache::SQLiteCache
368
392
 
369
393
  private
370
394
 
395
+ # Returns the SQL LIKE pattern for records starting with the current prefix.
396
+ #
397
+ # @return [ String ] the prefix pattern used in SQL WHERE clauses
398
+ def start_with_prefix
399
+ "#@prefix%"
400
+ end
401
+
371
402
  # The execute method executes an SQL query on the database by calling the
372
403
  # \@database.execute method.
373
404
  #
@@ -412,18 +443,18 @@ class Documentrix::Documents::Cache::SQLiteCache
412
443
  @database.enable_load_extension(false)
413
444
  execute %{
414
445
  CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
415
- embedding float[#@embedding_length]
446
+ embedding FLOAT[#@embedding_length]
416
447
  )
417
448
  }
418
449
  execute %{
419
450
  CREATE TABLE IF NOT EXISTS records (
420
- key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
421
- text text NOT NULL DEFAULT '',
422
- embedding_id integer,
423
- norm float NOT NULL DEFAULT 0.0,
424
- source text,
425
- digest text,
426
- tags json NOT NULL DEFAULT [],
451
+ key TEXT NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
452
+ text TEXT NOT NULL DEFAULT '',
453
+ embedding_id INTEGER,
454
+ norm FLOAT NOT NULL DEFAULT 0.0,
455
+ source TEXT,
456
+ digest TEXT,
457
+ tags JSON NOT NULL DEFAULT [],
427
458
  FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
428
459
  )
429
460
  }
@@ -162,7 +162,7 @@ class Documentrix::Documents
162
162
  infobar.progress by: batch.size
163
163
  end
164
164
  infobar.newline
165
- self
165
+ invalidate_collections_cache!
166
166
  end
167
167
  alias << add
168
168
 
@@ -201,7 +201,9 @@ class Documentrix::Documents
201
201
  # @return [ FalseClass, TrueClass ] true if the text was removed, false
202
202
  # otherwise.
203
203
  def delete(text)
204
- @cache.delete(key(text))
204
+ res = @cache.delete(key(text))
205
+ invalidate_collections_cache! if res
206
+ res
205
207
  end
206
208
 
207
209
  # The size method returns the number of texts stored in the cache of this
@@ -220,7 +222,7 @@ class Documentrix::Documents
220
222
  # @return [ Documentrix::Documents ] self
221
223
  def clear(tags: nil)
222
224
  @cache.clear(tags:)
223
- self
225
+ invalidate_collections_cache!
224
226
  end
225
227
 
226
228
  # Normalizes the source identifier to a canonical form.
@@ -318,7 +320,7 @@ class Documentrix::Documents
318
320
  def source_remove(source, digest: nil)
319
321
  source = normalize_source(source)
320
322
  @cache.clear_by_source(source, digest:, operator: '!=')
321
- self
323
+ invalidate_collections_cache!
322
324
  end
323
325
 
324
326
  # The find method searches for strings within the cache by computing their
@@ -359,7 +361,7 @@ class Documentrix::Documents
359
361
  # @param text_count [Integer] the maximum number of records to return
360
362
  # @param opts [Hash] additional options passed to #find, such as:
361
363
  # * :tags [Array<String>] filter results by tags
362
- # * :prompt [String] a prompt to use for the search
364
+ # * :prompt [String] use for the search
363
365
  # * :min_similarity [Numeric] minimum similarity score
364
366
  #
365
367
  # @example
@@ -379,7 +381,9 @@ class Documentrix::Documents
379
381
  #
380
382
  # @return [Array] An array of unique collection names
381
383
  def collections
382
- ([ default_collection ] + @cache.collections('%s-' % class_prefix)).uniq
384
+ @collections_cache ||= (
385
+ [ default_collection ] + @cache.collections('%s-' % class_prefix)
386
+ ).uniq
383
387
  end
384
388
 
385
389
  # Rename the current collection, moving all keys from the old prefix to a new
@@ -395,6 +399,7 @@ class Documentrix::Documents
395
399
  new_prefix = '%s-%s-' % [ class_prefix, new_collection ]
396
400
  @cache.move_prefix(prefix, new_prefix)
397
401
  self.collection = new_collection
402
+ invalidate_collections_cache!
398
403
  end
399
404
 
400
405
  # The tags method returns an array of unique tags from the cache.
@@ -406,6 +411,18 @@ class Documentrix::Documents
406
411
 
407
412
  private
408
413
 
414
+ # Resets the memoized list of collections.
415
+ #
416
+ # This is called whenever a mutation occurs that could change the set of
417
+ # existing collections, ensuring that the #collections method returns a
418
+ # fresh, accurate list on the next call.
419
+ #
420
+ # @return [ Documentrix::Documents ] self
421
+ def invalidate_collections_cache!
422
+ @collections_cache = nil
423
+ self
424
+ end
425
+
409
426
  # The connect_cache method initializes and returns an instance of the
410
427
  # specified cache class.
411
428
  #
@@ -482,7 +499,7 @@ class Documentrix::Documents
482
499
  # The prefix method returns a string that is used as the prefix for keys in
483
500
  # the cache of the currently configured collection.
484
501
  #
485
- # @return [ String ] The prefix string
502
+ # @return [ String ] the prefix string
486
503
  def prefix
487
504
  '%s-%s-' % [ class_prefix, @collection ]
488
505
  end
@@ -1,6 +1,6 @@
1
1
  module Documentrix
2
2
  # Documentrix version
3
- VERSION = '0.3.0'
3
+ VERSION = '0.3.2'
4
4
  VERSION_ARRAY = VERSION.split('.').map(&:to_i) # :nodoc:
5
5
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
6
6
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
@@ -250,6 +250,47 @@ describe Documentrix::Documents::SQLiteCache do
250
250
  ]
251
251
  end
252
252
 
253
+ describe 'Prefix Isolation' do
254
+ let(:cache2) { cache.dup }
255
+
256
+ before do
257
+ cache2.prefix = 'other-'
258
+
259
+ # Setup shared sources and tags across prefixes
260
+ cache['foo'] = test_value.merge(source: 'shared.txt', tags: %w[ a ])
261
+ cache2['bar'] = test_value.merge(source: 'shared.txt', tags: %w[ a ])
262
+ end
263
+
264
+ it 'does not leak clear_by_source' do
265
+ expect {
266
+ cache.clear_by_source('shared.txt')
267
+ }.to change { cache.size }.from(1).to(0)
268
+
269
+ expect(cache2.size).to eq 1
270
+ expect(cache2.key?('bar')).to be true
271
+ end
272
+
273
+ it 'does not leak source_exist?' do
274
+ # Ensure we are checking a source that ONLY exists in the other prefix
275
+ cache.clear_all_with_prefix
276
+ cache2['baz'] = test_value.merge(source: 'only-in-2.txt')
277
+
278
+ expect(cache.source_exist?('only-in-2.txt')).to be false
279
+ expect(cache2.source_exist?('only-in-2.txt')).to be true
280
+ end
281
+
282
+ it 'does not leak tags' do
283
+ cache.clear_all_with_prefix
284
+ cache2.clear_all_with_prefix
285
+
286
+ cache['foo'] = test_value.merge(tags: %w[ prefix1 ])
287
+ cache2['bar'] = test_value.merge(tags: %w[ prefix2 ])
288
+
289
+ expect(cache.tags.to_a).to match_array(['prefix1'])
290
+ expect(cache2.tags.to_a).to match_array(['prefix2'])
291
+ end
292
+ end
293
+
253
294
  describe '#find_records' do
254
295
  let(:needle) { [ 0.5 ] * 1_024 }
255
296
 
@@ -23,6 +23,8 @@ describe Documentrix::Documents do
23
23
  expect(ollama).to receive(:embed).
24
24
  with(model:, input: %w[ foo bar ], options: nil).
25
25
  and_return(double(embeddings: [ [ 0.1 ], [ 0.2 ] ]))
26
+ expect(documents).to receive(:invalidate_collections_cache!).
27
+ and_call_original
26
28
  expect(documents.add(%w[ foo bar ])).to eq documents
27
29
  expect(documents.exist?('foo')).to eq true
28
30
  expect(documents.exist?('bar')).to eq true
@@ -33,6 +35,8 @@ describe Documentrix::Documents do
33
35
  expect(ollama).to receive(:embed).
34
36
  with(model:, input: %w[ foo ], options: nil).
35
37
  and_return(double(embeddings: [ [ 0.1 ] ]))
38
+ expect(documents).to receive(:invalidate_collections_cache!).
39
+ and_call_original
36
40
  expect(documents << 'foo').to eq documents
37
41
  expect(documents.exist?('foo')).to eq true
38
42
  expect(documents.exist?('bar')).to eq false
@@ -123,6 +127,8 @@ describe Documentrix::Documents do
123
127
 
124
128
  it 'can delete texts' do
125
129
  expect(documents << 'foo').to eq documents
130
+ expect(documents).to receive(:invalidate_collections_cache!).
131
+ and_call_original
126
132
  expect {
127
133
  documents.delete('foo')
128
134
  }.to change { documents.exist?('foo') }.from(true).to(false)
@@ -136,6 +142,8 @@ describe Documentrix::Documents do
136
142
 
137
143
  it 'can clear texts' do
138
144
  expect(documents << 'foo').to eq documents
145
+ expect(documents).to receive(:invalidate_collections_cache!).
146
+ and_call_original
139
147
  expect {
140
148
  documents.clear
141
149
  }.to change { documents.size }.from(1).to(0)
@@ -148,9 +156,13 @@ describe Documentrix::Documents do
148
156
  expect(documents.add('foo', tags: %w[ test ])).to eq documents
149
157
  expect(documents.add('bar', tags: %w[ test2 ])).to eq documents
150
158
  expect(documents.tags.to_a).to eq %w[ test test2 ]
159
+ expect(documents).to receive(:invalidate_collections_cache!).
160
+ and_call_original
151
161
  expect {
152
162
  documents.clear tags: 'test'
153
163
  }.to change { documents.size }.from(2).to(1)
164
+ expect(documents).to receive(:invalidate_collections_cache!).
165
+ and_call_original
154
166
  expect {
155
167
  documents.clear tags: :test2
156
168
  }.to change { documents.size }.from(1).to(0)
@@ -166,6 +178,8 @@ describe Documentrix::Documents do
166
178
 
167
179
  expect(documents.size).to eq 3
168
180
 
181
+ expect(documents).to receive(:invalidate_collections_cache!).
182
+ and_call_original
169
183
  documents.source_remove('source1')
170
184
 
171
185
  expect(documents.size).to eq 1
@@ -182,6 +196,8 @@ describe Documentrix::Documents do
182
196
  documents.collection = :foo
183
197
  documents << 'foo'
184
198
  expect(documents.collections).to eq %i[ default foo ]
199
+ expect(documents).to receive(:invalidate_collections_cache!).
200
+ and_call_original
185
201
  documents.rename_collection(:bar)
186
202
  expect(documents.collection).to eq :bar
187
203
  expect(documents.collections).to eq %i[ default bar ]
@@ -195,6 +211,8 @@ describe Documentrix::Documents do
195
211
  documents.collection = :bar
196
212
  documents << 'foo'
197
213
  expect(documents.collections).to eq %i[ default foo bar ]
214
+ expect(documents).not_to receive(:invalidate_collections_cache!).
215
+ and_call_original
198
216
  expect {
199
217
  documents.rename_collection(:foo)
200
218
  }.to raise_error(ArgumentError, 'new collection foo already exists!')
@@ -243,6 +261,8 @@ describe Documentrix::Documents do
243
261
  documents.add('foo', source: 's1')
244
262
 
245
263
  expect(ollama).not_to receive(:embed)
264
+ expect(documents).to receive(:invalidate_collections_cache!).
265
+ and_call_original
246
266
  documents.source_update(['foo'], source: 's1')
247
267
  expect(documents.exist?('foo')).to be true
248
268
  end
@@ -255,6 +275,8 @@ describe Documentrix::Documents do
255
275
  allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
256
276
 
257
277
  expect(ollama).to receive(:embed).once
278
+ expect(documents).to receive(:invalidate_collections_cache!).
279
+ at_least(1).and_call_original
258
280
  documents.source_update(['bar'], source: 's1')
259
281
 
260
282
  expect(documents.exist?('bar')).to be true
@@ -263,6 +285,8 @@ describe Documentrix::Documents do
263
285
 
264
286
  it 'updates the source if it is an URL' do
265
287
  expect(ollama).to receive(:embed).once
288
+ expect(documents).to receive(:invalidate_collections_cache!).
289
+ and_call_original
266
290
  documents.source_update('foo', source: 'https://www.example.com/s1')
267
291
  expect(documents.exist?('foo')).to be true
268
292
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: documentrix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Florian Frank