documentrix 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e32a72c0a1f93a96f7c3cecd185f13a0f17a2629b2dc509ec3a29fd2d7b51a41
4
- data.tar.gz: 3f2c21125adf7061dcba94f1843456064245222839de0df6ddbcd8743a1aea13
3
+ metadata.gz: ced4bf69e3ae1b4251844dd7915f8d75e72d1c40410f34c14311f70dd55c91f4
4
+ data.tar.gz: 6fc0799e4559f50a22d211e630ea5f7deaac671f24805f5501c85a11c7a9b853
5
5
  SHA512:
6
- metadata.gz: 91dbf3ddfdeb124661ff78a4cbc1635dbb8bdf6606078aeeef77b4f9d14688d073261ad08b59f0333895c7391d83eff8a6c76467f19af4f04ca55172c1d934a5
7
- data.tar.gz: d40e5a53ceeda71c7a2be37d3bfb718ffc2ddb0fddc0c6eb2346c295d06713c895493854fab5f768af075bec2595e2f23c40e893d7f7266c6f61cd95c534d733
6
+ metadata.gz: 8023a4b4d7cad8948e6cbd2a65cefd8b3993ae3de38ae5520a2ebf178989a78efb4c1296b51c614c4066c547372f9ad8067b6c8e196caaa098876a3adf44f523
7
+ data.tar.gz: 0c4544a80ecde5c98c1d3da64ffeb97155324fdc063993caddf6d2d0780c490b0c8ec24dbc2ca392b3ceb6023d964f58bad099ca4c509a6e43c33c4f7406beb0
data/CHANGES.md CHANGED
@@ -1,5 +1,43 @@
1
1
  # Changes
2
2
 
3
+ ## 2026-06-17 v0.6.0
4
+
5
+ ### Changed
6
+
7
+ - Refactored collection discovery to ensure strong consistency across multiple
8
+ client instances by delegating `Documentrix::Documents#collections` directly
9
+ to the cache backend.
10
+ - Removed `@collections_cache` and the `invalidate_collections_cache!` method
11
+ from `lib/documentrix/documents.rb`.
12
+ - Implemented a high-performance `#collections` method in
13
+ `lib/documentrix/documents/cache/redis_cache.rb` utilizing `scan_each`.
14
+ - Implemented a high-performance `#collections` method in
15
+ `lib/documentrix/documents/cache/sqlite_cache.rb` using SQL `DISTINCT`.
16
+ - Added specialized unit tests for collection extraction and regex patterns
17
+ within `spec/documentrix/documents/cache/redis_cache_spec.rb` and
18
+ `spec/documentrix/documents/cache/sqlite_cache_spec.rb`.
19
+ - Updated `spec/documents_spec.rb` and
20
+ `spec/documentrix/documents/cache/interface_spec.rb` to remove dependencies
21
+ on the deleted invalidation method.
22
+
23
+ ## 2026-06-16 v0.5.0
24
+
25
+ ### Improvements
26
+
27
+ - Enhanced SQLite concurrency and prevented database locks:
28
+ - Added `database_busy_timeout` parameter to
29
+ `Documentrix::Documents#initialize`, defaulting to **5000**ms.
30
+ - Updated `Documentrix::Documents#connect_cache` to pass the timeout value
31
+ to the cache backend.
32
+ - Implemented `busy_timeout` support in
33
+ `Documentrix::Documents::Cache::SQLiteCache#initialize`.
34
+ - Configured `@database.busy_handler_timeout` in
35
+ `Documentrix::Documents::Cache::SQLiteCache#setup_database` to ensure
36
+ GVL-friendly waiting during lock contention.
37
+ - Prevented immediate `SQLITE_BUSY` errors on writes by updating
38
+ `Documentrix::Documents::Cache::SQLiteCache#[]=` to use `BEGIN IMMEDIATE`
39
+ instead of `BEGIN`, avoiding transaction upgrade failures.
40
+
3
41
  ## 2026-05-22 v0.4.0
4
42
 
5
43
  ### Added
data/documentrix.gemspec CHANGED
@@ -1,9 +1,9 @@
1
1
  # -*- encoding: utf-8 -*-
2
- # stub: documentrix 0.4.0 ruby lib
2
+ # stub: documentrix 0.6.0 ruby lib
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "documentrix".freeze
6
- s.version = "0.4.0".freeze
6
+ s.version = "0.6.0".freeze
7
7
 
8
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
9
9
  s.require_paths = ["lib".freeze]
@@ -101,8 +101,23 @@ class Documentrix::Documents::RedisCache
101
101
  s
102
102
  end
103
103
 
104
- # The clear_all_with_prefix method removes all key-value pairs associated
105
- # with the given prefix from this cache instance.
104
+ # Returns an array of collection names that match the given prefix.
105
+ # This is a high-performance override for Redis that only queries keys.
106
+ #
107
+ # @param prefix [String] the prefix to search for in collection names
108
+ # @return [Array<Symbol>] an array of matching collection names
109
+ def collections(prefix)
110
+ unique = Set.new
111
+ redis.scan_each(match: "#{prefix}*") do |key|
112
+ if key =~ /\A#{prefix}(.+)-/
113
+ unique << $1.to_sym
114
+ end
115
+ end
116
+ unique.to_a
117
+ end
118
+
119
+ # The clear_all_with_prefix method removes all key-value pairs associated with
120
+ # the given prefix from this cache instance.
106
121
  #
107
122
  # @return [Documentrix::Documents::RedisCache] self
108
123
  def clear_all_with_prefix
@@ -21,13 +21,15 @@ class Documentrix::Documents::Cache::SQLiteCache
21
21
  # @param embedding_length [ Integer ] the length of the embeddings vector
22
22
  # @param filename [ String ] the name of the SQLite database file or ':memory:' for in-memory.
23
23
  # @param debug [ FalseClass, TrueClass ] whether to enable debugging
24
+ # @param busy_timeout [ Integer ] the SQLite busy timeout in milliseconds (defaults to 5000)
24
25
  #
25
26
  # @return [ void ]
26
- def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
27
+ def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false, busy_timeout: 5000)
27
28
  super(prefix:)
28
29
  @embedding_length = embedding_length
29
30
  @filename = filename
30
31
  @debug = debug
32
+ @busy_timeout = busy_timeout
31
33
  setup_database(filename)
32
34
  end
33
35
 
@@ -69,7 +71,7 @@ class Documentrix::Documents::Cache::SQLiteCache
69
71
  value = convert_value_to_record(value)
70
72
  digest = compute_file_digest(value.source)
71
73
  embedding = value.embedding.pack("f*")
72
- execute(%{BEGIN})
74
+ execute(%{BEGIN IMMEDIATE})
73
75
  execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
74
76
  embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
75
77
  execute(%{
@@ -106,6 +108,22 @@ class Documentrix::Documents::Cache::SQLiteCache
106
108
  result
107
109
  end
108
110
 
111
+ # Returns an array of collection names that match the given prefix.
112
+ # This is a high-performance override for SQLite that only queries keys.
113
+ #
114
+ # @param prefix [String] the prefix to search for in collection names
115
+ # @return [Array<Symbol>] an array of matching collection names
116
+ def collections(prefix)
117
+ execute(
118
+ %{ SELECT DISTINCT key FROM records WHERE key LIKE ? },
119
+ [ "#{prefix}%" ]
120
+ ).flatten.each_with_object(Set.new) do |key, set|
121
+ if key =~ /\A#{prefix}(.+)-/
122
+ set << $1.to_sym
123
+ end
124
+ end.to_a
125
+ end
126
+
109
127
  # The tags method returns an array of unique tags from the database.
110
128
  #
111
129
  # @return [Documentrix::Utils::Tags] An instance of Documentrix::Utils::Tags
@@ -170,6 +188,8 @@ class Documentrix::Documents::Cache::SQLiteCache
170
188
  # @param source [String] the source identifier used to filter records
171
189
  # @param digest [String, nil] the SHA256 hexadecimal digest of the source.
172
190
  # Records matching this digest will be preserved.
191
+ # @param operator [String] the operator to use for comparison ('=' or '!=').
192
+ # Defaults to '='.
173
193
  #
174
194
  # @return [self] the cache instance for method chaining
175
195
  def clear_by_source(source, digest: nil, operator: ?=)
@@ -438,6 +458,7 @@ class Documentrix::Documents::Cache::SQLiteCache
438
458
  # @return [ nil ]
439
459
  def setup_database(filename)
440
460
  @database = SQLite3::Database.new(filename)
461
+ @database.busy_handler_timeout = @busy_timeout
441
462
  @database.enable_load_extension(true)
442
463
  SqliteVec.load(@database)
443
464
  @database.enable_load_extension(false)
@@ -76,12 +76,13 @@ class Documentrix::Documents
76
76
  # @param database_filename [ String ] the filename of the SQLite database to use (defaults to ':memory:')
77
77
  # @param redis_url [ String ] the URL of the Redis server to use (defaults to nil)
78
78
  # @param debug [ FalseClass, TrueClass ] whether to enable debugging mode (defaults to false)
79
- def initialize(ollama:, model:, model_options: nil, collection: nil, embedding_length: 1_024, cache: MemoryCache, database_filename: nil, redis_url: nil, debug: false)
79
+ # @param database_busy_timeout [ Integer ] the SQLite busy timeout in milliseconds (defaults to 5000)
80
+ def initialize(ollama:, model:, model_options: nil, collection: nil, embedding_length: 1_024, cache: MemoryCache, database_filename: nil, redis_url: nil, debug: false, database_busy_timeout: 5000)
80
81
  collection ||= default_collection
81
82
  @ollama, @model, @model_options, @collection, @debug =
82
83
  ollama, model, model_options, collection.to_sym, debug
83
84
  database_filename ||= ':memory:'
84
- @cache = connect_cache(cache, redis_url, embedding_length, database_filename)
85
+ @cache = connect_cache(cache, redis_url, embedding_length, database_filename, database_busy_timeout)
85
86
  end
86
87
 
87
88
  # The default_collection method returns the default collection name.
@@ -104,7 +105,7 @@ class Documentrix::Documents
104
105
  # The prepare_texts method filters out existing texts from the input array
105
106
  # and returns the filtered array.
106
107
  #
107
- # @param texts [ Array ] an array of text strings or #read objects.
108
+ # @param texts [ Array ] an array of text strings
108
109
  #
109
110
  # @return [ Array ] the filtered array of text strings
110
111
  private def prepare_texts(texts)
@@ -162,7 +163,7 @@ class Documentrix::Documents
162
163
  infobar.progress by: batch.size
163
164
  end
164
165
  infobar.newline
165
- invalidate_collections_cache!
166
+ self
166
167
  end
167
168
  alias << add
168
169
 
@@ -201,9 +202,7 @@ class Documentrix::Documents
201
202
  # @return [ FalseClass, TrueClass ] true if the text was removed, false
202
203
  # otherwise.
203
204
  def delete(text)
204
- res = @cache.delete(key(text))
205
- invalidate_collections_cache! if res
206
- res
205
+ @cache.delete(key(text))
207
206
  end
208
207
 
209
208
  # The size method returns the number of texts stored in the cache of this
@@ -222,7 +221,7 @@ class Documentrix::Documents
222
221
  # @return [ Documentrix::Documents ] self
223
222
  def clear(tags: nil)
224
223
  @cache.clear(tags:)
225
- invalidate_collections_cache!
224
+ self
226
225
  end
227
226
 
228
227
  # Normalizes the source identifier to a canonical form.
@@ -320,7 +319,7 @@ class Documentrix::Documents
320
319
  def source_remove(source, digest: nil)
321
320
  source = normalize_source(source)
322
321
  @cache.clear_by_source(source, digest:, operator: '!=')
323
- invalidate_collections_cache!
322
+ self
324
323
  end
325
324
 
326
325
  # The find method searches for strings within the cache by computing their
@@ -381,9 +380,7 @@ class Documentrix::Documents
381
380
  #
382
381
  # @return [Array] An array of unique collection names
383
382
  def collections
384
- @collections_cache ||= (
385
- [ default_collection ] + @cache.collections('%s-' % class_prefix)
386
- ).uniq
383
+ [ default_collection ].concat(@cache.collections('%s-' % class_prefix)).uniq
387
384
  end
388
385
 
389
386
  # Rename the current collection, moving all keys from the old prefix to a new
@@ -399,7 +396,7 @@ class Documentrix::Documents
399
396
  new_prefix = '%s-%s-' % [ class_prefix, new_collection ]
400
397
  @cache.move_prefix(prefix, new_prefix)
401
398
  self.collection = new_collection
402
- invalidate_collections_cache!
399
+ self
403
400
  end
404
401
 
405
402
  # The tags method returns an array of unique tags from the cache.
@@ -427,18 +424,6 @@ class Documentrix::Documents
427
424
 
428
425
  private
429
426
 
430
- # Resets the memoized list of collections.
431
- #
432
- # This is called whenever a mutation occurs that could change the set of
433
- # existing collections, ensuring that the #collections method returns a
434
- # fresh, accurate list on the next call.
435
- #
436
- # @return [ Documentrix::Documents ] self
437
- def invalidate_collections_cache!
438
- @collections_cache = nil
439
- self
440
- end
441
-
442
427
  # The connect_cache method initializes and returns an instance of the
443
428
  # specified cache class.
444
429
  #
@@ -446,9 +431,10 @@ class Documentrix::Documents
446
431
  # @param redis_url [String] the URL of the Redis server
447
432
  # @param embedding_length [Integer] the length of the embeddings used in the cache
448
433
  # @param database_filename [String] the filename of the SQLite database file
434
+ # @param database_busy_timeout [Integer] the SQLite busy timeout in milliseconds
449
435
  #
450
436
  # @return [CacheInstance] an instance of the specified cache class
451
- def connect_cache(cache_class, redis_url, embedding_length, database_filename)
437
+ def connect_cache(cache_class, redis_url, embedding_length, database_filename, database_busy_timeout)
452
438
  cache = nil
453
439
  if (cache_class.instance_method(:redis) rescue nil)
454
440
  begin
@@ -465,6 +451,7 @@ class Documentrix::Documents
465
451
  prefix:,
466
452
  embedding_length:,
467
453
  filename: database_filename,
454
+ busy_timeout: database_busy_timeout,
468
455
  debug: @debug
469
456
  )
470
457
  end
@@ -1,6 +1,6 @@
1
1
  module Documentrix
2
2
  # Documentrix version
3
- VERSION = '0.4.0'
3
+ VERSION = '0.6.0'
4
4
  VERSION_ARRAY = VERSION.split('.').map(&:to_i) # :nodoc:
5
5
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
6
6
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
@@ -101,7 +101,7 @@ describe 'Documentrix::Documents::Cache Interface' do
101
101
 
102
102
  # Common methods from Cache::Common
103
103
  expect(cache).to respond_to(:collections)
104
- expect(cache.method(:collections).owner).to eq Documentrix::Documents::Cache::Common
104
+ expect(cache.method(:collections).owner).to eq Documentrix::Documents::RedisCache
105
105
 
106
106
  expect(cache).to respond_to(:pre)
107
107
  expect(cache.method(:pre).owner).to eq Documentrix::Documents::Cache::Common
@@ -167,7 +167,7 @@ describe 'Documentrix::Documents::Cache Interface' do
167
167
 
168
168
  # Common methods from Cache::Common
169
169
  expect(cache).to respond_to(:collections)
170
- expect(cache.method(:collections).owner).to eq Documentrix::Documents::Cache::Common
170
+ expect(cache.method(:collections).owner).to eq Documentrix::Documents::Cache::SQLiteCache
171
171
 
172
172
  expect(cache).to respond_to(:pre)
173
173
  expect(cache.method(:pre).owner).to eq Documentrix::Documents::Cache::Common
@@ -184,5 +184,29 @@ describe Documentrix::Documents::RedisCache do
184
184
  expect(redis).to receive(:get).with("#{prefix}foo").and_return(JSON(source: 's1', digest: 'd1'))
185
185
  expect(cache.source_exist?('s1', digest: 'd2')).to be false
186
186
  end
187
+
188
+ describe '#collections' do
189
+ it 'extracts unique collection names from keys' do
190
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
191
+ "#{prefix}col1-foo"
192
+ ).and_yield(
193
+ "#{prefix}col1-bar"
194
+ ).and_yield(
195
+ "#{prefix}col2-baz"
196
+ )
197
+
198
+ expect(cache.collections(prefix)).to match_array([:col1, :col2])
199
+ end
200
+
201
+ it 'ignores keys that do not follow the collection pattern' do
202
+ expect(redis).to receive(:scan_each).with(match: "#{prefix}*").and_yield(
203
+ "#{prefix}valid-foo"
204
+ ).and_yield(
205
+ "#{prefix}invalid" # No trailing dash after the name
206
+ )
207
+
208
+ expect(cache.collections(prefix)).to eq [:valid]
209
+ end
210
+ end
187
211
  end
188
212
  end
@@ -353,4 +353,29 @@ describe Documentrix::Documents::SQLiteCache do
353
353
  expect(cache.find_records(needle)).to eq []
354
354
  end
355
355
  end
356
+
357
+ describe '#collections' do
358
+ it 'extracts unique collection names matching the prefix' do
359
+ # Since cache['key'] = val stores as "#{prefix}#{key}",
360
+ # we can create keys like "col1-foo" to get "test-col1-foo"
361
+ cache['col1-foo'] = test_value
362
+ cache['col1-bar'] = test_value
363
+ cache['col2-baz'] = test_value
364
+ cache['justprefix'] = test_value # Matches prefix, but not the pattern "prefix(name)-"
365
+
366
+ expect(cache.collections('test-')).to match_array([:col1, :col2])
367
+ end
368
+
369
+ it 'returns empty array when no keys match the prefix' do
370
+ cache['foo'] = test_value
371
+ expect(cache.collections('nonexistent-')).to eq []
372
+ end
373
+
374
+ it 'returns empty array when keys start with prefix but lack a following hyphen' do
375
+ # We need a key that starts with "test-" but doesn't have another "-" later.
376
+ # Because cache['foo'] = val results in "test-foo", this is exactly what happens.
377
+ cache['foo'] = test_value
378
+ expect(cache.collections('test-')).to eq []
379
+ end
380
+ end
356
381
  end
@@ -23,8 +23,6 @@ describe Documentrix::Documents do
23
23
  expect(ollama).to receive(:embed).
24
24
  with(model:, input: %w[ foo bar ], options: nil).
25
25
  and_return(double(embeddings: [ [ 0.1 ], [ 0.2 ] ]))
26
- expect(documents).to receive(:invalidate_collections_cache!).
27
- and_call_original
28
26
  expect(documents.add(%w[ foo bar ])).to eq documents
29
27
  expect(documents.exist?('foo')).to eq true
30
28
  expect(documents.exist?('bar')).to eq true
@@ -35,8 +33,6 @@ describe Documentrix::Documents do
35
33
  expect(ollama).to receive(:embed).
36
34
  with(model:, input: %w[ foo ], options: nil).
37
35
  and_return(double(embeddings: [ [ 0.1 ] ]))
38
- expect(documents).to receive(:invalidate_collections_cache!).
39
- and_call_original
40
36
  expect(documents << 'foo').to eq documents
41
37
  expect(documents.exist?('foo')).to eq true
42
38
  expect(documents.exist?('bar')).to eq false
@@ -127,8 +123,6 @@ describe Documentrix::Documents do
127
123
 
128
124
  it 'can delete texts' do
129
125
  expect(documents << 'foo').to eq documents
130
- expect(documents).to receive(:invalidate_collections_cache!).
131
- and_call_original
132
126
  expect {
133
127
  documents.delete('foo')
134
128
  }.to change { documents.exist?('foo') }.from(true).to(false)
@@ -142,8 +136,6 @@ describe Documentrix::Documents do
142
136
 
143
137
  it 'can clear texts' do
144
138
  expect(documents << 'foo').to eq documents
145
- expect(documents).to receive(:invalidate_collections_cache!).
146
- and_call_original
147
139
  expect {
148
140
  documents.clear
149
141
  }.to change { documents.size }.from(1).to(0)
@@ -156,13 +148,9 @@ describe Documentrix::Documents do
156
148
  expect(documents.add('foo', tags: %w[ test ])).to eq documents
157
149
  expect(documents.add('bar', tags: %w[ test2 ])).to eq documents
158
150
  expect(documents.tags.to_a).to eq %w[ test test2 ]
159
- expect(documents).to receive(:invalidate_collections_cache!).
160
- and_call_original
161
151
  expect {
162
152
  documents.clear tags: 'test'
163
153
  }.to change { documents.size }.from(2).to(1)
164
- expect(documents).to receive(:invalidate_collections_cache!).
165
- and_call_original
166
154
  expect {
167
155
  documents.clear tags: :test2
168
156
  }.to change { documents.size }.from(1).to(0)
@@ -178,8 +166,6 @@ describe Documentrix::Documents do
178
166
 
179
167
  expect(documents.size).to eq 3
180
168
 
181
- expect(documents).to receive(:invalidate_collections_cache!).
182
- and_call_original
183
169
  documents.source_remove('source1')
184
170
 
185
171
  expect(documents.size).to eq 1
@@ -196,8 +182,6 @@ describe Documentrix::Documents do
196
182
  documents.collection = :foo
197
183
  documents << 'foo'
198
184
  expect(documents.collections).to eq %i[ default foo ]
199
- expect(documents).to receive(:invalidate_collections_cache!).
200
- and_call_original
201
185
  documents.rename_collection(:bar)
202
186
  expect(documents.collection).to eq :bar
203
187
  expect(documents.collections).to eq %i[ default bar ]
@@ -211,8 +195,6 @@ describe Documentrix::Documents do
211
195
  documents.collection = :bar
212
196
  documents << 'foo'
213
197
  expect(documents.collections).to eq %i[ default foo bar ]
214
- expect(documents).not_to receive(:invalidate_collections_cache!).
215
- and_call_original
216
198
  expect {
217
199
  documents.rename_collection(:foo)
218
200
  }.to raise_error(ArgumentError, 'new collection foo already exists!')
@@ -287,8 +269,6 @@ describe Documentrix::Documents do
287
269
  documents.add('foo', source: 's1')
288
270
 
289
271
  expect(ollama).not_to receive(:embed)
290
- expect(documents).to receive(:invalidate_collections_cache!).
291
- and_call_original
292
272
  documents.source_update(['foo'], source: 's1')
293
273
  expect(documents.exist?('foo')).to be true
294
274
  end
@@ -301,8 +281,6 @@ describe Documentrix::Documents do
301
281
  allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
302
282
 
303
283
  expect(ollama).to receive(:embed).once
304
- expect(documents).to receive(:invalidate_collections_cache!).
305
- at_least(1).and_call_original
306
284
  documents.source_update(['bar'], source: 's1')
307
285
 
308
286
  expect(documents.exist?('bar')).to be true
@@ -311,8 +289,6 @@ describe Documentrix::Documents do
311
289
 
312
290
  it 'updates the source if it is an URL' do
313
291
  expect(ollama).to receive(:embed).once
314
- expect(documents).to receive(:invalidate_collections_cache!).
315
- and_call_original
316
292
  documents.source_update('foo', source: 'https://www.example.com/s1')
317
293
  expect(documents.exist?('foo')).to be true
318
294
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: documentrix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Florian Frank