documentrix 0.3.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 332c67275b90bcc797cdd8e8df75a751bcfa7a52c759c9bcbe4a55970e905401
4
- data.tar.gz: c546f89c61613b11d18c2cd724cadae8c303a988f925ad1624c5121f511e88fd
3
+ metadata.gz: 11ea07d3133f8de898211353a4bcff7f926b9f75114696af333e928862ee5aeb
4
+ data.tar.gz: b60d8606b1974bcb43e6584f690ecb21a7e54af6915b09d9adb904bc9269b417
5
5
  SHA512:
6
- metadata.gz: c80a4c253b4fe367fae1dbb7e6aea50e274a495a0026a7b676e1e962b80c33806fe89976338c12931f11a5059e384fdfddcfe4665f68bde406b118edd8c04d15
7
- data.tar.gz: '09292fd5d6aed6c939c0244467c743d515e68e5e253d7f443ef96a9c295a61689f1be5b9aec53e402287b6b6c7d305757eb26f79f5ade0161f7e82d534a7b3bf'
6
+ metadata.gz: ae907af900fc6932de6b2d022a4ca97367d6e1901442ed30e838eb7bf128b5e5d7dd572862f29cfe6196f58553d394e390062287b21cac6185e2c6845244ee5a
7
+ data.tar.gz: 75b95e852cbd6b412ae2651668db5cbffcb9f1c5845dd3bdfb2ea252380b9ab37d15f4b8381398f941c4c1dabeb1e7a174e9e6ef7dc4e0dc153455c4c0413b1e
data/CHANGES.md CHANGED
@@ -1,5 +1,41 @@
1
1
  # Changes
2
2
 
3
+ ## 2026-06-16 v0.5.0
4
+
5
+ ### Improvements
6
+
7
+ - Enhanced SQLite concurrency and prevented database locks:
8
+ - Added `database_busy_timeout` parameter to
9
+ `Documentrix::Documents#initialize`, defaulting to **5000**ms.
10
+ - Updated `Documentrix::Documents#connect_cache` to pass the timeout value
11
+ to the cache backend.
12
+ - Implemented `busy_timeout` support in
13
+ `Documentrix::Documents::Cache::SQLiteCache#initialize`.
14
+ - Configured `@database.busy_handler_timeout` in
15
+ `Documentrix::Documents::Cache::SQLiteCache#setup_database` to ensure
16
+ GVL-friendly waiting during lock contention.
17
+ - Prevented immediate `SQLITE_BUSY` errors on writes by updating
18
+ `Documentrix::Documents::Cache::SQLiteCache#[]=` to use `BEGIN IMMEDIATE`
19
+ instead of `BEGIN`, avoiding transaction upgrade failures.
20
+
21
+ ## 2026-05-22 v0.4.0
22
+
23
+ ### Added
24
+
25
+ - Added introspection methods to `Documents`:
26
+ - Implemented `sources` to return an array of unique source identifiers.
27
+ - Implemented `each_record` to iterate over records, returning an
28
+ `Enumerator` if no block is provided.
29
+
30
+ ### Changed
31
+
32
+ - Updated CI images from Alpine to Debian:
33
+ - Switched Ruby **4.0**, **3.4**, **3.3**, and **3.2** images to the
34
+ `trixie` distribution.
35
+ - Switched Ruby **3.1** image to the `bookworm` distribution.
36
+ - Updated the `dockerfile` in `.all_images.yml` to use `apt-get` instead of
37
+ `apk` for installing build dependencies.
38
+
3
39
  ## 2026-05-20 v0.3.2
4
40
 
5
41
  ### Performance Improvements
data/documentrix.gemspec CHANGED
@@ -1,9 +1,9 @@
1
1
  # -*- encoding: utf-8 -*-
2
- # stub: documentrix 0.3.2 ruby lib
2
+ # stub: documentrix 0.5.0 ruby lib
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "documentrix".freeze
6
- s.version = "0.3.2".freeze
6
+ s.version = "0.5.0".freeze
7
7
 
8
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
9
9
  s.require_paths = ["lib".freeze]
@@ -21,13 +21,15 @@ class Documentrix::Documents::Cache::SQLiteCache
21
21
  # @param embedding_length [ Integer ] the length of the embeddings vector
22
22
  # @param filename [ String ] the name of the SQLite database file or ':memory:' for in-memory.
23
23
  # @param debug [ FalseClass, TrueClass ] whether to enable debugging
24
+ # @param busy_timeout [ Integer ] the SQLite busy timeout in milliseconds (defaults to 5000)
24
25
  #
25
26
  # @return [ void ]
26
- def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
27
+ def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false, busy_timeout: 5000)
27
28
  super(prefix:)
28
29
  @embedding_length = embedding_length
29
30
  @filename = filename
30
31
  @debug = debug
32
+ @busy_timeout = busy_timeout
31
33
  setup_database(filename)
32
34
  end
33
35
 
@@ -69,7 +71,7 @@ class Documentrix::Documents::Cache::SQLiteCache
69
71
  value = convert_value_to_record(value)
70
72
  digest = compute_file_digest(value.source)
71
73
  embedding = value.embedding.pack("f*")
72
- execute(%{BEGIN})
74
+ execute(%{BEGIN IMMEDIATE})
73
75
  execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
74
76
  embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
75
77
  execute(%{
@@ -170,6 +172,8 @@ class Documentrix::Documents::Cache::SQLiteCache
170
172
  # @param source [String] the source identifier used to filter records
171
173
  # @param digest [String, nil] the SHA256 hexadecimal digest of the source.
172
174
  # Records matching this digest will be preserved.
175
+ # @param operator [String] the operator to use for comparison ('=' or '!=').
176
+ # Defaults to '='.
173
177
  #
174
178
  # @return [self] the cache instance for method chaining
175
179
  def clear_by_source(source, digest: nil, operator: ?=)
@@ -438,6 +442,7 @@ class Documentrix::Documents::Cache::SQLiteCache
438
442
  # @return [ nil ]
439
443
  def setup_database(filename)
440
444
  @database = SQLite3::Database.new(filename)
445
+ @database.busy_handler_timeout = @busy_timeout
441
446
  @database.enable_load_extension(true)
442
447
  SqliteVec.load(@database)
443
448
  @database.enable_load_extension(false)
@@ -76,12 +76,13 @@ class Documentrix::Documents
76
76
  # @param database_filename [ String ] the filename of the SQLite database to use (defaults to ':memory:')
77
77
  # @param redis_url [ String ] the URL of the Redis server to use (defaults to nil)
78
78
  # @param debug [ FalseClass, TrueClass ] whether to enable debugging mode (defaults to false)
79
- def initialize(ollama:, model:, model_options: nil, collection: nil, embedding_length: 1_024, cache: MemoryCache, database_filename: nil, redis_url: nil, debug: false)
79
+ # @param database_busy_timeout [ Integer ] the SQLite busy timeout in milliseconds (defaults to 5000)
80
+ def initialize(ollama:, model:, model_options: nil, collection: nil, embedding_length: 1_024, cache: MemoryCache, database_filename: nil, redis_url: nil, debug: false, database_busy_timeout: 5000)
80
81
  collection ||= default_collection
81
82
  @ollama, @model, @model_options, @collection, @debug =
82
83
  ollama, model, model_options, collection.to_sym, debug
83
84
  database_filename ||= ':memory:'
84
- @cache = connect_cache(cache, redis_url, embedding_length, database_filename)
85
+ @cache = connect_cache(cache, redis_url, embedding_length, database_filename, database_busy_timeout)
85
86
  end
86
87
 
87
88
  # The default_collection method returns the default collection name.
@@ -104,7 +105,7 @@ class Documentrix::Documents
104
105
  # The prepare_texts method filters out existing texts from the input array
105
106
  # and returns the filtered array.
106
107
  #
107
- # @param texts [ Array ] an array of text strings or #read objects.
108
+ # @param texts [ Array ] an array of text strings
108
109
  #
109
110
  # @return [ Array ] the filtered array of text strings
110
111
  private def prepare_texts(texts)
@@ -409,6 +410,22 @@ class Documentrix::Documents
409
410
  @cache.tags
410
411
  end
411
412
 
413
+ # Returns an array of all unique sources stored in the cache.
414
+ #
415
+ # @return [Array<String>] An array of unique source identifiers.
416
+ def sources
417
+ @cache.each_source.to_a
418
+ end
419
+
420
+ # The each_record method iterates over all records stored in the cache.
421
+ #
422
+ # @yield [record] The record being iterated over.
423
+ # @return [ Enumerator ] an enumerator if no block is provided.
424
+ def each_record(&block)
425
+ block or return enum_for(__method__)
426
+ @cache.each { |_key, record| block.(record) }
427
+ end
428
+
412
429
  private
413
430
 
414
431
  # Resets the memoized list of collections.
@@ -430,9 +447,10 @@ class Documentrix::Documents
430
447
  # @param redis_url [String] the URL of the Redis server
431
448
  # @param embedding_length [Integer] the length of the embeddings used in the cache
432
449
  # @param database_filename [String] the filename of the SQLite database file
450
+ # @param database_busy_timeout [Integer] the SQLite busy timeout in milliseconds
433
451
  #
434
452
  # @return [CacheInstance] an instance of the specified cache class
435
- def connect_cache(cache_class, redis_url, embedding_length, database_filename)
453
+ def connect_cache(cache_class, redis_url, embedding_length, database_filename, database_busy_timeout)
436
454
  cache = nil
437
455
  if (cache_class.instance_method(:redis) rescue nil)
438
456
  begin
@@ -449,6 +467,7 @@ class Documentrix::Documents
449
467
  prefix:,
450
468
  embedding_length:,
451
469
  filename: database_filename,
470
+ busy_timeout: database_busy_timeout,
452
471
  debug: @debug
453
472
  )
454
473
  end
@@ -1,6 +1,6 @@
1
1
  module Documentrix
2
2
  # Documentrix version
3
- VERSION = '0.3.2'
3
+ VERSION = '0.5.0'
4
4
  VERSION_ARRAY = VERSION.split('.').map(&:to_i) # :nodoc:
5
5
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
6
6
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
@@ -226,6 +226,32 @@ describe Documentrix::Documents do
226
226
  from(:default).
227
227
  to(:new_collection)
228
228
  end
229
+
230
+ it 'returns unique sources' do
231
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
232
+ documents.add('foo', source: 's1')
233
+ documents.add('bar', source: 's1')
234
+ documents.add('baz', source: 's2')
235
+ expect(documents.sources).to match_array %w[ s1 s2 ]
236
+ end
237
+
238
+ it 'can iterate over records' do
239
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
240
+ documents.add('foo')
241
+ documents.add('bar')
242
+ records = []
243
+ documents.each_record { |r| records << r }
244
+ expect(records.size).to eq 2
245
+ expect(records.map(&:text)).to match_array %w[ foo bar ]
246
+ end
247
+
248
+ it 'returns an enumerator for each_record' do
249
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
250
+ documents.add('foo')
251
+ documents.add('bar')
252
+ expect(documents.each_record).to be_a Enumerator
253
+ expect(documents.each_record.map(&:text)).to match_array %w[ foo bar ]
254
+ end
229
255
  end
230
256
 
231
257
  context 'source management' do
@@ -233,7 +259,7 @@ describe Documentrix::Documents do
233
259
  allow(documents).to receive(:compute_file_digest).and_return('d1')
234
260
  allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
235
261
 
236
- allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
262
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
237
263
  end
238
264
 
239
265
  it 'can check if a source exists' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: documentrix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Florian Frank