documentrix 0.3.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +36 -0
- data/documentrix.gemspec +2 -2
- data/lib/documentrix/documents/cache/sqlite_cache.rb +7 -2
- data/lib/documentrix/documents.rb +23 -4
- data/lib/documentrix/version.rb +1 -1
- data/spec/documents_spec.rb +27 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 11ea07d3133f8de898211353a4bcff7f926b9f75114696af333e928862ee5aeb
|
|
4
|
+
data.tar.gz: b60d8606b1974bcb43e6584f690ecb21a7e54af6915b09d9adb904bc9269b417
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ae907af900fc6932de6b2d022a4ca97367d6e1901442ed30e838eb7bf128b5e5d7dd572862f29cfe6196f58553d394e390062287b21cac6185e2c6845244ee5a
|
|
7
|
+
data.tar.gz: 75b95e852cbd6b412ae2651668db5cbffcb9f1c5845dd3bdfb2ea252380b9ab37d15f4b8381398f941c4c1dabeb1e7a174e9e6ef7dc4e0dc153455c4c0413b1e
|
data/CHANGES.md
CHANGED
|
@@ -1,5 +1,41 @@
|
|
|
1
1
|
# Changes
|
|
2
2
|
|
|
3
|
+
## 2026-06-16 v0.5.0
|
|
4
|
+
|
|
5
|
+
### Improvements
|
|
6
|
+
|
|
7
|
+
- Enhanced SQLite concurrency and prevented database locks:
|
|
8
|
+
- Added `database_busy_timeout` parameter to
|
|
9
|
+
`Documentrix::Documents#initialize`, defaulting to **5000**ms.
|
|
10
|
+
- Updated `Documentrix::Documents#connect_cache` to pass the timeout value
|
|
11
|
+
to the cache backend.
|
|
12
|
+
- Implemented `busy_timeout` support in
|
|
13
|
+
`Documentrix::Documents::Cache::SQLiteCache#initialize`.
|
|
14
|
+
- Configured `@database.busy_handler_timeout` in
|
|
15
|
+
`Documentrix::Documents::Cache::SQLiteCache#setup_database` to ensure
|
|
16
|
+
GVL-friendly waiting during lock contention.
|
|
17
|
+
- Prevented immediate `SQLITE_BUSY` errors on writes by updating
|
|
18
|
+
`Documentrix::Documents::Cache::SQLiteCache#[]=` to use `BEGIN IMMEDIATE`
|
|
19
|
+
instead of `BEGIN`, avoiding transaction upgrade failures.
|
|
20
|
+
|
|
21
|
+
## 2026-05-22 v0.4.0
|
|
22
|
+
|
|
23
|
+
### Added
|
|
24
|
+
|
|
25
|
+
- Added introspection methods to `Documents`:
|
|
26
|
+
- Implemented `sources` to return an array of unique source identifiers.
|
|
27
|
+
- Implemented `each_record` to iterate over records, returning an
|
|
28
|
+
`Enumerator` if no block is provided.
|
|
29
|
+
|
|
30
|
+
### Changed
|
|
31
|
+
|
|
32
|
+
- Updated CI images from Alpine to Debian:
|
|
33
|
+
- Switched Ruby **4.0**, **3.4**, **3.3**, and **3.2** images to the
|
|
34
|
+
`trixie` distribution.
|
|
35
|
+
- Switched Ruby **3.1** image to the `bookworm` distribution.
|
|
36
|
+
- Updated the `dockerfile` in `.all_images.yml` to use `apt-get` instead of
|
|
37
|
+
`apk` for installing build dependencies.
|
|
38
|
+
|
|
3
39
|
## 2026-05-20 v0.3.2
|
|
4
40
|
|
|
5
41
|
### Performance Improvements
|
data/documentrix.gemspec
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
|
2
|
-
# stub: documentrix 0.
|
|
2
|
+
# stub: documentrix 0.5.0 ruby lib
|
|
3
3
|
|
|
4
4
|
Gem::Specification.new do |s|
|
|
5
5
|
s.name = "documentrix".freeze
|
|
6
|
-
s.version = "0.
|
|
6
|
+
s.version = "0.5.0".freeze
|
|
7
7
|
|
|
8
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
|
9
9
|
s.require_paths = ["lib".freeze]
|
|
@@ -21,13 +21,15 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
21
21
|
# @param embedding_length [ Integer ] the length of the embeddings vector
|
|
22
22
|
# @param filename [ String ] the name of the SQLite database file or ':memory:' for in-memory.
|
|
23
23
|
# @param debug [ FalseClass, TrueClass ] whether to enable debugging
|
|
24
|
+
# @param busy_timeout [ Integer ] the SQLite busy timeout in milliseconds (defaults to 5000)
|
|
24
25
|
#
|
|
25
26
|
# @return [ void ]
|
|
26
|
-
def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false)
|
|
27
|
+
def initialize(prefix:, embedding_length: 1_024, filename: ':memory:', debug: false, busy_timeout: 5000)
|
|
27
28
|
super(prefix:)
|
|
28
29
|
@embedding_length = embedding_length
|
|
29
30
|
@filename = filename
|
|
30
31
|
@debug = debug
|
|
32
|
+
@busy_timeout = busy_timeout
|
|
31
33
|
setup_database(filename)
|
|
32
34
|
end
|
|
33
35
|
|
|
@@ -69,7 +71,7 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
69
71
|
value = convert_value_to_record(value)
|
|
70
72
|
digest = compute_file_digest(value.source)
|
|
71
73
|
embedding = value.embedding.pack("f*")
|
|
72
|
-
execute(%{BEGIN})
|
|
74
|
+
execute(%{BEGIN IMMEDIATE})
|
|
73
75
|
execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
|
|
74
76
|
embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
|
|
75
77
|
execute(%{
|
|
@@ -170,6 +172,8 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
170
172
|
# @param source [String] the source identifier used to filter records
|
|
171
173
|
# @param digest [String, nil] the SHA256 hexadecimal digest of the source.
|
|
172
174
|
# Records matching this digest will be preserved.
|
|
175
|
+
# @param operator [String] the operator to use for comparison ('=' or '!=').
|
|
176
|
+
# Defaults to '='.
|
|
173
177
|
#
|
|
174
178
|
# @return [self] the cache instance for method chaining
|
|
175
179
|
def clear_by_source(source, digest: nil, operator: ?=)
|
|
@@ -438,6 +442,7 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
438
442
|
# @return [ nil ]
|
|
439
443
|
def setup_database(filename)
|
|
440
444
|
@database = SQLite3::Database.new(filename)
|
|
445
|
+
@database.busy_handler_timeout = @busy_timeout
|
|
441
446
|
@database.enable_load_extension(true)
|
|
442
447
|
SqliteVec.load(@database)
|
|
443
448
|
@database.enable_load_extension(false)
|
|
@@ -76,12 +76,13 @@ class Documentrix::Documents
|
|
|
76
76
|
# @param database_filename [ String ] the filename of the SQLite database to use (defaults to ':memory:')
|
|
77
77
|
# @param redis_url [ String ] the URL of the Redis server to use (defaults to nil)
|
|
78
78
|
# @param debug [ FalseClass, TrueClass ] whether to enable debugging mode (defaults to false)
|
|
79
|
-
|
|
79
|
+
# @param database_busy_timeout [ Integer ] the SQLite busy timeout in milliseconds (defaults to 5000)
|
|
80
|
+
def initialize(ollama:, model:, model_options: nil, collection: nil, embedding_length: 1_024, cache: MemoryCache, database_filename: nil, redis_url: nil, debug: false, database_busy_timeout: 5000)
|
|
80
81
|
collection ||= default_collection
|
|
81
82
|
@ollama, @model, @model_options, @collection, @debug =
|
|
82
83
|
ollama, model, model_options, collection.to_sym, debug
|
|
83
84
|
database_filename ||= ':memory:'
|
|
84
|
-
@cache = connect_cache(cache, redis_url, embedding_length, database_filename)
|
|
85
|
+
@cache = connect_cache(cache, redis_url, embedding_length, database_filename, database_busy_timeout)
|
|
85
86
|
end
|
|
86
87
|
|
|
87
88
|
# The default_collection method returns the default collection name.
|
|
@@ -104,7 +105,7 @@ class Documentrix::Documents
|
|
|
104
105
|
# The prepare_texts method filters out existing texts from the input array
|
|
105
106
|
# and returns the filtered array.
|
|
106
107
|
#
|
|
107
|
-
# @param texts [ Array ] an array of text strings
|
|
108
|
+
# @param texts [ Array ] an array of text strings
|
|
108
109
|
#
|
|
109
110
|
# @return [ Array ] the filtered array of text strings
|
|
110
111
|
private def prepare_texts(texts)
|
|
@@ -409,6 +410,22 @@ class Documentrix::Documents
|
|
|
409
410
|
@cache.tags
|
|
410
411
|
end
|
|
411
412
|
|
|
413
|
+
# Returns an array of all unique sources stored in the cache.
|
|
414
|
+
#
|
|
415
|
+
# @return [Array<String>] An array of unique source identifiers.
|
|
416
|
+
def sources
|
|
417
|
+
@cache.each_source.to_a
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# The each_record method iterates over all records stored in the cache.
|
|
421
|
+
#
|
|
422
|
+
# @yield [record] The record being iterated over.
|
|
423
|
+
# @return [ Enumerator ] an enumerator if no block is provided.
|
|
424
|
+
def each_record(&block)
|
|
425
|
+
block or return enum_for(__method__)
|
|
426
|
+
@cache.each { |_key, record| block.(record) }
|
|
427
|
+
end
|
|
428
|
+
|
|
412
429
|
private
|
|
413
430
|
|
|
414
431
|
# Resets the memoized list of collections.
|
|
@@ -430,9 +447,10 @@ class Documentrix::Documents
|
|
|
430
447
|
# @param redis_url [String] the URL of the Redis server
|
|
431
448
|
# @param embedding_length [Integer] the length of the embeddings used in the cache
|
|
432
449
|
# @param database_filename [String] the filename of the SQLite database file
|
|
450
|
+
# @param database_busy_timeout [Integer] the SQLite busy timeout in milliseconds
|
|
433
451
|
#
|
|
434
452
|
# @return [CacheInstance] an instance of the specified cache class
|
|
435
|
-
def connect_cache(cache_class, redis_url, embedding_length, database_filename)
|
|
453
|
+
def connect_cache(cache_class, redis_url, embedding_length, database_filename, database_busy_timeout)
|
|
436
454
|
cache = nil
|
|
437
455
|
if (cache_class.instance_method(:redis) rescue nil)
|
|
438
456
|
begin
|
|
@@ -449,6 +467,7 @@ class Documentrix::Documents
|
|
|
449
467
|
prefix:,
|
|
450
468
|
embedding_length:,
|
|
451
469
|
filename: database_filename,
|
|
470
|
+
busy_timeout: database_busy_timeout,
|
|
452
471
|
debug: @debug
|
|
453
472
|
)
|
|
454
473
|
end
|
data/lib/documentrix/version.rb
CHANGED
data/spec/documents_spec.rb
CHANGED
|
@@ -226,6 +226,32 @@ describe Documentrix::Documents do
|
|
|
226
226
|
from(:default).
|
|
227
227
|
to(:new_collection)
|
|
228
228
|
end
|
|
229
|
+
|
|
230
|
+
it 'returns unique sources' do
|
|
231
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
232
|
+
documents.add('foo', source: 's1')
|
|
233
|
+
documents.add('bar', source: 's1')
|
|
234
|
+
documents.add('baz', source: 's2')
|
|
235
|
+
expect(documents.sources).to match_array %w[ s1 s2 ]
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
it 'can iterate over records' do
|
|
239
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
240
|
+
documents.add('foo')
|
|
241
|
+
documents.add('bar')
|
|
242
|
+
records = []
|
|
243
|
+
documents.each_record { |r| records << r }
|
|
244
|
+
expect(records.size).to eq 2
|
|
245
|
+
expect(records.map(&:text)).to match_array %w[ foo bar ]
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
it 'returns an enumerator for each_record' do
|
|
249
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
250
|
+
documents.add('foo')
|
|
251
|
+
documents.add('bar')
|
|
252
|
+
expect(documents.each_record).to be_a Enumerator
|
|
253
|
+
expect(documents.each_record.map(&:text)).to match_array %w[ foo bar ]
|
|
254
|
+
end
|
|
229
255
|
end
|
|
230
256
|
|
|
231
257
|
context 'source management' do
|
|
@@ -233,7 +259,7 @@ describe Documentrix::Documents do
|
|
|
233
259
|
allow(documents).to receive(:compute_file_digest).and_return('d1')
|
|
234
260
|
allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
|
|
235
261
|
|
|
236
|
-
allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
|
|
262
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
237
263
|
end
|
|
238
264
|
|
|
239
265
|
it 'can check if a source exists' do
|