documentrix 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +41 -0
- data/documentrix.gemspec +2 -2
- data/lib/documentrix/documents/cache/sqlite_cache.rb +8 -8
- data/lib/documentrix/documents.rb +40 -7
- data/lib/documentrix/version.rb +1 -1
- data/spec/documents_spec.rb +51 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e32a72c0a1f93a96f7c3cecd185f13a0f17a2629b2dc509ec3a29fd2d7b51a41
|
|
4
|
+
data.tar.gz: 3f2c21125adf7061dcba94f1843456064245222839de0df6ddbcd8743a1aea13
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 91dbf3ddfdeb124661ff78a4cbc1635dbb8bdf6606078aeeef77b4f9d14688d073261ad08b59f0333895c7391d83eff8a6c76467f19af4f04ca55172c1d934a5
|
|
7
|
+
data.tar.gz: d40e5a53ceeda71c7a2be37d3bfb718ffc2ddb0fddc0c6eb2346c295d06713c895493854fab5f768af075bec2595e2f23c40e893d7f7266c6f61cd95c534d733
|
data/CHANGES.md
CHANGED
|
@@ -1,5 +1,46 @@
|
|
|
1
1
|
# Changes
|
|
2
2
|
|
|
3
|
+
## 2026-05-22 v0.4.0
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- Added introspection methods to `Documents`:
|
|
8
|
+
- Implemented `sources` to return an array of unique source identifiers.
|
|
9
|
+
- Implemented `each_record` to iterate over records, returning an
|
|
10
|
+
`Enumerator` if no block is provided.
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
- Updated CI images from Alpine to Debian:
|
|
15
|
+
- Switched Ruby **4.0**, **3.4**, **3.3**, and **3.2** images to the
|
|
16
|
+
`trixie` distribution.
|
|
17
|
+
- Switched Ruby **3.1** image to the `bookworm` distribution.
|
|
18
|
+
- Updated the `dockerfile` in `.all_images.yml` to use `apt-get` instead of
|
|
19
|
+
`apk` for installing build dependencies.
|
|
20
|
+
|
|
21
|
+
## 2026-05-20 v0.3.2
|
|
22
|
+
|
|
23
|
+
### Performance Improvements
|
|
24
|
+
|
|
25
|
+
- Optimized collections lookup by memoizing
|
|
26
|
+
`Documentrix::Documents#collections` using `@collections_cache`.
|
|
27
|
+
- Added `Documentrix::Documents#invalidate_collections_cache!` to reset the
|
|
28
|
+
memoized list.
|
|
29
|
+
- Integrated `invalidate_collections_cache!` into `add`, `delete`, `clear`,
|
|
30
|
+
`source_remove`, and `rename_collection` to maintain cache consistency.
|
|
31
|
+
|
|
32
|
+
### Database & Cache
|
|
33
|
+
|
|
34
|
+
- Standardized SQL keyword casing to uppercase for data types (e.g., `FLOAT`,
|
|
35
|
+
`TEXT`, `INTEGER`, `JSON`) within `SQLiteCache`.
|
|
36
|
+
|
|
37
|
+
### Documentation & Testing
|
|
38
|
+
|
|
39
|
+
- Refined documentation and return type descriptions for
|
|
40
|
+
`Documentrix::Documents#find` and `Documentrix::Documents#prefix`.
|
|
41
|
+
- Updated `spec/documents_spec.rb` to ensure `invalidate_collections_cache!` is
|
|
42
|
+
correctly triggered during mutations.
|
|
43
|
+
|
|
3
44
|
## 2026-05-18 v0.3.1
|
|
4
45
|
|
|
5
46
|
- Fixed scoping bugs in `clear_by_source` and `source_exist?` by implementing a
|
data/documentrix.gemspec
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
|
2
|
-
# stub: documentrix 0.
|
|
2
|
+
# stub: documentrix 0.4.0 ruby lib
|
|
3
3
|
|
|
4
4
|
Gem::Specification.new do |s|
|
|
5
5
|
s.name = "documentrix".freeze
|
|
6
|
-
s.version = "0.
|
|
6
|
+
s.version = "0.4.0".freeze
|
|
7
7
|
|
|
8
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
|
9
9
|
s.require_paths = ["lib".freeze]
|
|
@@ -443,18 +443,18 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
443
443
|
@database.enable_load_extension(false)
|
|
444
444
|
execute %{
|
|
445
445
|
CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
|
|
446
|
-
embedding
|
|
446
|
+
embedding FLOAT[#@embedding_length]
|
|
447
447
|
)
|
|
448
448
|
}
|
|
449
449
|
execute %{
|
|
450
450
|
CREATE TABLE IF NOT EXISTS records (
|
|
451
|
-
key
|
|
452
|
-
text
|
|
453
|
-
embedding_id
|
|
454
|
-
norm
|
|
455
|
-
source
|
|
456
|
-
digest
|
|
457
|
-
tags
|
|
451
|
+
key TEXT NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
|
|
452
|
+
text TEXT NOT NULL DEFAULT '',
|
|
453
|
+
embedding_id INTEGER,
|
|
454
|
+
norm FLOAT NOT NULL DEFAULT 0.0,
|
|
455
|
+
source TEXT,
|
|
456
|
+
digest TEXT,
|
|
457
|
+
tags JSON NOT NULL DEFAULT [],
|
|
458
458
|
FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
|
|
459
459
|
)
|
|
460
460
|
}
|
|
@@ -162,7 +162,7 @@ class Documentrix::Documents
|
|
|
162
162
|
infobar.progress by: batch.size
|
|
163
163
|
end
|
|
164
164
|
infobar.newline
|
|
165
|
-
|
|
165
|
+
invalidate_collections_cache!
|
|
166
166
|
end
|
|
167
167
|
alias << add
|
|
168
168
|
|
|
@@ -201,7 +201,9 @@ class Documentrix::Documents
|
|
|
201
201
|
# @return [ FalseClass, TrueClass ] true if the text was removed, false
|
|
202
202
|
# otherwise.
|
|
203
203
|
def delete(text)
|
|
204
|
-
@cache.delete(key(text))
|
|
204
|
+
res = @cache.delete(key(text))
|
|
205
|
+
invalidate_collections_cache! if res
|
|
206
|
+
res
|
|
205
207
|
end
|
|
206
208
|
|
|
207
209
|
# The size method returns the number of texts stored in the cache of this
|
|
@@ -220,7 +222,7 @@ class Documentrix::Documents
|
|
|
220
222
|
# @return [ Documentrix::Documents ] self
|
|
221
223
|
def clear(tags: nil)
|
|
222
224
|
@cache.clear(tags:)
|
|
223
|
-
|
|
225
|
+
invalidate_collections_cache!
|
|
224
226
|
end
|
|
225
227
|
|
|
226
228
|
# Normalizes the source identifier to a canonical form.
|
|
@@ -318,7 +320,7 @@ class Documentrix::Documents
|
|
|
318
320
|
def source_remove(source, digest: nil)
|
|
319
321
|
source = normalize_source(source)
|
|
320
322
|
@cache.clear_by_source(source, digest:, operator: '!=')
|
|
321
|
-
|
|
323
|
+
invalidate_collections_cache!
|
|
322
324
|
end
|
|
323
325
|
|
|
324
326
|
# The find method searches for strings within the cache by computing their
|
|
@@ -359,7 +361,7 @@ class Documentrix::Documents
|
|
|
359
361
|
# @param text_count [Integer] the maximum number of records to return
|
|
360
362
|
# @param opts [Hash] additional options passed to #find, such as:
|
|
361
363
|
# * :tags [Array<String>] filter results by tags
|
|
362
|
-
# * :prompt [String]
|
|
364
|
+
# * :prompt [String] use for the search
|
|
363
365
|
# * :min_similarity [Numeric] minimum similarity score
|
|
364
366
|
#
|
|
365
367
|
# @example
|
|
@@ -379,7 +381,9 @@ class Documentrix::Documents
|
|
|
379
381
|
#
|
|
380
382
|
# @return [Array] An array of unique collection names
|
|
381
383
|
def collections
|
|
382
|
-
|
|
384
|
+
@collections_cache ||= (
|
|
385
|
+
[ default_collection ] + @cache.collections('%s-' % class_prefix)
|
|
386
|
+
).uniq
|
|
383
387
|
end
|
|
384
388
|
|
|
385
389
|
# Rename the current collection, moving all keys from the old prefix to a new
|
|
@@ -395,6 +399,7 @@ class Documentrix::Documents
|
|
|
395
399
|
new_prefix = '%s-%s-' % [ class_prefix, new_collection ]
|
|
396
400
|
@cache.move_prefix(prefix, new_prefix)
|
|
397
401
|
self.collection = new_collection
|
|
402
|
+
invalidate_collections_cache!
|
|
398
403
|
end
|
|
399
404
|
|
|
400
405
|
# The tags method returns an array of unique tags from the cache.
|
|
@@ -404,8 +409,36 @@ class Documentrix::Documents
|
|
|
404
409
|
@cache.tags
|
|
405
410
|
end
|
|
406
411
|
|
|
412
|
+
# Returns an array of all unique sources stored in the cache.
|
|
413
|
+
#
|
|
414
|
+
# @return [Array<String>] An array of unique source identifiers.
|
|
415
|
+
def sources
|
|
416
|
+
@cache.each_source.to_a
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# The each_record method iterates over all records stored in the cache.
|
|
420
|
+
#
|
|
421
|
+
# @yield [record] The record being iterated over.
|
|
422
|
+
# @return [ Enumerator ] an enumerator if no block is provided.
|
|
423
|
+
def each_record(&block)
|
|
424
|
+
block or return enum_for(__method__)
|
|
425
|
+
@cache.each { |_key, record| block.(record) }
|
|
426
|
+
end
|
|
427
|
+
|
|
407
428
|
private
|
|
408
429
|
|
|
430
|
+
# Resets the memoized list of collections.
|
|
431
|
+
#
|
|
432
|
+
# This is called whenever a mutation occurs that could change the set of
|
|
433
|
+
# existing collections, ensuring that the #collections method returns a
|
|
434
|
+
# fresh, accurate list on the next call.
|
|
435
|
+
#
|
|
436
|
+
# @return [ Documentrix::Documents ] self
|
|
437
|
+
def invalidate_collections_cache!
|
|
438
|
+
@collections_cache = nil
|
|
439
|
+
self
|
|
440
|
+
end
|
|
441
|
+
|
|
409
442
|
# The connect_cache method initializes and returns an instance of the
|
|
410
443
|
# specified cache class.
|
|
411
444
|
#
|
|
@@ -482,7 +515,7 @@ class Documentrix::Documents
|
|
|
482
515
|
# The prefix method returns a string that is used as the prefix for keys in
|
|
483
516
|
# the cache of the currently configured collection.
|
|
484
517
|
#
|
|
485
|
-
# @return [ String ]
|
|
518
|
+
# @return [ String ] the prefix string
|
|
486
519
|
def prefix
|
|
487
520
|
'%s-%s-' % [ class_prefix, @collection ]
|
|
488
521
|
end
|
data/lib/documentrix/version.rb
CHANGED
data/spec/documents_spec.rb
CHANGED
|
@@ -23,6 +23,8 @@ describe Documentrix::Documents do
|
|
|
23
23
|
expect(ollama).to receive(:embed).
|
|
24
24
|
with(model:, input: %w[ foo bar ], options: nil).
|
|
25
25
|
and_return(double(embeddings: [ [ 0.1 ], [ 0.2 ] ]))
|
|
26
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
27
|
+
and_call_original
|
|
26
28
|
expect(documents.add(%w[ foo bar ])).to eq documents
|
|
27
29
|
expect(documents.exist?('foo')).to eq true
|
|
28
30
|
expect(documents.exist?('bar')).to eq true
|
|
@@ -33,6 +35,8 @@ describe Documentrix::Documents do
|
|
|
33
35
|
expect(ollama).to receive(:embed).
|
|
34
36
|
with(model:, input: %w[ foo ], options: nil).
|
|
35
37
|
and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
38
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
39
|
+
and_call_original
|
|
36
40
|
expect(documents << 'foo').to eq documents
|
|
37
41
|
expect(documents.exist?('foo')).to eq true
|
|
38
42
|
expect(documents.exist?('bar')).to eq false
|
|
@@ -123,6 +127,8 @@ describe Documentrix::Documents do
|
|
|
123
127
|
|
|
124
128
|
it 'can delete texts' do
|
|
125
129
|
expect(documents << 'foo').to eq documents
|
|
130
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
131
|
+
and_call_original
|
|
126
132
|
expect {
|
|
127
133
|
documents.delete('foo')
|
|
128
134
|
}.to change { documents.exist?('foo') }.from(true).to(false)
|
|
@@ -136,6 +142,8 @@ describe Documentrix::Documents do
|
|
|
136
142
|
|
|
137
143
|
it 'can clear texts' do
|
|
138
144
|
expect(documents << 'foo').to eq documents
|
|
145
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
146
|
+
and_call_original
|
|
139
147
|
expect {
|
|
140
148
|
documents.clear
|
|
141
149
|
}.to change { documents.size }.from(1).to(0)
|
|
@@ -148,9 +156,13 @@ describe Documentrix::Documents do
|
|
|
148
156
|
expect(documents.add('foo', tags: %w[ test ])).to eq documents
|
|
149
157
|
expect(documents.add('bar', tags: %w[ test2 ])).to eq documents
|
|
150
158
|
expect(documents.tags.to_a).to eq %w[ test test2 ]
|
|
159
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
160
|
+
and_call_original
|
|
151
161
|
expect {
|
|
152
162
|
documents.clear tags: 'test'
|
|
153
163
|
}.to change { documents.size }.from(2).to(1)
|
|
164
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
165
|
+
and_call_original
|
|
154
166
|
expect {
|
|
155
167
|
documents.clear tags: :test2
|
|
156
168
|
}.to change { documents.size }.from(1).to(0)
|
|
@@ -166,6 +178,8 @@ describe Documentrix::Documents do
|
|
|
166
178
|
|
|
167
179
|
expect(documents.size).to eq 3
|
|
168
180
|
|
|
181
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
182
|
+
and_call_original
|
|
169
183
|
documents.source_remove('source1')
|
|
170
184
|
|
|
171
185
|
expect(documents.size).to eq 1
|
|
@@ -182,6 +196,8 @@ describe Documentrix::Documents do
|
|
|
182
196
|
documents.collection = :foo
|
|
183
197
|
documents << 'foo'
|
|
184
198
|
expect(documents.collections).to eq %i[ default foo ]
|
|
199
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
200
|
+
and_call_original
|
|
185
201
|
documents.rename_collection(:bar)
|
|
186
202
|
expect(documents.collection).to eq :bar
|
|
187
203
|
expect(documents.collections).to eq %i[ default bar ]
|
|
@@ -195,6 +211,8 @@ describe Documentrix::Documents do
|
|
|
195
211
|
documents.collection = :bar
|
|
196
212
|
documents << 'foo'
|
|
197
213
|
expect(documents.collections).to eq %i[ default foo bar ]
|
|
214
|
+
expect(documents).not_to receive(:invalidate_collections_cache!).
|
|
215
|
+
and_call_original
|
|
198
216
|
expect {
|
|
199
217
|
documents.rename_collection(:foo)
|
|
200
218
|
}.to raise_error(ArgumentError, 'new collection foo already exists!')
|
|
@@ -208,6 +226,32 @@ describe Documentrix::Documents do
|
|
|
208
226
|
from(:default).
|
|
209
227
|
to(:new_collection)
|
|
210
228
|
end
|
|
229
|
+
|
|
230
|
+
it 'returns unique sources' do
|
|
231
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
232
|
+
documents.add('foo', source: 's1')
|
|
233
|
+
documents.add('bar', source: 's1')
|
|
234
|
+
documents.add('baz', source: 's2')
|
|
235
|
+
expect(documents.sources).to match_array %w[ s1 s2 ]
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
it 'can iterate over records' do
|
|
239
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
240
|
+
documents.add('foo')
|
|
241
|
+
documents.add('bar')
|
|
242
|
+
records = []
|
|
243
|
+
documents.each_record { |r| records << r }
|
|
244
|
+
expect(records.size).to eq 2
|
|
245
|
+
expect(records.map(&:text)).to match_array %w[ foo bar ]
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
it 'returns an enumerator for each_record' do
|
|
249
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
250
|
+
documents.add('foo')
|
|
251
|
+
documents.add('bar')
|
|
252
|
+
expect(documents.each_record).to be_a Enumerator
|
|
253
|
+
expect(documents.each_record.map(&:text)).to match_array %w[ foo bar ]
|
|
254
|
+
end
|
|
211
255
|
end
|
|
212
256
|
|
|
213
257
|
context 'source management' do
|
|
@@ -215,7 +259,7 @@ describe Documentrix::Documents do
|
|
|
215
259
|
allow(documents).to receive(:compute_file_digest).and_return('d1')
|
|
216
260
|
allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
|
|
217
261
|
|
|
218
|
-
allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
|
|
262
|
+
allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
|
|
219
263
|
end
|
|
220
264
|
|
|
221
265
|
it 'can check if a source exists' do
|
|
@@ -243,6 +287,8 @@ describe Documentrix::Documents do
|
|
|
243
287
|
documents.add('foo', source: 's1')
|
|
244
288
|
|
|
245
289
|
expect(ollama).not_to receive(:embed)
|
|
290
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
291
|
+
and_call_original
|
|
246
292
|
documents.source_update(['foo'], source: 's1')
|
|
247
293
|
expect(documents.exist?('foo')).to be true
|
|
248
294
|
end
|
|
@@ -255,6 +301,8 @@ describe Documentrix::Documents do
|
|
|
255
301
|
allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
|
|
256
302
|
|
|
257
303
|
expect(ollama).to receive(:embed).once
|
|
304
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
305
|
+
at_least(1).and_call_original
|
|
258
306
|
documents.source_update(['bar'], source: 's1')
|
|
259
307
|
|
|
260
308
|
expect(documents.exist?('bar')).to be true
|
|
@@ -263,6 +311,8 @@ describe Documentrix::Documents do
|
|
|
263
311
|
|
|
264
312
|
it 'updates the source if it is an URL' do
|
|
265
313
|
expect(ollama).to receive(:embed).once
|
|
314
|
+
expect(documents).to receive(:invalidate_collections_cache!).
|
|
315
|
+
and_call_original
|
|
266
316
|
documents.source_update('foo', source: 'https://www.example.com/s1')
|
|
267
317
|
expect(documents.exist?('foo')).to be true
|
|
268
318
|
end
|