documentrix 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3ab97426ab9fbd4ec832a422a18d574f4c03b11f22391cbc6eded16b9ba0609
4
- data.tar.gz: 991215fb26b4165a4b3562075d9742e3756218e2a24f3f32a550e232f59fb92f
3
+ metadata.gz: e32a72c0a1f93a96f7c3cecd185f13a0f17a2629b2dc509ec3a29fd2d7b51a41
4
+ data.tar.gz: 3f2c21125adf7061dcba94f1843456064245222839de0df6ddbcd8743a1aea13
5
5
  SHA512:
6
- metadata.gz: d67e453e27428bcd8364349e988556af844caab5faa8a8edb3dcfba9650554b8dbfc66ad322a6f9f59d5ac64350d7006bf899a16b6c83f92909793301b6954d5
7
- data.tar.gz: 49706d4984b27ee7b3d0f8ef8a9efec4a8e74d8bef78d78f15ed5c76f5bacc0faf20c36cef739aa78c1f68e5733696d16999bbd14228ae7be8e6cf9c9baf0a75
6
+ metadata.gz: 91dbf3ddfdeb124661ff78a4cbc1635dbb8bdf6606078aeeef77b4f9d14688d073261ad08b59f0333895c7391d83eff8a6c76467f19af4f04ca55172c1d934a5
7
+ data.tar.gz: d40e5a53ceeda71c7a2be37d3bfb718ffc2ddb0fddc0c6eb2346c295d06713c895493854fab5f768af075bec2595e2f23c40e893d7f7266c6f61cd95c534d733
data/CHANGES.md CHANGED
@@ -1,5 +1,46 @@
1
1
  # Changes
2
2
 
3
+ ## 2026-05-22 v0.4.0
4
+
5
+ ### Added
6
+
7
+ - Added introspection methods to `Documents`:
8
+ - Implemented `sources` to return an array of unique source identifiers.
9
+ - Implemented `each_record` to iterate over records, returning an
10
+ `Enumerator` if no block is provided.
11
+
12
+ ### Changed
13
+
14
+ - Updated CI images from Alpine to Debian:
15
+ - Switched Ruby **4.0**, **3.4**, **3.3**, and **3.2** images to the
16
+ `trixie` distribution.
17
+ - Switched Ruby **3.1** image to the `bookworm` distribution.
18
+ - Updated the `dockerfile` in `.all_images.yml` to use `apt-get` instead of
19
+ `apk` for installing build dependencies.
20
+
21
+ ## 2026-05-20 v0.3.2
22
+
23
+ ### Performance Improvements
24
+
25
+ - Optimized collections lookup by memoizing
26
+ `Documentrix::Documents#collections` using `@collections_cache`.
27
+ - Added `Documentrix::Documents#invalidate_collections_cache!` to reset the
28
+ memoized list.
29
+ - Integrated `invalidate_collections_cache!` into `add`, `delete`, `clear`,
30
+ `source_remove`, and `rename_collection` to maintain cache consistency.
31
+
32
+ ### Database & Cache
33
+
34
+ - Standardized SQL keyword casing to uppercase for data types (e.g., `FLOAT`,
35
+ `TEXT`, `INTEGER`, `JSON`) within `SQLiteCache`.
36
+
37
+ ### Documentation & Testing
38
+
39
+ - Refined documentation and return type descriptions for
40
+ `Documentrix::Documents#find` and `Documentrix::Documents#prefix`.
41
+ - Updated `spec/documents_spec.rb` to ensure `invalidate_collections_cache!` is
42
+ correctly triggered during mutations.
43
+
3
44
  ## 2026-05-18 v0.3.1
4
45
 
5
46
  - Fixed scoping bugs in `clear_by_source` and `source_exist?` by implementing a
data/documentrix.gemspec CHANGED
@@ -1,9 +1,9 @@
1
1
  # -*- encoding: utf-8 -*-
2
- # stub: documentrix 0.3.1 ruby lib
2
+ # stub: documentrix 0.4.0 ruby lib
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "documentrix".freeze
6
- s.version = "0.3.1".freeze
6
+ s.version = "0.4.0".freeze
7
7
 
8
8
  s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
9
9
  s.require_paths = ["lib".freeze]
@@ -443,18 +443,18 @@ class Documentrix::Documents::Cache::SQLiteCache
443
443
  @database.enable_load_extension(false)
444
444
  execute %{
445
445
  CREATE VIRTUAL TABLE IF NOT EXISTS embeddings USING vec0(
446
- embedding float[#@embedding_length]
446
+ embedding FLOAT[#@embedding_length]
447
447
  )
448
448
  }
449
449
  execute %{
450
450
  CREATE TABLE IF NOT EXISTS records (
451
- key text NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
452
- text text NOT NULL DEFAULT '',
453
- embedding_id integer,
454
- norm float NOT NULL DEFAULT 0.0,
455
- source text,
456
- digest text,
457
- tags json NOT NULL DEFAULT [],
451
+ key TEXT NOT NULL PRIMARY KEY ON CONFLICT REPLACE,
452
+ text TEXT NOT NULL DEFAULT '',
453
+ embedding_id INTEGER,
454
+ norm FLOAT NOT NULL DEFAULT 0.0,
455
+ source TEXT,
456
+ digest TEXT,
457
+ tags JSON NOT NULL DEFAULT [],
458
458
  FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
459
459
  )
460
460
  }
@@ -162,7 +162,7 @@ class Documentrix::Documents
162
162
  infobar.progress by: batch.size
163
163
  end
164
164
  infobar.newline
165
- self
165
+ invalidate_collections_cache!
166
166
  end
167
167
  alias << add
168
168
 
@@ -201,7 +201,9 @@ class Documentrix::Documents
201
201
  # @return [ FalseClass, TrueClass ] true if the text was removed, false
202
202
  # otherwise.
203
203
  def delete(text)
204
- @cache.delete(key(text))
204
+ res = @cache.delete(key(text))
205
+ invalidate_collections_cache! if res
206
+ res
205
207
  end
206
208
 
207
209
  # The size method returns the number of texts stored in the cache of this
@@ -220,7 +222,7 @@ class Documentrix::Documents
220
222
  # @return [ Documentrix::Documents ] self
221
223
  def clear(tags: nil)
222
224
  @cache.clear(tags:)
223
- self
225
+ invalidate_collections_cache!
224
226
  end
225
227
 
226
228
  # Normalizes the source identifier to a canonical form.
@@ -318,7 +320,7 @@ class Documentrix::Documents
318
320
  def source_remove(source, digest: nil)
319
321
  source = normalize_source(source)
320
322
  @cache.clear_by_source(source, digest:, operator: '!=')
321
- self
323
+ invalidate_collections_cache!
322
324
  end
323
325
 
324
326
  # The find method searches for strings within the cache by computing their
@@ -359,7 +361,7 @@ class Documentrix::Documents
359
361
  # @param text_count [Integer] the maximum number of records to return
360
362
  # @param opts [Hash] additional options passed to #find, such as:
361
363
  # * :tags [Array<String>] filter results by tags
362
- # * :prompt [String] a prompt to use for the search
364
+ # * :prompt [String] use for the search
363
365
  # * :min_similarity [Numeric] minimum similarity score
364
366
  #
365
367
  # @example
@@ -379,7 +381,9 @@ class Documentrix::Documents
379
381
  #
380
382
  # @return [Array] An array of unique collection names
381
383
  def collections
382
- ([ default_collection ] + @cache.collections('%s-' % class_prefix)).uniq
384
+ @collections_cache ||= (
385
+ [ default_collection ] + @cache.collections('%s-' % class_prefix)
386
+ ).uniq
383
387
  end
384
388
 
385
389
  # Rename the current collection, moving all keys from the old prefix to a new
@@ -395,6 +399,7 @@ class Documentrix::Documents
395
399
  new_prefix = '%s-%s-' % [ class_prefix, new_collection ]
396
400
  @cache.move_prefix(prefix, new_prefix)
397
401
  self.collection = new_collection
402
+ invalidate_collections_cache!
398
403
  end
399
404
 
400
405
  # The tags method returns an array of unique tags from the cache.
@@ -404,8 +409,36 @@ class Documentrix::Documents
404
409
  @cache.tags
405
410
  end
406
411
 
412
+ # Returns an array of all unique sources stored in the cache.
413
+ #
414
+ # @return [Array<String>] An array of unique source identifiers.
415
+ def sources
416
+ @cache.each_source.to_a
417
+ end
418
+
419
+ # The each_record method iterates over all records stored in the cache.
420
+ #
421
+ # @yield [record] The record being iterated over.
422
+ # @return [ Enumerator ] an enumerator if no block is provided.
423
+ def each_record(&block)
424
+ block or return enum_for(__method__)
425
+ @cache.each { |_key, record| block.(record) }
426
+ end
427
+
407
428
  private
408
429
 
430
+ # Resets the memoized list of collections.
431
+ #
432
+ # This is called whenever a mutation occurs that could change the set of
433
+ # existing collections, ensuring that the #collections method returns a
434
+ # fresh, accurate list on the next call.
435
+ #
436
+ # @return [ Documentrix::Documents ] self
437
+ def invalidate_collections_cache!
438
+ @collections_cache = nil
439
+ self
440
+ end
441
+
409
442
  # The connect_cache method initializes and returns an instance of the
410
443
  # specified cache class.
411
444
  #
@@ -482,7 +515,7 @@ class Documentrix::Documents
482
515
  # The prefix method returns a string that is used as the prefix for keys in
483
516
  # the cache of the currently configured collection.
484
517
  #
485
- # @return [ String ] The prefix string
518
+ # @return [ String ] the prefix string
486
519
  def prefix
487
520
  '%s-%s-' % [ class_prefix, @collection ]
488
521
  end
@@ -1,6 +1,6 @@
1
1
  module Documentrix
2
2
  # Documentrix version
3
- VERSION = '0.3.1'
3
+ VERSION = '0.4.0'
4
4
  VERSION_ARRAY = VERSION.split('.').map(&:to_i) # :nodoc:
5
5
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
6
6
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
@@ -23,6 +23,8 @@ describe Documentrix::Documents do
23
23
  expect(ollama).to receive(:embed).
24
24
  with(model:, input: %w[ foo bar ], options: nil).
25
25
  and_return(double(embeddings: [ [ 0.1 ], [ 0.2 ] ]))
26
+ expect(documents).to receive(:invalidate_collections_cache!).
27
+ and_call_original
26
28
  expect(documents.add(%w[ foo bar ])).to eq documents
27
29
  expect(documents.exist?('foo')).to eq true
28
30
  expect(documents.exist?('bar')).to eq true
@@ -33,6 +35,8 @@ describe Documentrix::Documents do
33
35
  expect(ollama).to receive(:embed).
34
36
  with(model:, input: %w[ foo ], options: nil).
35
37
  and_return(double(embeddings: [ [ 0.1 ] ]))
38
+ expect(documents).to receive(:invalidate_collections_cache!).
39
+ and_call_original
36
40
  expect(documents << 'foo').to eq documents
37
41
  expect(documents.exist?('foo')).to eq true
38
42
  expect(documents.exist?('bar')).to eq false
@@ -123,6 +127,8 @@ describe Documentrix::Documents do
123
127
 
124
128
  it 'can delete texts' do
125
129
  expect(documents << 'foo').to eq documents
130
+ expect(documents).to receive(:invalidate_collections_cache!).
131
+ and_call_original
126
132
  expect {
127
133
  documents.delete('foo')
128
134
  }.to change { documents.exist?('foo') }.from(true).to(false)
@@ -136,6 +142,8 @@ describe Documentrix::Documents do
136
142
 
137
143
  it 'can clear texts' do
138
144
  expect(documents << 'foo').to eq documents
145
+ expect(documents).to receive(:invalidate_collections_cache!).
146
+ and_call_original
139
147
  expect {
140
148
  documents.clear
141
149
  }.to change { documents.size }.from(1).to(0)
@@ -148,9 +156,13 @@ describe Documentrix::Documents do
148
156
  expect(documents.add('foo', tags: %w[ test ])).to eq documents
149
157
  expect(documents.add('bar', tags: %w[ test2 ])).to eq documents
150
158
  expect(documents.tags.to_a).to eq %w[ test test2 ]
159
+ expect(documents).to receive(:invalidate_collections_cache!).
160
+ and_call_original
151
161
  expect {
152
162
  documents.clear tags: 'test'
153
163
  }.to change { documents.size }.from(2).to(1)
164
+ expect(documents).to receive(:invalidate_collections_cache!).
165
+ and_call_original
154
166
  expect {
155
167
  documents.clear tags: :test2
156
168
  }.to change { documents.size }.from(1).to(0)
@@ -166,6 +178,8 @@ describe Documentrix::Documents do
166
178
 
167
179
  expect(documents.size).to eq 3
168
180
 
181
+ expect(documents).to receive(:invalidate_collections_cache!).
182
+ and_call_original
169
183
  documents.source_remove('source1')
170
184
 
171
185
  expect(documents.size).to eq 1
@@ -182,6 +196,8 @@ describe Documentrix::Documents do
182
196
  documents.collection = :foo
183
197
  documents << 'foo'
184
198
  expect(documents.collections).to eq %i[ default foo ]
199
+ expect(documents).to receive(:invalidate_collections_cache!).
200
+ and_call_original
185
201
  documents.rename_collection(:bar)
186
202
  expect(documents.collection).to eq :bar
187
203
  expect(documents.collections).to eq %i[ default bar ]
@@ -195,6 +211,8 @@ describe Documentrix::Documents do
195
211
  documents.collection = :bar
196
212
  documents << 'foo'
197
213
  expect(documents.collections).to eq %i[ default foo bar ]
214
+ expect(documents).not_to receive(:invalidate_collections_cache!).
215
+ and_call_original
198
216
  expect {
199
217
  documents.rename_collection(:foo)
200
218
  }.to raise_error(ArgumentError, 'new collection foo already exists!')
@@ -208,6 +226,32 @@ describe Documentrix::Documents do
208
226
  from(:default).
209
227
  to(:new_collection)
210
228
  end
229
+
230
+ it 'returns unique sources' do
231
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
232
+ documents.add('foo', source: 's1')
233
+ documents.add('bar', source: 's1')
234
+ documents.add('baz', source: 's2')
235
+ expect(documents.sources).to match_array %w[ s1 s2 ]
236
+ end
237
+
238
+ it 'can iterate over records' do
239
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
240
+ documents.add('foo')
241
+ documents.add('bar')
242
+ records = []
243
+ documents.each_record { |r| records << r }
244
+ expect(records.size).to eq 2
245
+ expect(records.map(&:text)).to match_array %w[ foo bar ]
246
+ end
247
+
248
+ it 'returns an enumerator for each_record' do
249
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
250
+ documents.add('foo')
251
+ documents.add('bar')
252
+ expect(documents.each_record).to be_a Enumerator
253
+ expect(documents.each_record.map(&:text)).to match_array %w[ foo bar ]
254
+ end
211
255
  end
212
256
 
213
257
  context 'source management' do
@@ -215,7 +259,7 @@ describe Documentrix::Documents do
215
259
  allow(documents).to receive(:compute_file_digest).and_return('d1')
216
260
  allow(documents.cache).to receive(:compute_file_digest).and_return('d1')
217
261
 
218
- allow(ollama).to receive(:embed).and_return(double(embeddings: [[0.1]]))
262
+ allow(ollama).to receive(:embed).and_return(double(embeddings: [ [ 0.1 ] ]))
219
263
  end
220
264
 
221
265
  it 'can check if a source exists' do
@@ -243,6 +287,8 @@ describe Documentrix::Documents do
243
287
  documents.add('foo', source: 's1')
244
288
 
245
289
  expect(ollama).not_to receive(:embed)
290
+ expect(documents).to receive(:invalidate_collections_cache!).
291
+ and_call_original
246
292
  documents.source_update(['foo'], source: 's1')
247
293
  expect(documents.exist?('foo')).to be true
248
294
  end
@@ -255,6 +301,8 @@ describe Documentrix::Documents do
255
301
  allow(documents.cache).to receive(:compute_file_digest).with('s1').and_return('d2')
256
302
 
257
303
  expect(ollama).to receive(:embed).once
304
+ expect(documents).to receive(:invalidate_collections_cache!).
305
+ at_least(1).and_call_original
258
306
  documents.source_update(['bar'], source: 's1')
259
307
 
260
308
  expect(documents.exist?('bar')).to be true
@@ -263,6 +311,8 @@ describe Documentrix::Documents do
263
311
 
264
312
  it 'updates the source if it is an URL' do
265
313
  expect(ollama).to receive(:embed).once
314
+ expect(documents).to receive(:invalidate_collections_cache!).
315
+ and_call_original
266
316
  documents.source_update('foo', source: 'https://www.example.com/s1')
267
317
  expect(documents.exist?('foo')).to be true
268
318
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: documentrix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Florian Frank