documentrix 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,60 @@
1
1
  module Documentrix::Documents::Splitters
2
+ # Semantic splitter that divides text based on thematic changes in meaning.
3
+ #
4
+ # It works by splitting text into sentences, computing embeddings for each,
5
+ # and then calculating the cosine distance between adjacent sentences.
6
+ # Where the distance exceeds a calculated threshold (the "breakpoint"),
7
+ # a semantic boundary is identified.
8
+ #
9
+ # @example
10
+ # splitter = Documentrix::Documents::Splitters::Semantic.new(
11
+ # ollama: ollama_client,
12
+ # model: 'mxbai-embed-large'
13
+ # )
14
+ # chunks = splitter.split(text, breakpoint: :percentile, percentile: 90)
2
15
  class Semantic
16
+ include Documentrix::Documents::Splitters::Common
3
17
  include Documentrix::Utils::Math
4
18
 
5
- DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
19
+ # The default regex used to identify sentence boundaries for semantic
20
+ # splitting. It matches a sentence-ending punctuation mark (., !, ?)
21
+ # followed by optional whitespace at a word boundary or the end of the
22
+ # string.
23
+ #
24
+ # @return [Regexp]
25
+ DEFAULT_SEPARATOR = /[.!?,;]\s*(?:\b|\z)/
6
26
 
7
- def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
8
- @ollama, @model, @model_options, @separator, @chunk_size =
9
- ollama, model, model_options, separator, chunk_size
27
+ # Initializes a new Semantic splitter.
28
+ #
29
+ # @param ollama [Ollama::Client] the client used for generating embeddings
30
+ # @param model [String] the embedding model name
31
+ # @param model_options [Hash, nil] optional parameters passed to the embedding model
32
+ # @param separator [Regexp] the regex used to identify sentence boundaries
33
+ # @param chunk_size [Integer] the maximum character length of a resulting chunk
34
+ # @param force [Boolean] whether to force split chunks that exceed chunk_size (defaults to false)
35
+ def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096, force: false)
36
+ @ollama, @model, @model_options, @separator, @chunk_size, @force =
37
+ ollama, model, model_options, separator, chunk_size, force
10
38
  end
11
39
 
40
+ # Splits the given text into semantic chunks.
41
+ #
42
+ # The method first decomposes the text into sentences, then identifies gaps
43
+ # in semantic similarity. It then groups these sentences into chunks that
44
+ # respect both the semantic boundaries and the maximum chunk size.
45
+ #
46
+ # @param text [String] the text to be split
47
+ # @param batch_size [Integer] the number of sentences to embed in a single API call
48
+ # @param breakpoint [Symbol] the method used to determine the distance threshold
49
+ # * :percentile (default) - uses the N-th percentile of distances
50
+ # * :standard_deviation - uses mean + (std_dev * multiplier)
51
+ # * :interquartile - uses mean + (iqr * multiplier)
52
+ # @param opts [Hash] additional options for the splitting process:
53
+ # * :include_separator [Boolean] whether to keep the sentence separator in the result
54
+ # * :percentile [Integer] the percentile to use if breakpoint is :percentile (default: 95)
55
+ # * :percentage [Integer] the multiplier percentage for :standard_deviation or :interquartile (default: 100)
56
+ #
57
+ # @return [Array<String>] an array of semantically grouped text chunks
12
58
  def split(text, batch_size: 100, breakpoint: :percentile, **opts)
13
59
  sentences = Documentrix::Documents::Splitters::Character.new(
14
60
  separator: @separator,
@@ -45,22 +91,30 @@ module Documentrix::Documents::Splitters
45
91
  if current_text.size + sentence.size < @chunk_size
46
92
  current_text += sentence
47
93
  else
48
- current_text.empty? or result << current_text
94
+ result.concat(force_split(current_text))
49
95
  current_text = sentence
50
96
  end
51
97
  end
52
- unless current_text.empty?
53
- result << current_text
98
+ if current_text.present?
99
+ result.concat(force_split(current_text))
54
100
  current_text = +''
55
101
  end
56
102
  sg = g.succ
57
103
  end
58
- current_text.empty? or result << current_text
104
+ result.concat(force_split(current_text))
59
105
  result
60
106
  end
61
107
 
62
108
  private
63
109
 
110
+ # Calculates the distance threshold used to identify semantic boundaries.
111
+ #
112
+ # @param breakpoint_method [Symbol] the method to use (:percentile, :standard_deviation, :interquartile)
113
+ # @param distances [Array<Float>] the cosine distances between adjacent sentences
114
+ # @param opts [Hash] options specific to the chosen method (e.g., :percentile, :percentage)
115
+ #
116
+ # @return [Float] the distance threshold
117
+ # @raise [ArgumentError] if an unsupported breakpoint_method is provided
64
118
  def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
65
119
  sequence = MoreMath::Sequence.new(distances)
66
120
  case breakpoint_method
@@ -82,6 +136,11 @@ module Documentrix::Documents::Splitters
82
136
  end
83
137
  end
84
138
 
139
+ # Fetches embeddings for a batch of sentences and converts them to
140
+ # Numo::NArray.
141
+ #
142
+ # @param input [Array<String>] the batch of sentences to embed
143
+ # @return [Array<Numo::NArray>] an array of embeddings as Numo arrays
85
144
  def sentence_embeddings(input)
86
145
  @ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
87
146
  Numo::NArray[*_1]
@@ -1,5 +1,4 @@
1
1
  require 'numo/narray'
2
- require 'digest'
3
2
  require 'kramdown/ansi'
4
3
 
5
4
  class Documentrix::Documents
@@ -33,6 +32,7 @@ require 'documentrix/documents/cache/sqlite_cache'
33
32
  # to prepare text data for embedding and storage in vector databases.
34
33
  module Documentrix::Documents::Splitters
35
34
  end
35
+ require 'documentrix/documents/splitters/common'
36
36
  require 'documentrix/documents/splitters/character'
37
37
  require 'documentrix/documents/splitters/semantic'
38
38
 
@@ -59,6 +59,7 @@ require 'documentrix/documents/splitters/semantic'
59
59
  class Documentrix::Documents
60
60
  include Kramdown::ANSI::Width
61
61
  include Documentrix::Documents::Cache
62
+ include Documentrix::Utils::Digests
62
63
 
63
64
  # Shortcut for Documentrix::Documents::Cache::Records::Record
64
65
  Record = Class.new Documentrix::Documents::Cache::Records::Record
@@ -116,16 +117,16 @@ class Documentrix::Documents
116
117
  texts
117
118
  end
118
119
 
119
-
120
- # The method adds new texts `texts` to the documents collection by
120
+ # The add method adds new texts `texts` to the documents collection by
121
121
  # processing them through various stages. It first filters out existing texts
122
122
  # from the input array using the `prepare_texts` method, then fetches
123
123
  # embeddings for each text using the specified model and options. The fetched
124
124
  # embeddings are used to create a new record in the cache, which is
125
- # associated with the original text and tags (if any). The method processes
126
- # the texts in batches of size , displaying progress information
127
- # in the console. It also accepts an optional string to associate
128
- # with the added texts and an array of to attach to each record. Once
125
+ # associated with the original text, tags, and version digest (if any). The
126
+ # method processes the texts in batches of size `batch_size`, displaying
127
+ # progress information in the console. It also accepts an optional `source`
128
+ # string to associate with the added texts, an array of `tags` to attach to
129
+ # each record, and an optional `digest` string for version tracking. Once
129
130
  # all texts have been processed, it returns the `Documentrix::Documents`
130
131
  # instance itself, allowing for method chaining.
131
132
  #
@@ -133,14 +134,17 @@ class Documentrix::Documents
133
134
  # @param batch_size [Integer] the number of texts to process in one batch
134
135
  # @param source [String] the source URL for the added texts
135
136
  # @param tags [Array] an array of tags associated with the added texts
137
+ # @param digest [String, nil] the SHA256 hexadecimal digest of the source
136
138
  #
137
139
  # @example
138
140
  # documents.add(%w[ foo bar ], batch_size: 23, source: 'https://example.com', tags: %w[tag1 tag2])
139
141
  #
140
142
  # @return [Documentrix::Documents] self
141
- def add(texts, batch_size: nil, source: nil, tags: [])
142
- texts = prepare_texts(texts) or return self
143
- tags = Documentrix::Utils::Tags.new(tags, source:)
143
+ def add(texts, batch_size: nil, source: nil, tags: [], digest: nil)
144
+ texts = prepare_texts(texts) or return self
145
+ source = normalize_source(source)
146
+ tags = Documentrix::Utils::Tags.new(tags, source:)
147
+ digest ||= compute_file_digest(source)
144
148
  if source
145
149
  tags.add(File.basename(source).gsub(/\?.*/, ''), source:)
146
150
  end
@@ -153,7 +157,7 @@ class Documentrix::Documents
153
157
  embeddings = fetch_embeddings(model:, options: @model_options, input: batch)
154
158
  batch.zip(embeddings) do |text, embedding|
155
159
  norm = @cache.norm(embedding)
156
- self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a]
160
+ self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a, digest:]
157
161
  end
158
162
  infobar.progress by: batch.size
159
163
  end
@@ -219,13 +223,101 @@ class Documentrix::Documents
219
223
  self
220
224
  end
221
225
 
222
- # The remove method removes all documents associated with the given source.
226
+ # Normalizes the source identifier to a canonical form.
227
+ #
228
+ # If the source is blank, returns nil.
229
+ # If the source is an absolute URL, it is returned as-is.
230
+ # If the source is a local file path that exists, it is expanded to its real
231
+ # path, resolving all symlinks and absolute paths.
232
+ # Otherwise, the original source is returned.
233
+ #
234
+ # @param source [String, #to_s] the source identifier to normalize
235
+ # @return [String, nil] the normalized canonical path, the original source,
236
+ # or nil if blank
237
+ def normalize_source(source)
238
+ source.blank? and return
239
+ begin
240
+ URI::PARSER.parse(source).absolute? and return source
241
+ rescue
242
+ end
243
+ Pathname.new(source).realpath.to_path
244
+ rescue Errno::ENOENT
245
+ source
246
+ end
247
+
248
+ # The source_exist? method checks if any records associated with the given
249
+ # source exist in the cache. If a digest is provided, it verifies if the
250
+ # source exists and satisfies the comparison with the specified digest.
251
+ #
252
+ # @param source [#to_s] the source to check for existence
253
+ # @param digest [String, nil] the SHA256 hexadecimal digest to compare
254
+ # against the stored source digest (optional)
255
+ # @param operator [Symbol, String] the operator to compare the digest with
256
+ # (defaults to '=')
257
+ #
258
+ # @return [Boolean] true if the source exists (and satisfies the digest
259
+ # comparison if provided), false otherwise.
260
+ def source_exist?(source, digest: nil, operator: ?=)
261
+ source = normalize_source(source)
262
+ @cache.source_exist?(source, digest:, operator:)
263
+ end
264
+
265
+ # Checks if the content of the given source has been modified compared to
266
+ # the version stored in the cache, or if it is missing from the cache.
267
+ #
268
+ # The method is considered modified (returns true) if:
269
+ # 1. The source is blank or cannot be normalized.
270
+ # 2. The source is not a valid local file or its digest cannot be computed.
271
+ # 3. No records exist in the cache for this source.
272
+ # 4. Records exist in the cache for this source, but they have a different
273
+ # digest than the current version on disk.
274
+ #
275
+ # @param source [String, #to_s] the source identifier to check
276
+ # @return [Boolean] true if the source is modified, missing, or cannot be
277
+ # verified, false if it is up-to-date.
278
+ def source_modified?(source)
279
+ source = normalize_source(source) or return true
280
+ digest = compute_file_digest(source) or return true
281
+ !source_exist?(source) || source_exist?(source, digest:, operator: '!=')
282
+ end
283
+
284
+ # Updates the records associated with a given source.
285
+ #
286
+ # If the source already exists in the cache, this method computes its current
287
+ # digest and removes only the stale records that do not match this digest. If
288
+ # the source is new or has been modified, it adds the provided texts to the
289
+ # cache.
290
+ #
291
+ # @param texts [Array] the text strings to add if the source is new or modified
292
+ # @param opts [Hash] additional options passed to #add (e.g., :batch_size, :tags)
293
+ # * :source [#to_s] the source to update
294
+ #
295
+ # @return [Documentrix::Documents, nil] the instance itself if the source
296
+ # was added/updated, or nil if the source was already up-to-date.
297
+ def source_update(texts, **opts)
298
+ if source = normalize_source(opts[:source]) and source_exist?(source)
299
+ digest = compute_file_digest(source)
300
+ source_remove(source, digest:)
301
+ unless source_exist?(source, digest:, operator: ?=)
302
+ opts[:digest] = digest
303
+ add(texts, **opts)
304
+ end
305
+ else
306
+ add(texts, **opts)
307
+ end
308
+ end
309
+
310
+ # The source_remove method removes all documents associated with the given
311
+ # source.
223
312
  #
224
- # @param source [String] the source of the documents to remove
313
+ # @param source [#to_s] the source of the documents to remove
314
+ # @param digest [String, nil] the SHA256 hexadecimal digest for which records
315
+ # with this source are **not** to be removed if given.
225
316
  #
226
317
  # @return [Documentrix::Documents] self
227
- def remove(source)
228
- @cache.clear_by_source(source)
318
+ def source_remove(source, digest: nil)
319
+ source = normalize_source(source)
320
+ @cache.clear_by_source(source, digest:, operator: '!=')
229
321
  self
230
322
  end
231
323
 
@@ -236,14 +328,16 @@ class Documentrix::Documents
236
328
  # @param tags [Array<String>] an array of tags to filter results by (optional)
237
329
  # @param prompt [String] a prompt to use when searching for similar strings (optional)
238
330
  # @param max_records [Integer] the maximum number of records to return (optional)
331
+ # @param min_similarity [Numeric] the minimum similarity score to include in results (defaults to -1)
239
332
  #
240
333
  # @example
241
334
  # documents.find("foo")
242
335
  #
243
336
  # @return [Array<Documentrix::Documents::Record>]
244
- def find(string, tags: nil, prompt: nil, max_records: nil)
337
+ def find(string, tags: nil, prompt: nil, max_records: nil, min_similarity: nil)
338
+ min_similarity ||= -1
245
339
  needle = convert_to_vector(string, prompt:)
246
- @cache.find_records(needle, tags:, max_records: nil)
340
+ @cache.find_records(needle, tags:, max_records:, min_similarity:)
247
341
  end
248
342
 
249
343
  # The method filters the records returned by find based on text
@@ -256,20 +350,28 @@ class Documentrix::Documents
256
350
  # @example
257
351
  # documents.find_where('foo', text_size: 3, text_count: 1)
258
352
  # @return [Array<Documentrix::Documents::Record>] the filtered records
353
+
354
+ # The find_where method filters the records returned by find based on text
355
+ # size and count.
356
+ #
357
+ # @param string [String] the search query
358
+ # @param text_size [Integer] the maximum allowed total text size to return
359
+ # @param text_count [Integer] the maximum number of records to return
360
+ # @param opts [Hash] additional options passed to #find, such as:
361
+ # * :tags [Array<String>] filter results by tags
362
+ # * :prompt [String] a prompt to use for the search
363
+ # * :min_similarity [Numeric] minimum similarity score
364
+ #
365
+ # @example
366
+ # documents.find_where('foo', text_size: 1000, text_count: 5, tags: ['ruby'])
367
+ #
368
+ # @return [Array<Documentrix::Documents::Record>] the filtered records
259
369
  def find_where(string, text_size: nil, text_count: nil, **opts)
260
- if text_count
261
- opts[:max_records] = text_count
262
- end
370
+ text_count and opts[:max_records] = text_count
263
371
  records = find(string, **opts)
264
- size, count = 0, 0
372
+ size = 0
265
373
  records.take_while do |record|
266
- if text_size and (size += record.text.size) > text_size
267
- next false
268
- end
269
- if text_count and (count += 1) > text_count
270
- next false
271
- end
272
- true
374
+ !text_size || (size += record.text.size) <= text_size
273
375
  end
274
376
  end
275
377
 
@@ -333,6 +435,8 @@ class Documentrix::Documents
333
435
  debug: @debug
334
436
  )
335
437
  end
438
+ rescue => e
439
+ warn "Caught #{e.class}: #{e}"
336
440
  ensure
337
441
  cache ||= MemoryCache.new(prefix:,)
338
442
  return cache
@@ -389,6 +493,6 @@ class Documentrix::Documents
389
493
  #
390
494
  # @return [String] the SHA256 hash of the input string
391
495
  def key(input)
392
- Digest::SHA256.hexdigest(input)
496
+ compute_digest(input)
393
497
  end
394
498
  end
@@ -4,33 +4,37 @@ require 'kramdown/ansi'
4
4
  # A utility class for colorizing and formatting text output with ANSI color
5
5
  # codes and size information.
6
6
  #
7
- # The ColorizeTexts class takes an array of text strings and formats them with
8
- # different ANSI colors for visual distinction. It also appends the size of each
9
- # text block to the output, making it useful for debugging or displaying
10
- # information about text chunks in a visually appealing way.
7
+ # The ColorizeTexts class takes a collection of text strings and formats them
8
+ # with dynamically generated ANSI colors for visual distinction. Each text
9
+ # block is wrapped to fit the terminal width and appended with its size in
10
+ # bytes, making it ideal for debugging text-splitting pipelines.
11
11
  #
12
12
  # @example
13
- # colorizer = Documentrix::Utils::ColorizeTexts.new('foo', 'bar')
13
+ # colorizer = Documentrix::Utils::ColorizeTexts.new('First chunk', 'Second chunk')
14
14
  # puts colorizer.to_s
15
15
  class Documentrix::Utils::ColorizeTexts
16
16
  include Math
17
17
  include Term::ANSIColor
18
18
  include Kramdown::ANSI::Width
19
19
 
20
- # Initializes a new instance of Documentrix::::ColorizeTexts
20
+ # Initializes a new instance of ColorizeTexts.
21
21
  #
22
- # @param [Array<String>] texts the array of strings to be displayed with colors
22
+ # @param texts [String, Array<String>] a variable list of strings or an array
23
+ # of strings to be colorized.
23
24
  #
24
- # @return [Documentrix::::ColorizeTexts] an instance of Documentrix::::ColorizeTexts
25
+ # @return [Documentrix::Utils::ColorizeTexts] a new instance of ColorizeTexts
25
26
  def initialize(*texts)
26
- texts = texts.map(&:to_a)
27
- @texts = Array(texts.flatten)
27
+ @texts = texts.flatten
28
28
  end
29
29
 
30
- # Returns a string representation of the object, including all texts content,
31
- # colored differently and their sizes.
30
+ # Returns a formatted string representation of the texts.
32
31
  #
33
- # @return [String] The formatted string.
32
+ # Each text block is:
33
+ # 1. Assigned a color from a trigonometric RGB gradient.
34
+ # 2. Wrapped to 90% of the terminal width.
35
+ # 3. Appended with its size in bold text.
36
+ #
37
+ # @return [String] the colorized and formatted output string.
34
38
  def to_s
35
39
  result = +''
36
40
  @texts.each_with_index do |t, i|
@@ -45,14 +49,13 @@ class Documentrix::Utils::ColorizeTexts
45
49
 
46
50
  private
47
51
 
48
- # Returns the nearest RGB color to the given ANSI color
52
+ # Determines the optimal text color (black or white) for a given background
53
+ # color to ensure maximum readability based on contrast.
49
54
  #
50
- # @param [color] color The ANSI color attribute
55
+ # @param color [Symbol, Term::ANSIColor::Attribute] the ANSI color attribute
51
56
  #
52
- # @return [Array<RGBTriple>] An array containing two RGB colors, one for black and
53
- # one for white text, where the first is the closest match to the input color
54
- # when printed on a black background, and the second is the closest match
55
- # when printed on a white background.
57
+ # @return [Array<String>] an array containing the RGB colors that provide
58
+ # the best contrast for black and white backgrounds.
56
59
  def text_color(color)
57
60
  color = Term::ANSIColor::Attribute[color]
58
61
  [
@@ -61,9 +64,10 @@ class Documentrix::Utils::ColorizeTexts
61
64
  ].max_by { |t| t.distance_to(color) }
62
65
  end
63
66
 
64
- # Returns an array of colors for each step in the gradient
67
+ # Generates a 256-color RGB gradient using sine wave oscillations.
65
68
  #
66
- # @return [Array<Array<Integer>>] An array of RGB color arrays
69
+ # @return [Array<Array<Integer>>] an array of 256 RGB color arrays,
70
+ # where each inner array contains [R, G, B] values from 0 to 255.
67
71
  def colors
68
72
  @colors ||= (0..255).map { |i|
69
73
  [
@@ -0,0 +1,78 @@
1
+ require 'digest'
2
+ require 'uri'
3
+
4
+ # Module for computing cryptographic digests used for tracking content changes.
5
+ module Documentrix::Utils::Digests
6
+ private
7
+
8
+ @@file_digest_cache = {}
9
+
10
+ # Computes the SHA256 hexadecimal digest of the given text.
11
+ #
12
+ # @param text [String] the text to be hashed
13
+ # @return [String] the SHA256 hexadecimal digest
14
+ def compute_digest(text)
15
+ Digest::SHA256.hexdigest(text)
16
+ end
17
+
18
+ # Stores a computed digest in the internal cache, keyed by the filename
19
+ # and the file's modification time.
20
+ #
21
+ # @param filename [String] the path to the file
22
+ # @param stat [File::Stat] the status information of the file
23
+ # @param digest [String] the SHA256 digest to store
24
+ # @return [void]
25
+ def file_digest_store(filename, stat, digest)
26
+ @@file_digest_cache[[filename, stat.mtime]] = digest
27
+ end
28
+
29
+ # Checks if a valid digest exists in the internal cache for the given
30
+ # filename and modification time.
31
+ #
32
+ # @param filename [String] the path to the file
33
+ # @param stat [File::Stat] the status information of the file
34
+ # @return [String, nil] the cached digest if found, nil otherwise
35
+ def file_digest_cached?(filename, stat)
36
+ @@file_digest_cache.fetch([filename, stat.mtime], nil)
37
+ end
38
+
39
+ # Clears the internal file digest cache.
40
+ #
41
+ # This removes all stored digests and their associated modification times,
42
+ # forcing subsequent calls to #compute_file_digest to re-read files from
43
+ # disk.
44
+ def file_digest_cache_clear
45
+ @@file_digest_cache&.clear
46
+ end
47
+
48
+ # Computes the SHA256 hexadecimal digest of a local file's content.
49
+ #
50
+ # This method first verifies that the provided filename is not an absolute
51
+ # URL and that the file actually exists on the filesystem before reading
52
+ # and hashing its content. It uses an internal cache to avoid re-reading
53
+ # the file if the modification time has not changed.
54
+ #
55
+ # @param filename [String, #to_s] the path to the local file
56
+ # @return [String, nil] the SHA256 hexadecimal digest if the file is a
57
+ # valid local file and exists, nil otherwise.
58
+ def compute_file_digest(filename)
59
+ filename = filename.to_s
60
+ case
61
+ when !filename.present?
62
+ nil
63
+ when (URI::PARSER.parse(filename).absolute? rescue nil)
64
+ nil
65
+ else
66
+ stat = begin
67
+ File.stat(filename)
68
+ rescue Errno::ENOENT
69
+ end
70
+ stat or return
71
+ if digest = file_digest_cached?(filename, stat)
72
+ digest
73
+ else
74
+ file_digest_store(filename, stat, compute_digest(File.read(filename)))
75
+ end
76
+ end
77
+ end
78
+ end
@@ -13,3 +13,4 @@ end
13
13
  require 'documentrix/utils/colorize_texts'
14
14
  require 'documentrix/utils/math'
15
15
  require 'documentrix/utils/tags'
16
+ require 'documentrix/utils/digests'
@@ -1,6 +1,6 @@
1
1
  module Documentrix
2
2
  # Documentrix version
3
- VERSION = '0.2.0'
3
+ VERSION = '0.3.0'
4
4
  VERSION_ARRAY = VERSION.split('.').map(&:to_i) # :nodoc:
5
5
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
6
6
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe 'Documentrix::Documents::Cache Interface' do
4
2
  describe 'MemoryCache Interface' do
5
3
  let(:cache) { Documentrix::Documents::MemoryCache.new(prefix: 'test-') }
@@ -55,13 +53,22 @@ describe 'Documentrix::Documents::Cache Interface' do
55
53
  expect(cache).to respond_to(:clear_by_source)
56
54
  expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
57
55
 
56
+ expect(cache).to respond_to(:source_exist?)
57
+ expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
58
+
58
59
  expect(cache).to respond_to(:clear)
59
60
  expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
60
61
  end
61
62
  end
62
63
 
63
64
  describe 'RedisCache Interface' do
64
- let(:cache) { Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379') }
65
+ let :object_class do
66
+ Documentrix::Documents::Cache::Records::Record
67
+ end
68
+
69
+ let(:cache) do
70
+ Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379', object_class:)
71
+ end
65
72
 
66
73
  it 'has proper method resolution' do
67
74
  # Basic cache operations
@@ -114,6 +121,9 @@ describe 'Documentrix::Documents::Cache Interface' do
114
121
  expect(cache).to respond_to(:clear_by_source)
115
122
  expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
116
123
 
124
+ expect(cache).to respond_to(:source_exist?)
125
+ expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
126
+
117
127
  expect(cache).to respond_to(:clear)
118
128
  expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
119
129
 
@@ -177,6 +187,9 @@ describe 'Documentrix::Documents::Cache Interface' do
177
187
  expect(cache).to respond_to(:clear_by_source)
178
188
  expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::SQLiteCache
179
189
 
190
+ expect(cache).to respond_to(:source_exist?)
191
+ expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::SQLiteCache
192
+
180
193
  expect(cache).to respond_to(:clear)
181
194
  expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
182
195
 
@@ -1,5 +1,3 @@
1
- require 'spec_helper'
2
-
3
1
  describe Documentrix::Documents::MemoryCache do
4
2
  let :prefix do
5
3
  'test-'
@@ -135,4 +133,68 @@ describe Documentrix::Documents::MemoryCache do
135
133
  cache['foo'] = 'bar'
136
134
  expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
137
135
  end
136
+
137
+ it 'can iterate over unique sources' do
138
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
139
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
140
+ cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
141
+
142
+ expect(cache.each_source.to_a).to match_array(['s1', 's2'])
143
+ end
144
+
145
+ it 'can retrieve all unique tags' do
146
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['a', 'b'], embedding: [0.1]]
147
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's2', tags: ['b', 'c'], embedding: [0.1]]
148
+
149
+ expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
150
+ end
151
+
152
+ it 'can clear records by tags' do
153
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['keep'], embedding: [0.1]]
154
+ cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', tags: ['trash'], embedding: [0.1]]
155
+
156
+ expect {
157
+ cache.clear_for_tags(['trash'])
158
+ }.to change { cache.size }.from(2).to(1)
159
+ expect(cache.key?('foo')).to be true
160
+ expect(cache.key?('bar')).to be false
161
+ end
162
+
163
+ it 'can check if a source exists' do
164
+ cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
165
+
166
+ expect(cache.source_exist?('s1')).to be true
167
+ expect(cache.source_exist?('s2')).to be false
168
+ end
169
+
170
+ it 'can clear by source with a specific digest' do
171
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
172
+ cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
173
+
174
+ expect {
175
+ cache.clear_by_source('s1', digest: 'd1')
176
+ }.to change { cache.size }.from(2).to(1)
177
+ expect(cache.key?('f2')).to be true
178
+ expect(cache.key?('f1')).to be false
179
+ end
180
+
181
+ it 'can clear outdated versions of a source' do
182
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
183
+ cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
184
+
185
+ expect {
186
+ cache.clear_by_source('s1', digest: 'd2', operator: '!=')
187
+ }.to change { cache.size }.from(2).to(1)
188
+ expect(cache.key?('f2')).to be true
189
+ expect(cache.key?('f1')).to be false
190
+ end
191
+
192
+ it 'can check if a source exists with a specific digest' do
193
+ cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
194
+
195
+ expect(cache.source_exist?('s1', digest: 'd1')).to be true
196
+ expect(cache.source_exist?('s1', digest: 'd2')).to be false
197
+ expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be false
198
+ expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true
199
+ end
138
200
  end