RubyGems - documentrix - Versions diffs - 0.2.0 → 0.3.0 - Mend

documentrix 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/CHANGES.md +69 -0
data/documentrix.gemspec +5 -5
data/lib/documentrix/documents/cache/common.rb +63 -11
data/lib/documentrix/documents/cache/records.rb +1 -1
data/lib/documentrix/documents/cache/redis_cache.rb +3 -3
data/lib/documentrix/documents/cache/sqlite_cache.rb +95 -27
data/lib/documentrix/documents/splitters/character.rb +56 -4
data/lib/documentrix/documents/splitters/common.rb +38 -0
data/lib/documentrix/documents/splitters/semantic.rb +67 -8
data/lib/documentrix/documents.rb +133 -29
data/lib/documentrix/utils/colorize_texts.rb +25 -21
data/lib/documentrix/utils/digests.rb +78 -0
data/lib/documentrix/utils.rb +1 -0
data/lib/documentrix/version.rb +1 -1
data/spec/documentrix/documents/cache/interface_spec.rb +16 -3
data/spec/documentrix/documents/cache/memory_cache_spec.rb +64 -2
data/spec/documentrix/documents/cache/redis_cache_spec.rb +68 -19
data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +128 -2
data/spec/documentrix/documents/splitters/character_spec.rb +20 -2
data/spec/documentrix/documents/splitters/semantic_spec.rb +17 -5
data/spec/documents_spec.rb +59 -3
data/spec/utils/colorize_texts_spec.rb +0 -2
data/spec/utils/digests_spec.rb +97 -0
data/spec/utils/tags_spec.rb +0 -2
metadata +7 -1

data/lib/documentrix/documents/splitters/semantic.rb CHANGED Viewed

@@ -1,14 +1,60 @@
 module Documentrix::Documents::Splitters
+  # Semantic splitter that divides text based on thematic changes in meaning.
+  #
+  # It works by splitting text into sentences, computing embeddings for each,
+  # and then calculating the cosine distance between adjacent sentences.
+  # Where the distance exceeds a calculated threshold (the "breakpoint"),
+  # a semantic boundary is identified.
+  #
+  # @example
+  #   splitter = Documentrix::Documents::Splitters::Semantic.new(
+  #     ollama: ollama_client,
+  #     model: 'mxbai-embed-large'
+  #   )
+  #   chunks = splitter.split(text, breakpoint: :percentile, percentile: 90)
   class Semantic
+    include Documentrix::Documents::Splitters::Common
     include Documentrix::Utils::Math
-    DEFAULT_SEPARATOR = /[.!?]\s*(?:\b|\z)/
+    # The default regex used to identify sentence boundaries for semantic
+    # splitting. It matches a sentence-ending punctuation mark (., !, ?)
+    # followed by optional whitespace at a word boundary or the end of the
+    # string.
+    #
+    # @return [Regexp]
+    DEFAULT_SEPARATOR = /[.!?,;]\s*(?:\b|\z)/
-    def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096)
-      @ollama, @model, @model_options, @separator, @chunk_size =
-        ollama, model, model_options, separator, chunk_size
+    # Initializes a new Semantic splitter.
+    #
+    # @param ollama [Ollama::Client] the client used for generating embeddings
+    # @param model [String] the embedding model name
+    # @param model_options [Hash, nil] optional parameters passed to the embedding model
+    # @param separator [Regexp] the regex used to identify sentence boundaries
+    # @param chunk_size [Integer] the maximum character length of a resulting chunk
+    # @param force [Boolean] whether to force split chunks that exceed chunk_size (defaults to false)
+    def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096, force: false)
+      @ollama, @model, @model_options, @separator, @chunk_size, @force =
+        ollama, model, model_options, separator, chunk_size, force
     end
+    # Splits the given text into semantic chunks.
+    #
+    # The method first decomposes the text into sentences, then identifies gaps
+    # in semantic similarity. It then groups these sentences into chunks that
+    # respect both the semantic boundaries and the maximum chunk size.
+    #
+    # @param text [String] the text to be split
+    # @param batch_size [Integer] the number of sentences to embed in a single API call
+    # @param breakpoint [Symbol] the method used to determine the distance threshold
+    #   * :percentile (default) - uses the N-th percentile of distances
+    #   * :standard_deviation - uses mean + (std_dev * multiplier)
+    #   * :interquartile - uses mean + (iqr * multiplier)
+    # @param opts [Hash] additional options for the splitting process:
+    #   * :include_separator [Boolean] whether to keep the sentence separator in the result
+    #   * :percentile [Integer] the percentile to use if breakpoint is :percentile (default: 95)
+    #   * :percentage [Integer] the multiplier percentage for :standard_deviation or :interquartile (default: 100)
+    #
+    # @return [Array<String>] an array of semantically grouped text chunks
     def split(text, batch_size: 100, breakpoint: :percentile, **opts)
       sentences  = Documentrix::Documents::Splitters::Character.new(
         separator: @separator,
@@ -45,22 +91,30 @@ module Documentrix::Documents::Splitters
           if current_text.size + sentence.size < @chunk_size
             current_text += sentence
           else
-            current_text.empty? or result << current_text
+            result.concat(force_split(current_text))
             current_text = sentence
           end
         end
-        unless current_text.empty?
-          result << current_text
+        if current_text.present?
+          result.concat(force_split(current_text))
           current_text = +''
         end
         sg = g.succ
       end
-      current_text.empty? or result << current_text
+      result.concat(force_split(current_text))
       result
     end
     private
+    # Calculates the distance threshold used to identify semantic boundaries.
+    #
+    # @param breakpoint_method [Symbol] the method to use (:percentile, :standard_deviation, :interquartile)
+    # @param distances [Array<Float>] the cosine distances between adjacent sentences
+    # @param opts [Hash] options specific to the chosen method (e.g., :percentile, :percentage)
+    #
+    # @return [Float] the distance threshold
+    # @raise [ArgumentError] if an unsupported breakpoint_method is provided
     def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
       sequence = MoreMath::Sequence.new(distances)
       case breakpoint_method
@@ -82,6 +136,11 @@ module Documentrix::Documents::Splitters
       end
     end
+    # Fetches embeddings for a batch of sentences and converts them to
+    # Numo::NArray.
+    #
+    # @param input [Array<String>] the batch of sentences to embed
+    # @return [Array<Numo::NArray>] an array of embeddings as Numo arrays
     def sentence_embeddings(input)
       @ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
         Numo::NArray[*_1]

data/lib/documentrix/documents.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 require 'numo/narray'
-require 'digest'
 require 'kramdown/ansi'
 class Documentrix::Documents
@@ -33,6 +32,7 @@ require 'documentrix/documents/cache/sqlite_cache'
 # to prepare text data for embedding and storage in vector databases.
 module Documentrix::Documents::Splitters
 end
+require 'documentrix/documents/splitters/common'
 require 'documentrix/documents/splitters/character'
 require 'documentrix/documents/splitters/semantic'
@@ -59,6 +59,7 @@ require 'documentrix/documents/splitters/semantic'
 class Documentrix::Documents
   include Kramdown::ANSI::Width
   include Documentrix::Documents::Cache
+  include Documentrix::Utils::Digests
   # Shortcut for Documentrix::Documents::Cache::Records::Record
   Record = Class.new Documentrix::Documents::Cache::Records::Record
@@ -116,16 +117,16 @@ class Documentrix::Documents
     texts
   end
-  # The  method adds new texts `texts` to the documents collection by
+  # The add method adds new texts `texts` to the documents collection by
   # processing them through various stages. It first filters out existing texts
   # from the input array using the `prepare_texts` method, then fetches
   # embeddings for each text using the specified model and options. The fetched
   # embeddings are used to create a new record in the cache, which is
-  # associated with the original text and tags (if any). The method processes
-  # the texts in batches of size , displaying progress information
-  # in the console. It also accepts an optional  string to associate
-  # with the added texts and an array of  to attach to each record. Once
+  # associated with the original text, tags, and version digest (if any). The
+  # method processes the texts in batches of size `batch_size`, displaying
+  # progress information in the console. It also accepts an optional `source`
+  # string to associate with the added texts, an array of `tags` to attach to
+  # each record, and an optional `digest` string for version tracking. Once
   # all texts have been processed, it returns the `Documentrix::Documents`
   # instance itself, allowing for method chaining.
   #
@@ -133,14 +134,17 @@ class Documentrix::Documents
   # @param batch_size [Integer] the number of texts to process in one batch
   # @param source [String] the source URL for the added texts
   # @param tags [Array] an array of tags associated with the added texts
+  # @param digest [String, nil] the SHA256 hexadecimal digest of the source
   #
   # @example
   #   documents.add(%w[ foo bar ], batch_size: 23, source: 'https://example.com', tags: %w[tag1 tag2])
   #
   # @return [Documentrix::Documents] self
-  def add(texts, batch_size: nil, source: nil, tags: [])
-    texts = prepare_texts(texts) or return self
-    tags = Documentrix::Utils::Tags.new(tags, source:)
+  def add(texts, batch_size: nil, source: nil, tags: [], digest: nil)
+    texts    = prepare_texts(texts) or return self
+    source   = normalize_source(source)
+    tags     = Documentrix::Utils::Tags.new(tags, source:)
+    digest ||= compute_file_digest(source)
     if source
       tags.add(File.basename(source).gsub(/\?.*/, ''), source:)
     end
@@ -153,7 +157,7 @@ class Documentrix::Documents
       embeddings = fetch_embeddings(model:, options: @model_options, input: batch)
       batch.zip(embeddings) do |text, embedding|
         norm       = @cache.norm(embedding)
-        self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a]
+        self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a, digest:]
       end
       infobar.progress by: batch.size
     end
@@ -219,13 +223,101 @@ class Documentrix::Documents
     self
   end
-  # The remove method removes all documents associated with the given source.
+  # Normalizes the source identifier to a canonical form.
+  #
+  # If the source is blank, returns nil.
+  # If the source is an absolute URL, it is returned as-is.
+  # If the source is a local file path that exists, it is expanded to its real
+  # path, resolving all symlinks and absolute paths.
+  # Otherwise, the original source is returned.
+  #
+  # @param source [String, #to_s] the source identifier to normalize
+  # @return [String, nil] the normalized canonical path, the original source,
+  #   or nil if blank
+  def normalize_source(source)
+    source.blank? and return
+    begin
+      URI::PARSER.parse(source).absolute? and return source
+    rescue
+    end
+    Pathname.new(source).realpath.to_path
+  rescue Errno::ENOENT
+    source
+  end
+  # The source_exist? method checks if any records associated with the given
+  # source exist in the cache. If a digest is provided, it verifies if the
+  # source exists and satisfies the comparison with the specified digest.
+  #
+  # @param source [#to_s] the source to check for existence
+  # @param digest [String, nil] the SHA256 hexadecimal digest to compare
+  #   against the stored source digest (optional)
+  # @param operator [Symbol, String] the operator to compare the digest with
+  #   (defaults to '=')
+  #
+  # @return [Boolean] true if the source exists (and satisfies the digest
+  #   comparison if provided), false otherwise.
+  def source_exist?(source, digest: nil, operator: ?=)
+    source = normalize_source(source)
+    @cache.source_exist?(source, digest:, operator:)
+  end
+  # Checks if the content of the given source has been modified compared to
+  # the version stored in the cache, or if it is missing from the cache.
+  #
+  # The method is considered modified (returns true) if:
+  # 1. The source is blank or cannot be normalized.
+  # 2. The source is not a valid local file or its digest cannot be computed.
+  # 3. No records exist in the cache for this source.
+  # 4. Records exist in the cache for this source, but they have a different
+  #    digest than the current version on disk.
+  #
+  # @param source [String, #to_s] the source identifier to check
+  # @return [Boolean] true if the source is modified, missing, or cannot be
+  #   verified, false if it is up-to-date.
+  def source_modified?(source)
+    source = normalize_source(source) or return true
+    digest = compute_file_digest(source) or return true
+    !source_exist?(source) || source_exist?(source, digest:, operator: '!=')
+  end
+  # Updates the records associated with a given source.
+  #
+  # If the source already exists in the cache, this method computes its current
+  # digest and removes only the stale records that do not match this digest. If
+  # the source is new or has been modified, it adds the provided texts to the
+  # cache.
+  #
+  # @param texts [Array] the text strings to add if the source is new or modified
+  # @param opts [Hash] additional options passed to #add (e.g., :batch_size, :tags)
+  #   * :source [#to_s] the source to update
+  #
+  # @return [Documentrix::Documents, nil] the instance itself if the source
+  #   was added/updated, or nil if the source was already up-to-date.
+  def source_update(texts, **opts)
+    if source = normalize_source(opts[:source]) and source_exist?(source)
+      digest = compute_file_digest(source)
+      source_remove(source, digest:)
+      unless source_exist?(source, digest:, operator: ?=)
+        opts[:digest] = digest
+        add(texts, **opts)
+      end
+    else
+      add(texts, **opts)
+    end
+  end
+  # The source_remove method removes all documents associated with the given
+  # source.
   #
-  # @param source [String] the source of the documents to remove
+  # @param source [#to_s] the source of the documents to remove
+  # @param digest [String, nil] the SHA256 hexadecimal digest for which records
+  #   with this source are **not** to be removed if given.
   #
   # @return [Documentrix::Documents] self
-  def remove(source)
-    @cache.clear_by_source(source)
+  def source_remove(source, digest: nil)
+    source = normalize_source(source)
+    @cache.clear_by_source(source, digest:, operator: '!=')
     self
   end
@@ -236,14 +328,16 @@ class Documentrix::Documents
   # @param tags [Array<String>] an array of tags to filter results by (optional)
   # @param prompt [String] a prompt to use when searching for similar strings (optional)
   # @param max_records [Integer] the maximum number of records to return (optional)
+  # @param min_similarity [Numeric] the minimum similarity score to include in results (defaults to -1)
   #
   # @example
   #   documents.find("foo")
   #
   # @return [Array<Documentrix::Documents::Record>]
-  def find(string, tags: nil, prompt: nil, max_records: nil)
+  def find(string, tags: nil, prompt: nil, max_records: nil, min_similarity: nil)
+    min_similarity ||= -1
     needle = convert_to_vector(string, prompt:)
-    @cache.find_records(needle, tags:, max_records: nil)
+    @cache.find_records(needle, tags:, max_records:, min_similarity:)
   end
   # The  method filters the records returned by find based on text
@@ -256,20 +350,28 @@ class Documentrix::Documents
   # @example
   #   documents.find_where('foo', text_size: 3, text_count: 1)
   # @return [Array<Documentrix::Documents::Record>] the filtered records
+  # The find_where method filters the records returned by find based on text
+  # size and count.
+  #
+  # @param string [String] the search query
+  # @param text_size [Integer] the maximum allowed total text size to return
+  # @param text_count [Integer] the maximum number of records to return
+  # @param opts [Hash] additional options passed to #find, such as:
+  #   * :tags [Array<String>] filter results by tags
+  #   * :prompt [String] a prompt to use for the search
+  #   * :min_similarity [Numeric] minimum similarity score
+  #
+  # @example
+  #   documents.find_where('foo', text_size: 1000, text_count: 5, tags: ['ruby'])
+  #
+  # @return [Array<Documentrix::Documents::Record>] the filtered records
   def find_where(string, text_size: nil, text_count: nil, **opts)
-    if text_count
-      opts[:max_records] =  text_count
-    end
+    text_count and opts[:max_records] =  text_count
     records = find(string, **opts)
-    size, count = 0, 0
+    size    = 0
     records.take_while do |record|
-      if text_size and (size += record.text.size) > text_size
-        next false
-      end
-      if text_count and (count += 1) > text_count
-        next false
-      end
-      true
+      !text_size || (size += record.text.size) <= text_size
     end
   end
@@ -333,6 +435,8 @@ class Documentrix::Documents
         debug: @debug
       )
     end
+  rescue => e
+    warn "Caught #{e.class}: #{e}"
   ensure
     cache ||= MemoryCache.new(prefix:,)
     return cache
@@ -389,6 +493,6 @@ class Documentrix::Documents
   #
   # @return [String] the SHA256 hash of the input string
   def key(input)
-    Digest::SHA256.hexdigest(input)
+    compute_digest(input)
   end
 end

data/lib/documentrix/utils/colorize_texts.rb CHANGED Viewed

@@ -4,33 +4,37 @@ require 'kramdown/ansi'
 # A utility class for colorizing and formatting text output with ANSI color
 # codes and size information.
 #
-# The ColorizeTexts class takes an array of text strings and formats them with
-# different ANSI colors for visual distinction. It also appends the size of each
-# text block to the output, making it useful for debugging or displaying
-# information about text chunks in a visually appealing way.
+# The ColorizeTexts class takes a collection of text strings and formats them
+# with dynamically generated ANSI colors for visual distinction. Each text
+# block is wrapped to fit the terminal width and appended with its size in
+# bytes, making it ideal for debugging text-splitting pipelines.
 #
 # @example
-#   colorizer = Documentrix::Utils::ColorizeTexts.new('foo', 'bar')
+#   colorizer = Documentrix::Utils::ColorizeTexts.new('First chunk', 'Second chunk')
 #   puts colorizer.to_s
 class Documentrix::Utils::ColorizeTexts
   include Math
   include Term::ANSIColor
   include Kramdown::ANSI::Width
-  # Initializes a new instance of Documentrix::::ColorizeTexts
+  # Initializes a new instance of ColorizeTexts.
   #
-  # @param [Array<String>] texts the array of strings to be displayed with colors
+  # @param texts [String, Array<String>] a variable list of strings or an array
+  #   of strings to be colorized.
   #
-  # @return [Documentrix::::ColorizeTexts] an instance of Documentrix::::ColorizeTexts
+  # @return [Documentrix::Utils::ColorizeTexts] a new instance of ColorizeTexts
   def initialize(*texts)
-    texts  = texts.map(&:to_a)
-    @texts = Array(texts.flatten)
+    @texts = texts.flatten
   end
-  # Returns a string representation of the object, including all texts content,
-  # colored differently and their sizes.
+  # Returns a formatted string representation of the texts.
   #
-  # @return [String] The formatted string.
+  # Each text block is:
+  # 1. Assigned a color from a trigonometric RGB gradient.
+  # 2. Wrapped to 90% of the terminal width.
+  # 3. Appended with its size in bold text.
+  #
+  # @return [String] the colorized and formatted output string.
   def to_s
     result = +''
     @texts.each_with_index do |t, i|
@@ -45,14 +49,13 @@ class Documentrix::Utils::ColorizeTexts
   private
-  # Returns the nearest RGB color to the given ANSI color
+  # Determines the optimal text color (black or white) for a given background
+  # color to ensure maximum readability based on contrast.
   #
-  # @param [color] color The ANSI color attribute
+  # @param color [Symbol, Term::ANSIColor::Attribute] the ANSI color attribute
   #
-  # @return [Array<RGBTriple>] An array containing two RGB colors, one for black and
-  #   one for white text, where the first is the closest match to the input color
-  #   when printed on a black background, and the second is the closest match
-  #   when printed on a white background.
+  # @return [Array<String>] an array containing the RGB colors that provide
+  #   the best contrast for black and white backgrounds.
   def text_color(color)
     color = Term::ANSIColor::Attribute[color]
     [
@@ -61,9 +64,10 @@ class Documentrix::Utils::ColorizeTexts
     ].max_by { |t| t.distance_to(color) }
   end
-  # Returns an array of colors for each step in the gradient
+  # Generates a 256-color RGB gradient using sine wave oscillations.
   #
-  # @return [Array<Array<Integer>>] An array of RGB color arrays
+  # @return [Array<Array<Integer>>] an array of 256 RGB color arrays,
+  #   where each inner array contains [R, G, B] values from 0 to 255.
   def colors
     @colors ||= (0..255).map { |i|
       [

data/lib/documentrix/utils/digests.rb ADDED Viewed

@@ -0,0 +1,78 @@
+require 'digest'
+require 'uri'
+# Module for computing cryptographic digests used for tracking content changes.
+module Documentrix::Utils::Digests
+  private
+  @@file_digest_cache = {}
+  # Computes the SHA256 hexadecimal digest of the given text.
+  #
+  # @param text [String] the text to be hashed
+  # @return [String] the SHA256 hexadecimal digest
+  def compute_digest(text)
+    Digest::SHA256.hexdigest(text)
+  end
+  # Stores a computed digest in the internal cache, keyed by the filename
+  # and the file's modification time.
+  #
+  # @param filename [String] the path to the file
+  # @param stat [File::Stat] the status information of the file
+  # @param digest [String] the SHA256 digest to store
+  # @return [void]
+  def file_digest_store(filename, stat, digest)
+    @@file_digest_cache[[filename, stat.mtime]] = digest
+  end
+  # Checks if a valid digest exists in the internal cache for the given
+  # filename and modification time.
+  #
+  # @param filename [String] the path to the file
+  # @param stat [File::Stat] the status information of the file
+  # @return [String, nil] the cached digest if found, nil otherwise
+  def file_digest_cached?(filename, stat)
+    @@file_digest_cache.fetch([filename, stat.mtime], nil)
+  end
+  # Clears the internal file digest cache.
+  #
+  # This removes all stored digests and their associated modification times,
+  # forcing subsequent calls to #compute_file_digest to re-read files from
+  # disk.
+  def file_digest_cache_clear
+    @@file_digest_cache&.clear
+  end
+  # Computes the SHA256 hexadecimal digest of a local file's content.
+  #
+  # This method first verifies that the provided filename is not an absolute
+  # URL and that the file actually exists on the filesystem before reading
+  # and hashing its content. It uses an internal cache to avoid re-reading
+  # the file if the modification time has not changed.
+  #
+  # @param filename [String, #to_s] the path to the local file
+  # @return [String, nil] the SHA256 hexadecimal digest if the file is a
+  #   valid local file and exists, nil otherwise.
+  def compute_file_digest(filename)
+    filename = filename.to_s
+    case
+    when !filename.present?
+      nil
+    when (URI::PARSER.parse(filename).absolute? rescue nil)
+      nil
+    else
+      stat = begin
+               File.stat(filename)
+             rescue Errno::ENOENT
+             end
+      stat or return
+      if digest = file_digest_cached?(filename, stat)
+        digest
+      else
+        file_digest_store(filename, stat, compute_digest(File.read(filename)))
+      end
+    end
+  end
+end

data/lib/documentrix/utils.rb CHANGED Viewed

@@ -13,3 +13,4 @@ end
 require 'documentrix/utils/colorize_texts'
 require 'documentrix/utils/math'
 require 'documentrix/utils/tags'
+require 'documentrix/utils/digests'

data/lib/documentrix/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Documentrix
   # Documentrix version
-  VERSION         = '0.2.0'
+  VERSION         = '0.3.0'
   VERSION_ARRAY   = VERSION.split('.').map(&:to_i) # :nodoc:
   VERSION_MAJOR   = VERSION_ARRAY[0] # :nodoc:
   VERSION_MINOR   = VERSION_ARRAY[1] # :nodoc:

data/spec/documentrix/documents/cache/interface_spec.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-require 'spec_helper'
 describe 'Documentrix::Documents::Cache Interface' do
   describe 'MemoryCache Interface' do
     let(:cache) { Documentrix::Documents::MemoryCache.new(prefix: 'test-') }
@@ -55,13 +53,22 @@ describe 'Documentrix::Documents::Cache Interface' do
       expect(cache).to respond_to(:clear_by_source)
       expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
+      expect(cache).to respond_to(:source_exist?)
+      expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
       expect(cache).to respond_to(:clear)
       expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
     end
   end
   describe 'RedisCache Interface' do
-    let(:cache) { Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379') }
+    let :object_class do
+      Documentrix::Documents::Cache::Records::Record
+    end
+    let(:cache) do
+      Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379', object_class:)
+    end
     it 'has proper method resolution' do
       # Basic cache operations
@@ -114,6 +121,9 @@ describe 'Documentrix::Documents::Cache Interface' do
       expect(cache).to respond_to(:clear_by_source)
       expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
+      expect(cache).to respond_to(:source_exist?)
+      expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
       expect(cache).to respond_to(:clear)
       expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
@@ -177,6 +187,9 @@ describe 'Documentrix::Documents::Cache Interface' do
       expect(cache).to respond_to(:clear_by_source)
       expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::SQLiteCache
+      expect(cache).to respond_to(:source_exist?)
+      expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::SQLiteCache
       expect(cache).to respond_to(:clear)
       expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common

data/spec/documentrix/documents/cache/memory_cache_spec.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-require 'spec_helper'
 describe Documentrix::Documents::MemoryCache do
   let :prefix do
     'test-'
@@ -135,4 +133,68 @@ describe Documentrix::Documents::MemoryCache do
     cache['foo'] = 'bar'
     expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
   end
+  it 'can iterate over unique sources' do
+    cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
+    cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
+    cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
+    expect(cache.each_source.to_a).to match_array(['s1', 's2'])
+  end
+  it 'can retrieve all unique tags' do
+    cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['a', 'b'], embedding: [0.1]]
+    cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's2', tags: ['b', 'c'], embedding: [0.1]]
+    expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
+  end
+  it 'can clear records by tags' do
+    cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['keep'], embedding: [0.1]]
+    cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', tags: ['trash'], embedding: [0.1]]
+    expect {
+      cache.clear_for_tags(['trash'])
+    }.to change { cache.size }.from(2).to(1)
+    expect(cache.key?('foo')).to be true
+    expect(cache.key?('bar')).to be false
+  end
+  it 'can check if a source exists' do
+    cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
+    expect(cache.source_exist?('s1')).to be true
+    expect(cache.source_exist?('s2')).to be false
+  end
+  it 'can clear by source with a specific digest' do
+    cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
+    cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
+    expect {
+      cache.clear_by_source('s1', digest: 'd1')
+    }.to change { cache.size }.from(2).to(1)
+    expect(cache.key?('f2')).to be true
+    expect(cache.key?('f1')).to be false
+  end
+  it 'can clear outdated versions of a source' do
+    cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
+    cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
+    expect {
+      cache.clear_by_source('s1', digest: 'd2', operator: '!=')
+    }.to change { cache.size }.from(2).to(1)
+    expect(cache.key?('f2')).to be true
+    expect(cache.key?('f1')).to be false
+  end
+  it 'can check if a source exists with a specific digest' do
+    cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
+    expect(cache.source_exist?('s1', digest: 'd1')).to be true
+    expect(cache.source_exist?('s1', digest: 'd2')).to be false
+    expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be false
+    expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true
+  end
 end