RubyGems - ragnar-cli - Versions diffs - 0.1.0.pre.2 → 0.1.0.pre.4 - Mend

ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/README.md +187 -36
data/lib/ragnar/cli.rb +543 -172
data/lib/ragnar/cli_visualization.rb +184 -0
data/lib/ragnar/config.rb +226 -0
data/lib/ragnar/database.rb +94 -8
data/lib/ragnar/llm_manager.rb +4 -1
data/lib/ragnar/query_processor.rb +38 -20
data/lib/ragnar/topic_modeling.rb +13 -10
data/lib/ragnar/umap_processor.rb +190 -73
data/lib/ragnar/umap_transform_service.rb +169 -88
data/lib/ragnar/version.rb +1 -1
metadata +43 -22
data/lib/ragnar/topic_modeling/engine.rb +0 -221
data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
data/lib/ragnar/topic_modeling/metrics.rb +0 -186
data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
data/lib/ragnar/topic_modeling/topic.rb +0 -117
data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61

data/lib/ragnar/topic_modeling.rb CHANGED Viewed

@@ -1,24 +1,27 @@
-# Main entry point for topic modeling functionality
-# Designed for future extraction into a separate gem
+# frozen_string_literal: true
-require_relative 'topic_modeling/topic'
-require_relative 'topic_modeling/term_extractor'
-require_relative 'topic_modeling/metrics'
-require_relative 'topic_modeling/topic_labeler'
-require_relative 'topic_modeling/engine'
+# Topic modeling wrapper that delegates to the Topical gem
+# This maintains backward compatibility while using the extracted library
+require 'topical'
 module Ragnar
   module TopicModeling
+    # Re-export Topical classes for backward compatibility
+    Topic = Topical::Topic
+    Engine = Topical::Engine
+    # Re-export metrics module
+    Metrics = Topical::Metrics
     # Convenience method to create a new topic modeling engine
     def self.new(**options)
-      Engine.new(**options)
+      Topical::Engine.new(**options)
     end
     # Extract topics from embeddings and documents (simple interface)
     def self.extract(embeddings:, documents:, **options)
-      engine = Engine.new(**options)
-      engine.fit(embeddings: embeddings, documents: documents)
+      Topical.extract(embeddings: embeddings, documents: documents, **options)
     end
   end
 end

data/lib/ragnar/umap_processor.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'json'
+require 'time'
 module Ragnar
   class UmapProcessor
@@ -28,6 +29,67 @@ module Ragnar
       puts "Found #{embeddings.size} embeddings"
+      # Validate embeddings
+      embedding_dims = embeddings.map(&:size).uniq
+      if embedding_dims.size > 1
+        puts "  ⚠️  Warning: Inconsistent embedding dimensions found: #{embedding_dims.inspect}"
+        puts "     This may cause errors during UMAP training."
+        # Filter to only embeddings with the most common dimension
+        most_common_dim = embedding_dims.max_by { |dim| embeddings.count { |e| e.size == dim } }
+        embeddings = embeddings.select { |e| e.size == most_common_dim }
+        puts "     Using only embeddings with #{most_common_dim} dimensions (#{embeddings.size} embeddings)"
+      end
+      # Check for nil or invalid values
+      invalid_count = 0
+      nan_count = 0
+      inf_count = 0
+      valid_embeddings = embeddings.select do |embedding|
+        if !embedding.is_a?(Array)
+          invalid_count += 1
+          false
+        elsif embedding.any? { |v| !v.is_a?(Numeric) }
+          invalid_count += 1
+          false
+        elsif embedding.any?(&:nan?)
+          nan_count += 1
+          false
+        elsif embedding.any? { |v| !v.finite? }
+          inf_count += 1
+          false
+        else
+          true
+        end
+      end
+      if valid_embeddings.size < embeddings.size
+        puts "\n  ⚠️  Data quality issues detected:"
+        puts "     • Invalid embeddings: #{invalid_count}" if invalid_count > 0
+        puts "     • Embeddings with NaN: #{nan_count}" if nan_count > 0
+        puts "     • Embeddings with Infinity: #{inf_count}" if inf_count > 0
+        puts "     • Total removed: #{embeddings.size - valid_embeddings.size}"
+        puts "     • Remaining valid: #{valid_embeddings.size}"
+        embeddings = valid_embeddings
+      end
+      if embeddings.empty?
+        raise "No valid embeddings found after validation.\n\n" \
+              "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
+              "This suggests a problem with the embedding model or indexing process.\n\n" \
+              "Please try:\n" \
+              "  1. Re-indexing your documents: ragnar index <path> --force\n" \
+              "  2. Using a different embedding model\n" \
+              "  3. Checking your document content for unusual characters"
+      end
+      if embeddings.size < 10
+        raise "Too few valid embeddings (#{embeddings.size}) for UMAP training.\n\n" \
+              "UMAP requires at least 10 samples to work effectively.\n" \
+              "Please index more documents or check for data quality issues."
+      end
       # Adjust parameters based on the number of samples
       # UMAP requires n_neighbors < n_samples
       # Also, n_components should be less than n_samples for stability
@@ -55,6 +117,19 @@ module Ragnar
       embedding_matrix = embeddings
       original_dims = embeddings.first.size
+      # Ensure n_components is reasonable
+      if n_components >= original_dims
+        puts "  ⚠️  Warning: n_components (#{n_components}) >= original dimensions (#{original_dims})"
+        n_components = [original_dims / 2, 50].min
+        puts "     Reducing n_components to #{n_components}"
+      end
+      # For very high dimensional data, be more conservative
+      if original_dims > 500 && n_components > 50
+        puts "  ⚠️  Note: High dimensional data (#{original_dims}D) being reduced to #{n_components}D"
+        puts "     Consider using n_components <= 50 for stability"
+      end
       puts "\nTraining UMAP model..."
       puts "  Original dimensions: #{original_dims}"
       puts "  Target dimensions: #{n_components}"
@@ -64,14 +139,45 @@ module Ragnar
       # Perform the actual training using the class-based API
       puts "  Training UMAP model (this may take a moment)..."
-      @umap_instance = ClusterKit::Dimensionality::UMAP.new(
-        n_components: n_components,
-        n_neighbors: n_neighbors
-      )
-      @reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
-      puts "  ✓ UMAP training complete"
+      begin
+        @umap_instance = ClusterKit::Dimensionality::UMAP.new(
+          n_components: n_components,
+          n_neighbors: n_neighbors
+        )
+        @reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
+        puts "  ✓ UMAP training complete"
+      rescue => e
+        # Provide helpful error message without exposing internal stack trace
+        error_msg = "\n❌ UMAP training failed\n\n"
+        if e.message.include?("index out of bounds")
+          error_msg += "The UMAP algorithm encountered an index out of bounds error.\n\n"
+          error_msg += "This typically happens when:\n"
+          error_msg += "  • The embedding data contains invalid values (NaN, Infinity)\n"
+          error_msg += "  • The parameters are incompatible with your data\n"
+          error_msg += "  • There are duplicate or corrupted embeddings\n\n"
+          error_msg += "Suggested solutions:\n"
+          error_msg += "  1. Try with more conservative parameters:\n"
+          error_msg += "     ragnar train-umap --n-components 10 --n-neighbors 5\n\n"
+          error_msg += "  2. Re-index your documents to regenerate embeddings:\n"
+          error_msg += "     ragnar index <path> --force\n\n"
+          error_msg += "  3. Check your embedding model configuration\n\n"
+          error_msg += "Current parameters:\n"
+          error_msg += "  • n_components: #{n_components}\n"
+          error_msg += "  • n_neighbors: #{n_neighbors}\n"
+          error_msg += "  • embeddings: #{embeddings.size} samples\n"
+          error_msg += "  • dimensions: #{original_dims}\n"
+        else
+          error_msg += "Error: #{e.message}\n\n"
+          error_msg += "This may be due to incompatible parameters or data issues.\n"
+          error_msg += "Try using more conservative parameters:\n"
+          error_msg += "  ragnar train-umap --n-components 10 --n-neighbors 5\n"
+        end
+        raise RuntimeError, error_msg
+      end
       # Store the parameters for saving
       @model_params = {
@@ -91,10 +197,10 @@ module Ragnar
     end
     def apply(batch_size: 100)
-      # Load the trained UMAP model (reduced embeddings)
-      reduced_embeddings = load_model
+      # Load the trained UMAP model
+      umap_model = load_umap_model
-      puts "Applying saved UMAP embeddings to database..."
+      puts "Applying UMAP transformation to database documents..."
       # Get all embeddings from database
       all_docs = @database.get_embeddings
@@ -109,84 +215,95 @@ module Ragnar
       end
       puts "Found #{all_docs.size} documents in database"
-      puts "Loaded #{reduced_embeddings.size} reduced embeddings from model"
-      if all_docs.size != reduced_embeddings.size
-        puts "⚠️  Warning: Mismatch between database documents (#{all_docs.size}) and model embeddings (#{reduced_embeddings.size})"
-        puts "   This suggests the model was trained on a different dataset."
-        puts "   Please retrain the UMAP model after indexing all your documents."
-        return {
-          processed: 0,
-          skipped: 0,
-          errors: 1
-        }
+      # Process in batches for memory efficiency
+      processed_count = 0
+      error_count = 0
+      skipped_count = 0
+      all_docs.each_slice(batch_size) do |batch|
+        begin
+          # Extract embeddings
+          embeddings = batch.map { |d| d[:embedding] }
+          # Validate embeddings
+          valid_indices = []
+          embeddings_to_transform = []
+          embeddings.each_with_index do |emb, idx|
+            if emb.nil? || !emb.is_a?(Array) || emb.empty?
+              skipped_count += 1
+              next
+            end
+            if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
+              skipped_count += 1
+              next
+            end
+            valid_indices << idx
+            embeddings_to_transform << emb
+          end
+          next if embeddings_to_transform.empty?
+          # Transform using the loaded UMAP model
+          reduced_embeddings = umap_model.transform(embeddings_to_transform)
+          # Prepare updates for valid documents
+          updates = valid_indices.map.with_index do |batch_idx, transform_idx|
+            {
+              id: batch[batch_idx][:id],
+              reduced_embedding: reduced_embeddings[transform_idx]
+            }
+          end
+          # Update database
+          @database.update_reduced_embeddings(updates)
+          processed_count += updates.size
+          puts "  Processed batch: #{updates.size} documents transformed"
+        rescue => e
+          puts "  ⚠️  Error processing batch: #{e.message}"
+          error_count += batch.size
+        end
       end
-      # Prepare updates - match document IDs to reduced embeddings
-      updates = all_docs.each_with_index.map do |doc, idx|
-        {
-          id: doc[:id],
-          reduced_embedding: reduced_embeddings[idx]
-        }
-      end
-      puts "Updating database with reduced embeddings..."
-      @database.update_reduced_embeddings(updates)
+      puts "\nUMAP application complete:"
+      puts "  ✓ Processed: #{processed_count} documents"
+      puts "  ⚠️  Skipped: #{skipped_count} documents (invalid embeddings)" if skipped_count > 0
+      puts "  ❌ Errors: #{error_count} documents" if error_count > 0
       {
-        processed: updates.size,
-        skipped: 0,
-        errors: 0
+        processed: processed_count,
+        skipped: skipped_count,
+        errors: error_count
       }
     end
     private
-    def process_batch(docs)
-      # Extract embeddings
-      embeddings = docs.map { |d| d[:embedding] }
-      # Transform using UMAP
-      # The transform method returns a 2D array where each row is a reduced embedding
-      reduced = @umap_model.transform(embeddings)
-      # Prepare updates
-      updates = docs.each_with_index.map do |doc, idx|
-        {
-          id: doc[:id],
-          reduced_embedding: reduced[idx]
-        }
-      end
-      # Update database
-      @database.update_reduced_embeddings(updates)
-    end
     def save_model
-      return unless @umap_instance && @reduced_embeddings
+      return unless @umap_instance
-      # Save the trained UMAP model for transforming new queries
+      # Save the trained UMAP model for transforming new data
       @umap_instance.save_model(@model_path)
       puts "UMAP model saved to: #{@model_path}"
-      # Also cache the reduced embeddings separately for the apply method
-      embeddings_path = @model_path.sub(/\.bin$/, '_embeddings.json')
-      ClusterKit::Dimensionality::UMAP.save_data(@reduced_embeddings, embeddings_path)
-      puts "Reduced embeddings cached to: #{embeddings_path}"
-    end
-    def load_model
-      return @reduced_embeddings if @reduced_embeddings
-      # For the apply method, we need the pre-computed embeddings
-      embeddings_path = @model_path.sub(/\.bin$/, '_embeddings.json')
-      unless File.exist?(embeddings_path)
-        raise "Cached embeddings not found at #{embeddings_path}. Please train a model first."
+      # Save metadata about the training if we have params
+      if @model_params
+        metadata_path = @model_path.sub(/\.bin$/, '_metadata.json')
+        metadata = {
+          trained_at: Time.now.iso8601,
+          n_components: @model_params[:n_components],
+          n_neighbors: @model_params[:n_neighbors],
+          min_dist: @model_params[:min_dist],
+          document_count: @database.get_embeddings.size,
+          model_version: 2  # Version 2: proper transform-based approach
+        }
+        File.write(metadata_path, JSON.pretty_generate(metadata))
+        puts "Model metadata saved to: #{metadata_path}"
       end
-      @reduced_embeddings = ClusterKit::Dimensionality::UMAP.load_data(embeddings_path)
-      puts "Cached embeddings loaded from: #{embeddings_path}"
-      @reduced_embeddings
     end
     def load_umap_model

data/lib/ragnar/umap_transform_service.rb CHANGED Viewed

@@ -1,124 +1,205 @@
+require 'json'
 require 'clusterkit'
 module Ragnar
+  # Service for applying UMAP transformations to embeddings
+  # Separates transformation logic from training (UmapProcessor)
   class UmapTransformService
-    include Singleton
+    attr_reader :model_path, :database
-    def initialize
+    def initialize(model_path: "umap_model.bin", database:)
+      @model_path = model_path
+      @database = database
       @umap_model = nil
-      @model_path = "umap_model.bin"
+      @model_metadata = nil
     end
-    # Transform a query embedding to reduced space using saved UMAP model
-    def transform_query(query_embedding, model_path = nil)
-      # Use the real UMAP model's transform capability
-      model_path ||= @model_path
+    # Transform embeddings for specific documents
+    # @param document_ids [Array<Integer>] IDs of documents to transform
+    # @return [Hash] Results with :processed, :skipped, :errors counts
+    def transform_documents(document_ids)
+      return { processed: 0, skipped: 0, errors: 0 } if document_ids.empty?
-      # Load the model if not already loaded
-      load_model(model_path) unless @umap_model
+      load_model!
-      # Transform the query embedding using the trained UMAP model
-      # The transform method expects a 2D array (even for a single embedding)
-      result = @umap_model.transform([query_embedding])
+      # Fetch documents
+      documents = @database.get_documents_by_ids(document_ids)
-      # Return the first (and only) transformed embedding
-      result.first
-    rescue => e
-      # Fall back to k-NN approximation if model loading fails
-      puts "Warning: Could not use UMAP model for transform: #{e.message}"
-      puts "Falling back to k-NN approximation..."
-      knn_approximate_transform(query_embedding)
-    end
-    # Check if we can do transforms
-    def model_available?(model_path = nil)
-      model_path ||= @model_path
+      if documents.empty?
+        return { processed: 0, skipped: 0, errors: 0 }
+      end
-      # First check if the actual UMAP model file exists
-      if File.exist?(model_path)
-        return true
+      # Extract and validate embeddings
+      valid_docs = []
+      embeddings_to_transform = []
+      skipped_count = 0
+      documents.each do |doc|
+        emb = doc[:embedding]
+        if emb.nil? || !emb.is_a?(Array) || emb.empty?
+          skipped_count += 1
+          next
+        end
+        if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
+          skipped_count += 1
+          next
+        end
+        valid_docs << doc
+        embeddings_to_transform << emb
       end
-      # Fallback: check if the database has reduced embeddings for k-NN approximation
-      database = Database.new("./rag_database")
-      stats = database.get_stats
-      stats[:with_reduced_embeddings] > 0
+      return { processed: 0, skipped: skipped_count, errors: 0 } if embeddings_to_transform.empty?
+      # Transform using UMAP
+      begin
+        reduced_embeddings = @umap_model.transform(embeddings_to_transform)
+        # Prepare updates
+        updates = valid_docs.zip(reduced_embeddings).map do |doc, reduced_emb|
+          {
+            id: doc[:id],
+            reduced_embedding: reduced_emb,
+            umap_version: model_version
+          }
+        end
+        # Update database
+        @database.update_reduced_embeddings(updates)
+        { processed: updates.size, skipped: skipped_count, errors: 0 }
+      rescue => e
+        puts "Error transforming documents: #{e.message}"
+        { processed: 0, skipped: skipped_count, errors: valid_docs.size }
+      end
     end
-    private
-    def load_model(model_path)
-      unless File.exist?(model_path)
-        raise "UMAP model not found at #{model_path}. Please train a model first."
+    # Transform a single query embedding
+    # @param embedding [Array<Numeric>] Query embedding to transform
+    # @return [Array<Float>, nil] Reduced embedding or nil if error
+    def transform_query(embedding)
+      return nil if embedding.nil? || !embedding.is_a?(Array) || embedding.empty?
+      # Validate embedding
+      if embedding.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
+        puts "Warning: Invalid query embedding (contains NaN or Infinity)"
+        return nil
       end
-      @umap_model = ClusterKit::Dimensionality::UMAP.load_model(model_path)
-      puts "UMAP model loaded for query transformation"
+      load_model!
+      begin
+        # Transform returns array of arrays, get first (and only) result
+        @umap_model.transform([embedding]).first
+      rescue => e
+        puts "Error transforming query: #{e.message}"
+        nil
+      end
     end
-    def knn_approximate_transform(query_embedding)
-      # Fallback k-NN approximation method
-      # Get database stats to know dimensions
-      database = Database.new("./rag_database")
-      stats = database.get_stats
-      # If we don't have reduced embeddings, we can't transform
-      if stats[:with_reduced_embeddings] == 0
-        raise "No reduced embeddings available in database"
-      end
+    # Check if a UMAP model exists
+    # @return [Boolean] true if model file exists
+    def model_exists?
+      File.exist?(@model_path)
+    end
+    # Get metadata about the trained model
+    # @return [Hash, nil] Model metadata or nil if not found
+    def model_metadata
+      return @model_metadata if @model_metadata
-      # Get all documents with their embeddings
-      all_docs = database.get_embeddings
+      metadata_path = @model_path.sub(/\.bin$/, '_metadata.json')
+      return nil unless File.exist?(metadata_path)
-      # Find k nearest neighbors in full embedding space
-      k = 5
-      neighbors = []
+      @model_metadata = JSON.parse(File.read(metadata_path), symbolize_names: true)
+    rescue => e
+      puts "Error loading model metadata: #{e.message}"
+      nil
+    end
+    # Get the version of the current model
+    # @return [Integer] Model version (timestamp of file modification)
+    def model_version
+      return 0 unless File.exist?(@model_path)
+      File.mtime(@model_path).to_i
+    end
+    # Check if model needs retraining based on staleness
+    # @return [Hash] Staleness info with :needs_retraining, :coverage_percentage
+    def check_model_staleness
+      return { needs_retraining: true, coverage_percentage: 0, reason: "No model exists" } unless model_exists?
-      all_docs.each_with_index do |doc, idx|
-        next unless doc[:embedding] && doc[:reduced_embedding]
-        distance = euclidean_distance(query_embedding, doc[:embedding])
-        neighbors << { idx: idx, distance: distance, reduced: doc[:reduced_embedding] }
-      end
+      metadata = model_metadata
+      return { needs_retraining: true, coverage_percentage: 0, reason: "No metadata found" } unless metadata
-      # Sort by distance and take k nearest
-      neighbors.sort_by! { |n| n[:distance] }
-      k_nearest = neighbors.first(k)
+      trained_count = metadata[:document_count] || 0
+      current_count = @database.document_count
-      # Average the reduced embeddings of k nearest neighbors
-      # This is a simple approximation of the transform
-      if k_nearest.empty?
-        raise "No neighbors found for transform"
+      if current_count == 0
+        return { needs_retraining: false, coverage_percentage: 100, reason: "No documents" }
       end
-      reduced_dims = k_nearest.first[:reduced].size
-      averaged = Array.new(reduced_dims, 0.0)
+      coverage = (trained_count.to_f / current_count * 100).round(1)
+      staleness = 100 - coverage
+      {
+        needs_retraining: staleness > 30,
+        coverage_percentage: coverage,
+        trained_documents: trained_count,
+        current_documents: current_count,
+        staleness_percentage: staleness,
+        reason: staleness > 30 ? "Model covers only #{coverage}% of documents" : "Model is up to date"
+      }
+    end
+    private
+    def load_model!
+      return if @umap_model
-      # Weighted average based on inverse distance
-      total_weight = 0.0
-      k_nearest.each do |neighbor|
-        # Use inverse distance as weight (closer = higher weight)
-        weight = 1.0 / (neighbor[:distance] + 0.001) # Add small epsilon to avoid division by zero
-        total_weight += weight
-        neighbor[:reduced].each_with_index do |val, idx|
-          averaged[idx] += val * weight
-        end
+      unless File.exist?(@model_path)
+        raise "UMAP model not found at #{@model_path}. Please train a model first using 'ragnar train-umap'."
       end
-      # Normalize by total weight
-      averaged.map { |val| val / total_weight }
+      @umap_model = ClusterKit::Dimensionality::UMAP.load_model(@model_path)
     end
+  end
+  # Singleton service for backwards compatibility
+  # This allows the old UmapTransformService.instance pattern to work
+  class UmapTransformServiceSingleton
+    include Singleton
-    def euclidean_distance(vec1, vec2)
-      return Float::INFINITY if vec1.size != vec2.size
-      sum = 0.0
-      vec1.each_with_index do |val, idx|
-        diff = val - vec2[idx]
-        sum += diff * diff
+    def initialize
+      @database = Database.new(Config.instance.database_path)
+      @service = UmapTransformService.new(database: @database)
+    end
+    def transform_query(embedding, model_path = nil)
+      if model_path && model_path != @service.model_path
+        # Create a new service with different model path
+        service = UmapTransformService.new(model_path: model_path, database: @database)
+        service.transform_query(embedding)
+      else
+        @service.transform_query(embedding)
       end
-      Math.sqrt(sum)
+    end
+    def model_available?(model_path = nil)
+      if model_path
+        File.exist?(model_path)
+      else
+        @service.model_exists?
+      end
+    end
+  end
+  # For backwards compatibility - old code uses UmapTransformService.instance
+  class << UmapTransformService
+    def instance
+      UmapTransformServiceSingleton.instance
     end
   end
 end

data/lib/ragnar/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Ragnar
-  VERSION = "0.1.0.pre.2"
+  VERSION = "0.1.0.pre.4"
 end