RubyGems - ragnar-cli - Versions diffs - 0.1.0.pre.3 → 0.1.0.pre.4 - Mend

ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/README.md +187 -36
data/lib/ragnar/cli.rb +527 -172
data/lib/ragnar/cli_visualization.rb +184 -0
data/lib/ragnar/config.rb +226 -0
data/lib/ragnar/database.rb +94 -8
data/lib/ragnar/llm_manager.rb +4 -1
data/lib/ragnar/query_processor.rb +38 -20
data/lib/ragnar/topic_modeling.rb +13 -10
data/lib/ragnar/umap_processor.rb +77 -65
data/lib/ragnar/umap_transform_service.rb +169 -88
data/lib/ragnar/version.rb +1 -1
metadata +43 -22
data/lib/ragnar/topic_modeling/engine.rb +0 -301
data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
data/lib/ragnar/topic_modeling/metrics.rb +0 -186
data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
data/lib/ragnar/topic_modeling/topic.rb +0 -117
data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61

data/lib/ragnar/topic_modeling/engine.rb DELETED Viewed

@@ -1,301 +0,0 @@
-require 'json'
-module Ragnar
-  module TopicModeling
-    class Engine
-      attr_reader :topics, :clusterer, :term_extractor
-      def initialize(
-        min_cluster_size: 5,
-        min_samples: 3,
-        clustering_backend: nil,
-        reduce_dimensions: true,
-        n_components: 50,
-        labeling_method: :hybrid,
-        llm_client: nil,
-        verbose: false
-      )
-        @min_cluster_size = min_cluster_size
-        @min_samples = min_samples
-        @reduce_dimensions = reduce_dimensions
-        @n_components = n_components
-        @labeling_method = labeling_method
-        @verbose = verbose
-        @clusterer = clustering_backend || build_default_clusterer
-        @term_extractor = TermExtractor.new
-        @labeler = TopicLabeler.new(method: labeling_method, llm_client: llm_client)
-        @topics = []
-      end
-      def fit(embeddings:, documents:, metadata: nil)
-        raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
-        @embeddings = embeddings
-        @documents = documents
-        @metadata = metadata || Array.new(documents.length) { {} }
-        puts "Starting topic extraction..." if @verbose
-        # Step 1: Optionally reduce dimensions for better clustering
-        working_embeddings = @embeddings
-        if @reduce_dimensions && @embeddings.first.length > @n_components
-          puts "  Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
-          working_embeddings = reduce_dimensions(@embeddings)
-        end
-        # Step 2: Cluster embeddings
-        puts "  Clustering #{working_embeddings.length} documents..." if @verbose
-        cluster_ids = @clusterer.fit_predict(working_embeddings)
-        # Step 3: Build topics from clusters
-        puts "  Building topics..." if @verbose
-        @topics = build_topics(cluster_ids)
-        # Step 4: Extract terms for each topic
-        puts "  Extracting distinctive terms..." if @verbose
-        extract_topic_terms
-        # Step 5: Generate labels
-        puts "  Generating topic labels..." if @verbose
-        generate_topic_labels
-        puts "Found #{@topics.length} topics (plus #{count_outliers(cluster_ids)} outliers)" if @verbose
-        @topics
-      end
-      def transform(embeddings:, documents: nil)
-        # Assign new documents to existing topics
-        raise "Must call fit before transform" if @topics.empty?
-        # Use approximate prediction if available
-        if @clusterer.respond_to?(:approximate_predict)
-          @clusterer.approximate_predict(embeddings)
-        else
-          # Fallback: assign to nearest topic centroid
-          assign_to_nearest_topic(embeddings)
-        end
-      end
-      def get_topic(topic_id)
-        @topics.find { |t| t.id == topic_id }
-      end
-      def outliers
-        @outliers ||= @documents.each_with_index.select { |_, idx|
-          @cluster_ids && @cluster_ids[idx] == -1
-        }.map(&:first)
-      end
-      def save(path)
-        data = {
-          topics: @topics.map(&:to_h),
-          config: {
-            min_cluster_size: @min_cluster_size,
-            min_samples: @min_samples,
-            reduce_dimensions: @reduce_dimensions,
-            n_components: @n_components,
-            labeling_method: @labeling_method
-          }
-        }
-        File.write(path, JSON.pretty_generate(data))
-      end
-      def self.load(path)
-        data = JSON.parse(File.read(path), symbolize_names: true)
-        engine = new(**data[:config])
-        # Reconstruct topics
-        engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
-        engine
-      end
-      private
-      def build_default_clusterer
-        begin
-          require 'clusterkit'
-          ClusterKit::Clustering::HDBSCAN.new(
-            min_cluster_size: @min_cluster_size,
-            min_samples: @min_samples,
-            metric: 'euclidean'
-          )
-        rescue LoadError
-          raise "ClusterKit required for topic modeling. Add 'gem \"clusterkit\"' to your Gemfile."
-        end
-      end
-      def reduce_dimensions(embeddings)
-        require 'clusterkit'
-        # Validate embeddings before UMAP
-        valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
-        if valid_embeddings.empty?
-          raise "No valid embeddings for dimensionality reduction.\n\n" \
-                "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
-                "Try running without dimensionality reduction:\n" \
-                "  ragnar topics --reduce-dimensions false"
-        end
-        if invalid_indices.any? && @verbose
-          puts "  ⚠️  Warning: #{invalid_indices.size} embeddings with invalid values removed"
-        end
-        begin
-          # Adjust parameters based on data size
-          n_samples = valid_embeddings.size
-          n_components = [@n_components, n_samples - 1, 50].min
-          n_neighbors = [15, n_samples - 1].min
-          if @verbose && n_components != @n_components
-            puts "  Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
-          end
-          umap = ClusterKit::Dimensionality::UMAP.new(
-            n_components: n_components,
-            n_neighbors: n_neighbors,
-            random_seed: 42  # For reproducibility
-          )
-          # Convert to format UMAP expects
-          reduced = umap.fit_transform(valid_embeddings)
-          # If we had to remove invalid embeddings, reconstruct the full array
-          if invalid_indices.any?
-            full_reduced = []
-            valid_idx = 0
-            embeddings.size.times do |i|
-              if invalid_indices.include?(i)
-                # Use zeros for invalid embeddings (they'll be outliers anyway)
-                full_reduced << Array.new(n_components, 0.0)
-              else
-                full_reduced << reduced[valid_idx]
-                valid_idx += 1
-              end
-            end
-            full_reduced
-          else
-            reduced
-          end
-        rescue => e
-          if e.message.include?("index out of bounds")
-            error_msg = "\n❌ Dimensionality reduction failed\n\n"
-            error_msg += "The UMAP algorithm encountered an error with your data.\n\n"
-            error_msg += "This typically happens with:\n"
-            error_msg += "  • Embeddings containing invalid values\n"
-            error_msg += "  • Too few samples (#{valid_embeddings.size} valid embeddings)\n"
-            error_msg += "  • Incompatible parameters\n\n"
-            error_msg += "Solutions:\n"
-            error_msg += "  1. Run without dimensionality reduction:\n"
-            error_msg += "     ragnar topics --reduce-dimensions false\n\n"
-            error_msg += "  2. Use fewer dimensions:\n"
-            error_msg += "     ragnar topics --n-components 2\n\n"
-            error_msg += "  3. Re-index your documents:\n"
-            error_msg += "     ragnar index <path> --force\n"
-            raise error_msg
-          else
-            raise
-          end
-        end
-      rescue LoadError
-        puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
-        embeddings
-      end
-      private
-      def validate_embeddings_for_umap(embeddings)
-        valid = []
-        invalid_indices = []
-        embeddings.each_with_index do |embedding, idx|
-          if embedding.is_a?(Array) &&
-             embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
-            valid << embedding
-          else
-            invalid_indices << idx
-          end
-        end
-        [valid, invalid_indices]
-      end
-      def build_topics(cluster_ids)
-        @cluster_ids = cluster_ids
-        # Group documents by cluster
-        clusters = {}
-        cluster_ids.each_with_index do |cluster_id, doc_idx|
-          next if cluster_id == -1  # Skip outliers
-          clusters[cluster_id] ||= []
-          clusters[cluster_id] << doc_idx
-        end
-        # Create Topic objects
-        clusters.map do |cluster_id, doc_indices|
-          Topic.new(
-            id: cluster_id,
-            document_indices: doc_indices,
-            documents: doc_indices.map { |i| @documents[i] },
-            embeddings: doc_indices.map { |i| @embeddings[i] },
-            metadata: doc_indices.map { |i| @metadata[i] }
-          )
-        end.sort_by(&:id)
-      end
-      def extract_topic_terms
-        # Extract distinctive terms for each topic
-        all_docs_text = @documents.join(" ")
-        @topics.each do |topic|
-          topic_docs_text = topic.documents.join(" ")
-          # Use c-TF-IDF to find distinctive terms
-          terms = @term_extractor.extract_distinctive_terms(
-            topic_docs: topic.documents,
-            all_docs: @documents,
-            top_n: 20
-          )
-          topic.set_terms(terms)
-        end
-      end
-      def generate_topic_labels
-        @topics.each do |topic|
-          result = @labeler.generate_label(
-            topic: topic,
-            terms: topic.terms,
-            documents: topic.documents.first(3)  # Use top 3 representative docs
-          )
-          # Set both label and description if available
-          topic.set_label(result[:label])
-          topic.instance_variable_set(:@description, result[:description]) if result[:description]
-          topic.instance_variable_set(:@label_confidence, result[:confidence])
-          topic.instance_variable_set(:@themes, result[:themes]) if result[:themes]
-        end
-      end
-      def count_outliers(cluster_ids)
-        cluster_ids.count { |id| id == -1 }
-      end
-      def assign_to_nearest_topic(embeddings)
-        # Simple nearest centroid assignment
-        topic_centroids = @topics.map(&:centroid)
-        embeddings.map do |embedding|
-          distances = topic_centroids.map do |centroid|
-            # Euclidean distance
-            Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
-          end
-          min_idx = distances.index(distances.min)
-          @topics[min_idx].id
-        end
-      end
-    end
-  end
-end

data/lib/ragnar/topic_modeling/labeling_strategies.rb DELETED Viewed

@@ -1,300 +0,0 @@
-# Separate strategy classes for different labeling approaches
-module Ragnar
-  module TopicModeling
-    module LabelingStrategies
-      # Base strategy class
-      class Base
-        def generate_label(topic:, terms:, documents:)
-          raise NotImplementedError, "Subclasses must implement generate_label"
-        end
-        protected
-        def select_representative_docs(documents, k: 3)
-          return documents if documents.length <= k
-          # For now, just take first k
-          # Could be improved to select most central docs
-          documents.first(k)
-        end
-        def capitalize_phrase(phrase)
-          phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
-        end
-      end
-      # Fast term-based labeling using c-TF-IDF terms
-      class TermBased < Base
-        def generate_label(topic:, terms:, documents:)
-          return { label: "Empty Topic", description: "No terms found" } if terms.empty?
-          # Take top distinctive terms
-          label_terms = terms.first(3).select { |t| t.length > 3 }
-          label = if label_terms.length >= 2
-            "#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
-          else
-            capitalize_phrase(label_terms.first || terms.first)
-          end
-          {
-            label: label,
-            description: "Documents about #{terms.first(5).join(', ')}",
-            method: :term_based,
-            confidence: calculate_confidence(terms)
-          }
-        end
-        private
-        def calculate_confidence(terms)
-          # Simple heuristic: more distinctive terms = higher confidence
-          return 0.0 if terms.empty?
-          # Assume terms come with scores if available
-          if terms.is_a?(Array) && terms.first.is_a?(Array)
-            # Terms are [word, score] pairs
-            avg_score = terms.first(5).map(&:last).sum / 5.0
-            [avg_score, 1.0].min
-          else
-            # Just have terms, use count as proxy
-            [terms.length / 20.0, 1.0].min
-          end
-        end
-      end
-      # Quality LLM-based labeling
-      class LLMBased < Base
-        def initialize(llm_client: nil)
-          @llm_client = llm_client
-        end
-        def generate_label(topic:, terms:, documents:)
-          unless llm_available?
-            # Fallback to term-based if LLM not available
-            return TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
-          end
-          # Select best documents to send to LLM
-          sample_docs = select_representative_docs(documents, k: 3)
-          # Generate comprehensive analysis
-          response = analyze_with_llm(sample_docs, terms)
-          {
-            label: response[:label],
-            description: response[:description],
-            themes: response[:themes],
-            method: :llm_based,
-            confidence: response[:confidence] || 0.8
-          }
-        rescue => e
-          # Fallback on error
-          puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
-          TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
-        end
-        private
-        def llm_available?
-          return true if @llm_client
-          # Try to create LLM adapter
-          begin
-            require_relative 'llm_adapter'
-            @llm_client = LLMAdapter.create(type: :auto)
-            @llm_client && @llm_client.available?
-          rescue LoadError, StandardError => e
-            puts "LLM not available: #{e.message}" if ENV['DEBUG']
-            false
-          end
-        end
-        def analyze_with_llm(documents, terms)
-          prompt = build_analysis_prompt(documents, terms)
-          response = @llm_client.generate(
-            prompt: prompt,
-            max_tokens: 150,
-            temperature: 0.3,
-            response_format: { type: "json_object" }
-          )
-          # Parse JSON response
-          result = JSON.parse(response, symbolize_names: true)
-          # Validate and clean
-          {
-            label: clean_label(result[:label]),
-            description: result[:description] || "Topic about #{result[:label]}",
-            themes: result[:themes] || [],
-            confidence: result[:confidence] || 0.8
-          }
-        end
-        def build_analysis_prompt(documents, terms)
-          doc_samples = documents.map.with_index do |doc, i|
-            preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
-            "Document #{i + 1}:\n#{preview}"
-          end.join("\n\n")
-          <<~PROMPT
-            Analyze this cluster of related documents and provide a structured summary.
-            Distinctive terms found: #{terms.first(10).join(', ')}
-            Sample documents:
-            #{doc_samples}
-            Provide a JSON response with:
-            {
-              "label": "A 2-4 word topic label",
-              "description": "One sentence describing what connects these documents",
-              "themes": ["theme1", "theme2", "theme3"],
-              "confidence": 0.0-1.0 score of how coherent this topic is
-            }
-            Focus on what meaningfully connects these documents, not just common words.
-          PROMPT
-        end
-        def clean_label(label)
-          return "Unknown Topic" unless label
-          # Remove quotes, trim, limit length
-          cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
-          cleaned = cleaned.split("\n").first if cleaned.include?("\n")
-          # Limit to reasonable length
-          if cleaned.length > 50
-            cleaned[0..47] + "..."
-          else
-            cleaned
-          end
-        end
-      end
-      # Hybrid approach - uses terms to guide LLM for efficiency
-      class Hybrid < Base
-        def initialize(llm_client: nil)
-          @llm_client = llm_client
-          @term_strategy = TermBased.new
-        end
-        def generate_label(topic:, terms:, documents:)
-          # Start with term-based analysis
-          term_result = @term_strategy.generate_label(
-            topic: topic,
-            terms: terms,
-            documents: documents
-          )
-          # If no LLM available, return term-based result
-          unless llm_available?
-            return term_result.merge(method: :hybrid_fallback)
-          end
-          # Enhance with focused LLM call
-          enhanced = enhance_with_llm(term_result, terms, documents)
-          {
-            label: enhanced[:label] || term_result[:label],
-            description: enhanced[:description] || term_result[:description],
-            method: :hybrid,
-            confidence: (term_result[:confidence] + (enhanced[:confidence] || 0.5)) / 2,
-            term_label: term_result[:label],  # Keep original for comparison
-            themes: enhanced[:themes]
-          }
-        rescue => e
-          # Fallback to term-based
-          puts "Hybrid enhancement failed: #{e.message}" if ENV['DEBUG']
-          term_result.merge(method: :hybrid_fallback)
-        end
-        private
-        def llm_available?
-          return true if @llm_client
-          begin
-            require_relative 'llm_adapter'
-            @llm_client = LLMAdapter.create(type: :auto)
-            @llm_client && @llm_client.available?
-          rescue LoadError, StandardError => e
-            puts "LLM not available for hybrid: #{e.message}" if ENV['DEBUG']
-            false
-          end
-        end
-        def enhance_with_llm(term_result, terms, documents)
-          # Lighter-weight prompt using term analysis as starting point
-          prompt = build_enhancement_prompt(term_result[:label], terms, documents.first)
-          response = @llm_client.generate(
-            prompt: prompt,
-            max_tokens: 100,
-            temperature: 0.3
-          )
-          # Parse response (simpler format for speed)
-          parse_enhancement_response(response)
-        end
-        def build_enhancement_prompt(term_label, terms, sample_doc)
-          doc_preview = sample_doc.length > 200 ? "#{sample_doc[0..200]}..." : sample_doc
-          <<~PROMPT
-            Current topic label based on terms: "#{term_label}"
-            Key terms: #{terms.first(8).join(', ')}
-            Sample document:
-            #{doc_preview}
-            Provide a better topic label if possible (2-4 words), or confirm the current one.
-            Also provide a one-sentence description.
-            Format:
-            Label: [your label]
-            Description: [one sentence]
-            Themes: [comma-separated list]
-          PROMPT
-        end
-        def parse_enhancement_response(response)
-          result = {}
-          # Simple line-based parsing
-          response.lines.each do |line|
-            if line.start_with?("Label:")
-              result[:label] = line.sub("Label:", "").strip
-            elsif line.start_with?("Description:")
-              result[:description] = line.sub("Description:", "").strip
-            elsif line.start_with?("Themes:")
-              themes_str = line.sub("Themes:", "").strip
-              result[:themes] = themes_str.split(",").map(&:strip)
-            end
-          end
-          result[:confidence] = result[:label] ? 0.7 : 0.3
-          result
-        end
-      end
-      # Factory method to get appropriate strategy
-      def self.create(method, llm_client: nil)
-        case method.to_sym
-        when :fast, :term_based, :terms
-          TermBased.new
-        when :quality, :llm_based, :llm
-          LLMBased.new(llm_client: llm_client)
-        when :hybrid, :auto, :smart
-          Hybrid.new(llm_client: llm_client)
-        else
-          # Default to hybrid
-          Hybrid.new(llm_client: llm_client)
-        end
-      end
-    end
-  end
-end