RubyGems - ragnar-cli - Versions diffs - 0.1.0.pre.2 → 0.1.0.pre.4 - Mend

ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/README.md +187 -36
data/lib/ragnar/cli.rb +543 -172
data/lib/ragnar/cli_visualization.rb +184 -0
data/lib/ragnar/config.rb +226 -0
data/lib/ragnar/database.rb +94 -8
data/lib/ragnar/llm_manager.rb +4 -1
data/lib/ragnar/query_processor.rb +38 -20
data/lib/ragnar/topic_modeling.rb +13 -10
data/lib/ragnar/umap_processor.rb +190 -73
data/lib/ragnar/umap_transform_service.rb +169 -88
data/lib/ragnar/version.rb +1 -1
metadata +43 -22
data/lib/ragnar/topic_modeling/engine.rb +0 -221
data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
data/lib/ragnar/topic_modeling/metrics.rb +0 -186
data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
data/lib/ragnar/topic_modeling/topic.rb +0 -117
data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ragnar-cli
 version: !ruby/object:Gem::Version
-  version: 0.1.0.pre.2
+  version: 0.1.0.pre.4
 platform: ruby
 authors:
 - Chris Petersen
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-08-23 00:00:00.000000000 Z
+date: 2025-09-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: thor
@@ -30,14 +30,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.2'
+        version: 1.2.3
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.2'
+        version: 1.2.3
 - !ruby/object:Gem::Dependency
   name: lancelot
   requirement: !ruby/object:Gem::Requirement
@@ -47,7 +47,7 @@ dependencies:
         version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.3.2
+        version: 0.3.3
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -57,21 +57,27 @@ dependencies:
         version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.3.2
+        version: 0.3.3
 - !ruby/object:Gem::Dependency
-  name: clusterkit
+  name: topical
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.0.pre.2
+        version: 0.1.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.0.pre.2
+        version: 0.1.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.1
 - !ruby/object:Gem::Dependency
   name: baran
   requirement: !ruby/object:Gem::Requirement
@@ -92,14 +98,20 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.0.pre.1
+        version: '0.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.2
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.0.pre.1
+        version: '0.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.2
 - !ruby/object:Gem::Dependency
   name: tty-progressbar
   requirement: !ruby/object:Gem::Requirement
@@ -114,6 +126,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '0.18'
+- !ruby/object:Gem::Dependency
+  name: thor-interactive
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.0.pre.3
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.0.pre.3
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -186,6 +212,8 @@ files:
 - lib/ragnar.rb
 - lib/ragnar/chunker.rb
 - lib/ragnar/cli.rb
+- lib/ragnar/cli_visualization.rb
+- lib/ragnar/config.rb
 - lib/ragnar/context_repacker.rb
 - lib/ragnar/database.rb
 - lib/ragnar/embedder.rb
@@ -194,24 +222,17 @@ files:
 - lib/ragnar/query_processor.rb
 - lib/ragnar/query_rewriter.rb
 - lib/ragnar/topic_modeling.rb
-- lib/ragnar/topic_modeling/engine.rb
-- lib/ragnar/topic_modeling/labeling_strategies.rb
-- lib/ragnar/topic_modeling/llm_adapter.rb
-- lib/ragnar/topic_modeling/metrics.rb
-- lib/ragnar/topic_modeling/term_extractor.rb
-- lib/ragnar/topic_modeling/topic.rb
-- lib/ragnar/topic_modeling/topic_labeler.rb
 - lib/ragnar/umap_processor.rb
 - lib/ragnar/umap_transform_service.rb
 - lib/ragnar/version.rb
 - lib/ragnar_cli.rb
-homepage: https://github.com/cpetersen/ragnar
+homepage: https://github.com/scientist-labs/ragnar
 licenses:
 - MIT
 metadata:
-  homepage_uri: https://github.com/cpetersen/ragnar
-  source_code_uri: https://github.com/cpetersen/ragnar
-  changelog_uri: https://github.com/cpetersen/ragnar/blob/main/CHANGELOG.md
+  homepage_uri: https://github.com/scientist-labs/ragnar
+  source_code_uri: https://github.com/scientist-labs/ragnar
+  changelog_uri: https://github.com/scientist-labs/ragnar/blob/main/CHANGELOG.md
 post_install_message:
 rdoc_options: []
 require_paths:

data/lib/ragnar/topic_modeling/engine.rb DELETED Viewed

@@ -1,221 +0,0 @@
-require 'json'
-module Ragnar
-  module TopicModeling
-    class Engine
-      attr_reader :topics, :clusterer, :term_extractor
-      def initialize(
-        min_cluster_size: 5,
-        min_samples: 3,
-        clustering_backend: nil,
-        reduce_dimensions: true,
-        n_components: 50,
-        labeling_method: :hybrid,
-        llm_client: nil,
-        verbose: false
-      )
-        @min_cluster_size = min_cluster_size
-        @min_samples = min_samples
-        @reduce_dimensions = reduce_dimensions
-        @n_components = n_components
-        @labeling_method = labeling_method
-        @verbose = verbose
-        @clusterer = clustering_backend || build_default_clusterer
-        @term_extractor = TermExtractor.new
-        @labeler = TopicLabeler.new(method: labeling_method, llm_client: llm_client)
-        @topics = []
-      end
-      def fit(embeddings:, documents:, metadata: nil)
-        raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
-        @embeddings = embeddings
-        @documents = documents
-        @metadata = metadata || Array.new(documents.length) { {} }
-        puts "Starting topic extraction..." if @verbose
-        # Step 1: Optionally reduce dimensions for better clustering
-        working_embeddings = @embeddings
-        if @reduce_dimensions && @embeddings.first.length > @n_components
-          puts "  Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
-          working_embeddings = reduce_dimensions(@embeddings)
-        end
-        # Step 2: Cluster embeddings
-        puts "  Clustering #{working_embeddings.length} documents..." if @verbose
-        cluster_ids = @clusterer.fit_predict(working_embeddings)
-        # Step 3: Build topics from clusters
-        puts "  Building topics..." if @verbose
-        @topics = build_topics(cluster_ids)
-        # Step 4: Extract terms for each topic
-        puts "  Extracting distinctive terms..." if @verbose
-        extract_topic_terms
-        # Step 5: Generate labels
-        puts "  Generating topic labels..." if @verbose
-        generate_topic_labels
-        puts "Found #{@topics.length} topics (plus #{count_outliers(cluster_ids)} outliers)" if @verbose
-        @topics
-      end
-      def transform(embeddings:, documents: nil)
-        # Assign new documents to existing topics
-        raise "Must call fit before transform" if @topics.empty?
-        # Use approximate prediction if available
-        if @clusterer.respond_to?(:approximate_predict)
-          @clusterer.approximate_predict(embeddings)
-        else
-          # Fallback: assign to nearest topic centroid
-          assign_to_nearest_topic(embeddings)
-        end
-      end
-      def get_topic(topic_id)
-        @topics.find { |t| t.id == topic_id }
-      end
-      def outliers
-        @outliers ||= @documents.each_with_index.select { |_, idx|
-          @cluster_ids && @cluster_ids[idx] == -1
-        }.map(&:first)
-      end
-      def save(path)
-        data = {
-          topics: @topics.map(&:to_h),
-          config: {
-            min_cluster_size: @min_cluster_size,
-            min_samples: @min_samples,
-            reduce_dimensions: @reduce_dimensions,
-            n_components: @n_components,
-            labeling_method: @labeling_method
-          }
-        }
-        File.write(path, JSON.pretty_generate(data))
-      end
-      def self.load(path)
-        data = JSON.parse(File.read(path), symbolize_names: true)
-        engine = new(**data[:config])
-        # Reconstruct topics
-        engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
-        engine
-      end
-      private
-      def build_default_clusterer
-        begin
-          require 'clusterkit'
-          ClusterKit::Clustering::HDBSCAN.new(
-            min_cluster_size: @min_cluster_size,
-            min_samples: @min_samples,
-            metric: 'euclidean'
-          )
-        rescue LoadError
-          raise "ClusterKit required for topic modeling. Add 'gem \"clusterkit\"' to your Gemfile."
-        end
-      end
-      def reduce_dimensions(embeddings)
-        require 'clusterkit'
-        umap = ClusterKit::Dimensionality::UMAP.new(
-          n_components: @n_components,
-          n_neighbors: 15,
-          random_seed: 42  # For reproducibility
-        )
-        # Convert to format UMAP expects
-        umap.fit_transform(embeddings)
-      rescue LoadError
-        puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
-        embeddings
-      end
-      def build_topics(cluster_ids)
-        @cluster_ids = cluster_ids
-        # Group documents by cluster
-        clusters = {}
-        cluster_ids.each_with_index do |cluster_id, doc_idx|
-          next if cluster_id == -1  # Skip outliers
-          clusters[cluster_id] ||= []
-          clusters[cluster_id] << doc_idx
-        end
-        # Create Topic objects
-        clusters.map do |cluster_id, doc_indices|
-          Topic.new(
-            id: cluster_id,
-            document_indices: doc_indices,
-            documents: doc_indices.map { |i| @documents[i] },
-            embeddings: doc_indices.map { |i| @embeddings[i] },
-            metadata: doc_indices.map { |i| @metadata[i] }
-          )
-        end.sort_by(&:id)
-      end
-      def extract_topic_terms
-        # Extract distinctive terms for each topic
-        all_docs_text = @documents.join(" ")
-        @topics.each do |topic|
-          topic_docs_text = topic.documents.join(" ")
-          # Use c-TF-IDF to find distinctive terms
-          terms = @term_extractor.extract_distinctive_terms(
-            topic_docs: topic.documents,
-            all_docs: @documents,
-            top_n: 20
-          )
-          topic.set_terms(terms)
-        end
-      end
-      def generate_topic_labels
-        @topics.each do |topic|
-          result = @labeler.generate_label(
-            topic: topic,
-            terms: topic.terms,
-            documents: topic.documents.first(3)  # Use top 3 representative docs
-          )
-          # Set both label and description if available
-          topic.set_label(result[:label])
-          topic.instance_variable_set(:@description, result[:description]) if result[:description]
-          topic.instance_variable_set(:@label_confidence, result[:confidence])
-          topic.instance_variable_set(:@themes, result[:themes]) if result[:themes]
-        end
-      end
-      def count_outliers(cluster_ids)
-        cluster_ids.count { |id| id == -1 }
-      end
-      def assign_to_nearest_topic(embeddings)
-        # Simple nearest centroid assignment
-        topic_centroids = @topics.map(&:centroid)
-        embeddings.map do |embedding|
-          distances = topic_centroids.map do |centroid|
-            # Euclidean distance
-            Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
-          end
-          min_idx = distances.index(distances.min)
-          @topics[min_idx].id
-        end
-      end
-    end
-  end
-end

data/lib/ragnar/topic_modeling/labeling_strategies.rb DELETED Viewed

@@ -1,300 +0,0 @@
-# Separate strategy classes for different labeling approaches
-module Ragnar
-  module TopicModeling
-    module LabelingStrategies
-      # Base strategy class
-      class Base
-        def generate_label(topic:, terms:, documents:)
-          raise NotImplementedError, "Subclasses must implement generate_label"
-        end
-        protected
-        def select_representative_docs(documents, k: 3)
-          return documents if documents.length <= k
-          # For now, just take first k
-          # Could be improved to select most central docs
-          documents.first(k)
-        end
-        def capitalize_phrase(phrase)
-          phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
-        end
-      end
-      # Fast term-based labeling using c-TF-IDF terms
-      class TermBased < Base
-        def generate_label(topic:, terms:, documents:)
-          return { label: "Empty Topic", description: "No terms found" } if terms.empty?
-          # Take top distinctive terms
-          label_terms = terms.first(3).select { |t| t.length > 3 }
-          label = if label_terms.length >= 2
-            "#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
-          else
-            capitalize_phrase(label_terms.first || terms.first)
-          end
-          {
-            label: label,
-            description: "Documents about #{terms.first(5).join(', ')}",
-            method: :term_based,
-            confidence: calculate_confidence(terms)
-          }
-        end
-        private
-        def calculate_confidence(terms)
-          # Simple heuristic: more distinctive terms = higher confidence
-          return 0.0 if terms.empty?
-          # Assume terms come with scores if available
-          if terms.is_a?(Array) && terms.first.is_a?(Array)
-            # Terms are [word, score] pairs
-            avg_score = terms.first(5).map(&:last).sum / 5.0
-            [avg_score, 1.0].min
-          else
-            # Just have terms, use count as proxy
-            [terms.length / 20.0, 1.0].min
-          end
-        end
-      end
-      # Quality LLM-based labeling
-      class LLMBased < Base
-        def initialize(llm_client: nil)
-          @llm_client = llm_client
-        end
-        def generate_label(topic:, terms:, documents:)
-          unless llm_available?
-            # Fallback to term-based if LLM not available
-            return TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
-          end
-          # Select best documents to send to LLM
-          sample_docs = select_representative_docs(documents, k: 3)
-          # Generate comprehensive analysis
-          response = analyze_with_llm(sample_docs, terms)
-          {
-            label: response[:label],
-            description: response[:description],
-            themes: response[:themes],
-            method: :llm_based,
-            confidence: response[:confidence] || 0.8
-          }
-        rescue => e
-          # Fallback on error
-          puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
-          TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
-        end
-        private
-        def llm_available?
-          return true if @llm_client
-          # Try to create LLM adapter
-          begin
-            require_relative 'llm_adapter'
-            @llm_client = LLMAdapter.create(type: :auto)
-            @llm_client && @llm_client.available?
-          rescue LoadError, StandardError => e
-            puts "LLM not available: #{e.message}" if ENV['DEBUG']
-            false
-          end
-        end
-        def analyze_with_llm(documents, terms)
-          prompt = build_analysis_prompt(documents, terms)
-          response = @llm_client.generate(
-            prompt: prompt,
-            max_tokens: 150,
-            temperature: 0.3,
-            response_format: { type: "json_object" }
-          )
-          # Parse JSON response
-          result = JSON.parse(response, symbolize_names: true)
-          # Validate and clean
-          {
-            label: clean_label(result[:label]),
-            description: result[:description] || "Topic about #{result[:label]}",
-            themes: result[:themes] || [],
-            confidence: result[:confidence] || 0.8
-          }
-        end
-        def build_analysis_prompt(documents, terms)
-          doc_samples = documents.map.with_index do |doc, i|
-            preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
-            "Document #{i + 1}:\n#{preview}"
-          end.join("\n\n")
-          <<~PROMPT
-            Analyze this cluster of related documents and provide a structured summary.
-            Distinctive terms found: #{terms.first(10).join(', ')}
-            Sample documents:
-            #{doc_samples}
-            Provide a JSON response with:
-            {
-              "label": "A 2-4 word topic label",
-              "description": "One sentence describing what connects these documents",
-              "themes": ["theme1", "theme2", "theme3"],
-              "confidence": 0.0-1.0 score of how coherent this topic is
-            }
-            Focus on what meaningfully connects these documents, not just common words.
-          PROMPT
-        end
-        def clean_label(label)
-          return "Unknown Topic" unless label
-          # Remove quotes, trim, limit length
-          cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
-          cleaned = cleaned.split("\n").first if cleaned.include?("\n")
-          # Limit to reasonable length
-          if cleaned.length > 50
-            cleaned[0..47] + "..."
-          else
-            cleaned
-          end
-        end
-      end
-      # Hybrid approach - uses terms to guide LLM for efficiency
-      class Hybrid < Base
-        def initialize(llm_client: nil)
-          @llm_client = llm_client
-          @term_strategy = TermBased.new
-        end
-        def generate_label(topic:, terms:, documents:)
-          # Start with term-based analysis
-          term_result = @term_strategy.generate_label(
-            topic: topic,
-            terms: terms,
-            documents: documents
-          )
-          # If no LLM available, return term-based result
-          unless llm_available?
-            return term_result.merge(method: :hybrid_fallback)
-          end
-          # Enhance with focused LLM call
-          enhanced = enhance_with_llm(term_result, terms, documents)
-          {
-            label: enhanced[:label] || term_result[:label],
-            description: enhanced[:description] || term_result[:description],
-            method: :hybrid,
-            confidence: (term_result[:confidence] + (enhanced[:confidence] || 0.5)) / 2,
-            term_label: term_result[:label],  # Keep original for comparison
-            themes: enhanced[:themes]
-          }
-        rescue => e
-          # Fallback to term-based
-          puts "Hybrid enhancement failed: #{e.message}" if ENV['DEBUG']
-          term_result.merge(method: :hybrid_fallback)
-        end
-        private
-        def llm_available?
-          return true if @llm_client
-          begin
-            require_relative 'llm_adapter'
-            @llm_client = LLMAdapter.create(type: :auto)
-            @llm_client && @llm_client.available?
-          rescue LoadError, StandardError => e
-            puts "LLM not available for hybrid: #{e.message}" if ENV['DEBUG']
-            false
-          end
-        end
-        def enhance_with_llm(term_result, terms, documents)
-          # Lighter-weight prompt using term analysis as starting point
-          prompt = build_enhancement_prompt(term_result[:label], terms, documents.first)
-          response = @llm_client.generate(
-            prompt: prompt,
-            max_tokens: 100,
-            temperature: 0.3
-          )
-          # Parse response (simpler format for speed)
-          parse_enhancement_response(response)
-        end
-        def build_enhancement_prompt(term_label, terms, sample_doc)
-          doc_preview = sample_doc.length > 200 ? "#{sample_doc[0..200]}..." : sample_doc
-          <<~PROMPT
-            Current topic label based on terms: "#{term_label}"
-            Key terms: #{terms.first(8).join(', ')}
-            Sample document:
-            #{doc_preview}
-            Provide a better topic label if possible (2-4 words), or confirm the current one.
-            Also provide a one-sentence description.
-            Format:
-            Label: [your label]
-            Description: [one sentence]
-            Themes: [comma-separated list]
-          PROMPT
-        end
-        def parse_enhancement_response(response)
-          result = {}
-          # Simple line-based parsing
-          response.lines.each do |line|
-            if line.start_with?("Label:")
-              result[:label] = line.sub("Label:", "").strip
-            elsif line.start_with?("Description:")
-              result[:description] = line.sub("Description:", "").strip
-            elsif line.start_with?("Themes:")
-              themes_str = line.sub("Themes:", "").strip
-              result[:themes] = themes_str.split(",").map(&:strip)
-            end
-          end
-          result[:confidence] = result[:label] ? 0.7 : 0.3
-          result
-        end
-      end
-      # Factory method to get appropriate strategy
-      def self.create(method, llm_client: nil)
-        case method.to_sym
-        when :fast, :term_based, :terms
-          TermBased.new
-        when :quality, :llm_based, :llm
-          LLMBased.new(llm_client: llm_client)
-        when :hybrid, :auto, :smart
-          Hybrid.new(llm_client: llm_client)
-        else
-          # Default to hybrid
-          Hybrid.new(llm_client: llm_client)
-        end
-      end
-    end
-  end
-end