RubyGems - topical - Versions diffs - 0.0.1.pre.1 → 0.1.1 - Mend

topical 0.0.1.pre.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/README.md +159 -107
data/docs/assets/topical-wide.png +0 -0
data/examples/detect_new_topics.rb +190 -0
data/examples/quick_demo.rb +1 -1
data/examples/topic_summaries_with_llm.rb +128 -0
data/examples/verify_migration.rb +1 -1
data/lib/topical/clustering/adapter.rb +1 -1
data/lib/topical/clustering/hdbscan_adapter.rb +1 -1
data/lib/topical/clustering/kmeans_adapter.rb +1 -1
data/lib/topical/dimensionality_reducer.rb +96 -0
data/lib/topical/engine.rb +31 -126
data/lib/topical/extractors/term_extractor.rb +1 -1
data/lib/topical/labelers/base.rb +1 -1
data/lib/topical/labelers/term_based.rb +1 -1
data/lib/topical/metrics.rb +1 -1
data/lib/topical/model_serializer.rb +59 -0
data/lib/topical/topic.rb +1 -1
data/lib/topical/version.rb +1 -1
data/lib/topical.rb +6 -11
metadata +29 -13
data/lib/topical/labelers/hybrid.rb +0 -24
data/lib/topical/labelers/llm_adapter.rb +0 -126
data/lib/topical/labelers/llm_based.rb +0 -111

data/examples/topic_summaries_with_llm.rb ADDED Viewed

@@ -0,0 +1,128 @@
+#!/usr/bin/env ruby
+# Advanced example: Using Topical for clustering + red-candle for topic summaries
+require 'bundler/setup'
+require 'topical'
+require 'red-candle'
+puts "=== Advanced Topic Summaries Example ==="
+puts "Combining Topical clustering with red-candle LLM summarization"
+puts
+# Sample documents with clear topic clusters
+documents = [
+  # Finance/Economics
+  "The Federal Reserve raised interest rates to combat inflation pressures",
+  "Stock markets rallied on positive earnings reports from tech companies",
+  "Cryptocurrency markets experienced significant volatility this quarter",
+  "Central banks coordinate policy to address economic uncertainty",
+  "Corporate bond yields rise as investors seek safer assets",
+  # Technology/AI
+  "New AI breakthrough in natural language processing announced by researchers",
+  "Machine learning transforms healthcare diagnostics and treatment planning",
+  "Cloud computing adoption accelerates across enterprise sectors",
+  "Cybersecurity threats evolve with sophisticated ransomware attacks",
+  "Quantum computing reaches new milestone in error correction",
+  # Healthcare/Medical
+  "Clinical trials show promising results for new cancer immunotherapy",
+  "Telemedicine adoption continues to reshape patient care delivery",
+  "Gene editing techniques advance treatment for rare diseases",
+  "Mental health awareness campaigns gain momentum globally",
+  "Personalized medicine approaches show improved patient outcomes",
+  # Climate/Environment
+  "Renewable energy investments surpass fossil fuel spending globally",
+  "Climate scientists warn of accelerating Arctic ice melt",
+  "Carbon capture technology receives significant government funding",
+  "Electric vehicle adoption reaches record levels worldwide",
+  "Sustainable agriculture practices reduce environmental impact"
+]
+# Step 1: Generate embeddings using red-candle
+puts "1. Generating embeddings with red-candle..."
+embedder = Candle::EmbeddingModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+embeddings = documents.map { |doc| embedder.embedding(doc).first.to_a }
+# Step 2: Extract topics using Topical (term-based labeling only)
+puts "2. Extracting topics with Topical..."
+engine = Topical::Engine.new(
+  clustering_method: :hdbscan,
+  min_cluster_size: 4,
+  labeling_method: :term_based,
+  verbose: true
+)
+topics = engine.fit(embeddings: embeddings, documents: documents)
+# Step 3: Generate summaries using red-candle LLM
+puts "\n3. Generating topic summaries with LLM..."
+# Initialize LLM for summarization
+llm = Candle::LLM.from_pretrained(
+  "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+  gguf_file: "tinyllama-1.1b-chat-v1.0.q4_k_m.gguf"
+)
+def summarize_topic(topic, llm)
+  # Get representative documents for context
+  sample_docs = topic.representative_docs(k: 3)
+  # Simple, clear prompt for summarization
+  prompt = <<~PROMPT
+    Summarize what connects these documents in 1-2 sentences:
+    Key terms: #{topic.terms.first(5).join(', ')}
+    Documents:
+    #{sample_docs.map.with_index { |doc, i| "#{i+1}. #{doc}" }.join("\n")}
+    Summary:
+  PROMPT
+  begin
+    summary = llm.generate(prompt).strip
+    # Clean up common artifacts
+    summary = summary.lines.first&.strip || "Related documents"
+    summary = summary.gsub(/^(Summary:|Topic:|Documents:)/i, '').strip
+    summary.empty? ? "Documents about #{topic.terms.first(2).join(' and ')}" : summary
+  rescue => e
+    "Documents about #{topic.terms.first(2).join(' and ')}"
+  end
+end
+# Step 4: Display results with summaries
+puts "\n=== Topics with LLM Summaries ==="
+topics.each_with_index do |topic, i|
+  puts "\n#{i + 1}. Topic: #{topic.label}"
+  # Generate summary using LLM
+  summary = summarize_topic(topic, llm)
+  puts "   Summary: #{summary}"
+  puts "   Size: #{topic.size} documents"
+  puts "   Key terms: #{topic.terms.first(8).join(', ')}"
+  puts "   Coherence: #{topic.coherence.round(3)}"
+  puts "   Sample documents:"
+  topic.representative_docs(k: 2).each do |doc|
+    puts "   • #{doc[0..80]}..."
+  end
+end
+# Step 5: Show outliers
+outliers = engine.outliers
+if outliers.any?
+  puts "\nOutliers (#{outliers.length} documents):"
+  outliers.each { |doc| puts "  • #{doc[0..60]}..." }
+end
+puts "\n=== Key Benefits of This Approach ==="
+puts "• Topical handles clustering expertly (fast, reliable)"
+puts "• Your application controls LLM integration completely"
+puts "• Domain-specific prompts for better summaries"
+puts "• Easy to swap LLM providers or models"
+puts "• Clean separation of concerns"
+puts "\nDone! 🎯"

data/examples/verify_migration.rb CHANGED Viewed

@@ -105,4 +105,4 @@ outliers = engine.outliers
 puts "  Outliers: #{outliers.length}"
 puts
-puts "=== All tests passed! Migration successful. ==="
+puts "=== All tests passed! Migration successful. ==="

data/lib/topical/clustering/adapter.rb CHANGED Viewed

@@ -27,4 +27,4 @@ module Topical
       end
     end
   end
-end
+end

data/lib/topical/clustering/hdbscan_adapter.rb CHANGED Viewed

@@ -51,4 +51,4 @@ module Topical
       end
     end
   end
-end
+end

data/lib/topical/clustering/kmeans_adapter.rb CHANGED Viewed

@@ -41,4 +41,4 @@ module Topical
       attr_reader :clusterer
     end
   end
-end
+end

data/lib/topical/dimensionality_reducer.rb ADDED Viewed

@@ -0,0 +1,96 @@
+# frozen_string_literal: true
+require 'logger'
+module Topical
+  # Handles dimensionality reduction for embeddings using UMAP
+  class DimensionalityReducer
+    def initialize(n_components: 50, logger: nil)
+      @n_components = n_components
+      @logger = logger || Logger.new(IO::NULL, level: Logger::FATAL)
+    end
+    # Reduce dimensionality of embeddings if needed
+    # @param embeddings [Array<Array<Float>>] Input embeddings
+    # @return [Array<Array<Float>>] Reduced embeddings
+    def reduce(embeddings)
+      return embeddings if embeddings.empty?
+      return embeddings if embeddings.first.length <= @n_components
+      begin
+        require 'clusterkit'
+        # Validate embeddings before UMAP
+        valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
+        if valid_embeddings.empty?
+          raise "No valid embeddings for dimensionality reduction. " \
+                "All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
+        end
+        if invalid_indices.any?
+          @logger.warn "  Warning: #{invalid_indices.size} embeddings with invalid values removed"
+        end
+        # Adjust parameters based on data size
+        n_samples = valid_embeddings.size
+        n_components = [@n_components, n_samples - 1, 50].min
+        n_neighbors = [15, n_samples - 1].min
+        if n_components != @n_components
+          @logger.info "  Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
+        end
+        umap = ClusterKit::Dimensionality::UMAP.new(
+          n_components: n_components,
+          n_neighbors: n_neighbors,
+          random_seed: 42
+        )
+        reduced = umap.fit_transform(valid_embeddings)
+        # If we had to remove invalid embeddings, reconstruct the full array
+        if invalid_indices.any?
+          full_reduced = []
+          valid_idx = 0
+          embeddings.size.times do |i|
+            if invalid_indices.include?(i)
+              # Use zeros for invalid embeddings (they'll be outliers anyway)
+              full_reduced << Array.new(n_components, 0.0)
+            else
+              full_reduced << reduced[valid_idx]
+              valid_idx += 1
+            end
+          end
+          full_reduced
+        else
+          reduced
+        end
+      rescue LoadError
+        @logger.warn "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings."
+        embeddings
+      rescue => e
+        @logger.warn "Warning: Dimensionality reduction failed: #{e.message}"
+        embeddings
+      end
+    end
+    private
+    def validate_embeddings_for_umap(embeddings)
+      valid = []
+      invalid_indices = []
+      embeddings.each_with_index do |embedding, idx|
+        if embedding.is_a?(Array) &&
+           embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
+          valid << embedding
+        else
+          invalid_indices << idx
+        end
+      end
+      [valid, invalid_indices]
+    end
+  end
+end

data/lib/topical/engine.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require 'logger'
 module Topical
   # Main engine for topic modeling
   class Engine
@@ -11,9 +13,9 @@ module Topical
       min_samples: 3,
       reduce_dimensions: true,
       n_components: 50,
-      labeling_method: :hybrid,
-      llm_provider: nil,
+      labeling_method: :term_based,
       verbose: false,
+      logger: nil,
       k: nil,  # Add k as explicit parameter
       **options
     )
@@ -23,14 +25,18 @@ module Topical
       @reduce_dimensions = reduce_dimensions
       @n_components = n_components
       @labeling_method = labeling_method
-      @llm_provider = llm_provider
       @verbose = verbose
+      @logger = setup_logger(logger, verbose)
       @options = options
       @options[:k] = k if k  # Store k in options if provided
       @clustering_adapter = build_clustering_adapter
       @term_extractor = Extractors::TermExtractor.new
       @labeler = build_labeler
+      @dimensionality_reducer = DimensionalityReducer.new(
+        n_components: @n_components,
+        logger: @logger
+      )
       @topics = []
     end
@@ -46,34 +52,34 @@ module Topical
       @documents = documents
       @metadata = metadata || Array.new(documents.length) { {} }
-      puts "Starting topic extraction..." if @verbose
+      @logger.info "Starting topic extraction..."
       # Step 1: Optionally reduce dimensions
       working_embeddings = @embeddings
-      if @reduce_dimensions && @embeddings.first.length > @n_components
-        puts "  Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
-        working_embeddings = reduce_dimensions(@embeddings)
+      if @reduce_dimensions && !@embeddings.empty? && @embeddings.first.length > @n_components
+        @logger.info "  Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..."
+        working_embeddings = @dimensionality_reducer.reduce(@embeddings)
       end
       # Step 2: Cluster embeddings
-      puts "  Clustering #{working_embeddings.length} documents..." if @verbose
+      @logger.info "  Clustering #{working_embeddings.length} documents..."
       @cluster_ids = @clustering_adapter.fit_predict(working_embeddings)
       # Step 3: Build topics from clusters
-      puts "  Building topics from clusters..." if @verbose
+      @logger.info "  Building topics from clusters..."
       @topics = build_topics(@cluster_ids)
       # Step 4: Extract terms for each topic
-      puts "  Extracting distinctive terms..." if @verbose
+      @logger.info "  Extracting distinctive terms..."
       extract_topic_terms
       # Step 5: Generate labels
-      puts "  Generating topic labels..." if @verbose
+      @logger.info "  Generating topic labels..."
       generate_topic_labels
       if @verbose
         n_noise = @cluster_ids.count(-1)
-        puts "Found #{@topics.length} topics (plus #{n_noise} outliers)"
+        @logger.info "Found #{@topics.length} topics (plus #{n_noise} outliers)"
       end
       @topics
@@ -105,124 +111,27 @@ module Topical
     # Save the model
     def save(path)
-      require 'json'
-      config = {
-        clustering_method: @clustering_method,
-        min_cluster_size: @min_cluster_size,
-        min_samples: @min_samples,
-        reduce_dimensions: @reduce_dimensions,
-        n_components: @n_components,
-        labeling_method: @labeling_method
-      }
-      # Include k for kmeans
-      if @clustering_method == :kmeans
-        config[:k] = @options[:k] || @topics.length
-      end
-      data = {
-        topics: @topics.map(&:to_h),
-        config: config
-      }
-      File.write(path, JSON.pretty_generate(data))
+      ModelSerializer.save(self, path)
     end
     # Load a model
     def self.load(path)
-      require 'json'
-      data = JSON.parse(File.read(path), symbolize_names: true)
-      # Make sure k is passed for kmeans and convert string keys to symbols
-      config = data[:config]
-      config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
-      config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
-      if config[:clustering_method] == :kmeans && !config[:k]
-        # Extract k from saved topics or use default
-        config[:k] = data[:topics]&.length || 5
-      end
-      engine = new(**config)
-      # Reconstruct topics
-      engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
-      engine
+      ModelSerializer.load(path)
     end
     private
-    def reduce_dimensions(embeddings)
-      begin
-        require 'clusterkit'
-        # Validate embeddings before UMAP
-        valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
-        if valid_embeddings.empty?
-          raise "No valid embeddings for dimensionality reduction. " \
-                "All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
-        end
-        if invalid_indices.any? && @verbose
-          puts "  Warning: #{invalid_indices.size} embeddings with invalid values removed"
-        end
-        # Adjust parameters based on data size
-        n_samples = valid_embeddings.size
-        n_components = [@n_components, n_samples - 1, 50].min
-        n_neighbors = [15, n_samples - 1].min
-        if @verbose && n_components != @n_components
-          puts "  Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
-        end
-        umap = ClusterKit::Dimensionality::UMAP.new(
-          n_components: n_components,
-          n_neighbors: n_neighbors,
-          random_seed: 42
-        )
-        reduced = umap.fit_transform(valid_embeddings)
-        # If we had to remove invalid embeddings, reconstruct the full array
-        if invalid_indices.any?
-          full_reduced = []
-          valid_idx = 0
-          embeddings.size.times do |i|
-            if invalid_indices.include?(i)
-              # Use zeros for invalid embeddings (they'll be outliers anyway)
-              full_reduced << Array.new(n_components, 0.0)
-            else
-              full_reduced << reduced[valid_idx]
-              valid_idx += 1
-            end
-          end
-          full_reduced
-        else
-          reduced
-        end
-      rescue LoadError
-        puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
-        embeddings
-      rescue => e
-        puts "Warning: Dimensionality reduction failed: #{e.message}" if @verbose
-        embeddings
-      end
-    end
-    def validate_embeddings_for_umap(embeddings)
-      valid = []
-      invalid_indices = []
+    def setup_logger(logger, verbose)
+      return logger if logger
-      embeddings.each_with_index do |embedding, idx|
-        if embedding.is_a?(Array) &&
-           embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
-          valid << embedding
-        else
-          invalid_indices << idx
-        end
+      # Create default logger for backward compatibility
+      if verbose
+        require 'logger'
+        Logger.new($stdout, level: Logger::INFO)
+      else
+        # Null logger - doesn't output anything
+        Logger.new(IO::NULL, level: Logger::FATAL)
       end
-      [valid, invalid_indices]
     end
     def build_topics(cluster_ids)
@@ -283,12 +192,8 @@ module Topical
       case @labeling_method
       when :term_based
         Labelers::TermBased.new
-      when :llm_based
-        Labelers::LLMBased.new(provider: @llm_provider)
-      when :hybrid
-        Labelers::Hybrid.new(provider: @llm_provider)
       else
-        Labelers::TermBased.new  # Default fallback
+        Labelers::TermBased.new  # Only term-based labeling supported
       end
     end
@@ -307,4 +212,4 @@ module Topical
       end
     end
   end
-end
+end

data/lib/topical/extractors/term_extractor.rb CHANGED Viewed

@@ -95,4 +95,4 @@ module Topical
       end
     end
   end
-end
+end

data/lib/topical/labelers/base.rb CHANGED Viewed

@@ -20,4 +20,4 @@ module Topical
       end
     end
   end
-end
+end

data/lib/topical/labelers/term_based.rb CHANGED Viewed

@@ -19,4 +19,4 @@ module Topical
       end
     end
   end
-end
+end

data/lib/topical/metrics.rb CHANGED Viewed

@@ -185,4 +185,4 @@ module Topical
       )
     end
   end
-end
+end

data/lib/topical/model_serializer.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+module Topical
+  # Handles saving and loading of topic models
+  class ModelSerializer
+    # Save a topic model to JSON file
+    # @param engine [Engine] The engine instance to save
+    # @param path [String] File path to save to
+    def self.save(engine, path)
+      require 'json'
+      config = {
+        clustering_method: engine.instance_variable_get(:@clustering_method),
+        min_cluster_size: engine.instance_variable_get(:@min_cluster_size),
+        min_samples: engine.instance_variable_get(:@min_samples),
+        reduce_dimensions: engine.instance_variable_get(:@reduce_dimensions),
+        n_components: engine.instance_variable_get(:@n_components),
+        labeling_method: engine.instance_variable_get(:@labeling_method)
+      }
+      # Include k for kmeans
+      options = engine.instance_variable_get(:@options)
+      if config[:clustering_method] == :kmeans
+        config[:k] = options[:k] || engine.topics.length
+      end
+      data = {
+        topics: engine.topics.map(&:to_h),
+        config: config
+      }
+      File.write(path, JSON.pretty_generate(data))
+    end
+    # Load a topic model from JSON file
+    # @param path [String] File path to load from
+    # @return [Engine] Loaded engine instance
+    def self.load(path)
+      require 'json'
+      data = JSON.parse(File.read(path), symbolize_names: true)
+      # Make sure k is passed for kmeans and convert string keys to symbols
+      config = data[:config]
+      config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
+      config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
+      if config[:clustering_method] == :kmeans && !config[:k]
+        # Extract k from saved topics or use default
+        config[:k] = data[:topics]&.length || 5
+      end
+      engine = Engine.new(**config)
+      # Reconstruct topics
+      engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
+      engine
+    end
+  end
+end

data/lib/topical/topic.rb CHANGED Viewed

@@ -111,4 +111,4 @@ module Topical
       )
     end
   end
-end
+end

data/lib/topical/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Topical
-  VERSION = "0.0.1.pre.1"
+  VERSION = "0.1.1"
 end

data/lib/topical.rb CHANGED Viewed

@@ -10,6 +10,8 @@ module Topical
   autoload :Engine, "topical/engine"
   autoload :Topic, "topical/topic"
   autoload :Metrics, "topical/metrics"
+  autoload :DimensionalityReducer, "topical/dimensionality_reducer"
+  autoload :ModelSerializer, "topical/model_serializer"
   module Clustering
     autoload :Adapter, "topical/clustering/adapter"
@@ -17,20 +19,13 @@ module Topical
     autoload :KMeansAdapter, "topical/clustering/kmeans_adapter"
   end
-  module Dimensionality
-    autoload :Reducer, "topical/dimensionality/reducer"
-  end
   module Extractors
     autoload :TermExtractor, "topical/extractors/term_extractor"
-    autoload :Stopwords, "topical/extractors/stopwords"
   end
   module Labelers
     autoload :Base, "topical/labelers/base"
     autoload :TermBased, "topical/labelers/term_based"
-    autoload :LLMBased, "topical/labelers/llm_based"
-    autoload :Hybrid, "topical/labelers/hybrid"
   end
   # Convenience method for simple topic extraction
@@ -43,13 +38,13 @@ module Topical
     engine.fit(embeddings: embeddings, documents: documents)
   end
-  # Check if red-candle is available for enhanced features
-  def self.llm_available?
-    @llm_available ||= begin
+  # Check if red-candle is available for embedding generation in examples
+  def self.embedding_model_available?
+    @embedding_model_available ||= begin
       require 'red-candle'
       true
     rescue LoadError
       false
     end
   end
-end
+end