RubyGems - topical - Versions diffs - 0.0.1.pre.1 - Mend

topical 0.0.1.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.standard.yml +3 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE.txt +21 -0
data/README.md +252 -0
data/Rakefile +10 -0
data/examples/quick_demo.rb +118 -0
data/examples/verify_migration.rb +108 -0
data/lib/topical/clustering/adapter.rb +30 -0
data/lib/topical/clustering/hdbscan_adapter.rb +54 -0
data/lib/topical/clustering/kmeans_adapter.rb +44 -0
data/lib/topical/engine.rb +310 -0
data/lib/topical/extractors/term_extractor.rb +98 -0
data/lib/topical/labelers/base.rb +23 -0
data/lib/topical/labelers/hybrid.rb +24 -0
data/lib/topical/labelers/llm_adapter.rb +126 -0
data/lib/topical/labelers/llm_based.rb +111 -0
data/lib/topical/labelers/term_based.rb +22 -0
data/lib/topical/metrics.rb +188 -0
data/lib/topical/topic.rb +114 -0
data/lib/topical/version.rb +5 -0
data/lib/topical.rb +55 -0
data/sig/topical.rbs +4 -0
metadata +142 -0

data/lib/topical/clustering/hdbscan_adapter.rb ADDED Viewed

@@ -0,0 +1,54 @@
+# frozen_string_literal: true
+require 'clusterkit'
+module Topical
+  module Clustering
+    # Adapter for ClusterKit's HDBSCAN implementation
+    class HDBSCANAdapter < Adapter
+      def initialize(min_cluster_size: 5, min_samples: 3, metric: 'euclidean')
+        @min_cluster_size = min_cluster_size
+        @min_samples = min_samples
+        @metric = metric
+        @clusterer = ClusterKit::Clustering::HDBSCAN.new(
+          min_cluster_size: min_cluster_size,
+          min_samples: min_samples,
+          metric: metric
+        )
+      end
+      def fit_predict(embeddings)
+        labels = @clusterer.fit_predict(embeddings)
+        update_stats(labels)
+        labels
+      end
+      def fit(embeddings)
+        @clusterer.fit(embeddings)
+        self
+      end
+      def predict(embeddings)
+        # HDBSCAN doesn't have a separate predict method
+        # For new points, we'd need to use approximate prediction
+        if @clusterer.respond_to?(:approximate_predict)
+          @clusterer.approximate_predict(embeddings)
+        else
+          raise NotImplementedError, "HDBSCAN does not support prediction on new data"
+        end
+      end
+      # Access to underlying ClusterKit object if needed
+      attr_reader :clusterer
+      private
+      def update_stats(labels)
+        @n_noise_points = labels.count(-1)
+        unique_labels = labels.uniq.reject { |l| l == -1 }
+        @n_clusters = unique_labels.length
+      end
+    end
+  end
+end

data/lib/topical/clustering/kmeans_adapter.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+require 'clusterkit'
+module Topical
+  module Clustering
+    # Adapter for ClusterKit's K-means implementation
+    class KMeansAdapter < Adapter
+      def initialize(k: 5, random_seed: nil)
+        @k = k
+        @random_seed = random_seed
+        @clusterer = ClusterKit::Clustering::KMeans.new(
+          k: k,
+          random_seed: random_seed
+        )
+      end
+      def fit_predict(embeddings)
+        labels = @clusterer.fit_predict(embeddings)
+        @n_clusters = @k
+        @n_noise_points = 0  # K-means doesn't have noise points
+        labels
+      end
+      def fit(embeddings)
+        @clusterer.fit(embeddings)
+        self
+      end
+      def predict(embeddings)
+        @clusterer.predict(embeddings)
+      end
+      # Access cluster centers
+      def cluster_centers
+        @clusterer.cluster_centers
+      end
+      # Access to underlying ClusterKit object if needed
+      attr_reader :clusterer
+    end
+  end
+end

data/lib/topical/engine.rb ADDED Viewed

@@ -0,0 +1,310 @@
+# frozen_string_literal: true
+module Topical
+  # Main engine for topic modeling
+  class Engine
+    attr_reader :topics, :clustering_adapter, :term_extractor, :labeler
+    def initialize(
+      clustering_method: :hdbscan,
+      min_cluster_size: 5,
+      min_samples: 3,
+      reduce_dimensions: true,
+      n_components: 50,
+      labeling_method: :hybrid,
+      llm_provider: nil,
+      verbose: false,
+      k: nil,  # Add k as explicit parameter
+      **options
+    )
+      @clustering_method = clustering_method
+      @min_cluster_size = min_cluster_size
+      @min_samples = min_samples
+      @reduce_dimensions = reduce_dimensions
+      @n_components = n_components
+      @labeling_method = labeling_method
+      @llm_provider = llm_provider
+      @verbose = verbose
+      @options = options
+      @options[:k] = k if k  # Store k in options if provided
+      @clustering_adapter = build_clustering_adapter
+      @term_extractor = Extractors::TermExtractor.new
+      @labeler = build_labeler
+      @topics = []
+    end
+    # Fit the model to embeddings and documents
+    # @param embeddings [Array<Array<Float>>] Document embeddings
+    # @param documents [Array<String>] Document texts
+    # @param metadata [Array<Hash>] Optional metadata for each document
+    # @return [Array<Topic>] Extracted topics
+    def fit(embeddings:, documents:, metadata: nil)
+      raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
+      @embeddings = embeddings
+      @documents = documents
+      @metadata = metadata || Array.new(documents.length) { {} }
+      puts "Starting topic extraction..." if @verbose
+      # Step 1: Optionally reduce dimensions
+      working_embeddings = @embeddings
+      if @reduce_dimensions && @embeddings.first.length > @n_components
+        puts "  Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
+        working_embeddings = reduce_dimensions(@embeddings)
+      end
+      # Step 2: Cluster embeddings
+      puts "  Clustering #{working_embeddings.length} documents..." if @verbose
+      @cluster_ids = @clustering_adapter.fit_predict(working_embeddings)
+      # Step 3: Build topics from clusters
+      puts "  Building topics from clusters..." if @verbose
+      @topics = build_topics(@cluster_ids)
+      # Step 4: Extract terms for each topic
+      puts "  Extracting distinctive terms..." if @verbose
+      extract_topic_terms
+      # Step 5: Generate labels
+      puts "  Generating topic labels..." if @verbose
+      generate_topic_labels
+      if @verbose
+        n_noise = @cluster_ids.count(-1)
+        puts "Found #{@topics.length} topics (plus #{n_noise} outliers)"
+      end
+      @topics
+    end
+    # Transform new documents using fitted model
+    def transform(embeddings:, documents: nil)
+      raise "Must call fit before transform" if @topics.empty?
+      # Use approximate prediction if available
+      if @clustering_adapter.respond_to?(:approximate_predict)
+        @clustering_adapter.approximate_predict(embeddings)
+      else
+        # Fallback: assign to nearest topic centroid
+        assign_to_nearest_topic(embeddings: embeddings)
+      end
+    end
+    def get_topic(topic_id)
+      @topics.find { |t| t.id == topic_id }
+    end
+    def outliers
+      return [] unless @cluster_ids
+      @documents.each_with_index.select { |_, idx|
+        @cluster_ids[idx] == -1
+      }.map(&:first)
+    end
+    # Save the model
+    def save(path)
+      require 'json'
+      config = {
+        clustering_method: @clustering_method,
+        min_cluster_size: @min_cluster_size,
+        min_samples: @min_samples,
+        reduce_dimensions: @reduce_dimensions,
+        n_components: @n_components,
+        labeling_method: @labeling_method
+      }
+      # Include k for kmeans
+      if @clustering_method == :kmeans
+        config[:k] = @options[:k] || @topics.length
+      end
+      data = {
+        topics: @topics.map(&:to_h),
+        config: config
+      }
+      File.write(path, JSON.pretty_generate(data))
+    end
+    # Load a model
+    def self.load(path)
+      require 'json'
+      data = JSON.parse(File.read(path), symbolize_names: true)
+      # Make sure k is passed for kmeans and convert string keys to symbols
+      config = data[:config]
+      config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
+      config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
+      if config[:clustering_method] == :kmeans && !config[:k]
+        # Extract k from saved topics or use default
+        config[:k] = data[:topics]&.length || 5
+      end
+      engine = new(**config)
+      # Reconstruct topics
+      engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
+      engine
+    end
+    private
+    def reduce_dimensions(embeddings)
+      begin
+        require 'clusterkit'
+        # Validate embeddings before UMAP
+        valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
+        if valid_embeddings.empty?
+          raise "No valid embeddings for dimensionality reduction. " \
+                "All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
+        end
+        if invalid_indices.any? && @verbose
+          puts "  Warning: #{invalid_indices.size} embeddings with invalid values removed"
+        end
+        # Adjust parameters based on data size
+        n_samples = valid_embeddings.size
+        n_components = [@n_components, n_samples - 1, 50].min
+        n_neighbors = [15, n_samples - 1].min
+        if @verbose && n_components != @n_components
+          puts "  Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
+        end
+        umap = ClusterKit::Dimensionality::UMAP.new(
+          n_components: n_components,
+          n_neighbors: n_neighbors,
+          random_seed: 42
+        )
+        reduced = umap.fit_transform(valid_embeddings)
+        # If we had to remove invalid embeddings, reconstruct the full array
+        if invalid_indices.any?
+          full_reduced = []
+          valid_idx = 0
+          embeddings.size.times do |i|
+            if invalid_indices.include?(i)
+              # Use zeros for invalid embeddings (they'll be outliers anyway)
+              full_reduced << Array.new(n_components, 0.0)
+            else
+              full_reduced << reduced[valid_idx]
+              valid_idx += 1
+            end
+          end
+          full_reduced
+        else
+          reduced
+        end
+      rescue LoadError
+        puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
+        embeddings
+      rescue => e
+        puts "Warning: Dimensionality reduction failed: #{e.message}" if @verbose
+        embeddings
+      end
+    end
+    def validate_embeddings_for_umap(embeddings)
+      valid = []
+      invalid_indices = []
+      embeddings.each_with_index do |embedding, idx|
+        if embedding.is_a?(Array) &&
+           embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
+          valid << embedding
+        else
+          invalid_indices << idx
+        end
+      end
+      [valid, invalid_indices]
+    end
+    def build_topics(cluster_ids)
+      # Group documents by cluster
+      clusters = {}
+      cluster_ids.each_with_index do |cluster_id, doc_idx|
+        next if cluster_id == -1  # Skip outliers
+        clusters[cluster_id] ||= []
+        clusters[cluster_id] << doc_idx
+      end
+      # Create Topic objects
+      clusters.map do |cluster_id, doc_indices|
+        Topic.new(
+          id: cluster_id,
+          document_indices: doc_indices,
+          documents: doc_indices.map { |i| @documents[i] },
+          embeddings: doc_indices.map { |i| @embeddings[i] },
+          metadata: doc_indices.map { |i| @metadata[i] }
+        )
+      end.sort_by(&:id)
+    end
+    def extract_topic_terms
+      @topics.each do |topic|
+        # Extract distinctive terms using c-TF-IDF
+        terms = @term_extractor.extract_distinctive_terms(
+          topic_docs: topic.documents,
+          all_docs: @documents,
+          top_n: 20
+        )
+        topic.terms = terms
+      end
+    end
+    def generate_topic_labels
+      @topics.each do |topic|
+        topic.label = @labeler.generate_label(topic)
+      end
+    end
+    def build_clustering_adapter
+      case @clustering_method
+      when :hdbscan
+        Clustering::HDBSCANAdapter.new(
+          min_cluster_size: @min_cluster_size,
+          min_samples: @min_samples
+        )
+      when :kmeans
+        Clustering::KMeansAdapter.new(k: @options[:k] || 5)
+      else
+        raise ArgumentError, "Unknown clustering method: #{@clustering_method}"
+      end
+    end
+    def build_labeler
+      case @labeling_method
+      when :term_based
+        Labelers::TermBased.new
+      when :llm_based
+        Labelers::LLMBased.new(provider: @llm_provider)
+      when :hybrid
+        Labelers::Hybrid.new(provider: @llm_provider)
+      else
+        Labelers::TermBased.new  # Default fallback
+      end
+    end
+    def assign_to_nearest_topic(embeddings:)
+      # Simple nearest centroid assignment
+      topic_centroids = @topics.map(&:centroid)
+      embeddings.map do |embedding|
+        distances = topic_centroids.map do |centroid|
+          # Euclidean distance
+          Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
+        end
+        min_idx = distances.index(distances.min)
+        @topics[min_idx].id
+      end
+    end
+  end
+end

data/lib/topical/extractors/term_extractor.rb ADDED Viewed

@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+require 'set'
+module Topical
+  module Extractors
+    # Extracts distinctive terms from documents using c-TF-IDF
+    class TermExtractor
+      # Default English stop words
+      DEFAULT_STOP_WORDS = Set.new(%w[
+        the be to of and a in that have i it for not on with he as you do at
+        this but his by from they we say her she or an will my one all would
+        there their what so up out if about who get which go me when make can
+        like time no just him know take people into year your good some could
+        them see other than then now look only come its over think also back
+        after use two how our work first well way even new want because any
+        these give day most us is was are been has had were said did get may
+      ])
+      def initialize(stop_words: DEFAULT_STOP_WORDS, min_word_length: 3, max_word_length: 20)
+        @stop_words = stop_words
+        @min_word_length = min_word_length
+        @max_word_length = max_word_length
+      end
+      # Extract distinctive terms using c-TF-IDF
+      # @param topic_docs [Array<String>] Documents in the topic
+      # @param all_docs [Array<String>] All documents in the corpus
+      # @param top_n [Integer] Number of top terms to return
+      # @return [Array<String>] Top distinctive terms
+      def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
+        # Tokenize and count terms in topic
+        topic_terms = count_terms(topic_docs)
+        # Tokenize and count document frequency across all docs
+        doc_frequencies = compute_document_frequencies(all_docs)
+        # Compute c-TF-IDF scores
+        scores = {}
+        total_docs = all_docs.length.to_f
+        topic_terms.each do |term, tf|
+          # c-TF-IDF formula: tf * log(N / df)
+          df = doc_frequencies[term] || 1
+          idf = Math.log(total_docs / df)
+          scores[term] = tf * idf
+        end
+        # Return top scoring terms
+        scores.sort_by { |_, score| -score }
+               .first(top_n)
+               .map(&:first)
+      end
+      private
+      def tokenize(text)
+        # Simple tokenization
+        text.downcase
+            .split(/\W+/)
+            .select { |word| valid_word?(word) }
+      end
+      def valid_word?(word)
+        word.length >= @min_word_length &&
+        word.length <= @max_word_length &&
+        !@stop_words.include?(word) &&
+        !word.match?(/^\d+$/)  # Not pure numbers
+      end
+      def count_terms(documents)
+        terms = Hash.new(0)
+        documents.each do |doc|
+          tokenize(doc).each do |word|
+            terms[word] += 1
+          end
+        end
+        terms
+      end
+      def compute_document_frequencies(documents)
+        doc_frequencies = Hash.new(0)
+        documents.each do |doc|
+          # Use set to count each term once per document
+          unique_terms = Set.new(tokenize(doc))
+          unique_terms.each do |term|
+            doc_frequencies[term] += 1
+          end
+        end
+        doc_frequencies
+      end
+    end
+  end
+end

data/lib/topical/labelers/base.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+module Topical
+  module Labelers
+    # Base class for topic labeling strategies
+    class Base
+      def generate_label(topic)
+        raise NotImplementedError, "Subclasses must implement generate_label"
+      end
+      protected
+      def capitalize_phrase(phrase)
+        phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
+      end
+      def select_representative_docs(documents, k: 3)
+        return documents if documents.length <= k
+        documents.first(k)
+      end
+    end
+  end
+end

data/lib/topical/labelers/hybrid.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module Topical
+  module Labelers
+    # Hybrid labeling that combines term-based and LLM approaches
+    class Hybrid < Base
+      def initialize(provider: nil)
+        @term_labeler = TermBased.new
+        @llm_labeler = LLMBased.new(provider: provider)
+      end
+      def generate_label(topic)
+        # Start with term-based label
+        term_label = @term_labeler.generate_label(topic)
+        # Try to enhance with LLM if available
+        llm_label = @llm_labeler.generate_label(topic)
+        # For now, just return the LLM label if different, otherwise term label
+        llm_label != "LLM Topic #{topic.id}" ? llm_label : term_label
+      end
+    end
+  end
+end

data/lib/topical/labelers/llm_adapter.rb ADDED Viewed

@@ -0,0 +1,126 @@
+# frozen_string_literal: true
+module Topical
+  module Labelers
+    # Adapter to allow different LLM backends (red-candle, remote APIs, etc.)
+    class LLMAdapter
+      # Factory method to create appropriate LLM client
+      def self.create(type: :auto, **options)
+        case type
+        when :red_candle
+          RedCandleAdapter.new(**options)
+        when :openai
+          # Future: OpenAIAdapter.new(**options)
+          raise NotImplementedError, "OpenAI adapter not yet implemented"
+        when :anthropic
+          # Future: AnthropicAdapter.new(**options)
+          raise NotImplementedError, "Anthropic adapter not yet implemented"
+        when :auto
+          # Try red-candle first, then fall back to others
+          begin
+            RedCandleAdapter.new(**options)
+          rescue LoadError
+            nil  # No LLM available
+          end
+        else
+          raise ArgumentError, "Unknown LLM type: #{type}"
+        end
+      end
+    end
+    # Adapter for red-candle (local LLMs)
+    class RedCandleAdapter
+      def initialize(model: nil, **options)
+        require 'red-candle'
+        @model = model || default_model
+        @options = options
+        @llm = load_or_create_llm
+      end
+      def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
+        # Red-candle specific generation
+        response = @llm.generate(
+          prompt,
+          max_length: max_tokens,
+          temperature: temperature,
+          do_sample: temperature > 0
+        )
+        # Handle JSON response format if requested
+        if response_format && response_format[:type] == "json_object"
+          ensure_json_response(response)
+        else
+          response
+        end
+      end
+      def available?
+        true
+      end
+      private
+      def default_model
+        # Use a small, fast model by default for topic labeling
+        "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+      end
+      def load_or_create_llm
+        # Create new LLM instance with red-candle
+        RedCandle::Model.new(
+          model_id: @model,
+          model_type: :llama,
+          quantized: true
+        )
+      end
+      def ensure_json_response(response)
+        # Try to extract JSON from response
+        begin
+          require 'json'
+          # Look for JSON-like content
+          json_match = response.match(/\{.*\}/m)
+          if json_match
+            JSON.parse(json_match[0])
+            json_match[0]  # Return the JSON string if valid
+          else
+            # Generate a basic JSON response
+            generate_fallback_json(response)
+          end
+        rescue JSON::ParserError
+          generate_fallback_json(response)
+        end
+      end
+      def generate_fallback_json(text)
+        # Create a simple JSON from text response
+        require 'json'
+        label = text.lines.first&.strip || "Unknown"
+        {
+          label: label,
+          description: text,
+          confidence: 0.5
+        }.to_json
+      end
+    end
+    # Future adapter for remote LLMs
+    class RemoteAdapter
+      def initialize(api_key:, endpoint:, **options)
+        @api_key = api_key
+        @endpoint = endpoint
+        @options = options
+      end
+      def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
+        # Make API call
+        raise NotImplementedError, "Remote LLM adapter coming soon"
+      end
+      def available?
+        !@api_key.nil?
+      end
+    end
+  end
+end