RubyGems - topical - Versions diffs - 0.0.1.pre.1 - Mend

topical 0.0.1.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.standard.yml +3 -0
data/CODE_OF_CONDUCT.md +132 -0
data/LICENSE.txt +21 -0
data/README.md +252 -0
data/Rakefile +10 -0
data/examples/quick_demo.rb +118 -0
data/examples/verify_migration.rb +108 -0
data/lib/topical/clustering/adapter.rb +30 -0
data/lib/topical/clustering/hdbscan_adapter.rb +54 -0
data/lib/topical/clustering/kmeans_adapter.rb +44 -0
data/lib/topical/engine.rb +310 -0
data/lib/topical/extractors/term_extractor.rb +98 -0
data/lib/topical/labelers/base.rb +23 -0
data/lib/topical/labelers/hybrid.rb +24 -0
data/lib/topical/labelers/llm_adapter.rb +126 -0
data/lib/topical/labelers/llm_based.rb +111 -0
data/lib/topical/labelers/term_based.rb +22 -0
data/lib/topical/metrics.rb +188 -0
data/lib/topical/topic.rb +114 -0
data/lib/topical/version.rb +5 -0
data/lib/topical.rb +55 -0
data/sig/topical.rbs +4 -0
metadata +142 -0

data/lib/topical/labelers/llm_based.rb ADDED Viewed

@@ -0,0 +1,111 @@
+# frozen_string_literal: true
+module Topical
+  module Labelers
+    # LLM-powered topic labeling (requires red-candle or other LLM provider)
+    class LLMBased < Base
+      def initialize(provider: nil)
+        @provider = provider
+      end
+      def generate_label(topic)
+        unless llm_available?
+          # Fallback to term-based if LLM not available
+          return TermBased.new.generate_label(topic)
+        end
+        # Select best documents to send to LLM
+        sample_docs = topic.representative_docs(k: 3)
+        # Generate comprehensive analysis
+        response = analyze_with_llm(sample_docs, topic.terms)
+        response[:label]
+      rescue => e
+        # Fallback on error
+        puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
+        TermBased.new.generate_label(topic)
+      end
+      private
+      def llm_available?
+        return true if @provider
+        # Try to create LLM adapter
+        begin
+          require_relative 'llm_adapter'
+          @provider = LLMAdapter.create(type: :auto)
+          @provider && @provider.available?
+        rescue LoadError, StandardError => e
+          puts "LLM not available: #{e.message}" if ENV['DEBUG']
+          false
+        end
+      end
+      def analyze_with_llm(documents, terms)
+        prompt = build_analysis_prompt(documents, terms)
+        response = @provider.generate(
+          prompt: prompt,
+          max_tokens: 150,
+          temperature: 0.3,
+          response_format: { type: "json_object" }
+        )
+        # Parse JSON response
+        require 'json'
+        result = JSON.parse(response, symbolize_names: true)
+        # Validate and clean
+        {
+          label: clean_label(result[:label]),
+          description: result[:description] || "Topic about #{result[:label]}",
+          themes: result[:themes] || [],
+          confidence: result[:confidence] || 0.8
+        }
+      end
+      def build_analysis_prompt(documents, terms)
+        doc_samples = documents.map.with_index do |doc, i|
+          preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
+          "Document #{i + 1}:\n#{preview}"
+        end.join("\n\n")
+        <<~PROMPT
+          Analyze this cluster of related documents and provide a structured summary.
+          Distinctive terms found: #{terms.first(10).join(', ')}
+          Sample documents:
+          #{doc_samples}
+          Provide a JSON response with:
+          {
+            "label": "A 2-4 word topic label",
+            "description": "One sentence describing what connects these documents",
+            "themes": ["theme1", "theme2", "theme3"],
+            "confidence": 0.0-1.0 score of how coherent this topic is
+          }
+          Focus on what meaningfully connects these documents, not just common words.
+        PROMPT
+      end
+      def clean_label(label)
+        return "Unknown Topic" unless label
+        # Remove quotes, trim, limit length
+        cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
+        cleaned = cleaned.split("\n").first if cleaned.include?("\n")
+        # Limit to reasonable length
+        if cleaned.length > 50
+          cleaned[0..47] + "..."
+        else
+          cleaned
+        end
+      end
+    end
+  end
+end

data/lib/topical/labelers/term_based.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+module Topical
+  module Labelers
+    # Fast term-based labeling using top distinctive terms
+    class TermBased < Base
+      def generate_label(topic)
+        terms = topic.terms
+        return "Topic #{topic.id}" if terms.empty?
+        # Take top distinctive terms
+        label_terms = terms.first(3).select { |t| t.length > 3 }
+        if label_terms.length >= 2
+          "#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
+        else
+          capitalize_phrase(label_terms.first || terms.first)
+        end
+      end
+    end
+  end
+end

data/lib/topical/metrics.rb ADDED Viewed

@@ -0,0 +1,188 @@
+# frozen_string_literal: true
+require 'set'
+module Topical
+  module Metrics
+    extend self
+    # Compute UMass Coherence for topic quality
+    # Higher coherence = more interpretable topic
+    def compute_coherence(terms, documents, top_n: 10)
+      return 0.0 if terms.empty? || documents.empty?
+      # Use top N terms
+      eval_terms = terms.first(top_n)
+      return 0.0 if eval_terms.length < 2
+      # Create document term matrix for co-occurrence
+      doc_term_counts = count_cooccurrences(eval_terms, documents)
+      # Compute UMass coherence
+      coherence_sum = 0.0
+      pairs_count = 0
+      eval_terms.each_with_index do |term_i, i|
+        eval_terms.each_with_index do |term_j, j|
+          next unless j < i  # Only upper triangle
+          # P(term_i, term_j) = co-occurrence count
+          cooccur = doc_term_counts["#{term_i},#{term_j}"] || 0
+          # P(term_j) = document frequency
+          doc_freq_j = doc_term_counts[term_j] || 0
+          if cooccur > 0 && doc_freq_j > 0
+            # UMass: log((cooccur + 1) / doc_freq_j)
+            coherence_sum += Math.log((cooccur + 1.0) / doc_freq_j)
+            pairs_count += 1
+          end
+        end
+      end
+      return 0.0 if pairs_count == 0
+      # Normalize by number of pairs
+      coherence = coherence_sum / pairs_count
+      # Transform to 0-1 range (coherence is typically negative)
+      # More negative = less coherent, so we reverse and bound
+      normalized = 1.0 / (1.0 + Math.exp(-coherence))
+      normalized
+    end
+    # Compute how distinct a topic is from others
+    def compute_distinctiveness(topic, other_topics)
+      return 1.0 if other_topics.empty?
+      topic_terms = Set.new(topic.terms.first(20))
+      # Compare with other topics
+      overlaps = other_topics.map do |other|
+        next if other.id == topic.id
+        other_terms = Set.new(other.terms.first(20))
+        overlap = (topic_terms & other_terms).size.to_f
+        # Jaccard similarity
+        union_size = (topic_terms | other_terms).size
+        union_size > 0 ? overlap / union_size : 0
+      end.compact
+      return 1.0 if overlaps.empty?
+      # Distinctiveness = 1 - average overlap
+      1.0 - (overlaps.sum / overlaps.length)
+    end
+    # Compute diversity across all topics
+    def compute_diversity(topics)
+      return 0.0 if topics.length < 2
+      # Collect all term sets
+      term_sets = topics.map { |t| Set.new(t.terms.first(20)) }
+      # Compute pairwise Jaccard distances
+      distances = []
+      term_sets.each_with_index do |set_i, i|
+        term_sets.each_with_index do |set_j, j|
+          next unless j > i  # Only upper triangle
+          intersection = (set_i & set_j).size.to_f
+          union = (set_i | set_j).size.to_f
+          # Jaccard distance = 1 - Jaccard similarity
+          distance = union > 0 ? 1.0 - (intersection / union) : 1.0
+          distances << distance
+        end
+      end
+      # Average distance = diversity
+      distances.sum / distances.length
+    end
+    # Compute coverage (what fraction of docs are in topics vs outliers)
+    def compute_coverage(topics, total_documents)
+      return 0.0 if total_documents == 0
+      docs_in_topics = topics.sum(&:size)
+      docs_in_topics.to_f / total_documents
+    end
+    # Silhouette score for cluster quality
+    def compute_silhouette_score(topic, all_topics, embeddings)
+      return 0.0 if topic.embeddings.empty?
+      silhouettes = []
+      topic.embeddings.each_with_index do |embedding, idx|
+        # a(i) = average distance to other points in same cluster
+        if topic.embeddings.length > 1
+          a_i = topic.embeddings.each_with_index
+                    .reject { |_, j| j == idx }
+                    .map { |other, _| euclidean_distance(embedding, other) }
+                    .sum.to_f / (topic.embeddings.length - 1)
+        else
+          a_i = 0.0
+        end
+        # b(i) = minimum average distance to points in other clusters
+        b_values = all_topics.reject { |t| t.id == topic.id }.map do |other_topic|
+          next if other_topic.embeddings.empty?
+          avg_dist = other_topic.embeddings
+                               .map { |other| euclidean_distance(embedding, other) }
+                               .sum.to_f / other_topic.embeddings.length
+          avg_dist
+        end.compact
+        b_i = b_values.min || a_i
+        # Silhouette coefficient
+        if a_i == 0 && b_i == 0
+          s_i = 0
+        else
+          s_i = (b_i - a_i) / [a_i, b_i].max
+        end
+        silhouettes << s_i
+      end
+      # Average silhouette score for topic
+      silhouettes.sum / silhouettes.length
+    end
+    private
+    def count_cooccurrences(terms, documents)
+      counts = Hash.new(0)
+      documents.each do |doc|
+        doc_lower = doc.downcase
+        # Count individual term occurrences
+        terms.each do |term|
+          counts[term] += 1 if doc_lower.include?(term.downcase)
+        end
+        # Count co-occurrences
+        terms.each_with_index do |term_i, i|
+          terms.each_with_index do |term_j, j|
+            next unless j < i
+            if doc_lower.include?(term_i.downcase) && doc_lower.include?(term_j.downcase)
+              counts["#{term_i},#{term_j}"] += 1
+            end
+          end
+        end
+      end
+      counts
+    end
+    def euclidean_distance(vec1, vec2)
+      Math.sqrt(
+        vec1.zip(vec2).map { |a, b| (a - b) ** 2 }.sum
+      )
+    end
+  end
+end

data/lib/topical/topic.rb ADDED Viewed

@@ -0,0 +1,114 @@
+# frozen_string_literal: true
+module Topical
+  # Represents a discovered topic
+  class Topic
+    attr_reader :id, :document_indices, :documents, :embeddings, :metadata
+    attr_accessor :terms, :label, :description, :distinctiveness
+    attr_writer :coherence
+    def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
+      @id = id
+      @document_indices = document_indices
+      @documents = documents
+      @embeddings = embeddings
+      @metadata = metadata || []
+      @terms = []
+      @label = nil
+      @description = nil
+      @coherence = nil
+      @distinctiveness = 0.0
+    end
+    # Number of documents in this topic
+    def size
+      @documents.length
+    end
+    # Compute the centroid of the topic
+    def centroid
+      @centroid ||= compute_centroid
+    end
+    # Get the most representative documents
+    # @param k [Integer] Number of documents to return
+    # @return [Array<String>] Representative documents
+    def representative_docs(k: 3)
+      return @documents if @documents.length <= k
+      # Find documents closest to centroid
+      distances = @embeddings.map { |embedding| distance_to_centroid(embedding) }
+      # Get indices of k smallest distances
+      top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
+      top_indices.map { |i| @documents[i] }
+    end
+    # Compute topic coherence (simple PMI-based score)
+    def coherence
+      @coherence ||= compute_coherence
+    end
+    # Convert to hash for serialization
+    def to_h
+      {
+        id: @id,
+        label: @label,
+        description: @description,
+        size: size,
+        terms: @terms,
+        coherence: @coherence,
+        distinctiveness: @distinctiveness,
+        document_indices: @document_indices
+      }
+    end
+    # Create from hash
+    def self.from_h(hash)
+      topic = new(
+        id: hash[:id],
+        document_indices: hash[:document_indices],
+        documents: [],  # Would need to be reconstructed
+        embeddings: []   # Would need to be reconstructed
+      )
+      topic.label = hash[:label]
+      topic.description = hash[:description]
+      topic.terms = hash[:terms]
+      topic.coherence = hash[:coherence] || 0.0
+      topic.distinctiveness = hash[:distinctiveness] || 0.0
+      topic
+    end
+    private
+    def compute_coherence
+      # Use the Metrics module for proper coherence calculation
+      return 0.0 if @terms.empty? || @documents.empty?
+      Metrics.compute_coherence(@terms, @documents, top_n: 10)
+    end
+    def compute_centroid
+      return [] if @embeddings.empty?
+      # Compute mean of all embeddings
+      dim = @embeddings.first.length
+      centroid = Array.new(dim, 0.0)
+      @embeddings.each do |embedding|
+        embedding.each_with_index do |val, idx|
+          centroid[idx] += val
+        end
+      end
+      centroid.map { |val| val / @embeddings.length }
+    end
+    def distance_to_centroid(embedding)
+      # Euclidean distance
+      Math.sqrt(
+        embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
+      )
+    end
+  end
+end

data/lib/topical/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Topical
+  VERSION = "0.0.1.pre.1"
+end

data/lib/topical.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+require_relative "topical/version"
+# Main module for topic modeling
+module Topical
+  class Error < StandardError; end
+  # Autoload components for better performance
+  autoload :Engine, "topical/engine"
+  autoload :Topic, "topical/topic"
+  autoload :Metrics, "topical/metrics"
+  module Clustering
+    autoload :Adapter, "topical/clustering/adapter"
+    autoload :HDBSCANAdapter, "topical/clustering/hdbscan_adapter"
+    autoload :KMeansAdapter, "topical/clustering/kmeans_adapter"
+  end
+  module Dimensionality
+    autoload :Reducer, "topical/dimensionality/reducer"
+  end
+  module Extractors
+    autoload :TermExtractor, "topical/extractors/term_extractor"
+    autoload :Stopwords, "topical/extractors/stopwords"
+  end
+  module Labelers
+    autoload :Base, "topical/labelers/base"
+    autoload :TermBased, "topical/labelers/term_based"
+    autoload :LLMBased, "topical/labelers/llm_based"
+    autoload :Hybrid, "topical/labelers/hybrid"
+  end
+  # Convenience method for simple topic extraction
+  # @param embeddings [Array<Array<Float>>] Document embeddings
+  # @param documents [Array<String>] Document texts
+  # @param options [Hash] Additional options
+  # @return [Array<Topic>] Extracted topics
+  def self.extract(embeddings:, documents:, **options)
+    engine = Engine.new(**options)
+    engine.fit(embeddings: embeddings, documents: documents)
+  end
+  # Check if red-candle is available for enhanced features
+  def self.llm_available?
+    @llm_available ||= begin
+      require 'red-candle'
+      true
+    rescue LoadError
+      false
+    end
+  end
+end

data/sig/topical.rbs ADDED Viewed

@@ -0,0 +1,4 @@
+module Topical
+  VERSION: String
+  # See the writing guide of rbs: https://github.com/ruby/rbs#guides
+end

metadata ADDED Viewed

@@ -0,0 +1,142 @@
+--- !ruby/object:Gem::Specification
+name: topical
+version: !ruby/object:Gem::Version
+  version: 0.0.1.pre.1
+platform: ruby
+authors:
+- Chris Petersen
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2025-08-30 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: clusterkit
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0.1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0.1'
+- !ruby/object:Gem::Dependency
+  name: red-candle
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '1.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13.0'
+- !ruby/object:Gem::Dependency
+  name: standard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+description: Extract topics from document embeddings using HDBSCAN clustering and
+  c-TF-IDF term extraction. Provides automatic topic labeling, quality metrics, and
+  support for various clustering algorithms.
+email:
+- chris@petersen.io
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".rspec"
+- ".standard.yml"
+- CODE_OF_CONDUCT.md
+- LICENSE.txt
+- README.md
+- Rakefile
+- examples/quick_demo.rb
+- examples/verify_migration.rb
+- lib/topical.rb
+- lib/topical/clustering/adapter.rb
+- lib/topical/clustering/hdbscan_adapter.rb
+- lib/topical/clustering/kmeans_adapter.rb
+- lib/topical/engine.rb
+- lib/topical/extractors/term_extractor.rb
+- lib/topical/labelers/base.rb
+- lib/topical/labelers/hybrid.rb
+- lib/topical/labelers/llm_adapter.rb
+- lib/topical/labelers/llm_based.rb
+- lib/topical/labelers/term_based.rb
+- lib/topical/metrics.rb
+- lib/topical/topic.rb
+- lib/topical/version.rb
+- sig/topical.rbs
+homepage: https://github.com/cpetersen/topical
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/cpetersen/topical
+  source_code_uri: https://github.com/cpetersen/topical
+  changelog_uri: https://github.com/cpetersen/topical/blob/main/CHANGELOG.md
+  documentation_uri: https://rubydoc.info/gems/topical
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 3.1.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.3
+signing_key:
+specification_version: 4
+summary: Topic modeling for Ruby using modern clustering algorithms
+test_files: []