topical 0.0.1.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ module Labelers
5
+ # LLM-powered topic labeling (requires red-candle or other LLM provider)
6
+ class LLMBased < Base
7
+ def initialize(provider: nil)
8
+ @provider = provider
9
+ end
10
+
11
+ def generate_label(topic)
12
+ unless llm_available?
13
+ # Fallback to term-based if LLM not available
14
+ return TermBased.new.generate_label(topic)
15
+ end
16
+
17
+ # Select best documents to send to LLM
18
+ sample_docs = topic.representative_docs(k: 3)
19
+
20
+ # Generate comprehensive analysis
21
+ response = analyze_with_llm(sample_docs, topic.terms)
22
+
23
+ response[:label]
24
+ rescue => e
25
+ # Fallback on error
26
+ puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
27
+ TermBased.new.generate_label(topic)
28
+ end
29
+
30
+ private
31
+
32
+ def llm_available?
33
+ return true if @provider
34
+
35
+ # Try to create LLM adapter
36
+ begin
37
+ require_relative 'llm_adapter'
38
+ @provider = LLMAdapter.create(type: :auto)
39
+ @provider && @provider.available?
40
+ rescue LoadError, StandardError => e
41
+ puts "LLM not available: #{e.message}" if ENV['DEBUG']
42
+ false
43
+ end
44
+ end
45
+
46
+ def analyze_with_llm(documents, terms)
47
+ prompt = build_analysis_prompt(documents, terms)
48
+
49
+ response = @provider.generate(
50
+ prompt: prompt,
51
+ max_tokens: 150,
52
+ temperature: 0.3,
53
+ response_format: { type: "json_object" }
54
+ )
55
+
56
+ # Parse JSON response
57
+ require 'json'
58
+ result = JSON.parse(response, symbolize_names: true)
59
+
60
+ # Validate and clean
61
+ {
62
+ label: clean_label(result[:label]),
63
+ description: result[:description] || "Topic about #{result[:label]}",
64
+ themes: result[:themes] || [],
65
+ confidence: result[:confidence] || 0.8
66
+ }
67
+ end
68
+
69
+ def build_analysis_prompt(documents, terms)
70
+ doc_samples = documents.map.with_index do |doc, i|
71
+ preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
72
+ "Document #{i + 1}:\n#{preview}"
73
+ end.join("\n\n")
74
+
75
+ <<~PROMPT
76
+ Analyze this cluster of related documents and provide a structured summary.
77
+
78
+ Distinctive terms found: #{terms.first(10).join(', ')}
79
+
80
+ Sample documents:
81
+ #{doc_samples}
82
+
83
+ Provide a JSON response with:
84
+ {
85
+ "label": "A 2-4 word topic label",
86
+ "description": "One sentence describing what connects these documents",
87
+ "themes": ["theme1", "theme2", "theme3"],
88
+ "confidence": 0.0-1.0 score of how coherent this topic is
89
+ }
90
+
91
+ Focus on what meaningfully connects these documents, not just common words.
92
+ PROMPT
93
+ end
94
+
95
+ def clean_label(label)
96
+ return "Unknown Topic" unless label
97
+
98
+ # Remove quotes, trim, limit length
99
+ cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
100
+ cleaned = cleaned.split("\n").first if cleaned.include?("\n")
101
+
102
+ # Limit to reasonable length
103
+ if cleaned.length > 50
104
+ cleaned[0..47] + "..."
105
+ else
106
+ cleaned
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ module Labelers
5
+ # Fast term-based labeling using top distinctive terms
6
+ class TermBased < Base
7
+ def generate_label(topic)
8
+ terms = topic.terms
9
+ return "Topic #{topic.id}" if terms.empty?
10
+
11
+ # Take top distinctive terms
12
+ label_terms = terms.first(3).select { |t| t.length > 3 }
13
+
14
+ if label_terms.length >= 2
15
+ "#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
16
+ else
17
+ capitalize_phrase(label_terms.first || terms.first)
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,188 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module Topical
6
+ module Metrics
7
+ extend self
8
+
9
+ # Compute UMass Coherence for topic quality
10
+ # Higher coherence = more interpretable topic
11
+ def compute_coherence(terms, documents, top_n: 10)
12
+ return 0.0 if terms.empty? || documents.empty?
13
+
14
+ # Use top N terms
15
+ eval_terms = terms.first(top_n)
16
+ return 0.0 if eval_terms.length < 2
17
+
18
+ # Create document term matrix for co-occurrence
19
+ doc_term_counts = count_cooccurrences(eval_terms, documents)
20
+
21
+ # Compute UMass coherence
22
+ coherence_sum = 0.0
23
+ pairs_count = 0
24
+
25
+ eval_terms.each_with_index do |term_i, i|
26
+ eval_terms.each_with_index do |term_j, j|
27
+ next unless j < i # Only upper triangle
28
+
29
+ # P(term_i, term_j) = co-occurrence count
30
+ cooccur = doc_term_counts["#{term_i},#{term_j}"] || 0
31
+ # P(term_j) = document frequency
32
+ doc_freq_j = doc_term_counts[term_j] || 0
33
+
34
+ if cooccur > 0 && doc_freq_j > 0
35
+ # UMass: log((cooccur + 1) / doc_freq_j)
36
+ coherence_sum += Math.log((cooccur + 1.0) / doc_freq_j)
37
+ pairs_count += 1
38
+ end
39
+ end
40
+ end
41
+
42
+ return 0.0 if pairs_count == 0
43
+
44
+ # Normalize by number of pairs
45
+ coherence = coherence_sum / pairs_count
46
+
47
+ # Transform to 0-1 range (coherence is typically negative)
48
+ # More negative = less coherent, so we reverse and bound
49
+ normalized = 1.0 / (1.0 + Math.exp(-coherence))
50
+ normalized
51
+ end
52
+
53
+ # Compute how distinct a topic is from others
54
+ def compute_distinctiveness(topic, other_topics)
55
+ return 1.0 if other_topics.empty?
56
+
57
+ topic_terms = Set.new(topic.terms.first(20))
58
+
59
+ # Compare with other topics
60
+ overlaps = other_topics.map do |other|
61
+ next if other.id == topic.id
62
+
63
+ other_terms = Set.new(other.terms.first(20))
64
+ overlap = (topic_terms & other_terms).size.to_f
65
+
66
+ # Jaccard similarity
67
+ union_size = (topic_terms | other_terms).size
68
+ union_size > 0 ? overlap / union_size : 0
69
+ end.compact
70
+
71
+ return 1.0 if overlaps.empty?
72
+
73
+ # Distinctiveness = 1 - average overlap
74
+ 1.0 - (overlaps.sum / overlaps.length)
75
+ end
76
+
77
+ # Compute diversity across all topics
78
+ def compute_diversity(topics)
79
+ return 0.0 if topics.length < 2
80
+
81
+ # Collect all term sets
82
+ term_sets = topics.map { |t| Set.new(t.terms.first(20)) }
83
+
84
+ # Compute pairwise Jaccard distances
85
+ distances = []
86
+ term_sets.each_with_index do |set_i, i|
87
+ term_sets.each_with_index do |set_j, j|
88
+ next unless j > i # Only upper triangle
89
+
90
+ intersection = (set_i & set_j).size.to_f
91
+ union = (set_i | set_j).size.to_f
92
+
93
+ # Jaccard distance = 1 - Jaccard similarity
94
+ distance = union > 0 ? 1.0 - (intersection / union) : 1.0
95
+ distances << distance
96
+ end
97
+ end
98
+
99
+ # Average distance = diversity
100
+ distances.sum / distances.length
101
+ end
102
+
103
+ # Compute coverage (what fraction of docs are in topics vs outliers)
104
+ def compute_coverage(topics, total_documents)
105
+ return 0.0 if total_documents == 0
106
+
107
+ docs_in_topics = topics.sum(&:size)
108
+ docs_in_topics.to_f / total_documents
109
+ end
110
+
111
+ # Silhouette score for cluster quality
112
+ def compute_silhouette_score(topic, all_topics, embeddings)
113
+ return 0.0 if topic.embeddings.empty?
114
+
115
+ silhouettes = []
116
+
117
+ topic.embeddings.each_with_index do |embedding, idx|
118
+ # a(i) = average distance to other points in same cluster
119
+ if topic.embeddings.length > 1
120
+ a_i = topic.embeddings.each_with_index
121
+ .reject { |_, j| j == idx }
122
+ .map { |other, _| euclidean_distance(embedding, other) }
123
+ .sum.to_f / (topic.embeddings.length - 1)
124
+ else
125
+ a_i = 0.0
126
+ end
127
+
128
+ # b(i) = minimum average distance to points in other clusters
129
+ b_values = all_topics.reject { |t| t.id == topic.id }.map do |other_topic|
130
+ next if other_topic.embeddings.empty?
131
+
132
+ avg_dist = other_topic.embeddings
133
+ .map { |other| euclidean_distance(embedding, other) }
134
+ .sum.to_f / other_topic.embeddings.length
135
+ avg_dist
136
+ end.compact
137
+
138
+ b_i = b_values.min || a_i
139
+
140
+ # Silhouette coefficient
141
+ if a_i == 0 && b_i == 0
142
+ s_i = 0
143
+ else
144
+ s_i = (b_i - a_i) / [a_i, b_i].max
145
+ end
146
+
147
+ silhouettes << s_i
148
+ end
149
+
150
+ # Average silhouette score for topic
151
+ silhouettes.sum / silhouettes.length
152
+ end
153
+
154
+ private
155
+
156
+ def count_cooccurrences(terms, documents)
157
+ counts = Hash.new(0)
158
+
159
+ documents.each do |doc|
160
+ doc_lower = doc.downcase
161
+
162
+ # Count individual term occurrences
163
+ terms.each do |term|
164
+ counts[term] += 1 if doc_lower.include?(term.downcase)
165
+ end
166
+
167
+ # Count co-occurrences
168
+ terms.each_with_index do |term_i, i|
169
+ terms.each_with_index do |term_j, j|
170
+ next unless j < i
171
+
172
+ if doc_lower.include?(term_i.downcase) && doc_lower.include?(term_j.downcase)
173
+ counts["#{term_i},#{term_j}"] += 1
174
+ end
175
+ end
176
+ end
177
+ end
178
+
179
+ counts
180
+ end
181
+
182
+ def euclidean_distance(vec1, vec2)
183
+ Math.sqrt(
184
+ vec1.zip(vec2).map { |a, b| (a - b) ** 2 }.sum
185
+ )
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ # Represents a discovered topic
5
+ class Topic
6
+ attr_reader :id, :document_indices, :documents, :embeddings, :metadata
7
+ attr_accessor :terms, :label, :description, :distinctiveness
8
+ attr_writer :coherence
9
+
10
+ def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
11
+ @id = id
12
+ @document_indices = document_indices
13
+ @documents = documents
14
+ @embeddings = embeddings
15
+ @metadata = metadata || []
16
+ @terms = []
17
+ @label = nil
18
+ @description = nil
19
+ @coherence = nil
20
+ @distinctiveness = 0.0
21
+ end
22
+
23
+ # Number of documents in this topic
24
+ def size
25
+ @documents.length
26
+ end
27
+
28
+ # Compute the centroid of the topic
29
+ def centroid
30
+ @centroid ||= compute_centroid
31
+ end
32
+
33
+ # Get the most representative documents
34
+ # @param k [Integer] Number of documents to return
35
+ # @return [Array<String>] Representative documents
36
+ def representative_docs(k: 3)
37
+ return @documents if @documents.length <= k
38
+
39
+ # Find documents closest to centroid
40
+ distances = @embeddings.map { |embedding| distance_to_centroid(embedding) }
41
+
42
+ # Get indices of k smallest distances
43
+ top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
44
+ top_indices.map { |i| @documents[i] }
45
+ end
46
+
47
+ # Compute topic coherence (simple PMI-based score)
48
+ def coherence
49
+ @coherence ||= compute_coherence
50
+ end
51
+
52
+ # Convert to hash for serialization
53
+ def to_h
54
+ {
55
+ id: @id,
56
+ label: @label,
57
+ description: @description,
58
+ size: size,
59
+ terms: @terms,
60
+ coherence: @coherence,
61
+ distinctiveness: @distinctiveness,
62
+ document_indices: @document_indices
63
+ }
64
+ end
65
+
66
+ # Create from hash
67
+ def self.from_h(hash)
68
+ topic = new(
69
+ id: hash[:id],
70
+ document_indices: hash[:document_indices],
71
+ documents: [], # Would need to be reconstructed
72
+ embeddings: [] # Would need to be reconstructed
73
+ )
74
+ topic.label = hash[:label]
75
+ topic.description = hash[:description]
76
+ topic.terms = hash[:terms]
77
+ topic.coherence = hash[:coherence] || 0.0
78
+ topic.distinctiveness = hash[:distinctiveness] || 0.0
79
+ topic
80
+ end
81
+
82
+ private
83
+
84
+ def compute_coherence
85
+ # Use the Metrics module for proper coherence calculation
86
+ return 0.0 if @terms.empty? || @documents.empty?
87
+
88
+ Metrics.compute_coherence(@terms, @documents, top_n: 10)
89
+ end
90
+
91
+ def compute_centroid
92
+ return [] if @embeddings.empty?
93
+
94
+ # Compute mean of all embeddings
95
+ dim = @embeddings.first.length
96
+ centroid = Array.new(dim, 0.0)
97
+
98
+ @embeddings.each do |embedding|
99
+ embedding.each_with_index do |val, idx|
100
+ centroid[idx] += val
101
+ end
102
+ end
103
+
104
+ centroid.map { |val| val / @embeddings.length }
105
+ end
106
+
107
+ def distance_to_centroid(embedding)
108
+ # Euclidean distance
109
+ Math.sqrt(
110
+ embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
111
+ )
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ VERSION = "0.0.1.pre.1"
5
+ end
data/lib/topical.rb ADDED
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "topical/version"
4
+
5
+ # Main module for topic modeling
6
+ module Topical
7
+ class Error < StandardError; end
8
+
9
+ # Autoload components for better performance
10
+ autoload :Engine, "topical/engine"
11
+ autoload :Topic, "topical/topic"
12
+ autoload :Metrics, "topical/metrics"
13
+
14
+ module Clustering
15
+ autoload :Adapter, "topical/clustering/adapter"
16
+ autoload :HDBSCANAdapter, "topical/clustering/hdbscan_adapter"
17
+ autoload :KMeansAdapter, "topical/clustering/kmeans_adapter"
18
+ end
19
+
20
+ module Dimensionality
21
+ autoload :Reducer, "topical/dimensionality/reducer"
22
+ end
23
+
24
+ module Extractors
25
+ autoload :TermExtractor, "topical/extractors/term_extractor"
26
+ autoload :Stopwords, "topical/extractors/stopwords"
27
+ end
28
+
29
+ module Labelers
30
+ autoload :Base, "topical/labelers/base"
31
+ autoload :TermBased, "topical/labelers/term_based"
32
+ autoload :LLMBased, "topical/labelers/llm_based"
33
+ autoload :Hybrid, "topical/labelers/hybrid"
34
+ end
35
+
36
+ # Convenience method for simple topic extraction
37
+ # @param embeddings [Array<Array<Float>>] Document embeddings
38
+ # @param documents [Array<String>] Document texts
39
+ # @param options [Hash] Additional options
40
+ # @return [Array<Topic>] Extracted topics
41
+ def self.extract(embeddings:, documents:, **options)
42
+ engine = Engine.new(**options)
43
+ engine.fit(embeddings: embeddings, documents: documents)
44
+ end
45
+
46
+ # Check if red-candle is available for enhanced features
47
+ def self.llm_available?
48
+ @llm_available ||= begin
49
+ require 'red-candle'
50
+ true
51
+ rescue LoadError
52
+ false
53
+ end
54
+ end
55
+ end
data/sig/topical.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Topical
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: topical
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1.pre.1
5
+ platform: ruby
6
+ authors:
7
+ - Chris Petersen
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-08-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: clusterkit
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: red-candle
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '13.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '13.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: standard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.3'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.3'
83
+ description: Extract topics from document embeddings using HDBSCAN clustering and
84
+ c-TF-IDF term extraction. Provides automatic topic labeling, quality metrics, and
85
+ support for various clustering algorithms.
86
+ email:
87
+ - chris@petersen.io
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - ".rspec"
93
+ - ".standard.yml"
94
+ - CODE_OF_CONDUCT.md
95
+ - LICENSE.txt
96
+ - README.md
97
+ - Rakefile
98
+ - examples/quick_demo.rb
99
+ - examples/verify_migration.rb
100
+ - lib/topical.rb
101
+ - lib/topical/clustering/adapter.rb
102
+ - lib/topical/clustering/hdbscan_adapter.rb
103
+ - lib/topical/clustering/kmeans_adapter.rb
104
+ - lib/topical/engine.rb
105
+ - lib/topical/extractors/term_extractor.rb
106
+ - lib/topical/labelers/base.rb
107
+ - lib/topical/labelers/hybrid.rb
108
+ - lib/topical/labelers/llm_adapter.rb
109
+ - lib/topical/labelers/llm_based.rb
110
+ - lib/topical/labelers/term_based.rb
111
+ - lib/topical/metrics.rb
112
+ - lib/topical/topic.rb
113
+ - lib/topical/version.rb
114
+ - sig/topical.rbs
115
+ homepage: https://github.com/cpetersen/topical
116
+ licenses:
117
+ - MIT
118
+ metadata:
119
+ homepage_uri: https://github.com/cpetersen/topical
120
+ source_code_uri: https://github.com/cpetersen/topical
121
+ changelog_uri: https://github.com/cpetersen/topical/blob/main/CHANGELOG.md
122
+ documentation_uri: https://rubydoc.info/gems/topical
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: 3.1.0
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubygems_version: 3.5.3
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Topic modeling for Ruby using modern clustering algorithms
142
+ test_files: []