topical 0.0.1.pre.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env ruby
2
+ # Advanced example: Using Topical for clustering + red-candle for topic summaries
3
+
4
+ require 'bundler/setup'
5
+ require 'topical'
6
+ require 'red-candle'
7
+
8
+ puts "=== Advanced Topic Summaries Example ==="
9
+ puts "Combining Topical clustering with red-candle LLM summarization"
10
+ puts
11
+
12
+ # Sample documents with clear topic clusters
13
+ documents = [
14
+ # Finance/Economics
15
+ "The Federal Reserve raised interest rates to combat inflation pressures",
16
+ "Stock markets rallied on positive earnings reports from tech companies",
17
+ "Cryptocurrency markets experienced significant volatility this quarter",
18
+ "Central banks coordinate policy to address economic uncertainty",
19
+ "Corporate bond yields rise as investors seek safer assets",
20
+
21
+ # Technology/AI
22
+ "New AI breakthrough in natural language processing announced by researchers",
23
+ "Machine learning transforms healthcare diagnostics and treatment planning",
24
+ "Cloud computing adoption accelerates across enterprise sectors",
25
+ "Cybersecurity threats evolve with sophisticated ransomware attacks",
26
+ "Quantum computing reaches new milestone in error correction",
27
+
28
+ # Healthcare/Medical
29
+ "Clinical trials show promising results for new cancer immunotherapy",
30
+ "Telemedicine adoption continues to reshape patient care delivery",
31
+ "Gene editing techniques advance treatment for rare diseases",
32
+ "Mental health awareness campaigns gain momentum globally",
33
+ "Personalized medicine approaches show improved patient outcomes",
34
+
35
+ # Climate/Environment
36
+ "Renewable energy investments surpass fossil fuel spending globally",
37
+ "Climate scientists warn of accelerating Arctic ice melt",
38
+ "Carbon capture technology receives significant government funding",
39
+ "Electric vehicle adoption reaches record levels worldwide",
40
+ "Sustainable agriculture practices reduce environmental impact"
41
+ ]
42
+
43
+ # Step 1: Generate embeddings using red-candle
44
+ puts "1. Generating embeddings with red-candle..."
45
+ embedder = Candle::EmbeddingModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
46
+ embeddings = documents.map { |doc| embedder.embedding(doc).first.to_a }
47
+
48
+ # Step 2: Extract topics using Topical (term-based labeling only)
49
+ puts "2. Extracting topics with Topical..."
50
+ engine = Topical::Engine.new(
51
+ clustering_method: :hdbscan,
52
+ min_cluster_size: 4,
53
+ labeling_method: :term_based,
54
+ verbose: true
55
+ )
56
+
57
+ topics = engine.fit(embeddings: embeddings, documents: documents)
58
+
59
+ # Step 3: Generate summaries using red-candle LLM
60
+ puts "\n3. Generating topic summaries with LLM..."
61
+
62
+ # Initialize LLM for summarization
63
+ llm = Candle::LLM.from_pretrained(
64
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
65
+ gguf_file: "tinyllama-1.1b-chat-v1.0.q4_k_m.gguf"
66
+ )
67
+
68
+ def summarize_topic(topic, llm)
69
+ # Get representative documents for context
70
+ sample_docs = topic.representative_docs(k: 3)
71
+
72
+ # Simple, clear prompt for summarization
73
+ prompt = <<~PROMPT
74
+ Summarize what connects these documents in 1-2 sentences:
75
+
76
+ Key terms: #{topic.terms.first(5).join(', ')}
77
+
78
+ Documents:
79
+ #{sample_docs.map.with_index { |doc, i| "#{i+1}. #{doc}" }.join("\n")}
80
+
81
+ Summary:
82
+ PROMPT
83
+
84
+ begin
85
+ summary = llm.generate(prompt).strip
86
+ # Clean up common artifacts
87
+ summary = summary.lines.first&.strip || "Related documents"
88
+ summary = summary.gsub(/^(Summary:|Topic:|Documents:)/i, '').strip
89
+ summary.empty? ? "Documents about #{topic.terms.first(2).join(' and ')}" : summary
90
+ rescue => e
91
+ "Documents about #{topic.terms.first(2).join(' and ')}"
92
+ end
93
+ end
94
+
95
+ # Step 4: Display results with summaries
96
+ puts "\n=== Topics with LLM Summaries ==="
97
+
98
+ topics.each_with_index do |topic, i|
99
+ puts "\n#{i + 1}. Topic: #{topic.label}"
100
+
101
+ # Generate summary using LLM
102
+ summary = summarize_topic(topic, llm)
103
+ puts " Summary: #{summary}"
104
+
105
+ puts " Size: #{topic.size} documents"
106
+ puts " Key terms: #{topic.terms.first(8).join(', ')}"
107
+ puts " Coherence: #{topic.coherence.round(3)}"
108
+ puts " Sample documents:"
109
+ topic.representative_docs(k: 2).each do |doc|
110
+ puts " • #{doc[0..80]}..."
111
+ end
112
+ end
113
+
114
+ # Step 5: Show outliers
115
+ outliers = engine.outliers
116
+ if outliers.any?
117
+ puts "\nOutliers (#{outliers.length} documents):"
118
+ outliers.each { |doc| puts " • #{doc[0..60]}..." }
119
+ end
120
+
121
+ puts "\n=== Key Benefits of This Approach ==="
122
+ puts "• Topical handles clustering expertly (fast, reliable)"
123
+ puts "• Your application controls LLM integration completely"
124
+ puts "• Domain-specific prompts for better summaries"
125
+ puts "• Easy to swap LLM providers or models"
126
+ puts "• Clean separation of concerns"
127
+
128
+ puts "\nDone! 🎯"
@@ -105,4 +105,4 @@ outliers = engine.outliers
105
105
  puts " Outliers: #{outliers.length}"
106
106
  puts
107
107
 
108
- puts "=== All tests passed! Migration successful. ==="
108
+ puts "=== All tests passed! Migration successful. ==="
@@ -27,4 +27,4 @@ module Topical
27
27
  end
28
28
  end
29
29
  end
30
- end
30
+ end
@@ -51,4 +51,4 @@ module Topical
51
51
  end
52
52
  end
53
53
  end
54
- end
54
+ end
@@ -41,4 +41,4 @@ module Topical
41
41
  attr_reader :clusterer
42
42
  end
43
43
  end
44
- end
44
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logger'
4
+
5
+ module Topical
6
+ # Handles dimensionality reduction for embeddings using UMAP
7
+ class DimensionalityReducer
8
+ def initialize(n_components: 50, logger: nil)
9
+ @n_components = n_components
10
+ @logger = logger || Logger.new(IO::NULL, level: Logger::FATAL)
11
+ end
12
+
13
+ # Reduce dimensionality of embeddings if needed
14
+ # @param embeddings [Array<Array<Float>>] Input embeddings
15
+ # @return [Array<Array<Float>>] Reduced embeddings
16
+ def reduce(embeddings)
17
+ return embeddings if embeddings.empty?
18
+ return embeddings if embeddings.first.length <= @n_components
19
+
20
+ begin
21
+ require 'clusterkit'
22
+
23
+ # Validate embeddings before UMAP
24
+ valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
25
+
26
+ if valid_embeddings.empty?
27
+ raise "No valid embeddings for dimensionality reduction. " \
28
+ "All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
29
+ end
30
+
31
+ if invalid_indices.any?
32
+ @logger.warn " Warning: #{invalid_indices.size} embeddings with invalid values removed"
33
+ end
34
+
35
+ # Adjust parameters based on data size
36
+ n_samples = valid_embeddings.size
37
+ n_components = [@n_components, n_samples - 1, 50].min
38
+ n_neighbors = [15, n_samples - 1].min
39
+
40
+ if n_components != @n_components
41
+ @logger.info " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
42
+ end
43
+
44
+ umap = ClusterKit::Dimensionality::UMAP.new(
45
+ n_components: n_components,
46
+ n_neighbors: n_neighbors,
47
+ random_seed: 42
48
+ )
49
+
50
+ reduced = umap.fit_transform(valid_embeddings)
51
+
52
+ # If we had to remove invalid embeddings, reconstruct the full array
53
+ if invalid_indices.any?
54
+ full_reduced = []
55
+ valid_idx = 0
56
+ embeddings.size.times do |i|
57
+ if invalid_indices.include?(i)
58
+ # Use zeros for invalid embeddings (they'll be outliers anyway)
59
+ full_reduced << Array.new(n_components, 0.0)
60
+ else
61
+ full_reduced << reduced[valid_idx]
62
+ valid_idx += 1
63
+ end
64
+ end
65
+ full_reduced
66
+ else
67
+ reduced
68
+ end
69
+ rescue LoadError
70
+ @logger.warn "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings."
71
+ embeddings
72
+ rescue => e
73
+ @logger.warn "Warning: Dimensionality reduction failed: #{e.message}"
74
+ embeddings
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ def validate_embeddings_for_umap(embeddings)
81
+ valid = []
82
+ invalid_indices = []
83
+
84
+ embeddings.each_with_index do |embedding, idx|
85
+ if embedding.is_a?(Array) &&
86
+ embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
87
+ valid << embedding
88
+ else
89
+ invalid_indices << idx
90
+ end
91
+ end
92
+
93
+ [valid, invalid_indices]
94
+ end
95
+ end
96
+ end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'logger'
4
+
3
5
  module Topical
4
6
  # Main engine for topic modeling
5
7
  class Engine
@@ -11,9 +13,9 @@ module Topical
11
13
  min_samples: 3,
12
14
  reduce_dimensions: true,
13
15
  n_components: 50,
14
- labeling_method: :hybrid,
15
- llm_provider: nil,
16
+ labeling_method: :term_based,
16
17
  verbose: false,
18
+ logger: nil,
17
19
  k: nil, # Add k as explicit parameter
18
20
  **options
19
21
  )
@@ -23,14 +25,18 @@ module Topical
23
25
  @reduce_dimensions = reduce_dimensions
24
26
  @n_components = n_components
25
27
  @labeling_method = labeling_method
26
- @llm_provider = llm_provider
27
28
  @verbose = verbose
29
+ @logger = setup_logger(logger, verbose)
28
30
  @options = options
29
31
  @options[:k] = k if k # Store k in options if provided
30
32
 
31
33
  @clustering_adapter = build_clustering_adapter
32
34
  @term_extractor = Extractors::TermExtractor.new
33
35
  @labeler = build_labeler
36
+ @dimensionality_reducer = DimensionalityReducer.new(
37
+ n_components: @n_components,
38
+ logger: @logger
39
+ )
34
40
  @topics = []
35
41
  end
36
42
 
@@ -46,34 +52,34 @@ module Topical
46
52
  @documents = documents
47
53
  @metadata = metadata || Array.new(documents.length) { {} }
48
54
 
49
- puts "Starting topic extraction..." if @verbose
55
+ @logger.info "Starting topic extraction..."
50
56
 
51
57
  # Step 1: Optionally reduce dimensions
52
58
  working_embeddings = @embeddings
53
- if @reduce_dimensions && @embeddings.first.length > @n_components
54
- puts " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
55
- working_embeddings = reduce_dimensions(@embeddings)
59
+ if @reduce_dimensions && !@embeddings.empty? && @embeddings.first.length > @n_components
60
+ @logger.info " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..."
61
+ working_embeddings = @dimensionality_reducer.reduce(@embeddings)
56
62
  end
57
63
 
58
64
  # Step 2: Cluster embeddings
59
- puts " Clustering #{working_embeddings.length} documents..." if @verbose
65
+ @logger.info " Clustering #{working_embeddings.length} documents..."
60
66
  @cluster_ids = @clustering_adapter.fit_predict(working_embeddings)
61
67
 
62
68
  # Step 3: Build topics from clusters
63
- puts " Building topics from clusters..." if @verbose
69
+ @logger.info " Building topics from clusters..."
64
70
  @topics = build_topics(@cluster_ids)
65
71
 
66
72
  # Step 4: Extract terms for each topic
67
- puts " Extracting distinctive terms..." if @verbose
73
+ @logger.info " Extracting distinctive terms..."
68
74
  extract_topic_terms
69
75
 
70
76
  # Step 5: Generate labels
71
- puts " Generating topic labels..." if @verbose
77
+ @logger.info " Generating topic labels..."
72
78
  generate_topic_labels
73
79
 
74
80
  if @verbose
75
81
  n_noise = @cluster_ids.count(-1)
76
- puts "Found #{@topics.length} topics (plus #{n_noise} outliers)"
82
+ @logger.info "Found #{@topics.length} topics (plus #{n_noise} outliers)"
77
83
  end
78
84
 
79
85
  @topics
@@ -105,124 +111,27 @@ module Topical
105
111
 
106
112
  # Save the model
107
113
  def save(path)
108
- require 'json'
109
- config = {
110
- clustering_method: @clustering_method,
111
- min_cluster_size: @min_cluster_size,
112
- min_samples: @min_samples,
113
- reduce_dimensions: @reduce_dimensions,
114
- n_components: @n_components,
115
- labeling_method: @labeling_method
116
- }
117
-
118
- # Include k for kmeans
119
- if @clustering_method == :kmeans
120
- config[:k] = @options[:k] || @topics.length
121
- end
122
-
123
- data = {
124
- topics: @topics.map(&:to_h),
125
- config: config
126
- }
127
- File.write(path, JSON.pretty_generate(data))
114
+ ModelSerializer.save(self, path)
128
115
  end
129
116
 
130
117
  # Load a model
131
118
  def self.load(path)
132
- require 'json'
133
- data = JSON.parse(File.read(path), symbolize_names: true)
134
-
135
- # Make sure k is passed for kmeans and convert string keys to symbols
136
- config = data[:config]
137
- config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
138
- config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
139
-
140
- if config[:clustering_method] == :kmeans && !config[:k]
141
- # Extract k from saved topics or use default
142
- config[:k] = data[:topics]&.length || 5
143
- end
144
-
145
- engine = new(**config)
146
- # Reconstruct topics
147
- engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
148
- engine
119
+ ModelSerializer.load(path)
149
120
  end
150
121
 
151
122
  private
152
123
 
153
- def reduce_dimensions(embeddings)
154
- begin
155
- require 'clusterkit'
156
-
157
- # Validate embeddings before UMAP
158
- valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
159
-
160
- if valid_embeddings.empty?
161
- raise "No valid embeddings for dimensionality reduction. " \
162
- "All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
163
- end
164
-
165
- if invalid_indices.any? && @verbose
166
- puts " Warning: #{invalid_indices.size} embeddings with invalid values removed"
167
- end
168
-
169
- # Adjust parameters based on data size
170
- n_samples = valid_embeddings.size
171
- n_components = [@n_components, n_samples - 1, 50].min
172
- n_neighbors = [15, n_samples - 1].min
173
-
174
- if @verbose && n_components != @n_components
175
- puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
176
- end
177
-
178
- umap = ClusterKit::Dimensionality::UMAP.new(
179
- n_components: n_components,
180
- n_neighbors: n_neighbors,
181
- random_seed: 42
182
- )
183
-
184
- reduced = umap.fit_transform(valid_embeddings)
185
-
186
- # If we had to remove invalid embeddings, reconstruct the full array
187
- if invalid_indices.any?
188
- full_reduced = []
189
- valid_idx = 0
190
- embeddings.size.times do |i|
191
- if invalid_indices.include?(i)
192
- # Use zeros for invalid embeddings (they'll be outliers anyway)
193
- full_reduced << Array.new(n_components, 0.0)
194
- else
195
- full_reduced << reduced[valid_idx]
196
- valid_idx += 1
197
- end
198
- end
199
- full_reduced
200
- else
201
- reduced
202
- end
203
- rescue LoadError
204
- puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
205
- embeddings
206
- rescue => e
207
- puts "Warning: Dimensionality reduction failed: #{e.message}" if @verbose
208
- embeddings
209
- end
210
- end
211
-
212
- def validate_embeddings_for_umap(embeddings)
213
- valid = []
214
- invalid_indices = []
124
+ def setup_logger(logger, verbose)
125
+ return logger if logger
215
126
 
216
- embeddings.each_with_index do |embedding, idx|
217
- if embedding.is_a?(Array) &&
218
- embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
219
- valid << embedding
220
- else
221
- invalid_indices << idx
222
- end
127
+ # Create default logger for backward compatibility
128
+ if verbose
129
+ require 'logger'
130
+ Logger.new($stdout, level: Logger::INFO)
131
+ else
132
+ # Null logger - doesn't output anything
133
+ Logger.new(IO::NULL, level: Logger::FATAL)
223
134
  end
224
-
225
- [valid, invalid_indices]
226
135
  end
227
136
 
228
137
  def build_topics(cluster_ids)
@@ -283,12 +192,8 @@ module Topical
283
192
  case @labeling_method
284
193
  when :term_based
285
194
  Labelers::TermBased.new
286
- when :llm_based
287
- Labelers::LLMBased.new(provider: @llm_provider)
288
- when :hybrid
289
- Labelers::Hybrid.new(provider: @llm_provider)
290
195
  else
291
- Labelers::TermBased.new # Default fallback
196
+ Labelers::TermBased.new # Only term-based labeling supported
292
197
  end
293
198
  end
294
199
 
@@ -307,4 +212,4 @@ module Topical
307
212
  end
308
213
  end
309
214
  end
310
- end
215
+ end
@@ -95,4 +95,4 @@ module Topical
95
95
  end
96
96
  end
97
97
  end
98
- end
98
+ end
@@ -20,4 +20,4 @@ module Topical
20
20
  end
21
21
  end
22
22
  end
23
- end
23
+ end
@@ -19,4 +19,4 @@ module Topical
19
19
  end
20
20
  end
21
21
  end
22
- end
22
+ end
@@ -185,4 +185,4 @@ module Topical
185
185
  )
186
186
  end
187
187
  end
188
- end
188
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ # Handles saving and loading of topic models
5
+ class ModelSerializer
6
+ # Save a topic model to JSON file
7
+ # @param engine [Engine] The engine instance to save
8
+ # @param path [String] File path to save to
9
+ def self.save(engine, path)
10
+ require 'json'
11
+
12
+ config = {
13
+ clustering_method: engine.instance_variable_get(:@clustering_method),
14
+ min_cluster_size: engine.instance_variable_get(:@min_cluster_size),
15
+ min_samples: engine.instance_variable_get(:@min_samples),
16
+ reduce_dimensions: engine.instance_variable_get(:@reduce_dimensions),
17
+ n_components: engine.instance_variable_get(:@n_components),
18
+ labeling_method: engine.instance_variable_get(:@labeling_method)
19
+ }
20
+
21
+ # Include k for kmeans
22
+ options = engine.instance_variable_get(:@options)
23
+ if config[:clustering_method] == :kmeans
24
+ config[:k] = options[:k] || engine.topics.length
25
+ end
26
+
27
+ data = {
28
+ topics: engine.topics.map(&:to_h),
29
+ config: config
30
+ }
31
+
32
+ File.write(path, JSON.pretty_generate(data))
33
+ end
34
+
35
+ # Load a topic model from JSON file
36
+ # @param path [String] File path to load from
37
+ # @return [Engine] Loaded engine instance
38
+ def self.load(path)
39
+ require 'json'
40
+
41
+ data = JSON.parse(File.read(path), symbolize_names: true)
42
+
43
+ # Make sure k is passed for kmeans and convert string keys to symbols
44
+ config = data[:config]
45
+ config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
46
+ config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
47
+
48
+ if config[:clustering_method] == :kmeans && !config[:k]
49
+ # Extract k from saved topics or use default
50
+ config[:k] = data[:topics]&.length || 5
51
+ end
52
+
53
+ engine = Engine.new(**config)
54
+ # Reconstruct topics
55
+ engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
56
+ engine
57
+ end
58
+ end
59
+ end
data/lib/topical/topic.rb CHANGED
@@ -111,4 +111,4 @@ module Topical
111
111
  )
112
112
  end
113
113
  end
114
- end
114
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Topical
4
- VERSION = "0.0.1.pre.1"
4
+ VERSION = "0.1.1"
5
5
  end
data/lib/topical.rb CHANGED
@@ -10,6 +10,8 @@ module Topical
10
10
  autoload :Engine, "topical/engine"
11
11
  autoload :Topic, "topical/topic"
12
12
  autoload :Metrics, "topical/metrics"
13
+ autoload :DimensionalityReducer, "topical/dimensionality_reducer"
14
+ autoload :ModelSerializer, "topical/model_serializer"
13
15
 
14
16
  module Clustering
15
17
  autoload :Adapter, "topical/clustering/adapter"
@@ -17,20 +19,13 @@ module Topical
17
19
  autoload :KMeansAdapter, "topical/clustering/kmeans_adapter"
18
20
  end
19
21
 
20
- module Dimensionality
21
- autoload :Reducer, "topical/dimensionality/reducer"
22
- end
23
-
24
22
  module Extractors
25
23
  autoload :TermExtractor, "topical/extractors/term_extractor"
26
- autoload :Stopwords, "topical/extractors/stopwords"
27
24
  end
28
25
 
29
26
  module Labelers
30
27
  autoload :Base, "topical/labelers/base"
31
28
  autoload :TermBased, "topical/labelers/term_based"
32
- autoload :LLMBased, "topical/labelers/llm_based"
33
- autoload :Hybrid, "topical/labelers/hybrid"
34
29
  end
35
30
 
36
31
  # Convenience method for simple topic extraction
@@ -43,13 +38,13 @@ module Topical
43
38
  engine.fit(embeddings: embeddings, documents: documents)
44
39
  end
45
40
 
46
- # Check if red-candle is available for enhanced features
47
- def self.llm_available?
48
- @llm_available ||= begin
41
+ # Check if red-candle is available for embedding generation in examples
42
+ def self.embedding_model_available?
43
+ @embedding_model_available ||= begin
49
44
  require 'red-candle'
50
45
  true
51
46
  rescue LoadError
52
47
  false
53
48
  end
54
49
  end
55
- end
50
+ end