topical 0.0.1.pre.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +158 -106
- data/docs/assets/topical-wide.png +0 -0
- data/examples/detect_new_topics.rb +190 -0
- data/examples/quick_demo.rb +1 -1
- data/examples/topic_summaries_with_llm.rb +128 -0
- data/examples/verify_migration.rb +1 -1
- data/lib/topical/clustering/adapter.rb +1 -1
- data/lib/topical/clustering/hdbscan_adapter.rb +1 -1
- data/lib/topical/clustering/kmeans_adapter.rb +1 -1
- data/lib/topical/dimensionality_reducer.rb +96 -0
- data/lib/topical/engine.rb +31 -126
- data/lib/topical/extractors/term_extractor.rb +1 -1
- data/lib/topical/labelers/base.rb +1 -1
- data/lib/topical/labelers/term_based.rb +1 -1
- data/lib/topical/metrics.rb +1 -1
- data/lib/topical/model_serializer.rb +59 -0
- data/lib/topical/topic.rb +1 -1
- data/lib/topical/version.rb +1 -1
- data/lib/topical.rb +6 -11
- metadata +27 -11
- data/lib/topical/labelers/hybrid.rb +0 -24
- data/lib/topical/labelers/llm_adapter.rb +0 -126
- data/lib/topical/labelers/llm_based.rb +0 -111
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Advanced example: Using Topical for clustering + red-candle for topic summaries
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'topical'
|
6
|
+
require 'red-candle'
|
7
|
+
|
8
|
+
puts "=== Advanced Topic Summaries Example ==="
|
9
|
+
puts "Combining Topical clustering with red-candle LLM summarization"
|
10
|
+
puts
|
11
|
+
|
12
|
+
# Sample documents with clear topic clusters
|
13
|
+
documents = [
|
14
|
+
# Finance/Economics
|
15
|
+
"The Federal Reserve raised interest rates to combat inflation pressures",
|
16
|
+
"Stock markets rallied on positive earnings reports from tech companies",
|
17
|
+
"Cryptocurrency markets experienced significant volatility this quarter",
|
18
|
+
"Central banks coordinate policy to address economic uncertainty",
|
19
|
+
"Corporate bond yields rise as investors seek safer assets",
|
20
|
+
|
21
|
+
# Technology/AI
|
22
|
+
"New AI breakthrough in natural language processing announced by researchers",
|
23
|
+
"Machine learning transforms healthcare diagnostics and treatment planning",
|
24
|
+
"Cloud computing adoption accelerates across enterprise sectors",
|
25
|
+
"Cybersecurity threats evolve with sophisticated ransomware attacks",
|
26
|
+
"Quantum computing reaches new milestone in error correction",
|
27
|
+
|
28
|
+
# Healthcare/Medical
|
29
|
+
"Clinical trials show promising results for new cancer immunotherapy",
|
30
|
+
"Telemedicine adoption continues to reshape patient care delivery",
|
31
|
+
"Gene editing techniques advance treatment for rare diseases",
|
32
|
+
"Mental health awareness campaigns gain momentum globally",
|
33
|
+
"Personalized medicine approaches show improved patient outcomes",
|
34
|
+
|
35
|
+
# Climate/Environment
|
36
|
+
"Renewable energy investments surpass fossil fuel spending globally",
|
37
|
+
"Climate scientists warn of accelerating Arctic ice melt",
|
38
|
+
"Carbon capture technology receives significant government funding",
|
39
|
+
"Electric vehicle adoption reaches record levels worldwide",
|
40
|
+
"Sustainable agriculture practices reduce environmental impact"
|
41
|
+
]
|
42
|
+
|
43
|
+
# Step 1: Generate embeddings using red-candle
|
44
|
+
puts "1. Generating embeddings with red-candle..."
|
45
|
+
embedder = Candle::EmbeddingModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
46
|
+
embeddings = documents.map { |doc| embedder.embedding(doc).first.to_a }
|
47
|
+
|
48
|
+
# Step 2: Extract topics using Topical (term-based labeling only)
|
49
|
+
puts "2. Extracting topics with Topical..."
|
50
|
+
engine = Topical::Engine.new(
|
51
|
+
clustering_method: :hdbscan,
|
52
|
+
min_cluster_size: 4,
|
53
|
+
labeling_method: :term_based,
|
54
|
+
verbose: true
|
55
|
+
)
|
56
|
+
|
57
|
+
topics = engine.fit(embeddings: embeddings, documents: documents)
|
58
|
+
|
59
|
+
# Step 3: Generate summaries using red-candle LLM
|
60
|
+
puts "\n3. Generating topic summaries with LLM..."
|
61
|
+
|
62
|
+
# Initialize LLM for summarization
|
63
|
+
llm = Candle::LLM.from_pretrained(
|
64
|
+
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
65
|
+
gguf_file: "tinyllama-1.1b-chat-v1.0.q4_k_m.gguf"
|
66
|
+
)
|
67
|
+
|
68
|
+
def summarize_topic(topic, llm)
|
69
|
+
# Get representative documents for context
|
70
|
+
sample_docs = topic.representative_docs(k: 3)
|
71
|
+
|
72
|
+
# Simple, clear prompt for summarization
|
73
|
+
prompt = <<~PROMPT
|
74
|
+
Summarize what connects these documents in 1-2 sentences:
|
75
|
+
|
76
|
+
Key terms: #{topic.terms.first(5).join(', ')}
|
77
|
+
|
78
|
+
Documents:
|
79
|
+
#{sample_docs.map.with_index { |doc, i| "#{i+1}. #{doc}" }.join("\n")}
|
80
|
+
|
81
|
+
Summary:
|
82
|
+
PROMPT
|
83
|
+
|
84
|
+
begin
|
85
|
+
summary = llm.generate(prompt).strip
|
86
|
+
# Clean up common artifacts
|
87
|
+
summary = summary.lines.first&.strip || "Related documents"
|
88
|
+
summary = summary.gsub(/^(Summary:|Topic:|Documents:)/i, '').strip
|
89
|
+
summary.empty? ? "Documents about #{topic.terms.first(2).join(' and ')}" : summary
|
90
|
+
rescue => e
|
91
|
+
"Documents about #{topic.terms.first(2).join(' and ')}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Step 4: Display results with summaries
|
96
|
+
puts "\n=== Topics with LLM Summaries ==="
|
97
|
+
|
98
|
+
topics.each_with_index do |topic, i|
|
99
|
+
puts "\n#{i + 1}. Topic: #{topic.label}"
|
100
|
+
|
101
|
+
# Generate summary using LLM
|
102
|
+
summary = summarize_topic(topic, llm)
|
103
|
+
puts " Summary: #{summary}"
|
104
|
+
|
105
|
+
puts " Size: #{topic.size} documents"
|
106
|
+
puts " Key terms: #{topic.terms.first(8).join(', ')}"
|
107
|
+
puts " Coherence: #{topic.coherence.round(3)}"
|
108
|
+
puts " Sample documents:"
|
109
|
+
topic.representative_docs(k: 2).each do |doc|
|
110
|
+
puts " • #{doc[0..80]}..."
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Step 5: Show outliers
|
115
|
+
outliers = engine.outliers
|
116
|
+
if outliers.any?
|
117
|
+
puts "\nOutliers (#{outliers.length} documents):"
|
118
|
+
outliers.each { |doc| puts " • #{doc[0..60]}..." }
|
119
|
+
end
|
120
|
+
|
121
|
+
puts "\n=== Key Benefits of This Approach ==="
|
122
|
+
puts "• Topical handles clustering expertly (fast, reliable)"
|
123
|
+
puts "• Your application controls LLM integration completely"
|
124
|
+
puts "• Domain-specific prompts for better summaries"
|
125
|
+
puts "• Easy to swap LLM providers or models"
|
126
|
+
puts "• Clean separation of concerns"
|
127
|
+
|
128
|
+
puts "\nDone! 🎯"
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
module Topical
|
6
|
+
# Handles dimensionality reduction for embeddings using UMAP
|
7
|
+
class DimensionalityReducer
|
8
|
+
def initialize(n_components: 50, logger: nil)
|
9
|
+
@n_components = n_components
|
10
|
+
@logger = logger || Logger.new(IO::NULL, level: Logger::FATAL)
|
11
|
+
end
|
12
|
+
|
13
|
+
# Reduce dimensionality of embeddings if needed
|
14
|
+
# @param embeddings [Array<Array<Float>>] Input embeddings
|
15
|
+
# @return [Array<Array<Float>>] Reduced embeddings
|
16
|
+
def reduce(embeddings)
|
17
|
+
return embeddings if embeddings.empty?
|
18
|
+
return embeddings if embeddings.first.length <= @n_components
|
19
|
+
|
20
|
+
begin
|
21
|
+
require 'clusterkit'
|
22
|
+
|
23
|
+
# Validate embeddings before UMAP
|
24
|
+
valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
|
25
|
+
|
26
|
+
if valid_embeddings.empty?
|
27
|
+
raise "No valid embeddings for dimensionality reduction. " \
|
28
|
+
"All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
|
29
|
+
end
|
30
|
+
|
31
|
+
if invalid_indices.any?
|
32
|
+
@logger.warn " Warning: #{invalid_indices.size} embeddings with invalid values removed"
|
33
|
+
end
|
34
|
+
|
35
|
+
# Adjust parameters based on data size
|
36
|
+
n_samples = valid_embeddings.size
|
37
|
+
n_components = [@n_components, n_samples - 1, 50].min
|
38
|
+
n_neighbors = [15, n_samples - 1].min
|
39
|
+
|
40
|
+
if n_components != @n_components
|
41
|
+
@logger.info " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
|
42
|
+
end
|
43
|
+
|
44
|
+
umap = ClusterKit::Dimensionality::UMAP.new(
|
45
|
+
n_components: n_components,
|
46
|
+
n_neighbors: n_neighbors,
|
47
|
+
random_seed: 42
|
48
|
+
)
|
49
|
+
|
50
|
+
reduced = umap.fit_transform(valid_embeddings)
|
51
|
+
|
52
|
+
# If we had to remove invalid embeddings, reconstruct the full array
|
53
|
+
if invalid_indices.any?
|
54
|
+
full_reduced = []
|
55
|
+
valid_idx = 0
|
56
|
+
embeddings.size.times do |i|
|
57
|
+
if invalid_indices.include?(i)
|
58
|
+
# Use zeros for invalid embeddings (they'll be outliers anyway)
|
59
|
+
full_reduced << Array.new(n_components, 0.0)
|
60
|
+
else
|
61
|
+
full_reduced << reduced[valid_idx]
|
62
|
+
valid_idx += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
full_reduced
|
66
|
+
else
|
67
|
+
reduced
|
68
|
+
end
|
69
|
+
rescue LoadError
|
70
|
+
@logger.warn "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings."
|
71
|
+
embeddings
|
72
|
+
rescue => e
|
73
|
+
@logger.warn "Warning: Dimensionality reduction failed: #{e.message}"
|
74
|
+
embeddings
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def validate_embeddings_for_umap(embeddings)
|
81
|
+
valid = []
|
82
|
+
invalid_indices = []
|
83
|
+
|
84
|
+
embeddings.each_with_index do |embedding, idx|
|
85
|
+
if embedding.is_a?(Array) &&
|
86
|
+
embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
|
87
|
+
valid << embedding
|
88
|
+
else
|
89
|
+
invalid_indices << idx
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
[valid, invalid_indices]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/topical/engine.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'logger'
|
4
|
+
|
3
5
|
module Topical
|
4
6
|
# Main engine for topic modeling
|
5
7
|
class Engine
|
@@ -11,9 +13,9 @@ module Topical
|
|
11
13
|
min_samples: 3,
|
12
14
|
reduce_dimensions: true,
|
13
15
|
n_components: 50,
|
14
|
-
labeling_method: :
|
15
|
-
llm_provider: nil,
|
16
|
+
labeling_method: :term_based,
|
16
17
|
verbose: false,
|
18
|
+
logger: nil,
|
17
19
|
k: nil, # Add k as explicit parameter
|
18
20
|
**options
|
19
21
|
)
|
@@ -23,14 +25,18 @@ module Topical
|
|
23
25
|
@reduce_dimensions = reduce_dimensions
|
24
26
|
@n_components = n_components
|
25
27
|
@labeling_method = labeling_method
|
26
|
-
@llm_provider = llm_provider
|
27
28
|
@verbose = verbose
|
29
|
+
@logger = setup_logger(logger, verbose)
|
28
30
|
@options = options
|
29
31
|
@options[:k] = k if k # Store k in options if provided
|
30
32
|
|
31
33
|
@clustering_adapter = build_clustering_adapter
|
32
34
|
@term_extractor = Extractors::TermExtractor.new
|
33
35
|
@labeler = build_labeler
|
36
|
+
@dimensionality_reducer = DimensionalityReducer.new(
|
37
|
+
n_components: @n_components,
|
38
|
+
logger: @logger
|
39
|
+
)
|
34
40
|
@topics = []
|
35
41
|
end
|
36
42
|
|
@@ -46,34 +52,34 @@ module Topical
|
|
46
52
|
@documents = documents
|
47
53
|
@metadata = metadata || Array.new(documents.length) { {} }
|
48
54
|
|
49
|
-
|
55
|
+
@logger.info "Starting topic extraction..."
|
50
56
|
|
51
57
|
# Step 1: Optionally reduce dimensions
|
52
58
|
working_embeddings = @embeddings
|
53
|
-
if @reduce_dimensions && @embeddings.first.length > @n_components
|
54
|
-
|
55
|
-
working_embeddings =
|
59
|
+
if @reduce_dimensions && !@embeddings.empty? && @embeddings.first.length > @n_components
|
60
|
+
@logger.info " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..."
|
61
|
+
working_embeddings = @dimensionality_reducer.reduce(@embeddings)
|
56
62
|
end
|
57
63
|
|
58
64
|
# Step 2: Cluster embeddings
|
59
|
-
|
65
|
+
@logger.info " Clustering #{working_embeddings.length} documents..."
|
60
66
|
@cluster_ids = @clustering_adapter.fit_predict(working_embeddings)
|
61
67
|
|
62
68
|
# Step 3: Build topics from clusters
|
63
|
-
|
69
|
+
@logger.info " Building topics from clusters..."
|
64
70
|
@topics = build_topics(@cluster_ids)
|
65
71
|
|
66
72
|
# Step 4: Extract terms for each topic
|
67
|
-
|
73
|
+
@logger.info " Extracting distinctive terms..."
|
68
74
|
extract_topic_terms
|
69
75
|
|
70
76
|
# Step 5: Generate labels
|
71
|
-
|
77
|
+
@logger.info " Generating topic labels..."
|
72
78
|
generate_topic_labels
|
73
79
|
|
74
80
|
if @verbose
|
75
81
|
n_noise = @cluster_ids.count(-1)
|
76
|
-
|
82
|
+
@logger.info "Found #{@topics.length} topics (plus #{n_noise} outliers)"
|
77
83
|
end
|
78
84
|
|
79
85
|
@topics
|
@@ -105,124 +111,27 @@ module Topical
|
|
105
111
|
|
106
112
|
# Save the model
|
107
113
|
def save(path)
|
108
|
-
|
109
|
-
config = {
|
110
|
-
clustering_method: @clustering_method,
|
111
|
-
min_cluster_size: @min_cluster_size,
|
112
|
-
min_samples: @min_samples,
|
113
|
-
reduce_dimensions: @reduce_dimensions,
|
114
|
-
n_components: @n_components,
|
115
|
-
labeling_method: @labeling_method
|
116
|
-
}
|
117
|
-
|
118
|
-
# Include k for kmeans
|
119
|
-
if @clustering_method == :kmeans
|
120
|
-
config[:k] = @options[:k] || @topics.length
|
121
|
-
end
|
122
|
-
|
123
|
-
data = {
|
124
|
-
topics: @topics.map(&:to_h),
|
125
|
-
config: config
|
126
|
-
}
|
127
|
-
File.write(path, JSON.pretty_generate(data))
|
114
|
+
ModelSerializer.save(self, path)
|
128
115
|
end
|
129
116
|
|
130
117
|
# Load a model
|
131
118
|
def self.load(path)
|
132
|
-
|
133
|
-
data = JSON.parse(File.read(path), symbolize_names: true)
|
134
|
-
|
135
|
-
# Make sure k is passed for kmeans and convert string keys to symbols
|
136
|
-
config = data[:config]
|
137
|
-
config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
|
138
|
-
config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
|
139
|
-
|
140
|
-
if config[:clustering_method] == :kmeans && !config[:k]
|
141
|
-
# Extract k from saved topics or use default
|
142
|
-
config[:k] = data[:topics]&.length || 5
|
143
|
-
end
|
144
|
-
|
145
|
-
engine = new(**config)
|
146
|
-
# Reconstruct topics
|
147
|
-
engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
|
148
|
-
engine
|
119
|
+
ModelSerializer.load(path)
|
149
120
|
end
|
150
121
|
|
151
122
|
private
|
152
123
|
|
153
|
-
def
|
154
|
-
|
155
|
-
require 'clusterkit'
|
156
|
-
|
157
|
-
# Validate embeddings before UMAP
|
158
|
-
valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
|
159
|
-
|
160
|
-
if valid_embeddings.empty?
|
161
|
-
raise "No valid embeddings for dimensionality reduction. " \
|
162
|
-
"All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
|
163
|
-
end
|
164
|
-
|
165
|
-
if invalid_indices.any? && @verbose
|
166
|
-
puts " Warning: #{invalid_indices.size} embeddings with invalid values removed"
|
167
|
-
end
|
168
|
-
|
169
|
-
# Adjust parameters based on data size
|
170
|
-
n_samples = valid_embeddings.size
|
171
|
-
n_components = [@n_components, n_samples - 1, 50].min
|
172
|
-
n_neighbors = [15, n_samples - 1].min
|
173
|
-
|
174
|
-
if @verbose && n_components != @n_components
|
175
|
-
puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
|
176
|
-
end
|
177
|
-
|
178
|
-
umap = ClusterKit::Dimensionality::UMAP.new(
|
179
|
-
n_components: n_components,
|
180
|
-
n_neighbors: n_neighbors,
|
181
|
-
random_seed: 42
|
182
|
-
)
|
183
|
-
|
184
|
-
reduced = umap.fit_transform(valid_embeddings)
|
185
|
-
|
186
|
-
# If we had to remove invalid embeddings, reconstruct the full array
|
187
|
-
if invalid_indices.any?
|
188
|
-
full_reduced = []
|
189
|
-
valid_idx = 0
|
190
|
-
embeddings.size.times do |i|
|
191
|
-
if invalid_indices.include?(i)
|
192
|
-
# Use zeros for invalid embeddings (they'll be outliers anyway)
|
193
|
-
full_reduced << Array.new(n_components, 0.0)
|
194
|
-
else
|
195
|
-
full_reduced << reduced[valid_idx]
|
196
|
-
valid_idx += 1
|
197
|
-
end
|
198
|
-
end
|
199
|
-
full_reduced
|
200
|
-
else
|
201
|
-
reduced
|
202
|
-
end
|
203
|
-
rescue LoadError
|
204
|
-
puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
|
205
|
-
embeddings
|
206
|
-
rescue => e
|
207
|
-
puts "Warning: Dimensionality reduction failed: #{e.message}" if @verbose
|
208
|
-
embeddings
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
def validate_embeddings_for_umap(embeddings)
|
213
|
-
valid = []
|
214
|
-
invalid_indices = []
|
124
|
+
def setup_logger(logger, verbose)
|
125
|
+
return logger if logger
|
215
126
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
127
|
+
# Create default logger for backward compatibility
|
128
|
+
if verbose
|
129
|
+
require 'logger'
|
130
|
+
Logger.new($stdout, level: Logger::INFO)
|
131
|
+
else
|
132
|
+
# Null logger - doesn't output anything
|
133
|
+
Logger.new(IO::NULL, level: Logger::FATAL)
|
223
134
|
end
|
224
|
-
|
225
|
-
[valid, invalid_indices]
|
226
135
|
end
|
227
136
|
|
228
137
|
def build_topics(cluster_ids)
|
@@ -283,12 +192,8 @@ module Topical
|
|
283
192
|
case @labeling_method
|
284
193
|
when :term_based
|
285
194
|
Labelers::TermBased.new
|
286
|
-
when :llm_based
|
287
|
-
Labelers::LLMBased.new(provider: @llm_provider)
|
288
|
-
when :hybrid
|
289
|
-
Labelers::Hybrid.new(provider: @llm_provider)
|
290
195
|
else
|
291
|
-
Labelers::TermBased.new #
|
196
|
+
Labelers::TermBased.new # Only term-based labeling supported
|
292
197
|
end
|
293
198
|
end
|
294
199
|
|
@@ -307,4 +212,4 @@ module Topical
|
|
307
212
|
end
|
308
213
|
end
|
309
214
|
end
|
310
|
-
end
|
215
|
+
end
|
data/lib/topical/metrics.rb
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Topical
|
4
|
+
# Handles saving and loading of topic models
|
5
|
+
class ModelSerializer
|
6
|
+
# Save a topic model to JSON file
|
7
|
+
# @param engine [Engine] The engine instance to save
|
8
|
+
# @param path [String] File path to save to
|
9
|
+
def self.save(engine, path)
|
10
|
+
require 'json'
|
11
|
+
|
12
|
+
config = {
|
13
|
+
clustering_method: engine.instance_variable_get(:@clustering_method),
|
14
|
+
min_cluster_size: engine.instance_variable_get(:@min_cluster_size),
|
15
|
+
min_samples: engine.instance_variable_get(:@min_samples),
|
16
|
+
reduce_dimensions: engine.instance_variable_get(:@reduce_dimensions),
|
17
|
+
n_components: engine.instance_variable_get(:@n_components),
|
18
|
+
labeling_method: engine.instance_variable_get(:@labeling_method)
|
19
|
+
}
|
20
|
+
|
21
|
+
# Include k for kmeans
|
22
|
+
options = engine.instance_variable_get(:@options)
|
23
|
+
if config[:clustering_method] == :kmeans
|
24
|
+
config[:k] = options[:k] || engine.topics.length
|
25
|
+
end
|
26
|
+
|
27
|
+
data = {
|
28
|
+
topics: engine.topics.map(&:to_h),
|
29
|
+
config: config
|
30
|
+
}
|
31
|
+
|
32
|
+
File.write(path, JSON.pretty_generate(data))
|
33
|
+
end
|
34
|
+
|
35
|
+
# Load a topic model from JSON file
|
36
|
+
# @param path [String] File path to load from
|
37
|
+
# @return [Engine] Loaded engine instance
|
38
|
+
def self.load(path)
|
39
|
+
require 'json'
|
40
|
+
|
41
|
+
data = JSON.parse(File.read(path), symbolize_names: true)
|
42
|
+
|
43
|
+
# Make sure k is passed for kmeans and convert string keys to symbols
|
44
|
+
config = data[:config]
|
45
|
+
config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
|
46
|
+
config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
|
47
|
+
|
48
|
+
if config[:clustering_method] == :kmeans && !config[:k]
|
49
|
+
# Extract k from saved topics or use default
|
50
|
+
config[:k] = data[:topics]&.length || 5
|
51
|
+
end
|
52
|
+
|
53
|
+
engine = Engine.new(**config)
|
54
|
+
# Reconstruct topics
|
55
|
+
engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
|
56
|
+
engine
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/lib/topical/topic.rb
CHANGED
data/lib/topical/version.rb
CHANGED
data/lib/topical.rb
CHANGED
@@ -10,6 +10,8 @@ module Topical
|
|
10
10
|
autoload :Engine, "topical/engine"
|
11
11
|
autoload :Topic, "topical/topic"
|
12
12
|
autoload :Metrics, "topical/metrics"
|
13
|
+
autoload :DimensionalityReducer, "topical/dimensionality_reducer"
|
14
|
+
autoload :ModelSerializer, "topical/model_serializer"
|
13
15
|
|
14
16
|
module Clustering
|
15
17
|
autoload :Adapter, "topical/clustering/adapter"
|
@@ -17,20 +19,13 @@ module Topical
|
|
17
19
|
autoload :KMeansAdapter, "topical/clustering/kmeans_adapter"
|
18
20
|
end
|
19
21
|
|
20
|
-
module Dimensionality
|
21
|
-
autoload :Reducer, "topical/dimensionality/reducer"
|
22
|
-
end
|
23
|
-
|
24
22
|
module Extractors
|
25
23
|
autoload :TermExtractor, "topical/extractors/term_extractor"
|
26
|
-
autoload :Stopwords, "topical/extractors/stopwords"
|
27
24
|
end
|
28
25
|
|
29
26
|
module Labelers
|
30
27
|
autoload :Base, "topical/labelers/base"
|
31
28
|
autoload :TermBased, "topical/labelers/term_based"
|
32
|
-
autoload :LLMBased, "topical/labelers/llm_based"
|
33
|
-
autoload :Hybrid, "topical/labelers/hybrid"
|
34
29
|
end
|
35
30
|
|
36
31
|
# Convenience method for simple topic extraction
|
@@ -43,13 +38,13 @@ module Topical
|
|
43
38
|
engine.fit(embeddings: embeddings, documents: documents)
|
44
39
|
end
|
45
40
|
|
46
|
-
# Check if red-candle is available for
|
47
|
-
def self.
|
48
|
-
@
|
41
|
+
# Check if red-candle is available for embedding generation in examples
|
42
|
+
def self.embedding_model_available?
|
43
|
+
@embedding_model_available ||= begin
|
49
44
|
require 'red-candle'
|
50
45
|
true
|
51
46
|
rescue LoadError
|
52
47
|
false
|
53
48
|
end
|
54
49
|
end
|
55
|
-
end
|
50
|
+
end
|