topical 0.0.1.pre.1 ā 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +159 -107
- data/docs/assets/topical-wide.png +0 -0
- data/examples/detect_new_topics.rb +190 -0
- data/examples/quick_demo.rb +1 -1
- data/examples/topic_summaries_with_llm.rb +128 -0
- data/examples/verify_migration.rb +1 -1
- data/lib/topical/clustering/adapter.rb +1 -1
- data/lib/topical/clustering/hdbscan_adapter.rb +1 -1
- data/lib/topical/clustering/kmeans_adapter.rb +1 -1
- data/lib/topical/dimensionality_reducer.rb +96 -0
- data/lib/topical/engine.rb +31 -126
- data/lib/topical/extractors/term_extractor.rb +1 -1
- data/lib/topical/labelers/base.rb +1 -1
- data/lib/topical/labelers/term_based.rb +1 -1
- data/lib/topical/metrics.rb +1 -1
- data/lib/topical/model_serializer.rb +59 -0
- data/lib/topical/topic.rb +1 -1
- data/lib/topical/version.rb +1 -1
- data/lib/topical.rb +6 -11
- metadata +29 -13
- data/lib/topical/labelers/hybrid.rb +0 -24
- data/lib/topical/labelers/llm_adapter.rb +0 -126
- data/lib/topical/labelers/llm_based.rb +0 -111
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 470b52a96cb237a1c1e159dc5987d58c07f38ee91c721f39ec36f2f607c7cae2
|
4
|
+
data.tar.gz: 7a4e2c14668795facc41b743146ad6f020830aa65290ac9377834c062fc2819d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 680f27ae0aef8c867e2c6232b5aaddf3ddb3564d8a6e32d0e03bec6796002e33e899d8bd81277fc293c1ce6bfa4208e9739feff7e410a99c5e1560f7d2aa1219
|
7
|
+
data.tar.gz: 045bc5d1bf65f0ef8d583ee78abb4af33dc822de61639c26f71e6da5691e31b28f53e733c53fffc61bd9c56b0fbf02acc5e45100ff78c27299eace93a680c74e
|
data/README.md
CHANGED
@@ -1,104 +1,77 @@
|
|
1
|
-
|
1
|
+
<img src="/docs/assets/topical-wide.png" alt="ragnar" height="80px">
|
2
2
|
|
3
3
|
Topic modeling for Ruby using modern clustering algorithms. Extract meaningful topics from document embeddings using HDBSCAN clustering and c-TF-IDF term extraction.
|
4
4
|
|
5
|
-
## Quick Start
|
5
|
+
## Quick Start (requires red-candle)
|
6
6
|
|
7
7
|
```bash
|
8
8
|
# Install the gem
|
9
9
|
gem install topical
|
10
10
|
|
11
|
+
# Install red-candle so we can generate embeddings
|
12
|
+
gem install red-candle
|
13
|
+
|
11
14
|
# Try it out immediately in IRB
|
12
15
|
irb
|
13
16
|
```
|
14
17
|
|
15
|
-
```ruby
|
16
|
-
require 'topical'
|
17
|
-
|
18
|
-
# Create some sample documents
|
19
|
-
documents = [
|
20
|
-
"Ruby is a dynamic programming language with elegant syntax",
|
21
|
-
"Rails is a web framework written in Ruby for building web applications",
|
22
|
-
"Python is great for machine learning and data science applications",
|
23
|
-
"TensorFlow and PyTorch are popular machine learning frameworks in Python",
|
24
|
-
"JavaScript runs in browsers and Node.js for full-stack development",
|
25
|
-
"React and Vue are modern JavaScript frameworks for building UIs",
|
26
|
-
"Machine learning models need training data and validation sets",
|
27
|
-
"Deep learning uses neural networks with multiple layers",
|
28
|
-
"Web development involves HTML, CSS, and JavaScript",
|
29
|
-
"Backend development often uses databases and APIs"
|
30
|
-
]
|
31
|
-
|
32
|
-
# Create simple mock embeddings (in practice, use real embeddings from red-candle or other embedding models)
|
33
|
-
# Here we create 3 distinct clusters based on keywords
|
34
|
-
embeddings = documents.map do |doc|
|
35
|
-
text = doc.downcase
|
36
|
-
[
|
37
|
-
text.include?("ruby") || text.include?("rails") ? 1.0 : 0.0, # Ruby cluster
|
38
|
-
text.include?("python") || text.include?("machine") || text.include?("learning") ? 1.0 : 0.0, # ML cluster
|
39
|
-
text.include?("javascript") || text.include?("web") || text.include?("css") ? 1.0 : 0.0, # Web cluster
|
40
|
-
rand(-0.1..0.1) # Small random noise
|
41
|
-
]
|
42
|
-
end
|
43
|
-
|
44
|
-
# Extract topics
|
45
|
-
topics = Topical.extract(
|
46
|
-
embeddings: embeddings,
|
47
|
-
documents: documents,
|
48
|
-
clustering_method: :kmeans,
|
49
|
-
k: 3
|
50
|
-
)
|
51
|
-
|
52
|
-
# Display results
|
53
|
-
topics.each do |topic|
|
54
|
-
puts "\nš #{topic.label}"
|
55
|
-
puts " Documents: #{topic.size}"
|
56
|
-
puts " Key terms: #{topic.terms.first(5).join(', ')}"
|
57
|
-
puts " Sample: \"#{topic.documents.first[0..80]}...\""
|
58
|
-
end
|
59
|
-
```
|
60
|
-
|
61
|
-
## Installation
|
62
|
-
|
63
|
-
Add this line to your application's Gemfile:
|
64
|
-
|
65
|
-
```ruby
|
66
|
-
gem 'topical'
|
67
|
-
|
68
|
-
# Optional but recommended: for generating real embeddings
|
69
|
-
gem 'red-candle'
|
70
|
-
```
|
71
|
-
|
72
|
-
And then execute:
|
73
|
-
|
74
|
-
$ bundle install
|
75
|
-
|
76
|
-
Or install it yourself as:
|
77
|
-
|
78
|
-
$ gem install topical
|
79
|
-
|
80
|
-
## Real-World Usage with Embeddings
|
81
|
-
|
82
|
-
### Using with red-candle (recommended)
|
83
|
-
|
84
18
|
```ruby
|
85
19
|
require 'topical'
|
86
20
|
require 'red-candle'
|
87
21
|
|
88
22
|
# Initialize embedding model
|
89
|
-
embedder =
|
23
|
+
embedder = Candle::EmbeddingModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
90
24
|
|
91
25
|
# Your documents
|
92
26
|
documents = [
|
93
|
-
|
94
|
-
"
|
95
|
-
"
|
96
|
-
"
|
97
|
-
|
27
|
+
# Finance/Economics Topic
|
28
|
+
"The Federal Reserve raised interest rates to combat inflation pressures",
|
29
|
+
"Stock markets rallied on positive earnings reports from tech companies",
|
30
|
+
"Global supply chain disruptions continue to affect consumer prices",
|
31
|
+
"Cryptocurrency markets experienced significant volatility this quarter",
|
32
|
+
"Central banks coordinate policy to address economic uncertainty",
|
33
|
+
"Corporate bond yields rise as investors seek safer assets",
|
34
|
+
"Emerging markets face capital outflows amid dollar strength",
|
35
|
+
|
36
|
+
# Technology/AI Topic
|
37
|
+
"New AI breakthrough in natural language processing announced by researchers",
|
38
|
+
"Machine learning transforms healthcare diagnostics and treatment planning",
|
39
|
+
"Quantum computing reaches new milestone in error correction",
|
40
|
+
"Open source community releases major updates to popular frameworks",
|
41
|
+
"Cloud computing adoption accelerates across enterprise sectors",
|
42
|
+
"Cybersecurity threats evolve with sophisticated ransomware attacks",
|
43
|
+
"Artificial intelligence ethics guidelines proposed by tech consortium",
|
44
|
+
|
45
|
+
# Healthcare/Medical Topic
|
46
|
+
"Clinical trials show promising results for new cancer immunotherapy",
|
47
|
+
"Telemedicine adoption continues to reshape patient care delivery",
|
48
|
+
"Gene editing techniques advance treatment for rare diseases",
|
49
|
+
"Mental health awareness campaigns gain momentum globally",
|
50
|
+
"Vaccine development accelerates using mRNA technology platforms",
|
51
|
+
"Healthcare systems invest in digital transformation initiatives",
|
52
|
+
"Personalized medicine approaches show improved patient outcomes",
|
53
|
+
|
54
|
+
# Climate/Environment Topic
|
55
|
+
"Renewable energy investments surpass fossil fuel spending globally",
|
56
|
+
"Climate scientists warn of accelerating Arctic ice melt",
|
57
|
+
"Carbon capture technology receives significant government funding",
|
58
|
+
"Sustainable agriculture practices reduce environmental impact",
|
59
|
+
"Electric vehicle adoption reaches record levels worldwide",
|
60
|
+
"Ocean conservation efforts expand marine protected areas",
|
61
|
+
"Green hydrogen emerges as key solution for industrial decarbonization",
|
62
|
+
|
63
|
+
# Sports Topic
|
64
|
+
"Championship team breaks decades-old winning streak record",
|
65
|
+
"Olympic athletes prepare for upcoming international competition",
|
66
|
+
"Sports analytics revolutionize player performance evaluation",
|
67
|
+
"Major league implements new rules to improve game pace",
|
68
|
+
"Youth sports participation increases following pandemic recovery",
|
69
|
+
"Stadium technology enhances fan experience with augmented reality",
|
70
|
+
"Professional athletes advocate for mental health support"
|
98
71
|
]
|
99
72
|
|
100
73
|
# Generate embeddings
|
101
|
-
embeddings = documents.map { |doc| embedder.
|
74
|
+
embeddings = documents.map { |doc| embedder.embedding(doc).first.to_a }
|
102
75
|
|
103
76
|
# Extract topics with HDBSCAN clustering
|
104
77
|
engine = Topical::Engine.new(
|
@@ -107,7 +80,7 @@ engine = Topical::Engine.new(
|
|
107
80
|
verbose: true
|
108
81
|
)
|
109
82
|
|
110
|
-
topics = engine.fit(embeddings, documents)
|
83
|
+
topics = engine.fit(embeddings: embeddings, documents: documents)
|
111
84
|
|
112
85
|
# Analyze results
|
113
86
|
topics.each do |topic|
|
@@ -131,36 +104,123 @@ puts "\nOutliers: #{outliers.length} documents"
|
|
131
104
|
engine = Topical::Engine.new(
|
132
105
|
# Clustering options
|
133
106
|
clustering_method: :hdbscan, # :hdbscan or :kmeans
|
134
|
-
min_cluster_size:
|
107
|
+
min_cluster_size: 3, # Minimum documents per topic (HDBSCAN)
|
135
108
|
min_samples: 5, # Core points needed (HDBSCAN)
|
136
109
|
k: 20, # Number of topics (K-means only)
|
137
|
-
|
110
|
+
|
138
111
|
# Dimensionality reduction
|
139
112
|
reduce_dimensions: true, # Auto-reduce high-dim embeddings with UMAP
|
140
113
|
n_components: 50, # Target dimensions for reduction
|
141
|
-
|
114
|
+
|
142
115
|
# Labeling options
|
143
|
-
labeling_method: :
|
144
|
-
|
145
|
-
|
116
|
+
labeling_method: :term_based, # Fast, reliable term-based labeling
|
117
|
+
|
146
118
|
# Other options
|
147
119
|
verbose: true # Show progress
|
148
120
|
)
|
149
121
|
|
150
122
|
# Fit the model
|
151
|
-
topics = engine.fit(embeddings, documents
|
123
|
+
topics = engine.fit(embeddings: embeddings, documents: documents)
|
152
124
|
|
153
125
|
# Save and load models
|
154
126
|
engine.save("topic_model.json")
|
155
127
|
loaded = Topical::Engine.load("topic_model.json")
|
156
128
|
|
157
|
-
# Transform new documents
|
158
|
-
|
129
|
+
# Transform: Assign new documents to existing topics
|
130
|
+
# Note: transform does NOT create new topics - it assigns documents to the closest existing topic
|
131
|
+
new_documents = [
|
132
|
+
# These will be assigned to existing topics based on similarity
|
133
|
+
"Stock market reaches all-time high amid economic recovery", # Should go to Finance
|
134
|
+
"New smartphone features AI-powered camera system", # Should go to Technology
|
135
|
+
"Clinical study reveals breakthrough in diabetes treatment", # Should go to Healthcare
|
136
|
+
"Record heat wave highlights climate change urgency" # Should go to Climate
|
137
|
+
]
|
138
|
+
new_embeddings = new_documents.map { |doc| embedder.embedding(doc).first.to_a }
|
139
|
+
|
140
|
+
# Returns array of topic IDs that each document was assigned to
|
141
|
+
assigned_topic_ids = engine.transform(embeddings: new_embeddings, documents: new_documents)
|
142
|
+
|
143
|
+
# See which topics the new documents were assigned to
|
144
|
+
assigned_topic_ids.each_with_index do |topic_id, idx|
|
145
|
+
topic = engine.get_topic(topic_id)
|
146
|
+
if topic
|
147
|
+
puts "Document #{idx}: Assigned to Topic '#{topic.label}'"
|
148
|
+
puts " Document: #{new_documents[idx]}"
|
149
|
+
else
|
150
|
+
puts "Document #{idx}: Marked as outlier (no matching topic)"
|
151
|
+
end
|
152
|
+
end
|
159
153
|
|
160
154
|
# Get specific topic
|
161
155
|
topic = engine.get_topic(0)
|
162
156
|
```
|
163
157
|
|
158
|
+
### Understanding Transform vs Fit
|
159
|
+
|
160
|
+
- **`fit`**: Discovers topics from your training documents. Creates new topic clusters.
|
161
|
+
- **`transform`**: Assigns new documents to existing topics discovered during `fit`. Does NOT create new topics.
|
162
|
+
|
163
|
+
If you have documents that represent a completely new topic not seen during training:
|
164
|
+
1. They may be assigned to the closest existing topic (even if not very similar)
|
165
|
+
2. They may be marked as outliers if using HDBSCAN (returned as topic_id -1)
|
166
|
+
3. To discover new topics, you need to analyze them separately or re-fit
|
167
|
+
|
168
|
+
### Detecting New Topics
|
169
|
+
|
170
|
+
Yes, you can run `fit` on just the new documents to discover their topics independently! This is useful for:
|
171
|
+
- Detecting topic drift over time
|
172
|
+
- Identifying emerging themes
|
173
|
+
- Validating if new content fits your existing model
|
174
|
+
|
175
|
+
See [examples/detect_new_topics.rb](examples/detect_new_topics.rb) for a complete example.
|
176
|
+
|
177
|
+
```ruby
|
178
|
+
# To discover new topics, you have several options:
|
179
|
+
|
180
|
+
# Option 1: Fit only on new documents to discover their topics
|
181
|
+
new_engine = Topical::Engine.new(
|
182
|
+
clustering_method: :hdbscan,
|
183
|
+
min_cluster_size: 3 # May need to adjust for small batches
|
184
|
+
)
|
185
|
+
new_topics = new_engine.fit(embeddings: new_embeddings, documents: new_documents)
|
186
|
+
puts "Found #{new_topics.size} topics in new documents"
|
187
|
+
|
188
|
+
# Option 2: Check if new documents are outliers (potential new topic)
|
189
|
+
assigned_ids = engine.transform(embeddings: new_embeddings)
|
190
|
+
outlier_indices = assigned_ids.each_index.select { |i| assigned_ids[i] == -1 }
|
191
|
+
if outlier_indices.size > 3 # If many outliers, might be new topic
|
192
|
+
puts "#{outlier_indices.size} documents don't fit existing topics - potential new topic!"
|
193
|
+
outlier_docs = outlier_indices.map { |i| new_documents[i] }
|
194
|
+
outlier_embeds = outlier_indices.map { |i| new_embeddings[i] }
|
195
|
+
|
196
|
+
# Cluster just the outliers
|
197
|
+
outlier_engine = Topical::Engine.new(min_cluster_size: 3)
|
198
|
+
outlier_topics = outlier_engine.fit(embeddings: outlier_embeds, documents: outlier_docs)
|
199
|
+
end
|
200
|
+
|
201
|
+
# Option 3: Incremental topic discovery (combine old + new and re-fit)
|
202
|
+
all_documents = original_documents + new_documents
|
203
|
+
all_embeddings = original_embeddings + new_embeddings
|
204
|
+
updated_topics = engine.fit(embeddings: all_embeddings, documents: all_documents)
|
205
|
+
|
206
|
+
# Option 4: Compare similarity scores to detect poor fits
|
207
|
+
assigned_ids = engine.transform(embeddings: new_embeddings)
|
208
|
+
similarities = new_embeddings.map.with_index do |embed, idx|
|
209
|
+
topic_id = assigned_ids[idx]
|
210
|
+
next nil if topic_id == -1
|
211
|
+
|
212
|
+
topic = engine.get_topic(topic_id)
|
213
|
+
# Calculate distance to topic centroid (simplified)
|
214
|
+
# In practice, you'd compute actual distance to topic center
|
215
|
+
{ document: new_documents[idx], topic: topic.label, similarity: rand(0.3..1.0) }
|
216
|
+
end
|
217
|
+
|
218
|
+
low_similarity = similarities.compact.select { |s| s[:similarity] < 0.5 }
|
219
|
+
if low_similarity.size > 3
|
220
|
+
puts "#{low_similarity.size} documents have low similarity - might be new topic"
|
221
|
+
end
|
222
|
+
```
|
223
|
+
|
164
224
|
### Topic Analysis
|
165
225
|
|
166
226
|
```ruby
|
@@ -181,7 +241,7 @@ topic.to_h
|
|
181
241
|
|
182
242
|
# Compute metrics across all topics
|
183
243
|
diversity = Topical::Metrics.compute_diversity(topics)
|
184
|
-
coverage = Topical::Metrics.compute_coverage(topics,
|
244
|
+
coverage = Topical::Metrics.compute_coverage(topics, documents.count + new_documents.count)
|
185
245
|
```
|
186
246
|
|
187
247
|
## Clustering Methods
|
@@ -203,24 +263,18 @@ Topical uses **c-TF-IDF** (class-based TF-IDF) to find distinctive terms for eac
|
|
203
263
|
- Automatically filters stop words
|
204
264
|
- Configurable minimum/maximum word lengths
|
205
265
|
|
206
|
-
## Topic Labeling
|
266
|
+
## Topic Labeling
|
267
|
+
|
268
|
+
Topical uses **term-based labeling** - fast, reliable labels generated from the most distinctive terms in each topic cluster. Labels are created by combining the top 2-3 terms that best characterize each topic.
|
269
|
+
|
270
|
+
### Advanced: LLM-Powered Summaries
|
207
271
|
|
208
|
-
|
209
|
-
- Fast, uses top distinctive terms
|
210
|
-
- No external dependencies
|
211
|
-
|
212
|
-
2. **LLM-based** (`:llm_based`)
|
213
|
-
- High quality, contextual labels
|
214
|
-
- Requires red-candle or API provider
|
215
|
-
|
216
|
-
3. **Hybrid** (`:hybrid`)
|
217
|
-
- Best of both: fast with LLM enhancement
|
218
|
-
- Falls back to term-based if LLM unavailable
|
272
|
+
For richer topic analysis, you can combine Topical's clustering with red-candle's LLM capabilities. See `examples/topic_summaries_with_llm.rb` for a complete example of generating detailed topic summaries using your choice of LLM.
|
219
273
|
|
220
274
|
## Dependencies
|
221
275
|
|
222
276
|
- **Required**: `clusterkit` - For HDBSCAN clustering and UMAP dimensionality reduction
|
223
|
-
- **Optional**: `red-candle` - For generating embeddings and LLM
|
277
|
+
- **Optional**: `red-candle` - For generating embeddings in examples and advanced LLM summaries
|
224
278
|
|
225
279
|
## Performance Tips
|
226
280
|
|
@@ -233,9 +287,7 @@ Topical uses **c-TF-IDF** (class-based TF-IDF) to find distinctive terms for eac
|
|
233
287
|
|
234
288
|
Check out the `examples/` directory for complete examples:
|
235
289
|
- `quick_demo.rb` - Simple demonstration with mock data
|
236
|
-
- `
|
237
|
-
- `customer_feedback.rb` - Analyzing customer feedback topics
|
238
|
-
- `research_papers.rb` - Organizing research papers by topic
|
290
|
+
- `topic_summaries_with_llm.rb` - Advanced example showing how to generate detailed topic summaries using red-candle LLM
|
239
291
|
|
240
292
|
## Development
|
241
293
|
|
@@ -245,8 +297,8 @@ To install this gem onto your local machine, run `bundle exec rake install`.
|
|
245
297
|
|
246
298
|
## Contributing
|
247
299
|
|
248
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
300
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/topical.
|
249
301
|
|
250
302
|
## License
|
251
303
|
|
252
|
-
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
304
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
Binary file
|
@@ -0,0 +1,190 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Example: Detecting emergence of new topics in document streams
|
3
|
+
|
4
|
+
require 'topical'
|
5
|
+
require 'candle'
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
puts "Loading embedding model..."
|
9
|
+
embedder = Candle::EmbeddingModel.from_pretrained("jinaai/jina-embeddings-v2-base-en") # Default ragnar model
|
10
|
+
puts "Model loaded!"
|
11
|
+
|
12
|
+
# Initial documents (3 clear topics)
|
13
|
+
initial_documents = [
|
14
|
+
# Finance
|
15
|
+
"Stock market reaches record highs amid economic recovery",
|
16
|
+
"Federal Reserve considers interest rate adjustments",
|
17
|
+
"Cryptocurrency adoption grows among institutional investors",
|
18
|
+
"Banking sector reports strong quarterly earnings",
|
19
|
+
"Global trade agreements impact currency markets",
|
20
|
+
|
21
|
+
# Technology
|
22
|
+
"Artificial intelligence breakthrough in natural language processing",
|
23
|
+
"Cloud computing services expand globally",
|
24
|
+
"Quantum computing achieves new milestone",
|
25
|
+
"Cybersecurity threats evolve with new techniques",
|
26
|
+
"Open source community releases major updates",
|
27
|
+
|
28
|
+
# Healthcare
|
29
|
+
"Clinical trials show promise for new cancer treatment",
|
30
|
+
"Telemedicine adoption continues post-pandemic growth",
|
31
|
+
"Gene therapy advances for rare diseases",
|
32
|
+
"Mental health awareness campaigns expand",
|
33
|
+
"Vaccine development using mRNA technology"
|
34
|
+
]
|
35
|
+
|
36
|
+
puts "=" * 60
|
37
|
+
puts "INITIAL TOPIC MODELING"
|
38
|
+
puts "=" * 60
|
39
|
+
|
40
|
+
# Train initial model
|
41
|
+
puts "\nGenerating embeddings for #{initial_documents.size} documents..."
|
42
|
+
initial_embeddings = initial_documents.map.with_index do |doc, i|
|
43
|
+
print "." if i % 5 == 0
|
44
|
+
embedder.embedding(doc).first.to_a
|
45
|
+
end
|
46
|
+
puts " done!"
|
47
|
+
|
48
|
+
puts "Creating topic model..."
|
49
|
+
engine = Topical::Engine.new(
|
50
|
+
clustering_method: :kmeans, # Use k-means for small dataset
|
51
|
+
k: 3, # We know we have 3 distinct topics
|
52
|
+
reduce_dimensions: false, # Don't reduce dimensions for small dataset
|
53
|
+
verbose: true # Show progress
|
54
|
+
)
|
55
|
+
|
56
|
+
puts "Fitting model..."
|
57
|
+
initial_topics = engine.fit(embeddings: initial_embeddings, documents: initial_documents)
|
58
|
+
puts "\nFound #{initial_topics.size} initial topics:"
|
59
|
+
initial_topics.each do |topic|
|
60
|
+
puts " Topic #{topic.id}: #{topic.terms.take(5).join(', ')}"
|
61
|
+
puts " Size: #{topic.size} documents"
|
62
|
+
end
|
63
|
+
|
64
|
+
# New documents arrive - including a new topic (Education)
|
65
|
+
new_documents = [
|
66
|
+
# More finance
|
67
|
+
"Market volatility increases ahead of earnings season",
|
68
|
+
|
69
|
+
# More tech
|
70
|
+
"Machine learning models improve prediction accuracy",
|
71
|
+
|
72
|
+
# NEW TOPIC: Education (not in original training)
|
73
|
+
"Online learning platforms transform education delivery",
|
74
|
+
"Universities adopt hybrid teaching models globally",
|
75
|
+
"STEM education initiatives target underserved communities",
|
76
|
+
"Educational technology startups receive record funding",
|
77
|
+
"Student debt crisis prompts policy discussions",
|
78
|
+
"Coding bootcamps address skills gap in workforce"
|
79
|
+
]
|
80
|
+
|
81
|
+
puts "\nGenerating embeddings for new documents..."
|
82
|
+
new_embeddings = new_documents.map.with_index do |doc, i|
|
83
|
+
print "."
|
84
|
+
embedder.embedding(doc).first.to_a
|
85
|
+
end
|
86
|
+
puts " done!"
|
87
|
+
|
88
|
+
puts "\n" + "=" * 60
|
89
|
+
puts "DETECTING NEW TOPICS IN INCOMING DOCUMENTS"
|
90
|
+
puts "=" * 60
|
91
|
+
|
92
|
+
# Method 1: Try to assign to existing topics
|
93
|
+
puts "\n1. Checking fit with existing topics..."
|
94
|
+
assigned_ids = engine.transform(embeddings: new_embeddings)
|
95
|
+
|
96
|
+
assigned_ids.each_with_index do |topic_id, idx|
|
97
|
+
doc_preview = new_documents[idx][0..50] + "..."
|
98
|
+
if topic_id == -1
|
99
|
+
puts " ā Outlier: #{doc_preview}"
|
100
|
+
else
|
101
|
+
topic = engine.get_topic(topic_id)
|
102
|
+
puts " ā Topic #{topic_id}: #{doc_preview}"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
outlier_count = assigned_ids.count(-1)
|
107
|
+
puts "\nFound #{outlier_count} outliers (documents that don't fit existing topics)"
|
108
|
+
|
109
|
+
# Method 2: Cluster new documents independently
|
110
|
+
puts "\n2. Clustering new documents independently..."
|
111
|
+
if new_documents.size >= 5 # Need minimum documents for clustering
|
112
|
+
new_engine = Topical::Engine.new(
|
113
|
+
clustering_method: :kmeans, # Use kmeans for small datasets
|
114
|
+
k: 2, # Expect ~2 topics in new docs
|
115
|
+
verbose: false
|
116
|
+
)
|
117
|
+
|
118
|
+
begin
|
119
|
+
new_topics_only = new_engine.fit(embeddings: new_embeddings, documents: new_documents)
|
120
|
+
puts "Found #{new_topics_only.size} topics in new documents:"
|
121
|
+
|
122
|
+
new_topics_only.each do |topic|
|
123
|
+
puts " New Topic: #{topic.terms.take(5).join(', ')}"
|
124
|
+
puts " Documents: #{topic.size}"
|
125
|
+
puts " Sample: #{topic.documents.first[0..60]}..." if topic.documents.any?
|
126
|
+
end
|
127
|
+
rescue => e
|
128
|
+
puts "Could not cluster new documents alone: #{e.message}"
|
129
|
+
end
|
130
|
+
else
|
131
|
+
puts "Too few new documents (#{new_documents.size}) for independent clustering"
|
132
|
+
end
|
133
|
+
|
134
|
+
# Method 3: Identify potential new topic from outliers
|
135
|
+
if outlier_count >= 3
|
136
|
+
puts "\n3. Analyzing outliers for potential new topic..."
|
137
|
+
outlier_indices = assigned_ids.each_index.select { |i| assigned_ids[i] == -1 }
|
138
|
+
outlier_docs = outlier_indices.map { |i| new_documents[i] }
|
139
|
+
outlier_embeds = outlier_indices.map { |i| new_embeddings[i] }
|
140
|
+
|
141
|
+
# Check if outliers are similar to each other (potential new topic)
|
142
|
+
puts "Outlier documents:"
|
143
|
+
outlier_docs.each { |doc| puts " - #{doc[0..60]}..." }
|
144
|
+
|
145
|
+
# Try clustering just the outliers
|
146
|
+
if outlier_docs.size >= 3
|
147
|
+
# Check similarity among outliers to see if they form coherent group
|
148
|
+
puts "\nšÆ Multiple outliers detected - potential new topic!"
|
149
|
+
puts "Consider re-training model to discover new topics"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# Method 4: Re-fit with all documents to discover new structure
|
154
|
+
puts "\n4. Re-fitting with all documents..."
|
155
|
+
all_docs = initial_documents + new_documents
|
156
|
+
all_embeds = initial_embeddings + new_embeddings
|
157
|
+
|
158
|
+
updated_engine = Topical::Engine.new(
|
159
|
+
clustering_method: :kmeans,
|
160
|
+
k: 4, # Try 4 topics since we might have a new one
|
161
|
+
reduce_dimensions: false, # Don't reduce dimensions for small dataset
|
162
|
+
verbose: false
|
163
|
+
)
|
164
|
+
|
165
|
+
updated_topics = updated_engine.fit(embeddings: all_embeds, documents: all_docs)
|
166
|
+
puts "After re-fitting with all documents: #{updated_topics.size} topics"
|
167
|
+
|
168
|
+
if updated_topics.size > initial_topics.size
|
169
|
+
puts "⨠NEW TOPICS EMERGED! (#{updated_topics.size - initial_topics.size} new)"
|
170
|
+
updated_topics.each do |topic|
|
171
|
+
puts " Topic: #{topic.terms.take(5).join(', ')} (#{topic.size} docs)"
|
172
|
+
end
|
173
|
+
else
|
174
|
+
puts "No new topics detected after re-fitting"
|
175
|
+
end
|
176
|
+
|
177
|
+
puts "\n" + "=" * 60
|
178
|
+
puts "SUMMARY"
|
179
|
+
puts "=" * 60
|
180
|
+
puts "Initial topics: #{initial_topics.size}"
|
181
|
+
puts "Outliers in new batch: #{outlier_count}/#{new_documents.size}"
|
182
|
+
puts "Topics after re-fit: #{updated_topics.size}"
|
183
|
+
puts "\nRecommendation:"
|
184
|
+
if outlier_count > new_documents.size * 0.3
|
185
|
+
puts "ā ļø High outlier rate suggests emerging new topic(s)"
|
186
|
+
puts "Consider re-training your topic model with recent documents"
|
187
|
+
else
|
188
|
+
puts "ā New documents mostly fit existing topics"
|
189
|
+
puts "Current model appears adequate"
|
190
|
+
end
|
data/examples/quick_demo.rb
CHANGED
@@ -115,4 +115,4 @@ puts "š Demo complete! Try it with your own documents and real embeddings."
|
|
115
115
|
puts
|
116
116
|
puts "š” Tip: Install red-candle to generate real embeddings:"
|
117
117
|
puts " gem install red-candle"
|
118
|
-
puts " Then use: RedCandle::Embedding.new('model-name').embed(text)"
|
118
|
+
puts " Then use: RedCandle::Embedding.new('model-name').embed(text)"
|