topical 0.0.1.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'clusterkit'
4
+
5
+ module Topical
6
+ module Clustering
7
+ # Adapter for ClusterKit's HDBSCAN implementation
8
+ class HDBSCANAdapter < Adapter
9
+ def initialize(min_cluster_size: 5, min_samples: 3, metric: 'euclidean')
10
+ @min_cluster_size = min_cluster_size
11
+ @min_samples = min_samples
12
+ @metric = metric
13
+
14
+ @clusterer = ClusterKit::Clustering::HDBSCAN.new(
15
+ min_cluster_size: min_cluster_size,
16
+ min_samples: min_samples,
17
+ metric: metric
18
+ )
19
+ end
20
+
21
+ def fit_predict(embeddings)
22
+ labels = @clusterer.fit_predict(embeddings)
23
+ update_stats(labels)
24
+ labels
25
+ end
26
+
27
+ def fit(embeddings)
28
+ @clusterer.fit(embeddings)
29
+ self
30
+ end
31
+
32
+ def predict(embeddings)
33
+ # HDBSCAN doesn't have a separate predict method
34
+ # For new points, we'd need to use approximate prediction
35
+ if @clusterer.respond_to?(:approximate_predict)
36
+ @clusterer.approximate_predict(embeddings)
37
+ else
38
+ raise NotImplementedError, "HDBSCAN does not support prediction on new data"
39
+ end
40
+ end
41
+
42
+ # Access to underlying ClusterKit object if needed
43
+ attr_reader :clusterer
44
+
45
+ private
46
+
47
+ def update_stats(labels)
48
+ @n_noise_points = labels.count(-1)
49
+ unique_labels = labels.uniq.reject { |l| l == -1 }
50
+ @n_clusters = unique_labels.length
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'clusterkit'
4
+
5
+ module Topical
6
+ module Clustering
7
+ # Adapter for ClusterKit's K-means implementation
8
+ class KMeansAdapter < Adapter
9
+ def initialize(k: 5, random_seed: nil)
10
+ @k = k
11
+ @random_seed = random_seed
12
+
13
+ @clusterer = ClusterKit::Clustering::KMeans.new(
14
+ k: k,
15
+ random_seed: random_seed
16
+ )
17
+ end
18
+
19
+ def fit_predict(embeddings)
20
+ labels = @clusterer.fit_predict(embeddings)
21
+ @n_clusters = @k
22
+ @n_noise_points = 0 # K-means doesn't have noise points
23
+ labels
24
+ end
25
+
26
+ def fit(embeddings)
27
+ @clusterer.fit(embeddings)
28
+ self
29
+ end
30
+
31
+ def predict(embeddings)
32
+ @clusterer.predict(embeddings)
33
+ end
34
+
35
+ # Access cluster centers
36
+ def cluster_centers
37
+ @clusterer.cluster_centers
38
+ end
39
+
40
+ # Access to underlying ClusterKit object if needed
41
+ attr_reader :clusterer
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,310 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ # Main engine for topic modeling
5
+ class Engine
6
+ attr_reader :topics, :clustering_adapter, :term_extractor, :labeler
7
+
8
+ def initialize(
9
+ clustering_method: :hdbscan,
10
+ min_cluster_size: 5,
11
+ min_samples: 3,
12
+ reduce_dimensions: true,
13
+ n_components: 50,
14
+ labeling_method: :hybrid,
15
+ llm_provider: nil,
16
+ verbose: false,
17
+ k: nil, # Add k as explicit parameter
18
+ **options
19
+ )
20
+ @clustering_method = clustering_method
21
+ @min_cluster_size = min_cluster_size
22
+ @min_samples = min_samples
23
+ @reduce_dimensions = reduce_dimensions
24
+ @n_components = n_components
25
+ @labeling_method = labeling_method
26
+ @llm_provider = llm_provider
27
+ @verbose = verbose
28
+ @options = options
29
+ @options[:k] = k if k # Store k in options if provided
30
+
31
+ @clustering_adapter = build_clustering_adapter
32
+ @term_extractor = Extractors::TermExtractor.new
33
+ @labeler = build_labeler
34
+ @topics = []
35
+ end
36
+
37
+ # Fit the model to embeddings and documents
38
+ # @param embeddings [Array<Array<Float>>] Document embeddings
39
+ # @param documents [Array<String>] Document texts
40
+ # @param metadata [Array<Hash>] Optional metadata for each document
41
+ # @return [Array<Topic>] Extracted topics
42
+ def fit(embeddings:, documents:, metadata: nil)
43
+ raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
44
+
45
+ @embeddings = embeddings
46
+ @documents = documents
47
+ @metadata = metadata || Array.new(documents.length) { {} }
48
+
49
+ puts "Starting topic extraction..." if @verbose
50
+
51
+ # Step 1: Optionally reduce dimensions
52
+ working_embeddings = @embeddings
53
+ if @reduce_dimensions && @embeddings.first.length > @n_components
54
+ puts " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
55
+ working_embeddings = reduce_dimensions(@embeddings)
56
+ end
57
+
58
+ # Step 2: Cluster embeddings
59
+ puts " Clustering #{working_embeddings.length} documents..." if @verbose
60
+ @cluster_ids = @clustering_adapter.fit_predict(working_embeddings)
61
+
62
+ # Step 3: Build topics from clusters
63
+ puts " Building topics from clusters..." if @verbose
64
+ @topics = build_topics(@cluster_ids)
65
+
66
+ # Step 4: Extract terms for each topic
67
+ puts " Extracting distinctive terms..." if @verbose
68
+ extract_topic_terms
69
+
70
+ # Step 5: Generate labels
71
+ puts " Generating topic labels..." if @verbose
72
+ generate_topic_labels
73
+
74
+ if @verbose
75
+ n_noise = @cluster_ids.count(-1)
76
+ puts "Found #{@topics.length} topics (plus #{n_noise} outliers)"
77
+ end
78
+
79
+ @topics
80
+ end
81
+
82
+ # Transform new documents using fitted model
83
+ def transform(embeddings:, documents: nil)
84
+ raise "Must call fit before transform" if @topics.empty?
85
+
86
+ # Use approximate prediction if available
87
+ if @clustering_adapter.respond_to?(:approximate_predict)
88
+ @clustering_adapter.approximate_predict(embeddings)
89
+ else
90
+ # Fallback: assign to nearest topic centroid
91
+ assign_to_nearest_topic(embeddings: embeddings)
92
+ end
93
+ end
94
+
95
+ def get_topic(topic_id)
96
+ @topics.find { |t| t.id == topic_id }
97
+ end
98
+
99
+ def outliers
100
+ return [] unless @cluster_ids
101
+ @documents.each_with_index.select { |_, idx|
102
+ @cluster_ids[idx] == -1
103
+ }.map(&:first)
104
+ end
105
+
106
+ # Save the model
107
+ def save(path)
108
+ require 'json'
109
+ config = {
110
+ clustering_method: @clustering_method,
111
+ min_cluster_size: @min_cluster_size,
112
+ min_samples: @min_samples,
113
+ reduce_dimensions: @reduce_dimensions,
114
+ n_components: @n_components,
115
+ labeling_method: @labeling_method
116
+ }
117
+
118
+ # Include k for kmeans
119
+ if @clustering_method == :kmeans
120
+ config[:k] = @options[:k] || @topics.length
121
+ end
122
+
123
+ data = {
124
+ topics: @topics.map(&:to_h),
125
+ config: config
126
+ }
127
+ File.write(path, JSON.pretty_generate(data))
128
+ end
129
+
130
+ # Load a model
131
+ def self.load(path)
132
+ require 'json'
133
+ data = JSON.parse(File.read(path), symbolize_names: true)
134
+
135
+ # Make sure k is passed for kmeans and convert string keys to symbols
136
+ config = data[:config]
137
+ config[:clustering_method] = config[:clustering_method].to_sym if config[:clustering_method]
138
+ config[:labeling_method] = config[:labeling_method].to_sym if config[:labeling_method]
139
+
140
+ if config[:clustering_method] == :kmeans && !config[:k]
141
+ # Extract k from saved topics or use default
142
+ config[:k] = data[:topics]&.length || 5
143
+ end
144
+
145
+ engine = new(**config)
146
+ # Reconstruct topics
147
+ engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
148
+ engine
149
+ end
150
+
151
+ private
152
+
153
+ def reduce_dimensions(embeddings)
154
+ begin
155
+ require 'clusterkit'
156
+
157
+ # Validate embeddings before UMAP
158
+ valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
159
+
160
+ if valid_embeddings.empty?
161
+ raise "No valid embeddings for dimensionality reduction. " \
162
+ "All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
163
+ end
164
+
165
+ if invalid_indices.any? && @verbose
166
+ puts " Warning: #{invalid_indices.size} embeddings with invalid values removed"
167
+ end
168
+
169
+ # Adjust parameters based on data size
170
+ n_samples = valid_embeddings.size
171
+ n_components = [@n_components, n_samples - 1, 50].min
172
+ n_neighbors = [15, n_samples - 1].min
173
+
174
+ if @verbose && n_components != @n_components
175
+ puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
176
+ end
177
+
178
+ umap = ClusterKit::Dimensionality::UMAP.new(
179
+ n_components: n_components,
180
+ n_neighbors: n_neighbors,
181
+ random_seed: 42
182
+ )
183
+
184
+ reduced = umap.fit_transform(valid_embeddings)
185
+
186
+ # If we had to remove invalid embeddings, reconstruct the full array
187
+ if invalid_indices.any?
188
+ full_reduced = []
189
+ valid_idx = 0
190
+ embeddings.size.times do |i|
191
+ if invalid_indices.include?(i)
192
+ # Use zeros for invalid embeddings (they'll be outliers anyway)
193
+ full_reduced << Array.new(n_components, 0.0)
194
+ else
195
+ full_reduced << reduced[valid_idx]
196
+ valid_idx += 1
197
+ end
198
+ end
199
+ full_reduced
200
+ else
201
+ reduced
202
+ end
203
+ rescue LoadError
204
+ puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
205
+ embeddings
206
+ rescue => e
207
+ puts "Warning: Dimensionality reduction failed: #{e.message}" if @verbose
208
+ embeddings
209
+ end
210
+ end
211
+
212
+ def validate_embeddings_for_umap(embeddings)
213
+ valid = []
214
+ invalid_indices = []
215
+
216
+ embeddings.each_with_index do |embedding, idx|
217
+ if embedding.is_a?(Array) &&
218
+ embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
219
+ valid << embedding
220
+ else
221
+ invalid_indices << idx
222
+ end
223
+ end
224
+
225
+ [valid, invalid_indices]
226
+ end
227
+
228
+ def build_topics(cluster_ids)
229
+ # Group documents by cluster
230
+ clusters = {}
231
+ cluster_ids.each_with_index do |cluster_id, doc_idx|
232
+ next if cluster_id == -1 # Skip outliers
233
+ clusters[cluster_id] ||= []
234
+ clusters[cluster_id] << doc_idx
235
+ end
236
+
237
+ # Create Topic objects
238
+ clusters.map do |cluster_id, doc_indices|
239
+ Topic.new(
240
+ id: cluster_id,
241
+ document_indices: doc_indices,
242
+ documents: doc_indices.map { |i| @documents[i] },
243
+ embeddings: doc_indices.map { |i| @embeddings[i] },
244
+ metadata: doc_indices.map { |i| @metadata[i] }
245
+ )
246
+ end.sort_by(&:id)
247
+ end
248
+
249
+ def extract_topic_terms
250
+ @topics.each do |topic|
251
+ # Extract distinctive terms using c-TF-IDF
252
+ terms = @term_extractor.extract_distinctive_terms(
253
+ topic_docs: topic.documents,
254
+ all_docs: @documents,
255
+ top_n: 20
256
+ )
257
+
258
+ topic.terms = terms
259
+ end
260
+ end
261
+
262
+ def generate_topic_labels
263
+ @topics.each do |topic|
264
+ topic.label = @labeler.generate_label(topic)
265
+ end
266
+ end
267
+
268
+ def build_clustering_adapter
269
+ case @clustering_method
270
+ when :hdbscan
271
+ Clustering::HDBSCANAdapter.new(
272
+ min_cluster_size: @min_cluster_size,
273
+ min_samples: @min_samples
274
+ )
275
+ when :kmeans
276
+ Clustering::KMeansAdapter.new(k: @options[:k] || 5)
277
+ else
278
+ raise ArgumentError, "Unknown clustering method: #{@clustering_method}"
279
+ end
280
+ end
281
+
282
+ def build_labeler
283
+ case @labeling_method
284
+ when :term_based
285
+ Labelers::TermBased.new
286
+ when :llm_based
287
+ Labelers::LLMBased.new(provider: @llm_provider)
288
+ when :hybrid
289
+ Labelers::Hybrid.new(provider: @llm_provider)
290
+ else
291
+ Labelers::TermBased.new # Default fallback
292
+ end
293
+ end
294
+
295
+ def assign_to_nearest_topic(embeddings:)
296
+ # Simple nearest centroid assignment
297
+ topic_centroids = @topics.map(&:centroid)
298
+
299
+ embeddings.map do |embedding|
300
+ distances = topic_centroids.map do |centroid|
301
+ # Euclidean distance
302
+ Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
303
+ end
304
+
305
+ min_idx = distances.index(distances.min)
306
+ @topics[min_idx].id
307
+ end
308
+ end
309
+ end
310
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module Topical
6
+ module Extractors
7
+ # Extracts distinctive terms from documents using c-TF-IDF
8
+ class TermExtractor
9
+ # Default English stop words
10
+ DEFAULT_STOP_WORDS = Set.new(%w[
11
+ the be to of and a in that have i it for not on with he as you do at
12
+ this but his by from they we say her she or an will my one all would
13
+ there their what so up out if about who get which go me when make can
14
+ like time no just him know take people into year your good some could
15
+ them see other than then now look only come its over think also back
16
+ after use two how our work first well way even new want because any
17
+ these give day most us is was are been has had were said did get may
18
+ ])
19
+
20
+ def initialize(stop_words: DEFAULT_STOP_WORDS, min_word_length: 3, max_word_length: 20)
21
+ @stop_words = stop_words
22
+ @min_word_length = min_word_length
23
+ @max_word_length = max_word_length
24
+ end
25
+
26
+ # Extract distinctive terms using c-TF-IDF
27
+ # @param topic_docs [Array<String>] Documents in the topic
28
+ # @param all_docs [Array<String>] All documents in the corpus
29
+ # @param top_n [Integer] Number of top terms to return
30
+ # @return [Array<String>] Top distinctive terms
31
+ def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
32
+ # Tokenize and count terms in topic
33
+ topic_terms = count_terms(topic_docs)
34
+
35
+ # Tokenize and count document frequency across all docs
36
+ doc_frequencies = compute_document_frequencies(all_docs)
37
+
38
+ # Compute c-TF-IDF scores
39
+ scores = {}
40
+ total_docs = all_docs.length.to_f
41
+
42
+ topic_terms.each do |term, tf|
43
+ # c-TF-IDF formula: tf * log(N / df)
44
+ df = doc_frequencies[term] || 1
45
+ idf = Math.log(total_docs / df)
46
+ scores[term] = tf * idf
47
+ end
48
+
49
+ # Return top scoring terms
50
+ scores.sort_by { |_, score| -score }
51
+ .first(top_n)
52
+ .map(&:first)
53
+ end
54
+
55
+ private
56
+
57
+ def tokenize(text)
58
+ # Simple tokenization
59
+ text.downcase
60
+ .split(/\W+/)
61
+ .select { |word| valid_word?(word) }
62
+ end
63
+
64
+ def valid_word?(word)
65
+ word.length >= @min_word_length &&
66
+ word.length <= @max_word_length &&
67
+ !@stop_words.include?(word) &&
68
+ !word.match?(/^\d+$/) # Not pure numbers
69
+ end
70
+
71
+ def count_terms(documents)
72
+ terms = Hash.new(0)
73
+
74
+ documents.each do |doc|
75
+ tokenize(doc).each do |word|
76
+ terms[word] += 1
77
+ end
78
+ end
79
+
80
+ terms
81
+ end
82
+
83
+ def compute_document_frequencies(documents)
84
+ doc_frequencies = Hash.new(0)
85
+
86
+ documents.each do |doc|
87
+ # Use set to count each term once per document
88
+ unique_terms = Set.new(tokenize(doc))
89
+ unique_terms.each do |term|
90
+ doc_frequencies[term] += 1
91
+ end
92
+ end
93
+
94
+ doc_frequencies
95
+ end
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ module Labelers
5
+ # Base class for topic labeling strategies
6
+ class Base
7
+ def generate_label(topic)
8
+ raise NotImplementedError, "Subclasses must implement generate_label"
9
+ end
10
+
11
+ protected
12
+
13
+ def capitalize_phrase(phrase)
14
+ phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
15
+ end
16
+
17
+ def select_representative_docs(documents, k: 3)
18
+ return documents if documents.length <= k
19
+ documents.first(k)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ module Labelers
5
+ # Hybrid labeling that combines term-based and LLM approaches
6
+ class Hybrid < Base
7
+ def initialize(provider: nil)
8
+ @term_labeler = TermBased.new
9
+ @llm_labeler = LLMBased.new(provider: provider)
10
+ end
11
+
12
+ def generate_label(topic)
13
+ # Start with term-based label
14
+ term_label = @term_labeler.generate_label(topic)
15
+
16
+ # Try to enhance with LLM if available
17
+ llm_label = @llm_labeler.generate_label(topic)
18
+
19
+ # For now, just return the LLM label if different, otherwise term label
20
+ llm_label != "LLM Topic #{topic.id}" ? llm_label : term_label
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Topical
4
+ module Labelers
5
+ # Adapter to allow different LLM backends (red-candle, remote APIs, etc.)
6
+ class LLMAdapter
7
+ # Factory method to create appropriate LLM client
8
+ def self.create(type: :auto, **options)
9
+ case type
10
+ when :red_candle
11
+ RedCandleAdapter.new(**options)
12
+ when :openai
13
+ # Future: OpenAIAdapter.new(**options)
14
+ raise NotImplementedError, "OpenAI adapter not yet implemented"
15
+ when :anthropic
16
+ # Future: AnthropicAdapter.new(**options)
17
+ raise NotImplementedError, "Anthropic adapter not yet implemented"
18
+ when :auto
19
+ # Try red-candle first, then fall back to others
20
+ begin
21
+ RedCandleAdapter.new(**options)
22
+ rescue LoadError
23
+ nil # No LLM available
24
+ end
25
+ else
26
+ raise ArgumentError, "Unknown LLM type: #{type}"
27
+ end
28
+ end
29
+ end
30
+
31
+ # Adapter for red-candle (local LLMs)
32
+ class RedCandleAdapter
33
+ def initialize(model: nil, **options)
34
+ require 'red-candle'
35
+
36
+ @model = model || default_model
37
+ @options = options
38
+ @llm = load_or_create_llm
39
+ end
40
+
41
+ def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
42
+ # Red-candle specific generation
43
+ response = @llm.generate(
44
+ prompt,
45
+ max_length: max_tokens,
46
+ temperature: temperature,
47
+ do_sample: temperature > 0
48
+ )
49
+
50
+ # Handle JSON response format if requested
51
+ if response_format && response_format[:type] == "json_object"
52
+ ensure_json_response(response)
53
+ else
54
+ response
55
+ end
56
+ end
57
+
58
+ def available?
59
+ true
60
+ end
61
+
62
+ private
63
+
64
+ def default_model
65
+ # Use a small, fast model by default for topic labeling
66
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
67
+ end
68
+
69
+ def load_or_create_llm
70
+ # Create new LLM instance with red-candle
71
+ RedCandle::Model.new(
72
+ model_id: @model,
73
+ model_type: :llama,
74
+ quantized: true
75
+ )
76
+ end
77
+
78
+ def ensure_json_response(response)
79
+ # Try to extract JSON from response
80
+ begin
81
+ require 'json'
82
+ # Look for JSON-like content
83
+ json_match = response.match(/\{.*\}/m)
84
+ if json_match
85
+ JSON.parse(json_match[0])
86
+ json_match[0] # Return the JSON string if valid
87
+ else
88
+ # Generate a basic JSON response
89
+ generate_fallback_json(response)
90
+ end
91
+ rescue JSON::ParserError
92
+ generate_fallback_json(response)
93
+ end
94
+ end
95
+
96
+ def generate_fallback_json(text)
97
+ # Create a simple JSON from text response
98
+ require 'json'
99
+ label = text.lines.first&.strip || "Unknown"
100
+ {
101
+ label: label,
102
+ description: text,
103
+ confidence: 0.5
104
+ }.to_json
105
+ end
106
+ end
107
+
108
+ # Future adapter for remote LLMs
109
+ class RemoteAdapter
110
+ def initialize(api_key:, endpoint:, **options)
111
+ @api_key = api_key
112
+ @endpoint = endpoint
113
+ @options = options
114
+ end
115
+
116
+ def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
117
+ # Make API call
118
+ raise NotImplementedError, "Remote LLM adapter coming soon"
119
+ end
120
+
121
+ def available?
122
+ !@api_key.nil?
123
+ end
124
+ end
125
+ end
126
+ end