ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,301 +0,0 @@
1
- require 'json'
2
-
3
- module Ragnar
4
- module TopicModeling
5
- class Engine
6
- attr_reader :topics, :clusterer, :term_extractor
7
-
8
- def initialize(
9
- min_cluster_size: 5,
10
- min_samples: 3,
11
- clustering_backend: nil,
12
- reduce_dimensions: true,
13
- n_components: 50,
14
- labeling_method: :hybrid,
15
- llm_client: nil,
16
- verbose: false
17
- )
18
- @min_cluster_size = min_cluster_size
19
- @min_samples = min_samples
20
- @reduce_dimensions = reduce_dimensions
21
- @n_components = n_components
22
- @labeling_method = labeling_method
23
- @verbose = verbose
24
-
25
- @clusterer = clustering_backend || build_default_clusterer
26
- @term_extractor = TermExtractor.new
27
- @labeler = TopicLabeler.new(method: labeling_method, llm_client: llm_client)
28
- @topics = []
29
- end
30
-
31
- def fit(embeddings:, documents:, metadata: nil)
32
- raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
33
-
34
- @embeddings = embeddings
35
- @documents = documents
36
- @metadata = metadata || Array.new(documents.length) { {} }
37
-
38
- puts "Starting topic extraction..." if @verbose
39
-
40
- # Step 1: Optionally reduce dimensions for better clustering
41
- working_embeddings = @embeddings
42
- if @reduce_dimensions && @embeddings.first.length > @n_components
43
- puts " Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..." if @verbose
44
- working_embeddings = reduce_dimensions(@embeddings)
45
- end
46
-
47
- # Step 2: Cluster embeddings
48
- puts " Clustering #{working_embeddings.length} documents..." if @verbose
49
- cluster_ids = @clusterer.fit_predict(working_embeddings)
50
-
51
- # Step 3: Build topics from clusters
52
- puts " Building topics..." if @verbose
53
- @topics = build_topics(cluster_ids)
54
-
55
- # Step 4: Extract terms for each topic
56
- puts " Extracting distinctive terms..." if @verbose
57
- extract_topic_terms
58
-
59
- # Step 5: Generate labels
60
- puts " Generating topic labels..." if @verbose
61
- generate_topic_labels
62
-
63
- puts "Found #{@topics.length} topics (plus #{count_outliers(cluster_ids)} outliers)" if @verbose
64
-
65
- @topics
66
- end
67
-
68
- def transform(embeddings:, documents: nil)
69
- # Assign new documents to existing topics
70
- raise "Must call fit before transform" if @topics.empty?
71
-
72
- # Use approximate prediction if available
73
- if @clusterer.respond_to?(:approximate_predict)
74
- @clusterer.approximate_predict(embeddings)
75
- else
76
- # Fallback: assign to nearest topic centroid
77
- assign_to_nearest_topic(embeddings)
78
- end
79
- end
80
-
81
- def get_topic(topic_id)
82
- @topics.find { |t| t.id == topic_id }
83
- end
84
-
85
- def outliers
86
- @outliers ||= @documents.each_with_index.select { |_, idx|
87
- @cluster_ids && @cluster_ids[idx] == -1
88
- }.map(&:first)
89
- end
90
-
91
- def save(path)
92
- data = {
93
- topics: @topics.map(&:to_h),
94
- config: {
95
- min_cluster_size: @min_cluster_size,
96
- min_samples: @min_samples,
97
- reduce_dimensions: @reduce_dimensions,
98
- n_components: @n_components,
99
- labeling_method: @labeling_method
100
- }
101
- }
102
- File.write(path, JSON.pretty_generate(data))
103
- end
104
-
105
- def self.load(path)
106
- data = JSON.parse(File.read(path), symbolize_names: true)
107
- engine = new(**data[:config])
108
- # Reconstruct topics
109
- engine.instance_variable_set(:@topics, data[:topics].map { |t| Topic.from_h(t) })
110
- engine
111
- end
112
-
113
- private
114
-
115
- def build_default_clusterer
116
- begin
117
- require 'clusterkit'
118
- ClusterKit::Clustering::HDBSCAN.new(
119
- min_cluster_size: @min_cluster_size,
120
- min_samples: @min_samples,
121
- metric: 'euclidean'
122
- )
123
- rescue LoadError
124
- raise "ClusterKit required for topic modeling. Add 'gem \"clusterkit\"' to your Gemfile."
125
- end
126
- end
127
-
128
- def reduce_dimensions(embeddings)
129
- require 'clusterkit'
130
-
131
- # Validate embeddings before UMAP
132
- valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
133
-
134
- if valid_embeddings.empty?
135
- raise "No valid embeddings for dimensionality reduction.\n\n" \
136
- "All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
137
- "Try running without dimensionality reduction:\n" \
138
- " ragnar topics --reduce-dimensions false"
139
- end
140
-
141
- if invalid_indices.any? && @verbose
142
- puts " ⚠️ Warning: #{invalid_indices.size} embeddings with invalid values removed"
143
- end
144
-
145
- begin
146
- # Adjust parameters based on data size
147
- n_samples = valid_embeddings.size
148
- n_components = [@n_components, n_samples - 1, 50].min
149
- n_neighbors = [15, n_samples - 1].min
150
-
151
- if @verbose && n_components != @n_components
152
- puts " Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
153
- end
154
-
155
- umap = ClusterKit::Dimensionality::UMAP.new(
156
- n_components: n_components,
157
- n_neighbors: n_neighbors,
158
- random_seed: 42 # For reproducibility
159
- )
160
-
161
- # Convert to format UMAP expects
162
- reduced = umap.fit_transform(valid_embeddings)
163
-
164
- # If we had to remove invalid embeddings, reconstruct the full array
165
- if invalid_indices.any?
166
- full_reduced = []
167
- valid_idx = 0
168
- embeddings.size.times do |i|
169
- if invalid_indices.include?(i)
170
- # Use zeros for invalid embeddings (they'll be outliers anyway)
171
- full_reduced << Array.new(n_components, 0.0)
172
- else
173
- full_reduced << reduced[valid_idx]
174
- valid_idx += 1
175
- end
176
- end
177
- full_reduced
178
- else
179
- reduced
180
- end
181
- rescue => e
182
- if e.message.include?("index out of bounds")
183
- error_msg = "\n❌ Dimensionality reduction failed\n\n"
184
- error_msg += "The UMAP algorithm encountered an error with your data.\n\n"
185
- error_msg += "This typically happens with:\n"
186
- error_msg += " • Embeddings containing invalid values\n"
187
- error_msg += " • Too few samples (#{valid_embeddings.size} valid embeddings)\n"
188
- error_msg += " • Incompatible parameters\n\n"
189
- error_msg += "Solutions:\n"
190
- error_msg += " 1. Run without dimensionality reduction:\n"
191
- error_msg += " ragnar topics --reduce-dimensions false\n\n"
192
- error_msg += " 2. Use fewer dimensions:\n"
193
- error_msg += " ragnar topics --n-components 2\n\n"
194
- error_msg += " 3. Re-index your documents:\n"
195
- error_msg += " ragnar index <path> --force\n"
196
- raise error_msg
197
- else
198
- raise
199
- end
200
- end
201
- rescue LoadError
202
- puts "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings." if @verbose
203
- embeddings
204
- end
205
-
206
- private
207
-
208
- def validate_embeddings_for_umap(embeddings)
209
- valid = []
210
- invalid_indices = []
211
-
212
- embeddings.each_with_index do |embedding, idx|
213
- if embedding.is_a?(Array) &&
214
- embedding.all? { |v| v.is_a?(Numeric) && v.finite? }
215
- valid << embedding
216
- else
217
- invalid_indices << idx
218
- end
219
- end
220
-
221
- [valid, invalid_indices]
222
- end
223
-
224
- def build_topics(cluster_ids)
225
- @cluster_ids = cluster_ids
226
-
227
- # Group documents by cluster
228
- clusters = {}
229
- cluster_ids.each_with_index do |cluster_id, doc_idx|
230
- next if cluster_id == -1 # Skip outliers
231
- clusters[cluster_id] ||= []
232
- clusters[cluster_id] << doc_idx
233
- end
234
-
235
- # Create Topic objects
236
- clusters.map do |cluster_id, doc_indices|
237
- Topic.new(
238
- id: cluster_id,
239
- document_indices: doc_indices,
240
- documents: doc_indices.map { |i| @documents[i] },
241
- embeddings: doc_indices.map { |i| @embeddings[i] },
242
- metadata: doc_indices.map { |i| @metadata[i] }
243
- )
244
- end.sort_by(&:id)
245
- end
246
-
247
- def extract_topic_terms
248
- # Extract distinctive terms for each topic
249
- all_docs_text = @documents.join(" ")
250
-
251
- @topics.each do |topic|
252
- topic_docs_text = topic.documents.join(" ")
253
-
254
- # Use c-TF-IDF to find distinctive terms
255
- terms = @term_extractor.extract_distinctive_terms(
256
- topic_docs: topic.documents,
257
- all_docs: @documents,
258
- top_n: 20
259
- )
260
-
261
- topic.set_terms(terms)
262
- end
263
- end
264
-
265
- def generate_topic_labels
266
- @topics.each do |topic|
267
- result = @labeler.generate_label(
268
- topic: topic,
269
- terms: topic.terms,
270
- documents: topic.documents.first(3) # Use top 3 representative docs
271
- )
272
-
273
- # Set both label and description if available
274
- topic.set_label(result[:label])
275
- topic.instance_variable_set(:@description, result[:description]) if result[:description]
276
- topic.instance_variable_set(:@label_confidence, result[:confidence])
277
- topic.instance_variable_set(:@themes, result[:themes]) if result[:themes]
278
- end
279
- end
280
-
281
- def count_outliers(cluster_ids)
282
- cluster_ids.count { |id| id == -1 }
283
- end
284
-
285
- def assign_to_nearest_topic(embeddings)
286
- # Simple nearest centroid assignment
287
- topic_centroids = @topics.map(&:centroid)
288
-
289
- embeddings.map do |embedding|
290
- distances = topic_centroids.map do |centroid|
291
- # Euclidean distance
292
- Math.sqrt(embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum)
293
- end
294
-
295
- min_idx = distances.index(distances.min)
296
- @topics[min_idx].id
297
- end
298
- end
299
- end
300
- end
301
- end
@@ -1,300 +0,0 @@
1
- # Separate strategy classes for different labeling approaches
2
- module Ragnar
3
- module TopicModeling
4
- module LabelingStrategies
5
-
6
- # Base strategy class
7
- class Base
8
- def generate_label(topic:, terms:, documents:)
9
- raise NotImplementedError, "Subclasses must implement generate_label"
10
- end
11
-
12
- protected
13
-
14
- def select_representative_docs(documents, k: 3)
15
- return documents if documents.length <= k
16
-
17
- # For now, just take first k
18
- # Could be improved to select most central docs
19
- documents.first(k)
20
- end
21
-
22
- def capitalize_phrase(phrase)
23
- phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
24
- end
25
- end
26
-
27
- # Fast term-based labeling using c-TF-IDF terms
28
- class TermBased < Base
29
- def generate_label(topic:, terms:, documents:)
30
- return { label: "Empty Topic", description: "No terms found" } if terms.empty?
31
-
32
- # Take top distinctive terms
33
- label_terms = terms.first(3).select { |t| t.length > 3 }
34
-
35
- label = if label_terms.length >= 2
36
- "#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
37
- else
38
- capitalize_phrase(label_terms.first || terms.first)
39
- end
40
-
41
- {
42
- label: label,
43
- description: "Documents about #{terms.first(5).join(', ')}",
44
- method: :term_based,
45
- confidence: calculate_confidence(terms)
46
- }
47
- end
48
-
49
- private
50
-
51
- def calculate_confidence(terms)
52
- # Simple heuristic: more distinctive terms = higher confidence
53
- return 0.0 if terms.empty?
54
-
55
- # Assume terms come with scores if available
56
- if terms.is_a?(Array) && terms.first.is_a?(Array)
57
- # Terms are [word, score] pairs
58
- avg_score = terms.first(5).map(&:last).sum / 5.0
59
- [avg_score, 1.0].min
60
- else
61
- # Just have terms, use count as proxy
62
- [terms.length / 20.0, 1.0].min
63
- end
64
- end
65
- end
66
-
67
- # Quality LLM-based labeling
68
- class LLMBased < Base
69
- def initialize(llm_client: nil)
70
- @llm_client = llm_client
71
- end
72
-
73
- def generate_label(topic:, terms:, documents:)
74
- unless llm_available?
75
- # Fallback to term-based if LLM not available
76
- return TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
77
- end
78
-
79
- # Select best documents to send to LLM
80
- sample_docs = select_representative_docs(documents, k: 3)
81
-
82
- # Generate comprehensive analysis
83
- response = analyze_with_llm(sample_docs, terms)
84
-
85
- {
86
- label: response[:label],
87
- description: response[:description],
88
- themes: response[:themes],
89
- method: :llm_based,
90
- confidence: response[:confidence] || 0.8
91
- }
92
- rescue => e
93
- # Fallback on error
94
- puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
95
- TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
96
- end
97
-
98
- private
99
-
100
- def llm_available?
101
- return true if @llm_client
102
-
103
- # Try to create LLM adapter
104
- begin
105
- require_relative 'llm_adapter'
106
- @llm_client = LLMAdapter.create(type: :auto)
107
- @llm_client && @llm_client.available?
108
- rescue LoadError, StandardError => e
109
- puts "LLM not available: #{e.message}" if ENV['DEBUG']
110
- false
111
- end
112
- end
113
-
114
- def analyze_with_llm(documents, terms)
115
- prompt = build_analysis_prompt(documents, terms)
116
-
117
- response = @llm_client.generate(
118
- prompt: prompt,
119
- max_tokens: 150,
120
- temperature: 0.3,
121
- response_format: { type: "json_object" }
122
- )
123
-
124
- # Parse JSON response
125
- result = JSON.parse(response, symbolize_names: true)
126
-
127
- # Validate and clean
128
- {
129
- label: clean_label(result[:label]),
130
- description: result[:description] || "Topic about #{result[:label]}",
131
- themes: result[:themes] || [],
132
- confidence: result[:confidence] || 0.8
133
- }
134
- end
135
-
136
- def build_analysis_prompt(documents, terms)
137
- doc_samples = documents.map.with_index do |doc, i|
138
- preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
139
- "Document #{i + 1}:\n#{preview}"
140
- end.join("\n\n")
141
-
142
- <<~PROMPT
143
- Analyze this cluster of related documents and provide a structured summary.
144
-
145
- Distinctive terms found: #{terms.first(10).join(', ')}
146
-
147
- Sample documents:
148
- #{doc_samples}
149
-
150
- Provide a JSON response with:
151
- {
152
- "label": "A 2-4 word topic label",
153
- "description": "One sentence describing what connects these documents",
154
- "themes": ["theme1", "theme2", "theme3"],
155
- "confidence": 0.0-1.0 score of how coherent this topic is
156
- }
157
-
158
- Focus on what meaningfully connects these documents, not just common words.
159
- PROMPT
160
- end
161
-
162
- def clean_label(label)
163
- return "Unknown Topic" unless label
164
-
165
- # Remove quotes, trim, limit length
166
- cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
167
- cleaned = cleaned.split("\n").first if cleaned.include?("\n")
168
-
169
- # Limit to reasonable length
170
- if cleaned.length > 50
171
- cleaned[0..47] + "..."
172
- else
173
- cleaned
174
- end
175
- end
176
- end
177
-
178
- # Hybrid approach - uses terms to guide LLM for efficiency
179
- class Hybrid < Base
180
- def initialize(llm_client: nil)
181
- @llm_client = llm_client
182
- @term_strategy = TermBased.new
183
- end
184
-
185
- def generate_label(topic:, terms:, documents:)
186
- # Start with term-based analysis
187
- term_result = @term_strategy.generate_label(
188
- topic: topic,
189
- terms: terms,
190
- documents: documents
191
- )
192
-
193
- # If no LLM available, return term-based result
194
- unless llm_available?
195
- return term_result.merge(method: :hybrid_fallback)
196
- end
197
-
198
- # Enhance with focused LLM call
199
- enhanced = enhance_with_llm(term_result, terms, documents)
200
-
201
- {
202
- label: enhanced[:label] || term_result[:label],
203
- description: enhanced[:description] || term_result[:description],
204
- method: :hybrid,
205
- confidence: (term_result[:confidence] + (enhanced[:confidence] || 0.5)) / 2,
206
- term_label: term_result[:label], # Keep original for comparison
207
- themes: enhanced[:themes]
208
- }
209
- rescue => e
210
- # Fallback to term-based
211
- puts "Hybrid enhancement failed: #{e.message}" if ENV['DEBUG']
212
- term_result.merge(method: :hybrid_fallback)
213
- end
214
-
215
- private
216
-
217
- def llm_available?
218
- return true if @llm_client
219
-
220
- begin
221
- require_relative 'llm_adapter'
222
- @llm_client = LLMAdapter.create(type: :auto)
223
- @llm_client && @llm_client.available?
224
- rescue LoadError, StandardError => e
225
- puts "LLM not available for hybrid: #{e.message}" if ENV['DEBUG']
226
- false
227
- end
228
- end
229
-
230
- def enhance_with_llm(term_result, terms, documents)
231
- # Lighter-weight prompt using term analysis as starting point
232
- prompt = build_enhancement_prompt(term_result[:label], terms, documents.first)
233
-
234
- response = @llm_client.generate(
235
- prompt: prompt,
236
- max_tokens: 100,
237
- temperature: 0.3
238
- )
239
-
240
- # Parse response (simpler format for speed)
241
- parse_enhancement_response(response)
242
- end
243
-
244
- def build_enhancement_prompt(term_label, terms, sample_doc)
245
- doc_preview = sample_doc.length > 200 ? "#{sample_doc[0..200]}..." : sample_doc
246
-
247
- <<~PROMPT
248
- Current topic label based on terms: "#{term_label}"
249
- Key terms: #{terms.first(8).join(', ')}
250
-
251
- Sample document:
252
- #{doc_preview}
253
-
254
- Provide a better topic label if possible (2-4 words), or confirm the current one.
255
- Also provide a one-sentence description.
256
-
257
- Format:
258
- Label: [your label]
259
- Description: [one sentence]
260
- Themes: [comma-separated list]
261
- PROMPT
262
- end
263
-
264
- def parse_enhancement_response(response)
265
- result = {}
266
-
267
- # Simple line-based parsing
268
- response.lines.each do |line|
269
- if line.start_with?("Label:")
270
- result[:label] = line.sub("Label:", "").strip
271
- elsif line.start_with?("Description:")
272
- result[:description] = line.sub("Description:", "").strip
273
- elsif line.start_with?("Themes:")
274
- themes_str = line.sub("Themes:", "").strip
275
- result[:themes] = themes_str.split(",").map(&:strip)
276
- end
277
- end
278
-
279
- result[:confidence] = result[:label] ? 0.7 : 0.3
280
- result
281
- end
282
- end
283
-
284
- # Factory method to get appropriate strategy
285
- def self.create(method, llm_client: nil)
286
- case method.to_sym
287
- when :fast, :term_based, :terms
288
- TermBased.new
289
- when :quality, :llm_based, :llm
290
- LLMBased.new(llm_client: llm_client)
291
- when :hybrid, :auto, :smart
292
- Hybrid.new(llm_client: llm_client)
293
- else
294
- # Default to hybrid
295
- Hybrid.new(llm_client: llm_client)
296
- end
297
- end
298
- end
299
- end
300
- end