ragnar-cli 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,300 @@
1
+ # Separate strategy classes for different labeling approaches
2
+ module Ragnar
3
+ module TopicModeling
4
+ module LabelingStrategies
5
+
6
+ # Base strategy class
7
+ class Base
8
+ def generate_label(topic:, terms:, documents:)
9
+ raise NotImplementedError, "Subclasses must implement generate_label"
10
+ end
11
+
12
+ protected
13
+
14
+ def select_representative_docs(documents, k: 3)
15
+ return documents if documents.length <= k
16
+
17
+ # For now, just take first k
18
+ # Could be improved to select most central docs
19
+ documents.first(k)
20
+ end
21
+
22
+ def capitalize_phrase(phrase)
23
+ phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
24
+ end
25
+ end
26
+
27
+ # Fast term-based labeling using c-TF-IDF terms
28
+ class TermBased < Base
29
+ def generate_label(topic:, terms:, documents:)
30
+ return { label: "Empty Topic", description: "No terms found" } if terms.empty?
31
+
32
+ # Take top distinctive terms
33
+ label_terms = terms.first(3).select { |t| t.length > 3 }
34
+
35
+ label = if label_terms.length >= 2
36
+ "#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
37
+ else
38
+ capitalize_phrase(label_terms.first || terms.first)
39
+ end
40
+
41
+ {
42
+ label: label,
43
+ description: "Documents about #{terms.first(5).join(', ')}",
44
+ method: :term_based,
45
+ confidence: calculate_confidence(terms)
46
+ }
47
+ end
48
+
49
+ private
50
+
51
+ def calculate_confidence(terms)
52
+ # Simple heuristic: more distinctive terms = higher confidence
53
+ return 0.0 if terms.empty?
54
+
55
+ # Assume terms come with scores if available
56
+ if terms.is_a?(Array) && terms.first.is_a?(Array)
57
+ # Terms are [word, score] pairs
58
+ avg_score = terms.first(5).map(&:last).sum / 5.0
59
+ [avg_score, 1.0].min
60
+ else
61
+ # Just have terms, use count as proxy
62
+ [terms.length / 20.0, 1.0].min
63
+ end
64
+ end
65
+ end
66
+
67
+ # Quality LLM-based labeling
68
+ class LLMBased < Base
69
+ def initialize(llm_client: nil)
70
+ @llm_client = llm_client
71
+ end
72
+
73
+ def generate_label(topic:, terms:, documents:)
74
+ unless llm_available?
75
+ # Fallback to term-based if LLM not available
76
+ return TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
77
+ end
78
+
79
+ # Select best documents to send to LLM
80
+ sample_docs = select_representative_docs(documents, k: 3)
81
+
82
+ # Generate comprehensive analysis
83
+ response = analyze_with_llm(sample_docs, terms)
84
+
85
+ {
86
+ label: response[:label],
87
+ description: response[:description],
88
+ themes: response[:themes],
89
+ method: :llm_based,
90
+ confidence: response[:confidence] || 0.8
91
+ }
92
+ rescue => e
93
+ # Fallback on error
94
+ puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
95
+ TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
96
+ end
97
+
98
+ private
99
+
100
+ def llm_available?
101
+ return true if @llm_client
102
+
103
+ # Try to create LLM adapter
104
+ begin
105
+ require_relative 'llm_adapter'
106
+ @llm_client = LLMAdapter.create(type: :auto)
107
+ @llm_client && @llm_client.available?
108
+ rescue LoadError, StandardError => e
109
+ puts "LLM not available: #{e.message}" if ENV['DEBUG']
110
+ false
111
+ end
112
+ end
113
+
114
+ def analyze_with_llm(documents, terms)
115
+ prompt = build_analysis_prompt(documents, terms)
116
+
117
+ response = @llm_client.generate(
118
+ prompt: prompt,
119
+ max_tokens: 150,
120
+ temperature: 0.3,
121
+ response_format: { type: "json_object" }
122
+ )
123
+
124
+ # Parse JSON response
125
+ result = JSON.parse(response, symbolize_names: true)
126
+
127
+ # Validate and clean
128
+ {
129
+ label: clean_label(result[:label]),
130
+ description: result[:description] || "Topic about #{result[:label]}",
131
+ themes: result[:themes] || [],
132
+ confidence: result[:confidence] || 0.8
133
+ }
134
+ end
135
+
136
+ def build_analysis_prompt(documents, terms)
137
+ doc_samples = documents.map.with_index do |doc, i|
138
+ preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
139
+ "Document #{i + 1}:\n#{preview}"
140
+ end.join("\n\n")
141
+
142
+ <<~PROMPT
143
+ Analyze this cluster of related documents and provide a structured summary.
144
+
145
+ Distinctive terms found: #{terms.first(10).join(', ')}
146
+
147
+ Sample documents:
148
+ #{doc_samples}
149
+
150
+ Provide a JSON response with:
151
+ {
152
+ "label": "A 2-4 word topic label",
153
+ "description": "One sentence describing what connects these documents",
154
+ "themes": ["theme1", "theme2", "theme3"],
155
+ "confidence": 0.0-1.0 score of how coherent this topic is
156
+ }
157
+
158
+ Focus on what meaningfully connects these documents, not just common words.
159
+ PROMPT
160
+ end
161
+
162
+ def clean_label(label)
163
+ return "Unknown Topic" unless label
164
+
165
+ # Remove quotes, trim, limit length
166
+ cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
167
+ cleaned = cleaned.split("\n").first if cleaned.include?("\n")
168
+
169
+ # Limit to reasonable length
170
+ if cleaned.length > 50
171
+ cleaned[0..47] + "..."
172
+ else
173
+ cleaned
174
+ end
175
+ end
176
+ end
177
+
178
+ # Hybrid approach - uses terms to guide LLM for efficiency
179
+ class Hybrid < Base
180
+ def initialize(llm_client: nil)
181
+ @llm_client = llm_client
182
+ @term_strategy = TermBased.new
183
+ end
184
+
185
+ def generate_label(topic:, terms:, documents:)
186
+ # Start with term-based analysis
187
+ term_result = @term_strategy.generate_label(
188
+ topic: topic,
189
+ terms: terms,
190
+ documents: documents
191
+ )
192
+
193
+ # If no LLM available, return term-based result
194
+ unless llm_available?
195
+ return term_result.merge(method: :hybrid_fallback)
196
+ end
197
+
198
+ # Enhance with focused LLM call
199
+ enhanced = enhance_with_llm(term_result, terms, documents)
200
+
201
+ {
202
+ label: enhanced[:label] || term_result[:label],
203
+ description: enhanced[:description] || term_result[:description],
204
+ method: :hybrid,
205
+ confidence: (term_result[:confidence] + (enhanced[:confidence] || 0.5)) / 2,
206
+ term_label: term_result[:label], # Keep original for comparison
207
+ themes: enhanced[:themes]
208
+ }
209
+ rescue => e
210
+ # Fallback to term-based
211
+ puts "Hybrid enhancement failed: #{e.message}" if ENV['DEBUG']
212
+ term_result.merge(method: :hybrid_fallback)
213
+ end
214
+
215
+ private
216
+
217
+ def llm_available?
218
+ return true if @llm_client
219
+
220
+ begin
221
+ require_relative 'llm_adapter'
222
+ @llm_client = LLMAdapter.create(type: :auto)
223
+ @llm_client && @llm_client.available?
224
+ rescue LoadError, StandardError => e
225
+ puts "LLM not available for hybrid: #{e.message}" if ENV['DEBUG']
226
+ false
227
+ end
228
+ end
229
+
230
+ def enhance_with_llm(term_result, terms, documents)
231
+ # Lighter-weight prompt using term analysis as starting point
232
+ prompt = build_enhancement_prompt(term_result[:label], terms, documents.first)
233
+
234
+ response = @llm_client.generate(
235
+ prompt: prompt,
236
+ max_tokens: 100,
237
+ temperature: 0.3
238
+ )
239
+
240
+ # Parse response (simpler format for speed)
241
+ parse_enhancement_response(response)
242
+ end
243
+
244
+ def build_enhancement_prompt(term_label, terms, sample_doc)
245
+ doc_preview = sample_doc.length > 200 ? "#{sample_doc[0..200]}..." : sample_doc
246
+
247
+ <<~PROMPT
248
+ Current topic label based on terms: "#{term_label}"
249
+ Key terms: #{terms.first(8).join(', ')}
250
+
251
+ Sample document:
252
+ #{doc_preview}
253
+
254
+ Provide a better topic label if possible (2-4 words), or confirm the current one.
255
+ Also provide a one-sentence description.
256
+
257
+ Format:
258
+ Label: [your label]
259
+ Description: [one sentence]
260
+ Themes: [comma-separated list]
261
+ PROMPT
262
+ end
263
+
264
+ def parse_enhancement_response(response)
265
+ result = {}
266
+
267
+ # Simple line-based parsing
268
+ response.lines.each do |line|
269
+ if line.start_with?("Label:")
270
+ result[:label] = line.sub("Label:", "").strip
271
+ elsif line.start_with?("Description:")
272
+ result[:description] = line.sub("Description:", "").strip
273
+ elsif line.start_with?("Themes:")
274
+ themes_str = line.sub("Themes:", "").strip
275
+ result[:themes] = themes_str.split(",").map(&:strip)
276
+ end
277
+ end
278
+
279
+ result[:confidence] = result[:label] ? 0.7 : 0.3
280
+ result
281
+ end
282
+ end
283
+
284
+ # Factory method to get appropriate strategy
285
+ def self.create(method, llm_client: nil)
286
+ case method.to_sym
287
+ when :fast, :term_based, :terms
288
+ TermBased.new
289
+ when :quality, :llm_based, :llm
290
+ LLMBased.new(llm_client: llm_client)
291
+ when :hybrid, :auto, :smart
292
+ Hybrid.new(llm_client: llm_client)
293
+ else
294
+ # Default to hybrid
295
+ Hybrid.new(llm_client: llm_client)
296
+ end
297
+ end
298
+ end
299
+ end
300
+ end
@@ -0,0 +1,131 @@
1
+ # Adapter to allow different LLM backends (red-candle, remote APIs, etc.)
2
+ module Ragnar
3
+ module TopicModeling
4
+ class LLMAdapter
5
+ # Factory method to create appropriate LLM client
6
+ def self.create(type: :auto, **options)
7
+ case type
8
+ when :red_candle
9
+ RedCandleAdapter.new(**options)
10
+ when :openai
11
+ # Future: OpenAIAdapter.new(**options)
12
+ raise NotImplementedError, "OpenAI adapter not yet implemented"
13
+ when :anthropic
14
+ # Future: AnthropicAdapter.new(**options)
15
+ raise NotImplementedError, "Anthropic adapter not yet implemented"
16
+ when :auto
17
+ # Try red-candle first, then fall back to others
18
+ begin
19
+ RedCandleAdapter.new(**options)
20
+ rescue LoadError
21
+ nil # No LLM available
22
+ end
23
+ else
24
+ raise ArgumentError, "Unknown LLM type: #{type}"
25
+ end
26
+ end
27
+ end
28
+
29
+ # Adapter for red-candle (local LLMs)
30
+ class RedCandleAdapter
31
+ def initialize(model: nil, **options)
32
+ require 'candle'
33
+
34
+ @model = model || default_model
35
+ @options = options
36
+ @llm = load_or_create_llm
37
+ end
38
+
39
+ def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
40
+ # Red-candle specific generation
41
+ response = @llm.generate(
42
+ prompt,
43
+ max_length: max_tokens,
44
+ temperature: temperature,
45
+ do_sample: temperature > 0
46
+ )
47
+
48
+ # Handle JSON response format if requested
49
+ if response_format && response_format[:type] == "json_object"
50
+ ensure_json_response(response)
51
+ else
52
+ response
53
+ end
54
+ end
55
+
56
+ def available?
57
+ true
58
+ end
59
+
60
+ private
61
+
62
+ def default_model
63
+ # Use a small, fast model by default for topic labeling
64
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
65
+ end
66
+
67
+ def load_or_create_llm
68
+ # Check if already loaded in ruby-rag
69
+ if defined?(Ragnar::LLMManager)
70
+ begin
71
+ return Ragnar::LLMManager.instance.get_llm(@model)
72
+ rescue
73
+ # Fall through to create new
74
+ end
75
+ end
76
+
77
+ # Create new LLM instance
78
+ Candle::Model.new(
79
+ model_id: @model,
80
+ model_type: :llama,
81
+ quantized: true
82
+ )
83
+ end
84
+
85
+ def ensure_json_response(response)
86
+ # Try to extract JSON from response
87
+ begin
88
+ # Look for JSON-like content
89
+ json_match = response.match(/\{.*\}/m)
90
+ if json_match
91
+ JSON.parse(json_match[0])
92
+ json_match[0] # Return the JSON string if valid
93
+ else
94
+ # Generate a basic JSON response
95
+ generate_fallback_json(response)
96
+ end
97
+ rescue JSON::ParserError
98
+ generate_fallback_json(response)
99
+ end
100
+ end
101
+
102
+ def generate_fallback_json(text)
103
+ # Create a simple JSON from text response
104
+ label = text.lines.first&.strip || "Unknown"
105
+ {
106
+ label: label,
107
+ description: text,
108
+ confidence: 0.5
109
+ }.to_json
110
+ end
111
+ end
112
+
113
+ # Future adapter for remote LLMs
114
+ class RemoteAdapter
115
+ def initialize(api_key:, endpoint:, **options)
116
+ @api_key = api_key
117
+ @endpoint = endpoint
118
+ @options = options
119
+ end
120
+
121
+ def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
122
+ # Make API call
123
+ raise NotImplementedError, "Remote LLM adapter coming soon"
124
+ end
125
+
126
+ def available?
127
+ !@api_key.nil?
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,186 @@
1
+ module Ragnar
2
+ module TopicModeling
3
+ module Metrics
4
+ extend self
5
+
6
+ # Compute UMass Coherence for topic quality
7
+ # Higher coherence = more interpretable topic
8
+ def compute_coherence(terms, documents, top_n: 10)
9
+ return 0.0 if terms.empty? || documents.empty?
10
+
11
+ # Use top N terms
12
+ eval_terms = terms.first(top_n)
13
+ return 0.0 if eval_terms.length < 2
14
+
15
+ # Create document term matrix for co-occurrence
16
+ doc_term_counts = count_cooccurrences(eval_terms, documents)
17
+
18
+ # Compute UMass coherence
19
+ coherence_sum = 0.0
20
+ pairs_count = 0
21
+
22
+ eval_terms.each_with_index do |term_i, i|
23
+ eval_terms.each_with_index do |term_j, j|
24
+ next unless j < i # Only upper triangle
25
+
26
+ # P(term_i, term_j) = co-occurrence count
27
+ cooccur = doc_term_counts["#{term_i},#{term_j}"] || 0
28
+ # P(term_j) = document frequency
29
+ doc_freq_j = doc_term_counts[term_j] || 0
30
+
31
+ if cooccur > 0 && doc_freq_j > 0
32
+ # UMass: log((cooccur + 1) / doc_freq_j)
33
+ coherence_sum += Math.log((cooccur + 1.0) / doc_freq_j)
34
+ pairs_count += 1
35
+ end
36
+ end
37
+ end
38
+
39
+ return 0.0 if pairs_count == 0
40
+
41
+ # Normalize by number of pairs
42
+ coherence = coherence_sum / pairs_count
43
+
44
+ # Transform to 0-1 range (coherence is typically negative)
45
+ # More negative = less coherent, so we reverse and bound
46
+ normalized = 1.0 / (1.0 + Math.exp(-coherence))
47
+ normalized
48
+ end
49
+
50
+ # Compute how distinct a topic is from others
51
+ def compute_distinctiveness(topic, other_topics)
52
+ return 1.0 if other_topics.empty?
53
+
54
+ topic_terms = Set.new(topic.terms.first(20))
55
+
56
+ # Compare with other topics
57
+ overlaps = other_topics.map do |other|
58
+ next if other.id == topic.id
59
+
60
+ other_terms = Set.new(other.terms.first(20))
61
+ overlap = (topic_terms & other_terms).size.to_f
62
+
63
+ # Jaccard similarity
64
+ union_size = (topic_terms | other_terms).size
65
+ union_size > 0 ? overlap / union_size : 0
66
+ end.compact
67
+
68
+ return 1.0 if overlaps.empty?
69
+
70
+ # Distinctiveness = 1 - average overlap
71
+ 1.0 - (overlaps.sum / overlaps.length)
72
+ end
73
+
74
+ # Compute diversity across all topics
75
+ def compute_diversity(topics)
76
+ return 0.0 if topics.length < 2
77
+
78
+ # Collect all term sets
79
+ term_sets = topics.map { |t| Set.new(t.terms.first(20)) }
80
+
81
+ # Compute pairwise Jaccard distances
82
+ distances = []
83
+ term_sets.each_with_index do |set_i, i|
84
+ term_sets.each_with_index do |set_j, j|
85
+ next unless j > i # Only upper triangle
86
+
87
+ intersection = (set_i & set_j).size.to_f
88
+ union = (set_i | set_j).size.to_f
89
+
90
+ # Jaccard distance = 1 - Jaccard similarity
91
+ distance = union > 0 ? 1.0 - (intersection / union) : 1.0
92
+ distances << distance
93
+ end
94
+ end
95
+
96
+ # Average distance = diversity
97
+ distances.sum / distances.length
98
+ end
99
+
100
+ # Compute coverage (what fraction of docs are in topics vs outliers)
101
+ def compute_coverage(topics, total_documents)
102
+ return 0.0 if total_documents == 0
103
+
104
+ docs_in_topics = topics.sum(&:size)
105
+ docs_in_topics.to_f / total_documents
106
+ end
107
+
108
+ # Silhouette score for cluster quality
109
+ def compute_silhouette_score(topic, all_topics, embeddings)
110
+ return 0.0 if topic.embeddings.empty?
111
+
112
+ silhouettes = []
113
+
114
+ topic.embeddings.each_with_index do |embedding, idx|
115
+ # a(i) = average distance to other points in same cluster
116
+ if topic.embeddings.length > 1
117
+ a_i = topic.embeddings.each_with_index
118
+ .reject { |_, j| j == idx }
119
+ .map { |other, _| euclidean_distance(embedding, other) }
120
+ .sum.to_f / (topic.embeddings.length - 1)
121
+ else
122
+ a_i = 0.0
123
+ end
124
+
125
+ # b(i) = minimum average distance to points in other clusters
126
+ b_values = all_topics.reject { |t| t.id == topic.id }.map do |other_topic|
127
+ next if other_topic.embeddings.empty?
128
+
129
+ avg_dist = other_topic.embeddings
130
+ .map { |other| euclidean_distance(embedding, other) }
131
+ .sum.to_f / other_topic.embeddings.length
132
+ avg_dist
133
+ end.compact
134
+
135
+ b_i = b_values.min || a_i
136
+
137
+ # Silhouette coefficient
138
+ if a_i == 0 && b_i == 0
139
+ s_i = 0
140
+ else
141
+ s_i = (b_i - a_i) / [a_i, b_i].max
142
+ end
143
+
144
+ silhouettes << s_i
145
+ end
146
+
147
+ # Average silhouette score for topic
148
+ silhouettes.sum / silhouettes.length
149
+ end
150
+
151
+ private
152
+
153
+ def count_cooccurrences(terms, documents)
154
+ counts = Hash.new(0)
155
+
156
+ documents.each do |doc|
157
+ doc_lower = doc.downcase
158
+
159
+ # Count individual term occurrences
160
+ terms.each do |term|
161
+ counts[term] += 1 if doc_lower.include?(term.downcase)
162
+ end
163
+
164
+ # Count co-occurrences
165
+ terms.each_with_index do |term_i, i|
166
+ terms.each_with_index do |term_j, j|
167
+ next unless j < i
168
+
169
+ if doc_lower.include?(term_i.downcase) && doc_lower.include?(term_j.downcase)
170
+ counts["#{term_i},#{term_j}"] += 1
171
+ end
172
+ end
173
+ end
174
+ end
175
+
176
+ counts
177
+ end
178
+
179
+ def euclidean_distance(vec1, vec2)
180
+ Math.sqrt(
181
+ vec1.zip(vec2).map { |a, b| (a - b) ** 2 }.sum
182
+ )
183
+ end
184
+ end
185
+ end
186
+ end