ragnar-cli 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +439 -0
- data/exe/ragnar +6 -0
- data/lib/ragnar/chunker.rb +97 -0
- data/lib/ragnar/cli.rb +542 -0
- data/lib/ragnar/context_repacker.rb +121 -0
- data/lib/ragnar/database.rb +267 -0
- data/lib/ragnar/embedder.rb +137 -0
- data/lib/ragnar/indexer.rb +234 -0
- data/lib/ragnar/llm_manager.rb +43 -0
- data/lib/ragnar/query_processor.rb +398 -0
- data/lib/ragnar/query_rewriter.rb +75 -0
- data/lib/ragnar/topic_modeling/engine.rb +221 -0
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +300 -0
- data/lib/ragnar/topic_modeling/llm_adapter.rb +131 -0
- data/lib/ragnar/topic_modeling/metrics.rb +186 -0
- data/lib/ragnar/topic_modeling/term_extractor.rb +170 -0
- data/lib/ragnar/topic_modeling/topic.rb +117 -0
- data/lib/ragnar/topic_modeling/topic_labeler.rb +61 -0
- data/lib/ragnar/topic_modeling.rb +24 -0
- data/lib/ragnar/umap_processor.rb +228 -0
- data/lib/ragnar/umap_transform_service.rb +124 -0
- data/lib/ragnar/version.rb +5 -0
- data/lib/ragnar.rb +36 -0
- data/lib/ragnar_cli.rb +2 -0
- metadata +234 -0
@@ -0,0 +1,300 @@
|
|
1
|
+
# Separate strategy classes for different labeling approaches
|
2
|
+
module Ragnar
|
3
|
+
module TopicModeling
|
4
|
+
module LabelingStrategies
|
5
|
+
|
6
|
+
# Base strategy class
|
7
|
+
class Base
|
8
|
+
def generate_label(topic:, terms:, documents:)
|
9
|
+
raise NotImplementedError, "Subclasses must implement generate_label"
|
10
|
+
end
|
11
|
+
|
12
|
+
protected
|
13
|
+
|
14
|
+
def select_representative_docs(documents, k: 3)
|
15
|
+
return documents if documents.length <= k
|
16
|
+
|
17
|
+
# For now, just take first k
|
18
|
+
# Could be improved to select most central docs
|
19
|
+
documents.first(k)
|
20
|
+
end
|
21
|
+
|
22
|
+
def capitalize_phrase(phrase)
|
23
|
+
phrase.split(/[\s_-]/).map(&:capitalize).join(' ')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Fast term-based labeling using c-TF-IDF terms
|
28
|
+
class TermBased < Base
|
29
|
+
def generate_label(topic:, terms:, documents:)
|
30
|
+
return { label: "Empty Topic", description: "No terms found" } if terms.empty?
|
31
|
+
|
32
|
+
# Take top distinctive terms
|
33
|
+
label_terms = terms.first(3).select { |t| t.length > 3 }
|
34
|
+
|
35
|
+
label = if label_terms.length >= 2
|
36
|
+
"#{capitalize_phrase(label_terms[0])} & #{capitalize_phrase(label_terms[1])}"
|
37
|
+
else
|
38
|
+
capitalize_phrase(label_terms.first || terms.first)
|
39
|
+
end
|
40
|
+
|
41
|
+
{
|
42
|
+
label: label,
|
43
|
+
description: "Documents about #{terms.first(5).join(', ')}",
|
44
|
+
method: :term_based,
|
45
|
+
confidence: calculate_confidence(terms)
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def calculate_confidence(terms)
|
52
|
+
# Simple heuristic: more distinctive terms = higher confidence
|
53
|
+
return 0.0 if terms.empty?
|
54
|
+
|
55
|
+
# Assume terms come with scores if available
|
56
|
+
if terms.is_a?(Array) && terms.first.is_a?(Array)
|
57
|
+
# Terms are [word, score] pairs
|
58
|
+
avg_score = terms.first(5).map(&:last).sum / 5.0
|
59
|
+
[avg_score, 1.0].min
|
60
|
+
else
|
61
|
+
# Just have terms, use count as proxy
|
62
|
+
[terms.length / 20.0, 1.0].min
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Quality LLM-based labeling
|
68
|
+
class LLMBased < Base
|
69
|
+
def initialize(llm_client: nil)
|
70
|
+
@llm_client = llm_client
|
71
|
+
end
|
72
|
+
|
73
|
+
def generate_label(topic:, terms:, documents:)
|
74
|
+
unless llm_available?
|
75
|
+
# Fallback to term-based if LLM not available
|
76
|
+
return TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
|
77
|
+
end
|
78
|
+
|
79
|
+
# Select best documents to send to LLM
|
80
|
+
sample_docs = select_representative_docs(documents, k: 3)
|
81
|
+
|
82
|
+
# Generate comprehensive analysis
|
83
|
+
response = analyze_with_llm(sample_docs, terms)
|
84
|
+
|
85
|
+
{
|
86
|
+
label: response[:label],
|
87
|
+
description: response[:description],
|
88
|
+
themes: response[:themes],
|
89
|
+
method: :llm_based,
|
90
|
+
confidence: response[:confidence] || 0.8
|
91
|
+
}
|
92
|
+
rescue => e
|
93
|
+
# Fallback on error
|
94
|
+
puts "LLM labeling failed: #{e.message}" if ENV['DEBUG']
|
95
|
+
TermBased.new.generate_label(topic: topic, terms: terms, documents: documents)
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
def llm_available?
|
101
|
+
return true if @llm_client
|
102
|
+
|
103
|
+
# Try to create LLM adapter
|
104
|
+
begin
|
105
|
+
require_relative 'llm_adapter'
|
106
|
+
@llm_client = LLMAdapter.create(type: :auto)
|
107
|
+
@llm_client && @llm_client.available?
|
108
|
+
rescue LoadError, StandardError => e
|
109
|
+
puts "LLM not available: #{e.message}" if ENV['DEBUG']
|
110
|
+
false
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def analyze_with_llm(documents, terms)
|
115
|
+
prompt = build_analysis_prompt(documents, terms)
|
116
|
+
|
117
|
+
response = @llm_client.generate(
|
118
|
+
prompt: prompt,
|
119
|
+
max_tokens: 150,
|
120
|
+
temperature: 0.3,
|
121
|
+
response_format: { type: "json_object" }
|
122
|
+
)
|
123
|
+
|
124
|
+
# Parse JSON response
|
125
|
+
result = JSON.parse(response, symbolize_names: true)
|
126
|
+
|
127
|
+
# Validate and clean
|
128
|
+
{
|
129
|
+
label: clean_label(result[:label]),
|
130
|
+
description: result[:description] || "Topic about #{result[:label]}",
|
131
|
+
themes: result[:themes] || [],
|
132
|
+
confidence: result[:confidence] || 0.8
|
133
|
+
}
|
134
|
+
end
|
135
|
+
|
136
|
+
def build_analysis_prompt(documents, terms)
|
137
|
+
doc_samples = documents.map.with_index do |doc, i|
|
138
|
+
preview = doc.length > 300 ? "#{doc[0..300]}..." : doc
|
139
|
+
"Document #{i + 1}:\n#{preview}"
|
140
|
+
end.join("\n\n")
|
141
|
+
|
142
|
+
<<~PROMPT
|
143
|
+
Analyze this cluster of related documents and provide a structured summary.
|
144
|
+
|
145
|
+
Distinctive terms found: #{terms.first(10).join(', ')}
|
146
|
+
|
147
|
+
Sample documents:
|
148
|
+
#{doc_samples}
|
149
|
+
|
150
|
+
Provide a JSON response with:
|
151
|
+
{
|
152
|
+
"label": "A 2-4 word topic label",
|
153
|
+
"description": "One sentence describing what connects these documents",
|
154
|
+
"themes": ["theme1", "theme2", "theme3"],
|
155
|
+
"confidence": 0.0-1.0 score of how coherent this topic is
|
156
|
+
}
|
157
|
+
|
158
|
+
Focus on what meaningfully connects these documents, not just common words.
|
159
|
+
PROMPT
|
160
|
+
end
|
161
|
+
|
162
|
+
def clean_label(label)
|
163
|
+
return "Unknown Topic" unless label
|
164
|
+
|
165
|
+
# Remove quotes, trim, limit length
|
166
|
+
cleaned = label.to_s.strip.gsub(/^["']|["']$/, '')
|
167
|
+
cleaned = cleaned.split("\n").first if cleaned.include?("\n")
|
168
|
+
|
169
|
+
# Limit to reasonable length
|
170
|
+
if cleaned.length > 50
|
171
|
+
cleaned[0..47] + "..."
|
172
|
+
else
|
173
|
+
cleaned
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
# Hybrid approach - uses terms to guide LLM for efficiency
|
179
|
+
class Hybrid < Base
|
180
|
+
def initialize(llm_client: nil)
|
181
|
+
@llm_client = llm_client
|
182
|
+
@term_strategy = TermBased.new
|
183
|
+
end
|
184
|
+
|
185
|
+
def generate_label(topic:, terms:, documents:)
|
186
|
+
# Start with term-based analysis
|
187
|
+
term_result = @term_strategy.generate_label(
|
188
|
+
topic: topic,
|
189
|
+
terms: terms,
|
190
|
+
documents: documents
|
191
|
+
)
|
192
|
+
|
193
|
+
# If no LLM available, return term-based result
|
194
|
+
unless llm_available?
|
195
|
+
return term_result.merge(method: :hybrid_fallback)
|
196
|
+
end
|
197
|
+
|
198
|
+
# Enhance with focused LLM call
|
199
|
+
enhanced = enhance_with_llm(term_result, terms, documents)
|
200
|
+
|
201
|
+
{
|
202
|
+
label: enhanced[:label] || term_result[:label],
|
203
|
+
description: enhanced[:description] || term_result[:description],
|
204
|
+
method: :hybrid,
|
205
|
+
confidence: (term_result[:confidence] + (enhanced[:confidence] || 0.5)) / 2,
|
206
|
+
term_label: term_result[:label], # Keep original for comparison
|
207
|
+
themes: enhanced[:themes]
|
208
|
+
}
|
209
|
+
rescue => e
|
210
|
+
# Fallback to term-based
|
211
|
+
puts "Hybrid enhancement failed: #{e.message}" if ENV['DEBUG']
|
212
|
+
term_result.merge(method: :hybrid_fallback)
|
213
|
+
end
|
214
|
+
|
215
|
+
private
|
216
|
+
|
217
|
+
def llm_available?
|
218
|
+
return true if @llm_client
|
219
|
+
|
220
|
+
begin
|
221
|
+
require_relative 'llm_adapter'
|
222
|
+
@llm_client = LLMAdapter.create(type: :auto)
|
223
|
+
@llm_client && @llm_client.available?
|
224
|
+
rescue LoadError, StandardError => e
|
225
|
+
puts "LLM not available for hybrid: #{e.message}" if ENV['DEBUG']
|
226
|
+
false
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def enhance_with_llm(term_result, terms, documents)
|
231
|
+
# Lighter-weight prompt using term analysis as starting point
|
232
|
+
prompt = build_enhancement_prompt(term_result[:label], terms, documents.first)
|
233
|
+
|
234
|
+
response = @llm_client.generate(
|
235
|
+
prompt: prompt,
|
236
|
+
max_tokens: 100,
|
237
|
+
temperature: 0.3
|
238
|
+
)
|
239
|
+
|
240
|
+
# Parse response (simpler format for speed)
|
241
|
+
parse_enhancement_response(response)
|
242
|
+
end
|
243
|
+
|
244
|
+
def build_enhancement_prompt(term_label, terms, sample_doc)
|
245
|
+
doc_preview = sample_doc.length > 200 ? "#{sample_doc[0..200]}..." : sample_doc
|
246
|
+
|
247
|
+
<<~PROMPT
|
248
|
+
Current topic label based on terms: "#{term_label}"
|
249
|
+
Key terms: #{terms.first(8).join(', ')}
|
250
|
+
|
251
|
+
Sample document:
|
252
|
+
#{doc_preview}
|
253
|
+
|
254
|
+
Provide a better topic label if possible (2-4 words), or confirm the current one.
|
255
|
+
Also provide a one-sentence description.
|
256
|
+
|
257
|
+
Format:
|
258
|
+
Label: [your label]
|
259
|
+
Description: [one sentence]
|
260
|
+
Themes: [comma-separated list]
|
261
|
+
PROMPT
|
262
|
+
end
|
263
|
+
|
264
|
+
def parse_enhancement_response(response)
|
265
|
+
result = {}
|
266
|
+
|
267
|
+
# Simple line-based parsing
|
268
|
+
response.lines.each do |line|
|
269
|
+
if line.start_with?("Label:")
|
270
|
+
result[:label] = line.sub("Label:", "").strip
|
271
|
+
elsif line.start_with?("Description:")
|
272
|
+
result[:description] = line.sub("Description:", "").strip
|
273
|
+
elsif line.start_with?("Themes:")
|
274
|
+
themes_str = line.sub("Themes:", "").strip
|
275
|
+
result[:themes] = themes_str.split(",").map(&:strip)
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
result[:confidence] = result[:label] ? 0.7 : 0.3
|
280
|
+
result
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# Factory method to get appropriate strategy
|
285
|
+
def self.create(method, llm_client: nil)
|
286
|
+
case method.to_sym
|
287
|
+
when :fast, :term_based, :terms
|
288
|
+
TermBased.new
|
289
|
+
when :quality, :llm_based, :llm
|
290
|
+
LLMBased.new(llm_client: llm_client)
|
291
|
+
when :hybrid, :auto, :smart
|
292
|
+
Hybrid.new(llm_client: llm_client)
|
293
|
+
else
|
294
|
+
# Default to hybrid
|
295
|
+
Hybrid.new(llm_client: llm_client)
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# Adapter to allow different LLM backends (red-candle, remote APIs, etc.)
|
2
|
+
module Ragnar
|
3
|
+
module TopicModeling
|
4
|
+
class LLMAdapter
|
5
|
+
# Factory method to create appropriate LLM client
|
6
|
+
def self.create(type: :auto, **options)
|
7
|
+
case type
|
8
|
+
when :red_candle
|
9
|
+
RedCandleAdapter.new(**options)
|
10
|
+
when :openai
|
11
|
+
# Future: OpenAIAdapter.new(**options)
|
12
|
+
raise NotImplementedError, "OpenAI adapter not yet implemented"
|
13
|
+
when :anthropic
|
14
|
+
# Future: AnthropicAdapter.new(**options)
|
15
|
+
raise NotImplementedError, "Anthropic adapter not yet implemented"
|
16
|
+
when :auto
|
17
|
+
# Try red-candle first, then fall back to others
|
18
|
+
begin
|
19
|
+
RedCandleAdapter.new(**options)
|
20
|
+
rescue LoadError
|
21
|
+
nil # No LLM available
|
22
|
+
end
|
23
|
+
else
|
24
|
+
raise ArgumentError, "Unknown LLM type: #{type}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Adapter for red-candle (local LLMs)
|
30
|
+
class RedCandleAdapter
|
31
|
+
def initialize(model: nil, **options)
|
32
|
+
require 'candle'
|
33
|
+
|
34
|
+
@model = model || default_model
|
35
|
+
@options = options
|
36
|
+
@llm = load_or_create_llm
|
37
|
+
end
|
38
|
+
|
39
|
+
def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
|
40
|
+
# Red-candle specific generation
|
41
|
+
response = @llm.generate(
|
42
|
+
prompt,
|
43
|
+
max_length: max_tokens,
|
44
|
+
temperature: temperature,
|
45
|
+
do_sample: temperature > 0
|
46
|
+
)
|
47
|
+
|
48
|
+
# Handle JSON response format if requested
|
49
|
+
if response_format && response_format[:type] == "json_object"
|
50
|
+
ensure_json_response(response)
|
51
|
+
else
|
52
|
+
response
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def available?
|
57
|
+
true
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def default_model
|
63
|
+
# Use a small, fast model by default for topic labeling
|
64
|
+
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
65
|
+
end
|
66
|
+
|
67
|
+
def load_or_create_llm
|
68
|
+
# Check if already loaded in ruby-rag
|
69
|
+
if defined?(Ragnar::LLMManager)
|
70
|
+
begin
|
71
|
+
return Ragnar::LLMManager.instance.get_llm(@model)
|
72
|
+
rescue
|
73
|
+
# Fall through to create new
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Create new LLM instance
|
78
|
+
Candle::Model.new(
|
79
|
+
model_id: @model,
|
80
|
+
model_type: :llama,
|
81
|
+
quantized: true
|
82
|
+
)
|
83
|
+
end
|
84
|
+
|
85
|
+
def ensure_json_response(response)
|
86
|
+
# Try to extract JSON from response
|
87
|
+
begin
|
88
|
+
# Look for JSON-like content
|
89
|
+
json_match = response.match(/\{.*\}/m)
|
90
|
+
if json_match
|
91
|
+
JSON.parse(json_match[0])
|
92
|
+
json_match[0] # Return the JSON string if valid
|
93
|
+
else
|
94
|
+
# Generate a basic JSON response
|
95
|
+
generate_fallback_json(response)
|
96
|
+
end
|
97
|
+
rescue JSON::ParserError
|
98
|
+
generate_fallback_json(response)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def generate_fallback_json(text)
|
103
|
+
# Create a simple JSON from text response
|
104
|
+
label = text.lines.first&.strip || "Unknown"
|
105
|
+
{
|
106
|
+
label: label,
|
107
|
+
description: text,
|
108
|
+
confidence: 0.5
|
109
|
+
}.to_json
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Future adapter for remote LLMs
|
114
|
+
class RemoteAdapter
|
115
|
+
def initialize(api_key:, endpoint:, **options)
|
116
|
+
@api_key = api_key
|
117
|
+
@endpoint = endpoint
|
118
|
+
@options = options
|
119
|
+
end
|
120
|
+
|
121
|
+
def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
|
122
|
+
# Make API call
|
123
|
+
raise NotImplementedError, "Remote LLM adapter coming soon"
|
124
|
+
end
|
125
|
+
|
126
|
+
def available?
|
127
|
+
!@api_key.nil?
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,186 @@
|
|
1
|
+
module Ragnar
|
2
|
+
module TopicModeling
|
3
|
+
module Metrics
|
4
|
+
extend self
|
5
|
+
|
6
|
+
# Compute UMass Coherence for topic quality
|
7
|
+
# Higher coherence = more interpretable topic
|
8
|
+
def compute_coherence(terms, documents, top_n: 10)
|
9
|
+
return 0.0 if terms.empty? || documents.empty?
|
10
|
+
|
11
|
+
# Use top N terms
|
12
|
+
eval_terms = terms.first(top_n)
|
13
|
+
return 0.0 if eval_terms.length < 2
|
14
|
+
|
15
|
+
# Create document term matrix for co-occurrence
|
16
|
+
doc_term_counts = count_cooccurrences(eval_terms, documents)
|
17
|
+
|
18
|
+
# Compute UMass coherence
|
19
|
+
coherence_sum = 0.0
|
20
|
+
pairs_count = 0
|
21
|
+
|
22
|
+
eval_terms.each_with_index do |term_i, i|
|
23
|
+
eval_terms.each_with_index do |term_j, j|
|
24
|
+
next unless j < i # Only upper triangle
|
25
|
+
|
26
|
+
# P(term_i, term_j) = co-occurrence count
|
27
|
+
cooccur = doc_term_counts["#{term_i},#{term_j}"] || 0
|
28
|
+
# P(term_j) = document frequency
|
29
|
+
doc_freq_j = doc_term_counts[term_j] || 0
|
30
|
+
|
31
|
+
if cooccur > 0 && doc_freq_j > 0
|
32
|
+
# UMass: log((cooccur + 1) / doc_freq_j)
|
33
|
+
coherence_sum += Math.log((cooccur + 1.0) / doc_freq_j)
|
34
|
+
pairs_count += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
return 0.0 if pairs_count == 0
|
40
|
+
|
41
|
+
# Normalize by number of pairs
|
42
|
+
coherence = coherence_sum / pairs_count
|
43
|
+
|
44
|
+
# Transform to 0-1 range (coherence is typically negative)
|
45
|
+
# More negative = less coherent, so we reverse and bound
|
46
|
+
normalized = 1.0 / (1.0 + Math.exp(-coherence))
|
47
|
+
normalized
|
48
|
+
end
|
49
|
+
|
50
|
+
# Compute how distinct a topic is from others
|
51
|
+
def compute_distinctiveness(topic, other_topics)
|
52
|
+
return 1.0 if other_topics.empty?
|
53
|
+
|
54
|
+
topic_terms = Set.new(topic.terms.first(20))
|
55
|
+
|
56
|
+
# Compare with other topics
|
57
|
+
overlaps = other_topics.map do |other|
|
58
|
+
next if other.id == topic.id
|
59
|
+
|
60
|
+
other_terms = Set.new(other.terms.first(20))
|
61
|
+
overlap = (topic_terms & other_terms).size.to_f
|
62
|
+
|
63
|
+
# Jaccard similarity
|
64
|
+
union_size = (topic_terms | other_terms).size
|
65
|
+
union_size > 0 ? overlap / union_size : 0
|
66
|
+
end.compact
|
67
|
+
|
68
|
+
return 1.0 if overlaps.empty?
|
69
|
+
|
70
|
+
# Distinctiveness = 1 - average overlap
|
71
|
+
1.0 - (overlaps.sum / overlaps.length)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Compute diversity across all topics
|
75
|
+
def compute_diversity(topics)
|
76
|
+
return 0.0 if topics.length < 2
|
77
|
+
|
78
|
+
# Collect all term sets
|
79
|
+
term_sets = topics.map { |t| Set.new(t.terms.first(20)) }
|
80
|
+
|
81
|
+
# Compute pairwise Jaccard distances
|
82
|
+
distances = []
|
83
|
+
term_sets.each_with_index do |set_i, i|
|
84
|
+
term_sets.each_with_index do |set_j, j|
|
85
|
+
next unless j > i # Only upper triangle
|
86
|
+
|
87
|
+
intersection = (set_i & set_j).size.to_f
|
88
|
+
union = (set_i | set_j).size.to_f
|
89
|
+
|
90
|
+
# Jaccard distance = 1 - Jaccard similarity
|
91
|
+
distance = union > 0 ? 1.0 - (intersection / union) : 1.0
|
92
|
+
distances << distance
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# Average distance = diversity
|
97
|
+
distances.sum / distances.length
|
98
|
+
end
|
99
|
+
|
100
|
+
# Compute coverage (what fraction of docs are in topics vs outliers)
|
101
|
+
def compute_coverage(topics, total_documents)
|
102
|
+
return 0.0 if total_documents == 0
|
103
|
+
|
104
|
+
docs_in_topics = topics.sum(&:size)
|
105
|
+
docs_in_topics.to_f / total_documents
|
106
|
+
end
|
107
|
+
|
108
|
+
# Silhouette score for cluster quality
|
109
|
+
def compute_silhouette_score(topic, all_topics, embeddings)
|
110
|
+
return 0.0 if topic.embeddings.empty?
|
111
|
+
|
112
|
+
silhouettes = []
|
113
|
+
|
114
|
+
topic.embeddings.each_with_index do |embedding, idx|
|
115
|
+
# a(i) = average distance to other points in same cluster
|
116
|
+
if topic.embeddings.length > 1
|
117
|
+
a_i = topic.embeddings.each_with_index
|
118
|
+
.reject { |_, j| j == idx }
|
119
|
+
.map { |other, _| euclidean_distance(embedding, other) }
|
120
|
+
.sum.to_f / (topic.embeddings.length - 1)
|
121
|
+
else
|
122
|
+
a_i = 0.0
|
123
|
+
end
|
124
|
+
|
125
|
+
# b(i) = minimum average distance to points in other clusters
|
126
|
+
b_values = all_topics.reject { |t| t.id == topic.id }.map do |other_topic|
|
127
|
+
next if other_topic.embeddings.empty?
|
128
|
+
|
129
|
+
avg_dist = other_topic.embeddings
|
130
|
+
.map { |other| euclidean_distance(embedding, other) }
|
131
|
+
.sum.to_f / other_topic.embeddings.length
|
132
|
+
avg_dist
|
133
|
+
end.compact
|
134
|
+
|
135
|
+
b_i = b_values.min || a_i
|
136
|
+
|
137
|
+
# Silhouette coefficient
|
138
|
+
if a_i == 0 && b_i == 0
|
139
|
+
s_i = 0
|
140
|
+
else
|
141
|
+
s_i = (b_i - a_i) / [a_i, b_i].max
|
142
|
+
end
|
143
|
+
|
144
|
+
silhouettes << s_i
|
145
|
+
end
|
146
|
+
|
147
|
+
# Average silhouette score for topic
|
148
|
+
silhouettes.sum / silhouettes.length
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
def count_cooccurrences(terms, documents)
|
154
|
+
counts = Hash.new(0)
|
155
|
+
|
156
|
+
documents.each do |doc|
|
157
|
+
doc_lower = doc.downcase
|
158
|
+
|
159
|
+
# Count individual term occurrences
|
160
|
+
terms.each do |term|
|
161
|
+
counts[term] += 1 if doc_lower.include?(term.downcase)
|
162
|
+
end
|
163
|
+
|
164
|
+
# Count co-occurrences
|
165
|
+
terms.each_with_index do |term_i, i|
|
166
|
+
terms.each_with_index do |term_j, j|
|
167
|
+
next unless j < i
|
168
|
+
|
169
|
+
if doc_lower.include?(term_i.downcase) && doc_lower.include?(term_j.downcase)
|
170
|
+
counts["#{term_i},#{term_j}"] += 1
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
counts
|
177
|
+
end
|
178
|
+
|
179
|
+
def euclidean_distance(vec1, vec2)
|
180
|
+
Math.sqrt(
|
181
|
+
vec1.zip(vec2).map { |a, b| (a - b) ** 2 }.sum
|
182
|
+
)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|