ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +187 -36
- data/lib/ragnar/cli.rb +543 -172
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +226 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/llm_manager.rb +4 -1
- data/lib/ragnar/query_processor.rb +38 -20
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +190 -73
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- metadata +43 -22
- data/lib/ragnar/topic_modeling/engine.rb +0 -221
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
@@ -1,131 +0,0 @@
|
|
1
|
-
# Adapter to allow different LLM backends (red-candle, remote APIs, etc.)
|
2
|
-
module Ragnar
|
3
|
-
module TopicModeling
|
4
|
-
class LLMAdapter
|
5
|
-
# Factory method to create appropriate LLM client
|
6
|
-
def self.create(type: :auto, **options)
|
7
|
-
case type
|
8
|
-
when :red_candle
|
9
|
-
RedCandleAdapter.new(**options)
|
10
|
-
when :openai
|
11
|
-
# Future: OpenAIAdapter.new(**options)
|
12
|
-
raise NotImplementedError, "OpenAI adapter not yet implemented"
|
13
|
-
when :anthropic
|
14
|
-
# Future: AnthropicAdapter.new(**options)
|
15
|
-
raise NotImplementedError, "Anthropic adapter not yet implemented"
|
16
|
-
when :auto
|
17
|
-
# Try red-candle first, then fall back to others
|
18
|
-
begin
|
19
|
-
RedCandleAdapter.new(**options)
|
20
|
-
rescue LoadError
|
21
|
-
nil # No LLM available
|
22
|
-
end
|
23
|
-
else
|
24
|
-
raise ArgumentError, "Unknown LLM type: #{type}"
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
# Adapter for red-candle (local LLMs)
|
30
|
-
class RedCandleAdapter
|
31
|
-
def initialize(model: nil, **options)
|
32
|
-
require 'candle'
|
33
|
-
|
34
|
-
@model = model || default_model
|
35
|
-
@options = options
|
36
|
-
@llm = load_or_create_llm
|
37
|
-
end
|
38
|
-
|
39
|
-
def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
|
40
|
-
# Red-candle specific generation
|
41
|
-
response = @llm.generate(
|
42
|
-
prompt,
|
43
|
-
max_length: max_tokens,
|
44
|
-
temperature: temperature,
|
45
|
-
do_sample: temperature > 0
|
46
|
-
)
|
47
|
-
|
48
|
-
# Handle JSON response format if requested
|
49
|
-
if response_format && response_format[:type] == "json_object"
|
50
|
-
ensure_json_response(response)
|
51
|
-
else
|
52
|
-
response
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def available?
|
57
|
-
true
|
58
|
-
end
|
59
|
-
|
60
|
-
private
|
61
|
-
|
62
|
-
def default_model
|
63
|
-
# Use a small, fast model by default for topic labeling
|
64
|
-
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
65
|
-
end
|
66
|
-
|
67
|
-
def load_or_create_llm
|
68
|
-
# Check if already loaded in ruby-rag
|
69
|
-
if defined?(Ragnar::LLMManager)
|
70
|
-
begin
|
71
|
-
return Ragnar::LLMManager.instance.get_llm(@model)
|
72
|
-
rescue
|
73
|
-
# Fall through to create new
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
# Create new LLM instance
|
78
|
-
Candle::Model.new(
|
79
|
-
model_id: @model,
|
80
|
-
model_type: :llama,
|
81
|
-
quantized: true
|
82
|
-
)
|
83
|
-
end
|
84
|
-
|
85
|
-
def ensure_json_response(response)
|
86
|
-
# Try to extract JSON from response
|
87
|
-
begin
|
88
|
-
# Look for JSON-like content
|
89
|
-
json_match = response.match(/\{.*\}/m)
|
90
|
-
if json_match
|
91
|
-
JSON.parse(json_match[0])
|
92
|
-
json_match[0] # Return the JSON string if valid
|
93
|
-
else
|
94
|
-
# Generate a basic JSON response
|
95
|
-
generate_fallback_json(response)
|
96
|
-
end
|
97
|
-
rescue JSON::ParserError
|
98
|
-
generate_fallback_json(response)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
def generate_fallback_json(text)
|
103
|
-
# Create a simple JSON from text response
|
104
|
-
label = text.lines.first&.strip || "Unknown"
|
105
|
-
{
|
106
|
-
label: label,
|
107
|
-
description: text,
|
108
|
-
confidence: 0.5
|
109
|
-
}.to_json
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
# Future adapter for remote LLMs
|
114
|
-
class RemoteAdapter
|
115
|
-
def initialize(api_key:, endpoint:, **options)
|
116
|
-
@api_key = api_key
|
117
|
-
@endpoint = endpoint
|
118
|
-
@options = options
|
119
|
-
end
|
120
|
-
|
121
|
-
def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
|
122
|
-
# Make API call
|
123
|
-
raise NotImplementedError, "Remote LLM adapter coming soon"
|
124
|
-
end
|
125
|
-
|
126
|
-
def available?
|
127
|
-
!@api_key.nil?
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
@@ -1,186 +0,0 @@
|
|
1
|
-
module Ragnar
|
2
|
-
module TopicModeling
|
3
|
-
module Metrics
|
4
|
-
extend self
|
5
|
-
|
6
|
-
# Compute UMass Coherence for topic quality
|
7
|
-
# Higher coherence = more interpretable topic
|
8
|
-
def compute_coherence(terms, documents, top_n: 10)
|
9
|
-
return 0.0 if terms.empty? || documents.empty?
|
10
|
-
|
11
|
-
# Use top N terms
|
12
|
-
eval_terms = terms.first(top_n)
|
13
|
-
return 0.0 if eval_terms.length < 2
|
14
|
-
|
15
|
-
# Create document term matrix for co-occurrence
|
16
|
-
doc_term_counts = count_cooccurrences(eval_terms, documents)
|
17
|
-
|
18
|
-
# Compute UMass coherence
|
19
|
-
coherence_sum = 0.0
|
20
|
-
pairs_count = 0
|
21
|
-
|
22
|
-
eval_terms.each_with_index do |term_i, i|
|
23
|
-
eval_terms.each_with_index do |term_j, j|
|
24
|
-
next unless j < i # Only upper triangle
|
25
|
-
|
26
|
-
# P(term_i, term_j) = co-occurrence count
|
27
|
-
cooccur = doc_term_counts["#{term_i},#{term_j}"] || 0
|
28
|
-
# P(term_j) = document frequency
|
29
|
-
doc_freq_j = doc_term_counts[term_j] || 0
|
30
|
-
|
31
|
-
if cooccur > 0 && doc_freq_j > 0
|
32
|
-
# UMass: log((cooccur + 1) / doc_freq_j)
|
33
|
-
coherence_sum += Math.log((cooccur + 1.0) / doc_freq_j)
|
34
|
-
pairs_count += 1
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
return 0.0 if pairs_count == 0
|
40
|
-
|
41
|
-
# Normalize by number of pairs
|
42
|
-
coherence = coherence_sum / pairs_count
|
43
|
-
|
44
|
-
# Transform to 0-1 range (coherence is typically negative)
|
45
|
-
# More negative = less coherent, so we reverse and bound
|
46
|
-
normalized = 1.0 / (1.0 + Math.exp(-coherence))
|
47
|
-
normalized
|
48
|
-
end
|
49
|
-
|
50
|
-
# Compute how distinct a topic is from others
|
51
|
-
def compute_distinctiveness(topic, other_topics)
|
52
|
-
return 1.0 if other_topics.empty?
|
53
|
-
|
54
|
-
topic_terms = Set.new(topic.terms.first(20))
|
55
|
-
|
56
|
-
# Compare with other topics
|
57
|
-
overlaps = other_topics.map do |other|
|
58
|
-
next if other.id == topic.id
|
59
|
-
|
60
|
-
other_terms = Set.new(other.terms.first(20))
|
61
|
-
overlap = (topic_terms & other_terms).size.to_f
|
62
|
-
|
63
|
-
# Jaccard similarity
|
64
|
-
union_size = (topic_terms | other_terms).size
|
65
|
-
union_size > 0 ? overlap / union_size : 0
|
66
|
-
end.compact
|
67
|
-
|
68
|
-
return 1.0 if overlaps.empty?
|
69
|
-
|
70
|
-
# Distinctiveness = 1 - average overlap
|
71
|
-
1.0 - (overlaps.sum / overlaps.length)
|
72
|
-
end
|
73
|
-
|
74
|
-
# Compute diversity across all topics
|
75
|
-
def compute_diversity(topics)
|
76
|
-
return 0.0 if topics.length < 2
|
77
|
-
|
78
|
-
# Collect all term sets
|
79
|
-
term_sets = topics.map { |t| Set.new(t.terms.first(20)) }
|
80
|
-
|
81
|
-
# Compute pairwise Jaccard distances
|
82
|
-
distances = []
|
83
|
-
term_sets.each_with_index do |set_i, i|
|
84
|
-
term_sets.each_with_index do |set_j, j|
|
85
|
-
next unless j > i # Only upper triangle
|
86
|
-
|
87
|
-
intersection = (set_i & set_j).size.to_f
|
88
|
-
union = (set_i | set_j).size.to_f
|
89
|
-
|
90
|
-
# Jaccard distance = 1 - Jaccard similarity
|
91
|
-
distance = union > 0 ? 1.0 - (intersection / union) : 1.0
|
92
|
-
distances << distance
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
# Average distance = diversity
|
97
|
-
distances.sum / distances.length
|
98
|
-
end
|
99
|
-
|
100
|
-
# Compute coverage (what fraction of docs are in topics vs outliers)
|
101
|
-
def compute_coverage(topics, total_documents)
|
102
|
-
return 0.0 if total_documents == 0
|
103
|
-
|
104
|
-
docs_in_topics = topics.sum(&:size)
|
105
|
-
docs_in_topics.to_f / total_documents
|
106
|
-
end
|
107
|
-
|
108
|
-
# Silhouette score for cluster quality
|
109
|
-
def compute_silhouette_score(topic, all_topics, embeddings)
|
110
|
-
return 0.0 if topic.embeddings.empty?
|
111
|
-
|
112
|
-
silhouettes = []
|
113
|
-
|
114
|
-
topic.embeddings.each_with_index do |embedding, idx|
|
115
|
-
# a(i) = average distance to other points in same cluster
|
116
|
-
if topic.embeddings.length > 1
|
117
|
-
a_i = topic.embeddings.each_with_index
|
118
|
-
.reject { |_, j| j == idx }
|
119
|
-
.map { |other, _| euclidean_distance(embedding, other) }
|
120
|
-
.sum.to_f / (topic.embeddings.length - 1)
|
121
|
-
else
|
122
|
-
a_i = 0.0
|
123
|
-
end
|
124
|
-
|
125
|
-
# b(i) = minimum average distance to points in other clusters
|
126
|
-
b_values = all_topics.reject { |t| t.id == topic.id }.map do |other_topic|
|
127
|
-
next if other_topic.embeddings.empty?
|
128
|
-
|
129
|
-
avg_dist = other_topic.embeddings
|
130
|
-
.map { |other| euclidean_distance(embedding, other) }
|
131
|
-
.sum.to_f / other_topic.embeddings.length
|
132
|
-
avg_dist
|
133
|
-
end.compact
|
134
|
-
|
135
|
-
b_i = b_values.min || a_i
|
136
|
-
|
137
|
-
# Silhouette coefficient
|
138
|
-
if a_i == 0 && b_i == 0
|
139
|
-
s_i = 0
|
140
|
-
else
|
141
|
-
s_i = (b_i - a_i) / [a_i, b_i].max
|
142
|
-
end
|
143
|
-
|
144
|
-
silhouettes << s_i
|
145
|
-
end
|
146
|
-
|
147
|
-
# Average silhouette score for topic
|
148
|
-
silhouettes.sum / silhouettes.length
|
149
|
-
end
|
150
|
-
|
151
|
-
private
|
152
|
-
|
153
|
-
def count_cooccurrences(terms, documents)
|
154
|
-
counts = Hash.new(0)
|
155
|
-
|
156
|
-
documents.each do |doc|
|
157
|
-
doc_lower = doc.downcase
|
158
|
-
|
159
|
-
# Count individual term occurrences
|
160
|
-
terms.each do |term|
|
161
|
-
counts[term] += 1 if doc_lower.include?(term.downcase)
|
162
|
-
end
|
163
|
-
|
164
|
-
# Count co-occurrences
|
165
|
-
terms.each_with_index do |term_i, i|
|
166
|
-
terms.each_with_index do |term_j, j|
|
167
|
-
next unless j < i
|
168
|
-
|
169
|
-
if doc_lower.include?(term_i.downcase) && doc_lower.include?(term_j.downcase)
|
170
|
-
counts["#{term_i},#{term_j}"] += 1
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
counts
|
177
|
-
end
|
178
|
-
|
179
|
-
def euclidean_distance(vec1, vec2)
|
180
|
-
Math.sqrt(
|
181
|
-
vec1.zip(vec2).map { |a, b| (a - b) ** 2 }.sum
|
182
|
-
)
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
end
|
@@ -1,170 +0,0 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
|
-
module Ragnar
|
4
|
-
module TopicModeling
|
5
|
-
class TermExtractor
|
6
|
-
# Common English stop words to filter out
|
7
|
-
STOP_WORDS = Set.new(%w[
|
8
|
-
the be to of and a in that have i it for not on with he as you do at
|
9
|
-
this but his by from they we say her she or an will my one all would
|
10
|
-
there their what so up out if about who get which go me when make can
|
11
|
-
like time no just him know take people into year your good some could
|
12
|
-
them see other than then now look only come its over think also back
|
13
|
-
after use two how our work first well way even new want because any
|
14
|
-
these give day most us is was are been has had were said did get may
|
15
|
-
])
|
16
|
-
|
17
|
-
def initialize(stop_words: STOP_WORDS, min_word_length: 3, max_word_length: 20)
|
18
|
-
@stop_words = stop_words
|
19
|
-
@min_word_length = min_word_length
|
20
|
-
@max_word_length = max_word_length
|
21
|
-
end
|
22
|
-
|
23
|
-
# Extract distinctive terms using c-TF-IDF
|
24
|
-
def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
|
25
|
-
# Tokenize and count terms in topic
|
26
|
-
topic_terms = count_terms(topic_docs)
|
27
|
-
|
28
|
-
# Tokenize and count document frequency across all docs
|
29
|
-
doc_frequencies = compute_document_frequencies(all_docs)
|
30
|
-
|
31
|
-
# Compute c-TF-IDF scores
|
32
|
-
scores = {}
|
33
|
-
total_docs = all_docs.length.to_f
|
34
|
-
|
35
|
-
topic_terms.each do |term, tf|
|
36
|
-
# c-TF-IDF formula: tf * log(N / df)
|
37
|
-
df = doc_frequencies[term] || 1
|
38
|
-
idf = Math.log(total_docs / df)
|
39
|
-
scores[term] = tf * idf
|
40
|
-
end
|
41
|
-
|
42
|
-
# Return top scoring terms
|
43
|
-
scores.sort_by { |_, score| -score }
|
44
|
-
.first(top_n)
|
45
|
-
.map(&:first)
|
46
|
-
end
|
47
|
-
|
48
|
-
# Standard TF-IDF implementation
|
49
|
-
def extract_tfidf_terms(documents:, top_n: 20)
|
50
|
-
# Document frequency
|
51
|
-
doc_frequencies = compute_document_frequencies(documents)
|
52
|
-
total_docs = documents.length.to_f
|
53
|
-
|
54
|
-
# Compute TF-IDF for each document
|
55
|
-
all_scores = []
|
56
|
-
|
57
|
-
documents.each do |doc|
|
58
|
-
terms = count_terms([doc])
|
59
|
-
doc_length = terms.values.sum.to_f
|
60
|
-
|
61
|
-
scores = {}
|
62
|
-
terms.each do |term, count|
|
63
|
-
tf = count / doc_length # Normalized term frequency
|
64
|
-
df = doc_frequencies[term] || 1
|
65
|
-
idf = Math.log(total_docs / df)
|
66
|
-
scores[term] = tf * idf
|
67
|
-
end
|
68
|
-
|
69
|
-
all_scores << scores
|
70
|
-
end
|
71
|
-
|
72
|
-
# Aggregate scores across all documents
|
73
|
-
aggregated = {}
|
74
|
-
all_scores.each do |doc_scores|
|
75
|
-
doc_scores.each do |term, score|
|
76
|
-
aggregated[term] ||= 0
|
77
|
-
aggregated[term] += score
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
# Return top terms
|
82
|
-
aggregated.sort_by { |_, score| -score }
|
83
|
-
.first(top_n)
|
84
|
-
.map(&:first)
|
85
|
-
end
|
86
|
-
|
87
|
-
# Simple term frequency extraction
|
88
|
-
def extract_frequent_terms(documents:, top_n: 20)
|
89
|
-
terms = count_terms(documents)
|
90
|
-
terms.sort_by { |_, count| -count }
|
91
|
-
.first(top_n)
|
92
|
-
.map(&:first)
|
93
|
-
end
|
94
|
-
|
95
|
-
private
|
96
|
-
|
97
|
-
def tokenize(text)
|
98
|
-
# Simple tokenization - can be improved with proper NLP tokenizer
|
99
|
-
text.downcase
|
100
|
-
.split(/\W+/)
|
101
|
-
.select { |word| valid_word?(word) }
|
102
|
-
end
|
103
|
-
|
104
|
-
def valid_word?(word)
|
105
|
-
word.length >= @min_word_length &&
|
106
|
-
word.length <= @max_word_length &&
|
107
|
-
!@stop_words.include?(word) &&
|
108
|
-
!word.match?(/^\d+$/) # Not pure numbers
|
109
|
-
end
|
110
|
-
|
111
|
-
def count_terms(documents)
|
112
|
-
terms = Hash.new(0)
|
113
|
-
|
114
|
-
documents.each do |doc|
|
115
|
-
tokenize(doc).each do |word|
|
116
|
-
terms[word] += 1
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
terms
|
121
|
-
end
|
122
|
-
|
123
|
-
def compute_document_frequencies(documents)
|
124
|
-
doc_frequencies = Hash.new(0)
|
125
|
-
|
126
|
-
documents.each do |doc|
|
127
|
-
# Use set to count each term once per document
|
128
|
-
unique_terms = Set.new(tokenize(doc))
|
129
|
-
unique_terms.each do |term|
|
130
|
-
doc_frequencies[term] += 1
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
doc_frequencies
|
135
|
-
end
|
136
|
-
|
137
|
-
# N-gram extraction for phrases
|
138
|
-
def extract_ngrams(text, n: 2)
|
139
|
-
words = tokenize(text)
|
140
|
-
ngrams = []
|
141
|
-
|
142
|
-
(0..words.length - n).each do |i|
|
143
|
-
ngram = words[i, n].join(" ")
|
144
|
-
ngrams << ngram
|
145
|
-
end
|
146
|
-
|
147
|
-
ngrams
|
148
|
-
end
|
149
|
-
|
150
|
-
# Extract both unigrams and bigrams
|
151
|
-
def extract_mixed_terms(documents:, top_n: 20)
|
152
|
-
all_terms = Hash.new(0)
|
153
|
-
|
154
|
-
documents.each do |doc|
|
155
|
-
# Unigrams
|
156
|
-
tokenize(doc).each { |word| all_terms[word] += 1 }
|
157
|
-
|
158
|
-
# Bigrams
|
159
|
-
extract_ngrams(doc, n: 2).each { |bigram| all_terms[bigram] += 1 }
|
160
|
-
end
|
161
|
-
|
162
|
-
# Filter and return top terms
|
163
|
-
all_terms.select { |term, count| count > 1 } # Appears more than once
|
164
|
-
.sort_by { |_, count| -count }
|
165
|
-
.first(top_n)
|
166
|
-
.map(&:first)
|
167
|
-
end
|
168
|
-
end
|
169
|
-
end
|
170
|
-
end
|
@@ -1,117 +0,0 @@
|
|
1
|
-
module Ragnar
|
2
|
-
module TopicModeling
|
3
|
-
class Topic
|
4
|
-
attr_reader :id, :document_indices, :documents, :embeddings, :metadata
|
5
|
-
attr_accessor :terms, :label
|
6
|
-
|
7
|
-
def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
|
8
|
-
@id = id
|
9
|
-
@document_indices = document_indices
|
10
|
-
@documents = documents
|
11
|
-
@embeddings = embeddings
|
12
|
-
@metadata = metadata || []
|
13
|
-
@terms = []
|
14
|
-
@label = nil
|
15
|
-
end
|
16
|
-
|
17
|
-
def size
|
18
|
-
@documents.length
|
19
|
-
end
|
20
|
-
|
21
|
-
def centroid
|
22
|
-
@centroid ||= compute_centroid
|
23
|
-
end
|
24
|
-
|
25
|
-
def representative_docs(k: 3)
|
26
|
-
return @documents if @documents.length <= k
|
27
|
-
|
28
|
-
# Find documents closest to centroid
|
29
|
-
distances = @embeddings.map do |embedding|
|
30
|
-
distance_to_centroid(embedding)
|
31
|
-
end
|
32
|
-
|
33
|
-
# Get indices of k smallest distances
|
34
|
-
top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
|
35
|
-
top_indices.map { |i| @documents[i] }
|
36
|
-
end
|
37
|
-
|
38
|
-
def coherence
|
39
|
-
@coherence ||= Metrics.compute_coherence(@terms, @documents)
|
40
|
-
end
|
41
|
-
|
42
|
-
def distinctiveness(other_topics)
|
43
|
-
@distinctiveness ||= Metrics.compute_distinctiveness(self, other_topics)
|
44
|
-
end
|
45
|
-
|
46
|
-
def set_terms(terms)
|
47
|
-
@terms = terms
|
48
|
-
@centroid = nil # Reset centroid cache
|
49
|
-
end
|
50
|
-
|
51
|
-
def set_label(label)
|
52
|
-
@label = label
|
53
|
-
end
|
54
|
-
|
55
|
-
def summary
|
56
|
-
{
|
57
|
-
id: @id,
|
58
|
-
label: @label || "Topic #{@id}",
|
59
|
-
size: size,
|
60
|
-
terms: @terms.first(10),
|
61
|
-
coherence: coherence.round(3),
|
62
|
-
representative_docs: representative_docs(k: 2).map { |d| d[0..100] + "..." }
|
63
|
-
}
|
64
|
-
end
|
65
|
-
|
66
|
-
def to_h
|
67
|
-
{
|
68
|
-
id: @id,
|
69
|
-
label: @label,
|
70
|
-
document_indices: @document_indices,
|
71
|
-
terms: @terms,
|
72
|
-
centroid: centroid,
|
73
|
-
size: size,
|
74
|
-
coherence: coherence
|
75
|
-
}
|
76
|
-
end
|
77
|
-
|
78
|
-
def self.from_h(hash)
|
79
|
-
topic = new(
|
80
|
-
id: hash[:id],
|
81
|
-
document_indices: hash[:document_indices],
|
82
|
-
documents: [], # Would need to be reconstructed
|
83
|
-
embeddings: [], # Would need to be reconstructed
|
84
|
-
metadata: []
|
85
|
-
)
|
86
|
-
topic.set_label(hash[:label])
|
87
|
-
topic.set_terms(hash[:terms])
|
88
|
-
topic
|
89
|
-
end
|
90
|
-
|
91
|
-
private
|
92
|
-
|
93
|
-
def compute_centroid
|
94
|
-
return [] if @embeddings.empty?
|
95
|
-
|
96
|
-
# Compute mean of all embeddings
|
97
|
-
dim = @embeddings.first.length
|
98
|
-
centroid = Array.new(dim, 0.0)
|
99
|
-
|
100
|
-
@embeddings.each do |embedding|
|
101
|
-
embedding.each_with_index do |val, idx|
|
102
|
-
centroid[idx] += val
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
centroid.map { |val| val / @embeddings.length }
|
107
|
-
end
|
108
|
-
|
109
|
-
def distance_to_centroid(embedding)
|
110
|
-
# Euclidean distance
|
111
|
-
Math.sqrt(
|
112
|
-
embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
|
113
|
-
)
|
114
|
-
end
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
@@ -1,61 +0,0 @@
|
|
1
|
-
require_relative 'labeling_strategies'
|
2
|
-
|
3
|
-
module Ragnar
|
4
|
-
module TopicModeling
|
5
|
-
class TopicLabeler
|
6
|
-
attr_reader :strategy
|
7
|
-
|
8
|
-
def initialize(method: :hybrid, llm_client: nil)
|
9
|
-
@method = method
|
10
|
-
@llm_client = llm_client
|
11
|
-
@strategy = LabelingStrategies.create(method, llm_client: llm_client)
|
12
|
-
end
|
13
|
-
|
14
|
-
# Generate a human-readable label for a topic
|
15
|
-
# Returns a hash with label, description, and metadata
|
16
|
-
def generate_label(topic: nil, terms:, documents: [], method: nil)
|
17
|
-
# Allow method override per call
|
18
|
-
if method && method != @method
|
19
|
-
strategy = LabelingStrategies.create(method, llm_client: @llm_client)
|
20
|
-
else
|
21
|
-
strategy = @strategy
|
22
|
-
end
|
23
|
-
|
24
|
-
# Generate label using selected strategy
|
25
|
-
result = strategy.generate_label(
|
26
|
-
topic: topic,
|
27
|
-
terms: terms,
|
28
|
-
documents: documents
|
29
|
-
)
|
30
|
-
|
31
|
-
# Ensure we always return a consistent structure
|
32
|
-
normalize_result(result)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Convenience method for simple label string
|
36
|
-
def generate_simple_label(terms:, documents: [], method: nil)
|
37
|
-
result = generate_label(terms: terms, documents: documents, method: method)
|
38
|
-
result[:label]
|
39
|
-
end
|
40
|
-
|
41
|
-
# Change strategy at runtime
|
42
|
-
def set_strategy(method)
|
43
|
-
@method = method
|
44
|
-
@strategy = LabelingStrategies.create(method, llm_client: @llm_client)
|
45
|
-
end
|
46
|
-
|
47
|
-
private
|
48
|
-
|
49
|
-
def normalize_result(result)
|
50
|
-
{
|
51
|
-
label: result[:label] || "Unknown Topic",
|
52
|
-
description: result[:description] || nil,
|
53
|
-
method: result[:method] || @method,
|
54
|
-
confidence: result[:confidence] || 0.5,
|
55
|
-
themes: result[:themes] || [],
|
56
|
-
metadata: result.reject { |k, _| [:label, :description, :method, :confidence, :themes].include?(k) }
|
57
|
-
}
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|