ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +249 -41
- data/lib/ragnar/cli.rb +563 -219
- data/lib/ragnar/cli_umap.rb +86 -0
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +320 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/embedder.rb +1 -1
- data/lib/ragnar/indexer.rb +4 -2
- data/lib/ragnar/llm_manager.rb +31 -27
- data/lib/ragnar/query_processor.rb +123 -70
- data/lib/ragnar/query_rewriter.rb +21 -18
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +131 -95
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- data/lib/ragnar.rb +3 -1
- metadata +71 -30
- data/lib/ragnar/topic_modeling/engine.rb +0 -301
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
require 'set'
|
|
2
|
-
|
|
3
|
-
module Ragnar
|
|
4
|
-
module TopicModeling
|
|
5
|
-
class TermExtractor
|
|
6
|
-
# Common English stop words to filter out
|
|
7
|
-
STOP_WORDS = Set.new(%w[
|
|
8
|
-
the be to of and a in that have i it for not on with he as you do at
|
|
9
|
-
this but his by from they we say her she or an will my one all would
|
|
10
|
-
there their what so up out if about who get which go me when make can
|
|
11
|
-
like time no just him know take people into year your good some could
|
|
12
|
-
them see other than then now look only come its over think also back
|
|
13
|
-
after use two how our work first well way even new want because any
|
|
14
|
-
these give day most us is was are been has had were said did get may
|
|
15
|
-
])
|
|
16
|
-
|
|
17
|
-
def initialize(stop_words: STOP_WORDS, min_word_length: 3, max_word_length: 20)
|
|
18
|
-
@stop_words = stop_words
|
|
19
|
-
@min_word_length = min_word_length
|
|
20
|
-
@max_word_length = max_word_length
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
# Extract distinctive terms using c-TF-IDF
|
|
24
|
-
def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
|
|
25
|
-
# Tokenize and count terms in topic
|
|
26
|
-
topic_terms = count_terms(topic_docs)
|
|
27
|
-
|
|
28
|
-
# Tokenize and count document frequency across all docs
|
|
29
|
-
doc_frequencies = compute_document_frequencies(all_docs)
|
|
30
|
-
|
|
31
|
-
# Compute c-TF-IDF scores
|
|
32
|
-
scores = {}
|
|
33
|
-
total_docs = all_docs.length.to_f
|
|
34
|
-
|
|
35
|
-
topic_terms.each do |term, tf|
|
|
36
|
-
# c-TF-IDF formula: tf * log(N / df)
|
|
37
|
-
df = doc_frequencies[term] || 1
|
|
38
|
-
idf = Math.log(total_docs / df)
|
|
39
|
-
scores[term] = tf * idf
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Return top scoring terms
|
|
43
|
-
scores.sort_by { |_, score| -score }
|
|
44
|
-
.first(top_n)
|
|
45
|
-
.map(&:first)
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
# Standard TF-IDF implementation
|
|
49
|
-
def extract_tfidf_terms(documents:, top_n: 20)
|
|
50
|
-
# Document frequency
|
|
51
|
-
doc_frequencies = compute_document_frequencies(documents)
|
|
52
|
-
total_docs = documents.length.to_f
|
|
53
|
-
|
|
54
|
-
# Compute TF-IDF for each document
|
|
55
|
-
all_scores = []
|
|
56
|
-
|
|
57
|
-
documents.each do |doc|
|
|
58
|
-
terms = count_terms([doc])
|
|
59
|
-
doc_length = terms.values.sum.to_f
|
|
60
|
-
|
|
61
|
-
scores = {}
|
|
62
|
-
terms.each do |term, count|
|
|
63
|
-
tf = count / doc_length # Normalized term frequency
|
|
64
|
-
df = doc_frequencies[term] || 1
|
|
65
|
-
idf = Math.log(total_docs / df)
|
|
66
|
-
scores[term] = tf * idf
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
all_scores << scores
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
# Aggregate scores across all documents
|
|
73
|
-
aggregated = {}
|
|
74
|
-
all_scores.each do |doc_scores|
|
|
75
|
-
doc_scores.each do |term, score|
|
|
76
|
-
aggregated[term] ||= 0
|
|
77
|
-
aggregated[term] += score
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
# Return top terms
|
|
82
|
-
aggregated.sort_by { |_, score| -score }
|
|
83
|
-
.first(top_n)
|
|
84
|
-
.map(&:first)
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# Simple term frequency extraction
|
|
88
|
-
def extract_frequent_terms(documents:, top_n: 20)
|
|
89
|
-
terms = count_terms(documents)
|
|
90
|
-
terms.sort_by { |_, count| -count }
|
|
91
|
-
.first(top_n)
|
|
92
|
-
.map(&:first)
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
private
|
|
96
|
-
|
|
97
|
-
def tokenize(text)
|
|
98
|
-
# Simple tokenization - can be improved with proper NLP tokenizer
|
|
99
|
-
text.downcase
|
|
100
|
-
.split(/\W+/)
|
|
101
|
-
.select { |word| valid_word?(word) }
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
def valid_word?(word)
|
|
105
|
-
word.length >= @min_word_length &&
|
|
106
|
-
word.length <= @max_word_length &&
|
|
107
|
-
!@stop_words.include?(word) &&
|
|
108
|
-
!word.match?(/^\d+$/) # Not pure numbers
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
def count_terms(documents)
|
|
112
|
-
terms = Hash.new(0)
|
|
113
|
-
|
|
114
|
-
documents.each do |doc|
|
|
115
|
-
tokenize(doc).each do |word|
|
|
116
|
-
terms[word] += 1
|
|
117
|
-
end
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
terms
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
def compute_document_frequencies(documents)
|
|
124
|
-
doc_frequencies = Hash.new(0)
|
|
125
|
-
|
|
126
|
-
documents.each do |doc|
|
|
127
|
-
# Use set to count each term once per document
|
|
128
|
-
unique_terms = Set.new(tokenize(doc))
|
|
129
|
-
unique_terms.each do |term|
|
|
130
|
-
doc_frequencies[term] += 1
|
|
131
|
-
end
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
doc_frequencies
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# N-gram extraction for phrases
|
|
138
|
-
def extract_ngrams(text, n: 2)
|
|
139
|
-
words = tokenize(text)
|
|
140
|
-
ngrams = []
|
|
141
|
-
|
|
142
|
-
(0..words.length - n).each do |i|
|
|
143
|
-
ngram = words[i, n].join(" ")
|
|
144
|
-
ngrams << ngram
|
|
145
|
-
end
|
|
146
|
-
|
|
147
|
-
ngrams
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
# Extract both unigrams and bigrams
|
|
151
|
-
def extract_mixed_terms(documents:, top_n: 20)
|
|
152
|
-
all_terms = Hash.new(0)
|
|
153
|
-
|
|
154
|
-
documents.each do |doc|
|
|
155
|
-
# Unigrams
|
|
156
|
-
tokenize(doc).each { |word| all_terms[word] += 1 }
|
|
157
|
-
|
|
158
|
-
# Bigrams
|
|
159
|
-
extract_ngrams(doc, n: 2).each { |bigram| all_terms[bigram] += 1 }
|
|
160
|
-
end
|
|
161
|
-
|
|
162
|
-
# Filter and return top terms
|
|
163
|
-
all_terms.select { |term, count| count > 1 } # Appears more than once
|
|
164
|
-
.sort_by { |_, count| -count }
|
|
165
|
-
.first(top_n)
|
|
166
|
-
.map(&:first)
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
|
-
end
|
|
170
|
-
end
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
module Ragnar
|
|
2
|
-
module TopicModeling
|
|
3
|
-
class Topic
|
|
4
|
-
attr_reader :id, :document_indices, :documents, :embeddings, :metadata
|
|
5
|
-
attr_accessor :terms, :label
|
|
6
|
-
|
|
7
|
-
def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
|
|
8
|
-
@id = id
|
|
9
|
-
@document_indices = document_indices
|
|
10
|
-
@documents = documents
|
|
11
|
-
@embeddings = embeddings
|
|
12
|
-
@metadata = metadata || []
|
|
13
|
-
@terms = []
|
|
14
|
-
@label = nil
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def size
|
|
18
|
-
@documents.length
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def centroid
|
|
22
|
-
@centroid ||= compute_centroid
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def representative_docs(k: 3)
|
|
26
|
-
return @documents if @documents.length <= k
|
|
27
|
-
|
|
28
|
-
# Find documents closest to centroid
|
|
29
|
-
distances = @embeddings.map do |embedding|
|
|
30
|
-
distance_to_centroid(embedding)
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# Get indices of k smallest distances
|
|
34
|
-
top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
|
|
35
|
-
top_indices.map { |i| @documents[i] }
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
def coherence
|
|
39
|
-
@coherence ||= Metrics.compute_coherence(@terms, @documents)
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
def distinctiveness(other_topics)
|
|
43
|
-
@distinctiveness ||= Metrics.compute_distinctiveness(self, other_topics)
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def set_terms(terms)
|
|
47
|
-
@terms = terms
|
|
48
|
-
@centroid = nil # Reset centroid cache
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def set_label(label)
|
|
52
|
-
@label = label
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def summary
|
|
56
|
-
{
|
|
57
|
-
id: @id,
|
|
58
|
-
label: @label || "Topic #{@id}",
|
|
59
|
-
size: size,
|
|
60
|
-
terms: @terms.first(10),
|
|
61
|
-
coherence: coherence.round(3),
|
|
62
|
-
representative_docs: representative_docs(k: 2).map { |d| d[0..100] + "..." }
|
|
63
|
-
}
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def to_h
|
|
67
|
-
{
|
|
68
|
-
id: @id,
|
|
69
|
-
label: @label,
|
|
70
|
-
document_indices: @document_indices,
|
|
71
|
-
terms: @terms,
|
|
72
|
-
centroid: centroid,
|
|
73
|
-
size: size,
|
|
74
|
-
coherence: coherence
|
|
75
|
-
}
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def self.from_h(hash)
|
|
79
|
-
topic = new(
|
|
80
|
-
id: hash[:id],
|
|
81
|
-
document_indices: hash[:document_indices],
|
|
82
|
-
documents: [], # Would need to be reconstructed
|
|
83
|
-
embeddings: [], # Would need to be reconstructed
|
|
84
|
-
metadata: []
|
|
85
|
-
)
|
|
86
|
-
topic.set_label(hash[:label])
|
|
87
|
-
topic.set_terms(hash[:terms])
|
|
88
|
-
topic
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
private
|
|
92
|
-
|
|
93
|
-
def compute_centroid
|
|
94
|
-
return [] if @embeddings.empty?
|
|
95
|
-
|
|
96
|
-
# Compute mean of all embeddings
|
|
97
|
-
dim = @embeddings.first.length
|
|
98
|
-
centroid = Array.new(dim, 0.0)
|
|
99
|
-
|
|
100
|
-
@embeddings.each do |embedding|
|
|
101
|
-
embedding.each_with_index do |val, idx|
|
|
102
|
-
centroid[idx] += val
|
|
103
|
-
end
|
|
104
|
-
end
|
|
105
|
-
|
|
106
|
-
centroid.map { |val| val / @embeddings.length }
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
def distance_to_centroid(embedding)
|
|
110
|
-
# Euclidean distance
|
|
111
|
-
Math.sqrt(
|
|
112
|
-
embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
|
|
113
|
-
)
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
end
|
|
117
|
-
end
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
require_relative 'labeling_strategies'
|
|
2
|
-
|
|
3
|
-
module Ragnar
|
|
4
|
-
module TopicModeling
|
|
5
|
-
class TopicLabeler
|
|
6
|
-
attr_reader :strategy
|
|
7
|
-
|
|
8
|
-
def initialize(method: :hybrid, llm_client: nil)
|
|
9
|
-
@method = method
|
|
10
|
-
@llm_client = llm_client
|
|
11
|
-
@strategy = LabelingStrategies.create(method, llm_client: llm_client)
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
# Generate a human-readable label for a topic
|
|
15
|
-
# Returns a hash with label, description, and metadata
|
|
16
|
-
def generate_label(topic: nil, terms:, documents: [], method: nil)
|
|
17
|
-
# Allow method override per call
|
|
18
|
-
if method && method != @method
|
|
19
|
-
strategy = LabelingStrategies.create(method, llm_client: @llm_client)
|
|
20
|
-
else
|
|
21
|
-
strategy = @strategy
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
# Generate label using selected strategy
|
|
25
|
-
result = strategy.generate_label(
|
|
26
|
-
topic: topic,
|
|
27
|
-
terms: terms,
|
|
28
|
-
documents: documents
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
# Ensure we always return a consistent structure
|
|
32
|
-
normalize_result(result)
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
# Convenience method for simple label string
|
|
36
|
-
def generate_simple_label(terms:, documents: [], method: nil)
|
|
37
|
-
result = generate_label(terms: terms, documents: documents, method: method)
|
|
38
|
-
result[:label]
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
# Change strategy at runtime
|
|
42
|
-
def set_strategy(method)
|
|
43
|
-
@method = method
|
|
44
|
-
@strategy = LabelingStrategies.create(method, llm_client: @llm_client)
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
private
|
|
48
|
-
|
|
49
|
-
def normalize_result(result)
|
|
50
|
-
{
|
|
51
|
-
label: result[:label] || "Unknown Topic",
|
|
52
|
-
description: result[:description] || nil,
|
|
53
|
-
method: result[:method] || @method,
|
|
54
|
-
confidence: result[:confidence] || 0.5,
|
|
55
|
-
themes: result[:themes] || [],
|
|
56
|
-
metadata: result.reject { |k, _| [:label, :description, :method, :confidence, :themes].include?(k) }
|
|
57
|
-
}
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
end
|