ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,170 +0,0 @@
1
- require 'set'
2
-
3
- module Ragnar
4
- module TopicModeling
5
- class TermExtractor
6
- # Common English stop words to filter out
7
- STOP_WORDS = Set.new(%w[
8
- the be to of and a in that have i it for not on with he as you do at
9
- this but his by from they we say her she or an will my one all would
10
- there their what so up out if about who get which go me when make can
11
- like time no just him know take people into year your good some could
12
- them see other than then now look only come its over think also back
13
- after use two how our work first well way even new want because any
14
- these give day most us is was are been has had were said did get may
15
- ])
16
-
17
- def initialize(stop_words: STOP_WORDS, min_word_length: 3, max_word_length: 20)
18
- @stop_words = stop_words
19
- @min_word_length = min_word_length
20
- @max_word_length = max_word_length
21
- end
22
-
23
- # Extract distinctive terms using c-TF-IDF
24
- def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
25
- # Tokenize and count terms in topic
26
- topic_terms = count_terms(topic_docs)
27
-
28
- # Tokenize and count document frequency across all docs
29
- doc_frequencies = compute_document_frequencies(all_docs)
30
-
31
- # Compute c-TF-IDF scores
32
- scores = {}
33
- total_docs = all_docs.length.to_f
34
-
35
- topic_terms.each do |term, tf|
36
- # c-TF-IDF formula: tf * log(N / df)
37
- df = doc_frequencies[term] || 1
38
- idf = Math.log(total_docs / df)
39
- scores[term] = tf * idf
40
- end
41
-
42
- # Return top scoring terms
43
- scores.sort_by { |_, score| -score }
44
- .first(top_n)
45
- .map(&:first)
46
- end
47
-
48
- # Standard TF-IDF implementation
49
- def extract_tfidf_terms(documents:, top_n: 20)
50
- # Document frequency
51
- doc_frequencies = compute_document_frequencies(documents)
52
- total_docs = documents.length.to_f
53
-
54
- # Compute TF-IDF for each document
55
- all_scores = []
56
-
57
- documents.each do |doc|
58
- terms = count_terms([doc])
59
- doc_length = terms.values.sum.to_f
60
-
61
- scores = {}
62
- terms.each do |term, count|
63
- tf = count / doc_length # Normalized term frequency
64
- df = doc_frequencies[term] || 1
65
- idf = Math.log(total_docs / df)
66
- scores[term] = tf * idf
67
- end
68
-
69
- all_scores << scores
70
- end
71
-
72
- # Aggregate scores across all documents
73
- aggregated = {}
74
- all_scores.each do |doc_scores|
75
- doc_scores.each do |term, score|
76
- aggregated[term] ||= 0
77
- aggregated[term] += score
78
- end
79
- end
80
-
81
- # Return top terms
82
- aggregated.sort_by { |_, score| -score }
83
- .first(top_n)
84
- .map(&:first)
85
- end
86
-
87
- # Simple term frequency extraction
88
- def extract_frequent_terms(documents:, top_n: 20)
89
- terms = count_terms(documents)
90
- terms.sort_by { |_, count| -count }
91
- .first(top_n)
92
- .map(&:first)
93
- end
94
-
95
- private
96
-
97
- def tokenize(text)
98
- # Simple tokenization - can be improved with proper NLP tokenizer
99
- text.downcase
100
- .split(/\W+/)
101
- .select { |word| valid_word?(word) }
102
- end
103
-
104
- def valid_word?(word)
105
- word.length >= @min_word_length &&
106
- word.length <= @max_word_length &&
107
- !@stop_words.include?(word) &&
108
- !word.match?(/^\d+$/) # Not pure numbers
109
- end
110
-
111
- def count_terms(documents)
112
- terms = Hash.new(0)
113
-
114
- documents.each do |doc|
115
- tokenize(doc).each do |word|
116
- terms[word] += 1
117
- end
118
- end
119
-
120
- terms
121
- end
122
-
123
- def compute_document_frequencies(documents)
124
- doc_frequencies = Hash.new(0)
125
-
126
- documents.each do |doc|
127
- # Use set to count each term once per document
128
- unique_terms = Set.new(tokenize(doc))
129
- unique_terms.each do |term|
130
- doc_frequencies[term] += 1
131
- end
132
- end
133
-
134
- doc_frequencies
135
- end
136
-
137
- # N-gram extraction for phrases
138
- def extract_ngrams(text, n: 2)
139
- words = tokenize(text)
140
- ngrams = []
141
-
142
- (0..words.length - n).each do |i|
143
- ngram = words[i, n].join(" ")
144
- ngrams << ngram
145
- end
146
-
147
- ngrams
148
- end
149
-
150
- # Extract both unigrams and bigrams
151
- def extract_mixed_terms(documents:, top_n: 20)
152
- all_terms = Hash.new(0)
153
-
154
- documents.each do |doc|
155
- # Unigrams
156
- tokenize(doc).each { |word| all_terms[word] += 1 }
157
-
158
- # Bigrams
159
- extract_ngrams(doc, n: 2).each { |bigram| all_terms[bigram] += 1 }
160
- end
161
-
162
- # Filter and return top terms
163
- all_terms.select { |term, count| count > 1 } # Appears more than once
164
- .sort_by { |_, count| -count }
165
- .first(top_n)
166
- .map(&:first)
167
- end
168
- end
169
- end
170
- end
@@ -1,117 +0,0 @@
1
- module Ragnar
2
- module TopicModeling
3
- class Topic
4
- attr_reader :id, :document_indices, :documents, :embeddings, :metadata
5
- attr_accessor :terms, :label
6
-
7
- def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
8
- @id = id
9
- @document_indices = document_indices
10
- @documents = documents
11
- @embeddings = embeddings
12
- @metadata = metadata || []
13
- @terms = []
14
- @label = nil
15
- end
16
-
17
- def size
18
- @documents.length
19
- end
20
-
21
- def centroid
22
- @centroid ||= compute_centroid
23
- end
24
-
25
- def representative_docs(k: 3)
26
- return @documents if @documents.length <= k
27
-
28
- # Find documents closest to centroid
29
- distances = @embeddings.map do |embedding|
30
- distance_to_centroid(embedding)
31
- end
32
-
33
- # Get indices of k smallest distances
34
- top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
35
- top_indices.map { |i| @documents[i] }
36
- end
37
-
38
- def coherence
39
- @coherence ||= Metrics.compute_coherence(@terms, @documents)
40
- end
41
-
42
- def distinctiveness(other_topics)
43
- @distinctiveness ||= Metrics.compute_distinctiveness(self, other_topics)
44
- end
45
-
46
- def set_terms(terms)
47
- @terms = terms
48
- @centroid = nil # Reset centroid cache
49
- end
50
-
51
- def set_label(label)
52
- @label = label
53
- end
54
-
55
- def summary
56
- {
57
- id: @id,
58
- label: @label || "Topic #{@id}",
59
- size: size,
60
- terms: @terms.first(10),
61
- coherence: coherence.round(3),
62
- representative_docs: representative_docs(k: 2).map { |d| d[0..100] + "..." }
63
- }
64
- end
65
-
66
- def to_h
67
- {
68
- id: @id,
69
- label: @label,
70
- document_indices: @document_indices,
71
- terms: @terms,
72
- centroid: centroid,
73
- size: size,
74
- coherence: coherence
75
- }
76
- end
77
-
78
- def self.from_h(hash)
79
- topic = new(
80
- id: hash[:id],
81
- document_indices: hash[:document_indices],
82
- documents: [], # Would need to be reconstructed
83
- embeddings: [], # Would need to be reconstructed
84
- metadata: []
85
- )
86
- topic.set_label(hash[:label])
87
- topic.set_terms(hash[:terms])
88
- topic
89
- end
90
-
91
- private
92
-
93
- def compute_centroid
94
- return [] if @embeddings.empty?
95
-
96
- # Compute mean of all embeddings
97
- dim = @embeddings.first.length
98
- centroid = Array.new(dim, 0.0)
99
-
100
- @embeddings.each do |embedding|
101
- embedding.each_with_index do |val, idx|
102
- centroid[idx] += val
103
- end
104
- end
105
-
106
- centroid.map { |val| val / @embeddings.length }
107
- end
108
-
109
- def distance_to_centroid(embedding)
110
- # Euclidean distance
111
- Math.sqrt(
112
- embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
113
- )
114
- end
115
- end
116
- end
117
- end
@@ -1,61 +0,0 @@
1
- require_relative 'labeling_strategies'
2
-
3
- module Ragnar
4
- module TopicModeling
5
- class TopicLabeler
6
- attr_reader :strategy
7
-
8
- def initialize(method: :hybrid, llm_client: nil)
9
- @method = method
10
- @llm_client = llm_client
11
- @strategy = LabelingStrategies.create(method, llm_client: llm_client)
12
- end
13
-
14
- # Generate a human-readable label for a topic
15
- # Returns a hash with label, description, and metadata
16
- def generate_label(topic: nil, terms:, documents: [], method: nil)
17
- # Allow method override per call
18
- if method && method != @method
19
- strategy = LabelingStrategies.create(method, llm_client: @llm_client)
20
- else
21
- strategy = @strategy
22
- end
23
-
24
- # Generate label using selected strategy
25
- result = strategy.generate_label(
26
- topic: topic,
27
- terms: terms,
28
- documents: documents
29
- )
30
-
31
- # Ensure we always return a consistent structure
32
- normalize_result(result)
33
- end
34
-
35
- # Convenience method for simple label string
36
- def generate_simple_label(terms:, documents: [], method: nil)
37
- result = generate_label(terms: terms, documents: documents, method: method)
38
- result[:label]
39
- end
40
-
41
- # Change strategy at runtime
42
- def set_strategy(method)
43
- @method = method
44
- @strategy = LabelingStrategies.create(method, llm_client: @llm_client)
45
- end
46
-
47
- private
48
-
49
- def normalize_result(result)
50
- {
51
- label: result[:label] || "Unknown Topic",
52
- description: result[:description] || nil,
53
- method: result[:method] || @method,
54
- confidence: result[:confidence] || 0.5,
55
- themes: result[:themes] || [],
56
- metadata: result.reject { |k, _| [:label, :description, :method, :confidence, :themes].include?(k) }
57
- }
58
- end
59
- end
60
- end
61
- end