ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,131 +0,0 @@
1
- # Adapter to allow different LLM backends (red-candle, remote APIs, etc.)
2
- module Ragnar
3
- module TopicModeling
4
- class LLMAdapter
5
- # Factory method to create appropriate LLM client
6
- def self.create(type: :auto, **options)
7
- case type
8
- when :red_candle
9
- RedCandleAdapter.new(**options)
10
- when :openai
11
- # Future: OpenAIAdapter.new(**options)
12
- raise NotImplementedError, "OpenAI adapter not yet implemented"
13
- when :anthropic
14
- # Future: AnthropicAdapter.new(**options)
15
- raise NotImplementedError, "Anthropic adapter not yet implemented"
16
- when :auto
17
- # Try red-candle first, then fall back to others
18
- begin
19
- RedCandleAdapter.new(**options)
20
- rescue LoadError
21
- nil # No LLM available
22
- end
23
- else
24
- raise ArgumentError, "Unknown LLM type: #{type}"
25
- end
26
- end
27
- end
28
-
29
- # Adapter for red-candle (local LLMs)
30
- class RedCandleAdapter
31
- def initialize(model: nil, **options)
32
- require 'candle'
33
-
34
- @model = model || default_model
35
- @options = options
36
- @llm = load_or_create_llm
37
- end
38
-
39
- def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
40
- # Red-candle specific generation
41
- response = @llm.generate(
42
- prompt,
43
- max_length: max_tokens,
44
- temperature: temperature,
45
- do_sample: temperature > 0
46
- )
47
-
48
- # Handle JSON response format if requested
49
- if response_format && response_format[:type] == "json_object"
50
- ensure_json_response(response)
51
- else
52
- response
53
- end
54
- end
55
-
56
- def available?
57
- true
58
- end
59
-
60
- private
61
-
62
- def default_model
63
- # Use a small, fast model by default for topic labeling
64
- "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
65
- end
66
-
67
- def load_or_create_llm
68
- # Check if already loaded in ruby-rag
69
- if defined?(Ragnar::LLMManager)
70
- begin
71
- return Ragnar::LLMManager.instance.get_llm(@model)
72
- rescue
73
- # Fall through to create new
74
- end
75
- end
76
-
77
- # Create new LLM instance
78
- Candle::Model.new(
79
- model_id: @model,
80
- model_type: :llama,
81
- quantized: true
82
- )
83
- end
84
-
85
- def ensure_json_response(response)
86
- # Try to extract JSON from response
87
- begin
88
- # Look for JSON-like content
89
- json_match = response.match(/\{.*\}/m)
90
- if json_match
91
- JSON.parse(json_match[0])
92
- json_match[0] # Return the JSON string if valid
93
- else
94
- # Generate a basic JSON response
95
- generate_fallback_json(response)
96
- end
97
- rescue JSON::ParserError
98
- generate_fallback_json(response)
99
- end
100
- end
101
-
102
- def generate_fallback_json(text)
103
- # Create a simple JSON from text response
104
- label = text.lines.first&.strip || "Unknown"
105
- {
106
- label: label,
107
- description: text,
108
- confidence: 0.5
109
- }.to_json
110
- end
111
- end
112
-
113
- # Future adapter for remote LLMs
114
- class RemoteAdapter
115
- def initialize(api_key:, endpoint:, **options)
116
- @api_key = api_key
117
- @endpoint = endpoint
118
- @options = options
119
- end
120
-
121
- def generate(prompt:, max_tokens: 100, temperature: 0.3, response_format: nil)
122
- # Make API call
123
- raise NotImplementedError, "Remote LLM adapter coming soon"
124
- end
125
-
126
- def available?
127
- !@api_key.nil?
128
- end
129
- end
130
- end
131
- end
@@ -1,186 +0,0 @@
1
- module Ragnar
2
- module TopicModeling
3
- module Metrics
4
- extend self
5
-
6
- # Compute UMass Coherence for topic quality
7
- # Higher coherence = more interpretable topic
8
- def compute_coherence(terms, documents, top_n: 10)
9
- return 0.0 if terms.empty? || documents.empty?
10
-
11
- # Use top N terms
12
- eval_terms = terms.first(top_n)
13
- return 0.0 if eval_terms.length < 2
14
-
15
- # Create document term matrix for co-occurrence
16
- doc_term_counts = count_cooccurrences(eval_terms, documents)
17
-
18
- # Compute UMass coherence
19
- coherence_sum = 0.0
20
- pairs_count = 0
21
-
22
- eval_terms.each_with_index do |term_i, i|
23
- eval_terms.each_with_index do |term_j, j|
24
- next unless j < i # Only upper triangle
25
-
26
- # P(term_i, term_j) = co-occurrence count
27
- cooccur = doc_term_counts["#{term_i},#{term_j}"] || 0
28
- # P(term_j) = document frequency
29
- doc_freq_j = doc_term_counts[term_j] || 0
30
-
31
- if cooccur > 0 && doc_freq_j > 0
32
- # UMass: log((cooccur + 1) / doc_freq_j)
33
- coherence_sum += Math.log((cooccur + 1.0) / doc_freq_j)
34
- pairs_count += 1
35
- end
36
- end
37
- end
38
-
39
- return 0.0 if pairs_count == 0
40
-
41
- # Normalize by number of pairs
42
- coherence = coherence_sum / pairs_count
43
-
44
- # Transform to 0-1 range (coherence is typically negative)
45
- # More negative = less coherent, so we reverse and bound
46
- normalized = 1.0 / (1.0 + Math.exp(-coherence))
47
- normalized
48
- end
49
-
50
- # Compute how distinct a topic is from others
51
- def compute_distinctiveness(topic, other_topics)
52
- return 1.0 if other_topics.empty?
53
-
54
- topic_terms = Set.new(topic.terms.first(20))
55
-
56
- # Compare with other topics
57
- overlaps = other_topics.map do |other|
58
- next if other.id == topic.id
59
-
60
- other_terms = Set.new(other.terms.first(20))
61
- overlap = (topic_terms & other_terms).size.to_f
62
-
63
- # Jaccard similarity
64
- union_size = (topic_terms | other_terms).size
65
- union_size > 0 ? overlap / union_size : 0
66
- end.compact
67
-
68
- return 1.0 if overlaps.empty?
69
-
70
- # Distinctiveness = 1 - average overlap
71
- 1.0 - (overlaps.sum / overlaps.length)
72
- end
73
-
74
- # Compute diversity across all topics
75
- def compute_diversity(topics)
76
- return 0.0 if topics.length < 2
77
-
78
- # Collect all term sets
79
- term_sets = topics.map { |t| Set.new(t.terms.first(20)) }
80
-
81
- # Compute pairwise Jaccard distances
82
- distances = []
83
- term_sets.each_with_index do |set_i, i|
84
- term_sets.each_with_index do |set_j, j|
85
- next unless j > i # Only upper triangle
86
-
87
- intersection = (set_i & set_j).size.to_f
88
- union = (set_i | set_j).size.to_f
89
-
90
- # Jaccard distance = 1 - Jaccard similarity
91
- distance = union > 0 ? 1.0 - (intersection / union) : 1.0
92
- distances << distance
93
- end
94
- end
95
-
96
- # Average distance = diversity
97
- distances.sum / distances.length
98
- end
99
-
100
- # Compute coverage (what fraction of docs are in topics vs outliers)
101
- def compute_coverage(topics, total_documents)
102
- return 0.0 if total_documents == 0
103
-
104
- docs_in_topics = topics.sum(&:size)
105
- docs_in_topics.to_f / total_documents
106
- end
107
-
108
- # Silhouette score for cluster quality
109
- def compute_silhouette_score(topic, all_topics, embeddings)
110
- return 0.0 if topic.embeddings.empty?
111
-
112
- silhouettes = []
113
-
114
- topic.embeddings.each_with_index do |embedding, idx|
115
- # a(i) = average distance to other points in same cluster
116
- if topic.embeddings.length > 1
117
- a_i = topic.embeddings.each_with_index
118
- .reject { |_, j| j == idx }
119
- .map { |other, _| euclidean_distance(embedding, other) }
120
- .sum.to_f / (topic.embeddings.length - 1)
121
- else
122
- a_i = 0.0
123
- end
124
-
125
- # b(i) = minimum average distance to points in other clusters
126
- b_values = all_topics.reject { |t| t.id == topic.id }.map do |other_topic|
127
- next if other_topic.embeddings.empty?
128
-
129
- avg_dist = other_topic.embeddings
130
- .map { |other| euclidean_distance(embedding, other) }
131
- .sum.to_f / other_topic.embeddings.length
132
- avg_dist
133
- end.compact
134
-
135
- b_i = b_values.min || a_i
136
-
137
- # Silhouette coefficient
138
- if a_i == 0 && b_i == 0
139
- s_i = 0
140
- else
141
- s_i = (b_i - a_i) / [a_i, b_i].max
142
- end
143
-
144
- silhouettes << s_i
145
- end
146
-
147
- # Average silhouette score for topic
148
- silhouettes.sum / silhouettes.length
149
- end
150
-
151
- private
152
-
153
- def count_cooccurrences(terms, documents)
154
- counts = Hash.new(0)
155
-
156
- documents.each do |doc|
157
- doc_lower = doc.downcase
158
-
159
- # Count individual term occurrences
160
- terms.each do |term|
161
- counts[term] += 1 if doc_lower.include?(term.downcase)
162
- end
163
-
164
- # Count co-occurrences
165
- terms.each_with_index do |term_i, i|
166
- terms.each_with_index do |term_j, j|
167
- next unless j < i
168
-
169
- if doc_lower.include?(term_i.downcase) && doc_lower.include?(term_j.downcase)
170
- counts["#{term_i},#{term_j}"] += 1
171
- end
172
- end
173
- end
174
- end
175
-
176
- counts
177
- end
178
-
179
- def euclidean_distance(vec1, vec2)
180
- Math.sqrt(
181
- vec1.zip(vec2).map { |a, b| (a - b) ** 2 }.sum
182
- )
183
- end
184
- end
185
- end
186
- end
@@ -1,170 +0,0 @@
1
- require 'set'
2
-
3
- module Ragnar
4
- module TopicModeling
5
- class TermExtractor
6
- # Common English stop words to filter out
7
- STOP_WORDS = Set.new(%w[
8
- the be to of and a in that have i it for not on with he as you do at
9
- this but his by from they we say her she or an will my one all would
10
- there their what so up out if about who get which go me when make can
11
- like time no just him know take people into year your good some could
12
- them see other than then now look only come its over think also back
13
- after use two how our work first well way even new want because any
14
- these give day most us is was are been has had were said did get may
15
- ])
16
-
17
- def initialize(stop_words: STOP_WORDS, min_word_length: 3, max_word_length: 20)
18
- @stop_words = stop_words
19
- @min_word_length = min_word_length
20
- @max_word_length = max_word_length
21
- end
22
-
23
- # Extract distinctive terms using c-TF-IDF
24
- def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
25
- # Tokenize and count terms in topic
26
- topic_terms = count_terms(topic_docs)
27
-
28
- # Tokenize and count document frequency across all docs
29
- doc_frequencies = compute_document_frequencies(all_docs)
30
-
31
- # Compute c-TF-IDF scores
32
- scores = {}
33
- total_docs = all_docs.length.to_f
34
-
35
- topic_terms.each do |term, tf|
36
- # c-TF-IDF formula: tf * log(N / df)
37
- df = doc_frequencies[term] || 1
38
- idf = Math.log(total_docs / df)
39
- scores[term] = tf * idf
40
- end
41
-
42
- # Return top scoring terms
43
- scores.sort_by { |_, score| -score }
44
- .first(top_n)
45
- .map(&:first)
46
- end
47
-
48
- # Standard TF-IDF implementation
49
- def extract_tfidf_terms(documents:, top_n: 20)
50
- # Document frequency
51
- doc_frequencies = compute_document_frequencies(documents)
52
- total_docs = documents.length.to_f
53
-
54
- # Compute TF-IDF for each document
55
- all_scores = []
56
-
57
- documents.each do |doc|
58
- terms = count_terms([doc])
59
- doc_length = terms.values.sum.to_f
60
-
61
- scores = {}
62
- terms.each do |term, count|
63
- tf = count / doc_length # Normalized term frequency
64
- df = doc_frequencies[term] || 1
65
- idf = Math.log(total_docs / df)
66
- scores[term] = tf * idf
67
- end
68
-
69
- all_scores << scores
70
- end
71
-
72
- # Aggregate scores across all documents
73
- aggregated = {}
74
- all_scores.each do |doc_scores|
75
- doc_scores.each do |term, score|
76
- aggregated[term] ||= 0
77
- aggregated[term] += score
78
- end
79
- end
80
-
81
- # Return top terms
82
- aggregated.sort_by { |_, score| -score }
83
- .first(top_n)
84
- .map(&:first)
85
- end
86
-
87
- # Simple term frequency extraction
88
- def extract_frequent_terms(documents:, top_n: 20)
89
- terms = count_terms(documents)
90
- terms.sort_by { |_, count| -count }
91
- .first(top_n)
92
- .map(&:first)
93
- end
94
-
95
- private
96
-
97
- def tokenize(text)
98
- # Simple tokenization - can be improved with proper NLP tokenizer
99
- text.downcase
100
- .split(/\W+/)
101
- .select { |word| valid_word?(word) }
102
- end
103
-
104
- def valid_word?(word)
105
- word.length >= @min_word_length &&
106
- word.length <= @max_word_length &&
107
- !@stop_words.include?(word) &&
108
- !word.match?(/^\d+$/) # Not pure numbers
109
- end
110
-
111
- def count_terms(documents)
112
- terms = Hash.new(0)
113
-
114
- documents.each do |doc|
115
- tokenize(doc).each do |word|
116
- terms[word] += 1
117
- end
118
- end
119
-
120
- terms
121
- end
122
-
123
- def compute_document_frequencies(documents)
124
- doc_frequencies = Hash.new(0)
125
-
126
- documents.each do |doc|
127
- # Use set to count each term once per document
128
- unique_terms = Set.new(tokenize(doc))
129
- unique_terms.each do |term|
130
- doc_frequencies[term] += 1
131
- end
132
- end
133
-
134
- doc_frequencies
135
- end
136
-
137
- # N-gram extraction for phrases
138
- def extract_ngrams(text, n: 2)
139
- words = tokenize(text)
140
- ngrams = []
141
-
142
- (0..words.length - n).each do |i|
143
- ngram = words[i, n].join(" ")
144
- ngrams << ngram
145
- end
146
-
147
- ngrams
148
- end
149
-
150
- # Extract both unigrams and bigrams
151
- def extract_mixed_terms(documents:, top_n: 20)
152
- all_terms = Hash.new(0)
153
-
154
- documents.each do |doc|
155
- # Unigrams
156
- tokenize(doc).each { |word| all_terms[word] += 1 }
157
-
158
- # Bigrams
159
- extract_ngrams(doc, n: 2).each { |bigram| all_terms[bigram] += 1 }
160
- end
161
-
162
- # Filter and return top terms
163
- all_terms.select { |term, count| count > 1 } # Appears more than once
164
- .sort_by { |_, count| -count }
165
- .first(top_n)
166
- .map(&:first)
167
- end
168
- end
169
- end
170
- end
@@ -1,117 +0,0 @@
1
- module Ragnar
2
- module TopicModeling
3
- class Topic
4
- attr_reader :id, :document_indices, :documents, :embeddings, :metadata
5
- attr_accessor :terms, :label
6
-
7
- def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
8
- @id = id
9
- @document_indices = document_indices
10
- @documents = documents
11
- @embeddings = embeddings
12
- @metadata = metadata || []
13
- @terms = []
14
- @label = nil
15
- end
16
-
17
- def size
18
- @documents.length
19
- end
20
-
21
- def centroid
22
- @centroid ||= compute_centroid
23
- end
24
-
25
- def representative_docs(k: 3)
26
- return @documents if @documents.length <= k
27
-
28
- # Find documents closest to centroid
29
- distances = @embeddings.map do |embedding|
30
- distance_to_centroid(embedding)
31
- end
32
-
33
- # Get indices of k smallest distances
34
- top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
35
- top_indices.map { |i| @documents[i] }
36
- end
37
-
38
- def coherence
39
- @coherence ||= Metrics.compute_coherence(@terms, @documents)
40
- end
41
-
42
- def distinctiveness(other_topics)
43
- @distinctiveness ||= Metrics.compute_distinctiveness(self, other_topics)
44
- end
45
-
46
- def set_terms(terms)
47
- @terms = terms
48
- @centroid = nil # Reset centroid cache
49
- end
50
-
51
- def set_label(label)
52
- @label = label
53
- end
54
-
55
- def summary
56
- {
57
- id: @id,
58
- label: @label || "Topic #{@id}",
59
- size: size,
60
- terms: @terms.first(10),
61
- coherence: coherence.round(3),
62
- representative_docs: representative_docs(k: 2).map { |d| d[0..100] + "..." }
63
- }
64
- end
65
-
66
- def to_h
67
- {
68
- id: @id,
69
- label: @label,
70
- document_indices: @document_indices,
71
- terms: @terms,
72
- centroid: centroid,
73
- size: size,
74
- coherence: coherence
75
- }
76
- end
77
-
78
- def self.from_h(hash)
79
- topic = new(
80
- id: hash[:id],
81
- document_indices: hash[:document_indices],
82
- documents: [], # Would need to be reconstructed
83
- embeddings: [], # Would need to be reconstructed
84
- metadata: []
85
- )
86
- topic.set_label(hash[:label])
87
- topic.set_terms(hash[:terms])
88
- topic
89
- end
90
-
91
- private
92
-
93
- def compute_centroid
94
- return [] if @embeddings.empty?
95
-
96
- # Compute mean of all embeddings
97
- dim = @embeddings.first.length
98
- centroid = Array.new(dim, 0.0)
99
-
100
- @embeddings.each do |embedding|
101
- embedding.each_with_index do |val, idx|
102
- centroid[idx] += val
103
- end
104
- end
105
-
106
- centroid.map { |val| val / @embeddings.length }
107
- end
108
-
109
- def distance_to_centroid(embedding)
110
- # Euclidean distance
111
- Math.sqrt(
112
- embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
113
- )
114
- end
115
- end
116
- end
117
- end
@@ -1,61 +0,0 @@
1
- require_relative 'labeling_strategies'
2
-
3
- module Ragnar
4
- module TopicModeling
5
- class TopicLabeler
6
- attr_reader :strategy
7
-
8
- def initialize(method: :hybrid, llm_client: nil)
9
- @method = method
10
- @llm_client = llm_client
11
- @strategy = LabelingStrategies.create(method, llm_client: llm_client)
12
- end
13
-
14
- # Generate a human-readable label for a topic
15
- # Returns a hash with label, description, and metadata
16
- def generate_label(topic: nil, terms:, documents: [], method: nil)
17
- # Allow method override per call
18
- if method && method != @method
19
- strategy = LabelingStrategies.create(method, llm_client: @llm_client)
20
- else
21
- strategy = @strategy
22
- end
23
-
24
- # Generate label using selected strategy
25
- result = strategy.generate_label(
26
- topic: topic,
27
- terms: terms,
28
- documents: documents
29
- )
30
-
31
- # Ensure we always return a consistent structure
32
- normalize_result(result)
33
- end
34
-
35
- # Convenience method for simple label string
36
- def generate_simple_label(terms:, documents: [], method: nil)
37
- result = generate_label(terms: terms, documents: documents, method: method)
38
- result[:label]
39
- end
40
-
41
- # Change strategy at runtime
42
- def set_strategy(method)
43
- @method = method
44
- @strategy = LabelingStrategies.create(method, llm_client: @llm_client)
45
- end
46
-
47
- private
48
-
49
- def normalize_result(result)
50
- {
51
- label: result[:label] || "Unknown Topic",
52
- description: result[:description] || nil,
53
- method: result[:method] || @method,
54
- confidence: result[:confidence] || 0.5,
55
- themes: result[:themes] || [],
56
- metadata: result.reject { |k, _| [:label, :description, :method, :confidence, :themes].include?(k) }
57
- }
58
- end
59
- end
60
- end
61
- end