ragnar-cli 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,170 @@
1
+ require 'set'
2
+
3
+ module Ragnar
4
+ module TopicModeling
5
+ class TermExtractor
6
+ # Common English stop words to filter out
7
+ STOP_WORDS = Set.new(%w[
8
+ the be to of and a in that have i it for not on with he as you do at
9
+ this but his by from they we say her she or an will my one all would
10
+ there their what so up out if about who get which go me when make can
11
+ like time no just him know take people into year your good some could
12
+ them see other than then now look only come its over think also back
13
+ after use two how our work first well way even new want because any
14
+ these give day most us is was are been has had were said did get may
15
+ ])
16
+
17
+ def initialize(stop_words: STOP_WORDS, min_word_length: 3, max_word_length: 20)
18
+ @stop_words = stop_words
19
+ @min_word_length = min_word_length
20
+ @max_word_length = max_word_length
21
+ end
22
+
23
+ # Extract distinctive terms using c-TF-IDF
24
+ def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
25
+ # Tokenize and count terms in topic
26
+ topic_terms = count_terms(topic_docs)
27
+
28
+ # Tokenize and count document frequency across all docs
29
+ doc_frequencies = compute_document_frequencies(all_docs)
30
+
31
+ # Compute c-TF-IDF scores
32
+ scores = {}
33
+ total_docs = all_docs.length.to_f
34
+
35
+ topic_terms.each do |term, tf|
36
+ # c-TF-IDF formula: tf * log(N / df)
37
+ df = doc_frequencies[term] || 1
38
+ idf = Math.log(total_docs / df)
39
+ scores[term] = tf * idf
40
+ end
41
+
42
+ # Return top scoring terms
43
+ scores.sort_by { |_, score| -score }
44
+ .first(top_n)
45
+ .map(&:first)
46
+ end
47
+
48
+ # Standard TF-IDF implementation
49
+ def extract_tfidf_terms(documents:, top_n: 20)
50
+ # Document frequency
51
+ doc_frequencies = compute_document_frequencies(documents)
52
+ total_docs = documents.length.to_f
53
+
54
+ # Compute TF-IDF for each document
55
+ all_scores = []
56
+
57
+ documents.each do |doc|
58
+ terms = count_terms([doc])
59
+ doc_length = terms.values.sum.to_f
60
+
61
+ scores = {}
62
+ terms.each do |term, count|
63
+ tf = count / doc_length # Normalized term frequency
64
+ df = doc_frequencies[term] || 1
65
+ idf = Math.log(total_docs / df)
66
+ scores[term] = tf * idf
67
+ end
68
+
69
+ all_scores << scores
70
+ end
71
+
72
+ # Aggregate scores across all documents
73
+ aggregated = {}
74
+ all_scores.each do |doc_scores|
75
+ doc_scores.each do |term, score|
76
+ aggregated[term] ||= 0
77
+ aggregated[term] += score
78
+ end
79
+ end
80
+
81
+ # Return top terms
82
+ aggregated.sort_by { |_, score| -score }
83
+ .first(top_n)
84
+ .map(&:first)
85
+ end
86
+
87
+ # Simple term frequency extraction
88
+ def extract_frequent_terms(documents:, top_n: 20)
89
+ terms = count_terms(documents)
90
+ terms.sort_by { |_, count| -count }
91
+ .first(top_n)
92
+ .map(&:first)
93
+ end
94
+
95
+ private
96
+
97
+ def tokenize(text)
98
+ # Simple tokenization - can be improved with proper NLP tokenizer
99
+ text.downcase
100
+ .split(/\W+/)
101
+ .select { |word| valid_word?(word) }
102
+ end
103
+
104
+ def valid_word?(word)
105
+ word.length >= @min_word_length &&
106
+ word.length <= @max_word_length &&
107
+ !@stop_words.include?(word) &&
108
+ !word.match?(/^\d+$/) # Not pure numbers
109
+ end
110
+
111
+ def count_terms(documents)
112
+ terms = Hash.new(0)
113
+
114
+ documents.each do |doc|
115
+ tokenize(doc).each do |word|
116
+ terms[word] += 1
117
+ end
118
+ end
119
+
120
+ terms
121
+ end
122
+
123
+ def compute_document_frequencies(documents)
124
+ doc_frequencies = Hash.new(0)
125
+
126
+ documents.each do |doc|
127
+ # Use set to count each term once per document
128
+ unique_terms = Set.new(tokenize(doc))
129
+ unique_terms.each do |term|
130
+ doc_frequencies[term] += 1
131
+ end
132
+ end
133
+
134
+ doc_frequencies
135
+ end
136
+
137
+ # N-gram extraction for phrases
138
+ def extract_ngrams(text, n: 2)
139
+ words = tokenize(text)
140
+ ngrams = []
141
+
142
+ (0..words.length - n).each do |i|
143
+ ngram = words[i, n].join(" ")
144
+ ngrams << ngram
145
+ end
146
+
147
+ ngrams
148
+ end
149
+
150
+ # Extract both unigrams and bigrams
151
+ def extract_mixed_terms(documents:, top_n: 20)
152
+ all_terms = Hash.new(0)
153
+
154
+ documents.each do |doc|
155
+ # Unigrams
156
+ tokenize(doc).each { |word| all_terms[word] += 1 }
157
+
158
+ # Bigrams
159
+ extract_ngrams(doc, n: 2).each { |bigram| all_terms[bigram] += 1 }
160
+ end
161
+
162
+ # Filter and return top terms
163
+ all_terms.select { |term, count| count > 1 } # Appears more than once
164
+ .sort_by { |_, count| -count }
165
+ .first(top_n)
166
+ .map(&:first)
167
+ end
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,117 @@
1
+ module Ragnar
2
+ module TopicModeling
3
+ class Topic
4
+ attr_reader :id, :document_indices, :documents, :embeddings, :metadata
5
+ attr_accessor :terms, :label
6
+
7
+ def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
8
+ @id = id
9
+ @document_indices = document_indices
10
+ @documents = documents
11
+ @embeddings = embeddings
12
+ @metadata = metadata || []
13
+ @terms = []
14
+ @label = nil
15
+ end
16
+
17
+ def size
18
+ @documents.length
19
+ end
20
+
21
+ def centroid
22
+ @centroid ||= compute_centroid
23
+ end
24
+
25
+ def representative_docs(k: 3)
26
+ return @documents if @documents.length <= k
27
+
28
+ # Find documents closest to centroid
29
+ distances = @embeddings.map do |embedding|
30
+ distance_to_centroid(embedding)
31
+ end
32
+
33
+ # Get indices of k smallest distances
34
+ top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
35
+ top_indices.map { |i| @documents[i] }
36
+ end
37
+
38
+ def coherence
39
+ @coherence ||= Metrics.compute_coherence(@terms, @documents)
40
+ end
41
+
42
+ def distinctiveness(other_topics)
43
+ @distinctiveness ||= Metrics.compute_distinctiveness(self, other_topics)
44
+ end
45
+
46
+ def set_terms(terms)
47
+ @terms = terms
48
+ @centroid = nil # Reset centroid cache
49
+ end
50
+
51
+ def set_label(label)
52
+ @label = label
53
+ end
54
+
55
+ def summary
56
+ {
57
+ id: @id,
58
+ label: @label || "Topic #{@id}",
59
+ size: size,
60
+ terms: @terms.first(10),
61
+ coherence: coherence.round(3),
62
+ representative_docs: representative_docs(k: 2).map { |d| d[0..100] + "..." }
63
+ }
64
+ end
65
+
66
+ def to_h
67
+ {
68
+ id: @id,
69
+ label: @label,
70
+ document_indices: @document_indices,
71
+ terms: @terms,
72
+ centroid: centroid,
73
+ size: size,
74
+ coherence: coherence
75
+ }
76
+ end
77
+
78
+ def self.from_h(hash)
79
+ topic = new(
80
+ id: hash[:id],
81
+ document_indices: hash[:document_indices],
82
+ documents: [], # Would need to be reconstructed
83
+ embeddings: [], # Would need to be reconstructed
84
+ metadata: []
85
+ )
86
+ topic.set_label(hash[:label])
87
+ topic.set_terms(hash[:terms])
88
+ topic
89
+ end
90
+
91
+ private
92
+
93
+ def compute_centroid
94
+ return [] if @embeddings.empty?
95
+
96
+ # Compute mean of all embeddings
97
+ dim = @embeddings.first.length
98
+ centroid = Array.new(dim, 0.0)
99
+
100
+ @embeddings.each do |embedding|
101
+ embedding.each_with_index do |val, idx|
102
+ centroid[idx] += val
103
+ end
104
+ end
105
+
106
+ centroid.map { |val| val / @embeddings.length }
107
+ end
108
+
109
+ def distance_to_centroid(embedding)
110
+ # Euclidean distance
111
+ Math.sqrt(
112
+ embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
113
+ )
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,61 @@
1
+ require_relative 'labeling_strategies'
2
+
3
+ module Ragnar
4
+ module TopicModeling
5
+ class TopicLabeler
6
+ attr_reader :strategy
7
+
8
+ def initialize(method: :hybrid, llm_client: nil)
9
+ @method = method
10
+ @llm_client = llm_client
11
+ @strategy = LabelingStrategies.create(method, llm_client: llm_client)
12
+ end
13
+
14
+ # Generate a human-readable label for a topic
15
+ # Returns a hash with label, description, and metadata
16
+ def generate_label(topic: nil, terms:, documents: [], method: nil)
17
+ # Allow method override per call
18
+ if method && method != @method
19
+ strategy = LabelingStrategies.create(method, llm_client: @llm_client)
20
+ else
21
+ strategy = @strategy
22
+ end
23
+
24
+ # Generate label using selected strategy
25
+ result = strategy.generate_label(
26
+ topic: topic,
27
+ terms: terms,
28
+ documents: documents
29
+ )
30
+
31
+ # Ensure we always return a consistent structure
32
+ normalize_result(result)
33
+ end
34
+
35
+ # Convenience method for simple label string
36
+ def generate_simple_label(terms:, documents: [], method: nil)
37
+ result = generate_label(terms: terms, documents: documents, method: method)
38
+ result[:label]
39
+ end
40
+
41
+ # Change strategy at runtime
42
+ def set_strategy(method)
43
+ @method = method
44
+ @strategy = LabelingStrategies.create(method, llm_client: @llm_client)
45
+ end
46
+
47
+ private
48
+
49
+ def normalize_result(result)
50
+ {
51
+ label: result[:label] || "Unknown Topic",
52
+ description: result[:description] || nil,
53
+ method: result[:method] || @method,
54
+ confidence: result[:confidence] || 0.5,
55
+ themes: result[:themes] || [],
56
+ metadata: result.reject { |k, _| [:label, :description, :method, :confidence, :themes].include?(k) }
57
+ }
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,24 @@
1
+ # Main entry point for topic modeling functionality
2
+ # Designed for future extraction into a separate gem
3
+
4
+ require_relative 'topic_modeling/topic'
5
+ require_relative 'topic_modeling/term_extractor'
6
+ require_relative 'topic_modeling/metrics'
7
+ require_relative 'topic_modeling/topic_labeler'
8
+ require_relative 'topic_modeling/engine'
9
+
10
+ module Ragnar
11
+ module TopicModeling
12
+
13
+ # Convenience method to create a new topic modeling engine
14
+ def self.new(**options)
15
+ Engine.new(**options)
16
+ end
17
+
18
+ # Extract topics from embeddings and documents (simple interface)
19
+ def self.extract(embeddings:, documents:, **options)
20
+ engine = Engine.new(**options)
21
+ engine.fit(embeddings: embeddings, documents: documents)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,228 @@
1
+ require 'json'
2
+
3
+ module Ragnar
4
+ class UmapProcessor
5
+ attr_reader :database, :model_path
6
+
7
+ def initialize(db_path: Ragnar::DEFAULT_DB_PATH, model_path: "umap_model.bin")
8
+ @database = Database.new(db_path)
9
+ @model_path = model_path
10
+ @umap_model = nil
11
+ end
12
+
13
+ def train(n_components: Ragnar::DEFAULT_REDUCED_DIMENSIONS, n_neighbors: 15, min_dist: 0.1)
14
+ puts "Loading embeddings from database..."
15
+
16
+ # Get all embeddings
17
+ docs = @database.get_embeddings
18
+
19
+ if docs.empty?
20
+ raise "No embeddings found in database. Please index some documents first."
21
+ end
22
+
23
+ embeddings = docs.map { |d| d[:embedding] }.compact
24
+
25
+ if embeddings.empty?
26
+ raise "No valid embeddings found in database."
27
+ end
28
+
29
+ puts "Found #{embeddings.size} embeddings"
30
+
31
+ # Adjust parameters based on the number of samples
32
+ # UMAP requires n_neighbors < n_samples
33
+ # Also, n_components should be less than n_samples for stability
34
+ n_samples = embeddings.size
35
+
36
+ if n_neighbors >= n_samples
37
+ n_neighbors = [3, (n_samples - 1) / 2].max.to_i
38
+ puts " Adjusted n_neighbors to #{n_neighbors} (was #{15}, but only have #{n_samples} samples)"
39
+ end
40
+
41
+ if n_components >= n_samples
42
+ n_components = [2, n_samples - 1].min
43
+ puts " Adjusted n_components to #{n_components} (was #{50}, but only have #{n_samples} samples)"
44
+ end
45
+
46
+ # Warn if we have very few samples
47
+ if n_samples < 100
48
+ puts "\n ⚠️ Warning: UMAP works best with at least 100 samples."
49
+ puts " You currently have #{n_samples} samples."
50
+ puts " Consider indexing more documents for better results."
51
+ end
52
+
53
+ # Convert to matrix format for ClusterKit
54
+ # ClusterKit expects a 2D array or Numo::NArray
55
+ embedding_matrix = embeddings
56
+ original_dims = embeddings.first.size
57
+
58
+ puts "\nTraining UMAP model..."
59
+ puts " Original dimensions: #{original_dims}"
60
+ puts " Target dimensions: #{n_components}"
61
+ puts " Neighbors: #{n_neighbors}"
62
+ puts " Min distance: #{min_dist}"
63
+
64
+ # Use the simple ClusterKit.umap method
65
+ progressbar = TTY::ProgressBar.new(
66
+ "Training UMAP [:bar] :percent",
67
+ total: 100,
68
+ bar_format: :block,
69
+ width: 30
70
+ )
71
+
72
+ # Start progress in background (ClusterKit doesn't provide callbacks)
73
+ progress_thread = Thread.new do
74
+ 100.times do
75
+ sleep(0.05)
76
+ progressbar.advance
77
+ break if @training_complete
78
+ end
79
+ end
80
+
81
+ # Perform the actual training using the class-based API
82
+ @umap_instance = ClusterKit::Dimensionality::UMAP.new(
83
+ n_components: n_components,
84
+ n_neighbors: n_neighbors
85
+ )
86
+
87
+ @reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
88
+
89
+ @training_complete = true
90
+ progress_thread.join
91
+ progressbar.finish
92
+
93
+ # Store the parameters for saving
94
+ @model_params = {
95
+ n_components: n_components,
96
+ n_neighbors: n_neighbors,
97
+ min_dist: min_dist
98
+ }
99
+
100
+ # Save the model
101
+ save_model
102
+
103
+ {
104
+ embeddings_count: embeddings.size,
105
+ original_dims: original_dims,
106
+ reduced_dims: n_components
107
+ }
108
+ end
109
+
110
+ def apply(batch_size: 100)
111
+ # Load the trained UMAP model (reduced embeddings)
112
+ reduced_embeddings = load_model
113
+
114
+ puts "Applying saved UMAP embeddings to database..."
115
+
116
+ # Get all embeddings from database
117
+ all_docs = @database.get_embeddings
118
+
119
+ if all_docs.empty?
120
+ puts "No embeddings found in database."
121
+ return {
122
+ processed: 0,
123
+ skipped: 0,
124
+ errors: 0
125
+ }
126
+ end
127
+
128
+ puts "Found #{all_docs.size} documents in database"
129
+ puts "Loaded #{reduced_embeddings.size} reduced embeddings from model"
130
+
131
+ if all_docs.size != reduced_embeddings.size
132
+ puts "⚠️ Warning: Mismatch between database documents (#{all_docs.size}) and model embeddings (#{reduced_embeddings.size})"
133
+ puts " This suggests the model was trained on a different dataset."
134
+ puts " Please retrain the UMAP model after indexing all your documents."
135
+ return {
136
+ processed: 0,
137
+ skipped: 0,
138
+ errors: 1
139
+ }
140
+ end
141
+
142
+ # Prepare updates - match document IDs to reduced embeddings
143
+ updates = all_docs.each_with_index.map do |doc, idx|
144
+ {
145
+ id: doc[:id],
146
+ reduced_embedding: reduced_embeddings[idx]
147
+ }
148
+ end
149
+
150
+ puts "Updating database with reduced embeddings..."
151
+ @database.update_reduced_embeddings(updates)
152
+
153
+ {
154
+ processed: updates.size,
155
+ skipped: 0,
156
+ errors: 0
157
+ }
158
+ end
159
+
160
+ private
161
+
162
+ def process_batch(docs)
163
+ # Extract embeddings
164
+ embeddings = docs.map { |d| d[:embedding] }
165
+
166
+ # Transform using UMAP
167
+ # The transform method returns a 2D array where each row is a reduced embedding
168
+ reduced = @umap_model.transform(embeddings)
169
+
170
+ # Prepare updates
171
+ updates = docs.each_with_index.map do |doc, idx|
172
+ {
173
+ id: doc[:id],
174
+ reduced_embedding: reduced[idx]
175
+ }
176
+ end
177
+
178
+ # Update database
179
+ @database.update_reduced_embeddings(updates)
180
+ end
181
+
182
+ def save_model
183
+ return unless @umap_instance && @reduced_embeddings
184
+
185
+ # Save the trained UMAP model for transforming new queries
186
+ @umap_instance.save_model(@model_path)
187
+ puts "UMAP model saved to: #{@model_path}"
188
+
189
+ # Also cache the reduced embeddings separately for the apply method
190
+ embeddings_path = @model_path.sub(/\.bin$/, '_embeddings.json')
191
+ ClusterKit::Dimensionality::UMAP.save_data(@reduced_embeddings, embeddings_path)
192
+ puts "Reduced embeddings cached to: #{embeddings_path}"
193
+ end
194
+
195
+ def load_model
196
+ return @reduced_embeddings if @reduced_embeddings
197
+
198
+ # For the apply method, we need the pre-computed embeddings
199
+ embeddings_path = @model_path.sub(/\.bin$/, '_embeddings.json')
200
+ unless File.exist?(embeddings_path)
201
+ raise "Cached embeddings not found at #{embeddings_path}. Please train a model first."
202
+ end
203
+
204
+ @reduced_embeddings = ClusterKit::Dimensionality::UMAP.load_data(embeddings_path)
205
+ puts "Cached embeddings loaded from: #{embeddings_path}"
206
+ @reduced_embeddings
207
+ end
208
+
209
+ def load_umap_model
210
+ # Load the actual UMAP model for transforming new data
211
+ unless File.exist?(@model_path)
212
+ raise "UMAP model not found at #{@model_path}. Please train a model first."
213
+ end
214
+
215
+ @umap_instance ||= ClusterKit::Dimensionality::UMAP.load_model(@model_path)
216
+ puts "UMAP model loaded from: #{@model_path}"
217
+ @umap_instance
218
+ end
219
+
220
+ def self.optimal_dimensions(original_dims, target_ratio: 0.1)
221
+ # Suggest optimal number of dimensions for reduction
222
+ # Common heuristic: reduce to 10% of original dimensions
223
+ # but keep at least 50 dimensions for good quality
224
+ suggested = (original_dims * target_ratio).to_i
225
+ [suggested, 50].max
226
+ end
227
+ end
228
+ end