ragnar-cli 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +439 -0
- data/exe/ragnar +6 -0
- data/lib/ragnar/chunker.rb +97 -0
- data/lib/ragnar/cli.rb +542 -0
- data/lib/ragnar/context_repacker.rb +121 -0
- data/lib/ragnar/database.rb +267 -0
- data/lib/ragnar/embedder.rb +137 -0
- data/lib/ragnar/indexer.rb +234 -0
- data/lib/ragnar/llm_manager.rb +43 -0
- data/lib/ragnar/query_processor.rb +398 -0
- data/lib/ragnar/query_rewriter.rb +75 -0
- data/lib/ragnar/topic_modeling/engine.rb +221 -0
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +300 -0
- data/lib/ragnar/topic_modeling/llm_adapter.rb +131 -0
- data/lib/ragnar/topic_modeling/metrics.rb +186 -0
- data/lib/ragnar/topic_modeling/term_extractor.rb +170 -0
- data/lib/ragnar/topic_modeling/topic.rb +117 -0
- data/lib/ragnar/topic_modeling/topic_labeler.rb +61 -0
- data/lib/ragnar/topic_modeling.rb +24 -0
- data/lib/ragnar/umap_processor.rb +228 -0
- data/lib/ragnar/umap_transform_service.rb +124 -0
- data/lib/ragnar/version.rb +5 -0
- data/lib/ragnar.rb +36 -0
- data/lib/ragnar_cli.rb +2 -0
- metadata +234 -0
@@ -0,0 +1,170 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Ragnar
|
4
|
+
module TopicModeling
|
5
|
+
class TermExtractor
|
6
|
+
# Common English stop words to filter out
|
7
|
+
STOP_WORDS = Set.new(%w[
|
8
|
+
the be to of and a in that have i it for not on with he as you do at
|
9
|
+
this but his by from they we say her she or an will my one all would
|
10
|
+
there their what so up out if about who get which go me when make can
|
11
|
+
like time no just him know take people into year your good some could
|
12
|
+
them see other than then now look only come its over think also back
|
13
|
+
after use two how our work first well way even new want because any
|
14
|
+
these give day most us is was are been has had were said did get may
|
15
|
+
])
|
16
|
+
|
17
|
+
def initialize(stop_words: STOP_WORDS, min_word_length: 3, max_word_length: 20)
|
18
|
+
@stop_words = stop_words
|
19
|
+
@min_word_length = min_word_length
|
20
|
+
@max_word_length = max_word_length
|
21
|
+
end
|
22
|
+
|
23
|
+
# Extract distinctive terms using c-TF-IDF
|
24
|
+
def extract_distinctive_terms(topic_docs:, all_docs:, top_n: 20)
|
25
|
+
# Tokenize and count terms in topic
|
26
|
+
topic_terms = count_terms(topic_docs)
|
27
|
+
|
28
|
+
# Tokenize and count document frequency across all docs
|
29
|
+
doc_frequencies = compute_document_frequencies(all_docs)
|
30
|
+
|
31
|
+
# Compute c-TF-IDF scores
|
32
|
+
scores = {}
|
33
|
+
total_docs = all_docs.length.to_f
|
34
|
+
|
35
|
+
topic_terms.each do |term, tf|
|
36
|
+
# c-TF-IDF formula: tf * log(N / df)
|
37
|
+
df = doc_frequencies[term] || 1
|
38
|
+
idf = Math.log(total_docs / df)
|
39
|
+
scores[term] = tf * idf
|
40
|
+
end
|
41
|
+
|
42
|
+
# Return top scoring terms
|
43
|
+
scores.sort_by { |_, score| -score }
|
44
|
+
.first(top_n)
|
45
|
+
.map(&:first)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Standard TF-IDF implementation
|
49
|
+
def extract_tfidf_terms(documents:, top_n: 20)
|
50
|
+
# Document frequency
|
51
|
+
doc_frequencies = compute_document_frequencies(documents)
|
52
|
+
total_docs = documents.length.to_f
|
53
|
+
|
54
|
+
# Compute TF-IDF for each document
|
55
|
+
all_scores = []
|
56
|
+
|
57
|
+
documents.each do |doc|
|
58
|
+
terms = count_terms([doc])
|
59
|
+
doc_length = terms.values.sum.to_f
|
60
|
+
|
61
|
+
scores = {}
|
62
|
+
terms.each do |term, count|
|
63
|
+
tf = count / doc_length # Normalized term frequency
|
64
|
+
df = doc_frequencies[term] || 1
|
65
|
+
idf = Math.log(total_docs / df)
|
66
|
+
scores[term] = tf * idf
|
67
|
+
end
|
68
|
+
|
69
|
+
all_scores << scores
|
70
|
+
end
|
71
|
+
|
72
|
+
# Aggregate scores across all documents
|
73
|
+
aggregated = {}
|
74
|
+
all_scores.each do |doc_scores|
|
75
|
+
doc_scores.each do |term, score|
|
76
|
+
aggregated[term] ||= 0
|
77
|
+
aggregated[term] += score
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Return top terms
|
82
|
+
aggregated.sort_by { |_, score| -score }
|
83
|
+
.first(top_n)
|
84
|
+
.map(&:first)
|
85
|
+
end
|
86
|
+
|
87
|
+
# Simple term frequency extraction
|
88
|
+
def extract_frequent_terms(documents:, top_n: 20)
|
89
|
+
terms = count_terms(documents)
|
90
|
+
terms.sort_by { |_, count| -count }
|
91
|
+
.first(top_n)
|
92
|
+
.map(&:first)
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
def tokenize(text)
|
98
|
+
# Simple tokenization - can be improved with proper NLP tokenizer
|
99
|
+
text.downcase
|
100
|
+
.split(/\W+/)
|
101
|
+
.select { |word| valid_word?(word) }
|
102
|
+
end
|
103
|
+
|
104
|
+
def valid_word?(word)
|
105
|
+
word.length >= @min_word_length &&
|
106
|
+
word.length <= @max_word_length &&
|
107
|
+
!@stop_words.include?(word) &&
|
108
|
+
!word.match?(/^\d+$/) # Not pure numbers
|
109
|
+
end
|
110
|
+
|
111
|
+
def count_terms(documents)
|
112
|
+
terms = Hash.new(0)
|
113
|
+
|
114
|
+
documents.each do |doc|
|
115
|
+
tokenize(doc).each do |word|
|
116
|
+
terms[word] += 1
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
terms
|
121
|
+
end
|
122
|
+
|
123
|
+
def compute_document_frequencies(documents)
|
124
|
+
doc_frequencies = Hash.new(0)
|
125
|
+
|
126
|
+
documents.each do |doc|
|
127
|
+
# Use set to count each term once per document
|
128
|
+
unique_terms = Set.new(tokenize(doc))
|
129
|
+
unique_terms.each do |term|
|
130
|
+
doc_frequencies[term] += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
doc_frequencies
|
135
|
+
end
|
136
|
+
|
137
|
+
# N-gram extraction for phrases
|
138
|
+
def extract_ngrams(text, n: 2)
|
139
|
+
words = tokenize(text)
|
140
|
+
ngrams = []
|
141
|
+
|
142
|
+
(0..words.length - n).each do |i|
|
143
|
+
ngram = words[i, n].join(" ")
|
144
|
+
ngrams << ngram
|
145
|
+
end
|
146
|
+
|
147
|
+
ngrams
|
148
|
+
end
|
149
|
+
|
150
|
+
# Extract both unigrams and bigrams
|
151
|
+
def extract_mixed_terms(documents:, top_n: 20)
|
152
|
+
all_terms = Hash.new(0)
|
153
|
+
|
154
|
+
documents.each do |doc|
|
155
|
+
# Unigrams
|
156
|
+
tokenize(doc).each { |word| all_terms[word] += 1 }
|
157
|
+
|
158
|
+
# Bigrams
|
159
|
+
extract_ngrams(doc, n: 2).each { |bigram| all_terms[bigram] += 1 }
|
160
|
+
end
|
161
|
+
|
162
|
+
# Filter and return top terms
|
163
|
+
all_terms.select { |term, count| count > 1 } # Appears more than once
|
164
|
+
.sort_by { |_, count| -count }
|
165
|
+
.first(top_n)
|
166
|
+
.map(&:first)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module Ragnar
|
2
|
+
module TopicModeling
|
3
|
+
class Topic
|
4
|
+
attr_reader :id, :document_indices, :documents, :embeddings, :metadata
|
5
|
+
attr_accessor :terms, :label
|
6
|
+
|
7
|
+
def initialize(id:, document_indices:, documents:, embeddings:, metadata: nil)
|
8
|
+
@id = id
|
9
|
+
@document_indices = document_indices
|
10
|
+
@documents = documents
|
11
|
+
@embeddings = embeddings
|
12
|
+
@metadata = metadata || []
|
13
|
+
@terms = []
|
14
|
+
@label = nil
|
15
|
+
end
|
16
|
+
|
17
|
+
def size
|
18
|
+
@documents.length
|
19
|
+
end
|
20
|
+
|
21
|
+
def centroid
|
22
|
+
@centroid ||= compute_centroid
|
23
|
+
end
|
24
|
+
|
25
|
+
def representative_docs(k: 3)
|
26
|
+
return @documents if @documents.length <= k
|
27
|
+
|
28
|
+
# Find documents closest to centroid
|
29
|
+
distances = @embeddings.map do |embedding|
|
30
|
+
distance_to_centroid(embedding)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get indices of k smallest distances
|
34
|
+
top_indices = distances.each_with_index.sort_by(&:first).first(k).map(&:last)
|
35
|
+
top_indices.map { |i| @documents[i] }
|
36
|
+
end
|
37
|
+
|
38
|
+
def coherence
|
39
|
+
@coherence ||= Metrics.compute_coherence(@terms, @documents)
|
40
|
+
end
|
41
|
+
|
42
|
+
def distinctiveness(other_topics)
|
43
|
+
@distinctiveness ||= Metrics.compute_distinctiveness(self, other_topics)
|
44
|
+
end
|
45
|
+
|
46
|
+
def set_terms(terms)
|
47
|
+
@terms = terms
|
48
|
+
@centroid = nil # Reset centroid cache
|
49
|
+
end
|
50
|
+
|
51
|
+
def set_label(label)
|
52
|
+
@label = label
|
53
|
+
end
|
54
|
+
|
55
|
+
def summary
|
56
|
+
{
|
57
|
+
id: @id,
|
58
|
+
label: @label || "Topic #{@id}",
|
59
|
+
size: size,
|
60
|
+
terms: @terms.first(10),
|
61
|
+
coherence: coherence.round(3),
|
62
|
+
representative_docs: representative_docs(k: 2).map { |d| d[0..100] + "..." }
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_h
|
67
|
+
{
|
68
|
+
id: @id,
|
69
|
+
label: @label,
|
70
|
+
document_indices: @document_indices,
|
71
|
+
terms: @terms,
|
72
|
+
centroid: centroid,
|
73
|
+
size: size,
|
74
|
+
coherence: coherence
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.from_h(hash)
|
79
|
+
topic = new(
|
80
|
+
id: hash[:id],
|
81
|
+
document_indices: hash[:document_indices],
|
82
|
+
documents: [], # Would need to be reconstructed
|
83
|
+
embeddings: [], # Would need to be reconstructed
|
84
|
+
metadata: []
|
85
|
+
)
|
86
|
+
topic.set_label(hash[:label])
|
87
|
+
topic.set_terms(hash[:terms])
|
88
|
+
topic
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def compute_centroid
|
94
|
+
return [] if @embeddings.empty?
|
95
|
+
|
96
|
+
# Compute mean of all embeddings
|
97
|
+
dim = @embeddings.first.length
|
98
|
+
centroid = Array.new(dim, 0.0)
|
99
|
+
|
100
|
+
@embeddings.each do |embedding|
|
101
|
+
embedding.each_with_index do |val, idx|
|
102
|
+
centroid[idx] += val
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
centroid.map { |val| val / @embeddings.length }
|
107
|
+
end
|
108
|
+
|
109
|
+
def distance_to_centroid(embedding)
|
110
|
+
# Euclidean distance
|
111
|
+
Math.sqrt(
|
112
|
+
embedding.zip(centroid).map { |a, b| (a - b) ** 2 }.sum
|
113
|
+
)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require_relative 'labeling_strategies'
|
2
|
+
|
3
|
+
module Ragnar
|
4
|
+
module TopicModeling
|
5
|
+
class TopicLabeler
|
6
|
+
attr_reader :strategy
|
7
|
+
|
8
|
+
def initialize(method: :hybrid, llm_client: nil)
|
9
|
+
@method = method
|
10
|
+
@llm_client = llm_client
|
11
|
+
@strategy = LabelingStrategies.create(method, llm_client: llm_client)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Generate a human-readable label for a topic
|
15
|
+
# Returns a hash with label, description, and metadata
|
16
|
+
def generate_label(topic: nil, terms:, documents: [], method: nil)
|
17
|
+
# Allow method override per call
|
18
|
+
if method && method != @method
|
19
|
+
strategy = LabelingStrategies.create(method, llm_client: @llm_client)
|
20
|
+
else
|
21
|
+
strategy = @strategy
|
22
|
+
end
|
23
|
+
|
24
|
+
# Generate label using selected strategy
|
25
|
+
result = strategy.generate_label(
|
26
|
+
topic: topic,
|
27
|
+
terms: terms,
|
28
|
+
documents: documents
|
29
|
+
)
|
30
|
+
|
31
|
+
# Ensure we always return a consistent structure
|
32
|
+
normalize_result(result)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Convenience method for simple label string
|
36
|
+
def generate_simple_label(terms:, documents: [], method: nil)
|
37
|
+
result = generate_label(terms: terms, documents: documents, method: method)
|
38
|
+
result[:label]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Change strategy at runtime
|
42
|
+
def set_strategy(method)
|
43
|
+
@method = method
|
44
|
+
@strategy = LabelingStrategies.create(method, llm_client: @llm_client)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def normalize_result(result)
|
50
|
+
{
|
51
|
+
label: result[:label] || "Unknown Topic",
|
52
|
+
description: result[:description] || nil,
|
53
|
+
method: result[:method] || @method,
|
54
|
+
confidence: result[:confidence] || 0.5,
|
55
|
+
themes: result[:themes] || [],
|
56
|
+
metadata: result.reject { |k, _| [:label, :description, :method, :confidence, :themes].include?(k) }
|
57
|
+
}
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Main entry point for topic modeling functionality
|
2
|
+
# Designed for future extraction into a separate gem
|
3
|
+
|
4
|
+
require_relative 'topic_modeling/topic'
|
5
|
+
require_relative 'topic_modeling/term_extractor'
|
6
|
+
require_relative 'topic_modeling/metrics'
|
7
|
+
require_relative 'topic_modeling/topic_labeler'
|
8
|
+
require_relative 'topic_modeling/engine'
|
9
|
+
|
10
|
+
module Ragnar
|
11
|
+
module TopicModeling
|
12
|
+
|
13
|
+
# Convenience method to create a new topic modeling engine
|
14
|
+
def self.new(**options)
|
15
|
+
Engine.new(**options)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Extract topics from embeddings and documents (simple interface)
|
19
|
+
def self.extract(embeddings:, documents:, **options)
|
20
|
+
engine = Engine.new(**options)
|
21
|
+
engine.fit(embeddings: embeddings, documents: documents)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module Ragnar
|
4
|
+
class UmapProcessor
|
5
|
+
attr_reader :database, :model_path
|
6
|
+
|
7
|
+
def initialize(db_path: Ragnar::DEFAULT_DB_PATH, model_path: "umap_model.bin")
|
8
|
+
@database = Database.new(db_path)
|
9
|
+
@model_path = model_path
|
10
|
+
@umap_model = nil
|
11
|
+
end
|
12
|
+
|
13
|
+
def train(n_components: Ragnar::DEFAULT_REDUCED_DIMENSIONS, n_neighbors: 15, min_dist: 0.1)
|
14
|
+
puts "Loading embeddings from database..."
|
15
|
+
|
16
|
+
# Get all embeddings
|
17
|
+
docs = @database.get_embeddings
|
18
|
+
|
19
|
+
if docs.empty?
|
20
|
+
raise "No embeddings found in database. Please index some documents first."
|
21
|
+
end
|
22
|
+
|
23
|
+
embeddings = docs.map { |d| d[:embedding] }.compact
|
24
|
+
|
25
|
+
if embeddings.empty?
|
26
|
+
raise "No valid embeddings found in database."
|
27
|
+
end
|
28
|
+
|
29
|
+
puts "Found #{embeddings.size} embeddings"
|
30
|
+
|
31
|
+
# Adjust parameters based on the number of samples
|
32
|
+
# UMAP requires n_neighbors < n_samples
|
33
|
+
# Also, n_components should be less than n_samples for stability
|
34
|
+
n_samples = embeddings.size
|
35
|
+
|
36
|
+
if n_neighbors >= n_samples
|
37
|
+
n_neighbors = [3, (n_samples - 1) / 2].max.to_i
|
38
|
+
puts " Adjusted n_neighbors to #{n_neighbors} (was #{15}, but only have #{n_samples} samples)"
|
39
|
+
end
|
40
|
+
|
41
|
+
if n_components >= n_samples
|
42
|
+
n_components = [2, n_samples - 1].min
|
43
|
+
puts " Adjusted n_components to #{n_components} (was #{50}, but only have #{n_samples} samples)"
|
44
|
+
end
|
45
|
+
|
46
|
+
# Warn if we have very few samples
|
47
|
+
if n_samples < 100
|
48
|
+
puts "\n ⚠️ Warning: UMAP works best with at least 100 samples."
|
49
|
+
puts " You currently have #{n_samples} samples."
|
50
|
+
puts " Consider indexing more documents for better results."
|
51
|
+
end
|
52
|
+
|
53
|
+
# Convert to matrix format for ClusterKit
|
54
|
+
# ClusterKit expects a 2D array or Numo::NArray
|
55
|
+
embedding_matrix = embeddings
|
56
|
+
original_dims = embeddings.first.size
|
57
|
+
|
58
|
+
puts "\nTraining UMAP model..."
|
59
|
+
puts " Original dimensions: #{original_dims}"
|
60
|
+
puts " Target dimensions: #{n_components}"
|
61
|
+
puts " Neighbors: #{n_neighbors}"
|
62
|
+
puts " Min distance: #{min_dist}"
|
63
|
+
|
64
|
+
# Use the simple ClusterKit.umap method
|
65
|
+
progressbar = TTY::ProgressBar.new(
|
66
|
+
"Training UMAP [:bar] :percent",
|
67
|
+
total: 100,
|
68
|
+
bar_format: :block,
|
69
|
+
width: 30
|
70
|
+
)
|
71
|
+
|
72
|
+
# Start progress in background (ClusterKit doesn't provide callbacks)
|
73
|
+
progress_thread = Thread.new do
|
74
|
+
100.times do
|
75
|
+
sleep(0.05)
|
76
|
+
progressbar.advance
|
77
|
+
break if @training_complete
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Perform the actual training using the class-based API
|
82
|
+
@umap_instance = ClusterKit::Dimensionality::UMAP.new(
|
83
|
+
n_components: n_components,
|
84
|
+
n_neighbors: n_neighbors
|
85
|
+
)
|
86
|
+
|
87
|
+
@reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
|
88
|
+
|
89
|
+
@training_complete = true
|
90
|
+
progress_thread.join
|
91
|
+
progressbar.finish
|
92
|
+
|
93
|
+
# Store the parameters for saving
|
94
|
+
@model_params = {
|
95
|
+
n_components: n_components,
|
96
|
+
n_neighbors: n_neighbors,
|
97
|
+
min_dist: min_dist
|
98
|
+
}
|
99
|
+
|
100
|
+
# Save the model
|
101
|
+
save_model
|
102
|
+
|
103
|
+
{
|
104
|
+
embeddings_count: embeddings.size,
|
105
|
+
original_dims: original_dims,
|
106
|
+
reduced_dims: n_components
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
def apply(batch_size: 100)
|
111
|
+
# Load the trained UMAP model (reduced embeddings)
|
112
|
+
reduced_embeddings = load_model
|
113
|
+
|
114
|
+
puts "Applying saved UMAP embeddings to database..."
|
115
|
+
|
116
|
+
# Get all embeddings from database
|
117
|
+
all_docs = @database.get_embeddings
|
118
|
+
|
119
|
+
if all_docs.empty?
|
120
|
+
puts "No embeddings found in database."
|
121
|
+
return {
|
122
|
+
processed: 0,
|
123
|
+
skipped: 0,
|
124
|
+
errors: 0
|
125
|
+
}
|
126
|
+
end
|
127
|
+
|
128
|
+
puts "Found #{all_docs.size} documents in database"
|
129
|
+
puts "Loaded #{reduced_embeddings.size} reduced embeddings from model"
|
130
|
+
|
131
|
+
if all_docs.size != reduced_embeddings.size
|
132
|
+
puts "⚠️ Warning: Mismatch between database documents (#{all_docs.size}) and model embeddings (#{reduced_embeddings.size})"
|
133
|
+
puts " This suggests the model was trained on a different dataset."
|
134
|
+
puts " Please retrain the UMAP model after indexing all your documents."
|
135
|
+
return {
|
136
|
+
processed: 0,
|
137
|
+
skipped: 0,
|
138
|
+
errors: 1
|
139
|
+
}
|
140
|
+
end
|
141
|
+
|
142
|
+
# Prepare updates - match document IDs to reduced embeddings
|
143
|
+
updates = all_docs.each_with_index.map do |doc, idx|
|
144
|
+
{
|
145
|
+
id: doc[:id],
|
146
|
+
reduced_embedding: reduced_embeddings[idx]
|
147
|
+
}
|
148
|
+
end
|
149
|
+
|
150
|
+
puts "Updating database with reduced embeddings..."
|
151
|
+
@database.update_reduced_embeddings(updates)
|
152
|
+
|
153
|
+
{
|
154
|
+
processed: updates.size,
|
155
|
+
skipped: 0,
|
156
|
+
errors: 0
|
157
|
+
}
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def process_batch(docs)
|
163
|
+
# Extract embeddings
|
164
|
+
embeddings = docs.map { |d| d[:embedding] }
|
165
|
+
|
166
|
+
# Transform using UMAP
|
167
|
+
# The transform method returns a 2D array where each row is a reduced embedding
|
168
|
+
reduced = @umap_model.transform(embeddings)
|
169
|
+
|
170
|
+
# Prepare updates
|
171
|
+
updates = docs.each_with_index.map do |doc, idx|
|
172
|
+
{
|
173
|
+
id: doc[:id],
|
174
|
+
reduced_embedding: reduced[idx]
|
175
|
+
}
|
176
|
+
end
|
177
|
+
|
178
|
+
# Update database
|
179
|
+
@database.update_reduced_embeddings(updates)
|
180
|
+
end
|
181
|
+
|
182
|
+
def save_model
|
183
|
+
return unless @umap_instance && @reduced_embeddings
|
184
|
+
|
185
|
+
# Save the trained UMAP model for transforming new queries
|
186
|
+
@umap_instance.save_model(@model_path)
|
187
|
+
puts "UMAP model saved to: #{@model_path}"
|
188
|
+
|
189
|
+
# Also cache the reduced embeddings separately for the apply method
|
190
|
+
embeddings_path = @model_path.sub(/\.bin$/, '_embeddings.json')
|
191
|
+
ClusterKit::Dimensionality::UMAP.save_data(@reduced_embeddings, embeddings_path)
|
192
|
+
puts "Reduced embeddings cached to: #{embeddings_path}"
|
193
|
+
end
|
194
|
+
|
195
|
+
def load_model
|
196
|
+
return @reduced_embeddings if @reduced_embeddings
|
197
|
+
|
198
|
+
# For the apply method, we need the pre-computed embeddings
|
199
|
+
embeddings_path = @model_path.sub(/\.bin$/, '_embeddings.json')
|
200
|
+
unless File.exist?(embeddings_path)
|
201
|
+
raise "Cached embeddings not found at #{embeddings_path}. Please train a model first."
|
202
|
+
end
|
203
|
+
|
204
|
+
@reduced_embeddings = ClusterKit::Dimensionality::UMAP.load_data(embeddings_path)
|
205
|
+
puts "Cached embeddings loaded from: #{embeddings_path}"
|
206
|
+
@reduced_embeddings
|
207
|
+
end
|
208
|
+
|
209
|
+
def load_umap_model
|
210
|
+
# Load the actual UMAP model for transforming new data
|
211
|
+
unless File.exist?(@model_path)
|
212
|
+
raise "UMAP model not found at #{@model_path}. Please train a model first."
|
213
|
+
end
|
214
|
+
|
215
|
+
@umap_instance ||= ClusterKit::Dimensionality::UMAP.load_model(@model_path)
|
216
|
+
puts "UMAP model loaded from: #{@model_path}"
|
217
|
+
@umap_instance
|
218
|
+
end
|
219
|
+
|
220
|
+
def self.optimal_dimensions(original_dims, target_ratio: 0.1)
|
221
|
+
# Suggest optimal number of dimensions for reduction
|
222
|
+
# Common heuristic: reduce to 10% of original dimensions
|
223
|
+
# but keep at least 50 dimensions for good quality
|
224
|
+
suggested = (original_dims * target_ratio).to_i
|
225
|
+
[suggested, 50].max
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|