ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +187 -36
- data/lib/ragnar/cli.rb +543 -172
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +226 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/llm_manager.rb +4 -1
- data/lib/ragnar/query_processor.rb +38 -20
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +190 -73
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- metadata +43 -22
- data/lib/ragnar/topic_modeling/engine.rb +0 -221
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
@@ -1,24 +1,27 @@
|
|
1
|
-
#
|
2
|
-
# Designed for future extraction into a separate gem
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
require_relative 'topic_modeling/engine'
|
3
|
+
# Topic modeling wrapper that delegates to the Topical gem
|
4
|
+
# This maintains backward compatibility while using the extracted library
|
5
|
+
|
6
|
+
require 'topical'
|
9
7
|
|
10
8
|
module Ragnar
|
11
9
|
module TopicModeling
|
10
|
+
# Re-export Topical classes for backward compatibility
|
11
|
+
Topic = Topical::Topic
|
12
|
+
Engine = Topical::Engine
|
13
|
+
|
14
|
+
# Re-export metrics module
|
15
|
+
Metrics = Topical::Metrics
|
12
16
|
|
13
17
|
# Convenience method to create a new topic modeling engine
|
14
18
|
def self.new(**options)
|
15
|
-
Engine.new(**options)
|
19
|
+
Topical::Engine.new(**options)
|
16
20
|
end
|
17
21
|
|
18
22
|
# Extract topics from embeddings and documents (simple interface)
|
19
23
|
def self.extract(embeddings:, documents:, **options)
|
20
|
-
|
21
|
-
engine.fit(embeddings: embeddings, documents: documents)
|
24
|
+
Topical.extract(embeddings: embeddings, documents: documents, **options)
|
22
25
|
end
|
23
26
|
end
|
24
27
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'json'
|
2
|
+
require 'time'
|
2
3
|
|
3
4
|
module Ragnar
|
4
5
|
class UmapProcessor
|
@@ -28,6 +29,67 @@ module Ragnar
|
|
28
29
|
|
29
30
|
puts "Found #{embeddings.size} embeddings"
|
30
31
|
|
32
|
+
# Validate embeddings
|
33
|
+
embedding_dims = embeddings.map(&:size).uniq
|
34
|
+
if embedding_dims.size > 1
|
35
|
+
puts " ⚠️ Warning: Inconsistent embedding dimensions found: #{embedding_dims.inspect}"
|
36
|
+
puts " This may cause errors during UMAP training."
|
37
|
+
# Filter to only embeddings with the most common dimension
|
38
|
+
most_common_dim = embedding_dims.max_by { |dim| embeddings.count { |e| e.size == dim } }
|
39
|
+
embeddings = embeddings.select { |e| e.size == most_common_dim }
|
40
|
+
puts " Using only embeddings with #{most_common_dim} dimensions (#{embeddings.size} embeddings)"
|
41
|
+
end
|
42
|
+
|
43
|
+
# Check for nil or invalid values
|
44
|
+
invalid_count = 0
|
45
|
+
nan_count = 0
|
46
|
+
inf_count = 0
|
47
|
+
|
48
|
+
valid_embeddings = embeddings.select do |embedding|
|
49
|
+
if !embedding.is_a?(Array)
|
50
|
+
invalid_count += 1
|
51
|
+
false
|
52
|
+
elsif embedding.any? { |v| !v.is_a?(Numeric) }
|
53
|
+
invalid_count += 1
|
54
|
+
false
|
55
|
+
elsif embedding.any?(&:nan?)
|
56
|
+
nan_count += 1
|
57
|
+
false
|
58
|
+
elsif embedding.any? { |v| !v.finite? }
|
59
|
+
inf_count += 1
|
60
|
+
false
|
61
|
+
else
|
62
|
+
true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
if valid_embeddings.size < embeddings.size
|
67
|
+
puts "\n ⚠️ Data quality issues detected:"
|
68
|
+
puts " • Invalid embeddings: #{invalid_count}" if invalid_count > 0
|
69
|
+
puts " • Embeddings with NaN: #{nan_count}" if nan_count > 0
|
70
|
+
puts " • Embeddings with Infinity: #{inf_count}" if inf_count > 0
|
71
|
+
puts " • Total removed: #{embeddings.size - valid_embeddings.size}"
|
72
|
+
puts " • Remaining valid: #{valid_embeddings.size}"
|
73
|
+
|
74
|
+
embeddings = valid_embeddings
|
75
|
+
end
|
76
|
+
|
77
|
+
if embeddings.empty?
|
78
|
+
raise "No valid embeddings found after validation.\n\n" \
|
79
|
+
"All embeddings contain invalid values (NaN, Infinity, or non-numeric).\n" \
|
80
|
+
"This suggests a problem with the embedding model or indexing process.\n\n" \
|
81
|
+
"Please try:\n" \
|
82
|
+
" 1. Re-indexing your documents: ragnar index <path> --force\n" \
|
83
|
+
" 2. Using a different embedding model\n" \
|
84
|
+
" 3. Checking your document content for unusual characters"
|
85
|
+
end
|
86
|
+
|
87
|
+
if embeddings.size < 10
|
88
|
+
raise "Too few valid embeddings (#{embeddings.size}) for UMAP training.\n\n" \
|
89
|
+
"UMAP requires at least 10 samples to work effectively.\n" \
|
90
|
+
"Please index more documents or check for data quality issues."
|
91
|
+
end
|
92
|
+
|
31
93
|
# Adjust parameters based on the number of samples
|
32
94
|
# UMAP requires n_neighbors < n_samples
|
33
95
|
# Also, n_components should be less than n_samples for stability
|
@@ -55,6 +117,19 @@ module Ragnar
|
|
55
117
|
embedding_matrix = embeddings
|
56
118
|
original_dims = embeddings.first.size
|
57
119
|
|
120
|
+
# Ensure n_components is reasonable
|
121
|
+
if n_components >= original_dims
|
122
|
+
puts " ⚠️ Warning: n_components (#{n_components}) >= original dimensions (#{original_dims})"
|
123
|
+
n_components = [original_dims / 2, 50].min
|
124
|
+
puts " Reducing n_components to #{n_components}"
|
125
|
+
end
|
126
|
+
|
127
|
+
# For very high dimensional data, be more conservative
|
128
|
+
if original_dims > 500 && n_components > 50
|
129
|
+
puts " ⚠️ Note: High dimensional data (#{original_dims}D) being reduced to #{n_components}D"
|
130
|
+
puts " Consider using n_components <= 50 for stability"
|
131
|
+
end
|
132
|
+
|
58
133
|
puts "\nTraining UMAP model..."
|
59
134
|
puts " Original dimensions: #{original_dims}"
|
60
135
|
puts " Target dimensions: #{n_components}"
|
@@ -64,14 +139,45 @@ module Ragnar
|
|
64
139
|
# Perform the actual training using the class-based API
|
65
140
|
puts " Training UMAP model (this may take a moment)..."
|
66
141
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
142
|
+
begin
|
143
|
+
@umap_instance = ClusterKit::Dimensionality::UMAP.new(
|
144
|
+
n_components: n_components,
|
145
|
+
n_neighbors: n_neighbors
|
146
|
+
)
|
147
|
+
|
148
|
+
@reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
|
149
|
+
|
150
|
+
puts " ✓ UMAP training complete"
|
151
|
+
rescue => e
|
152
|
+
# Provide helpful error message without exposing internal stack trace
|
153
|
+
error_msg = "\n❌ UMAP training failed\n\n"
|
154
|
+
|
155
|
+
if e.message.include?("index out of bounds")
|
156
|
+
error_msg += "The UMAP algorithm encountered an index out of bounds error.\n\n"
|
157
|
+
error_msg += "This typically happens when:\n"
|
158
|
+
error_msg += " • The embedding data contains invalid values (NaN, Infinity)\n"
|
159
|
+
error_msg += " • The parameters are incompatible with your data\n"
|
160
|
+
error_msg += " • There are duplicate or corrupted embeddings\n\n"
|
161
|
+
error_msg += "Suggested solutions:\n"
|
162
|
+
error_msg += " 1. Try with more conservative parameters:\n"
|
163
|
+
error_msg += " ragnar train-umap --n-components 10 --n-neighbors 5\n\n"
|
164
|
+
error_msg += " 2. Re-index your documents to regenerate embeddings:\n"
|
165
|
+
error_msg += " ragnar index <path> --force\n\n"
|
166
|
+
error_msg += " 3. Check your embedding model configuration\n\n"
|
167
|
+
error_msg += "Current parameters:\n"
|
168
|
+
error_msg += " • n_components: #{n_components}\n"
|
169
|
+
error_msg += " • n_neighbors: #{n_neighbors}\n"
|
170
|
+
error_msg += " • embeddings: #{embeddings.size} samples\n"
|
171
|
+
error_msg += " • dimensions: #{original_dims}\n"
|
172
|
+
else
|
173
|
+
error_msg += "Error: #{e.message}\n\n"
|
174
|
+
error_msg += "This may be due to incompatible parameters or data issues.\n"
|
175
|
+
error_msg += "Try using more conservative parameters:\n"
|
176
|
+
error_msg += " ragnar train-umap --n-components 10 --n-neighbors 5\n"
|
177
|
+
end
|
178
|
+
|
179
|
+
raise RuntimeError, error_msg
|
180
|
+
end
|
75
181
|
|
76
182
|
# Store the parameters for saving
|
77
183
|
@model_params = {
|
@@ -91,10 +197,10 @@ module Ragnar
|
|
91
197
|
end
|
92
198
|
|
93
199
|
def apply(batch_size: 100)
|
94
|
-
# Load the trained UMAP model
|
95
|
-
|
200
|
+
# Load the trained UMAP model
|
201
|
+
umap_model = load_umap_model
|
96
202
|
|
97
|
-
puts "Applying
|
203
|
+
puts "Applying UMAP transformation to database documents..."
|
98
204
|
|
99
205
|
# Get all embeddings from database
|
100
206
|
all_docs = @database.get_embeddings
|
@@ -109,84 +215,95 @@ module Ragnar
|
|
109
215
|
end
|
110
216
|
|
111
217
|
puts "Found #{all_docs.size} documents in database"
|
112
|
-
puts "Loaded #{reduced_embeddings.size} reduced embeddings from model"
|
113
218
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
219
|
+
# Process in batches for memory efficiency
|
220
|
+
processed_count = 0
|
221
|
+
error_count = 0
|
222
|
+
skipped_count = 0
|
223
|
+
|
224
|
+
all_docs.each_slice(batch_size) do |batch|
|
225
|
+
begin
|
226
|
+
# Extract embeddings
|
227
|
+
embeddings = batch.map { |d| d[:embedding] }
|
228
|
+
|
229
|
+
# Validate embeddings
|
230
|
+
valid_indices = []
|
231
|
+
embeddings_to_transform = []
|
232
|
+
|
233
|
+
embeddings.each_with_index do |emb, idx|
|
234
|
+
if emb.nil? || !emb.is_a?(Array) || emb.empty?
|
235
|
+
skipped_count += 1
|
236
|
+
next
|
237
|
+
end
|
238
|
+
|
239
|
+
if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
|
240
|
+
skipped_count += 1
|
241
|
+
next
|
242
|
+
end
|
243
|
+
|
244
|
+
valid_indices << idx
|
245
|
+
embeddings_to_transform << emb
|
246
|
+
end
|
247
|
+
|
248
|
+
next if embeddings_to_transform.empty?
|
249
|
+
|
250
|
+
# Transform using the loaded UMAP model
|
251
|
+
reduced_embeddings = umap_model.transform(embeddings_to_transform)
|
252
|
+
|
253
|
+
# Prepare updates for valid documents
|
254
|
+
updates = valid_indices.map.with_index do |batch_idx, transform_idx|
|
255
|
+
{
|
256
|
+
id: batch[batch_idx][:id],
|
257
|
+
reduced_embedding: reduced_embeddings[transform_idx]
|
258
|
+
}
|
259
|
+
end
|
260
|
+
|
261
|
+
# Update database
|
262
|
+
@database.update_reduced_embeddings(updates)
|
263
|
+
processed_count += updates.size
|
264
|
+
|
265
|
+
puts " Processed batch: #{updates.size} documents transformed"
|
266
|
+
rescue => e
|
267
|
+
puts " ⚠️ Error processing batch: #{e.message}"
|
268
|
+
error_count += batch.size
|
269
|
+
end
|
123
270
|
end
|
124
271
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
reduced_embedding: reduced_embeddings[idx]
|
130
|
-
}
|
131
|
-
end
|
132
|
-
|
133
|
-
puts "Updating database with reduced embeddings..."
|
134
|
-
@database.update_reduced_embeddings(updates)
|
272
|
+
puts "\nUMAP application complete:"
|
273
|
+
puts " ✓ Processed: #{processed_count} documents"
|
274
|
+
puts " ⚠️ Skipped: #{skipped_count} documents (invalid embeddings)" if skipped_count > 0
|
275
|
+
puts " ❌ Errors: #{error_count} documents" if error_count > 0
|
135
276
|
|
136
277
|
{
|
137
|
-
processed:
|
138
|
-
skipped:
|
139
|
-
errors:
|
278
|
+
processed: processed_count,
|
279
|
+
skipped: skipped_count,
|
280
|
+
errors: error_count
|
140
281
|
}
|
141
282
|
end
|
142
283
|
|
143
284
|
private
|
144
285
|
|
145
|
-
def process_batch(docs)
|
146
|
-
# Extract embeddings
|
147
|
-
embeddings = docs.map { |d| d[:embedding] }
|
148
|
-
|
149
|
-
# Transform using UMAP
|
150
|
-
# The transform method returns a 2D array where each row is a reduced embedding
|
151
|
-
reduced = @umap_model.transform(embeddings)
|
152
|
-
|
153
|
-
# Prepare updates
|
154
|
-
updates = docs.each_with_index.map do |doc, idx|
|
155
|
-
{
|
156
|
-
id: doc[:id],
|
157
|
-
reduced_embedding: reduced[idx]
|
158
|
-
}
|
159
|
-
end
|
160
|
-
|
161
|
-
# Update database
|
162
|
-
@database.update_reduced_embeddings(updates)
|
163
|
-
end
|
164
|
-
|
165
286
|
def save_model
|
166
|
-
return unless @umap_instance
|
287
|
+
return unless @umap_instance
|
167
288
|
|
168
|
-
# Save the trained UMAP model for transforming new
|
289
|
+
# Save the trained UMAP model for transforming new data
|
169
290
|
@umap_instance.save_model(@model_path)
|
170
291
|
puts "UMAP model saved to: #{@model_path}"
|
171
292
|
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
293
|
+
# Save metadata about the training if we have params
|
294
|
+
if @model_params
|
295
|
+
metadata_path = @model_path.sub(/\.bin$/, '_metadata.json')
|
296
|
+
metadata = {
|
297
|
+
trained_at: Time.now.iso8601,
|
298
|
+
n_components: @model_params[:n_components],
|
299
|
+
n_neighbors: @model_params[:n_neighbors],
|
300
|
+
min_dist: @model_params[:min_dist],
|
301
|
+
document_count: @database.get_embeddings.size,
|
302
|
+
model_version: 2 # Version 2: proper transform-based approach
|
303
|
+
}
|
304
|
+
File.write(metadata_path, JSON.pretty_generate(metadata))
|
305
|
+
puts "Model metadata saved to: #{metadata_path}"
|
185
306
|
end
|
186
|
-
|
187
|
-
@reduced_embeddings = ClusterKit::Dimensionality::UMAP.load_data(embeddings_path)
|
188
|
-
puts "Cached embeddings loaded from: #{embeddings_path}"
|
189
|
-
@reduced_embeddings
|
190
307
|
end
|
191
308
|
|
192
309
|
def load_umap_model
|
@@ -1,124 +1,205 @@
|
|
1
|
+
require 'json'
|
1
2
|
require 'clusterkit'
|
2
3
|
|
3
4
|
module Ragnar
|
5
|
+
# Service for applying UMAP transformations to embeddings
|
6
|
+
# Separates transformation logic from training (UmapProcessor)
|
4
7
|
class UmapTransformService
|
5
|
-
|
8
|
+
attr_reader :model_path, :database
|
6
9
|
|
7
|
-
def initialize
|
10
|
+
def initialize(model_path: "umap_model.bin", database:)
|
11
|
+
@model_path = model_path
|
12
|
+
@database = database
|
8
13
|
@umap_model = nil
|
9
|
-
@
|
14
|
+
@model_metadata = nil
|
10
15
|
end
|
11
16
|
|
12
|
-
# Transform
|
13
|
-
|
14
|
-
|
15
|
-
|
17
|
+
# Transform embeddings for specific documents
|
18
|
+
# @param document_ids [Array<Integer>] IDs of documents to transform
|
19
|
+
# @return [Hash] Results with :processed, :skipped, :errors counts
|
20
|
+
def transform_documents(document_ids)
|
21
|
+
return { processed: 0, skipped: 0, errors: 0 } if document_ids.empty?
|
16
22
|
|
17
|
-
|
18
|
-
load_model(model_path) unless @umap_model
|
23
|
+
load_model!
|
19
24
|
|
20
|
-
#
|
21
|
-
|
22
|
-
result = @umap_model.transform([query_embedding])
|
25
|
+
# Fetch documents
|
26
|
+
documents = @database.get_documents_by_ids(document_ids)
|
23
27
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
# Fall back to k-NN approximation if model loading fails
|
28
|
-
puts "Warning: Could not use UMAP model for transform: #{e.message}"
|
29
|
-
puts "Falling back to k-NN approximation..."
|
30
|
-
knn_approximate_transform(query_embedding)
|
31
|
-
end
|
32
|
-
|
33
|
-
# Check if we can do transforms
|
34
|
-
def model_available?(model_path = nil)
|
35
|
-
model_path ||= @model_path
|
28
|
+
if documents.empty?
|
29
|
+
return { processed: 0, skipped: 0, errors: 0 }
|
30
|
+
end
|
36
31
|
|
37
|
-
#
|
38
|
-
|
39
|
-
|
32
|
+
# Extract and validate embeddings
|
33
|
+
valid_docs = []
|
34
|
+
embeddings_to_transform = []
|
35
|
+
skipped_count = 0
|
36
|
+
|
37
|
+
documents.each do |doc|
|
38
|
+
emb = doc[:embedding]
|
39
|
+
|
40
|
+
if emb.nil? || !emb.is_a?(Array) || emb.empty?
|
41
|
+
skipped_count += 1
|
42
|
+
next
|
43
|
+
end
|
44
|
+
|
45
|
+
if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
|
46
|
+
skipped_count += 1
|
47
|
+
next
|
48
|
+
end
|
49
|
+
|
50
|
+
valid_docs << doc
|
51
|
+
embeddings_to_transform << emb
|
40
52
|
end
|
41
53
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
54
|
+
return { processed: 0, skipped: skipped_count, errors: 0 } if embeddings_to_transform.empty?
|
55
|
+
|
56
|
+
# Transform using UMAP
|
57
|
+
begin
|
58
|
+
reduced_embeddings = @umap_model.transform(embeddings_to_transform)
|
59
|
+
|
60
|
+
# Prepare updates
|
61
|
+
updates = valid_docs.zip(reduced_embeddings).map do |doc, reduced_emb|
|
62
|
+
{
|
63
|
+
id: doc[:id],
|
64
|
+
reduced_embedding: reduced_emb,
|
65
|
+
umap_version: model_version
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
# Update database
|
70
|
+
@database.update_reduced_embeddings(updates)
|
71
|
+
|
72
|
+
{ processed: updates.size, skipped: skipped_count, errors: 0 }
|
73
|
+
rescue => e
|
74
|
+
puts "Error transforming documents: #{e.message}"
|
75
|
+
{ processed: 0, skipped: skipped_count, errors: valid_docs.size }
|
76
|
+
end
|
46
77
|
end
|
47
78
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
79
|
+
# Transform a single query embedding
|
80
|
+
# @param embedding [Array<Numeric>] Query embedding to transform
|
81
|
+
# @return [Array<Float>, nil] Reduced embedding or nil if error
|
82
|
+
def transform_query(embedding)
|
83
|
+
return nil if embedding.nil? || !embedding.is_a?(Array) || embedding.empty?
|
84
|
+
|
85
|
+
# Validate embedding
|
86
|
+
if embedding.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
|
87
|
+
puts "Warning: Invalid query embedding (contains NaN or Infinity)"
|
88
|
+
return nil
|
53
89
|
end
|
54
90
|
|
55
|
-
|
56
|
-
|
91
|
+
load_model!
|
92
|
+
|
93
|
+
begin
|
94
|
+
# Transform returns array of arrays, get first (and only) result
|
95
|
+
@umap_model.transform([embedding]).first
|
96
|
+
rescue => e
|
97
|
+
puts "Error transforming query: #{e.message}"
|
98
|
+
nil
|
99
|
+
end
|
57
100
|
end
|
58
101
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
102
|
+
# Check if a UMAP model exists
|
103
|
+
# @return [Boolean] true if model file exists
|
104
|
+
def model_exists?
|
105
|
+
File.exist?(@model_path)
|
106
|
+
end
|
107
|
+
|
108
|
+
# Get metadata about the trained model
|
109
|
+
# @return [Hash, nil] Model metadata or nil if not found
|
110
|
+
def model_metadata
|
111
|
+
return @model_metadata if @model_metadata
|
69
112
|
|
70
|
-
|
71
|
-
|
113
|
+
metadata_path = @model_path.sub(/\.bin$/, '_metadata.json')
|
114
|
+
return nil unless File.exist?(metadata_path)
|
72
115
|
|
73
|
-
|
74
|
-
|
75
|
-
|
116
|
+
@model_metadata = JSON.parse(File.read(metadata_path), symbolize_names: true)
|
117
|
+
rescue => e
|
118
|
+
puts "Error loading model metadata: #{e.message}"
|
119
|
+
nil
|
120
|
+
end
|
121
|
+
|
122
|
+
# Get the version of the current model
|
123
|
+
# @return [Integer] Model version (timestamp of file modification)
|
124
|
+
def model_version
|
125
|
+
return 0 unless File.exist?(@model_path)
|
126
|
+
File.mtime(@model_path).to_i
|
127
|
+
end
|
128
|
+
|
129
|
+
# Check if model needs retraining based on staleness
|
130
|
+
# @return [Hash] Staleness info with :needs_retraining, :coverage_percentage
|
131
|
+
def check_model_staleness
|
132
|
+
return { needs_retraining: true, coverage_percentage: 0, reason: "No model exists" } unless model_exists?
|
76
133
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
distance = euclidean_distance(query_embedding, doc[:embedding])
|
81
|
-
neighbors << { idx: idx, distance: distance, reduced: doc[:reduced_embedding] }
|
82
|
-
end
|
134
|
+
metadata = model_metadata
|
135
|
+
return { needs_retraining: true, coverage_percentage: 0, reason: "No metadata found" } unless metadata
|
83
136
|
|
84
|
-
|
85
|
-
|
86
|
-
k_nearest = neighbors.first(k)
|
137
|
+
trained_count = metadata[:document_count] || 0
|
138
|
+
current_count = @database.document_count
|
87
139
|
|
88
|
-
|
89
|
-
|
90
|
-
if k_nearest.empty?
|
91
|
-
raise "No neighbors found for transform"
|
140
|
+
if current_count == 0
|
141
|
+
return { needs_retraining: false, coverage_percentage: 100, reason: "No documents" }
|
92
142
|
end
|
93
143
|
|
94
|
-
|
95
|
-
|
144
|
+
coverage = (trained_count.to_f / current_count * 100).round(1)
|
145
|
+
staleness = 100 - coverage
|
146
|
+
|
147
|
+
{
|
148
|
+
needs_retraining: staleness > 30,
|
149
|
+
coverage_percentage: coverage,
|
150
|
+
trained_documents: trained_count,
|
151
|
+
current_documents: current_count,
|
152
|
+
staleness_percentage: staleness,
|
153
|
+
reason: staleness > 30 ? "Model covers only #{coverage}% of documents" : "Model is up to date"
|
154
|
+
}
|
155
|
+
end
|
156
|
+
|
157
|
+
private
|
158
|
+
|
159
|
+
def load_model!
|
160
|
+
return if @umap_model
|
96
161
|
|
97
|
-
|
98
|
-
|
99
|
-
k_nearest.each do |neighbor|
|
100
|
-
# Use inverse distance as weight (closer = higher weight)
|
101
|
-
weight = 1.0 / (neighbor[:distance] + 0.001) # Add small epsilon to avoid division by zero
|
102
|
-
total_weight += weight
|
103
|
-
|
104
|
-
neighbor[:reduced].each_with_index do |val, idx|
|
105
|
-
averaged[idx] += val * weight
|
106
|
-
end
|
162
|
+
unless File.exist?(@model_path)
|
163
|
+
raise "UMAP model not found at #{@model_path}. Please train a model first using 'ragnar train-umap'."
|
107
164
|
end
|
108
165
|
|
109
|
-
|
110
|
-
averaged.map { |val| val / total_weight }
|
166
|
+
@umap_model = ClusterKit::Dimensionality::UMAP.load_model(@model_path)
|
111
167
|
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Singleton service for backwards compatibility
|
171
|
+
# This allows the old UmapTransformService.instance pattern to work
|
172
|
+
class UmapTransformServiceSingleton
|
173
|
+
include Singleton
|
112
174
|
|
113
|
-
def
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
175
|
+
def initialize
|
176
|
+
@database = Database.new(Config.instance.database_path)
|
177
|
+
@service = UmapTransformService.new(database: @database)
|
178
|
+
end
|
179
|
+
|
180
|
+
def transform_query(embedding, model_path = nil)
|
181
|
+
if model_path && model_path != @service.model_path
|
182
|
+
# Create a new service with different model path
|
183
|
+
service = UmapTransformService.new(model_path: model_path, database: @database)
|
184
|
+
service.transform_query(embedding)
|
185
|
+
else
|
186
|
+
@service.transform_query(embedding)
|
120
187
|
end
|
121
|
-
|
188
|
+
end
|
189
|
+
|
190
|
+
def model_available?(model_path = nil)
|
191
|
+
if model_path
|
192
|
+
File.exist?(model_path)
|
193
|
+
else
|
194
|
+
@service.model_exists?
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
# For backwards compatibility - old code uses UmapTransformService.instance
|
200
|
+
class << UmapTransformService
|
201
|
+
def instance
|
202
|
+
UmapTransformServiceSingleton.instance
|
122
203
|
end
|
123
204
|
end
|
124
205
|
end
|
data/lib/ragnar/version.rb
CHANGED