ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +249 -41
- data/lib/ragnar/cli.rb +563 -219
- data/lib/ragnar/cli_umap.rb +86 -0
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +320 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/embedder.rb +1 -1
- data/lib/ragnar/indexer.rb +4 -2
- data/lib/ragnar/llm_manager.rb +31 -27
- data/lib/ragnar/query_processor.rb +123 -70
- data/lib/ragnar/query_rewriter.rb +21 -18
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +131 -95
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- data/lib/ragnar.rb +3 -1
- metadata +71 -30
- data/lib/ragnar/topic_modeling/engine.rb +0 -301
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
|
@@ -1,24 +1,27 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Designed for future extraction into a separate gem
|
|
1
|
+
# frozen_string_literal: true
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
require_relative 'topic_modeling/engine'
|
|
3
|
+
# Topic modeling wrapper that delegates to the Topical gem
|
|
4
|
+
# This maintains backward compatibility while using the extracted library
|
|
5
|
+
|
|
6
|
+
require 'topical'
|
|
9
7
|
|
|
10
8
|
module Ragnar
|
|
11
9
|
module TopicModeling
|
|
10
|
+
# Re-export Topical classes for backward compatibility
|
|
11
|
+
Topic = Topical::Topic
|
|
12
|
+
Engine = Topical::Engine
|
|
13
|
+
|
|
14
|
+
# Re-export metrics module
|
|
15
|
+
Metrics = Topical::Metrics
|
|
12
16
|
|
|
13
17
|
# Convenience method to create a new topic modeling engine
|
|
14
18
|
def self.new(**options)
|
|
15
|
-
Engine.new(**options)
|
|
19
|
+
Topical::Engine.new(**options)
|
|
16
20
|
end
|
|
17
21
|
|
|
18
22
|
# Extract topics from embeddings and documents (simple interface)
|
|
19
23
|
def self.extract(embeddings:, documents:, **options)
|
|
20
|
-
|
|
21
|
-
engine.fit(embeddings: embeddings, documents: documents)
|
|
24
|
+
Topical.extract(embeddings: embeddings, documents: documents, **options)
|
|
22
25
|
end
|
|
23
26
|
end
|
|
24
27
|
end
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require 'json'
|
|
2
|
+
require 'time'
|
|
2
3
|
|
|
3
4
|
module Ragnar
|
|
4
5
|
class UmapProcessor
|
|
@@ -137,45 +138,69 @@ module Ragnar
|
|
|
137
138
|
|
|
138
139
|
# Perform the actual training using the class-based API
|
|
139
140
|
puts " Training UMAP model (this may take a moment)..."
|
|
140
|
-
|
|
141
|
+
|
|
142
|
+
attempts = 0
|
|
143
|
+
max_attempts = 3
|
|
144
|
+
|
|
141
145
|
begin
|
|
146
|
+
attempts += 1
|
|
142
147
|
@umap_instance = ClusterKit::Dimensionality::UMAP.new(
|
|
143
148
|
n_components: n_components,
|
|
144
149
|
n_neighbors: n_neighbors
|
|
145
150
|
)
|
|
146
|
-
|
|
151
|
+
|
|
147
152
|
@reduced_embeddings = @umap_instance.fit_transform(embedding_matrix)
|
|
148
|
-
|
|
153
|
+
|
|
149
154
|
puts " ✓ UMAP training complete"
|
|
150
|
-
rescue => e
|
|
151
|
-
#
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
155
|
+
rescue Exception => e
|
|
156
|
+
# Catch Exception (not just StandardError) because Rust panics from
|
|
157
|
+
# ClusterKit raise fatal errors that bypass the default rescue
|
|
158
|
+
if e.message.include?("LapackInvalidValue") || e.message.include?("SGESDD") || e.message.include?("illegal value")
|
|
159
|
+
if attempts < max_attempts
|
|
160
|
+
# LAPACK SVD can fail with certain dimension combinations — retry with fewer components
|
|
161
|
+
n_components = [n_components / 2, 2].max
|
|
162
|
+
n_neighbors = [n_neighbors, n_components - 1, 3].min
|
|
163
|
+
puts " ⚠️ LAPACK error, retrying with n_components=#{n_components}, n_neighbors=#{n_neighbors} (attempt #{attempts + 1}/#{max_attempts})..."
|
|
164
|
+
retry
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
raise RuntimeError, "\n❌ UMAP training failed due to a LAPACK numerical error.\n\n" \
|
|
168
|
+
"This can happen with certain data/dimension combinations.\n" \
|
|
169
|
+
"Try reducing n_components:\n" \
|
|
170
|
+
" ragnar umap train --n-components 10 --n-neighbors 5\n\n" \
|
|
171
|
+
"Current parameters:\n" \
|
|
172
|
+
" • n_components: #{n_components}\n" \
|
|
173
|
+
" • n_neighbors: #{n_neighbors}\n" \
|
|
174
|
+
" • embeddings: #{embeddings.size} samples\n" \
|
|
175
|
+
" • dimensions: #{original_dims}\n"
|
|
176
|
+
elsif e.message.include?("index out of bounds")
|
|
177
|
+
raise RuntimeError, "\n❌ UMAP training failed\n\n" \
|
|
178
|
+
"The UMAP algorithm encountered an index out of bounds error.\n\n" \
|
|
179
|
+
"This typically happens when:\n" \
|
|
180
|
+
" • The embedding data contains invalid values (NaN, Infinity)\n" \
|
|
181
|
+
" • The parameters are incompatible with your data\n" \
|
|
182
|
+
" • There are duplicate or corrupted embeddings\n\n" \
|
|
183
|
+
"Suggested solutions:\n" \
|
|
184
|
+
" 1. Try with more conservative parameters:\n" \
|
|
185
|
+
" ragnar umap train --n-components 10 --n-neighbors 5\n\n" \
|
|
186
|
+
" 2. Re-index your documents to regenerate embeddings:\n" \
|
|
187
|
+
" ragnar index <path> --force\n\n" \
|
|
188
|
+
" 3. Check your embedding model configuration\n\n" \
|
|
189
|
+
"Current parameters:\n" \
|
|
190
|
+
" • n_components: #{n_components}\n" \
|
|
191
|
+
" • n_neighbors: #{n_neighbors}\n" \
|
|
192
|
+
" • embeddings: #{embeddings.size} samples\n" \
|
|
193
|
+
" • dimensions: #{original_dims}\n"
|
|
194
|
+
elsif e.is_a?(StandardError) || e.message.include?("unwrap")
|
|
195
|
+
raise RuntimeError, "\n❌ UMAP training failed\n\n" \
|
|
196
|
+
"Error: #{e.message}\n\n" \
|
|
197
|
+
"This may be due to incompatible parameters or data issues.\n" \
|
|
198
|
+
"Try using more conservative parameters:\n" \
|
|
199
|
+
" ragnar umap train --n-components 10 --n-neighbors 5\n"
|
|
171
200
|
else
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
error_msg += "Try using more conservative parameters:\n"
|
|
175
|
-
error_msg += " ragnar train-umap --n-components 10 --n-neighbors 5\n"
|
|
201
|
+
# Re-raise non-application exceptions (Interrupt, SignalException, etc.)
|
|
202
|
+
raise
|
|
176
203
|
end
|
|
177
|
-
|
|
178
|
-
raise RuntimeError, error_msg
|
|
179
204
|
end
|
|
180
205
|
|
|
181
206
|
# Store the parameters for saving
|
|
@@ -196,10 +221,10 @@ module Ragnar
|
|
|
196
221
|
end
|
|
197
222
|
|
|
198
223
|
def apply(batch_size: 100)
|
|
199
|
-
# Load the trained UMAP model
|
|
200
|
-
|
|
224
|
+
# Load the trained UMAP model
|
|
225
|
+
umap_model = load_umap_model
|
|
201
226
|
|
|
202
|
-
puts "Applying
|
|
227
|
+
puts "Applying UMAP transformation to database documents..."
|
|
203
228
|
|
|
204
229
|
# Get all embeddings from database
|
|
205
230
|
all_docs = @database.get_embeddings
|
|
@@ -214,84 +239,95 @@ module Ragnar
|
|
|
214
239
|
end
|
|
215
240
|
|
|
216
241
|
puts "Found #{all_docs.size} documents in database"
|
|
217
|
-
puts "Loaded #{reduced_embeddings.size} reduced embeddings from model"
|
|
218
|
-
|
|
219
|
-
if all_docs.size != reduced_embeddings.size
|
|
220
|
-
puts "⚠️ Warning: Mismatch between database documents (#{all_docs.size}) and model embeddings (#{reduced_embeddings.size})"
|
|
221
|
-
puts " This suggests the model was trained on a different dataset."
|
|
222
|
-
puts " Please retrain the UMAP model after indexing all your documents."
|
|
223
|
-
return {
|
|
224
|
-
processed: 0,
|
|
225
|
-
skipped: 0,
|
|
226
|
-
errors: 1
|
|
227
|
-
}
|
|
228
|
-
end
|
|
229
242
|
|
|
230
|
-
#
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
243
|
+
# Process in batches for memory efficiency
|
|
244
|
+
processed_count = 0
|
|
245
|
+
error_count = 0
|
|
246
|
+
skipped_count = 0
|
|
247
|
+
|
|
248
|
+
all_docs.each_slice(batch_size) do |batch|
|
|
249
|
+
begin
|
|
250
|
+
# Extract embeddings
|
|
251
|
+
embeddings = batch.map { |d| d[:embedding] }
|
|
252
|
+
|
|
253
|
+
# Validate embeddings
|
|
254
|
+
valid_indices = []
|
|
255
|
+
embeddings_to_transform = []
|
|
256
|
+
|
|
257
|
+
embeddings.each_with_index do |emb, idx|
|
|
258
|
+
if emb.nil? || !emb.is_a?(Array) || emb.empty?
|
|
259
|
+
skipped_count += 1
|
|
260
|
+
next
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
|
|
264
|
+
skipped_count += 1
|
|
265
|
+
next
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
valid_indices << idx
|
|
269
|
+
embeddings_to_transform << emb
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
next if embeddings_to_transform.empty?
|
|
273
|
+
|
|
274
|
+
# Transform using the loaded UMAP model
|
|
275
|
+
reduced_embeddings = umap_model.transform(embeddings_to_transform)
|
|
276
|
+
|
|
277
|
+
# Prepare updates for valid documents
|
|
278
|
+
updates = valid_indices.map.with_index do |batch_idx, transform_idx|
|
|
279
|
+
{
|
|
280
|
+
id: batch[batch_idx][:id],
|
|
281
|
+
reduced_embedding: reduced_embeddings[transform_idx]
|
|
282
|
+
}
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
# Update database
|
|
286
|
+
@database.update_reduced_embeddings(updates)
|
|
287
|
+
processed_count += updates.size
|
|
288
|
+
|
|
289
|
+
puts " Processed batch: #{updates.size} documents transformed"
|
|
290
|
+
rescue => e
|
|
291
|
+
puts " ⚠️ Error processing batch: #{e.message}"
|
|
292
|
+
error_count += batch.size
|
|
293
|
+
end
|
|
236
294
|
end
|
|
237
295
|
|
|
238
|
-
puts "
|
|
239
|
-
|
|
296
|
+
puts "\nUMAP application complete:"
|
|
297
|
+
puts " ✓ Processed: #{processed_count} documents"
|
|
298
|
+
puts " ⚠️ Skipped: #{skipped_count} documents (invalid embeddings)" if skipped_count > 0
|
|
299
|
+
puts " ❌ Errors: #{error_count} documents" if error_count > 0
|
|
240
300
|
|
|
241
301
|
{
|
|
242
|
-
processed:
|
|
243
|
-
skipped:
|
|
244
|
-
errors:
|
|
302
|
+
processed: processed_count,
|
|
303
|
+
skipped: skipped_count,
|
|
304
|
+
errors: error_count
|
|
245
305
|
}
|
|
246
306
|
end
|
|
247
307
|
|
|
248
308
|
private
|
|
249
309
|
|
|
250
|
-
def process_batch(docs)
|
|
251
|
-
# Extract embeddings
|
|
252
|
-
embeddings = docs.map { |d| d[:embedding] }
|
|
253
|
-
|
|
254
|
-
# Transform using UMAP
|
|
255
|
-
# The transform method returns a 2D array where each row is a reduced embedding
|
|
256
|
-
reduced = @umap_model.transform(embeddings)
|
|
257
|
-
|
|
258
|
-
# Prepare updates
|
|
259
|
-
updates = docs.each_with_index.map do |doc, idx|
|
|
260
|
-
{
|
|
261
|
-
id: doc[:id],
|
|
262
|
-
reduced_embedding: reduced[idx]
|
|
263
|
-
}
|
|
264
|
-
end
|
|
265
|
-
|
|
266
|
-
# Update database
|
|
267
|
-
@database.update_reduced_embeddings(updates)
|
|
268
|
-
end
|
|
269
|
-
|
|
270
310
|
def save_model
|
|
271
|
-
return unless @umap_instance
|
|
311
|
+
return unless @umap_instance
|
|
272
312
|
|
|
273
|
-
# Save the trained UMAP model for transforming new
|
|
313
|
+
# Save the trained UMAP model for transforming new data
|
|
274
314
|
@umap_instance.save_model(@model_path)
|
|
275
315
|
puts "UMAP model saved to: #{@model_path}"
|
|
276
316
|
|
|
277
|
-
#
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
317
|
+
# Save metadata about the training if we have params
|
|
318
|
+
if @model_params
|
|
319
|
+
metadata_path = @model_path.sub(/\.bin$/, '_metadata.json')
|
|
320
|
+
metadata = {
|
|
321
|
+
trained_at: Time.now.iso8601,
|
|
322
|
+
n_components: @model_params[:n_components],
|
|
323
|
+
n_neighbors: @model_params[:n_neighbors],
|
|
324
|
+
min_dist: @model_params[:min_dist],
|
|
325
|
+
document_count: @database.get_embeddings.size,
|
|
326
|
+
model_version: 2 # Version 2: proper transform-based approach
|
|
327
|
+
}
|
|
328
|
+
File.write(metadata_path, JSON.pretty_generate(metadata))
|
|
329
|
+
puts "Model metadata saved to: #{metadata_path}"
|
|
290
330
|
end
|
|
291
|
-
|
|
292
|
-
@reduced_embeddings = ClusterKit::Dimensionality::UMAP.load_data(embeddings_path)
|
|
293
|
-
puts "Cached embeddings loaded from: #{embeddings_path}"
|
|
294
|
-
@reduced_embeddings
|
|
295
331
|
end
|
|
296
332
|
|
|
297
333
|
def load_umap_model
|
|
@@ -1,124 +1,205 @@
|
|
|
1
|
+
require 'json'
|
|
1
2
|
require 'clusterkit'
|
|
2
3
|
|
|
3
4
|
module Ragnar
|
|
5
|
+
# Service for applying UMAP transformations to embeddings
|
|
6
|
+
# Separates transformation logic from training (UmapProcessor)
|
|
4
7
|
class UmapTransformService
|
|
5
|
-
|
|
8
|
+
attr_reader :model_path, :database
|
|
6
9
|
|
|
7
|
-
def initialize
|
|
10
|
+
def initialize(model_path: "umap_model.bin", database:)
|
|
11
|
+
@model_path = model_path
|
|
12
|
+
@database = database
|
|
8
13
|
@umap_model = nil
|
|
9
|
-
@
|
|
14
|
+
@model_metadata = nil
|
|
10
15
|
end
|
|
11
16
|
|
|
12
|
-
# Transform
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
17
|
+
# Transform embeddings for specific documents
|
|
18
|
+
# @param document_ids [Array<Integer>] IDs of documents to transform
|
|
19
|
+
# @return [Hash] Results with :processed, :skipped, :errors counts
|
|
20
|
+
def transform_documents(document_ids)
|
|
21
|
+
return { processed: 0, skipped: 0, errors: 0 } if document_ids.empty?
|
|
16
22
|
|
|
17
|
-
|
|
18
|
-
load_model(model_path) unless @umap_model
|
|
23
|
+
load_model!
|
|
19
24
|
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
result = @umap_model.transform([query_embedding])
|
|
25
|
+
# Fetch documents
|
|
26
|
+
documents = @database.get_documents_by_ids(document_ids)
|
|
23
27
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# Fall back to k-NN approximation if model loading fails
|
|
28
|
-
puts "Warning: Could not use UMAP model for transform: #{e.message}"
|
|
29
|
-
puts "Falling back to k-NN approximation..."
|
|
30
|
-
knn_approximate_transform(query_embedding)
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# Check if we can do transforms
|
|
34
|
-
def model_available?(model_path = nil)
|
|
35
|
-
model_path ||= @model_path
|
|
28
|
+
if documents.empty?
|
|
29
|
+
return { processed: 0, skipped: 0, errors: 0 }
|
|
30
|
+
end
|
|
36
31
|
|
|
37
|
-
#
|
|
38
|
-
|
|
39
|
-
|
|
32
|
+
# Extract and validate embeddings
|
|
33
|
+
valid_docs = []
|
|
34
|
+
embeddings_to_transform = []
|
|
35
|
+
skipped_count = 0
|
|
36
|
+
|
|
37
|
+
documents.each do |doc|
|
|
38
|
+
emb = doc[:embedding]
|
|
39
|
+
|
|
40
|
+
if emb.nil? || !emb.is_a?(Array) || emb.empty?
|
|
41
|
+
skipped_count += 1
|
|
42
|
+
next
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
if emb.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
|
|
46
|
+
skipped_count += 1
|
|
47
|
+
next
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
valid_docs << doc
|
|
51
|
+
embeddings_to_transform << emb
|
|
40
52
|
end
|
|
41
53
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
return { processed: 0, skipped: skipped_count, errors: 0 } if embeddings_to_transform.empty?
|
|
55
|
+
|
|
56
|
+
# Transform using UMAP
|
|
57
|
+
begin
|
|
58
|
+
reduced_embeddings = @umap_model.transform(embeddings_to_transform)
|
|
59
|
+
|
|
60
|
+
# Prepare updates
|
|
61
|
+
updates = valid_docs.zip(reduced_embeddings).map do |doc, reduced_emb|
|
|
62
|
+
{
|
|
63
|
+
id: doc[:id],
|
|
64
|
+
reduced_embedding: reduced_emb,
|
|
65
|
+
umap_version: model_version
|
|
66
|
+
}
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Update database
|
|
70
|
+
@database.update_reduced_embeddings(updates)
|
|
71
|
+
|
|
72
|
+
{ processed: updates.size, skipped: skipped_count, errors: 0 }
|
|
73
|
+
rescue => e
|
|
74
|
+
puts "Error transforming documents: #{e.message}"
|
|
75
|
+
{ processed: 0, skipped: skipped_count, errors: valid_docs.size }
|
|
76
|
+
end
|
|
46
77
|
end
|
|
47
78
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
79
|
+
# Transform a single query embedding
|
|
80
|
+
# @param embedding [Array<Numeric>] Query embedding to transform
|
|
81
|
+
# @return [Array<Float>, nil] Reduced embedding or nil if error
|
|
82
|
+
def transform_query(embedding)
|
|
83
|
+
return nil if embedding.nil? || !embedding.is_a?(Array) || embedding.empty?
|
|
84
|
+
|
|
85
|
+
# Validate embedding
|
|
86
|
+
if embedding.any? { |v| !v.is_a?(Numeric) || v.nan? || !v.finite? }
|
|
87
|
+
puts "Warning: Invalid query embedding (contains NaN or Infinity)"
|
|
88
|
+
return nil
|
|
53
89
|
end
|
|
54
90
|
|
|
55
|
-
|
|
56
|
-
|
|
91
|
+
load_model!
|
|
92
|
+
|
|
93
|
+
begin
|
|
94
|
+
# Transform returns array of arrays, get first (and only) result
|
|
95
|
+
@umap_model.transform([embedding]).first
|
|
96
|
+
rescue => e
|
|
97
|
+
puts "Error transforming query: #{e.message}"
|
|
98
|
+
nil
|
|
99
|
+
end
|
|
57
100
|
end
|
|
58
101
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
102
|
+
# Check if a UMAP model exists
|
|
103
|
+
# @return [Boolean] true if model file exists
|
|
104
|
+
def model_exists?
|
|
105
|
+
File.exist?(@model_path)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Get metadata about the trained model
|
|
109
|
+
# @return [Hash, nil] Model metadata or nil if not found
|
|
110
|
+
def model_metadata
|
|
111
|
+
return @model_metadata if @model_metadata
|
|
69
112
|
|
|
70
|
-
|
|
71
|
-
|
|
113
|
+
metadata_path = @model_path.sub(/\.bin$/, '_metadata.json')
|
|
114
|
+
return nil unless File.exist?(metadata_path)
|
|
72
115
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
116
|
+
@model_metadata = JSON.parse(File.read(metadata_path), symbolize_names: true)
|
|
117
|
+
rescue => e
|
|
118
|
+
puts "Error loading model metadata: #{e.message}"
|
|
119
|
+
nil
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Get the version of the current model
|
|
123
|
+
# @return [Integer] Model version (timestamp of file modification)
|
|
124
|
+
def model_version
|
|
125
|
+
return 0 unless File.exist?(@model_path)
|
|
126
|
+
File.mtime(@model_path).to_i
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Check if model needs retraining based on staleness
|
|
130
|
+
# @return [Hash] Staleness info with :needs_retraining, :coverage_percentage
|
|
131
|
+
def check_model_staleness
|
|
132
|
+
return { needs_retraining: true, coverage_percentage: 0, reason: "No model exists" } unless model_exists?
|
|
76
133
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
distance = euclidean_distance(query_embedding, doc[:embedding])
|
|
81
|
-
neighbors << { idx: idx, distance: distance, reduced: doc[:reduced_embedding] }
|
|
82
|
-
end
|
|
134
|
+
metadata = model_metadata
|
|
135
|
+
return { needs_retraining: true, coverage_percentage: 0, reason: "No metadata found" } unless metadata
|
|
83
136
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
k_nearest = neighbors.first(k)
|
|
137
|
+
trained_count = metadata[:document_count] || 0
|
|
138
|
+
current_count = @database.document_count
|
|
87
139
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
if k_nearest.empty?
|
|
91
|
-
raise "No neighbors found for transform"
|
|
140
|
+
if current_count == 0
|
|
141
|
+
return { needs_retraining: false, coverage_percentage: 100, reason: "No documents" }
|
|
92
142
|
end
|
|
93
143
|
|
|
94
|
-
|
|
95
|
-
|
|
144
|
+
coverage = (trained_count.to_f / current_count * 100).round(1)
|
|
145
|
+
staleness = 100 - coverage
|
|
146
|
+
|
|
147
|
+
{
|
|
148
|
+
needs_retraining: staleness > 30,
|
|
149
|
+
coverage_percentage: coverage,
|
|
150
|
+
trained_documents: trained_count,
|
|
151
|
+
current_documents: current_count,
|
|
152
|
+
staleness_percentage: staleness,
|
|
153
|
+
reason: staleness > 30 ? "Model covers only #{coverage}% of documents" : "Model is up to date"
|
|
154
|
+
}
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private
|
|
158
|
+
|
|
159
|
+
def load_model!
|
|
160
|
+
return if @umap_model
|
|
96
161
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
k_nearest.each do |neighbor|
|
|
100
|
-
# Use inverse distance as weight (closer = higher weight)
|
|
101
|
-
weight = 1.0 / (neighbor[:distance] + 0.001) # Add small epsilon to avoid division by zero
|
|
102
|
-
total_weight += weight
|
|
103
|
-
|
|
104
|
-
neighbor[:reduced].each_with_index do |val, idx|
|
|
105
|
-
averaged[idx] += val * weight
|
|
106
|
-
end
|
|
162
|
+
unless File.exist?(@model_path)
|
|
163
|
+
raise "UMAP model not found at #{@model_path}. Please train a model first using 'ragnar umap train'."
|
|
107
164
|
end
|
|
108
165
|
|
|
109
|
-
|
|
110
|
-
averaged.map { |val| val / total_weight }
|
|
166
|
+
@umap_model = ClusterKit::Dimensionality::UMAP.load_model(@model_path)
|
|
111
167
|
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Singleton service for backwards compatibility
|
|
171
|
+
# This allows the old UmapTransformService.instance pattern to work
|
|
172
|
+
class UmapTransformServiceSingleton
|
|
173
|
+
include Singleton
|
|
112
174
|
|
|
113
|
-
def
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
175
|
+
def initialize
|
|
176
|
+
@database = Database.new(Config.instance.database_path)
|
|
177
|
+
@service = UmapTransformService.new(database: @database)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def transform_query(embedding, model_path = nil)
|
|
181
|
+
if model_path && model_path != @service.model_path
|
|
182
|
+
# Create a new service with different model path
|
|
183
|
+
service = UmapTransformService.new(model_path: model_path, database: @database)
|
|
184
|
+
service.transform_query(embedding)
|
|
185
|
+
else
|
|
186
|
+
@service.transform_query(embedding)
|
|
120
187
|
end
|
|
121
|
-
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def model_available?(model_path = nil)
|
|
191
|
+
if model_path
|
|
192
|
+
File.exist?(model_path)
|
|
193
|
+
else
|
|
194
|
+
@service.model_exists?
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# For backwards compatibility - old code uses UmapTransformService.instance
|
|
200
|
+
class << UmapTransformService
|
|
201
|
+
def instance
|
|
202
|
+
UmapTransformServiceSingleton.instance
|
|
122
203
|
end
|
|
123
204
|
end
|
|
124
205
|
end
|
data/lib/ragnar/version.rb
CHANGED
data/lib/ragnar.rb
CHANGED